/* libs/pixelflinger/codeflinger/GGLAssembler.cpp
**
** Copyright 2006, The Android Open Source Project
**
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
**
** http://www.apache.org/licenses/LICENSE-2.0
**
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*/
#define LOG_TAG "GGLAssembler"
#include <assert.h>
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <sys/types.h>
#include <cutils/log.h>
#include "GGLAssembler.h"
namespace android {
// ----------------------------------------------------------------------------
GGLAssembler::GGLAssembler(ARMAssemblerInterface* target)
: ARMAssemblerProxy(target),
RegisterAllocator(ARMAssemblerProxy::getCodegenArch()), mOptLevel(7)
{
}
GGLAssembler::~GGLAssembler()
{
}
void GGLAssembler::prolog()
{
ARMAssemblerProxy::prolog();
}
void GGLAssembler::epilog(uint32_t touched)
{
ARMAssemblerProxy::epilog(touched);
}
void GGLAssembler::reset(int opt_level)
{
ARMAssemblerProxy::reset();
RegisterAllocator::reset();
mOptLevel = opt_level;
}
// ---------------------------------------------------------------------------
int GGLAssembler::scanline(const needs_t& needs, context_t const* c)
{
int err = 0;
int opt_level = mOptLevel;
while (opt_level >= 0) {
reset(opt_level);
err = scanline_core(needs, c);
if (err == 0)
break;
opt_level--;
}
// XXX: in theory, pcForLabel is not valid before generate()
uint32_t* fragment_start_pc = pcForLabel("fragment_loop");
uint32_t* fragment_end_pc = pcForLabel("epilog");
const int per_fragment_ops = int(fragment_end_pc - fragment_start_pc);
// build a name for our pipeline
char name[64];
sprintf(name,
"scanline__%08X:%08X_%08X_%08X [%3d ipp]",
needs.p, needs.n, needs.t[0], needs.t[1], per_fragment_ops);
if (err) {
ALOGE("Error while generating ""%s""\n", name);
disassemble(name);
return -1;
}
return generate(name);
}
int GGLAssembler::scanline_core(const needs_t& needs, context_t const* c)
{
int64_t duration = ggl_system_time();
mBlendFactorCached = 0;
mBlending = 0;
mMasking = 0;
mAA = GGL_READ_NEEDS(P_AA, needs.p);
mDithering = GGL_READ_NEEDS(P_DITHER, needs.p);
mAlphaTest = GGL_READ_NEEDS(P_ALPHA_TEST, needs.p) + GGL_NEVER;
mDepthTest = GGL_READ_NEEDS(P_DEPTH_TEST, needs.p) + GGL_NEVER;
mFog = GGL_READ_NEEDS(P_FOG, needs.p) != 0;
mSmooth = GGL_READ_NEEDS(SHADE, needs.n) != 0;
mBuilderContext.needs = needs;
mBuilderContext.c = c;
mBuilderContext.Rctx = reserveReg(R0); // context always in R0
mCbFormat = c->formats[ GGL_READ_NEEDS(CB_FORMAT, needs.n) ];
// ------------------------------------------------------------------------
decodeLogicOpNeeds(needs);
decodeTMUNeeds(needs, c);
mBlendSrc = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRC, needs.n));
mBlendDst = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DST, needs.n));
mBlendSrcA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_SRCA, needs.n));
mBlendDstA = ggl_needs_to_blendfactor(GGL_READ_NEEDS(BLEND_DSTA, needs.n));
if (!mCbFormat.c[GGLFormat::ALPHA].h) {
if ((mBlendSrc == GGL_ONE_MINUS_DST_ALPHA) ||
(mBlendSrc == GGL_DST_ALPHA)) {
mBlendSrc = GGL_ONE;
}
if ((mBlendSrcA == GGL_ONE_MINUS_DST_ALPHA) ||
(mBlendSrcA == GGL_DST_ALPHA)) {
mBlendSrcA = GGL_ONE;
}
if ((mBlendDst == GGL_ONE_MINUS_DST_ALPHA) ||
(mBlendDst == GGL_DST_ALPHA)) {
mBlendDst = GGL_ONE;
}
if ((mBlendDstA == GGL_ONE_MINUS_DST_ALPHA) ||
(mBlendDstA == GGL_DST_ALPHA)) {
mBlendDstA = GGL_ONE;
}
}
// if we need the framebuffer, read it now
const int blending = blending_codes(mBlendSrc, mBlendDst) |
blending_codes(mBlendSrcA, mBlendDstA);
// XXX: handle special cases, destination not modified...
if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
(mBlendDst==GGL_ONE) && (mBlendDstA==GGL_ONE)) {
// Destination unmodified (beware of logic ops)
} else if ((mBlendSrc==GGL_ZERO) && (mBlendSrcA==GGL_ZERO) &&
(mBlendDst==GGL_ZERO) && (mBlendDstA==GGL_ZERO)) {
// Destination is zero (beware of logic ops)
}
int fbComponents = 0;
const int masking = GGL_READ_NEEDS(MASK_ARGB, needs.n);
for (int i=0 ; i<4 ; i++) {
const int mask = 1<<i;
component_info_t& info = mInfo[i];
int fs = i==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
int fd = i==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
if (fs==GGL_SRC_ALPHA_SATURATE && i==GGLFormat::ALPHA)
fs = GGL_ONE;
info.masked = !!(masking & mask);
info.inDest = !info.masked && mCbFormat.c[i].h &&
((mLogicOp & LOGIC_OP_SRC) || (!mLogicOp));
if (mCbFormat.components >= GGL_LUMINANCE &&
(i==GGLFormat::GREEN || i==GGLFormat::BLUE)) {
info.inDest = false;
}
info.needed = (i==GGLFormat::ALPHA) &&
(isAlphaSourceNeeded() || mAlphaTest != GGL_ALWAYS);
info.replaced = !!(mTextureMachine.replaced & mask);
info.iterated = (!info.replaced && (info.inDest || info.needed));
info.smooth = mSmooth && info.iterated;
info.fog = mFog && info.inDest && (i != GGLFormat::ALPHA);
info.blend = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
mBlending |= (info.blend ? mask : 0);
mMasking |= (mCbFormat.c[i].h && info.masked) ? mask : 0;
fbComponents |= mCbFormat.c[i].h ? mask : 0;
}
mAllMasked = (mMasking == fbComponents);
if (mAllMasked) {
mDithering = 0;
}
fragment_parts_t parts;
// ------------------------------------------------------------------------
prolog();
// ------------------------------------------------------------------------
build_scanline_prolog(parts, needs);
if (registerFile().status())
return registerFile().status();
// ------------------------------------------------------------------------
label("fragment_loop");
// ------------------------------------------------------------------------
{
Scratch regs(registerFile());
if (mDithering) {
// update the dither index.
MOV(AL, 0, parts.count.reg,
reg_imm(parts.count.reg, ROR, GGL_DITHER_ORDER_SHIFT));
ADD(AL, 0, parts.count.reg, parts.count.reg,
imm( 1 << (32 - GGL_DITHER_ORDER_SHIFT)));
MOV(AL, 0, parts.count.reg,
reg_imm(parts.count.reg, ROR, 32 - GGL_DITHER_ORDER_SHIFT));
}
// XXX: could we do an early alpha-test here in some cases?
// It would probaly be used only with smooth-alpha and no texture
// (or no alpha component in the texture).
// Early z-test
if (mAlphaTest==GGL_ALWAYS) {
build_depth_test(parts, Z_TEST|Z_WRITE);
} else {
// we cannot do the z-write here, because
// it might be killed by the alpha-test later
build_depth_test(parts, Z_TEST);
}
{ // texture coordinates
Scratch scratches(registerFile());
// texel generation
build_textures(parts, regs);
if (registerFile().status())
return registerFile().status();
}
if ((blending & (FACTOR_DST|BLEND_DST)) ||
(mMasking && !mAllMasked) ||
(mLogicOp & LOGIC_OP_DST))
{
// blending / logic_op / masking need the framebuffer
mDstPixel.setTo(regs.obtain(), &mCbFormat);
// load the framebuffer pixel
comment("fetch color-buffer");
load(parts.cbPtr, mDstPixel);
}
if (registerFile().status())
return registerFile().status();
pixel_t pixel;
int directTex = mTextureMachine.directTexture;
if (directTex | parts.packed) {
// note: we can't have both here
// iterated color or direct texture
pixel = directTex ? parts.texel[directTex-1] : parts.iterated;
pixel.flags &= ~CORRUPTIBLE;
} else {
if (mDithering) {
const int ctxtReg = mBuilderContext.Rctx;
const int mask = GGL_DITHER_SIZE-1;
parts.dither = reg_t(regs.obtain());
AND(AL, 0, parts.dither.reg, parts.count.reg, imm(mask));
ADD(AL, 0, parts.dither.reg, parts.dither.reg, ctxtReg);
LDRB(AL, parts.dither.reg, parts.dither.reg,
immed12_pre(GGL_OFFSETOF(ditherMatrix)));
}
// allocate a register for the resulting pixel
pixel.setTo(regs.obtain(), &mCbFormat, FIRST);
build_component(pixel, parts, GGLFormat::ALPHA, regs);
if (mAlphaTest!=GGL_ALWAYS) {
// only handle the z-write part here. We know z-test
// was successful, as well as alpha-test.
build_depth_test(parts, Z_WRITE);
}
build_component(pixel, parts, GGLFormat::RED, regs);
build_component(pixel, parts, GGLFormat::GREEN, regs);
build_component(pixel, parts, GGLFormat::BLUE, regs);
pixel.flags |= CORRUPTIBLE;
}
if (registerFile().status())
return registerFile().status();
if (pixel.reg == -1) {
// be defensive here. if we're here it's probably
// that this whole fragment is a no-op.
pixel = mDstPixel;
}
if (!mAllMasked) {
// logic operation
build_logic_op(pixel, regs);
// masking
build_masking(pixel, regs);
comment("store");
store(parts.cbPtr, pixel, WRITE_BACK);
}
}
if (registerFile().status())
return registerFile().status();
// update the iterated color...
if (parts.reload != 3) {
build_smooth_shade(parts);
}
// update iterated z
build_iterate_z(parts);
// update iterated fog
build_iterate_f(parts);
SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
B(PL, "fragment_loop");
label("epilog");
epilog(registerFile().touched());
if ((mAlphaTest!=GGL_ALWAYS) || (mDepthTest!=GGL_ALWAYS)) {
if (mDepthTest!=GGL_ALWAYS) {
label("discard_before_textures");
build_iterate_texture_coordinates(parts);
}
label("discard_after_textures");
build_smooth_shade(parts);
build_iterate_z(parts);
build_iterate_f(parts);
if (!mAllMasked) {
ADD(AL, 0, parts.cbPtr.reg, parts.cbPtr.reg, imm(parts.cbPtr.size>>3));
}
SUB(AL, S, parts.count.reg, parts.count.reg, imm(1<<16));
B(PL, "fragment_loop");
epilog(registerFile().touched());
}
return registerFile().status();
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_scanline_prolog(
fragment_parts_t& parts, const needs_t& needs)
{
Scratch scratches(registerFile());
int Rctx = mBuilderContext.Rctx;
// compute count
comment("compute ct (# of pixels to process)");
parts.count.setTo(obtainReg());
int Rx = scratches.obtain();
int Ry = scratches.obtain();
CONTEXT_LOAD(Rx, iterators.xl);
CONTEXT_LOAD(parts.count.reg, iterators.xr);
CONTEXT_LOAD(Ry, iterators.y);
// parts.count = iterators.xr - Rx
SUB(AL, 0, parts.count.reg, parts.count.reg, Rx);
SUB(AL, 0, parts.count.reg, parts.count.reg, imm(1));
if (mDithering) {
// parts.count.reg = 0xNNNNXXDD
// NNNN = count-1
// DD = dither offset
// XX = 0xxxxxxx (x = garbage)
Scratch scratches(registerFile());
int tx = scratches.obtain();
int ty = scratches.obtain();
AND(AL, 0, tx, Rx, imm(GGL_DITHER_MASK));
AND(AL, 0, ty, Ry, imm(GGL_DITHER_MASK));
ADD(AL, 0, tx, tx, reg_imm(ty, LSL, GGL_DITHER_ORDER_SHIFT));
ORR(AL, 0, parts.count.reg, tx, reg_imm(parts.count.reg, LSL, 16));
} else {
// parts.count.reg = 0xNNNN0000
// NNNN = count-1
MOV(AL, 0, parts.count.reg, reg_imm(parts.count.reg, LSL, 16));
}
if (!mAllMasked) {
// compute dst ptr
comment("compute color-buffer pointer");
const int cb_bits = mCbFormat.size*8;
int Rs = scratches.obtain();
parts.cbPtr.setTo(obtainReg(), cb_bits);
CONTEXT_LOAD(Rs, state.buffers.color.stride);
CONTEXT_LOAD(parts.cbPtr.reg, state.buffers.color.data);
SMLABB(AL, Rs, Ry, Rs, Rx); // Rs = Rx + Ry*Rs
base_offset(parts.cbPtr, parts.cbPtr, Rs);
scratches.recycle(Rs);
}
// init fog
const int need_fog = GGL_READ_NEEDS(P_FOG, needs.p);
if (need_fog) {
comment("compute initial fog coordinate");
Scratch scratches(registerFile());
int dfdx = scratches.obtain();
int ydfdy = scratches.obtain();
int f = ydfdy;
CONTEXT_LOAD(dfdx, generated_vars.dfdx);
CONTEXT_LOAD(ydfdy, iterators.ydfdy);
MLA(AL, 0, f, Rx, dfdx, ydfdy);
CONTEXT_STORE(f, generated_vars.f);
}
// init Z coordinate
if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
parts.z = reg_t(obtainReg());
comment("compute initial Z coordinate");
Scratch scratches(registerFile());
int dzdx = scratches.obtain();
int ydzdy = parts.z.reg;
CONTEXT_LOAD(dzdx, generated_vars.dzdx); // 1.31 fixed-point
CONTEXT_LOAD(ydzdy, iterators.ydzdy); // 1.31 fixed-point
MLA(AL, 0, parts.z.reg, Rx, dzdx, ydzdy);
// we're going to index zbase of parts.count
// zbase = base + (xl-count + stride*y)*2
int Rs = dzdx;
int zbase = scratches.obtain();
CONTEXT_LOAD(Rs, state.buffers.depth.stride);
CONTEXT_LOAD(zbase, state.buffers.depth.data);
SMLABB(AL, Rs, Ry, Rs, Rx);
ADD(AL, 0, Rs, Rs, reg_imm(parts.count.reg, LSR, 16));
ADD(AL, 0, zbase, zbase, reg_imm(Rs, LSL, 1));
CONTEXT_STORE(zbase, generated_vars.zbase);
}
// init texture coordinates
init_textures(parts.coords, reg_t(Rx), reg_t(Ry));
scratches.recycle(Ry);
// iterated color
init_iterated_color(parts, reg_t(Rx));
// init coverage factor application (anti-aliasing)
if (mAA) {
parts.covPtr.setTo(obtainReg(), 16);
CONTEXT_LOAD(parts.covPtr.reg, state.buffers.coverage);
ADD(AL, 0, parts.covPtr.reg, parts.covPtr.reg, reg_imm(Rx, LSL, 1));
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_component( pixel_t& pixel,
const fragment_parts_t& parts,
int component,
Scratch& regs)
{
static char const * comments[] = {"alpha", "red", "green", "blue"};
comment(comments[component]);
// local register file
Scratch scratches(registerFile());
const int dst_component_size = pixel.component_size(component);
component_t temp(-1);
build_incoming_component( temp, dst_component_size,
parts, component, scratches, regs);
if (mInfo[component].inDest) {
// blending...
build_blending( temp, mDstPixel, component, scratches );
// downshift component and rebuild pixel...
downshift(pixel, component, temp, parts.dither);
}
}
void GGLAssembler::build_incoming_component(
component_t& temp,
int dst_size,
const fragment_parts_t& parts,
int component,
Scratch& scratches,
Scratch& global_regs)
{
const uint32_t component_mask = 1<<component;
// Figure out what we need for the blending stage...
int fs = component==GGLFormat::ALPHA ? mBlendSrcA : mBlendSrc;
int fd = component==GGLFormat::ALPHA ? mBlendDstA : mBlendDst;
if (fs==GGL_SRC_ALPHA_SATURATE && component==GGLFormat::ALPHA) {
fs = GGL_ONE;
}
// Figure out what we need to extract and for what reason
const int blending = blending_codes(fs, fd);
// Are we actually going to blend?
const int need_blending = (fs != int(GGL_ONE)) || (fd > int(GGL_ZERO));
// expand the source if the destination has more bits
int need_expander = false;
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT-1 ; i++) {
texture_unit_t& tmu = mTextureMachine.tmu[i];
if ((tmu.format_idx) &&
(parts.texel[i].component_size(component) < dst_size)) {
need_expander = true;
}
}
// do we need to extract this component?
const bool multiTexture = mTextureMachine.activeUnits > 1;
const int blend_needs_alpha_source = (component==GGLFormat::ALPHA) &&
(isAlphaSourceNeeded());
int need_extract = mInfo[component].needed;
if (mInfo[component].inDest)
{
need_extract |= ((need_blending ?
(blending & (BLEND_SRC|FACTOR_SRC)) : need_expander));
need_extract |= (mTextureMachine.mask != mTextureMachine.replaced);
need_extract |= mInfo[component].smooth;
need_extract |= mInfo[component].fog;
need_extract |= mDithering;
need_extract |= multiTexture;
}
if (need_extract) {
Scratch& regs = blend_needs_alpha_source ? global_regs : scratches;
component_t fragment;
// iterated color
build_iterated_color(fragment, parts, component, regs);
// texture environement (decal, modulate, replace)
build_texture_environment(fragment, parts, component, regs);
// expand the source if the destination has more bits
if (need_expander && (fragment.size() < dst_size)) {
// we're here only if we fetched a texel
// (so we know for sure fragment is CORRUPTIBLE)
expand(fragment, fragment, dst_size);
}
// We have a few specific things to do for the alpha-channel
if ((component==GGLFormat::ALPHA) &&
(mInfo[component].needed || fragment.size()<dst_size))
{
// convert to integer_t first and make sure
// we don't corrupt a needed register
if (fragment.l) {
component_t incoming(fragment);
modify(fragment, regs);
MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSR, incoming.l));
fragment.h -= fragment.l;
fragment.l = 0;
}
// coverage factor application
build_coverage_application(fragment, parts, regs);
// alpha-test
build_alpha_test(fragment, parts);
if (blend_needs_alpha_source) {
// We keep only 8 bits for the blending stage
const int shift = fragment.h <= 8 ? 0 : fragment.h-8;
if (fragment.flags & CORRUPTIBLE) {
fragment.flags &= ~CORRUPTIBLE;
mAlphaSource.setTo(fragment.reg,
fragment.size(), fragment.flags);
if (shift) {
MOV(AL, 0, mAlphaSource.reg,
reg_imm(mAlphaSource.reg, LSR, shift));
}
} else {
// XXX: it would better to do this in build_blend_factor()
// so we can avoid the extra MOV below.
mAlphaSource.setTo(regs.obtain(),
fragment.size(), CORRUPTIBLE);
if (shift) {
MOV(AL, 0, mAlphaSource.reg,
reg_imm(fragment.reg, LSR, shift));
} else {
MOV(AL, 0, mAlphaSource.reg, fragment.reg);
}
}
mAlphaSource.s -= shift;
}
}
// fog...
build_fog( fragment, component, regs );
temp = fragment;
} else {
if (mInfo[component].inDest) {
// extraction not needed and replace
// we just select the right component
if ((mTextureMachine.replaced & component_mask) == 0) {
// component wasn't replaced, so use it!
temp = component_t(parts.iterated, component);
}
for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) {
const texture_unit_t& tmu = mTextureMachine.tmu[i];
if ((tmu.mask & component_mask) &&
((tmu.replaced & component_mask) == 0)) {
temp = component_t(parts.texel[i], component);
}
}
}
}
}
bool GGLAssembler::isAlphaSourceNeeded() const
{
// XXX: also needed for alpha-test
const int bs = mBlendSrc;
const int bd = mBlendDst;
return bs==GGL_SRC_ALPHA_SATURATE ||
bs==GGL_SRC_ALPHA || bs==GGL_ONE_MINUS_SRC_ALPHA ||
bd==GGL_SRC_ALPHA || bd==GGL_ONE_MINUS_SRC_ALPHA ;
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_smooth_shade(const fragment_parts_t& parts)
{
if (mSmooth && !parts.iterated_packed) {
// update the iterated color in a pipelined way...
comment("update iterated color");
Scratch scratches(registerFile());
const int reload = parts.reload;
for (int i=0 ; i<4 ; i++) {
if (!mInfo[i].iterated)
continue;
int c = parts.argb[i].reg;
int dx = parts.argb_dx[i].reg;
if (reload & 1) {
c = scratches.obtain();
CONTEXT_LOAD(c, generated_vars.argb[i].c);
}
if (reload & 2) {
dx = scratches.obtain();
CONTEXT_LOAD(dx, generated_vars.argb[i].dx);
}
if (mSmooth) {
ADD(AL, 0, c, c, dx);
}
if (reload & 1) {
CONTEXT_STORE(c, generated_vars.argb[i].c);
scratches.recycle(c);
}
if (reload & 2) {
scratches.recycle(dx);
}
}
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_coverage_application(component_t& fragment,
const fragment_parts_t& parts, Scratch& regs)
{
// here fragment.l is guarenteed to be 0
if (mAA) {
// coverages are 1.15 fixed-point numbers
comment("coverage application");
component_t incoming(fragment);
modify(fragment, regs);
Scratch scratches(registerFile());
int cf = scratches.obtain();
LDRH(AL, cf, parts.covPtr.reg, immed8_post(2));
if (fragment.h > 31) {
fragment.h--;
SMULWB(AL, fragment.reg, incoming.reg, cf);
} else {
MOV(AL, 0, fragment.reg, reg_imm(incoming.reg, LSL, 1));
SMULWB(AL, fragment.reg, fragment.reg, cf);
}
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_alpha_test(component_t& fragment,
const fragment_parts_t& parts)
{
if (mAlphaTest != GGL_ALWAYS) {
comment("Alpha Test");
Scratch scratches(registerFile());
int ref = scratches.obtain();
const int shift = GGL_COLOR_BITS-fragment.size();
CONTEXT_LOAD(ref, state.alpha_test.ref);
if (shift) CMP(AL, fragment.reg, reg_imm(ref, LSR, shift));
else CMP(AL, fragment.reg, ref);
int cc = NV;
switch (mAlphaTest) {
case GGL_NEVER: cc = NV; break;
case GGL_LESS: cc = LT; break;
case GGL_EQUAL: cc = EQ; break;
case GGL_LEQUAL: cc = LS; break;
case GGL_GREATER: cc = HI; break;
case GGL_NOTEQUAL: cc = NE; break;
case GGL_GEQUAL: cc = HS; break;
}
B(cc^1, "discard_after_textures");
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_depth_test(
const fragment_parts_t& parts, uint32_t mask)
{
mask &= Z_TEST|Z_WRITE;
const needs_t& needs = mBuilderContext.needs;
const int zmask = GGL_READ_NEEDS(P_MASK_Z, needs.p);
Scratch scratches(registerFile());
if (mDepthTest != GGL_ALWAYS || zmask) {
int cc=AL, ic=AL;
switch (mDepthTest) {
case GGL_LESS: ic = HI; break;
case GGL_EQUAL: ic = EQ; break;
case GGL_LEQUAL: ic = HS; break;
case GGL_GREATER: ic = LT; break;
case GGL_NOTEQUAL: ic = NE; break;
case GGL_GEQUAL: ic = LS; break;
case GGL_NEVER:
// this never happens, because it's taken care of when
// computing the needs. but we keep it for completness.
comment("Depth Test (NEVER)");
B(AL, "discard_before_textures");
return;
case GGL_ALWAYS:
// we're here because zmask is enabled
mask &= ~Z_TEST; // test always passes.
break;
}
// inverse the condition
cc = ic^1;
if ((mask & Z_WRITE) && !zmask) {
mask &= ~Z_WRITE;
}
if (!mask)
return;
comment("Depth Test");
int zbase = scratches.obtain();
int depth = scratches.obtain();
int z = parts.z.reg;
CONTEXT_LOAD(zbase, generated_vars.zbase); // stall
SUB(AL, 0, zbase, zbase, reg_imm(parts.count.reg, LSR, 15));
// above does zbase = zbase + ((count >> 16) << 1)
if (mask & Z_TEST) {
LDRH(AL, depth, zbase); // stall
CMP(AL, depth, reg_imm(z, LSR, 16));
B(cc, "discard_before_textures");
}
if (mask & Z_WRITE) {
if (mask == Z_WRITE) {
// only z-write asked, cc is meaningless
ic = AL;
}
MOV(AL, 0, depth, reg_imm(z, LSR, 16));
STRH(ic, depth, zbase);
}
}
}
void GGLAssembler::build_iterate_z(const fragment_parts_t& parts)
{
const needs_t& needs = mBuilderContext.needs;
if ((mDepthTest != GGL_ALWAYS) || GGL_READ_NEEDS(P_MASK_Z, needs.p)) {
Scratch scratches(registerFile());
int dzdx = scratches.obtain();
CONTEXT_LOAD(dzdx, generated_vars.dzdx); // stall
ADD(AL, 0, parts.z.reg, parts.z.reg, dzdx);
}
}
void GGLAssembler::build_iterate_f(const fragment_parts_t& parts)
{
const needs_t& needs = mBuilderContext.needs;
if (GGL_READ_NEEDS(P_FOG, needs.p)) {
Scratch scratches(registerFile());
int dfdx = scratches.obtain();
int f = scratches.obtain();
CONTEXT_LOAD(f, generated_vars.f);
CONTEXT_LOAD(dfdx, generated_vars.dfdx); // stall
ADD(AL, 0, f, f, dfdx);
CONTEXT_STORE(f, generated_vars.f);
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::build_logic_op(pixel_t& pixel, Scratch& regs)
{
const needs_t& needs = mBuilderContext.needs;
const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
if (opcode == GGL_COPY)
return;
comment("logic operation");
pixel_t s(pixel);
if (!(pixel.flags & CORRUPTIBLE)) {
pixel.reg = regs.obtain();
pixel.flags |= CORRUPTIBLE;
}
pixel_t d(mDstPixel);
switch(opcode) {
case GGL_CLEAR: MOV(AL, 0, pixel.reg, imm(0)); break;
case GGL_AND: AND(AL, 0, pixel.reg, s.reg, d.reg); break;
case GGL_AND_REVERSE: BIC(AL, 0, pixel.reg, s.reg, d.reg); break;
case GGL_COPY: break;
case GGL_AND_INVERTED: BIC(AL, 0, pixel.reg, d.reg, s.reg); break;
case GGL_NOOP: MOV(AL, 0, pixel.reg, d.reg); break;
case GGL_XOR: EOR(AL, 0, pixel.reg, s.reg, d.reg); break;
case GGL_OR: ORR(AL, 0, pixel.reg, s.reg, d.reg); break;
case GGL_NOR: ORR(AL, 0, pixel.reg, s.reg, d.reg);
MVN(AL, 0, pixel.reg, pixel.reg); break;
case GGL_EQUIV: EOR(AL, 0, pixel.reg, s.reg, d.reg);
MVN(AL, 0, pixel.reg, pixel.reg); break;
case GGL_INVERT: MVN(AL, 0, pixel.reg, d.reg); break;
case GGL_OR_REVERSE: // s | ~d == ~(~s & d)
BIC(AL, 0, pixel.reg, d.reg, s.reg);
MVN(AL, 0, pixel.reg, pixel.reg); break;
case GGL_COPY_INVERTED: MVN(AL, 0, pixel.reg, s.reg); break;
case GGL_OR_INVERTED: // ~s | d == ~(s & ~d)
BIC(AL, 0, pixel.reg, s.reg, d.reg);
MVN(AL, 0, pixel.reg, pixel.reg); break;
case GGL_NAND: AND(AL, 0, pixel.reg, s.reg, d.reg);
MVN(AL, 0, pixel.reg, pixel.reg); break;
case GGL_SET: MVN(AL, 0, pixel.reg, imm(0)); break;
};
}
// ---------------------------------------------------------------------------
static uint32_t find_bottom(uint32_t val)
{
uint32_t i = 0;
while (!(val & (3<<i)))
i+= 2;
return i;
}
static void normalize(uint32_t& val, uint32_t& rot)
{
rot = 0;
while (!(val&3) || (val & 0xFC000000)) {
uint32_t newval;
newval = val >> 2;
newval |= (val&3) << 30;
val = newval;
rot += 2;
if (rot == 32) {
rot = 0;
break;
}
}
}
void GGLAssembler::build_and_immediate(int d, int s, uint32_t mask, int bits)
{
uint32_t rot;
uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1;
mask &= size;
if (mask == size) {
if (d != s)
MOV( AL, 0, d, s);
return;
}
if (getCodegenArch() == CODEGEN_ARCH_MIPS) {
// MIPS can do 16-bit imm in 1 instr, 32-bit in 3 instr
// the below ' while (mask)' code is buggy on mips
// since mips returns true on isValidImmediate()
// then we get multiple AND instr (positive logic)
AND( AL, 0, d, s, imm(mask) );
return;
}
int negative_logic = !isValidImmediate(mask);
if (negative_logic) {
mask = ~mask & size;
}
normalize(mask, rot);
if (mask) {
while (mask) {
uint32_t bitpos = find_bottom(mask);
int shift = rot + bitpos;
uint32_t m = mask & (0xff << bitpos);
mask &= ~m;
m >>= bitpos;
int32_t newMask = (m<<shift) | (m>>(32-shift));
if (!negative_logic) {
AND( AL, 0, d, s, imm(newMask) );
} else {
BIC( AL, 0, d, s, imm(newMask) );
}
s = d;
}
} else {
MOV( AL, 0, d, imm(0));
}
}
void GGLAssembler::build_masking(pixel_t& pixel, Scratch& regs)
{
if (!mMasking || mAllMasked) {
return;
}
comment("color mask");
pixel_t fb(mDstPixel);
pixel_t s(pixel);
if (!(pixel.flags & CORRUPTIBLE)) {
pixel.reg = regs.obtain();
pixel.flags |= CORRUPTIBLE;
}
int mask = 0;
for (int i=0 ; i<4 ; i++) {
const int component_mask = 1<<i;
const int h = fb.format.c[i].h;
const int l = fb.format.c[i].l;
if (h && (!(mMasking & component_mask))) {
mask |= ((1<<(h-l))-1) << l;
}
}
// There is no need to clear the masked components of the source
// (unless we applied a logic op), because they're already zeroed
// by construction (masked components are not computed)
if (mLogicOp) {
const needs_t& needs = mBuilderContext.needs;
const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR;
if (opcode != GGL_CLEAR) {
// clear masked component of source
build_and_immediate(pixel.reg, s.reg, mask, fb.size());
s = pixel;
}
}
// clear non masked components of destination
build_and_immediate(fb.reg, fb.reg, ~mask, fb.size());
// or back the channels that were masked
if (s.reg == fb.reg) {
// this is in fact a MOV
if (s.reg == pixel.reg) {
// ugh. this in in fact a nop
} else {
MOV(AL, 0, pixel.reg, fb.reg);
}
} else {
ORR(AL, 0, pixel.reg, s.reg, fb.reg);
}
}
// ---------------------------------------------------------------------------
void GGLAssembler::base_offset(
const pointer_t& d, const pointer_t& b, const reg_t& o)
{
switch (b.size) {
case 32:
ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 2));
break;
case 24:
if (d.reg == b.reg) {
ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
ADD(AL, 0, d.reg, d.reg, o.reg);
} else {
ADD(AL, 0, d.reg, o.reg, reg_imm(o.reg, LSL, 1));
ADD(AL, 0, d.reg, d.reg, b.reg);
}
break;
case 16:
ADD(AL, 0, d.reg, b.reg, reg_imm(o.reg, LSL, 1));
break;
case 8:
ADD(AL, 0, d.reg, b.reg, o.reg);
break;
}
}
// ----------------------------------------------------------------------------
// cheezy register allocator...
// ----------------------------------------------------------------------------
// Modified to support MIPS processors, in a very simple way. We retain the
// (Arm) limit of 16 total registers, but shift the mapping of those registers
// from 0-15, to 2-17. Register 0 on Mips cannot be used as GP registers, and
// register 1 has a traditional use as a temp).
RegisterAllocator::RegisterAllocator(int arch) : mRegs(arch)
{
}
void RegisterAllocator::reset()
{
mRegs.reset();
}
int RegisterAllocator::reserveReg(int reg)
{
return mRegs.reserve(reg);
}
int RegisterAllocator::obtainReg()
{
return mRegs.obtain();
}
void RegisterAllocator::recycleReg(int reg)
{
mRegs.recycle(reg);
}
RegisterAllocator::RegisterFile& RegisterAllocator::registerFile()
{
return mRegs;
}
// ----------------------------------------------------------------------------
RegisterAllocator::RegisterFile::RegisterFile(int codegen_arch)
: mRegs(0), mTouched(0), mStatus(0), mArch(codegen_arch), mRegisterOffset(0)
{
if (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) {
mRegisterOffset = 2; // ARM has regs 0..15, MIPS offset to 2..17
}
reserve(ARMAssemblerInterface::SP);
reserve(ARMAssemblerInterface::PC);
}
RegisterAllocator::RegisterFile::RegisterFile(const RegisterFile& rhs, int codegen_arch)
: mRegs(rhs.mRegs), mTouched(rhs.mTouched), mArch(codegen_arch), mRegisterOffset(0)
{
if (mArch == ARMAssemblerInterface::CODEGEN_ARCH_MIPS) {
mRegisterOffset = 2; // ARM has regs 0..15, MIPS offset to 2..17
}
}
RegisterAllocator::RegisterFile::~RegisterFile()
{
}
bool RegisterAllocator::RegisterFile::operator == (const RegisterFile& rhs) const
{
return (mRegs == rhs.mRegs);
}
void RegisterAllocator::RegisterFile::reset()
{
mRegs = mTouched = mStatus = 0;
reserve(ARMAssemblerInterface::SP);
reserve(ARMAssemblerInterface::PC);
}
// RegisterFile::reserve() take a register parameter in the
// range 0-15 (Arm compatible), but on a Mips processor, will
// return the actual allocated register in the range 2-17.
int RegisterAllocator::RegisterFile::reserve(int reg)
{
reg += mRegisterOffset;
LOG_ALWAYS_FATAL_IF(isUsed(reg),
"reserving register %d, but already in use",
reg);
mRegs |= (1<<reg);
mTouched |= mRegs;
return reg;
}
// This interface uses regMask in range 2-17 on MIPS, no translation.
void RegisterAllocator::RegisterFile::reserveSeveral(uint32_t regMask)
{
mRegs |= regMask;
mTouched |= regMask;
}
int RegisterAllocator::RegisterFile::isUsed(int reg) const
{
LOG_ALWAYS_FATAL_IF(reg>=16+(int)mRegisterOffset, "invalid register %d", reg);
return mRegs & (1<<reg);
}
int RegisterAllocator::RegisterFile::obtain()
{
const char priorityList[14] = { 0, 1, 2, 3,
12, 14, 4, 5,
6, 7, 8, 9,
10, 11 };
const int nbreg = sizeof(priorityList);
int i, r, reg;
for (i=0 ; i<nbreg ; i++) {
r = priorityList[i];
if (!isUsed(r + mRegisterOffset)) {
break;
}
}
// this is not an error anymore because, we'll try again with
// a lower optimization level.
//ALOGE_IF(i >= nbreg, "pixelflinger ran out of registers\n");
if (i >= nbreg) {
mStatus |= OUT_OF_REGISTERS;
// we return SP so we can more easily debug things
// the code will never be run anyway.
return ARMAssemblerInterface::SP;
}
reg = reserve(r); // Param in Arm range 0-15, returns range 2-17 on Mips.
return reg;
}
bool RegisterAllocator::RegisterFile::hasFreeRegs() const
{
uint32_t regs = mRegs >> mRegisterOffset; // MIPS fix.
return ((regs & 0xFFFF) == 0xFFFF) ? false : true;
}
int RegisterAllocator::RegisterFile::countFreeRegs() const
{
uint32_t regs = mRegs >> mRegisterOffset; // MIPS fix.
int f = ~regs & 0xFFFF;
// now count number of 1
f = (f & 0x5555) + ((f>>1) & 0x5555);
f = (f & 0x3333) + ((f>>2) & 0x3333);
f = (f & 0x0F0F) + ((f>>4) & 0x0F0F);
f = (f & 0x00FF) + ((f>>8) & 0x00FF);
return f;
}
void RegisterAllocator::RegisterFile::recycle(int reg)
{
// commented out, since common failure of running out of regs
// triggers this assertion. Since the code is not execectued
// in that case, it does not matter. No reason to FATAL err.
// LOG_FATAL_IF(!isUsed(reg),
// "recycling unallocated register %d",
// reg);
mRegs &= ~(1<<reg);
}
void RegisterAllocator::RegisterFile::recycleSeveral(uint32_t regMask)
{
// commented out, since common failure of running out of regs
// triggers this assertion. Since the code is not execectued
// in that case, it does not matter. No reason to FATAL err.
// LOG_FATAL_IF((mRegs & regMask)!=regMask,
// "recycling unallocated registers "
// "(recycle=%08x, allocated=%08x, unallocated=%08x)",
// regMask, mRegs, mRegs®Mask);
mRegs &= ~regMask;
}
uint32_t RegisterAllocator::RegisterFile::touched() const
{
return mTouched;
}
// ----------------------------------------------------------------------------
}; // namespace android