/**************************************************************************
*
* Copyright 2009-2010 VMware, Inc.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* @file
* Depth/stencil testing to LLVM IR translation.
*
* To be done accurately/efficiently the depth/stencil test must be done with
* the same type/format of the depth/stencil buffer, which implies massaging
* the incoming depths to fit into place. Using a more straightforward
* type/format for depth/stencil values internally and only convert when
* flushing would avoid this, but it would most likely result in depth fighting
* artifacts.
*
* Since we're using linear layout for everything, but we need to deal with
* 2x2 quads, we need to load/store multiple values and swizzle them into
* place (we could avoid this by doing depth/stencil testing in linear format,
* which would be easy for late depth/stencil test as we could do that after
* the fragment shader loop just as we do for color buffers, but more tricky
* for early depth test as we'd need both masks and interpolated depth in
* linear format).
*
*
* @author Jose Fonseca <jfonseca@vmware.com>
* @author Brian Paul <jfonseca@vmware.com>
*/
#include "pipe/p_state.h"
#include "util/u_format.h"
#include "util/u_cpu_detect.h"
#include "gallivm/lp_bld_type.h"
#include "gallivm/lp_bld_arit.h"
#include "gallivm/lp_bld_bitarit.h"
#include "gallivm/lp_bld_const.h"
#include "gallivm/lp_bld_conv.h"
#include "gallivm/lp_bld_logic.h"
#include "gallivm/lp_bld_flow.h"
#include "gallivm/lp_bld_intr.h"
#include "gallivm/lp_bld_debug.h"
#include "gallivm/lp_bld_swizzle.h"
#include "gallivm/lp_bld_pack.h"
#include "lp_bld_depth.h"
/** Used to select fields from pipe_stencil_state */
enum stencil_op {
S_FAIL_OP,
Z_FAIL_OP,
Z_PASS_OP
};
/**
* Do the stencil test comparison (compare FB stencil values against ref value).
* This will be used twice when generating two-sided stencil code.
* \param stencil the front/back stencil state
* \param stencilRef the stencil reference value, replicated as a vector
* \param stencilVals vector of stencil values from framebuffer
* \return vector mask of pass/fail values (~0 or 0)
*/
static LLVMValueRef
lp_build_stencil_test_single(struct lp_build_context *bld,
const struct pipe_stencil_state *stencil,
LLVMValueRef stencilRef,
LLVMValueRef stencilVals)
{
LLVMBuilderRef builder = bld->gallivm->builder;
const unsigned stencilMax = 255; /* XXX fix */
struct lp_type type = bld->type;
LLVMValueRef res;
/*
* SSE2 has intrinsics for signed comparisons, but not unsigned ones. Values
* are between 0..255 so ensure we generate the fastest comparisons for
* wider elements.
*/
if (type.width <= 8) {
assert(!type.sign);
} else {
assert(type.sign);
}
assert(stencil->enabled);
if (stencil->valuemask != stencilMax) {
/* compute stencilRef = stencilRef & valuemask */
LLVMValueRef valuemask = lp_build_const_int_vec(bld->gallivm, type, stencil->valuemask);
stencilRef = LLVMBuildAnd(builder, stencilRef, valuemask, "");
/* compute stencilVals = stencilVals & valuemask */
stencilVals = LLVMBuildAnd(builder, stencilVals, valuemask, "");
}
res = lp_build_cmp(bld, stencil->func, stencilRef, stencilVals);
return res;
}
/**
* Do the one or two-sided stencil test comparison.
* \sa lp_build_stencil_test_single
* \param front_facing an integer vector mask, indicating front (~0) or back
* (0) facing polygon. If NULL, assume front-facing.
*/
static LLVMValueRef
lp_build_stencil_test(struct lp_build_context *bld,
const struct pipe_stencil_state stencil[2],
LLVMValueRef stencilRefs[2],
LLVMValueRef stencilVals,
LLVMValueRef front_facing)
{
LLVMValueRef res;
assert(stencil[0].enabled);
/* do front face test */
res = lp_build_stencil_test_single(bld, &stencil[0],
stencilRefs[0], stencilVals);
if (stencil[1].enabled && front_facing != NULL) {
/* do back face test */
LLVMValueRef back_res;
back_res = lp_build_stencil_test_single(bld, &stencil[1],
stencilRefs[1], stencilVals);
res = lp_build_select(bld, front_facing, res, back_res);
}
return res;
}
/**
* Apply the stencil operator (add/sub/keep/etc) to the given vector
* of stencil values.
* \return new stencil values vector
*/
static LLVMValueRef
lp_build_stencil_op_single(struct lp_build_context *bld,
const struct pipe_stencil_state *stencil,
enum stencil_op op,
LLVMValueRef stencilRef,
LLVMValueRef stencilVals)
{
LLVMBuilderRef builder = bld->gallivm->builder;
struct lp_type type = bld->type;
LLVMValueRef res;
LLVMValueRef max = lp_build_const_int_vec(bld->gallivm, type, 0xff);
unsigned stencil_op;
assert(type.sign);
switch (op) {
case S_FAIL_OP:
stencil_op = stencil->fail_op;
break;
case Z_FAIL_OP:
stencil_op = stencil->zfail_op;
break;
case Z_PASS_OP:
stencil_op = stencil->zpass_op;
break;
default:
assert(0 && "Invalid stencil_op mode");
stencil_op = PIPE_STENCIL_OP_KEEP;
}
switch (stencil_op) {
case PIPE_STENCIL_OP_KEEP:
res = stencilVals;
/* we can return early for this case */
return res;
case PIPE_STENCIL_OP_ZERO:
res = bld->zero;
break;
case PIPE_STENCIL_OP_REPLACE:
res = stencilRef;
break;
case PIPE_STENCIL_OP_INCR:
res = lp_build_add(bld, stencilVals, bld->one);
res = lp_build_min(bld, res, max);
break;
case PIPE_STENCIL_OP_DECR:
res = lp_build_sub(bld, stencilVals, bld->one);
res = lp_build_max(bld, res, bld->zero);
break;
case PIPE_STENCIL_OP_INCR_WRAP:
res = lp_build_add(bld, stencilVals, bld->one);
res = LLVMBuildAnd(builder, res, max, "");
break;
case PIPE_STENCIL_OP_DECR_WRAP:
res = lp_build_sub(bld, stencilVals, bld->one);
res = LLVMBuildAnd(builder, res, max, "");
break;
case PIPE_STENCIL_OP_INVERT:
res = LLVMBuildNot(builder, stencilVals, "");
res = LLVMBuildAnd(builder, res, max, "");
break;
default:
assert(0 && "bad stencil op mode");
res = bld->undef;
}
return res;
}
/**
* Do the one or two-sided stencil test op/update.
*/
static LLVMValueRef
lp_build_stencil_op(struct lp_build_context *bld,
const struct pipe_stencil_state stencil[2],
enum stencil_op op,
LLVMValueRef stencilRefs[2],
LLVMValueRef stencilVals,
LLVMValueRef mask,
LLVMValueRef front_facing)
{
LLVMBuilderRef builder = bld->gallivm->builder;
LLVMValueRef res;
assert(stencil[0].enabled);
/* do front face op */
res = lp_build_stencil_op_single(bld, &stencil[0], op,
stencilRefs[0], stencilVals);
if (stencil[1].enabled && front_facing != NULL) {
/* do back face op */
LLVMValueRef back_res;
back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
stencilRefs[1], stencilVals);
res = lp_build_select(bld, front_facing, res, back_res);
}
if (stencil[0].writemask != 0xff ||
(stencil[1].enabled && front_facing != NULL && stencil[1].writemask != 0xff)) {
/* mask &= stencil[0].writemask */
LLVMValueRef writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
stencil[0].writemask);
if (stencil[1].enabled && stencil[1].writemask != stencil[0].writemask && front_facing != NULL) {
LLVMValueRef back_writemask = lp_build_const_int_vec(bld->gallivm, bld->type,
stencil[1].writemask);
writemask = lp_build_select(bld, front_facing, writemask, back_writemask);
}
mask = LLVMBuildAnd(builder, mask, writemask, "");
/* res = (res & mask) | (stencilVals & ~mask) */
res = lp_build_select_bitwise(bld, mask, res, stencilVals);
}
else {
/* res = mask ? res : stencilVals */
res = lp_build_select(bld, mask, res, stencilVals);
}
return res;
}
/**
* Return a type that matches the depth/stencil format.
*/
struct lp_type
lp_depth_type(const struct util_format_description *format_desc,
unsigned length)
{
struct lp_type type;
unsigned z_swizzle;
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
assert(format_desc->block.width == 1);
assert(format_desc->block.height == 1);
memset(&type, 0, sizeof type);
type.width = format_desc->block.bits;
z_swizzle = format_desc->swizzle[0];
if (z_swizzle < 4) {
if (format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_FLOAT) {
type.floating = TRUE;
assert(z_swizzle == 0);
assert(format_desc->channel[z_swizzle].size == 32);
}
else if(format_desc->channel[z_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
assert(format_desc->block.bits <= 32);
assert(format_desc->channel[z_swizzle].normalized);
if (format_desc->channel[z_swizzle].size < format_desc->block.bits) {
/* Prefer signed integers when possible, as SSE has less support
* for unsigned comparison;
*/
type.sign = TRUE;
}
}
else
assert(0);
}
type.length = length;
return type;
}
/**
* Compute bitmask and bit shift to apply to the incoming fragment Z values
* and the Z buffer values needed before doing the Z comparison.
*
* Note that we leave the Z bits in the position that we find them
* in the Z buffer (typically 0xffffff00 or 0x00ffffff). That lets us
* get by with fewer bit twiddling steps.
*/
static boolean
get_z_shift_and_mask(const struct util_format_description *format_desc,
unsigned *shift, unsigned *width, unsigned *mask)
{
unsigned total_bits;
unsigned z_swizzle;
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
assert(format_desc->block.width == 1);
assert(format_desc->block.height == 1);
/* 64bit d/s format is special already extracted 32 bits */
total_bits = format_desc->block.bits > 32 ? 32 : format_desc->block.bits;
z_swizzle = format_desc->swizzle[0];
if (z_swizzle == PIPE_SWIZZLE_NONE)
return FALSE;
*width = format_desc->channel[z_swizzle].size;
/* & 31 is for the same reason as the 32-bit limit above */
*shift = format_desc->channel[z_swizzle].shift & 31;
if (*width == total_bits) {
*mask = 0xffffffff;
} else {
*mask = ((1 << *width) - 1) << *shift;
}
return TRUE;
}
/**
* Compute bitmask and bit shift to apply to the framebuffer pixel values
* to put the stencil bits in the least significant position.
* (i.e. 0x000000ff)
*/
static boolean
get_s_shift_and_mask(const struct util_format_description *format_desc,
unsigned *shift, unsigned *mask)
{
unsigned s_swizzle;
unsigned sz;
s_swizzle = format_desc->swizzle[1];
if (s_swizzle == PIPE_SWIZZLE_NONE)
return FALSE;
/* just special case 64bit d/s format */
if (format_desc->block.bits > 32) {
/* XXX big-endian? */
assert(format_desc->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT);
*shift = 0;
*mask = 0xff;
return TRUE;
}
*shift = format_desc->channel[s_swizzle].shift;
sz = format_desc->channel[s_swizzle].size;
*mask = (1U << sz) - 1U;
return TRUE;
}
/**
* Perform the occlusion test and increase the counter.
* Test the depth mask. Add the number of channel which has none zero mask
* into the occlusion counter. e.g. maskvalue is {-1, -1, -1, -1}.
* The counter will add 4.
* TODO: could get that out of the fs loop.
*
* \param type holds element type of the mask vector.
* \param maskvalue is the depth test mask.
* \param counter is a pointer of the uint32 counter.
*/
void
lp_build_occlusion_count(struct gallivm_state *gallivm,
struct lp_type type,
LLVMValueRef maskvalue,
LLVMValueRef counter)
{
LLVMBuilderRef builder = gallivm->builder;
LLVMContextRef context = gallivm->context;
LLVMValueRef countmask = lp_build_const_int_vec(gallivm, type, 1);
LLVMValueRef count, newcount;
assert(type.length <= 16);
assert(type.floating);
if(util_cpu_caps.has_sse && type.length == 4) {
const char *movmskintr = "llvm.x86.sse.movmsk.ps";
const char *popcntintr = "llvm.ctpop.i32";
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
lp_build_vec_type(gallivm, type), "");
bits = lp_build_intrinsic_unary(builder, movmskintr,
LLVMInt32TypeInContext(context), bits);
count = lp_build_intrinsic_unary(builder, popcntintr,
LLVMInt32TypeInContext(context), bits);
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
}
else if(util_cpu_caps.has_avx && type.length == 8) {
const char *movmskintr = "llvm.x86.avx.movmsk.ps.256";
const char *popcntintr = "llvm.ctpop.i32";
LLVMValueRef bits = LLVMBuildBitCast(builder, maskvalue,
lp_build_vec_type(gallivm, type), "");
bits = lp_build_intrinsic_unary(builder, movmskintr,
LLVMInt32TypeInContext(context), bits);
count = lp_build_intrinsic_unary(builder, popcntintr,
LLVMInt32TypeInContext(context), bits);
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
}
else {
unsigned i;
LLVMValueRef countv = LLVMBuildAnd(builder, maskvalue, countmask, "countv");
LLVMTypeRef counttype = LLVMIntTypeInContext(context, type.length * 8);
LLVMTypeRef i8vntype = LLVMVectorType(LLVMInt8TypeInContext(context), type.length * 4);
LLVMValueRef shufflev, countd;
LLVMValueRef shuffles[16];
const char *popcntintr = NULL;
countv = LLVMBuildBitCast(builder, countv, i8vntype, "");
for (i = 0; i < type.length; i++) {
shuffles[i] = lp_build_const_int32(gallivm, 4*i);
}
shufflev = LLVMConstVector(shuffles, type.length);
countd = LLVMBuildShuffleVector(builder, countv, LLVMGetUndef(i8vntype), shufflev, "");
countd = LLVMBuildBitCast(builder, countd, counttype, "countd");
/*
* XXX FIXME
* this is bad on cpus without popcount (on x86 supported by intel
* nehalem, amd barcelona, and up - not tied to sse42).
* Would be much faster to just sum the 4 elements of the vector with
* some horizontal add (shuffle/add/shuffle/add after the initial and).
*/
switch (type.length) {
case 4:
popcntintr = "llvm.ctpop.i32";
break;
case 8:
popcntintr = "llvm.ctpop.i64";
break;
case 16:
popcntintr = "llvm.ctpop.i128";
break;
default:
assert(0);
}
count = lp_build_intrinsic_unary(builder, popcntintr, counttype, countd);
if (type.length > 8) {
count = LLVMBuildTrunc(builder, count, LLVMIntTypeInContext(context, 64), "");
}
else if (type.length < 8) {
count = LLVMBuildZExt(builder, count, LLVMIntTypeInContext(context, 64), "");
}
}
newcount = LLVMBuildLoad(builder, counter, "origcount");
newcount = LLVMBuildAdd(builder, newcount, count, "newcount");
LLVMBuildStore(builder, newcount, counter);
}
/**
* Load depth/stencil values.
* The stored values are linear, swizzle them.
*
* \param type the data type of the fragment depth/stencil values
* \param format_desc description of the depth/stencil surface
* \param is_1d whether this resource has only one dimension
* \param loop_counter the current loop iteration
* \param depth_ptr pointer to the depth/stencil values of this 4x4 block
* \param depth_stride stride of the depth/stencil buffer
* \param z_fb contains z values loaded from fb (may include padding)
* \param s_fb contains s values loaded from fb (may include padding)
*/
void
lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
struct lp_type z_src_type,
const struct util_format_description *format_desc,
boolean is_1d,
LLVMValueRef depth_ptr,
LLVMValueRef depth_stride,
LLVMValueRef *z_fb,
LLVMValueRef *s_fb,
LLVMValueRef loop_counter)
{
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
LLVMValueRef zs_dst1, zs_dst2;
LLVMValueRef zs_dst_ptr;
LLVMValueRef depth_offset1, depth_offset2;
LLVMTypeRef load_ptr_type;
unsigned depth_bytes = format_desc->block.bits / 8;
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
struct lp_type zs_load_type = zs_type;
zs_load_type.length = zs_load_type.length / 2;
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
if (z_src_type.length == 4) {
unsigned i;
LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
lp_build_const_int32(gallivm, 1), "");
LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
lp_build_const_int32(gallivm, 2), "");
LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
depth_stride, "");
depth_offset1 = LLVMBuildMul(builder, looplsb,
lp_build_const_int32(gallivm, depth_bytes * 2), "");
depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
/* just concatenate the loaded 2x2 values into 4-wide vector */
for (i = 0; i < 4; i++) {
shuffles[i] = lp_build_const_int32(gallivm, i);
}
}
else {
unsigned i;
LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
lp_build_const_int32(gallivm, 1), "");
assert(z_src_type.length == 8);
depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
/*
* We load 2x4 values, and need to swizzle them (order
* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
*/
for (i = 0; i < 8; i++) {
shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
}
}
depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
/* Load current z/stencil values from z/stencil buffer */
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
if (is_1d) {
zs_dst2 = lp_build_undef(gallivm, zs_load_type);
}
else {
zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
}
*z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
LLVMConstVector(shuffles, zs_type.length), "");
*s_fb = *z_fb;
if (format_desc->block.bits < z_src_type.width) {
/* Extend destination ZS values (e.g., when reading from Z16_UNORM) */
*z_fb = LLVMBuildZExt(builder, *z_fb,
lp_build_int_vec_type(gallivm, z_src_type), "");
}
else if (format_desc->block.bits > 32) {
/* rely on llvm to handle too wide vector we have here nicely */
unsigned i;
struct lp_type typex2 = zs_type;
struct lp_type s_type = zs_type;
LLVMValueRef shuffles1[LP_MAX_VECTOR_LENGTH / 4];
LLVMValueRef shuffles2[LP_MAX_VECTOR_LENGTH / 4];
LLVMValueRef tmp;
typex2.width = typex2.width / 2;
typex2.length = typex2.length * 2;
s_type.width = s_type.width / 2;
s_type.floating = 0;
tmp = LLVMBuildBitCast(builder, *z_fb,
lp_build_vec_type(gallivm, typex2), "");
for (i = 0; i < zs_type.length; i++) {
shuffles1[i] = lp_build_const_int32(gallivm, i * 2);
shuffles2[i] = lp_build_const_int32(gallivm, i * 2 + 1);
}
*z_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
LLVMConstVector(shuffles1, zs_type.length), "");
*s_fb = LLVMBuildShuffleVector(builder, tmp, tmp,
LLVMConstVector(shuffles2, zs_type.length), "");
*s_fb = LLVMBuildBitCast(builder, *s_fb,
lp_build_vec_type(gallivm, s_type), "");
lp_build_name(*s_fb, "s_dst");
}
lp_build_name(*z_fb, "z_dst");
lp_build_name(*s_fb, "s_dst");
lp_build_name(*z_fb, "z_dst");
}
/**
* Store depth/stencil values.
* Incoming values are swizzled (typically n 2x2 quads), stored linear.
* If there's a mask it will do select/store otherwise just store.
*
* \param type the data type of the fragment depth/stencil values
* \param format_desc description of the depth/stencil surface
* \param is_1d whether this resource has only one dimension
* \param mask the alive/dead pixel mask for the quad (vector)
* \param z_fb z values read from fb (with padding)
* \param s_fb s values read from fb (with padding)
* \param loop_counter the current loop iteration
* \param depth_ptr pointer to the depth/stencil values of this 4x4 block
* \param depth_stride stride of the depth/stencil buffer
* \param z_value the depth values to store (with padding)
* \param s_value the stencil values to store (with padding)
*/
void
lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
struct lp_type z_src_type,
const struct util_format_description *format_desc,
boolean is_1d,
struct lp_build_mask_context *mask,
LLVMValueRef z_fb,
LLVMValueRef s_fb,
LLVMValueRef loop_counter,
LLVMValueRef depth_ptr,
LLVMValueRef depth_stride,
LLVMValueRef z_value,
LLVMValueRef s_value)
{
struct lp_build_context z_bld;
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 4];
LLVMBuilderRef builder = gallivm->builder;
LLVMValueRef mask_value = NULL;
LLVMValueRef zs_dst1, zs_dst2;
LLVMValueRef zs_dst_ptr1, zs_dst_ptr2;
LLVMValueRef depth_offset1, depth_offset2;
LLVMTypeRef load_ptr_type;
unsigned depth_bytes = format_desc->block.bits / 8;
struct lp_type zs_type = lp_depth_type(format_desc, z_src_type.length);
struct lp_type z_type = zs_type;
struct lp_type zs_load_type = zs_type;
zs_load_type.length = zs_load_type.length / 2;
load_ptr_type = LLVMPointerType(lp_build_vec_type(gallivm, zs_load_type), 0);
z_type.width = z_src_type.width;
lp_build_context_init(&z_bld, gallivm, z_type);
/*
* This is far from ideal, at least for late depth write we should do this
* outside the fs loop to avoid all the swizzle stuff.
*/
if (z_src_type.length == 4) {
LLVMValueRef looplsb = LLVMBuildAnd(builder, loop_counter,
lp_build_const_int32(gallivm, 1), "");
LLVMValueRef loopmsb = LLVMBuildAnd(builder, loop_counter,
lp_build_const_int32(gallivm, 2), "");
LLVMValueRef offset2 = LLVMBuildMul(builder, loopmsb,
depth_stride, "");
depth_offset1 = LLVMBuildMul(builder, looplsb,
lp_build_const_int32(gallivm, depth_bytes * 2), "");
depth_offset1 = LLVMBuildAdd(builder, depth_offset1, offset2, "");
}
else {
unsigned i;
LLVMValueRef loopx2 = LLVMBuildShl(builder, loop_counter,
lp_build_const_int32(gallivm, 1), "");
assert(z_src_type.length == 8);
depth_offset1 = LLVMBuildMul(builder, loopx2, depth_stride, "");
/*
* We load 2x4 values, and need to swizzle them (order
* 0,1,4,5,2,3,6,7) - not so hot with avx unfortunately.
*/
for (i = 0; i < 8; i++) {
shuffles[i] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
}
}
depth_offset2 = LLVMBuildAdd(builder, depth_offset1, depth_stride, "");
zs_dst_ptr1 = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
zs_dst_ptr1 = LLVMBuildBitCast(builder, zs_dst_ptr1, load_ptr_type, "");
zs_dst_ptr2 = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
zs_dst_ptr2 = LLVMBuildBitCast(builder, zs_dst_ptr2, load_ptr_type, "");
if (format_desc->block.bits > 32) {
s_value = LLVMBuildBitCast(builder, s_value, z_bld.vec_type, "");
}
if (mask) {
mask_value = lp_build_mask_value(mask);
z_value = lp_build_select(&z_bld, mask_value, z_value, z_fb);
if (format_desc->block.bits > 32) {
s_fb = LLVMBuildBitCast(builder, s_fb, z_bld.vec_type, "");
s_value = lp_build_select(&z_bld, mask_value, s_value, s_fb);
}
}
if (zs_type.width < z_src_type.width) {
/* Truncate ZS values (e.g., when writing to Z16_UNORM) */
z_value = LLVMBuildTrunc(builder, z_value,
lp_build_int_vec_type(gallivm, zs_type), "");
}
if (format_desc->block.bits <= 32) {
if (z_src_type.length == 4) {
zs_dst1 = lp_build_extract_range(gallivm, z_value, 0, 2);
zs_dst2 = lp_build_extract_range(gallivm, z_value, 2, 2);
}
else {
assert(z_src_type.length == 8);
zs_dst1 = LLVMBuildShuffleVector(builder, z_value, z_value,
LLVMConstVector(&shuffles[0],
zs_load_type.length), "");
zs_dst2 = LLVMBuildShuffleVector(builder, z_value, z_value,
LLVMConstVector(&shuffles[4],
zs_load_type.length), "");
}
}
else {
if (z_src_type.length == 4) {
zs_dst1 = lp_build_interleave2(gallivm, z_type,
z_value, s_value, 0);
zs_dst2 = lp_build_interleave2(gallivm, z_type,
z_value, s_value, 1);
}
else {
unsigned i;
LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH / 2];
assert(z_src_type.length == 8);
for (i = 0; i < 8; i++) {
shuffles[i*2] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2);
shuffles[i*2+1] = lp_build_const_int32(gallivm, (i&1) + (i&2) * 2 + (i&4) / 2 +
z_src_type.length);
}
zs_dst1 = LLVMBuildShuffleVector(builder, z_value, s_value,
LLVMConstVector(&shuffles[0],
z_src_type.length), "");
zs_dst2 = LLVMBuildShuffleVector(builder, z_value, s_value,
LLVMConstVector(&shuffles[8],
z_src_type.length), "");
}
zs_dst1 = LLVMBuildBitCast(builder, zs_dst1,
lp_build_vec_type(gallivm, zs_load_type), "");
zs_dst2 = LLVMBuildBitCast(builder, zs_dst2,
lp_build_vec_type(gallivm, zs_load_type), "");
}
LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
if (!is_1d) {
LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
}
}
/**
* Generate code for performing depth and/or stencil tests.
* We operate on a vector of values (typically n 2x2 quads).
*
* \param depth the depth test state
* \param stencil the front/back stencil state
* \param type the data type of the fragment depth/stencil values
* \param format_desc description of the depth/stencil surface
* \param mask the alive/dead pixel mask for the quad (vector)
* \param stencil_refs the front/back stencil ref values (scalar)
* \param z_src the incoming depth/stencil values (n 2x2 quad values, float32)
* \param zs_dst the depth/stencil values in framebuffer
* \param face contains boolean value indicating front/back facing polygon
*/
void
lp_build_depth_stencil_test(struct gallivm_state *gallivm,
const struct pipe_depth_state *depth,
const struct pipe_stencil_state stencil[2],
struct lp_type z_src_type,
const struct util_format_description *format_desc,
struct lp_build_mask_context *mask,
LLVMValueRef stencil_refs[2],
LLVMValueRef z_src,
LLVMValueRef z_fb,
LLVMValueRef s_fb,
LLVMValueRef face,
LLVMValueRef *z_value,
LLVMValueRef *s_value,
boolean do_branch)
{
LLVMBuilderRef builder = gallivm->builder;
struct lp_type z_type;
struct lp_build_context z_bld;
struct lp_build_context s_bld;
struct lp_type s_type;
unsigned z_shift = 0, z_width = 0, z_mask = 0;
LLVMValueRef z_dst = NULL;
LLVMValueRef stencil_vals = NULL;
LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
LLVMValueRef current_mask = lp_build_mask_value(mask);
LLVMValueRef front_facing = NULL;
boolean have_z, have_s;
/*
* Depths are expected to be between 0 and 1, even if they are stored in
* floats. Setting these bits here will ensure that the lp_build_conv() call
* below won't try to unnecessarily clamp the incoming values.
*/
if(z_src_type.floating) {
z_src_type.sign = FALSE;
z_src_type.norm = TRUE;
}
else {
assert(!z_src_type.sign);
assert(z_src_type.norm);
}
/* Pick the type matching the depth-stencil format. */
z_type = lp_depth_type(format_desc, z_src_type.length);
/* Pick the intermediate type for depth operations. */
z_type.width = z_src_type.width;
assert(z_type.length == z_src_type.length);
/* FIXME: for non-float depth/stencil might generate better code
* if we'd always split it up to use 128bit operations.
* For stencil we'd almost certainly want to pack to 8xi16 values,
* for z just run twice.
*/
/* Sanity checking */
{
const unsigned z_swizzle = format_desc->swizzle[0];
const unsigned s_swizzle = format_desc->swizzle[1];
assert(z_swizzle != PIPE_SWIZZLE_NONE ||
s_swizzle != PIPE_SWIZZLE_NONE);
assert(depth->enabled || stencil[0].enabled);
assert(format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS);
assert(format_desc->block.width == 1);
assert(format_desc->block.height == 1);
if (stencil[0].enabled) {
assert(s_swizzle < 4);
assert(format_desc->channel[s_swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED);
assert(format_desc->channel[s_swizzle].pure_integer);
assert(!format_desc->channel[s_swizzle].normalized);
assert(format_desc->channel[s_swizzle].size == 8);
}
if (depth->enabled) {
assert(z_swizzle < 4);
if (z_type.floating) {
assert(z_swizzle == 0);
assert(format_desc->channel[z_swizzle].type ==
UTIL_FORMAT_TYPE_FLOAT);
assert(format_desc->channel[z_swizzle].size == 32);
}
else {
assert(format_desc->channel[z_swizzle].type ==
UTIL_FORMAT_TYPE_UNSIGNED);
assert(format_desc->channel[z_swizzle].normalized);
assert(!z_type.fixed);
}
}
}
/* Setup build context for Z vals */
lp_build_context_init(&z_bld, gallivm, z_type);
/* Setup build context for stencil vals */
s_type = lp_int_type(z_type);
lp_build_context_init(&s_bld, gallivm, s_type);
/* Compute and apply the Z/stencil bitmasks and shifts.
*/
{
unsigned s_shift, s_mask;
z_dst = z_fb;
stencil_vals = s_fb;
have_z = get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask);
have_s = get_s_shift_and_mask(format_desc, &s_shift, &s_mask);
if (have_z) {
if (z_mask != 0xffffffff) {
z_bitmask = lp_build_const_int_vec(gallivm, z_type, z_mask);
}
/*
* Align the framebuffer Z 's LSB to the right.
*/
if (z_shift) {
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
z_dst = LLVMBuildLShr(builder, z_dst, shift, "z_dst");
} else if (z_bitmask) {
z_dst = LLVMBuildAnd(builder, z_dst, z_bitmask, "z_dst");
} else {
lp_build_name(z_dst, "z_dst");
}
}
if (have_s) {
if (s_shift) {
LLVMValueRef shift = lp_build_const_int_vec(gallivm, s_type, s_shift);
stencil_vals = LLVMBuildLShr(builder, stencil_vals, shift, "");
stencil_shift = shift; /* used below */
}
if (s_mask != 0xffffffff) {
LLVMValueRef mask = lp_build_const_int_vec(gallivm, s_type, s_mask);
stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
}
lp_build_name(stencil_vals, "s_dst");
}
}
if (stencil[0].enabled) {
if (face) {
if (0) {
/*
* XXX: the scalar expansion below produces atrocious code
* (basically producing a 64bit scalar value, then moving the 2
* 32bit pieces separately to simd, plus 4 shuffles, which is
* seriously lame). But the scalar-simd transitions are always
* tricky, so no big surprise there.
* This here would be way better, however llvm has some serious
* trouble later using it in the select, probably because it will
* recognize the expression as constant and move the simd value
* away (out of the loop) - and then it will suddenly try
* constructing i1 high-bit masks out of it later...
* (Try piglit stencil-twoside.)
* Note this is NOT due to using SExt/Trunc, it fails exactly the
* same even when using native compare/select.
* I cannot reproduce this problem when using stand-alone compiler
* though, suggesting some problem with optimization passes...
* (With stand-alone compilation, the construction of this mask
* value, no matter if the easy 3 instruction here or the complex
* 16+ one below, never gets separated from where it's used.)
* The scalar code still has the same problem, but the generated
* code looks a bit better at least for some reason, even if
* mostly by luck (the fundamental issue clearly is the same).
*/
front_facing = lp_build_broadcast(gallivm, s_bld.vec_type, face);
/* front_facing = face != 0 ? ~0 : 0 */
front_facing = lp_build_compare(gallivm, s_bld.type,
PIPE_FUNC_NOTEQUAL,
front_facing, s_bld.zero);
} else {
LLVMValueRef zero = lp_build_const_int32(gallivm, 0);
/* front_facing = face != 0 ? ~0 : 0 */
front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
front_facing = LLVMBuildSExt(builder, front_facing,
LLVMIntTypeInContext(gallivm->context,
s_bld.type.length*s_bld.type.width),
"");
front_facing = LLVMBuildBitCast(builder, front_facing,
s_bld.int_vec_type, "");
}
}
s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
stencil_refs, stencil_vals,
front_facing);
/* apply stencil-fail operator */
{
LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, current_mask, s_pass_mask);
stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
stencil_refs, stencil_vals,
s_fail_mask, front_facing);
}
}
if (depth->enabled) {
/*
* Convert fragment Z to the desired type, aligning the LSB to the right.
*/
assert(z_type.width == z_src_type.width);
assert(z_type.length == z_src_type.length);
assert(lp_check_value(z_src_type, z_src));
if (z_src_type.floating) {
/*
* Convert from floating point values
*/
if (!z_type.floating) {
z_src = lp_build_clamped_float_to_unsigned_norm(gallivm,
z_src_type,
z_width,
z_src);
}
} else {
/*
* Convert from unsigned normalized values.
*/
assert(!z_src_type.sign);
assert(!z_src_type.fixed);
assert(z_src_type.norm);
assert(!z_type.floating);
if (z_src_type.width > z_width) {
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_src_type,
z_src_type.width - z_width);
z_src = LLVMBuildLShr(builder, z_src, shift, "");
}
}
assert(lp_check_value(z_type, z_src));
lp_build_name(z_src, "z_src");
/* compare src Z to dst Z, returning 'pass' mask */
z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
/* mask off bits that failed stencil test */
if (s_pass_mask) {
current_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
}
if (!stencil[0].enabled) {
/* We can potentially skip all remaining operations here, but only
* if stencil is disabled because we still need to update the stencil
* buffer values. Don't need to update Z buffer values.
*/
lp_build_mask_update(mask, z_pass);
if (do_branch) {
lp_build_mask_check(mask);
}
}
if (depth->writemask) {
LLVMValueRef z_pass_mask;
/* mask off bits that failed Z test */
z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
/* Mix the old and new Z buffer values.
* z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
*/
z_dst = lp_build_select(&z_bld, z_pass_mask, z_src, z_dst);
}
if (stencil[0].enabled) {
/* update stencil buffer values according to z pass/fail result */
LLVMValueRef z_fail_mask, z_pass_mask;
/* apply Z-fail operator */
z_fail_mask = lp_build_andnot(&s_bld, current_mask, z_pass);
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
stencil_refs, stencil_vals,
z_fail_mask, front_facing);
/* apply Z-pass operator */
z_pass_mask = LLVMBuildAnd(builder, current_mask, z_pass, "");
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
stencil_refs, stencil_vals,
z_pass_mask, front_facing);
}
}
else {
/* No depth test: apply Z-pass operator to stencil buffer values which
* passed the stencil test.
*/
s_pass_mask = LLVMBuildAnd(builder, current_mask, s_pass_mask, "");
stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
stencil_refs, stencil_vals,
s_pass_mask, front_facing);
}
/* Put Z and stencil bits in the right place */
if (have_z && z_shift) {
LLVMValueRef shift = lp_build_const_int_vec(gallivm, z_type, z_shift);
z_dst = LLVMBuildShl(builder, z_dst, shift, "");
}
if (stencil_vals && stencil_shift)
stencil_vals = LLVMBuildShl(builder, stencil_vals,
stencil_shift, "");
/* Finally, merge the z/stencil values */
if (format_desc->block.bits <= 32) {
if (have_z && have_s)
*z_value = LLVMBuildOr(builder, z_dst, stencil_vals, "");
else if (have_z)
*z_value = z_dst;
else
*z_value = stencil_vals;
*s_value = *z_value;
}
else {
*z_value = z_dst;
*s_value = stencil_vals;
}
if (s_pass_mask)
lp_build_mask_update(mask, s_pass_mask);
if (depth->enabled && stencil[0].enabled)
lp_build_mask_update(mask, z_pass);
}