/************************************************************************** * * Copyright 2009 VMware, Inc. * All Rights Reserved. * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the * "Software"), to deal in the Software without restriction, including * without limitation the rights to use, copy, modify, merge, publish, * distribute, sub license, and/or sell copies of the Software, and to * permit persons to whom the Software is furnished to do so, subject to * the following conditions: * * The above copyright notice and this permission notice (including the * next paragraph) shall be included in all copies or substantial portions * of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. * **************************************************************************/ /** * @file * Helper functions for logical operations. * * @author Jose Fonseca <jfonseca@vmware.com> */ #include "util/u_cpu_detect.h" #include "util/u_memory.h" #include "util/u_debug.h" #include "lp_bld_type.h" #include "lp_bld_const.h" #include "lp_bld_swizzle.h" #include "lp_bld_init.h" #include "lp_bld_intr.h" #include "lp_bld_debug.h" #include "lp_bld_logic.h" /* * XXX * * Selection with vector conditional like * * select <4 x i1> %C, %A, %B * * is valid IR (e.g. llvm/test/Assembler/vector-select.ll), but it is only * supported on some backends (x86) starting with llvm 3.1. * * Expanding the boolean vector to full SIMD register width, as in * * sext <4 x i1> %C to <4 x i32> * * is valid and supported (e.g., llvm/test/CodeGen/X86/vec_compare.ll), but * it causes assertion failures in LLVM 2.6. It appears to work correctly on * LLVM 2.7. */ /** * Build code to compare two values 'a' and 'b' of 'type' using the given func. * \param func one of PIPE_FUNC_x * If the ordered argument is true the function will use LLVM's ordered * comparisons, otherwise unordered comparisons will be used. * The result values will be 0 for false or ~0 for true. */ static LLVMValueRef lp_build_compare_ext(struct gallivm_state *gallivm, const struct lp_type type, unsigned func, LLVMValueRef a, LLVMValueRef b, boolean ordered) { LLVMBuilderRef builder = gallivm->builder; LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type); LLVMValueRef zeros = LLVMConstNull(int_vec_type); LLVMValueRef ones = LLVMConstAllOnes(int_vec_type); LLVMValueRef cond; LLVMValueRef res; assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; if(func == PIPE_FUNC_ALWAYS) return ones; assert(func > PIPE_FUNC_NEVER); assert(func < PIPE_FUNC_ALWAYS); if(type.floating) { LLVMRealPredicate op; switch(func) { case PIPE_FUNC_EQUAL: op = ordered ? LLVMRealOEQ : LLVMRealUEQ; break; case PIPE_FUNC_NOTEQUAL: op = ordered ? LLVMRealONE : LLVMRealUNE; break; case PIPE_FUNC_LESS: op = ordered ? LLVMRealOLT : LLVMRealULT; break; case PIPE_FUNC_LEQUAL: op = ordered ? LLVMRealOLE : LLVMRealULE; break; case PIPE_FUNC_GREATER: op = ordered ? LLVMRealOGT : LLVMRealUGT; break; case PIPE_FUNC_GEQUAL: op = ordered ? LLVMRealOGE : LLVMRealUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } cond = LLVMBuildFCmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } else { LLVMIntPredicate op; switch(func) { case PIPE_FUNC_EQUAL: op = LLVMIntEQ; break; case PIPE_FUNC_NOTEQUAL: op = LLVMIntNE; break; case PIPE_FUNC_LESS: op = type.sign ? LLVMIntSLT : LLVMIntULT; break; case PIPE_FUNC_LEQUAL: op = type.sign ? LLVMIntSLE : LLVMIntULE; break; case PIPE_FUNC_GREATER: op = type.sign ? LLVMIntSGT : LLVMIntUGT; break; case PIPE_FUNC_GEQUAL: op = type.sign ? LLVMIntSGE : LLVMIntUGE; break; default: assert(0); return lp_build_undef(gallivm, type); } cond = LLVMBuildICmp(builder, op, a, b, ""); res = LLVMBuildSExt(builder, cond, int_vec_type, ""); } return res; } /** * Build code to compare two values 'a' and 'b' of 'type' using the given func. * \param func one of PIPE_FUNC_x * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_compare(struct gallivm_state *gallivm, const struct lp_type type, unsigned func, LLVMValueRef a, LLVMValueRef b) { LLVMTypeRef int_vec_type = lp_build_int_vec_type(gallivm, type); LLVMValueRef zeros = LLVMConstNull(int_vec_type); LLVMValueRef ones = LLVMConstAllOnes(int_vec_type); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(func == PIPE_FUNC_NEVER) return zeros; if(func == PIPE_FUNC_ALWAYS) return ones; assert(func > PIPE_FUNC_NEVER); assert(func < PIPE_FUNC_ALWAYS); #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64) /* * There are no unsigned integer comparison instructions in SSE. */ if (!type.floating && !type.sign && type.width * type.length == 128 && util_cpu_caps.has_sse2 && (func == PIPE_FUNC_LESS || func == PIPE_FUNC_LEQUAL || func == PIPE_FUNC_GREATER || func == PIPE_FUNC_GEQUAL) && (gallivm_debug & GALLIVM_DEBUG_PERF)) { debug_printf("%s: inefficient <%u x i%u> unsigned comparison\n", __FUNCTION__, type.length, type.width); } #endif return lp_build_compare_ext(gallivm, type, func, a, b, FALSE); } /** * Build code to compare two values 'a' and 'b' using the given func. * \param func one of PIPE_FUNC_x * If the operands are floating point numbers, the function will use * ordered comparison which means that it will return true if both * operands are not a NaN and the specified condition evaluates to true. * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_cmp_ordered(struct lp_build_context *bld, unsigned func, LLVMValueRef a, LLVMValueRef b) { return lp_build_compare_ext(bld->gallivm, bld->type, func, a, b, TRUE); } /** * Build code to compare two values 'a' and 'b' using the given func. * \param func one of PIPE_FUNC_x * If the operands are floating point numbers, the function will use * unordered comparison which means that it will return true if either * operand is a NaN or the specified condition evaluates to true. * The result values will be 0 for false or ~0 for true. */ LLVMValueRef lp_build_cmp(struct lp_build_context *bld, unsigned func, LLVMValueRef a, LLVMValueRef b) { return lp_build_compare(bld->gallivm, bld->type, func, a, b); } /** * Return (mask & a) | (~mask & b); */ LLVMValueRef lp_build_select_bitwise(struct lp_build_context *bld, LLVMValueRef mask, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; struct lp_type type = bld->type; LLVMValueRef res; assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if (a == b) { return a; } if(type.floating) { LLVMTypeRef int_vec_type = lp_build_int_vec_type(bld->gallivm, type); a = LLVMBuildBitCast(builder, a, int_vec_type, ""); b = LLVMBuildBitCast(builder, b, int_vec_type, ""); } a = LLVMBuildAnd(builder, a, mask, ""); /* This often gets translated to PANDN, but sometimes the NOT is * pre-computed and stored in another constant. The best strategy depends * on available registers, so it is not a big deal -- hopefully LLVM does * the right decision attending the rest of the program. */ b = LLVMBuildAnd(builder, b, LLVMBuildNot(builder, mask, ""), ""); res = LLVMBuildOr(builder, a, b, ""); if(type.floating) { LLVMTypeRef vec_type = lp_build_vec_type(bld->gallivm, type); res = LLVMBuildBitCast(builder, res, vec_type, ""); } return res; } /** * Return mask ? a : b; * * mask is a bitwise mask, composed of 0 or ~0 for each element. Any other value * will yield unpredictable results. */ LLVMValueRef lp_build_select(struct lp_build_context *bld, LLVMValueRef mask, LLVMValueRef a, LLVMValueRef b) { LLVMBuilderRef builder = bld->gallivm->builder; LLVMContextRef lc = bld->gallivm->context; struct lp_type type = bld->type; LLVMValueRef res; assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(a == b) return a; if (type.length == 1) { mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), ""); res = LLVMBuildSelect(builder, mask, a, b, ""); } else if (!(HAVE_LLVM == 0x0307) && (LLVMIsConstant(mask) || LLVMGetInstructionOpcode(mask) == LLVMSExt)) { /* Generate a vector select. * * Using vector selects should avoid emitting intrinsics hence avoid * hindering optimization passes, but vector selects weren't properly * supported yet for a long time, and LLVM will generate poor code when * the mask is not the result of a comparison. * Also, llvm 3.7 may miscompile them (bug 94972). * XXX: Even if the instruction was an SExt, this may still produce * terrible code. Try piglit stencil-twoside. */ /* Convert the mask to a vector of booleans. * * XXX: In x86 the mask is controlled by the MSB, so if we shifted the * mask by `type.width - 1`, LLVM should realize the mask is ready. Alas * what really happens is that LLVM will emit two shifts back to back. */ if (0) { LLVMValueRef shift = LLVMConstInt(bld->int_elem_type, bld->type.width - 1, 0); shift = lp_build_broadcast(bld->gallivm, bld->int_vec_type, shift); mask = LLVMBuildLShr(builder, mask, shift, ""); } LLVMTypeRef bool_vec_type = LLVMVectorType(LLVMInt1TypeInContext(lc), type.length); mask = LLVMBuildTrunc(builder, mask, bool_vec_type, ""); res = LLVMBuildSelect(builder, mask, a, b, ""); } else if (((util_cpu_caps.has_sse4_1 && type.width * type.length == 128) || (util_cpu_caps.has_avx && type.width * type.length == 256 && type.width >= 32) || (util_cpu_caps.has_avx2 && type.width * type.length == 256)) && !LLVMIsConstant(a) && !LLVMIsConstant(b) && !LLVMIsConstant(mask)) { const char *intrinsic; LLVMTypeRef arg_type; LLVMValueRef args[3]; /* * There's only float blend in AVX but can just cast i32/i64 * to float. */ if (type.width * type.length == 256) { if (type.width == 64) { intrinsic = "llvm.x86.avx.blendv.pd.256"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 4); } else if (type.width == 32) { intrinsic = "llvm.x86.avx.blendv.ps.256"; arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 8); } else { assert(util_cpu_caps.has_avx2); intrinsic = "llvm.x86.avx2.pblendvb"; arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 32); } } else if (type.floating && type.width == 64) { intrinsic = "llvm.x86.sse41.blendvpd"; arg_type = LLVMVectorType(LLVMDoubleTypeInContext(lc), 2); } else if (type.floating && type.width == 32) { intrinsic = "llvm.x86.sse41.blendvps"; arg_type = LLVMVectorType(LLVMFloatTypeInContext(lc), 4); } else { intrinsic = "llvm.x86.sse41.pblendvb"; arg_type = LLVMVectorType(LLVMInt8TypeInContext(lc), 16); } if (arg_type != bld->int_vec_type) { mask = LLVMBuildBitCast(builder, mask, arg_type, ""); } if (arg_type != bld->vec_type) { a = LLVMBuildBitCast(builder, a, arg_type, ""); b = LLVMBuildBitCast(builder, b, arg_type, ""); } args[0] = b; args[1] = a; args[2] = mask; res = lp_build_intrinsic(builder, intrinsic, arg_type, args, ARRAY_SIZE(args), 0); if (arg_type != bld->vec_type) { res = LLVMBuildBitCast(builder, res, bld->vec_type, ""); } } else { res = lp_build_select_bitwise(bld, mask, a, b); } return res; } /** * Return mask ? a : b; * * mask is a TGSI_WRITEMASK_xxx. */ LLVMValueRef lp_build_select_aos(struct lp_build_context *bld, unsigned mask, LLVMValueRef a, LLVMValueRef b, unsigned num_channels) { LLVMBuilderRef builder = bld->gallivm->builder; const struct lp_type type = bld->type; const unsigned n = type.length; unsigned i, j; assert((mask & ~0xf) == 0); assert(lp_check_value(type, a)); assert(lp_check_value(type, b)); if(a == b) return a; if((mask & 0xf) == 0xf) return a; if((mask & 0xf) == 0x0) return b; if(a == bld->undef || b == bld->undef) return bld->undef; /* * There are two major ways of accomplishing this: * - with a shuffle * - with a select * * The flip between these is empirical and might need to be adjusted. */ if (n <= 4) { /* * Shuffle. */ LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; for(j = 0; j < n; j += num_channels) for(i = 0; i < num_channels; ++i) shuffles[j + i] = LLVMConstInt(elem_type, (mask & (1 << i) ? 0 : n) + j + i, 0); return LLVMBuildShuffleVector(builder, a, b, LLVMConstVector(shuffles, n), ""); } else { LLVMValueRef mask_vec = lp_build_const_mask_aos(bld->gallivm, type, mask, num_channels); return lp_build_select(bld, mask_vec, a, b); } } /** * Return (scalar-cast)val ? true : false; */ LLVMValueRef lp_build_any_true_range(struct lp_build_context *bld, unsigned real_length, LLVMValueRef val) { LLVMBuilderRef builder = bld->gallivm->builder; LLVMTypeRef scalar_type; LLVMTypeRef true_type; assert(real_length <= bld->type.length); true_type = LLVMIntTypeInContext(bld->gallivm->context, bld->type.width * real_length); scalar_type = LLVMIntTypeInContext(bld->gallivm->context, bld->type.width * bld->type.length); val = LLVMBuildBitCast(builder, val, scalar_type, ""); /* * We're using always native types so we can use intrinsics. * However, if we don't do per-element calculations, we must ensure * the excess elements aren't used since they may contain garbage. */ if (real_length < bld->type.length) { val = LLVMBuildTrunc(builder, val, true_type, ""); } return LLVMBuildICmp(builder, LLVMIntNE, val, LLVMConstNull(true_type), ""); }