HELLO·Android
系统源代码
IT资讯
技术文章
我的收藏
注册
登录
-
我收藏的文章
创建代码块
我的代码块
我的账号
Lollipop MR1
|
5.1.0_r3
下载
查看原文件
收藏
根目录
external
llvm
lib
Target
X86
X86ISelLowering.cpp
//===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// // // This file defines the interfaces that X86 uses to lower LLVM code into a // selection DAG. // //===----------------------------------------------------------------------===// #include "X86ISelLowering.h" #include "Utils/X86ShuffleDecode.h" #include "X86CallingConv.h" #include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/ADT/VariadicFunction.h" #include "llvm/CodeGen/IntrinsicLowering.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCSymbol.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" #include "llvm/Target/TargetOptions.h" #include
#include
#include
using namespace llvm; #define DEBUG_TYPE "x86-isel" STATISTIC(NumTailCalls, "Number of tail calls"); static cl::opt
ExperimentalVectorWideningLegalization( "x86-experimental-vector-widening-legalization", cl::init(false), cl::desc("Enable an experimental vector type legalization through widening " "rather than promotion."), cl::Hidden); static cl::opt
ExperimentalVectorShuffleLowering( "x86-experimental-vector-shuffle-lowering", cl::init(false), cl::desc("Enable an experimental vector shuffle lowering code path."), cl::Hidden); // Forward declarations. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2); static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); unsigned Factor = VT.getSizeInBits()/vectorWidth; EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT, VT.getVectorNumElements()/Factor); // Extract from UNDEF is UNDEF. if (Vec.getOpcode() == ISD::UNDEF) return DAG.getUNDEF(ResultVT); // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits(); // This is the index of the first element of the vectorWidth-bit chunk // we want. unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits()) / vectorWidth) * ElemsPerChunk); // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT, makeArrayRef(Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); return Result; } /// Generate a DAG to grab 128-bits from a vector > 128 bits. This /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4 /// instructions or a simple subregister reference. Idx is an index in the /// 128 bits we want. It need not be aligned to a 128-bit bounday. That makes /// lowering EXTRACT_VECTOR_ELT operations easier. static SDValue Extract128BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert((Vec.getValueType().is256BitVector() || Vec.getValueType().is512BitVector()) && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 128); } /// Generate a DAG to grab 256-bits from a 512-bit vector. static SDValue Extract256BitVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!"); return ExtractSubVector(Vec, IdxVal, DAG, dl, 256); } static SDValue InsertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl, unsigned vectorWidth) { assert((vectorWidth == 128 || vectorWidth == 256) && "Unsupported vector width"); // Inserting UNDEF is Result if (Vec.getOpcode() == ISD::UNDEF) return Result; EVT VT = Vec.getValueType(); EVT ElVT = VT.getVectorElementType(); EVT ResultVT = Result.getValueType(); // Insert the relevant vectorWidth bits. unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits(); // This is the index of the first element of the vectorWidth-bit chunk // we want. unsigned NormalizedIdxVal = (((IdxVal * ElVT.getSizeInBits())/vectorWidth) * ElemsPerChunk); SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx); } /// Generate a DAG to put 128-bits into a vector > 128 bits. This /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a /// simple superregister reference. Idx is an index in the 128 bits /// we want. It need not be aligned to a 128-bit bounday. That makes /// lowering INSERT_VECTOR_ELT operations easier. static SDValue Insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 128); } static SDValue Insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, SDLoc dl) { assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!"); return InsertSubVector(Result, Vec, IdxVal, DAG, dl, 256); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 /// instructions. This is used because creating CONCAT_VECTOR nodes of /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower /// large BUILD_VECTORS. static SDValue Concat128BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert128BitVector(V, V2, NumElems/2, DAG, dl); } static SDValue Concat256BitVectors(SDValue V1, SDValue V2, EVT VT, unsigned NumElems, SelectionDAG &DAG, SDLoc dl) { SDValue V = Insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl); return Insert256BitVector(V, V2, NumElems/2, DAG, dl); } static TargetLoweringObjectFile *createTLOF(const Triple &TT) { if (TT.isOSBinFormatMachO()) { if (TT.getArch() == Triple::x86_64) return new X86_64MachoTargetObjectFile(); return new TargetLoweringObjectFileMachO(); } if (TT.isOSLinux()) return new X86LinuxTargetObjectFile(); if (TT.isOSBinFormatELF()) return new TargetLoweringObjectFileELF(); if (TT.isKnownWindowsMSVCEnvironment()) return new X86WindowsTargetObjectFile(); if (TT.isOSBinFormatCOFF()) return new TargetLoweringObjectFileCOFF(); llvm_unreachable("unknown subtarget type"); } // FIXME: This should stop caching the target machine as soon as // we can remove resetOperationActions et al. X86TargetLowering::X86TargetLowering(X86TargetMachine &TM) : TargetLowering(TM, createTLOF(Triple(TM.getTargetTriple()))) { Subtarget = &TM.getSubtarget
(); X86ScalarSSEf64 = Subtarget->hasSSE2(); X86ScalarSSEf32 = Subtarget->hasSSE1(); TD = getDataLayout(); resetOperationActions(); } void X86TargetLowering::resetOperationActions() { const TargetMachine &TM = getTargetMachine(); static bool FirstTimeThrough = true; // If none of the target options have changed, then we don't need to reset the // operation actions. if (!FirstTimeThrough && TO == TM.Options) return; if (!FirstTimeThrough) { // Reinitialize the actions. initActions(); FirstTimeThrough = false; } TO = TM.Options; // Set up the TargetLowering object. static const MVT IntVTs[] = { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }; // X86 is weird, it always uses i8 for shift amounts and setcc results. setBooleanContents(ZeroOrOneBooleanContent); // X86-SSE is even stranger. It uses -1 or 0 for vector masks. setBooleanVectorContents(ZeroOrNegativeOneBooleanContent); // For 64-bit since we have so many registers use the ILP scheduler, for // 32-bit code use the register pressure specific scheduling. // For Atom, always use ILP scheduling. if (Subtarget->isAtom()) setSchedulingPreference(Sched::ILP); else if (Subtarget->is64Bit()) setSchedulingPreference(Sched::ILP); else setSchedulingPreference(Sched::RegPressure); const X86RegisterInfo *RegInfo = static_cast
(TM.getRegisterInfo()); setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister()); // Bypass expensive divides on Atom when compiling with O2 if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default) { addBypassSlowDiv(32, 8); if (Subtarget->is64Bit()) addBypassSlowDiv(64, 16); } if (Subtarget->isTargetKnownWindowsMSVC()) { // Setup Windows compiler runtime calls. setLibcallName(RTLIB::SDIV_I64, "_alldiv"); setLibcallName(RTLIB::UDIV_I64, "_aulldiv"); setLibcallName(RTLIB::SREM_I64, "_allrem"); setLibcallName(RTLIB::UREM_I64, "_aullrem"); setLibcallName(RTLIB::MUL_I64, "_allmul"); setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall); setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall); // The _ftol2 runtime function has an unusual calling conv, which // is modeled by a special pseudo-instruction. setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr); setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr); setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr); setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr); } if (Subtarget->isTargetDarwin()) { // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp. setUseUnderscoreSetJmp(false); setUseUnderscoreLongJmp(false); } else if (Subtarget->isTargetWindowsGNU()) { // MS runtime is weird: it exports _setjmp, but longjmp! setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(false); } else { setUseUnderscoreSetJmp(true); setUseUnderscoreLongJmp(true); } // Set up the register classes. addRegisterClass(MVT::i8, &X86::GR8RegClass); addRegisterClass(MVT::i16, &X86::GR16RegClass); addRegisterClass(MVT::i32, &X86::GR32RegClass); if (Subtarget->is64Bit()) addRegisterClass(MVT::i64, &X86::GR64RegClass); setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote); // We don't accept any truncstore of integer registers. setTruncStoreAction(MVT::i64, MVT::i32, Expand); setTruncStoreAction(MVT::i64, MVT::i16, Expand); setTruncStoreAction(MVT::i64, MVT::i8 , Expand); setTruncStoreAction(MVT::i32, MVT::i16, Expand); setTruncStoreAction(MVT::i32, MVT::i8 , Expand); setTruncStoreAction(MVT::i16, MVT::i8, Expand); // SETOEQ and SETUNE require checking two conditions. setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand); setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand); setCondCodeAction(ISD::SETUNE, MVT::f32, Expand); setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote); if (Subtarget->is64Bit()) { setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote); setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); } else if (!TM.Options.UseSoftFloat) { // We have an algorithm for SSE2->double, and we turn this into a // 64-bit FILD followed by conditional FADD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom); // We have an algorithm for SSE2, and we turn this into a 64-bit // FILD for other targets. setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom); } // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have // this operation. setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote); if (!TM.Options.UseSoftFloat) { // SSE has no i16 to fp conversion, only i32 if (X86ScalarSSEf32) { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom); } } else { setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote); setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote); } // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64 // are Legal, f80 is custom lowered. setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom); setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom); // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have // this operation. setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote); if (X86ScalarSSEf32) { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote); // f32 and f64 cases are Legal, f80 case is not setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } else { setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom); setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom); } // Handle FP_TO_UINT by promoting the destination to a larger signed // conversion. setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote); setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote); if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand); setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote); } else if (!TM.Options.UseSoftFloat) { // Since AVX is a superset of SSE3, only check for SSE here. if (Subtarget->hasSSE1() && !Subtarget->hasSSE3()) // Expand FP_TO_UINT into a select. // FIXME: We would like to use a Custom expander here eventually to do // the optimal thing for SSE vs. the default expansion in the legalizer. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand); else // With SSE3 we can use fisttpll to convert to a signed i64; without // SSE, we're stuck with a fistpll. setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom); } if (isTargetFTOL()) { // Use the _ftol2 runtime function, which has a pseudo-instruction // to handle its weird calling convention. setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom); } // TODO: when we have SSE, these could be more efficient, by using movd/movq. if (!X86ScalarSSEf64) { setOperationAction(ISD::BITCAST , MVT::f32 , Expand); setOperationAction(ISD::BITCAST , MVT::i32 , Expand); if (Subtarget->is64Bit()) { setOperationAction(ISD::BITCAST , MVT::f64 , Expand); // Without SSE, i64->f64 goes through memory. setOperationAction(ISD::BITCAST , MVT::i64 , Expand); } } // Scalar integer divide and remainder are lowered to use operations that // produce two results, to match the available instructions. This exposes // the two-result form to trivial CSE, which is able to combine x/y and x%y // into a single instruction. // // Scalar integer multiply-high is also lowered to use two-result // operations, to match the available instructions. However, plain multiply // (low) operations are left as Legal, as there are single-result // instructions for this in x86. Using the two-result multiply instructions // when both high and low results are needed must be arranged by dagcombine. for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences. setOperationAction(ISD::ADDC, VT, Custom); setOperationAction(ISD::ADDE, VT, Custom); setOperationAction(ISD::SUBC, VT, Custom); setOperationAction(ISD::SUBE, VT, Custom); } setOperationAction(ISD::BR_JT , MVT::Other, Expand); setOperationAction(ISD::BRCOND , MVT::Other, Custom); setOperationAction(ISD::BR_CC , MVT::f32, Expand); setOperationAction(ISD::BR_CC , MVT::f64, Expand); setOperationAction(ISD::BR_CC , MVT::f80, Expand); setOperationAction(ISD::BR_CC , MVT::i8, Expand); setOperationAction(ISD::BR_CC , MVT::i16, Expand); setOperationAction(ISD::BR_CC , MVT::i32, Expand); setOperationAction(ISD::BR_CC , MVT::i64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f32, Expand); setOperationAction(ISD::SELECT_CC , MVT::f64, Expand); setOperationAction(ISD::SELECT_CC , MVT::f80, Expand); setOperationAction(ISD::SELECT_CC , MVT::i8, Expand); setOperationAction(ISD::SELECT_CC , MVT::i16, Expand); setOperationAction(ISD::SELECT_CC , MVT::i32, Expand); setOperationAction(ISD::SELECT_CC , MVT::i64, Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal); setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand); setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f32 , Expand); setOperationAction(ISD::FREM , MVT::f64 , Expand); setOperationAction(ISD::FREM , MVT::f80 , Expand); setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom); // Promote the i8 variants and force them on up to i32 which has a shorter // encoding. setOperationAction(ISD::CTTZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTTZ_ZERO_UNDEF , MVT::i8 , Promote); AddPromotedToType (ISD::CTTZ_ZERO_UNDEF , MVT::i8 , MVT::i32); if (Subtarget->hasBMI()) { setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTTZ , MVT::i16 , Custom); setOperationAction(ISD::CTTZ , MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::CTTZ , MVT::i64 , Custom); } if (Subtarget->hasLZCNT()) { // When promoting the i8 variants, force them to i32 for a shorter // encoding. setOperationAction(ISD::CTLZ , MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ , MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Promote); AddPromotedToType (ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand); } else { setOperationAction(ISD::CTLZ , MVT::i8 , Custom); setOperationAction(ISD::CTLZ , MVT::i16 , Custom); setOperationAction(ISD::CTLZ , MVT::i32 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::CTLZ , MVT::i64 , Custom); setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom); } } // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. if (TM.Options.UseSoftFloat || !Subtarget->hasF16C()) { setOperationAction(ISD::FP16_TO_FP32, MVT::f32, Expand); setOperationAction(ISD::FP32_TO_FP16, MVT::i16, Expand); } if (Subtarget->hasPOPCNT()) { setOperationAction(ISD::CTPOP , MVT::i8 , Promote); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); setOperationAction(ISD::CTPOP , MVT::i32 , Expand); if (Subtarget->is64Bit()) setOperationAction(ISD::CTPOP , MVT::i64 , Expand); } setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom); if (!Subtarget->hasMOVBE()) setOperationAction(ISD::BSWAP , MVT::i16 , Expand); // These should be promoted to a larger select which is supported. setOperationAction(ISD::SELECT , MVT::i1 , Promote); // X86 wants to expand cmov itself. setOperationAction(ISD::SELECT , MVT::i8 , Custom); setOperationAction(ISD::SELECT , MVT::i16 , Custom); setOperationAction(ISD::SELECT , MVT::i32 , Custom); setOperationAction(ISD::SELECT , MVT::f32 , Custom); setOperationAction(ISD::SELECT , MVT::f64 , Custom); setOperationAction(ISD::SELECT , MVT::f80 , Custom); setOperationAction(ISD::SETCC , MVT::i8 , Custom); setOperationAction(ISD::SETCC , MVT::i16 , Custom); setOperationAction(ISD::SETCC , MVT::i32 , Custom); setOperationAction(ISD::SETCC , MVT::f32 , Custom); setOperationAction(ISD::SETCC , MVT::f64 , Custom); setOperationAction(ISD::SETCC , MVT::f80 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SELECT , MVT::i64 , Custom); setOperationAction(ISD::SETCC , MVT::i64 , Custom); } setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support // SjLj exception handling but a light-weight setjmp/longjmp replacement to // support continuation, user-level threading, and etc.. As a result, no // other SjLj exception interfaces are implemented and please don't build // your own exception handling based on them. // LLVM/Clang supports zero-cost DWARF exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); // Darwin ABI issue. setOperationAction(ISD::ConstantPool , MVT::i32 , Custom); setOperationAction(ISD::JumpTable , MVT::i32 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i32 , Custom); setOperationAction(ISD::GlobalTLSAddress, MVT::i32 , Custom); if (Subtarget->is64Bit()) setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom); setOperationAction(ISD::ExternalSymbol , MVT::i32 , Custom); setOperationAction(ISD::BlockAddress , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::ConstantPool , MVT::i64 , Custom); setOperationAction(ISD::JumpTable , MVT::i64 , Custom); setOperationAction(ISD::GlobalAddress , MVT::i64 , Custom); setOperationAction(ISD::ExternalSymbol, MVT::i64 , Custom); setOperationAction(ISD::BlockAddress , MVT::i64 , Custom); } // 64-bit addm sub, shl, sra, srl (iff 32-bit x86) setOperationAction(ISD::SHL_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i32 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i32 , Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::SHL_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRA_PARTS , MVT::i64 , Custom); setOperationAction(ISD::SRL_PARTS , MVT::i64 , Custom); } if (Subtarget->hasSSE1()) setOperationAction(ISD::PREFETCH , MVT::Other, Legal); setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom); // Expand certain atomics for (unsigned i = 0; i != array_lengthof(IntVTs); ++i) { MVT VT = IntVTs[i]; setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom); setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom); setOperationAction(ISD::ATOMIC_STORE, VT, Custom); } if (Subtarget->hasCmpxchg16b()) { setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom); } // FIXME - use subtarget debug flags if (!Subtarget->isTargetDarwin() && !Subtarget->isTargetELF() && !Subtarget->isTargetCygMing() && !Subtarget->isTargetWin64()) { setOperationAction(ISD::EH_LABEL, MVT::Other, Expand); } if (Subtarget->is64Bit()) { setExceptionPointerRegister(X86::RAX); setExceptionSelectorRegister(X86::RDX); } else { setExceptionPointerRegister(X86::EAX); setExceptionSelectorRegister(X86::EDX); } setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom); setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom); setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom); setOperationAction(ISD::TRAP, MVT::Other, Legal); setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal); // VASTART needs to be custom lowered to use the VarArgsFrameIndex setOperationAction(ISD::VASTART , MVT::Other, Custom); setOperationAction(ISD::VAEND , MVT::Other, Expand); if (Subtarget->is64Bit() && !Subtarget->isTargetWin64()) { // TargetInfo::X86_64ABIBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Custom); setOperationAction(ISD::VACOPY , MVT::Other, Custom); } else { // TargetInfo::CharPtrBuiltinVaList setOperationAction(ISD::VAARG , MVT::Other, Expand); setOperationAction(ISD::VACOPY , MVT::Other, Expand); } setOperationAction(ISD::STACKSAVE, MVT::Other, Expand); setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand); setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ? MVT::i64 : MVT::i32, Custom); if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) { // f32 and f64 use SSE. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::FR64RegClass); // Use ANDPD to simulate FABS. setOperationAction(ISD::FABS , MVT::f64, Custom); setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f64, Custom); setOperationAction(ISD::FNEG , MVT::f32, Custom); // Use ANDPD and ORPD to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // Lower this to FGETSIGNx86 plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Expand FP immediates into loads from the stack, except for the special // cases we handle. addLegalFPImmediate(APFloat(+0.0)); // xorpd addLegalFPImmediate(APFloat(+0.0f)); // xorps } else if (!TM.Options.UseSoftFloat && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. setOperationAction(ISD::FABS , MVT::f32, Custom); // Use XORP to simulate FNEG. setOperationAction(ISD::FNEG , MVT::f32, Custom); setOperationAction(ISD::UNDEF, MVT::f64, Expand); // Use ANDPS and ORPS to simulate FCOPYSIGN. setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom); // We don't support sin/cos/fmod setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); // Special cases we handle for FP constants. addLegalFPImmediate(APFloat(+0.0f)); // xorps addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } } else if (!TM.Options.UseSoftFloat) { // f32 and f64 in x87. // Set up the FP register classes. addRegisterClass(MVT::f64, &X86::RFP64RegClass); addRegisterClass(MVT::f32, &X86::RFP32RegClass); setOperationAction(ISD::UNDEF, MVT::f64, Expand); setOperationAction(ISD::UNDEF, MVT::f32, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand); if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f64, Expand); setOperationAction(ISD::FSIN , MVT::f32, Expand); setOperationAction(ISD::FCOS , MVT::f64, Expand); setOperationAction(ISD::FCOS , MVT::f32, Expand); setOperationAction(ISD::FSINCOS, MVT::f64, Expand); setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS addLegalFPImmediate(APFloat(+0.0f)); // FLD0 addLegalFPImmediate(APFloat(+1.0f)); // FLD1 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS } // We don't support FMA. setOperationAction(ISD::FMA, MVT::f64, Expand); setOperationAction(ISD::FMA, MVT::f32, Expand); // Long double always uses X87. if (!TM.Options.UseSoftFloat) { addRegisterClass(MVT::f80, &X86::RFP80RegClass); setOperationAction(ISD::UNDEF, MVT::f80, Expand); setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand); { APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended); addLegalFPImmediate(TmpFlt); // FLD0 TmpFlt.changeSign(); addLegalFPImmediate(TmpFlt); // FLD0/FCHS bool ignored; APFloat TmpFlt2(+1.0); TmpFlt2.convert(APFloat::x87DoubleExtended, APFloat::rmNearestTiesToEven, &ignored); addLegalFPImmediate(TmpFlt2); // FLD1 TmpFlt2.changeSign(); addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } if (!TM.Options.UnsafeFPMath) { setOperationAction(ISD::FSIN , MVT::f80, Expand); setOperationAction(ISD::FCOS , MVT::f80, Expand); setOperationAction(ISD::FSINCOS, MVT::f80, Expand); } setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); setOperationAction(ISD::FTRUNC, MVT::f80, Expand); setOperationAction(ISD::FRINT, MVT::f80, Expand); setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand); setOperationAction(ISD::FMA, MVT::f80, Expand); } // Always use a library call for pow. setOperationAction(ISD::FPOW , MVT::f32 , Expand); setOperationAction(ISD::FPOW , MVT::f64 , Expand); setOperationAction(ISD::FPOW , MVT::f80 , Expand); setOperationAction(ISD::FLOG, MVT::f80, Expand); setOperationAction(ISD::FLOG2, MVT::f80, Expand); setOperationAction(ISD::FLOG10, MVT::f80, Expand); setOperationAction(ISD::FEXP, MVT::f80, Expand); setOperationAction(ISD::FEXP2, MVT::f80, Expand); // First set operation action for all vector types to either promote // (for widening) or expand (for scalarization). Then we will selectively // turn on ones that can be effectively codegen'd. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT VT = (MVT::SimpleValueType)i; setOperationAction(ISD::ADD , VT, Expand); setOperationAction(ISD::SUB , VT, Expand); setOperationAction(ISD::FADD, VT, Expand); setOperationAction(ISD::FNEG, VT, Expand); setOperationAction(ISD::FSUB, VT, Expand); setOperationAction(ISD::MUL , VT, Expand); setOperationAction(ISD::FMUL, VT, Expand); setOperationAction(ISD::SDIV, VT, Expand); setOperationAction(ISD::UDIV, VT, Expand); setOperationAction(ISD::FDIV, VT, Expand); setOperationAction(ISD::SREM, VT, Expand); setOperationAction(ISD::UREM, VT, Expand); setOperationAction(ISD::LOAD, VT, Expand); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand); setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand); setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand); setOperationAction(ISD::FABS, VT, Expand); setOperationAction(ISD::FSIN, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FCOS, VT, Expand); setOperationAction(ISD::FSINCOS, VT, Expand); setOperationAction(ISD::FREM, VT, Expand); setOperationAction(ISD::FMA, VT, Expand); setOperationAction(ISD::FPOWI, VT, Expand); setOperationAction(ISD::FSQRT, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); setOperationAction(ISD::FFLOOR, VT, Expand); setOperationAction(ISD::FCEIL, VT, Expand); setOperationAction(ISD::FTRUNC, VT, Expand); setOperationAction(ISD::FRINT, VT, Expand); setOperationAction(ISD::FNEARBYINT, VT, Expand); setOperationAction(ISD::SMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHS, VT, Expand); setOperationAction(ISD::UMUL_LOHI, VT, Expand); setOperationAction(ISD::MULHU, VT, Expand); setOperationAction(ISD::SDIVREM, VT, Expand); setOperationAction(ISD::UDIVREM, VT, Expand); setOperationAction(ISD::FPOW, VT, Expand); setOperationAction(ISD::CTPOP, VT, Expand); setOperationAction(ISD::CTTZ, VT, Expand); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::CTLZ, VT, Expand); setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand); setOperationAction(ISD::SHL, VT, Expand); setOperationAction(ISD::SRA, VT, Expand); setOperationAction(ISD::SRL, VT, Expand); setOperationAction(ISD::ROTL, VT, Expand); setOperationAction(ISD::ROTR, VT, Expand); setOperationAction(ISD::BSWAP, VT, Expand); setOperationAction(ISD::SETCC, VT, Expand); setOperationAction(ISD::FLOG, VT, Expand); setOperationAction(ISD::FLOG2, VT, Expand); setOperationAction(ISD::FLOG10, VT, Expand); setOperationAction(ISD::FEXP, VT, Expand); setOperationAction(ISD::FEXP2, VT, Expand); setOperationAction(ISD::FP_TO_UINT, VT, Expand); setOperationAction(ISD::FP_TO_SINT, VT, Expand); setOperationAction(ISD::UINT_TO_FP, VT, Expand); setOperationAction(ISD::SINT_TO_FP, VT, Expand); setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand); setOperationAction(ISD::TRUNCATE, VT, Expand); setOperationAction(ISD::SIGN_EXTEND, VT, Expand); setOperationAction(ISD::ZERO_EXTEND, VT, Expand); setOperationAction(ISD::ANY_EXTEND, VT, Expand); setOperationAction(ISD::VSELECT, VT, Expand); setOperationAction(ISD::SELECT_CC, VT, Expand); for (int InnerVT = MVT::FIRST_VECTOR_VALUETYPE; InnerVT <= MVT::LAST_VECTOR_VALUETYPE; ++InnerVT) setTruncStoreAction(VT, (MVT::SimpleValueType)InnerVT, Expand); setLoadExtAction(ISD::SEXTLOAD, VT, Expand); setLoadExtAction(ISD::ZEXTLOAD, VT, Expand); setLoadExtAction(ISD::EXTLOAD, VT, Expand); } // FIXME: In order to prevent SSE instructions being expanded to MMX ones // with -msoft-float, disable use of MMX as well. if (!TM.Options.UseSoftFloat && Subtarget->hasMMX()) { addRegisterClass(MVT::x86mmx, &X86::VR64RegClass); // No operations on x86mmx supported, everything uses intrinsics. } // MMX-sized vectors (other than x86mmx) are expected to be expanded // into smaller operations. setOperationAction(ISD::MULHS, MVT::v8i8, Expand); setOperationAction(ISD::MULHS, MVT::v4i16, Expand); setOperationAction(ISD::MULHS, MVT::v2i32, Expand); setOperationAction(ISD::MULHS, MVT::v1i64, Expand); setOperationAction(ISD::AND, MVT::v8i8, Expand); setOperationAction(ISD::AND, MVT::v4i16, Expand); setOperationAction(ISD::AND, MVT::v2i32, Expand); setOperationAction(ISD::AND, MVT::v1i64, Expand); setOperationAction(ISD::OR, MVT::v8i8, Expand); setOperationAction(ISD::OR, MVT::v4i16, Expand); setOperationAction(ISD::OR, MVT::v2i32, Expand); setOperationAction(ISD::OR, MVT::v1i64, Expand); setOperationAction(ISD::XOR, MVT::v8i8, Expand); setOperationAction(ISD::XOR, MVT::v4i16, Expand); setOperationAction(ISD::XOR, MVT::v2i32, Expand); setOperationAction(ISD::XOR, MVT::v1i64, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i8, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4i16, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v2i32, Expand); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v1i64, Expand); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v1i64, Expand); setOperationAction(ISD::SELECT, MVT::v8i8, Expand); setOperationAction(ISD::SELECT, MVT::v4i16, Expand); setOperationAction(ISD::SELECT, MVT::v2i32, Expand); setOperationAction(ISD::SELECT, MVT::v1i64, Expand); setOperationAction(ISD::BITCAST, MVT::v8i8, Expand); setOperationAction(ISD::BITCAST, MVT::v4i16, Expand); setOperationAction(ISD::BITCAST, MVT::v2i32, Expand); setOperationAction(ISD::BITCAST, MVT::v1i64, Expand); if (!TM.Options.UseSoftFloat && Subtarget->hasSSE1()) { addRegisterClass(MVT::v4f32, &X86::VR128RegClass); setOperationAction(ISD::FADD, MVT::v4f32, Legal); setOperationAction(ISD::FSUB, MVT::v4f32, Legal); setOperationAction(ISD::FMUL, MVT::v4f32, Legal); setOperationAction(ISD::FDIV, MVT::v4f32, Legal); setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); setOperationAction(ISD::FNEG, MVT::v4f32, Custom); setOperationAction(ISD::FABS, MVT::v4f32, Custom); setOperationAction(ISD::LOAD, MVT::v4f32, Legal); setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::SELECT, MVT::v4f32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE2()) { addRegisterClass(MVT::v2f64, &X86::VR128RegClass); // FIXME: Unfortunately -soft-float and -no-implicit-float means XMM // registers cannot be used even for integer operations. addRegisterClass(MVT::v16i8, &X86::VR128RegClass); addRegisterClass(MVT::v8i16, &X86::VR128RegClass); addRegisterClass(MVT::v4i32, &X86::VR128RegClass); addRegisterClass(MVT::v2i64, &X86::VR128RegClass); setOperationAction(ISD::ADD, MVT::v16i8, Legal); setOperationAction(ISD::ADD, MVT::v8i16, Legal); setOperationAction(ISD::ADD, MVT::v4i32, Legal); setOperationAction(ISD::ADD, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom); setOperationAction(ISD::MULHU, MVT::v8i16, Legal); setOperationAction(ISD::MULHS, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v16i8, Legal); setOperationAction(ISD::SUB, MVT::v8i16, Legal); setOperationAction(ISD::SUB, MVT::v4i32, Legal); setOperationAction(ISD::SUB, MVT::v2i64, Legal); setOperationAction(ISD::MUL, MVT::v8i16, Legal); setOperationAction(ISD::FADD, MVT::v2f64, Legal); setOperationAction(ISD::FSUB, MVT::v2f64, Legal); setOperationAction(ISD::FMUL, MVT::v2f64, Legal); setOperationAction(ISD::FDIV, MVT::v2f64, Legal); setOperationAction(ISD::FSQRT, MVT::v2f64, Legal); setOperationAction(ISD::FNEG, MVT::v2f64, Custom); setOperationAction(ISD::FABS, MVT::v2f64, Custom); setOperationAction(ISD::SETCC, MVT::v2i64, Custom); setOperationAction(ISD::SETCC, MVT::v16i8, Custom); setOperationAction(ISD::SETCC, MVT::v8i16, Custom); setOperationAction(ISD::SETCC, MVT::v4i32, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); // Custom lower build_vector, vector_shuffle, and extract_vector_elt. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to custom lower non-power-of-2 vectors if (!isPowerOf2_32(VT.getVectorNumElements())) continue; // Do not attempt to custom lower non-128-bit vectors if (!VT.is128BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); } setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f64, Custom); if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64. for (int i = MVT::v16i8; i != MVT::v2i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-128-bit vectors if (!VT.is128BitVector()) continue; setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v2i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v2i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v2i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v2i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v2i64); } setTruncStoreAction(MVT::f64, MVT::f32, Expand); // Custom lower v2i64 and v2f64 selects. setOperationAction(ISD::LOAD, MVT::v2f64, Legal); setOperationAction(ISD::LOAD, MVT::v2i64, Legal); setOperationAction(ISD::SELECT, MVT::v2f64, Custom); setOperationAction(ISD::SELECT, MVT::v2i64, Custom); setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); // As there is no 64-bit GPR available, we need build a special custom // sequence to convert from v2i32 to v2f32. if (!Subtarget->is64Bit()) setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom); setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom); setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, Legal); setOperationAction(ISD::BITCAST, MVT::v2i32, Custom); setOperationAction(ISD::BITCAST, MVT::v4i16, Custom); setOperationAction(ISD::BITCAST, MVT::v8i8, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) { setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FCEIL, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f64, Legal); setOperationAction(ISD::FCEIL, MVT::f64, Legal); setOperationAction(ISD::FTRUNC, MVT::f64, Legal); setOperationAction(ISD::FRINT, MVT::f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); setOperationAction(ISD::FRINT, MVT::v4f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal); setOperationAction(ISD::FCEIL, MVT::v2f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal); setOperationAction(ISD::FRINT, MVT::v2f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal); // FIXME: Do we need to handle scalar-to-vector here? setOperationAction(ISD::MUL, MVT::v4i32, Legal); setOperationAction(ISD::VSELECT, MVT::v2f64, Custom); setOperationAction(ISD::VSELECT, MVT::v2i64, Custom); setOperationAction(ISD::VSELECT, MVT::v4i32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f32, Custom); setOperationAction(ISD::VSELECT, MVT::v8i16, Custom); // There is no BLENDI for byte vectors. We don't need to custom lower // some vselects for now. setOperationAction(ISD::VSELECT, MVT::v16i8, Legal); // i8 and i16 vectors are custom , because the source register and source // source memory operand types are not the same width. f32 vectors are // custom since the immediate controlling the insert encodes additional // information. setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i8, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom); // FIXME: these should be Legal but thats only for the case where // the index is constant. For now custom expand to deal with that. if (Subtarget->is64Bit()) { setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i64, Custom); } } if (Subtarget->hasSSE2()) { setOperationAction(ISD::SRL, MVT::v8i16, Custom); setOperationAction(ISD::SRL, MVT::v16i8, Custom); setOperationAction(ISD::SHL, MVT::v8i16, Custom); setOperationAction(ISD::SHL, MVT::v16i8, Custom); setOperationAction(ISD::SRA, MVT::v8i16, Custom); setOperationAction(ISD::SRA, MVT::v16i8, Custom); // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v2i64, Custom); setOperationAction(ISD::SRL, MVT::v4i32, Custom); setOperationAction(ISD::SHL, MVT::v2i64, Custom); setOperationAction(ISD::SHL, MVT::v4i32, Custom); setOperationAction(ISD::SRA, MVT::v4i32, Custom); } if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) { addRegisterClass(MVT::v32i8, &X86::VR256RegClass); addRegisterClass(MVT::v16i16, &X86::VR256RegClass); addRegisterClass(MVT::v8i32, &X86::VR256RegClass); addRegisterClass(MVT::v8f32, &X86::VR256RegClass); addRegisterClass(MVT::v4i64, &X86::VR256RegClass); addRegisterClass(MVT::v4f64, &X86::VR256RegClass); setOperationAction(ISD::LOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v4f64, Legal); setOperationAction(ISD::LOAD, MVT::v4i64, Legal); setOperationAction(ISD::FADD, MVT::v8f32, Legal); setOperationAction(ISD::FSUB, MVT::v8f32, Legal); setOperationAction(ISD::FMUL, MVT::v8f32, Legal); setOperationAction(ISD::FDIV, MVT::v8f32, Legal); setOperationAction(ISD::FSQRT, MVT::v8f32, Legal); setOperationAction(ISD::FFLOOR, MVT::v8f32, Legal); setOperationAction(ISD::FCEIL, MVT::v8f32, Legal); setOperationAction(ISD::FTRUNC, MVT::v8f32, Legal); setOperationAction(ISD::FRINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v8f32, Legal); setOperationAction(ISD::FNEG, MVT::v8f32, Custom); setOperationAction(ISD::FABS, MVT::v8f32, Custom); setOperationAction(ISD::FADD, MVT::v4f64, Legal); setOperationAction(ISD::FSUB, MVT::v4f64, Legal); setOperationAction(ISD::FMUL, MVT::v4f64, Legal); setOperationAction(ISD::FDIV, MVT::v4f64, Legal); setOperationAction(ISD::FSQRT, MVT::v4f64, Legal); setOperationAction(ISD::FFLOOR, MVT::v4f64, Legal); setOperationAction(ISD::FCEIL, MVT::v4f64, Legal); setOperationAction(ISD::FTRUNC, MVT::v4f64, Legal); setOperationAction(ISD::FRINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEARBYINT, MVT::v4f64, Legal); setOperationAction(ISD::FNEG, MVT::v4f64, Custom); setOperationAction(ISD::FABS, MVT::v4f64, Custom); // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted // even though v8i16 is a legal type. setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, Legal); setOperationAction(ISD::SRL, MVT::v16i16, Custom); setOperationAction(ISD::SRL, MVT::v32i8, Custom); setOperationAction(ISD::SHL, MVT::v16i16, Custom); setOperationAction(ISD::SHL, MVT::v32i8, Custom); setOperationAction(ISD::SRA, MVT::v16i16, Custom); setOperationAction(ISD::SRA, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v32i8, Custom); setOperationAction(ISD::SETCC, MVT::v16i16, Custom); setOperationAction(ISD::SETCC, MVT::v8i32, Custom); setOperationAction(ISD::SETCC, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v4f64, Custom); setOperationAction(ISD::SELECT, MVT::v4i64, Custom); setOperationAction(ISD::SELECT, MVT::v8f32, Custom); setOperationAction(ISD::VSELECT, MVT::v4f64, Custom); setOperationAction(ISD::VSELECT, MVT::v4i64, Custom); setOperationAction(ISD::VSELECT, MVT::v8i32, Custom); setOperationAction(ISD::VSELECT, MVT::v8f32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom); setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom); if (Subtarget->hasFMA() || Subtarget->hasFMA4()) { setOperationAction(ISD::FMA, MVT::v8f32, Legal); setOperationAction(ISD::FMA, MVT::v4f64, Legal); setOperationAction(ISD::FMA, MVT::v4f32, Legal); setOperationAction(ISD::FMA, MVT::v2f64, Legal); setOperationAction(ISD::FMA, MVT::f32, Legal); setOperationAction(ISD::FMA, MVT::f64, Legal); } if (Subtarget->hasInt256()) { setOperationAction(ISD::ADD, MVT::v4i64, Legal); setOperationAction(ISD::ADD, MVT::v8i32, Legal); setOperationAction(ISD::ADD, MVT::v16i16, Legal); setOperationAction(ISD::ADD, MVT::v32i8, Legal); setOperationAction(ISD::SUB, MVT::v4i64, Legal); setOperationAction(ISD::SUB, MVT::v8i32, Legal); setOperationAction(ISD::SUB, MVT::v16i16, Legal); setOperationAction(ISD::SUB, MVT::v32i8, Legal); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Legal); setOperationAction(ISD::MUL, MVT::v16i16, Legal); // Don't lower v32i8 because there is no 128-bit byte mul setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom); setOperationAction(ISD::MULHU, MVT::v16i16, Legal); setOperationAction(ISD::MULHS, MVT::v16i16, Legal); setOperationAction(ISD::VSELECT, MVT::v16i16, Custom); setOperationAction(ISD::VSELECT, MVT::v32i8, Legal); } else { setOperationAction(ISD::ADD, MVT::v4i64, Custom); setOperationAction(ISD::ADD, MVT::v8i32, Custom); setOperationAction(ISD::ADD, MVT::v16i16, Custom); setOperationAction(ISD::ADD, MVT::v32i8, Custom); setOperationAction(ISD::SUB, MVT::v4i64, Custom); setOperationAction(ISD::SUB, MVT::v8i32, Custom); setOperationAction(ISD::SUB, MVT::v16i16, Custom); setOperationAction(ISD::SUB, MVT::v32i8, Custom); setOperationAction(ISD::MUL, MVT::v4i64, Custom); setOperationAction(ISD::MUL, MVT::v8i32, Custom); setOperationAction(ISD::MUL, MVT::v16i16, Custom); // Don't lower v32i8 because there is no 128-bit byte mul } // In the customized shift lowering, the legal cases in AVX2 will be // recognized. setOperationAction(ISD::SRL, MVT::v4i64, Custom); setOperationAction(ISD::SRL, MVT::v8i32, Custom); setOperationAction(ISD::SHL, MVT::v4i64, Custom); setOperationAction(ISD::SHL, MVT::v8i32, Custom); setOperationAction(ISD::SRA, MVT::v8i32, Custom); // Custom lower several nodes for 256-bit types. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT VT = (MVT::SimpleValueType)i; // Extract subvector is special because the value type // (result) is 128-bit but the source is 256-bit wide. if (VT.is128BitVector()) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); // Do not attempt to custom lower other non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); } // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64. for (int i = MVT::v32i8; i != MVT::v4i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-256-bit vectors if (!VT.is256BitVector()) continue; setOperationAction(ISD::AND, VT, Promote); AddPromotedToType (ISD::AND, VT, MVT::v4i64); setOperationAction(ISD::OR, VT, Promote); AddPromotedToType (ISD::OR, VT, MVT::v4i64); setOperationAction(ISD::XOR, VT, Promote); AddPromotedToType (ISD::XOR, VT, MVT::v4i64); setOperationAction(ISD::LOAD, VT, Promote); AddPromotedToType (ISD::LOAD, VT, MVT::v4i64); setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v4i64); } } if (!TM.Options.UseSoftFloat && Subtarget->hasAVX512()) { addRegisterClass(MVT::v16i32, &X86::VR512RegClass); addRegisterClass(MVT::v16f32, &X86::VR512RegClass); addRegisterClass(MVT::v8i64, &X86::VR512RegClass); addRegisterClass(MVT::v8f64, &X86::VR512RegClass); addRegisterClass(MVT::i1, &X86::VK1RegClass); addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); setOperationAction(ISD::BR_CC, MVT::i1, Expand); setOperationAction(ISD::SETCC, MVT::i1, Custom); setOperationAction(ISD::XOR, MVT::i1, Legal); setOperationAction(ISD::OR, MVT::i1, Legal); setOperationAction(ISD::AND, MVT::i1, Legal); setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, Legal); setOperationAction(ISD::LOAD, MVT::v16f32, Legal); setOperationAction(ISD::LOAD, MVT::v8f64, Legal); setOperationAction(ISD::LOAD, MVT::v8i64, Legal); setOperationAction(ISD::LOAD, MVT::v16i32, Legal); setOperationAction(ISD::LOAD, MVT::v16i1, Legal); setOperationAction(ISD::FADD, MVT::v16f32, Legal); setOperationAction(ISD::FSUB, MVT::v16f32, Legal); setOperationAction(ISD::FMUL, MVT::v16f32, Legal); setOperationAction(ISD::FDIV, MVT::v16f32, Legal); setOperationAction(ISD::FSQRT, MVT::v16f32, Legal); setOperationAction(ISD::FNEG, MVT::v16f32, Custom); setOperationAction(ISD::FADD, MVT::v8f64, Legal); setOperationAction(ISD::FSUB, MVT::v8f64, Legal); setOperationAction(ISD::FMUL, MVT::v8f64, Legal); setOperationAction(ISD::FDIV, MVT::v8f64, Legal); setOperationAction(ISD::FSQRT, MVT::v8f64, Legal); setOperationAction(ISD::FNEG, MVT::v8f64, Custom); setOperationAction(ISD::FMA, MVT::v8f64, Legal); setOperationAction(ISD::FMA, MVT::v16f32, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal); if (Subtarget->is64Bit()) { setOperationAction(ISD::FP_TO_UINT, MVT::i64, Legal); setOperationAction(ISD::FP_TO_SINT, MVT::i64, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::i64, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::i64, Legal); } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setOperationAction(ISD::TRUNCATE, MVT::i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Legal); setOperationAction(ISD::SETCC, MVT::v16i1, Custom); setOperationAction(ISD::SETCC, MVT::v8i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom); setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); setOperationAction(ISD::ADD, MVT::v8i64, Legal); setOperationAction(ISD::ADD, MVT::v16i32, Legal); setOperationAction(ISD::SUB, MVT::v8i64, Legal); setOperationAction(ISD::SUB, MVT::v16i32, Legal); setOperationAction(ISD::MUL, MVT::v16i32, Legal); setOperationAction(ISD::SRL, MVT::v8i64, Custom); setOperationAction(ISD::SRL, MVT::v16i32, Custom); setOperationAction(ISD::SHL, MVT::v8i64, Custom); setOperationAction(ISD::SHL, MVT::v16i32, Custom); setOperationAction(ISD::SRA, MVT::v8i64, Custom); setOperationAction(ISD::SRA, MVT::v16i32, Custom); setOperationAction(ISD::AND, MVT::v8i64, Legal); setOperationAction(ISD::OR, MVT::v8i64, Legal); setOperationAction(ISD::XOR, MVT::v8i64, Legal); setOperationAction(ISD::AND, MVT::v16i32, Legal); setOperationAction(ISD::OR, MVT::v16i32, Legal); setOperationAction(ISD::XOR, MVT::v16i32, Legal); if (Subtarget->hasCDI()) { setOperationAction(ISD::CTLZ, MVT::v8i64, Legal); setOperationAction(ISD::CTLZ, MVT::v16i32, Legal); } // Custom lower several nodes. for (int i = MVT::FIRST_VECTOR_VALUETYPE; i <= MVT::LAST_VECTOR_VALUETYPE; ++i) { MVT VT = (MVT::SimpleValueType)i; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); // Extract subvector is special because the value type // (result) is 256/128-bit but the source is 512-bit wide. if (VT.is128BitVector() || VT.is256BitVector()) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); if (VT.getVectorElementType() == MVT::i1) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); // Do not attempt to custom lower other non-512-bit vectors if (!VT.is512BitVector()) continue; if ( EltSize >= 32) { setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); setOperationAction(ISD::VSELECT, VT, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); } } for (int i = MVT::v32i8; i != MVT::v8i64; ++i) { MVT VT = (MVT::SimpleValueType)i; // Do not attempt to promote non-256-bit vectors if (!VT.is512BitVector()) continue; setOperationAction(ISD::SELECT, VT, Promote); AddPromotedToType (ISD::SELECT, VT, MVT::v8i64); } }// has AVX-512 // SIGN_EXTEND_INREGs are evaluated by the extend type. Handle the expansion // of this type with custom code. for (int VT = MVT::FIRST_VECTOR_VALUETYPE; VT != MVT::LAST_VECTOR_VALUETYPE; VT++) { setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT, Custom); } // We want to custom lower some of our intrinsics. setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom); setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom); if (!Subtarget->is64Bit()) setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom); // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't // handle type legalization for these operations here. // // FIXME: We really should do custom legalization for addition and // subtraction on x86-32 once PR3203 is fixed. We really can't do much better // than generic legalization for 64-bit multiplication-with-overflow, though. for (unsigned i = 0, e = 3+Subtarget->is64Bit(); i != e; ++i) { // Add/Sub/Mul with overflow operations are custom lowered. MVT VT = IntVTs[i]; setOperationAction(ISD::SADDO, VT, Custom); setOperationAction(ISD::UADDO, VT, Custom); setOperationAction(ISD::SSUBO, VT, Custom); setOperationAction(ISD::USUBO, VT, Custom); setOperationAction(ISD::SMULO, VT, Custom); setOperationAction(ISD::UMULO, VT, Custom); } // There are no 8-bit 3-address imul/mul instructions setOperationAction(ISD::SMULO, MVT::i8, Expand); setOperationAction(ISD::UMULO, MVT::i8, Expand); if (!Subtarget->is64Bit()) { // These libcalls are not available in 32-bit. setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. if (Subtarget->hasSinCos()) { setLibcallName(RTLIB::SINCOS_F32, "sincosf"); setLibcallName(RTLIB::SINCOS_F64, "sincos"); if (Subtarget->isTargetDarwin()) { // For MacOSX, we don't want to the normal expansion of a libcall to // sincos. We want to issue a libcall to __sincos_stret to avoid memory // traffic. setOperationAction(ISD::FSINCOS, MVT::f64, Custom); setOperationAction(ISD::FSINCOS, MVT::f32, Custom); } } if (Subtarget->isTargetWin64()) { setOperationAction(ISD::SDIV, MVT::i128, Custom); setOperationAction(ISD::UDIV, MVT::i128, Custom); setOperationAction(ISD::SREM, MVT::i128, Custom); setOperationAction(ISD::UREM, MVT::i128, Custom); setOperationAction(ISD::SDIVREM, MVT::i128, Custom); setOperationAction(ISD::UDIVREM, MVT::i128, Custom); } // We have target-specific dag combine patterns for the following nodes: setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRA); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::ADD); setTargetDAGCombine(ISD::FADD); setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FMA); setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::LOAD); setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::TRUNCATE); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::SETCC); setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget->is64Bit()) setTargetDAGCombine(ISD::MUL); setTargetDAGCombine(ISD::XOR); computeRegisterProperties(); // On Darwin, -Os means optimize for size without hurting performance, // do not reduce the limit. MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores MaxStoresPerMemsetOptSize = Subtarget->isTargetDarwin() ? 16 : 8; MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores MaxStoresPerMemcpyOptSize = Subtarget->isTargetDarwin() ? 8 : 4; MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = Subtarget->isTargetDarwin() ? 8 : 4; setPrefLoopAlignment(4); // 2^4 bytes. // Predictable cmov don't hurt on atom because it's in-order. PredictableSelectIsExpensive = !Subtarget->isAtom(); setPrefFunctionAlignment(4); // 2^4 bytes. } TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && VT.getVectorNumElements() != 1 && VT.getVectorElementType().getSimpleVT() != MVT::i1) return TypeWidenVector; return TargetLoweringBase::getPreferredVectorAction(VT); } EVT X86TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const { if (!VT.isVector()) return Subtarget->hasAVX512() ? MVT::i1: MVT::i8; if (Subtarget->hasAVX512()) switch(VT.getVectorNumElements()) { case 8: return MVT::v8i1; case 16: return MVT::v16i1; } return VT.changeVectorElementTypeToInteger(); } /// getMaxByValAlign - Helper for getByValTypeAlignment to determine /// the desired ByVal argument alignment. static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) { if (MaxAlign == 16) return; if (VectorType *VTy = dyn_cast
(Ty)) { if (VTy->getBitWidth() == 128) MaxAlign = 16; } else if (ArrayType *ATy = dyn_cast
(Ty)) { unsigned EltAlign = 0; getMaxByValAlign(ATy->getElementType(), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; } else if (StructType *STy = dyn_cast
(Ty)) { for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) { unsigned EltAlign = 0; getMaxByValAlign(STy->getElementType(i), EltAlign); if (EltAlign > MaxAlign) MaxAlign = EltAlign; if (MaxAlign == 16) break; } } } /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate /// function arguments in the caller parameter area. For X86, aggregates /// that contain SSE vectors are placed at 16-byte boundaries while the rest /// are at 4-byte boundaries. unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const { if (Subtarget->is64Bit()) { // Max of 8 and alignment of type. unsigned TyAlign = TD->getABITypeAlignment(Ty); if (TyAlign > 8) return TyAlign; return 8; } unsigned Align = 4; if (Subtarget->hasSSE1()) getMaxByValAlign(Ty, Align); return Align; } /// getOptimalMemOpType - Returns the target specific optimal type for load /// and store operations as a result of memset, memcpy, and memmove /// lowering. If DstAlign is zero that means it's safe to destination /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it /// means there isn't a need to check it against alignment requirement, /// probably because the source does not need to be loaded. If 'IsMemset' is /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy /// source is constant so it does not need to be loaded. /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT X86TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { const Function *F = MF.getFunction(); if ((!IsMemset || ZeroMemset) && !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat)) { if (Size >= 16 && (Subtarget->isUnalignedMemAccessFast() || ((DstAlign == 0 || DstAlign >= 16) && (SrcAlign == 0 || SrcAlign >= 16)))) { if (Size >= 32) { if (Subtarget->hasInt256()) return MVT::v8i32; if (Subtarget->hasFp256()) return MVT::v8f32; } if (Subtarget->hasSSE2()) return MVT::v4i32; if (Subtarget->hasSSE1()) return MVT::v4f32; } else if (!MemcpyStrSrc && Size >= 8 && !Subtarget->is64Bit() && Subtarget->hasSSE2()) { // Do not use f64 to lower memcpy if source is string constant. It's // better to use i32 to avoid the loads. return MVT::f64; } } if (Subtarget->is64Bit() && Size >= 8) return MVT::i64; return MVT::i32; } bool X86TargetLowering::isSafeMemOpType(MVT VT) const { if (VT == MVT::f32) return X86ScalarSSEf32; else if (VT == MVT::f64) return X86ScalarSSEf64; return true; } bool X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, unsigned, bool *Fast) const { if (Fast) *Fast = Subtarget->isUnalignedMemAccessFast(); return true; } /// getJumpTableEncoding - Return the entry encoding for a jump table in the /// current function. The returned value is a member of the /// MachineJumpTableInfo::JTEntryKind enum. unsigned X86TargetLowering::getJumpTableEncoding() const { // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF // symbol. if (getTargetMachine().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()) return MachineJumpTableInfo::EK_Custom32; // Otherwise, use the normal jump table encoding heuristics. return TargetLowering::getJumpTableEncoding(); } const MCExpr * X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned uid,MCContext &Ctx) const{ assert(MBB->getParent()->getTarget().getRelocationModel() == Reloc::PIC_ && Subtarget->isPICStyleGOT()); // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF // entries. return MCSymbolRefExpr::Create(MBB->getSymbol(), MCSymbolRefExpr::VK_GOTOFF, Ctx); } /// getPICJumpTableRelocaBase - Returns relocation base for the given PIC /// jumptable. SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table, SelectionDAG &DAG) const { if (!Subtarget->is64Bit()) // This doesn't have SDLoc associated with it, but is not really the // same as a Register. return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()); return Table; } /// getPICJumpTableRelocBaseExpr - This returns the relocation base for the /// given PIC jumptable, the same as getPICJumpTableRelocBase, but as an /// MCExpr. const MCExpr *X86TargetLowering:: getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI, MCContext &Ctx) const { // X86-64 uses RIP relative addressing based on the jump table label. if (Subtarget->isPICStyleRIPRel()) return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx); // Otherwise, the reference is relative to the PIC base. return MCSymbolRefExpr::Create(MF->getPICBaseSymbol(), Ctx); } // FIXME: Why this routine is here? Move to RegInfo! std::pair
X86TargetLowering::findRepresentativeClass(MVT VT) const{ const TargetRegisterClass *RRC = nullptr; uint8_t Cost = 1; switch (VT.SimpleTy) { default: return TargetLowering::findRepresentativeClass(VT); case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64: RRC = Subtarget->is64Bit() ? (const TargetRegisterClass*)&X86::GR64RegClass : (const TargetRegisterClass*)&X86::GR32RegClass; break; case MVT::x86mmx: RRC = &X86::VR64RegClass; break; case MVT::f32: case MVT::f64: case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64: case MVT::v4f32: case MVT::v2f64: case MVT::v32i8: case MVT::v8i32: case MVT::v4i64: case MVT::v8f32: case MVT::v4f64: RRC = &X86::VR128RegClass; break; } return std::make_pair(RRC, Cost); } bool X86TargetLowering::getStackCookieLocation(unsigned &AddressSpace, unsigned &Offset) const { if (!Subtarget->isTargetLinux()) return false; if (Subtarget->is64Bit()) { // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs: Offset = 0x28; if (getTargetMachine().getCodeModel() == CodeModel::Kernel) AddressSpace = 256; else AddressSpace = 257; } else { // %gs:0x14 on i386 Offset = 0x14; AddressSpace = 256; } return true; } bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const { assert(SrcAS != DestAS && "Expected different address spaces!"); return SrcAS < 256 && DestAS < 256; } //===----------------------------------------------------------------------===// // Return Value Calling Convention Implementation //===----------------------------------------------------------------------===// #include "X86GenCallingConv.inc" bool X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl
&Outs, LLVMContext &Context) const { SmallVector
RVLocs; CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), RVLocs, Context); return CCInfo.CheckReturn(Outs, RetCC_X86); } const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const { static const MCPhysReg ScratchRegs[] = { X86::R11, 0 }; return ScratchRegs; } SDValue X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl
&Outs, const SmallVectorImpl
&OutVals, SDLoc dl, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); SmallVector
RVLocs; CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeReturn(Outs, RetCC_X86); SDValue Flag; SmallVector
RetOps; RetOps.push_back(Chain); // Operand #0 = Chain (updated below) // Operand #1 = Bytes To Pop RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), MVT::i16)); // Copy the result values into the output registers. for (unsigned i = 0; i != RVLocs.size(); ++i) { CCValAssign &VA = RVLocs[i]; assert(VA.isRegLoc() && "Can only return in registers!"); SDValue ValToCopy = OutVals[i]; EVT ValVT = ValToCopy.getValueType(); // Promote values to the appropriate types if (VA.getLocInfo() == CCValAssign::SExt) ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::ZExt) ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::AExt) ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy); else if (VA.getLocInfo() == CCValAssign::BCvt) ValToCopy = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), ValToCopy); assert(VA.getLocInfo() != CCValAssign::FPExt && "Unexpected FP-extend for return value."); // If this is x86-64, and we disabled SSE, we can't return FP values, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && (Subtarget->is64Bit() && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } // Likewise we can't return F64 values with SSE1 only. gcc does so, but // llvm-gcc has never done it right and no one has noticed, so this // should be OK for now. if (ValVT == MVT::f64 && (Subtarget->is64Bit() && !Subtarget->hasSSE2())) report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { // If this is a copy from an xmm register to ST(0), use an FPExtend to // change the value to the FP stack register class. if (isScalarFPTypeInSSEReg(VA.getValVT())) ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy); RetOps.push_back(ValToCopy); // Don't emit a copytoreg. continue; } // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64 // which is returned in RAX / RDX. if (Subtarget->is64Bit()) { if (ValVT == MVT::x86mmx) { if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) { ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ValToCopy); ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, ValToCopy); // If we don't have SSE2 available, convert to v4f32 so the generated // register is legal. if (!Subtarget->hasSSE2()) ValToCopy = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32,ValToCopy); } } } Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), ValToCopy, Flag); Flag = Chain.getValue(1); RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT())); } // The x86-64 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // Win32 requires us to put the sret argument to %eax as well. // We saved the argument into a virtual register in the entry block, // so now we copy the value out and into %rax/%eax. if (DAG.getMachineFunction().getFunction()->hasStructRetAttr() && (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); unsigned Reg = FuncInfo->getSRetReturnReg(); assert(Reg && "SRetReturnReg should have been set in LowerFormalArguments()."); SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy()); unsigned RetValReg = (Subtarget->is64Bit() && !Subtarget->isTarget64BitILP32()) ? X86::RAX : X86::EAX; Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag); Flag = Chain.getValue(1); // RAX/EAX now acts like a return value. RetOps.push_back(DAG.getRegister(RetValReg, getPointerTy())); } RetOps[0] = Chain; // Update chain. // Add the flag if we have it. if (Flag.getNode()) RetOps.push_back(Flag); return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps); } bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const { if (N->getNumValues() != 1) return false; if (!N->hasNUsesOfValue(1, 0)) return false; SDValue TCChain = Chain; SDNode *Copy = *N->use_begin(); if (Copy->getOpcode() == ISD::CopyToReg) { // If the copy has a glue operand, we conservatively assume it isn't safe to // perform a tail call. if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue) return false; TCChain = Copy->getOperand(0); } else if (Copy->getOpcode() != ISD::FP_EXTEND) return false; bool HasRet = false; for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end(); UI != UE; ++UI) { if (UI->getOpcode() != X86ISD::RET_FLAG) return false; HasRet = true; } if (!HasRet) return false; Chain = TCChain; return true; } MVT X86TargetLowering::getTypeForExtArgOrReturn(MVT VT, ISD::NodeType ExtendKind) const { MVT ReturnMVT; // TODO: Is this also valid on 32-bit? if (Subtarget->is64Bit() && VT == MVT::i1 && ExtendKind == ISD::ZERO_EXTEND) ReturnMVT = MVT::i8; else ReturnMVT = MVT::i32; MVT MinVT = getRegisterType(ReturnMVT); return VT.bitsLT(MinVT) ? MinVT : VT; } /// LowerCallResult - Lower the result values of a call into the /// appropriate copies out of appropriate physical registers. /// SDValue X86TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl
&Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl
&InVals) const { // Assign locations to each value returned by this call. SmallVector
RVLocs; bool Is64Bit = Subtarget->is64Bit(); CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); // Copy all of the result registers out of their specified physreg. for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; EVT CopyVT = VA.getValVT(); // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) && ((Is64Bit || Ins[i].Flags.isInReg()) && !Subtarget->hasSSE1())) { report_fatal_error("SSE register return with SSE disabled"); } SDValue Val; // If this is a call to a function that returns an fp value on the floating // point stack, we must guarantee the value is popped from the stack, so // a CopyFromReg is not good enough - the copy instruction may be eliminated // if the return value is not used. We use the FpPOP_RETVAL instruction // instead. if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) { // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. if (isScalarFPTypeInSSEReg(VA.getValVT())) CopyVT = MVT::f80; SDValue Ops[] = { Chain, InFlag }; Chain = SDValue(DAG.getMachineNode(X86::FpPOP_RETVAL, dl, CopyVT, MVT::Other, MVT::Glue, Ops), 1); Val = Chain.getValue(0); // Round the f80 to the right size, which also moves it to the appropriate // xmm register. if (CopyVT != VA.getValVT()) Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val, // This truncation won't change the value. DAG.getIntPtrConstant(1)); } else { Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag).getValue(1); Val = Chain.getValue(0); } InFlag = Chain.getValue(2); InVals.push_back(Val); } return Chain; } //===----------------------------------------------------------------------===// // C & StdCall & Fast Calling Convention implementation //===----------------------------------------------------------------------===// // StdCall calling convention seems to be standard for many Windows' API // routines and around. It differs from C calling convention just a little: // callee should clean up the stack, not caller. Symbols should be also // decorated in some fancy way :) It doesn't support any vector arguments. // For info on fast calling convention see Fast Calling Convention (tail call) // implementation LowerX86_32FastCCCallTo. /// CallIsStructReturn - Determines whether a call uses struct return /// semantics. enum StructReturnType { NotStructReturn, RegStructReturn, StackStructReturn }; static StructReturnType callIsStructReturn(const SmallVectorImpl
&Outs) { if (Outs.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Outs[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg()) return RegStructReturn; return StackStructReturn; } /// ArgsAreStructReturn - Determines whether a function uses struct /// return semantics. static StructReturnType argsAreStructReturn(const SmallVectorImpl
&Ins) { if (Ins.empty()) return NotStructReturn; const ISD::ArgFlagsTy &Flags = Ins[0].Flags; if (!Flags.isSRet()) return NotStructReturn; if (Flags.isInReg()) return RegStructReturn; return StackStructReturn; } /// CreateCopyOfByValArgument - Make a copy of an aggregate at address specified /// by "Src" to address "Dst" with size and alignment information specified by /// the specific parameter attribute. The copy will be passed as a byval /// function parameter. static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl) { SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32); return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(), /*isVolatile*/false, /*AlwaysInline=*/true, MachinePointerInfo(), MachinePointerInfo()); } /// IsTailCallConvention - Return true if the calling convention is one that /// supports tail call optimization. static bool IsTailCallConvention(CallingConv::ID CC) { return (CC == CallingConv::Fast || CC == CallingConv::GHC || CC == CallingConv::HiPE); } /// \brief Return true if the calling convention is a C calling convention. static bool IsCCallConvention(CallingConv::ID CC) { return (CC == CallingConv::C || CC == CallingConv::X86_64_Win64 || CC == CallingConv::X86_64_SysV); } bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const { if (!CI->isTailCall() || getTargetMachine().Options.DisableTailCalls) return false; CallSite CS(CI); CallingConv::ID CalleeCC = CS.getCallingConv(); if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; return true; } /// FuncIsMadeTailCallSafe - Return true if the function is being made into /// a tailcall target by changing its ABI. static bool FuncIsMadeTailCallSafe(CallingConv::ID CC, bool GuaranteedTailCallOpt) { return GuaranteedTailCallOpt && IsTailCallConvention(CC); } SDValue X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv, const SmallVectorImpl
&Ins, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, MachineFrameInfo *MFI, unsigned i) const { // Create the nodes corresponding to a load from this parameter slot. ISD::ArgFlagsTy Flags = Ins[i].Flags; bool AlwaysUseMutable = FuncIsMadeTailCallSafe( CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt); bool isImmutable = !AlwaysUseMutable && !Flags.isByVal(); EVT ValVT; // If value is passed by pointer we have address passed instead of the value // itself. if (VA.getLocInfo() == CCValAssign::Indirect) ValVT = VA.getLocVT(); else ValVT = VA.getValVT(); // FIXME: For now, all byval parameter objects are marked mutable. This can be // changed with more analysis. // In case of tail call optimization mark all arguments mutable. Since they // could be overwritten by lowering of arguments in case of a tail call. if (Flags.isByVal()) { unsigned Bytes = Flags.getByValSize(); if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects. int FI = MFI->CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable); return DAG.getFrameIndex(FI, getPointerTy()); } else { int FI = MFI->CreateFixedObject(ValVT.getSizeInBits()/8, VA.getLocMemOffset(), isImmutable); SDValue FIN = DAG.getFrameIndex(FI, getPointerTy()); return DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo::getFixedStack(FI), false, false, false, 0); } } SDValue X86TargetLowering::LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl
&Ins, SDLoc dl, SelectionDAG &DAG, SmallVectorImpl
&InVals) const { MachineFunction &MF = DAG.getMachineFunction(); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); const Function* Fn = MF.getFunction(); if (Fn->hasExternalLinkage() && Subtarget->isTargetCygMing() && Fn->getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo *MFI = MF.getFrameInfo(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); assert(!(isVarArg && IsTailCallConvention(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Assign locations to all of the incoming arguments. SmallVector
ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, DAG.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeFormalArguments(Ins, CC_X86); unsigned LastVal = ~0U; SDValue ArgValue; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; // TODO: If an arg is passed in two places (e.g. reg and stack), skip later // places. assert(VA.getValNo() != LastVal && "Don't support value assigned to multiple locs yet"); (void)LastVal; LastVal = VA.getValNo(); if (VA.isRegLoc()) { EVT RegVT = VA.getLocVT(); const TargetRegisterClass *RC; if (RegVT == MVT::i32) RC = &X86::GR32RegClass; else if (Is64Bit && RegVT == MVT::i64) RC = &X86::GR64RegClass; else if (RegVT == MVT::f32) RC = &X86::FR32RegClass; else if (RegVT == MVT::f64) RC = &X86::FR64RegClass; else if (RegVT.is512BitVector()) RC = &X86::VR512RegClass; else if (RegVT.is256BitVector()) RC = &X86::VR256RegClass; else if (RegVT.is128BitVector()) RC = &X86::VR128RegClass; else if (RegVT == MVT::x86mmx) RC = &X86::VR64RegClass; else if (RegVT == MVT::i1) RC = &X86::VK1RegClass; else if (RegVT == MVT::v8i1) RC = &X86::VK8RegClass; else if (RegVT == MVT::v16i1) RC = &X86::VK16RegClass; else llvm_unreachable("Unknown argument type!"); unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC); ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT); // If this is an 8 or 16-bit value, it is really passed promoted to 32 // bits. Insert an assert[sz]ext to capture this, then truncate to the // right size. if (VA.getLocInfo() == CCValAssign::SExt) ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::ZExt) ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue, DAG.getValueType(VA.getValVT())); else if (VA.getLocInfo() == CCValAssign::BCvt) ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue); if (VA.isExtInLoc()) { // Handle MMX values passed in XMM regs. if (RegVT.isVector()) ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue); else ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue); } } else { assert(VA.isMemLoc()); ArgValue = LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, i); } // If value is passed via pointer - do a load. if (VA.getLocInfo() == CCValAssign::Indirect) ArgValue = DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo(), false, false, false, 0); InVals.push_back(ArgValue); } if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) { for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // The x86-64 ABIs require that for returning structs by value we copy // the sret argument into %rax/%eax (depending on ABI) for the return. // Win32 requires us to put the sret argument to %eax as well. // Save the argument into a virtual register so that we can access it // from the return points. if (Ins[i].Flags.isSRet()) { unsigned Reg = FuncInfo->getSRetReturnReg(); if (!Reg) { MVT PtrTy = getPointerTy(); Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy)); FuncInfo->setSRetReturnReg(Reg); } SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]); Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain); break; } } } unsigned StackSize = CCInfo.getNextStackOffset(); // Align stack specially for tail calls. if (FuncIsMadeTailCallSafe(CallConv, MF.getTarget().Options.GuaranteedTailCallOpt)) StackSize = GetAlignedArgumentStackSize(StackSize, DAG); // If the function takes variable number of arguments, make a frame index for // the start of the first vararg value... for expansion of llvm.va_start. if (isVarArg) { if (Is64Bit || (CallConv != CallingConv::X86_FastCall && CallConv != CallingConv::X86_ThisCall)) { FuncInfo->setVarArgsFrameIndex(MFI->CreateFixedObject(1, StackSize,true)); } if (Is64Bit) { unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0; // FIXME: We should really autogenerate these arrays static const MCPhysReg GPR64ArgRegsWin64[] = { X86::RCX, X86::RDX, X86::R8, X86::R9 }; static const MCPhysReg GPR64ArgRegs64Bit[] = { X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9 }; static const MCPhysReg XMMArgRegs64Bit[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; const MCPhysReg *GPR64ArgRegs; unsigned NumXMMRegs = 0; if (IsWin64) { // The XMM registers which might contain var arg parameters are shadowed // in their paired GPR. So we only need to save the GPR to their home // slots. TotalNumIntRegs = 4; GPR64ArgRegs = GPR64ArgRegsWin64; } else { TotalNumIntRegs = 6; TotalNumXMMRegs = 8; GPR64ArgRegs = GPR64ArgRegs64Bit; NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs64Bit, TotalNumXMMRegs); } unsigned NumIntRegs = CCInfo.getFirstUnallocated(GPR64ArgRegs, TotalNumIntRegs); bool NoImplicitFloatOps = Fn->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NoImplicitFloat); assert(!(NumXMMRegs && !Subtarget->hasSSE1()) && "SSE register cannot be used when SSE is disabled!"); assert(!(NumXMMRegs && MF.getTarget().Options.UseSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); if (MF.getTarget().Options.UseSoftFloat || NoImplicitFloatOps || !Subtarget->hasSSE1()) // Kernel mode asks for SSE to be disabled, so don't push them // on the stack. TotalNumXMMRegs = 0; if (IsWin64) { const TargetFrameLowering &TFI = *MF.getTarget().getFrameLowering(); // Get to the caller-allocated home save location. Add 8 to account // for the return address. int HomeOffset = TFI.getOffsetOfLocalArea() + 8; FuncInfo->setRegSaveFrameIndex( MFI->CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false)); // Fixup to set vararg frame on shadow area (4 x i64). if (NumIntRegs < 4) FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex()); } else { // For X86-64, if there are vararg parameters that are passed via // registers, then we must store them to their spots on the stack so // they may be loaded by deferencing the result of va_next. FuncInfo->setVarArgsGPOffset(NumIntRegs * 8); FuncInfo->setVarArgsFPOffset(TotalNumIntRegs * 8 + NumXMMRegs * 16); FuncInfo->setRegSaveFrameIndex( MFI->CreateStackObject(TotalNumIntRegs * 8 + TotalNumXMMRegs * 16, 16, false)); } // Store the integer parameter registers. SmallVector
MemOps; SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), getPointerTy()); unsigned Offset = FuncInfo->getVarArgsGPOffset(); for (; NumIntRegs != TotalNumIntRegs; ++NumIntRegs) { SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(), RSFIN, DAG.getIntPtrConstant(Offset)); unsigned VReg = MF.addLiveIn(GPR64ArgRegs[NumIntRegs], &X86::GR64RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i64); SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo::getFixedStack( FuncInfo->getRegSaveFrameIndex(), Offset), false, false, 0); MemOps.push_back(Store); Offset += 8; } if (TotalNumXMMRegs != 0 && NumXMMRegs != TotalNumXMMRegs) { // Now store the XMM (fp + vector) parameter registers. SmallVector
SaveXMMOps; SaveXMMOps.push_back(Chain); unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass); SDValue ALVal = DAG.getCopyFromReg(DAG.getEntryNode(), dl, AL, MVT::i8); SaveXMMOps.push_back(ALVal); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getRegSaveFrameIndex())); SaveXMMOps.push_back(DAG.getIntPtrConstant( FuncInfo->getVarArgsFPOffset())); for (; NumXMMRegs != TotalNumXMMRegs; ++NumXMMRegs) { unsigned VReg = MF.addLiveIn(XMMArgRegs64Bit[NumXMMRegs], &X86::VR128RegClass); SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::v4f32); SaveXMMOps.push_back(Val); } MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl, MVT::Other, SaveXMMOps)); } if (!MemOps.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps); } } // Some CCs need callee pop. if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, MF.getTarget().Options.GuaranteedTailCallOpt)) { FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything. } else { FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing. // If this is an sret function, the return should pop the hidden pointer. if (!Is64Bit && !IsTailCallConvention(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && argsAreStructReturn(Ins) == StackStructReturn) FuncInfo->setBytesToPopOnReturn(4); } if (!Is64Bit) { // RegSaveFrameIndex is X86-64 only. FuncInfo->setRegSaveFrameIndex(0xAAAAAAA); if (CallConv == CallingConv::X86_FastCall || CallConv == CallingConv::X86_ThisCall) // fastcc functions can't have varargs. FuncInfo->setVarArgsFrameIndex(0xAAAAAAA); } FuncInfo->setArgumentStackSize(StackSize); return Chain; } SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg, SDLoc dl, SelectionDAG &DAG, const CCValAssign &VA, ISD::ArgFlagsTy Flags) const { unsigned LocMemOffset = VA.getLocMemOffset(); SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset); PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff); if (Flags.isByVal()) return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl); return DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo::getStack(LocMemOffset), false, false, 0); } /// EmitTailCallLoadRetAddr - Emit a load of return address if tail call /// optimization is performed and it is required. SDValue X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall, bool Is64Bit, int FPDiff, SDLoc dl) const { // Adjust the Return address stack slot. EVT VT = getPointerTy(); OutRetAddr = getReturnAddressFrameIndex(DAG); // Load the "old" Return address. OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo(), false, false, false, 0); return SDValue(OutRetAddr.getNode(), 1); } /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call /// optimization is performed and it is required (FPDiff!=0). static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF, SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT, unsigned SlotSize, int FPDiff, SDLoc dl) { // Store the return address to the appropriate stack slot. if (!FPDiff) return Chain; // Calculate the new stack slot for the return address. int NewReturnAddrFI = MF.getFrameInfo()->CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize, false); SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT); Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx, MachinePointerInfo::getFixedStack(NewReturnAddrFI), false, false, 0); return Chain; } SDValue X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SmallVectorImpl
&InVals) const { SelectionDAG &DAG = CLI.DAG; SDLoc &dl = CLI.DL; SmallVectorImpl
&Outs = CLI.Outs; SmallVectorImpl
&OutVals = CLI.OutVals; SmallVectorImpl
&Ins = CLI.Ins; SDValue Chain = CLI.Chain; SDValue Callee = CLI.Callee; CallingConv::ID CallConv = CLI.CallConv; bool &isTailCall = CLI.IsTailCall; bool isVarArg = CLI.IsVarArg; MachineFunction &MF = DAG.getMachineFunction(); bool Is64Bit = Subtarget->is64Bit(); bool IsWin64 = Subtarget->isCallingConvWin64(CallConv); StructReturnType SR = callIsStructReturn(Outs); bool IsSibcall = false; if (MF.getTarget().Options.DisableTailCalls) isTailCall = false; bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address // around. isTailCall = true; } else if (isTailCall) { // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, MF.getFunction()->hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require // ABI changes. if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall) IsSibcall = true; if (isTailCall) ++NumTailCalls; } assert(!(isVarArg && IsTailCallConvention(CallConv)) && "Var args not supported with calling convention fastcc, ghc or hipe"); // Analyze operands of the call, assigning locations to each operand. SmallVector
ArgLocs; CCState CCInfo(CallConv, isVarArg, MF, MF.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); if (IsSibcall) // This is a sibcall. The memory operands are available in caller's // own caller's stack. NumBytes = 0; else if (MF.getTarget().Options.GuaranteedTailCallOpt && IsTailCallConvention(CallConv)) NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG); int FPDiff = 0; if (isTailCall && !IsSibcall && !IsMustTail) { // Lower arguments at fp - stackoffset + fpdiff. X86MachineFunctionInfo *X86Info = MF.getInfo
(); unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn(); FPDiff = NumBytesCallerPushed - NumBytes; // Set the delta of movement of the returnaddr stackslot. // But only set if delta is greater than previous delta. if (FPDiff < X86Info->getTCReturnAddrDelta()) X86Info->setTCReturnAddrDelta(FPDiff); } unsigned NumBytesToPush = NumBytes; unsigned NumBytesToPop = NumBytes; // If we have an inalloca argument, all stack space has already been allocated // for us and be right at the top of the stack. We don't support multiple // arguments passed in memory when using inalloca. if (!Outs.empty() && Outs.back().Flags.isInAlloca()) { NumBytesToPush = 0; assert(ArgLocs.back().getLocMemOffset() == 0 && "an inalloca argument must be the only memory argument"); } if (!IsSibcall) Chain = DAG.getCALLSEQ_START( Chain, DAG.getIntPtrConstant(NumBytesToPush, true), dl); SDValue RetAddrFrIdx; // Load return address for tail calls. if (isTailCall && FPDiff) Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall, Is64Bit, FPDiff, dl); SmallVector
, 8> RegsToPass; SmallVector
MemOpChains; SDValue StackPtr; // Walk the register/memloc assignments, inserting copies/loads. In the case // of tail call optimization arguments are handle later. const X86RegisterInfo *RegInfo = static_cast
(DAG.getTarget().getRegisterInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { // Skip inalloca arguments, they have already been written. ISD::ArgFlagsTy Flags = Outs[i].Flags; if (Flags.isInAlloca()) continue; CCValAssign &VA = ArgLocs[i]; EVT RegVT = VA.getLocVT(); SDValue Arg = OutVals[i]; bool isByVal = Flags.isByVal(); // Promote the value if needed. switch (VA.getLocInfo()) { default: llvm_unreachable("Unknown loc info!"); case CCValAssign::Full: break; case CCValAssign::SExt: Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg); break; case CCValAssign::ZExt: Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg); break; case CCValAssign::AExt: if (RegVT.is128BitVector()) { // Special case: passing MMX values in XMM registers. Arg = DAG.getNode(ISD::BITCAST, dl, MVT::i64, Arg); Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg); Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg); } else Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg); break; case CCValAssign::BCvt: Arg = DAG.getNode(ISD::BITCAST, dl, RegVT, Arg); break; case CCValAssign::Indirect: { // Store the argument. SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT()); int FI = cast
(SpillSlot)->getIndex(); Chain = DAG.getStore(Chain, dl, Arg, SpillSlot, MachinePointerInfo::getFixedStack(FI), false, false, 0); Arg = SpillSlot; break; } } if (VA.isRegLoc()) { RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg)); if (isVarArg && IsWin64) { // Win64 ABI requires argument XMM reg to be copied to the corresponding // shadow reg if callee is a varargs function. unsigned ShadowReg = 0; switch (VA.getLocReg()) { case X86::XMM0: ShadowReg = X86::RCX; break; case X86::XMM1: ShadowReg = X86::RDX; break; case X86::XMM2: ShadowReg = X86::R8; break; case X86::XMM3: ShadowReg = X86::R9; break; } if (ShadowReg) RegsToPass.push_back(std::make_pair(ShadowReg, Arg)); } } else if (!IsSibcall && (!isTailCall || isByVal)) { assert(VA.isMemLoc()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg, dl, DAG, VA, Flags)); } } if (!MemOpChains.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains); if (Subtarget->isPICStyleGOT()) { // ELF / PIC requires GOT in the EBX register before function calls via PLT // GOT pointer. if (!isTailCall) { RegsToPass.push_back(std::make_pair(unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), getPointerTy()))); } else { // If we are tail calling and generating PIC/GOT style code load the // address of the callee into ECX. The value in ecx is used as target of // the tail jump. This is done to circumvent the ebx/callee-saved problem // for tail calls on PIC/GOT architectures. Normally we would just put the // address of GOT into ebx and then call target@PLT. But for tail calls // ebx would be restored (since ebx is callee saved) before jumping to the // target@PLT. // Note: The actual moving to ECX is done further down. GlobalAddressSDNode *G = dyn_cast
(Callee); if (G && !G->getGlobal()->hasHiddenVisibility() && !G->getGlobal()->hasProtectedVisibility()) Callee = LowerGlobalAddress(Callee, DAG); else if (isa
(Callee)) Callee = LowerExternalSymbol(Callee, DAG); } } if (Is64Bit && isVarArg && !IsWin64) { // From AMD64 ABI document: // For calls that may call functions that use varargs or stdargs // (prototype-less calls or calls to functions containing ellipsis (...) in // the declaration) %al is used as hidden argument to specify the number // of SSE registers used. The contents of %al do not need to match exactly // the number of registers, but must be an ubound on the number of SSE // registers used and is in the range 0 - 8 inclusive. // Count the number of XMM registers allocated. static const MCPhysReg XMMArgRegs[] = { X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3, X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7 }; unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs, 8); assert((Subtarget->hasSSE1() || !NumXMMRegs) && "SSE registers cannot be used when SSE is disabled"); RegsToPass.push_back(std::make_pair(unsigned(X86::AL), DAG.getConstant(NumXMMRegs, MVT::i8))); } // For tail calls lower the arguments to the 'real' stack slots. Sibcalls // don't need this because the eligibility check rejects calls that require // shuffling arguments passed in memory. if (!IsSibcall && isTailCall) { // Force all the incoming stack arguments to be loaded from the stack // before any new outgoing arguments are stored to the stack, because the // outgoing stack slots may alias the incoming argument stack slots, and // the alias isn't otherwise explicit. This is slightly more conservative // than necessary, because it means that each store effectively depends // on every argument instead of just those arguments it would clobber. SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain); SmallVector
MemOpChains2; SDValue FIN; int FI = 0; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (VA.isRegLoc()) continue; assert(VA.isMemLoc()); SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; // Skip inalloca arguments. They don't require any work. if (Flags.isInAlloca()) continue; // Create frame index. int32_t Offset = VA.getLocMemOffset()+FPDiff; uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8; FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true); FIN = DAG.getFrameIndex(FI, getPointerTy()); if (Flags.isByVal()) { // Copy relative to framepointer. SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset()); if (!StackPtr.getNode()) StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(), getPointerTy()); Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source); MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN, ArgChain, Flags, DAG, dl)); } else { // Store relative to framepointer. MemOpChains2.push_back( DAG.getStore(ArgChain, dl, Arg, FIN, MachinePointerInfo::getFixedStack(FI), false, false, 0)); } } if (!MemOpChains2.empty()) Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2); // Store the return address to the appropriate stack slot. Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, getPointerTy(), RegInfo->getSlotSize(), FPDiff, dl); } // Build a sequence of copy-to-reg nodes chained together with token chain // and flag operands which copy the outgoing args into registers. SDValue InFlag; for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) { Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first, RegsToPass[i].second, InFlag); InFlag = Chain.getValue(1); } if (DAG.getTarget().getCodeModel() == CodeModel::Large) { assert(Is64Bit && "Large code model is only legal in 64-bit mode."); // In the 64-bit large code model, we have to make all calls // through a register, since the call instruction's 32-bit // pc-relative offset may not be large enough to hold the whole // address. } else if (GlobalAddressSDNode *G = dyn_cast
(Callee)) { // If the callee is a GlobalAddress node (quite common, every direct call // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack // it. // We should use extra load for direct calls to dllimported functions in // non-JIT mode. const GlobalValue *GV = G->getGlobal(); if (!GV->hasDLLImportStorageClass()) { unsigned char OpFlags = 0; bool ExtraLoad = false; unsigned WrapperKind = ISD::DELETED_NODE; // On ELF targets, in both X86-64 and X86-32 mode, direct calls to // external symbols most go through the PLT in PIC mode. If the symbol // has hidden or protected visibility, or if it is static or local, then // we don't need to use the PLT - we can directly call it. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_ && GV->hasDefaultVisibility() && !GV->hasLocalLinkage()) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (GV->isDeclaration() || GV->isWeakForLinker()) && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } else if (Subtarget->isPICStyleRIPRel() && isa
(GV) && cast
(GV)->getAttributes(). hasAttribute(AttributeSet::FunctionIndex, Attribute::NonLazyBind)) { // If the function is marked as non-lazy, generate an indirect call // which loads from the GOT directly. This avoids runtime overhead // at the cost of eager binding (and one extra byte of encoding). OpFlags = X86II::MO_GOTPCREL; WrapperKind = X86ISD::WrapperRIP; ExtraLoad = true; } Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy(), G->getOffset(), OpFlags); // Add a wrapper if needed. if (WrapperKind != ISD::DELETED_NODE) Callee = DAG.getNode(X86ISD::WrapperRIP, dl, getPointerTy(), Callee); // Add extra indirection if needed. if (ExtraLoad) Callee = DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(), Callee, MachinePointerInfo::getGOT(), false, false, false, 0); } } else if (ExternalSymbolSDNode *S = dyn_cast
(Callee)) { unsigned char OpFlags = 0; // On ELF targets, in either X86-64 or X86-32 mode, direct calls to // external symbols should go through the PLT. if (Subtarget->isTargetELF() && DAG.getTarget().getRelocationModel() == Reloc::PIC_) { OpFlags = X86II::MO_PLT; } else if (Subtarget->isPICStyleStubAny() && (!Subtarget->getTargetTriple().isMacOSX() || Subtarget->getTargetTriple().isMacOSXVersionLT(10, 5))) { // PC-relative references to external symbols should go through $stub, // unless we're building with the leopard linker or later, which // automatically synthesizes these stubs. OpFlags = X86II::MO_DARWIN_STUB; } Callee = DAG.getTargetExternalSymbol(S->getSymbol(), getPointerTy(), OpFlags); } // Returns a chain & a flag for retval copy to use. SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); SmallVector
Ops; if (!IsSibcall && isTailCall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, true), DAG.getIntPtrConstant(0, true), InFlag, dl); InFlag = Chain.getValue(1); } Ops.push_back(Chain); Ops.push_back(Callee); if (isTailCall) Ops.push_back(DAG.getConstant(FPDiff, MVT::i32)); // Add argument registers to the end of the list so that they are known live // into the call. for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) Ops.push_back(DAG.getRegister(RegsToPass[i].first, RegsToPass[i].second.getValueType())); // Add a register mask operand representing the call-preserved registers. const TargetRegisterInfo *TRI = DAG.getTarget().getRegisterInfo(); const uint32_t *Mask = TRI->getCallPreservedMask(CallConv); assert(Mask && "Missing call preserved mask for calling convention"); Ops.push_back(DAG.getRegisterMask(Mask)); if (InFlag.getNode()) Ops.push_back(InFlag); if (isTailCall) { // We used to do: //// If this is the first return lowered for this function, add the regs //// to the liveout set for the function. // This isn't right, although it's probably harmless on x86; liveouts // should be computed from returns not tail calls. Consider a void // function making a tail call to a function returning int. return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops); } Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops); InFlag = Chain.getValue(1); // Create the CALLSEQ_END node. unsigned NumBytesForCalleeToPop; if (X86::isCalleePop(CallConv, Is64Bit, isVarArg, DAG.getTarget().Options.GuaranteedTailCallOpt)) NumBytesForCalleeToPop = NumBytes; // Callee pops everything else if (!Is64Bit && !IsTailCallConvention(CallConv) && !Subtarget->getTargetTriple().isOSMSVCRT() && SR == StackStructReturn) // If this is a call to a struct-return function, the callee // pops the hidden struct pointer, so we have to push it back. // This is common for Darwin/X86, Linux & Mingw32 targets. // For MSVC Win32 targets, the caller pops the hidden struct pointer. NumBytesForCalleeToPop = 4; else NumBytesForCalleeToPop = 0; // Callee pops nothing. // Returns a flag for retval copy to use. if (!IsSibcall) { Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytesToPop, true), DAG.getIntPtrConstant(NumBytesForCalleeToPop, true), InFlag, dl); InFlag = Chain.getValue(1); } // Handle result values, copying them out of physregs into vregs that we // return. return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG, InVals); } //===----------------------------------------------------------------------===// // Fast Calling Convention (tail call) implementation //===----------------------------------------------------------------------===// // Like std call, callee cleans arguments, convention except that ECX is // reserved for storing the tail called function address. Only 2 registers are // free for argument passing (inreg). Tail call optimization is performed // provided: // * tailcallopt is enabled // * caller/callee are fastcc // On X86_64 architecture with GOT-style position independent code only local // (within module) calls are supported at the moment. // To keep the stack aligned according to platform abi the function // GetAlignedArgumentStackSize ensures that argument delta is always multiples // of stack alignment. (Dynamic linkers need this - darwin's dyld for example) // If a tail called function callee has more arguments than the caller the // caller needs to make sure that there is room to move the RETADDR to. This is // achieved by reserving an area the size of the argument delta right after the // original REtADDR, but before the saved framepointer or the spilled registers // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4) // stack layout: // arg1 // arg2 // RETADDR // [ new RETADDR // move area ] // (possible EBP) // ESI // EDI // local1 .. /// GetAlignedArgumentStackSize - Make the stack size align e.g 16n + 12 aligned /// for a 16 byte align requirement. unsigned X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize, SelectionDAG& DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const TargetMachine &TM = MF.getTarget(); const X86RegisterInfo *RegInfo = static_cast
(TM.getRegisterInfo()); const TargetFrameLowering &TFI = *TM.getFrameLowering(); unsigned StackAlignment = TFI.getStackAlignment(); uint64_t AlignMask = StackAlignment - 1; int64_t Offset = StackSize; unsigned SlotSize = RegInfo->getSlotSize(); if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) { // Number smaller than 12 so just add the difference. Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask)); } else { // Mask out lower bits, add stackalignment once plus the 12 bytes. Offset = ((~AlignMask) & Offset) + StackAlignment + (StackAlignment-SlotSize); } return Offset; } /// MatchingStackOffset - Return true if the given stack call argument is /// already available in the same position (relatively) of the caller's /// incoming argument stack. static bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags, MachineFrameInfo *MFI, const MachineRegisterInfo *MRI, const X86InstrInfo *TII) { unsigned Bytes = Arg.getValueType().getSizeInBits() / 8; int FI = INT_MAX; if (Arg.getOpcode() == ISD::CopyFromReg) { unsigned VR = cast
(Arg.getOperand(1))->getReg(); if (!TargetRegisterInfo::isVirtualRegister(VR)) return false; MachineInstr *Def = MRI->getVRegDef(VR); if (!Def) return false; if (!Flags.isByVal()) { if (!TII->isLoadFromStackSlot(Def, FI)) return false; } else { unsigned Opcode = Def->getOpcode(); if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r) && Def->getOperand(1).isFI()) { FI = Def->getOperand(1).getIndex(); Bytes = Flags.getByValSize(); } else return false; } } else if (LoadSDNode *Ld = dyn_cast
(Arg)) { if (Flags.isByVal()) // ByVal argument is passed in as a pointer but it's now being // dereferenced. e.g. // define @foo(%struct.X* %A) { // tail call @bar(%struct.X* byval %A) // } return false; SDValue Ptr = Ld->getBasePtr(); FrameIndexSDNode *FINode = dyn_cast
(Ptr); if (!FINode) return false; FI = FINode->getIndex(); } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) { FrameIndexSDNode *FINode = cast
(Arg); FI = FINode->getIndex(); Bytes = Flags.getByValSize(); } else return false; assert(FI != INT_MAX); if (!MFI->isFixedObjectIndex(FI)) return false; return Offset == MFI->getObjectOffset(FI) && Bytes == MFI->getObjectSize(FI); } /// IsEligibleForTailCallOptimization - Check whether the call is eligible /// for tail call optimization. Targets which want to do tail call /// optimization should implement this function. bool X86TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy, const SmallVectorImpl
&Outs, const SmallVectorImpl
&OutVals, const SmallVectorImpl
&Ins, SelectionDAG &DAG) const { if (!IsTailCallConvention(CalleeCC) && !IsCCallConvention(CalleeCC)) return false; // If -tailcallopt is specified, make fastcc functions tail-callable. const MachineFunction &MF = DAG.getMachineFunction(); const Function *CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; CallingConv::ID CallerCC = CallerF->getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget->isCallingConvWin64(CallerCC); if (DAG.getTarget().Options.GuaranteedTailCallOpt) { if (IsTailCallConvention(CalleeCC) && CCMatch) return true; return false; } // Look for obvious safe cases to perform tail call optimization that do not // require ABI changes. This is what gcc calls sibcall. // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to // emit a special epilogue. const X86RegisterInfo *RegInfo = static_cast
(DAG.getTarget().getRegisterInfo()); if (RegInfo->needsStackRealignment(MF)) return false; // Also avoid sibcall optimization if either caller or callee uses struct // return semantics. if (isCalleeStructRet || isCallerStructRet) return false; // An stdcall/thiscall caller is expected to clean up its arguments; the // callee isn't going to do that. // FIXME: this is more restrictive than needed. We could produce a tailcall // when the stack adjustment matches. For example, with a thiscall that takes // only one argument. if (!CCMatch && (CallerCC == CallingConv::X86_StdCall || CallerCC == CallingConv::X86_ThisCall)) return false; // Do not sibcall optimize vararg calls unless all arguments are passed via // registers. if (isVarArg && !Outs.empty()) { // Optimizing for varargs on Win64 is unlikely to be safe without // additional testing. if (IsCalleeWin64 || IsCallerWin64) return false; SmallVector
ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), DAG.getTarget(), ArgLocs, *DAG.getContext()); CCInfo.AnalyzeCallOperands(Outs, CC_X86); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) if (!ArgLocs[i].isRegLoc()) return false; } // If the call result is in ST0 / ST1, it needs to be popped off the x87 // stack. Therefore, if it's not used by the call it is not safe to optimize // this into a sibcall. bool Unused = false; for (unsigned i = 0, e = Ins.size(); i != e; ++i) { if (!Ins[i].Used) { Unused = true; break; } } if (Unused) { SmallVector
RVLocs; CCState CCInfo(CalleeCC, false, DAG.getMachineFunction(), DAG.getTarget(), RVLocs, *DAG.getContext()); CCInfo.AnalyzeCallResult(Ins, RetCC_X86); for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) { CCValAssign &VA = RVLocs[i]; if (VA.getLocReg() == X86::ST0 || VA.getLocReg() == X86::ST1) return false; } } // If the calling conventions do not match, then we'd better make sure the // results are returned in the same way as what the caller expects. if (!CCMatch) { SmallVector
RVLocs1; CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(), DAG.getTarget(), RVLocs1, *DAG.getContext()); CCInfo1.AnalyzeCallResult(Ins, RetCC_X86); SmallVector
RVLocs2; CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(), DAG.getTarget(), RVLocs2, *DAG.getContext()); CCInfo2.AnalyzeCallResult(Ins, RetCC_X86); if (RVLocs1.size() != RVLocs2.size()) return false; for (unsigned i = 0, e = RVLocs1.size(); i != e; ++i) { if (RVLocs1[i].isRegLoc() != RVLocs2[i].isRegLoc()) return false; if (RVLocs1[i].getLocInfo() != RVLocs2[i].getLocInfo()) return false; if (RVLocs1[i].isRegLoc()) { if (RVLocs1[i].getLocReg() != RVLocs2[i].getLocReg()) return false; } else { if (RVLocs1[i].getLocMemOffset() != RVLocs2[i].getLocMemOffset()) return false; } } } // If the callee takes no arguments then go on to check the results of the // call. if (!Outs.empty()) { // Check if stack adjustment is needed. For now, do not do this if any // argument is passed on the stack. SmallVector
ArgLocs; CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(), DAG.getTarget(), ArgLocs, *DAG.getContext()); // Allocate shadow area for Win64 if (IsCalleeWin64) CCInfo.AllocateStack(32, 8); CCInfo.AnalyzeCallOperands(Outs, CC_X86); if (CCInfo.getNextStackOffset()) { MachineFunction &MF = DAG.getMachineFunction(); if (MF.getInfo
()->getBytesToPopOnReturn()) return false; // Check if the arguments are already laid out in the right way as // the caller's fixed stack objects. MachineFrameInfo *MFI = MF.getFrameInfo(); const MachineRegisterInfo *MRI = &MF.getRegInfo(); const X86InstrInfo *TII = static_cast
(DAG.getTarget().getInstrInfo()); for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; SDValue Arg = OutVals[i]; ISD::ArgFlagsTy Flags = Outs[i].Flags; if (VA.getLocInfo() == CCValAssign::Indirect) return false; if (!VA.isRegLoc()) { if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags, MFI, MRI, TII)) return false; } } } // If the tailcall address may be in a register, then make sure it's // possible to register allocate for it. In 32-bit, the call address can // only target EAX, EDX, or ECX since the tail call must be scheduled after // callee-saved registers are restored. These happen to be the same // registers used to pass 'inreg' arguments so watch out for those. if (!Subtarget->is64Bit() && ((!isa
(Callee) && !isa
(Callee)) || DAG.getTarget().getRelocationModel() == Reloc::PIC_)) { unsigned NumInRegs = 0; // In PIC we need an extra register to formulate the address computation // for the callee. unsigned MaxInRegs = (DAG.getTarget().getRelocationModel() == Reloc::PIC_) ? 2 : 3; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; if (!VA.isRegLoc()) continue; unsigned Reg = VA.getLocReg(); switch (Reg) { default: break; case X86::EAX: case X86::EDX: case X86::ECX: if (++NumInRegs == MaxInRegs) return false; break; } } } } return true; } FastISel * X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { return X86::createFastISel(funcInfo, libInfo); } //===----------------------------------------------------------------------===// // Other Lowering Hooks //===----------------------------------------------------------------------===// static bool MayFoldLoad(SDValue Op) { return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldIntoStore(SDValue Op) { return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin()); } static bool isTargetShuffle(unsigned Opcode) { switch(Opcode) { default: return false; case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::PALIGNR: case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: case X86ISD::VPERMILP: case X86ISD::VPERM2X128: case X86ISD::VPERMI: return true; } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: case X86ISD::MOVDDUP: return DAG.getNode(Opc, dl, VT, V1); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PSHUFD: case X86ISD::PSHUFHW: case X86ISD::PSHUFLW: case X86ISD::VPERMILP: case X86ISD::VPERMI: return DAG.getNode(Opc, dl, VT, V1, DAG.getConstant(TargetMask, MVT::i8)); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, unsigned TargetMask, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::PALIGNR: case X86ISD::SHUFP: case X86ISD::VPERM2X128: return DAG.getNode(Opc, dl, VT, V1, V2, DAG.getConstant(TargetMask, MVT::i8)); } } static SDValue getTargetShuffleNode(unsigned Opc, SDLoc dl, EVT VT, SDValue V1, SDValue V2, SelectionDAG &DAG) { switch(Opc) { default: llvm_unreachable("Unknown x86 shuffle node"); case X86ISD::MOVLHPS: case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: case X86ISD::MOVSS: case X86ISD::MOVSD: case X86ISD::UNPCKL: case X86ISD::UNPCKH: return DAG.getNode(Opc, dl, VT, V1, V2); } } SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); const X86RegisterInfo *RegInfo = static_cast
(DAG.getTarget().getRegisterInfo()); X86MachineFunctionInfo *FuncInfo = MF.getInfo
(); int ReturnAddrIndex = FuncInfo->getRAIndex(); if (ReturnAddrIndex == 0) { // Set up a frame object for the return address. unsigned SlotSize = RegInfo->getSlotSize(); ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -(int64_t)SlotSize, false); FuncInfo->setRAIndex(ReturnAddrIndex); } return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy()); } bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M, bool hasSymbolicDisplacement) { // Offset should fit into 32 bit immediate field. if (!isInt<32>(Offset)) return false; // If we don't have a symbolic displacement - we don't have any extra // restrictions. if (!hasSymbolicDisplacement) return true; // FIXME: Some tweaks might be needed for medium code model. if (M != CodeModel::Small && M != CodeModel::Kernel) return false; // For small code model we assume that latest object is 16MB before end of 31 // bits boundary. We may also accept pretty large negative constants knowing // that all objects are in the positive half of address space. if (M == CodeModel::Small && Offset < 16*1024*1024) return true; // For kernel code model we know that all object resist in the negative half // of 32bits address space. We may not accept negative offsets, since they may // be just off and we may accept pretty large positive ones. if (M == CodeModel::Kernel && Offset > 0) return true; return false; } /// isCalleePop - Determines whether the callee is required to pop its /// own arguments. Callee pop is necessary to support tail calls. bool X86::isCalleePop(CallingConv::ID CallingConv, bool is64Bit, bool IsVarArg, bool TailCallOpt) { if (IsVarArg) return false; switch (CallingConv) { default: return false; case CallingConv::X86_StdCall: return !is64Bit; case CallingConv::X86_FastCall: return !is64Bit; case CallingConv::X86_ThisCall: return !is64Bit; case CallingConv::Fast: return TailCallOpt; case CallingConv::GHC: return TailCallOpt; case CallingConv::HiPE: return TailCallOpt; } } /// \brief Return true if the condition is an unsigned comparison operation. static bool isX86CCUnsigned(unsigned X86CC) { switch (X86CC) { default: llvm_unreachable("Invalid integer condition!"); case X86::COND_E: return true; case X86::COND_G: return false; case X86::COND_GE: return false; case X86::COND_L: return false; case X86::COND_LE: return false; case X86::COND_NE: return true; case X86::COND_B: return true; case X86::COND_A: return true; case X86::COND_BE: return true; case X86::COND_AE: return true; } llvm_unreachable("covered switch fell through?!"); } /// TranslateX86CC - do a one to one translation of a ISD::CondCode to the X86 /// specific condition code, returning the condition code and the LHS/RHS of the /// comparison to make. static unsigned TranslateX86CC(ISD::CondCode SetCCOpcode, bool isFP, SDValue &LHS, SDValue &RHS, SelectionDAG &DAG) { if (!isFP) { if (ConstantSDNode *RHSC = dyn_cast
(RHS)) { if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) { // X > -1 -> X == 0, jump !sign. RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_NS; } if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) { // X < 0 -> X == 0, jump on sign. return X86::COND_S; } if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) { // X < 1 -> X <= 0 RHS = DAG.getConstant(0, RHS.getValueType()); return X86::COND_LE; } } switch (SetCCOpcode) { default: llvm_unreachable("Invalid integer condition!"); case ISD::SETEQ: return X86::COND_E; case ISD::SETGT: return X86::COND_G; case ISD::SETGE: return X86::COND_GE; case ISD::SETLT: return X86::COND_L; case ISD::SETLE: return X86::COND_LE; case ISD::SETNE: return X86::COND_NE; case ISD::SETULT: return X86::COND_B; case ISD::SETUGT: return X86::COND_A; case ISD::SETULE: return X86::COND_BE; case ISD::SETUGE: return X86::COND_AE; } } // First determine if it is required or is profitable to flip the operands. // If LHS is a foldable load, but RHS is not, flip the condition. if (ISD::isNON_EXTLoad(LHS.getNode()) && !ISD::isNON_EXTLoad(RHS.getNode())) { SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode); std::swap(LHS, RHS); } switch (SetCCOpcode) { default: break; case ISD::SETOLT: case ISD::SETOLE: case ISD::SETUGT: case ISD::SETUGE: std::swap(LHS, RHS); break; } // On a floating point condition, the flags are set as follows: // ZF PF CF op // 0 | 0 | 0 | X > Y // 0 | 0 | 1 | X < Y // 1 | 0 | 0 | X == Y // 1 | 1 | 1 | unordered switch (SetCCOpcode) { default: llvm_unreachable("Condcode should be pre-legalized away"); case ISD::SETUEQ: case ISD::SETEQ: return X86::COND_E; case ISD::SETOLT: // flipped case ISD::SETOGT: case ISD::SETGT: return X86::COND_A; case ISD::SETOLE: // flipped case ISD::SETOGE: case ISD::SETGE: return X86::COND_AE; case ISD::SETUGT: // flipped case ISD::SETULT: case ISD::SETLT: return X86::COND_B; case ISD::SETUGE: // flipped case ISD::SETULE: case ISD::SETLE: return X86::COND_BE; case ISD::SETONE: case ISD::SETNE: return X86::COND_NE; case ISD::SETUO: return X86::COND_P; case ISD::SETO: return X86::COND_NP; case ISD::SETOEQ: case ISD::SETUNE: return X86::COND_INVALID; } } /// hasFPCMov - is there a floating point cmov for the specific X86 condition /// code. Current x86 isa includes the following FP cmov instructions: /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu. static bool hasFPCMov(unsigned X86CC) { switch (X86CC) { default: return false; case X86::COND_B: case X86::COND_BE: case X86::COND_E: case X86::COND_P: case X86::COND_A: case X86::COND_AE: case X86::COND_NE: case X86::COND_NP: return true; } } /// isFPImmLegal - Returns true if the target can instruction select the /// specified FP immediate natively. If false, the legalizer will /// materialize the FP immediate as a load from a constant pool. bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) { if (Imm.bitwiseIsEqual(LegalFPImmediates[i])) return true; } return false; } /// \brief Returns true if it is beneficial to convert a load of a constant /// to just the constant itself. bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const { assert(Ty->isIntegerTy()); unsigned BitSize = Ty->getPrimitiveSizeInBits(); if (BitSize == 0 || BitSize > 64) return false; return true; } /// isUndefOrInRange - Return true if Val is undef or if its value falls within /// the specified range (L, H]. static bool isUndefOrInRange(int Val, int Low, int Hi) { return (Val < 0) || (Val >= Low && Val < Hi); } /// isUndefOrEqual - Val is either less than zero (undef) or equal to the /// specified value. static bool isUndefOrEqual(int Val, int CmpVal) { return (Val < 0 || Val == CmpVal); } /// isSequentialOrUndefInRange - Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (L, L+Pos]. or is undef. static bool isSequentialOrUndefInRange(ArrayRef
Mask, unsigned Pos, unsigned Size, int Low) { for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low) if (!isUndefOrEqual(Mask[i], Low)) return false; return true; } /// isPSHUFDMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFD or PSHUFW. That is, it doesn't reference /// the second operand. static bool isPSHUFDMask(ArrayRef
Mask, MVT VT) { if (VT == MVT::v4f32 || VT == MVT::v4i32 ) return (Mask[0] < 4 && Mask[1] < 4 && Mask[2] < 4 && Mask[3] < 4); if (VT == MVT::v2f64 || VT == MVT::v2i64) return (Mask[0] < 2 && Mask[1] < 2); return false; } /// isPSHUFHWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFHW. static bool isPSHUFHWMask(ArrayRef
Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; // Lower quadword copied in order or undef. if (!isSequentialOrUndefInRange(Mask, 0, 4, 0)) return false; // Upper quadword shuffled. for (unsigned i = 4; i != 8; ++i) if (!isUndefOrInRange(Mask[i], 4, 8)) return false; if (VT == MVT::v16i16) { // Lower quadword copied in order or undef. if (!isSequentialOrUndefInRange(Mask, 8, 4, 8)) return false; // Upper quadword shuffled. for (unsigned i = 12; i != 16; ++i) if (!isUndefOrInRange(Mask[i], 12, 16)) return false; } return true; } /// isPSHUFLWMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PSHUFLW. static bool isPSHUFLWMask(ArrayRef
Mask, MVT VT, bool HasInt256) { if (VT != MVT::v8i16 && (!HasInt256 || VT != MVT::v16i16)) return false; // Upper quadword copied in order. if (!isSequentialOrUndefInRange(Mask, 4, 4, 4)) return false; // Lower quadword shuffled. for (unsigned i = 0; i != 4; ++i) if (!isUndefOrInRange(Mask[i], 0, 4)) return false; if (VT == MVT::v16i16) { // Upper quadword copied in order. if (!isSequentialOrUndefInRange(Mask, 12, 4, 12)) return false; // Lower quadword shuffled. for (unsigned i = 8; i != 12; ++i) if (!isUndefOrInRange(Mask[i], 8, 12)) return false; } return true; } /// isPALIGNRMask - Return true if the node specifies a shuffle of elements that /// is suitable for input to PALIGNR. static bool isPALIGNRMask(ArrayRef
Mask, MVT VT, const X86Subtarget *Subtarget) { if ((VT.is128BitVector() && !Subtarget->hasSSSE3()) || (VT.is256BitVector() && !Subtarget->hasInt256())) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.is512BitVector() ? 1: VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; // Do not handle 64-bit element shuffles with palignr. if (NumLaneElts == 2) return false; for (unsigned l = 0; l != NumElts; l+=NumLaneElts) { unsigned i; for (i = 0; i != NumLaneElts; ++i) { if (Mask[i+l] >= 0) break; } // Lane is all undef, go to next lane if (i == NumLaneElts) continue; int Start = Mask[i+l]; // Make sure its in this lane in one of the sources if (!isUndefOrInRange(Start, l, l+NumLaneElts) && !isUndefOrInRange(Start, l+NumElts, l+NumElts+NumLaneElts)) return false; // If not lane 0, then we must match lane 0 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Start, Mask[i]+l)) return false; // Correct second source to be contiguous with first source if (Start >= (int)NumElts) Start -= NumElts - NumLaneElts; // Make sure we're shifting in the right direction. if (Start <= (int)(i+l)) return false; Start -= i; // Check the rest of the elements to see if they are consecutive. for (++i; i != NumLaneElts; ++i) { int Idx = Mask[i+l]; // Make sure its in this lane if (!isUndefOrInRange(Idx, l, l+NumLaneElts) && !isUndefOrInRange(Idx, l+NumElts, l+NumElts+NumLaneElts)) return false; // If not lane 0, then we must match lane 0 if (l != 0 && Mask[i] >= 0 && !isUndefOrEqual(Idx, Mask[i]+l)) return false; if (Idx >= (int)NumElts) Idx -= NumElts - NumLaneElts; if (!isUndefOrEqual(Idx, Start+i)) return false; } } return true; } /// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming /// the two vector operands have swapped position. static void CommuteVectorShuffleMask(SmallVectorImpl
&Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { int idx = Mask[i]; if (idx < 0) continue; else if (idx < (int)NumElems) Mask[i] = idx + NumElems; else Mask[i] = idx - NumElems; } } /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128/256-bit /// SHUFPS and SHUFPD. If Commuted is true, then it checks for sources to be /// reverse of what x86 shuffles want. static bool isSHUFPMask(ArrayRef
Mask, MVT VT, bool Commuted = false) { unsigned NumElems = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElems = NumElems/NumLanes; if (NumLaneElems != 2 && NumLaneElems != 4) return false; unsigned EltSize = VT.getVectorElementType().getSizeInBits(); bool symetricMaskRequired = (VT.getSizeInBits() >= 256) && (EltSize == 32); // VSHUFPSY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // // SRC1 => X7 X6 X5 X4 X3 X2 X1 X0 // SRC2 => Y7 Y6 Y5 Y4 Y3 Y2 Y1 Y9 // // DST => Y7..Y4, Y7..Y4, X7..X4, X7..X4, // Y3..Y0, Y3..Y0, X3..X0, X3..X0 // // VSHUFPDY divides the resulting vector into 4 chunks. // The sources are also splitted into 4 chunks, and each destination // chunk must come from a different source chunk. // // SRC1 => X3 X2 X1 X0 // SRC2 => Y3 Y2 Y1 Y0 // // DST => Y3..Y2, X3..X2, Y1..Y0, X1..X0 // SmallVector
MaskVal(NumLaneElems, -1); unsigned HalfLaneElems = NumLaneElems/2; for (unsigned l = 0; l != NumElems; l += NumLaneElems) { for (unsigned i = 0; i != NumLaneElems; ++i) { int Idx = Mask[i+l]; unsigned RngStart = l + ((Commuted == (i
Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 4) return false; // Expect bit0 == 6, bit1 == 7, bit2 == 2, bit3 == 3 return isUndefOrEqual(Mask[0], 6) && isUndefOrEqual(Mask[1], 7) && isUndefOrEqual(Mask[2], 2) && isUndefOrEqual(Mask[3], 3); } /// isMOVHLPS_v_undef_Mask - Special case of isMOVHLPSMask for canonical form /// of vector_shuffle v, v, <2, 3, 2, 3>, i.e. vector_shuffle v, undef, /// <2, 3, 2, 3> static bool isMOVHLPS_v_undef_Mask(ArrayRef
Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 4) return false; return isUndefOrEqual(Mask[0], 2) && isUndefOrEqual(Mask[1], 3) && isUndefOrEqual(Mask[2], 2) && isUndefOrEqual(Mask[3], 3); } /// isMOVLPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLP{S|D}. static bool isMOVLPMask(ArrayRef
Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i + NumElems)) return false; for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } /// isMOVLHPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVLHPS. static bool isMOVLHPSMask(ArrayRef
Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i + e], i + NumElems)) return false; return true; } /// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to INSERTPS. /// i. e: If all but one element come from the same vector. static bool isINSERTPSMask(ArrayRef
Mask, MVT VT) { // TODO: Deal with AVX's VINSERTPS if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32)) return false; unsigned CorrectPosV1 = 0; unsigned CorrectPosV2 = 0; for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i) { if (Mask[i] == -1) { ++CorrectPosV1; ++CorrectPosV2; continue; } if (Mask[i] == i) ++CorrectPosV1; else if (Mask[i] == i + 4) ++CorrectPosV2; } if (CorrectPosV1 == 3 || CorrectPosV2 == 3) // We have 3 elements (undefs count as elements from any vector) from one // vector, and one from another. return true; return false; } // // Some special combinations that can be optimized. // static SDValue Compact8x32ShuffleNode(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); SDLoc dl(SVOp); if (VT != MVT::v8i32 && VT != MVT::v8f32) return SDValue(); ArrayRef
Mask = SVOp->getMask(); // These are the special masks that may be optimized. static const int MaskToOptimizeEven[] = {0, 8, 2, 10, 4, 12, 6, 14}; static const int MaskToOptimizeOdd[] = {1, 9, 3, 11, 5, 13, 7, 15}; bool MatchEvenMask = true; bool MatchOddMask = true; for (int i=0; i<8; ++i) { if (!isUndefOrEqual(Mask[i], MaskToOptimizeEven[i])) MatchEvenMask = false; if (!isUndefOrEqual(Mask[i], MaskToOptimizeOdd[i])) MatchOddMask = false; } if (!MatchEvenMask && !MatchOddMask) return SDValue(); SDValue UndefNode = DAG.getNode(ISD::UNDEF, dl, VT); SDValue Op0 = SVOp->getOperand(0); SDValue Op1 = SVOp->getOperand(1); if (MatchEvenMask) { // Shift the second operand right to 32 bits. static const int ShiftRightMask[] = {-1, 0, -1, 2, -1, 4, -1, 6 }; Op1 = DAG.getVectorShuffle(VT, dl, Op1, UndefNode, ShiftRightMask); } else { // Shift the first operand left to 32 bits. static const int ShiftLeftMask[] = {1, -1, 3, -1, 5, -1, 7, -1 }; Op0 = DAG.getVectorShuffle(VT, dl, Op0, UndefNode, ShiftLeftMask); } static const int BlendMask[] = {0, 9, 2, 11, 4, 13, 6, 15}; return DAG.getVectorShuffle(VT, dl, Op0, Op1, BlendMask); } /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKL. static bool isUNPCKLMask(ArrayRef
Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckl"); // AVX defines UNPCK* to operate independently on 128-bit lanes. unsigned NumLanes; unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); if (VT.is256BitVector()) { if (NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; NumLanes = 2; NumOf256BitLanes = 1; } else if (VT.is512BitVector()) { assert(VT.getScalarType().getSizeInBits() >= 32 && "Unsupported vector type for unpckh"); NumLanes = 2; NumOf256BitLanes = 2; } else { NumLanes = 1; NumOf256BitLanes = 1; } unsigned NumEltsInStride = NumElts/NumOf256BitLanes; unsigned NumLaneElts = NumEltsInStride/NumLanes; for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l256*NumEltsInStride+l+i]; int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) return false; if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } } return true; } /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to UNPCKH. static bool isUNPCKHMask(ArrayRef
Mask, MVT VT, bool HasInt256, bool V2IsSplat = false) { assert(VT.getSizeInBits() >= 128 && "Unsupported vector type for unpckh"); // AVX defines UNPCK* to operate independently on 128-bit lanes. unsigned NumLanes; unsigned NumOf256BitLanes; unsigned NumElts = VT.getVectorNumElements(); if (VT.is256BitVector()) { if (NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; NumLanes = 2; NumOf256BitLanes = 1; } else if (VT.is512BitVector()) { assert(VT.getScalarType().getSizeInBits() >= 32 && "Unsupported vector type for unpckh"); NumLanes = 2; NumOf256BitLanes = 2; } else { NumLanes = 1; NumOf256BitLanes = 1; } unsigned NumEltsInStride = NumElts/NumOf256BitLanes; unsigned NumLaneElts = NumEltsInStride/NumLanes; for (unsigned l256 = 0; l256 < NumOf256BitLanes; l256 += 1) { for (unsigned l = 0; l != NumEltsInStride; l += NumLaneElts) { for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l256*NumEltsInStride+l+i]; int BitI1 = Mask[l256*NumEltsInStride+l+i+1]; if (!isUndefOrEqual(BitI, j+l256*NumElts)) return false; if (V2IsSplat && !isUndefOrEqual(BitI1, NumElts)) return false; if (!isUndefOrEqual(BitI1, j+l256*NumElts+NumEltsInStride)) return false; } } } return true; } /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef, /// <0, 0, 1, 1> static bool isUNPCKL_v_undef_Mask(ArrayRef
Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); bool Is256BitVec = VT.is256BitVector(); if (VT.is512BitVector()) return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (Is256BitVec && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // For 256-bit i64/f64, use MOVDDUPY instead, so reject the matching pattern // FIXME: Need a better way to get rid of this, there's no latency difference // between UNPCKLPD and MOVDDUP, the later should always be checked first and // the former later. We should also remove the "_undef" special mask. if (NumElts == 4 && Is256BitVec) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0, j = l; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l+i]; int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) return false; } } return true; } /// isUNPCKH_v_undef_Mask - Special case of isUNPCKHMask for canonical form /// of vector_shuffle v, v, <2, 6, 3, 7>, i.e. vector_shuffle v, undef, /// <2, 2, 3, 3> static bool isUNPCKH_v_undef_Mask(ArrayRef
Mask, MVT VT, bool HasInt256) { unsigned NumElts = VT.getVectorNumElements(); if (VT.is512BitVector()) return false; assert((VT.is128BitVector() || VT.is256BitVector()) && "Unsupported vector type for unpckh"); if (VT.is256BitVector() && NumElts != 4 && NumElts != 8 && (!HasInt256 || (NumElts != 16 && NumElts != 32))) return false; // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate // independently on 128-bit lanes. unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; for (unsigned l = 0; l != NumElts; l += NumLaneElts) { for (unsigned i = 0, j = l+NumLaneElts/2; i != NumLaneElts; i += 2, ++j) { int BitI = Mask[l+i]; int BitI1 = Mask[l+i+1]; if (!isUndefOrEqual(BitI, j)) return false; if (!isUndefOrEqual(BitI1, j)) return false; } } return true; } // Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or // (src1[0], src0[1]), manipulation with 256-bit sub-vectors static bool isINSERT64x4Mask(ArrayRef
Mask, MVT VT, unsigned int *Imm) { if (!VT.is512BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); unsigned HalfSize = NumElts/2; if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) { *Imm = 1; return true; } } if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) { *Imm = 0; return true; } } return false; } /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSS, /// MOVSD, and MOVD, i.e. setting the lowest element. static bool isMOVLMask(ArrayRef
Mask, EVT VT) { if (VT.getVectorElementType().getSizeInBits() < 32) return false; if (!VT.is128BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); if (!isUndefOrEqual(Mask[0], NumElts)) return false; for (unsigned i = 1; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; return true; } /// isVPERM2X128Mask - Match 256-bit shuffles where the elements are considered /// as permutations between 128-bit chunks or halves. As an example: this /// shuffle bellow: /// vector_shuffle <4, 5, 6, 7, 12, 13, 14, 15> /// The first half comes from the second half of V1 and the second half from the /// the second half of V2. static bool isVPERM2X128Mask(ArrayRef
Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; // The shuffle result is divided into half A and half B. In total the two // sources have 4 halves, namely: C, D, E, F. The final values of A and // B must come from C, D, E or F. unsigned HalfSize = VT.getVectorNumElements()/2; bool MatchA = false, MatchB = false; // Check if A comes from one of C, D, E, F. for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, 0, HalfSize, Half*HalfSize)) { MatchA = true; break; } } // Check if B comes from one of C, D, E, F. for (unsigned Half = 0; Half != 4; ++Half) { if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, Half*HalfSize)) { MatchB = true; break; } } return MatchA && MatchB; } /// getShuffleVPERM2X128Immediate - Return the appropriate immediate to shuffle /// the specified VECTOR_MASK mask with VPERM2F128/VPERM2I128 instructions. static unsigned getShuffleVPERM2X128Immediate(ShuffleVectorSDNode *SVOp) { MVT VT = SVOp->getSimpleValueType(0); unsigned HalfSize = VT.getVectorNumElements()/2; unsigned FstHalf = 0, SndHalf = 0; for (unsigned i = 0; i < HalfSize; ++i) { if (SVOp->getMaskElt(i) > 0) { FstHalf = SVOp->getMaskElt(i)/HalfSize; break; } } for (unsigned i = HalfSize; i < HalfSize*2; ++i) { if (SVOp->getMaskElt(i) > 0) { SndHalf = SVOp->getMaskElt(i)/HalfSize; break; } } return (FstHalf | (SndHalf << 4)); } // Symetric in-lane mask. Each lane has 4 elements (for imm8) static bool isPermImmMask(ArrayRef
Mask, MVT VT, unsigned& Imm8) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (EltSize < 32) return false; unsigned NumElts = VT.getVectorNumElements(); Imm8 = 0; if (VT.is128BitVector() || (VT.is256BitVector() && EltSize == 64)) { for (unsigned i = 0; i != NumElts; ++i) { if (Mask[i] < 0) continue; Imm8 |= Mask[i] << (i*2); } return true; } unsigned LaneSize = 4; SmallVector
MaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; if (Mask[i+l] < 0) continue; if (MaskVal[i] < 0) { MaskVal[i] = Mask[i+l] - l; Imm8 |= MaskVal[i] << (i*2); continue; } if (Mask[i+l] != (signed)(MaskVal[i]+l)) return false; } } return true; } /// isVPERMILPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to VPERMILPD*. /// Note that VPERMIL mask matching is different depending whether theunderlying /// type is 32 or 64. In the VPERMILPS the high half of the mask should point /// to the same elements of the low, but to the higher half of the source. /// In VPERMILPD the two lanes could be shuffled independently of each other /// with the same restriction that lanes can't be crossed. Also handles PSHUFDY. static bool isVPERMILPMask(ArrayRef
Mask, MVT VT) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); if (VT.getSizeInBits() < 256 || EltSize < 32) return false; bool symetricMaskRequired = (EltSize == 32); unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned LaneSize = NumElts/NumLanes; // 2 or 4 elements in one lane SmallVector
ExpectedMaskVal(LaneSize, -1); for (unsigned l = 0; l != NumElts; l += LaneSize) { for (unsigned i = 0; i != LaneSize; ++i) { if (!isUndefOrInRange(Mask[i+l], l, l+LaneSize)) return false; if (symetricMaskRequired) { if (ExpectedMaskVal[i] < 0 && Mask[i+l] >= 0) { ExpectedMaskVal[i] = Mask[i+l] - l; continue; } if (!isUndefOrEqual(Mask[i+l], ExpectedMaskVal[i]+l)) return false; } } } return true; } /// isCommutedMOVLMask - Returns true if the shuffle mask is except the reverse /// of what x86 movss want. X86 movs requires the lowest element to be lowest /// element of vector 2 and the other elements to come from vector 1 in order. static bool isCommutedMOVLMask(ArrayRef
Mask, MVT VT, bool V2IsSplat = false, bool V2IsUndef = false) { if (!VT.is128BitVector()) return false; unsigned NumOps = VT.getVectorNumElements(); if (NumOps != 2 && NumOps != 4 && NumOps != 8 && NumOps != 16) return false; if (!isUndefOrEqual(Mask[0], 0)) return false; for (unsigned i = 1; i != NumOps; ++i) if (!(isUndefOrEqual(Mask[i], i+NumOps) || (V2IsUndef && isUndefOrInRange(Mask[i], NumOps, NumOps*2)) || (V2IsSplat && isUndefOrEqual(Mask[i], NumOps)))) return false; return true; } /// isMOVSHDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSHDUP. /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7> static bool isMOVSHDUPMask(ArrayRef
Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || (VT.is256BitVector() && NumElems != 8) || (VT.is512BitVector() && NumElems != 16)) return false; // "i+1" is the value the indexed mask element must have for (unsigned i = 0; i != NumElems; i += 2) if (!isUndefOrEqual(Mask[i], i+1) || !isUndefOrEqual(Mask[i+1], i+1)) return false; return true; } /// isMOVSLDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to MOVSLDUP. /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6> static bool isMOVSLDUPMask(ArrayRef
Mask, MVT VT, const X86Subtarget *Subtarget) { if (!Subtarget->hasSSE3()) return false; unsigned NumElems = VT.getVectorNumElements(); if ((VT.is128BitVector() && NumElems != 4) || (VT.is256BitVector() && NumElems != 8) || (VT.is512BitVector() && NumElems != 16)) return false; // "i" is the value the indexed mask element must have for (unsigned i = 0; i != NumElems; i += 2) if (!isUndefOrEqual(Mask[i], i) || !isUndefOrEqual(Mask[i+1], i)) return false; return true; } /// isMOVDDUPYMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 256-bit /// version of MOVDDUP. static bool isMOVDDUPYMask(ArrayRef
Mask, MVT VT, bool HasFp256) { if (!HasFp256 || !VT.is256BitVector()) return false; unsigned NumElts = VT.getVectorNumElements(); if (NumElts != 4) return false; for (unsigned i = 0; i != NumElts/2; ++i) if (!isUndefOrEqual(Mask[i], 0)) return false; for (unsigned i = NumElts/2; i != NumElts; ++i) if (!isUndefOrEqual(Mask[i], NumElts/2)) return false; return true; } /// isMOVDDUPMask - Return true if the specified VECTOR_SHUFFLE operand /// specifies a shuffle of elements that is suitable for input to 128-bit /// version of MOVDDUP. static bool isMOVDDUPMask(ArrayRef
Mask, MVT VT) { if (!VT.is128BitVector()) return false; unsigned e = VT.getVectorNumElements() / 2; for (unsigned i = 0; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = 0; i != e; ++i) if (!isUndefOrEqual(Mask[e+i], i)) return false; return true; } /// isVEXTRACTIndex - Return true if the specified /// EXTRACT_SUBVECTOR operand specifies a vector extract that is /// suitable for instruction that extract 128 or 256 bit vectors static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa
(N->getOperand(1).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast
(N->getOperand(1).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } /// isVINSERTIndex - Return true if the specified INSERT_SUBVECTOR /// operand specifies a subvector insert that is suitable for input to /// insertion of 128 or 256-bit subvectors static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); if (!isa
(N->getOperand(2).getNode())) return false; // The index should be aligned on a vecWidth-bit boundary. uint64_t Index = cast
(N->getOperand(2).getNode())->getZExtValue(); MVT VT = N->getSimpleValueType(0); unsigned ElSize = VT.getVectorElementType().getSizeInBits(); bool Result = (Index * ElSize) % vecWidth == 0; return Result; } bool X86::isVINSERT128Index(SDNode *N) { return isVINSERTIndex(N, 128); } bool X86::isVINSERT256Index(SDNode *N) { return isVINSERTIndex(N, 256); } bool X86::isVEXTRACT128Index(SDNode *N) { return isVEXTRACTIndex(N, 128); } bool X86::isVEXTRACT256Index(SDNode *N) { return isVEXTRACTIndex(N, 256); } /// getShuffleSHUFImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with PSHUF* and SHUFP* instructions. /// Handles 128-bit and 256-bit. static unsigned getShuffleSHUFImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); assert((VT.getSizeInBits() >= 128) && "Unsupported vector type for PSHUF/SHUFP"); // Handle 128 and 256-bit vector lengths. AVX defines PSHUF/SHUFP to operate // independently on 128-bit lanes. unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; assert((NumLaneElts == 2 || NumLaneElts == 4 || NumLaneElts == 8) && "Only supports 2, 4 or 8 elements per lane"); unsigned Shift = (NumLaneElts >= 4) ? 1 : 0; unsigned Mask = 0; for (unsigned i = 0; i != NumElts; ++i) { int Elt = N->getMaskElt(i); if (Elt < 0) continue; Elt &= NumLaneElts - 1; unsigned ShAmt = (i << Shift) % 8; Mask |= Elt << ShAmt; } return Mask; } /// getShufflePSHUFHWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFHW instruction. static unsigned getShufflePSHUFHWImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); unsigned NumElts = VT.getVectorNumElements(); unsigned Mask = 0; for (unsigned l = 0; l != NumElts; l += 8) { // 8 nodes per lane, but we only care about the last 4. for (unsigned i = 0; i < 4; ++i) { int Elt = N->getMaskElt(l+i+4); if (Elt < 0) continue; Elt &= 0x3; // only 2-bits. Mask |= Elt << (i * 2); } } return Mask; } /// getShufflePSHUFLWImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PSHUFLW instruction. static unsigned getShufflePSHUFLWImmediate(ShuffleVectorSDNode *N) { MVT VT = N->getSimpleValueType(0); assert((VT == MVT::v8i16 || VT == MVT::v16i16) && "Unsupported vector type for PSHUFHW"); unsigned NumElts = VT.getVectorNumElements(); unsigned Mask = 0; for (unsigned l = 0; l != NumElts; l += 8) { // 8 nodes per lane, but we only care about the first 4. for (unsigned i = 0; i < 4; ++i) { int Elt = N->getMaskElt(l+i); if (Elt < 0) continue; Elt &= 0x3; // only 2-bits Mask |= Elt << (i * 2); } } return Mask; } /// getShufflePALIGNRImmediate - Return the appropriate immediate to shuffle /// the specified VECTOR_SHUFFLE mask with the PALIGNR instruction. static unsigned getShufflePALIGNRImmediate(ShuffleVectorSDNode *SVOp) { MVT VT = SVOp->getSimpleValueType(0); unsigned EltSize = VT.is512BitVector() ? 1 : VT.getVectorElementType().getSizeInBits() >> 3; unsigned NumElts = VT.getVectorNumElements(); unsigned NumLanes = VT.is512BitVector() ? 1 : VT.getSizeInBits()/128; unsigned NumLaneElts = NumElts/NumLanes; int Val = 0; unsigned i; for (i = 0; i != NumElts; ++i) { Val = SVOp->getMaskElt(i); if (Val >= 0) break; } if (Val >= (int)NumElts) Val -= NumElts - NumLaneElts; assert(Val - i > 0 && "PALIGNR imm should be positive"); return (Val - i) * EltSize; } static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa
(N->getOperand(1).getNode())) llvm_unreachable("Illegal extract subvector for VEXTRACT"); uint64_t Index = cast
(N->getOperand(1).getNode())->getZExtValue(); MVT VecVT = N->getOperand(0).getSimpleValueType(); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); if (!isa
(N->getOperand(2).getNode())) llvm_unreachable("Illegal insert subvector for VINSERT"); uint64_t Index = cast
(N->getOperand(2).getNode())->getZExtValue(); MVT VecVT = N->getSimpleValueType(0); MVT ElVT = VecVT.getVectorElementType(); unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits(); return Index / NumElemsPerChunk; } /// getExtractVEXTRACT128Immediate - Return the appropriate immediate /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF128 /// and VINSERTI128 instructions. unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 128); } /// getExtractVEXTRACT256Immediate - Return the appropriate immediate /// to extract the specified EXTRACT_SUBVECTOR index with VEXTRACTF64x4 /// and VINSERTI64x4 instructions. unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { return getExtractVEXTRACTImmediate(N, 256); } /// getInsertVINSERT128Immediate - Return the appropriate immediate /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF128 /// and VINSERTI128 instructions. unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 128); } /// getInsertVINSERT256Immediate - Return the appropriate immediate /// to insert at the specified INSERT_SUBVECTOR index with VINSERTF46x4 /// and VINSERTI64x4 instructions. unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { return getInsertVINSERTImmediate(N, 256); } /// isZero - Returns true if Elt is a constant integer zero static bool isZero(SDValue V) { ConstantSDNode *C = dyn_cast
(V); return C && C->isNullValue(); } /// isZeroNode - Returns true if Elt is a constant zero or a floating point /// constant +0.0. bool X86::isZeroNode(SDValue Elt) { if (isZero(Elt)) return true; if (ConstantFPSDNode *CFP = dyn_cast
(Elt)) return CFP->getValueAPF().isPosZero(); return false; } /// CommuteVectorShuffle - Swap vector_shuffle operands as well as values in /// their permute mask. static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); SmallVector
MaskVec; for (unsigned i = 0; i != NumElems; ++i) { int Idx = SVOp->getMaskElt(i); if (Idx >= 0) { if (Idx < (int)NumElems) Idx += NumElems; else Idx -= NumElems; } MaskVec.push_back(Idx); } return DAG.getVectorShuffle(VT, SDLoc(SVOp), SVOp->getOperand(1), SVOp->getOperand(0), &MaskVec[0]); } /// ShouldXformToMOVHLPS - Return true if the node should be transformed to /// match movhlps. The lower half elements should come from upper half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). static bool ShouldXformToMOVHLPS(ArrayRef
Mask, MVT VT) { if (!VT.is128BitVector()) return false; if (VT.getVectorNumElements() != 4) return false; for (unsigned i = 0, e = 2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i+2)) return false; for (unsigned i = 2; i != 4; ++i) if (!isUndefOrEqual(Mask[i], i+4)) return false; return true; } /// isScalarLoadToVector - Returns true if the node is a scalar load that /// is promoted to a vector. It also returns the LoadSDNode by reference if /// required. static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) { if (N->getOpcode() != ISD::SCALAR_TO_VECTOR) return false; N = N->getOperand(0).getNode(); if (!ISD::isNON_EXTLoad(N)) return false; if (LD) *LD = cast
(N); return true; } // Test whether the given value is a vector value which will be legalized // into a load. static bool WillBeConstantPoolLoad(SDNode *N) { if (N->getOpcode() != ISD::BUILD_VECTOR) return false; // Check for any non-constant elements. for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) switch (N->getOperand(i).getNode()->getOpcode()) { case ISD::UNDEF: case ISD::ConstantFP: case ISD::Constant: break; default: return false; } // Vectors of all-zeros and all-ones are materialized with special // instructions rather than being loaded. return !ISD::isBuildVectorAllZeros(N) && !ISD::isBuildVectorAllOnes(N); } /// ShouldXformToMOVLP{S|D} - Return true if the node should be transformed to /// match movlp{s|d}. The lower half elements should come from lower half of /// V1 (and in order), and the upper half elements should come from the upper /// half of V2 (and in order). And since V1 will become the source of the /// MOVLP, it must be either a vector load or a scalar load to vector. static bool ShouldXformToMOVLP(SDNode *V1, SDNode *V2, ArrayRef
Mask, MVT VT) { if (!VT.is128BitVector()) return false; if (!ISD::isNON_EXTLoad(V1) && !isScalarLoadToVector(V1)) return false; // Is V2 is a vector load, don't do this transformation. We will try to use // load folding shufps op. if (ISD::isNON_EXTLoad(V2) || WillBeConstantPoolLoad(V2)) return false; unsigned NumElems = VT.getVectorNumElements(); if (NumElems != 2 && NumElems != 4) return false; for (unsigned i = 0, e = NumElems/2; i != e; ++i) if (!isUndefOrEqual(Mask[i], i)) return false; for (unsigned i = NumElems/2, e = NumElems; i != e; ++i) if (!isUndefOrEqual(Mask[i], i+NumElems)) return false; return true; } /// isZeroShuffle - Returns true if N is a VECTOR_SHUFFLE that can be resolved /// to an zero vector. /// FIXME: move to dag combiner / method on ShuffleVectorSDNode static bool isZeroShuffle(ShuffleVectorSDNode *N) { SDValue V1 = N->getOperand(0); SDValue V2 = N->getOperand(1); unsigned NumElems = N->getValueType(0).getVectorNumElements(); for (unsigned i = 0; i != NumElems; ++i) { int Idx = N->getMaskElt(i); if (Idx >= (int)NumElems) { unsigned Opc = V2.getOpcode(); if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) continue; if (Opc != ISD::BUILD_VECTOR || !X86::isZeroNode(V2.getOperand(Idx-NumElems))) return false; } else if (Idx >= 0) { unsigned Opc = V1.getOpcode(); if (Opc == ISD::UNDEF || ISD::isBuildVectorAllZeros(V1.getNode())) continue; if (Opc != ISD::BUILD_VECTOR || !X86::isZeroNode(V1.getOperand(Idx))) return false; } } return true; } /// getZeroVector - Returns a vector of specified type with all zero elements. /// static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); // Always build SSE zero vectors as <4 x i32> bitcasted // to their dest type. This ensures they get CSE'd. SDValue Vec; if (VT.is128BitVector()) { // SSE if (Subtarget->hasSSE2()) { // SSE2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else { // SSE1 SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4f32, Cst, Cst, Cst, Cst); } } else if (VT.is256BitVector()) { // AVX if (Subtarget->hasInt256()) { // AVX2 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // 256-bit logic and arithmetic instructions in AVX are all // floating-point, no support for integer ops. Emit fp zeroed vectors. SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops); } } else if (VT.is512BitVector()) { // AVX-512 SDValue Cst = DAG.getTargetConstant(0, MVT::i32); SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops); } else if (VT.getScalarType() == MVT::i1) { assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type"); SDValue Cst = DAG.getTargetConstant(0, MVT::i1); SmallVector
Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } else llvm_unreachable("Unexpected vector type"); return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// getOnesVector - Returns a vector of specified type with all bits set. /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with /// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately. /// Then bitcast to their original type, ensuring they get CSE'd. static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG, SDLoc dl) { assert(VT.isVector() && "Expected a vector type"); SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32); SDValue Vec; if (VT.is256BitVector()) { if (HasInt256) { // AVX2 SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst }; Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops); } else { // AVX Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl); } } else if (VT.is128BitVector()) { Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst); } else llvm_unreachable("Unexpected vector type"); return DAG.getNode(ISD::BITCAST, dl, VT, Vec); } /// NormalizeMask - V2 is a splat, modify the mask (if needed) so all elements /// that point to V2 points to its first element. static void NormalizeMask(SmallVectorImpl
&Mask, unsigned NumElems) { for (unsigned i = 0; i != NumElems; ++i) { if (Mask[i] > (int)NumElems) { Mask[i] = NumElems; } } } /// getMOVLMask - Returns a vector_shuffle mask for an movs{s|d}, movd /// operation of specified width. static SDValue getMOVL(SelectionDAG &DAG, SDLoc dl, EVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector
Mask; Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) Mask.push_back(i); return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// getUnpackl - Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector
Mask; for (unsigned i = 0, e = NumElems/2; i != e; ++i) { Mask.push_back(i); Mask.push_back(i + NumElems); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } /// getUnpackh - Returns a vector_shuffle node for an unpackh operation. static SDValue getUnpackh(SelectionDAG &DAG, SDLoc dl, MVT VT, SDValue V1, SDValue V2) { unsigned NumElems = VT.getVectorNumElements(); SmallVector
Mask; for (unsigned i = 0, Half = NumElems/2; i != Half; ++i) { Mask.push_back(i + Half); Mask.push_back(i + NumElems + Half); } return DAG.getVectorShuffle(VT, dl, V1, V2, &Mask[0]); } // PromoteSplati8i16 - All i16 and i8 vector types can't be used directly by // a generic shuffle instruction because the target has no such instructions. // Generate shuffles which repeat i16 and i8 several times until they can be // represented by v4f32 and then be manipulated by target suported shuffles. static SDValue PromoteSplati8i16(SDValue V, SelectionDAG &DAG, int &EltNo) { MVT VT = V.getSimpleValueType(); int NumElems = VT.getVectorNumElements(); SDLoc dl(V); while (NumElems > 4) { if (EltNo < NumElems/2) { V = getUnpackl(DAG, dl, VT, V, V); } else { V = getUnpackh(DAG, dl, VT, V, V); EltNo -= NumElems/2; } NumElems >>= 1; } return V; } /// getLegalSplat - Generate a legal splat with supported x86 shuffles static SDValue getLegalSplat(SelectionDAG &DAG, SDValue V, int EltNo) { MVT VT = V.getSimpleValueType(); SDLoc dl(V); if (VT.is128BitVector()) { V = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, V); int SplatMask[4] = { EltNo, EltNo, EltNo, EltNo }; V = DAG.getVectorShuffle(MVT::v4f32, dl, V, DAG.getUNDEF(MVT::v4f32), &SplatMask[0]); } else if (VT.is256BitVector()) { // To use VPERMILPS to splat scalars, the second half of indicies must // refer to the higher part, which is a duplication of the lower one, // because VPERMILPS can only handle in-lane permutations. int SplatMask[8] = { EltNo, EltNo, EltNo, EltNo, EltNo+4, EltNo+4, EltNo+4, EltNo+4 }; V = DAG.getNode(ISD::BITCAST, dl, MVT::v8f32, V); V = DAG.getVectorShuffle(MVT::v8f32, dl, V, DAG.getUNDEF(MVT::v8f32), &SplatMask[0]); } else llvm_unreachable("Vector size not supported"); return DAG.getNode(ISD::BITCAST, dl, VT, V); } /// PromoteSplat - Splat is promoted to target supported vector shuffles. static SDValue PromoteSplat(ShuffleVectorSDNode *SV, SelectionDAG &DAG) { MVT SrcVT = SV->getSimpleValueType(0); SDValue V1 = SV->getOperand(0); SDLoc dl(SV); int EltNo = SV->getSplatIndex(); int NumElems = SrcVT.getVectorNumElements(); bool Is256BitVec = SrcVT.is256BitVector(); assert(((SrcVT.is128BitVector() && NumElems > 4) || Is256BitVec) && "Unknown how to promote splat for type"); // Extract the 128-bit part containing the splat element and update // the splat element index when it refers to the higher register. if (Is256BitVec) { V1 = Extract128BitVector(V1, EltNo, DAG, dl); if (EltNo >= NumElems/2) EltNo -= NumElems/2; } // All i16 and i8 vector types can't be used directly by a generic shuffle // instruction because the target has no such instruction. Generate shuffles // which repeat i16 and i8 several times until they fit in i32, and then can // be manipulated by target suported shuffles. MVT EltVT = SrcVT.getVectorElementType(); if (EltVT == MVT::i8 || EltVT == MVT::i16) V1 = PromoteSplati8i16(V1, DAG, EltNo); // Recreate the 256-bit vector and place the same 128-bit vector // into the low and high part. This is necessary because we want // to use VPERM* to shuffle the vectors if (Is256BitVec) { V1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT, V1, V1); } return getLegalSplat(DAG, V1, EltNo); } /// getShuffleVectorZeroOrUndef - Return a vector_shuffle of the specified /// vector of zero or undef vector. This produces a shuffle where the low /// element of V2 is swizzled into the zero/undef vector, landing at element /// Idx. This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3). static SDValue getShuffleVectorZeroOrUndef(SDValue V2, unsigned Idx, bool IsZero, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = V2.getSimpleValueType(); SDValue V1 = IsZero ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT); unsigned NumElems = VT.getVectorNumElements(); SmallVector
MaskVec; for (unsigned i = 0; i != NumElems; ++i) // If this is the insertion idx, put the low elt of V2 here. MaskVec.push_back(i == Idx ? NumElems : i); return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, &MaskVec[0]); } /// getTargetShuffleMask - Calculates the shuffle mask corresponding to the /// target specific opcode. Returns true if the Mask could be calculated. /// Sets IsUnary to true if only uses one source. static bool getTargetShuffleMask(SDNode *N, MVT VT, SmallVectorImpl
&Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; IsUnary = false; switch(N->getOpcode()) { case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast
(ImmN)->getZExtValue(), Mask); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast
(ImmN)->getZExtValue(), Mask); break; case X86ISD::PSHUFD: case X86ISD::VPERMILP: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VPERMI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast
(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: { // The index 0 always comes from the first element of the second source, // this is why MOVSS and MOVSD are used in the first place. The other // elements come from the other positions of the first source vector Mask.push_back(NumElems); for (unsigned i = 1; i != NumElems; ++i) { Mask.push_back(i); } break; } case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast
(ImmN)->getZExtValue(), Mask); if (Mask.empty()) return false; break; case X86ISD::MOVDDUP: case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: case X86ISD::MOVSHDUP: case X86ISD::MOVSLDUP: // Not yet implemented return false; default: llvm_unreachable("unknown target shuffle node"); } return true; } /// getShuffleScalarElt - Returns the scalar element that will make up the ith /// element of the result of the vector shuffle. static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, unsigned Depth) { if (Depth == 6) return SDValue(); // Limit search depth. SDValue V = SDValue(N, 0); EVT VT = V.getValueType(); unsigned Opcode = V.getOpcode(); // Recurse into ISD::VECTOR_SHUFFLE node to find scalars. if (const ShuffleVectorSDNode *SV = dyn_cast
(N)) { int Elt = SV->getMaskElt(Index); if (Elt < 0) return DAG.getUNDEF(VT.getVectorElementType()); unsigned NumElems = VT.getVectorNumElements(); SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Recurse into target specific vector shuffles to find scalars. if (isTargetShuffle(Opcode)) { MVT ShufVT = V.getSimpleValueType(); unsigned NumElems = ShufVT.getVectorNumElements(); SmallVector
ShuffleMask; bool IsUnary; if (!getTargetShuffleMask(N, ShufVT, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; if (Elt < 0) return DAG.getUNDEF(ShufVT.getVectorElementType()); SDValue NewV = (Elt < (int)NumElems) ? N->getOperand(0) : N->getOperand(1); return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } // Actual nodes that may contain scalar elements if (Opcode == ISD::BITCAST) { V = V.getOperand(0); EVT SrcVT = V.getValueType(); unsigned NumElems = VT.getVectorNumElements(); if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems) return SDValue(); } if (V.getOpcode() == ISD::SCALAR_TO_VECTOR) return (Index == 0) ? V.getOperand(0) : DAG.getUNDEF(VT.getVectorElementType()); if (V.getOpcode() == ISD::BUILD_VECTOR) return V.getOperand(Index); return SDValue(); } /// getNumOfConsecutiveZeros - Return the number of elements of a vector /// shuffle operation which come from a consecutively from a zero. The /// search can start in two different directions, from left or right. /// We count undefs as zeros until PreferredNum is reached. static unsigned getNumOfConsecutiveZeros(ShuffleVectorSDNode *SVOp, unsigned NumElems, bool ZerosFromLeft, SelectionDAG &DAG, unsigned PreferredNum = -1U) { unsigned NumZeros = 0; for (unsigned i = 0; i != NumElems; ++i) { unsigned Index = ZerosFromLeft ? i : NumElems - i - 1; SDValue Elt = getShuffleScalarElt(SVOp, Index, DAG, 0); if (!Elt.getNode()) break; if (X86::isZeroNode(Elt)) ++NumZeros; else if (Elt.getOpcode() == ISD::UNDEF) // Undef as zero up to PreferredNum. NumZeros = std::min(NumZeros + 1, PreferredNum); else break; } return NumZeros; } /// isShuffleMaskConsecutive - Check if the shuffle mask indicies [MaskI, MaskE) /// correspond consecutively to elements from one of the vector operands, /// starting from its index OpIdx. Also tell OpNum which source vector operand. static bool isShuffleMaskConsecutive(ShuffleVectorSDNode *SVOp, unsigned MaskI, unsigned MaskE, unsigned OpIdx, unsigned NumElems, unsigned &OpNum) { bool SeenV1 = false; bool SeenV2 = false; for (unsigned i = MaskI; i != MaskE; ++i, ++OpIdx) { int Idx = SVOp->getMaskElt(i); // Ignore undef indicies if (Idx < 0) continue; if (Idx < (int)NumElems) SeenV1 = true; else SeenV2 = true; // Only accept consecutive elements from the same vector if ((Idx % NumElems != OpIdx) || (SeenV1 && SeenV2)) return false; } OpNum = SeenV1 ? 0 : 1; return true; } /// isVectorShiftRight - Returns true if the shuffle can be implemented as a /// logical left shift of a vector. static bool isVectorShiftRight(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getSimpleValueType(0).getVectorNumElements(); unsigned NumZeros = getNumOfConsecutiveZeros( SVOp, NumElems, false /* check zeros from right */, DAG, SVOp->getMaskElt(0)); unsigned OpSrc; if (!NumZeros) return false; // Considering the elements in the mask that are not consecutive zeros, // check if they consecutively come from only one of the source vectors. // // V1 = {X, A, B, C} 0 // \ \ \ / // vector_shuffle V1, V2 <1, 2, 3, X> // if (!isShuffleMaskConsecutive(SVOp, 0, // Mask Start Index NumElems-NumZeros, // Mask End Index(exclusive) NumZeros, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? return false; isLeft = false; ShAmt = NumZeros; ShVal = SVOp->getOperand(OpSrc); return true; } /// isVectorShiftLeft - Returns true if the shuffle can be implemented as a /// logical left shift of a vector. static bool isVectorShiftLeft(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { unsigned NumElems = SVOp->getSimpleValueType(0).getVectorNumElements(); unsigned NumZeros = getNumOfConsecutiveZeros( SVOp, NumElems, true /* check zeros from left */, DAG, NumElems - SVOp->getMaskElt(NumElems - 1) - 1); unsigned OpSrc; if (!NumZeros) return false; // Considering the elements in the mask that are not consecutive zeros, // check if they consecutively come from only one of the source vectors. // // 0 { A, B, X, X } = V2 // / \ / / // vector_shuffle V1, V2
// if (!isShuffleMaskConsecutive(SVOp, NumZeros, // Mask Start Index NumElems, // Mask End Index(exclusive) 0, // Where to start looking in the src vector NumElems, // Number of elements in vector OpSrc)) // Which source operand ? return false; isLeft = true; ShAmt = NumZeros; ShVal = SVOp->getOperand(OpSrc); return true; } /// isVectorShift - Returns true if the shuffle can be implemented as a /// logical left or right shift of a vector. static bool isVectorShift(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG, bool &isLeft, SDValue &ShVal, unsigned &ShAmt) { // Although the logic below support any bitwidth size, there are no // shift instructions which handle more than 128-bit vectors. if (!SVOp->getSimpleValueType(0).is128BitVector()) return false; if (isVectorShiftLeft(SVOp, DAG, isLeft, ShVal, ShAmt) || isVectorShiftRight(SVOp, DAG, isLeft, ShVal, ShAmt)) return true; return false; } /// LowerBuildVectorv16i8 - Custom lower build_vector of v16i8. /// static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 8) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; if (ThisIsNonZero && First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } if ((i & 1) != 0) { SDValue ThisElt, LastElt; bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0; if (LastIsNonZero) { LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i-1)); } if (ThisIsNonZero) { ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i)); ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt, DAG.getConstant(8, MVT::i8)); if (LastIsNonZero) ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt); } else ThisElt = LastElt; if (ThisElt.getNode()) V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt, DAG.getIntPtrConstant(i/2)); } } return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, V); } /// LowerBuildVectorv8i16 - Custom lower build_vector of v8i16. /// static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget* Subtarget, const TargetLowering &TLI) { if (NumNonZero > 4) return SDValue(); SDLoc dl(Op); SDValue V; bool First = true; for (unsigned i = 0; i < 8; ++i) { bool isNonZero = (NonZeros & (1 << i)) != 0; if (isNonZero) { if (First) { if (NumZero) V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); else V = DAG.getUNDEF(MVT::v8i16); First = false; } V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, Op.getOperand(i), DAG.getIntPtrConstant(i)); } } return V; } /// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32. static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, SelectionDAG &DAG, const X86Subtarget *Subtarget, const TargetLowering &TLI) { // We know there's at least one non-zero element unsigned FirstNonZeroIdx = 0; SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx); while (FirstNonZero.getOpcode() == ISD::UNDEF || X86::isZeroNode(FirstNonZero)) { ++FirstNonZeroIdx; FirstNonZero = Op->getOperand(FirstNonZeroIdx); } if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa
(FirstNonZero.getOperand(1))) return SDValue(); SDValue V = FirstNonZero.getOperand(0); MVT VVT = V.getSimpleValueType(); if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32)) return SDValue(); unsigned FirstNonZeroDst = cast
(FirstNonZero.getOperand(1))->getZExtValue(); unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx; unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx; unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst; for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) { SDValue Elem = Op.getOperand(Idx); if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem)) continue; // TODO: What else can be here? Deal with it. if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); // TODO: Some optimizations are still possible here // ex: Getting one element from a vector, and the rest from another. if (Elem.getOperand(0) != V) return SDValue(); unsigned Dst = cast
(Elem.getOperand(1))->getZExtValue(); if (Dst == Idx) ++CorrectIdx; else if (IncorrectIdx == -1U) { IncorrectIdx = Idx; IncorrectDst = Dst; } else // There was already one element with an incorrect index. // We can't optimize this case to an insertps. return SDValue(); } if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) { SDLoc dl(Op); EVT VT = Op.getSimpleValueType(); unsigned ElementMoveMask = 0; if (IncorrectIdx == -1U) ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4; else ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4; SDValue InsertpsMask = DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf)); return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask); } return SDValue(); } /// getVShift - Return a vector logical shift node. /// static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits, SelectionDAG &DAG, const TargetLowering &TLI, SDLoc dl) { assert(VT.is128BitVector() && "Unknown type for VShift"); EVT ShVT = MVT::v2i64; unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ; SrcOp = DAG.getNode(ISD::BITCAST, dl, ShVT, SrcOp); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(Opc, dl, ShVT, SrcOp, DAG.getConstant(NumBits, TLI.getScalarShiftAmountTy(SrcOp.getValueType())))); } static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, SDLoc dl, SelectionDAG &DAG) { // Check if the scalar load can be widened into a vector load. And if // the address is "base + cst" see if the cst can be "absorbed" into // the shuffle mask. if (LoadSDNode *LD = dyn_cast
(SrcOp)) { SDValue Ptr = LD->getBasePtr(); if (!ISD::isNormalLoad(LD) || LD->isVolatile()) return SDValue(); EVT PVT = LD->getValueType(0); if (PVT != MVT::i32 && PVT != MVT::f32) return SDValue(); int FI = -1; int64_t Offset = 0; if (FrameIndexSDNode *FINode = dyn_cast
(Ptr)) { FI = FINode->getIndex(); Offset = 0; } else if (DAG.isBaseWithConstantOffset(Ptr) && isa
(Ptr.getOperand(0))) { FI = cast
(Ptr.getOperand(0))->getIndex(); Offset = Ptr.getConstantOperandVal(1); Ptr = Ptr.getOperand(0); } else { return SDValue(); } // FIXME: 256-bit vector instructions don't require a strict alignment, // improve this code to support it better. unsigned RequiredAlign = VT.getSizeInBits()/8; SDValue Chain = LD->getChain(); // Make sure the stack object alignment is at least 16 or 32. MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo(); if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) { if (MFI->isFixedObjectIndex(FI)) { // Can't change the alignment. FIXME: It's possible to compute // the exact stack offset and reference FI + adjust offset instead. // If someone *really* cares about this. That's the way to implement it. return SDValue(); } else { MFI->setObjectAlignment(FI, RequiredAlign); } } // (Offset % 16 or 32) must be multiple of 4. Then address is then // Ptr + (Offset & ~15). if (Offset < 0) return SDValue(); if ((Offset % RequiredAlign) & 3) return SDValue(); int64_t StartOffset = Offset & ~(RequiredAlign-1); if (StartOffset) Ptr = DAG.getNode(ISD::ADD, SDLoc(Ptr), Ptr.getValueType(), Ptr,DAG.getConstant(StartOffset, Ptr.getValueType())); int EltNo = (Offset - StartOffset) >> 2; unsigned NumElems = VT.getVectorNumElements(); EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems); SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(StartOffset), false, false, false, 0); SmallVector
Mask; for (unsigned i = 0; i != NumElems; ++i) Mask.push_back(EltNo); return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), &Mask[0]); } return SDValue(); } /// EltsFromConsecutiveLoads - Given the initializing elements 'Elts' of a /// vector of type 'VT', see if the elements can be replaced by a single large /// load which has the same value as a build_vector whose operands are 'elts'. /// /// Example:
-> zextload a /// /// FIXME: we'd also like to handle the case where the last elements are zero /// rather than undef via VZEXT_LOAD, but we do not detect that case today. /// There's even a handy isZeroNode for that purpose. static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl
&Elts, SDLoc &DL, SelectionDAG &DAG, bool isAfterLegalize) { EVT EltVT = VT.getVectorElementType(); unsigned NumElems = Elts.size(); LoadSDNode *LDBase = nullptr; unsigned LastLoadedElt = -1U; // For each element in the initializer, see if we've found a load or an undef. // If we don't find an initial load element, or later load elements are // non-consecutive, bail out. for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Elts[i]; if (!Elt.getNode() || (Elt.getOpcode() != ISD::UNDEF && !ISD::isNON_EXTLoad(Elt.getNode()))) return SDValue(); if (!LDBase) { if (Elt.getNode()->getOpcode() == ISD::UNDEF) return SDValue(); LDBase = cast
(Elt.getNode()); LastLoadedElt = i; continue; } if (Elt.getOpcode() == ISD::UNDEF) continue; LoadSDNode *LD = cast
(Elt); if (!DAG.isConsecutiveLoad(LD, LDBase, EltVT.getSizeInBits()/8, i)) return SDValue(); LastLoadedElt = i; } // If we have found an entire vector of loads and undefs, then return a large // load of the entire vector width starting at the base pointer. If we found // consecutive loads for the low half, generate a vzext_load node. if (LastLoadedElt == NumElems - 1) { if (isAfterLegalize && !DAG.getTargetLoweringInfo().isOperationLegal(ISD::LOAD, VT)) return SDValue(); SDValue NewLd = SDValue(); if (DAG.InferPtrAlignment(LDBase->getBasePtr()) >= 16) NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), 0); NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->isVolatile(), LDBase->isNonTemporal(), LDBase->isInvariant(), LDBase->getAlignment()); if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(NewLd.getNode(), 1)); } return NewLd; } if (NumElems == 4 && LastLoadedElt == 1 && DAG.getTargetLoweringInfo().isTypeLegal(MVT::v2i64)) { SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other); SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() }; SDValue ResNode = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64, LDBase->getPointerInfo(), LDBase->getAlignment(), false/*isVolatile*/, true/*ReadMem*/, false/*WriteMem*/); // Make sure the newly-created LOAD is in the same position as LDBase in // terms of dependency. We create a TokenFactor for LDBase and ResNode, and // update uses of LDBase's output chain to use the TokenFactor. if (LDBase->hasAnyUseOfValue(1)) { SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain); DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1), SDValue(ResNode.getNode(), 1)); } return DAG.getNode(ISD::BITCAST, DL, VT, ResNode); } return SDValue(); } /// LowerVectorBroadcast - Attempt to use the vbroadcast instruction /// to generate a splat value for the following cases: /// 1. A splat BUILD_VECTOR which uses a single scalar load, or a constant. /// 2. A splat shuffle which uses a scalar_to_vector node which comes from /// a scalar load, or a constant. /// The VBROADCAST node is returned when a pattern is found, /// or SDValue() otherwise. static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget, SelectionDAG &DAG) { if (!Subtarget->hasFp256()) return SDValue(); MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) && "Unsupported vector type for broadcast."); SDValue Ld; bool ConstSplatVal; switch (Op.getOpcode()) { default: // Unknown pattern found. return SDValue(); case ISD::BUILD_VECTOR: { auto *BVOp = cast
(Op.getNode()); BitVector UndefElements; SDValue Splat = BVOp->getSplatValue(&UndefElements); // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Splat || (VT.getVectorNumElements() - UndefElements.count()) <= 1) return SDValue(); Ld = Splat; ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // Make sure that all of the users of a non-constant load are from the // BUILD_VECTOR node. if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode())) return SDValue(); break; } case ISD::VECTOR_SHUFFLE: { ShuffleVectorSDNode *SVOp = cast
(Op); // Shuffles must have a splat mask where the first element is // broadcasted. if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0) return SDValue(); SDValue Sc = Op.getOperand(0); if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR && Sc.getOpcode() != ISD::BUILD_VECTOR) { if (!Subtarget->hasInt256()) return SDValue(); // Use the register form of the broadcast instruction available on AVX2. if (VT.getSizeInBits() >= 256) Sc = Extract128BitVector(Sc, 0, DAG, dl); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Sc); } Ld = Sc.getOperand(0); ConstSplatVal = (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP); // The scalar_to_vector node and the suspected // load node must have exactly one user. // Constants may have multiple users. // AVX-512 has register version of the broadcast bool hasRegVer = Subtarget->hasAVX512() && VT.is512BitVector() && Ld.getValueType().getSizeInBits() >= 32; if (!ConstSplatVal && ((!Sc.hasOneUse() || !Ld.hasOneUse()) && !hasRegVer)) return SDValue(); break; } } bool IsGE256 = (VT.getSizeInBits() >= 256); // Handle the broadcasting a single constant scalar from the constant pool // into a vector. On Sandybridge it is still better to load a constant vector // from the constant pool and not to broadcast it from a scalar. if (ConstSplatVal && Subtarget->hasInt256()) { EVT CVT = Ld.getValueType(); assert(!CVT.isVector() && "Must not broadcast a vector type"); unsigned ScalarSize = CVT.getSizeInBits(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) { const Constant *C = nullptr; if (ConstantSDNode *CI = dyn_cast
(Ld)) C = CI->getConstantIntValue(); else if (ConstantFPSDNode *CF = dyn_cast
(Ld)) C = CF->getConstantFPValue(); assert(C && "Invalid constant type"); const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue CP = DAG.getConstantPool(C, TLI.getPointerTy()); unsigned Alignment = cast
(CP)->getAlignment(); Ld = DAG.getLoad(CVT, dl, DAG.getEntryNode(), CP, MachinePointerInfo::getConstantPool(), false, false, false, Alignment); return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } } bool IsLoad = ISD::isNormalLoad(Ld.getNode()); unsigned ScalarSize = Ld.getValueType().getSizeInBits(); // Handle AVX2 in-register broadcasts. if (!IsLoad && Subtarget->hasInt256() && (ScalarSize == 32 || (IsGE256 && ScalarSize == 64))) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The scalar source must be a normal load. if (!IsLoad) return SDValue(); if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); // The integer check is needed for the 64-bit into 128-bit so it doesn't match // double since there is no vbroadcastsd xmm if (Subtarget->hasInt256() && Ld.getValueType().isInteger()) { if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64) return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld); } // Unsupported broadcast. return SDValue(); } /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real /// underlying vector and index. /// /// Modifies \p ExtractedFromVec to the real vector and returns the real /// index. static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, SDValue ExtIdx) { int Idx = cast
(ExtIdx)->getZExtValue(); if (!isa
(ExtractedFromVec)) return Idx; // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: // (extract_vector_elt (v8f32 %vreg1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> // (extract_subvector (v8f32 %vreg0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index // is 2, as specified by the shuffle. ShuffleVectorSDNode *SVOp = cast
(ExtractedFromVec); SDValue ShuffleVec = SVOp->getOperand(0); MVT ShuffleVecVT = ShuffleVec.getSimpleValueType(); assert(ShuffleVecVT.getVectorElementType() == ExtractedFromVec.getSimpleValueType().getVectorElementType()); int ShuffleIdx = SVOp->getMaskElt(Idx); if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) { ExtractedFromVec = ShuffleVec; return ShuffleIdx; } return Idx; } static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) { MVT VT = Op.getSimpleValueType(); // Skip if insert_vec_elt is not supported. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT)) return SDValue(); SDLoc DL(Op); unsigned NumElems = Op.getNumOperands(); SDValue VecIn1; SDValue VecIn2; SmallVector
InsertIndices; SmallVector
Mask(NumElems, -1); for (unsigned i = 0; i != NumElems; ++i) { unsigned Opc = Op.getOperand(i).getOpcode(); if (Opc == ISD::UNDEF) continue; if (Opc != ISD::EXTRACT_VECTOR_ELT) { // Quit if more than 1 elements need inserting. if (InsertIndices.size() > 1) return SDValue(); InsertIndices.push_back(i); continue; } SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0); SDValue ExtIdx = Op.getOperand(i).getOperand(1); // Quit if non-constant index. if (!isa
(ExtIdx)) return SDValue(); int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx); // Quit if extracted from vector of different type. if (ExtractedFromVec.getValueType() != VT) return SDValue(); if (!VecIn1.getNode()) VecIn1 = ExtractedFromVec; else if (VecIn1 != ExtractedFromVec) { if (!VecIn2.getNode()) VecIn2 = ExtractedFromVec; else if (VecIn2 != ExtractedFromVec) // Quit if more than 2 vectors to shuffle return SDValue(); } if (ExtractedFromVec == VecIn1) Mask[i] = Idx; else if (ExtractedFromVec == VecIn2) Mask[i] = Idx + NumElems; } if (!VecIn1.getNode()) return SDValue(); VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT); SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, &Mask[0]); for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) { unsigned Idx = InsertIndices[i]; NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx), DAG.getIntPtrConstant(Idx)); } return NV; } // Lower BUILD_VECTOR operation for v8i1 and v16i1 types. SDValue X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); assert((VT.getVectorElementType() == MVT::i1) && (VT.getSizeInBits() <= 16) && "Unexpected type in LowerBUILD_VECTORvXi1!"); SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(0, MVT::i1); SmallVector
Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } if (ISD::isBuildVectorAllOnes(Op.getNode())) { SDValue Cst = DAG.getTargetConstant(1, MVT::i1); SmallVector
Ops(VT.getVectorNumElements(), Cst); return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops); } bool AllContants = true; uint64_t Immediate = 0; int NonConstIdx = -1; bool IsSplat = true; unsigned NumNonConsts = 0; unsigned NumConsts = 0; for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) { SDValue In = Op.getOperand(idx); if (In.getOpcode() == ISD::UNDEF) continue; if (!isa
(In)) { AllContants = false; NonConstIdx = idx; NumNonConsts++; } else { NumConsts++; if (cast
(In)->getZExtValue()) Immediate |= (1ULL << idx); } if (In != Op.getOperand(0)) IsSplat = false; } if (AllContants) { SDValue FullMask = DAG.getNode(ISD::BITCAST, dl, MVT::v16i1, DAG.getConstant(Immediate, MVT::i16)); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, FullMask, DAG.getIntPtrConstant(0)); } if (NumNonConsts == 1 && NonConstIdx != 0) { SDValue DstVec; if (NumConsts) { SDValue VecAsImm = DAG.getConstant(Immediate, MVT::getIntegerVT(VT.getSizeInBits())); DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm); } else DstVec = DAG.getUNDEF(VT); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec, Op.getOperand(NonConstIdx), DAG.getIntPtrConstant(NonConstIdx)); } if (!IsSplat && (NonConstIdx != 0)) llvm_unreachable("Unsupported BUILD_VECTOR operation"); MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8; SDValue Select; if (IsSplat) Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), DAG.getConstant(-1, SelectVT), DAG.getConstant(0, SelectVT)); else Select = DAG.getNode(ISD::SELECT, dl, SelectVT, Op.getOperand(0), DAG.getConstant((Immediate | 1), SelectVT), DAG.getConstant(Immediate, SelectVT)); return DAG.getNode(ISD::BITCAST, dl, VT, Select); } /// \brief Return true if \p N implements a horizontal binop and return the /// operands for the horizontal binop into V0 and V1. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function checks that the build_vector \p N in input implements a /// horizontal operation. Parameter \p Opcode defines the kind of horizontal /// operation to match. /// For example, if \p Opcode is equal to ISD::ADD, then this function /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode /// is equal to ISD::SUB, then this function checks if this is a horizontal /// arithmetic sub. /// /// This function only analyzes elements of \p N whose indices are /// in range [BaseIdx, LastIdx). static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode, SelectionDAG &DAG, unsigned BaseIdx, unsigned LastIdx, SDValue &V0, SDValue &V1) { EVT VT = N->getValueType(0); assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!"); assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx && "Invalid Vector in input!"); bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD); bool CanFold = true; unsigned ExpectedVExtractIdx = BaseIdx; unsigned NumElts = LastIdx - BaseIdx; V0 = DAG.getUNDEF(VT); V1 = DAG.getUNDEF(VT); // Check if N implements a horizontal binop. for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) { SDValue Op = N->getOperand(i + BaseIdx); // Skip UNDEFs. if (Op->getOpcode() == ISD::UNDEF) { // Update the expected vector extract index. if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; ExpectedVExtractIdx += 2; continue; } CanFold = Op->getOpcode() == Opcode && Op->hasOneUse(); if (!CanFold) break; SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1)) CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT && Op0.getOperand(0) == Op1.getOperand(0) && isa
(Op0.getOperand(1)) && isa
(Op1.getOperand(1))); if (!CanFold) break; unsigned I0 = cast
(Op0.getOperand(1))->getZExtValue(); unsigned I1 = cast
(Op1.getOperand(1))->getZExtValue(); if (i * 2 < NumElts) { if (V0.getOpcode() == ISD::UNDEF) V0 = Op0.getOperand(0); } else { if (V1.getOpcode() == ISD::UNDEF) V1 = Op0.getOperand(0); if (i * 2 == NumElts) ExpectedVExtractIdx = BaseIdx; } SDValue Expected = (i * 2 < NumElts) ? V0 : V1; if (I0 == ExpectedVExtractIdx) CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected; else if (IsCommutable && I1 == ExpectedVExtractIdx) { // Try to match the following dag sequence: // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I)) CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected; } else CanFold = false; ExpectedVExtractIdx += 2; } return CanFold; } /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by /// a concat_vector. /// /// This is a helper function of PerformBUILD_VECTORCombine. /// This function expects two 256-bit vectors called V0 and V1. /// At first, each vector is split into two separate 128-bit vectors. /// Then, the resulting 128-bit vectors are used to implement two /// horizontal binary operations. /// /// The kind of horizontal binary operation is defined by \p X86Opcode. /// /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to /// the two new horizontal binop. /// When Mode is set, the first horizontal binop dag node would take as input /// the lower 128-bit of V0 and the upper 128-bit of V0. The second /// horizontal binop dag node would take as input the lower 128-bit of V1 /// and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V0_HI /// HADD V1_LO, V1_HI /// /// Otherwise, the first horizontal binop dag node takes as input the lower /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop /// dag node takes the the upper 128-bit of V0 and the upper 128-bit of V1. /// Example: /// HADD V0_LO, V1_LO /// HADD V0_HI, V1_HI /// /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to /// the upper 128-bits of the result. static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, SDLoc DL, SelectionDAG &DAG, unsigned X86Opcode, bool Mode, bool isUndefLO, bool isUndefHI) { EVT VT = V0.getValueType(); assert(VT.is256BitVector() && VT == V1.getValueType() && "Invalid nodes in input!"); unsigned NumElts = VT.getVectorNumElements(); SDValue V0_LO = Extract128BitVector(V0, 0, DAG, DL); SDValue V0_HI = Extract128BitVector(V0, NumElts/2, DAG, DL); SDValue V1_LO = Extract128BitVector(V1, 0, DAG, DL); SDValue V1_HI = Extract128BitVector(V1, NumElts/2, DAG, DL); EVT NewVT = V0_LO.getValueType(); SDValue LO = DAG.getUNDEF(NewVT); SDValue HI = DAG.getUNDEF(NewVT); if (Mode) { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && V0->getOpcode() != ISD::UNDEF) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI); if (!isUndefHI && V1->getOpcode() != ISD::UNDEF) HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI); } else { // Don't emit a horizontal binop if the result is expected to be UNDEF. if (!isUndefLO && (V0_LO->getOpcode() != ISD::UNDEF || V1_LO->getOpcode() != ISD::UNDEF)) LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO); if (!isUndefHI && (V0_HI->getOpcode() != ISD::UNDEF || V1_HI->getOpcode() != ISD::UNDEF)) HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI); } return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI); } /// \brief Try to fold a build_vector that performs an 'addsub' into the /// sequence of 'vadd + vsub + blendi'. static SDValue matchAddSub(const BuildVectorSDNode *BV, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(BV); EVT VT = BV->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 || VT == MVT::v2f64) && "build_vector with an invalid type found!"); // Don't try to emit a VSELECT that cannot be lowered into a blend. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT)) return SDValue(); // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from // subtracting two integer/float elements. unsigned ExpectedOpcode = ISD::FSUB; unsigned NextExpectedOpcode = ISD::FADD; bool AddFound = false; bool SubFound = false; for (unsigned i = 0, e = NumElts; i != e; i++) { SDValue Op = BV->getOperand(i); // Skip 'undef' values. unsigned Opcode = Op.getOpcode(); if (Opcode == ISD::UNDEF) { std::swap(ExpectedOpcode, NextExpectedOpcode); continue; } // Early exit if we found an unexpected opcode. if (Opcode != ExpectedOpcode) return SDValue(); SDValue Op0 = Op.getOperand(0); SDValue Op1 = Op.getOperand(1); // Try to match the following pattern: // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i)) // Early exit if we cannot match that sequence. if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT || Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT || !isa
(Op0.getOperand(1)) || !isa
(Op1.getOperand(1)) || Op0.getOperand(1) != Op1.getOperand(1)) return SDValue(); unsigned I0 = cast
(Op0.getOperand(1))->getZExtValue(); if (I0 != i) return SDValue(); // We found a valid add/sub node. Update the information accordingly. if (i & 1) AddFound = true; else SubFound = true; // Update InVec0 and InVec1. if (InVec0.getOpcode() == ISD::UNDEF) InVec0 = Op0.getOperand(0); if (InVec1.getOpcode() == ISD::UNDEF) InVec1 = Op1.getOperand(0); // Make sure that operands in input to each add/sub node always // come from a same pair of vectors. if (InVec0 != Op0.getOperand(0)) { if (ExpectedOpcode == ISD::FSUB) return SDValue(); // FADD is commutable. Try to commute the operands // and then test again. std::swap(Op0, Op1); if (InVec0 != Op0.getOperand(0)) return SDValue(); } if (InVec1 != Op1.getOperand(0)) return SDValue(); // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); } // Don't try to fold this build_vector into a VSELECT if it has // too many UNDEF operands. if (AddFound && SubFound && InVec0.getOpcode() != ISD::UNDEF && InVec1.getOpcode() != ISD::UNDEF) { // Emit a sequence of vector add and sub followed by a VSELECT. // The new VSELECT will be lowered into a BLENDI. // At ISel stage, we pattern-match the sequence 'add + sub + BLENDI' // and emit a single ADDSUB instruction. SDValue Sub = DAG.getNode(ExpectedOpcode, DL, VT, InVec0, InVec1); SDValue Add = DAG.getNode(NextExpectedOpcode, DL, VT, InVec0, InVec1); // Construct the VSELECT mask. EVT MaskVT = VT.changeVectorElementTypeToInteger(); EVT SVT = MaskVT.getVectorElementType(); unsigned SVTBits = SVT.getSizeInBits(); SmallVector
Ops; for (unsigned i = 0, e = NumElts; i != e; ++i) { APInt Value = i & 1 ? APInt::getNullValue(SVTBits) : APInt::getAllOnesValue(SVTBits); SDValue Constant = DAG.getConstant(Value, SVT); Ops.push_back(Constant); } SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVT, Ops); return DAG.getSelect(DL, VT, Mask, Sub, Add); } return SDValue(); } static SDValue PerformBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG, const X86Subtarget *Subtarget) { SDLoc DL(N); EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); BuildVectorSDNode *BV = cast
(N); SDValue InVec0, InVec1; // Try to match an ADDSUB. if ((Subtarget->hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) || (Subtarget->hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))) { SDValue Value = matchAddSub(BV, DAG, Subtarget); if (Value.getNode()) return Value; } // Try to match horizontal ADD/SUB. unsigned NumUndefsLO = 0; unsigned NumUndefsHI = 0; unsigned Half = NumElts/2; // Count the number of UNDEF operands in the build_vector in input. for (unsigned i = 0, e = Half; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsLO++; for (unsigned i = Half, e = NumElts; i != e; ++i) if (BV->getOperand(i)->getOpcode() == ISD::UNDEF) NumUndefsHI++; // Early exit if this is either a build_vector of all UNDEFs or all the // operands but one are UNDEF. if (NumUndefsLO + NumUndefsHI + 1 >= NumElts) return SDValue(); if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget->hasSSE3()) { // Try to match an SSE3 float HADD/HSUB. if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget->hasSSSE3()) { // Try to match an SSSE3 integer HADD/HSUB. if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1); } if (!Subtarget->hasAVX()) return SDValue(); if ((VT == MVT::v8f32 || VT == MVT::v4f64)) { // Try to match an AVX horizontal add/sub of packed single/double // precision floating point values from 256-bit vectors. SDValue InVec2, InVec3; if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1); if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1); } else if (VT == MVT::v8i32 || VT == MVT::v16i16) { // Try to match an AVX2 horizontal add/sub of signed integers. SDValue InVec2, InVec3; unsigned X86Opcode; bool CanFold = true; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) && isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) && ((InVec0.getOpcode() == ISD::UNDEF || InVec2.getOpcode() == ISD::UNDEF) || InVec0 == InVec2) && ((InVec1.getOpcode() == ISD::UNDEF || InVec3.getOpcode() == ISD::UNDEF) || InVec1 == InVec3)) X86Opcode = X86ISD::HSUB; else CanFold = false; if (CanFold) { // Fold this build_vector into a single horizontal add/sub. // Do this only if the target has AVX2. if (Subtarget->hasAVX2()) return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1); // Do not try to expand this build_vector into a pair of horizontal // add/sub if we can emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into a pair of horizontal binop followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false, isUndefLO, isUndefHI); } } if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 || VT == MVT::v16i16) && Subtarget->hasAVX()) { unsigned X86Opcode; if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HADD; else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::HSUB; else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHADD; else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1)) X86Opcode = X86ISD::FHSUB; else return SDValue(); // Don't try to expand this build_vector into a pair of horizontal add/sub // if we can simply emit a pair of scalar add/sub. if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half) return SDValue(); // Convert this build_vector into two horizontal add/sub followed by // a concat vector. bool isUndefLO = NumUndefsLO == Half; bool isUndefHI = NumUndefsHI == Half; return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true, isUndefLO, isUndefHI); } return SDValue(); } SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); MVT VT = Op.getSimpleValueType(); MVT ExtVT = VT.getVectorElementType(); unsigned NumElems = Op.getNumOperands(); // Generate vectors for predicate vectors. if (VT.getScalarType() == MVT::i1 && Subtarget->hasAVX512()) return LowerBUILD_VECTORvXi1(Op, DAG); // Vectors containing all zeros can be matched by pxor and xorps later if (ISD::isBuildVectorAllZeros(Op.getNode())) { // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd // and 2) ensure that i64 scalars are eliminated on x86-32 hosts. if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) return Op; return getZeroVector(VT, Subtarget, DAG, dl); } // Vectors containing all ones can be matched by pcmpeqd on 128-bit width // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use // vpcmpeqd on 256-bit vectors. if (Subtarget->hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) { if (VT == MVT::v4i32 || (VT == MVT::v8i32 && Subtarget->hasInt256())) return Op; if (!VT.is512BitVector()) return getOnesVector(VT, Subtarget->hasInt256(), DAG, dl); } SDValue Broadcast = LowerVectorBroadcast(Op, Subtarget, DAG); if (Broadcast.getNode()) return Broadcast; unsigned EVTBits = ExtVT.getSizeInBits(); unsigned NumZero = 0; unsigned NumNonZero = 0; unsigned NonZeros = 0; bool IsAllConstants = true; SmallSet
Values; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.getOpcode() == ISD::UNDEF) continue; Values.insert(Elt); if (Elt.getOpcode() != ISD::Constant && Elt.getOpcode() != ISD::ConstantFP) IsAllConstants = false; if (X86::isZeroNode(Elt)) NumZero++; else { NonZeros |= (1 << i); NumNonZero++; } } // All undef vector. Return an UNDEF. All zero vectors were handled above. if (NumNonZero == 0) return DAG.getUNDEF(VT); // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); // If this is an insertion of an i64 value on x86-32, and if the top bits of // the value are obviously zero, truncate the value to i32 and do the // insertion that way. Only do this if the value is non-constant or if the // value is a constant being inserted into element 0. It is cheaper to do // a constant pool load than it is to do a movd + shuffle. if (ExtVT == MVT::i64 && !Subtarget->is64Bit() && (!IsAllConstants || Idx == 0)) { if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) { // Handle SSE only. assert(VT == MVT::v2i64 && "Expected an SSE value type!"); EVT VecVT = MVT::v4i32; unsigned VecElts = 4; // Truncate the value (which may itself be a constant) to i32, and // convert it to a vector with movd (S2V+shuffle to zero extend). Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); // Now we have our 32-bit value zero extended in the low element of // a vector. If Idx != 0, swizzle it into place. if (Idx != 0) { SmallVector
Mask; Mask.push_back(Idx); for (unsigned i = 1; i != VecElts; ++i) Mask.push_back(i); Item = DAG.getVectorShuffle(VecVT, dl, Item, DAG.getUNDEF(VecVT), &Mask[0]); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } // If we have a constant or non-constant insertion into the low element of // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into // the rest of the elements. This will be matched as movd/movq/movss/movsd // depending on what the source datatype is. if (Idx == 0) { if (NumZero == 0) return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 || (ExtVT == MVT::i64 && Subtarget->is64Bit())) { if (VT.is256BitVector() || VT.is512BitVector()) { SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec, Item, DAG.getIntPtrConstant(0)); } assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector. return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } if (ExtVT == MVT::i16 || ExtVT == MVT::i8) { Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item); Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item); if (VT.is256BitVector()) { SDValue ZeroVec = getZeroVector(MVT::v8i32, Subtarget, DAG, dl); Item = Insert128BitVector(ZeroVec, Item, 0, DAG, dl); } else { assert(VT.is128BitVector() && "Expected an SSE value type!"); Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG); } return DAG.getNode(ISD::BITCAST, dl, VT, Item); } } // Is it a vector logical left shift? if (NumElems == 2 && Idx == 1 && X86::isZeroNode(Op.getOperand(0)) && !X86::isZeroNode(Op.getOperand(1))) { unsigned NumBits = VT.getSizeInBits(); return getVShift(true, VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(1)), NumBits/2, DAG, *this, dl); } if (IsAllConstants) // Otherwise, it's better to do a constpool load. return SDValue(); // Otherwise, if this is a vector with i32 or f32 elements, and the element // is a non-constant being inserted into an element other than the low one, // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka // movd/movss) to move this into the low element, then shuffle it into // place. if (EVTBits == 32) { Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item); // Turn it into a shuffle of zero and zero-extended scalar to vector. Item = getShuffleVectorZeroOrUndef(Item, 0, NumZero > 0, Subtarget, DAG); SmallVector
MaskVec; for (unsigned i = 0; i != NumElems; ++i) MaskVec.push_back(i == Idx ? 0 : 1); return DAG.getVectorShuffle(VT, dl, Item, DAG.getUNDEF(VT), &MaskVec[0]); } } // Splat is obviously ok. Let legalizer expand it to a shuffle. if (Values.size() == 1) { if (EVTBits == 32) { // Instead of a shuffle like this: // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> // Check if it's possible to issue this instead. // shuffle (vload ptr)), undef, <1, 1, 1, 1> unsigned Idx = countTrailingZeros(NonZeros); SDValue Item = Op.getOperand(Idx); if (Op.getNode()->isOnlyUserOf(Item.getNode())) return LowerAsSplatVectorLoad(Item, VT, dl, DAG); } return SDValue(); } // A vector full of immediates; various special cases are already // handled, so this is best done with a single constant-pool load. if (IsAllConstants) return SDValue(); // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { SmallVector
V; for (unsigned i = 0; i != NumElems; ++i) V.push_back(Op.getOperand(i)); EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[0], NumElems/2)); SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, makeArrayRef(&V[NumElems / 2], NumElems/2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) return Concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl); return Concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl); } // Let legalizer expand 2-wide build_vectors. if (EVTBits == 64) { if (NumNonZero == 1) { // One half is zero or undef. unsigned Idx = countTrailingZeros(NonZeros); SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(Idx)); return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG); } return SDValue(); } // If element VT is < 32 bits, convert it to inserts into a zero vector. if (EVTBits == 8 && NumElems == 16) { SDValue V = LowerBuildVectorv16i8(Op, NonZeros,NumNonZero,NumZero, DAG, Subtarget, *this); if (V.getNode()) return V; } if (EVTBits == 16 && NumElems == 8) { SDValue V = LowerBuildVectorv8i16(Op, NonZeros,NumNonZero,NumZero, DAG, Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS if (EVTBits == 32 && NumElems == 4) { SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero, NumZero, DAG, Subtarget, *this); if (V.getNode()) return V; } // If element VT is == 32 bits, turn it into a number of shuffles. SmallVector
V(NumElems); if (NumElems == 4 && NumZero > 0) { for (unsigned i = 0; i < 4; ++i) { bool isZero = !(NonZeros & (1 << i)); if (isZero) V[i] = getZeroVector(VT, Subtarget, DAG, dl); else V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); } for (unsigned i = 0; i < 2; ++i) { switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { default: break; case 0: V[i] = V[i*2]; // Must be a zero vector. break; case 1: V[i] = getMOVL(DAG, dl, VT, V[i*2+1], V[i*2]); break; case 2: V[i] = getMOVL(DAG, dl, VT, V[i*2], V[i*2+1]); break; case 3: V[i] = getUnpackl(DAG, dl, VT, V[i*2], V[i*2+1]); break; } } bool Reverse1 = (NonZeros & 0x3) == 2; bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2; int MaskVec[] = { Reverse1 ? 1 : 0, Reverse1 ? 0 : 1, static_cast
(Reverse2 ? NumElems+1 : NumElems), static_cast
(Reverse2 ? NumElems : NumElems+1) }; return DAG.getVectorShuffle(VT, dl, V[0], V[1], &MaskVec[0]); } if (Values.size() > 1 && VT.is128BitVector()) { // Check for a build vector of consecutive loads. for (unsigned i = 0; i < NumElems; ++i) V[i] = Op.getOperand(i); // Check for elements which are consecutive loads. SDValue LD = EltsFromConsecutiveLoads(VT, V, dl, DAG, false); if (LD.getNode()) return LD; // Check for a build vector from mostly shuffle plus few inserting. SDValue Sh = buildFromShuffleMostly(Op, DAG); if (Sh.getNode()) return Sh; // For SSE 4.1, use insertps to put the high elements into the low element. if (getSubtarget()->hasSSE41()) { SDValue Result; if (Op.getOperand(0).getOpcode() != ISD::UNDEF) Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); else Result = DAG.getUNDEF(VT); for (unsigned i = 1; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() == ISD::UNDEF) continue; Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, Op.getOperand(i), DAG.getIntPtrConstant(i)); } return Result; } // Otherwise, expand into a number of unpckl*, start by extending each of // our (non-undef) elements to the full vector width with the element in the // bottom slot of the vector (which generates no code for SSE). for (unsigned i = 0; i < NumElems; ++i) { if (Op.getOperand(i).getOpcode() != ISD::UNDEF) V[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); else V[i] = DAG.getUNDEF(VT); } // Next, we iteratively mix elements, e.g. for v4f32: // Step 1: unpcklps 0, 2 ==> X: , ?, 2, 0> // : unpcklps 1, 3 ==> Y: , ?, 3, 1> // Step 2: unpcklps X, Y ==> <3, 2, 1, 0> unsigned EltStride = NumElems >> 1; while (EltStride != 0) { for (unsigned i = 0; i < EltStride; ++i) { // If V[i+EltStride] is undef and this is the first round of mixing, // then it is safe to just drop this shuffle: V[i] is already in the // right place, the one element (since it's the first round) being // inserted as undef can be dropped. This isn't safe for successive // rounds because they will permute elements within both vectors. if (V[i+EltStride].getOpcode() == ISD::UNDEF && EltStride == NumElems/2) continue; V[i] = getUnpackl(DAG, dl, VT, V[i], V[i + EltStride]); } EltStride >>= 1; } return V[0]; } return SDValue(); } // LowerAVXCONCAT_VECTORS - 256-bit AVX can use the vinsertf128 instruction // to create 256-bit vectors from two other 128-bit ones. static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); assert((ResVT.is256BitVector() || ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide"); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); unsigned NumElems = ResVT.getVectorNumElements(); if(ResVT.is256BitVector()) return Concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl); if (Op.getNumOperands() == 4) { MVT HalfVT = MVT::getVectorVT(ResVT.getScalarType(), ResVT.getVectorNumElements()/2); SDValue V3 = Op.getOperand(2); SDValue V4 = Op.getOperand(3); return Concat256BitVectors(Concat128BitVectors(V1, V2, HalfVT, NumElems/2, DAG, dl), Concat128BitVectors(V3, V4, HalfVT, NumElems/2, DAG, dl), ResVT, NumElems, DAG, dl); } return Concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl); } static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) { MVT LLVM_ATTRIBUTE_UNUSED VT = Op.getSimpleValueType(); assert((VT.is256BitVector() && Op.getNumOperands() == 2) || (VT.is512BitVector() && (Op.getNumOperands() == 2 || Op.getNumOperands() == 4))); // AVX can use the vinsertf128 instruction to create 256-bit vectors // from two other 128-bit ones. // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors return LowerAVXCONCAT_VECTORS(Op, DAG); } //===----------------------------------------------------------------------===// // Vector shuffle lowering // // This is an experimental code path for lowering vector shuffles on x86. It is // designed to handle arbitrary vector shuffles and blends, gracefully // degrading performance as necessary. It works hard to recognize idiomatic // shuffles and lower them to optimal instruction patterns without leaving // a framework that allows reasonably efficient handling of all vector shuffle // patterns. //===----------------------------------------------------------------------===// /// \brief Tiny helper function to identify a no-op mask. /// /// This is a somewhat boring predicate function. It checks whether the mask /// array input, which is assumed to be a single-input shuffle mask of the kind /// used by the X86 shuffle instructions (not a fully general /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an /// in-place shuffle are 'no-op's. static bool isNoopShuffleMask(ArrayRef
Mask) { for (int i = 0, Size = Mask.size(); i < Size; ++i) if (Mask[i] != -1 && Mask[i] != i) return false; return true; } /// \brief Helper function to classify a mask as a single-input mask. /// /// This isn't a generic single-input test because in the vector shuffle /// lowering we canonicalize single inputs to be the first input operand. This /// means we can more quickly test for a single input by only checking whether /// an input from the second operand exists. We also assume that the size of /// mask corresponds to the size of the input vectors which isn't true in the /// fully general case. static bool isSingleInputShuffleMask(ArrayRef
Mask) { for (int M : Mask) if (M >= (int)Mask.size()) return false; return true; } /// \brief Get a 4-lane 8-bit shuffle immediate for a mask. /// /// This helper function produces an 8-bit shuffle immediate corresponding to /// the ubiquitous shuffle encoding scheme used in x86 instructions for /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for /// example. /// /// NB: We rely heavily on "undef" masks preserving the input lane. static SDValue getV4X86ShuffleImm8ForMask(ArrayRef
Mask, SelectionDAG &DAG) { assert(Mask.size() == 4 && "Only 4-lane shuffle masks"); assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!"); assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!"); assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!"); assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!"); unsigned Imm = 0; Imm |= (Mask[0] == -1 ? 0 : Mask[0]) << 0; Imm |= (Mask[1] == -1 ? 1 : Mask[1]) << 2; Imm |= (Mask[2] == -1 ? 2 : Mask[2]) << 4; Imm |= (Mask[3] == -1 ? 3 : Mask[3]) << 6; return DAG.getConstant(Imm, MVT::i8); } /// \brief Handle lowering of 2-lane 64-bit floating point shuffles. /// /// This is the basis function for the 2-lane 64-bit shuffles as we have full /// support for floating point shuffles but not integer shuffles. These /// instructions will incur a domain crossing penalty on some chips though so /// it is better to avoid lowering through this for integer vectors where /// possible. static SDValue lowerV2F64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2f64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Straight shuffle of a single input vector. Simulate this by using the // single input as both of the "inputs" to this instruction.. unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V1, DAG.getConstant(SHUFPDMask, MVT::i8)); } assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!"); assert(Mask[1] >= 2 && "Non-canonicalized blend!"); unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1); return DAG.getNode(X86ISD::SHUFP, SDLoc(Op), MVT::v2f64, V1, V2, DAG.getConstant(SHUFPDMask, MVT::i8)); } /// \brief Handle lowering of 2-lane 64-bit integer shuffles. /// /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by /// the integer unit to minimize domain crossing penalties. However, for blends /// it falls back to the floating point shuffle operation with appropriate bit /// casting. static SDValue lowerV2I64VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v2i64 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!"); if (isSingleInputShuffleMask(Mask)) { // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. // We have to map the mask as it is actually a v4i32 shuffle instruction. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V1); int WidenedMask[4] = { std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1, std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1}; return DAG.getNode( ISD::BITCAST, DL, MVT::v2i64, DAG.getNode(X86ISD::PSHUFD, SDLoc(Op), MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(WidenedMask, DAG))); } // We implement this with SHUFPD which is pretty lame because it will likely // incur 2 cycles of stall for integer vectors on Nehalem and older chips. // However, all the alternatives are still more cycles and newer chips don't // have this problem. It would be really nice if x86 had better shuffles here. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V1); V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, V2); return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask)); } /// \brief Lower 4-lane 32-bit floating point shuffles. /// /// Uses instructions exclusively from the floating point unit to minimize /// domain crossing penalties, as these are sufficient to implement all v4f32 /// shuffles. static SDValue lowerV4F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4f32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); SDValue LowV = V1, HighV = V2; int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]}; int NumV2Elements = std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }); if (NumV2Elements == 0) // Straight shuffle of a single input vector. We pass the input vector to // both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); if (NumV2Elements == 1) { int V2Index = std::find_if(Mask.begin(), Mask.end(), [](int M) { return M >= 4; }) - Mask.begin(); // Compute the index adjacent to V2Index and in the same half by toggling // the low bit. int V2AdjIndex = V2Index ^ 1; if (Mask[V2AdjIndex] == -1) { // Handles all the cases where we have a single V2 element and an undef. // This will only ever happen in the high lanes because we commute the // vector otherwise. if (V2Index < 2) std::swap(LowV, HighV); NewMask[V2Index] -= 4; } else { // Handle the case where the V2 element ends up adjacent to a V1 element. // To make this work, blend them together as the first step. int V1Index = V2AdjIndex; int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0}; V2 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V2, V1, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now proceed to reconstruct the final blend as we have the necessary // high or low half formed. if (V2Index < 2) { LowV = V2; HighV = V1; } else { HighV = V2; } NewMask[V1Index] = 2; // We put the V1 element in V2[2]. NewMask[V2Index] = 0; // We shifted the V2 element into V2[0]. } } else if (NumV2Elements == 2) { if (Mask[0] < 4 && Mask[1] < 4) { // Handle the easy case where we have V1 in the low lanes and V2 in the // high lanes. We never see this reversed because we sort the shuffle. NewMask[2] -= 4; NewMask[3] -= 4; } else { // We have a mixture of V1 and V2 in both low and high lanes. Rather than // trying to place elements directly, just blend them and set up the final // shuffle to place them. // The first two blend mask elements are for V1, the second two are for // V2. int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1], Mask[2] < 4 ? Mask[2] : Mask[3], (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4, (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4}; V1 = DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V2, getV4X86ShuffleImm8ForMask(BlendMask, DAG)); // Now we do a normal shuffle of V1 by giving V1 as both operands to // a blend. LowV = HighV = V1; NewMask[0] = Mask[0] < 4 ? 0 : 2; NewMask[1] = Mask[0] < 4 ? 2 : 0; NewMask[2] = Mask[2] < 4 ? 1 : 3; NewMask[3] = Mask[2] < 4 ? 3 : 1; } } return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, LowV, HighV, getV4X86ShuffleImm8ForMask(NewMask, DAG)); } /// \brief Lower 4-lane i32 vector shuffles. /// /// We try to handle these with integer-domain shuffles where we can, but for /// blends we use the floating point domain blend instructions. static SDValue lowerV4I32VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v4i32 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!"); if (isSingleInputShuffleMask(Mask)) // Straight shuffle of a single input vector. For everything from SSE2 // onward this has a single fast instruction with no scary immediates. return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1, getV4X86ShuffleImm8ForMask(Mask, DAG)); // We implement this with SHUFPS because it can blend from two vectors. // Because we're going to eventually use SHUFPS, we use SHUFPS even to build // up the inputs, bypassing domain shift penalties that we would encur if we // directly used PSHUFD on Nehalem and older. For newer chips, this isn't // relevant. return DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, DAG.getVectorShuffle( MVT::v4f32, DL, DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V1), DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, V2), Mask)); } /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2 /// shuffle lowering, and the most complex part. /// /// The lowering strategy is to try to form pairs of input lanes which are /// targeted at the same half of the final vector, and then use a dword shuffle /// to place them onto the right half, and finally unpack the paired lanes into /// their final position. /// /// The exact breakdown of how to form these dword pairs and align them on the /// correct sides is really tricky. See the comments within the function for /// more of the details. static SDValue lowerV8I16SingleInputVectorShuffle( SDLoc DL, SDValue V, MutableArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(V.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); MutableArrayRef
LoMask = Mask.slice(0, 4); MutableArrayRef
HiMask = Mask.slice(4, 4); SmallVector
LoInputs; std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector
HiInputs; std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 0; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); int NumLToL = std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin(); int NumHToL = LoInputs.size() - NumLToL; int NumLToH = std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin(); int NumHToH = HiInputs.size() - NumLToH; MutableArrayRef
LToLInputs(LoInputs.data(), NumLToL); MutableArrayRef
LToHInputs(HiInputs.data(), NumLToH); MutableArrayRef
HToLInputs(LoInputs.data() + NumLToL, NumHToL); MutableArrayRef
HToHInputs(HiInputs.data() + NumLToH, NumHToH); // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all // such inputs we can swap two of the dwords across the half mark and end up // with <=2 inputs to each half in each half. Once there, we can fall through // to the generic code below. For example: // // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h] // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5] // // Before we had 3-1 in the low half and 3-1 in the high half. Afterward, 2-2 // and 2-2. auto balanceSides = [&](ArrayRef
ThreeInputs, int OneInput, int ThreeInputHalfSum, int OneInputHalfOffset) { // Compute the index of dword with only one word among the three inputs in // a half by taking the sum of the half with three inputs and subtracting // the sum of the actual three inputs. The difference is the remaining // slot. int DWordA = (ThreeInputHalfSum - std::accumulate(ThreeInputs.begin(), ThreeInputs.end(), 0)) / 2; int DWordB = OneInputHalfOffset / 2 + (OneInput / 2 + 1) % 2; int PSHUFDMask[] = {0, 1, 2, 3}; PSHUFDMask[DWordA] = DWordB; PSHUFDMask[DWordB] = DWordA; V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // Adjust the mask to match the new locations of A and B. for (int &M : Mask) if (M != -1 && M/2 == DWordA) M = 2 * DWordB + M % 2; else if (M != -1 && M/2 == DWordB) M = 2 * DWordA + M % 2; // Recurse back into this routine to re-compute state now that this isn't // a 3 and 1 problem. return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), Mask); }; if (NumLToL == 3 && NumHToL == 1) return balanceSides(LToLInputs, HToLInputs[0], 0 + 1 + 2 + 3, 4); else if (NumLToL == 1 && NumHToL == 3) return balanceSides(HToLInputs, LToLInputs[0], 4 + 5 + 6 + 7, 0); else if (NumLToH == 1 && NumHToH == 3) return balanceSides(HToHInputs, LToHInputs[0], 4 + 5 + 6 + 7, 0); else if (NumLToH == 3 && NumHToH == 1) return balanceSides(LToHInputs, HToHInputs[0], 0 + 1 + 2 + 3, 4); // At this point there are at most two inputs to the low and high halves from // each half. That means the inputs can always be grouped into dwords and // those dwords can then be moved to the correct half with a dword shuffle. // We use at most one low and one high word shuffle to collect these paired // inputs into dwords, and finally a dword shuffle to place them. int PSHUFLMask[4] = {-1, -1, -1, -1}; int PSHUFHMask[4] = {-1, -1, -1, -1}; int PSHUFDMask[4] = {-1, -1, -1, -1}; // First fix the masks for all the inputs that are staying in their // original halves. This will then dictate the targets of the cross-half // shuffles. auto fixInPlaceInputs = [&PSHUFDMask]( ArrayRef
InPlaceInputs, MutableArrayRef
SourceHalfMask, MutableArrayRef
HalfMask, int HalfOffset) { if (InPlaceInputs.empty()) return; if (InPlaceInputs.size() == 1) { SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2; return; } assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!"); SourceHalfMask[InPlaceInputs[0] - HalfOffset] = InPlaceInputs[0] - HalfOffset; // Put the second input next to the first so that they are packed into // a dword. We find the adjacent index by toggling the low bit. int AdjIndex = InPlaceInputs[0] ^ 1; SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset; std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex); PSHUFDMask[AdjIndex / 2] = AdjIndex / 2; }; if (!HToLInputs.empty()) fixInPlaceInputs(LToLInputs, PSHUFLMask, LoMask, 0); if (!LToHInputs.empty()) fixInPlaceInputs(HToHInputs, PSHUFHMask, HiMask, 4); // Now gather the cross-half inputs and place them into a free dword of // their target half. // FIXME: This operation could almost certainly be simplified dramatically to // look more like the 3-1 fixing operation. auto moveInputsToRightHalf = [&PSHUFDMask]( MutableArrayRef
IncomingInputs, ArrayRef
ExistingInputs, MutableArrayRef
SourceHalfMask, MutableArrayRef
HalfMask, int SourceOffset, int DestOffset) { auto isWordClobbered = [](ArrayRef
SourceHalfMask, int Word) { return SourceHalfMask[Word] != -1 && SourceHalfMask[Word] != Word; }; auto isDWordClobbered = [&isWordClobbered](ArrayRef
SourceHalfMask, int Word) { int LowWord = Word & ~1; int HighWord = Word | 1; return isWordClobbered(SourceHalfMask, LowWord) || isWordClobbered(SourceHalfMask, HighWord); }; if (IncomingInputs.empty()) return; if (ExistingInputs.empty()) { // Map any dwords with inputs from them into the right half. for (int Input : IncomingInputs) { // If the source half mask maps over the inputs, turn those into // swaps and use the swapped lane. if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) { if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == -1) { SourceHalfMask[SourceHalfMask[Input - SourceOffset]] = Input - SourceOffset; // We have to swap the uses in our half mask in one sweep. for (int &M : HalfMask) if (M == SourceHalfMask[Input - SourceOffset]) M = Input; else if (M == Input) M = SourceHalfMask[Input - SourceOffset] + SourceOffset; } else { assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] == Input - SourceOffset && "Previous placement doesn't match!"); } // Note that this correctly re-maps both when we do a swap and when // we observe the other side of the swap above. We rely on that to // avoid swapping the members of the input list directly. Input = SourceHalfMask[Input - SourceOffset] + SourceOffset; } // Map the input's dword into the correct half. if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == -1) PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2; else assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] == Input / 2 && "Previous placement doesn't match!"); } // And just directly shift any other-half mask elements to be same-half // as we will have mirrored the dword containing the element into the // same position within that half. for (int &M : HalfMask) if (M >= SourceOffset && M < SourceOffset + 4) { M = M - SourceOffset + DestOffset; assert(M >= 0 && "This should never wrap below zero!"); } return; } // Ensure we have the input in a viable dword of its current half. This // is particularly tricky because the original position may be clobbered // by inputs being moved and *staying* in that half. if (IncomingInputs.size() == 1) { if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int InputFixed = std::find(std::begin(SourceHalfMask), std::end(SourceHalfMask), -1) - std::begin(SourceHalfMask) + SourceOffset; SourceHalfMask[InputFixed - SourceOffset] = IncomingInputs[0] - SourceOffset; std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0], InputFixed); IncomingInputs[0] = InputFixed; } } else if (IncomingInputs.size() == 2) { if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 || isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) { int SourceDWordBase = !isDWordClobbered(SourceHalfMask, 0) ? 0 : 2; assert(!isDWordClobbered(SourceHalfMask, SourceDWordBase) && "Not all dwords can be clobbered!"); SourceHalfMask[SourceDWordBase] = IncomingInputs[0] - SourceOffset; SourceHalfMask[SourceDWordBase + 1] = IncomingInputs[1] - SourceOffset; for (int &M : HalfMask) if (M == IncomingInputs[0]) M = SourceDWordBase + SourceOffset; else if (M == IncomingInputs[1]) M = SourceDWordBase + 1 + SourceOffset; IncomingInputs[0] = SourceDWordBase + SourceOffset; IncomingInputs[1] = SourceDWordBase + 1 + SourceOffset; } } else { llvm_unreachable("Unhandled input size!"); } // Now hoist the DWord down to the right half. int FreeDWord = (PSHUFDMask[DestOffset / 2] == -1 ? 0 : 1) + DestOffset / 2; assert(PSHUFDMask[FreeDWord] == -1 && "DWord not free"); PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2; for (int Input : IncomingInputs) std::replace(HalfMask.begin(), HalfMask.end(), Input, FreeDWord * 2 + Input % 2); }; moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, /*SourceOffset*/ 4, /*DestOffset*/ 0); moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, /*SourceOffset*/ 0, /*DestOffset*/ 4); // Now enact all the shuffles we've computed to move the inputs into their // target half. if (!isNoopShuffleMask(PSHUFLMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFLMask, DAG)); if (!isNoopShuffleMask(PSHUFHMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(PSHUFHMask, DAG)); if (!isNoopShuffleMask(PSHUFDMask)) V = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, DAG.getNode(ISD::BITCAST, DL, MVT::v4i32, V), getV4X86ShuffleImm8ForMask(PSHUFDMask, DAG))); // At this point, each half should contain all its inputs, and we can then // just shuffle them into their final position. assert(std::count_if(LoMask.begin(), LoMask.end(), [](int M) { return M >= 4; }) == 0 && "Failed to lift all the high half inputs to the low mask!"); assert(std::count_if(HiMask.begin(), HiMask.end(), [](int M) { return M >= 0 && M < 4; }) == 0 && "Failed to lift all the low half inputs to the high mask!"); // Do a half shuffle for the low mask. if (!isNoopShuffleMask(LoMask)) V = DAG.getNode(X86ISD::PSHUFLW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(LoMask, DAG)); // Do a half shuffle with the high mask after shifting its values down. for (int &M : HiMask) if (M >= 0) M -= 4; if (!isNoopShuffleMask(HiMask)) V = DAG.getNode(X86ISD::PSHUFHW, DL, MVT::v8i16, V, getV4X86ShuffleImm8ForMask(HiMask, DAG)); return V; } /// \brief Detect whether the mask pattern should be lowered through /// interleaving. /// /// This essentially tests whether viewing the mask as an interleaving of two /// sub-sequences reduces the cross-input traffic of a blend operation. If so, /// lowering it through interleaving is a significantly better strategy. static bool shouldLowerAsInterleaving(ArrayRef
Mask) { int NumEvenInputs[2] = {0, 0}; int NumOddInputs[2] = {0, 0}; int NumLoInputs[2] = {0, 0}; int NumHiInputs[2] = {0, 0}; for (int i = 0, Size = Mask.size(); i < Size; ++i) { if (Mask[i] < 0) continue; int InputIdx = Mask[i] >= Size; if (i < Size / 2) ++NumLoInputs[InputIdx]; else ++NumHiInputs[InputIdx]; if ((i % 2) == 0) ++NumEvenInputs[InputIdx]; else ++NumOddInputs[InputIdx]; } // The minimum number of cross-input results for both the interleaved and // split cases. If interleaving results in fewer cross-input results, return // true. int InterleavedCrosses = std::min(NumEvenInputs[1] + NumOddInputs[0], NumEvenInputs[0] + NumOddInputs[1]); int SplitCrosses = std::min(NumLoInputs[1] + NumHiInputs[0], NumLoInputs[0] + NumHiInputs[1]); return InterleavedCrosses < SplitCrosses; } /// \brief Blend two v8i16 vectors using a naive unpack strategy. /// /// This strategy only works when the inputs from each vector fit into a single /// half of that vector, and generally there are not so many inputs as to leave /// the in-place shuffles required highly constrained (and thus expensive). It /// shifts all the inputs into a single side of both input vectors and then /// uses an unpack to interleave these inputs in a single vector. At that /// point, we will fall back on the generic single input shuffle lowering. static SDValue lowerV8I16BasicBlendVectorShuffle(SDLoc DL, SDValue V1, SDValue V2, MutableArrayRef
Mask, const X86Subtarget *Subtarget, SelectionDAG &DAG) { assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad input type!"); SmallVector
LoV1Inputs, HiV1Inputs, LoV2Inputs, HiV2Inputs; for (int i = 0; i < 8; ++i) if (Mask[i] >= 0 && Mask[i] < 4) LoV1Inputs.push_back(i); else if (Mask[i] >= 4 && Mask[i] < 8) HiV1Inputs.push_back(i); else if (Mask[i] >= 8 && Mask[i] < 12) LoV2Inputs.push_back(i); else if (Mask[i] >= 12) HiV2Inputs.push_back(i); int NumV1Inputs = LoV1Inputs.size() + HiV1Inputs.size(); int NumV2Inputs = LoV2Inputs.size() + HiV2Inputs.size(); (void)NumV1Inputs; (void)NumV2Inputs; assert(NumV1Inputs > 0 && NumV1Inputs <= 3 && "At most 3 inputs supported"); assert(NumV2Inputs > 0 && NumV2Inputs <= 3 && "At most 3 inputs supported"); assert(NumV1Inputs + NumV2Inputs <= 4 && "At most 4 combined inputs"); bool MergeFromLo = LoV1Inputs.size() + LoV2Inputs.size() >= HiV1Inputs.size() + HiV2Inputs.size(); auto moveInputsToHalf = [&](SDValue V, ArrayRef
LoInputs, ArrayRef
HiInputs, bool MoveToLo, int MaskOffset) { ArrayRef
GoodInputs = MoveToLo ? LoInputs : HiInputs; ArrayRef
BadInputs = MoveToLo ? HiInputs : LoInputs; if (BadInputs.empty()) return V; int MoveMask[] = {-1, -1, -1, -1, -1, -1, -1, -1}; int MoveOffset = MoveToLo ? 0 : 4; if (GoodInputs.empty()) { for (int BadInput : BadInputs) { MoveMask[Mask[BadInput] % 4 + MoveOffset] = Mask[BadInput] - MaskOffset; Mask[BadInput] = Mask[BadInput] % 4 + MoveOffset + MaskOffset; } } else { if (GoodInputs.size() == 2) { // If the low inputs are spread across two dwords, pack them into // a single dword. MoveMask[Mask[GoodInputs[0]] % 2 + MoveOffset] = Mask[GoodInputs[0]] - MaskOffset; MoveMask[Mask[GoodInputs[1]] % 2 + MoveOffset] = Mask[GoodInputs[1]] - MaskOffset; Mask[GoodInputs[0]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; Mask[GoodInputs[1]] = Mask[GoodInputs[0]] % 2 + MoveOffset + MaskOffset; } else { // Otherwise pin the low inputs. for (int GoodInput : GoodInputs) MoveMask[Mask[GoodInput] - MaskOffset] = Mask[GoodInput] - MaskOffset; } int MoveMaskIdx = std::find(std::begin(MoveMask) + MoveOffset, std::end(MoveMask), -1) - std::begin(MoveMask); assert(MoveMaskIdx >= MoveOffset && "Established above"); if (BadInputs.size() == 2) { assert(MoveMask[MoveMaskIdx] == -1 && "Expected empty slot"); assert(MoveMask[MoveMaskIdx + 1] == -1 && "Expected empty slot"); MoveMask[MoveMaskIdx + Mask[BadInputs[0]] % 2] = Mask[BadInputs[0]] - MaskOffset; MoveMask[MoveMaskIdx + Mask[BadInputs[1]] % 2] = Mask[BadInputs[1]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + Mask[BadInputs[0]] % 2 + MaskOffset; Mask[BadInputs[1]] = MoveMaskIdx + Mask[BadInputs[1]] % 2 + MaskOffset; } else { assert(BadInputs.size() == 1 && "All sizes handled"); MoveMask[MoveMaskIdx] = Mask[BadInputs[0]] - MaskOffset; Mask[BadInputs[0]] = MoveMaskIdx + MaskOffset; } } return DAG.getVectorShuffle(MVT::v8i16, DL, V, DAG.getUNDEF(MVT::v8i16), MoveMask); }; V1 = moveInputsToHalf(V1, LoV1Inputs, HiV1Inputs, MergeFromLo, /*MaskOffset*/ 0); V2 = moveInputsToHalf(V2, LoV2Inputs, HiV2Inputs, MergeFromLo, /*MaskOffset*/ 8); // FIXME: Select an interleaving of the merge of V1 and V2 that minimizes // cross-half traffic in the final shuffle. // Munge the mask to be a single-input mask after the unpack merges the // results. for (int &M : Mask) if (M != -1) M = 2 * (M % 4) + (M / 8); return DAG.getVectorShuffle( MVT::v8i16, DL, DAG.getNode(MergeFromLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v8i16, V1, V2), DAG.getUNDEF(MVT::v8i16), Mask); } /// \brief Generic lowering of 8-lane i16 shuffles. /// /// This handles both single-input shuffles and combined shuffle/blends with /// two inputs. The single input shuffles are immediately delegated to /// a dedicated lowering routine. /// /// The blends are lowered in one of three fundamental ways. If there are few /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle /// of the input is significantly cheaper when lowered as an interleaving of /// the two inputs, try to interleave them. Otherwise, blend the low and high /// halves of the inputs separately (making them have relatively few inputs) /// and then concatenate them. static SDValue lowerV8I16VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v8i16 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
OrigMask = SVOp->getMask(); int MaskStorage[8] = {OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7]}; MutableArrayRef
Mask(MaskStorage); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); auto isV1 = [](int M) { return M >= 0 && M < 8; }; auto isV2 = [](int M) { return M >= 8; }; int NumV1Inputs = std::count_if(Mask.begin(), Mask.end(), isV1); int NumV2Inputs = std::count_if(Mask.begin(), Mask.end(), isV2); if (NumV2Inputs == 0) return lowerV8I16SingleInputVectorShuffle(DL, V1, Mask, Subtarget, DAG); assert(NumV1Inputs > 0 && "All single-input shuffles should be canonicalized " "to be V1-input shuffles."); if (NumV1Inputs + NumV2Inputs <= 4) return lowerV8I16BasicBlendVectorShuffle(DL, V1, V2, Mask, Subtarget, DAG); // Check whether an interleaving lowering is likely to be more efficient. // This isn't perfect but it is a strong heuristic that tends to work well on // the kinds of shuffles that show up in practice. // // FIXME: Handle 1x, 2x, and 4x interleaving. if (shouldLowerAsInterleaving(Mask)) { // FIXME: Figure out whether we should pack these into the low or high // halves. int EMask[8], OMask[8]; for (int i = 0; i < 4; ++i) { EMask[i] = Mask[2*i]; OMask[i] = Mask[2*i + 1]; EMask[i + 4] = -1; OMask[i + 4] = -1; } SDValue Evens = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, OMask); return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v8i16, Evens, Odds); } int LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 4; ++i) { LoBlendMask[i] = Mask[i]; HiBlendMask[i] = Mask[i + 4]; } SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); LoV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, LoV); HiV = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, HiV); return DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, LoV, HiV)); } /// \brief Generic lowering of v16i8 shuffles. /// /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to /// detect any complexity reducing interleaving. If that doesn't help, it uses /// UNPCK to spread the i8 elements across two i16-element vectors, and uses /// the existing lowering for v8i16 blends on each half, finally PACK-ing them /// back together. static SDValue lowerV16I8VectorShuffle(SDValue Op, SDValue V1, SDValue V2, const X86Subtarget *Subtarget, SelectionDAG &DAG) { SDLoc DL(Op); assert(Op.getSimpleValueType() == MVT::v16i8 && "Bad shuffle type!"); assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!"); ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
OrigMask = SVOp->getMask(); assert(OrigMask.size() == 16 && "Unexpected mask size for v16 shuffle!"); int MaskStorage[16] = { OrigMask[0], OrigMask[1], OrigMask[2], OrigMask[3], OrigMask[4], OrigMask[5], OrigMask[6], OrigMask[7], OrigMask[8], OrigMask[9], OrigMask[10], OrigMask[11], OrigMask[12], OrigMask[13], OrigMask[14], OrigMask[15]}; MutableArrayRef
Mask(MaskStorage); MutableArrayRef
LoMask = Mask.slice(0, 8); MutableArrayRef
HiMask = Mask.slice(8, 8); // For single-input shuffles, there are some nicer lowering tricks we can use. if (isSingleInputShuffleMask(Mask)) { // Check whether we can widen this to an i16 shuffle by duplicating bytes. // Notably, this handles splat and partial-splat shuffles more efficiently. // However, it only makes sense if the pre-duplication shuffle simplifies // things significantly. Currently, this means we need to be able to // express the pre-duplication shuffle as an i16 shuffle. // // FIXME: We should check for other patterns which can be widened into an // i16 shuffle as well. auto canWidenViaDuplication = [](ArrayRef
Mask) { for (int i = 0; i < 16; i += 2) { if (Mask[i] != Mask[i + 1]) return false; } return true; }; auto tryToWidenViaDuplication = [&]() -> SDValue { if (!canWidenViaDuplication(Mask)) return SDValue(); SmallVector
LoInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs), [](int M) { return M >= 0 && M < 8; }); std::sort(LoInputs.begin(), LoInputs.end()); LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end()); SmallVector
HiInputs; std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs), [](int M) { return M >= 8; }); std::sort(HiInputs.begin(), HiInputs.end()); HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end()); bool TargetLo = LoInputs.size() >= HiInputs.size(); ArrayRef
InPlaceInputs = TargetLo ? LoInputs : HiInputs; ArrayRef
MovingInputs = TargetLo ? HiInputs : LoInputs; int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1}; SmallDenseMap
LaneMap; for (int I : InPlaceInputs) { PreDupI16Shuffle[I/2] = I/2; LaneMap[I] = I; } int j = TargetLo ? 0 : 4, je = j + 4; for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) { // Check if j is already a shuffle of this input. This happens when // there are two adjacent bytes after we move the low one. if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) { // If we haven't yet mapped the input, search for a slot into which // we can map it. while (j < je && PreDupI16Shuffle[j] != -1) ++j; if (j == je) // We can't place the inputs into a single half with a simple i16 shuffle, so bail. return SDValue(); // Map this input with the i16 shuffle. PreDupI16Shuffle[j] = MovingInputs[i] / 2; } // Update the lane map based on the mapping we ended up with. LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2; } V1 = DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle)); // Unpack the bytes to form the i16s that will be shuffled into place. V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL, MVT::v16i8, V1, V1); int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; for (int i = 0; i < 16; i += 2) { if (Mask[i] != -1) PostDupI16Shuffle[i / 2] = LaneMap[Mask[i]] - (TargetLo ? 0 : 8); assert(PostDupI16Shuffle[i / 2] < 8 && "Invalid v8 shuffle mask!"); } return DAG.getNode( ISD::BITCAST, DL, MVT::v16i8, DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V1), DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle)); }; if (SDValue V = tryToWidenViaDuplication()) return V; } // Check whether an interleaving lowering is likely to be more efficient. // This isn't perfect but it is a strong heuristic that tends to work well on // the kinds of shuffles that show up in practice. // // FIXME: We need to handle other interleaving widths (i16, i32, ...). if (shouldLowerAsInterleaving(Mask)) { // FIXME: Figure out whether we should pack these into the low or high // halves. int EMask[16], OMask[16]; for (int i = 0; i < 8; ++i) { EMask[i] = Mask[2*i]; OMask[i] = Mask[2*i + 1]; EMask[i + 8] = -1; OMask[i + 8] = -1; } SDValue Evens = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, EMask); SDValue Odds = DAG.getVectorShuffle(MVT::v16i8, DL, V1, V2, OMask); return DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, Evens, Odds); } int V1LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V1HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V2LoBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; int V2HiBlendMask[8] = {-1, -1, -1, -1, -1, -1, -1, -1}; auto buildBlendMasks = [](MutableArrayRef
HalfMask, MutableArrayRef
V1HalfBlendMask, MutableArrayRef
V2HalfBlendMask) { for (int i = 0; i < 8; ++i) if (HalfMask[i] >= 0 && HalfMask[i] < 16) { V1HalfBlendMask[i] = HalfMask[i]; HalfMask[i] = i; } else if (HalfMask[i] >= 16) { V2HalfBlendMask[i] = HalfMask[i] - 16; HalfMask[i] = i + 8; } }; buildBlendMasks(LoMask, V1LoBlendMask, V2LoBlendMask); buildBlendMasks(HiMask, V1HiBlendMask, V2HiBlendMask); SDValue Zero = getZeroVector(MVT::v8i16, Subtarget, DAG, DL); auto buildLoAndHiV8s = [&](SDValue V, MutableArrayRef
LoBlendMask, MutableArrayRef
HiBlendMask) { SDValue V1, V2; // Check if any of the odd lanes in the v16i8 are used. If not, we can mask // them out and avoid using UNPCK{L,H} to extract the elements of V as // i16s. if (std::none_of(LoBlendMask.begin(), LoBlendMask.end(), [](int M) { return M >= 0 && M % 2 == 1; }) && std::none_of(HiBlendMask.begin(), HiBlendMask.end(), [](int M) { return M >= 0 && M % 2 == 1; })) { // Use a mask to drop the high bytes. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, V); V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, V1, DAG.getConstant(0x00FF, MVT::v8i16)); // This will be a single vector shuffle instead of a blend so nuke V2. V2 = DAG.getUNDEF(MVT::v8i16); // Squash the masks to point directly into V1. for (int &M : LoBlendMask) if (M >= 0) M /= 2; for (int &M : HiBlendMask) if (M >= 0) M /= 2; } else { // Otherwise just unpack the low half of V into V1 and the high half into // V2 so that we can blend them as i16s. V1 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero)); V2 = DAG.getNode(ISD::BITCAST, DL, MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero)); } SDValue BlendedLo = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, LoBlendMask); SDValue BlendedHi = DAG.getVectorShuffle(MVT::v8i16, DL, V1, V2, HiBlendMask); return std::make_pair(BlendedLo, BlendedHi); }; SDValue V1Lo, V1Hi, V2Lo, V2Hi; std::tie(V1Lo, V1Hi) = buildLoAndHiV8s(V1, V1LoBlendMask, V1HiBlendMask); std::tie(V2Lo, V2Hi) = buildLoAndHiV8s(V2, V2LoBlendMask, V2HiBlendMask); SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Lo, V2Lo, LoMask); SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, V1Hi, V2Hi, HiMask); return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV); } /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles. /// /// This routine breaks down the specific type of 128-bit shuffle and /// dispatches to the lowering routines accordingly. static SDValue lower128BitVectorShuffle(SDValue Op, SDValue V1, SDValue V2, MVT VT, const X86Subtarget *Subtarget, SelectionDAG &DAG) { switch (VT.SimpleTy) { case MVT::v2i64: return lowerV2I64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v2f64: return lowerV2F64VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4i32: return lowerV4I32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v4f32: return lowerV4F32VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v8i16: return lowerV8I16VectorShuffle(Op, V1, V2, Subtarget, DAG); case MVT::v16i8: return lowerV16I8VectorShuffle(Op, V1, V2, Subtarget, DAG); default: llvm_unreachable("Unimplemented!"); } } /// \brief Tiny helper function to test whether adjacent masks are sequential. static bool areAdjacentMasksSequential(ArrayRef
Mask) { for (int i = 0, Size = Mask.size(); i < Size; i += 2) if (Mask[i] + 1 != Mask[i+1]) return false; return true; } /// \brief Top-level lowering for x86 vector shuffles. /// /// This handles decomposition, canonicalization, and lowering of all x86 /// vector shuffles. Most of the specific lowering strategies are encapsulated /// above in helper routines. The canonicalization attempts to widen shuffles /// to involve fewer lanes of wider elements, consolidate symmetric patterns /// s.t. only one of the two inputs needs to be tested, etc. static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast
(Op); ArrayRef
Mask = SVOp->getMask(); SDValue V1 = Op.getOperand(0); SDValue V2 = Op.getOperand(1); MVT VT = Op.getSimpleValueType(); int NumElements = VT.getVectorNumElements(); SDLoc dl(Op); assert(VT.getSizeInBits() != 64 && "Can't lower MMX shuffles"); bool V1IsUndef = V1.getOpcode() == ISD::UNDEF; bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; if (V1IsUndef && V2IsUndef) return DAG.getUNDEF(VT); // When we create a shuffle node we put the UNDEF node to second operand, // but in some cases the first operand may be transformed to UNDEF. // In this case we should just commute the node. if (V1IsUndef) return CommuteVectorShuffle(SVOp, DAG); // Check for non-undef masks pointing at an undef vector and make the masks // undef as well. This makes it easier to match the shuffle based solely on // the mask. if (V2IsUndef) for (int M : Mask) if (M >= NumElements) { SmallVector
NewMask(Mask.begin(), Mask.end()); for (int &M : NewMask) if (M >= NumElements) M = -1; return DAG.getVectorShuffle(VT, dl, V1, V2, NewMask); } // For integer vector shuffles, try to collapse them into a shuffle of fewer // lanes but wider integers. We cap this to not form integers larger than i64 // but it might be interesting to form i128 integers to handle flipping the // low and high halves of AVX 256-bit vectors. if (VT.isInteger() && VT.getScalarSizeInBits() < 64 && areAdjacentMasksSequential(Mask)) { SmallVector
NewMask; for (int i = 0, Size = Mask.size(); i < Size; i += 2) NewMask.push_back(Mask[i] / 2); MVT NewVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits() * 2), VT.getVectorNumElements() / 2); V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, V2); return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getVectorShuffle(NewVT, dl, V1, V2, NewMask)); } int NumV1Elements = 0, NumUndefElements = 0, NumV2Elements = 0; for (int M : SVOp->getMask()) if (M < 0) ++NumUndefElements; else if (M < NumElements) ++NumV1Elements; else ++NumV2Elements; // Commute the shuffle as needed such that more elements come from V1 than // V2. This allows us to match the shuffle pattern strictly on how many // elements come from V1 without handling the symmetric cases. if (NumV2Elements > NumV1Elements) return CommuteVectorShuffle(SVOp, DAG); // When the number of V1 and V2 elements are the same, try to minimize the // number of uses of V2 in the low half of the vector. if (NumV1Elements == NumV2Elements) { int LowV1Elements = 0, LowV2Elements = 0; for (int M : SVOp->getMask().slice(0, NumElements / 2)) if (M >= NumElements) ++LowV2Elements; else if (M >= 0) ++LowV1Elements; if (LowV2Elements > LowV1Elements) return CommuteVectorShuffle(SVOp, DAG); } // For each vector width, delegate to a specialized lowering routine. if (VT.getSizeInBits() == 128) return lower128BitVectorShuffle(Op, V1, V2, VT, Subtarget, DAG); llvm_unreachable("Unimplemented!"); } //===----------------------------------------------------------------------===// // Legacy vector shuffle lowering // // This code is the legacy code handling vector shuffles until the above // replaces its functionality and performance. //===----------------------------------------------------------------------===// static bool isBlendMask(ArrayRef
MaskVals, MVT VT, bool hasSSE41, bool hasInt256, unsigned *MaskOut = nullptr) { MVT EltVT = VT.getVectorElementType(); // There is no blend with immediate in AVX-512. if (VT.is512BitVector()) return false; if (!hasSSE41 || EltVT == MVT::i8) return false; if (!hasInt256 && VT == MVT::v16i16) return false; unsigned MaskValue = 0; unsigned NumElems = VT.getVectorNumElements(); // There are 2 lanes if (NumElems > 8), and 1 lane otherwise. unsigned NumLanes = (NumElems - 1) / 8 + 1; unsigned NumElemsInLane = NumElems / NumLanes; // Blend for v16i16 should be symetric for the both lanes. for (unsigned i = 0; i < NumElemsInLane; ++i) { int SndLaneEltIdx = (NumLanes == 2) ? MaskVals[i + NumElemsInLane] : -1; int EltIdx = MaskVals[i]; if ((EltIdx < 0 || EltIdx == (int)i) && (SndLaneEltIdx < 0 || SndLaneEltIdx == (int)(i + NumElemsInLane))) continue; if (((unsigned)EltIdx == (i + NumElems)) && (SndLaneEltIdx < 0 || (unsigned)SndLaneEltIdx == i + NumElems + NumElemsInLane)) MaskValue |= (1 << i); else return false; } if (MaskOut) *MaskOut = MaskValue; return true; } // Try to lower a shuffle node into a simple blend instruction. // This function assumes isBlendMask returns true for this // SuffleVectorSDNode static SDValue LowerVECTOR_SHUFFLEtoBlend(ShuffleVectorSDNode *SVOp, unsigned MaskValue, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); MVT EltVT = VT.getVectorElementType(); assert(isBlendMask(SVOp->getMask(), VT, Subtarget->hasSSE41(), Subtarget->hasInt256() && "Trying to lower a " "VECTOR_SHUFFLE to a Blend but " "with the wrong mask")); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); // Convert i32 vectors to floating point if it is not AVX2. // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors. MVT BlendVT = VT; if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) { BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()), NumElems); V1 = DAG.getNode(ISD::BITCAST, dl, VT, V1); V2 = DAG.getNode(ISD::BITCAST, dl, VT, V2); } SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, V1, V2, DAG.getConstant(MaskValue, MVT::i32)); return DAG.getNode(ISD::BITCAST, dl, VT, Ret); } /// In vector type \p VT, return true if the element at index \p InputIdx /// falls on a different 128-bit lane than \p OutputIdx. static bool ShuffleCrosses128bitLane(MVT VT, unsigned InputIdx, unsigned OutputIdx) { unsigned EltSize = VT.getVectorElementType().getSizeInBits(); return InputIdx * EltSize / 128 != OutputIdx * EltSize / 128; } /// Generate a PSHUFB if possible. Selects elements from \p V1 according to /// \p MaskVals. MaskVals[OutputIdx] = InputIdx specifies that we want to /// shuffle the element at InputIdx in V1 to OutputIdx in the result. If \p /// MaskVals refers to elements outside of \p V1 or is undef (-1), insert a /// zero. static SDValue getPSHUFB(ArrayRef
MaskVals, SDValue V1, SDLoc &dl, SelectionDAG &DAG) { MVT VT = V1.getSimpleValueType(); assert(VT.is128BitVector() || VT.is256BitVector()); MVT EltVT = VT.getVectorElementType(); unsigned EltSizeInBytes = EltVT.getSizeInBits() / 8; unsigned NumElts = VT.getVectorNumElements(); SmallVector
PshufbMask; for (unsigned OutputIdx = 0; OutputIdx < NumElts; ++OutputIdx) { int InputIdx = MaskVals[OutputIdx]; unsigned InputByteIdx; if (InputIdx < 0 || NumElts <= (unsigned)InputIdx) InputByteIdx = 0x80; else { // Cross lane is not allowed. if (ShuffleCrosses128bitLane(VT, InputIdx, OutputIdx)) return SDValue(); InputByteIdx = InputIdx * EltSizeInBytes; // Index is an byte offset within the 128-bit lane. InputByteIdx &= 0xf; } for (unsigned j = 0; j < EltSizeInBytes; ++j) { PshufbMask.push_back(DAG.getConstant(InputByteIdx, MVT::i8)); if (InputByteIdx != 0x80) ++InputByteIdx; } } MVT ShufVT = MVT::getVectorVT(MVT::i8, PshufbMask.size()); if (ShufVT != VT) V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1); return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask)); } // v8i16 shuffles - Prefer shuffles in the following order: // 1. [all] pshuflw, pshufhw, optional move // 2. [ssse3] 1 x pshufb // 3. [ssse3] 2 x pshufb + 1 x por // 4. [all] mov + pshuflw + pshufhw + N x (pextrw + pinsrw) static SDValue LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast
(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); SmallVector
MaskVals; // Determine if more than 1 of the words in each of the low and high quadwords // of the result come from the same quadword of one of the two inputs. Undef // mask values count as coming from any quadword, for better codegen. // // Lo/HiQuad[i] = j indicates how many words from the ith quad of the input // feeds this quad. For i, 0 and 1 refer to V1, 2 and 3 refer to V2. unsigned LoQuad[] = { 0, 0, 0, 0 }; unsigned HiQuad[] = { 0, 0, 0, 0 }; // Indices of quads used. std::bitset<4> InputQuads; for (unsigned i = 0; i < 8; ++i) { unsigned *Quad = i < 4 ? LoQuad : HiQuad; int EltIdx = SVOp->getMaskElt(i); MaskVals.push_back(EltIdx); if (EltIdx < 0) { ++Quad[0]; ++Quad[1]; ++Quad[2]; ++Quad[3]; continue; } ++Quad[EltIdx / 4]; InputQuads.set(EltIdx / 4); } int BestLoQuad = -1; unsigned MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { if (LoQuad[i] > MaxQuad) { BestLoQuad = i; MaxQuad = LoQuad[i]; } } int BestHiQuad = -1; MaxQuad = 1; for (unsigned i = 0; i < 4; ++i) { if (HiQuad[i] > MaxQuad) { BestHiQuad = i; MaxQuad = HiQuad[i]; } } // For SSSE3, If all 8 words of the result come from only 1 quadword of each // of the two input vectors, shuffle them into one input vector so only a // single pshufb instruction is necessary. If there are more than 2 input // quads, disable the next transformation since it does not help SSSE3. bool V1Used = InputQuads[0] || InputQuads[1]; bool V2Used = InputQuads[2] || InputQuads[3]; if (Subtarget->hasSSSE3()) { if (InputQuads.count() == 2 && V1Used && V2Used) { BestLoQuad = InputQuads[0] ? 0 : 1; BestHiQuad = InputQuads[2] ? 2 : 3; } if (InputQuads.count() > 2) { BestLoQuad = -1; BestHiQuad = -1; } } // If BestLoQuad or BestHiQuad are set, shuffle the quads together and update // the shuffle mask. If a quad is scored as -1, that means that it contains // words from all 4 input quadwords. SDValue NewV; if (BestLoQuad >= 0 || BestHiQuad >= 0) { int MaskV[] = { BestLoQuad < 0 ? 0 : BestLoQuad, BestHiQuad < 0 ? 1 : BestHiQuad }; NewV = DAG.getVectorShuffle(MVT::v2i64, dl, DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V1), DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, V2), &MaskV[0]); NewV = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, NewV); // Rewrite the MaskVals and assign NewV to V1 if NewV now contains all the // source words for the shuffle, to aid later transformations. bool AllWordsInNewV = true; bool InOrder[2] = { true, true }; for (unsigned i = 0; i != 8; ++i) { int idx = MaskVals[i]; if (idx != (int)i) InOrder[i/4] = false; if (idx < 0 || (idx/4) == BestLoQuad || (idx/4) == BestHiQuad) continue; AllWordsInNewV = false; break; } bool pshuflw = AllWordsInNewV, pshufhw = AllWordsInNewV; if (AllWordsInNewV) { for (int i = 0; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) continue; idx = MaskVals[i] = (idx / 4) == BestLoQuad ? (idx & 3) : (idx & 3) + 4; if ((idx != i) && idx < 4) pshufhw = false; if ((idx != i) && idx > 3) pshuflw = false; } V1 = NewV; V2Used = false; BestLoQuad = 0; BestHiQuad = 1; } // If we've eliminated the use of V2, and the new mask is a pshuflw or // pshufhw, that's as cheap as it gets. Return the new shuffle. if ((pshufhw && InOrder[0]) || (pshuflw && InOrder[1])) { unsigned Opc = pshufhw ? X86ISD::PSHUFHW : X86ISD::PSHUFLW; unsigned TargetMask = 0; NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskVals[0]); ShuffleVectorSDNode *SVOp = cast
(NewV.getNode()); TargetMask = pshufhw ? getShufflePSHUFHWImmediate(SVOp): getShufflePSHUFLWImmediate(SVOp); V1 = NewV.getOperand(0); return getTargetShuffleNode(Opc, dl, MVT::v8i16, V1, TargetMask, DAG); } } // Promote splats to a larger type which usually leads to more efficient code. // FIXME: Is this true if pshufb is available? if (SVOp->isSplat()) return PromoteSplat(SVOp, DAG); // If we have SSSE3, and all words of the result are from 1 input vector, // case 2 is generated, otherwise case 3 is generated. If no SSSE3 // is present, fall back to case 4. if (Subtarget->hasSSSE3()) { SmallVector
pshufbMask; // If we have elements from both input vectors, set the high bit of the // shuffle mask element to zero out elements that come from V2 in the V1 // mask, and elements that come from V1 in the V2 mask, so that the two // results can be OR'd together. bool TwoInputs = V1Used && V2Used; V1 = getPSHUFB(MaskVals, V1, dl, DAG); if (!TwoInputs) return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. CommuteVectorShuffleMask(MaskVals, 8); V2 = getPSHUFB(MaskVals, V2, dl, DAG); V1 = DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); return DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); } // If BestLoQuad >= 0, generate a pshuflw to put the low elements in order, // and update MaskVals with new element order. std::bitset<8> InOrder; if (BestLoQuad >= 0) { int MaskV[] = { -1, -1, -1, -1, 4, 5, 6, 7 }; for (int i = 0; i != 4; ++i) { int idx = MaskVals[i]; if (idx < 0) { InOrder.set(i); } else if ((idx / 4) == BestLoQuad) { MaskV[i] = idx & 3; InOrder.set(i); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast
(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16, NewV.getOperand(0), getShufflePSHUFLWImmediate(SVOp), DAG); } } // If BestHi >= 0, generate a pshufhw to put the high elements in order, // and update MaskVals with the new element order. if (BestHiQuad >= 0) { int MaskV[] = { 0, 1, 2, 3, -1, -1, -1, -1 }; for (unsigned i = 4; i != 8; ++i) { int idx = MaskVals[i]; if (idx < 0) { InOrder.set(i); } else if ((idx / 4) == BestHiQuad) { MaskV[i] = (idx & 3) + 4; InOrder.set(i); } } NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16), &MaskV[0]); if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) { ShuffleVectorSDNode *SVOp = cast
(NewV.getNode()); NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16, NewV.getOperand(0), getShufflePSHUFHWImmediate(SVOp), DAG); } } // In case BestHi & BestLo were both -1, which means each quadword has a word // from each of the four input quadwords, calculate the InOrder bitvector now // before falling through to the insert/extract cleanup. if (BestLoQuad == -1 && BestHiQuad == -1) { NewV = V1; for (int i = 0; i != 8; ++i) if (MaskVals[i] < 0 || MaskVals[i] == i) InOrder.set(i); } // The other elements are put in the right place using pextrw and pinsrw. for (unsigned i = 0; i != 8; ++i) { if (InOrder[i]) continue; int EltIdx = MaskVals[i]; if (EltIdx < 0) continue; SDValue ExtOp = (EltIdx < 8) ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V1, DAG.getIntPtrConstant(EltIdx)) : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, V2, DAG.getIntPtrConstant(EltIdx - 8)); NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, ExtOp, DAG.getIntPtrConstant(i)); } return NewV; } /// \brief v16i16 shuffles /// /// FIXME: We only support generation of a single pshufb currently. We can /// generalize the other applicable cases from LowerVECTOR_SHUFFLEv8i16 as /// well (e.g 2 x pshufb + 1 x por). static SDValue LowerVECTOR_SHUFFLEv16i16(SDValue Op, SelectionDAG &DAG) { ShuffleVectorSDNode *SVOp = cast
(Op); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); if (V2.getOpcode() != ISD::UNDEF) return SDValue(); SmallVector
MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); return getPSHUFB(MaskVals, V1, dl, DAG); } // v16i8 shuffles - Prefer shuffles in the following order: // 1. [ssse3] 1 x pshufb // 2. [ssse3] 2 x pshufb + 1 x por // 3. [all] v8i16 shuffle + N x pextrw + rotate + pinsrw static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp, const X86Subtarget* Subtarget, SelectionDAG &DAG) { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); ArrayRef
MaskVals = SVOp->getMask(); // Promote splats to a larger type which usually leads to more efficient code. // FIXME: Is this true if pshufb is available? if (SVOp->isSplat()) return PromoteSplat(SVOp, DAG); // If we have SSSE3, case 1 is generated when all result bytes come from // one of the inputs. Otherwise, case 2 is generated. If no SSSE3 is // present, fall back to case 3. // If SSSE3, use 1 pshufb instruction per vector with elements in the result. if (Subtarget->hasSSSE3()) { SmallVector
pshufbMask; // If all result elements are from one input vector, then only translate // undef mask values to 0x80 (zero out result) in the pshufb mask. // // Otherwise, we have elements from both input vectors, and must zero out // elements that come from V2 in the first mask, and V1 in the second mask // so that we can OR them together. for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; if (EltIdx < 0 || EltIdx >= 16) EltIdx = 0x80; pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, pshufbMask)); // As PSHUFB will zero elements with negative indices, it's safe to ignore // the 2nd operand if it's undefined or zero. if (V2.getOpcode() == ISD::UNDEF || ISD::isBuildVectorAllZeros(V2.getNode())) return V1; // Calculate the shuffle mask for the second input, shuffle it, and // OR it with the first shuffled input. pshufbMask.clear(); for (unsigned i = 0; i != 16; ++i) { int EltIdx = MaskVals[i]; EltIdx = (EltIdx < 16) ? 0x80 : EltIdx - 16; pshufbMask.push_back(DAG.getConstant(EltIdx, MVT::i8)); } V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2, DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8, pshufbMask)); return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2); } // No SSSE3 - Calculate in place words and then fix all out of place words // With 0-16 extracts & inserts. Worst case is 16 bytes out of order from // the 16 different words that comprise the two doublequadword input vectors. V1 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V1); V2 = DAG.getNode(ISD::BITCAST, dl, MVT::v8i16, V2); SDValue NewV = V1; for (int i = 0; i != 8; ++i) { int Elt0 = MaskVals[i*2]; int Elt1 = MaskVals[i*2+1]; // This word of the result is all undef, skip it. if (Elt0 < 0 && Elt1 < 0) continue; // This word of the result is already in the correct place, skip it. if ((Elt0 == i*2) && (Elt1 == i*2+1)) continue; SDValue Elt0Src = Elt0 < 16 ? V1 : V2; SDValue Elt1Src = Elt1 < 16 ? V1 : V2; SDValue InsElt; // If Elt0 and Elt1 are defined, are consecutive, and can be load // using a single extract together, load it and store it. if ((Elt0 >= 0) && ((Elt0 + 1) == Elt1) && ((Elt0 & 1) == 0)) { InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, DAG.getIntPtrConstant(Elt1 / 2)); NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, DAG.getIntPtrConstant(i)); continue; } // If Elt1 is defined, extract it from the appropriate source. If the // source byte is not also odd, shift the extracted word left 8 bits // otherwise clear the bottom 8 bits if we need to do an or. if (Elt1 >= 0) { InsElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt1Src, DAG.getIntPtrConstant(Elt1 / 2)); if ((Elt1 & 1) == 0) InsElt = DAG.getNode(ISD::SHL, dl, MVT::i16, InsElt, DAG.getConstant(8, TLI.getShiftAmountTy(InsElt.getValueType()))); else if (Elt0 >= 0) InsElt = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt, DAG.getConstant(0xFF00, MVT::i16)); } // If Elt0 is defined, extract it from the appropriate source. If the // source byte is not also even, shift the extracted word right 8 bits. If // Elt1 was also defined, OR the extracted values together before // inserting them in the result. if (Elt0 >= 0) { SDValue InsElt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Elt0Src, DAG.getIntPtrConstant(Elt0 / 2)); if ((Elt0 & 1) != 0) InsElt0 = DAG.getNode(ISD::SRL, dl, MVT::i16, InsElt0, DAG.getConstant(8, TLI.getShiftAmountTy(InsElt0.getValueType()))); else if (Elt1 >= 0) InsElt0 = DAG.getNode(ISD::AND, dl, MVT::i16, InsElt0, DAG.getConstant(0x00FF, MVT::i16)); InsElt = Elt1 >= 0 ? DAG.getNode(ISD::OR, dl, MVT::i16, InsElt, InsElt0) : InsElt0; } NewV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, NewV, InsElt, DAG.getIntPtrConstant(i)); } return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, NewV); } // v32i8 shuffles - Translate to VPSHUFB if possible. static SDValue LowerVECTOR_SHUFFLEv32i8(ShuffleVectorSDNode *SVOp, const X86Subtarget *Subtarget, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); SmallVector
MaskVals(SVOp->getMask().begin(), SVOp->getMask().end()); bool V2IsUndef = V2.getOpcode() == ISD::UNDEF; bool V1IsAllZero = ISD::isBuildVectorAllZeros(V1.getNode()); bool V2IsAllZero = ISD::isBuildVectorAllZeros(V2.getNode()); // VPSHUFB may be generated if // (1) one of input vector is undefined or zeroinitializer. // The mask value 0x80 puts 0 in the corresponding slot of the vector. // And (2) the mask indexes don't cross the 128-bit lane. if (VT != MVT::v32i8 || !Subtarget->hasInt256() || (!V2IsUndef && !V2IsAllZero && !V1IsAllZero)) return SDValue(); if (V1IsAllZero && !V2IsAllZero) { CommuteVectorShuffleMask(MaskVals, 32); V1 = V2; } return getPSHUFB(MaskVals, V1, dl, DAG); } /// RewriteAsNarrowerShuffle - Try rewriting v8i16 and v16i8 shuffles as 4 wide /// ones, or rewriting v4i32 / v4f32 as 2 wide ones if possible. This can be /// done when every pair / quad of shuffle mask elements point to elements in /// the right sequence. e.g. /// vector_shuffle X, Y, <2, 3, | 10, 11, | 0, 1, | 14, 15> static SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { MVT VT = SVOp->getSimpleValueType(0); SDLoc dl(SVOp); unsigned NumElems = VT.getVectorNumElements(); MVT NewVT; unsigned Scale; switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected!"); case MVT::v2i64: case MVT::v2f64: return SDValue(SVOp, 0); case MVT::v4f32: NewVT = MVT::v2f64; Scale = 2; break; case MVT::v4i32: NewVT = MVT::v2i64; Scale = 2; break; case MVT::v8i16: NewVT = MVT::v4i32; Scale = 2; break; case MVT::v16i8: NewVT = MVT::v4i32; Scale = 4; break; case MVT::v16i16: NewVT = MVT::v8i32; Scale = 2; break; case MVT::v32i8: NewVT = MVT::v8i32; Scale = 4; break; } SmallVector
MaskVec; for (unsigned i = 0; i != NumElems; i += Scale) { int StartIdx = -1; for (unsigned j = 0; j != Scale; ++j) { int EltIdx = SVOp->getMaskElt(i+j); if (EltIdx < 0) continue; if (StartIdx < 0) StartIdx = (EltIdx / Scale); if (EltIdx != (int)(StartIdx*Scale + j)) return SDValue(); } MaskVec.push_back(StartIdx); } SDValue V1 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(0)); SDValue V2 = DAG.getNode(ISD::BITCAST, dl, NewVT, SVOp->getOperand(1)); return DAG.getVectorShuffle(NewVT, dl, V1, V2, &MaskVec[0]); } /// getVZextMovL - Return a zero-extending vector move low node. /// static SDValue getVZextMovL(MVT VT, MVT OpVT, SDValue SrcOp, SelectionDAG &DAG, const X86Subtarget *Subtarget, SDLoc dl) { if (VT == MVT::v2f64 || VT == MVT::v4f32) { LoadSDNode *LD = nullptr; if (!isScalarLoadToVector(SrcOp.getNode(), &LD)) LD = dyn_cast
(SrcOp); if (!LD) { // movssrr and movsdrr do not clear top bits. Try to use movd, movq // instead. MVT ExtVT = (OpVT == MVT::v2f64) ? MVT::i64 : MVT::i32; if ((ExtVT != MVT::i64 || Subtarget->is64Bit()) && SrcOp.getOpcode() == ISD::SCALAR_TO_VECTOR && SrcOp.getOperand(0).getOpcode() == ISD::BITCAST && SrcOp.getOperand(0).getOperand(0).getValueType() == ExtVT) { // PR2108 OpVT = (OpVT == MVT::v2f64) ? MVT::v2i64 : MVT::v4i32; return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, OpVT, SrcOp.getOperand(0) .getOperand(0)))); } } } return DAG.getNode(ISD::BITCAST, dl, VT, DAG.getNode(X86ISD::VZEXT_MOVL, dl, OpVT, DAG.getNode(ISD::BITCAST, dl, OpVT, SrcOp))); } /// LowerVECTOR_SHUFFLE_256 - Handle all 256-bit wide vectors shuffles /// which could not be matched by any known target speficic shuffle static SDValue LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue NewOp = Compact8x32ShuffleNode(SVOp, DAG); if (NewOp.getNode()) return NewOp; MVT VT = SVOp->getSimpleValueType(0); unsigned NumElems = VT.getVectorNumElements(); unsigned NumLaneElems = NumElems / 2; SDLoc dl(SVOp); MVT EltVT = VT.getVectorElementType(); MVT NVT = MVT::getVectorVT(EltVT, NumLaneElems); SDValue Output[2]; SmallVector
Mask; for (unsigned l = 0; l < 2; ++l) { // Build a shuffle mask for the output, discovering on the fly which // input vectors to use as shuffle operands (recorded in InputUsed). // If building a suitable shuffle vector proves too hard, then bail // out with UseBuildVector set. bool UseBuildVector = false; int InputUsed[2] = { -1, -1 }; // Not yet discovered. unsigned LaneStart = l * NumLaneElems; for (unsigned i = 0; i != NumLaneElems; ++i) { // The mask element. This indexes into the input. int Idx = SVOp->getMaskElt(i+LaneStart); if (Idx < 0) { // the mask element does not index into any input vector. Mask.push_back(-1); continue; } // The input vector this mask element indexes into. int Input = Idx / NumLaneElems; // Turn the index into an offset from the start of the input vector. Idx -= Input * NumLaneElems; // Find or create a shuffle vector operand to hold this input. unsigned OpNo; for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) { if (InputUsed[OpNo] == Input) // This input vector is already an operand. break; if (InputUsed[OpNo] < 0) { // Create a new operand for this input vector. InputUsed[OpNo] = Input; break; } } if (OpNo >= array_lengthof(InputUsed)) { // More than two input vectors used! Give up on trying to create a // shuffle vector. Insert all elements into a BUILD_VECTOR instead. UseBuildVector = true; break; } // Add the mask index for the new shuffle vector. Mask.push_back(Idx + OpNo * NumLaneElems); } if (UseBuildVector) { SmallVector
SVOps; for (unsigned i = 0; i != NumLaneElems; ++i) { // The mask element. This indexes into the input. int Idx = SVOp->getMaskElt(i+LaneStart); if (Idx < 0) { SVOps.push_back(DAG.getUNDEF(EltVT)); continue; } // The input vector this mask element indexes into. int Input = Idx / NumElems; // Turn the index into an offset from the start of the input vector. Idx -= Input * NumElems; // Extract the vector element by hand. SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SVOp->getOperand(Input), DAG.getIntPtrConstant(Idx))); } // Construct the output using a BUILD_VECTOR. Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps); } else if (InputUsed[0] < 0) { // No input vectors were used! The result is undefined. Output[l] = DAG.getUNDEF(NVT); } else { SDValue Op0 = Extract128BitVector(SVOp->getOperand(InputUsed[0] / 2), (InputUsed[0] % 2) * NumLaneElems, DAG, dl); // If only one input was used, use an undefined vector for the other. SDValue Op1 = (InputUsed[1] < 0) ? DAG.getUNDEF(NVT) : Extract128BitVector(SVOp->getOperand(InputUsed[1] / 2), (InputUsed[1] % 2) * NumLaneElems, DAG, dl); // At least one input vector was used. Create a new shuffle vector. Output[l] = DAG.getVectorShuffle(NVT, dl, Op0, Op1, &Mask[0]); } Mask.clear(); } // Concatenate the result back return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Output[0], Output[1]); } /// LowerVECTOR_SHUFFLE_128v4 - Handle all 128-bit wide vectors with /// 4 elements, and match them with several different shuffle types. static SDValue LowerVECTOR_SHUFFLE_128v4(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) { SDValue V1 = SVOp->getOperand(0); SDValue V2 = SVOp->getOperand(1); SDLoc dl(SVOp); MVT VT = SVOp->getSimpleValueType(0); assert(VT.is128BitVector() && "Unsupported vector size"); std::pair