//===- subzero/src/IceCfg.cpp - Control flow graph implementation ---------===// // // The Subzero Code Generator // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// /// /// \file /// \brief Implements the Cfg class. /// //===----------------------------------------------------------------------===// #include "IceCfg.h" #include "IceAssembler.h" #include "IceBitVector.h" #include "IceCfgNode.h" #include "IceClFlags.h" #include "IceDefs.h" #include "IceELFObjectWriter.h" #include "IceGlobalInits.h" #include "IceInst.h" #include "IceInstVarIter.h" #include "IceInstrumentation.h" #include "IceLiveness.h" #include "IceLoopAnalyzer.h" #include "IceOperand.h" #include "IceTargetLowering.h" #include <memory> #include <utility> namespace Ice { Cfg::Cfg(GlobalContext *Ctx, uint32_t SequenceNumber) : Allocator(createAllocator()), Ctx(Ctx), SequenceNumber(SequenceNumber), VMask(getFlags().getVerbose()), FunctionName(), NextInstNumber(Inst::NumberInitial), Live(nullptr) { NodeStrings.reset(new StringPool); VarStrings.reset(new StringPool); CfgLocalAllocatorScope _(this); Target = TargetLowering::createLowering(getFlags().getTargetArch(), this); VMetadata.reset(new VariablesMetadata(this)); TargetAssembler = Target->createAssembler(); if (getFlags().getRandomizeAndPoolImmediatesOption() == RPI_Randomize) { // If -randomize-pool-immediates=randomize, create a random number // generator to generate a cookie for constant blinding. RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_ConstantBlinding, this->SequenceNumber); ConstantBlindingCookie = (uint32_t)RNG.next((uint64_t)std::numeric_limits<uint32_t>::max() + 1); } } Cfg::~Cfg() { assert(CfgAllocatorTraits::current() == nullptr); if (getFlags().getDumpStrings()) { OstreamLocker _(Ctx); Ostream &Str = Ctx->getStrDump(); getNodeStrings()->dump(Str); getVarStrings()->dump(Str); } } // Called in the initalizer list of Cfg's constructor to create the Allocator // and set it as TLS before any other member fields are constructed, since they // may depend on it. ArenaAllocator *Cfg::createAllocator() { ArenaAllocator *Allocator = new ArenaAllocator(); CfgAllocatorTraits::set_current(Allocator); return Allocator; } /// Create a string like "foo(i=123:b=9)" indicating the function name, number /// of high-level instructions, and number of basic blocks. This string is only /// used for dumping and other diagnostics, and the idea is that given a set of /// functions to debug a problem on, it's easy to find the smallest or simplest /// function to attack. Note that the counts may change somewhat depending on /// what point it is called during the translation passes. std::string Cfg::getFunctionNameAndSize() const { if (!BuildDefs::dump()) return getFunctionName().toString(); SizeT NodeCount = 0; SizeT InstCount = 0; for (CfgNode *Node : getNodes()) { ++NodeCount; // Note: deleted instructions are *not* ignored. InstCount += Node->getPhis().size(); for (Inst &I : Node->getInsts()) { if (!llvm::isa<InstTarget>(&I)) ++InstCount; } } return getFunctionName() + "(i=" + std::to_string(InstCount) + ":b=" + std::to_string(NodeCount) + ")"; } void Cfg::setError(const std::string &Message) { HasError = true; ErrorMessage = Message; } CfgNode *Cfg::makeNode() { SizeT LabelIndex = Nodes.size(); auto *Node = CfgNode::create(this, LabelIndex); Nodes.push_back(Node); return Node; } void Cfg::swapNodes(NodeList &NewNodes) { assert(Nodes.size() == NewNodes.size()); Nodes.swap(NewNodes); for (SizeT I = 0, NumNodes = getNumNodes(); I < NumNodes; ++I) Nodes[I]->resetIndex(I); } template <> Variable *Cfg::makeVariable<Variable>(Type Ty) { SizeT Index = Variables.size(); Variable *Var; if (Target->shouldSplitToVariableVecOn32(Ty)) { Var = VariableVecOn32::create(this, Ty, Index); } else if (Target->shouldSplitToVariable64On32(Ty)) { Var = Variable64On32::create(this, Ty, Index); } else { Var = Variable::create(this, Ty, Index); } Variables.push_back(Var); return Var; } void Cfg::addArg(Variable *Arg) { Arg->setIsArg(); Args.push_back(Arg); } void Cfg::addImplicitArg(Variable *Arg) { Arg->setIsImplicitArg(); ImplicitArgs.push_back(Arg); } // Returns whether the stack frame layout has been computed yet. This is used // for dumping the stack frame location of Variables. bool Cfg::hasComputedFrame() const { return getTarget()->hasComputedFrame(); } namespace { constexpr char BlockNameGlobalPrefix[] = ".L$profiler$block_name$"; constexpr char BlockStatsGlobalPrefix[] = ".L$profiler$block_info$"; } // end of anonymous namespace void Cfg::createNodeNameDeclaration(const std::string &NodeAsmName) { auto *Var = VariableDeclaration::create(GlobalInits.get()); Var->setName(Ctx, BlockNameGlobalPrefix + NodeAsmName); Var->setIsConstant(true); Var->addInitializer(VariableDeclaration::DataInitializer::create( GlobalInits.get(), NodeAsmName.data(), NodeAsmName.size() + 1)); const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64); Var->setAlignment(Int64ByteSize); // Wasteful, 32-bit could use 4 bytes. GlobalInits->push_back(Var); } void Cfg::createBlockProfilingInfoDeclaration( const std::string &NodeAsmName, VariableDeclaration *NodeNameDeclaration) { auto *Var = VariableDeclaration::create(GlobalInits.get()); Var->setName(Ctx, BlockStatsGlobalPrefix + NodeAsmName); const SizeT Int64ByteSize = typeWidthInBytes(IceType_i64); Var->addInitializer(VariableDeclaration::ZeroInitializer::create( GlobalInits.get(), Int64ByteSize)); const RelocOffsetT NodeNameDeclarationOffset = 0; Var->addInitializer(VariableDeclaration::RelocInitializer::create( GlobalInits.get(), NodeNameDeclaration, {RelocOffset::create(Ctx, NodeNameDeclarationOffset)})); Var->setAlignment(Int64ByteSize); GlobalInits->push_back(Var); } void Cfg::profileBlocks() { if (GlobalInits == nullptr) GlobalInits.reset(new VariableDeclarationList()); for (CfgNode *Node : Nodes) { const std::string NodeAsmName = Node->getAsmName(); createNodeNameDeclaration(NodeAsmName); createBlockProfilingInfoDeclaration(NodeAsmName, GlobalInits->back()); Node->profileExecutionCount(GlobalInits->back()); } } bool Cfg::isProfileGlobal(const VariableDeclaration &Var) { if (!Var.getName().hasStdString()) return false; return Var.getName().toString().find(BlockStatsGlobalPrefix) == 0; } void Cfg::addCallToProfileSummary() { // The call(s) to __Sz_profile_summary are added by the profiler in functions // that cause the program to exit. This function is defined in // runtime/szrt_profiler.c. Constant *ProfileSummarySym = Ctx->getConstantExternSym(Ctx->getGlobalString("__Sz_profile_summary")); constexpr SizeT NumArgs = 0; constexpr Variable *Void = nullptr; constexpr bool HasTailCall = false; auto *Call = InstCall::create(this, NumArgs, Void, ProfileSummarySym, HasTailCall); getEntryNode()->getInsts().push_front(Call); } void Cfg::translate() { if (hasError()) return; // Cache the possibly-overridden optimization level once translation begins. // It would be nicer to do this in the constructor, but we need to wait until // after setFunctionName() has a chance to be called. OptimizationLevel = getFlags().matchForceO2(getFunctionName(), getSequenceNumber()) ? Opt_2 : getFlags().getOptLevel(); if (BuildDefs::timers()) { if (getFlags().matchTimingFocus(getFunctionName(), getSequenceNumber())) { setFocusedTiming(); getContext()->resetTimer(GlobalContext::TSK_Default); } } if (BuildDefs::dump()) { if (isVerbose(IceV_Status) && getFlags().matchTestStatus(getFunctionName(), getSequenceNumber())) { getContext()->getStrDump() << ">>>Translating " << getFunctionNameAndSize() << " seq=" << getSequenceNumber() << "\n"; } } TimerMarker T_func(getContext(), getFunctionName().toStringOrEmpty()); TimerMarker T(TimerStack::TT_translate, this); dump("Initial CFG"); if (getFlags().getEnableBlockProfile()) { profileBlocks(); // TODO(jpp): this is fragile, at best. Figure out a better way of // detecting exit functions. if (getFunctionName().toStringOrEmpty() == "exit") { addCallToProfileSummary(); } dump("Profiled CFG"); } // Create the Hi and Lo variables where a split was needed for (Variable *Var : Variables) { if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Var)) { Var64On32->initHiLo(this); } else if (auto *VarVecOn32 = llvm::dyn_cast<VariableVecOn32>(Var)) { VarVecOn32->initVecElement(this); } } // Instrument the Cfg, e.g. with AddressSanitizer if (!BuildDefs::minimal() && getFlags().getSanitizeAddresses()) { getContext()->instrumentFunc(this); dump("Instrumented CFG"); } // The set of translation passes and their order are determined by the // target. getTarget()->translate(); dump("Final output"); if (getFocusedTiming()) { getContext()->dumpLocalTimers(getFunctionName().toString()); } } void Cfg::fixPhiNodes() { for (auto *Node : Nodes) { // Fix all the phi edges since WASM can't tell how to make them correctly at // the beginning. assert(Node); const auto &InEdges = Node->getInEdges(); for (auto &Instr : Node->getPhis()) { auto *Phi = llvm::cast<InstPhi>(&Instr); assert(Phi); for (SizeT i = 0; i < InEdges.size(); ++i) { Phi->setLabel(i, InEdges[i]); } } } } void Cfg::computeInOutEdges() { // Compute the out-edges. for (CfgNode *Node : Nodes) { Node->computeSuccessors(); } // Prune any unreachable nodes before computing in-edges. SizeT NumNodes = getNumNodes(); BitVector Reachable(NumNodes); BitVector Pending(NumNodes); Pending.set(getEntryNode()->getIndex()); while (true) { int Index = Pending.find_first(); if (Index == -1) break; Pending.reset(Index); Reachable.set(Index); CfgNode *Node = Nodes[Index]; assert(Node->getIndex() == (SizeT)Index); for (CfgNode *Succ : Node->getOutEdges()) { SizeT SuccIndex = Succ->getIndex(); if (!Reachable.test(SuccIndex)) Pending.set(SuccIndex); } } SizeT Dest = 0; for (SizeT Source = 0; Source < NumNodes; ++Source) { if (Reachable.test(Source)) { Nodes[Dest] = Nodes[Source]; Nodes[Dest]->resetIndex(Dest); // Compute the in-edges. Nodes[Dest]->computePredecessors(); ++Dest; } } Nodes.resize(Dest); TimerMarker T(TimerStack::TT_phiValidation, this); for (CfgNode *Node : Nodes) Node->enforcePhiConsistency(); } void Cfg::renumberInstructions() { TimerMarker T(TimerStack::TT_renumberInstructions, this); NextInstNumber = Inst::NumberInitial; for (CfgNode *Node : Nodes) Node->renumberInstructions(); // Make sure the entry node is the first node and therefore got the lowest // instruction numbers, to facilitate live range computation of function // arguments. We want to model function arguments as being live on entry to // the function, otherwise an argument whose only use is in the first // instruction will be assigned a trivial live range and the register // allocator will not recognize its live range as overlapping another // variable's live range. assert(Nodes.empty() || (*Nodes.begin() == getEntryNode())); } // placePhiLoads() must be called before placePhiStores(). void Cfg::placePhiLoads() { TimerMarker T(TimerStack::TT_placePhiLoads, this); for (CfgNode *Node : Nodes) Node->placePhiLoads(); } // placePhiStores() must be called after placePhiLoads(). void Cfg::placePhiStores() { TimerMarker T(TimerStack::TT_placePhiStores, this); for (CfgNode *Node : Nodes) Node->placePhiStores(); } void Cfg::deletePhis() { TimerMarker T(TimerStack::TT_deletePhis, this); for (CfgNode *Node : Nodes) Node->deletePhis(); } void Cfg::advancedPhiLowering() { TimerMarker T(TimerStack::TT_advancedPhiLowering, this); // Clear all previously computed live ranges (but not live-in/live-out bit // vectors or last-use markers), because the follow-on register allocation is // only concerned with live ranges across the newly created blocks. for (Variable *Var : Variables) { Var->getLiveRange().reset(); } // This splits edges and appends new nodes to the end of the node list. This // can invalidate iterators, so don't use an iterator. SizeT NumNodes = getNumNodes(); SizeT NumVars = getNumVariables(); for (SizeT I = 0; I < NumNodes; ++I) Nodes[I]->advancedPhiLowering(); TimerMarker TT(TimerStack::TT_lowerPhiAssignments, this); if (true) { // The following code does an in-place update of liveness and live ranges // as a result of adding the new phi edge split nodes. getLiveness()->initPhiEdgeSplits(Nodes.begin() + NumNodes, Variables.begin() + NumVars); TimerMarker TTT(TimerStack::TT_liveness, this); // Iterate over the newly added nodes to add their liveness info. for (auto I = Nodes.begin() + NumNodes, E = Nodes.end(); I != E; ++I) { InstNumberT FirstInstNum = getNextInstNumber(); (*I)->renumberInstructions(); InstNumberT LastInstNum = getNextInstNumber() - 1; (*I)->liveness(getLiveness()); (*I)->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum); } } else { // The following code does a brute-force recalculation of live ranges as a // result of adding the new phi edge split nodes. The liveness calculation // is particularly expensive because the new nodes are not yet in a proper // topological order and so convergence is slower. // // This code is kept here for reference and can be temporarily enabled in // case the incremental code is under suspicion. renumberInstructions(); liveness(Liveness_Intervals); getVMetadata()->init(VMK_All); } Target->regAlloc(RAK_Phi); } // Find a reasonable placement for nodes that have not yet been placed, while // maintaining the same relative ordering among already placed nodes. void Cfg::reorderNodes() { // TODO(ascull): it would be nice if the switch tests were always followed by // the default case to allow for fall through. using PlacedList = CfgList<CfgNode *>; PlacedList Placed; // Nodes with relative placement locked down PlacedList Unreachable; // Unreachable nodes PlacedList::iterator NoPlace = Placed.end(); // Keep track of where each node has been tentatively placed so that we can // manage insertions into the middle. CfgVector<PlacedList::iterator> PlaceIndex(Nodes.size(), NoPlace); for (CfgNode *Node : Nodes) { // The "do ... while(0);" construct is to factor out the --PlaceIndex and // assert() statements before moving to the next node. do { if (Node != getEntryNode() && Node->getInEdges().empty()) { // The node has essentially been deleted since it is not a successor of // any other node. Unreachable.push_back(Node); PlaceIndex[Node->getIndex()] = Unreachable.end(); Node->setNeedsPlacement(false); continue; } if (!Node->needsPlacement()) { // Add to the end of the Placed list. Placed.push_back(Node); PlaceIndex[Node->getIndex()] = Placed.end(); continue; } Node->setNeedsPlacement(false); // Assume for now that the unplaced node is from edge-splitting and // therefore has 1 in-edge and 1 out-edge (actually, possibly more than 1 // in-edge if the predecessor node was contracted). If this changes in // the future, rethink the strategy. assert(Node->getInEdges().size() >= 1); assert(Node->hasSingleOutEdge()); // If it's a (non-critical) edge where the successor has a single // in-edge, then place it before the successor. CfgNode *Succ = Node->getOutEdges().front(); if (Succ->getInEdges().size() == 1 && PlaceIndex[Succ->getIndex()] != NoPlace) { Placed.insert(PlaceIndex[Succ->getIndex()], Node); PlaceIndex[Node->getIndex()] = PlaceIndex[Succ->getIndex()]; continue; } // Otherwise, place it after the (first) predecessor. CfgNode *Pred = Node->getInEdges().front(); auto PredPosition = PlaceIndex[Pred->getIndex()]; // It shouldn't be the case that PredPosition==NoPlace, but if that // somehow turns out to be true, we just insert Node before // PredPosition=NoPlace=Placed.end() . if (PredPosition != NoPlace) ++PredPosition; Placed.insert(PredPosition, Node); PlaceIndex[Node->getIndex()] = PredPosition; } while (0); --PlaceIndex[Node->getIndex()]; assert(*PlaceIndex[Node->getIndex()] == Node); } // Reorder Nodes according to the built-up lists. NodeList Reordered; Reordered.reserve(Placed.size() + Unreachable.size()); for (CfgNode *Node : Placed) Reordered.push_back(Node); for (CfgNode *Node : Unreachable) Reordered.push_back(Node); assert(getNumNodes() == Reordered.size()); swapNodes(Reordered); } namespace { void getRandomPostOrder(CfgNode *Node, BitVector &ToVisit, Ice::NodeList &PostOrder, Ice::RandomNumberGenerator *RNG) { assert(ToVisit[Node->getIndex()]); ToVisit[Node->getIndex()] = false; NodeList Outs = Node->getOutEdges(); Ice::RandomShuffle(Outs.begin(), Outs.end(), [RNG](int N) { return RNG->next(N); }); for (CfgNode *Next : Outs) { if (ToVisit[Next->getIndex()]) getRandomPostOrder(Next, ToVisit, PostOrder, RNG); } PostOrder.push_back(Node); } } // end of anonymous namespace void Cfg::shuffleNodes() { if (!getFlags().getReorderBasicBlocks()) return; NodeList ReversedReachable; NodeList Unreachable; BitVector ToVisit(Nodes.size(), true); // Create Random number generator for function reordering RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_BasicBlockReordering, SequenceNumber); // Traverse from entry node. getRandomPostOrder(getEntryNode(), ToVisit, ReversedReachable, &RNG); // Collect the unreachable nodes. for (CfgNode *Node : Nodes) if (ToVisit[Node->getIndex()]) Unreachable.push_back(Node); // Copy the layout list to the Nodes. NodeList Shuffled; Shuffled.reserve(ReversedReachable.size() + Unreachable.size()); for (CfgNode *Node : reverse_range(ReversedReachable)) Shuffled.push_back(Node); for (CfgNode *Node : Unreachable) Shuffled.push_back(Node); assert(Nodes.size() == Shuffled.size()); swapNodes(Shuffled); dump("After basic block shuffling"); } void Cfg::localCSE(bool AssumeSSA) { // Performs basic-block local common-subexpression elimination // If we have // t1 = op b c // t2 = op b c // This pass will replace future references to t2 in a basic block by t1 // Points to note: // 1. Assumes SSA by default. To change this, use -lcse=no-ssa // This is needed if this pass is moved to a point later in the pipeline. // If variables have a single definition (in the node), CSE can work just // on the basis of an equality compare on instructions (sans Dest). When // variables can be updated (hence, non-SSA) the result of a previous // instruction which used that variable as an operand can not be reused. // 2. Leaves removal of instructions to DCE. // 3. Only enabled on arithmetic instructions. pnacl-clang (-O2) is expected // to take care of cases not arising from GEP simplification. // 4. By default, a single pass is made over each basic block. Control this // with -lcse-max-iters=N TimerMarker T(TimerStack::TT_localCse, this); struct VariableHash { size_t operator()(const Variable *Var) const { return Var->hashValue(); } }; struct InstHash { size_t operator()(const Inst *Instr) const { auto Kind = Instr->getKind(); auto Result = std::hash<typename std::underlying_type<Inst::InstKind>::type>()( Kind); for (SizeT i = 0; i < Instr->getSrcSize(); ++i) { Result ^= Instr->getSrc(i)->hashValue(); } return Result; } }; struct InstEq { bool srcEq(const Operand *A, const Operand *B) const { if (llvm::isa<Variable>(A) || llvm::isa<Constant>(A)) return (A == B); return false; } bool operator()(const Inst *InstrA, const Inst *InstrB) const { if ((InstrA->getKind() != InstrB->getKind()) || (InstrA->getSrcSize() != InstrB->getSrcSize())) return false; if (auto *A = llvm::dyn_cast<InstArithmetic>(InstrA)) { auto *B = llvm::cast<InstArithmetic>(InstrB); // A, B are guaranteed to be of the same 'kind' at this point // So, dyn_cast is not needed if (A->getOp() != B->getOp()) return false; } // Does not enter loop if different kind or number of operands for (SizeT i = 0; i < InstrA->getSrcSize(); ++i) { if (!srcEq(InstrA->getSrc(i), InstrB->getSrc(i))) return false; } return true; } }; for (CfgNode *Node : getNodes()) { CfgUnorderedSet<Inst *, InstHash, InstEq> Seen; // Stores currently available instructions. CfgUnorderedMap<Variable *, Variable *, VariableHash> Replacements; // Combining the above two into a single data structure might consume less // memory but will be slower i.e map of Instruction -> Set of Variables CfgUnorderedMap<Variable *, std::vector<Inst *>, VariableHash> Dependency; // Maps a variable to the Instructions that depend on it. // a = op1 b c // x = op2 c d // Will result in the map : b -> {a}, c -> {a, x}, d -> {x} // Not necessary for SSA as dependencies will never be invalidated, and the // container will use minimal memory when left unused. auto IterCount = getFlags().getLocalCseMaxIterations(); for (uint32_t i = 0; i < IterCount; ++i) { // TODO(manasijm): Stats on IterCount -> performance for (Inst &Instr : Node->getInsts()) { if (Instr.isDeleted() || !llvm::isa<InstArithmetic>(&Instr)) continue; if (!AssumeSSA) { // Invalidate replacements auto Iter = Replacements.find(Instr.getDest()); if (Iter != Replacements.end()) { Replacements.erase(Iter); } // Invalidate 'seen' instructions whose operands were just updated. auto DepIter = Dependency.find(Instr.getDest()); if (DepIter != Dependency.end()) { for (auto *DepInst : DepIter->second) { Seen.erase(DepInst); } } } // Replace - doing this before checking for repetitions might enable // more optimizations for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { auto *Opnd = Instr.getSrc(i); if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { if (Replacements.find(Var) != Replacements.end()) { Instr.replaceSource(i, Replacements[Var]); } } } // Check for repetitions auto SeenIter = Seen.find(&Instr); if (SeenIter != Seen.end()) { // seen before const Inst *Found = *SeenIter; Replacements[Instr.getDest()] = Found->getDest(); } else { // new Seen.insert(&Instr); if (!AssumeSSA) { // Update dependencies for (SizeT i = 0; i < Instr.getSrcSize(); ++i) { auto *Opnd = Instr.getSrc(i); if (auto *Var = llvm::dyn_cast<Variable>(Opnd)) { Dependency[Var].push_back(&Instr); } } } } } } } } void Cfg::loopInvariantCodeMotion() { TimerMarker T(TimerStack::TT_loopInvariantCodeMotion, this); // Does not introduce new nodes as of now. for (auto &Loop : LoopInfo) { CfgNode *Header = Loop.Header; assert(Header); if (Header->getLoopNestDepth() < 1) return; CfgNode *PreHeader = Loop.PreHeader; if (PreHeader == nullptr || PreHeader->getInsts().size() == 0) { return; // try next loop } auto &Insts = PreHeader->getInsts(); auto &LastInst = Insts.back(); Insts.pop_back(); for (auto *Inst : findLoopInvariantInstructions(Loop.Body)) { PreHeader->appendInst(Inst); } PreHeader->appendInst(&LastInst); } } CfgVector<Inst *> Cfg::findLoopInvariantInstructions(const CfgUnorderedSet<SizeT> &Body) { CfgUnorderedSet<Inst *> InvariantInsts; CfgUnorderedSet<Variable *> InvariantVars; for (auto *Var : getArgs()) { InvariantVars.insert(Var); } bool Changed = false; do { Changed = false; for (auto NodeIndex : Body) { auto *Node = Nodes[NodeIndex]; CfgVector<std::reference_wrapper<Inst>> Insts(Node->getInsts().begin(), Node->getInsts().end()); for (auto &InstRef : Insts) { auto &Inst = InstRef.get(); if (Inst.isDeleted() || InvariantInsts.find(&Inst) != InvariantInsts.end()) continue; switch (Inst.getKind()) { case Inst::InstKind::Alloca: case Inst::InstKind::Br: case Inst::InstKind::Ret: case Inst::InstKind::Phi: case Inst::InstKind::Call: case Inst::InstKind::IntrinsicCall: case Inst::InstKind::Load: case Inst::InstKind::Store: case Inst::InstKind::Switch: continue; default: break; } bool IsInvariant = true; for (SizeT i = 0; i < Inst.getSrcSize(); ++i) { if (auto *Var = llvm::dyn_cast<Variable>(Inst.getSrc(i))) { if (InvariantVars.find(Var) == InvariantVars.end()) { IsInvariant = false; } } } if (IsInvariant) { Changed = true; InvariantInsts.insert(&Inst); Node->getInsts().remove(Inst); if (Inst.getDest() != nullptr) { InvariantVars.insert(Inst.getDest()); } } } } } while (Changed); CfgVector<Inst *> InstVector(InvariantInsts.begin(), InvariantInsts.end()); std::sort(InstVector.begin(), InstVector.end(), [](Inst *A, Inst *B) { return A->getNumber() < B->getNumber(); }); return InstVector; } void Cfg::shortCircuitJumps() { // Split Nodes whenever an early jump is possible. // __N : // a = <something> // Instruction 1 without side effect // ... b = <something> ... // Instruction N without side effect // t1 = or a b // br t1 __X __Y // // is transformed into: // __N : // a = <something> // br a __X __N_ext // // __N_ext : // Instruction 1 without side effect // ... b = <something> ... // Instruction N without side effect // br b __X __Y // (Similar logic for AND, jump to false instead of true target.) TimerMarker T(TimerStack::TT_shortCircuit, this); getVMetadata()->init(VMK_Uses); auto NodeStack = this->getNodes(); CfgUnorderedMap<SizeT, CfgVector<CfgNode *>> Splits; while (!NodeStack.empty()) { auto *Node = NodeStack.back(); NodeStack.pop_back(); auto NewNode = Node->shortCircuit(); if (NewNode) { NodeStack.push_back(NewNode); NodeStack.push_back(Node); Splits[Node->getIndex()].push_back(NewNode); } } // Insert nodes in the right place NodeList NewList; NewList.reserve(Nodes.size()); CfgUnorderedSet<SizeT> Inserted; for (auto *Node : Nodes) { if (Inserted.find(Node->getIndex()) != Inserted.end()) continue; // already inserted NodeList Stack{Node}; while (!Stack.empty()) { auto *Current = Stack.back(); Stack.pop_back(); Inserted.insert(Current->getIndex()); NewList.push_back(Current); for (auto *Next : Splits[Current->getIndex()]) { Stack.push_back(Next); } } } SizeT NodeIndex = 0; for (auto *Node : NewList) { Node->resetIndex(NodeIndex++); } Nodes = NewList; } void Cfg::floatConstantCSE() { // Load multiple uses of a floating point constant (between two call // instructions or block start/end) into a variable before its first use. // t1 = b + 1.0 // t2 = c + 1.0 // Gets transformed to: // t0 = 1.0 // t0_1 = t0 // t1 = b + t0_1 // t2 = c + t0_1 // Call instructions reset the procedure, but use the same variable, just in // case it got a register. We are assuming floating point registers are not // callee saved in general. Example, continuing from before: // result = call <some function> // t3 = d + 1.0 // Gets transformed to: // result = call <some function> // t0_2 = t0 // t3 = d + t0_2 // TODO(manasijm, stichnot): Figure out how to 'link' t0 to the stack slot of // 1.0. When t0 does not get a register, introducing an extra assignment // statement does not make sense. The relevant portion is marked below. TimerMarker _(TimerStack::TT_floatConstantCse, this); for (CfgNode *Node : getNodes()) { CfgUnorderedMap<Constant *, Variable *> ConstCache; auto Current = Node->getInsts().begin(); auto End = Node->getInsts().end(); while (Current != End) { CfgUnorderedMap<Constant *, CfgVector<InstList::iterator>> FloatUses; if (llvm::isa<InstCall>(iteratorToInst(Current))) { ++Current; assert(Current != End); // Block should not end with a call } while (Current != End && !llvm::isa<InstCall>(iteratorToInst(Current))) { if (!Current->isDeleted()) { for (SizeT i = 0; i < Current->getSrcSize(); ++i) { if (auto *Const = llvm::dyn_cast<Constant>(Current->getSrc(i))) { if (Const->getType() == IceType_f32 || Const->getType() == IceType_f64) { FloatUses[Const].push_back(Current); } } } } ++Current; } for (auto &Pair : FloatUses) { static constexpr SizeT MinUseThreshold = 3; if (Pair.second.size() < MinUseThreshold) continue; // Only consider constants with at least `MinUseThreshold` uses auto &Insts = Node->getInsts(); if (ConstCache.find(Pair.first) == ConstCache.end()) { // Saw a constant (which is used at least twice) for the first time auto *NewVar = makeVariable(Pair.first->getType()); // NewVar->setLinkedTo(Pair.first); // TODO(manasijm): Plumbing for linking to an Operand. auto *Assign = InstAssign::create(Node->getCfg(), NewVar, Pair.first); Insts.insert(Pair.second[0], Assign); ConstCache[Pair.first] = NewVar; } auto *NewVar = makeVariable(Pair.first->getType()); NewVar->setLinkedTo(ConstCache[Pair.first]); auto *Assign = InstAssign::create(Node->getCfg(), NewVar, ConstCache[Pair.first]); Insts.insert(Pair.second[0], Assign); for (auto InstUse : Pair.second) { for (SizeT i = 0; i < InstUse->getSrcSize(); ++i) { if (auto *Const = llvm::dyn_cast<Constant>(InstUse->getSrc(i))) { if (Const == Pair.first) { InstUse->replaceSource(i, NewVar); } } } } } } } } void Cfg::doArgLowering() { TimerMarker T(TimerStack::TT_doArgLowering, this); getTarget()->lowerArguments(); } void Cfg::sortAndCombineAllocas(CfgVector<InstAlloca *> &Allocas, uint32_t CombinedAlignment, InstList &Insts, AllocaBaseVariableType BaseVariableType) { if (Allocas.empty()) return; // Sort by decreasing alignment. std::sort(Allocas.begin(), Allocas.end(), [](InstAlloca *A1, InstAlloca *A2) { uint32_t Align1 = A1->getAlignInBytes(); uint32_t Align2 = A2->getAlignInBytes(); if (Align1 == Align2) return A1->getNumber() < A2->getNumber(); else return Align1 > Align2; }); // Process the allocas in order of decreasing stack alignment. This allows // us to pack less-aligned pieces after more-aligned ones, resulting in less // stack growth. It also allows there to be at most one stack alignment "and" // instruction for a whole list of allocas. uint32_t CurrentOffset = 0; CfgVector<int32_t> Offsets; for (Inst *Instr : Allocas) { auto *Alloca = llvm::cast<InstAlloca>(Instr); // Adjust the size of the allocation up to the next multiple of the // object's alignment. uint32_t Alignment = std::max(Alloca->getAlignInBytes(), 1u); auto *ConstSize = llvm::dyn_cast<ConstantInteger32>(Alloca->getSizeInBytes()); uint32_t Size = Utils::applyAlignment(ConstSize->getValue(), Alignment); if (BaseVariableType == BVT_FramePointer) { // Addressing is relative to the frame pointer. Subtract the offset after // adding the size of the alloca, because it grows downwards from the // frame pointer. Offsets.push_back(Target->getFramePointerOffset(CurrentOffset, Size)); } else { // Addressing is relative to the stack pointer or to a user pointer. Add // the offset before adding the size of the object, because it grows // upwards from the stack pointer. In addition, if the addressing is // relative to the stack pointer, we need to add the pre-computed max out // args size bytes. const uint32_t OutArgsOffsetOrZero = (BaseVariableType == BVT_StackPointer) ? getTarget()->maxOutArgsSizeBytes() : 0; Offsets.push_back(CurrentOffset + OutArgsOffsetOrZero); } // Update the running offset of the fused alloca region. CurrentOffset += Size; } // Round the offset up to the alignment granularity to use as the size. uint32_t TotalSize = Utils::applyAlignment(CurrentOffset, CombinedAlignment); // Ensure every alloca was assigned an offset. assert(Allocas.size() == Offsets.size()); switch (BaseVariableType) { case BVT_UserPointer: { Variable *BaseVariable = makeVariable(IceType_i32); for (SizeT i = 0; i < Allocas.size(); ++i) { auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]); // Emit a new addition operation to replace the alloca. Operand *AllocaOffset = Ctx->getConstantInt32(Offsets[i]); InstArithmetic *Add = InstArithmetic::create(this, InstArithmetic::Add, Alloca->getDest(), BaseVariable, AllocaOffset); Insts.push_front(Add); Alloca->setDeleted(); } Operand *AllocaSize = Ctx->getConstantInt32(TotalSize); InstAlloca *CombinedAlloca = InstAlloca::create(this, BaseVariable, AllocaSize, CombinedAlignment); CombinedAlloca->setKnownFrameOffset(); Insts.push_front(CombinedAlloca); } break; case BVT_StackPointer: case BVT_FramePointer: { for (SizeT i = 0; i < Allocas.size(); ++i) { auto *Alloca = llvm::cast<InstAlloca>(Allocas[i]); // Emit a fake definition of the rematerializable variable. Variable *Dest = Alloca->getDest(); auto *Def = InstFakeDef::create(this, Dest); if (BaseVariableType == BVT_StackPointer) Dest->setRematerializable(getTarget()->getStackReg(), Offsets[i]); else Dest->setRematerializable(getTarget()->getFrameReg(), Offsets[i]); Insts.push_front(Def); Alloca->setDeleted(); } // Allocate the fixed area in the function prolog. getTarget()->reserveFixedAllocaArea(TotalSize, CombinedAlignment); } break; } } void Cfg::processAllocas(bool SortAndCombine) { TimerMarker _(TimerStack::TT_alloca, this); const uint32_t StackAlignment = getTarget()->getStackAlignment(); CfgNode *EntryNode = getEntryNode(); assert(EntryNode); // LLVM enforces power of 2 alignment. assert(llvm::isPowerOf2_32(StackAlignment)); // If the ABI's stack alignment is smaller than the vector size (16 bytes), // conservatively use a frame pointer to allow for explicit alignment of the // stack pointer. This needs to happen before register allocation so the frame // pointer can be reserved. if (getTarget()->needsStackPointerAlignment()) { getTarget()->setHasFramePointer(); } // Determine if there are large alignment allocations in the entry block or // dynamic allocations (variable size in the entry block). bool HasLargeAlignment = false; bool HasDynamicAllocation = false; for (Inst &Instr : EntryNode->getInsts()) { if (Instr.isDeleted()) continue; if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) { uint32_t AlignmentParam = Alloca->getAlignInBytes(); if (AlignmentParam > StackAlignment) HasLargeAlignment = true; if (llvm::isa<Constant>(Alloca->getSizeInBytes())) Alloca->setKnownFrameOffset(); else { HasDynamicAllocation = true; // If Allocas are not sorted, the first dynamic allocation causes // later Allocas to be at unknown offsets relative to the stack/frame. if (!SortAndCombine) break; } } } // Don't do the heavyweight sorting and layout for low optimization levels. if (!SortAndCombine) return; // Any alloca outside the entry block is a dynamic allocation. for (CfgNode *Node : Nodes) { if (Node == EntryNode) continue; for (Inst &Instr : Node->getInsts()) { if (Instr.isDeleted()) continue; if (llvm::isa<InstAlloca>(&Instr)) { // Allocations outside the entry block require a frame pointer. HasDynamicAllocation = true; break; } } if (HasDynamicAllocation) break; } // Mark the target as requiring a frame pointer. if (HasLargeAlignment || HasDynamicAllocation) getTarget()->setHasFramePointer(); // Collect the Allocas into the two vectors. // Allocas in the entry block that have constant size and alignment less // than or equal to the function's stack alignment. CfgVector<InstAlloca *> FixedAllocas; // Allocas in the entry block that have constant size and alignment greater // than the function's stack alignment. CfgVector<InstAlloca *> AlignedAllocas; // Maximum alignment used by any alloca. uint32_t MaxAlignment = StackAlignment; for (Inst &Instr : EntryNode->getInsts()) { if (Instr.isDeleted()) continue; if (auto *Alloca = llvm::dyn_cast<InstAlloca>(&Instr)) { if (!llvm::isa<Constant>(Alloca->getSizeInBytes())) continue; uint32_t AlignmentParam = Alloca->getAlignInBytes(); // For default align=0, set it to the real value 1, to avoid any // bit-manipulation problems below. AlignmentParam = std::max(AlignmentParam, 1u); assert(llvm::isPowerOf2_32(AlignmentParam)); if (HasDynamicAllocation && AlignmentParam > StackAlignment) { // If we have both dynamic allocations and large stack alignments, // high-alignment allocations are pulled out with their own base. AlignedAllocas.push_back(Alloca); } else { FixedAllocas.push_back(Alloca); } MaxAlignment = std::max(AlignmentParam, MaxAlignment); } } // Add instructions to the head of the entry block in reverse order. InstList &Insts = getEntryNode()->getInsts(); if (HasDynamicAllocation && HasLargeAlignment) { // We are using a frame pointer, but fixed large-alignment alloca addresses // do not have a known offset from either the stack or frame pointer. // They grow up from a user pointer from an alloca. sortAndCombineAllocas(AlignedAllocas, MaxAlignment, Insts, BVT_UserPointer); // Fixed size allocas are addressed relative to the frame pointer. sortAndCombineAllocas(FixedAllocas, StackAlignment, Insts, BVT_FramePointer); } else { // Otherwise, fixed size allocas are addressed relative to the stack unless // there are dynamic allocas. const AllocaBaseVariableType BasePointerType = (HasDynamicAllocation ? BVT_FramePointer : BVT_StackPointer); sortAndCombineAllocas(FixedAllocas, MaxAlignment, Insts, BasePointerType); } if (!FixedAllocas.empty() || !AlignedAllocas.empty()) // No use calling findRematerializable() unless there is some // rematerializable alloca instruction to seed it. findRematerializable(); } namespace { // Helpers for findRematerializable(). For each of them, if a suitable // rematerialization is found, the instruction's Dest variable is set to be // rematerializable and it returns true, otherwise it returns false. bool rematerializeArithmetic(const Inst *Instr) { // Check that it's an Arithmetic instruction with an Add operation. auto *Arith = llvm::dyn_cast<InstArithmetic>(Instr); if (Arith == nullptr || Arith->getOp() != InstArithmetic::Add) return false; // Check that Src(0) is rematerializable. auto *Src0Var = llvm::dyn_cast<Variable>(Arith->getSrc(0)); if (Src0Var == nullptr || !Src0Var->isRematerializable()) return false; // Check that Src(1) is an immediate. auto *Src1Imm = llvm::dyn_cast<ConstantInteger32>(Arith->getSrc(1)); if (Src1Imm == nullptr) return false; Arith->getDest()->setRematerializable( Src0Var->getRegNum(), Src0Var->getStackOffset() + Src1Imm->getValue()); return true; } bool rematerializeAssign(const Inst *Instr) { // An InstAssign only originates from an inttoptr or ptrtoint instruction, // which never occurs in a MINIMAL build. if (BuildDefs::minimal()) return false; // Check that it's an Assign instruction. if (!llvm::isa<InstAssign>(Instr)) return false; // Check that Src(0) is rematerializable. auto *Src0Var = llvm::dyn_cast<Variable>(Instr->getSrc(0)); if (Src0Var == nullptr || !Src0Var->isRematerializable()) return false; Instr->getDest()->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset()); return true; } bool rematerializeCast(const Inst *Instr) { // An pointer-type bitcast never occurs in a MINIMAL build. if (BuildDefs::minimal()) return false; // Check that it's a Cast instruction with a Bitcast operation. auto *Cast = llvm::dyn_cast<InstCast>(Instr); if (Cast == nullptr || Cast->getCastKind() != InstCast::Bitcast) return false; // Check that Src(0) is rematerializable. auto *Src0Var = llvm::dyn_cast<Variable>(Cast->getSrc(0)); if (Src0Var == nullptr || !Src0Var->isRematerializable()) return false; // Check that Dest and Src(0) have the same type. Variable *Dest = Cast->getDest(); if (Dest->getType() != Src0Var->getType()) return false; Dest->setRematerializable(Src0Var->getRegNum(), Src0Var->getStackOffset()); return true; } } // end of anonymous namespace /// Scan the function to find additional rematerializable variables. This is /// possible when the source operand of an InstAssignment is a rematerializable /// variable, or the same for a pointer-type InstCast::Bitcast, or when an /// InstArithmetic is an add of a rematerializable variable and an immediate. /// Note that InstAssignment instructions and pointer-type InstCast::Bitcast /// instructions generally only come about from the IceConverter's treatment of /// inttoptr, ptrtoint, and bitcast instructions. TODO(stichnot): Consider /// other possibilities, however unlikely, such as InstArithmetic::Sub, or /// commutativity. void Cfg::findRematerializable() { // Scan the instructions in order, and repeat until no new opportunities are // found. It may take more than one iteration because a variable's defining // block may happen to come after a block where it is used, depending on the // CfgNode linearization order. bool FoundNewAssignment; do { FoundNewAssignment = false; for (CfgNode *Node : getNodes()) { // No need to process Phi instructions. for (Inst &Instr : Node->getInsts()) { if (Instr.isDeleted()) continue; Variable *Dest = Instr.getDest(); if (Dest == nullptr || Dest->isRematerializable()) continue; if (rematerializeArithmetic(&Instr) || rematerializeAssign(&Instr) || rematerializeCast(&Instr)) { FoundNewAssignment = true; } } } } while (FoundNewAssignment); } void Cfg::doAddressOpt() { TimerMarker T(TimerStack::TT_doAddressOpt, this); for (CfgNode *Node : Nodes) Node->doAddressOpt(); } namespace { // ShuffleVectorUtils implements helper functions for rematerializing // shufflevector instructions from a sequence of extractelement/insertelement // instructions. It looks for the following pattern: // // %t0 = extractelement A, %n0 // %t1 = extractelement B, %n1 // %t2 = extractelement C, %n2 // ... // %tN = extractelement N, %nN // %d0 = insertelement undef, %t0, 0 // %d1 = insertelement %d0, %t1, 1 // %d2 = insertelement %d1, %t2, 2 // ... // %dest = insertelement %d_N-1, %tN, N // // where N is num_element(typeof(%dest)), and A, B, C, ... N are at most two // distinct variables. namespace ShuffleVectorUtils { // findAllInserts is used when searching for all the insertelements that are // used in a shufflevector operation. This function works recursively, when // invoked with I = i, the function assumes Insts[i] is the last found // insertelement in the chain. The next insertelement insertruction is saved in // Insts[i+1]. bool findAllInserts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM, CfgVector<const Inst *> *Insts, SizeT I = 0) { const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat); if (I > Insts->size()) { if (Verbose) { Ctx->getStrDump() << "\tToo many inserts.\n"; } return false; } const auto *LastInsert = Insts->at(I); assert(llvm::isa<InstInsertElement>(LastInsert)); if (I == Insts->size() - 1) { // Matching against undef is not really needed because the value in Src(0) // will be totally overwritten. We still enforce it anyways because the // PNaCl toolchain generates the bitcode with it. if (!llvm::isa<ConstantUndef>(LastInsert->getSrc(0))) { if (Verbose) { Ctx->getStrDump() << "\tSrc0 is not undef: " << I << " " << Insts->size(); LastInsert->dump(Func); Ctx->getStrDump() << "\n"; } return false; } // The following loop ensures that the insertelements are sorted. In theory, // we could relax this restriction and allow any order. As long as each // index appears exactly once, this chain is still a candidate for becoming // a shufflevector. The Insts vector is traversed backwards because the // instructions are "enqueued" in reverse order. int32_t ExpectedElement = 0; for (const auto *I : reverse_range(*Insts)) { if (llvm::cast<ConstantInteger32>(I->getSrc(2))->getValue() != ExpectedElement) { return false; } ++ExpectedElement; } return true; } const auto *Src0V = llvm::cast<Variable>(LastInsert->getSrc(0)); const auto *Def = VM->getSingleDefinition(Src0V); // Only optimize if the first operand in // // Dest = insertelement A, B, 10 // // is singly-def'ed. if (Def == nullptr) { if (Verbose) { Ctx->getStrDump() << "\tmulti-def: "; (*Insts)[I]->dump(Func); Ctx->getStrDump() << "\n"; } return false; } // We also require the (single) definition to come from an insertelement // instruction. if (!llvm::isa<InstInsertElement>(Def)) { if (Verbose) { Ctx->getStrDump() << "\tnot insert element: "; Def->dump(Func); Ctx->getStrDump() << "\n"; } return false; } // Everything seems fine, so we save Def in Insts, and delegate the decision // to findAllInserts. (*Insts)[I + 1] = Def; return findAllInserts(Func, Ctx, VM, Insts, I + 1); } // insertsLastElement returns true if Insert is inserting an element in the last // position of a vector. bool insertsLastElement(const Inst &Insert) { const Type DestTy = Insert.getDest()->getType(); assert(isVectorType(DestTy)); const SizeT Elem = llvm::cast<ConstantInteger32>(Insert.getSrc(2))->getValue(); return Elem == typeNumElements(DestTy) - 1; } // findAllExtracts goes over all the insertelement instructions that are // candidates to be replaced by a shufflevector, and searches for all the // definitions of the elements being inserted. If all of the elements are the // result of an extractelement instruction, and all of the extractelements // operate on at most two different sources, than the instructions can be // replaced by a shufflevector. bool findAllExtracts(Cfg *Func, GlobalContext *Ctx, VariablesMetadata *VM, const CfgVector<const Inst *> &Insts, Variable **Src0, Variable **Src1, CfgVector<const Inst *> *Extracts) { const bool Verbose = BuildDefs::dump() && Func->isVerbose(IceV_ShufMat); *Src0 = nullptr; *Src1 = nullptr; assert(Insts.size() > 0); for (SizeT I = 0; I < Insts.size(); ++I) { const auto *Insert = Insts.at(I); const auto *Src1V = llvm::dyn_cast<Variable>(Insert->getSrc(1)); if (Src1V == nullptr) { if (Verbose) { Ctx->getStrDump() << "src(1) is not a variable: "; Insert->dump(Func); Ctx->getStrDump() << "\n"; } return false; } const auto *Def = VM->getSingleDefinition(Src1V); if (Def == nullptr) { if (Verbose) { Ctx->getStrDump() << "multi-def src(1): "; Insert->dump(Func); Ctx->getStrDump() << "\n"; } return false; } if (!llvm::isa<InstExtractElement>(Def)) { if (Verbose) { Ctx->getStrDump() << "not extractelement: "; Def->dump(Func); Ctx->getStrDump() << "\n"; } return false; } auto *Src = llvm::cast<Variable>(Def->getSrc(0)); if (*Src0 == nullptr) { // No sources yet. Save Src to Src0. *Src0 = Src; } else if (*Src1 == nullptr) { // We already have a source, so we might save Src in Src1 -- but only if // Src0 is not Src. if (*Src0 != Src) { *Src1 = Src; } } else if (Src != *Src0 && Src != *Src1) { // More than two sources, so we can't rematerialize the shufflevector // instruction. if (Verbose) { Ctx->getStrDump() << "Can't shuffle more than two sources.\n"; } return false; } (*Extracts)[I] = Def; } // We should have seen at least one source operand. assert(*Src0 != nullptr); // If a second source was not seen, then we just make Src1 = Src0 to simplify // things down stream. This should not matter, as all of the indexes in the // shufflevector instruction will point to Src0. if (*Src1 == nullptr) { *Src1 = *Src0; } return true; } } // end of namespace ShuffleVectorUtils } // end of anonymous namespace void Cfg::materializeVectorShuffles() { const bool Verbose = BuildDefs::dump() && isVerbose(IceV_ShufMat); std::unique_ptr<OstreamLocker> L; if (Verbose) { L.reset(new OstreamLocker(getContext())); getContext()->getStrDump() << "\nShuffle materialization:\n"; } // MaxVectorElements is the maximum number of elements in the vector types // handled by Subzero. We use it to create the Inserts and Extracts vectors // with the appropriate size, thus avoiding resize() calls. const SizeT MaxVectorElements = typeNumElements(IceType_v16i8); CfgVector<const Inst *> Inserts(MaxVectorElements); CfgVector<const Inst *> Extracts(MaxVectorElements); TimerMarker T(TimerStack::TT_materializeVectorShuffles, this); for (CfgNode *Node : Nodes) { for (auto &Instr : Node->getInsts()) { if (!llvm::isa<InstInsertElement>(Instr)) { continue; } if (!ShuffleVectorUtils::insertsLastElement(Instr)) { // To avoid wasting time, we only start the pattern match at the last // insertelement instruction -- and go backwards from there. continue; } if (Verbose) { getContext()->getStrDump() << "\tCandidate: "; Instr.dump(this); getContext()->getStrDump() << "\n"; } Inserts.resize(typeNumElements(Instr.getDest()->getType())); Inserts[0] = &Instr; if (!ShuffleVectorUtils::findAllInserts(this, getContext(), VMetadata.get(), &Inserts)) { // If we fail to find a sequence of insertelements, we stop the // optimization. if (Verbose) { getContext()->getStrDump() << "\tFalse alarm.\n"; } continue; } if (Verbose) { getContext()->getStrDump() << "\tFound the following insertelement: \n"; for (auto *I : reverse_range(Inserts)) { getContext()->getStrDump() << "\t\t"; I->dump(this); getContext()->getStrDump() << "\n"; } } Extracts.resize(Inserts.size()); Variable *Src0; Variable *Src1; if (!ShuffleVectorUtils::findAllExtracts(this, getContext(), VMetadata.get(), Inserts, &Src0, &Src1, &Extracts)) { // If we fail to match the definitions of the insertelements' sources // with extractelement instructions -- or if those instructions operate // on more than two different variables -- we stop the optimization. if (Verbose) { getContext()->getStrDump() << "\tFailed to match extractelements.\n"; } continue; } if (Verbose) { getContext()->getStrDump() << "\tFound the following insert/extract element pairs: \n"; for (SizeT I = 0; I < Inserts.size(); ++I) { const SizeT Pos = Inserts.size() - I - 1; getContext()->getStrDump() << "\t\tInsert : "; Inserts[Pos]->dump(this); getContext()->getStrDump() << "\n\t\tExtract: "; Extracts[Pos]->dump(this); getContext()->getStrDump() << "\n"; } } assert(Src0 != nullptr); assert(Src1 != nullptr); auto *ShuffleVector = InstShuffleVector::create(this, Instr.getDest(), Src0, Src1); assert(ShuffleVector->getSrc(0) == Src0); assert(ShuffleVector->getSrc(1) == Src1); for (SizeT I = 0; I < Extracts.size(); ++I) { const SizeT Pos = Extracts.size() - I - 1; auto *Index = llvm::cast<ConstantInteger32>(Extracts[Pos]->getSrc(1)); if (Src0 == Extracts[Pos]->getSrc(0)) { ShuffleVector->addIndex(Index); } else { ShuffleVector->addIndex(llvm::cast<ConstantInteger32>( Ctx->getConstantInt32(Index->getValue() + Extracts.size()))); } } if (Verbose) { getContext()->getStrDump() << "Created: "; ShuffleVector->dump(this); getContext()->getStrDump() << "\n"; } Instr.setDeleted(); auto &LoweringContext = getTarget()->getContext(); LoweringContext.setInsertPoint(instToIterator(&Instr)); LoweringContext.insert(ShuffleVector); } } } void Cfg::doNopInsertion() { if (!getFlags().getShouldDoNopInsertion()) return; TimerMarker T(TimerStack::TT_doNopInsertion, this); RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_NopInsertion, SequenceNumber); for (CfgNode *Node : Nodes) Node->doNopInsertion(RNG); } void Cfg::genCode() { TimerMarker T(TimerStack::TT_genCode, this); for (CfgNode *Node : Nodes) Node->genCode(); } // Compute the stack frame layout. void Cfg::genFrame() { TimerMarker T(TimerStack::TT_genFrame, this); getTarget()->addProlog(Entry); for (CfgNode *Node : Nodes) if (Node->getHasReturn()) getTarget()->addEpilog(Node); } void Cfg::generateLoopInfo() { TimerMarker T(TimerStack::TT_computeLoopNestDepth, this); LoopInfo = ComputeLoopInfo(this); } // This is a lightweight version of live-range-end calculation. Marks the last // use of only those variables whose definition and uses are completely with a // single block. It is a quick single pass and doesn't need to iterate until // convergence. void Cfg::livenessLightweight() { TimerMarker T(TimerStack::TT_livenessLightweight, this); getVMetadata()->init(VMK_Uses); for (CfgNode *Node : Nodes) Node->livenessLightweight(); } void Cfg::liveness(LivenessMode Mode) { TimerMarker T(TimerStack::TT_liveness, this); // Destroying the previous (if any) Liveness information clears the Liveness // allocator TLS pointer. Live = nullptr; Live = Liveness::create(this, Mode); getVMetadata()->init(VMK_Uses); Live->init(); // Initialize with all nodes needing to be processed. BitVector NeedToProcess(Nodes.size(), true); while (NeedToProcess.any()) { // Iterate in reverse topological order to speed up convergence. for (CfgNode *Node : reverse_range(Nodes)) { if (NeedToProcess[Node->getIndex()]) { NeedToProcess[Node->getIndex()] = false; bool Changed = Node->liveness(getLiveness()); if (Changed) { // If the beginning-of-block liveness changed since the last // iteration, mark all in-edges as needing to be processed. for (CfgNode *Pred : Node->getInEdges()) NeedToProcess[Pred->getIndex()] = true; } } } } if (Mode == Liveness_Intervals) { // Reset each variable's live range. for (Variable *Var : Variables) Var->resetLiveRange(); } // Make a final pass over each node to delete dead instructions, collect the // first and last instruction numbers, and add live range segments for that // node. for (CfgNode *Node : Nodes) { InstNumberT FirstInstNum = Inst::NumberSentinel; InstNumberT LastInstNum = Inst::NumberSentinel; for (Inst &I : Node->getPhis()) { I.deleteIfDead(); if (Mode == Liveness_Intervals && !I.isDeleted()) { if (FirstInstNum == Inst::NumberSentinel) FirstInstNum = I.getNumber(); assert(I.getNumber() > LastInstNum); LastInstNum = I.getNumber(); } } for (Inst &I : Node->getInsts()) { I.deleteIfDead(); if (Mode == Liveness_Intervals && !I.isDeleted()) { if (FirstInstNum == Inst::NumberSentinel) FirstInstNum = I.getNumber(); assert(I.getNumber() > LastInstNum); LastInstNum = I.getNumber(); } } if (Mode == Liveness_Intervals) { // Special treatment for live in-args. Their liveness needs to extend // beyond the beginning of the function, otherwise an arg whose only use // is in the first instruction will end up having the trivial live range // [2,2) and will *not* interfere with other arguments. So if the first // instruction of the method is "r=arg1+arg2", both args may be assigned // the same register. This is accomplished by extending the entry block's // instruction range from [2,n) to [1,n) which will transform the // problematic [2,2) live ranges into [1,2). This extension works because // the entry node is guaranteed to have the lowest instruction numbers. if (Node == getEntryNode()) { FirstInstNum = Inst::NumberExtended; // Just in case the entry node somehow contains no instructions... if (LastInstNum == Inst::NumberSentinel) LastInstNum = FirstInstNum; } // If this node somehow contains no instructions, don't bother trying to // add liveness intervals for it, because variables that are live-in and // live-out will have a bogus interval added. if (FirstInstNum != Inst::NumberSentinel) Node->livenessAddIntervals(getLiveness(), FirstInstNum, LastInstNum); } } } // Traverse every Variable of every Inst and verify that it appears within the // Variable's computed live range. bool Cfg::validateLiveness() const { TimerMarker T(TimerStack::TT_validateLiveness, this); bool Valid = true; OstreamLocker L(Ctx); Ostream &Str = Ctx->getStrDump(); for (CfgNode *Node : Nodes) { Inst *FirstInst = nullptr; for (Inst &Instr : Node->getInsts()) { if (Instr.isDeleted()) continue; if (FirstInst == nullptr) FirstInst = &Instr; InstNumberT InstNumber = Instr.getNumber(); if (Variable *Dest = Instr.getDest()) { if (!Dest->getIgnoreLiveness()) { bool Invalid = false; constexpr bool IsDest = true; if (!Dest->getLiveRange().containsValue(InstNumber, IsDest)) Invalid = true; // Check that this instruction actually *begins* Dest's live range, // by checking that Dest is not live in the previous instruction. As // a special exception, we don't check this for the first instruction // of the block, because a Phi temporary may be live at the end of // the previous block, and if it is also assigned in the first // instruction of this block, the adjacent live ranges get merged. if (&Instr != FirstInst && !Instr.isDestRedefined() && Dest->getLiveRange().containsValue(InstNumber - 1, IsDest)) Invalid = true; if (Invalid) { Valid = false; Str << "Liveness error: inst " << Instr.getNumber() << " dest "; Dest->dump(this); Str << " live range " << Dest->getLiveRange() << "\n"; } } } FOREACH_VAR_IN_INST(Var, Instr) { static constexpr bool IsDest = false; if (!Var->getIgnoreLiveness() && !Var->getLiveRange().containsValue(InstNumber, IsDest)) { Valid = false; Str << "Liveness error: inst " << Instr.getNumber() << " var "; Var->dump(this); Str << " live range " << Var->getLiveRange() << "\n"; } } } } return Valid; } void Cfg::contractEmptyNodes() { // If we're decorating the asm output with register liveness info, this // information may become corrupted or incorrect after contracting nodes that // contain only redundant assignments. As such, we disable this pass when // DecorateAsm is specified. This may make the resulting code look more // branchy, but it should have no effect on the register assignments. if (getFlags().getDecorateAsm()) return; for (CfgNode *Node : Nodes) { Node->contractIfEmpty(); } } void Cfg::doBranchOpt() { TimerMarker T(TimerStack::TT_doBranchOpt, this); for (auto I = Nodes.begin(), E = Nodes.end(); I != E; ++I) { auto NextNode = I + 1; (*I)->doBranchOpt(NextNode == E ? nullptr : *NextNode); } } void Cfg::markNodesForSandboxing() { for (const InstJumpTable *JT : JumpTables) for (SizeT I = 0; I < JT->getNumTargets(); ++I) JT->getTarget(I)->setNeedsAlignment(); } // ======================== Dump routines ======================== // // emitTextHeader() is not target-specific (apart from what is abstracted by // the Assembler), so it is defined here rather than in the target lowering // class. void Cfg::emitTextHeader(GlobalString Name, GlobalContext *Ctx, const Assembler *Asm) { if (!BuildDefs::dump()) return; Ostream &Str = Ctx->getStrEmit(); Str << "\t.text\n"; if (getFlags().getFunctionSections()) Str << "\t.section\t.text." << Name << ",\"ax\",%progbits\n"; if (!Asm->getInternal() || getFlags().getDisableInternal()) { Str << "\t.globl\t" << Name << "\n"; Str << "\t.type\t" << Name << ",%function\n"; } Str << "\t" << Asm->getAlignDirective() << " " << Asm->getBundleAlignLog2Bytes() << ",0x"; for (uint8_t I : Asm->getNonExecBundlePadding()) Str.write_hex(I); Str << "\n"; Str << Name << ":\n"; } void Cfg::emitJumpTables() { switch (getFlags().getOutFileType()) { case FT_Elf: case FT_Iasm: { // The emission needs to be delayed until the after the text section so // save the offsets in the global context. for (const InstJumpTable *JumpTable : JumpTables) { Ctx->addJumpTableData(JumpTable->toJumpTableData(getAssembler())); } } break; case FT_Asm: { // Emit the assembly directly so we don't need to hang on to all the names for (const InstJumpTable *JumpTable : JumpTables) getTarget()->emitJumpTable(this, JumpTable); } break; } } void Cfg::emit() { if (!BuildDefs::dump()) return; TimerMarker T(TimerStack::TT_emitAsm, this); if (getFlags().getDecorateAsm()) { renumberInstructions(); getVMetadata()->init(VMK_Uses); liveness(Liveness_Basic); dump("After recomputing liveness for -decorate-asm"); } OstreamLocker L(Ctx); Ostream &Str = Ctx->getStrEmit(); const Assembler *Asm = getAssembler<>(); const bool NeedSandboxing = getFlags().getUseSandboxing(); emitTextHeader(FunctionName, Ctx, Asm); if (getFlags().getDecorateAsm()) { for (Variable *Var : getVariables()) { if (Var->hasKnownStackOffset() && !Var->isRematerializable()) { Str << "\t" << Var->getSymbolicStackOffset() << " = " << Var->getStackOffset() << "\n"; } } } for (CfgNode *Node : Nodes) { if (NeedSandboxing && Node->needsAlignment()) { Str << "\t" << Asm->getAlignDirective() << " " << Asm->getBundleAlignLog2Bytes() << "\n"; } Node->emit(this); } emitJumpTables(); Str << "\n"; } void Cfg::emitIAS() { TimerMarker T(TimerStack::TT_emitAsm, this); // The emitIAS() routines emit into the internal assembler buffer, so there's // no need to lock the streams. const bool NeedSandboxing = getFlags().getUseSandboxing(); for (CfgNode *Node : Nodes) { if (NeedSandboxing && Node->needsAlignment()) getAssembler()->alignCfgNode(); Node->emitIAS(this); } emitJumpTables(); } size_t Cfg::getTotalMemoryMB() const { constexpr size_t _1MB = 1024 * 1024; assert(Allocator != nullptr); assert(CfgAllocatorTraits::current() == Allocator.get()); return Allocator->getTotalMemory() / _1MB; } size_t Cfg::getLivenessMemoryMB() const { constexpr size_t _1MB = 1024 * 1024; if (Live == nullptr) { return 0; } return Live->getAllocator()->getTotalMemory() / _1MB; } // Dumps the IR with an optional introductory message. void Cfg::dump(const char *Message) { if (!BuildDefs::dump()) return; if (!isVerbose()) return; OstreamLocker L(Ctx); Ostream &Str = Ctx->getStrDump(); if (Message[0]) Str << "================ " << Message << " ================\n"; if (isVerbose(IceV_Mem)) { Str << "Memory size = " << getTotalMemoryMB() << " MB\n"; } setCurrentNode(getEntryNode()); // Print function name+args if (isVerbose(IceV_Instructions)) { Str << "define "; if (getInternal() && !getFlags().getDisableInternal()) Str << "internal "; Str << ReturnType << " @" << getFunctionName() << "("; for (SizeT i = 0; i < Args.size(); ++i) { if (i > 0) Str << ", "; Str << Args[i]->getType() << " "; Args[i]->dump(this); } // Append an extra copy of the function name here, in order to print its // size stats but not mess up lit tests. Str << ") { # " << getFunctionNameAndSize() << "\n"; } resetCurrentNode(); if (isVerbose(IceV_Liveness)) { // Print summary info about variables for (Variable *Var : Variables) { Str << "// multiblock="; if (getVMetadata()->isTracked(Var)) Str << getVMetadata()->isMultiBlock(Var); else Str << "?"; Str << " defs="; bool FirstPrint = true; if (VMetadata->getKind() != VMK_Uses) { if (const Inst *FirstDef = VMetadata->getFirstDefinition(Var)) { Str << FirstDef->getNumber(); FirstPrint = false; } } if (VMetadata->getKind() == VMK_All) { for (const Inst *Instr : VMetadata->getLatterDefinitions(Var)) { if (!FirstPrint) Str << ","; Str << Instr->getNumber(); FirstPrint = false; } } Str << " weight=" << Var->getWeight(this) << " "; Var->dump(this); Str << " LIVE=" << Var->getLiveRange() << "\n"; } } // Print each basic block for (CfgNode *Node : Nodes) Node->dump(this); if (isVerbose(IceV_Instructions)) Str << "}\n"; } } // end of namespace Ice