//===- subzero/src/IceGlobalContext.cpp - Global context defs -------------===//
//
//                        The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Defines aspects of the compilation that persist across multiple
/// functions.
///
//===----------------------------------------------------------------------===//

#include "IceGlobalContext.h"

#include "IceCfg.h"
#include "IceCfgNode.h"
#include "IceClFlags.h"
#include "IceDefs.h"
#include "IceELFObjectWriter.h"
#include "IceGlobalInits.h"
#include "IceLiveness.h"
#include "IceOperand.h"
#include "IceRevision.h"
#include "IceTargetLowering.h"
#include "IceTimerTree.h"
#include "IceTypes.def"
#include "IceTypes.h"

#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-parameter"
#endif // __clang__

#include "llvm/Support/Timer.h"

#ifdef __clang__
#pragma clang diagnostic pop
#endif // __clang__

#include <algorithm> // max()

namespace std {
template <> struct hash<Ice::RelocatableTuple> {
  size_t operator()(const Ice::RelocatableTuple &Key) const {
    // Use the relocatable's name, plus the hash of a combination of the number
    // of OffsetExprs and the known, fixed offset for the reloc. We left shift
    // the known relocatable by 5 trying to minimize the interaction between the
    // bits in OffsetExpr.size() and Key.Offset.
    return hash<Ice::SizeT>()(Key.Name.getID()) +
           hash<std::size_t>()(Key.OffsetExpr.size() + (Key.Offset << 5));
  }
};
} // end of namespace std

namespace Ice {

namespace {

// Define the key comparison function for the constant pool's unordered_map,
// but only for key types of interest: integer types, floating point types, and
// the special RelocatableTuple.
template <typename KeyType, class Enable = void> struct KeyCompare {};

template <typename KeyType>
struct KeyCompare<KeyType,
                  typename std::enable_if<
                      std::is_integral<KeyType>::value ||
                      std::is_same<KeyType, RelocatableTuple>::value>::type> {
  bool operator()(const KeyType &Value1, const KeyType &Value2) const {
    return Value1 == Value2;
  }
};
template <typename KeyType>
struct KeyCompare<KeyType, typename std::enable_if<
                               std::is_floating_point<KeyType>::value>::type> {
  bool operator()(const KeyType &Value1, const KeyType &Value2) const {
    return !memcmp(&Value1, &Value2, sizeof(KeyType));
  }
};

// Define a key comparison function for sorting the constant pool's values
// after they are dumped to a vector. This covers integer types, floating point
// types, and ConstantRelocatable values.
template <typename ValueType, class Enable = void> struct KeyCompareLess {};

template <typename ValueType>
struct KeyCompareLess<ValueType,
                      typename std::enable_if<std::is_floating_point<
                          typename ValueType::PrimType>::value>::type> {
  bool operator()(const Constant *Const1, const Constant *Const2) const {
    using CompareType = uint64_t;
    static_assert(sizeof(typename ValueType::PrimType) <= sizeof(CompareType),
                  "Expected floating-point type of width 64-bit or less");
    typename ValueType::PrimType V1 = llvm::cast<ValueType>(Const1)->getValue();
    typename ValueType::PrimType V2 = llvm::cast<ValueType>(Const2)->getValue();
    // We avoid "V1<V2" because of NaN.
    // We avoid "memcmp(&V1,&V2,sizeof(V1))<0" which depends on the
    // endian-ness of the host system running Subzero.
    // Instead, compare the result of bit_cast to uint64_t.
    uint64_t I1 = 0, I2 = 0;
    memcpy(&I1, &V1, sizeof(V1));
    memcpy(&I2, &V2, sizeof(V2));
    return I1 < I2;
  }
};
template <typename ValueType>
struct KeyCompareLess<ValueType,
                      typename std::enable_if<std::is_integral<
                          typename ValueType::PrimType>::value>::type> {
  bool operator()(const Constant *Const1, const Constant *Const2) const {
    typename ValueType::PrimType V1 = llvm::cast<ValueType>(Const1)->getValue();
    typename ValueType::PrimType V2 = llvm::cast<ValueType>(Const2)->getValue();
    return V1 < V2;
  }
};
template <typename ValueType>
struct KeyCompareLess<
    ValueType, typename std::enable_if<
                   std::is_same<ValueType, ConstantRelocatable>::value>::type> {
  bool operator()(const Constant *Const1, const Constant *Const2) const {
    auto *V1 = llvm::cast<ValueType>(Const1);
    auto *V2 = llvm::cast<ValueType>(Const2);
    if (V1->getName() == V2->getName())
      return V1->getOffset() < V2->getOffset();
    return V1->getName() < V2->getName();
  }
};

// TypePool maps constants of type KeyType (e.g. float) to pointers to
// type ValueType (e.g. ConstantFloat).
template <Type Ty, typename KeyType, typename ValueType> class TypePool {
  TypePool(const TypePool &) = delete;
  TypePool &operator=(const TypePool &) = delete;

public:
  TypePool() = default;
  ValueType *getOrAdd(GlobalContext *Ctx, KeyType Key) {
    auto Iter = Pool.find(Key);
    if (Iter != Pool.end()) {
      Iter->second->updateLookupCount();
      return Iter->second;
    }
    auto *Result = ValueType::create(Ctx, Ty, Key);
    Pool[Key] = Result;
    Result->updateLookupCount();
    return Result;
  }
  ConstantList getConstantPool() const {
    ConstantList Constants;
    Constants.reserve(Pool.size());
    for (auto &I : Pool)
      Constants.push_back(I.second);
    // The sort (and its KeyCompareLess machinery) is not strictly necessary,
    // but is desirable for producing output that is deterministic across
    // unordered_map::iterator implementations.
    std::sort(Constants.begin(), Constants.end(), KeyCompareLess<ValueType>());
    return Constants;
  }
  size_t size() const { return Pool.size(); }

private:
  // Use the default hash function, and a custom key comparison function. The
  // key comparison function for floating point variables can't use the default
  // == based implementation because of special C++ semantics regarding +0.0,
  // -0.0, and NaN comparison. However, it's OK to use the default hash for
  // floating point values because KeyCompare is the final source of truth - in
  // the worst case a "false" collision must be resolved.
  using ContainerType =
      std::unordered_map<KeyType, ValueType *, std::hash<KeyType>,
                         KeyCompare<KeyType>>;
  ContainerType Pool;
};

// UndefPool maps ICE types to the corresponding ConstantUndef values.
class UndefPool {
  UndefPool(const UndefPool &) = delete;
  UndefPool &operator=(const UndefPool &) = delete;

public:
  UndefPool() : Pool(IceType_NUM) {}

  ConstantUndef *getOrAdd(GlobalContext *Ctx, Type Ty) {
    if (Pool[Ty] == nullptr)
      Pool[Ty] = ConstantUndef::create(Ctx, Ty);
    return Pool[Ty];
  }

private:
  std::vector<ConstantUndef *> Pool;
};

} // end of anonymous namespace

// The global constant pool bundles individual pools of each type of
// interest.
class ConstantPool {
  ConstantPool(const ConstantPool &) = delete;
  ConstantPool &operator=(const ConstantPool &) = delete;

public:
  ConstantPool() = default;
  TypePool<IceType_f32, float, ConstantFloat> Floats;
  TypePool<IceType_f64, double, ConstantDouble> Doubles;
  TypePool<IceType_i1, int8_t, ConstantInteger32> Integers1;
  TypePool<IceType_i8, int8_t, ConstantInteger32> Integers8;
  TypePool<IceType_i16, int16_t, ConstantInteger32> Integers16;
  TypePool<IceType_i32, int32_t, ConstantInteger32> Integers32;
  TypePool<IceType_i64, int64_t, ConstantInteger64> Integers64;
  TypePool<IceType_i32, RelocatableTuple, ConstantRelocatable> Relocatables;
  TypePool<IceType_i32, RelocatableTuple, ConstantRelocatable>
      ExternRelocatables;
  UndefPool Undefs;
};

void GlobalContext::waitForWorkerThreads() {
  if (WaitForWorkerThreadsCalled.exchange(true))
    return;
  optQueueNotifyEnd();
  for (std::thread &Worker : TranslationThreads) {
    Worker.join();
  }
  TranslationThreads.clear();

  // Only notify the emit queue to end after all the translation threads have
  // ended.
  emitQueueNotifyEnd();
  for (std::thread &Worker : EmitterThreads) {
    Worker.join();
  }
  EmitterThreads.clear();

  if (BuildDefs::timers()) {
    auto Timers = getTimers();
    for (ThreadContext *TLS : AllThreadContexts)
      Timers->mergeFrom(TLS->Timers);
  }
  if (BuildDefs::dump()) {
    // Do a separate loop over AllThreadContexts to avoid holding two locks at
    // once.
    auto Stats = getStatsCumulative();
    for (ThreadContext *TLS : AllThreadContexts)
      Stats->add(TLS->StatsCumulative);
  }
}

void GlobalContext::CodeStats::dump(const Cfg *Func, GlobalContext *Ctx) {
  if (!BuildDefs::dump())
    return;
  OstreamLocker _(Ctx);
  Ostream &Str = Ctx->getStrDump();
  const std::string Name =
      (Func == nullptr ? "_FINAL_" : Func->getFunctionNameAndSize());
#define X(str, tag)                                                            \
  Str << "|" << Name << "|" str "|" << Stats[CS_##tag] << "\n";
  CODESTATS_TABLE
#undef X
  Str << "|" << Name << "|Spills+Fills|"
      << Stats[CS_NumSpills] + Stats[CS_NumFills] << "\n";
  Str << "|" << Name << "|Memory Usage     |";
  if (const auto MemUsed = static_cast<size_t>(
          llvm::TimeRecord::getCurrentTime(false).getMemUsed())) {
    static constexpr size_t _1MB = 1024 * 1024;
    Str << (MemUsed / _1MB) << " MB";
  } else {
    Str << "(requires '-track-memory')";
  }
  Str << "\n";
  Str << "|" << Name << "|CPool Sizes ";
  {
    auto Pool = Ctx->getConstPool();
    Str << "|f32=" << Pool->Floats.size();
    Str << "|f64=" << Pool->Doubles.size();
    Str << "|i1=" << Pool->Integers1.size();
    Str << "|i8=" << Pool->Integers8.size();
    Str << "|i16=" << Pool->Integers16.size();
    Str << "|i32=" << Pool->Integers32.size();
    Str << "|i64=" << Pool->Integers64.size();
    Str << "|Rel=" << Pool->Relocatables.size();
    Str << "|ExtRel=" << Pool->ExternRelocatables.size();
  }
  Str << "\n";
  if (Func != nullptr) {
    Str << "|" << Name << "|Cfg Memory       |" << Func->getTotalMemoryMB()
        << " MB\n";
    Str << "|" << Name << "|Liveness Memory  |" << Func->getLivenessMemoryMB()
        << " MB\n";
  }
}

namespace {

// By default, wake up the main parser thread when the OptQ gets half empty.
static constexpr size_t DefaultOptQWakeupSize = GlobalContext::MaxOptQSize >> 1;

} // end of anonymous namespace

GlobalContext::GlobalContext(Ostream *OsDump, Ostream *OsEmit, Ostream *OsError,
                             ELFStreamer *ELFStr)
    : Strings(new StringPool()), ConstPool(new ConstantPool()), ErrorStatus(),
      StrDump(OsDump), StrEmit(OsEmit), StrError(OsError), IntrinsicsInfo(this),
      ObjectWriter(),
      OptQWakeupSize(std::max(DefaultOptQWakeupSize,
                              size_t(getFlags().getNumTranslationThreads()))),
      OptQ(/*Sequential=*/getFlags().isSequential(),
           /*MaxSize=*/
           getFlags().isParseParallel()
               ? MaxOptQSize
               : getFlags().getNumTranslationThreads()),
      // EmitQ is allowed unlimited size.
      EmitQ(/*Sequential=*/getFlags().isSequential()),
      DataLowering(TargetDataLowering::createLowering(this)) {
  assert(OsDump && "OsDump is not defined for GlobalContext");
  assert(OsEmit && "OsEmit is not defined for GlobalContext");
  assert(OsError && "OsError is not defined for GlobalContext");
  // Make sure thread_local fields are properly initialized before any
  // accesses are made.  Do this here instead of at the start of
  // main() so that all clients (e.g. unit tests) can benefit for
  // free.
  GlobalContext::TlsInit();
  Cfg::TlsInit();
  Liveness::TlsInit();
  // Create a new ThreadContext for the current thread.  No need to
  // lock AllThreadContexts at this point since no other threads have
  // access yet to this GlobalContext object.
  ThreadContext *MyTLS = new ThreadContext();
  AllThreadContexts.push_back(MyTLS);
  ICE_TLS_SET_FIELD(TLS, MyTLS);
  // Pre-register built-in stack names.
  if (BuildDefs::timers()) {
    // TODO(stichnot): There needs to be a strong relationship between
    // the newTimerStackID() return values and TSK_Default/TSK_Funcs.
    newTimerStackID("Total across all functions");
    newTimerStackID("Per-function summary");
  }
  Timers.initInto(MyTLS->Timers);
  switch (getFlags().getOutFileType()) {
  case FT_Elf:
    ObjectWriter.reset(new ELFObjectWriter(*this, *ELFStr));
    break;
  case FT_Asm:
  case FT_Iasm:
    break;
  }
// Cache up front common constants.
#define X(tag, sizeLog2, align, elts, elty, str, rcstr)                        \
  ConstZeroForType[IceType_##tag] = getConstantZeroInternal(IceType_##tag);
  ICETYPE_TABLE;
#undef X
  ConstantTrue = getConstantInt1Internal(1);
// Define runtime helper functions.
#define X(Tag, Name)                                                           \
  RuntimeHelperFunc[static_cast<size_t>(RuntimeHelper::H_##Tag)] =             \
      getConstantExternSym(getGlobalString(Name));
  RUNTIME_HELPER_FUNCTIONS_TABLE
#undef X

  TargetLowering::staticInit(this);

  if (getFlags().getEmitRevision()) {
    // Embed the Subzero revision into the compiled binary by creating a special
    // global variable initialized with the revision string.
    auto *Revision = VariableDeclaration::create(&Globals, true);
    Revision->setName(this, "__Sz_revision");
    Revision->setIsConstant(true);
    const char *RevisionString = getSubzeroRevision();
    Revision->addInitializer(VariableDeclaration::DataInitializer::create(
        &Globals, RevisionString, 1 + strlen(RevisionString)));
    Globals.push_back(Revision);
  }
}

void GlobalContext::translateFunctionsWrapper(ThreadContext *MyTLS) {
  ICE_TLS_SET_FIELD(TLS, MyTLS);
  translateFunctions();
}

void GlobalContext::translateFunctions() {
  TimerMarker Timer(TimerStack::TT_translateFunctions, this);
  while (std::unique_ptr<OptWorkItem> OptItem = optQueueBlockingPop()) {
    std::unique_ptr<EmitterWorkItem> Item;
    auto Func = OptItem->getParsedCfg();
    // Install Func in TLS for Cfg-specific container allocators.
    CfgLocalAllocatorScope _(Func.get());
    // Reset per-function stats being accumulated in TLS.
    resetStats();
    // Set verbose level to none if the current function does NOT match the
    // -verbose-focus command-line option.
    if (!getFlags().matchVerboseFocusOn(Func->getFunctionName(),
                                        Func->getSequenceNumber()))
      Func->setVerbose(IceV_None);
    // Disable translation if -notranslate is specified, or if the current
    // function matches the -translate-only option.  If translation is disabled,
    // just dump the high-level IR and continue.
    if (getFlags().getDisableTranslation() ||
        !getFlags().matchTranslateOnly(Func->getFunctionName(),
                                       Func->getSequenceNumber())) {
      Func->dump();
      // Add a dummy work item as a placeholder.  This maintains sequence
      // numbers so that the emitter thread will emit subsequent functions.
      Item = makeUnique<EmitterWorkItem>(Func->getSequenceNumber());
      emitQueueBlockingPush(std::move(Item));
      continue; // Func goes out of scope and gets deleted
    }

    Func->translate();
    if (Func->hasError()) {
      getErrorStatus()->assign(EC_Translation);
      OstreamLocker L(this);
      getStrError() << "ICE translation error: " << Func->getFunctionName()
                    << ": " << Func->getError() << ": "
                    << Func->getFunctionNameAndSize() << "\n";
      Item = makeUnique<EmitterWorkItem>(Func->getSequenceNumber());
    } else {
      Func->getAssembler<>()->setInternal(Func->getInternal());
      switch (getFlags().getOutFileType()) {
      case FT_Elf:
      case FT_Iasm: {
        Func->emitIAS();
        // The Cfg has already emitted into the assembly buffer, so
        // stats have been fully collected into this thread's TLS.
        // Dump them before TLS is reset for the next Cfg.
        if (BuildDefs::dump())
          dumpStats(Func.get());
        auto Asm = Func->releaseAssembler();
        // Copy relevant fields into Asm before Func is deleted.
        Asm->setFunctionName(Func->getFunctionName());
        Item = makeUnique<EmitterWorkItem>(Func->getSequenceNumber(),
                                           std::move(Asm));
        Item->setGlobalInits(Func->getGlobalInits());
      } break;
      case FT_Asm:
        // The Cfg has not been emitted yet, so stats are not ready
        // to be dumped.
        std::unique_ptr<VariableDeclarationList> GlobalInits =
            Func->getGlobalInits();
        Item = makeUnique<EmitterWorkItem>(Func->getSequenceNumber(),
                                           std::move(Func));
        Item->setGlobalInits(std::move(GlobalInits));
        break;
      }
    }
    assert(Item != nullptr);
    emitQueueBlockingPush(std::move(Item));
    // The Cfg now gets deleted as Func goes out of scope.
  }
}

namespace {

// Ensure Pending is large enough that Pending[Index] is valid.
void resizePending(std::vector<std::unique_ptr<EmitterWorkItem>> *Pending,
                   uint32_t Index) {
  if (Index >= Pending->size())
    Utils::reserveAndResize(*Pending, Index + 1);
}

} // end of anonymous namespace

// static
void GlobalContext::TlsInit() { ICE_TLS_INIT_FIELD(TLS); }

void GlobalContext::emitFileHeader() {
  TimerMarker T1(Ice::TimerStack::TT_emitAsm, this);
  if (getFlags().getOutFileType() == FT_Elf) {
    getObjectWriter()->writeInitialELFHeader();
  } else {
    if (!BuildDefs::dump()) {
      getStrError() << "emitFileHeader for non-ELF";
      getErrorStatus()->assign(EC_Translation);
    }
    TargetHeaderLowering::createLowering(this)->lower();
  }
}

void GlobalContext::lowerConstants() { DataLowering->lowerConstants(); }

void GlobalContext::lowerJumpTables() { DataLowering->lowerJumpTables(); }

void GlobalContext::emitTargetRODataSections() {
  DataLowering->emitTargetRODataSections();
}

void GlobalContext::saveBlockInfoPtrs() {
  for (VariableDeclaration *Global : Globals) {
    if (Cfg::isProfileGlobal(*Global)) {
      ProfileBlockInfos.push_back(Global);
    }
  }
}

void GlobalContext::lowerGlobals(const std::string &SectionSuffix) {
  TimerMarker T(TimerStack::TT_emitGlobalInitializers, this);
  const bool DumpGlobalVariables =
      BuildDefs::dump() && (getFlags().getVerbose() & IceV_GlobalInit) &&
      getFlags().matchVerboseFocusOn("", 0);
  if (DumpGlobalVariables) {
    OstreamLocker L(this);
    Ostream &Stream = getStrDump();
    for (const Ice::VariableDeclaration *Global : Globals) {
      Global->dump(Stream);
    }
  }
  if (getFlags().getDisableTranslation())
    return;

  saveBlockInfoPtrs();
  // If we need to shuffle the layout of global variables, shuffle them now.
  if (getFlags().getReorderGlobalVariables()) {
    // Create a random number generator for global variable reordering.
    RandomNumberGenerator RNG(getFlags().getRandomSeed(),
                              RPE_GlobalVariableReordering);
    RandomShuffle(Globals.begin(), Globals.end(),
                  [&RNG](int N) { return (uint32_t)RNG.next(N); });
  }

  if (!BuildDefs::minimal() && Instrumentor)
    Instrumentor->instrumentGlobals(Globals);

  DataLowering->lowerGlobals(Globals, SectionSuffix);
  if (ProfileBlockInfos.empty() && DisposeGlobalVariablesAfterLowering) {
    Globals.clearAndPurge();
  } else {
    Globals.clear();
  }
}

void GlobalContext::lowerProfileData() {
  // ProfileBlockInfoVarDecl is initialized in the constructor, and will only
  // ever be nullptr after this method completes. This assertion is a convoluted
  // way of ensuring lowerProfileData is invoked a single time.
  assert(ProfileBlockInfoVarDecl == nullptr);

  auto GlobalVariablePool = getInitializerAllocator();
  ProfileBlockInfoVarDecl =
      VariableDeclaration::createExternal(GlobalVariablePool.get());
  ProfileBlockInfoVarDecl->setAlignment(typeWidthInBytes(IceType_i64));
  ProfileBlockInfoVarDecl->setIsConstant(true);

  // Note: if you change this symbol, make sure to update
  // runtime/szrt_profiler.c as well.
  ProfileBlockInfoVarDecl->setName(this, "__Sz_block_profile_info");

  for (const VariableDeclaration *PBI : ProfileBlockInfos) {
    if (Cfg::isProfileGlobal(*PBI)) {
      constexpr RelocOffsetT BlockExecutionCounterOffset = 0;
      ProfileBlockInfoVarDecl->addInitializer(
          VariableDeclaration::RelocInitializer::create(
              GlobalVariablePool.get(), PBI,
              {RelocOffset::create(this, BlockExecutionCounterOffset)}));
    }
  }

  // This adds a 64-bit sentinel entry to the end of our array. For 32-bit
  // architectures this will waste 4 bytes.
  const SizeT Sizeof64BitNullPtr = typeWidthInBytes(IceType_i64);
  ProfileBlockInfoVarDecl->addInitializer(
      VariableDeclaration::ZeroInitializer::create(GlobalVariablePool.get(),
                                                   Sizeof64BitNullPtr));
  Globals.push_back(ProfileBlockInfoVarDecl);
  constexpr char ProfileDataSection[] = "$sz_profiler$";
  lowerGlobals(ProfileDataSection);
}

void GlobalContext::emitterWrapper(ThreadContext *MyTLS) {
  ICE_TLS_SET_FIELD(TLS, MyTLS);
  emitItems();
}

void GlobalContext::emitItems() {
  const bool Threaded = !getFlags().isSequential();
  // Pending is a vector containing the reassembled, ordered list of
  // work items.  When we're ready for the next item, we first check
  // whether it's in the Pending list.  If not, we take an item from
  // the work queue, and if it's not the item we're waiting for, we
  // insert it into Pending and repeat.  The work item is deleted
  // after it is processed.
  std::vector<std::unique_ptr<EmitterWorkItem>> Pending;
  uint32_t DesiredSequenceNumber = getFirstSequenceNumber();
  uint32_t ShuffleStartIndex = DesiredSequenceNumber;
  uint32_t ShuffleEndIndex = DesiredSequenceNumber;
  bool EmitQueueEmpty = false;
  const uint32_t ShuffleWindowSize =
      std::max(1u, getFlags().getReorderFunctionsWindowSize());
  bool Shuffle = Threaded && getFlags().getReorderFunctions();
  // Create a random number generator for function reordering.
  RandomNumberGenerator RNG(getFlags().getRandomSeed(), RPE_FunctionReordering);

  while (!EmitQueueEmpty) {
    resizePending(&Pending, DesiredSequenceNumber);
    // See if Pending contains DesiredSequenceNumber.
    if (Pending[DesiredSequenceNumber] == nullptr) {
      // We need to fetch an EmitterWorkItem from the queue.
      auto RawItem = emitQueueBlockingPop();
      if (RawItem == nullptr) {
        // This is the notifier for an empty queue.
        EmitQueueEmpty = true;
      } else {
        // We get an EmitterWorkItem, we need to add it to Pending.
        uint32_t ItemSeq = RawItem->getSequenceNumber();
        if (Threaded && ItemSeq != DesiredSequenceNumber) {
          // Not the desired one, add it to Pending but do not increase
          // DesiredSequenceNumber. Continue the loop, do not emit the item.
          resizePending(&Pending, ItemSeq);
          Pending[ItemSeq] = std::move(RawItem);
          continue;
        }
        // ItemSeq == DesiredSequenceNumber, we need to check if we should
        // emit it or not. If !Threaded, we're OK with ItemSeq !=
        // DesiredSequenceNumber.
        Pending[DesiredSequenceNumber] = std::move(RawItem);
      }
    }
    const auto *CurrentWorkItem = Pending[DesiredSequenceNumber].get();

    // We have the desired EmitterWorkItem or nullptr as the end notifier.
    // If the emitter queue is not empty, increase DesiredSequenceNumber and
    // ShuffleEndIndex.
    if (!EmitQueueEmpty) {
      DesiredSequenceNumber++;
      ShuffleEndIndex++;
    }

    if (Shuffle) {
      // Continue fetching EmitterWorkItem if function reordering is turned on,
      // and emit queue is not empty, and the number of consecutive pending
      // items is smaller than the window size, and RawItem is not a
      // WI_GlobalInits kind. Emit WI_GlobalInits kind block first to avoid
      // holding an arbitrarily large GlobalDeclarationList.
      if (!EmitQueueEmpty &&
          ShuffleEndIndex - ShuffleStartIndex < ShuffleWindowSize &&
          CurrentWorkItem->getKind() != EmitterWorkItem::WI_GlobalInits)
        continue;

      // Emit the EmitterWorkItem between Pending[ShuffleStartIndex] to
      // Pending[ShuffleEndIndex]. If function reordering turned on, shuffle the
      // pending items from Pending[ShuffleStartIndex] to
      // Pending[ShuffleEndIndex].
      RandomShuffle(Pending.begin() + ShuffleStartIndex,
                    Pending.begin() + ShuffleEndIndex,
                    [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); });
    }

    // Emit the item from ShuffleStartIndex to ShuffleEndIndex.
    for (uint32_t I = ShuffleStartIndex; I < ShuffleEndIndex; I++) {
      std::unique_ptr<EmitterWorkItem> Item = std::move(Pending[I]);

      switch (Item->getKind()) {
      case EmitterWorkItem::WI_Nop:
        break;
      case EmitterWorkItem::WI_GlobalInits: {
        accumulateGlobals(Item->getGlobalInits());
      } break;
      case EmitterWorkItem::WI_Asm: {
        lowerGlobalsIfNoCodeHasBeenSeen();
        accumulateGlobals(Item->getGlobalInits());

        std::unique_ptr<Assembler> Asm = Item->getAsm();
        Asm->alignFunction();
        GlobalString Name = Asm->getFunctionName();
        switch (getFlags().getOutFileType()) {
        case FT_Elf:
          getObjectWriter()->writeFunctionCode(Name, Asm->getInternal(),
                                               Asm.get());
          break;
        case FT_Iasm: {
          OstreamLocker L(this);
          Cfg::emitTextHeader(Name, this, Asm.get());
          Asm->emitIASBytes(this);
        } break;
        case FT_Asm:
          llvm::report_fatal_error("Unexpected FT_Asm");
          break;
        }
      } break;
      case EmitterWorkItem::WI_Cfg: {
        if (!BuildDefs::dump())
          llvm::report_fatal_error("WI_Cfg work item created inappropriately");
        lowerGlobalsIfNoCodeHasBeenSeen();
        accumulateGlobals(Item->getGlobalInits());

        assert(getFlags().getOutFileType() == FT_Asm);
        std::unique_ptr<Cfg> Func = Item->getCfg();
        // Unfortunately, we have to temporarily install the Cfg in TLS
        // because Variable::asType() uses the allocator to create the
        // differently-typed copy.
        CfgLocalAllocatorScope _(Func.get());
        Func->emit();
        dumpStats(Func.get());
      } break;
      }
    }
    // Update the start index for next shuffling queue
    ShuffleStartIndex = ShuffleEndIndex;
  }

  // In case there are no code to be generated, we invoke the conditional
  // lowerGlobals again -- this is a no-op if code has been emitted.
  lowerGlobalsIfNoCodeHasBeenSeen();
}

GlobalContext::~GlobalContext() {
  llvm::DeleteContainerPointers(AllThreadContexts);
  LockedPtr<DestructorArray> Dtors = getDestructors();
  // Destructors are invoked in the opposite object construction order.
  for (const auto &Dtor : reverse_range(*Dtors))
    Dtor();
}

void GlobalContext::dumpStrings() {
  if (!getFlags().getDumpStrings())
    return;
  OstreamLocker _(this);
  Ostream &Str = getStrDump();
  Str << "GlobalContext strings:\n";
  getStrings()->dump(Str);
}

void GlobalContext::dumpConstantLookupCounts() {
  if (!BuildDefs::dump())
    return;
  const bool DumpCounts = (getFlags().getVerbose() & IceV_ConstPoolStats) &&
                          getFlags().matchVerboseFocusOn("", 0);
  if (!DumpCounts)
    return;

  OstreamLocker _(this);
  Ostream &Str = getStrDump();
  Str << "Constant pool use stats: count+value+type\n";
#define X(WhichPool)                                                           \
  for (auto *C : getConstPool()->WhichPool.getConstantPool()) {                \
    Str << C->getLookupCount() << " ";                                         \
    C->dump(Str);                                                              \
    Str << " " << C->getType() << "\n";                                        \
  }
  X(Integers1);
  X(Integers8);
  X(Integers16);
  X(Integers32);
  X(Integers64);
  X(Floats);
  X(Doubles);
  X(Relocatables);
  X(ExternRelocatables);
#undef X
}

// TODO(stichnot): Consider adding thread-local caches of constant pool entries
// to reduce contention.

// All locking is done by the getConstantInt[0-9]+() target function.
Constant *GlobalContext::getConstantInt(Type Ty, int64_t Value) {
  switch (Ty) {
  case IceType_i1:
    return getConstantInt1(Value);
  case IceType_i8:
    return getConstantInt8(Value);
  case IceType_i16:
    return getConstantInt16(Value);
  case IceType_i32:
    return getConstantInt32(Value);
  case IceType_i64:
    return getConstantInt64(Value);
  default:
    llvm_unreachable("Bad integer type for getConstant");
  }
  return nullptr;
}

Constant *GlobalContext::getConstantInt1Internal(int8_t ConstantInt1) {
  ConstantInt1 &= INT8_C(1);
  return getConstPool()->Integers1.getOrAdd(this, ConstantInt1);
}

Constant *GlobalContext::getConstantInt8Internal(int8_t ConstantInt8) {
  return getConstPool()->Integers8.getOrAdd(this, ConstantInt8);
}

Constant *GlobalContext::getConstantInt16Internal(int16_t ConstantInt16) {
  return getConstPool()->Integers16.getOrAdd(this, ConstantInt16);
}

Constant *GlobalContext::getConstantInt32Internal(int32_t ConstantInt32) {
  return getConstPool()->Integers32.getOrAdd(this, ConstantInt32);
}

Constant *GlobalContext::getConstantInt64Internal(int64_t ConstantInt64) {
  return getConstPool()->Integers64.getOrAdd(this, ConstantInt64);
}

Constant *GlobalContext::getConstantFloat(float ConstantFloat) {
  return getConstPool()->Floats.getOrAdd(this, ConstantFloat);
}

Constant *GlobalContext::getConstantDouble(double ConstantDouble) {
  return getConstPool()->Doubles.getOrAdd(this, ConstantDouble);
}

Constant *GlobalContext::getConstantSymWithEmitString(
    const RelocOffsetT Offset, const RelocOffsetArray &OffsetExpr,
    GlobalString Name, const std::string &EmitString) {
  return getConstPool()->Relocatables.getOrAdd(
      this, RelocatableTuple(Offset, OffsetExpr, Name, EmitString));
}

Constant *GlobalContext::getConstantSym(RelocOffsetT Offset,
                                        GlobalString Name) {
  constexpr char EmptyEmitString[] = "";
  return getConstantSymWithEmitString(Offset, {}, Name, EmptyEmitString);
}

Constant *GlobalContext::getConstantExternSym(GlobalString Name) {
  constexpr RelocOffsetT Offset = 0;
  return getConstPool()->ExternRelocatables.getOrAdd(
      this, RelocatableTuple(Offset, {}, Name));
}

Constant *GlobalContext::getConstantUndef(Type Ty) {
  return getConstPool()->Undefs.getOrAdd(this, Ty);
}

Constant *GlobalContext::getConstantZero(Type Ty) {
  Constant *Zero = ConstZeroForType[Ty];
  if (Zero == nullptr)
    llvm::report_fatal_error("Unsupported constant type: " + typeStdString(Ty));
  return Zero;
}

// All locking is done by the getConstant*() target function.
Constant *GlobalContext::getConstantZeroInternal(Type Ty) {
  switch (Ty) {
  case IceType_i1:
    return getConstantInt1Internal(0);
  case IceType_i8:
    return getConstantInt8Internal(0);
  case IceType_i16:
    return getConstantInt16Internal(0);
  case IceType_i32:
    return getConstantInt32Internal(0);
  case IceType_i64:
    return getConstantInt64Internal(0);
  case IceType_f32:
    return getConstantFloat(0);
  case IceType_f64:
    return getConstantDouble(0);
  default:
    return nullptr;
  }
}

ConstantList GlobalContext::getConstantPool(Type Ty) {
  switch (Ty) {
  case IceType_i1:
  case IceType_i8:
    return getConstPool()->Integers8.getConstantPool();
  case IceType_i16:
    return getConstPool()->Integers16.getConstantPool();
  case IceType_i32:
    return getConstPool()->Integers32.getConstantPool();
  case IceType_i64:
    return getConstPool()->Integers64.getConstantPool();
  case IceType_f32:
    return getConstPool()->Floats.getConstantPool();
  case IceType_f64:
    return getConstPool()->Doubles.getConstantPool();
  case IceType_v4i1:
  case IceType_v8i1:
  case IceType_v16i1:
  case IceType_v16i8:
  case IceType_v8i16:
  case IceType_v4i32:
  case IceType_v4f32:
    llvm::report_fatal_error("Unsupported constant type: " + typeStdString(Ty));
    break;
  case IceType_void:
  case IceType_NUM:
    break;
  }
  llvm_unreachable("Unknown type");
}

ConstantList GlobalContext::getConstantExternSyms() {
  return getConstPool()->ExternRelocatables.getConstantPool();
}

GlobalString GlobalContext::getGlobalString(const std::string &Name) {
  return GlobalString::createWithString(this, Name);
}

JumpTableDataList GlobalContext::getJumpTables() {
  JumpTableDataList JumpTables(*getJumpTableList());
  // Make order deterministic by sorting into functions and then ID of the jump
  // table within that function.
  std::sort(JumpTables.begin(), JumpTables.end(),
            [](const JumpTableData &A, const JumpTableData &B) {
              if (A.getFunctionName() != B.getFunctionName())
                return A.getFunctionName() < B.getFunctionName();
              return A.getId() < B.getId();
            });

  if (getFlags().getReorderPooledConstants()) {
    // If reorder-pooled-constants option is set to true, we also shuffle the
    // jump tables before emitting them.

    // Create a random number generator for jump tables reordering, considering
    // jump tables as pooled constants.
    RandomNumberGenerator RNG(getFlags().getRandomSeed(),
                              RPE_PooledConstantReordering);
    RandomShuffle(JumpTables.begin(), JumpTables.end(),
                  [&RNG](uint64_t N) { return (uint32_t)RNG.next(N); });
  }
  return JumpTables;
}

void GlobalContext::addJumpTableData(JumpTableData JumpTable) {
  getJumpTableList()->emplace_back(std::move(JumpTable));
}

TimerStackIdT GlobalContext::newTimerStackID(const std::string &Name) {
  if (!BuildDefs::timers())
    return 0;
  auto Timers = getTimers();
  TimerStackIdT NewID = Timers->size();
  Timers->push_back(TimerStack(Name));
  return NewID;
}

TimerIdT GlobalContext::getTimerID(TimerStackIdT StackID,
                                   const std::string &Name) {
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(StackID < Timers->size());
  return Timers->at(StackID).getTimerID(Name);
}

void GlobalContext::pushTimer(TimerIdT ID, TimerStackIdT StackID) {
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(StackID < Timers->size());
  Timers->at(StackID).push(ID);
}

void GlobalContext::popTimer(TimerIdT ID, TimerStackIdT StackID) {
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(StackID < Timers->size());
  Timers->at(StackID).pop(ID);
}

void GlobalContext::resetTimer(TimerStackIdT StackID) {
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(StackID < Timers->size());
  Timers->at(StackID).reset();
}

std::string GlobalContext::getTimerName(TimerStackIdT StackID) {
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(StackID < Timers->size());
  return Timers->at(StackID).getName();
}

void GlobalContext::setTimerName(TimerStackIdT StackID,
                                 const std::string &NewName) {
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(StackID < Timers->size());
  Timers->at(StackID).setName(NewName);
}

// Note: optQueueBlockingPush and optQueueBlockingPop use unique_ptr at the
// interface to take and transfer ownership, but they internally store the raw
// Cfg pointer in the work queue. This allows e.g. future queue optimizations
// such as the use of atomics to modify queue elements.
void GlobalContext::optQueueBlockingPush(std::unique_ptr<OptWorkItem> Item) {
  assert(Item);
  {
    TimerMarker _(TimerStack::TT_qTransPush, this);
    OptQ.blockingPush(std::move(Item));
  }
  if (getFlags().isSequential())
    translateFunctions();
}

std::unique_ptr<OptWorkItem> GlobalContext::optQueueBlockingPop() {
  TimerMarker _(TimerStack::TT_qTransPop, this);
  return OptQ.blockingPop(OptQWakeupSize);
}

void GlobalContext::emitQueueBlockingPush(
    std::unique_ptr<EmitterWorkItem> Item) {
  assert(Item);
  {
    TimerMarker _(TimerStack::TT_qEmitPush, this);
    EmitQ.blockingPush(std::move(Item));
  }
  if (getFlags().isSequential())
    emitItems();
}

std::unique_ptr<EmitterWorkItem> GlobalContext::emitQueueBlockingPop() {
  TimerMarker _(TimerStack::TT_qEmitPop, this);
  return EmitQ.blockingPop();
}

void GlobalContext::initParserThread() {
  ThreadContext *Tls = new ThreadContext();
  auto Timers = getTimers();
  Timers->initInto(Tls->Timers);
  AllThreadContexts.push_back(Tls);
  ICE_TLS_SET_FIELD(TLS, Tls);
}

void GlobalContext::startWorkerThreads() {
  size_t NumWorkers = getFlags().getNumTranslationThreads();
  auto Timers = getTimers();
  for (size_t i = 0; i < NumWorkers; ++i) {
    ThreadContext *WorkerTLS = new ThreadContext();
    Timers->initInto(WorkerTLS->Timers);
    AllThreadContexts.push_back(WorkerTLS);
    TranslationThreads.push_back(std::thread(
        &GlobalContext::translateFunctionsWrapper, this, WorkerTLS));
  }
  if (NumWorkers) {
    ThreadContext *WorkerTLS = new ThreadContext();
    Timers->initInto(WorkerTLS->Timers);
    AllThreadContexts.push_back(WorkerTLS);
    EmitterThreads.push_back(
        std::thread(&GlobalContext::emitterWrapper, this, WorkerTLS));
  }
}

void GlobalContext::resetStats() {
  if (BuildDefs::dump())
    ICE_TLS_GET_FIELD(TLS)->StatsFunction.reset();
}

void GlobalContext::dumpStats(const Cfg *Func) {
  if (!getFlags().getDumpStats())
    return;
  if (Func == nullptr) {
    getStatsCumulative()->dump(Func, this);
  } else {
    ICE_TLS_GET_FIELD(TLS)->StatsFunction.dump(Func, this);
  }
}

void GlobalContext::statsUpdateEmitted(uint32_t InstCount) {
  if (!getFlags().getDumpStats())
    return;
  ThreadContext *Tls = ICE_TLS_GET_FIELD(TLS);
  Tls->StatsFunction.update(CodeStats::CS_InstCount, InstCount);
  Tls->StatsCumulative.update(CodeStats::CS_InstCount, InstCount);
}

void GlobalContext::statsUpdateRegistersSaved(uint32_t Num) {
  if (!getFlags().getDumpStats())
    return;
  ThreadContext *Tls = ICE_TLS_GET_FIELD(TLS);
  Tls->StatsFunction.update(CodeStats::CS_RegsSaved, Num);
  Tls->StatsCumulative.update(CodeStats::CS_RegsSaved, Num);
}

void GlobalContext::statsUpdateFrameBytes(uint32_t Bytes) {
  if (!getFlags().getDumpStats())
    return;
  ThreadContext *Tls = ICE_TLS_GET_FIELD(TLS);
  Tls->StatsFunction.update(CodeStats::CS_FrameByte, Bytes);
  Tls->StatsCumulative.update(CodeStats::CS_FrameByte, Bytes);
}

void GlobalContext::statsUpdateSpills() {
  if (!getFlags().getDumpStats())
    return;
  ThreadContext *Tls = ICE_TLS_GET_FIELD(TLS);
  Tls->StatsFunction.update(CodeStats::CS_NumSpills);
  Tls->StatsCumulative.update(CodeStats::CS_NumSpills);
}

void GlobalContext::statsUpdateFills() {
  if (!getFlags().getDumpStats())
    return;
  ThreadContext *Tls = ICE_TLS_GET_FIELD(TLS);
  Tls->StatsFunction.update(CodeStats::CS_NumFills);
  Tls->StatsCumulative.update(CodeStats::CS_NumFills);
}

void GlobalContext::statsUpdateRPImms() {
  if (!getFlags().getDumpStats())
    return;
  ThreadContext *Tls = ICE_TLS_GET_FIELD(TLS);
  Tls->StatsFunction.update(CodeStats::CS_NumRPImms);
  Tls->StatsCumulative.update(CodeStats::CS_NumRPImms);
}

void GlobalContext::dumpTimers(TimerStackIdT StackID, bool DumpCumulative) {
  if (!BuildDefs::timers())
    return;
  auto Timers = getTimers();
  assert(Timers->size() > StackID);
  OstreamLocker L(this);
  Timers->at(StackID).dump(getStrDump(), DumpCumulative);
}

void GlobalContext::dumpLocalTimers(const std::string &TimerNameOverride,
                                    TimerStackIdT StackID,
                                    bool DumpCumulative) {
  if (!BuildDefs::timers())
    return;
  auto *Timers = &ICE_TLS_GET_FIELD(TLS)->Timers;
  assert(Timers->size() > StackID);
  // Temporarily override the thread-local timer name with the given name.
  // Don't do it permanently because the final timer merge at the end expects
  // the thread-local timer names to be the same as the global timer name.
  auto OrigName = getTimerName(StackID);
  setTimerName(StackID, TimerNameOverride);
  {
    OstreamLocker _(this);
    Timers->at(StackID).dump(getStrDump(), DumpCumulative);
  }
  setTimerName(StackID, OrigName);
}

LockedPtr<StringPool>
GlobalStringPoolTraits::getStrings(const GlobalContext *PoolOwner) {
  return PoolOwner->getStrings();
}

TimerIdT TimerMarker::getTimerIdFromFuncName(GlobalContext *Ctx,
                                             const std::string &FuncName) {
  if (!BuildDefs::timers())
    return 0;
  if (!getFlags().getTimeEachFunction())
    return 0;
  return Ctx->getTimerID(GlobalContext::TSK_Funcs, FuncName);
}

void TimerMarker::push() {
  switch (StackID) {
  case GlobalContext::TSK_Default:
    Active = getFlags().getSubzeroTimingEnabled() ||
             !getFlags().getTimingFocusOnString().empty();
    break;
  case GlobalContext::TSK_Funcs:
    Active = getFlags().getTimeEachFunction();
    break;
  default:
    break;
  }
  if (Active)
    Ctx->pushTimer(ID, StackID);
}

void TimerMarker::pushCfg(const Cfg *Func) {
  Ctx = Func->getContext();
  Active = Func->getFocusedTiming() || getFlags().getSubzeroTimingEnabled();
  if (Active)
    Ctx->pushTimer(ID, StackID);
}

ICE_TLS_DEFINE_FIELD(GlobalContext::ThreadContext *, GlobalContext, TLS);

} // end of namespace Ice