//===-- XCoreLowerThreadLocal - Lower thread local variables --------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief This file contains a pass that lowers thread local variables on the
///        XCore.
///
//===----------------------------------------------------------------------===//

#include "XCore.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/NoFolder.h"
#include "llvm/IR/ValueHandle.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"

#define DEBUG_TYPE "xcore-lower-thread-local"

using namespace llvm;

static cl::opt<unsigned> MaxThreads(
  "xcore-max-threads", cl::Optional,
  cl::desc("Maximum number of threads (for emulation thread-local storage)"),
  cl::Hidden, cl::value_desc("number"), cl::init(8));

namespace {
  /// Lowers thread local variables on the XCore. Each thread local variable is
  /// expanded to an array of n elements indexed by the thread ID where n is the
  /// fixed number hardware threads supported by the device.
  struct XCoreLowerThreadLocal : public ModulePass {
    static char ID;

    XCoreLowerThreadLocal() : ModulePass(ID) {
      initializeXCoreLowerThreadLocalPass(*PassRegistry::getPassRegistry());
    }

    bool lowerGlobal(GlobalVariable *GV);

    bool runOnModule(Module &M) override;
  };
}

char XCoreLowerThreadLocal::ID = 0;

INITIALIZE_PASS(XCoreLowerThreadLocal, "xcore-lower-thread-local",
                "Lower thread local variables", false, false)

ModulePass *llvm::createXCoreLowerThreadLocalPass() {
  return new XCoreLowerThreadLocal();
}

static ArrayType *createLoweredType(Type *OriginalType) {
  return ArrayType::get(OriginalType, MaxThreads);
}

static Constant *
createLoweredInitializer(ArrayType *NewType, Constant *OriginalInitializer) {
  SmallVector<Constant *, 8> Elements(MaxThreads);
  for (unsigned i = 0; i != MaxThreads; ++i) {
    Elements[i] = OriginalInitializer;
  }
  return ConstantArray::get(NewType, Elements);
}

static Instruction *
createReplacementInstr(ConstantExpr *CE, Instruction *Instr) {
  IRBuilder<true,NoFolder> Builder(Instr);
  unsigned OpCode = CE->getOpcode();
  switch (OpCode) {
    case Instruction::GetElementPtr: {
      SmallVector<Value *,4> CEOpVec(CE->op_begin(), CE->op_end());
      ArrayRef<Value *> CEOps(CEOpVec);
      return dyn_cast<Instruction>(Builder.CreateInBoundsGEP(
          cast<GEPOperator>(CE)->getSourceElementType(), CEOps[0],
          CEOps.slice(1)));
    }
    case Instruction::Add:
    case Instruction::Sub:
    case Instruction::Mul:
    case Instruction::UDiv:
    case Instruction::SDiv:
    case Instruction::FDiv:
    case Instruction::URem:
    case Instruction::SRem:
    case Instruction::FRem:
    case Instruction::Shl:
    case Instruction::LShr:
    case Instruction::AShr:
    case Instruction::And:
    case Instruction::Or:
    case Instruction::Xor:
      return dyn_cast<Instruction>(
                  Builder.CreateBinOp((Instruction::BinaryOps)OpCode,
                                      CE->getOperand(0), CE->getOperand(1),
                                      CE->getName()));
    case Instruction::Trunc:
    case Instruction::ZExt:
    case Instruction::SExt:
    case Instruction::FPToUI:
    case Instruction::FPToSI:
    case Instruction::UIToFP:
    case Instruction::SIToFP:
    case Instruction::FPTrunc:
    case Instruction::FPExt:
    case Instruction::PtrToInt:
    case Instruction::IntToPtr:
    case Instruction::BitCast:
      return dyn_cast<Instruction>(
                  Builder.CreateCast((Instruction::CastOps)OpCode,
                                     CE->getOperand(0), CE->getType(),
                                     CE->getName()));
    default:
      llvm_unreachable("Unhandled constant expression!\n");
  }
}

static bool replaceConstantExprOp(ConstantExpr *CE, Pass *P) {
  do {
    SmallVector<WeakVH,8> WUsers(CE->user_begin(), CE->user_end());
    std::sort(WUsers.begin(), WUsers.end());
    WUsers.erase(std::unique(WUsers.begin(), WUsers.end()), WUsers.end());
    while (!WUsers.empty())
      if (WeakVH WU = WUsers.pop_back_val()) {
        if (PHINode *PN = dyn_cast<PHINode>(WU)) {
          for (int I = 0, E = PN->getNumIncomingValues(); I < E; ++I)
            if (PN->getIncomingValue(I) == CE) {
              BasicBlock *PredBB = PN->getIncomingBlock(I);
              if (PredBB->getTerminator()->getNumSuccessors() > 1)
                PredBB = SplitEdge(PredBB, PN->getParent());
              Instruction *InsertPos = PredBB->getTerminator();
              Instruction *NewInst = createReplacementInstr(CE, InsertPos);
              PN->setOperand(I, NewInst);
            }
        } else if (Instruction *Instr = dyn_cast<Instruction>(WU)) {
          Instruction *NewInst = createReplacementInstr(CE, Instr);
          Instr->replaceUsesOfWith(CE, NewInst);
        } else {
          ConstantExpr *CExpr = dyn_cast<ConstantExpr>(WU);
          if (!CExpr || !replaceConstantExprOp(CExpr, P))
            return false;
        }
      }
  } while (CE->hasNUsesOrMore(1)); // We need to check because a recursive
  // sibling may have used 'CE' when createReplacementInstr was called.
  CE->destroyConstant();
  return true;
}

static bool rewriteNonInstructionUses(GlobalVariable *GV, Pass *P) {
  SmallVector<WeakVH,8> WUsers;
  for (User *U : GV->users())
    if (!isa<Instruction>(U))
      WUsers.push_back(WeakVH(U));
  while (!WUsers.empty())
    if (WeakVH WU = WUsers.pop_back_val()) {
      ConstantExpr *CE = dyn_cast<ConstantExpr>(WU);
      if (!CE || !replaceConstantExprOp(CE, P))
        return false;
    }
  return true;
}

static bool isZeroLengthArray(Type *Ty) {
  ArrayType *AT = dyn_cast<ArrayType>(Ty);
  return AT && (AT->getNumElements() == 0);
}

bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
  Module *M = GV->getParent();
  LLVMContext &Ctx = M->getContext();
  if (!GV->isThreadLocal())
    return false;

  // Skip globals that we can't lower and leave it for the backend to error.
  if (!rewriteNonInstructionUses(GV, this) ||
      !GV->getType()->isSized() || isZeroLengthArray(GV->getType()))
    return false;

  // Create replacement global.
  ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
  Constant *NewInitializer = nullptr;
  if (GV->hasInitializer())
    NewInitializer = createLoweredInitializer(NewType,
                                              GV->getInitializer());
  GlobalVariable *NewGV =
    new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
                       NewInitializer, "", nullptr,
                       GlobalVariable::NotThreadLocal,
                       GV->getType()->getAddressSpace(),
                       GV->isExternallyInitialized());

  // Update uses.
  SmallVector<User *, 16> Users(GV->user_begin(), GV->user_end());
  for (unsigned I = 0, E = Users.size(); I != E; ++I) {
    User *U = Users[I];
    Instruction *Inst = cast<Instruction>(U);
    IRBuilder<> Builder(Inst);
    Function *GetID = Intrinsic::getDeclaration(GV->getParent(),
                                                Intrinsic::xcore_getid);
    Value *ThreadID = Builder.CreateCall(GetID, {});
    SmallVector<Value *, 2> Indices;
    Indices.push_back(Constant::getNullValue(Type::getInt64Ty(Ctx)));
    Indices.push_back(ThreadID);
    Value *Addr =
        Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV, Indices);
    U->replaceUsesOfWith(GV, Addr);
  }

  // Remove old global.
  NewGV->takeName(GV);
  GV->eraseFromParent();
  return true;
}

bool XCoreLowerThreadLocal::runOnModule(Module &M) {
  // Find thread local globals.
  bool MadeChange = false;
  SmallVector<GlobalVariable *, 16> ThreadLocalGlobals;
  for (GlobalVariable &GV : M.globals())
    if (GV.isThreadLocal())
      ThreadLocalGlobals.push_back(&GV);
  for (unsigned I = 0, E = ThreadLocalGlobals.size(); I != E; ++I) {
    MadeChange |= lowerGlobal(ThreadLocalGlobals[I]);
  }
  return MadeChange;
}