//===-- BypassSlowDivision.cpp - Bypass slow division ---------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This file contains an optimization for div and rem on architectures that
// execute short instructions significantly faster than longer instructions.
// For example, on Intel Atom 32-bit divides are slow enough that during
// runtime it is profitable to check the value of the operands, and if they are
// positive and less than 256 use an unsigned 8-bit divide.
//
//===----------------------------------------------------------------------===//

#define DEBUG_TYPE "bypass-slow-division"
#include "llvm/Transforms/Utils/BypassSlowDivision.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"

using namespace llvm;

namespace {
  struct DivOpInfo {
    bool SignedOp;
    Value *Dividend;
    Value *Divisor;

    DivOpInfo(bool InSignedOp, Value *InDividend, Value *InDivisor)
      : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
  };

  struct DivPhiNodes {
    PHINode *Quotient;
    PHINode *Remainder;

    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
      : Quotient(InQuotient), Remainder(InRemainder) {}
  };
}

namespace llvm {
  template<>
  struct DenseMapInfo<DivOpInfo> {
    static bool isEqual(const DivOpInfo &Val1, const DivOpInfo &Val2) {
      return Val1.SignedOp == Val2.SignedOp &&
             Val1.Dividend == Val2.Dividend &&
             Val1.Divisor == Val2.Divisor;
    }

    static DivOpInfo getEmptyKey() {
      return DivOpInfo(false, 0, 0);
    }

    static DivOpInfo getTombstoneKey() {
      return DivOpInfo(true, 0, 0);
    }

    static unsigned getHashValue(const DivOpInfo &Val) {
      return (unsigned)(reinterpret_cast<uintptr_t>(Val.Dividend) ^
                        reinterpret_cast<uintptr_t>(Val.Divisor)) ^
                        (unsigned)Val.SignedOp;
    }
  };

  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
}

// insertFastDiv - Substitutes the div/rem instruction with code that checks the
// value of the operands and uses a shorter-faster div/rem instruction when
// possible and the longer-slower div/rem instruction otherwise.
static bool insertFastDiv(Function &F,
                          Function::iterator &I,
                          BasicBlock::iterator &J,
                          IntegerType *BypassType,
                          bool UseDivOp,
                          bool UseSignedOp,
                          DivCacheTy &PerBBDivCache) {
  // Get instruction operands
  Instruction *Instr = J;
  Value *Dividend = Instr->getOperand(0);
  Value *Divisor = Instr->getOperand(1);

  if (isa<ConstantInt>(Divisor) ||
      (isa<ConstantInt>(Dividend) && isa<ConstantInt>(Divisor))) {
    // Operations with immediate values should have
    // been solved and replaced during compile time.
    return false;
  }

  // Basic Block is split before divide
  BasicBlock *MainBB = I;
  BasicBlock *SuccessorBB = I->splitBasicBlock(J);
  ++I; //advance iterator I to successorBB

  // Add new basic block for slow divide operation
  BasicBlock *SlowBB = BasicBlock::Create(F.getContext(), "",
                                          MainBB->getParent(), SuccessorBB);
  SlowBB->moveBefore(SuccessorBB);
  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
  Value *SlowQuotientV;
  Value *SlowRemainderV;
  if (UseSignedOp) {
    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
  } else {
    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
  }
  SlowBuilder.CreateBr(SuccessorBB);

  // Add new basic block for fast divide operation
  BasicBlock *FastBB = BasicBlock::Create(F.getContext(), "",
                                          MainBB->getParent(), SuccessorBB);
  FastBB->moveBefore(SlowBB);
  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
                                                BypassType);
  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
                                                 BypassType);

  // udiv/urem because optimization only handles positive numbers
  Value *ShortQuotientV = FastBuilder.CreateExactUDiv(ShortDividendV,
                                                      ShortDivisorV);
  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
                                                  ShortDivisorV);
  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
                                                ShortQuotientV,
                                                Dividend->getType());
  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
                                                 ShortRemainderV,
                                                 Dividend->getType());
  FastBuilder.CreateBr(SuccessorBB);

  // Phi nodes for result of div and rem
  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
  QuoPhi->addIncoming(FastQuotientV, FastBB);
  PHINode *RemPhi = SuccessorBuilder.CreatePHI(Instr->getType(), 2);
  RemPhi->addIncoming(SlowRemainderV, SlowBB);
  RemPhi->addIncoming(FastRemainderV, FastBB);

  // Replace Instr with appropriate phi node
  if (UseDivOp)
    Instr->replaceAllUsesWith(QuoPhi);
  else
    Instr->replaceAllUsesWith(RemPhi);
  Instr->eraseFromParent();

  // Combine operands into a single value with OR for value testing below
  MainBB->getInstList().back().eraseFromParent();
  IRBuilder<> MainBuilder(MainBB, MainBB->end());
  Value *OrV = MainBuilder.CreateOr(Dividend, Divisor);

  // BitMask is inverted to check if the operands are
  // larger than the bypass type
  uint64_t BitMask = ~BypassType->getBitMask();
  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);

  // Compare operand values and branch
  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);

  // point iterator J at first instruction of successorBB
  J = I->begin();

  // Cache phi nodes to be used later in place of other instances
  // of div or rem with the same sign, dividend, and divisor
  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
  DivPhiNodes Value(QuoPhi, RemPhi);
  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
  return true;
}

// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder if
// operands and operation are identical. Otherwise call insertFastDiv to perform
// the optimization and cache the resulting dividend and remainder.
static bool reuseOrInsertFastDiv(Function &F,
                                 Function::iterator &I,
                                 BasicBlock::iterator &J,
                                 IntegerType *BypassType,
                                 bool UseDivOp,
                                 bool UseSignedOp,
                                 DivCacheTy &PerBBDivCache) {
  // Get instruction operands
  Instruction *Instr = J;
  DivOpInfo Key(UseSignedOp, Instr->getOperand(0), Instr->getOperand(1));
  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);

  if (CacheI == PerBBDivCache.end()) {
    // If previous instance does not exist, insert fast div
    return insertFastDiv(F, I, J, BypassType, UseDivOp, UseSignedOp,
                         PerBBDivCache);
  }

  // Replace operation value with previously generated phi node
  DivPhiNodes &Value = CacheI->second;
  if (UseDivOp) {
    // Replace all uses of div instruction with quotient phi node
    J->replaceAllUsesWith(Value.Quotient);
  } else {
    // Replace all uses of rem instruction with remainder phi node
    J->replaceAllUsesWith(Value.Remainder);
  }

  // Advance to next operation
  ++J;

  // Remove redundant operation
  Instr->eraseFromParent();
  return true;
}

// bypassSlowDivision - This optimization identifies DIV instructions that can
// be profitably bypassed and carried out with a shorter, faster divide.
bool llvm::bypassSlowDivision(Function &F,
                              Function::iterator &I,
                              const DenseMap<unsigned int, unsigned int> &BypassWidths) {
  DivCacheTy DivCache;

  bool MadeChange = false;
  for (BasicBlock::iterator J = I->begin(); J != I->end(); J++) {

    // Get instruction details
    unsigned Opcode = J->getOpcode();
    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
    bool UseSignedOp = Opcode == Instruction::SDiv ||
                       Opcode == Instruction::SRem;

    // Only optimize div or rem ops
    if (!UseDivOp && !UseRemOp)
      continue;

    // Skip division on vector types, only optimize integer instructions
    if (!J->getType()->isIntegerTy())
      continue;

    // Get bitwidth of div/rem instruction
    IntegerType *T = cast<IntegerType>(J->getType());
    unsigned int bitwidth = T->getBitWidth();

    // Continue if bitwidth is not bypassed
    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
    if (BI == BypassWidths.end())
      continue;

    // Get type for div/rem instruction with bypass bitwidth
    IntegerType *BT = IntegerType::get(J->getContext(), BI->second);

    MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp,
                                       UseSignedOp, DivCache);
  }

  return MadeChange;
}