//===- subzero/src/IceTargetLoweringARM32.h - ARM32 lowering ----*- C++ -*-===//
//
//                        The Subzero Code Generator
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
///
/// \file
/// \brief Declares the TargetLoweringARM32 class, which implements the
/// TargetLowering interface for the ARM 32-bit architecture.
///
//===----------------------------------------------------------------------===//

#ifndef SUBZERO_SRC_ICETARGETLOWERINGARM32_H
#define SUBZERO_SRC_ICETARGETLOWERINGARM32_H

#include "IceAssemblerARM32.h"
#include "IceDefs.h"
#include "IceInstARM32.h"
#include "IceRegistersARM32.h"
#include "IceTargetLowering.h"

#include <utility>

namespace Ice {
namespace ARM32 {

// Class encapsulating ARM cpu features / instruction set.
class TargetARM32Features {
  TargetARM32Features() = delete;
  TargetARM32Features(const TargetARM32Features &) = delete;
  TargetARM32Features &operator=(const TargetARM32Features &) = delete;

public:
  explicit TargetARM32Features(const ClFlags &Flags);

  enum ARM32InstructionSet {
    Begin,
    // Neon is the PNaCl baseline instruction set.
    Neon = Begin,
    HWDivArm, // HW divide in ARM mode (not just Thumb mode).
    End
  };

  bool hasFeature(ARM32InstructionSet I) const { return I <= InstructionSet; }

private:
  ARM32InstructionSet InstructionSet = ARM32InstructionSet::Begin;
};

// The target lowering logic for ARM32.
class TargetARM32 : public TargetLowering {
  TargetARM32() = delete;
  TargetARM32(const TargetARM32 &) = delete;
  TargetARM32 &operator=(const TargetARM32 &) = delete;

public:
  static void staticInit(GlobalContext *Ctx);

  static bool shouldBePooled(const Constant *C) {
    if (auto *ConstDouble = llvm::dyn_cast<ConstantDouble>(C)) {
      return !Utils::isPositiveZero(ConstDouble->getValue());
    }
    if (llvm::isa<ConstantFloat>(C))
      return true;
    return false;
  }

  static ::Ice::Type getPointerType() { return ::Ice::IceType_i32; }

  // TODO(jvoung): return a unique_ptr.
  static std::unique_ptr<::Ice::TargetLowering> create(Cfg *Func) {
    return makeUnique<TargetARM32>(Func);
  }

  std::unique_ptr<::Ice::Assembler> createAssembler() const override {
    const bool IsNonsfi = SandboxingType == ST_Nonsfi;
    return makeUnique<ARM32::AssemblerARM32>(IsNonsfi);
  }

  void initNodeForLowering(CfgNode *Node) override {
    Computations.forgetProducers();
    Computations.recordProducers(Node);
    Computations.dump(Func);
  }

  void translateOm1() override;
  void translateO2() override;
  bool doBranchOpt(Inst *I, const CfgNode *NextNode) override;

  SizeT getNumRegisters() const override { return RegARM32::Reg_NUM; }
  Variable *getPhysicalRegister(RegNumT RegNum,
                                Type Ty = IceType_void) override;
  const char *getRegName(RegNumT RegNum, Type Ty) const override;
  SmallBitVector getRegisterSet(RegSetMask Include,
                                RegSetMask Exclude) const override;
  const SmallBitVector &
  getRegistersForVariable(const Variable *Var) const override {
    RegClass RC = Var->getRegClass();
    switch (RC) {
    default:
      assert(RC < RC_Target);
      return TypeToRegisterSet[RC];
    case RegARM32::RCARM32_QtoS:
      return TypeToRegisterSet[RC];
    }
  }
  const SmallBitVector &
  getAllRegistersForVariable(const Variable *Var) const override {
    RegClass RC = Var->getRegClass();
    assert((RegARM32::RegClassARM32)RC < RegARM32::RCARM32_NUM);
    return TypeToRegisterSetUnfiltered[RC];
  }
  const SmallBitVector &getAliasesForRegister(RegNumT Reg) const override {
    return RegisterAliases[Reg];
  }
  bool hasFramePointer() const override { return UsesFramePointer; }
  void setHasFramePointer() override { UsesFramePointer = true; }
  RegNumT getStackReg() const override { return RegARM32::Reg_sp; }
  RegNumT getFrameReg() const override { return RegARM32::Reg_fp; }
  RegNumT getFrameOrStackReg() const override {
    return UsesFramePointer ? getFrameReg() : getStackReg();
  }
  RegNumT getReservedTmpReg() const { return RegARM32::Reg_ip; }

  size_t typeWidthInBytesOnStack(Type Ty) const override {
    // Round up to the next multiple of 4 bytes. In particular, i1, i8, and i16
    // are rounded up to 4 bytes.
    return (typeWidthInBytes(Ty) + 3) & ~3;
  }
  uint32_t getStackAlignment() const override;
  void reserveFixedAllocaArea(size_t Size, size_t Align) override {
    FixedAllocaSizeBytes = Size;
    assert(llvm::isPowerOf2_32(Align));
    FixedAllocaAlignBytes = Align;
    PrologEmitsFixedAllocas = true;
  }
  int32_t getFrameFixedAllocaOffset() const override {
    return FixedAllocaSizeBytes - (SpillAreaSizeBytes - MaxOutArgsSizeBytes);
  }
  uint32_t maxOutArgsSizeBytes() const override { return MaxOutArgsSizeBytes; }

  bool shouldSplitToVariable64On32(Type Ty) const override {
    return Ty == IceType_i64;
  }

  // TODO(ascull): what size is best for ARM?
  SizeT getMinJumpTableSize() const override { return 3; }
  void emitJumpTable(const Cfg *Func,
                     const InstJumpTable *JumpTable) const override;

  void emitVariable(const Variable *Var) const override;

  void emit(const ConstantUndef *C) const final;
  void emit(const ConstantInteger32 *C) const final;
  void emit(const ConstantInteger64 *C) const final;
  void emit(const ConstantFloat *C) const final;
  void emit(const ConstantDouble *C) const final;
  void emit(const ConstantRelocatable *C) const final;

  void lowerArguments() override;
  void addProlog(CfgNode *Node) override;
  void addEpilog(CfgNode *Node) override;

  Operand *loOperand(Operand *Operand);
  Operand *hiOperand(Operand *Operand);
  void finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                              size_t BasicFrameOffset, size_t *InArgsSizeBytes);

  bool hasCPUFeature(TargetARM32Features::ARM32InstructionSet I) const {
    return CPUFeatures.hasFeature(I);
  }

  enum OperandLegalization {
    Legal_Reg = 1 << 0,  /// physical register, not stack location
    Legal_Flex = 1 << 1, /// A flexible operand2, which can hold rotated small
                         /// immediates, shifted registers, or modified fp imm.
    Legal_Mem = 1 << 2,  /// includes [r0, r1 lsl #2] as well as [sp, #12]
    Legal_Rematerializable = 1 << 3,
    Legal_Default = ~Legal_Rematerializable,
  };

  using LegalMask = uint32_t;
  Operand *legalizeUndef(Operand *From, RegNumT RegNum = RegNumT());
  Operand *legalize(Operand *From, LegalMask Allowed = Legal_Default,
                    RegNumT RegNum = RegNumT());
  Variable *legalizeToReg(Operand *From, RegNumT RegNum = RegNumT());

  OperandARM32ShAmtImm *shAmtImm(uint32_t ShAmtImm) const {
    assert(ShAmtImm < 32);
    return OperandARM32ShAmtImm::create(
        Func,
        llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(ShAmtImm & 0x1F)));
  }

  GlobalContext *getCtx() const { return Ctx; }

protected:
  explicit TargetARM32(Cfg *Func);

  void postLower() override;

  enum SafeBoolChain {
    SBC_No,
    SBC_Yes,
  };

  void lowerAlloca(const InstAlloca *Instr) override;
  SafeBoolChain lowerInt1Arithmetic(const InstArithmetic *Instr);
  void lowerInt64Arithmetic(InstArithmetic::OpKind Op, Variable *Dest,
                            Operand *Src0, Operand *Src1);
  void lowerArithmetic(const InstArithmetic *Instr) override;
  void lowerAssign(const InstAssign *Instr) override;
  void lowerBr(const InstBr *Instr) override;
  void lowerCall(const InstCall *Instr) override;
  void lowerCast(const InstCast *Instr) override;
  void lowerExtractElement(const InstExtractElement *Instr) override;

  /// CondWhenTrue is a helper type returned by every method in the lowering
  /// that emits code to set the condition codes.
  class CondWhenTrue {
  public:
    explicit CondWhenTrue(CondARM32::Cond T0,
                          CondARM32::Cond T1 = CondARM32::kNone)
        : WhenTrue0(T0), WhenTrue1(T1) {
      assert(T1 == CondARM32::kNone || T0 != CondARM32::kNone);
      assert(T1 != T0 || T0 == CondARM32::kNone);
    }
    CondARM32::Cond WhenTrue0;
    CondARM32::Cond WhenTrue1;

    /// invert returns a new object with WhenTrue0 and WhenTrue1 inverted.
    CondWhenTrue invert() const {
      switch (WhenTrue0) {
      default:
        if (WhenTrue1 == CondARM32::kNone)
          return CondWhenTrue(InstARM32::getOppositeCondition(WhenTrue0));
        return CondWhenTrue(InstARM32::getOppositeCondition(WhenTrue0),
                            InstARM32::getOppositeCondition(WhenTrue1));
      case CondARM32::AL:
        return CondWhenTrue(CondARM32::kNone);
      case CondARM32::kNone:
        return CondWhenTrue(CondARM32::AL);
      }
    }
  };

  CondWhenTrue lowerFcmpCond(const InstFcmp *Instr);
  void lowerFcmp(const InstFcmp *Instr) override;
  CondWhenTrue lowerInt8AndInt16IcmpCond(InstIcmp::ICond Condition,
                                         Operand *Src0, Operand *Src1);
  CondWhenTrue lowerInt32IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
                                  Operand *Src1);
  CondWhenTrue lowerInt64IcmpCond(InstIcmp::ICond Condition, Operand *Src0,
                                  Operand *Src1);
  CondWhenTrue lowerIcmpCond(InstIcmp::ICond Condition, Operand *Src0,
                             Operand *Src1);
  CondWhenTrue lowerIcmpCond(const InstIcmp *Instr);
  void lowerIcmp(const InstIcmp *Instr) override;
  /// Emits the basic sequence for lower-linked/store-exclusive loops:
  ///
  /// retry:
  ///        ldrex tmp, [Addr]
  ///        StoreValue = Operation(tmp)
  ///        strexCond success, StoreValue, [Addr]
  ///        cmpCond success, #0
  ///        bne retry
  ///
  /// Operation needs to return which value to strex in Addr, it must not change
  /// the flags if Cond is not AL, and must not emit any instructions that could
  /// end up writing to memory. Operation also needs to handle fake-defing for
  /// i64 handling.
  void
  lowerLoadLinkedStoreExclusive(Type Ty, Operand *Addr,
                                std::function<Variable *(Variable *)> Operation,
                                CondARM32::Cond Cond = CondARM32::AL);
  void lowerInt64AtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                           Operand *Val);
  void lowerAtomicRMW(Variable *Dest, uint32_t Operation, Operand *Ptr,
                      Operand *Val);
  void lowerBreakpoint(const InstBreakpoint *Instr) override;
  void lowerIntrinsicCall(const InstIntrinsicCall *Instr) override;
  void lowerInsertElement(const InstInsertElement *Instr) override;
  void lowerLoad(const InstLoad *Instr) override;
  void lowerPhi(const InstPhi *Instr) override;
  void lowerRet(const InstRet *Instr) override;
  void lowerSelect(const InstSelect *Instr) override;
  void lowerShuffleVector(const InstShuffleVector *Instr) override;
  void lowerStore(const InstStore *Instr) override;
  void lowerSwitch(const InstSwitch *Instr) override;
  void lowerUnreachable(const InstUnreachable *Instr) override;
  void prelowerPhis() override;
  uint32_t getCallStackArgumentsSizeBytes(const InstCall *Instr) override;
  void genTargetHelperCallFor(Inst *Instr) override;
  void doAddressOptLoad() override;
  void doAddressOptStore() override;
  void randomlyInsertNop(float Probability,
                         RandomNumberGenerator &RNG) override;

  OperandARM32Mem *formMemoryOperand(Operand *Ptr, Type Ty);

  Variable64On32 *makeI64RegPair();
  Variable *makeReg(Type Ty, RegNumT RegNum = RegNumT());
  static Type stackSlotType();
  Variable *copyToReg(Operand *Src, RegNumT RegNum = RegNumT());
  void alignRegisterPow2(Variable *Reg, uint32_t Align,
                         RegNumT TmpRegNum = RegNumT());

  /// Returns a vector in a register with the given constant entries.
  Variable *makeVectorOfZeros(Type Ty, RegNumT RegNum = RegNumT());

  void
  makeRandomRegisterPermutation(llvm::SmallVectorImpl<RegNumT> &Permutation,
                                const SmallBitVector &ExcludeRegisters,
                                uint64_t Salt) const override;

  // If a divide-by-zero check is needed, inserts a: test; branch .LSKIP; trap;
  // .LSKIP: <continuation>. If no check is needed nothing is inserted.
  void div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi);
  using ExtInstr = void (TargetARM32::*)(Variable *, Variable *,
                                         CondARM32::Cond);
  using DivInstr = void (TargetARM32::*)(Variable *, Variable *, Variable *,
                                         CondARM32::Cond);
  void lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R, Operand *Src1,
                    ExtInstr ExtFunc, DivInstr DivFunc, bool IsRemainder);

  void lowerCLZ(Variable *Dest, Variable *ValLo, Variable *ValHi);

  // The following are helpers that insert lowered ARM32 instructions with
  // minimal syntactic overhead, so that the lowering code can look as close to
  // assembly as practical.
  void _add(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Add>(Dest, Src0, Src1, Pred);
  }
  void _adds(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Add>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _adc(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Adc>(Dest, Src0, Src1, Pred);
  }
  void _and(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32And>(Dest, Src0, Src1, Pred);
  }
  void _asr(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Asr>(Dest, Src0, Src1, Pred);
  }
  void _bic(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Bic>(Dest, Src0, Src1, Pred);
  }
  void _br(CfgNode *TargetTrue, CfgNode *TargetFalse,
           CondARM32::Cond Condition) {
    Context.insert<InstARM32Br>(TargetTrue, TargetFalse, Condition);
  }
  void _br(CfgNode *Target) { Context.insert<InstARM32Br>(Target); }
  void _br(CfgNode *Target, CondARM32::Cond Condition) {
    Context.insert<InstARM32Br>(Target, Condition);
  }
  void _br(InstARM32Label *Label, CondARM32::Cond Condition) {
    Context.insert<InstARM32Br>(Label, Condition);
  }
  void _cmn(Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Cmn>(Src0, Src1, Pred);
  }
  void _cmp(Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Cmp>(Src0, Src1, Pred);
  }
  void _clz(Variable *Dest, Variable *Src0,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Clz>(Dest, Src0, Pred);
  }
  void _dmb() { Context.insert<InstARM32Dmb>(); }
  void _eor(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Eor>(Dest, Src0, Src1, Pred);
  }
  /// _ldr, for all your memory to Variable data moves. It handles all types
  /// (integer, floating point, and vectors.) Addr needs to be valid for Dest's
  /// type (e.g., no immediates for vector loads, and no index registers for fp
  /// loads.)
  void _ldr(Variable *Dest, OperandARM32Mem *Addr,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Ldr>(Dest, Addr, Pred);
  }
  InstARM32Ldrex *_ldrex(Variable *Dest, OperandARM32Mem *Addr,
                         CondARM32::Cond Pred = CondARM32::AL) {
    auto *Ldrex = Context.insert<InstARM32Ldrex>(Dest, Addr, Pred);
    if (auto *Dest64 = llvm::dyn_cast<Variable64On32>(Dest)) {
      Context.insert<InstFakeDef>(Dest64->getLo(), Dest);
      Context.insert<InstFakeDef>(Dest64->getHi(), Dest);
    }
    return Ldrex;
  }
  void _lsl(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Lsl>(Dest, Src0, Src1, Pred);
  }
  void _lsls(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Lsl>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _lsr(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Lsr>(Dest, Src0, Src1, Pred);
  }
  void _mla(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Mla>(Dest, Src0, Src1, Acc, Pred);
  }
  void _mls(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Mls>(Dest, Src0, Src1, Acc, Pred);
  }
  /// _mov, for all your Variable to Variable data movement needs. It handles
  /// all types (integer, floating point, and vectors), as well as moves between
  /// Core and VFP registers. This is not a panacea: you must obey the (weird,
  /// confusing, non-uniform) rules for data moves in ARM.
  void _mov(Variable *Dest, Operand *Src0,
            CondARM32::Cond Pred = CondARM32::AL) {
    // _mov used to be unique in the sense that it would create a temporary
    // automagically if Dest was nullptr. It won't do that anymore, so we keep
    // an assert around just in case there is some untested code path where Dest
    // is nullptr.
    assert(Dest != nullptr);
    assert(!llvm::isa<OperandARM32Mem>(Src0));
    auto *Instr = Context.insert<InstARM32Mov>(Dest, Src0, Pred);

    if (Instr->isMultiDest()) {
      // If Instr is multi-dest, then Dest must be a Variable64On32. We add a
      // fake-def for Instr.DestHi here.
      assert(llvm::isa<Variable64On32>(Dest));
      Context.insert<InstFakeDef>(Instr->getDestHi());
    }
  }

  void _mov_redefined(Variable *Dest, Operand *Src0,
                      CondARM32::Cond Pred = CondARM32::AL) {
    auto *Instr = Context.insert<InstARM32Mov>(Dest, Src0, Pred);
    Instr->setDestRedefined();
    if (Instr->isMultiDest()) {
      // If Instr is multi-dest, then Dest must be a Variable64On32. We add a
      // fake-def for Instr.DestHi here.
      assert(llvm::isa<Variable64On32>(Dest));
      Context.insert<InstFakeDef>(Instr->getDestHi());
    }
  }

  void _nop() { Context.insert<InstARM32Nop>(); }

  // Generates a vmov instruction to extract the given index from a vector
  // register.
  void _extractelement(Variable *Dest, Variable *Src0, uint32_t Index,
                       CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Extract>(Dest, Src0, Index, Pred);
  }

  // Generates a vmov instruction to insert a value into the given index of a
  // vector register.
  void _insertelement(Variable *Dest, Variable *Src0, uint32_t Index,
                      CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Insert>(Dest, Src0, Index, Pred);
  }

  // --------------------------------------------------------------------------
  // Begin bool folding machinery.
  //
  // There are three types of boolean lowerings handled by this target:
  //
  // 1) Boolean expressions leading to a boolean Variable definition
  // ---------------------------------------------------------------
  //
  // Whenever a i1 Variable is live out (i.e., its live range extends beyond
  // the defining basic block) we do not fold the operation. We instead
  // materialize (i.e., compute) the variable normally, so that it can be used
  // when needed. We also materialize i1 values that are not single use to
  // avoid code duplication. These expressions are not short circuited.
  //
  // 2) Boolean expressions leading to a select
  // ------------------------------------------
  //
  // These include boolean chains leading to a select instruction, as well as
  // i1 Sexts. These boolean expressions are lowered to:
  //
  // mov T, <false value>
  // CC <- eval(Boolean Expression)
  // movCC T, <true value>
  //
  // For Sexts, <false value> is 0, and <true value> is -1.
  //
  // 3) Boolean expressions leading to a br i1
  // -----------------------------------------
  //
  // These are the boolean chains leading to a branch. These chains are
  // short-circuited, i.e.:
  //
  //   A = or i1 B, C
  //   br i1 A, label %T, label %F
  //
  // becomes
  //
  //   tst B
  //   jne %T
  //   tst B
  //   jne %T
  //   j %F
  //
  // and
  //
  //   A = and i1 B, C
  //   br i1 A, label %T, label %F
  //
  // becomes
  //
  //   tst B
  //   jeq %F
  //   tst B
  //   jeq %F
  //   j %T
  //
  // Arbitrarily long chains are short circuited, e.g
  //
  //   A = or  i1 B, C
  //   D = and i1 A, E
  //   F = and i1 G, H
  //   I = or i1 D, F
  //   br i1 I, label %True, label %False
  //
  // becomes
  //
  // Label[A]:
  //   tst B, 1
  //   bne Label[D]
  //   tst C, 1
  //   beq Label[I]
  // Label[D]:
  //   tst E, 1
  //   bne %True
  // Label[I]
  //   tst G, 1
  //   beq %False
  //   tst H, 1
  //   beq %False (bne %True)

  /// lowerInt1 materializes Boolean to a Variable.
  SafeBoolChain lowerInt1(Variable *Dest, Operand *Boolean);

  /// lowerInt1ForSelect generates the following instruction sequence:
  ///
  ///   mov T, FalseValue
  ///   CC <- eval(Boolean)
  ///   movCC T, TrueValue
  ///   mov Dest, T
  ///
  /// It is used for lowering select i1, as well as i1 Sext.
  void lowerInt1ForSelect(Variable *Dest, Operand *Boolean, Operand *TrueValue,
                          Operand *FalseValue);

  /// LowerInt1BranchTarget is used by lowerIntForBranch. It wraps a CfgNode, or
  /// an InstARM32Label (but never both) so that, during br i1 lowering, we can
  /// create auxiliary labels for short circuiting the condition evaluation.
  class LowerInt1BranchTarget {
  public:
    explicit LowerInt1BranchTarget(CfgNode *const Target)
        : NodeTarget(Target) {}
    explicit LowerInt1BranchTarget(InstARM32Label *const Target)
        : LabelTarget(Target) {}

    /// createForLabelOrDuplicate will return a new LowerInt1BranchTarget that
    /// is the exact copy of this if Label is nullptr; otherwise, the returned
    /// object will wrap Label instead.
    LowerInt1BranchTarget
    createForLabelOrDuplicate(InstARM32Label *Label) const {
      if (Label != nullptr)
        return LowerInt1BranchTarget(Label);
      if (NodeTarget)
        return LowerInt1BranchTarget(NodeTarget);
      return LowerInt1BranchTarget(LabelTarget);
    }

    CfgNode *const NodeTarget = nullptr;
    InstARM32Label *const LabelTarget = nullptr;
  };

  /// LowerInt1AllowShortCircuit is a helper type used by lowerInt1ForBranch for
  /// determining which type arithmetic is allowed to be short circuited. This
  /// is useful for lowering
  ///
  ///   t1 = and i1 A, B
  ///   t2 = and i1 t1, C
  ///   br i1 t2, label %False, label %True
  ///
  /// to
  ///
  ///   tst A, 1
  ///   beq %False
  ///   tst B, 1
  ///   beq %False
  ///   tst C, 1
  ///   bne %True
  ///   b %False
  ///
  /// Without this information, short circuiting would only allow to short
  /// circuit a single high level instruction. For example:
  ///
  ///   t1 = or i1 A, B
  ///   t2 = and i1 t1, C
  ///   br i1 t2, label %False, label %True
  ///
  /// cannot be lowered to
  ///
  ///   tst A, 1
  ///   bne %True
  ///   tst B, 1
  ///   bne %True
  ///   tst C, 1
  ///   beq %True
  ///   b %False
  ///
  /// It needs to be lowered to
  ///
  ///   tst A, 1
  ///   bne Aux
  ///   tst B, 1
  ///   beq %False
  /// Aux:
  ///   tst C, 1
  ///   bne %True
  ///   b %False
  ///
  /// TODO(jpp): evaluate if this kind of short circuiting hurts performance (it
  /// might.)
  enum LowerInt1AllowShortCircuit {
    SC_And = 1,
    SC_Or = 2,
    SC_All = SC_And | SC_Or,
  };

  /// ShortCircuitCondAndLabel wraps the condition codes that should be used
  /// after a lowerInt1ForBranch returns to branch to the
  /// TrueTarget/FalseTarget. If ShortCircuitLabel is not nullptr, then the
  /// called lowerInt1forBranch created an internal (i.e., short-circuit) label
  /// used for short circuiting.
  class ShortCircuitCondAndLabel {
  public:
    explicit ShortCircuitCondAndLabel(CondWhenTrue &&C,
                                      InstARM32Label *L = nullptr)
        : Cond(std::move(C)), ShortCircuitTarget(L) {}
    const CondWhenTrue Cond;
    InstARM32Label *const ShortCircuitTarget;

    CondWhenTrue assertNoLabelAndReturnCond() const {
      assert(ShortCircuitTarget == nullptr);
      return Cond;
    }
  };

  /// lowerInt1ForBranch expands Boolean, and returns the condition codes that
  /// are to be used for branching to the branch's TrueTarget. It may return a
  /// label that the expansion of Boolean used to short circuit the chain's
  /// evaluation.
  ShortCircuitCondAndLabel
  lowerInt1ForBranch(Operand *Boolean, const LowerInt1BranchTarget &TargetTrue,
                     const LowerInt1BranchTarget &TargetFalse,
                     uint32_t ShortCircuitable);

  // _br is a convenience wrapper that emits br instructions to Target.
  void _br(const LowerInt1BranchTarget &BrTarget,
           CondARM32::Cond Cond = CondARM32::AL) {
    assert((BrTarget.NodeTarget == nullptr) !=
           (BrTarget.LabelTarget == nullptr));
    if (BrTarget.NodeTarget != nullptr)
      _br(BrTarget.NodeTarget, Cond);
    else
      _br(BrTarget.LabelTarget, Cond);
  }

  // _br_short_circuit is used when lowering InstArithmetic::And and
  // InstArithmetic::Or and a short circuit branch is needed.
  void _br_short_circuit(const LowerInt1BranchTarget &Target,
                         const CondWhenTrue &Cond) {
    if (Cond.WhenTrue1 != CondARM32::kNone) {
      _br(Target, Cond.WhenTrue1);
    }
    if (Cond.WhenTrue0 != CondARM32::kNone) {
      _br(Target, Cond.WhenTrue0);
    }
  }
  // End of bool folding machinery
  // --------------------------------------------------------------------------

  /// The Operand can only be a 16-bit immediate or a ConstantRelocatable (with
  /// an upper16 relocation).
  void _movt(Variable *Dest, Operand *Src0,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Movt>(Dest, Src0, Pred);
  }
  void _movw(Variable *Dest, Operand *Src0,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Movw>(Dest, Src0, Pred);
  }
  void _mul(Variable *Dest, Variable *Src0, Variable *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Mul>(Dest, Src0, Src1, Pred);
  }
  void _mvn(Variable *Dest, Operand *Src0,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Mvn>(Dest, Src0, Pred);
  }
  void _orr(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Orr>(Dest, Src0, Src1, Pred);
  }
  void _orrs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Orr>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _push(const VarList &Sources) { Context.insert<InstARM32Push>(Sources); }
  void _pop(const VarList &Dests) {
    Context.insert<InstARM32Pop>(Dests);
    // Mark dests as modified.
    for (Variable *Dest : Dests)
      Context.insert<InstFakeDef>(Dest);
  }
  void _rbit(Variable *Dest, Variable *Src0,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Rbit>(Dest, Src0, Pred);
  }
  void _rev(Variable *Dest, Variable *Src0,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Rev>(Dest, Src0, Pred);
  }
  void _ret(Variable *LR, Variable *Src0 = nullptr) {
    Context.insert<InstARM32Ret>(LR, Src0);
  }
  void _rscs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Rsc>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _rsc(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Rsc>(Dest, Src0, Src1, Pred);
  }
  void _rsbs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Rsb>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _rsb(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Rsb>(Dest, Src0, Src1, Pred);
  }
  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Sbc>(Dest, Src0, Src1, Pred);
  }
  void _sbcs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Sbc>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _sdiv(Variable *Dest, Variable *Src0, Variable *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Sdiv>(Dest, Src0, Src1, Pred);
  }
  /// _str, for all your Variable to memory transfers. Addr has the same
  /// restrictions that it does in _ldr.
  void _str(Variable *Value, OperandARM32Mem *Addr,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Str>(Value, Addr, Pred);
  }
  InstARM32Strex *_strex(Variable *Dest, Variable *Value, OperandARM32Mem *Addr,
                         CondARM32::Cond Pred = CondARM32::AL) {
    if (auto *Value64 = llvm::dyn_cast<Variable64On32>(Value)) {
      Context.insert<InstFakeUse>(Value64->getLo());
      Context.insert<InstFakeUse>(Value64->getHi());
    }
    return Context.insert<InstARM32Strex>(Dest, Value, Addr, Pred);
  }
  void _sub(Variable *Dest, Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Sub>(Dest, Src0, Src1, Pred);
  }
  void _subs(Variable *Dest, Variable *Src0, Operand *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    constexpr bool SetFlags = true;
    Context.insert<InstARM32Sub>(Dest, Src0, Src1, Pred, SetFlags);
    if (SetFlags) {
      Context.insert<InstFakeUse>(Dest);
    }
  }
  void _sxt(Variable *Dest, Variable *Src0,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Sxt>(Dest, Src0, Pred);
  }
  void _tst(Variable *Src0, Operand *Src1,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Tst>(Src0, Src1, Pred);
  }
  void _trap() { Context.insert<InstARM32Trap>(); }
  void _udiv(Variable *Dest, Variable *Src0, Variable *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Udiv>(Dest, Src0, Src1, Pred);
  }
  void _umull(Variable *DestLo, Variable *DestHi, Variable *Src0,
              Variable *Src1, CondARM32::Cond Pred = CondARM32::AL) {
    // umull requires DestLo and DestHi to be assigned to different GPRs. The
    // following lines create overlapping liveness ranges for both variables. If
    // either one of them is live, then they are both going to be live, and thus
    // assigned to different registers; if they are both dead, then DCE will
    // kick in and delete the following three instructions.
    Context.insert<InstFakeDef>(DestHi);
    Context.insert<InstARM32Umull>(DestLo, DestHi, Src0, Src1, Pred);
    Context.insert<InstFakeDef>(DestHi, DestLo)->setDestRedefined();
    Context.insert<InstFakeUse>(DestHi);
  }
  void _uxt(Variable *Dest, Variable *Src0,
            CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Uxt>(Dest, Src0, Pred);
  }
  void _vabs(Variable *Dest, Variable *Src,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vabs>(Dest, Src, Pred);
  }
  void _vadd(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vadd>(Dest, Src0, Src1);
  }
  void _vand(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vand>(Dest, Src0, Src1);
  }
  InstARM32Vbsl *_vbsl(Variable *Dest, Variable *Src0, Variable *Src1) {
    return Context.insert<InstARM32Vbsl>(Dest, Src0, Src1);
  }
  void _vceq(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vceq>(Dest, Src0, Src1);
  }
  InstARM32Vcge *_vcge(Variable *Dest, Variable *Src0, Variable *Src1) {
    return Context.insert<InstARM32Vcge>(Dest, Src0, Src1);
  }
  InstARM32Vcgt *_vcgt(Variable *Dest, Variable *Src0, Variable *Src1) {
    return Context.insert<InstARM32Vcgt>(Dest, Src0, Src1);
  }
  void _vcvt(Variable *Dest, Variable *Src, InstARM32Vcvt::VcvtVariant Variant,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vcvt>(Dest, Src, Variant, Pred);
  }
  void _vdiv(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vdiv>(Dest, Src0, Src1);
  }
  void _vcmp(Variable *Src0, Variable *Src1,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vcmp>(Src0, Src1, Pred);
  }
  void _vcmp(Variable *Src0, OperandARM32FlexFpZero *FpZero,
             CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vcmp>(Src0, FpZero, Pred);
  }
  void _vdup(Variable *Dest, Variable *Src, int Idx) {
    Context.insert<InstARM32Vdup>(Dest, Src, Idx);
  }
  void _veor(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Veor>(Dest, Src0, Src1);
  }
  void _vldr1d(Variable *Dest, OperandARM32Mem *Addr,
               CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vldr1d>(Dest, Addr, Pred);
  }
  void _vldr1q(Variable *Dest, OperandARM32Mem *Addr,
               CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vldr1q>(Dest, Addr, Pred);
  }
  void _vmrs(CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vmrs>(Pred);
  }
  void _vmla(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmla>(Dest, Src0, Src1);
  }
  void _vmlap(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmlap>(Dest, Src0, Src1);
  }
  void _vmls(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmls>(Dest, Src0, Src1);
  }
  void _vmovl(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmovl>(Dest, Src0, Src1);
  }
  void _vmovh(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmovh>(Dest, Src0, Src1);
  }
  void _vmovhl(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmovhl>(Dest, Src0, Src1);
  }
  void _vmovlh(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmovlh>(Dest, Src0, Src1);
  }
  void _vmul(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vmul>(Dest, Src0, Src1);
  }
  void _vmulh(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned) {
    Context.insert<InstARM32Vmulh>(Dest, Src0, Src1)
        ->setSignType(Unsigned ? InstARM32::FS_Unsigned : InstARM32::FS_Signed);
  }
  void _vmvn(Variable *Dest, Variable *Src0) {
    Context.insert<InstARM32Vmvn>(Dest, Src0, CondARM32::AL);
  }
  void _vneg(Variable *Dest, Variable *Src0) {
    Context.insert<InstARM32Vneg>(Dest, Src0, CondARM32::AL)
        ->setSignType(InstARM32::FS_Signed);
  }
  void _vorr(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vorr>(Dest, Src0, Src1);
  }
  void _vqadd(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned) {
    Context.insert<InstARM32Vqadd>(Dest, Src0, Src1)
        ->setSignType(Unsigned ? InstARM32::FS_Unsigned : InstARM32::FS_Signed);
  }
  void _vqmovn2(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned,
                bool Saturating) {
    Context.insert<InstARM32Vqmovn2>(Dest, Src0, Src1)
        ->setSignType(Saturating ? (Unsigned ? InstARM32::FS_Unsigned
                                             : InstARM32::FS_Signed)
                                 : InstARM32::FS_None);
  }
  void _vqsub(Variable *Dest, Variable *Src0, Variable *Src1, bool Unsigned) {
    Context.insert<InstARM32Vqsub>(Dest, Src0, Src1)
        ->setSignType(Unsigned ? InstARM32::FS_Unsigned : InstARM32::FS_Signed);
  }
  InstARM32Vshl *_vshl(Variable *Dest, Variable *Src0, Variable *Src1) {
    return Context.insert<InstARM32Vshl>(Dest, Src0, Src1);
  }
  void _vshl(Variable *Dest, Variable *Src0, ConstantInteger32 *Src1) {
    Context.insert<InstARM32Vshl>(Dest, Src0, Src1)
        ->setSignType(InstARM32::FS_Unsigned);
  }
  InstARM32Vshr *_vshr(Variable *Dest, Variable *Src0,
                       ConstantInteger32 *Src1) {
    return Context.insert<InstARM32Vshr>(Dest, Src0, Src1);
  }
  void _vsqrt(Variable *Dest, Variable *Src,
              CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vsqrt>(Dest, Src, Pred);
  }
  void _vstr1d(Variable *Value, OperandARM32Mem *Addr,
               CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vstr1>(Value, Addr, Pred, 32);
  }
  void _vstr1q(Variable *Value, OperandARM32Mem *Addr,
               CondARM32::Cond Pred = CondARM32::AL) {
    Context.insert<InstARM32Vstr1>(Value, Addr, Pred, 64);
  }
  void _vsub(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vsub>(Dest, Src0, Src1);
  }
  void _vzip(Variable *Dest, Variable *Src0, Variable *Src1) {
    Context.insert<InstARM32Vzip>(Dest, Src0, Src1);
  }

  // Iterates over the CFG and determines the maximum outgoing stack arguments
  // bytes. This information is later used during addProlog() to pre-allocate
  // the outargs area.
  // TODO(jpp): This could live in the Parser, if we provided a Target-specific
  // method that the Parser could call.
  void findMaxStackOutArgsSize();

  /// Returns true if the given Offset can be represented in a Load/Store Mem
  /// Operand.
  bool isLegalMemOffset(Type Ty, int32_t Offset) const;

  void postLowerLegalization();

  /// Manages the GotPtr variable, which is used for Nonsfi sandboxing.
  /// @{
  void createGotPtr();
  void insertGotPtrInitPlaceholder();
  VariableDeclaration *createGotRelocation(RelocOffset *AddPcReloc);
  void materializeGotAddr(CfgNode *Node);
  Variable *GotPtr = nullptr;
  // TODO(jpp): use CfgLocalAllocator.
  /// @}

  /// Manages the Gotoff relocations created during the function lowering. A
  /// single Gotoff relocation is created for each global variable used by the
  /// function being lowered.
  /// @{
  // TODO(jpp): if the same global G is used in different functions, then this
  // method will emit one G(gotoff) relocation per function.
  GlobalString createGotoffRelocation(const ConstantRelocatable *CR);
  CfgUnorderedSet<GlobalString> KnownGotoffs;
  /// @}

  /// Loads the constant relocatable Name to Register. Then invoke Finish to
  /// finish the relocatable lowering. Finish **must** use PC in its first
  /// emitted instruction, or the relocatable in Register will contain the wrong
  /// value.
  //
  // Lowered sequence:
  //
  // Movw:
  //     movw Register, #:lower16:Name - (End - Movw) - 8 .
  // Movt:
  //     movt Register, #:upper16:Name - (End - Movt) - 8 .
  //     PC = fake-def
  // End:
  //     Finish(PC)
  //
  // The -8 in movw/movt above is to account for the PC value that the first
  // instruction emitted by Finish(PC) will read.
  void
  loadNamedConstantRelocatablePIC(GlobalString Name, Variable *Register,
                                  std::function<void(Variable *PC)> Finish);

  /// Sandboxer defines methods for ensuring that "dangerous" operations are
  /// masked during sandboxed code emission. For regular, non-sandboxed code
  /// emission, its methods are simple pass-through methods.
  ///
  /// The Sandboxer also emits BundleLock/BundleUnlock pseudo-instructions
  /// in the constructor/destructor during sandboxed code emission. Therefore,
  /// it is a bad idea to create an object of this type and "keep it around."
  /// The recommended usage is:
  ///
  /// AutoSandboxing(this).<<operation>>(...);
  ///
  /// This usage ensures that no other instructions are inadvertently added to
  /// the bundle.
  class Sandboxer {
    Sandboxer() = delete;
    Sandboxer(const Sandboxer &) = delete;
    Sandboxer &operator=(const Sandboxer &) = delete;

  public:
    explicit Sandboxer(
        TargetARM32 *Target,
        InstBundleLock::Option BundleOption = InstBundleLock::Opt_None);
    ~Sandboxer();

    /// Increments sp:
    ///
    ///   add sp, sp, AddAmount
    ///   bic sp, sp, 0xc0000000
    ///
    /// (for the rationale, see the ARM 32-bit Sandbox Specification.)
    void add_sp(Operand *AddAmount);

    /// Emits code to align sp to the specified alignment:
    ///
    ///   bic/and sp, sp, Alignment
    ///   bic, sp, sp, 0xc0000000
    void align_sp(size_t Alignment);

    /// Emits a call instruction. If CallTarget is a Variable, it emits
    ///
    ///   bic CallTarget, CallTarget, 0xc000000f
    ///   bl CallTarget
    ///
    /// Otherwise, it emits
    ///
    ///   bl CallTarget
    ///
    /// Note: in sandboxed code calls are always emitted in addresses 12 mod 16.
    InstARM32Call *bl(Variable *ReturnReg, Operand *CallTarget);

    /// Emits a load:
    ///
    ///   bic rBase, rBase, 0xc0000000
    ///   ldr rDest, [rBase, #Offset]
    ///
    /// Exception: if rBase is r9 or sp, then the load is emitted as:
    ///
    ///   ldr rDest, [rBase, #Offset]
    ///
    /// because the NaCl ARM 32-bit Sandbox Specification guarantees they are
    /// always valid.
    void ldr(Variable *Dest, OperandARM32Mem *Mem, CondARM32::Cond Pred);

    /// Emits a load exclusive:
    ///
    ///   bic rBase, rBase, 0xc0000000
    ///   ldrex rDest, [rBase]
    ///
    /// Exception: if rBase is r9 or sp, then the load is emitted as:
    ///
    ///   ldrex rDest, [rBase]
    ///
    /// because the NaCl ARM 32-bit Sandbox Specification guarantees they are
    /// always valid.
    void ldrex(Variable *Dest, OperandARM32Mem *Mem, CondARM32::Cond Pred);

    /// Resets sp to Src:
    ///
    ///   mov sp, Src
    ///   bic sp, sp, 0xc0000000
    void reset_sp(Variable *Src);

    /// Emits code to return from a function:
    ///
    ///   bic lr, lr, 0xc000000f
    ///   bx lr
    void ret(Variable *RetAddr, Variable *RetValue);

    /// Emits a store:
    ///
    ///   bic rBase, rBase, 0xc0000000
    ///   str rSrc, [rBase, #Offset]
    ///
    /// Exception: if rBase is r9 or sp, then the store is emitted as:
    ///
    ///   str rDest, [rBase, #Offset]
    ///
    /// because the NaCl ARM 32-bit Sandbox Specification guarantees they are
    /// always valid.
    void str(Variable *Src, OperandARM32Mem *Mem, CondARM32::Cond Pred);

    /// Emits a store exclusive:
    ///
    ///   bic rBase, rBase, 0xc0000000
    ///   strex rDest, rSrc, [rBase]
    ///
    /// Exception: if rBase is r9 or sp, then the store is emitted as:
    ///
    ///   strex rDest, rSrc, [rBase]
    ///
    /// because the NaCl ARM 32-bit Sandbox Specification guarantees they are
    /// always valid.
    void strex(Variable *Dest, Variable *Src, OperandARM32Mem *Mem,
               CondARM32::Cond Pred);

    /// Decrements sp:
    ///
    ///   sub sp, sp, SubAmount
    ///   bic sp, sp, 0xc0000000
    void sub_sp(Operand *SubAmount);

  private:
    TargetARM32 *const Target;
    const InstBundleLock::Option BundleOption;
    std::unique_ptr<AutoBundle> Bundler;

    void createAutoBundle();
  };

  class PostLoweringLegalizer {
    PostLoweringLegalizer() = delete;
    PostLoweringLegalizer(const PostLoweringLegalizer &) = delete;
    PostLoweringLegalizer &operator=(const PostLoweringLegalizer &) = delete;

  public:
    explicit PostLoweringLegalizer(TargetARM32 *Target)
        : Target(Target), StackOrFrameReg(Target->getPhysicalRegister(
                              Target->getFrameOrStackReg())) {}

    void resetTempBaseIfClobberedBy(const Inst *Instr);

    // Ensures that the TempBase register held by the this legalizer (if any) is
    // assigned to IP.
    void assertNoTempOrAssignedToIP() const {
      assert(TempBaseReg == nullptr ||
             TempBaseReg->getRegNum() == Target->getReservedTmpReg());
    }

    // Legalizes Mem. if Mem.Base is a Reamaterializable variable, Mem.Offset is
    // fixed up.
    OperandARM32Mem *legalizeMemOperand(OperandARM32Mem *Mem,
                                        bool AllowOffsets = true);

    /// Legalizes Mov if its Source (or Destination) is a spilled Variable, or
    /// if its Source is a Rematerializable variable (this form is used in lieu
    /// of lea, which is not available in ARM.)
    ///
    /// Moves to memory become store instructions, and moves from memory, loads.
    void legalizeMov(InstARM32Mov *Mov);

  private:
    /// Creates a new Base register centered around [Base, +/- Offset].
    Variable *newBaseRegister(Variable *Base, int32_t Offset,
                              RegNumT ScratchRegNum);

    /// Creates a new, legal OperandARM32Mem for accessing Base + Offset.
    /// The returned mem operand is a legal operand for accessing memory that is
    /// of type Ty.
    ///
    /// If [Base, #Offset] is encodable, then the method returns a Mem operand
    /// expressing it. Otherwise,
    ///
    /// if [TempBaseReg, #Offset-TempBaseOffset] is a valid memory operand, the
    /// method will return that. Otherwise,
    ///
    /// a new base register ip=Base+Offset is created, and the method returns a
    /// memory operand expressing [ip, #0].
    OperandARM32Mem *createMemOperand(Type Ty, Variable *Base, int32_t Offset,
                                      bool AllowOffsets = true);
    TargetARM32 *const Target;
    Variable *const StackOrFrameReg;
    Variable *TempBaseReg = nullptr;
    int32_t TempBaseOffset = 0;
  };

  const bool NeedSandboxing;
  TargetARM32Features CPUFeatures;
  bool UsesFramePointer = false;
  bool NeedsStackAlignment = false;
  bool MaybeLeafFunc = true;
  size_t SpillAreaSizeBytes = 0;
  size_t FixedAllocaSizeBytes = 0;
  size_t FixedAllocaAlignBytes = 0;
  bool PrologEmitsFixedAllocas = false;
  uint32_t MaxOutArgsSizeBytes = 0;
  // TODO(jpp): std::array instead of array.
  static SmallBitVector TypeToRegisterSet[RegARM32::RCARM32_NUM];
  static SmallBitVector TypeToRegisterSetUnfiltered[RegARM32::RCARM32_NUM];
  static SmallBitVector RegisterAliases[RegARM32::Reg_NUM];
  SmallBitVector RegsUsed;
  VarList PhysicalRegisters[IceType_NUM];
  VarList PreservedGPRs;
  VarList PreservedSRegs;

  /// Helper class that understands the Calling Convention and register
  /// assignments. The first few integer type parameters can use r0-r3,
  /// regardless of their position relative to the floating-point/vector
  /// arguments in the argument list. Floating-point and vector arguments
  /// can use q0-q3 (aka d0-d7, s0-s15). For more information on the topic,
  /// see the ARM Architecture Procedure Calling Standards (AAPCS).
  ///
  /// Technically, arguments that can start with registers but extend beyond the
  /// available registers can be split between the registers and the stack.
  /// However, this is typically  for passing GPR structs by value, and PNaCl
  /// transforms expand this out.
  ///
  /// At (public) function entry, the stack must be 8-byte aligned.
  class CallingConv {
    CallingConv(const CallingConv &) = delete;
    CallingConv &operator=(const CallingConv &) = delete;

  public:
    CallingConv();
    ~CallingConv() = default;

    /// argInGPR returns true if there is a GPR available for the requested
    /// type, and false otherwise. If it returns true, Reg is set to the
    /// appropriate register number. Note that, when Ty == IceType_i64, Reg will
    /// be an I64 register pair.
    bool argInGPR(Type Ty, RegNumT *Reg);

    /// argInVFP is to floating-point/vector types what argInGPR is for integer
    /// types.
    bool argInVFP(Type Ty, RegNumT *Reg);

  private:
    void discardUnavailableGPRsAndTheirAliases(CfgVector<RegNumT> *Regs);
    SmallBitVector GPRegsUsed;
    CfgVector<RegNumT> GPRArgs;
    CfgVector<RegNumT> I64Args;

    void discardUnavailableVFPRegs(CfgVector<RegNumT> *Regs);
    SmallBitVector VFPRegsUsed;
    CfgVector<RegNumT> FP32Args;
    CfgVector<RegNumT> FP64Args;
    CfgVector<RegNumT> Vec128Args;
  };

private:
  ENABLE_MAKE_UNIQUE;

  OperandARM32Mem *formAddressingMode(Type Ty, Cfg *Func, const Inst *LdSt,
                                      Operand *Base);

  void postambleCtpop64(const InstCall *Instr);
  void preambleDivRem(const InstCall *Instr);
  CfgUnorderedMap<Operand *, void (TargetARM32::*)(const InstCall *Instr)>
      ARM32HelpersPreamble;
  CfgUnorderedMap<Operand *, void (TargetARM32::*)(const InstCall *Instr)>
      ARM32HelpersPostamble;

  class ComputationTracker {
  public:
    ComputationTracker() = default;
    ~ComputationTracker() = default;

    void forgetProducers() { KnownComputations.clear(); }
    void recordProducers(CfgNode *Node);

    const Inst *getProducerOf(const Operand *Opnd) const {
      auto *Var = llvm::dyn_cast<Variable>(Opnd);
      if (Var == nullptr) {
        return nullptr;
      }

      auto Iter = KnownComputations.find(Var->getIndex());
      if (Iter == KnownComputations.end()) {
        return nullptr;
      }

      return Iter->second.Instr;
    }

    void dump(const Cfg *Func) const {
      if (!BuildDefs::dump() || !Func->isVerbose(IceV_Folding))
        return;
      OstreamLocker L(Func->getContext());
      Ostream &Str = Func->getContext()->getStrDump();
      Str << "foldable producer:\n";
      for (const auto &Computation : KnownComputations) {
        Str << "    ";
        Computation.second.Instr->dump(Func);
        Str << "\n";
      }
      Str << "\n";
    }

  private:
    class ComputationEntry {
    public:
      ComputationEntry(Inst *I, Type Ty) : Instr(I), ComputationType(Ty) {}
      Inst *const Instr;
      // Boolean folding is disabled for variables whose live range is multi
      // block. We conservatively initialize IsLiveOut to true, and set it to
      // false once we find the end of the live range for the variable defined
      // by this instruction. If liveness analysis is not performed (e.g., in
      // Om1 mode) IsLiveOut will never be set to false, and folding will be
      // disabled.
      bool IsLiveOut = true;
      int32_t NumUses = 0;
      Type ComputationType;
    };

    // ComputationMap maps a Variable number to a payload identifying which
    // instruction defined it.
    using ComputationMap = CfgUnorderedMap<SizeT, ComputationEntry>;
    ComputationMap KnownComputations;
  };

  ComputationTracker Computations;

  // AllowTemporaryWithNoReg indicates if TargetARM32::makeReg() can be invoked
  // without specifying a physical register. This is needed for creating unbound
  // temporaries during Ice -> ARM lowering, but before register allocation.
  // This a safe-guard that no unbound temporaries are created during the
  // legalization post-passes.
  bool AllowTemporaryWithNoReg = true;
  // ForbidTemporaryWithoutReg is a RAII class that manages
  // AllowTemporaryWithNoReg.
  class ForbidTemporaryWithoutReg {
    ForbidTemporaryWithoutReg() = delete;
    ForbidTemporaryWithoutReg(const ForbidTemporaryWithoutReg &) = delete;
    ForbidTemporaryWithoutReg &
    operator=(const ForbidTemporaryWithoutReg &) = delete;

  public:
    explicit ForbidTemporaryWithoutReg(TargetARM32 *Target) : Target(Target) {
      Target->AllowTemporaryWithNoReg = false;
    }
    ~ForbidTemporaryWithoutReg() { Target->AllowTemporaryWithNoReg = true; }

  private:
    TargetARM32 *const Target;
  };
};

class TargetDataARM32 final : public TargetDataLowering {
  TargetDataARM32() = delete;
  TargetDataARM32(const TargetDataARM32 &) = delete;
  TargetDataARM32 &operator=(const TargetDataARM32 &) = delete;

public:
  static std::unique_ptr<TargetDataLowering> create(GlobalContext *Ctx) {
    return std::unique_ptr<TargetDataLowering>(new TargetDataARM32(Ctx));
  }

  void lowerGlobals(const VariableDeclarationList &Vars,
                    const std::string &SectionSuffix) override;
  void lowerConstants() override;
  void lowerJumpTables() override;

protected:
  explicit TargetDataARM32(GlobalContext *Ctx);

private:
  ~TargetDataARM32() override = default;
};

class TargetHeaderARM32 final : public TargetHeaderLowering {
  TargetHeaderARM32() = delete;
  TargetHeaderARM32(const TargetHeaderARM32 &) = delete;
  TargetHeaderARM32 &operator=(const TargetHeaderARM32 &) = delete;

public:
  static std::unique_ptr<TargetHeaderLowering> create(GlobalContext *Ctx) {
    return std::unique_ptr<TargetHeaderLowering>(new TargetHeaderARM32(Ctx));
  }

  void lower() override;

protected:
  explicit TargetHeaderARM32(GlobalContext *Ctx);

private:
  ~TargetHeaderARM32() = default;

  TargetARM32Features CPUFeatures;
};

} // end of namespace ARM32
} // end of namespace Ice

#endif // SUBZERO_SRC_ICETARGETLOWERINGARM32_H