src/IceTargetLoweringARM32.cpp - platform/external/swiftshader - Gitiles

 //===- subzero/src/IceTargetLoweringARM32.cpp - ARM32 lowering ------------===//
 //
 //                        The Subzero Code Generator
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 ///
 /// \file
 /// This file implements the TargetLoweringARM32 class, which consists almost
 /// entirely of the lowering sequence for each high-level instruction.
 ///
 //===----------------------------------------------------------------------===//
 #include "IceTargetLoweringARM32.h"

 #include "IceCfg.h"
 #include "IceCfgNode.h"
 #include "IceClFlags.h"
 #include "IceDefs.h"
 #include "IceELFObjectWriter.h"
 #include "IceGlobalInits.h"
 #include "IceInstARM32.def"
 #include "IceInstARM32.h"
 #include "IceLiveness.h"
 #include "IceOperand.h"
 #include "IcePhiLoweringImpl.h"
 #include "IceRegistersARM32.h"
 #include "IceTargetLoweringARM32.def"
 #include "IceUtils.h"
 #include "llvm/Support/MathExtras.h"

 #include <algorithm>
 #include <utility>

 namespace Ice {

 namespace {

 // The following table summarizes the logic for lowering the icmp instruction
 // for i32 and narrower types. Each icmp condition has a clear mapping to an
 // ARM32 conditional move instruction.

 const struct TableIcmp32_ {
   CondARM32::Cond Mapping;
 } TableIcmp32[] = {
 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64)                       \
   { CondARM32::C_32 }                                                          \
   ,
     ICMPARM32_TABLE
 #undef X
 };

 // The following table summarizes the logic for lowering the icmp instruction
 // for the i64 type. Two conditional moves are needed for setting to 1 or 0.
 // The operands may need to be swapped, and there is a slight difference for
 // signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc).
 const struct TableIcmp64_ {
   bool IsSigned;
   bool Swapped;
   CondARM32::Cond C1, C2;
 } TableIcmp64[] = {
 #define X(val, is_signed, swapped64, C_32, C1_64, C2_64)                       \
   { is_signed, swapped64, CondARM32::C1_64, CondARM32::C2_64 }                 \
   ,
     ICMPARM32_TABLE
 #undef X
 };

 CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) {
   size_t Index = static_cast<size_t>(Cond);
   assert(Index < llvm::array_lengthof(TableIcmp32));
   return TableIcmp32[Index].Mapping;
 }

 // In some cases, there are x-macros tables for both high-level and low-level
 // instructions/operands that use the same enum key value. The tables are kept
 // separate to maintain a proper separation between abstraction layers. There
 // is a risk that the tables could get out of sync if enum values are reordered
 // or if entries are added or deleted. The following anonymous namespaces use
 // static_asserts to ensure everything is kept in sync.

 // Validate the enum values in ICMPARM32_TABLE.
 namespace {
 // Define a temporary set of enum values based on low-level table entries.
 enum _icmp_ll_enum {
 #define X(val, signed, swapped64, C_32, C1_64, C2_64) _icmp_ll_##val,
   ICMPARM32_TABLE
 #undef X
       _num
 };
 // Define a set of constants based on high-level table entries.
 #define X(tag, str) static constexpr int _icmp_hl_##tag = InstIcmp::tag;
 ICEINSTICMP_TABLE
 #undef X
 // Define a set of constants based on low-level table entries, and ensure the
 // table entry keys are consistent.
 #define X(val, signed, swapped64, C_32, C1_64, C2_64)                          \
   static_assert(                                                               \
       _icmp_ll_##val == _icmp_hl_##val,                                        \
       "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #val);
 ICMPARM32_TABLE
 #undef X
 // Repeat the static asserts with respect to the high-level table entries in
 // case the high-level table has extra entries.
 #define X(tag, str)                                                            \
   static_assert(                                                               \
       _icmp_hl_##tag == _icmp_ll_##tag,                                        \
       "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE: " #tag);
 ICEINSTICMP_TABLE
 #undef X
 } // end of anonymous namespace

 // Stack alignment
 const uint32_t ARM32_STACK_ALIGNMENT_BYTES = 16;

 // Value is in bytes. Return Value adjusted to the next highest multiple of the
 // stack alignment.
 uint32_t applyStackAlignment(uint32_t Value) {
   return Utils::applyAlignment(Value, ARM32_STACK_ALIGNMENT_BYTES);
 }

 // Value is in bytes. Return Value adjusted to the next highest multiple of the
 // stack alignment required for the given type.
 uint32_t applyStackAlignmentTy(uint32_t Value, Type Ty) {
   // Use natural alignment, except that normally (non-NaCl) ARM only aligns
   // vectors to 8 bytes.
   // TODO(jvoung): Check this ...
   size_t typeAlignInBytes = typeWidthInBytes(Ty);
   if (isVectorType(Ty))
     typeAlignInBytes = 8;
   return Utils::applyAlignment(Value, typeAlignInBytes);
 }

 // Conservatively check if at compile time we know that the operand is
 // definitely a non-zero integer.
 bool isGuaranteedNonzeroInt(const Operand *Op) {
   if (auto *Const = llvm::dyn_cast_or_null<ConstantInteger32>(Op)) {
     return Const->getValue() != 0;
   }
   return false;
 }

 } // end of anonymous namespace

 TargetARM32Features::TargetARM32Features(const ClFlags &Flags) {
   static_assert(
       (ARM32InstructionSet::End - ARM32InstructionSet::Begin) ==
           (TargetInstructionSet::ARM32InstructionSet_End -
            TargetInstructionSet::ARM32InstructionSet_Begin),
       "ARM32InstructionSet range different from TargetInstructionSet");
   if (Flags.getTargetInstructionSet() !=
       TargetInstructionSet::BaseInstructionSet) {
     InstructionSet = static_cast<ARM32InstructionSet>(
         (Flags.getTargetInstructionSet() -
          TargetInstructionSet::ARM32InstructionSet_Begin) +
         ARM32InstructionSet::Begin);
   }
 }

 TargetARM32::TargetARM32(Cfg *Func)
     : TargetLowering(Func), CPUFeatures(Func->getContext()->getFlags()) {
   // TODO: Don't initialize IntegerRegisters and friends every time. Instead,
   // initialize in some sort of static initializer for the class.
   // Limit this size (or do all bitsets need to be the same width)???
   llvm::SmallBitVector IntegerRegisters(RegARM32::Reg_NUM);
   llvm::SmallBitVector I64PairRegisters(RegARM32::Reg_NUM);
   llvm::SmallBitVector Float32Registers(RegARM32::Reg_NUM);
   llvm::SmallBitVector Float64Registers(RegARM32::Reg_NUM);
   llvm::SmallBitVector VectorRegisters(RegARM32::Reg_NUM);
   llvm::SmallBitVector InvalidRegisters(RegARM32::Reg_NUM);
   ScratchRegs.resize(RegARM32::Reg_NUM);
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
           isI64Pair, isFP32, isFP64, isVec128, alias_init)                     \
   IntegerRegisters[RegARM32::val] = isInt;                                     \
   I64PairRegisters[RegARM32::val] = isI64Pair;                                 \
   Float32Registers[RegARM32::val] = isFP32;                                    \
   Float64Registers[RegARM32::val] = isFP64;                                    \
   VectorRegisters[RegARM32::val] = isVec128;                                   \
   RegisterAliases[RegARM32::val].resize(RegARM32::Reg_NUM);                    \
   for (SizeT RegAlias : alias_init) {                                          \
     assert(!RegisterAliases[RegARM32::val][RegAlias] &&                        \
            "Duplicate alias for " #val);                                       \
     RegisterAliases[RegARM32::val].set(RegAlias);                              \
   }                                                                            \
   assert(RegisterAliases[RegARM32::val][RegARM32::val]);                       \
   ScratchRegs[RegARM32::val] = scratch;
   REGARM32_TABLE;
 #undef X
   TypeToRegisterSet[IceType_void] = InvalidRegisters;
   TypeToRegisterSet[IceType_i1] = IntegerRegisters;
   TypeToRegisterSet[IceType_i8] = IntegerRegisters;
   TypeToRegisterSet[IceType_i16] = IntegerRegisters;
   TypeToRegisterSet[IceType_i32] = IntegerRegisters;
   TypeToRegisterSet[IceType_i64] = I64PairRegisters;
   TypeToRegisterSet[IceType_f32] = Float32Registers;
   TypeToRegisterSet[IceType_f64] = Float64Registers;
   TypeToRegisterSet[IceType_v4i1] = VectorRegisters;
   TypeToRegisterSet[IceType_v8i1] = VectorRegisters;
   TypeToRegisterSet[IceType_v16i1] = VectorRegisters;
   TypeToRegisterSet[IceType_v16i8] = VectorRegisters;
   TypeToRegisterSet[IceType_v8i16] = VectorRegisters;
   TypeToRegisterSet[IceType_v4i32] = VectorRegisters;
   TypeToRegisterSet[IceType_v4f32] = VectorRegisters;
 }

 namespace {
 void copyRegAllocFromInfWeightVariable64On32(const VarList &Vars) {
   for (Variable *Var : Vars) {
     auto *Var64 = llvm::dyn_cast<Variable64On32>(Var);
     if (!Var64) {
       // This is not the variable we are looking for.
       continue;
     }
     assert(Var64->hasReg() || !Var64->mustHaveReg());
     if (!Var64->hasReg()) {
       continue;
     }
     SizeT FirstReg = RegARM32::getI64PairFirstGPRNum(Var->getRegNum());
     // This assumes little endian.
     Variable *Lo = Var64->getLo();
     Variable *Hi = Var64->getHi();
     assert(Lo->hasReg() == Hi->hasReg());
     if (Lo->hasReg()) {
       continue;
     }
     Lo->setRegNum(FirstReg);
     Lo->setMustHaveReg();
     Hi->setRegNum(FirstReg + 1);
     Hi->setMustHaveReg();
   }
 }
 } // end of anonymous namespace

 void TargetARM32::translateO2() {
   TimerMarker T(TimerStack::TT_O2, Func);

   // TODO(stichnot): share passes with X86?
   // https://code.google.com/p/nativeclient/issues/detail?id=4094

   if (!Ctx->getFlags().getPhiEdgeSplit()) {
     // Lower Phi instructions.
     Func->placePhiLoads();
     if (Func->hasError())
       return;
     Func->placePhiStores();
     if (Func->hasError())
       return;
     Func->deletePhis();
     if (Func->hasError())
       return;
     Func->dump("After Phi lowering");
   }

   // Address mode optimization.
   Func->getVMetadata()->init(VMK_SingleDefs);
   Func->doAddressOpt();

   // Argument lowering
   Func->doArgLowering();

   // Target lowering. This requires liveness analysis for some parts of the
   // lowering decisions, such as compare/branch fusing. If non-lightweight
   // liveness analysis is used, the instructions need to be renumbered first.
   // TODO: This renumbering should only be necessary if we're actually
   // calculating live intervals, which we only do for register allocation.
   Func->renumberInstructions();
   if (Func->hasError())
     return;

   // TODO: It should be sufficient to use the fastest liveness calculation,
   // i.e. livenessLightweight(). However, for some reason that slows down the
   // rest of the translation. Investigate.
   Func->liveness(Liveness_Basic);
   if (Func->hasError())
     return;
   Func->dump("After ARM32 address mode opt");

   Func->genCode();
   if (Func->hasError())
     return;
   Func->dump("After ARM32 codegen");

   // Register allocation. This requires instruction renumbering and full
   // liveness analysis.
   Func->renumberInstructions();
   if (Func->hasError())
     return;
   Func->liveness(Liveness_Intervals);
   if (Func->hasError())
     return;
   // Validate the live range computations. The expensive validation call is
   // deliberately only made when assertions are enabled.
   assert(Func->validateLiveness());
   // The post-codegen dump is done here, after liveness analysis and associated
   // cleanup, to make the dump cleaner and more useful.
   Func->dump("After initial ARM32 codegen");
   Func->getVMetadata()->init(VMK_All);
   regAlloc(RAK_Global);
   if (Func->hasError())
     return;
   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   Func->dump("After linear scan regalloc");

   if (Ctx->getFlags().getPhiEdgeSplit()) {
     Func->advancedPhiLowering();
     Func->dump("After advanced Phi lowering");
   }

   // Stack frame mapping.
   Func->genFrame();
   if (Func->hasError())
     return;
   Func->dump("After stack frame mapping");

   legalizeStackSlots();
   if (Func->hasError())
     return;
   Func->dump("After legalizeStackSlots");

   Func->contractEmptyNodes();
   Func->reorderNodes();

   // Branch optimization. This needs to be done just before code emission. In
   // particular, no transformations that insert or reorder CfgNodes should be
   // done after branch optimization. We go ahead and do it before nop insertion
   // to reduce the amount of work needed for searching for opportunities.
   Func->doBranchOpt();
   Func->dump("After branch optimization");

   // Nop insertion
   if (Ctx->getFlags().shouldDoNopInsertion()) {
     Func->doNopInsertion();
   }
 }

 void TargetARM32::translateOm1() {
   TimerMarker T(TimerStack::TT_Om1, Func);

   // TODO: share passes with X86?

   Func->placePhiLoads();
   if (Func->hasError())
     return;
   Func->placePhiStores();
   if (Func->hasError())
     return;
   Func->deletePhis();
   if (Func->hasError())
     return;
   Func->dump("After Phi lowering");

   Func->doArgLowering();

   Func->genCode();
   if (Func->hasError())
     return;
   Func->dump("After initial ARM32 codegen");

   regAlloc(RAK_InfOnly);
   if (Func->hasError())
     return;
   copyRegAllocFromInfWeightVariable64On32(Func->getVariables());
   Func->dump("After regalloc of infinite-weight variables");

   Func->genFrame();
   if (Func->hasError())
     return;
   Func->dump("After stack frame mapping");

   legalizeStackSlots();
   if (Func->hasError())
     return;
   Func->dump("After legalizeStackSlots");

   // Nop insertion
   if (Ctx->getFlags().shouldDoNopInsertion()) {
     Func->doNopInsertion();
   }
 }

 bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) {
   if (InstARM32Br *Br = llvm::dyn_cast<InstARM32Br>(I)) {
     return Br->optimizeBranch(NextNode);
   }
   return false;
 }

 IceString TargetARM32::getRegName(SizeT RegNum, Type Ty) const {
   assert(RegNum < RegARM32::Reg_NUM);
   (void)Ty;
   static const char *RegNames[] = {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
           isI64Pair, isFP32, isFP64, isVec128, alias_init)                     \
   name,
       REGARM32_TABLE
 #undef X
   };

   return RegNames[RegNum];
 }

 Variable *TargetARM32::getPhysicalRegister(SizeT RegNum, Type Ty) {
   static const Type DefaultType[] = {
 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
           isI64Pair, isFP32, isFP64, isVec128, alias_init)                     \
   (isFP32)                                                                     \
       ? IceType_f32                                                            \
       : ((isFP64) ? IceType_f64 : ((isVec128 ? IceType_v4i32 : IceType_i32))),
       REGARM32_TABLE
 #undef X
   };

   assert(RegNum < RegARM32::Reg_NUM);
   if (Ty == IceType_void) {
     assert(RegNum < llvm::array_lengthof(DefaultType));
     Ty = DefaultType[RegNum];
   }
   if (PhysicalRegisters[Ty].empty())
     PhysicalRegisters[Ty].resize(RegARM32::Reg_NUM);
   assert(RegNum < PhysicalRegisters[Ty].size());
   Variable *Reg = PhysicalRegisters[Ty][RegNum];
   if (Reg == nullptr) {
     Reg = Func->makeVariable(Ty);
     Reg->setRegNum(RegNum);
     PhysicalRegisters[Ty][RegNum] = Reg;
     // Specially mark a named physical register as an "argument" so that it is
     // considered live upon function entry.  Otherwise it's possible to get
     // liveness validation errors for saving callee-save registers.
     Func->addImplicitArg(Reg);
     // Don't bother tracking the live range of a named physical register.
     Reg->setIgnoreLiveness();
   }
   return Reg;
 }

 void TargetARM32::emitJumpTable(const Cfg *Func,
                                 const InstJumpTable *JumpTable) const {
   (void)JumpTable;
   UnimplementedError(Func->getContext()->getFlags());
 }

 void TargetARM32::emitVariable(const Variable *Var) const {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
   if (Var->hasReg()) {
     Str << getRegName(Var->getRegNum(), Var->getType());
     return;
   }
   if (Var->mustHaveReg()) {
     llvm::report_fatal_error(
         "Infinite-weight Variable has no register assigned");
   }
   int32_t Offset = Var->getStackOffset();
   int32_t BaseRegNum = Var->getBaseRegNum();
   if (BaseRegNum == Variable::NoRegister) {
     BaseRegNum = getFrameOrStackReg();
     if (!hasFramePointer())
       Offset += getStackAdjustment();
   }
   const Type VarTy = Var->getType();
   if (!isLegalVariableStackOffset(VarTy, Offset)) {
     llvm::report_fatal_error("Illegal stack offset");
   }
   Str << "[" << getRegName(BaseRegNum, VarTy);
   if (Offset != 0) {
     Str << ", " << getConstantPrefix() << Offset;
   }
   Str << "]";
 }

 bool TargetARM32::CallingConv::I64InRegs(std::pair<int32_t, int32_t> *Regs) {
   if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
     return false;
   int32_t RegLo, RegHi;
   // Always start i64 registers at an even register, so this may end up padding
   // away a register.
   NumGPRRegsUsed = Utils::applyAlignment(NumGPRRegsUsed, 2);
   RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
   ++NumGPRRegsUsed;
   RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
   ++NumGPRRegsUsed;
   // If this bumps us past the boundary, don't allocate to a register and leave
   // any previously speculatively consumed registers as consumed.
   if (NumGPRRegsUsed > ARM32_MAX_GPR_ARG)
     return false;
   Regs->first = RegLo;
   Regs->second = RegHi;
   return true;
 }

 bool TargetARM32::CallingConv::I32InReg(int32_t *Reg) {
   if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
     return false;
   *Reg = RegARM32::Reg_r0 + NumGPRRegsUsed;
   ++NumGPRRegsUsed;
   return true;
 }

 bool TargetARM32::CallingConv::FPInReg(Type Ty, int32_t *Reg) {
   if (!VFPRegsFree.any()) {
     return false;
   }

   if (isVectorType(Ty)) {
     // Q registers are declared in reverse order, so RegARM32::Reg_q0 >
     // RegARM32::Reg_q1. Therefore, we need to subtract QRegStart from Reg_q0.
     // Same thing goes for D registers.
     static_assert(RegARM32::Reg_q0 > RegARM32::Reg_q1,
                   "ARM32 Q registers are possibly declared incorrectly.");

     int32_t QRegStart = (VFPRegsFree & ValidV128Regs).find_first();
     if (QRegStart >= 0) {
       VFPRegsFree.reset(QRegStart, QRegStart + 4);
       *Reg = RegARM32::Reg_q0 - (QRegStart / 4);
       return true;
     }
   } else if (Ty == IceType_f64) {
     static_assert(RegARM32::Reg_d0 > RegARM32::Reg_d1,
                   "ARM32 D registers are possibly declared incorrectly.");

     int32_t DRegStart = (VFPRegsFree & ValidF64Regs).find_first();
     if (DRegStart >= 0) {
       VFPRegsFree.reset(DRegStart, DRegStart + 2);
       *Reg = RegARM32::Reg_d0 - (DRegStart / 2);
       return true;
     }
   } else {
     static_assert(RegARM32::Reg_s0 < RegARM32::Reg_s1,
                   "ARM32 S registers are possibly declared incorrectly.");

     assert(Ty == IceType_f32);
     int32_t SReg = VFPRegsFree.find_first();
     assert(SReg >= 0);
     VFPRegsFree.reset(SReg);
     *Reg = RegARM32::Reg_s0 + SReg;
     return true;
   }

   // Parameter allocation failed. From now on, every fp register must be placed
   // on the stack. We clear VFRegsFree in case there are any "holes" from S and
   // D registers.
   VFPRegsFree.clear();
   return false;
 }

 void TargetARM32::lowerArguments() {
   VarList &Args = Func->getArgs();
   TargetARM32::CallingConv CC;

   // For each register argument, replace Arg in the argument list with the home
   // register. Then generate an instruction in the prolog to copy the home
   // register to the assigned location of Arg.
   Context.init(Func->getEntryNode());
   Context.setInsertPoint(Context.getCur());

   for (SizeT I = 0, E = Args.size(); I < E; ++I) {
     Variable *Arg = Args[I];
     Type Ty = Arg->getType();
     if (Ty == IceType_i64) {
       std::pair<int32_t, int32_t> RegPair;
       if (!CC.I64InRegs(&RegPair))
         continue;
       Variable *RegisterArg = Func->makeVariable(Ty);
       auto *RegisterArg64On32 = llvm::cast<Variable64On32>(RegisterArg);
       if (BuildDefs::dump())
         RegisterArg64On32->setName(Func, "home_reg:" + Arg->getName(Func));
       RegisterArg64On32->initHiLo(Func);
       RegisterArg64On32->setIsArg();
       RegisterArg64On32->getLo()->setRegNum(RegPair.first);
       RegisterArg64On32->getHi()->setRegNum(RegPair.second);
       Arg->setIsArg(false);

       Args[I] = RegisterArg64On32;
       Context.insert(InstAssign::create(Func, Arg, RegisterArg));
       continue;
     } else {
       int32_t RegNum;
       if (isVectorType(Ty) || isFloatingType(Ty)) {
         if (!CC.FPInReg(Ty, &RegNum))
           continue;
       } else {
         assert(Ty == IceType_i32);
         if (!CC.I32InReg(&RegNum))
           continue;
       }
       Variable *RegisterArg = Func->makeVariable(Ty);
       if (BuildDefs::dump()) {
         RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
       }
       RegisterArg->setRegNum(RegNum);
       RegisterArg->setIsArg();
       Arg->setIsArg(false);

       Args[I] = RegisterArg;
       Context.insert(InstAssign::create(Func, Arg, RegisterArg));
       continue;
     }
   }
 }

 // Helper function for addProlog().
 //
 // This assumes Arg is an argument passed on the stack. This sets the frame
 // offset for Arg and updates InArgsSizeBytes according to Arg's width. For an
 // I64 arg that has been split into Lo and Hi components, it calls itself
 // recursively on the components, taking care to handle Lo first because of the
 // little-endian architecture. Lastly, this function generates an instruction
 // to copy Arg into its assigned register if applicable.
 void TargetARM32::finishArgumentLowering(Variable *Arg, Variable *FramePtr,
                                          size_t BasicFrameOffset,
                                          size_t &InArgsSizeBytes) {
   if (auto *Arg64On32 = llvm::dyn_cast<Variable64On32>(Arg)) {
     Variable *Lo = Arg64On32->getLo();
     Variable *Hi = Arg64On32->getHi();
     finishArgumentLowering(Lo, FramePtr, BasicFrameOffset, InArgsSizeBytes);
     finishArgumentLowering(Hi, FramePtr, BasicFrameOffset, InArgsSizeBytes);
     return;
   }
   Type Ty = Arg->getType();
   InArgsSizeBytes = applyStackAlignmentTy(InArgsSizeBytes, Ty);
   Arg->setStackOffset(BasicFrameOffset + InArgsSizeBytes);
   InArgsSizeBytes += typeWidthInBytesOnStack(Ty);
   // If the argument variable has been assigned a register, we need to load the
   // value from the stack slot.
   if (Arg->hasReg()) {
     assert(Ty != IceType_i64);
     // This should be simple, just load the parameter off the stack using a nice
     // sp + imm addressing mode. Because ARM, we can't do that (e.g., VLDR, for
     // fp types, cannot have an index register), so we legalize the memory
     // operand instead.
     auto *Mem = OperandARM32Mem::create(
         Func, Ty, FramePtr, llvm::cast<ConstantInteger32>(
                                 Ctx->getConstantInt32(Arg->getStackOffset())));
     _mov(Arg, legalizeToReg(Mem, Arg->getRegNum()));
     // This argument-copying instruction uses an explicit OperandARM32Mem
     // operand instead of a Variable, so its fill-from-stack operation has to
     // be tracked separately for statistics.
     Ctx->statsUpdateFills();
   }
 }

 Type TargetARM32::stackSlotType() { return IceType_i32; }

 void TargetARM32::addProlog(CfgNode *Node) {
   // Stack frame layout:
   //
   // +------------------------+
   // | 1. preserved registers |
   // +------------------------+
   // | 2. padding             |
   // +------------------------+ <--- FramePointer (if used)
   // | 3. global spill area   |
   // +------------------------+
   // | 4. padding             |
   // +------------------------+
   // | 5. local spill area    |
   // +------------------------+
   // | 6. padding             |
   // +------------------------+
   // | 7. allocas             |
   // +------------------------+ <--- StackPointer
   //
   // The following variables record the size in bytes of the given areas:
   //  * PreservedRegsSizeBytes: area 1
   //  * SpillAreaPaddingBytes:  area 2
   //  * GlobalsSize:            area 3
   //  * GlobalsAndSubsequentPaddingSize: areas 3 - 4
   //  * LocalsSpillAreaSize:    area 5
   //  * SpillAreaSizeBytes:     areas 2 - 6
   // Determine stack frame offsets for each Variable without a register
   // assignment.  This can be done as one variable per stack slot.  Or, do
   // coalescing by running the register allocator again with an infinite set of
   // registers (as a side effect, this gives variables a second chance at
   // physical register assignment).
   //
   // A middle ground approach is to leverage sparsity and allocate one block of
   // space on the frame for globals (variables with multi-block lifetime), and
   // one block to share for locals (single-block lifetime).

   Context.init(Node);
   Context.setInsertPoint(Context.getCur());

   llvm::SmallBitVector CalleeSaves =
       getRegisterSet(RegSet_CalleeSave, RegSet_None);
   RegsUsed = llvm::SmallBitVector(CalleeSaves.size());
   VarList SortedSpilledVariables;
   size_t GlobalsSize = 0;
   // If there is a separate locals area, this represents that area. Otherwise
   // it counts any variable not counted by GlobalsSize.
   SpillAreaSizeBytes = 0;
   // If there is a separate locals area, this specifies the alignment for it.
   uint32_t LocalsSlotsAlignmentBytes = 0;
   // The entire spill locations area gets aligned to largest natural alignment
   // of the variables that have a spill slot.
   uint32_t SpillAreaAlignmentBytes = 0;
   // For now, we don't have target-specific variables that need special
   // treatment (no stack-slot-linked SpillVariable type).
   std::function<bool(Variable *)> TargetVarHook = [](Variable *Var) {
     static constexpr bool AssignStackSlot = false;
     static constexpr bool DontAssignStackSlot = !AssignStackSlot;
     if (llvm::isa<Variable64On32>(Var)) {
       return DontAssignStackSlot;
     }
     return AssignStackSlot;
   };

   // Compute the list of spilled variables and bounds for GlobalsSize, etc.
   getVarStackSlotParams(SortedSpilledVariables, RegsUsed, &GlobalsSize,
                         &SpillAreaSizeBytes, &SpillAreaAlignmentBytes,
                         &LocalsSlotsAlignmentBytes, TargetVarHook);
   uint32_t LocalsSpillAreaSize = SpillAreaSizeBytes;
   SpillAreaSizeBytes += GlobalsSize;

   // Add push instructions for preserved registers. On ARM, "push" can push a
   // whole list of GPRs via a bitmask (0-15). Unlike x86, ARM also has
   // callee-saved float/vector registers. The "vpush" instruction can handle a
   // whole list of float/vector registers, but it only handles contiguous
   // sequences of registers by specifying the start and the length.
   VarList GPRsToPreserve;
   GPRsToPreserve.reserve(CalleeSaves.size());
   uint32_t NumCallee = 0;
   size_t PreservedRegsSizeBytes = 0;
   // Consider FP and LR as callee-save / used as needed.
   if (UsesFramePointer) {
     CalleeSaves[RegARM32::Reg_fp] = true;
     assert(RegsUsed[RegARM32::Reg_fp] == false);
     RegsUsed[RegARM32::Reg_fp] = true;
   }
   if (!MaybeLeafFunc) {
     CalleeSaves[RegARM32::Reg_lr] = true;
     RegsUsed[RegARM32::Reg_lr] = true;
   }
   for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
     if (RegARM32::isI64RegisterPair(i)) {
       // We don't save register pairs explicitly. Instead, we rely on the code
       // fake-defing/fake-using each register in the pair.
       continue;
     }
     if (CalleeSaves[i] && RegsUsed[i]) {
       // TODO(jvoung): do separate vpush for each floating point register
       // segment and += 4, or 8 depending on type.
       ++NumCallee;
       PreservedRegsSizeBytes += 4;
       GPRsToPreserve.push_back(getPhysicalRegister(i));
     }
   }
   Ctx->statsUpdateRegistersSaved(NumCallee);
   if (!GPRsToPreserve.empty())
     _push(GPRsToPreserve);

   // Generate "mov FP, SP" if needed.
   if (UsesFramePointer) {
     Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _mov(FP, SP);
     // Keep FP live for late-stage liveness analysis (e.g. asm-verbose mode).
     Context.insert(InstFakeUse::create(Func, FP));
   }

   // Align the variables area. SpillAreaPaddingBytes is the size of the region
   // after the preserved registers and before the spill areas.
   // LocalsSlotsPaddingBytes is the amount of padding between the globals and
   // locals area if they are separate.
   assert(SpillAreaAlignmentBytes <= ARM32_STACK_ALIGNMENT_BYTES);
   assert(LocalsSlotsAlignmentBytes <= SpillAreaAlignmentBytes);
   uint32_t SpillAreaPaddingBytes = 0;
   uint32_t LocalsSlotsPaddingBytes = 0;
   alignStackSpillAreas(PreservedRegsSizeBytes, SpillAreaAlignmentBytes,
                        GlobalsSize, LocalsSlotsAlignmentBytes,
                        &SpillAreaPaddingBytes, &LocalsSlotsPaddingBytes);
   SpillAreaSizeBytes += SpillAreaPaddingBytes + LocalsSlotsPaddingBytes;
   uint32_t GlobalsAndSubsequentPaddingSize =
       GlobalsSize + LocalsSlotsPaddingBytes;

   // Align SP if necessary.
   if (NeedsStackAlignment) {
     uint32_t StackOffset = PreservedRegsSizeBytes;
     uint32_t StackSize = applyStackAlignment(StackOffset + SpillAreaSizeBytes);
     SpillAreaSizeBytes = StackSize - StackOffset;
   }

   // Generate "sub sp, SpillAreaSizeBytes"
   if (SpillAreaSizeBytes) {
     // Use the scratch register if needed to legalize the immediate.
     Operand *SubAmount = legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
                                   Legal_Reg | Legal_Flex, getReservedTmpReg());
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _sub(SP, SP, SubAmount);
   }
   Ctx->statsUpdateFrameBytes(SpillAreaSizeBytes);

   resetStackAdjustment();

   // Fill in stack offsets for stack args, and copy args into registers for
   // those that were register-allocated. Args are pushed right to left, so
   // Arg[0] is closest to the stack/frame pointer.
   Variable *FramePtr = getPhysicalRegister(getFrameOrStackReg());
   size_t BasicFrameOffset = PreservedRegsSizeBytes;
   if (!UsesFramePointer)
     BasicFrameOffset += SpillAreaSizeBytes;

   const VarList &Args = Func->getArgs();
   size_t InArgsSizeBytes = 0;
   TargetARM32::CallingConv CC;
   for (Variable *Arg : Args) {
     Type Ty = Arg->getType();
     bool InRegs = false;
     // Skip arguments passed in registers.
     if (isVectorType(Ty) || isFloatingType(Ty)) {
       int32_t DummyReg;
       InRegs = CC.FPInReg(Ty, &DummyReg);
     } else if (Ty == IceType_i64) {
       std::pair<int32_t, int32_t> DummyRegs;
       InRegs = CC.I64InRegs(&DummyRegs);
     } else {
       assert(Ty == IceType_i32);
       int32_t DummyReg;
       InRegs = CC.I32InReg(&DummyReg);
     }
     if (!InRegs)
       finishArgumentLowering(Arg, FramePtr, BasicFrameOffset, InArgsSizeBytes);
   }

   // Fill in stack offsets for locals.
   assignVarStackSlots(SortedSpilledVariables, SpillAreaPaddingBytes,
                       SpillAreaSizeBytes, GlobalsAndSubsequentPaddingSize,
                       UsesFramePointer);
   this->HasComputedFrame = true;

   if (BuildDefs::dump() && Func->isVerbose(IceV_Frame)) {
     OstreamLocker L(Func->getContext());
     Ostream &Str = Func->getContext()->getStrDump();

     Str << "Stack layout:\n";
     uint32_t SPAdjustmentPaddingSize =
         SpillAreaSizeBytes - LocalsSpillAreaSize -
         GlobalsAndSubsequentPaddingSize - SpillAreaPaddingBytes;
     Str << " in-args = " << InArgsSizeBytes << " bytes\n"
         << " preserved registers = " << PreservedRegsSizeBytes << " bytes\n"
         << " spill area padding = " << SpillAreaPaddingBytes << " bytes\n"
         << " globals spill area = " << GlobalsSize << " bytes\n"
         << " globals-locals spill areas intermediate padding = "
         << GlobalsAndSubsequentPaddingSize - GlobalsSize << " bytes\n"
         << " locals spill area = " << LocalsSpillAreaSize << " bytes\n"
         << " SP alignment padding = " << SPAdjustmentPaddingSize << " bytes\n";

     Str << "Stack details:\n"
         << " SP adjustment = " << SpillAreaSizeBytes << " bytes\n"
         << " spill area alignment = " << SpillAreaAlignmentBytes << " bytes\n"
         << " locals spill area alignment = " << LocalsSlotsAlignmentBytes
         << " bytes\n"
         << " is FP based = " << UsesFramePointer << "\n";
   }
 }

 void TargetARM32::addEpilog(CfgNode *Node) {
   InstList &Insts = Node->getInsts();
   InstList::reverse_iterator RI, E;
   for (RI = Insts.rbegin(), E = Insts.rend(); RI != E; ++RI) {
     if (llvm::isa<InstARM32Ret>(*RI))
       break;
   }
   if (RI == E)
     return;

   // Convert the reverse_iterator position into its corresponding (forward)
   // iterator position.
   InstList::iterator InsertPoint = RI.base();
   --InsertPoint;
   Context.init(Node);
   Context.setInsertPoint(InsertPoint);

   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   if (UsesFramePointer) {
     Variable *FP = getPhysicalRegister(RegARM32::Reg_fp);
     // For late-stage liveness analysis (e.g. asm-verbose mode), adding a fake
     // use of SP before the assignment of SP=FP keeps previous SP adjustments
     // from being dead-code eliminated.
     Context.insert(InstFakeUse::create(Func, SP));
     _mov(SP, FP);
   } else {
     // add SP, SpillAreaSizeBytes
     if (SpillAreaSizeBytes) {
       // Use the scratch register if needed to legalize the immediate.
       Operand *AddAmount =
           legalize(Ctx->getConstantInt32(SpillAreaSizeBytes),
                    Legal_Reg | Legal_Flex, getReservedTmpReg());
       _add(SP, SP, AddAmount);
     }
   }

   // Add pop instructions for preserved registers.
   llvm::SmallBitVector CalleeSaves =
       getRegisterSet(RegSet_CalleeSave, RegSet_None);
   VarList GPRsToRestore;
   GPRsToRestore.reserve(CalleeSaves.size());
   // Consider FP and LR as callee-save / used as needed.
   if (UsesFramePointer) {
     CalleeSaves[RegARM32::Reg_fp] = true;
   }
   if (!MaybeLeafFunc) {
     CalleeSaves[RegARM32::Reg_lr] = true;
   }
   // Pop registers in ascending order just like push (instead of in reverse
   // order).
   for (SizeT i = 0; i < CalleeSaves.size(); ++i) {
     if (RegARM32::isI64RegisterPair(i)) {
       continue;
     }

     if (CalleeSaves[i] && RegsUsed[i]) {
       GPRsToRestore.push_back(getPhysicalRegister(i));
     }
   }
   if (!GPRsToRestore.empty())
     _pop(GPRsToRestore);

   if (!Ctx->getFlags().getUseSandboxing())
     return;

   // Change the original ret instruction into a sandboxed return sequence.
   // bundle_lock
   // bic lr, #0xc000000f
   // bx lr
   // bundle_unlock
   // This isn't just aligning to the getBundleAlignLog2Bytes(). It needs to
   // restrict to the lower 1GB as well.
   Operand *RetMask =
       legalize(Ctx->getConstantInt32(0xc000000f), Legal_Reg | Legal_Flex);
   Variable *LR = makeReg(IceType_i32, RegARM32::Reg_lr);
   Variable *RetValue = nullptr;
   if (RI->getSrcSize())
     RetValue = llvm::cast<Variable>(RI->getSrc(0));
   _bundle_lock();
   _bic(LR, LR, RetMask);
   _ret(LR, RetValue);
   _bundle_unlock();
   RI->setDeleted();
 }

 bool TargetARM32::isLegalVariableStackOffset(Type Ty, int32_t Offset) const {
   constexpr bool SignExt = false;
   return OperandARM32Mem::canHoldOffset(Ty, SignExt, Offset);
 }

 StackVariable *TargetARM32::legalizeVariableSlot(Variable *Var,
                                                  int32_t StackAdjust,
                                                  Variable *OrigBaseReg) {
   int32_t Offset = Var->getStackOffset() + StackAdjust;
   // Legalize will likely need a movw/movt combination, but if the top bits are
   // all 0 from negating the offset and subtracting, we could use that instead.
   bool ShouldSub = (-Offset & 0xFFFF0000) == 0;
   if (ShouldSub)
     Offset = -Offset;
   Operand *OffsetVal = legalize(Ctx->getConstantInt32(Offset),
                                 Legal_Reg | Legal_Flex, getReservedTmpReg());
   Variable *ScratchReg = makeReg(IceType_i32, getReservedTmpReg());
   if (ShouldSub)
     _sub(ScratchReg, OrigBaseReg, OffsetVal);
   else
     _add(ScratchReg, OrigBaseReg, OffsetVal);
   StackVariable *NewVar = Func->makeVariable<StackVariable>(stackSlotType());
   NewVar->setMustNotHaveReg();
   NewVar->setBaseRegNum(ScratchReg->getRegNum());
   constexpr int32_t NewOffset = 0;
   NewVar->setStackOffset(NewOffset);
   return NewVar;
 }

 void TargetARM32::legalizeStackSlots() {
   // If a stack variable's frame offset doesn't fit, convert from:
   //   ldr X, OFF[SP]
   // to:
   //   movw/movt TMP, OFF_PART
   //   add TMP, TMP, SP
   //   ldr X, OFF_MORE[TMP]
   //
   // This is safe because we have reserved TMP, and add for ARM does not
   // clobber the flags register.
   Func->dump("Before legalizeStackSlots");
   assert(hasComputedFrame());
   // Early exit, if SpillAreaSizeBytes is really small.
   // TODO(jpp): this is not safe -- loads and stores of q registers can't have
   // offsets.
   if (isLegalVariableStackOffset(IceType_v4i32, SpillAreaSizeBytes))
     return;
   Variable *OrigBaseReg = getPhysicalRegister(getFrameOrStackReg());
   int32_t StackAdjust = 0;
   // Do a fairly naive greedy clustering for now. Pick the first stack slot
   // that's out of bounds and make a new base reg using the architecture's temp
   // register. If that works for the next slot, then great. Otherwise, create a
   // new base register, clobbering the previous base register. Never share a
   // base reg across different basic blocks. This isn't ideal if local and
   // multi-block variables are far apart and their references are interspersed.
   // It may help to be more coordinated about assign stack slot numbers and may
   // help to assign smaller offsets to higher-weight variables so that they
   // don't depend on this legalization.
   for (CfgNode *Node : Func->getNodes()) {
     Context.init(Node);
     StackVariable *NewBaseReg = nullptr;
     int32_t NewBaseOffset = 0;
     while (!Context.atEnd()) {
       PostIncrLoweringContext PostIncrement(Context);
       Inst *CurInstr = Context.getCur();
       Variable *Dest = CurInstr->getDest();
       // Check if the previous NewBaseReg is clobbered, and reset if needed.
       if ((Dest && NewBaseReg && Dest->hasReg() &&
            Dest->getRegNum() == NewBaseReg->getBaseRegNum()) ||
           llvm::isa<InstFakeKill>(CurInstr)) {
         NewBaseReg = nullptr;
         NewBaseOffset = 0;
       }
       // The stack adjustment only matters if we are using SP instead of FP.
       if (!hasFramePointer()) {
         if (auto *AdjInst = llvm::dyn_cast<InstARM32AdjustStack>(CurInstr)) {
           StackAdjust += AdjInst->getAmount();
           NewBaseOffset += AdjInst->getAmount();
           continue;
         }
         if (llvm::isa<InstARM32Call>(CurInstr)) {
           NewBaseOffset -= StackAdjust;
           StackAdjust = 0;
           continue;
         }
       }

       // For now, only Mov instructions can have stack variables. We need to
       // know the type of instruction because we currently create a fresh one
       // to replace Dest/Source, rather than mutate in place.
       bool MayNeedOffsetRewrite = false;
       if (auto *MovInstr = llvm::dyn_cast<InstARM32Mov>(CurInstr)) {
         MayNeedOffsetRewrite =
             !MovInstr->isMultiDest() && !MovInstr->isMultiSource();
       }

       if (!MayNeedOffsetRewrite) {
         continue;
       }

       assert(Dest != nullptr);
       Type DestTy = Dest->getType();
       assert(DestTy != IceType_i64);
       if (!Dest->hasReg()) {
         int32_t Offset = Dest->getStackOffset();
         Offset += StackAdjust;
         if (!isLegalVariableStackOffset(DestTy, Offset)) {
           if (NewBaseReg) {
             int32_t OffsetDiff = Offset - NewBaseOffset;
             if (isLegalVariableStackOffset(DestTy, OffsetDiff)) {
               StackVariable *NewDest =
                   Func->makeVariable<StackVariable>(stackSlotType());
               NewDest->setMustNotHaveReg();
               NewDest->setBaseRegNum(NewBaseReg->getBaseRegNum());
               NewDest->setStackOffset(OffsetDiff);
               Variable *NewDestVar = NewDest;
               _mov(NewDestVar, CurInstr->getSrc(0));
               CurInstr->setDeleted();
               continue;
             }
           }
           StackVariable *LegalDest =
               legalizeVariableSlot(Dest, StackAdjust, OrigBaseReg);
           assert(LegalDest != Dest);
           Variable *LegalDestVar = LegalDest;
           _mov(LegalDestVar, CurInstr->getSrc(0));
           CurInstr->setDeleted();
           NewBaseReg = LegalDest;
           NewBaseOffset = Offset;
           continue;
         }
       }
       assert(CurInstr->getSrcSize() == 1);
       Variable *Var = llvm::dyn_cast<Variable>(CurInstr->getSrc(0));
       if (Var && !Var->hasReg()) {
         Type VarTy = Var->getType();
         int32_t Offset = Var->getStackOffset();
         Offset += StackAdjust;
         if (!isLegalVariableStackOffset(VarTy, Offset)) {
           if (NewBaseReg) {
             int32_t OffsetDiff = Offset - NewBaseOffset;
             if (isLegalVariableStackOffset(VarTy, OffsetDiff)) {
               StackVariable *NewVar =
                   Func->makeVariable<StackVariable>(stackSlotType());
               NewVar->setMustNotHaveReg();
               NewVar->setBaseRegNum(NewBaseReg->getBaseRegNum());
               NewVar->setStackOffset(OffsetDiff);
               _mov(Dest, NewVar);
               CurInstr->setDeleted();
               continue;
             }
           }
           StackVariable *LegalVar =
               legalizeVariableSlot(Var, StackAdjust, OrigBaseReg);
           assert(LegalVar != Var);
           _mov(Dest, LegalVar);
           CurInstr->setDeleted();
           NewBaseReg = LegalVar;
           NewBaseOffset = Offset;
           continue;
         }
       }
     }
   }
 }

 Operand *TargetARM32::loOperand(Operand *Operand) {
   assert(Operand->getType() == IceType_i64);
   if (Operand->getType() != IceType_i64)
     return Operand;
   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
     return Var64On32->getLo();
   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand))
     return Ctx->getConstantInt32(static_cast<uint32_t>(Const->getValue()));
   if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
     // Conservatively disallow memory operands with side-effects (pre/post
     // increment) in case of duplication.
     assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
            Mem->getAddrMode() == OperandARM32Mem::NegOffset);
     if (Mem->isRegReg()) {
       return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
                                      Mem->getIndex(), Mem->getShiftOp(),
                                      Mem->getShiftAmt(), Mem->getAddrMode());
     } else {
       return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
                                      Mem->getOffset(), Mem->getAddrMode());
     }
   }
   llvm_unreachable("Unsupported operand type");
   return nullptr;
 }

 Operand *TargetARM32::hiOperand(Operand *Operand) {
   assert(Operand->getType() == IceType_i64);
   if (Operand->getType() != IceType_i64)
     return Operand;
   if (auto *Var64On32 = llvm::dyn_cast<Variable64On32>(Operand))
     return Var64On32->getHi();
   if (auto *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
     return Ctx->getConstantInt32(
         static_cast<uint32_t>(Const->getValue() >> 32));
   }
   if (auto *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
     // Conservatively disallow memory operands with side-effects in case of
     // duplication.
     assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
            Mem->getAddrMode() == OperandARM32Mem::NegOffset);
     const Type SplitType = IceType_i32;
     if (Mem->isRegReg()) {
       // We have to make a temp variable T, and add 4 to either Base or Index.
       // The Index may be shifted, so adding 4 can mean something else. Thus,
       // prefer T := Base + 4, and use T as the new Base.
       Variable *Base = Mem->getBase();
       Constant *Four = Ctx->getConstantInt32(4);
       Variable *NewBase = Func->makeVariable(Base->getType());
       lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
                                              Base, Four));
       return OperandARM32Mem::create(Func, SplitType, NewBase, Mem->getIndex(),
                                      Mem->getShiftOp(), Mem->getShiftAmt(),
                                      Mem->getAddrMode());
     } else {
       Variable *Base = Mem->getBase();
       ConstantInteger32 *Offset = Mem->getOffset();
       assert(!Utils::WouldOverflowAdd(Offset->getValue(), 4));
       int32_t NextOffsetVal = Offset->getValue() + 4;
       const bool SignExt = false;
       if (!OperandARM32Mem::canHoldOffset(SplitType, SignExt, NextOffsetVal)) {
         // We have to make a temp variable and add 4 to either Base or Offset.
         // If we add 4 to Offset, this will convert a non-RegReg addressing
         // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
         // RegReg addressing modes, prefer adding to base and replacing
         // instead. Thus we leave the old offset alone.
         Constant *Four = Ctx->getConstantInt32(4);
         Variable *NewBase = Func->makeVariable(Base->getType());
         lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
                                                NewBase, Base, Four));
         Base = NewBase;
       } else {
         Offset =
             llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
       }
       return OperandARM32Mem::create(Func, SplitType, Base, Offset,
                                      Mem->getAddrMode());
     }
   }
   llvm_unreachable("Unsupported operand type");
   return nullptr;
 }

 llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
                                                  RegSetMask Exclude) const {
   llvm::SmallBitVector Registers(RegARM32::Reg_NUM);

 #define X(val, encode, name, scratch, preserved, stackptr, frameptr, isInt,    \
           isI64Pair, isFP32, isFP64, isVec128, alias_init)                     \
   if (scratch && (Include & RegSet_CallerSave))                                \
     Registers[RegARM32::val] = true;                                           \
   if (preserved && (Include & RegSet_CalleeSave))                              \
     Registers[RegARM32::val] = true;                                           \
   if (stackptr && (Include & RegSet_StackPointer))                             \
     Registers[RegARM32::val] = true;                                           \
   if (frameptr && (Include & RegSet_FramePointer))                             \
     Registers[RegARM32::val] = true;                                           \
   if (scratch && (Exclude & RegSet_CallerSave))                                \
     Registers[RegARM32::val] = false;                                          \
   if (preserved && (Exclude & RegSet_CalleeSave))                              \
     Registers[RegARM32::val] = false;                                          \
   if (stackptr && (Exclude & RegSet_StackPointer))                             \
     Registers[RegARM32::val] = false;                                          \
   if (frameptr && (Exclude & RegSet_FramePointer))                             \
     Registers[RegARM32::val] = false;

   REGARM32_TABLE

 #undef X

   return Registers;
 }

 void TargetARM32::lowerAlloca(const InstAlloca *Inst) {
   UsesFramePointer = true;
   // Conservatively require the stack to be aligned. Some stack adjustment
   // operations implemented below assume that the stack is aligned before the
   // alloca. All the alloca code ensures that the stack alignment is preserved
   // after the alloca. The stack alignment restriction can be relaxed in some
   // cases.
   NeedsStackAlignment = true;

   // TODO(stichnot): minimize the number of adjustments of SP, etc.
   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   Variable *Dest = Inst->getDest();
   uint32_t AlignmentParam = Inst->getAlignInBytes();
   // For default align=0, set it to the real value 1, to avoid any
   // bit-manipulation problems below.
   AlignmentParam = std::max(AlignmentParam, 1u);

   // LLVM enforces power of 2 alignment.
   assert(llvm::isPowerOf2_32(AlignmentParam));
   assert(llvm::isPowerOf2_32(ARM32_STACK_ALIGNMENT_BYTES));

   uint32_t Alignment = std::max(AlignmentParam, ARM32_STACK_ALIGNMENT_BYTES);
   if (Alignment > ARM32_STACK_ALIGNMENT_BYTES) {
     alignRegisterPow2(SP, Alignment);
   }
   Operand *TotalSize = Inst->getSizeInBytes();
   if (const auto *ConstantTotalSize =
           llvm::dyn_cast<ConstantInteger32>(TotalSize)) {
     uint32_t Value = ConstantTotalSize->getValue();
     Value = Utils::applyAlignment(Value, Alignment);
     Operand *SubAmount = legalize(Ctx->getConstantInt32(Value));
     _sub(SP, SP, SubAmount);
   } else {
     // Non-constant sizes need to be adjusted to the next highest multiple of
     // the required alignment at runtime.
     TotalSize = legalize(TotalSize, Legal_Reg | Legal_Flex);
     Variable *T = makeReg(IceType_i32);
     _mov(T, TotalSize);
     Operand *AddAmount = legalize(Ctx->getConstantInt32(Alignment - 1));
     _add(T, T, AddAmount);
     alignRegisterPow2(T, Alignment);
     _sub(SP, SP, T);
   }
   _mov(Dest, SP);
 }

 void TargetARM32::div0Check(Type Ty, Operand *SrcLo, Operand *SrcHi) {
   if (isGuaranteedNonzeroInt(SrcLo) || isGuaranteedNonzeroInt(SrcHi))
     return;
   Variable *SrcLoReg = legalizeToReg(SrcLo);
   switch (Ty) {
   default:
     llvm_unreachable("Unexpected type");
   case IceType_i8: {
     Operand *Mask =
         legalize(Ctx->getConstantInt32(0xFF), Legal_Reg | Legal_Flex);
     _tst(SrcLoReg, Mask);
     break;
   }
   case IceType_i16: {
     Operand *Mask =
         legalize(Ctx->getConstantInt32(0xFFFF), Legal_Reg | Legal_Flex);
     _tst(SrcLoReg, Mask);
     break;
   }
   case IceType_i32: {
     _tst(SrcLoReg, SrcLoReg);
     break;
   }
   case IceType_i64: {
     Variable *ScratchReg = makeReg(IceType_i32);
     _orrs(ScratchReg, SrcLoReg, SrcHi);
     // ScratchReg isn't going to be used, but we need the side-effect of
     // setting flags from this operation.
     Context.insert(InstFakeUse::create(Func, ScratchReg));
   }
   }
   InstARM32Label *Label = InstARM32Label::create(Func, this);
   _br(Label, CondARM32::NE);
   _trap();
   Context.insert(Label);
 }

 void TargetARM32::lowerIDivRem(Variable *Dest, Variable *T, Variable *Src0R,
                                Operand *Src1, ExtInstr ExtFunc,
                                DivInstr DivFunc, const char *DivHelperName,
                                bool IsRemainder) {
   div0Check(Dest->getType(), Src1, nullptr);
   Variable *Src1R = legalizeToReg(Src1);
   Variable *T0R = Src0R;
   Variable *T1R = Src1R;
   if (Dest->getType() != IceType_i32) {
     T0R = makeReg(IceType_i32);
     (this->*ExtFunc)(T0R, Src0R, CondARM32::AL);
     T1R = makeReg(IceType_i32);
     (this->*ExtFunc)(T1R, Src1R, CondARM32::AL);
   }
   if (hasCPUFeature(TargetARM32Features::HWDivArm)) {
     (this->*DivFunc)(T, T0R, T1R, CondARM32::AL);
     if (IsRemainder) {
       Variable *T2 = makeReg(IceType_i32);
       _mls(T2, T, T1R, T0R);
       T = T2;
     }
     _mov(Dest, T);
   } else {
     constexpr SizeT MaxSrcs = 2;
     InstCall *Call = makeHelperCall(DivHelperName, Dest, MaxSrcs);
     Call->addArg(T0R);
     Call->addArg(T1R);
     lowerCall(Call);
   }
   return;
 }

 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
   Variable *Dest = Inst->getDest();
   // TODO(jvoung): Should be able to flip Src0 and Src1 if it is easier to
   // legalize Src0 to flex or Src1 to flex and there is a reversible
   // instruction. E.g., reverse subtract with immediate, register vs register,
   // immediate.
   // Or it may be the case that the operands aren't swapped, but the bits can
   // be flipped and a different operation applied. E.g., use BIC (bit clear)
   // instead of AND for some masks.
   Operand *Src0 = legalizeUndef(Inst->getSrc(0));
   Operand *Src1 = legalizeUndef(Inst->getSrc(1));
   if (Dest->getType() == IceType_i64) {
     // These helper-call-involved instructions are lowered in this separate
     // switch. This is because we would otherwise assume that we need to
     // legalize Src0 to Src0RLo and Src0Hi. However, those go unused with
     // helper calls, and such unused/redundant instructions will fail liveness
     // analysis under -Om1 setting.
     switch (Inst->getOp()) {
     default:
       break;
     case InstArithmetic::Udiv:
     case InstArithmetic::Sdiv:
     case InstArithmetic::Urem:
     case InstArithmetic::Srem: {
       // Check for divide by 0 (ARM normally doesn't trap, but we want it to
       // trap for NaCl). Src1Lo and Src1Hi may have already been legalized to a
       // register, which will hide a constant source operand. Instead, check
       // the not-yet-legalized Src1 to optimize-out a divide by 0 check.
       if (auto *C64 = llvm::dyn_cast<ConstantInteger64>(Src1)) {
         if (C64->getValue() == 0) {
           _trap();
           return;
         }
       } else {
         Operand *Src1Lo = legalize(loOperand(Src1), Legal_Reg | Legal_Flex);
         Operand *Src1Hi = legalize(hiOperand(Src1), Legal_Reg | Legal_Flex);
         div0Check(IceType_i64, Src1Lo, Src1Hi);
       }
       // Technically, ARM has their own aeabi routines, but we can use the
       // non-aeabi routine as well. LLVM uses __aeabi_ldivmod for div, but uses
       // the more standard __moddi3 for rem.
       const char *HelperName = "";
       switch (Inst->getOp()) {
       default:
         llvm_unreachable("Should have only matched div ops.");
         break;
       case InstArithmetic::Udiv:
         HelperName = H_udiv_i64;
         break;
       case InstArithmetic::Sdiv:
         HelperName = H_sdiv_i64;
         break;
       case InstArithmetic::Urem:
         HelperName = H_urem_i64;
         break;
       case InstArithmetic::Srem:
         HelperName = H_srem_i64;
         break;
       }
       constexpr SizeT MaxSrcs = 2;
       InstCall *Call = makeHelperCall(HelperName, Dest, MaxSrcs);
       Call->addArg(Src0);
       Call->addArg(Src1);
       lowerCall(Call);
       return;
     }
     }
     Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
     Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
     Variable *Src0RLo = legalizeToReg(loOperand(Src0));
     Variable *Src0RHi = legalizeToReg(hiOperand(Src0));
     Operand *Src1Lo = loOperand(Src1);
     Operand *Src1Hi = hiOperand(Src1);
     Variable *T_Lo = makeReg(DestLo->getType());
     Variable *T_Hi = makeReg(DestHi->getType());
     switch (Inst->getOp()) {
     case InstArithmetic::_num:
       llvm_unreachable("Unknown arithmetic operator");
       return;
     case InstArithmetic::Add:
       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Flex);
       Src1Hi = legalize(Src1Hi, Legal_Reg | Legal_Flex);
       _adds(T_Lo, Src0RLo, Src1Lo);
       _mov(DestLo, T_Lo);
       _adc(T_Hi, Src0RHi, Src1Hi);
       _mov(DestHi, T_Hi);
       return;
     case InstArithmetic::And:
       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Flex);
       Src1Hi = legalize(Src1Hi, Legal_Reg | Legal_Flex);
       _and(T_Lo, Src0RLo, Src1Lo);
       _mov(DestLo, T_Lo);
       _and(T_Hi, Src0RHi, Src1Hi);
       _mov(DestHi, T_Hi);
       return;
     case InstArithmetic::Or:
       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Flex);
       Src1Hi = legalize(Src1Hi, Legal_Reg | Legal_Flex);
       _orr(T_Lo, Src0RLo, Src1Lo);
       _mov(DestLo, T_Lo);
       _orr(T_Hi, Src0RHi, Src1Hi);
       _mov(DestHi, T_Hi);
       return;
     case InstArithmetic::Xor:
       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Flex);
       Src1Hi = legalize(Src1Hi, Legal_Reg | Legal_Flex);
       _eor(T_Lo, Src0RLo, Src1Lo);
       _mov(DestLo, T_Lo);
       _eor(T_Hi, Src0RHi, Src1Hi);
       _mov(DestHi, T_Hi);
       return;
     case InstArithmetic::Sub:
       Src1Lo = legalize(Src1Lo, Legal_Reg | Legal_Flex);
       Src1Hi = legalize(Src1Hi, Legal_Reg | Legal_Flex);
       _subs(T_Lo, Src0RLo, Src1Lo);
       _mov(DestLo, T_Lo);
       _sbc(T_Hi, Src0RHi, Src1Hi);
       _mov(DestHi, T_Hi);
       return;
     case InstArithmetic::Mul: {
       // GCC 4.8 does:
       // a=b*c ==>
       //   t_acc =(mul) (b.lo * c.hi)
       //   t_acc =(mla) (c.lo * b.hi) + t_acc
       //   t.hi,t.lo =(umull) b.lo * c.lo
       //   t.hi += t_acc
       //   a.lo = t.lo
       //   a.hi = t.hi
       //
       // LLVM does:
       //   t.hi,t.lo =(umull) b.lo * c.lo
       //   t.hi =(mla) (b.lo * c.hi) + t.hi
       //   t.hi =(mla) (b.hi * c.lo) + t.hi
       //   a.lo = t.lo
       //   a.hi = t.hi
       //
       // LLVM's lowering has fewer instructions, but more register pressure:
       // t.lo is live from beginning to end, while GCC delays the two-dest
       // instruction till the end, and kills c.hi immediately.
       Variable *T_Acc = makeReg(IceType_i32);
       Variable *T_Acc1 = makeReg(IceType_i32);
       Variable *T_Hi1 = makeReg(IceType_i32);
       Variable *Src1RLo = legalizeToReg(Src1Lo);
       Variable *Src1RHi = legalizeToReg(Src1Hi);
       _mul(T_Acc, Src0RLo, Src1RHi);
       _mla(T_Acc1, Src1RLo, Src0RHi, T_Acc);
       _umull(T_Lo, T_Hi1, Src0RLo, Src1RLo);
       _add(T_Hi, T_Hi1, T_Acc1);
       _mov(DestLo, T_Lo);
       _mov(DestHi, T_Hi);
       return;
     }
     case InstArithmetic::Shl: {
       // a=b<<c ==>
       // pnacl-llc does:
       // mov     t_b.lo, b.lo
       // mov     t_b.hi, b.hi
       // mov     t_c.lo, c.lo
       // rsb     T0, t_c.lo, #32
       // lsr     T1, t_b.lo, T0
       // orr     t_a.hi, T1, t_b.hi, lsl t_c.lo
       // sub     T2, t_c.lo, #32
       // cmp     T2, #0
       // lslge   t_a.hi, t_b.lo, T2
       // lsl     t_a.lo, t_b.lo, t_c.lo
       // mov     a.lo, t_a.lo
       // mov     a.hi, t_a.hi
       //
       // GCC 4.8 does:
       // sub t_c1, c.lo, #32
       // lsl t_hi, b.hi, c.lo
       // orr t_hi, t_hi, b.lo, lsl t_c1
       // rsb t_c2, c.lo, #32
       // orr t_hi, t_hi, b.lo, lsr t_c2
       // lsl t_lo, b.lo, c.lo
       // a.lo = t_lo
       // a.hi = t_hi
       //
       // These are incompatible, therefore we mimic pnacl-llc.
       // Can be strength-reduced for constant-shifts, but we don't do that for
       // now.
       // Given the sub/rsb T_C, C.lo, #32, one of the T_C will be negative. On
       // ARM, shifts only take the lower 8 bits of the shift register, and
       // saturate to the range 0-32, so the negative value will saturate to 32.
       Constant *_32 = Ctx->getConstantInt32(32);
       Constant *_0 = Ctx->getConstantZero(IceType_i32);
       Variable *Src1RLo = legalizeToReg(Src1Lo);
       Variable *T0 = makeReg(IceType_i32);
       Variable *T1 = makeReg(IceType_i32);
       Variable *T2 = makeReg(IceType_i32);
       Variable *TA_Hi = makeReg(IceType_i32);
       Variable *TA_Lo = makeReg(IceType_i32);
       _rsb(T0, Src1RLo, _32);
       _lsr(T1, Src0RLo, T0);
       _orr(TA_Hi, T1, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
                                                   OperandARM32::LSL, Src1RLo));
       _sub(T2, Src1RLo, _32);
       _cmp(T2, _0);
       _lsl(TA_Hi, Src0RLo, T2, CondARM32::GE);
       _set_dest_redefined();
       _lsl(TA_Lo, Src0RLo, Src1RLo);
       _mov(DestLo, TA_Lo);
       _mov(DestHi, TA_Hi);
       return;
     }
     case InstArithmetic::Lshr:
     case InstArithmetic::Ashr: {
       // a=b>>c
       // pnacl-llc does:
       // mov        t_b.lo, b.lo
       // mov        t_b.hi, b.hi
       // mov        t_c.lo, c.lo
       // lsr        T0, t_b.lo, t_c.lo
       // rsb        T1, t_c.lo, #32
       // orr        t_a.lo, T0, t_b.hi, lsl T1
       // sub        T2, t_c.lo, #32
       // cmp        T2, #0
       // [al]srge   t_a.lo, t_b.hi, T2
       // [al]sr     t_a.hi, t_b.hi, t_c.lo
       // mov        a.lo, t_a.lo
       // mov        a.hi, t_a.hi
       //
       // GCC 4.8 does (lsr):
       // rsb        t_c1, c.lo, #32
       // lsr        t_lo, b.lo, c.lo
       // orr        t_lo, t_lo, b.hi, lsl t_c1
       // sub        t_c2, c.lo, #32
       // orr        t_lo, t_lo, b.hi, lsr t_c2
       // lsr        t_hi, b.hi, c.lo
       // mov        a.lo, t_lo
       // mov        a.hi, t_hi
       //
       // These are incompatible, therefore we mimic pnacl-llc.
       const bool IsAshr = Inst->getOp() == InstArithmetic::Ashr;
       Constant *_32 = Ctx->getConstantInt32(32);
       Constant *_0 = Ctx->getConstantZero(IceType_i32);
       Variable *Src1RLo = legalizeToReg(Src1Lo);
       Variable *T0 = makeReg(IceType_i32);
       Variable *T1 = makeReg(IceType_i32);
       Variable *T2 = makeReg(IceType_i32);
       Variable *TA_Lo = makeReg(IceType_i32);
       Variable *TA_Hi = makeReg(IceType_i32);
       _lsr(T0, Src0RLo, Src1RLo);
       _rsb(T1, Src1RLo, _32);
       _orr(TA_Lo, T0, OperandARM32FlexReg::create(Func, IceType_i32, Src0RHi,
                                                   OperandARM32::LSL, T1));
       _sub(T2, Src1RLo, _32);
       _cmp(T2, _0);
       if (IsAshr) {
         _asr(TA_Lo, Src0RHi, T2, CondARM32::GE);
         _set_dest_redefined();
         _asr(TA_Hi, Src0RHi, Src1RLo);
       } else {
         _lsr(TA_Lo, Src0RHi, T2, CondARM32::GE);
         _set_dest_redefined();
         _lsr(TA_Hi, Src0RHi, Src1RLo);
       }
       _mov(DestLo, TA_Lo);
       _mov(DestHi, TA_Hi);
       return;
     }
     case InstArithmetic::Fadd:
     case InstArithmetic::Fsub:
     case InstArithmetic::Fmul:
     case InstArithmetic::Fdiv:
     case InstArithmetic::Frem:
       llvm_unreachable("FP instruction with i64 type");
       return;
     case InstArithmetic::Udiv:
     case InstArithmetic::Sdiv:
     case InstArithmetic::Urem:
     case InstArithmetic::Srem:
       llvm_unreachable("Call-helper-involved instruction for i64 type "
                        "should have already been handled before");
       return;
     }
     return;
   } else if (isVectorType(Dest->getType())) {
     // Add a fake def to keep liveness consistent in the meantime.
     Variable *T = makeReg(Dest->getType());
     Context.insert(InstFakeDef::create(Func, T));
     _mov(Dest, T);
     UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   // Dest->getType() is a non-i64 scalar.
   Variable *Src0R = legalizeToReg(Src0);
   Variable *T = makeReg(Dest->getType());
   // Handle div/rem separately. They require a non-legalized Src1 to inspect
   // whether or not Src1 is a non-zero constant. Once legalized it is more
   // difficult to determine (constant may be moved to a register).
   switch (Inst->getOp()) {
   default:
     break;
   case InstArithmetic::Udiv: {
     constexpr bool IsRemainder = false;
     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt, &TargetARM32::_udiv,
                  H_udiv_i32, IsRemainder);
     return;
   }
   case InstArithmetic::Sdiv: {
     constexpr bool IsRemainder = false;
     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt, &TargetARM32::_sdiv,
                  H_sdiv_i32, IsRemainder);
     return;
   }
   case InstArithmetic::Urem: {
     constexpr bool IsRemainder = true;
     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_uxt, &TargetARM32::_udiv,
                  H_urem_i32, IsRemainder);
     return;
   }
   case InstArithmetic::Srem: {
     constexpr bool IsRemainder = true;
     lowerIDivRem(Dest, T, Src0R, Src1, &TargetARM32::_sxt, &TargetARM32::_sdiv,
                  H_srem_i32, IsRemainder);
     return;
   }
   case InstArithmetic::Frem: {
     const SizeT MaxSrcs = 2;
     Type Ty = Dest->getType();
     InstCall *Call = makeHelperCall(
         isFloat32Asserting32Or64(Ty) ? H_frem_f32 : H_frem_f64, Dest, MaxSrcs);
     Call->addArg(Src0R);
     Call->addArg(Src1);
     lowerCall(Call);
     return;
   }
   }

   // Handle floating point arithmetic separately: they require Src1 to be
   // legalized to a register.
   switch (Inst->getOp()) {
   default:
     break;
   case InstArithmetic::Fadd: {
     Variable *Src1R = legalizeToReg(Src1);
     _vadd(T, Src0R, Src1R);
     _mov(Dest, T);
     return;
   }
   case InstArithmetic::Fsub: {
     Variable *Src1R = legalizeToReg(Src1);
     _vsub(T, Src0R, Src1R);
     _mov(Dest, T);
     return;
   }
   case InstArithmetic::Fmul: {
     Variable *Src1R = legalizeToReg(Src1);
     _vmul(T, Src0R, Src1R);
     _mov(Dest, T);
     return;
   }
   case InstArithmetic::Fdiv: {
     Variable *Src1R = legalizeToReg(Src1);
     _vdiv(T, Src0R, Src1R);
     _mov(Dest, T);
     return;
   }
   }

   Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
   switch (Inst->getOp()) {
   case InstArithmetic::_num:
     llvm_unreachable("Unknown arithmetic operator");
     return;
   case InstArithmetic::Add:
     _add(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::And:
     _and(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Or:
     _orr(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Xor:
     _eor(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Sub:
     _sub(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Mul: {
     Variable *Src1R = legalizeToReg(Src1RF);
     _mul(T, Src0R, Src1R);
     _mov(Dest, T);
     return;
   }
   case InstArithmetic::Shl:
     _lsl(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Lshr:
     _lsr(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Ashr:
     _asr(T, Src0R, Src1RF);
     _mov(Dest, T);
     return;
   case InstArithmetic::Udiv:
   case InstArithmetic::Sdiv:
   case InstArithmetic::Urem:
   case InstArithmetic::Srem:
     llvm_unreachable("Integer div/rem should have been handled earlier.");
     return;
   case InstArithmetic::Fadd:
   case InstArithmetic::Fsub:
   case InstArithmetic::Fmul:
   case InstArithmetic::Fdiv:
   case InstArithmetic::Frem:
     llvm_unreachable("Floating point arith should have been handled earlier.");
     return;
   }
 }

 void TargetARM32::lowerAssign(const InstAssign *Inst) {
   Variable *Dest = Inst->getDest();
   Operand *Src0 = Inst->getSrc(0);
   assert(Dest->getType() == Src0->getType());
   if (Dest->getType() == IceType_i64) {
     Src0 = legalizeUndef(Src0);
     Operand *Src0Lo = legalize(loOperand(Src0), Legal_Reg | Legal_Flex);
     Operand *Src0Hi = legalize(hiOperand(Src0), Legal_Reg | Legal_Flex);
     Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
     Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
     Variable *T_Lo = makeReg(IceType_i32);
     Variable *T_Hi = makeReg(IceType_i32);

     _mov(T_Lo, Src0Lo);
     _mov(DestLo, T_Lo);
     _mov(T_Hi, Src0Hi);
     _mov(DestHi, T_Hi);
   } else {
     Operand *NewSrc;
     if (Dest->hasReg()) {
       // If Dest already has a physical register, then legalize the Src operand
       // into a Variable with the same register assignment. This especially
       // helps allow the use of Flex operands.
       NewSrc = legalize(Src0, Legal_Reg | Legal_Flex, Dest->getRegNum());
     } else {
       // Dest could be a stack operand. Since we could potentially need to do a
       // Store (and store can only have Register operands), legalize this to a
       // register.
       NewSrc = legalize(Src0, Legal_Reg);
     }
     if (isVectorType(Dest->getType())) {
       Variable *SrcR = legalizeToReg(NewSrc);
       _mov(Dest, SrcR);
     } else if (isFloatingType(Dest->getType())) {
       Variable *SrcR = legalizeToReg(NewSrc);
       _mov(Dest, SrcR);
     } else {
       _mov(Dest, NewSrc);
     }
   }
 }

 void TargetARM32::lowerBr(const InstBr *Inst) {
   if (Inst->isUnconditional()) {
     _br(Inst->getTargetUnconditional());
     return;
   }
   Operand *Cond = Inst->getCondition();
   // TODO(jvoung): Handle folding opportunities.

   Variable *Src0R = legalizeToReg(Cond);
   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   _cmp(Src0R, Zero);
   _br(Inst->getTargetTrue(), Inst->getTargetFalse(), CondARM32::NE);
 }

 void TargetARM32::lowerCall(const InstCall *Instr) {
   MaybeLeafFunc = false;
   NeedsStackAlignment = true;

   // Assign arguments to registers and stack. Also reserve stack.
   TargetARM32::CallingConv CC;
   // Pair of Arg Operand -> GPR number assignments.
   llvm::SmallVector<std::pair<Operand *, int32_t>,
                     TargetARM32::CallingConv::ARM32_MAX_GPR_ARG> GPRArgs;
   llvm::SmallVector<std::pair<Operand *, int32_t>,
                     TargetARM32::CallingConv::ARM32_MAX_FP_REG_UNITS> FPArgs;
   // Pair of Arg Operand -> stack offset.
   llvm::SmallVector<std::pair<Operand *, int32_t>, 8> StackArgs;
   int32_t ParameterAreaSizeBytes = 0;

   // Classify each argument operand according to the location where the
   // argument is passed.
   for (SizeT i = 0, NumArgs = Instr->getNumArgs(); i < NumArgs; ++i) {
     Operand *Arg = legalizeUndef(Instr->getArg(i));
     Type Ty = Arg->getType();
     bool InRegs = false;
     if (Ty == IceType_i64) {
       std::pair<int32_t, int32_t> Regs;
       if (CC.I64InRegs(&Regs)) {
         InRegs = true;
         Operand *Lo = loOperand(Arg);
         Operand *Hi = hiOperand(Arg);
         GPRArgs.push_back(std::make_pair(Lo, Regs.first));
         GPRArgs.push_back(std::make_pair(Hi, Regs.second));
       }
     } else if (isVectorType(Ty) || isFloatingType(Ty)) {
       int32_t Reg;
       if (CC.FPInReg(Ty, &Reg)) {
         InRegs = true;
         FPArgs.push_back(std::make_pair(Arg, Reg));
       }
     } else {
       assert(Ty == IceType_i32);
       int32_t Reg;
       if (CC.I32InReg(&Reg)) {
         InRegs = true;
         GPRArgs.push_back(std::make_pair(Arg, Reg));
       }
     }

     if (!InRegs) {
       ParameterAreaSizeBytes =
           applyStackAlignmentTy(ParameterAreaSizeBytes, Ty);
       StackArgs.push_back(std::make_pair(Arg, ParameterAreaSizeBytes));
       ParameterAreaSizeBytes += typeWidthInBytesOnStack(Ty);
     }
   }

   // Adjust the parameter area so that the stack is aligned. It is assumed that
   // the stack is already aligned at the start of the calling sequence.
   ParameterAreaSizeBytes = applyStackAlignment(ParameterAreaSizeBytes);

   // Subtract the appropriate amount for the argument area. This also takes
   // care of setting the stack adjustment during emission.
   //
   // TODO: If for some reason the call instruction gets dead-code eliminated
   // after lowering, we would need to ensure that the pre-call and the
   // post-call esp adjustment get eliminated as well.
   if (ParameterAreaSizeBytes) {
     Operand *SubAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
                                   Legal_Reg | Legal_Flex);
     _adjust_stack(ParameterAreaSizeBytes, SubAmount);
   }

   // Copy arguments that are passed on the stack to the appropriate stack
   // locations.
   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   for (auto &StackArg : StackArgs) {
     ConstantInteger32 *Loc =
         llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(StackArg.second));
     Type Ty = StackArg.first->getType();
     OperandARM32Mem *Addr;
     constexpr bool SignExt = false;
     if (OperandARM32Mem::canHoldOffset(Ty, SignExt, StackArg.second)) {
       Addr = OperandARM32Mem::create(Func, Ty, SP, Loc);
     } else {
       Variable *NewBase = Func->makeVariable(SP->getType());
       lowerArithmetic(
           InstArithmetic::create(Func, InstArithmetic::Add, NewBase, SP, Loc));
       Addr = formMemoryOperand(NewBase, Ty);
     }
     lowerStore(InstStore::create(Func, StackArg.first, Addr));
   }

   // Generate the call instruction. Assign its result to a temporary with high
   // register allocation weight.
   Variable *Dest = Instr->getDest();
   // ReturnReg doubles as ReturnRegLo as necessary.
   Variable *ReturnReg = nullptr;
   Variable *ReturnRegHi = nullptr;
   if (Dest) {
     switch (Dest->getType()) {
     case IceType_NUM:
       llvm_unreachable("Invalid Call dest type");
       break;
     case IceType_void:
       break;
     case IceType_i1:
     case IceType_i8:
     case IceType_i16:
     case IceType_i32:
       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_r0);
       break;
     case IceType_i64:
       ReturnReg = makeReg(IceType_i32, RegARM32::Reg_r0);
       ReturnRegHi = makeReg(IceType_i32, RegARM32::Reg_r1);
       break;
     case IceType_f32:
       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_s0);
       break;
     case IceType_f64:
       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_d0);
       break;
     case IceType_v4i1:
     case IceType_v8i1:
     case IceType_v16i1:
     case IceType_v16i8:
     case IceType_v8i16:
     case IceType_v4i32:
     case IceType_v4f32:
       ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_q0);
       break;
     }
   }
   Operand *CallTarget = Instr->getCallTarget();
   // TODO(jvoung): Handle sandboxing. const bool NeedSandboxing =
   // Ctx->getFlags().getUseSandboxing();

   // Allow ConstantRelocatable to be left alone as a direct call, but force
   // other constants like ConstantInteger32 to be in a register and make it an
   // indirect call.
   if (!llvm::isa<ConstantRelocatable>(CallTarget)) {
     CallTarget = legalize(CallTarget, Legal_Reg);
   }

   // Copy arguments to be passed in registers to the appropriate registers.
   for (auto &FPArg : FPArgs) {
     Variable *Reg = legalizeToReg(FPArg.first, FPArg.second);
     Context.insert(InstFakeUse::create(Func, Reg));
   }
   for (auto &GPRArg : GPRArgs) {
     Variable *Reg = legalizeToReg(GPRArg.first, GPRArg.second);
     // Generate a FakeUse of register arguments so that they do not get dead
     // code eliminated as a result of the FakeKill of scratch registers after
     // the call.
     Context.insert(InstFakeUse::create(Func, Reg));
   }
   Inst *NewCall = InstARM32Call::create(Func, ReturnReg, CallTarget);
   Context.insert(NewCall);
   if (ReturnRegHi)
     Context.insert(InstFakeDef::create(Func, ReturnRegHi));

   // Add the appropriate offset to SP. The call instruction takes care of
   // resetting the stack offset during emission.
   if (ParameterAreaSizeBytes) {
     Operand *AddAmount = legalize(Ctx->getConstantInt32(ParameterAreaSizeBytes),
                                   Legal_Reg | Legal_Flex);
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _add(SP, SP, AddAmount);
   }

   // Insert a register-kill pseudo instruction.
   Context.insert(InstFakeKill::create(Func, NewCall));

   // Generate a FakeUse to keep the call live if necessary.
   if (Instr->hasSideEffects() && ReturnReg) {
     Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
     Context.insert(FakeUse);
   }

   if (!Dest)
     return;

   // Assign the result of the call to Dest.
   if (ReturnReg) {
     if (ReturnRegHi) {
       auto *Dest64On32 = llvm::cast<Variable64On32>(Dest);
       Variable *DestLo = Dest64On32->getLo();
       Variable *DestHi = Dest64On32->getHi();
       _mov(DestLo, ReturnReg);
       _mov(DestHi, ReturnRegHi);
     } else {
       if (isFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
         _mov(Dest, ReturnReg);
       } else {
         assert(isIntegerType(Dest->getType()) &&
                typeWidthInBytes(Dest->getType()) <= 4);
         _mov(Dest, ReturnReg);
       }
     }
   }
 }

 namespace {
 void configureBitcastTemporary(Variable64On32 *Var) {
   Var->setMustNotHaveReg();
   Var->getHi()->setMustHaveReg();
   Var->getLo()->setMustHaveReg();
 }
 } // end of anonymous namespace

 void TargetARM32::lowerCast(const InstCast *Inst) {
   InstCast::OpKind CastKind = Inst->getCastKind();
   Variable *Dest = Inst->getDest();
   Operand *Src0 = legalizeUndef(Inst->getSrc(0));
   switch (CastKind) {
   default:
     Func->setError("Cast type not supported");
     return;
   case InstCast::Sext: {
     if (isVectorType(Dest->getType())) {
       Variable *T = makeReg(Dest->getType());
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
     } else if (Dest->getType() == IceType_i64) {
       // t1=sxtb src; t2= mov t1 asr #31; dst.lo=t1; dst.hi=t2
       Constant *ShiftAmt = Ctx->getConstantInt32(31);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
       Variable *T_Lo = makeReg(DestLo->getType());
       if (Src0->getType() == IceType_i32) {
         Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
         _mov(T_Lo, Src0RF);
       } else if (Src0->getType() == IceType_i1) {
         Variable *Src0R = legalizeToReg(Src0);
         _lsl(T_Lo, Src0R, ShiftAmt);
         _asr(T_Lo, T_Lo, ShiftAmt);
       } else {
         Variable *Src0R = legalizeToReg(Src0);
         _sxt(T_Lo, Src0R);
       }
       _mov(DestLo, T_Lo);
       Variable *T_Hi = makeReg(DestHi->getType());
       if (Src0->getType() != IceType_i1) {
         _mov(T_Hi, OperandARM32FlexReg::create(Func, IceType_i32, T_Lo,
                                                OperandARM32::ASR, ShiftAmt));
       } else {
         // For i1, the asr instruction is already done above.
         _mov(T_Hi, T_Lo);
       }
       _mov(DestHi, T_Hi);
     } else if (Src0->getType() == IceType_i1) {
       // GPR registers are 32-bit, so just use 31 as dst_bitwidth - 1.
       // lsl t1, src_reg, 31
       // asr t1, t1, 31
       // dst = t1
       Variable *Src0R = legalizeToReg(Src0);
       Constant *ShiftAmt = Ctx->getConstantInt32(31);
       Variable *T = makeReg(Dest->getType());
       _lsl(T, Src0R, ShiftAmt);
       _asr(T, T, ShiftAmt);
       _mov(Dest, T);
     } else {
       // t1 = sxt src; dst = t1
       Variable *Src0R = legalizeToReg(Src0);
       Variable *T = makeReg(Dest->getType());
       _sxt(T, Src0R);
       _mov(Dest, T);
     }
     break;
   }
   case InstCast::Zext: {
     if (isVectorType(Dest->getType())) {
       Variable *T = makeReg(Dest->getType());
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
     } else if (Dest->getType() == IceType_i64) {
       // t1=uxtb src; dst.lo=t1; dst.hi=0
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
       Variable *T_Lo = makeReg(DestLo->getType());
       // i32 and i1 can just take up the whole register. i32 doesn't need uxt,
       // while i1 will have an and mask later anyway.
       if (Src0->getType() == IceType_i32 || Src0->getType() == IceType_i1) {
         Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
         _mov(T_Lo, Src0RF);
       } else {
         Variable *Src0R = legalizeToReg(Src0);
         _uxt(T_Lo, Src0R);
       }
       if (Src0->getType() == IceType_i1) {
         Constant *One = Ctx->getConstantInt32(1);
         _and(T_Lo, T_Lo, One);
       }
       _mov(DestLo, T_Lo);
       Variable *T_Hi = makeReg(DestLo->getType());
       _mov(T_Hi, Zero);
       _mov(DestHi, T_Hi);
     } else if (Src0->getType() == IceType_i1) {
       // t = Src0; t &= 1; Dest = t
       Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
       Constant *One = Ctx->getConstantInt32(1);
       Variable *T = makeReg(Dest->getType());
       // Just use _mov instead of _uxt since all registers are 32-bit. _uxt
       // requires the source to be a register so could have required a _mov
       // from legalize anyway.
       _mov(T, Src0RF);
       _and(T, T, One);
       _mov(Dest, T);
     } else {
       // t1 = uxt src; dst = t1
       Variable *Src0R = legalizeToReg(Src0);
       Variable *T = makeReg(Dest->getType());
       _uxt(T, Src0R);
       _mov(Dest, T);
     }
     break;
   }
   case InstCast::Trunc: {
     if (isVectorType(Dest->getType())) {
       Variable *T = makeReg(Dest->getType());
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
     } else {
       if (Src0->getType() == IceType_i64)
         Src0 = loOperand(Src0);
       Operand *Src0RF = legalize(Src0, Legal_Reg | Legal_Flex);
       // t1 = trunc Src0RF; Dest = t1
       Variable *T = makeReg(Dest->getType());
       _mov(T, Src0RF);
       if (Dest->getType() == IceType_i1)
         _and(T, T, Ctx->getConstantInt1(1));
       _mov(Dest, T);
     }
     break;
   }
   case InstCast::Fptrunc:
   case InstCast::Fpext: {
     // fptrunc: dest.f32 = fptrunc src0.fp64
     // fpext: dest.f64 = fptrunc src0.fp32
     const bool IsTrunc = CastKind == InstCast::Fptrunc;
     if (isVectorType(Dest->getType())) {
       Variable *T = makeReg(Dest->getType());
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
       break;
     }
     assert(Dest->getType() == (IsTrunc ? IceType_f32 : IceType_f64));
     assert(Src0->getType() == (IsTrunc ? IceType_f64 : IceType_f32));
     Variable *Src0R = legalizeToReg(Src0);
     Variable *T = makeReg(Dest->getType());
     _vcvt(T, Src0R, IsTrunc ? InstARM32Vcvt::D2s : InstARM32Vcvt::S2d);
     _mov(Dest, T);
     break;
   }
   case InstCast::Fptosi:
   case InstCast::Fptoui: {
     if (isVectorType(Dest->getType())) {
       Variable *T = makeReg(Dest->getType());
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
       break;
     }

     const bool DestIsSigned = CastKind == InstCast::Fptosi;
     const bool Src0IsF32 = isFloat32Asserting32Or64(Src0->getType());
     if (llvm::isa<Variable64On32>(Dest)) {
       const char *HelperName =
           Src0IsF32 ? (DestIsSigned ? H_fptosi_f32_i64 : H_fptoui_f32_i64)
                     : (DestIsSigned ? H_fptosi_f64_i64 : H_fptoui_f64_i64);
       static constexpr SizeT MaxSrcs = 1;
       InstCall *Call = makeHelperCall(HelperName, Dest, MaxSrcs);
       Call->addArg(Src0);
       lowerCall(Call);
       break;
     }
     // fptosi:
     //     t1.fp = vcvt src0.fp
     //     t2.i32 = vmov t1.fp
     //     dest.int = conv t2.i32     @ Truncates the result if needed.
     // fptoui:
     //     t1.fp = vcvt src0.fp
     //     t2.u32 = vmov t1.fp
     //     dest.uint = conv t2.u32    @ Truncates the result if needed.
     Variable *Src0R = legalizeToReg(Src0);
     Variable *T_fp = makeReg(IceType_f32);
     const InstARM32Vcvt::VcvtVariant Conversion =
         Src0IsF32 ? (DestIsSigned ? InstARM32Vcvt::S2si : InstARM32Vcvt::S2ui)
                   : (DestIsSigned ? InstARM32Vcvt::D2si : InstARM32Vcvt::D2ui);
     _vcvt(T_fp, Src0R, Conversion);
     Variable *T = makeReg(IceType_i32);
     _mov(T, T_fp);
     if (Dest->getType() != IceType_i32) {
       Variable *T_1 = makeReg(Dest->getType());
       lowerCast(InstCast::create(Func, InstCast::Trunc, T_1, T));
       T = T_1;
     }
     _mov(Dest, T);
     break;
   }
   case InstCast::Sitofp:
   case InstCast::Uitofp: {
     if (isVectorType(Dest->getType())) {
       Variable *T = makeReg(Dest->getType());
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
       break;
     }
     const bool SourceIsSigned = CastKind == InstCast::Sitofp;
     const bool DestIsF32 = isFloat32Asserting32Or64(Dest->getType());
     if (Src0->getType() == IceType_i64) {
       const char *HelperName =
           DestIsF32 ? (SourceIsSigned ? H_sitofp_i64_f32 : H_uitofp_i64_f32)
                     : (SourceIsSigned ? H_sitofp_i64_f64 : H_uitofp_i64_f64);
       static constexpr SizeT MaxSrcs = 1;
       InstCall *Call = makeHelperCall(HelperName, Dest, MaxSrcs);
       Call->addArg(Src0);
       lowerCall(Call);
       break;
     }
     // sitofp:
     //     t1.i32 = sext src.int    @ sign-extends src0 if needed.
     //     t2.fp32 = vmov t1.i32
     //     t3.fp = vcvt.{fp}.s32    @ fp is either f32 or f64
     // uitofp:
     //     t1.i32 = zext src.int    @ zero-extends src0 if needed.
     //     t2.fp32 = vmov t1.i32
     //     t3.fp = vcvt.{fp}.s32    @ fp is either f32 or f64
     if (Src0->getType() != IceType_i32) {
       Variable *Src0R_32 = makeReg(IceType_i32);
       lowerCast(InstCast::create(Func, SourceIsSigned ? InstCast::Sext
                                                       : InstCast::Zext,
                                  Src0R_32, Src0));
       Src0 = Src0R_32;
     }
     Variable *Src0R = legalizeToReg(Src0);
     Variable *Src0R_f32 = makeReg(IceType_f32);
     _mov(Src0R_f32, Src0R);
     Src0R = Src0R_f32;
     Variable *T = makeReg(Dest->getType());
     const InstARM32Vcvt::VcvtVariant Conversion =
         DestIsF32
             ? (SourceIsSigned ? InstARM32Vcvt::Si2s : InstARM32Vcvt::Ui2s)
             : (SourceIsSigned ? InstARM32Vcvt::Si2d : InstARM32Vcvt::Ui2d);
     _vcvt(T, Src0R, Conversion);
     _mov(Dest, T);
     break;
   }
   case InstCast::Bitcast: {
     Operand *Src0 = Inst->getSrc(0);
     if (Dest->getType() == Src0->getType()) {
       InstAssign *Assign = InstAssign::create(Func, Dest, Src0);
       lowerAssign(Assign);
       return;
     }
     Type DestType = Dest->getType();
     switch (DestType) {
     case IceType_NUM:
     case IceType_void:
       llvm::report_fatal_error("Unexpected bitcast.");
     case IceType_i1:
       UnimplementedError(Func->getContext()->getFlags());
       break;
     case IceType_i8:
       UnimplementedError(Func->getContext()->getFlags());
       break;
     case IceType_i16:
       UnimplementedError(Func->getContext()->getFlags());
       break;
     case IceType_i32:
     case IceType_f32: {
       Variable *Src0R = legalizeToReg(Src0);
       Variable *T = makeReg(DestType);
       _mov(T, Src0R);
       lowerAssign(InstAssign::create(Func, Dest, T));
       break;
     }
     case IceType_i64: {
       // t0, t1 <- src0
       // dest[31..0]  = t0
       // dest[63..32] = t1
       assert(Src0->getType() == IceType_f64);
       auto *T = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
       T->initHiLo(Func);
       configureBitcastTemporary(T);
       Variable *Src0R = legalizeToReg(Src0);
       _mov(T, Src0R);
       lowerAssign(InstAssign::create(Func, Dest, T));
       break;
     }
     case IceType_f64: {
       // T0 <- lo(src)
       // T1 <- hi(src)
       // vmov T2, T0, T1
       // Dest <- T2
       assert(Src0->getType() == IceType_i64);
       Variable *T = makeReg(DestType);
       auto *Src64 = llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
       Src64->initHiLo(Func);
       configureBitcastTemporary(Src64);
       lowerAssign(InstAssign::create(Func, Src64, Src0));
       _mov(T, Src64);
       lowerAssign(InstAssign::create(Func, Dest, T));
       break;
     }
     case IceType_v4i1:
     case IceType_v8i1:
     case IceType_v16i1:
     case IceType_v8i16:
     case IceType_v16i8:
     case IceType_v4f32:
     case IceType_v4i32: {
       // avoid liveness errors
       Variable *T = makeReg(DestType);
       Context.insert(InstFakeDef::create(Func, T, legalizeToReg(Src0)));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
       break;
     }
     }
     break;
   }
   }
 }

 void TargetARM32::lowerExtractElement(const InstExtractElement *Inst) {
   Variable *Dest = Inst->getDest();
   Type DestType = Dest->getType();
   Variable *T = makeReg(DestType);
   Context.insert(InstFakeDef::create(Func, T));
   _mov(Dest, T);
   UnimplementedError(Func->getContext()->getFlags());
 }

 namespace {
 // Validates FCMPARM32_TABLE's declaration w.r.t. InstFcmp::FCondition ordering
 // (and naming).
 enum {
 #define X(val, CC0, CC1) _fcmp_ll_##val,
   FCMPARM32_TABLE
 #undef X
       _fcmp_ll_NUM
 };

 enum {
 #define X(tag, str) _fcmp_hl_##tag = InstFcmp::tag,
   ICEINSTFCMP_TABLE
 #undef X
       _fcmp_hl_NUM
 };

 static_assert(_fcmp_hl_NUM == _fcmp_ll_NUM,
               "Inconsistency between high-level and low-level fcmp tags.");
 #define X(tag, str)                                                            \
   static_assert(                                                               \
       _fcmp_hl_##tag == _fcmp_ll_##tag,                                        \
       "Inconsistency between high-level and low-level fcmp tag " #tag);
 ICEINSTFCMP_TABLE
 #undef X

 struct {
   CondARM32::Cond CC0;
   CondARM32::Cond CC1;
 } TableFcmp[] = {
 #define X(val, CC0, CC1)                                                       \
   { CondARM32::CC0, CondARM32::CC1 }                                           \
   ,
     FCMPARM32_TABLE
 #undef X
 };
 } // end of anonymous namespace

 void TargetARM32::lowerFcmp(const InstFcmp *Inst) {
   Variable *Dest = Inst->getDest();
   if (isVectorType(Dest->getType())) {
     Variable *T = makeReg(Dest->getType());
     Context.insert(InstFakeDef::create(Func, T));
     _mov(Dest, T);
     UnimplementedError(Func->getContext()->getFlags());
     return;
   }

   Variable *Src0R = legalizeToReg(Inst->getSrc(0));
   Variable *Src1R = legalizeToReg(Inst->getSrc(1));
   Variable *T = makeReg(IceType_i32);
   _vcmp(Src0R, Src1R);
   _mov(T, Ctx->getConstantZero(IceType_i32));
   _vmrs();
   Operand *One = Ctx->getConstantInt32(1);
   InstFcmp::FCond Condition = Inst->getCondition();
   assert(Condition < llvm::array_lengthof(TableFcmp));
   CondARM32::Cond CC0 = TableFcmp[Condition].CC0;
   CondARM32::Cond CC1 = TableFcmp[Condition].CC1;
   if (CC0 != CondARM32::kNone) {
     _mov(T, One, CC0);
     // If this mov is not a maybe mov, but an actual mov (i.e., CC0 == AL), we
     // don't want to _set_dest_redefined so that liveness + dead-code
     // elimination will get rid of the previous assignment (i.e., T = 0) above.
     // TODO(stichnot,jpp): We should be able to conditionally create the "T=0"
     // instruction based on CC0, instead of relying on DCE to remove it.
     if (CC0 != CondARM32::AL)
       _set_dest_redefined();
   }
   if (CC1 != CondARM32::kNone) {
     assert(CC0 != CondARM32::kNone);
     assert(CC1 != CondARM32::AL);
     _mov_redefined(T, One, CC1);
   }
   _mov(Dest, T);
 }

 void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
   Variable *Dest = Inst->getDest();
   Operand *Src0 = legalizeUndef(Inst->getSrc(0));
   Operand *Src1 = legalizeUndef(Inst->getSrc(1));

   if (isVectorType(Dest->getType())) {
     Variable *T = makeReg(Dest->getType());
     Context.insert(InstFakeDef::create(Func, T));
     _mov(Dest, T);
     UnimplementedError(Func->getContext()->getFlags());
     return;
   }

   // a=icmp cond, b, c ==>
   // GCC does:
   //   cmp      b.hi, c.hi     or  cmp      b.lo, c.lo
   //   cmp.eq   b.lo, c.lo         sbcs t1, b.hi, c.hi
   //   mov.<C1> t, #1              mov.<C1> t, #1
   //   mov.<C2> t, #0              mov.<C2> t, #0
   //   mov      a, t               mov      a, t
   // where the "cmp.eq b.lo, c.lo" is used for unsigned and "sbcs t1, hi, hi"
   // is used for signed compares. In some cases, b and c need to be swapped as
   // well.
   //
   // LLVM does:
   // for EQ and NE:
   //   eor  t1, b.hi, c.hi
   //   eor  t2, b.lo, c.hi
   //   orrs t, t1, t2
   //   mov.<C> t, #1
   //   mov  a, t
   //
   // that's nice in that it's just as short but has fewer dependencies for
   // better ILP at the cost of more registers.
   //
   // Otherwise for signed/unsigned <, <=, etc. LLVM uses a sequence with two
   // unconditional mov #0, two cmps, two conditional mov #1, and one
   // conditional reg mov. That has few dependencies for good ILP, but is a
   // longer sequence.
   //
   // So, we are going with the GCC version since it's usually better (except
   // perhaps for eq/ne). We could revisit special-casing eq/ne later.
   Constant *Zero = Ctx->getConstantZero(IceType_i32);
   Constant *One = Ctx->getConstantInt32(1);
   if (Src0->getType() == IceType_i64) {
     InstIcmp::ICond Conditon = Inst->getCondition();
     size_t Index = static_cast<size_t>(Conditon);
     assert(Index < llvm::array_lengthof(TableIcmp64));
     Variable *Src0Lo, *Src0Hi;
     Operand *Src1LoRF, *Src1HiRF;
     if (TableIcmp64[Index].Swapped) {
       Src0Lo = legalizeToReg(loOperand(Src1));
       Src0Hi = legalizeToReg(hiOperand(Src1));
       Src1LoRF = legalize(loOperand(Src0), Legal_Reg | Legal_Flex);
       Src1HiRF = legalize(hiOperand(Src0), Legal_Reg | Legal_Flex);
     } else {
       Src0Lo = legalizeToReg(loOperand(Src0));
       Src0Hi = legalizeToReg(hiOperand(Src0));
       Src1LoRF = legalize(loOperand(Src1), Legal_Reg | Legal_Flex);
       Src1HiRF = legalize(hiOperand(Src1), Legal_Reg | Legal_Flex);
     }
     Variable *T = makeReg(IceType_i32);
     if (TableIcmp64[Index].IsSigned) {
       Variable *ScratchReg = makeReg(IceType_i32);
       _cmp(Src0Lo, Src1LoRF);
       _sbcs(ScratchReg, Src0Hi, Src1HiRF);
       // ScratchReg isn't going to be used, but we need the side-effect of
       // setting flags from this operation.
       Context.insert(InstFakeUse::create(Func, ScratchReg));
     } else {
       _cmp(Src0Hi, Src1HiRF);
       _cmp(Src0Lo, Src1LoRF, CondARM32::EQ);
     }
     _mov(T, One, TableIcmp64[Index].C1);
     _mov_redefined(T, Zero, TableIcmp64[Index].C2);
     _mov(Dest, T);
     return;
   }

   // a=icmp cond b, c ==>
   // GCC does:
   //   <u/s>xtb tb, b
   //   <u/s>xtb tc, c
   //   cmp      tb, tc
   //   mov.C1   t, #0
   //   mov.C2   t, #1
   //   mov      a, t
   // where the unsigned/sign extension is not needed for 32-bit. They also have
   // special cases for EQ and NE. E.g., for NE:
   //   <extend to tb, tc>
   //   subs     t, tb, tc
   //   movne    t, #1
   //   mov      a, t
   //
   // LLVM does:
   //   lsl     tb, b, #<N>
   //   mov     t, #0
   //   cmp     tb, c, lsl #<N>
   //   mov.<C> t, #1
   //   mov     a, t
   //
   // the left shift is by 0, 16, or 24, which allows the comparison to focus on
   // the digits that actually matter (for 16-bit or 8-bit signed/unsigned). For
   // the unsigned case, for some reason it does similar to GCC and does a uxtb
   // first. It's not clear to me why that special-casing is needed.
   //
   // We'll go with the LLVM way for now, since it's shorter and has just as few
   // dependencies.
   int32_t ShiftAmt = 32 - getScalarIntBitWidth(Src0->getType());
   assert(ShiftAmt >= 0);
   Constant *ShiftConst = nullptr;
   Variable *Src0R = nullptr;
   Variable *T = makeReg(IceType_i32);
   if (ShiftAmt) {
     ShiftConst = Ctx->getConstantInt32(ShiftAmt);
     Src0R = makeReg(IceType_i32);
     _lsl(Src0R, legalizeToReg(Src0), ShiftConst);
   } else {
     Src0R = legalizeToReg(Src0);
   }
   _mov(T, Zero);
   if (ShiftAmt) {
     Variable *Src1R = legalizeToReg(Src1);
     OperandARM32FlexReg *Src1RShifted = OperandARM32FlexReg::create(
         Func, IceType_i32, Src1R, OperandARM32::LSL, ShiftConst);
     _cmp(Src0R, Src1RShifted);
   } else {
     Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
     _cmp(Src0R, Src1RF);
   }
   _mov_redefined(T, One, getIcmp32Mapping(Inst->getCondition()));
   _mov(Dest, T);
   return;
 }

 void TargetARM32::lowerInsertElement(const InstInsertElement *Inst) {
   (void)Inst;
   UnimplementedError(Func->getContext()->getFlags());
 }

 namespace {
 inline uint64_t getConstantMemoryOrder(Operand *Opnd) {
   if (auto Integer = llvm::dyn_cast<ConstantInteger32>(Opnd))
     return Integer->getValue();
   return Intrinsics::MemoryOrderInvalid;
 }
 } // end of anonymous namespace

 void TargetARM32::lowerAtomicRMW(Variable *Dest, uint32_t Operation,
                                  Operand *Ptr, Operand *Val) {
   // retry:
   //     ldrex contents, [addr]
   //     op tmp, contents, operand
   //     strex success, tmp, [addr]
   //     jne retry
   //     fake-use(addr, operand)  @ prevents undesirable clobbering.
   //     mov dest, contents
   assert(Dest != nullptr);
   Type DestTy = Dest->getType();
   (void)Ptr;
   (void)Val;

   OperandARM32Mem *Mem;
   Variable *PtrContentsReg;
   Variable *PtrContentsHiReg;
   Variable *PtrContentsLoReg;
   Variable *Value = Func->makeVariable(DestTy);
   Variable *ValueReg;
   Variable *ValueHiReg;
   Variable *ValueLoReg;
   Variable *Success = makeReg(IceType_i32);
   Variable *TmpReg;
   Variable *TmpHiReg;
   Variable *TmpLoReg;
   Operand *_0 = Ctx->getConstantZero(IceType_i32);
   InstARM32Label *Retry = InstARM32Label::create(Func, this);

   if (DestTy == IceType_i64) {
     Variable64On32 *PtrContentsReg64 = makeI64RegPair();
     PtrContentsHiReg = PtrContentsReg64->getHi();
     PtrContentsLoReg = PtrContentsReg64->getLo();
     PtrContentsReg = PtrContentsReg64;

     llvm::cast<Variable64On32>(Value)->initHiLo(Func);
     Variable64On32 *ValueReg64 = makeI64RegPair();
     ValueHiReg = ValueReg64->getHi();
     ValueLoReg = ValueReg64->getLo();
     ValueReg = ValueReg64;

     Variable64On32 *TmpReg64 = makeI64RegPair();
     TmpHiReg = TmpReg64->getHi();
     TmpLoReg = TmpReg64->getLo();
     TmpReg = TmpReg64;
   } else {
     PtrContentsReg = makeReg(DestTy);
     PtrContentsHiReg = nullptr;
     PtrContentsLoReg = PtrContentsReg;

     ValueReg = makeReg(DestTy);
     ValueHiReg = nullptr;
     ValueLoReg = ValueReg;

     TmpReg = makeReg(DestTy);
     TmpHiReg = nullptr;
     TmpLoReg = TmpReg;
   }

   if (DestTy == IceType_i64) {
     Context.insert(InstFakeDef::create(Func, Value));
   }
   lowerAssign(InstAssign::create(Func, Value, Val));

   Variable *PtrVar = Func->makeVariable(IceType_i32);
   lowerAssign(InstAssign::create(Func, PtrVar, Ptr));

   _dmb();
   Context.insert(Retry);
   Mem = formMemoryOperand(PtrVar, DestTy);
   if (DestTy == IceType_i64) {
     Context.insert(InstFakeDef::create(Func, ValueReg, Value));
   }
   lowerAssign(InstAssign::create(Func, ValueReg, Value));
   if (DestTy == IceType_i8 || DestTy == IceType_i16) {
     _uxt(ValueReg, ValueReg);
   }
   _ldrex(PtrContentsReg, Mem);

   if (DestTy == IceType_i64) {
     Context.insert(InstFakeDef::create(Func, TmpReg, ValueReg));
   }
   switch (Operation) {
   default:
     Func->setError("Unknown AtomicRMW operation");
     return;
   case Intrinsics::AtomicAdd:
     if (DestTy == IceType_i64) {
       _adds(TmpLoReg, PtrContentsLoReg, ValueLoReg);
       _adc(TmpHiReg, PtrContentsHiReg, ValueHiReg);
     } else {
       _add(TmpLoReg, PtrContentsLoReg, ValueLoReg);
     }
     break;
   case Intrinsics::AtomicSub:
     if (DestTy == IceType_i64) {
       _subs(TmpLoReg, PtrContentsLoReg, ValueLoReg);
       _sbc(TmpHiReg, PtrContentsHiReg, ValueHiReg);
     } else {
       _sub(TmpLoReg, PtrContentsLoReg, ValueLoReg);
     }
     break;
   case Intrinsics::AtomicOr:
     _orr(TmpLoReg, PtrContentsLoReg, ValueLoReg);
     if (DestTy == IceType_i64) {
       _orr(TmpHiReg, PtrContentsHiReg, ValueHiReg);
     }
     break;
   case Intrinsics::AtomicAnd:
     _and(TmpLoReg, PtrContentsLoReg, ValueLoReg);
     if (DestTy == IceType_i64) {
       _and(TmpHiReg, PtrContentsHiReg, ValueHiReg);
     }
     break;
   case Intrinsics::AtomicXor:
     _eor(TmpLoReg, PtrContentsLoReg, ValueLoReg);
     if (DestTy == IceType_i64) {
       _eor(TmpHiReg, PtrContentsHiReg, ValueHiReg);
     }
     break;
   case Intrinsics::AtomicExchange:
     _mov(TmpLoReg, ValueLoReg);
     if (DestTy == IceType_i64) {
       _mov(TmpHiReg, ValueHiReg);
     }
     break;
   }
   _strex(Success, TmpReg, Mem);
   _cmp(Success, _0);
   _br(Retry, CondARM32::NE);

   // The following fake-uses ensure that Subzero will not clobber them in the
   // load-linked/store-conditional loop above. We might have to spill them, but
   // spilling is preferable over incorrect behavior.
   Context.insert(InstFakeUse::create(Func, PtrVar));
   if (auto *Value64 = llvm::dyn_cast<Variable64On32>(Value)) {
     Context.insert(InstFakeUse::create(Func, Value64->getHi()));
     Context.insert(InstFakeUse::create(Func, Value64->getLo()));
   } else {
     Context.insert(InstFakeUse::create(Func, Value));
   }
   _dmb();
   if (DestTy == IceType_i8 || DestTy == IceType_i16) {
     _uxt(PtrContentsReg, PtrContentsReg);
   }

   if (DestTy == IceType_i64) {
     Context.insert(InstFakeUse::create(Func, PtrContentsReg));
   }
   lowerAssign(InstAssign::create(Func, Dest, PtrContentsReg));
   if (auto *Dest64 = llvm::dyn_cast<Variable64On32>(Dest)) {
     Context.insert(InstFakeUse::create(Func, Dest64->getLo()));
     Context.insert(InstFakeUse::create(Func, Dest64->getHi()));
   } else {
     Context.insert(InstFakeUse::create(Func, Dest));
   }
 }

 void TargetARM32::lowerIntrinsicCall(const InstIntrinsicCall *Instr) {
   Variable *Dest = Instr->getDest();
   Type DestTy = (Dest != nullptr) ? Dest->getType() : IceType_void;
   Intrinsics::IntrinsicID ID = Instr->getIntrinsicInfo().ID;
   switch (ID) {
   case Intrinsics::AtomicFence:
   case Intrinsics::AtomicFenceAll:
     assert(Dest == nullptr);
     _dmb();
     return;
   case Intrinsics::AtomicIsLockFree: {
     Operand *ByteSize = Instr->getArg(0);
     auto *CI = llvm::dyn_cast<ConstantInteger32>(ByteSize);
     if (CI == nullptr) {
       // The PNaCl ABI requires the byte size to be a compile-time constant.
       Func->setError("AtomicIsLockFree byte size should be compile-time const");
       return;
     }
     static constexpr int32_t NotLockFree = 0;
     static constexpr int32_t LockFree = 1;
     int32_t Result = NotLockFree;
     switch (CI->getValue()) {
     case 1:
     case 2:
     case 4:
     case 8:
       Result = LockFree;
       break;
     }
     _mov(Dest, legalizeToReg(Ctx->getConstantInt32(Result)));
     return;
   }
   case Intrinsics::AtomicLoad: {
     assert(isScalarIntegerType(DestTy));
     // We require the memory address to be naturally aligned. Given that is the
     // case, then normal loads are atomic.
     if (!Intrinsics::isMemoryOrderValid(
             ID, getConstantMemoryOrder(Instr->getArg(1)))) {
       Func->setError("Unexpected memory ordering for AtomicLoad");
       return;
     }
     Variable *T;

     if (DestTy == IceType_i64) {
       // ldrex is the only arm instruction that is guaranteed to load a 64-bit
       // integer atomically. Everything else works with a regular ldr.
       T = makeI64RegPair();
       _ldrex(T, formMemoryOperand(Instr->getArg(0), IceType_i64));
     } else {
       T = makeReg(DestTy);
       _ldr(T, formMemoryOperand(Instr->getArg(0), DestTy));
     }
     _dmb();
     lowerAssign(InstAssign::create(Func, Dest, T));
     // Make sure the atomic load isn't elided when unused, by adding a FakeUse.
     // Since lowerLoad may fuse the load w/ an arithmetic instruction, insert
     // the FakeUse on the last-inserted instruction's dest.
     Context.insert(
         InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
     return;
   }
   case Intrinsics::AtomicStore: {
     // We require the memory address to be naturally aligned. Given that is the
     // case, then normal loads are atomic.
     if (!Intrinsics::isMemoryOrderValid(
             ID, getConstantMemoryOrder(Instr->getArg(2)))) {
       Func->setError("Unexpected memory ordering for AtomicStore");
       return;
     }
     Operand *Value = Instr->getArg(0);
     Type ValueTy = Value->getType();
     assert(isScalarIntegerType(ValueTy));
     Operand *Addr = Instr->getArg(1);

     if (ValueTy == IceType_i64) {
       // Atomic 64-bit stores require a load-locked/store-conditional loop using
       // ldrexd, and strexd. The lowered code is:
       //
       // retry:
       //     ldrexd t.lo, t.hi, [addr]
       //     strexd success, value.lo, value.hi, [addr]
       //     cmp success, #0
       //     bne retry
       //     fake-use(addr, value.lo, value.hi)
       //
       // The fake-use is needed to prevent those variables from being clobbered
       // in the loop (which will happen under register pressure.)
       Variable64On32 *Tmp = makeI64RegPair();
       Variable64On32 *ValueVar =
           llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
       Variable *AddrVar = makeReg(IceType_i32);
       Variable *Success = makeReg(IceType_i32);
       OperandARM32Mem *Mem;
       Operand *_0 = Ctx->getConstantZero(IceType_i32);
       InstARM32Label *Retry = InstARM32Label::create(Func, this);
       Variable64On32 *NewReg = makeI64RegPair();
       ValueVar->initHiLo(Func);
       ValueVar->mustNotHaveReg();

       _dmb();
       lowerAssign(InstAssign::create(Func, ValueVar, Value));
       lowerAssign(InstAssign::create(Func, AddrVar, Addr));

       Context.insert(Retry);
       Context.insert(InstFakeDef::create(Func, NewReg));
       lowerAssign(InstAssign::create(Func, NewReg, ValueVar));
       Mem = formMemoryOperand(AddrVar, IceType_i64);
       _ldrex(Tmp, Mem);
       // This fake-use both prevents the ldrex from being dead-code eliminated,
       // while also keeping liveness happy about all defs being used.
       Context.insert(
           InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
       _strex(Success, NewReg, Mem);
       _cmp(Success, _0);
       _br(Retry, CondARM32::NE);

       Context.insert(InstFakeUse::create(Func, ValueVar->getLo()));
       Context.insert(InstFakeUse::create(Func, ValueVar->getHi()));
       Context.insert(InstFakeUse::create(Func, AddrVar));
       _dmb();
       return;
     }
     // non-64-bit stores are atomically as long as the address is aligned. This
     // is PNaCl, so addresses are aligned.
     Variable *T = makeReg(ValueTy);

     _dmb();
     lowerAssign(InstAssign::create(Func, T, Value));
     _str(T, formMemoryOperand(Addr, ValueTy));
     _dmb();
     return;
   }
   case Intrinsics::AtomicCmpxchg: {
     // The initial lowering for cmpxchg was:
     //
     // retry:
     //     ldrex tmp, [addr]
     //     cmp tmp, expected
     //     mov expected, tmp
     //     jne retry
     //     strex success, new, [addr]
     //     cmp success, #0
     //     bne retry
     //     mov dest, expected
     //
     // Besides requiring two branches, that lowering could also potentially
     // write to memory (in mov expected, tmp) unless we were OK with increasing
     // the register pressure and requiring expected to be an infinite-weight
     // variable (spoiler alert: that was a problem for i64 cmpxchg.) Through
     // careful rewritting, and thanks to predication, we now implement the
     // lowering as:
     //
     // retry:
     //     ldrex tmp, [addr]
     //     cmp tmp, expected
     //     strexeq success, new, [addr]
     //     movne expected, tmp
     //     cmpeq success, #0
     //     bne retry
     //     mov dest, expected
     //
     // Predication lets us move the strex ahead of the mov expected, tmp, which
     // allows tmp to be a non-infinite weight temporary. We wanted to avoid
     // writing to memory between ldrex and strex because, even though most times
     // that would cause no issues, if any interleaving memory write aliased
     // [addr] than we would have undefined behavior. Undefined behavior isn't
     // cool, so we try to avoid it. See the "Synchronization and semaphores"
     // section of the "ARM Architecture Reference Manual."

     assert(isScalarIntegerType(DestTy));
     // We require the memory address to be naturally aligned. Given that is the
     // case, then normal loads are atomic.
     if (!Intrinsics::isMemoryOrderValid(
             ID, getConstantMemoryOrder(Instr->getArg(3)),
             getConstantMemoryOrder(Instr->getArg(4)))) {
       Func->setError("Unexpected memory ordering for AtomicCmpxchg");
       return;
     }

     OperandARM32Mem *Mem;
     Variable *TmpReg;
     Variable *Expected, *ExpectedReg;
     Variable *New, *NewReg;
     Variable *Success = makeReg(IceType_i32);
     Operand *_0 = Ctx->getConstantZero(IceType_i32);
     InstARM32Label *Retry = InstARM32Label::create(Func, this);

     if (DestTy == IceType_i64) {
       Variable64On32 *TmpReg64 = makeI64RegPair();
       Variable64On32 *New64 =
           llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
       Variable64On32 *NewReg64 = makeI64RegPair();
       Variable64On32 *Expected64 =
           llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
       Variable64On32 *ExpectedReg64 = makeI64RegPair();

       New64->initHiLo(Func);
       New64->mustNotHaveReg();
       Expected64->initHiLo(Func);
       Expected64->mustNotHaveReg();

       TmpReg = TmpReg64;
       New = New64;
       NewReg = NewReg64;
       Expected = Expected64;
       ExpectedReg = ExpectedReg64;
     } else {
       TmpReg = makeReg(DestTy);
       New = Func->makeVariable(DestTy);
       NewReg = makeReg(DestTy);
       Expected = Func->makeVariable(DestTy);
       ExpectedReg = makeReg(DestTy);
     }

     Mem = formMemoryOperand(Instr->getArg(0), DestTy);
     if (DestTy == IceType_i64) {
       Context.insert(InstFakeDef::create(Func, Expected));
     }
     lowerAssign(InstAssign::create(Func, Expected, Instr->getArg(1)));
     if (DestTy == IceType_i64) {
       Context.insert(InstFakeDef::create(Func, New));
     }
     lowerAssign(InstAssign::create(Func, New, Instr->getArg(2)));
     _dmb();

     Context.insert(Retry);
     if (DestTy == IceType_i64) {
       Context.insert(InstFakeDef::create(Func, ExpectedReg, Expected));
     }
     lowerAssign(InstAssign::create(Func, ExpectedReg, Expected));
     if (DestTy == IceType_i64) {
       Context.insert(InstFakeDef::create(Func, NewReg, New));
     }
     lowerAssign(InstAssign::create(Func, NewReg, New));

     _ldrex(TmpReg, Mem);
     Context.insert(
         InstFakeUse::create(Func, Context.getLastInserted()->getDest()));
     if (DestTy == IceType_i64) {
       auto *TmpReg64 = llvm::cast<Variable64On32>(TmpReg);
       auto *ExpectedReg64 = llvm::cast<Variable64On32>(ExpectedReg);
       // lowerAssign above has added fake-defs for TmpReg and ExpectedReg. Let's
       // keep liveness happy, shall we?
       Context.insert(InstFakeUse::create(Func, TmpReg));
       Context.insert(InstFakeUse::create(Func, ExpectedReg));
       _cmp(TmpReg64->getHi(), ExpectedReg64->getHi());
       _cmp(TmpReg64->getLo(), ExpectedReg64->getLo(), CondARM32::EQ);
     } else {
       _cmp(TmpReg, ExpectedReg);
     }
     _strex(Success, NewReg, Mem, CondARM32::EQ);
     if (DestTy == IceType_i64) {
       auto *TmpReg64 = llvm::cast<Variable64On32>(TmpReg);
       auto *Expected64 = llvm::cast<Variable64On32>(Expected);
       _mov_redefined(Expected64->getHi(), TmpReg64->getHi(), CondARM32::NE);
       _mov_redefined(Expected64->getLo(), TmpReg64->getLo(), CondARM32::NE);
       auto *FakeDef = InstFakeDef::create(Func, Expected, TmpReg);
       Context.insert(FakeDef);
       FakeDef->setDestRedefined();
     } else {
       _mov_redefined(Expected, TmpReg, CondARM32::NE);
     }
     _cmp(Success, _0, CondARM32::EQ);
     _br(Retry, CondARM32::NE);
     _dmb();
     lowerAssign(InstAssign::create(Func, Dest, Expected));
     Context.insert(InstFakeUse::create(Func, Expected));
     if (auto *New64 = llvm::dyn_cast<Variable64On32>(New)) {
       Context.insert(InstFakeUse::create(Func, New64->getLo()));
       Context.insert(InstFakeUse::create(Func, New64->getHi()));
     } else {
       Context.insert(InstFakeUse::create(Func, New));
     }
     return;
   }
   case Intrinsics::AtomicRMW: {
     if (!Intrinsics::isMemoryOrderValid(
             ID, getConstantMemoryOrder(Instr->getArg(3)))) {
       Func->setError("Unexpected memory ordering for AtomicRMW");
       return;
     }
     lowerAtomicRMW(
         Dest, static_cast<uint32_t>(
                   llvm::cast<ConstantInteger32>(Instr->getArg(0))->getValue()),
         Instr->getArg(1), Instr->getArg(2));
     return;
   }
   case Intrinsics::Bswap: {
     Operand *Val = Instr->getArg(0);
     Type Ty = Val->getType();
     if (Ty == IceType_i64) {
       Val = legalizeUndef(Val);
       Variable *Val_Lo = legalizeToReg(loOperand(Val));
       Variable *Val_Hi = legalizeToReg(hiOperand(Val));
       Variable *T_Lo = makeReg(IceType_i32);
       Variable *T_Hi = makeReg(IceType_i32);
       Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
       _rev(T_Lo, Val_Lo);
       _rev(T_Hi, Val_Hi);
       _mov(DestLo, T_Hi);
       _mov(DestHi, T_Lo);
     } else {
       assert(Ty == IceType_i32 || Ty == IceType_i16);
       Variable *ValR = legalizeToReg(Val);
       Variable *T = makeReg(Ty);
       _rev(T, ValR);
       if (Val->getType() == IceType_i16) {
         Operand *Sixteen =
             legalize(Ctx->getConstantInt32(16), Legal_Reg | Legal_Flex);
         _lsr(T, T, Sixteen);
       }
       _mov(Dest, T);
     }
     return;
   }
   case Intrinsics::Ctpop: {
     Operand *Val = Instr->getArg(0);
     InstCall *Call = makeHelperCall(isInt32Asserting32Or64(Val->getType())
                                         ? H_call_ctpop_i32
                                         : H_call_ctpop_i64,
                                     Dest, 1);
     Call->addArg(Val);
     lowerCall(Call);
     // The popcount helpers always return 32-bit values, while the intrinsic's
     // signature matches some 64-bit platform's native instructions and expect
     // to fill a 64-bit reg. Thus, clear the upper bits of the dest just in
     // case the user doesn't do that in the IR or doesn't toss the bits via
     // truncate.
     if (Val->getType() == IceType_i64) {
       Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
       Constant *Zero = Ctx->getConstantZero(IceType_i32);
       Variable *T = makeReg(Zero->getType());
       _mov(T, Zero);
       _mov(DestHi, T);
     }
     return;
   }
   case Intrinsics::Ctlz: {
     // The "is zero undef" parameter is ignored and we always return a
     // well-defined value.
     Operand *Val = Instr->getArg(0);
     Variable *ValLoR;
     Variable *ValHiR = nullptr;
     if (Val->getType() == IceType_i64) {
       Val = legalizeUndef(Val);
       ValLoR = legalizeToReg(loOperand(Val));
       ValHiR = legalizeToReg(hiOperand(Val));
     } else {
       ValLoR = legalizeToReg(Val);
     }
     lowerCLZ(Dest, ValLoR, ValHiR);
     return;
   }
   case Intrinsics::Cttz: {
     // Essentially like Clz, but reverse the bits first.
     Operand *Val = Instr->getArg(0);
     Variable *ValLoR;
     Variable *ValHiR = nullptr;
     if (Val->getType() == IceType_i64) {
       Val = legalizeUndef(Val);
       ValLoR = legalizeToReg(loOperand(Val));
       ValHiR = legalizeToReg(hiOperand(Val));
       Variable *TLo = makeReg(IceType_i32);
       Variable *THi = makeReg(IceType_i32);
       _rbit(TLo, ValLoR);
       _rbit(THi, ValHiR);
       ValLoR = THi;
       ValHiR = TLo;
     } else {
       ValLoR = legalizeToReg(Val);
       Variable *T = makeReg(IceType_i32);
       _rbit(T, ValLoR);
       ValLoR = T;
     }
     lowerCLZ(Dest, ValLoR, ValHiR);
     return;
   }
   case Intrinsics::Fabs: {
     Type DestTy = Dest->getType();
     Variable *T = makeReg(DestTy);
     if (isVectorType(DestTy)) {
       // Add a fake def to keep liveness consistent in the meantime.
       Context.insert(InstFakeDef::create(Func, T));
       _mov(Dest, T);
       UnimplementedError(Func->getContext()->getFlags());
       return;
     }
     _vabs(T, legalizeToReg(Instr->getArg(0)));
     _mov(Dest, T);
     return;
   }
   case Intrinsics::Longjmp: {
     InstCall *Call = makeHelperCall(H_call_longjmp, nullptr, 2);
     Call->addArg(Instr->getArg(0));
     Call->addArg(Instr->getArg(1));
     lowerCall(Call);
     return;
   }
   case Intrinsics::Memcpy: {
     // In the future, we could potentially emit an inline memcpy/memset, etc.
     // for intrinsic calls w/ a known length.
     InstCall *Call = makeHelperCall(H_call_memcpy, nullptr, 3);
     Call->addArg(Instr->getArg(0));
     Call->addArg(Instr->getArg(1));
     Call->addArg(Instr->getArg(2));
     lowerCall(Call);
     return;
   }
   case Intrinsics::Memmove: {
     InstCall *Call = makeHelperCall(H_call_memmove, nullptr, 3);
     Call->addArg(Instr->getArg(0));
     Call->addArg(Instr->getArg(1));
     Call->addArg(Instr->getArg(2));
     lowerCall(Call);
     return;
   }
   case Intrinsics::Memset: {
     // The value operand needs to be extended to a stack slot size because the
     // PNaCl ABI requires arguments to be at least 32 bits wide.
     Operand *ValOp = Instr->getArg(1);
     assert(ValOp->getType() == IceType_i8);
     Variable *ValExt = Func->makeVariable(stackSlotType());
     lowerCast(InstCast::create(Func, InstCast::Zext, ValExt, ValOp));
     // Technically, ARM has their own __aeabi_memset, but we can use plain
     // memset too. The value and size argument need to be flipped if we ever
     // decide to use __aeabi_memset.
     InstCall *Call = makeHelperCall(H_call_memset, nullptr, 3);
     Call->addArg(Instr->getArg(0));
     Call->addArg(ValExt);
     Call->addArg(Instr->getArg(2));
     lowerCall(Call);
     return;
   }
   case Intrinsics::NaClReadTP: {
     if (Ctx->getFlags().getUseSandboxing()) {
       UnimplementedError(Func->getContext()->getFlags());
     } else {
       InstCall *Call = makeHelperCall(H_call_read_tp, Dest, 0);
       lowerCall(Call);
     }
     return;
   }
   case Intrinsics::Setjmp: {
     InstCall *Call = makeHelperCall(H_call_setjmp, Dest, 1);
     Call->addArg(Instr->getArg(0));
     lowerCall(Call);
     return;
   }
   case Intrinsics::Sqrt: {
     Variable *Src = legalizeToReg(Instr->getArg(0));
     Variable *T = makeReg(Dest->getType());
     _vsqrt(T, Src);
     _mov(Dest, T);
     return;
   }
   case Intrinsics::Stacksave: {
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     _mov(Dest, SP);
     return;
   }
   case Intrinsics::Stackrestore: {
     Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
     Operand *Val = legalize(Instr->getArg(0), Legal_Reg | Legal_Flex);
     _mov_redefined(SP, Val);
     return;
   }
   case Intrinsics::Trap:
     _trap();
     return;
   case Intrinsics::UnknownIntrinsic:
     Func->setError("Should not be lowering UnknownIntrinsic");
     return;
   }
   return;
 }

 void TargetARM32::lowerCLZ(Variable *Dest, Variable *ValLoR, Variable *ValHiR) {
   Type Ty = Dest->getType();
   assert(Ty == IceType_i32 || Ty == IceType_i64);
   Variable *T = makeReg(IceType_i32);
   _clz(T, ValLoR);
   if (Ty == IceType_i64) {
     Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
     Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
     Operand *Zero =
         legalize(Ctx->getConstantZero(IceType_i32), Legal_Reg | Legal_Flex);
     Operand *ThirtyTwo =
         legalize(Ctx->getConstantInt32(32), Legal_Reg | Legal_Flex);
     _cmp(ValHiR, Zero);
     Variable *T2 = makeReg(IceType_i32);
     _add(T2, T, ThirtyTwo);
     _clz(T2, ValHiR, CondARM32::NE);
     // T2 is actually a source as well when the predicate is not AL (since it
     // may leave T2 alone). We use _set_dest_redefined to prolong the liveness
     // of T2 as if it was used as a source.
     _set_dest_redefined();
     _mov(DestLo, T2);
     Variable *T3 = makeReg(Zero->getType());
     _mov(T3, Zero);
     _mov(DestHi, T3);
     return;
   }
   _mov(Dest, T);
   return;
 }

 void TargetARM32::lowerLoad(const InstLoad *Load) {
   // A Load instruction can be treated the same as an Assign instruction, after
   // the source operand is transformed into an OperandARM32Mem operand.
   Type Ty = Load->getDest()->getType();
   Operand *Src0 = formMemoryOperand(Load->getSourceAddress(), Ty);
   Variable *DestLoad = Load->getDest();

   // TODO(jvoung): handled folding opportunities. Sign and zero extension can
   // be folded into a load.
   InstAssign *Assign = InstAssign::create(Func, DestLoad, Src0);
   lowerAssign(Assign);
 }

 void TargetARM32::doAddressOptLoad() {
   UnimplementedError(Func->getContext()->getFlags());
 }

 void TargetARM32::randomlyInsertNop(float Probability,
                                     RandomNumberGenerator &RNG) {
   RandomNumberGeneratorWrapper RNGW(RNG);
   if (RNGW.getTrueWithProbability(Probability)) {
     UnimplementedError(Func->getContext()->getFlags());
   }
 }

 void TargetARM32::lowerPhi(const InstPhi * /*Inst*/) {
   Func->setError("Phi found in regular instruction list");
 }

 void TargetARM32::lowerRet(const InstRet *Inst) {
   Variable *Reg = nullptr;
   if (Inst->hasRetValue()) {
     Operand *Src0 = Inst->getRetValue();
     Type Ty = Src0->getType();
     if (Ty == IceType_i64) {
       Src0 = legalizeUndef(Src0);
       Variable *R0 = legalizeToReg(loOperand(Src0), RegARM32::Reg_r0);
       Variable *R1 = legalizeToReg(hiOperand(Src0), RegARM32::Reg_r1);
       Reg = R0;
       Context.insert(InstFakeUse::create(Func, R1));
     } else if (Ty == IceType_f32) {
       Variable *S0 = legalizeToReg(Src0, RegARM32::Reg_s0);
       Reg = S0;
     } else if (Ty == IceType_f64) {
       Variable *D0 = legalizeToReg(Src0, RegARM32::Reg_d0);
       Reg = D0;
     } else if (isVectorType(Src0->getType())) {
       Variable *Q0 = legalizeToReg(Src0, RegARM32::Reg_q0);
       Reg = Q0;
     } else {
       Operand *Src0F = legalize(Src0, Legal_Reg | Legal_Flex);
       Reg = makeReg(Src0F->getType(), RegARM32::Reg_r0);
       _mov(Reg, Src0F, CondARM32::AL);
     }
   }
   // Add a ret instruction even if sandboxing is enabled, because addEpilog
   // explicitly looks for a ret instruction as a marker for where to insert the
   // frame removal instructions. addEpilog is responsible for restoring the
   // "lr" register as needed prior to this ret instruction.
   _ret(getPhysicalRegister(RegARM32::Reg_lr), Reg);
   // Add a fake use of sp to make sure sp stays alive for the entire function.
   // Otherwise post-call sp adjustments get dead-code eliminated.
   // TODO: Are there more places where the fake use should be inserted? E.g.
   // "void f(int n){while(1) g(n);}" may not have a ret instruction.
   Variable *SP = getPhysicalRegister(RegARM32::Reg_sp);
   Context.insert(InstFakeUse::create(Func, SP));
 }

 void TargetARM32::lowerSelect(const InstSelect *Inst) {
   Variable *Dest = Inst->getDest();
   Type DestTy = Dest->getType();
   Operand *SrcT = Inst->getTrueOperand();
   Operand *SrcF = Inst->getFalseOperand();
   Operand *Condition = Inst->getCondition();

   if (isVectorType(DestTy)) {
     Variable *T = makeReg(DestTy);
     Context.insert(InstFakeDef::create(Func, T));
     _mov(Dest, T);
     UnimplementedError(Func->getContext()->getFlags());
     return;
   }
   // TODO(jvoung): handle folding opportunities.
   // cmp cond, #0; mov t, SrcF; mov_cond t, SrcT; mov dest, t
   Variable *CmpOpnd0 = legalizeToReg(Condition);
   Operand *CmpOpnd1 = Ctx->getConstantZero(IceType_i32);
   _cmp(CmpOpnd0, CmpOpnd1);
   static constexpr CondARM32::Cond Cond = CondARM32::NE;
   if (DestTy == IceType_i64) {
     SrcT = legalizeUndef(SrcT);
     SrcF = legalizeUndef(SrcF);
     // Set the low portion.
     Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
     Operand *SrcFLo = legalize(loOperand(SrcF), Legal_Reg | Legal_Flex);
     Variable *TLo = makeReg(SrcFLo->getType());
     _mov(TLo, SrcFLo);
     Operand *SrcTLo = legalize(loOperand(SrcT), Legal_Reg | Legal_Flex);
     _mov_redefined(TLo, SrcTLo, Cond);
     _mov(DestLo, TLo);
     // Set the high portion.
     Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
     Operand *SrcFHi = legalize(hiOperand(SrcF), Legal_Reg | Legal_Flex);
     Variable *THi = makeReg(SrcFHi->getType());
     _mov(THi, SrcFHi);
     Operand *SrcTHi = legalize(hiOperand(SrcT), Legal_Reg | Legal_Flex);
     _mov_redefined(THi, SrcTHi, Cond);
     _mov(DestHi, THi);
     return;
   }

   if (isFloatingType(DestTy)) {
     Variable *T = makeReg(DestTy);
     SrcF = legalizeToReg(SrcF);
     assert(DestTy == SrcF->getType());
     _mov(T, SrcF);
     SrcT = legalizeToReg(SrcT);
     assert(DestTy == SrcT->getType());
     _mov(T, SrcT, Cond);
     _set_dest_redefined();
     _mov(Dest, T);
     return;
   }

   SrcF = legalize(SrcF, Legal_Reg | Legal_Flex);
   Variable *T = makeReg(SrcF->getType());
   _mov(T, SrcF);
   SrcT = legalize(SrcT, Legal_Reg | Legal_Flex);
   _mov_redefined(T, SrcT, Cond);
   _mov(Dest, T);
 }

 void TargetARM32::lowerStore(const InstStore *Inst) {
   Operand *Value = Inst->getData();
   Operand *Addr = Inst->getAddr();
   OperandARM32Mem *NewAddr = formMemoryOperand(Addr, Value->getType());
   Type Ty = NewAddr->getType();

   if (Ty == IceType_i64) {
     Value = legalizeUndef(Value);
     Variable *ValueHi = legalizeToReg(hiOperand(Value));
     Variable *ValueLo = legalizeToReg(loOperand(Value));
     _str(ValueHi, llvm::cast<OperandARM32Mem>(hiOperand(NewAddr)));
     _str(ValueLo, llvm::cast<OperandARM32Mem>(loOperand(NewAddr)));
   } else {
     Variable *ValueR = legalizeToReg(Value);
     _str(ValueR, NewAddr);
   }
 }

 void TargetARM32::doAddressOptStore() {
   UnimplementedError(Func->getContext()->getFlags());
 }

 void TargetARM32::lowerSwitch(const InstSwitch *Inst) {
   // This implements the most naive possible lowering.
   // cmp a,val[0]; jeq label[0]; cmp a,val[1]; jeq label[1]; ... jmp default
   Operand *Src0 = Inst->getComparison();
   SizeT NumCases = Inst->getNumCases();
   if (Src0->getType() == IceType_i64) {
     Src0 = legalizeUndef(Src0);
     Variable *Src0Lo = legalizeToReg(loOperand(Src0));
     Variable *Src0Hi = legalizeToReg(hiOperand(Src0));
     for (SizeT I = 0; I < NumCases; ++I) {
       Operand *ValueLo = Ctx->getConstantInt32(Inst->getValue(I));
       Operand *ValueHi = Ctx->getConstantInt32(Inst->getValue(I) >> 32);
       ValueLo = legalize(ValueLo, Legal_Reg | Legal_Flex);
       ValueHi = legalize(ValueHi, Legal_Reg | Legal_Flex);
       _cmp(Src0Lo, ValueLo);
       _cmp(Src0Hi, ValueHi, CondARM32::EQ);
       _br(Inst->getLabel(I), CondARM32::EQ);
     }
     _br(Inst->getLabelDefault());
     return;
   }

   // 32 bit integer
   Variable *Src0Var = legalizeToReg(Src0);
   for (SizeT I = 0; I < NumCases; ++I) {
     Operand *Value = Ctx->getConstantInt32(Inst->getValue(I));
     Value = legalize(Value, Legal_Reg | Legal_Flex);
     _cmp(Src0Var, Value);
     _br(Inst->getLabel(I), CondARM32::EQ);
   }
   _br(Inst->getLabelDefault());
 }

 void TargetARM32::lowerUnreachable(const InstUnreachable * /*Inst*/) {
   _trap();
 }

 void TargetARM32::prelowerPhis() {
   PhiLowering::prelowerPhis32Bit<TargetARM32>(this, Context.getNode(), Func);
 }

 Variable *TargetARM32::makeVectorOfZeros(Type Ty, int32_t RegNum) {
   Variable *Reg = makeReg(Ty, RegNum);
   Context.insert(InstFakeDef::create(Func, Reg));
   UnimplementedError(Func->getContext()->getFlags());
   return Reg;
 }

 // Helper for legalize() to emit the right code to lower an operand to a
 // register of the appropriate type.
 Variable *TargetARM32::copyToReg(Operand *Src, int32_t RegNum) {
   Type Ty = Src->getType();
   Variable *Reg = makeReg(Ty, RegNum);
   _mov(Reg, Src);
   return Reg;
 }

 Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
                                int32_t RegNum) {
   Type Ty = From->getType();
   // Assert that a physical register is allowed. To date, all calls to
   // legalize() allow a physical register. Legal_Flex converts registers to the
   // right type OperandARM32FlexReg as needed.
   assert(Allowed & Legal_Reg);
   // Go through the various types of operands: OperandARM32Mem,
   // OperandARM32Flex, Constant, and Variable. Given the above assertion, if
   // type of operand is not legal (e.g., OperandARM32Mem and !Legal_Mem), we
   // can always copy to a register.
   if (auto Mem = llvm::dyn_cast<OperandARM32Mem>(From)) {
     static const struct {
       bool CanHaveOffset;
       bool CanHaveIndex;
     } MemTraits[] = {
 #define X(tag, elementty, int_width, vec_width, sbits, ubits, rraddr)          \
   { (ubits) > 0, rraddr }                                                      \
   ,
         ICETYPEARM32_TABLE
 #undef X
     };
     // Before doing anything with a Mem operand, we need to ensure that the
     // Base and Index components are in physical registers.
     Variable *Base = Mem->getBase();
     Variable *Index = Mem->getIndex();
     ConstantInteger32 *Offset = Mem->getOffset();
     assert(Index == nullptr || Offset == nullptr);
     Variable *RegBase = nullptr;
     Variable *RegIndex = nullptr;
     if (Base) {
       RegBase = legalizeToReg(Base);
     }
     if (Index) {
       RegIndex = legalizeToReg(Index);
       if (!MemTraits[Ty].CanHaveIndex) {
         Variable *T = makeReg(IceType_i32, getReservedTmpReg());
         _add(T, RegBase, RegIndex);
         RegBase = T;
         RegIndex = nullptr;
       }
     }
     if (Offset && Offset->getValue() != 0) {
       static constexpr bool SignExt = false;
       if (!MemTraits[Ty].CanHaveOffset ||
           !OperandARM32Mem::canHoldOffset(Ty, SignExt, Offset->getValue())) {
         Variable *T = legalizeToReg(Offset, getReservedTmpReg());
         _add(T, T, RegBase);
         RegBase = T;
         Offset = llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(0));
       }
     }

     // Create a new operand if there was a change.
     if (Base != RegBase || Index != RegIndex) {
       // There is only a reg +/- reg or reg + imm form.
       // Figure out which to re-create.
       if (RegBase && RegIndex) {
         Mem = OperandARM32Mem::create(Func, Ty, RegBase, RegIndex,
                                       Mem->getShiftOp(), Mem->getShiftAmt(),
                                       Mem->getAddrMode());
       } else {
         Mem = OperandARM32Mem::create(Func, Ty, RegBase, Offset,
                                       Mem->getAddrMode());
       }
     }
     if (Allowed & Legal_Mem) {
       From = Mem;
     } else {
       Variable *Reg = makeReg(Ty, RegNum);
       _ldr(Reg, Mem);
       From = Reg;
     }
     return From;
   }

   if (auto Flex = llvm::dyn_cast<OperandARM32Flex>(From)) {
     if (!(Allowed & Legal_Flex)) {
       if (auto FlexReg = llvm::dyn_cast<OperandARM32FlexReg>(Flex)) {
         if (FlexReg->getShiftOp() == OperandARM32::kNoShift) {
           From = FlexReg->getReg();
           // Fall through and let From be checked as a Variable below, where it
           // may or may not need a register.
         } else {
           return copyToReg(Flex, RegNum);
         }
       } else {
         return copyToReg(Flex, RegNum);
       }
     } else {
       return From;
     }
   }

   if (llvm::isa<Constant>(From)) {
     if (llvm::isa<ConstantUndef>(From)) {
       From = legalizeUndef(From, RegNum);
       if (isVectorType(Ty))
         return From;
     }
     // There should be no constants of vector type (other than undef).
     assert(!isVectorType(Ty));
     bool CanBeFlex = Allowed & Legal_Flex;
     if (auto *C32 = llvm::dyn_cast<ConstantInteger32>(From)) {
       uint32_t RotateAmt;
       uint32_t Immed_8;
       uint32_t Value = static_cast<uint32_t>(C32->getValue());
       // Check if the immediate will fit in a Flexible second operand, if a
       // Flexible second operand is allowed. We need to know the exact value,
       // so that rules out relocatable constants. Also try the inverse and use
       // MVN if possible.
       if (CanBeFlex &&
           OperandARM32FlexImm::canHoldImm(Value, &RotateAmt, &Immed_8)) {
         return OperandARM32FlexImm::create(Func, Ty, Immed_8, RotateAmt);
       } else if (CanBeFlex && OperandARM32FlexImm::canHoldImm(
                                   ~Value, &RotateAmt, &Immed_8)) {
         auto InvertedFlex =
             OperandARM32FlexImm::create(Func, Ty, Immed_8, RotateAmt);
         Variable *Reg = makeReg(Ty, RegNum);
         _mvn(Reg, InvertedFlex);
         return Reg;
       } else {
         // Do a movw/movt to a register.
         Variable *Reg = makeReg(Ty, RegNum);
         uint32_t UpperBits = (Value >> 16) & 0xFFFF;
         _movw(Reg,
               UpperBits != 0 ? Ctx->getConstantInt32(Value & 0xFFFF) : C32);
         if (UpperBits != 0) {
           _movt(Reg, Ctx->getConstantInt32(UpperBits));
         }
         return Reg;
       }
     } else if (auto *C = llvm::dyn_cast<ConstantRelocatable>(From)) {
       Variable *Reg = makeReg(Ty, RegNum);
       _movw(Reg, C);
       _movt(Reg, C);
       return Reg;
     } else {
       assert(isScalarFloatingType(Ty));
       // Load floats/doubles from literal pool.
       // TODO(jvoung): Allow certain immediates to be encoded directly in an
       // operand. See Table A7-18 of the ARM manual: "Floating-point modified
       // immediate constants". Or, for 32-bit floating point numbers, just
       // encode the raw bits into a movw/movt pair to GPR, and vmov to an SREG,
       // instead of using a movw/movt pair to get the const-pool address then
       // loading to SREG.
       std::string Buffer;
       llvm::raw_string_ostream StrBuf(Buffer);
       llvm::cast<Constant>(From)->emitPoolLabel(StrBuf, Ctx);
       llvm::cast<Constant>(From)->setShouldBePooled(true);
       Constant *Offset = Ctx->getConstantSym(0, StrBuf.str(), true);
       Variable *BaseReg = makeReg(getPointerType());
       _movw(BaseReg, Offset);
       _movt(BaseReg, Offset);
       From = formMemoryOperand(BaseReg, Ty);
       return copyToReg(From, RegNum);
     }
   }

   if (auto Var = llvm::dyn_cast<Variable>(From)) {
     // Check if the variable is guaranteed a physical register. This can happen
     // either when the variable is pre-colored or when it is assigned infinite
     // weight.
     bool MustHaveRegister = (Var->hasReg() || Var->mustHaveReg());
     // We need a new physical register for the operand if:
     //   Mem is not allowed and Var isn't guaranteed a physical
     //   register, or
     //   RegNum is required and Var->getRegNum() doesn't match.
     if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
         (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
       From = copyToReg(From, RegNum);
     }
     return From;
   }
   llvm_unreachable("Unhandled operand kind in legalize()");

   return From;
 }

 /// Provide a trivial wrapper to legalize() for this common usage.
 Variable *TargetARM32::legalizeToReg(Operand *From, int32_t RegNum) {
   return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
 }

 /// Legalize undef values to concrete values.
 Operand *TargetARM32::legalizeUndef(Operand *From, int32_t RegNum) {
   Type Ty = From->getType();
   if (llvm::isa<ConstantUndef>(From)) {
     // Lower undefs to zero. Another option is to lower undefs to an
     // uninitialized register; however, using an uninitialized register results
     // in less predictable code.
     //
     // If in the future the implementation is changed to lower undef values to
     // uninitialized registers, a FakeDef will be needed:
     // Context.insert(InstFakeDef::create(Func, Reg)); This is in order to
     // ensure that the live range of Reg is not overestimated. If the constant
     // being lowered is a 64 bit value, then the result should be split and the
     // lo and hi components will need to go in uninitialized registers.
     if (isVectorType(Ty))
       return makeVectorOfZeros(Ty, RegNum);
     return Ctx->getConstantZero(Ty);
   }
   return From;
 }

 OperandARM32Mem *TargetARM32::formMemoryOperand(Operand *Operand, Type Ty) {
   OperandARM32Mem *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand);
   // It may be the case that address mode optimization already creates an
   // OperandARM32Mem, so in that case it wouldn't need another level of
   // transformation.
   if (Mem) {
     return llvm::cast<OperandARM32Mem>(legalize(Mem));
   }
   // If we didn't do address mode optimization, then we only have a base/offset
   // to work with. ARM always requires a base register, so just use that to
   // hold the operand.
   Variable *Base = legalizeToReg(Operand);
   return OperandARM32Mem::create(
       Func, Ty, Base,
       llvm::cast<ConstantInteger32>(Ctx->getConstantZero(IceType_i32)));
 }

 Variable64On32 *TargetARM32::makeI64RegPair() {
   Variable64On32 *Reg =
       llvm::cast<Variable64On32>(Func->makeVariable(IceType_i64));
   Reg->setMustHaveReg();
   Reg->initHiLo(Func);
   Reg->getLo()->setMustNotHaveReg();
   Reg->getHi()->setMustNotHaveReg();
   return Reg;
 }

 Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
   // There aren't any 64-bit integer registers for ARM32.
   assert(Type != IceType_i64);
   Variable *Reg = Func->makeVariable(Type);
   if (RegNum == Variable::NoRegister)
     Reg->setMustHaveReg();
   else
     Reg->setRegNum(RegNum);
   return Reg;
 }

 void TargetARM32::alignRegisterPow2(Variable *Reg, uint32_t Align) {
   assert(llvm::isPowerOf2_32(Align));
   uint32_t RotateAmt;
   uint32_t Immed_8;
   Operand *Mask;
   // Use AND or BIC to mask off the bits, depending on which immediate fits (if
   // it fits at all). Assume Align is usually small, in which case BIC works
   // better. Thus, this rounds down to the alignment.
   if (OperandARM32FlexImm::canHoldImm(Align - 1, &RotateAmt, &Immed_8)) {
     Mask = legalize(Ctx->getConstantInt32(Align - 1), Legal_Reg | Legal_Flex);
     _bic(Reg, Reg, Mask);
   } else {
     Mask = legalize(Ctx->getConstantInt32(-Align), Legal_Reg | Legal_Flex);
     _and(Reg, Reg, Mask);
   }
 }

 void TargetARM32::postLower() {
   if (Ctx->getFlags().getOptLevel() == Opt_m1)
     return;
   markRedefinitions();
 }

 void TargetARM32::makeRandomRegisterPermutation(
     llvm::SmallVectorImpl<int32_t> &Permutation,
     const llvm::SmallBitVector &ExcludeRegisters, uint64_t Salt) const {
   (void)Permutation;
   (void)ExcludeRegisters;
   (void)Salt;
   UnimplementedError(Func->getContext()->getFlags());
 }

 void TargetARM32::emit(const ConstantInteger32 *C) const {
   if (!BuildDefs::dump())
     return;
   Ostream &Str = Ctx->getStrEmit();
   Str << getConstantPrefix() << C->getValue();
 }

 void TargetARM32::emit(const ConstantInteger64 *) const {
   llvm::report_fatal_error("Not expecting to emit 64-bit integers");
 }

 void TargetARM32::emit(const ConstantFloat *C) const {
   (void)C;
   UnimplementedError(Ctx->getFlags());
 }

 void TargetARM32::emit(const ConstantDouble *C) const {
   (void)C;
   UnimplementedError(Ctx->getFlags());
 }

 void TargetARM32::emit(const ConstantUndef *) const {
   llvm::report_fatal_error("undef value encountered by emitter.");
 }

 TargetDataARM32::TargetDataARM32(GlobalContext *Ctx)
     : TargetDataLowering(Ctx) {}

 void TargetDataARM32::lowerGlobals(const VariableDeclarationList &Vars,
                                    const IceString &SectionSuffix) {
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf: {
     ELFObjectWriter *Writer = Ctx->getObjectWriter();
     Writer->writeDataSection(Vars, llvm::ELF::R_ARM_ABS32, SectionSuffix);
   } break;
   case FT_Asm:
   case FT_Iasm: {
     const IceString &TranslateOnly = Ctx->getFlags().getTranslateOnly();
     OstreamLocker L(Ctx);
     for (const VariableDeclaration *Var : Vars) {
       if (GlobalContext::matchSymbolName(Var->getName(), TranslateOnly)) {
         emitGlobal(*Var, SectionSuffix);
       }
     }
   } break;
   }
 }

 namespace {
 template <typename T> struct ConstantPoolEmitterTraits;

 static_assert(sizeof(uint64_t) == 8,
               "uint64_t is supposed to be 8 bytes wide.");

 // TODO(jpp): implement the following when implementing constant randomization:
 //  * template <> struct ConstantPoolEmitterTraits<uint8_t>
 //  * template <> struct ConstantPoolEmitterTraits<uint16_t>
 //  * template <> struct ConstantPoolEmitterTraits<uint32_t>
 template <> struct ConstantPoolEmitterTraits<float> {
   using ConstantType = ConstantFloat;
   static constexpr Type IceType = IceType_f32;
   // AsmTag and TypeName can't be constexpr because llvm::StringRef is unhappy
   // about them being constexpr.
   static const char AsmTag[];
   static const char TypeName[];
   static uint64_t bitcastToUint64(float Value) {
     static_assert(sizeof(Value) == sizeof(uint32_t),
                   "Float should be 4 bytes.");
     uint32_t IntValue = *reinterpret_cast<uint32_t *>(&Value);
     return static_cast<uint64_t>(IntValue);
   }
 };
 const char ConstantPoolEmitterTraits<float>::AsmTag[] = ".long";
 const char ConstantPoolEmitterTraits<float>::TypeName[] = "f32";

 template <> struct ConstantPoolEmitterTraits<double> {
   using ConstantType = ConstantDouble;
   static constexpr Type IceType = IceType_f64;
   static const char AsmTag[];
   static const char TypeName[];
   static uint64_t bitcastToUint64(double Value) {
     static_assert(sizeof(double) == sizeof(uint64_t),
                   "Double should be 8 bytes.");
     return *reinterpret_cast<uint64_t *>(&Value);
   }
 };
 const char ConstantPoolEmitterTraits<double>::AsmTag[] = ".quad";
 const char ConstantPoolEmitterTraits<double>::TypeName[] = "f64";

 template <typename T>
 void emitConstant(
     Ostream &Str, const GlobalContext *Ctx,
     const typename ConstantPoolEmitterTraits<T>::ConstantType *Const) {
   using Traits = ConstantPoolEmitterTraits<T>;
   Const->emitPoolLabel(Str, Ctx);
   Str << ":\n\t" << Traits::AsmTag << "\t0x";
   T Value = Const->getValue();
   Str.write_hex(Traits::bitcastToUint64(Value));
   Str << "\t@" << Traits::TypeName << " " << Value << "\n";
 }

 template <typename T> void emitConstantPool(GlobalContext *Ctx) {
   if (!BuildDefs::dump()) {
     return;
   }

   using Traits = ConstantPoolEmitterTraits<T>;
   static constexpr size_t MinimumAlignment = 4;
   SizeT Align = std::max(MinimumAlignment, typeAlignInBytes(Traits::IceType));
   assert((Align % 4) == 0 && "Constants should be aligned");
   Ostream &Str = Ctx->getStrEmit();
   ConstantList Pool = Ctx->getConstantPool(Traits::IceType);

   Str << "\t.section\t.rodata.cst" << Align << ",\"aM\",%progbits," << Align
       << "\n"
       << "\t.align\t" << Align << "\n";

   if (Ctx->getFlags().shouldReorderPooledConstants()) {
     // TODO(jpp): add constant pooling.
     UnimplementedError(Ctx->getFlags());
   }

   for (Constant *C : Pool) {
     if (!C->getShouldBePooled()) {
       continue;
     }

     emitConstant<T>(Str, Ctx, llvm::dyn_cast<typename Traits::ConstantType>(C));
   }
 }
 } // end of anonymous namespace

 void TargetDataARM32::lowerConstants() {
   if (Ctx->getFlags().getDisableTranslation())
     return;
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf:
     UnimplementedError(Ctx->getFlags());
     break;
   case FT_Asm:
   case FT_Iasm: {
     OstreamLocker L(Ctx);
     emitConstantPool<float>(Ctx);
     emitConstantPool<double>(Ctx);
     break;
   }
   }
 }

 void TargetDataARM32::lowerJumpTables() {
   if (Ctx->getFlags().getDisableTranslation())
     return;
   switch (Ctx->getFlags().getOutFileType()) {
   case FT_Elf:
     UnimplementedError(Ctx->getFlags());
     break;
   case FT_Asm:
     // Already emitted from Cfg
     break;
   case FT_Iasm: {
     // TODO(kschimpf): Fill this in when we get more information.
     break;
   }
   }
 }

 TargetHeaderARM32::TargetHeaderARM32(GlobalContext *Ctx)
     : TargetHeaderLowering(Ctx), CPUFeatures(Ctx->getFlags()) {}

 void TargetHeaderARM32::lower() {
   OstreamLocker L(Ctx);
   Ostream &Str = Ctx->getStrEmit();
   Str << ".syntax unified\n";
   // Emit build attributes in format: .eabi_attribute TAG, VALUE. See Sec. 2 of
   // "Addenda to, and Errata in the ABI for the ARM architecture"
   // http://infocenter.arm.com
   //                  /help/topic/com.arm.doc.ihi0045d/IHI0045D_ABI_addenda.pdf
   //
   // Tag_conformance should be be emitted first in a file-scope sub-subsection
   // of the first public subsection of the attributes.
   Str << ".eabi_attribute 67, \"2.09\"      @ Tag_conformance\n";
   // Chromebooks are at least A15, but do A9 for higher compat. For some
   // reason, the LLVM ARM asm parser has the .cpu directive override the mattr
   // specified on the commandline. So to test hwdiv, we need to set the .cpu
   // directive higher (can't just rely on --mattr=...).
   if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
     Str << ".cpu    cortex-a15\n";
   } else {
     Str << ".cpu    cortex-a9\n";
   }
   Str << ".eabi_attribute 6, 10   @ Tag_CPU_arch: ARMv7\n"
       << ".eabi_attribute 7, 65   @ Tag_CPU_arch_profile: App profile\n";
   Str << ".eabi_attribute 8, 1    @ Tag_ARM_ISA_use: Yes\n"
       << ".eabi_attribute 9, 2    @ Tag_THUMB_ISA_use: Thumb-2\n";
   Str << ".fpu    neon\n"
       << ".eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use: permit directly\n"
       << ".eabi_attribute 20, 1   @ Tag_ABI_FP_denormal\n"
       << ".eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions\n"
       << ".eabi_attribute 23, 3   @ Tag_ABI_FP_number_model: IEEE 754\n"
       << ".eabi_attribute 34, 1   @ Tag_CPU_unaligned_access\n"
       << ".eabi_attribute 24, 1   @ Tag_ABI_align_needed: 8-byte\n"
       << ".eabi_attribute 25, 1   @ Tag_ABI_align_preserved: 8-byte\n"
       << ".eabi_attribute 28, 1   @ Tag_ABI_VFP_args\n"
       << ".eabi_attribute 36, 1   @ Tag_FP_HP_extension\n"
       << ".eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format\n"
       << ".eabi_attribute 42, 1   @ Tag_MPextension_use\n"
       << ".eabi_attribute 68, 1   @ Tag_Virtualization_use\n";
   if (CPUFeatures.hasFeature(TargetARM32Features::HWDivArm)) {
     Str << ".eabi_attribute 44, 2   @ Tag_DIV_use\n";
   }
   // Technically R9 is used for TLS with Sandboxing, and we reserve it.
   // However, for compatibility with current NaCl LLVM, don't claim that.
   Str << ".eabi_attribute 14, 3   @ Tag_ABI_PCS_R9_use: Not used\n";
 }

 } // end of namespace Ice