Subzero ARM: lowerArguments (GPR), basic legalize(), and lowerRet(i32, i64).

Adds basic assignment instructions, mov, movn, movw, movt,
ldr, etc. in order to copy around the first few integer
(i32, i64) arguments out of r0 - r3, and then return then.

The "mov" instruction is a bit special and can actually
be a "str" when the dest is a stack slot.

Model the Memory operand types, and the "flexible Operand2".
Add a few tests demonstrating the flexibility of the
immediate encoding.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1127963004
diff --git a/src/IceInst.cpp b/src/IceInst.cpp
index b06f6df..0e6be75 100644
--- a/src/IceInst.cpp
+++ b/src/IceInst.cpp
@@ -873,4 +873,20 @@
   Inst::dump(Func);
 }
 
+bool checkForRedundantAssign(const Variable *Dest, const Operand *Source) {
+  const auto SrcVar = llvm::dyn_cast<const Variable>(Source);
+  if (!SrcVar)
+    return false;
+  if (Dest->hasReg() && Dest->getRegNum() == SrcVar->getRegNum()) {
+    // TODO: On x86-64, instructions like "mov eax, eax" are used to
+    // clear the upper 32 bits of rax.  We need to recognize and
+    // preserve these.
+    return true;
+  }
+  if (!Dest->hasReg() && !SrcVar->hasReg() &&
+      Dest->getStackOffset() == SrcVar->getStackOffset())
+    return true;
+  return false;
+}
+
 } // end of namespace Ice
diff --git a/src/IceInst.h b/src/IceInst.h
index 28fb046..b3a4a6d 100644
--- a/src/IceInst.h
+++ b/src/IceInst.h
@@ -67,6 +67,10 @@
     FakeKill,     // not part of LLVM/PNaCl bitcode
     Target        // target-specific low-level ICE
                   // Anything >= Target is an InstTarget subclass.
+                  // Note that the value-spaces are shared across targets.
+                  // To avoid confusion over the definition of shared values,
+                  // an object specific to one target should never be passed
+                  // to a different target.
   };
   InstKind getKind() const { return Kind; }
 
@@ -926,6 +930,8 @@
   ~InstTarget() override {}
 };
 
+bool checkForRedundantAssign(const Variable *Dest, const Operand *Source);
+
 } // end of namespace Ice
 
 namespace llvm {
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 9f54e85..6ac1698 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -37,12 +37,57 @@
 #undef X
 };
 
+const struct InstARM32ShiftAttributes_ {
+  const char *EmitString;
+} InstARM32ShiftAttributes[] = {
+#define X(tag, emit)                                                           \
+  { emit }                                                                     \
+  ,
+    ICEINSTARM32SHIFT_TABLE
+#undef X
+};
+
 } // end of anonymous namespace
 
 const char *InstARM32::getWidthString(Type Ty) {
   return TypeARM32Attributes[Ty].WidthString;
 }
 
+void emitTwoAddr(const char *Opcode, const Inst *Inst, const Cfg *Func) {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(Inst->getSrcSize() == 2);
+  Variable *Dest = Inst->getDest();
+  assert(Dest == Inst->getSrc(0));
+  Operand *Src1 = Inst->getSrc(1);
+  Str << "\t" << Opcode << "\t";
+  Dest->emit(Func);
+  Str << ", ";
+  Src1->emit(Func);
+}
+
+OperandARM32Mem::OperandARM32Mem(Cfg * /* Func */, Type Ty, Variable *Base,
+                                 ConstantInteger32 *ImmOffset, AddrMode Mode)
+    : OperandARM32(kMem, Ty), Base(Base), ImmOffset(ImmOffset), Index(nullptr),
+      ShiftOp(kNoShift), ShiftAmt(0), Mode(Mode) {
+  // The Neg modes are only needed for Reg +/- Reg.
+  assert(!isNegAddrMode());
+  NumVars = 1;
+  Vars = &this->Base;
+}
+
+OperandARM32Mem::OperandARM32Mem(Cfg *Func, Type Ty, Variable *Base,
+                                 Variable *Index, ShiftKind ShiftOp,
+                                 uint16_t ShiftAmt, AddrMode Mode)
+    : OperandARM32(kMem, Ty), Base(Base), ImmOffset(0), Index(Index),
+      ShiftOp(ShiftOp), ShiftAmt(ShiftAmt), Mode(Mode) {
+  NumVars = 2;
+  Vars = Func->allocateArrayOf<Variable *>(2);
+  Vars[0] = Base;
+  Vars[1] = Index;
+}
+
 bool OperandARM32Mem::canHoldOffset(Type Ty, bool SignExt, int32_t Offset) {
   int32_t Bits = SignExt ? TypeARM32Attributes[Ty].SExtAddrOffsetBits
                          : TypeARM32Attributes[Ty].ZExtAddrOffsetBits;
@@ -55,6 +100,52 @@
   return Utils::IsAbsoluteUint(Bits, Offset);
 }
 
+OperandARM32FlexImm::OperandARM32FlexImm(Cfg * /* Func */, Type Ty,
+                                         uint32_t Imm, uint32_t RotateAmt)
+    : OperandARM32Flex(kFlexImm, Ty), Imm(Imm), RotateAmt(RotateAmt) {
+  NumVars = 0;
+  Vars = nullptr;
+}
+
+bool OperandARM32FlexImm::canHoldImm(uint32_t Immediate, uint32_t *RotateAmt,
+                                     uint32_t *Immed_8) {
+  // Avoid the more expensive test for frequent small immediate values.
+  if (Immediate <= 0xFF) {
+    *RotateAmt = 0;
+    *Immed_8 = Immediate;
+    return true;
+  }
+  // Note that immediate must be unsigned for the test to work correctly.
+  for (int Rot = 1; Rot < 16; Rot++) {
+    uint32_t Imm8 = Utils::rotateLeft32(Immediate, 2 * Rot);
+    if (Imm8 <= 0xFF) {
+      *RotateAmt = Rot;
+      *Immed_8 = Imm8;
+      return true;
+    }
+  }
+  return false;
+}
+
+OperandARM32FlexReg::OperandARM32FlexReg(Cfg *Func, Type Ty, Variable *Reg,
+                                         ShiftKind ShiftOp, Operand *ShiftAmt)
+    : OperandARM32Flex(kFlexReg, Ty), Reg(Reg), ShiftOp(ShiftOp),
+      ShiftAmt(ShiftAmt) {
+  NumVars = 1;
+  Variable *ShiftVar = llvm::dyn_cast_or_null<Variable>(ShiftAmt);
+  if (ShiftVar)
+    ++NumVars;
+  Vars = Func->allocateArrayOf<Variable *>(NumVars);
+  Vars[0] = Reg;
+  if (ShiftVar)
+    Vars[1] = ShiftVar;
+}
+
+InstARM32Ldr::InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem)
+    : InstARM32(Func, InstARM32::Ldr, 1, Dest) {
+  addSource(Mem);
+}
+
 InstARM32Ret::InstARM32Ret(Cfg *Func, Variable *LR, Variable *Source)
     : InstARM32(Func, InstARM32::Ret, Source ? 2 : 1, nullptr) {
   addSource(LR);
@@ -64,6 +155,14 @@
 
 // ======================== Dump routines ======================== //
 
+// Two-addr ops
+template <> const char *InstARM32Movt::Opcode = "movt";
+// Unary ops
+template <> const char *InstARM32Movw::Opcode = "movw";
+template <> const char *InstARM32Mvn::Opcode = "mvn";
+// Mov-like ops
+template <> const char *InstARM32Mov::Opcode = "mov";
+
 void InstARM32::dump(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
@@ -72,6 +171,101 @@
   Inst::dump(Func);
 }
 
+template <> void InstARM32Mov::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Variable *Dest = getDest();
+  if (Dest->hasReg()) {
+    Str << "\t"
+        << "mov"
+        << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+  } else {
+    Variable *Src0 = llvm::cast<Variable>(getSrc(0));
+    assert(Src0->hasReg());
+    Str << "\t"
+        << "str"
+        << "\t";
+    Src0->emit(Func);
+    Str << ", ";
+    Dest->emit(Func);
+  }
+}
+
+template <> void InstARM32Mov::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 1);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Ldr::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  assert(getDest()->hasReg());
+  Type Ty = getSrc(0)->getType();
+  Str << "\t"
+      << "ldr" << getWidthString(Ty) << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  getSrc(0)->emit(Func);
+}
+
+void InstARM32Ldr::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Ldr::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpDest(Func);
+  Str << "ldr." << getSrc(0)->getType() << " ";
+  dumpSources(Func);
+}
+
+template <> void InstARM32Movw::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  Str << "\t" << Opcode << "\t";
+  getDest()->emit(Func);
+  Str << ", ";
+  Constant *Src0 = llvm::cast<Constant>(getSrc(0));
+  if (auto CR = llvm::dyn_cast<ConstantRelocatable>(Src0)) {
+    Str << "#:lower16:";
+    CR->emitWithoutPrefix(Func->getTarget());
+  } else {
+    Src0->emit(Func);
+  }
+}
+
+template <> void InstARM32Movt::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Variable *Dest = getDest();
+  Constant *Src1 = llvm::cast<Constant>(getSrc(1));
+  Str << "\t" << Opcode << "\t";
+  Dest->emit(Func);
+  Str << ", ";
+  if (auto CR = llvm::dyn_cast<ConstantRelocatable>(Src1)) {
+    Str << "#:upper16:";
+    CR->emitWithoutPrefix(Func->getTarget());
+  } else {
+    Src1->emit(Func);
+  }
+}
+
 void InstARM32Ret::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
@@ -98,4 +292,119 @@
   dumpSources(Func);
 }
 
+void OperandARM32Mem::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "[";
+  getBase()->emit(Func);
+  switch (getAddrMode()) {
+  case PostIndex:
+  case NegPostIndex:
+    Str << "], ";
+    break;
+  default:
+    Str << ", ";
+    break;
+  }
+  if (isRegReg()) {
+    if (isNegAddrMode()) {
+      Str << "-";
+    }
+    getIndex()->emit(Func);
+    if (getShiftOp() != kNoShift) {
+      Str << ", " << InstARM32ShiftAttributes[getShiftOp()].EmitString << " #"
+          << getShiftAmt();
+    }
+  } else {
+    getOffset()->emit(Func);
+  }
+  switch (getAddrMode()) {
+  case Offset:
+  case NegOffset:
+    Str << "]";
+    break;
+  case PreIndex:
+  case NegPreIndex:
+    Str << "]!";
+    break;
+  case PostIndex:
+  case NegPostIndex:
+    // Brace is already closed off.
+    break;
+  }
+}
+
+void OperandARM32Mem::dump(const Cfg *Func, Ostream &Str) const {
+  if (!ALLOW_DUMP)
+    return;
+  Str << "[";
+  if (Func)
+    getBase()->dump(Func);
+  else
+    getBase()->dump(Str);
+  Str << ", ";
+  if (isRegReg()) {
+    if (isNegAddrMode()) {
+      Str << "-";
+    }
+    if (Func)
+      getIndex()->dump(Func);
+    else
+      getIndex()->dump(Str);
+    if (getShiftOp() != kNoShift) {
+      Str << ", " << InstARM32ShiftAttributes[getShiftOp()].EmitString << " #"
+          << getShiftAmt();
+    }
+  } else {
+    getOffset()->dump(Func, Str);
+  }
+  Str << "] AddrMode==" << getAddrMode() << "\n";
+}
+
+void OperandARM32FlexImm::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  uint32_t Imm = getImm();
+  uint32_t RotateAmt = getRotateAmt();
+  Str << "#" << Utils::rotateRight32(Imm, 2 * RotateAmt);
+}
+
+void OperandARM32FlexImm::dump(const Cfg * /* Func */, Ostream &Str) const {
+  if (!ALLOW_DUMP)
+    return;
+  uint32_t Imm = getImm();
+  uint32_t RotateAmt = getRotateAmt();
+  Str << "#(" << Imm << " ror 2*" << RotateAmt << ")";
+}
+
+void OperandARM32FlexReg::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  getReg()->emit(Func);
+  if (getShiftOp() != kNoShift) {
+    Str << ", " << InstARM32ShiftAttributes[getShiftOp()].EmitString << " ";
+    getShiftAmt()->emit(Func);
+  }
+}
+
+void OperandARM32FlexReg::dump(const Cfg *Func, Ostream &Str) const {
+  if (!ALLOW_DUMP)
+    return;
+  Variable *Reg = getReg();
+  if (Func)
+    Reg->dump(Func);
+  else
+    Reg->dump(Str);
+  if (getShiftOp() != kNoShift) {
+    Str << ", " << InstARM32ShiftAttributes[getShiftOp()].EmitString << " ";
+    if (Func)
+      getShiftAmt()->dump(Func);
+    else
+      getShiftAmt()->dump(Str);
+  }
+}
+
 } // end of namespace Ice
diff --git a/src/IceInstARM32.def b/src/IceInstARM32.def
index 8eca14b..7970e58 100644
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -81,4 +81,14 @@
   X(IceType_v4f32, IceType_f32 , "",  0,  0)                            \
 //#define X(tag, elementty, width, sbits, ubits)
 
+// Shifter types for Data-processing operands as defined in section A5.1.2.
+#define ICEINSTARM32SHIFT_TABLE                 \
+  /* enum value, emit */                        \
+  X(LSL, "lsl")                                 \
+  X(LSR, "lsr")                                 \
+  X(ASR, "asr")                                 \
+  X(ROR, "ror")                                 \
+  X(RRX, "rrx")                                 \
+//#define X(tag, emit)
+
 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index e4e6c49..2e3f764 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -25,15 +25,22 @@
 
 class TargetARM32;
 
-// OperandARM32 extends the Operand hierarchy.
-// TODO(jvoung): Add the OperandARM32Mem and OperandARM32Flex.
+// OperandARM32 extends the Operand hierarchy.  Its subclasses are
+// OperandARM32Mem and OperandARM32Flex.
 class OperandARM32 : public Operand {
   OperandARM32() = delete;
   OperandARM32(const OperandARM32 &) = delete;
   OperandARM32 &operator=(const OperandARM32 &) = delete;
 
 public:
-  enum OperandKindARM32 { k__Start = Operand::kTarget };
+  enum OperandKindARM32 {
+    k__Start = Operand::kTarget,
+    kMem,
+    kFlexStart,
+    kFlexImm = kFlexStart,
+    kFlexReg,
+    kFlexEnd = kFlexReg
+  };
 
   enum ShiftKind {
     kNoShift = -1,
@@ -56,18 +63,178 @@
 
 // OperandARM32Mem represents a memory operand in any of the various ARM32
 // addressing modes.
-// TODO(jvoung): Fill out more.
 class OperandARM32Mem : public OperandARM32 {
   OperandARM32Mem() = delete;
   OperandARM32Mem(const OperandARM32Mem &) = delete;
   OperandARM32Mem &operator=(const OperandARM32Mem &) = delete;
 
 public:
+  // Memory operand addressing mode.
+  // The enum value also carries the encoding.
+  // TODO(jvoung): unify with the assembler.
+  enum AddrMode {
+    // bit encoding P U W
+    Offset = (8 | 4 | 0) << 21,      // offset (w/o writeback to base)
+    PreIndex = (8 | 4 | 1) << 21,    // pre-indexed addressing with writeback
+    PostIndex = (0 | 4 | 0) << 21,   // post-indexed addressing with writeback
+    NegOffset = (8 | 0 | 0) << 21,   // negative offset (w/o writeback to base)
+    NegPreIndex = (8 | 0 | 1) << 21, // negative pre-indexed with writeback
+    NegPostIndex = (0 | 0 | 0) << 21 // negative post-indexed with writeback
+  };
+
+  // Provide two constructors.
+  // NOTE: The Variable-typed operands have to be registers.
+  //
+  // (1) Reg + Imm. The Immediate actually has a limited number of bits
+  // for encoding, so check canHoldOffset first. It cannot handle
+  // general Constant operands like ConstantRelocatable, since a relocatable
+  // can potentially take up too many bits.
+  static OperandARM32Mem *create(Cfg *Func, Type Ty, Variable *Base,
+                                 ConstantInteger32 *ImmOffset = nullptr,
+                                 AddrMode Mode = Offset) {
+    return new (Func->allocate<OperandARM32Mem>())
+        OperandARM32Mem(Func, Ty, Base, ImmOffset, Mode);
+  }
+  // (2) Reg +/- Reg with an optional shift of some kind and amount.
+  // Note that this mode is disallowed in the NaCl sandbox.
+  static OperandARM32Mem *create(Cfg *Func, Type Ty, Variable *Base,
+                                 Variable *Index, ShiftKind ShiftOp = kNoShift,
+                                 uint16_t ShiftAmt = 0,
+                                 AddrMode Mode = Offset) {
+    return new (Func->allocate<OperandARM32Mem>())
+        OperandARM32Mem(Func, Ty, Base, Index, ShiftOp, ShiftAmt, Mode);
+  }
+  Variable *getBase() const { return Base; }
+  ConstantInteger32 *getOffset() const { return ImmOffset; }
+  Variable *getIndex() const { return Index; }
+  ShiftKind getShiftOp() const { return ShiftOp; }
+  uint16_t getShiftAmt() const { return ShiftAmt; }
+  AddrMode getAddrMode() const { return Mode; }
+
+  bool isRegReg() const { return Index != nullptr; }
+  bool isNegAddrMode() const { return Mode >= NegOffset; }
+
+  void emit(const Cfg *Func) const override;
+  using OperandARM32::dump;
+  void dump(const Cfg *Func, Ostream &Str) const override;
+
+  static bool classof(const Operand *Operand) {
+    return Operand->getKind() == static_cast<OperandKind>(kMem);
+  }
+
   // Return true if a load/store instruction for an element of type Ty
   // can encode the Offset directly in the immediate field of the 32-bit
   // ARM instruction. For some types, if the load is Sign extending, then
   // the range is reduced.
   static bool canHoldOffset(Type Ty, bool SignExt, int32_t Offset);
+
+private:
+  OperandARM32Mem(Cfg *Func, Type Ty, Variable *Base,
+                  ConstantInteger32 *ImmOffset, AddrMode Mode);
+  OperandARM32Mem(Cfg *Func, Type Ty, Variable *Base, Variable *Index,
+                  ShiftKind ShiftOp, uint16_t ShiftAmt, AddrMode Mode);
+  ~OperandARM32Mem() override {}
+  Variable *Base;
+  ConstantInteger32 *ImmOffset;
+  Variable *Index;
+  ShiftKind ShiftOp;
+  uint16_t ShiftAmt;
+  AddrMode Mode;
+};
+
+// OperandARM32Flex represent the "flexible second operand" for
+// data-processing instructions. It can be a rotatable 8-bit constant, or
+// a register with an optional shift operand. The shift amount can even be
+// a third register.
+class OperandARM32Flex : public OperandARM32 {
+  OperandARM32Flex() = delete;
+  OperandARM32Flex(const OperandARM32Flex &) = delete;
+  OperandARM32Flex &operator=(const OperandARM32Flex &) = delete;
+
+public:
+  static bool classof(const Operand *Operand) {
+    return static_cast<OperandKind>(kFlexStart) <= Operand->getKind() &&
+           Operand->getKind() <= static_cast<OperandKind>(kFlexEnd);
+  }
+
+protected:
+  OperandARM32Flex(OperandKindARM32 Kind, Type Ty) : OperandARM32(Kind, Ty) {}
+  ~OperandARM32Flex() override {}
+};
+
+// Rotated immediate variant.
+class OperandARM32FlexImm : public OperandARM32Flex {
+  OperandARM32FlexImm() = delete;
+  OperandARM32FlexImm(const OperandARM32FlexImm &) = delete;
+  OperandARM32FlexImm &operator=(const OperandARM32FlexImm &) = delete;
+
+public:
+  // Immed_8 rotated by an even number of bits (2 * RotateAmt).
+  static OperandARM32FlexImm *create(Cfg *Func, Type Ty, uint32_t Imm,
+                                     uint32_t RotateAmt) {
+    return new (Func->allocate<OperandARM32FlexImm>())
+        OperandARM32FlexImm(Func, Ty, Imm, RotateAmt);
+  }
+
+  void emit(const Cfg *Func) const override;
+  using OperandARM32::dump;
+  void dump(const Cfg *Func, Ostream &Str) const override;
+
+  static bool classof(const Operand *Operand) {
+    return Operand->getKind() == static_cast<OperandKind>(kFlexImm);
+  }
+
+  // Return true if the Immediate can fit in the ARM flexible operand.
+  // Fills in the out-params RotateAmt and Immed_8 if Immediate fits.
+  static bool canHoldImm(uint32_t Immediate, uint32_t *RotateAmt,
+                         uint32_t *Immed_8);
+
+  uint32_t getImm() const { return Imm; }
+  uint32_t getRotateAmt() const { return RotateAmt; }
+
+private:
+  OperandARM32FlexImm(Cfg *Func, Type Ty, uint32_t Imm, uint32_t RotateAmt);
+  ~OperandARM32FlexImm() override {}
+
+  uint32_t Imm;
+  uint32_t RotateAmt;
+};
+
+// Shifted register variant.
+class OperandARM32FlexReg : public OperandARM32Flex {
+  OperandARM32FlexReg() = delete;
+  OperandARM32FlexReg(const OperandARM32FlexReg &) = delete;
+  OperandARM32FlexReg &operator=(const OperandARM32FlexReg &) = delete;
+
+public:
+  // Register with immediate/reg shift amount and shift operation.
+  static OperandARM32FlexReg *create(Cfg *Func, Type Ty, Variable *Reg,
+                                     ShiftKind ShiftOp, Operand *ShiftAmt) {
+    return new (Func->allocate<OperandARM32FlexReg>())
+        OperandARM32FlexReg(Func, Ty, Reg, ShiftOp, ShiftAmt);
+  }
+
+  void emit(const Cfg *Func) const override;
+  using OperandARM32::dump;
+  void dump(const Cfg *Func, Ostream &Str) const override;
+
+  static bool classof(const Operand *Operand) {
+    return Operand->getKind() == static_cast<OperandKind>(kFlexReg);
+  }
+
+  Variable *getReg() const { return Reg; }
+  ShiftKind getShiftOp() const { return ShiftOp; }
+  // ShiftAmt can represent an immediate or a register.
+  Operand *getShiftAmt() const { return ShiftAmt; }
+
+private:
+  OperandARM32FlexReg(Cfg *Func, Type Ty, Variable *Reg, ShiftKind ShiftOp,
+                      Operand *ShiftAmt);
+  ~OperandARM32FlexReg() override {}
+
+  Variable *Reg;
+  ShiftKind ShiftOp;
+  Operand *ShiftAmt;
 };
 
 class InstARM32 : public InstTarget {
@@ -76,7 +243,15 @@
   InstARM32 &operator=(const InstARM32 &) = delete;
 
 public:
-  enum InstKindARM32 { k__Start = Inst::Target, Ret };
+  enum InstKindARM32 {
+    k__Start = Inst::Target,
+    Mov,
+    Movt,
+    Movw,
+    Mvn,
+    Ret,
+    Ldr
+  };
 
   static const char *getWidthString(Type Ty);
 
@@ -91,6 +266,168 @@
   }
 };
 
+void emitTwoAddr(const char *Opcode, const Inst *Inst, const Cfg *Func);
+
+// TODO(jvoung): add condition codes if instruction can be predicated.
+
+// Instructions of the form x := op(y).
+template <InstARM32::InstKindARM32 K>
+class InstARM32UnaryopGPR : public InstARM32 {
+  InstARM32UnaryopGPR() = delete;
+  InstARM32UnaryopGPR(const InstARM32UnaryopGPR &) = delete;
+  InstARM32UnaryopGPR &operator=(const InstARM32UnaryopGPR &) = delete;
+
+public:
+  static InstARM32UnaryopGPR *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstARM32UnaryopGPR>())
+        InstARM32UnaryopGPR(Func, Dest, Src);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    Ostream &Str = Func->getContext()->getStrEmit();
+    assert(getSrcSize() == 1);
+    Str << "\t" << Opcode << "\t";
+    getDest()->emit(Func);
+    Str << ", ";
+    getSrc(0)->emit(Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm_unreachable("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32UnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstARM32(Func, K, 1, Dest) {
+    addSource(Src);
+  }
+  ~InstARM32UnaryopGPR() override {}
+  static const char *Opcode;
+};
+
+// Instructions of the form x := x op y.
+template <InstARM32::InstKindARM32 K>
+class InstARM32TwoAddrGPR : public InstARM32 {
+  InstARM32TwoAddrGPR() = delete;
+  InstARM32TwoAddrGPR(const InstARM32TwoAddrGPR &) = delete;
+  InstARM32TwoAddrGPR &operator=(const InstARM32TwoAddrGPR &) = delete;
+
+public:
+  // Dest must be a register.
+  static InstARM32TwoAddrGPR *create(Cfg *Func, Variable *Dest, Operand *Src) {
+    return new (Func->allocate<InstARM32TwoAddrGPR>())
+        InstARM32TwoAddrGPR(Func, Dest, Src);
+  }
+  void emit(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    emitTwoAddr(Opcode, this, Func);
+  }
+  void emitIAS(const Cfg *Func) const override {
+    (void)Func;
+    llvm::report_fatal_error("Not yet implemented");
+  }
+  void dump(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    dumpDest(Func);
+    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32TwoAddrGPR(Cfg *Func, Variable *Dest, Operand *Src)
+      : InstARM32(Func, K, 2, Dest) {
+    addSource(Dest);
+    addSource(Src);
+  }
+  ~InstARM32TwoAddrGPR() override {}
+  static const char *Opcode;
+};
+
+// Base class for assignment instructions.
+// These can be tested for redundancy (and elided if redundant).
+template <InstARM32::InstKindARM32 K>
+class InstARM32Movlike : public InstARM32 {
+  InstARM32Movlike() = delete;
+  InstARM32Movlike(const InstARM32Movlike &) = delete;
+  InstARM32Movlike &operator=(const InstARM32Movlike &) = delete;
+
+public:
+  static InstARM32Movlike *create(Cfg *Func, Variable *Dest, Operand *Source) {
+    return new (Func->allocate<InstARM32Movlike>())
+        InstARM32Movlike(Func, Dest, Source);
+  }
+  bool isRedundantAssign() const override {
+    return checkForRedundantAssign(getDest(), getSrc(0));
+  }
+  bool isSimpleAssign() const override { return true; }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override {
+    if (!ALLOW_DUMP)
+      return;
+    Ostream &Str = Func->getContext()->getStrDump();
+    Str << Opcode << "." << getDest()->getType() << " ";
+    dumpDest(Func);
+    Str << ", ";
+    dumpSources(Func);
+  }
+  static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
+
+private:
+  InstARM32Movlike(Cfg *Func, Variable *Dest, Operand *Source)
+      : InstARM32(Func, K, 1, Dest) {
+    addSource(Source);
+  }
+  ~InstARM32Movlike() override {}
+
+  static const char *Opcode;
+};
+
+// Move instruction (variable <- flex). This is more of a pseudo-inst.
+// If var is a register, then we use "mov". If var is stack, then we use
+// "str" to store to the stack.
+typedef InstARM32Movlike<InstARM32::Mov> InstARM32Mov;
+// MovT leaves the bottom bits alone so dest is also a source.
+// This helps indicate that a previous MovW setting dest is not dead code.
+typedef InstARM32TwoAddrGPR<InstARM32::Movt> InstARM32Movt;
+typedef InstARM32UnaryopGPR<InstARM32::Movw> InstARM32Movw;
+typedef InstARM32UnaryopGPR<InstARM32::Mvn> InstARM32Mvn;
+
+// Load instruction.
+class InstARM32Ldr : public InstARM32 {
+  InstARM32Ldr() = delete;
+  InstARM32Ldr(const InstARM32Ldr &) = delete;
+  InstARM32Ldr &operator=(const InstARM32Ldr &) = delete;
+
+public:
+  // Dest must be a register.
+  static InstARM32Ldr *create(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem) {
+    return new (Func->allocate<InstARM32Ldr>()) InstARM32Ldr(Func, Dest, Mem);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Ldr); }
+
+private:
+  InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem);
+  ~InstARM32Ldr() override {}
+};
+
 // Ret pseudo-instruction.  This is actually a "bx" instruction with
 // an "lr" register operand, but epilogue lowering will search for a Ret
 // instead of a generic "bx". This instruction also takes a Source
@@ -116,6 +453,13 @@
   ~InstARM32Ret() override {}
 };
 
+// Declare partial template specializations of emit() methods that
+// already have default implementations.  Without this, there is the
+// possibility of ODR violations and link errors.
+
+template <> void InstARM32Movw::emit(const Cfg *Func) const;
+template <> void InstARM32Movt::emit(const Cfg *Func) const;
+
 } // end of namespace Ice
 
 #endif // SUBZERO_SRC_ICEINSTARM32_H
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 2ac95db..e7863ca 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -889,22 +889,6 @@
   }
 }
 
-bool checkForRedundantAssign(const Variable *Dest, const Operand *Source) {
-  const auto SrcVar = llvm::dyn_cast<const Variable>(Source);
-  if (!SrcVar)
-    return false;
-  if (Dest->hasReg() && Dest->getRegNum() == SrcVar->getRegNum()) {
-    // TODO: On x86-64, instructions like "mov eax, eax" are used to
-    // clear the upper 32 bits of rax.  We need to recognize and
-    // preserve these.
-    return true;
-  }
-  if (!Dest->hasReg() && !SrcVar->hasReg() &&
-      Dest->getStackOffset() == SrcVar->getStackOffset())
-    return true;
-  return false;
-}
-
 // In-place ops
 template <> const char *InstX8632Bswap::Opcode = "bswap";
 template <> const char *InstX8632Neg::Opcode = "neg";
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 4ea2b36..bee9843 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -956,8 +956,6 @@
   static const char *Opcode;
 };
 
-bool checkForRedundantAssign(const Variable *Dest, const Operand *Source);
-
 // Base class for assignment instructions
 template <InstX8632::InstKindX8632 K>
 class InstX8632Movlike : public InstX8632 {
diff --git a/src/IceOperand.h b/src/IceOperand.h
index 430a565..4ae217c 100644
--- a/src/IceOperand.h
+++ b/src/IceOperand.h
@@ -46,7 +46,10 @@
     kVariable_Target, // leave space for target-specific variable kinds
     kVariable_Num = kVariable_Target + MaxTargetKinds,
     // Target-specific operand classes use kTarget as the starting
-    // point for their Kind enum space.
+    // point for their Kind enum space. Note that the value-spaces are shared
+    // across targets. To avoid confusion over the definition of shared
+    // values, an object specific to one target should never be passed
+    // to a different target.
     kTarget
   };
   OperandKind getKind() const { return Kind; }
diff --git a/src/IceTargetLowering.cpp b/src/IceTargetLowering.cpp
index 2fa0397..db332cb 100644
--- a/src/IceTargetLowering.cpp
+++ b/src/IceTargetLowering.cpp
@@ -228,6 +228,22 @@
   LinearScan.scan(RegMask, Ctx->getFlags().shouldRandomizeRegAlloc());
 }
 
+void TargetLowering::inferTwoAddress() {
+  // Find two-address non-SSA instructions where Dest==Src0, and set
+  // the DestNonKillable flag to keep liveness analysis consistent.
+  for (auto Inst = Context.getCur(), E = Context.getNext(); Inst != E; ++Inst) {
+    if (Inst->isDeleted())
+      continue;
+    if (Variable *Dest = Inst->getDest()) {
+      // TODO(stichnot): We may need to consider all source
+      // operands, not just the first one, if using 3-address
+      // instructions.
+      if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest)
+        Inst->setDestNonKillable();
+    }
+  }
+}
+
 InstCall *TargetLowering::makeHelperCall(const IceString &Name, Variable *Dest,
                                          SizeT MaxSrcs) {
   const bool HasTailCall = false;
diff --git a/src/IceTargetLowering.h b/src/IceTargetLowering.h
index f488930..c875b26 100644
--- a/src/IceTargetLowering.h
+++ b/src/IceTargetLowering.h
@@ -255,6 +255,10 @@
   // expansion before returning.
   virtual void postLower() {}
 
+  // Find two-address non-SSA instructions and set the DestNonKillable flag
+  // to keep liveness analysis consistent.
+  void inferTwoAddress();
+
   // Make a call to an external helper function.
   InstCall *makeHelperCall(const IceString &Name, Variable *Dest,
                            SizeT MaxSrcs);
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 7e96d04..d65b546 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -39,6 +39,10 @@
     abort();
   }
 }
+
+// The maximum number of arguments to pass in GPR registers.
+const uint32_t ARM32_MAX_GPR_ARG = 4;
+
 } // end of anonymous namespace
 
 TargetARM32::TargetARM32(Cfg *Func)
@@ -275,12 +279,92 @@
     llvm::report_fatal_error("Illegal stack offset");
   }
   const Type FrameSPTy = IceType_i32;
-  Str << "[" << getRegName(getFrameOrStackReg(), FrameSPTy) << ", " << Offset
-      << "]";
+  Str << "[" << getRegName(getFrameOrStackReg(), FrameSPTy);
+  if (Offset != 0) {
+    Str << ", " << getConstantPrefix() << Offset;
+  }
+  Str << "]";
 }
 
 void TargetARM32::lowerArguments() {
-  UnimplementedError(Func->getContext()->getFlags());
+  VarList &Args = Func->getArgs();
+  // The first few integer type parameters can use r0-r3, regardless of their
+  // position relative to the floating-point/vector arguments in the argument
+  // list. Floating-point and vector arguments can use q0-q3 (aka d0-d7,
+  // s0-s15).
+  unsigned NumGPRRegsUsed = 0;
+
+  // For each register argument, replace Arg in the argument list with the
+  // home register.  Then generate an instruction in the prolog to copy the
+  // home register to the assigned location of Arg.
+  Context.init(Func->getEntryNode());
+  Context.setInsertPoint(Context.getCur());
+
+  for (SizeT I = 0, E = Args.size(); I < E; ++I) {
+    Variable *Arg = Args[I];
+    Type Ty = Arg->getType();
+    // TODO(jvoung): handle float/vector types.
+    if (isVectorType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+      continue;
+    } else if (isFloatingType(Ty)) {
+      UnimplementedError(Func->getContext()->getFlags());
+      continue;
+    } else if (Ty == IceType_i64) {
+      if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
+        continue;
+      int32_t RegLo = RegARM32::Reg_r0 + NumGPRRegsUsed;
+      int32_t RegHi = 0;
+      ++NumGPRRegsUsed;
+      // Always start i64 registers at an even register, so this may end
+      // up padding away a register.
+      if (RegLo % 2 != 0) {
+        ++RegLo;
+        ++NumGPRRegsUsed;
+      }
+      // If this leaves us without room to consume another register,
+      // leave any previously speculatively consumed registers as consumed.
+      if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
+        continue;
+      RegHi = RegARM32::Reg_r0 + NumGPRRegsUsed;
+      ++NumGPRRegsUsed;
+      Variable *RegisterArg = Func->makeVariable(Ty);
+      Variable *RegisterLo = Func->makeVariable(IceType_i32);
+      Variable *RegisterHi = Func->makeVariable(IceType_i32);
+      if (ALLOW_DUMP) {
+        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+        RegisterLo->setName(Func, "home_reg_lo:" + Arg->getName(Func));
+        RegisterHi->setName(Func, "home_reg_hi:" + Arg->getName(Func));
+      }
+      RegisterLo->setRegNum(RegLo);
+      RegisterLo->setIsArg();
+      RegisterHi->setRegNum(RegHi);
+      RegisterHi->setIsArg();
+      RegisterArg->setLoHi(RegisterLo, RegisterHi);
+      RegisterArg->setIsArg();
+      Arg->setIsArg(false);
+
+      Args[I] = RegisterArg;
+      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+      continue;
+    } else {
+      assert(Ty == IceType_i32);
+      if (NumGPRRegsUsed >= ARM32_MAX_GPR_ARG)
+        continue;
+      int32_t RegNum = RegARM32::Reg_r0 + NumGPRRegsUsed;
+      ++NumGPRRegsUsed;
+      Variable *RegisterArg = Func->makeVariable(Ty);
+      if (ALLOW_DUMP) {
+        RegisterArg->setName(Func, "home_reg:" + Arg->getName(Func));
+      }
+      RegisterArg->setRegNum(RegNum);
+      RegisterArg->setIsArg();
+      Arg->setIsArg(false);
+
+      Args[I] = RegisterArg;
+      Context.insert(InstAssign::create(Func, Arg, RegisterArg));
+    }
+  }
 }
 
 Type TargetARM32::stackSlotType() { return IceType_i32; }
@@ -295,6 +379,116 @@
   UnimplementedError(Func->getContext()->getFlags());
 }
 
+void TargetARM32::split64(Variable *Var) {
+  assert(Var->getType() == IceType_i64);
+  Variable *Lo = Var->getLo();
+  Variable *Hi = Var->getHi();
+  if (Lo) {
+    assert(Hi);
+    return;
+  }
+  assert(Hi == nullptr);
+  Lo = Func->makeVariable(IceType_i32);
+  Hi = Func->makeVariable(IceType_i32);
+  if (ALLOW_DUMP) {
+    Lo->setName(Func, Var->getName(Func) + "__lo");
+    Hi->setName(Func, Var->getName(Func) + "__hi");
+  }
+  Var->setLoHi(Lo, Hi);
+  if (Var->getIsArg()) {
+    Lo->setIsArg();
+    Hi->setIsArg();
+  }
+}
+
+Operand *TargetARM32::loOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64);
+  if (Operand->getType() != IceType_i64)
+    return Operand;
+  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
+    split64(Var);
+    return Var->getLo();
+  }
+  if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    return Ctx->getConstantInt32(static_cast<uint32_t>(Const->getValue()));
+  }
+  if (OperandARM32Mem *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
+    // Conservatively disallow memory operands with side-effects (pre/post
+    // increment) in case of duplication.
+    assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
+           Mem->getAddrMode() == OperandARM32Mem::NegOffset);
+    if (Mem->isRegReg()) {
+      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
+                                     Mem->getIndex(), Mem->getShiftOp(),
+                                     Mem->getShiftAmt(), Mem->getAddrMode());
+    } else {
+      return OperandARM32Mem::create(Func, IceType_i32, Mem->getBase(),
+                                     Mem->getOffset(), Mem->getAddrMode());
+    }
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
+Operand *TargetARM32::hiOperand(Operand *Operand) {
+  assert(Operand->getType() == IceType_i64);
+  if (Operand->getType() != IceType_i64)
+    return Operand;
+  if (Variable *Var = llvm::dyn_cast<Variable>(Operand)) {
+    split64(Var);
+    return Var->getHi();
+  }
+  if (ConstantInteger64 *Const = llvm::dyn_cast<ConstantInteger64>(Operand)) {
+    return Ctx->getConstantInt32(
+        static_cast<uint32_t>(Const->getValue() >> 32));
+  }
+  if (OperandARM32Mem *Mem = llvm::dyn_cast<OperandARM32Mem>(Operand)) {
+    // Conservatively disallow memory operands with side-effects
+    // in case of duplication.
+    assert(Mem->getAddrMode() == OperandARM32Mem::Offset ||
+           Mem->getAddrMode() == OperandARM32Mem::NegOffset);
+    const Type SplitType = IceType_i32;
+    if (Mem->isRegReg()) {
+      // We have to make a temp variable T, and add 4 to either Base or Index.
+      // The Index may be shifted, so adding 4 can mean something else.
+      // Thus, prefer T := Base + 4, and use T as the new Base.
+      Variable *Base = Mem->getBase();
+      Constant *Four = Ctx->getConstantInt32(4);
+      Variable *NewBase = Func->makeVariable(Base->getType());
+      lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add, NewBase,
+                                             Base, Four));
+      return OperandARM32Mem::create(Func, SplitType, NewBase, Mem->getIndex(),
+                                     Mem->getShiftOp(), Mem->getShiftAmt(),
+                                     Mem->getAddrMode());
+    } else {
+      Variable *Base = Mem->getBase();
+      ConstantInteger32 *Offset = Mem->getOffset();
+      assert(!Utils::WouldOverflowAdd(Offset->getValue(), 4));
+      int32_t NextOffsetVal = Offset->getValue() + 4;
+      const bool SignExt = false;
+      if (!OperandARM32Mem::canHoldOffset(SplitType, SignExt, NextOffsetVal)) {
+        // We have to make a temp variable and add 4 to either Base or Offset.
+        // If we add 4 to Offset, this will convert a non-RegReg addressing
+        // mode into a RegReg addressing mode. Since NaCl sandboxing disallows
+        // RegReg addressing modes, prefer adding to base and replacing instead.
+        // Thus we leave the old offset alone.
+        Constant *Four = Ctx->getConstantInt32(4);
+        Variable *NewBase = Func->makeVariable(Base->getType());
+        lowerArithmetic(InstArithmetic::create(Func, InstArithmetic::Add,
+                                               NewBase, Base, Four));
+        Base = NewBase;
+      } else {
+        Offset =
+            llvm::cast<ConstantInteger32>(Ctx->getConstantInt32(NextOffsetVal));
+      }
+      return OperandARM32Mem::create(Func, SplitType, Base, Offset,
+                                     Mem->getAddrMode());
+    }
+  }
+  llvm_unreachable("Unsupported operand type");
+  return nullptr;
+}
+
 llvm::SmallBitVector TargetARM32::getRegisterSet(RegSetMask Include,
                                                  RegSetMask Exclude) const {
   llvm::SmallBitVector Registers(RegARM32::Reg_NUM);
@@ -338,70 +532,126 @@
 }
 
 void TargetARM32::lowerArithmetic(const InstArithmetic *Inst) {
-  switch (Inst->getOp()) {
-  case InstArithmetic::_num:
-    llvm_unreachable("Unknown arithmetic operator");
-    break;
-  case InstArithmetic::Add:
+  Variable *Dest = Inst->getDest();
+  // TODO(jvoung): Should be able to flip Src0 and Src1 if it is easier
+  // to legalize Src0 to flex or Src1 to flex and there is a reversible
+  // instruction. E.g., reverse subtract with immediate, register vs
+  // register, immediate.
+  // Or it may be the case that the operands aren't swapped, but the
+  // bits can be flipped and a different operation applied.
+  // E.g., use BIC (bit clear) instead of AND for some masks.
+  Variable *Src0 = legalizeToVar(Inst->getSrc(0));
+  Operand *Src1 = legalize(Inst->getSrc(1), Legal_Reg | Legal_Flex);
+  (void)Src0;
+  (void)Src1;
+  if (Dest->getType() == IceType_i64) {
     UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::And:
+  } else if (isVectorType(Dest->getType())) {
     UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Or:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Xor:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Sub:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Mul:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Shl:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Lshr:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Ashr:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Udiv:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Sdiv:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Urem:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Srem:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Fadd:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Fsub:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Fmul:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Fdiv:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
-  case InstArithmetic::Frem:
-    UnimplementedError(Func->getContext()->getFlags());
-    break;
+  } else { // Dest->getType() is non-i64 scalar
+    switch (Inst->getOp()) {
+    case InstArithmetic::_num:
+      llvm_unreachable("Unknown arithmetic operator");
+      break;
+    case InstArithmetic::Add: {
+      UnimplementedError(Func->getContext()->getFlags());
+      // Variable *T = makeReg(Dest->getType());
+      // _add(T, Src0, Src1);
+      // _mov(Dest, T);
+    } break;
+    case InstArithmetic::And:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Or:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Xor:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Sub:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Mul:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Shl:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Lshr:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Ashr:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Udiv:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Sdiv:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Urem:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Srem:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Fadd:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Fsub:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Fmul:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Fdiv:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case InstArithmetic::Frem:
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    }
   }
 }
 
 void TargetARM32::lowerAssign(const InstAssign *Inst) {
-  (void)Inst;
-  UnimplementedError(Func->getContext()->getFlags());
+  Variable *Dest = Inst->getDest();
+  Operand *Src0 = Inst->getSrc(0);
+  assert(Dest->getType() == Src0->getType());
+  if (Dest->getType() == IceType_i64) {
+    Src0 = legalize(Src0);
+    Operand *Src0Lo = loOperand(Src0);
+    Operand *Src0Hi = hiOperand(Src0);
+    Variable *DestLo = llvm::cast<Variable>(loOperand(Dest));
+    Variable *DestHi = llvm::cast<Variable>(hiOperand(Dest));
+    Variable *T_Lo = nullptr, *T_Hi = nullptr;
+    _mov(T_Lo, Src0Lo);
+    _mov(DestLo, T_Lo);
+    _mov(T_Hi, Src0Hi);
+    _mov(DestHi, T_Hi);
+  } else {
+    Operand *SrcR;
+    if (Dest->hasReg()) {
+      // If Dest already has a physical register, then legalize the
+      // Src operand into a Variable with the same register
+      // assignment.  This is mostly a workaround for advanced phi
+      // lowering's ad-hoc register allocation which assumes no
+      // register allocation is needed when at least one of the
+      // operands is non-memory.
+      // TODO(jvoung): check this for ARM.
+      SrcR = legalize(Src0, Legal_Reg, Dest->getRegNum());
+    } else {
+      // Dest could be a stack operand. Since we could potentially need
+      // to do a Store (and store can only have Register operands),
+      // legalize this to a register.
+      SrcR = legalize(Src0, Legal_Reg);
+    }
+    if (isVectorType(Dest->getType())) {
+      UnimplementedError(Func->getContext()->getFlags());
+    } else {
+      _mov(Dest, SrcR);
+    }
+  }
 }
 
 void TargetARM32::lowerBr(const InstBr *Inst) {
@@ -629,7 +879,20 @@
 void TargetARM32::lowerRet(const InstRet *Inst) {
   Variable *Reg = nullptr;
   if (Inst->hasRetValue()) {
-    UnimplementedError(Func->getContext()->getFlags());
+    Operand *Src0 = Inst->getRetValue();
+    if (Src0->getType() == IceType_i64) {
+      Variable *R0 = legalizeToVar(loOperand(Src0), RegARM32::Reg_r0);
+      Variable *R1 = legalizeToVar(hiOperand(Src0), RegARM32::Reg_r1);
+      Reg = R0;
+      Context.insert(InstFakeUse::create(Func, R1));
+    } else if (isScalarFloatingType(Src0->getType())) {
+      UnimplementedError(Func->getContext()->getFlags());
+    } else if (isVectorType(Src0->getType())) {
+      UnimplementedError(Func->getContext()->getFlags());
+    } else {
+      Operand *Src0F = legalize(Src0, Legal_Reg | Legal_Flex);
+      _mov(Reg, Src0F, RegARM32::Reg_r0);
+    }
   }
   // Add a ret instruction even if sandboxing is enabled, because
   // addEpilog explicitly looks for a ret instruction as a marker for
@@ -666,7 +929,7 @@
 }
 
 void TargetARM32::lowerUnreachable(const InstUnreachable * /*Inst*/) {
-  llvm_unreachable("Not yet implemented");
+  UnimplementedError(Func->getContext()->getFlags());
 }
 
 // Turn an i64 Phi instruction into a pair of i32 Phi instructions, to
@@ -686,12 +949,191 @@
   UnimplementedError(Func->getContext()->getFlags());
 }
 
+Variable *TargetARM32::makeVectorOfZeros(Type Ty, int32_t RegNum) {
+  Variable *Reg = makeReg(Ty, RegNum);
+  UnimplementedError(Func->getContext()->getFlags());
+  return Reg;
+}
+
+// Helper for legalize() to emit the right code to lower an operand to a
+// register of the appropriate type.
+Variable *TargetARM32::copyToReg(Operand *Src, int32_t RegNum) {
+  Type Ty = Src->getType();
+  Variable *Reg = makeReg(Ty, RegNum);
+  if (isVectorType(Ty)) {
+    UnimplementedError(Func->getContext()->getFlags());
+  } else {
+    // Mov's Src operand can really only be the flexible second operand type
+    // or a register. Users should guarantee that.
+    _mov(Reg, Src);
+  }
+  return Reg;
+}
+
+Operand *TargetARM32::legalize(Operand *From, LegalMask Allowed,
+                               int32_t RegNum) {
+  // Assert that a physical register is allowed.  To date, all calls
+  // to legalize() allow a physical register. Legal_Flex converts
+  // registers to the right type OperandARM32FlexReg as needed.
+  assert(Allowed & Legal_Reg);
+  // Go through the various types of operands:
+  // OperandARM32Mem, OperandARM32Flex, Constant, and Variable.
+  // Given the above assertion, if type of operand is not legal
+  // (e.g., OperandARM32Mem and !Legal_Mem), we can always copy
+  // to a register.
+  if (auto Mem = llvm::dyn_cast<OperandARM32Mem>(From)) {
+    // Before doing anything with a Mem operand, we need to ensure
+    // that the Base and Index components are in physical registers.
+    Variable *Base = Mem->getBase();
+    Variable *Index = Mem->getIndex();
+    Variable *RegBase = nullptr;
+    Variable *RegIndex = nullptr;
+    if (Base) {
+      RegBase = legalizeToVar(Base);
+    }
+    if (Index) {
+      RegIndex = legalizeToVar(Index);
+    }
+    // Create a new operand if there was a change.
+    if (Base != RegBase || Index != RegIndex) {
+      // There is only a reg +/- reg or reg + imm form.
+      // Figure out which to re-create.
+      if (Mem->isRegReg()) {
+        Mem = OperandARM32Mem::create(Func, Mem->getType(), RegBase, RegIndex,
+                                      Mem->getShiftOp(), Mem->getShiftAmt(),
+                                      Mem->getAddrMode());
+      } else {
+        Mem = OperandARM32Mem::create(Func, Mem->getType(), RegBase,
+                                      Mem->getOffset(), Mem->getAddrMode());
+      }
+    }
+    if (!(Allowed & Legal_Mem)) {
+      Type Ty = Mem->getType();
+      Variable *Reg = makeReg(Ty, RegNum);
+      _ldr(Reg, Mem);
+      From = Reg;
+    } else {
+      From = Mem;
+    }
+    return From;
+  }
+
+  if (auto Flex = llvm::dyn_cast<OperandARM32Flex>(From)) {
+    if (!(Allowed & Legal_Flex)) {
+      if (auto FlexReg = llvm::dyn_cast<OperandARM32FlexReg>(Flex)) {
+        if (FlexReg->getShiftOp() == OperandARM32::kNoShift) {
+          From = FlexReg->getReg();
+          // Fall through and let From be checked as a Variable below,
+          // where it may or may not need a register.
+        } else {
+          return copyToReg(Flex, RegNum);
+        }
+      } else {
+        return copyToReg(Flex, RegNum);
+      }
+    } else {
+      return From;
+    }
+  }
+
+  if (llvm::isa<Constant>(From)) {
+    if (llvm::isa<ConstantUndef>(From)) {
+      // Lower undefs to zero.  Another option is to lower undefs to an
+      // uninitialized register; however, using an uninitialized register
+      // results in less predictable code.
+      if (isVectorType(From->getType()))
+        return makeVectorOfZeros(From->getType(), RegNum);
+      From = Ctx->getConstantZero(From->getType());
+    }
+    // There should be no constants of vector type (other than undef).
+    assert(!isVectorType(From->getType()));
+    bool CanBeFlex = Allowed & Legal_Flex;
+    if (auto C32 = llvm::dyn_cast<ConstantInteger32>(From)) {
+      uint32_t RotateAmt;
+      uint32_t Immed_8;
+      uint32_t Value = static_cast<uint32_t>(C32->getValue());
+      // Check if the immediate will fit in a Flexible second operand,
+      // if a Flexible second operand is allowed. We need to know the exact
+      // value, so that rules out relocatable constants.
+      // Also try the inverse and use MVN if possible.
+      if (CanBeFlex &&
+          OperandARM32FlexImm::canHoldImm(Value, &RotateAmt, &Immed_8)) {
+        return OperandARM32FlexImm::create(Func, From->getType(), Immed_8,
+                                           RotateAmt);
+      } else if (CanBeFlex && OperandARM32FlexImm::canHoldImm(
+                                  ~Value, &RotateAmt, &Immed_8)) {
+        auto InvertedFlex = OperandARM32FlexImm::create(Func, From->getType(),
+                                                        Immed_8, RotateAmt);
+        Type Ty = From->getType();
+        Variable *Reg = makeReg(Ty, RegNum);
+        _mvn(Reg, InvertedFlex);
+        return Reg;
+      } else {
+        // Do a movw/movt to a register.
+        Type Ty = From->getType();
+        Variable *Reg = makeReg(Ty, RegNum);
+        uint32_t UpperBits = (Value >> 16) & 0xFFFF;
+        _movw(Reg,
+              UpperBits != 0 ? Ctx->getConstantInt32(Value & 0xFFFF) : C32);
+        if (UpperBits != 0) {
+          _movt(Reg, Ctx->getConstantInt32(UpperBits));
+        }
+        return Reg;
+      }
+    } else if (auto C = llvm::dyn_cast<ConstantRelocatable>(From)) {
+      Type Ty = From->getType();
+      Variable *Reg = makeReg(Ty, RegNum);
+      _movw(Reg, C);
+      _movt(Reg, C);
+      return Reg;
+    } else {
+      // Load floats/doubles from literal pool.
+      UnimplementedError(Func->getContext()->getFlags());
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+
+  if (auto Var = llvm::dyn_cast<Variable>(From)) {
+    // Check if the variable is guaranteed a physical register.  This
+    // can happen either when the variable is pre-colored or when it is
+    // assigned infinite weight.
+    bool MustHaveRegister = (Var->hasReg() || Var->getWeight().isInf());
+    // We need a new physical register for the operand if:
+    //   Mem is not allowed and Var isn't guaranteed a physical
+    //   register, or
+    //   RegNum is required and Var->getRegNum() doesn't match.
+    if ((!(Allowed & Legal_Mem) && !MustHaveRegister) ||
+        (RegNum != Variable::NoRegister && RegNum != Var->getRegNum())) {
+      From = copyToReg(From, RegNum);
+    }
+    return From;
+  }
+  llvm_unreachable("Unhandled operand kind in legalize()");
+
+  return From;
+}
+
+// Provide a trivial wrapper to legalize() for this common usage.
+Variable *TargetARM32::legalizeToVar(Operand *From, int32_t RegNum) {
+  return llvm::cast<Variable>(legalize(From, Legal_Reg, RegNum));
+}
+
+Variable *TargetARM32::makeReg(Type Type, int32_t RegNum) {
+  // There aren't any 64-bit integer registers for ARM32.
+  assert(Type != IceType_i64);
+  Variable *Reg = Func->makeVariable(Type);
+  if (RegNum == Variable::NoRegister)
+    Reg->setWeightInfinite();
+  else
+    Reg->setRegNum(RegNum);
+  return Reg;
+}
+
 void TargetARM32::postLower() {
   if (Ctx->getFlags().getOptLevel() == Opt_m1)
     return;
-  // Find two-address non-SSA instructions where Dest==Src0, and set
-  // the DestNonKillable flag to keep liveness analysis consistent.
-  UnimplementedError(Func->getContext()->getFlags());
+  inferTwoAddress();
 }
 
 void TargetARM32::makeRandomRegisterPermutation(
@@ -714,10 +1156,12 @@
 }
 
 void TargetARM32::emit(const ConstantFloat *C) const {
+  (void)C;
   UnimplementedError(Ctx->getFlags());
 }
 
 void TargetARM32::emit(const ConstantDouble *C) const {
+  (void)C;
   UnimplementedError(Ctx->getFlags());
 }
 
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 99909a3..0c21c7c 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -65,6 +65,13 @@
   void addProlog(CfgNode *Node) override;
   void addEpilog(CfgNode *Node) override;
 
+  // Ensure that a 64-bit Variable has been split into 2 32-bit
+  // Variables, creating them if necessary.  This is needed for all
+  // I64 operations.
+  void split64(Variable *Var);
+  Operand *loOperand(Operand *Operand);
+  Operand *hiOperand(Operand *Operand);
+
 protected:
   explicit TargetARM32(Cfg *Func);
 
@@ -94,16 +101,58 @@
   void doAddressOptLoad() override;
   void doAddressOptStore() override;
   void randomlyInsertNop(float Probability) override;
+
+  enum OperandLegalization {
+    Legal_None = 0,
+    Legal_Reg = 1 << 0,  // physical register, not stack location
+    Legal_Flex = 1 << 1, // A flexible operand2, which can hold rotated
+                         // small immediates, or shifted registers.
+    Legal_Mem = 1 << 2,  // includes [r0, r1 lsl #2] as well as [sp, #12]
+    Legal_All = ~Legal_None
+  };
+  typedef uint32_t LegalMask;
+  Operand *legalize(Operand *From, LegalMask Allowed = Legal_All,
+                    int32_t RegNum = Variable::NoRegister);
+  Variable *legalizeToVar(Operand *From, int32_t RegNum = Variable::NoRegister);
+
+  Variable *makeReg(Type Ty, int32_t RegNum = Variable::NoRegister);
+  static Type stackSlotType();
+  Variable *copyToReg(Operand *Src, int32_t RegNum = Variable::NoRegister);
+
+  // Returns a vector in a register with the given constant entries.
+  Variable *makeVectorOfZeros(Type Ty, int32_t RegNum = Variable::NoRegister);
+
   void makeRandomRegisterPermutation(
       llvm::SmallVectorImpl<int32_t> &Permutation,
       const llvm::SmallBitVector &ExcludeRegisters) const override;
 
-  static Type stackSlotType();
-
   // The following are helpers that insert lowered ARM32 instructions
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
 
+  void _ldr(Variable *Dest, OperandARM32Mem *Addr) {
+    Context.insert(InstARM32Ldr::create(Func, Dest, Addr));
+  }
+  // If Dest=nullptr is passed in, then a new variable is created,
+  // marked as infinite register allocation weight, and returned
+  // through the in/out Dest argument.
+  void _mov(Variable *&Dest, Operand *Src0,
+            int32_t RegNum = Variable::NoRegister) {
+    if (Dest == nullptr)
+      Dest = makeReg(Src0->getType(), RegNum);
+    Context.insert(InstARM32Mov::create(Func, Dest, Src0));
+  }
+  // The Operand can only be a 16-bit immediate or a ConstantRelocatable
+  // (with an upper16 relocation).
+  void _movt(Variable *&Dest, Operand *Src0) {
+    Context.insert(InstARM32Movt::create(Func, Dest, Src0));
+  }
+  void _movw(Variable *&Dest, Operand *Src0) {
+    Context.insert(InstARM32Movw::create(Func, Dest, Src0));
+  }
+  void _mvn(Variable *&Dest, Operand *Src0) {
+    Context.insert(InstARM32Mvn::create(Func, Dest, Src0));
+  }
   void _ret(Variable *LR, Variable *Src0 = nullptr) {
     Context.insert(InstARM32Ret::create(Func, LR, Src0));
   }
diff --git a/src/IceTargetLoweringX8632.cpp b/src/IceTargetLoweringX8632.cpp
index f9488ec..0aaafa2 100644
--- a/src/IceTargetLoweringX8632.cpp
+++ b/src/IceTargetLoweringX8632.cpp
@@ -4636,7 +4636,7 @@
   // work, e.g. allow the shl shift amount to be either an immediate
   // or in ecx.)
   assert(RegNum == Variable::NoRegister || Allowed == Legal_Reg);
-  if (OperandX8632Mem *Mem = llvm::dyn_cast<OperandX8632Mem>(From)) {
+  if (auto Mem = llvm::dyn_cast<OperandX8632Mem>(From)) {
     // Before doing anything with a Mem operand, we need to ensure
     // that the Base and Index components are in physical registers.
     Variable *Base = Mem->getBase();
@@ -4691,7 +4691,7 @@
     }
     return From;
   }
-  if (Variable *Var = llvm::dyn_cast<Variable>(From)) {
+  if (auto Var = llvm::dyn_cast<Variable>(From)) {
     // Check if the variable is guaranteed a physical register.  This
     // can happen either when the variable is pre-colored or when it is
     // assigned infinite weight.
@@ -4766,19 +4766,7 @@
 void TargetX8632::postLower() {
   if (Ctx->getFlags().getOptLevel() == Opt_m1)
     return;
-  // Find two-address non-SSA instructions where Dest==Src0, and set
-  // the DestNonKillable flag to keep liveness analysis consistent.
-  for (auto Inst = Context.getCur(), E = Context.getNext(); Inst != E; ++Inst) {
-    if (Inst->isDeleted())
-      continue;
-    if (Variable *Dest = Inst->getDest()) {
-      // TODO(stichnot): We may need to consider all source
-      // operands, not just the first one, if using 3-address
-      // instructions.
-      if (Inst->getSrcSize() > 0 && Inst->getSrc(0) == Dest)
-        Inst->setDestNonKillable();
-    }
-  }
+  inferTwoAddress();
 }
 
 void TargetX8632::makeRandomRegisterPermutation(
diff --git a/src/IceUtils.h b/src/IceUtils.h
index bcbba23..cf0be63 100644
--- a/src/IceUtils.h
+++ b/src/IceUtils.h
@@ -78,6 +78,19 @@
       return 0;
     return Align - Mod;
   }
+
+  // Precondition: 0 <= shift < 32
+  static inline uint32_t rotateLeft32(uint32_t value, uint32_t shift) {
+    if (shift == 0)
+      return value;
+    return (value << shift) | (value >> (32 - shift));
+  }
+
+  static inline uint32_t rotateRight32(uint32_t value, uint32_t shift) {
+    if (shift == 0)
+      return value;
+    return (value >> shift) | (value << (32 - shift));
+  }
 };
 
 } // end of namespace Ice
diff --git a/tests_lit/llvm2ice_tests/int-arg.ll b/tests_lit/llvm2ice_tests/int-arg.ll
new file mode 100644
index 0000000..9be4e1e
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/int-arg.ll
@@ -0,0 +1,243 @@
+; This file checks that Subzero generates code in accordance with the
+; calling convention for integers.
+
+; RUN: %p2i -i %s --filetype=obj --disassemble --args -O2 \
+; RUN:   | FileCheck %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s
+
+; For x86-32, integer arguments use the stack.
+; For ARM32, integer arguments can be r0-r3. i64 arguments occupy two
+; adjacent 32-bit registers, and require the first to be an even register.
+
+
+; i32
+
+define i32 @test_returning32_arg0(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7) {
+entry:
+  ret i32 %arg0
+}
+; CHECK-LABEL: test_returning32_arg0
+; CHECK-NEXT: mov eax,{{.*}} [esp+0x4]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_arg0
+; ARM32-NEXT: bx lr
+
+define i32 @test_returning32_arg1(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7) {
+entry:
+  ret i32 %arg1
+}
+; CHECK-LABEL: test_returning32_arg1
+; CHECK-NEXT: mov eax,{{.*}} [esp+0x8]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_arg1
+; ARM32-NEXT: mov r0, r1
+; ARM32-NEXT: bx lr
+
+
+define i32 @test_returning32_arg2(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7) {
+entry:
+  ret i32 %arg2
+}
+; CHECK-LABEL: test_returning32_arg2
+; CHECK-NEXT: mov eax,{{.*}} [esp+0xc]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_arg2
+; ARM32-NEXT: mov r0, r2
+; ARM32-NEXT: bx lr
+
+
+define i32 @test_returning32_arg3(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7) {
+entry:
+  ret i32 %arg3
+}
+; CHECK-LABEL: test_returning32_arg3
+; CHECK-NEXT: mov eax,{{.*}} [esp+0x10]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_arg3
+; ARM32-NEXT: mov r0, r3
+; ARM32-NEXT: bx lr
+
+
+define i32 @test_returning32_arg4(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7) {
+entry:
+  ret i32 %arg4
+}
+; CHECK-LABEL: test_returning32_arg4
+; CHECK-NEXT: mov eax,{{.*}} [esp+0x14]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_arg4
+; TODO(jvoung): Toggle this on, once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp]
+; ARM32-NEXT: bx lr
+
+
+define i32 @test_returning32_arg5(i32 %arg0, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 %arg7) {
+entry:
+  ret i32 %arg5
+}
+; CHECK-LABEL: test_returning32_arg5
+; CHECK-NEXT: mov eax,{{.*}} [esp+0x18]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_arg5
+; TODO(jvoung): Toggle this on, once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp, #4]
+; ARM32-NEXT: bx lr
+
+; i64
+
+define i64 @test_returning64_arg0(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) {
+entry:
+  ret i64 %arg0
+}
+; CHECK-LABEL: test_returning64_arg0
+; CHECK-NEXT: mov {{.*}} [esp+0x4]
+; CHECK-NEXT: mov {{.*}} [esp+0x8]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_arg0
+; ARM32-NEXT: bx lr
+
+define i64 @test_returning64_arg1(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) {
+entry:
+  ret i64 %arg1
+}
+; CHECK-LABEL: test_returning64_arg1
+; CHECK-NEXT: mov {{.*}} [esp+0xc]
+; CHECK-NEXT: mov {{.*}} [esp+0x10]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_arg1
+; ARM32-NEXT: mov r0, r2
+; ARM32-NEXT: mov r1, r3
+; ARM32-NEXT: bx lr
+
+define i64 @test_returning64_arg2(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) {
+entry:
+  ret i64 %arg2
+}
+; CHECK-LABEL: test_returning64_arg2
+; CHECK-NEXT: mov {{.*}} [esp+0x14]
+; CHECK-NEXT: mov {{.*}} [esp+0x18]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_arg2
+; This could have been a ldm sp, {r0, r1}, but we don't do the ldm optimization.
+; TODO(jvoung): enable this once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp]
+; TODOARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-NEXT: bx lr
+
+define i64 @test_returning64_arg3(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3) {
+entry:
+  ret i64 %arg3
+}
+; CHECK-LABEL: test_returning64_arg3
+; CHECK-NEXT: mov {{.*}} [esp+0x1c]
+; CHECK-NEXT: mov {{.*}} [esp+0x20]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_arg3
+; TODO(jvoung): enable this once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp, #8]
+; TODOARM32-NEXT: ldr r1, [sp, #12]
+; ARM32-NEXT: bx lr
+
+
+; Test that on ARM, the i64 arguments start with an even register.
+
+define i64 @test_returning64_even_arg1(i32 %arg0, i64 %arg1, i64 %arg2) {
+entry:
+  ret i64 %arg1
+}
+; Not padded out x86-32.
+; CHECK-LABEL: test_returning64_even_arg1
+; CHECK-NEXT: mov {{.*}} [esp+0x8]
+; CHECK-NEXT: mov {{.*}} [esp+0xc]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_even_arg1
+; ARM32-NEXT: mov r0, r2
+; ARM32-NEXT: mov r1, r3
+; ARM32-NEXT: bx lr
+
+define i64 @test_returning64_even_arg1b(i32 %arg0, i32 %arg0b, i64 %arg1, i64 %arg2) {
+entry:
+  ret i64 %arg1
+}
+; CHECK-LABEL: test_returning64_even_arg1b
+; CHECK-NEXT: mov {{.*}} [esp+0xc]
+; CHECK-NEXT: mov {{.*}} [esp+0x10]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_even_arg1b
+; ARM32-NEXT: mov r0, r2
+; ARM32-NEXT: mov r1, r3
+; ARM32-NEXT: bx lr
+
+define i64 @test_returning64_even_arg2(i64 %arg0, i32 %arg1, i64 %arg2) {
+entry:
+  ret i64 %arg2
+}
+; Not padded out on x86-32.
+; CHECK-LABEL: test_returning64_even_arg2
+; CHECK-NEXT: mov {{.*}} [esp+0x10]
+; CHECK-NEXT: mov {{.*}} [esp+0x14]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_even_arg2
+; TODO(jvoung): enable this once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp]
+; TODOARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-NEXT: bx lr
+
+define i64 @test_returning64_even_arg2b(i64 %arg0, i32 %arg1, i32 %arg1b, i64 %arg2) {
+entry:
+  ret i64 %arg2
+}
+; CHECK-LABEL: test_returning64_even_arg2b
+; CHECK-NEXT: mov {{.*}} [esp+0x14]
+; CHECK-NEXT: mov {{.*}} [esp+0x18]
+; CHECK: ret
+; ARM32-LABEL: test_returning64_even_arg2b
+; TODO(jvoung): enable this once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp]
+; TODOARM32-NEXT: ldr r1, [sp, #4]
+; ARM32-NEXT: bx lr
+
+define i32 @test_returning32_even_arg2(i64 %arg0, i32 %arg1, i32 %arg2) {
+entry:
+  ret i32 %arg2
+}
+; CHECK-LABEL: test_returning32_even_arg2
+; CHECK-NEXT: mov {{.*}} [esp+0x10]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_even_arg2
+; ARM32-NEXT: mov r0, r3
+; ARM32-NEXT: bx lr
+
+define i32 @test_returning32_even_arg2b(i32 %arg0, i32 %arg1, i32 %arg2, i64 %arg3) {
+entry:
+  ret i32 %arg2
+}
+; CHECK-LABEL: test_returning32_even_arg2b
+; CHECK-NEXT: mov {{.*}} [esp+0xc]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_even_arg2b
+; ARM32-NEXT: mov r0, r2
+; ARM32-NEXT: bx lr
+
+; The i64 won't fit in a pair of register, and consumes the last register so a
+; following i32 can't use that free register.
+define i32 @test_returning32_even_arg4(i32 %arg0, i32 %arg1, i32 %arg2, i64 %arg3, i32 %arg4) {
+entry:
+  ret i32 %arg4
+}
+; CHECK-LABEL: test_returning32_even_arg4
+; CHECK-NEXT: mov {{.*}} [esp+0x18]
+; CHECK-NEXT: ret
+; ARM32-LABEL: test_returning32_even_arg4
+; TODO(jvoung): enable this once addProlog is done.
+; TODOARM32-NEXT: ldr r0, [sp, #8]
+; ARM32-NEXT: bx lr
+
+; Test interleaving float/double and integer (different register streams on ARM).
+; TODO(jvoung): Test once the S/D/Q regs are modeled.
diff --git a/tests_lit/llvm2ice_tests/return_immediates.ll b/tests_lit/llvm2ice_tests/return_immediates.ll
new file mode 100644
index 0000000..08ce8dc
--- /dev/null
+++ b/tests_lit/llvm2ice_tests/return_immediates.ll
@@ -0,0 +1,245 @@
+; Simple test that returns various immediates. For fixed-width instruction
+; sets, some immediates are more complicated than others.
+; For x86-32, it shouldn't be a problem.
+
+; RUN: %p2i --filetype=obj --disassemble -i %s --args -O2 | FileCheck %s
+
+; TODO(jvoung): Stop skipping unimplemented parts (via --skip-unimplemented)
+; once enough infrastructure is in. Also, switch to --filetype=obj
+; when possible.
+; RUN: %if --need=target_ARM32 --command %p2i --filetype=asm --assemble \
+; RUN:   --disassemble --target arm32 -i %s --args -O2 --skip-unimplemented \
+; RUN:   | %if --need=target_ARM32 --command FileCheck --check-prefix ARM32 %s
+
+; Test 8-bits of all ones rotated right by various amounts (even vs odd).
+; ARM has a shifter that allows encoding 8-bits rotated right by even amounts.
+; The first few "rotate right" test cases are expressed as shift-left.
+
+define i32 @ret_8bits_shift_left0() {
+  ret i32 255
+}
+; CHECK-LABEL: ret_8bits_shift_left0
+; CHECK-NEXT: mov eax,0xff
+; ARM32-LABEL: ret_8bits_shift_left0
+; ARM32-NEXT: mov r0, #255
+
+define i32 @ret_8bits_shift_left1() {
+  ret i32 510
+}
+; CHECK-LABEL: ret_8bits_shift_left1
+; CHECK-NEXT: mov eax,0x1fe
+; ARM32-LABEL: ret_8bits_shift_left1
+; ARM32-NEXT: movw r0, #510
+
+define i32 @ret_8bits_shift_left2() {
+  ret i32 1020
+}
+; CHECK-LABEL: ret_8bits_shift_left2
+; CHECK-NEXT: mov eax,0x3fc
+; ARM32-LABEL: ret_8bits_shift_left2
+; ARM32-NEXT: mov r0, #1020
+
+define i32 @ret_8bits_shift_left4() {
+  ret i32 4080
+}
+; CHECK-LABEL: ret_8bits_shift_left4
+; CHECK-NEXT: mov eax,0xff0
+; ARM32-LABEL: ret_8bits_shift_left4
+; ARM32-NEXT: mov r0, #4080
+
+define i32 @ret_8bits_shift_left14() {
+  ret i32 4177920
+}
+; CHECK-LABEL: ret_8bits_shift_left14
+; CHECK-NEXT: mov eax,0x3fc000
+; ARM32-LABEL: ret_8bits_shift_left14
+; ARM32-NEXT: mov r0, #4177920
+
+define i32 @ret_8bits_shift_left15() {
+  ret i32 8355840
+}
+; CHECK-LABEL: ret_8bits_shift_left15
+; CHECK-NEXT: mov eax,0x7f8000
+; ARM32-LABEL: ret_8bits_shift_left15
+; ARM32-NEXT: movw r0, #32768
+; ARM32-NEXT: movt r0, #127
+
+; Shift 8 bits left by 24 to the i32 limit. This is also ror by 8 bits.
+
+define i32 @ret_8bits_shift_left24() {
+  ret i32 4278190080
+}
+; CHECK-LABEL: ret_8bits_shift_left24
+; CHECK-NEXT: mov eax,0xff000000
+; ARM32-LABEL: ret_8bits_shift_left24
+; ARM32-NEXT: mov r0, #-16777216
+; ARM32-NEXT: bx lr
+
+; The next few cases wrap around and actually demonstrate the rotation.
+
+define i32 @ret_8bits_ror7() {
+  ret i32 4261412865
+}
+; CHECK-LABEL: ret_8bits_ror7
+; CHECK-NEXT: mov eax,0xfe000001
+; ARM32-LABEL: ret_8bits_ror7
+; ARM32-NEXT: movw r0, #1
+; ARM32-NEXT: movt r0, #65024
+
+define i32 @ret_8bits_ror6() {
+  ret i32 4227858435
+}
+; CHECK-LABEL: ret_8bits_ror6
+; CHECK-NEXT: mov eax,0xfc000003
+; ARM32-LABEL: ret_8bits_ror6
+; ARM32-NEXT: mov r0, #-67108861
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_ror5() {
+  ret i32 4160749575
+}
+; CHECK-LABEL: ret_8bits_ror5
+; CHECK-NEXT: mov eax,0xf8000007
+; ARM32-LABEL: ret_8bits_ror5
+; ARM32-NEXT: movw r0, #7
+; ARM32-NEXT: movt r0, #63488
+
+define i32 @ret_8bits_ror4() {
+  ret i32 4026531855
+}
+; CHECK-LABEL: ret_8bits_ror4
+; CHECK-NEXT: mov eax,0xf000000f
+; ARM32-LABEL: ret_8bits_ror4
+; ARM32-NEXT: mov r0, #-268435441
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_ror3() {
+  ret i32 3758096415
+}
+; CHECK-LABEL: ret_8bits_ror3
+; CHECK-NEXT: mov eax,0xe000001f
+; ARM32-LABEL: ret_8bits_ror3
+; ARM32-NEXT: movw r0, #31
+; ARM32-NEXT: movt r0, #57344
+
+define i32 @ret_8bits_ror2() {
+  ret i32 3221225535
+}
+; CHECK-LABEL: ret_8bits_ror2
+; CHECK-NEXT: mov eax,0xc000003f
+; ARM32-LABEL: ret_8bits_ror2
+; ARM32-NEXT: mov r0, #-1073741761
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_ror1() {
+  ret i32 2147483775
+}
+; CHECK-LABEL: ret_8bits_ror1
+; CHECK-NEXT: mov eax,0x8000007f
+; ARM32-LABEL: ret_8bits_ror1
+; ARM32-NEXT: movw r0, #127
+; ARM32-NEXT: movt r0, #32768
+
+; Some architectures can handle 16-bits at a time efficiently,
+; so also test those.
+
+define i32 @ret_16bits_lower() {
+  ret i32 65535
+}
+; CHECK-LABEL: ret_16bits_lower
+; CHECK-NEXT: mov eax,0xffff
+; ARM32-LABEL: ret_16bits_lower
+; ARM32-NEXT: movw r0, #65535
+; ARM32-NEXT: bx lr
+
+define i32 @ret_17bits_lower() {
+  ret i32 131071
+}
+; CHECK-LABEL: ret_17bits_lower
+; CHECK-NEXT: mov eax,0x1ffff
+; ARM32-LABEL: ret_17bits_lower
+; ARM32-NEXT: movw r0, #65535
+; ARM32-NEXT: movt r0, #1
+
+define i32 @ret_16bits_upper() {
+  ret i32 4294901760
+}
+; CHECK-LABEL: ret_16bits_upper
+; CHECK-NEXT: mov eax,0xffff0000
+; ARM32-LABEL: ret_16bits_upper
+; ARM32-NEXT: movw r0, #0
+; ARM32-NEXT: movt r0, #65535
+
+; Some 32-bit immediates can be inverted, and moved in a single instruction.
+
+define i32 @ret_8bits_inverted_shift_left0() {
+  ret i32 4294967040
+}
+; CHECK-LABEL: ret_8bits_inverted_shift_left0
+; CHECK-NEXT: mov eax,0xffffff00
+; ARM32-LABEL: ret_8bits_inverted_shift_left0
+; ARM32-NEXT: mvn r0, #255
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_inverted_shift_left24() {
+  ret i32 16777215
+}
+; CHECK-LABEL: ret_8bits_inverted_shift_left24
+; CHECK-NEXT: mov eax,0xffffff
+; ARM32-LABEL: ret_8bits_inverted_shift_left24
+; ARM32-NEXT: mvn r0, #-16777216
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_inverted_ror2() {
+  ret i32 1073741760
+}
+; CHECK-LABEL: ret_8bits_inverted_ror2
+; CHECK-NEXT: mov eax,0x3fffffc0
+; ARM32-LABEL: ret_8bits_inverted_ror2
+; ARM32-NEXT: mvn r0, #-1073741761
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_inverted_ror6() {
+  ret i32 67108860
+}
+; CHECK-LABEL: ret_8bits_inverted_ror6
+; CHECK-NEXT: mov eax,0x3fffffc
+; ARM32-LABEL: ret_8bits_inverted_ror6
+; ARM32-NEXT: mvn r0, #-67108861
+; ARM32-NEXT: bx lr
+
+define i32 @ret_8bits_inverted_ror7() {
+  ret i32 33554430
+}
+; CHECK-LABEL: ret_8bits_inverted_ror7
+; CHECK-NEXT: mov eax,0x1fffffe
+; ARM32-LABEL: ret_8bits_inverted_ror7
+; ARM32-NEXT: movw r0, #65534
+; ARM32-NEXT: movt r0, #511
+
+; 64-bit immediates.
+
+define i64 @ret_64bits_shift_left0() {
+  ret i64 1095216660735
+}
+; CHECK-LABEL: ret_64bits_shift_left0
+; CHECK-NEXT: mov eax,0xff
+; CHECK-NEXT: mov edx,0xff
+; ARM32-LABEL: ret_64bits_shift_left0
+; ARM32-NEXT: movw r0, #255
+; ARM32-NEXT: movw r1, #255
+
+; A relocatable constant is assumed to require 32-bits along with
+; relocation directives.
+
+declare void @_start()
+
+define i32 @ret_addr() {
+  %ptr = ptrtoint void ()* @_start to i32
+  ret i32 %ptr
+}
+; CHECK-LABEL: ret_addr
+; CHECK-NEXT: mov eax,0x0 {{.*}} R_386_32 _start
+; ARM32-LABEL: ret_addr
+; ARM32-NEXT: movw r0, #0 {{.*}} R_ARM_MOVW_ABS_NC _start
+; ARM32-NEXT: movt r0, #0 {{.*}} R_ARM_MOVT_ABS    _start