Subzero ARM: do lowerIcmp, lowerBr, and a bit of lowerCall.

Allow instructions to be predicated and use that in lower icmp
and branch. Tracking the predicate for almost every instruction
is a bit overkill, but technically possible. Add that to most of
the instruction constructors except ret and call for now.

This doesn't yet do compare + branch fusing, but it does handle
the branch fallthrough to avoid branching twice.

I can't yet test 8bit and 16bit, since those come from "trunc"
and "trunc" is not lowered yet (or load, which also isn't
handled yet).

Adds basic "call(void)" lowering, just to get the call markers
showing up in tests.

64bit.pnacl.ll no longer explodes with liveness consistency errors,
so risk running that and backfill some of the 64bit arith tests.

BUG= https://code.google.com/p/nativeclient/issues/detail?id=4076
R=stichnot@chromium.org

Review URL: https://codereview.chromium.org/1151663004
diff --git a/src/IceConditionCodesARM32.h b/src/IceConditionCodesARM32.h
new file mode 100644
index 0000000..b98c770
--- /dev/null
+++ b/src/IceConditionCodesARM32.h
@@ -0,0 +1,39 @@
+//===- subzero/src/IceConditionCodesARM32.h - Condition Codes ---*- C++ -*-===//
+//
+//                        The Subzero Code Generator
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the condition codes for ARM32.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SUBZERO_SRC_ICECONDITIONCODESARM32_H
+#define SUBZERO_SRC_ICECONDITIONCODESARM32_H
+
+#include "IceDefs.h"
+#include "IceInstARM32.def"
+
+namespace Ice {
+
+class CondARM32 {
+  CondARM32() = delete;
+  CondARM32(const CondARM32 &) = delete;
+  CondARM32 &operator=(const CondARM32 &) = delete;
+
+public:
+  // An enum of codes used for conditional instructions. The enum value
+  // should match the value used to encode operands in binary instructions.
+  enum Cond {
+#define X(tag, encode, opp, emit) tag = encode,
+    ICEINSTARM32COND_TABLE
+#undef X
+  };
+};
+
+} // end of namespace Ice
+
+#endif // SUBZERO_SRC_ICECONDITIONCODESARM32_H
diff --git a/src/IceInstARM32.cpp b/src/IceInstARM32.cpp
index 67f0fd4..16d3ac1 100644
--- a/src/IceInstARM32.cpp
+++ b/src/IceInstARM32.cpp
@@ -47,32 +47,58 @@
 #undef X
 };
 
+const struct InstARM32CondAttributes_ {
+  CondARM32::Cond Opposite;
+  const char *EmitString;
+} InstARM32CondAttributes[] = {
+#define X(tag, encode, opp, emit)                                              \
+  { CondARM32::opp, emit }                                                     \
+  ,
+    ICEINSTARM32COND_TABLE
+#undef X
+};
+
 } // end of anonymous namespace
 
 const char *InstARM32::getWidthString(Type Ty) {
   return TypeARM32Attributes[Ty].WidthString;
 }
 
-void emitTwoAddr(const char *Opcode, const Inst *Inst, const Cfg *Func) {
+const char *InstARM32Pred::predString(CondARM32::Cond Pred) {
+  return InstARM32CondAttributes[Pred].EmitString;
+}
+
+void InstARM32Pred::dumpOpcodePred(Ostream &Str, const char *Opcode,
+                                   Type Ty) const {
+  Str << Opcode << getPredicate() << "." << Ty;
+}
+
+CondARM32::Cond InstARM32::getOppositeCondition(CondARM32::Cond Cond) {
+  return InstARM32CondAttributes[Cond].Opposite;
+}
+
+void InstARM32Pred::emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
+                                const Cfg *Func) {
   if (!ALLOW_DUMP)
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(Inst->getSrcSize() == 2);
   Variable *Dest = Inst->getDest();
   assert(Dest == Inst->getSrc(0));
-  Str << "\t" << Opcode << "\t";
+  Str << "\t" << Opcode << Inst->getPredicate() << "\t";
   Dest->emit(Func);
   Str << ", ";
   Inst->getSrc(1)->emit(Func);
 }
 
-void emitThreeAddr(const char *Opcode, const Inst *Inst, const Cfg *Func,
-                   bool SetFlags) {
+void InstARM32Pred::emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
+                                  const Cfg *Func, bool SetFlags) {
   if (!ALLOW_DUMP)
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(Inst->getSrcSize() == 2);
-  Str << "\t" << Opcode << (SetFlags ? "s" : "") << "\t";
+  Str << "\t" << Opcode << (SetFlags ? "s" : "") << Inst->getPredicate()
+      << "\t";
   Inst->getDest()->emit(Func);
   Str << ", ";
   Inst->getSrc(0)->emit(Func);
@@ -154,14 +180,81 @@
     Vars[1] = ShiftVar;
 }
 
-InstARM32Ldr::InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem)
-    : InstARM32(Func, InstARM32::Ldr, 1, Dest) {
+InstARM32Br::InstARM32Br(Cfg *Func, const CfgNode *TargetTrue,
+                         const CfgNode *TargetFalse, CondARM32::Cond Pred)
+    : InstARM32Pred(Func, InstARM32::Br, 0, nullptr, Pred),
+      TargetTrue(TargetTrue), TargetFalse(TargetFalse) {}
+
+bool InstARM32Br::optimizeBranch(const CfgNode *NextNode) {
+  // If there is no next block, then there can be no fallthrough to
+  // optimize.
+  if (NextNode == nullptr)
+    return false;
+  // If there is no fallthrough node, such as a non-default case label
+  // for a switch instruction, then there is no opportunity to
+  // optimize.
+  if (getTargetFalse() == nullptr)
+    return false;
+
+  // Unconditional branch to the next node can be removed.
+  if (isUnconditionalBranch() && getTargetFalse() == NextNode) {
+    assert(getTargetTrue() == nullptr);
+    setDeleted();
+    return true;
+  }
+  // If the fallthrough is to the next node, set fallthrough to nullptr
+  // to indicate.
+  if (getTargetFalse() == NextNode) {
+    TargetFalse = nullptr;
+    return true;
+  }
+  // If TargetTrue is the next node, and TargetFalse is not nullptr
+  // (which was already tested above), then invert the branch
+  // condition, swap the targets, and set new fallthrough to nullptr.
+  if (getTargetTrue() == NextNode) {
+    assert(Predicate != CondARM32::AL);
+    setPredicate(getOppositeCondition(getPredicate()));
+    TargetTrue = getTargetFalse();
+    TargetFalse = nullptr;
+    return true;
+  }
+  return false;
+}
+
+bool InstARM32Br::repointEdge(CfgNode *OldNode, CfgNode *NewNode) {
+  if (TargetFalse == OldNode) {
+    TargetFalse = NewNode;
+    return true;
+  } else if (TargetTrue == OldNode) {
+    TargetTrue = NewNode;
+    return true;
+  }
+  return false;
+}
+
+InstARM32Call::InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget)
+    : InstARM32(Func, InstARM32::Call, 1, Dest) {
+  HasSideEffects = true;
+  addSource(CallTarget);
+}
+
+InstARM32Cmp::InstARM32Cmp(Cfg *Func, Variable *Src1, Operand *Src2,
+                           CondARM32::Cond Predicate)
+    : InstARM32Pred(Func, InstARM32::Cmp, 2, nullptr, Predicate) {
+  addSource(Src1);
+  addSource(Src2);
+}
+
+InstARM32Ldr::InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
+                           CondARM32::Cond Predicate)
+    : InstARM32Pred(Func, InstARM32::Ldr, 1, Dest, Predicate) {
   addSource(Mem);
 }
 
 InstARM32Mla::InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0,
-                           Variable *Src1, Variable *Acc)
-    : InstARM32(Func, InstARM32::Mla, 3, Dest) {
+                           Variable *Src1, Variable *Acc,
+                           CondARM32::Cond Predicate)
+    : InstARM32Pred(Func, InstARM32::Mla, 3, Dest, Predicate) {
   addSource(Src0);
   addSource(Src1);
   addSource(Acc);
@@ -175,8 +268,9 @@
 }
 
 InstARM32Umull::InstARM32Umull(Cfg *Func, Variable *DestLo, Variable *DestHi,
-                               Variable *Src0, Variable *Src1)
-    : InstARM32(Func, InstARM32::Umull, 2, DestLo),
+                               Variable *Src0, Variable *Src1,
+                               CondARM32::Cond Predicate)
+    : InstARM32Pred(Func, InstARM32::Umull, 2, DestLo, Predicate),
       // DestHi is expected to have a FakeDef inserted by the lowering code.
       DestHi(DestHi) {
   addSource(Src0);
@@ -197,6 +291,7 @@
 template <> const char *InstARM32Add::Opcode = "add";
 template <> const char *InstARM32And::Opcode = "and";
 template <> const char *InstARM32Eor::Opcode = "eor";
+template <> const char *InstARM32Lsl::Opcode = "lsl";
 template <> const char *InstARM32Mul::Opcode = "mul";
 template <> const char *InstARM32Orr::Opcode = "orr";
 template <> const char *InstARM32Sbc::Opcode = "sbc";
@@ -218,8 +313,7 @@
   Variable *Dest = getDest();
   if (Dest->hasReg()) {
     Str << "\t"
-        << "mov"
-        << "\t";
+        << "mov" << getPredicate() << "\t";
     getDest()->emit(Func);
     Str << ", ";
     getSrc(0)->emit(Func);
@@ -227,8 +321,7 @@
     Variable *Src0 = llvm::cast<Variable>(getSrc(0));
     assert(Src0->hasReg());
     Str << "\t"
-        << "str"
-        << "\t";
+        << "str" << getPredicate() << "\t";
     Src0->emit(Func);
     Str << ", ";
     Dest->emit(Func);
@@ -241,6 +334,115 @@
   llvm_unreachable("Not yet implemented");
 }
 
+void InstARM32Br::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  Str << "\t"
+      << "b" << getPredicate() << "\t";
+  if (isUnconditionalBranch()) {
+    Str << getTargetFalse()->getAsmName();
+  } else {
+    Str << getTargetTrue()->getAsmName();
+    if (getTargetFalse()) {
+      Str << "\n\t"
+          << "b"
+          << "\t" << getTargetFalse()->getAsmName();
+    }
+  }
+}
+
+void InstARM32Br::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Br::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  Str << "br ";
+
+  if (getPredicate() == CondARM32::AL) {
+    Str << "label %" << getTargetFalse()->getName();
+    return;
+  }
+
+  Str << getPredicate() << ", label %" << getTargetTrue()->getName();
+  if (getTargetFalse()) {
+    Str << ", label %" << getTargetFalse()->getName();
+  }
+}
+
+void InstARM32Call::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 1);
+  if (llvm::isa<ConstantInteger32>(getCallTarget())) {
+    // This shouldn't happen (typically have to copy the full 32-bits
+    // to a register and do an indirect jump).
+    llvm::report_fatal_error("ARM32Call to ConstantInteger32");
+  } else if (const auto CallTarget =
+                 llvm::dyn_cast<ConstantRelocatable>(getCallTarget())) {
+    // Calls only have 24-bits, but the linker should insert veneers to
+    // extend the range if needed.
+    Str << "\t"
+        << "bl"
+        << "\t";
+    CallTarget->emitWithoutPrefix(Func->getTarget());
+  } else {
+    Str << "\t"
+        << "blx"
+        << "\t";
+    getCallTarget()->emit(Func);
+  }
+  Func->getTarget()->resetStackAdjustment();
+}
+
+void InstARM32Call::emitIAS(const Cfg *Func) const {
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Call::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  if (getDest()) {
+    dumpDest(Func);
+    Str << " = ";
+  }
+  Str << "call ";
+  getCallTarget()->dump(Func);
+}
+
+void InstARM32Cmp::emit(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrEmit();
+  assert(getSrcSize() == 2);
+  Str << "\t"
+      << "cmp" << getPredicate() << "\t";
+  getSrc(0)->emit(Func);
+  Str << ", ";
+  getSrc(1)->emit(Func);
+}
+
+void InstARM32Cmp::emitIAS(const Cfg *Func) const {
+  assert(getSrcSize() == 2);
+  (void)Func;
+  llvm_unreachable("Not yet implemented");
+}
+
+void InstARM32Cmp::dump(const Cfg *Func) const {
+  if (!ALLOW_DUMP)
+    return;
+  Ostream &Str = Func->getContext()->getStrDump();
+  dumpOpcodePred(Str, "cmp", getSrc(0)->getType());
+  dumpSources(Func);
+}
+
 void InstARM32Ldr::emit(const Cfg *Func) const {
   if (!ALLOW_DUMP)
     return;
@@ -249,7 +451,7 @@
   assert(getDest()->hasReg());
   Type Ty = getSrc(0)->getType();
   Str << "\t"
-      << "ldr" << getWidthString(Ty) << "\t";
+      << "ldr" << getWidthString(Ty) << getPredicate() << "\t";
   getDest()->emit(Func);
   Str << ", ";
   getSrc(0)->emit(Func);
@@ -266,7 +468,9 @@
     return;
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
-  Str << " = ldr." << getSrc(0)->getType() << " ";
+  Str << " = ";
+  dumpOpcodePred(Str, "ldr", getDest()->getType());
+  Str << " ";
   dumpSources(Func);
 }
 
@@ -277,8 +481,7 @@
   assert(getSrcSize() == 3);
   assert(getDest()->hasReg());
   Str << "\t"
-      << "mla"
-      << "\t";
+      << "mla" << getPredicate() << "\t";
   getDest()->emit(Func);
   Str << ", ";
   getSrc(0)->emit(Func);
@@ -299,7 +502,9 @@
     return;
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
-  Str << " = mla." << getSrc(0)->getType() << " ";
+  Str << " = ";
+  dumpOpcodePred(Str, "mla", getDest()->getType());
+  Str << " ";
   dumpSources(Func);
 }
 
@@ -308,7 +513,7 @@
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
   assert(getSrcSize() == 1);
-  Str << "\t" << Opcode << "\t";
+  Str << "\t" << Opcode << getPredicate() << "\t";
   getDest()->emit(Func);
   Str << ", ";
   Constant *Src0 = llvm::cast<Constant>(getSrc(0));
@@ -327,7 +532,7 @@
   assert(getSrcSize() == 2);
   Variable *Dest = getDest();
   Constant *Src1 = llvm::cast<Constant>(getSrc(1));
-  Str << "\t" << Opcode << "\t";
+  Str << "\t" << Opcode << getPredicate() << "\t";
   Dest->emit(Func);
   Str << ", ";
   if (auto CR = llvm::dyn_cast<ConstantRelocatable>(Src1)) {
@@ -373,8 +578,7 @@
   assert(getSrcSize() == 2);
   assert(getDest()->hasReg());
   Str << "\t"
-      << "umull"
-      << "\t";
+      << "umull" << getPredicate() << "\t";
   getDest()->emit(Func);
   Str << ", ";
   DestHi->emit(Func);
@@ -395,7 +599,9 @@
     return;
   Ostream &Str = Func->getContext()->getStrDump();
   dumpDest(Func);
-  Str << " = umull." << getSrc(0)->getType() << " ";
+  Str << " = ";
+  dumpOpcodePred(Str, "umull", getDest()->getType());
+  Str << " ";
   dumpSources(Func);
 }
 
diff --git a/src/IceInstARM32.def b/src/IceInstARM32.def
index c314305..d381e1b 100644
--- a/src/IceInstARM32.def
+++ b/src/IceInstARM32.def
@@ -91,4 +91,27 @@
   X(RRX, "rrx")                                                         \
 //#define X(tag, emit)
 
+// Attributes for the condition code 4-bit encoding (that is independent
+// of the APSR's NZCV fields). For example, EQ is 0, but corresponds to
+// Z = 1, and NE is 1, but corresponds to Z = 0.
+#define ICEINSTARM32COND_TABLE                                          \
+  /* enum value, encoding, opposite, emit */                            \
+  X(EQ, 0, NE, "eq") /* equal */                                        \
+  X(NE, 1, EQ, "ne") /* not equal */                                    \
+  X(CS, 2, CC, "cs") /* carry set/unsigned (AKA hs: higher or same) */  \
+  X(CC, 3, CS, "cc") /* carry clear/unsigned (AKA lo: lower) */         \
+  X(MI, 4, PL, "mi") /* minus/negative */                               \
+  X(PL, 5, MI, "pl") /* plus/positive or zero */                        \
+  X(VS, 6, VC, "vs") /* overflow (float unordered) */                   \
+  X(VC, 7, VS, "vc") /* no overflow (float not unordered) */            \
+  X(HI, 8, LS, "hi") /* unsigned higher */                              \
+  X(LS, 9, HI, "ls") /* unsigned lower or same */                       \
+  X(GE, 10, LT, "ge") /* signed greater than or equal */                \
+  X(LT, 11, GE, "lt") /* signed less than */                            \
+  X(GT, 12, LE, "gt") /* signed greater than */                         \
+  X(LE, 13, GT, "le") /* signed less than or equal */                   \
+  X(AL, 14, kNone, "") /* always (unconditional) */                     \
+  X(kNone, 15, kNone, "??") /* special condition / none */              \
+//#define(tag, encode, opp, emit)
+
 #endif // SUBZERO_SRC_ICEINSTARM32_DEF
diff --git a/src/IceInstARM32.h b/src/IceInstARM32.h
index c8d01e6..1ee1831 100644
--- a/src/IceInstARM32.h
+++ b/src/IceInstARM32.h
@@ -16,6 +16,7 @@
 #ifndef SUBZERO_SRC_ICEINSTARM32_H
 #define SUBZERO_SRC_ICEINSTARM32_H
 
+#include "IceConditionCodesARM32.h"
 #include "IceDefs.h"
 #include "IceInst.h"
 #include "IceInstARM32.def"
@@ -237,6 +238,9 @@
   Operand *ShiftAmt;
 };
 
+// Base class for ARM instructions. While most ARM instructions can be
+// conditionally executed, a few of them are not predicable (halt,
+// memory barriers, etc.).
 class InstARM32 : public InstTarget {
   InstARM32() = delete;
   InstARM32(const InstARM32 &) = delete;
@@ -248,8 +252,12 @@
     Adc,
     Add,
     And,
+    Br,
+    Call,
+    Cmp,
     Eor,
     Ldr,
+    Lsl,
     Mla,
     Mov,
     Movt,
@@ -264,6 +272,7 @@
   };
 
   static const char *getWidthString(Type Ty);
+  static CondARM32::Cond getOppositeCondition(CondARM32::Cond Cond);
 
   void dump(const Cfg *Func) const override;
 
@@ -276,23 +285,51 @@
   }
 };
 
-void emitTwoAddr(const char *Opcode, const Inst *Inst, const Cfg *Func);
-void emitThreeAddr(const char *Opcode, const Inst *Inst, const Cfg *Func,
-                   bool SetFlags);
+// A predicable ARM instruction.
+class InstARM32Pred : public InstARM32 {
+  InstARM32Pred() = delete;
+  InstARM32Pred(const InstARM32Pred &) = delete;
+  InstARM32Pred &operator=(const InstARM32Pred &) = delete;
 
-// TODO(jvoung): add condition codes if instruction can be predicated.
+public:
+  InstARM32Pred(Cfg *Func, InstKindARM32 Kind, SizeT Maxsrcs, Variable *Dest,
+                CondARM32::Cond Predicate)
+      : InstARM32(Func, Kind, Maxsrcs, Dest), Predicate(Predicate) {}
+
+  CondARM32::Cond getPredicate() const { return Predicate; }
+  void setPredicate(CondARM32::Cond Pred) { Predicate = Pred; }
+
+  static const char *predString(CondARM32::Cond Predicate);
+  void dumpOpcodePred(Ostream &Str, const char *Opcode, Type Ty) const;
+
+  // Shared emit routines for common forms of instructions.
+  static void emitTwoAddr(const char *Opcode, const InstARM32Pred *Inst,
+                          const Cfg *Func);
+  static void emitThreeAddr(const char *Opcode, const InstARM32Pred *Inst,
+                            const Cfg *Func, bool SetFlags);
+
+protected:
+  CondARM32::Cond Predicate;
+};
+
+template <typename StreamType>
+inline StreamType &operator<<(StreamType &Stream, CondARM32::Cond Predicate) {
+  Stream << InstARM32Pred::predString(Predicate);
+  return Stream;
+}
 
 // Instructions of the form x := op(y).
 template <InstARM32::InstKindARM32 K>
-class InstARM32UnaryopGPR : public InstARM32 {
+class InstARM32UnaryopGPR : public InstARM32Pred {
   InstARM32UnaryopGPR() = delete;
   InstARM32UnaryopGPR(const InstARM32UnaryopGPR &) = delete;
   InstARM32UnaryopGPR &operator=(const InstARM32UnaryopGPR &) = delete;
 
 public:
-  static InstARM32UnaryopGPR *create(Cfg *Func, Variable *Dest, Operand *Src) {
+  static InstARM32UnaryopGPR *create(Cfg *Func, Variable *Dest, Operand *Src,
+                                     CondARM32::Cond Predicate) {
     return new (Func->allocate<InstARM32UnaryopGPR>())
-        InstARM32UnaryopGPR(Func, Dest, Src);
+        InstARM32UnaryopGPR(Func, Dest, Src, Predicate);
   }
   void emit(const Cfg *Func) const override {
     if (!ALLOW_DUMP)
@@ -313,14 +350,17 @@
       return;
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
-    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    Str << " = ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << " ";
     dumpSources(Func);
   }
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstARM32UnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src)
-      : InstARM32(Func, K, 1, Dest) {
+  InstARM32UnaryopGPR(Cfg *Func, Variable *Dest, Operand *Src,
+                      CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 1, Dest, Predicate) {
     addSource(Src);
   }
   ~InstARM32UnaryopGPR() override {}
@@ -329,16 +369,17 @@
 
 // Instructions of the form x := x op y.
 template <InstARM32::InstKindARM32 K>
-class InstARM32TwoAddrGPR : public InstARM32 {
+class InstARM32TwoAddrGPR : public InstARM32Pred {
   InstARM32TwoAddrGPR() = delete;
   InstARM32TwoAddrGPR(const InstARM32TwoAddrGPR &) = delete;
   InstARM32TwoAddrGPR &operator=(const InstARM32TwoAddrGPR &) = delete;
 
 public:
   // Dest must be a register.
-  static InstARM32TwoAddrGPR *create(Cfg *Func, Variable *Dest, Operand *Src) {
+  static InstARM32TwoAddrGPR *create(Cfg *Func, Variable *Dest, Operand *Src,
+                                     CondARM32::Cond Predicate) {
     return new (Func->allocate<InstARM32TwoAddrGPR>())
-        InstARM32TwoAddrGPR(Func, Dest, Src);
+        InstARM32TwoAddrGPR(Func, Dest, Src, Predicate);
   }
   void emit(const Cfg *Func) const override {
     if (!ALLOW_DUMP)
@@ -354,14 +395,17 @@
       return;
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
-    Str << " = " << Opcode << "." << getDest()->getType() << " ";
+    Str << " = ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << " ";
     dumpSources(Func);
   }
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstARM32TwoAddrGPR(Cfg *Func, Variable *Dest, Operand *Src)
-      : InstARM32(Func, K, 2, Dest) {
+  InstARM32TwoAddrGPR(Cfg *Func, Variable *Dest, Operand *Src,
+                      CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 2, Dest, Predicate) {
     addSource(Dest);
     addSource(Src);
   }
@@ -372,15 +416,16 @@
 // Base class for assignment instructions.
 // These can be tested for redundancy (and elided if redundant).
 template <InstARM32::InstKindARM32 K>
-class InstARM32Movlike : public InstARM32 {
+class InstARM32Movlike : public InstARM32Pred {
   InstARM32Movlike() = delete;
   InstARM32Movlike(const InstARM32Movlike &) = delete;
   InstARM32Movlike &operator=(const InstARM32Movlike &) = delete;
 
 public:
-  static InstARM32Movlike *create(Cfg *Func, Variable *Dest, Operand *Source) {
+  static InstARM32Movlike *create(Cfg *Func, Variable *Dest, Operand *Source,
+                                  CondARM32::Cond Predicate) {
     return new (Func->allocate<InstARM32Movlike>())
-        InstARM32Movlike(Func, Dest, Source);
+        InstARM32Movlike(Func, Dest, Source, Predicate);
   }
   bool isRedundantAssign() const override {
     return checkForRedundantAssign(getDest(), getSrc(0));
@@ -392,7 +437,8 @@
     if (!ALLOW_DUMP)
       return;
     Ostream &Str = Func->getContext()->getStrDump();
-    Str << Opcode << "." << getDest()->getType() << " ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << " ";
     dumpDest(Func);
     Str << ", ";
     dumpSources(Func);
@@ -400,8 +446,9 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
-  InstARM32Movlike(Cfg *Func, Variable *Dest, Operand *Source)
-      : InstARM32(Func, K, 1, Dest) {
+  InstARM32Movlike(Cfg *Func, Variable *Dest, Operand *Source,
+                   CondARM32::Cond Predicate)
+      : InstARM32Pred(Func, K, 1, Dest, Predicate) {
     addSource(Source);
   }
   ~InstARM32Movlike() override {}
@@ -412,7 +459,7 @@
 // Instructions of the form x := y op z. May have the side-effect of setting
 // status flags.
 template <InstARM32::InstKindARM32 K>
-class InstARM32ThreeAddrGPR : public InstARM32 {
+class InstARM32ThreeAddrGPR : public InstARM32Pred {
   InstARM32ThreeAddrGPR() = delete;
   InstARM32ThreeAddrGPR(const InstARM32ThreeAddrGPR &) = delete;
   InstARM32ThreeAddrGPR &operator=(const InstARM32ThreeAddrGPR &) = delete;
@@ -422,9 +469,10 @@
   // Dest and Src1 must be registers.
   static InstARM32ThreeAddrGPR *create(Cfg *Func, Variable *Dest,
                                        Variable *Src1, Operand *Src2,
+                                       CondARM32::Cond Predicate,
                                        bool SetFlags = false) {
     return new (Func->allocate<InstARM32ThreeAddrGPR>())
-        InstARM32ThreeAddrGPR(Func, Dest, Src1, Src2, SetFlags);
+        InstARM32ThreeAddrGPR(Func, Dest, Src1, Src2, Predicate, SetFlags);
   }
   void emit(const Cfg *Func) const override {
     if (!ALLOW_DUMP)
@@ -440,16 +488,17 @@
       return;
     Ostream &Str = Func->getContext()->getStrDump();
     dumpDest(Func);
-    Str << " = " << Opcode << (SetFlags ? "s" : "") << "."
-        << getDest()->getType() << " ";
+    Str << " = ";
+    dumpOpcodePred(Str, Opcode, getDest()->getType());
+    Str << (SetFlags ? ".s " : " ");
     dumpSources(Func);
   }
   static bool classof(const Inst *Inst) { return isClassof(Inst, K); }
 
 private:
   InstARM32ThreeAddrGPR(Cfg *Func, Variable *Dest, Variable *Src1,
-                        Operand *Src2, bool SetFlags)
-      : InstARM32(Func, K, 2, Dest), SetFlags(SetFlags) {
+                        Operand *Src2, CondARM32::Cond Predicate, bool SetFlags)
+      : InstARM32Pred(Func, K, 2, Dest, Predicate), SetFlags(SetFlags) {
     addSource(Src1);
     addSource(Src2);
   }
@@ -462,6 +511,7 @@
 typedef InstARM32ThreeAddrGPR<InstARM32::Add> InstARM32Add;
 typedef InstARM32ThreeAddrGPR<InstARM32::And> InstARM32And;
 typedef InstARM32ThreeAddrGPR<InstARM32::Eor> InstARM32Eor;
+typedef InstARM32ThreeAddrGPR<InstARM32::Lsl> InstARM32Lsl;
 typedef InstARM32ThreeAddrGPR<InstARM32::Mul> InstARM32Mul;
 typedef InstARM32ThreeAddrGPR<InstARM32::Orr> InstARM32Orr;
 typedef InstARM32ThreeAddrGPR<InstARM32::Sbc> InstARM32Sbc;
@@ -476,16 +526,123 @@
 typedef InstARM32UnaryopGPR<InstARM32::Movw> InstARM32Movw;
 typedef InstARM32UnaryopGPR<InstARM32::Mvn> InstARM32Mvn;
 
+// Direct branch instruction.
+class InstARM32Br : public InstARM32Pred {
+  InstARM32Br() = delete;
+  InstARM32Br(const InstARM32Br &) = delete;
+  InstARM32Br &operator=(const InstARM32Br &) = delete;
+
+public:
+  // Create a conditional branch to one of two nodes.
+  static InstARM32Br *create(Cfg *Func, CfgNode *TargetTrue,
+                             CfgNode *TargetFalse, CondARM32::Cond Predicate) {
+    assert(Predicate != CondARM32::AL);
+    return new (Func->allocate<InstARM32Br>())
+        InstARM32Br(Func, TargetTrue, TargetFalse, Predicate);
+  }
+  // Create an unconditional branch to a node.
+  static InstARM32Br *create(Cfg *Func, CfgNode *Target) {
+    const CfgNode *NoCondTarget = nullptr;
+    return new (Func->allocate<InstARM32Br>())
+        InstARM32Br(Func, NoCondTarget, Target, CondARM32::AL);
+  }
+  // Create a non-terminator conditional branch to a node, with a
+  // fallthrough to the next instruction in the current node.  This is
+  // used for switch lowering.
+  static InstARM32Br *create(Cfg *Func, CfgNode *Target,
+                             CondARM32::Cond Predicate) {
+    assert(Predicate != CondARM32::AL);
+    const CfgNode *NoUncondTarget = nullptr;
+    return new (Func->allocate<InstARM32Br>())
+        InstARM32Br(Func, Target, NoUncondTarget, Predicate);
+  }
+  const CfgNode *getTargetTrue() const { return TargetTrue; }
+  const CfgNode *getTargetFalse() const { return TargetFalse; }
+  bool optimizeBranch(const CfgNode *NextNode);
+  uint32_t getEmitInstCount() const override {
+    uint32_t Sum = 0;
+    if (getTargetTrue())
+      ++Sum;
+    if (getTargetFalse())
+      ++Sum;
+    return Sum;
+  }
+  bool isUnconditionalBranch() const override {
+    return getPredicate() == CondARM32::AL;
+  }
+  bool repointEdge(CfgNode *OldNode, CfgNode *NewNode) override;
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Br); }
+
+private:
+  InstARM32Br(Cfg *Func, const CfgNode *TargetTrue, const CfgNode *TargetFalse,
+              CondARM32::Cond Predicate);
+  ~InstARM32Br() override {}
+  const CfgNode *TargetTrue;
+  const CfgNode *TargetFalse;
+};
+
+// Call instruction (bl/blx).  Arguments should have already been pushed.
+// Technically bl and the register form of blx can be predicated, but we'll
+// leave that out until needed.
+class InstARM32Call : public InstARM32 {
+  InstARM32Call() = delete;
+  InstARM32Call(const InstARM32Call &) = delete;
+  InstARM32Call &operator=(const InstARM32Call &) = delete;
+
+public:
+  static InstARM32Call *create(Cfg *Func, Variable *Dest, Operand *CallTarget) {
+    return new (Func->allocate<InstARM32Call>())
+        InstARM32Call(Func, Dest, CallTarget);
+  }
+  Operand *getCallTarget() const { return getSrc(0); }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Call); }
+
+private:
+  InstARM32Call(Cfg *Func, Variable *Dest, Operand *CallTarget);
+  ~InstARM32Call() override {}
+};
+
+// Integer compare instruction.
+class InstARM32Cmp : public InstARM32Pred {
+  InstARM32Cmp() = delete;
+  InstARM32Cmp(const InstARM32Cmp &) = delete;
+  InstARM32Cmp &operator=(const InstARM32Cmp &) = delete;
+
+public:
+  static InstARM32Cmp *create(Cfg *Func, Variable *Src1, Operand *Src2,
+                              CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Cmp>())
+        InstARM32Cmp(Func, Src1, Src2, Predicate);
+  }
+  void emit(const Cfg *Func) const override;
+  void emitIAS(const Cfg *Func) const override;
+  void dump(const Cfg *Func) const override;
+  static bool classof(const Inst *Inst) { return isClassof(Inst, Cmp); }
+
+private:
+  InstARM32Cmp(Cfg *Func, Variable *Src1, Operand *Src2,
+               CondARM32::Cond Predicate);
+  ~InstARM32Cmp() override {}
+};
+
 // Load instruction.
-class InstARM32Ldr : public InstARM32 {
+class InstARM32Ldr : public InstARM32Pred {
   InstARM32Ldr() = delete;
   InstARM32Ldr(const InstARM32Ldr &) = delete;
   InstARM32Ldr &operator=(const InstARM32Ldr &) = delete;
 
 public:
   // Dest must be a register.
-  static InstARM32Ldr *create(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem) {
-    return new (Func->allocate<InstARM32Ldr>()) InstARM32Ldr(Func, Dest, Mem);
+  static InstARM32Ldr *create(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
+                              CondARM32::Cond Predicate) {
+    return new (Func->allocate<InstARM32Ldr>())
+        InstARM32Ldr(Func, Dest, Mem, Predicate);
   }
   void emit(const Cfg *Func) const override;
   void emitIAS(const Cfg *Func) const override;
@@ -493,12 +650,13 @@
   static bool classof(const Inst *Inst) { return isClassof(Inst, Ldr); }
 
 private:
-  InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem);
+  InstARM32Ldr(Cfg *Func, Variable *Dest, OperandARM32Mem *Mem,
+               CondARM32::Cond Predicate);
   ~InstARM32Ldr() override {}
 };
 
 // Multiply Accumulate: d := x * y + a
-class InstARM32Mla : public InstARM32 {
+class InstARM32Mla : public InstARM32Pred {
   InstARM32Mla() = delete;
   InstARM32Mla(const InstARM32Mla &) = delete;
   InstARM32Mla &operator=(const InstARM32Mla &) = delete;
@@ -506,9 +664,10 @@
 public:
   // Everything must be a register.
   static InstARM32Mla *create(Cfg *Func, Variable *Dest, Variable *Src0,
-                              Variable *Src1, Variable *Acc) {
+                              Variable *Src1, Variable *Acc,
+                              CondARM32::Cond Predicate) {
     return new (Func->allocate<InstARM32Mla>())
-        InstARM32Mla(Func, Dest, Src0, Src1, Acc);
+        InstARM32Mla(Func, Dest, Src0, Src1, Acc, Predicate);
   }
   void emit(const Cfg *Func) const override;
   void emitIAS(const Cfg *Func) const override;
@@ -517,7 +676,7 @@
 
 private:
   InstARM32Mla(Cfg *Func, Variable *Dest, Variable *Src0, Variable *Src1,
-               Variable *Acc);
+               Variable *Acc, CondARM32::Cond Predicate);
   ~InstARM32Mla() override {}
 };
 
@@ -526,6 +685,10 @@
 // instead of a generic "bx". This instruction also takes a Source
 // operand (for non-void returning functions) for liveness analysis, though
 // a FakeUse before the ret would do just as well.
+//
+// NOTE: Even though "bx" can be predicated, for now leave out the predication
+// since it's not yet known to be useful for Ret. That may complicate finding
+// the terminator instruction if it's not guaranteed to be executed.
 class InstARM32Ret : public InstARM32 {
   InstARM32Ret() = delete;
   InstARM32Ret(const InstARM32Ret &) = delete;
@@ -547,7 +710,7 @@
 };
 
 // Unsigned Multiply Long: d.lo, d.hi := x * y
-class InstARM32Umull : public InstARM32 {
+class InstARM32Umull : public InstARM32Pred {
   InstARM32Umull() = delete;
   InstARM32Umull(const InstARM32Umull &) = delete;
   InstARM32Umull &operator=(const InstARM32Umull &) = delete;
@@ -555,9 +718,10 @@
 public:
   // Everything must be a register.
   static InstARM32Umull *create(Cfg *Func, Variable *DestLo, Variable *DestHi,
-                                Variable *Src0, Variable *Src1) {
+                                Variable *Src0, Variable *Src1,
+                                CondARM32::Cond Predicate) {
     return new (Func->allocate<InstARM32Umull>())
-        InstARM32Umull(Func, DestLo, DestHi, Src0, Src1);
+        InstARM32Umull(Func, DestLo, DestHi, Src0, Src1, Predicate);
   }
   void emit(const Cfg *Func) const override;
   void emitIAS(const Cfg *Func) const override;
@@ -566,7 +730,7 @@
 
 private:
   InstARM32Umull(Cfg *Func, Variable *DestLo, Variable *DestHi, Variable *Src0,
-                 Variable *Src1);
+                 Variable *Src1, CondARM32::Cond Predicate);
   ~InstARM32Umull() override {}
   Variable *DestHi;
 };
diff --git a/src/IceInstX8632.cpp b/src/IceInstX8632.cpp
index 74db41d..38a9b35 100644
--- a/src/IceInstX8632.cpp
+++ b/src/IceInstX8632.cpp
@@ -583,8 +583,8 @@
 // shift instructions, in order to be syntactically valid.  The
 // Opcode parameter needs to be char* and not IceString because of
 // template issues.
-void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
-                    bool ShiftHack) {
+void InstX8632::emitTwoAddress(const char *Opcode, const Inst *Inst,
+                               const Cfg *Func, bool ShiftHack) {
   if (!ALLOW_DUMP)
     return;
   Ostream &Str = Func->getContext()->getStrEmit();
@@ -703,9 +703,9 @@
   }
 }
 
-void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
-                     const Operand *Src,
-                     const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter) {
+void InstX8632::emitIASGPRShift(
+    const Cfg *Func, Type Ty, const Variable *Var, const Operand *Src,
+    const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter) {
   X8632::AssemblerX8632 *Asm = Func->getAssembler<X8632::AssemblerX8632>();
   // Technically, the Dest Var can be mem as well, but we only use Reg.
   // We can extend this to check Dest if we decide to use that form.
diff --git a/src/IceInstX8632.h b/src/IceInstX8632.h
index 976ccc4..2e82052 100644
--- a/src/IceInstX8632.h
+++ b/src/IceInstX8632.h
@@ -17,9 +17,9 @@
 #define SUBZERO_SRC_ICEINSTX8632_H
 
 #include "assembler_ia32.h"
+#include "IceConditionCodesX8632.h"
 #include "IceDefs.h"
 #include "IceInst.h"
-#include "IceConditionCodesX8632.h"
 #include "IceInstX8632.def"
 #include "IceOperand.h"
 
@@ -268,6 +268,17 @@
   static CondX86::BrCond getOppositeCondition(CondX86::BrCond Cond);
   void dump(const Cfg *Func) const override;
 
+  // Shared emit routines for common forms of instructions.
+  // See the definition of emitTwoAddress() for a description of
+  // ShiftHack.
+  static void emitTwoAddress(const char *Opcode, const Inst *Inst,
+                             const Cfg *Func, bool ShiftHack = false);
+
+  static void
+  emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
+                  const Operand *Src,
+                  const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter);
+
 protected:
   InstX8632(Cfg *Func, InstKindX8632 Kind, SizeT Maxsrcs, Variable *Dest)
       : InstTarget(Func, static_cast<InstKind>(Kind), Maxsrcs, Dest) {}
@@ -665,15 +676,6 @@
   static const X8632::AssemblerX8632::XmmEmitterRegOp Emitter;
 };
 
-// See the definition of emitTwoAddress() for a description of
-// ShiftHack.
-void emitTwoAddress(const char *Opcode, const Inst *Inst, const Cfg *Func,
-                    bool ShiftHack = false);
-
-void emitIASGPRShift(const Cfg *Func, Type Ty, const Variable *Var,
-                     const Operand *Src,
-                     const X8632::AssemblerX8632::GPREmitterShiftOp &Emitter);
-
 template <InstX8632::InstKindX8632 K>
 class InstX8632BinopGPRShift : public InstX8632 {
   InstX8632BinopGPRShift() = delete;
diff --git a/src/IceTargetLoweringARM32.cpp b/src/IceTargetLoweringARM32.cpp
index 73eb77c..26f01f9 100644
--- a/src/IceTargetLoweringARM32.cpp
+++ b/src/IceTargetLoweringARM32.cpp
@@ -31,6 +31,7 @@
 namespace Ice {
 
 namespace {
+
 void UnimplementedError(const ClFlags &Flags) {
   if (!Flags.getSkipUnimplemented()) {
     // Use llvm_unreachable instead of report_fatal_error, which gives better
@@ -40,6 +41,85 @@
   }
 }
 
+// The following table summarizes the logic for lowering the icmp instruction
+// for i32 and narrower types.  Each icmp condition has a clear mapping to an
+// ARM32 conditional move instruction.
+
+const struct TableIcmp32_ {
+  CondARM32::Cond Mapping;
+} TableIcmp32[] = {
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)                       \
+  { CondARM32::C_32 }                                                          \
+  ,
+    ICMPARM32_TABLE
+#undef X
+};
+const size_t TableIcmp32Size = llvm::array_lengthof(TableIcmp32);
+
+// The following table summarizes the logic for lowering the icmp instruction
+// for the i64 type. Two conditional moves are needed for setting to 1 or 0.
+// The operands may need to be swapped, and there is a slight difference
+// for signed vs unsigned (comparing hi vs lo first, and using cmp vs sbc).
+const struct TableIcmp64_ {
+  bool IsSigned;
+  bool Swapped;
+  CondARM32::Cond C1, C2;
+} TableIcmp64[] = {
+#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)                       \
+  { is_signed, swapped64, CondARM32::C1_64, CondARM32::C2_64 }                 \
+  ,
+    ICMPARM32_TABLE
+#undef X
+};
+const size_t TableIcmp64Size = llvm::array_lengthof(TableIcmp64);
+
+CondARM32::Cond getIcmp32Mapping(InstIcmp::ICond Cond) {
+  size_t Index = static_cast<size_t>(Cond);
+  assert(Index < TableIcmp32Size);
+  return TableIcmp32[Index].Mapping;
+}
+
+// In some cases, there are x-macros tables for both high-level and
+// low-level instructions/operands that use the same enum key value.
+// The tables are kept separate to maintain a proper separation
+// between abstraction layers.  There is a risk that the tables could
+// get out of sync if enum values are reordered or if entries are
+// added or deleted.  The following dummy namespaces use
+// static_asserts to ensure everything is kept in sync.
+
+// Validate the enum values in ICMPARM32_TABLE.
+namespace dummy1 {
+// Define a temporary set of enum values based on low-level table
+// entries.
+enum _tmp_enum {
+#define X(val, signed, swapped64, C_32, C1_64, C2_64) _tmp_##val,
+  ICMPARM32_TABLE
+#undef X
+      _num
+};
+// Define a set of constants based on high-level table entries.
+#define X(tag, str) static const int _table1_##tag = InstIcmp::tag;
+ICEINSTICMP_TABLE
+#undef X
+// Define a set of constants based on low-level table entries, and
+// ensure the table entry keys are consistent.
+#define X(val, signed, swapped64, C_32, C1_64, C2_64)                          \
+  static const int _table2_##val = _tmp_##val;                                 \
+  static_assert(                                                               \
+      _table1_##val == _table2_##val,                                          \
+      "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE");
+ICMPARM32_TABLE
+#undef X
+// Repeat the static asserts with respect to the high-level table
+// entries in case the high-level table has extra entries.
+#define X(tag, str)                                                            \
+  static_assert(                                                               \
+      _table1_##tag == _table2_##tag,                                          \
+      "Inconsistency between ICMPARM32_TABLE and ICEINSTICMP_TABLE");
+ICEINSTICMP_TABLE
+#undef X
+} // end of namespace dummy1
+
 // The maximum number of arguments to pass in GPR registers.
 const uint32_t ARM32_MAX_GPR_ARG = 4;
 
@@ -218,9 +298,9 @@
 }
 
 bool TargetARM32::doBranchOpt(Inst *I, const CfgNode *NextNode) {
-  (void)I;
-  (void)NextNode;
-  UnimplementedError(Func->getContext()->getFlags());
+  if (InstARM32Br *Br = llvm::dyn_cast<InstARM32Br>(I)) {
+    return Br->optimizeBranch(NextNode);
+  }
   return false;
 }
 
@@ -750,13 +830,109 @@
 }
 
 void TargetARM32::lowerBr(const InstBr *Inst) {
-  (void)Inst;
-  UnimplementedError(Func->getContext()->getFlags());
+  if (Inst->isUnconditional()) {
+    _br(Inst->getTargetUnconditional());
+    return;
+  }
+  Operand *Cond = Inst->getCondition();
+  // TODO(jvoung): Handle folding opportunities.
+
+  Variable *Src0R = legalizeToVar(Cond);
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  _cmp(Src0R, Zero);
+  _br(CondARM32::NE, Inst->getTargetTrue(), Inst->getTargetFalse());
 }
 
-void TargetARM32::lowerCall(const InstCall *Inst) {
-  (void)Inst;
-  UnimplementedError(Func->getContext()->getFlags());
+void TargetARM32::lowerCall(const InstCall *Instr) {
+  // TODO(jvoung): assign arguments to registers and stack. Also reserve stack.
+  if (Instr->getNumArgs()) {
+    UnimplementedError(Func->getContext()->getFlags());
+  }
+
+  // Generate the call instruction.  Assign its result to a temporary
+  // with high register allocation weight.
+  Variable *Dest = Instr->getDest();
+  // ReturnReg doubles as ReturnRegLo as necessary.
+  Variable *ReturnReg = nullptr;
+  Variable *ReturnRegHi = nullptr;
+  if (Dest) {
+    switch (Dest->getType()) {
+    case IceType_NUM:
+      llvm_unreachable("Invalid Call dest type");
+      break;
+    case IceType_void:
+      break;
+    case IceType_i1:
+    case IceType_i8:
+    case IceType_i16:
+    case IceType_i32:
+      ReturnReg = makeReg(Dest->getType(), RegARM32::Reg_r0);
+      break;
+    case IceType_i64:
+      ReturnReg = makeReg(IceType_i32, RegARM32::Reg_r0);
+      ReturnRegHi = makeReg(IceType_i32, RegARM32::Reg_r1);
+      break;
+    case IceType_f32:
+    case IceType_f64:
+      // Use S and D regs.
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    case IceType_v4i1:
+    case IceType_v8i1:
+    case IceType_v16i1:
+    case IceType_v16i8:
+    case IceType_v8i16:
+    case IceType_v4i32:
+    case IceType_v4f32:
+      // Use Q regs.
+      UnimplementedError(Func->getContext()->getFlags());
+      break;
+    }
+  }
+  Operand *CallTarget = Instr->getCallTarget();
+  // Allow ConstantRelocatable to be left alone as a direct call,
+  // but force other constants like ConstantInteger32 to be in
+  // a register and make it an indirect call.
+  if (!llvm::isa<ConstantRelocatable>(CallTarget)) {
+    CallTarget = legalize(CallTarget, Legal_Reg);
+  }
+  Inst *NewCall = InstARM32Call::create(Func, ReturnReg, CallTarget);
+  Context.insert(NewCall);
+  if (ReturnRegHi)
+    Context.insert(InstFakeDef::create(Func, ReturnRegHi));
+
+  // Insert a register-kill pseudo instruction.
+  Context.insert(InstFakeKill::create(Func, NewCall));
+
+  // Generate a FakeUse to keep the call live if necessary.
+  if (Instr->hasSideEffects() && ReturnReg) {
+    Inst *FakeUse = InstFakeUse::create(Func, ReturnReg);
+    Context.insert(FakeUse);
+  }
+
+  if (!Dest)
+    return;
+
+  // Assign the result of the call to Dest.
+  if (ReturnReg) {
+    if (ReturnRegHi) {
+      assert(Dest->getType() == IceType_i64);
+      split64(Dest);
+      Variable *DestLo = Dest->getLo();
+      Variable *DestHi = Dest->getHi();
+      _mov(DestLo, ReturnReg);
+      _mov(DestHi, ReturnRegHi);
+    } else {
+      assert(Dest->getType() == IceType_i32 || Dest->getType() == IceType_i16 ||
+             Dest->getType() == IceType_i8 || Dest->getType() == IceType_i1 ||
+             isVectorType(Dest->getType()));
+      if (isFloatingType(Dest->getType()) || isVectorType(Dest->getType())) {
+        UnimplementedError(Func->getContext()->getFlags());
+      } else {
+        _mov(Dest, ReturnReg);
+      }
+    }
+  }
 }
 
 void TargetARM32::lowerCast(const InstCast *Inst) {
@@ -815,8 +991,135 @@
 }
 
 void TargetARM32::lowerIcmp(const InstIcmp *Inst) {
-  (void)Inst;
-  UnimplementedError(Func->getContext()->getFlags());
+  Variable *Dest = Inst->getDest();
+  Operand *Src0 = Inst->getSrc(0);
+  Operand *Src1 = Inst->getSrc(1);
+
+  if (isVectorType(Dest->getType())) {
+    UnimplementedError(Func->getContext()->getFlags());
+    return;
+  }
+
+  // a=icmp cond, b, c ==>
+  // GCC does:
+  //   cmp      b.hi, c.hi     or  cmp      b.lo, c.lo
+  //   cmp.eq   b.lo, c.lo         sbcs t1, b.hi, c.hi
+  //   mov.<C1> t, #1              mov.<C1> t, #1
+  //   mov.<C2> t, #0              mov.<C2> t, #0
+  //   mov      a, t               mov      a, t
+  // where the "cmp.eq b.lo, c.lo" is used for unsigned and "sbcs t1, hi, hi"
+  // is used for signed compares. In some cases, b and c need to be swapped
+  // as well.
+  //
+  // LLVM does:
+  // for EQ and NE:
+  //   eor  t1, b.hi, c.hi
+  //   eor  t2, b.lo, c.hi
+  //   orrs t, t1, t2
+  //   mov.<C> t, #1
+  //   mov  a, t
+  //
+  // that's nice in that it's just as short but has fewer dependencies
+  // for better ILP at the cost of more registers.
+  //
+  // Otherwise for signed/unsigned <, <=, etc. LLVM uses a sequence with
+  // two unconditional mov #0, two cmps, two conditional mov #1,
+  // and one conditonal reg mov. That has few dependencies for good ILP,
+  // but is a longer sequence.
+  //
+  // So, we are going with the GCC version since it's usually better (except
+  // perhaps for eq/ne). We could revisit special-casing eq/ne later.
+  Constant *Zero = Ctx->getConstantZero(IceType_i32);
+  Constant *One = Ctx->getConstantInt32(1);
+  if (Src0->getType() == IceType_i64) {
+    InstIcmp::ICond Conditon = Inst->getCondition();
+    size_t Index = static_cast<size_t>(Conditon);
+    assert(Index < TableIcmp64Size);
+    Variable *Src0Lo, *Src0Hi;
+    Operand *Src1LoRF, *Src1HiRF;
+    if (TableIcmp64[Index].Swapped) {
+      Src0Lo = legalizeToVar(loOperand(Src1));
+      Src0Hi = legalizeToVar(hiOperand(Src1));
+      Src1LoRF = legalize(loOperand(Src0), Legal_Reg | Legal_Flex);
+      Src1HiRF = legalize(hiOperand(Src0), Legal_Reg | Legal_Flex);
+    } else {
+      Src0Lo = legalizeToVar(loOperand(Src0));
+      Src0Hi = legalizeToVar(hiOperand(Src0));
+      Src1LoRF = legalize(loOperand(Src1), Legal_Reg | Legal_Flex);
+      Src1HiRF = legalize(hiOperand(Src1), Legal_Reg | Legal_Flex);
+    }
+    Variable *T = makeReg(IceType_i32);
+    if (TableIcmp64[Index].IsSigned) {
+      Variable *ScratchReg = makeReg(IceType_i32);
+      _cmp(Src0Lo, Src1LoRF);
+      _sbcs(ScratchReg, Src0Hi, Src1HiRF);
+      // ScratchReg isn't going to be used, but we need the
+      // side-effect of setting flags from this operation.
+      Context.insert(InstFakeUse::create(Func, ScratchReg));
+    } else {
+      _cmp(Src0Hi, Src1HiRF);
+      _cmp(Src0Lo, Src1LoRF, CondARM32::EQ);
+    }
+    _mov(T, One, TableIcmp64[Index].C1);
+    _mov_nonkillable(T, Zero, TableIcmp64[Index].C2);
+    _mov(Dest, T);
+    return;
+  }
+
+  // a=icmp cond b, c ==>
+  // GCC does:
+  //   <u/s>xtb tb, b
+  //   <u/s>xtb tc, c
+  //   cmp      tb, tc
+  //   mov.C1   t, #0
+  //   mov.C2   t, #1
+  //   mov      a, t
+  // where the unsigned/sign extension is not needed for 32-bit.
+  // They also have special cases for EQ and NE. E.g., for NE:
+  //   <extend to tb, tc>
+  //   subs     t, tb, tc
+  //   movne    t, #1
+  //   mov      a, t
+  //
+  // LLVM does:
+  //   lsl     tb, b, #<N>
+  //   mov     t, #0
+  //   cmp     tb, c, lsl #<N>
+  //   mov.<C> t, #1
+  //   mov     a, t
+  //
+  // the left shift is by 0, 16, or 24, which allows the comparison to focus
+  // on the digits that actually matter (for 16-bit or 8-bit signed/unsigned).
+  // For the unsigned case, for some reason it does similar to GCC and does
+  // a uxtb first. It's not clear to me why that special-casing is needed.
+  //
+  // We'll go with the LLVM way for now, since it's shorter and has just as
+  // few dependencies.
+  int32_t ShiftAmount = 32 - getScalarIntBitWidth(Src0->getType());
+  assert(ShiftAmount >= 0);
+  Constant *ShiftConst = nullptr;
+  Variable *Src0R = nullptr;
+  Variable *T = makeReg(IceType_i32);
+  if (ShiftAmount) {
+    ShiftConst = Ctx->getConstantInt32(ShiftAmount);
+    Src0R = makeReg(IceType_i32);
+    _lsl(Src0R, legalizeToVar(Src0), ShiftConst);
+  } else {
+    Src0R = legalizeToVar(Src0);
+  }
+  _mov(T, Zero);
+  if (ShiftAmount) {
+    Variable *Src1R = legalizeToVar(Src1);
+    OperandARM32FlexReg *Src1RShifted = OperandARM32FlexReg::create(
+        Func, IceType_i32, Src1R, OperandARM32::LSL, ShiftConst);
+    _cmp(Src0R, Src1RShifted);
+  } else {
+    Operand *Src1RF = legalize(Src1, Legal_Reg | Legal_Flex);
+    _cmp(Src0R, Src1RF);
+  }
+  _mov_nonkillable(T, One, getIcmp32Mapping(Inst->getCondition()));
+  _mov(Dest, T);
+  return;
 }
 
 void TargetARM32::lowerInsertElement(const InstInsertElement *Inst) {
@@ -986,7 +1289,7 @@
       UnimplementedError(Func->getContext()->getFlags());
     } else {
       Operand *Src0F = legalize(Src0, Legal_Reg | Legal_Flex);
-      _mov(Reg, Src0F, RegARM32::Reg_r0);
+      _mov(Reg, Src0F, CondARM32::AL, RegARM32::Reg_r0);
     }
   }
   // Add a ret instruction even if sandboxing is enabled, because
diff --git a/src/IceTargetLoweringARM32.def b/src/IceTargetLoweringARM32.def
index baeec2c..a004cb6 100644
--- a/src/IceTargetLoweringARM32.def
+++ b/src/IceTargetLoweringARM32.def
@@ -15,6 +15,19 @@
 #ifndef SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
 #define SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
 
-// TODO(jvoung): Fill out comparison tables, etc. for 32/64-bit compares.
+// Patterns for lowering icmp.
+#define ICMPARM32_TABLE                                             \
+  /* val, is_signed, swapped64, C_32, C1_64, C2_64 */               \
+  X(Eq,   false,     false,     EQ,   EQ,    NE)                    \
+  X(Ne,   false,     false,     NE,   NE,    EQ)                    \
+  X(Ugt,  false,     false,     HI,   HI,    LS)                    \
+  X(Uge,  false,     false,     CS,   CS,    CC)                    \
+  X(Ult,  false,     false,     CC,   CC,    CS)                    \
+  X(Ule,  false,     false,     LS,   LS,    HI)                    \
+  X(Sgt,  true,      true,      GT,   LT,    GE)                    \
+  X(Sge,  true,      false,     GE,   GE,    LT)                    \
+  X(Slt,  true,      false,     LT,   LT,    GE)                    \
+  X(Sle,  true,      true,      LE,   GE,    LT)                    \
+//#define X(val, is_signed, swapped64, C_32, C1_64, C2_64)
 
 #endif // SUBZERO_SRC_ICETARGETLOWERINGARM32_DEF
diff --git a/src/IceTargetLoweringARM32.h b/src/IceTargetLoweringARM32.h
index 88a8eb1..04d5984 100644
--- a/src/IceTargetLoweringARM32.h
+++ b/src/IceTargetLoweringARM32.h
@@ -130,71 +130,119 @@
   // with minimal syntactic overhead, so that the lowering code can
   // look as close to assembly as practical.
 
-  void _add(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1));
+  void _add(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _adds(Variable *Dest, Variable *Src0, Operand *Src1) {
+  void _adds(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
     const bool SetFlags = true;
-    Context.insert(InstARM32Add::create(Func, Dest, Src0, Src1, SetFlags));
+    Context.insert(
+        InstARM32Add::create(Func, Dest, Src0, Src1, Pred, SetFlags));
   }
-  void _adc(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Adc::create(Func, Dest, Src0, Src1));
+  void _adc(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Adc::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _and(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1));
+  void _and(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32And::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _eor(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1));
+  void _br(CondARM32::Cond Condition, CfgNode *TargetTrue,
+           CfgNode *TargetFalse) {
+    Context.insert(
+        InstARM32Br::create(Func, TargetTrue, TargetFalse, Condition));
   }
-  void _ldr(Variable *Dest, OperandARM32Mem *Addr) {
-    Context.insert(InstARM32Ldr::create(Func, Dest, Addr));
+  void _br(CfgNode *Target) {
+    Context.insert(InstARM32Br::create(Func, Target));
   }
-  void _mla(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc) {
-    Context.insert(InstARM32Mla::create(Func, Dest, Src0, Src1, Acc));
+  void _cmp(Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Cmp::create(Func, Src0, Src1, Pred));
+  }
+  void _eor(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Eor::create(Func, Dest, Src0, Src1, Pred));
+  }
+  void _ldr(Variable *Dest, OperandARM32Mem *Addr,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Ldr::create(Func, Dest, Addr, Pred));
+  }
+  void _lsl(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Lsl::create(Func, Dest, Src0, Src1, Pred));
+  }
+  void _mla(Variable *Dest, Variable *Src0, Variable *Src1, Variable *Acc,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mla::create(Func, Dest, Src0, Src1, Acc, Pred));
   }
   // If Dest=nullptr is passed in, then a new variable is created,
   // marked as infinite register allocation weight, and returned
   // through the in/out Dest argument.
   void _mov(Variable *&Dest, Operand *Src0,
+            CondARM32::Cond Pred = CondARM32::AL,
             int32_t RegNum = Variable::NoRegister) {
     if (Dest == nullptr)
       Dest = makeReg(Src0->getType(), RegNum);
-    Context.insert(InstARM32Mov::create(Func, Dest, Src0));
+    Context.insert(InstARM32Mov::create(Func, Dest, Src0, Pred));
+  }
+  void _mov_nonkillable(Variable *Dest, Operand *Src0,
+                        CondARM32::Cond Pred = CondARM32::AL) {
+    Inst *NewInst = InstARM32Mov::create(Func, Dest, Src0, Pred);
+    NewInst->setDestNonKillable();
+    Context.insert(NewInst);
   }
   // The Operand can only be a 16-bit immediate or a ConstantRelocatable
   // (with an upper16 relocation).
-  void _movt(Variable *Dest, Operand *Src0) {
-    Context.insert(InstARM32Movt::create(Func, Dest, Src0));
+  void _movt(Variable *Dest, Operand *Src0,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Movt::create(Func, Dest, Src0, Pred));
   }
-  void _movw(Variable *Dest, Operand *Src0) {
-    Context.insert(InstARM32Movw::create(Func, Dest, Src0));
+  void _movw(Variable *Dest, Operand *Src0,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Movw::create(Func, Dest, Src0, Pred));
   }
-  void _mul(Variable *Dest, Variable *Src0, Variable *Src1) {
-    Context.insert(InstARM32Mul::create(Func, Dest, Src0, Src1));
+  void _mul(Variable *Dest, Variable *Src0, Variable *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mul::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _mvn(Variable *Dest, Operand *Src0) {
-    Context.insert(InstARM32Mvn::create(Func, Dest, Src0));
+  void _mvn(Variable *Dest, Operand *Src0,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Mvn::create(Func, Dest, Src0, Pred));
   }
-  void _orr(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1));
+  void _orr(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Orr::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1));
+  void _sbc(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred));
   }
-  void _sub(Variable *Dest, Variable *Src0, Operand *Src1) {
-    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1));
-  }
-  void _subs(Variable *Dest, Variable *Src0, Operand *Src1) {
+  void _sbcs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
     const bool SetFlags = true;
-    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1, SetFlags));
+    Context.insert(
+        InstARM32Sbc::create(Func, Dest, Src0, Src1, Pred, SetFlags));
+  }
+  void _sub(Variable *Dest, Variable *Src0, Operand *Src1,
+            CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(InstARM32Sub::create(Func, Dest, Src0, Src1, Pred));
+  }
+  void _subs(Variable *Dest, Variable *Src0, Operand *Src1,
+             CondARM32::Cond Pred = CondARM32::AL) {
+    const bool SetFlags = true;
+    Context.insert(
+        InstARM32Sub::create(Func, Dest, Src0, Src1, Pred, SetFlags));
   }
   void _ret(Variable *LR, Variable *Src0 = nullptr) {
     Context.insert(InstARM32Ret::create(Func, LR, Src0));
   }
   void _umull(Variable *DestLo, Variable *DestHi, Variable *Src0,
-              Variable *Src1) {
-    Context.insert(InstARM32Umull::create(Func, DestLo, DestHi, Src0, Src1));
+              Variable *Src1, CondARM32::Cond Pred = CondARM32::AL) {
+    Context.insert(
+        InstARM32Umull::create(Func, DestLo, DestHi, Src0, Src1, Pred));
     // Model the modification to the second dest as a fake def.
+    // Note that the def is not predicated.
     Context.insert(InstFakeDef::create(Func, DestHi, DestLo));
   }