Compiler constant handling rework

In preparation for de-optimization, reworked the constant
handling mechanism.  Also took advantage of knowledge of
constant operands (particularly for long operations).

Significant performance improvements for Mandelbrot
(~60 seconds to ~34 seconds).  Minor improvements in other
benchmarks.

The new constant handling breaks two of the existing
optimization passes: "Skip Large Method" and "Load/Store
Elimization."

I don't intend to update the large method optimization
because it will be superceeded by the upcoming interpreter/
fingerprinting mechanism.  Leaving the code in place for
now in order to compare compile-time improvements with
fingerprinting/interpret.  All related code will be deleted
when that is complete.

The load/store elimination pass needs some rework to handle
uses of multiple-register loads and stores.  It will be
updated & restored in a future CL.

Change-Id: Ia979abaf51b8ae81bbb0428031cbcea854625fac
diff --git a/src/compiler/codegen/arm/arm_lir.h b/src/compiler/codegen/arm/arm_lir.h
index 3fc8792..c41f53b 100644
--- a/src/compiler/codegen/arm/arm_lir.h
+++ b/src/compiler/codegen/arm/arm_lir.h
@@ -371,7 +371,7 @@
   kThumb2StrbRRI12,  // strb rt,[rn,#imm12] [111110001000] rt[15..12] rn[19..16] imm12[11..0].
   kThumb2Pop,        // pop   [1110100010111101] list[15-0]*/
   kThumb2Push,       // push  [1110100100101101] list[15-0]*/
-  kThumb2CmpRI8,     // cmp rn, #<const> [11110] i [011011] rn[19-16] [0] imm3 [1111] imm8[7..0].
+  kThumb2CmpRI12,    // cmp rn, #<const> [11110] i [011011] rn[19-16] [0] imm3 [1111] imm8[7..0].
   kThumb2AdcRRR,     // adc [111010110101] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
   kThumb2AndRRR,     // and [111010100000] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
   kThumb2BicRRR,     // bic [111010100010] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
@@ -445,6 +445,9 @@
   kThumb2Pop1,       // t3 encoding of pop.
   kThumb2RsubRRR,    // rsb [111010111101] rn[19..16] [0000] rd[11..8] [0000] rm[3..0].
   kThumb2Smull,      // smull [111110111000] rn[19-16], rdlo[15-12] rdhi[11-8] [0000] rm[3-0].
+  kThumb2LdrdPcRel8, // ldrd rt, rt2, pc +-/1024.
+  kThumb2LdrdI8,     // ldrd rt, rt2, [rn +-/1024].
+  kThumb2StrdI8,     // strd rt, rt2, [rn +-/1024].
   kArmLast,
 };
 
diff --git a/src/compiler/codegen/arm/assemble_arm.cc b/src/compiler/codegen/arm/assemble_arm.cc
index 91f25d6..455ea67 100644
--- a/src/compiler/codegen/arm/assemble_arm.cc
+++ b/src/compiler/codegen/arm/assemble_arm.cc
@@ -646,7 +646,7 @@
                  kFmtUnused, -1, -1,
                  IS_UNARY_OP | REG_DEF_SP | REG_USE_SP | REG_USE_LIST0
                  | IS_STORE | NEEDS_FIXUP, "push", "<!0R>", 4),
-    ENCODING_MAP(kThumb2CmpRI8, 0xf1b00f00,
+    ENCODING_MAP(kThumb2CmpRI12, 0xf1b00f00,
                  kFmtBitBlt, 19, 16, kFmtModImm, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
                  IS_BINARY_OP | REG_USE0 | SETS_CCODES,
@@ -917,8 +917,8 @@
                  "b", "!0t", 4),
     ENCODING_MAP(kThumb2MovImm16H,       0xf2c00000,
                  kFmtBitBlt, 11, 8, kFmtImm16, -1, -1, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0,
-                 "movh", "!0C, #!1M", 4),
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0 | REG_USE0,
+                 "movt", "!0C, #!1M", 4),
     ENCODING_MAP(kThumb2AddPCR,      0x4487,
                  kFmtBitBlt, 6, 3, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
@@ -936,8 +936,8 @@
                  "mov", "!0C, #!1M", 4),
     ENCODING_MAP(kThumb2MovImm16HST,     0xf2c00000,
                  kFmtBitBlt, 11, 8, kFmtImm16, -1, -1, kFmtUnused, -1, -1,
-                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0 | NEEDS_FIXUP,
-                 "movh", "!0C, #!1M", 4),
+                 kFmtUnused, -1, -1, IS_BINARY_OP | REG_DEF0 | REG_USE0 | NEEDS_FIXUP,
+                 "movt", "!0C, #!1M", 4),
     ENCODING_MAP(kThumb2LdmiaWB,         0xe8b00000,
                  kFmtBitBlt, 19, 16, kFmtBitBlt, 15, 0, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1,
@@ -972,7 +972,21 @@
                  kFmtBitBlt, 3, 0,
                  IS_QUAD_OP | REG_DEF0 | REG_DEF1 | REG_USE2 | REG_USE3,
                  "smull", "!0C, !1C, !2C, !3C", 4),
-
+    ENCODING_MAP(kThumb2LdrdPcRel8,  0xe9df0000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 7, 0,
+                 kFmtUnused, -1, -1,
+                 IS_TERTIARY_OP | REG_DEF0 | REG_DEF1 | REG_USE_PC | IS_LOAD | NEEDS_FIXUP,
+                 "ldrd", "!0C, !1C, [pc, #!2E]", 4),
+    ENCODING_MAP(kThumb2LdrdI8, 0xe9d00000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
+                 kFmtBitBlt, 7, 0,
+                 IS_QUAD_OP | REG_DEF0 | REG_DEF1 | REG_USE2 | IS_LOAD,
+                 "ldrd", "!0C, !1C, [!2C, #!3E]", 4),
+    ENCODING_MAP(kThumb2StrdI8, 0xe9c00000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 8, kFmtBitBlt, 19, 16,
+                 kFmtBitBlt, 7, 0,
+                 IS_QUAD_OP | REG_USE0 | REG_USE1 | REG_USE2 | IS_STORE,
+                 "strd", "!0C, !1C, [!2C, #!3E]", 4),
 };
 
 /*
@@ -1023,13 +1037,14 @@
       if (lir->opcode == kThumbLdrPcRel ||
           lir->opcode == kThumb2LdrPcRel12 ||
           lir->opcode == kThumbAddPcRel ||
+          lir->opcode == kThumb2LdrdPcRel8 ||
           ((lir->opcode == kThumb2Vldrd) && (lir->operands[1] == r15pc)) ||
           ((lir->opcode == kThumb2Vldrs) && (lir->operands[1] == r15pc))) {
         /*
          * PC-relative loads are mostly used to load immediates
          * that are too large to materialize directly in one shot.
          * However, if the load displacement exceeds the limit,
-         * we revert to a 2-instruction materialization sequence.
+         * we revert to a multiple-instruction materialization sequence.
          */
         LIR *lir_target = lir->target;
         uintptr_t pc = (lir->offset + 4) & ~3;
@@ -1044,8 +1059,9 @@
           // Shouldn't happen in current codegen.
           LOG(FATAL) << "Unexpected pc-rel offset " << delta;
         }
-        // Now, check for the two difficult cases
+        // Now, check for the difficult cases
         if (((lir->opcode == kThumb2LdrPcRel12) && (delta > 4091)) ||
+            ((lir->opcode == kThumb2LdrdPcRel8) && (delta > 1020)) ||
             ((lir->opcode == kThumb2Vldrs) && (delta > 1020)) ||
             ((lir->opcode == kThumb2Vldrd) && (delta > 1020))) {
           /*
@@ -1053,26 +1069,34 @@
            * vldrs/vldrd we include REG_DEF_LR in the resource
            * masks for these instructions.
            */
-          int base_reg = (lir->opcode == kThumb2LdrPcRel12) ?
-            lir->operands[0] : rARM_LR;
+          int base_reg = ((lir->opcode == kThumb2LdrdPcRel8) || (lir->opcode == kThumb2LdrPcRel12))
+              ?  lir->operands[0] : rARM_LR;
 
-          // Add new Adr to generate the address
+          // Add new Adr to generate the address.
           LIR* new_adr = RawLIR(cu, lir->dalvik_offset, kThumb2Adr,
                      base_reg, 0, 0, 0, 0, lir->target);
           InsertLIRBefore(lir, new_adr);
 
-          // Convert to normal load
+          // Convert to normal load.
           if (lir->opcode == kThumb2LdrPcRel12) {
             lir->opcode = kThumb2LdrRRI12;
+          } else if (lir->opcode == kThumb2LdrdPcRel8) {
+            lir->opcode = kThumb2LdrdI8;
           }
-          // Change the load to be relative to the new Adr base
-          lir->operands[1] = base_reg;
-          lir->operands[2] = 0;
+          // Change the load to be relative to the new Adr base.
+          if (lir->opcode == kThumb2LdrdI8) {
+            lir->operands[3] = 0;
+            lir->operands[2] = base_reg;
+          } else {
+            lir->operands[2] = 0;
+            lir->operands[1] = base_reg;
+          }
           SetupResourceMasks(cu, lir);
           res = kRetryAll;
         } else {
           if ((lir->opcode == kThumb2Vldrs) ||
-              (lir->opcode == kThumb2Vldrd)) {
+              (lir->opcode == kThumb2Vldrd) ||
+              (lir->opcode == kThumb2LdrdPcRel8)) {
             lir->operands[2] = delta >> 2;
           } else {
             lir->operands[1] = (lir->opcode == kThumb2LdrPcRel12) ?  delta :
diff --git a/src/compiler/codegen/arm/codegen_arm.h b/src/compiler/codegen/arm/codegen_arm.h
index ea34ff2..4dadd6c 100644
--- a/src/compiler/codegen/arm/codegen_arm.h
+++ b/src/compiler/codegen/arm/codegen_arm.h
@@ -37,8 +37,7 @@
                                      int displacement, int r_dest, int r_dest_hi, OpSize size,
                                      int s_reg);
     virtual LIR* LoadConstantNoClobber(CompilationUnit* cu, int r_dest, int value);
-    virtual LIR* LoadConstantValueWide(CompilationUnit* cu, int r_dest_lo, int r_dest_hi,
-                                       int val_lo, int val_hi);
+    virtual LIR* LoadConstantWide(CompilationUnit* cu, int r_dest_lo, int r_dest_hi, int64_t value);
     virtual LIR* StoreBaseDisp(CompilationUnit* cu, int rBase, int displacement, int r_src,
                                OpSize size);
     virtual LIR* StoreBaseDispWide(CompilationUnit* cu, int rBase, int displacement, int r_src_lo,
@@ -89,12 +88,18 @@
     virtual bool IsUnconditionalBranch(LIR* lir);
 
     // Required for target - Dalvik-level generators.
+    virtual bool GenArithImmOpLong(CompilationUnit* cu, Instruction::Code opcode, RegLocation rl_dest,
+                                   RegLocation rl_src1, RegLocation rl_src2);
     virtual void GenArrayObjPut(CompilationUnit* cu, int opt_flags, RegLocation rl_array,
                                 RegLocation rl_index, RegLocation rl_src, int scale);
     virtual void GenArrayGet(CompilationUnit* cu, int opt_flags, OpSize size, RegLocation rl_array,
                              RegLocation rl_index, RegLocation rl_dest, int scale);
     virtual void GenArrayPut(CompilationUnit* cu, int opt_flags, OpSize size, RegLocation rl_array,
                              RegLocation rl_index, RegLocation rl_src, int scale);
+    virtual bool GenShiftImmOpLong(CompilationUnit* cu, Instruction::Code opcode,
+                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_shift);
+    virtual void GenMulLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
+                            RegLocation rl_src2);
     virtual bool GenAddLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2);
     virtual bool GenAndLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
@@ -197,7 +202,14 @@
     static int EncodeShift(int code, int amount);
     static int ModifiedImmediate(uint32_t value);
     static ArmConditionCode ArmConditionEncoding(ConditionCode code);
-    bool InexpensiveConstant(int reg, int value);
+    bool InexpensiveConstantInt(int32_t value);
+    bool InexpensiveConstantFloat(int32_t value);
+    bool InexpensiveConstantLong(int64_t value);
+    bool InexpensiveConstantDouble(int64_t value);
+
+  private:
+    void GenFusedLongCmpImmBranch(CompilationUnit* cu, BasicBlock* bb, RegLocation rl_src1,
+                                  int64_t val, ConditionCode ccode);
 };
 
 }  // namespace art
diff --git a/src/compiler/codegen/arm/int_arm.cc b/src/compiler/codegen/arm/int_arm.cc
index fcf74f1..5a9786c 100644
--- a/src/compiler/codegen/arm/int_arm.cc
+++ b/src/compiler/codegen/arm/int_arm.cc
@@ -121,16 +121,81 @@
   branch3->target = branch1->target;
 }
 
-void ArmCodegen::GenFusedLongCmpBranch(CompilationUnit* cu, BasicBlock* bb, MIR* mir)
+void ArmCodegen::GenFusedLongCmpImmBranch(CompilationUnit* cu, BasicBlock* bb, RegLocation rl_src1,
+                                          int64_t val, ConditionCode ccode)
 {
+  int32_t val_lo = Low32Bits(val);
+  int32_t val_hi = High32Bits(val);
+  DCHECK(ModifiedImmediate(val_lo) >= 0);
+  DCHECK(ModifiedImmediate(val_hi) >= 0);
   LIR* label_list = cu->block_label_list;
   LIR* taken = &label_list[bb->taken->id];
   LIR* not_taken = &label_list[bb->fall_through->id];
+  rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
+  int32_t low_reg = rl_src1.low_reg;
+  int32_t high_reg = rl_src1.high_reg;
+
+  switch(ccode) {
+    case kCondEq:
+      OpCmpImmBranch(cu, kCondNe, high_reg, val_hi, not_taken);
+      break;
+    case kCondNe:
+      OpCmpImmBranch(cu, kCondNe, high_reg, val_hi, taken);
+      break;
+    case kCondLt:
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, not_taken);
+      ccode = kCondCc;
+      break;
+    case kCondLe:
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, not_taken);
+      ccode = kCondLs;
+      break;
+    case kCondGt:
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, not_taken);
+      ccode = kCondHi;
+      break;
+    case kCondGe:
+      OpCmpImmBranch(cu, kCondGt, high_reg, val_hi, taken);
+      OpCmpImmBranch(cu, kCondLt, high_reg, val_hi, not_taken);
+      ccode = kCondCs;
+      break;
+    default:
+      LOG(FATAL) << "Unexpected ccode: " << ccode;
+  }
+  OpCmpImmBranch(cu, ccode, low_reg, val_lo, taken);
+}
+
+
+void ArmCodegen::GenFusedLongCmpBranch(CompilationUnit* cu, BasicBlock* bb, MIR* mir)
+{
   RegLocation rl_src1 = GetSrcWide(cu, mir, 0);
   RegLocation rl_src2 = GetSrcWide(cu, mir, 2);
+  // Normalize such that if either operand is constant, src2 will be constant.
+  ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
+  if (rl_src1.is_const) {
+    RegLocation rl_temp = rl_src1;
+    rl_src1 = rl_src2;
+    rl_src2 = rl_temp;
+    ccode = FlipComparisonOrder(ccode);
+  }
+  if (rl_src2.is_const) {
+    RegLocation rl_temp = UpdateLocWide(cu, rl_src2);
+    // Do special compare/branch against simple const operand if not already in registers.
+    int64_t val = ConstantValueWide(cu, rl_src2);
+    if ((rl_temp.location != kLocPhysReg) &&
+        ((ModifiedImmediate(Low32Bits(val)) >= 0) && (ModifiedImmediate(High32Bits(val)) >= 0))) {
+      GenFusedLongCmpImmBranch(cu, bb, rl_src1, val, ccode);
+      return;
+    }
+  }
+  LIR* label_list = cu->block_label_list;
+  LIR* taken = &label_list[bb->taken->id];
+  LIR* not_taken = &label_list[bb->fall_through->id];
   rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
   rl_src2 = LoadValueWide(cu, rl_src2, kCoreReg);
-  ConditionCode ccode = static_cast<ConditionCode>(mir->dalvikInsn.arg[0]);
   OpRegReg(cu, kOpCmp, rl_src1.high_reg, rl_src2.high_reg);
   switch(ccode) {
     case kCondEq:
@@ -185,7 +250,7 @@
     if (ARM_LOWREG(reg) && ((check_value & 0xff) == check_value)) {
       NewLIR2(cu, kThumbCmpRI8, reg, check_value);
     } else if (mod_imm >= 0) {
-      NewLIR2(cu, kThumb2CmpRI8, reg, mod_imm);
+      NewLIR2(cu, kThumb2CmpRI12, reg, mod_imm);
     } else {
       int t_reg = AllocTemp(cu);
       LoadConstant(cu, t_reg, check_value);
@@ -523,6 +588,93 @@
   return false;
 }
 
+
+ /*
+  * Check to see if a result pair has a misaligned overlap with an operand pair.  This
+  * is not usual for dx to generate, but it is legal (for now).  In a future rev of
+  * dex, we'll want to make this case illegal.
+  */
+static bool BadOverlap(CompilationUnit* cu, RegLocation rl_src, RegLocation rl_dest)
+{
+  DCHECK(rl_src.wide);
+  DCHECK(rl_dest.wide);
+  return (abs(SRegToVReg(cu, rl_src.s_reg_low) - SRegToVReg(cu, rl_dest.s_reg_low)) == 1);
+}
+
+void ArmCodegen::GenMulLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
+                            RegLocation rl_src2)
+{
+    /*
+     * To pull off inline multiply, we have a worst-case requirement of 8 temporary
+     * registers.  Normally for Arm, we get 5.  We can get to 6 by including
+     * lr in the temp set.  The only problematic case is all operands and result are
+     * distinct, and none have been promoted.  In that case, we can succeed by aggressively
+     * freeing operand temp registers after they are no longer needed.  All other cases
+     * can proceed normally.  We'll just punt on the case of the result having a misaligned
+     * overlap with either operand and send that case to a runtime handler.
+     */
+    RegLocation rl_result;
+    if (BadOverlap(cu, rl_src1, rl_dest) || (BadOverlap(cu, rl_src2, rl_dest))) {
+      int func_offset = ENTRYPOINT_OFFSET(pLmul);
+      FlushAllRegs(cu);
+      CallRuntimeHelperRegLocationRegLocation(cu, func_offset, rl_src1, rl_src2, false);
+      rl_result = GetReturnWide(cu, false);
+      StoreValueWide(cu, rl_dest, rl_result);
+      return;
+    }
+    // Temporarily add LR to the temp pool, and assign it to tmp1
+    MarkTemp(cu, rARM_LR);
+    FreeTemp(cu, rARM_LR);
+    int tmp1 = rARM_LR;
+    LockTemp(cu, rARM_LR);
+
+    rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
+    rl_src2 = LoadValueWide(cu, rl_src2, kCoreReg);
+
+    bool special_case = true;
+    // If operands are the same, or any pair has been promoted we're not the special case.
+    if ((rl_src1.s_reg_low == rl_src2.s_reg_low) ||
+        (!IsTemp(cu, rl_src1.low_reg) && !IsTemp(cu, rl_src1.high_reg)) ||
+        (!IsTemp(cu, rl_src2.low_reg) && !IsTemp(cu, rl_src2.high_reg))) {
+      special_case = false;
+    }
+    // Tuning: if rl_dest has been promoted and is *not* either operand, could use directly.
+    int res_lo = AllocTemp(cu);
+    int res_hi;
+    if (rl_src1.low_reg == rl_src2.low_reg) {
+      res_hi = AllocTemp(cu);
+      NewLIR3(cu, kThumb2MulRRR, tmp1, rl_src1.low_reg, rl_src1.high_reg);
+      NewLIR4(cu, kThumb2Umull, res_lo, res_hi, rl_src1.low_reg, rl_src1.low_reg);
+      OpRegRegRegShift(cu, kOpAdd, res_hi, res_hi, tmp1, EncodeShift(kArmLsl, 1));
+    } else {
+      // In the special case, all temps are now allocated
+      NewLIR3(cu, kThumb2MulRRR, tmp1, rl_src2.low_reg, rl_src1.high_reg);
+      if (special_case) {
+        DCHECK_NE(rl_src1.low_reg, rl_src2.low_reg);
+        DCHECK_NE(rl_src1.high_reg, rl_src2.high_reg);
+        FreeTemp(cu, rl_src1.high_reg);
+      }
+      res_hi = AllocTemp(cu);
+
+      NewLIR4(cu, kThumb2Umull, res_lo, res_hi, rl_src2.low_reg, rl_src1.low_reg);
+      NewLIR4(cu, kThumb2Mla, tmp1, rl_src1.low_reg, rl_src2.high_reg, tmp1);
+      NewLIR4(cu, kThumb2AddRRR, res_hi, tmp1, res_hi, 0);
+      if (special_case) {
+        FreeTemp(cu, rl_src1.low_reg);
+        Clobber(cu, rl_src1.low_reg);
+        Clobber(cu, rl_src1.high_reg);
+      }
+    }
+    FreeTemp(cu, tmp1);
+    rl_result = GetReturnWide(cu, false); // Just using as a template.
+    rl_result.low_reg = res_lo;
+    rl_result.high_reg = res_hi;
+    StoreValueWide(cu, rl_dest, rl_result);
+    // Now, restore lr to its non-temp status.
+    Clobber(cu, rARM_LR);
+    UnmarkTemp(cu, rARM_LR);
+}
+
 bool ArmCodegen::GenAddLong(CompilationUnit* cu, RegLocation rl_dest, RegLocation rl_src1,
                             RegLocation rl_src2)
 {
@@ -568,8 +720,11 @@
   int len_offset = mirror::Array::LengthOffset().Int32Value();
   int data_offset;
   RegLocation rl_result;
+  bool constant_index = rl_index.is_const;
   rl_array = LoadValue(cu, rl_array, kCoreReg);
-  rl_index = LoadValue(cu, rl_index, kCoreReg);
+  if (!constant_index) {
+    rl_index = LoadValue(cu, rl_index, kCoreReg);
+  }
 
   if (rl_dest.wide) {
     data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Int32Value();
@@ -577,6 +732,11 @@
     data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Int32Value();
   }
 
+  // If index is constant, just fold it into the data offset
+  if (constant_index) {
+    data_offset += ConstantValue(cu, rl_index) << scale;
+  }
+
   /* null object? */
   GenNullCheck(cu, rl_array.s_reg_low, rl_array.low_reg, opt_flags);
 
@@ -587,27 +747,38 @@
     /* Get len */
     LoadWordDisp(cu, rl_array.low_reg, len_offset, reg_len);
   }
-  if (rl_dest.wide || rl_dest.fp) {
-    // No special indexed operation, lea + load w/ displacement
-    int reg_ptr = AllocTemp(cu);
-    OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
-                     EncodeShift(kArmLsl, scale));
-    FreeTemp(cu, rl_index.low_reg);
+  if (rl_dest.wide || rl_dest.fp || constant_index) {
+    int reg_ptr;
+    if (constant_index) {
+      reg_ptr = rl_array.low_reg;  // NOTE: must not alter reg_ptr in constant case.
+    } else {
+      // No special indexed operation, lea + load w/ displacement
+      reg_ptr = AllocTemp(cu);
+      OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
+                       EncodeShift(kArmLsl, scale));
+      FreeTemp(cu, rl_index.low_reg);
+    }
     rl_result = EvalLoc(cu, rl_dest, reg_class, true);
 
     if (needs_range_check) {
-      // TODO: change kCondCS to a more meaningful name, is the sense of
-      // carry-set/clear flipped?
-      GenRegRegCheck(cu, kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      if (constant_index) {
+        GenImmedCheck(cu, kCondLs, reg_len, ConstantValue(cu, rl_index), kThrowConstantArrayBounds);
+      } else {
+        GenRegRegCheck(cu, kCondLs, reg_len, rl_index.low_reg, kThrowArrayBounds);
+      }
       FreeTemp(cu, reg_len);
     }
     if (rl_dest.wide) {
       LoadBaseDispWide(cu, reg_ptr, data_offset, rl_result.low_reg, rl_result.high_reg, INVALID_SREG);
-      FreeTemp(cu, reg_ptr);
+      if (!constant_index) {
+        FreeTemp(cu, reg_ptr);
+      }
       StoreValueWide(cu, rl_dest, rl_result);
     } else {
       LoadBaseDisp(cu, reg_ptr, data_offset, rl_result.low_reg, size, INVALID_SREG);
-      FreeTemp(cu, reg_ptr);
+      if (!constant_index) {
+        FreeTemp(cu, reg_ptr);
+      }
       StoreValue(cu, rl_dest, rl_result);
     }
   } else {
@@ -639,17 +810,28 @@
   RegisterClass reg_class = oat_reg_class_by_size(size);
   int len_offset = mirror::Array::LengthOffset().Int32Value();
   int data_offset;
+  bool constant_index = rl_index.is_const;
 
-  if (size == kLong || size == kDouble) {
+  if (rl_src.wide) {
     data_offset = mirror::Array::DataOffset(sizeof(int64_t)).Int32Value();
   } else {
     data_offset = mirror::Array::DataOffset(sizeof(int32_t)).Int32Value();
   }
 
+  // If index is constant, just fold it into the data offset.
+  if (constant_index) {
+    data_offset += ConstantValue(cu, rl_index) << scale;
+  }
+
   rl_array = LoadValue(cu, rl_array, kCoreReg);
-  rl_index = LoadValue(cu, rl_index, kCoreReg);
-  int reg_ptr = INVALID_REG;
-  if (IsTemp(cu, rl_array.low_reg)) {
+  if (!constant_index) {
+    rl_index = LoadValue(cu, rl_index, kCoreReg);
+  }
+
+  int reg_ptr;
+  if (constant_index) {
+    reg_ptr = rl_array.low_reg;
+  } else if (IsTemp(cu, rl_array.low_reg)) {
     Clobber(cu, rl_array.low_reg);
     reg_ptr = rl_array.low_reg;
   } else {
@@ -668,18 +850,25 @@
     LoadWordDisp(cu, rl_array.low_reg, len_offset, reg_len);
   }
   /* at this point, reg_ptr points to array, 2 live temps */
-  if (rl_src.wide || rl_src.fp) {
+  if (rl_src.wide || rl_src.fp || constant_index) {
     if (rl_src.wide) {
       rl_src = LoadValueWide(cu, rl_src, reg_class);
     } else {
       rl_src = LoadValue(cu, rl_src, reg_class);
     }
-    OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
-                     EncodeShift(kArmLsl, scale));
+    if (!constant_index) {
+      OpRegRegRegShift(cu, kOpAdd, reg_ptr, rl_array.low_reg, rl_index.low_reg,
+                       EncodeShift(kArmLsl, scale));
+    }
     if (needs_range_check) {
-      GenRegRegCheck(cu, kCondCs, rl_index.low_reg, reg_len, kThrowArrayBounds);
+      if (constant_index) {
+        GenImmedCheck(cu, kCondLs, reg_len, ConstantValue(cu, rl_index), kThrowConstantArrayBounds);
+      } else {
+        GenRegRegCheck(cu, kCondLs, reg_len, rl_index.low_reg, kThrowArrayBounds);
+      }
       FreeTemp(cu, reg_len);
     }
+
     if (rl_src.wide) {
       StoreBaseDispWide(cu, reg_ptr, data_offset, rl_src.low_reg, rl_src.high_reg);
     } else {
@@ -696,7 +885,9 @@
     StoreBaseIndexed(cu, reg_ptr, rl_index.low_reg, rl_src.low_reg,
                      scale, size);
   }
-  FreeTemp(cu, reg_ptr);
+  if (!constant_index) {
+    FreeTemp(cu, reg_ptr);
+  }
 }
 
 /*
@@ -758,4 +949,163 @@
   MarkGCCard(cu, r_value, r_array);
 }
 
+bool ArmCodegen::GenShiftImmOpLong(CompilationUnit* cu, Instruction::Code opcode,
+                                   RegLocation rl_dest, RegLocation rl_src, RegLocation rl_shift)
+{
+  rl_src = LoadValueWide(cu, rl_src, kCoreReg);
+  // Per spec, we only care about low 6 bits of shift amount.
+  int shift_amount = ConstantValue(cu, rl_shift) & 0x3f;
+  if (shift_amount == 0) {
+    StoreValueWide(cu, rl_dest, rl_src);
+    return false; // TODO: remove useless bool return result.
+  }
+  if (BadOverlap(cu, rl_src, rl_dest)) {
+    return GenShiftOpLong(cu, opcode, rl_dest, rl_src, rl_shift);
+  }
+  RegLocation rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
+  switch(opcode) {
+    case Instruction::SHL_LONG:
+    case Instruction::SHL_LONG_2ADDR:
+      if (shift_amount == 1) {
+        OpRegRegReg(cu, kOpAdd, rl_result.low_reg, rl_src.low_reg, rl_src.low_reg);
+        OpRegRegReg(cu, kOpAdc, rl_result.high_reg, rl_src.high_reg, rl_src.high_reg);
+      } else if (shift_amount == 32) {
+        OpRegCopy(cu, rl_result.high_reg, rl_src.low_reg);
+        LoadConstant(cu, rl_result.low_reg, 0);
+      } else if (shift_amount > 31) {
+        OpRegRegImm(cu, kOpLsl, rl_result.high_reg, rl_src.low_reg, shift_amount - 32);
+        LoadConstant(cu, rl_result.low_reg, 0);
+      } else {
+        OpRegRegImm(cu, kOpLsl, rl_result.high_reg, rl_src.high_reg, shift_amount);
+        OpRegRegRegShift(cu, kOpOr, rl_result.high_reg, rl_result.high_reg, rl_src.low_reg,
+                         EncodeShift(kArmLsr, 32 - shift_amount));
+        OpRegRegImm(cu, kOpLsl, rl_result.low_reg, rl_src.low_reg, shift_amount);
+      }
+      break;
+    case Instruction::SHR_LONG:
+    case Instruction::SHR_LONG_2ADDR:
+      if (shift_amount == 32) {
+        OpRegCopy(cu, rl_result.low_reg, rl_src.high_reg);
+        OpRegRegImm(cu, kOpAsr, rl_result.high_reg, rl_src.high_reg, 31);
+      } else if (shift_amount > 31) {
+        OpRegRegImm(cu, kOpAsr, rl_result.low_reg, rl_src.high_reg, shift_amount - 32);
+        OpRegRegImm(cu, kOpAsr, rl_result.high_reg, rl_src.high_reg, 31);
+      } else {
+        int t_reg = AllocTemp(cu);
+        OpRegRegImm(cu, kOpLsr, t_reg, rl_src.low_reg, shift_amount);
+        OpRegRegRegShift(cu, kOpOr, rl_result.low_reg, t_reg, rl_src.high_reg,
+                         EncodeShift(kArmLsl, 32 - shift_amount));
+        FreeTemp(cu, t_reg);
+        OpRegRegImm(cu, kOpAsr, rl_result.high_reg, rl_src.high_reg, shift_amount);
+      }
+      break;
+    case Instruction::USHR_LONG:
+    case Instruction::USHR_LONG_2ADDR:
+      if (shift_amount == 32) {
+        OpRegCopy(cu, rl_result.low_reg, rl_src.high_reg);
+        LoadConstant(cu, rl_result.high_reg, 0);
+      } else if (shift_amount > 31) {
+        OpRegRegImm(cu, kOpLsr, rl_result.low_reg, rl_src.high_reg, shift_amount - 32);
+        LoadConstant(cu, rl_result.high_reg, 0);
+      } else {
+        int t_reg = AllocTemp(cu);
+        OpRegRegImm(cu, kOpLsr, t_reg, rl_src.low_reg, shift_amount);
+        OpRegRegRegShift(cu, kOpOr, rl_result.low_reg, t_reg, rl_src.high_reg,
+                         EncodeShift(kArmLsl, 32 - shift_amount));
+        FreeTemp(cu, t_reg);
+        OpRegRegImm(cu, kOpLsr, rl_result.high_reg, rl_src.high_reg, shift_amount);
+      }
+      break;
+    default:
+      LOG(FATAL) << "Unexpected case";
+      return true;
+  }
+  StoreValueWide(cu, rl_dest, rl_result);
+  return false;
+}
+
+bool ArmCodegen::GenArithImmOpLong(CompilationUnit* cu, Instruction::Code opcode,
+                                   RegLocation rl_dest, RegLocation rl_src1, RegLocation rl_src2)
+{
+  if ((opcode == Instruction::SUB_LONG_2ADDR) || (opcode == Instruction::SUB_LONG)) {
+    if (!rl_src2.is_const) {
+      // Don't bother with special handling for subtract from immediate.
+      return GenArithOpLong(cu, opcode, rl_dest, rl_src1, rl_src2);
+    }
+  } else {
+    // Normalize
+    if (!rl_src2.is_const) {
+      DCHECK(rl_src1.is_const);
+      RegLocation rl_temp = rl_src1;
+      rl_src1 = rl_src2;
+      rl_src2 = rl_temp;
+    }
+  }
+  if (BadOverlap(cu, rl_src1, rl_dest)) {
+    return GenArithOpLong(cu, opcode, rl_dest, rl_src1, rl_src2);
+  }
+  DCHECK(rl_src2.is_const);
+  int64_t val = ConstantValueWide(cu, rl_src2);
+  uint32_t val_lo = Low32Bits(val);
+  uint32_t val_hi = High32Bits(val);
+  int32_t mod_imm_lo = ModifiedImmediate(val_lo);
+  int32_t mod_imm_hi = ModifiedImmediate(val_hi);
+
+  // Only a subset of add/sub immediate instructions set carry - so bail if we don't fit
+  switch(opcode) {
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+    case Instruction::SUB_LONG:
+    case Instruction::SUB_LONG_2ADDR:
+      if ((mod_imm_lo < 0) || (mod_imm_hi < 0)) {
+        return GenArithOpLong(cu, opcode, rl_dest, rl_src1, rl_src2);
+      }
+      break;
+    default:
+      break;
+  }
+  rl_src1 = LoadValueWide(cu, rl_src1, kCoreReg);
+  RegLocation rl_result = EvalLoc(cu, rl_dest, kCoreReg, true);
+  // NOTE: once we've done the EvalLoc on dest, we can no longer bail.
+  switch (opcode) {
+    case Instruction::ADD_LONG:
+    case Instruction::ADD_LONG_2ADDR:
+      NewLIR3(cu, kThumb2AddRRI8, rl_result.low_reg, rl_src1.low_reg, mod_imm_lo);
+      NewLIR3(cu, kThumb2AdcRRI8, rl_result.high_reg, rl_src1.high_reg, mod_imm_hi);
+      break;
+    case Instruction::OR_LONG:
+    case Instruction::OR_LONG_2ADDR:
+      if ((val_lo != 0) || (rl_result.low_reg != rl_src1.low_reg)) {
+        OpRegRegImm(cu, kOpOr, rl_result.low_reg, rl_src1.low_reg, val_lo);
+      }
+      if ((val_hi != 0) || (rl_result.high_reg != rl_src1.high_reg)) {
+        OpRegRegImm(cu, kOpOr, rl_result.high_reg, rl_src1.high_reg, val_hi);
+      }
+      break;
+    case Instruction::XOR_LONG:
+    case Instruction::XOR_LONG_2ADDR:
+      OpRegRegImm(cu, kOpXor, rl_result.low_reg, rl_src1.low_reg, val_lo);
+      OpRegRegImm(cu, kOpXor, rl_result.high_reg, rl_src1.high_reg, val_hi);
+      break;
+    case Instruction::AND_LONG:
+    case Instruction::AND_LONG_2ADDR:
+      if ((val_lo != 0xffffffff) || (rl_result.low_reg != rl_src1.low_reg)) {
+        OpRegRegImm(cu, kOpAnd, rl_result.low_reg, rl_src1.low_reg, val_lo);
+      }
+      if ((val_hi != 0xffffffff) || (rl_result.high_reg != rl_src1.high_reg)) {
+        OpRegRegImm(cu, kOpAnd, rl_result.high_reg, rl_src1.high_reg, val_hi);
+      }
+      break;
+    case Instruction::SUB_LONG_2ADDR:
+    case Instruction::SUB_LONG:
+      NewLIR3(cu, kThumb2SubRRI8, rl_result.low_reg, rl_src1.low_reg, mod_imm_lo);
+      NewLIR3(cu, kThumb2SbcRRI8, rl_result.high_reg, rl_src1.high_reg, mod_imm_hi);
+      break;
+    default:
+      LOG(FATAL) << "Unexpected opcode " << opcode;
+  }
+  StoreValueWide(cu, rl_dest, rl_result);
+  return false;  // TODO: remove bool return value from all of these Gen routines.
+}
+
 }  // namespace art
diff --git a/src/compiler/codegen/arm/utility_arm.cc b/src/compiler/codegen/arm/utility_arm.cc
index 433111c..a670199 100644
--- a/src/compiler/codegen/arm/utility_arm.cc
+++ b/src/compiler/codegen/arm/utility_arm.cc
@@ -45,6 +45,32 @@
   return res;
 }
 
+/*
+ * Determine whether value can be encoded as a Thumb2 floating point
+ * immediate.  If not, return -1.  If so return encoded 8-bit value.
+ */
+static int EncodeImmDouble(int64_t value)
+{
+  int res;
+  int bit_a = (value & 0x8000000000000000ll) >> 63;
+  int not_bit_b = (value & 0x4000000000000000ll) >> 62;
+  int bit_b = (value & 0x2000000000000000ll) >> 61;
+  int b_smear = (value & 0x3fc0000000000000ll) >> 54;
+  int slice =  (value & 0x003f000000000000ll) >> 48;
+  uint64_t zeroes = (value & 0x0000ffffffffffffll);
+  if (zeroes != 0)
+    return -1;
+  if (bit_b) {
+    if ((not_bit_b != 0) || (b_smear != 0xff))
+      return -1;
+  } else {
+    if ((not_bit_b != 1) || (b_smear != 0x0))
+      return -1;
+  }
+  res = (bit_a << 7) | (bit_b << 6) | slice;
+  return res;
+}
+
 static LIR* LoadFPConstantValue(CompilationUnit* cu, int r_dest, int value)
 {
   DCHECK(ARM_SINGLEREG(r_dest));
@@ -126,19 +152,24 @@
    return value | ((0x8 + z_leading) << 7); /* [01000..11111]:bcdefgh */
 }
 
-bool ArmCodegen::InexpensiveConstant(int reg, int value)
+bool ArmCodegen::InexpensiveConstantInt(int32_t value)
 {
-  bool res = false;
-  if (ARM_FPREG(reg)) {
-    res = (EncodeImmSingle(value) >= 0);
-  } else {
-    if (ARM_LOWREG(reg) && (value >= 0) && (IsUint(8, value))) {
-      res = true;
-    } else {
-      res = (ModifiedImmediate(value) >= 0) || (ModifiedImmediate(~value) >= 0);
-    }
-  }
-  return res;
+  return (ModifiedImmediate(value) >= 0) || (ModifiedImmediate(~value) >= 0);
+}
+
+bool ArmCodegen::InexpensiveConstantFloat(int32_t value)
+{
+  return EncodeImmSingle(value) >= 0;
+}
+
+bool ArmCodegen::InexpensiveConstantLong(int64_t value)
+{
+  return InexpensiveConstantInt(High32Bits(value)) && InexpensiveConstantInt(Low32Bits(value));
+}
+
+bool ArmCodegen::InexpensiveConstantDouble(int64_t value)
+{
+  return EncodeImmDouble(value) >= 0;
 }
 
 /*
@@ -178,25 +209,9 @@
     res = NewLIR2(cu, kThumb2MovImm16, r_dest, value);
     return res;
   }
-  /* No shortcut - go ahead and use literal pool */
-  LIR* data_target = ScanLiteralPool(cu->literal_list, value, 0);
-  if (data_target == NULL) {
-    data_target = AddWordData(cu, &cu->literal_list, value);
-  }
-  LIR* load_pc_rel = RawLIR(cu, cu->current_dalvik_offset,
-                          kThumb2LdrPcRel12, r_dest, 0, 0, 0, 0, data_target);
-  SetMemRefType(cu, load_pc_rel, true, kLiteral);
-  load_pc_rel->alias_info = reinterpret_cast<uintptr_t>(data_target);
-  res = load_pc_rel;
-  AppendLIR(cu, load_pc_rel);
-
-  /*
-   * To save space in the constant pool, we use the ADD_RRI8 instruction to
-   * add up to 255 to an existing constant value.
-   */
-  if (data_target->operands[0] != value) {
-    OpRegImm(cu, kOpAdd, r_dest, value - data_target->operands[0]);
-  }
+  /* Do a low/high pair */
+  res = NewLIR2(cu, kThumb2MovImm16, r_dest, Low16Bits(value));
+  NewLIR2(cu, kThumb2MovImm16H, r_dest, High16Bits(value));
   return res;
 }
 
@@ -514,7 +529,7 @@
       int mod_imm = ModifiedImmediate(value);
       LIR* res;
       if (mod_imm >= 0) {
-        res = NewLIR2(cu, kThumb2CmpRI8, r_src1, mod_imm);
+        res = NewLIR2(cu, kThumb2CmpRI12, r_src1, mod_imm);
       } else {
         int r_tmp = AllocTemp(cu);
         res = LoadConstant(cu, r_tmp, value);
@@ -587,44 +602,11 @@
   }
 }
 
-/*
- * Determine whether value can be encoded as a Thumb2 floating point
- * immediate.  If not, return -1.  If so return encoded 8-bit value.
- */
-static int EncodeImmDoubleHigh(int value)
+LIR* ArmCodegen::LoadConstantWide(CompilationUnit* cu, int r_dest_lo, int r_dest_hi, int64_t value)
 {
-  int res;
-  int bit_a =  (value & 0x80000000) >> 31;
-  int not_bit_b = (value & 0x40000000) >> 30;
-  int bit_b =  (value & 0x20000000) >> 29;
-  int b_smear =  (value & 0x3fc00000) >> 22;
-  int slice =   (value & 0x003f0000) >> 16;
-  int zeroes =  (value & 0x0000ffff);
-  if (zeroes != 0)
-    return -1;
-  if (bit_b) {
-    if ((not_bit_b != 0) || (b_smear != 0xff))
-      return -1;
-  } else {
-    if ((not_bit_b != 1) || (b_smear != 0x0))
-      return -1;
-  }
-  res = (bit_a << 7) | (bit_b << 6) | slice;
-  return res;
-}
-
-static int EncodeImmDouble(int val_lo, int val_hi)
-{
-  int res = -1;
-  if (val_lo == 0)
-    res = EncodeImmDoubleHigh(val_hi);
-  return res;
-}
-
-LIR* ArmCodegen::LoadConstantValueWide(CompilationUnit* cu, int r_dest_lo, int r_dest_hi,
-                                       int val_lo, int val_hi)
-{
-  LIR* res;
+  LIR* res = NULL;
+  int32_t val_lo = Low32Bits(value);
+  int32_t val_hi = High32Bits(value);
   int target_reg = S2d(r_dest_lo, r_dest_hi);
   if (ARM_FPREG(r_dest_lo)) {
     if ((val_lo == 0) && (val_hi == 0)) {
@@ -635,26 +617,33 @@
       // +0.0 = +2.0 - +2.0
       res = NewLIR3(cu, kThumb2Vsubd, target_reg, target_reg, target_reg);
     } else {
-      int encoded_imm = EncodeImmDouble(val_lo, val_hi);
+      int encoded_imm = EncodeImmDouble(value);
       if (encoded_imm >= 0) {
         res = NewLIR2(cu, kThumb2Vmovd_IMM8, target_reg, encoded_imm);
-      } else {
-        LIR* data_target = ScanLiteralPoolWide(cu->literal_list, val_lo, val_hi);
-        if (data_target == NULL) {
-          data_target = AddWideData(cu, &cu->literal_list, val_lo, val_hi);
-        }
-        LIR* load_pc_rel =
-            RawLIR(cu, cu->current_dalvik_offset, kThumb2Vldrd,
-                   target_reg, r15pc, 0, 0, 0, data_target);
-        SetMemRefType(cu, load_pc_rel, true, kLiteral);
-        load_pc_rel->alias_info = reinterpret_cast<uintptr_t>(data_target);
-        AppendLIR(cu, load_pc_rel);
-        res = load_pc_rel;
       }
     }
   } else {
-    res = LoadConstantNoClobber(cu, r_dest_lo, val_lo);
-    LoadConstantNoClobber(cu, r_dest_hi, val_hi);
+    if ((InexpensiveConstantInt(val_lo) && (InexpensiveConstantInt(val_hi)))) {
+      res = LoadConstantNoClobber(cu, r_dest_lo, val_lo);
+      LoadConstantNoClobber(cu, r_dest_hi, val_hi);
+    }
+  }
+  if (res == NULL) {
+    // No short form - load from the literal pool.
+    LIR* data_target = ScanLiteralPoolWide(cu->literal_list, val_lo, val_hi);
+    if (data_target == NULL) {
+      data_target = AddWideData(cu, &cu->literal_list, val_lo, val_hi);
+    }
+    if (ARM_FPREG(r_dest_lo)) {
+      res = RawLIR(cu, cu->current_dalvik_offset, kThumb2Vldrd,
+                   target_reg, r15pc, 0, 0, 0, data_target);
+    } else {
+      res = RawLIR(cu, cu->current_dalvik_offset, kThumb2LdrdPcRel8,
+                   r_dest_lo, r_dest_hi, r15pc, 0, 0, data_target);
+    }
+    SetMemRefType(cu, res, true, kLiteral);
+    res->alias_info = reinterpret_cast<uintptr_t>(data_target);
+    AppendLIR(cu, res);
   }
   return res;
 }
@@ -732,7 +721,7 @@
                                   int scale, OpSize size)
 {
   bool all_low_regs = ARM_LOWREG(rBase) && ARM_LOWREG(r_index) && ARM_LOWREG(r_src);
-  LIR* store;
+  LIR* store = NULL;
   ArmOpcode opcode = kThumbBkpt;
   bool thumb_form = (all_low_regs && (scale == 0));
   int reg_ptr;
@@ -798,14 +787,14 @@
                                   int r_dest_hi, OpSize size, int s_reg)
 {
   Codegen* cg = cu->cg.get();
-  LIR* res;
-  LIR* load;
+  LIR* load = NULL;
   ArmOpcode opcode = kThumbBkpt;
   bool short_form = false;
   bool thumb2Form = (displacement < 4092 && displacement >= 0);
   bool all_low_regs = (ARM_LOWREG(rBase) && ARM_LOWREG(r_dest));
   int encoded_disp = displacement;
   bool is64bit = false;
+  bool already_generated = false;
   switch (size) {
     case kDouble:
     case kLong:
@@ -822,11 +811,15 @@
         }
         break;
       } else {
-        res = LoadBaseDispBody(cu, rBase, displacement, r_dest,
-                               -1, kWord, s_reg);
-        LoadBaseDispBody(cu, rBase, displacement + 4, r_dest_hi,
-                         -1, kWord, INVALID_SREG);
-        return res;
+        if (displacement <= 1020) {
+          load = NewLIR4(cu, kThumb2LdrdI8, r_dest, r_dest_hi, rBase, displacement >> 2);
+        } else {
+          load = LoadBaseDispBody(cu, rBase, displacement, r_dest,
+                                 -1, kWord, s_reg);
+          LoadBaseDispBody(cu, rBase, displacement + 4, r_dest_hi,
+                           -1, kWord, INVALID_SREG);
+        }
+        already_generated = true;
       }
     case kSingle:
     case kWord:
@@ -894,13 +887,15 @@
       LOG(FATAL) << "Bad size: " << size;
   }
 
-  if (short_form) {
-    load = res = NewLIR3(cu, opcode, r_dest, rBase, encoded_disp);
-  } else {
-    int reg_offset = AllocTemp(cu);
-    res = cg->LoadConstant(cu, reg_offset, encoded_disp);
-    load = cg->LoadBaseIndexed(cu, rBase, reg_offset, r_dest, 0, size);
-    FreeTemp(cu, reg_offset);
+  if (!already_generated) {
+    if (short_form) {
+      load = NewLIR3(cu, opcode, r_dest, rBase, encoded_disp);
+    } else {
+      int reg_offset = AllocTemp(cu);
+      cg->LoadConstant(cu, reg_offset, encoded_disp);
+      load = cg->LoadBaseIndexed(cu, rBase, reg_offset, r_dest, 0, size);
+      FreeTemp(cu, reg_offset);
+    }
   }
 
   // TODO: in future may need to differentiate Dalvik accesses w/ spills
@@ -926,30 +921,36 @@
 LIR* ArmCodegen::StoreBaseDispBody(CompilationUnit* cu, int rBase, int displacement,
                                    int r_src, int r_src_hi, OpSize size) {
   Codegen* cg = cu->cg.get();
-  LIR* res, *store;
+  LIR* store = NULL;
   ArmOpcode opcode = kThumbBkpt;
   bool short_form = false;
   bool thumb2Form = (displacement < 4092 && displacement >= 0);
   bool all_low_regs = (ARM_LOWREG(rBase) && ARM_LOWREG(r_src));
   int encoded_disp = displacement;
   bool is64bit = false;
+  bool already_generated = false;
   switch (size) {
     case kLong:
     case kDouble:
       is64bit = true;
       if (!ARM_FPREG(r_src)) {
-        res = StoreBaseDispBody(cu, rBase, displacement, r_src, -1, kWord);
-        StoreBaseDispBody(cu, rBase, displacement + 4, r_src_hi, -1, kWord);
-        return res;
-      }
-      if (ARM_SINGLEREG(r_src)) {
-        DCHECK(ARM_FPREG(r_src_hi));
-        r_src = cg->S2d(r_src, r_src_hi);
-      }
-      opcode = kThumb2Vstrd;
-      if (displacement <= 1020) {
-        short_form = true;
-        encoded_disp >>= 2;
+        if (displacement <= 1020) {
+          store = NewLIR4(cu, kThumb2StrdI8, r_src, r_src_hi, rBase, displacement >> 2);
+        } else {
+          store = StoreBaseDispBody(cu, rBase, displacement, r_src, -1, kWord);
+          StoreBaseDispBody(cu, rBase, displacement + 4, r_src_hi, -1, kWord);
+        }
+        already_generated = true;
+      } else {
+        if (ARM_SINGLEREG(r_src)) {
+          DCHECK(ARM_FPREG(r_src_hi));
+          r_src = cg->S2d(r_src, r_src_hi);
+        }
+        opcode = kThumb2Vstrd;
+        if (displacement <= 1020) {
+          short_form = true;
+          encoded_disp >>= 2;
+        }
       }
       break;
     case kSingle:
@@ -998,20 +999,22 @@
     default:
       LOG(FATAL) << "Bad size: " << size;
   }
-  if (short_form) {
-    store = res = NewLIR3(cu, opcode, r_src, rBase, encoded_disp);
-  } else {
-    int r_scratch = AllocTemp(cu);
-    res = cg->LoadConstant(cu, r_scratch, encoded_disp);
-    store = cg->StoreBaseIndexed(cu, rBase, r_scratch, r_src, 0, size);
-    FreeTemp(cu, r_scratch);
+  if (!already_generated) {
+    if (short_form) {
+      store = NewLIR3(cu, opcode, r_src, rBase, encoded_disp);
+    } else {
+      int r_scratch = AllocTemp(cu);
+      cg->LoadConstant(cu, r_scratch, encoded_disp);
+      store = cg->StoreBaseIndexed(cu, rBase, r_scratch, r_src, 0, size);
+      FreeTemp(cu, r_scratch);
+    }
   }
 
   // TODO: In future, may need to differentiate Dalvik & spill accesses
   if (rBase == rARM_SP) {
     AnnotateDalvikRegAccess(cu, store, displacement >> 2, false /* is_load */, is64bit);
   }
-  return res;
+  return store;
 }
 
 LIR* ArmCodegen::StoreBaseDisp(CompilationUnit* cu, int rBase, int displacement, int r_src,