Implemented peephole optimizations including null-check elimination, redundant ld/st elimination, ad-hoc register renaming and store sinking.
diff --git a/vm/compiler/codegen/Optimizer.h b/vm/compiler/codegen/Optimizer.h
new file mode 100644
index 0000000..1a891b1
--- /dev/null
+++ b/vm/compiler/codegen/Optimizer.h
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dalvik.h"
+#include "compiler/CompilerInternals.h"
+
+#ifndef _DALVIK_VM_COMPILER_OPTIMIZATION_H
+#define _DALVIK_VM_COMPILER_OPTIMIZATION_H
+
+/* Forward declarations */
+struct CompilationUnit;
+struct LIR;
+
+/*
+ * Data structure tracking the mapping between a Dalvik register (pair) and a
+ * native register (pair). The idea is to reuse the previously loaded value
+ * if possible, otherwise to keep the value in a native register as long as
+ * possible.
+ */
+typedef struct RegisterScoreboard {
+    BitVector *nullCheckedRegs; // Track which registers have been null-checked
+    int liveDalvikReg;          // Track which Dalvik register is live
+    int nativeReg;              // And the mapped native register
+    int nativeRegHi;            // And the mapped native register
+    bool isWide;                // Whether a pair of registers are alive
+} RegisterScoreboard;
+
+void dvmCompilerApplyLocalOptimizations(struct CompilationUnit *cUnit,
+                                        struct LIR *head,
+                                        struct LIR *tail);
+
+void dvmCompilerApplyGlobalOptimizations(struct CompilationUnit *cUnit);
+
+#endif /* _DALVIK_VM_COMPILER_OPTIMIZATION_H */
diff --git a/vm/compiler/codegen/armv5te/ArchUtility.c b/vm/compiler/codegen/armv5te/ArchUtility.c
index a64b54f..7d7f119 100644
--- a/vm/compiler/codegen/armv5te/ArchUtility.c
+++ b/vm/compiler/codegen/armv5te/ArchUtility.c
@@ -193,11 +193,15 @@
             LOGD("L%#06x:\n", dest);
             break;
         default:
+            if (lir->isNop) {
+                break;
+            }
             buildInsnString(EncodingMap[lir->opCode].name, lir, opName,
                             baseAddr, 256);
             buildInsnString(EncodingMap[lir->opCode].fmt, lir, buf, baseAddr,
                             256);
-            LOGD("%p (%04x): %-8s%s\n", baseAddr + offset, offset, opName, buf);
+            LOGD("%p (%04x): %-8s%s\n",
+                 baseAddr + offset, offset, opName, buf);
             break;
     }
 }
diff --git a/vm/compiler/codegen/armv5te/Armv5teLIR.h b/vm/compiler/codegen/armv5te/Armv5teLIR.h
index 3341e49..dcf501b 100644
--- a/vm/compiler/codegen/armv5te/Armv5teLIR.h
+++ b/vm/compiler/codegen/armv5te/Armv5teLIR.h
@@ -145,7 +145,18 @@
     ARMV5TE_LAST,
 } Armv5teOpCode;
 
-/* Struct used to define the snippet posotions for each Thumb opcode */
+/* Bit flags describing the behavior of each native opcode */
+typedef enum Armv5teOpFeatureFlags {
+    IS_BRANCH =           1 << 1,
+    CLOBBER_DEST =        1 << 2,
+    CLOBBER_SRC1 =        1 << 3,
+    NO_OPERAND =          1 << 4,
+    IS_UNARY_OP =         1 << 5,
+    IS_BINARY_OP =        1 << 6,
+    IS_TERTIARY_OP =      1 << 7,
+} Armv5teOpFeatureFlags;
+
+/* Struct used to define the snippet positions for each Thumb opcode */
 typedef struct Armv5teEncodingMap {
     short skeleton;
     struct {
@@ -153,7 +164,7 @@
         int start;
     } fieldLoc[3];
     Armv5teOpCode opCode;
-    int operands;
+    int flags;
     char *name;
     char* fmt;
 } Armv5teEncodingMap;
@@ -168,7 +179,9 @@
 typedef struct Armv5teLIR {
     LIR generic;
     Armv5teOpCode opCode;
-    int operands[3]; /* dest, src1, src2 */
+    int operands[3];    // [0..2] = [dest, src1, src2]
+    bool isNop;         // LIR is optimized away
+    int age;            // default is 0, set lazily by the optimizer
 } Armv5teLIR;
 
 /* Utility macros to traverse the LIR/Armv5teLIR list */
diff --git a/vm/compiler/codegen/armv5te/Assemble.c b/vm/compiler/codegen/armv5te/Assemble.c
index f874f48..a59d27f 100644
--- a/vm/compiler/codegen/armv5te/Assemble.c
+++ b/vm/compiler/codegen/armv5te/Assemble.c
@@ -66,151 +66,225 @@
 /* NOTE: must be kept in sync with enum Armv5teOpcode from Armv5teLIR.h */
 Armv5teEncodingMap EncodingMap[ARMV5TE_LAST] = {
     ENCODING_MAP(ARMV5TE_16BIT_DATA,    0x0000, 15, 0, -1, -1, -1, -1,
-                 1, "data", "0x!0h(!0d)"),
+                 IS_UNARY_OP,
+                 "data", "0x!0h(!0d)"),
     ENCODING_MAP(ARMV5TE_ADC,           0x4140, 2, 0, 5, 3, -1, -1,
-                 2, "adc", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "adc", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_ADD_RRI3,      0x1c00, 2, 0, 5, 3, 8, 6,
-                 3, "add", "r!0d, r!1d, #!2d"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, r!1d, #!2d"),
     ENCODING_MAP(ARMV5TE_ADD_RI8,       0x3000, 10, 8, 7, 0, -1, -1,
-                 2, "add", "r!0d, r!0d, #!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, r!0d, #!1d"),
     ENCODING_MAP(ARMV5TE_ADD_RRR,       0x1800, 2, 0, 5, 3, 8, 6,
-                 3, "add", "r!0d, r!1d, r!2d"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, r!1d, r!2d"),
     ENCODING_MAP(ARMV5TE_ADD_RR_LH,     0x4440, 2, 0, 5, 3, -1, -1,
-                 2, "add", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "add",
+                 "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_ADD_RR_HL,     0x4480, 2, 0, 5, 3, -1, -1,
-                 2, "add", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_ADD_RR_HH,     0x44c0, 2, 0, 5, 3, -1, -1,
-                 2, "add", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_ADD_PC_REL,    0xa000, 10, 8, 7, 0, -1, -1,
-                 2, "add", "r!0d, pc, #!1E"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, pc, #!1E"),
     ENCODING_MAP(ARMV5TE_ADD_SP_REL,    0xa800, 10, 8, 7, 0, -1, -1,
-                 2, "add", "r!0d, sp, #!1E"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "add", "r!0d, sp, #!1E"),
     ENCODING_MAP(ARMV5TE_ADD_SPI7,      0xb000, 6, 0, -1, -1, -1, -1,
-                 1, "add", "sp, #!0d*4"),
+                 IS_UNARY_OP | CLOBBER_DEST,
+                 "add", "sp, #!0d*4"),
     ENCODING_MAP(ARMV5TE_AND_RR,        0x4000, 2, 0, 5, 3, -1, -1,
-                 2, "and", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "and", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_ASR,           0x1000, 2, 0, 5, 3, 10, 6,
-                 3, "asr", "r!0d, r!1d, #!2d"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "asr", "r!0d, r!1d, #!2d"),
     ENCODING_MAP(ARMV5TE_ASRV,          0x4100, 2, 0, 5, 3, -1, -1,
-                 2, "asr", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "asr", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_B_COND,        0xd000, 7, 0, 11, 8, -1, -1,
-                 2, "!1c", "!0t"),
+                 IS_BINARY_OP | IS_BRANCH,
+                 "!1c", "!0t"),
     ENCODING_MAP(ARMV5TE_B_UNCOND,      0xe000, 10, 0, -1, -1, -1, -1,
-                 0, "b", "!0t"),
+                 NO_OPERAND | IS_BRANCH,
+                 "b", "!0t"),
     ENCODING_MAP(ARMV5TE_BIC,           0x4380, 2, 0, 5, 3, -1, -1,
-                 2, "bic", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "bic", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_BKPT,          0xbe00, 7, 0, -1, -1, -1, -1,
-                 1, "bkpt", "!0d"),
+                 IS_UNARY_OP | IS_BRANCH,
+                 "bkpt", "!0d"),
     ENCODING_MAP(ARMV5TE_BLX_1,         0xf000, 10, 0, -1, -1, -1, -1,
-                 2, "blx_1", "!0u"),
+                 IS_BINARY_OP | IS_BRANCH,
+                 "blx_1", "!0u"),
     ENCODING_MAP(ARMV5TE_BLX_2,         0xe800, 10, 0, -1, -1, -1, -1,
-                 2, "blx_2", "!0v"),
+                 IS_BINARY_OP | IS_BRANCH,
+                 "blx_2", "!0v"),
     ENCODING_MAP(ARMV5TE_BL_1,          0xf000, 10, 0, -1, -1, -1, -1,
-                 1, "bl_1", "!0u"),
+                 IS_UNARY_OP | IS_BRANCH,
+                 "bl_1", "!0u"),
     ENCODING_MAP(ARMV5TE_BL_2,          0xf800, 10, 0, -1, -1, -1, -1,
-                 1, "bl_2", "!0v"),
+                 IS_UNARY_OP | IS_BRANCH,
+                 "bl_2", "!0v"),
     ENCODING_MAP(ARMV5TE_BLX_R,         0x4780, 6, 3, -1, -1, -1, -1,
-                 1, "blx", "r!0d"),
+                 IS_UNARY_OP | IS_BRANCH,
+                 "blx", "r!0d"),
     ENCODING_MAP(ARMV5TE_BX,            0x4700, 6, 3, -1, -1, -1, -1,
-                 1, "bx", "r!0d"),
+                 IS_UNARY_OP | IS_BRANCH,
+                 "bx", "r!0d"),
     ENCODING_MAP(ARMV5TE_CMN,           0x42c0, 2, 0, 5, 3, -1, -1,
-                 2, "cmn", "r!0d, r!1d"),
+                 IS_BINARY_OP,
+                 "cmn", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_CMP_RI8,       0x2800, 10, 8, 7, 0, -1, -1,
-                 2, "cmp", "r!0d, #!1d"),
+                 IS_BINARY_OP,
+                 "cmp", "r!0d, #!1d"),
     ENCODING_MAP(ARMV5TE_CMP_RR,        0x4280, 2, 0, 5, 3, -1, -1,
-                 2, "cmp", "r!0d, r!1d"),
+                 IS_BINARY_OP,
+                 "cmp", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_CMP_LH,        0x4540, 2, 0, 5, 3, -1, -1,
-                 2, "cmp", "r!0d, r!1D"),
+                 IS_BINARY_OP,
+                 "cmp", "r!0d, r!1D"),
     ENCODING_MAP(ARMV5TE_CMP_HL,        0x4580, 2, 0, 5, 3, -1, -1,
-                 2, "cmp", "r!0D, r!1d"),
+                 IS_BINARY_OP,
+                 "cmp", "r!0D, r!1d"),
     ENCODING_MAP(ARMV5TE_CMP_HH,        0x45c0, 2, 0, 5, 3, -1, -1,
-                 2, "cmp", "r!0D, r!1D"),
+                 IS_BINARY_OP,
+                 "cmp", "r!0D, r!1D"),
     ENCODING_MAP(ARMV5TE_EOR,           0x4040, 2, 0, 5, 3, -1, -1,
-                 2, "eor", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "eor", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_LDMIA,         0xc800, 10, 8, 7, 0, -1, -1,
-                 2, "ldmia", "r!0d!!, <!1R>"),
+                 IS_BINARY_OP | CLOBBER_DEST | CLOBBER_SRC1,
+                 "ldmia", "r!0d!!, <!1R>"),
     ENCODING_MAP(ARMV5TE_LDR_RRI5,      0x6800, 2, 0, 5, 3, 10, 6,
-                 3, "ldr", "r!0d, [r!1d, #!2E]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d, [r!1d, #!2E]"),
     ENCODING_MAP(ARMV5TE_LDR_RRR,       0x5800, 2, 0, 5, 3, 8, 6,
-                 3, "ldr", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_LDR_PC_REL,    0x4800, 10, 8, 7, 0, -1, -1,
-                 2, "ldr", "r!0d, [pc, #!1E]"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d, [pc, #!1E]"),
     ENCODING_MAP(ARMV5TE_LDR_SP_REL,    0x9800, 10, 8, 7, 0, -1, -1,
-                 2, "ldr", "r!0d, [sp, #!1E]"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "ldr", "r!0d, [sp, #!1E]"),
     ENCODING_MAP(ARMV5TE_LDRB_RRI5,     0x7800, 2, 0, 5, 3, 10, 6,
-                 3, "ldrb", "r!0d, [r!1d, #2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldrb", "r!0d, [r!1d, #2d]"),
     ENCODING_MAP(ARMV5TE_LDRB_RRR,      0x5c00, 2, 0, 5, 3, 8, 6,
-                 3, "ldrb", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldrb", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_LDRH_RRI5,     0x8800, 2, 0, 5, 3, 10, 6,
-                 3, "ldrh", "r!0d, [r!1d, #!2F]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldrh", "r!0d, [r!1d, #!2F]"),
     ENCODING_MAP(ARMV5TE_LDRH_RRR,      0x5a00, 2, 0, 5, 3, 8, 6,
-                 3, "ldrh", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldrh", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_LDRSB_RRR,     0x5600, 2, 0, 5, 3, 8, 6,
-                 3, "ldrsb", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldrsb", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_LDRSH_RRR,     0x5e00, 2, 0, 5, 3, 8, 6,
-                 3, "ldrsh", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "ldrsh", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_LSL,           0x0000, 2, 0, 5, 3, 10, 6,
-                 3, "lsl", "r!0d, r!1d, #!2d"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "lsl", "r!0d, r!1d, #!2d"),
     ENCODING_MAP(ARMV5TE_LSLV,          0x4080, 2, 0, 5, 3, -1, -1,
-                 2, "lsl", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "lsl", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_LSR,           0x0800, 2, 0, 5, 3, 10, 6,
-                 3, "lsr", "r!0d, r!1d, #!2d"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "lsr", "r!0d, r!1d, #!2d"),
     ENCODING_MAP(ARMV5TE_LSRV,          0x40c0, 2, 0, 5, 3, -1, -1,
-                 2, "lsr", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "lsr", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_MOV_IMM,       0x2000, 10, 8, 7, 0, -1, -1,
-                 2, "mov", "r!0d, #!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0d, #!1d"),
     ENCODING_MAP(ARMV5TE_MOV_RR,        0x1c00, 2, 0, 5, 3, -1, -1,
-                 2, "mov", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_MOV_RR_LH,     0x4640, 2, 0, 5, 3, -1, -1,
-                 2, "mov", "r!0D, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0D, r!1d"),
     ENCODING_MAP(ARMV5TE_MOV_RR_HL,     0x4680, 2, 0, 5, 3, -1, -1,
-                 2, "mov", "r!0d, r!1D"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0d, r!1D"),
     ENCODING_MAP(ARMV5TE_MOV_RR_HH,     0x46c0, 2, 0, 5, 3, -1, -1,
-                 2, "mov", "r!0D, r!1D"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mov", "r!0D, r!1D"),
     ENCODING_MAP(ARMV5TE_MUL,           0x4340, 2, 0, 5, 3, -1, -1,
-                 2, "mul", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mul", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_MVN,           0x43c0, 2, 0, 5, 3, -1, -1,
-                 2, "mvn", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "mvn", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_NEG,           0x4240, 2, 0, 5, 3, -1, -1,
-                 2, "neg", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "neg", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_ORR,           0x4300, 2, 0, 5, 3, -1, -1,
-                 2, "orr", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "orr", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_POP,           0xbc00, 8, 0, -1, -1, -1, -1,
-                 1, "pop", "<!0R>"),
+                 IS_UNARY_OP,
+                 "pop", "<!0R>"),
     ENCODING_MAP(ARMV5TE_PUSH,          0xb400, 8, 0, -1, -1, -1, -1,
-                 1, "push", "<!0R>"),
+                 IS_UNARY_OP,
+                 "push", "<!0R>"),
     ENCODING_MAP(ARMV5TE_ROR,           0x41c0, 2, 0, 5, 3, -1, -1,
-                 2, "ror", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "ror", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_SBC,           0x4180, 2, 0, 5, 3, -1, -1,
-                 2, "sbc", "r!0d, r!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "sbc", "r!0d, r!1d"),
     ENCODING_MAP(ARMV5TE_STMIA,         0xc000, 10, 8, 7, 0, -1, -1,
-                 2, "stmia", "r!0d!!, <!1R>"),
+                 IS_BINARY_OP | CLOBBER_SRC1,
+                 "stmia", "r!0d!!, <!1R>"),
     ENCODING_MAP(ARMV5TE_STR_RRI5,      0x6000, 2, 0, 5, 3, 10, 6,
-                 3, "str", "r!0d, [r!1d, #!2E]"),
+                 IS_TERTIARY_OP,
+                 "str", "r!0d, [r!1d, #!2E]"),
     ENCODING_MAP(ARMV5TE_STR_RRR,       0x5000, 2, 0, 5, 3, 8, 6,
-                 3, "str", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP,
+                 "str", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_STR_SP_REL,    0x9000, 10, 8, 7, 0, -1, -1,
-                 2, "str", "r!0d, [sp, #!1E]"),
+                 IS_BINARY_OP,
+                 "str", "r!0d, [sp, #!1E]"),
     ENCODING_MAP(ARMV5TE_STRB_RRI5,     0x7000, 2, 0, 5, 3, 10, 6,
-                 3, "strb", "r!0d, [r!1d, #!2d]"),
+                 IS_TERTIARY_OP,
+                 "strb", "r!0d, [r!1d, #!2d]"),
     ENCODING_MAP(ARMV5TE_STRB_RRR,      0x5400, 2, 0, 5, 3, 8, 6,
-                 3, "strb", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP,
+                 "strb", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_STRH_RRI5,     0x8000, 2, 0, 5, 3, 10, 6,
-                 3, "strh", "r!0d, [r!1d, #!2F]"),
+                 IS_TERTIARY_OP,
+                 "strh", "r!0d, [r!1d, #!2F]"),
     ENCODING_MAP(ARMV5TE_STRH_RRR,      0x5200, 2, 0, 5, 3, 8, 6,
-                 3, "strh", "r!0d, [r!1d, r!2d]"),
+                 IS_TERTIARY_OP,
+                 "strh", "r!0d, [r!1d, r!2d]"),
     ENCODING_MAP(ARMV5TE_SUB_RRI3,      0x1e00, 2, 0, 5, 3, 8, 6,
-                 3, "sub", "r!0d, r!1d, #!2d]"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "sub", "r!0d, r!1d, #!2d]"),
     ENCODING_MAP(ARMV5TE_SUB_RI8,       0x3800, 10, 8, 7, 0, -1, -1,
-                 2, "sub", "r!0d, #!1d"),
+                 IS_BINARY_OP | CLOBBER_DEST,
+                 "sub", "r!0d, #!1d"),
     ENCODING_MAP(ARMV5TE_SUB_RRR,       0x1a00, 2, 0, 5, 3, 8, 6,
-                 3, "sub", "r!0d, r!1d, r!2d"),
+                 IS_TERTIARY_OP | CLOBBER_DEST,
+                 "sub", "r!0d, r!1d, r!2d"),
     ENCODING_MAP(ARMV5TE_SUB_SPI7,      0xb080, 6, 0, -1, -1, -1, -1,
-                 1, "sub", "sp, #!0d"),
+                 IS_UNARY_OP | CLOBBER_DEST,
+                 "sub", "sp, #!0d"),
     ENCODING_MAP(ARMV5TE_SWI,           0xdf00, 7, 0, -1, -1, -1, -1,
-                 1, "swi", "!0d"),
+                 IS_UNARY_OP | IS_BRANCH,
+                 "swi", "!0d"),
     ENCODING_MAP(ARMV5TE_TST,           0x4200, 2, 0, 5, 3, -1, -1,
-                 1, "tst", "r!0d, r!1d"),
+                 IS_UNARY_OP,
+                 "tst", "r!0d, r!1d"),
 };
 
 #define PADDING_MOV_R0_R0               0x1C00
@@ -253,6 +327,10 @@
             continue;
         }
 
+        if (lir->isNop) {
+            continue;
+        }
+
         if (lir->opCode == ARMV5TE_LDR_PC_REL ||
             lir->opCode == ARMV5TE_ADD_PC_REL) {
             Armv5teLIR *lirTarget = (Armv5teLIR *) lir->generic.target;
@@ -373,7 +451,7 @@
          armLIR;
          armLIR = NEXT_LIR(armLIR)) {
         armLIR->generic.offset = offset;
-        if (armLIR->opCode >= 0) {
+        if (armLIR->opCode >= 0 && !armLIR->isNop) {
             offset += 2;
         } else if (armLIR->opCode == ARMV5TE_PSEUDO_ALIGN4) {
             if (offset & 0x2) {
diff --git a/vm/compiler/codegen/armv5te/Codegen.c b/vm/compiler/codegen/armv5te/Codegen.c
index 448a539..86faa54 100644
--- a/vm/compiler/codegen/armv5te/Codegen.c
+++ b/vm/compiler/codegen/armv5te/Codegen.c
@@ -29,6 +29,12 @@
 /* Track exercised opcodes */
 static int opcodeCoverage[256];
 
+/* non-existent register */
+#define vNone   (-1)
+
+/* get the next register in r0..r3 in a round-robin fashion */
+#define NEXT_REG(reg) ((reg + 1) & 3)
+
 /*****************************************************************************/
 
 /*
@@ -38,7 +44,7 @@
 static Armv5teLIR *newLIR0(CompilationUnit *cUnit, Armv5teOpCode opCode)
 {
     Armv5teLIR *insn = dvmCompilerNew(sizeof(Armv5teLIR), true);
-    assert(isPseudoOpCode(opCode) || EncodingMap[opCode].operands == 0);
+    assert(isPseudoOpCode(opCode) || (EncodingMap[opCode].flags & NO_OPERAND));
     insn->opCode = opCode;
     dvmCompilerAppendLIR(cUnit, (LIR *) insn);
     return insn;
@@ -48,7 +54,7 @@
                            int dest)
 {
     Armv5teLIR *insn = dvmCompilerNew(sizeof(Armv5teLIR), true);
-    assert(isPseudoOpCode(opCode) || EncodingMap[opCode].operands == 1);
+    assert(isPseudoOpCode(opCode) || (EncodingMap[opCode].flags & IS_UNARY_OP));
     insn->opCode = opCode;
     insn->operands[0] = dest;
     dvmCompilerAppendLIR(cUnit, (LIR *) insn);
@@ -59,7 +65,8 @@
                            int dest, int src1)
 {
     Armv5teLIR *insn = dvmCompilerNew(sizeof(Armv5teLIR), true);
-    assert(isPseudoOpCode(opCode) || EncodingMap[opCode].operands == 2);
+    assert(isPseudoOpCode(opCode) ||
+           (EncodingMap[opCode].flags & IS_BINARY_OP));
     insn->opCode = opCode;
     insn->operands[0] = dest;
     insn->operands[1] = src1;
@@ -71,7 +78,8 @@
                            int dest, int src1, int src2)
 {
     Armv5teLIR *insn = dvmCompilerNew(sizeof(Armv5teLIR), true);
-    assert(isPseudoOpCode(opCode) || EncodingMap[opCode].operands == 3);
+    assert(isPseudoOpCode(opCode) ||
+           (EncodingMap[opCode].flags & IS_TERTIARY_OP));
     insn->opCode = opCode;
     insn->operands[0] = dest;
     insn->operands[1] = src1;
@@ -84,7 +92,7 @@
                             int srcdest, int src2)
 {
     assert(!isPseudoOpCode(opCode));
-    if (EncodingMap[opCode].operands==2)
+    if (EncodingMap[opCode].flags & IS_BINARY_OP)
         return newLIR2(cUnit, opCode, srcdest, src2);
     else
         return newLIR3(cUnit, opCode, srcdest, srcdest, src2);
@@ -93,6 +101,80 @@
 /*****************************************************************************/
 
 /*
+ * The following are utility routines to help maintain the RegisterScoreboard
+ * state to facilitate register renaming.
+ */
+
+/* Reset the tracker to unknown state */
+static inline void resetRegisterScoreboard(CompilationUnit *cUnit)
+{
+    RegisterScoreboard *registerScoreboard = &cUnit->registerScoreboard;
+
+    dvmClearAllBits(registerScoreboard->nullCheckedRegs);
+    registerScoreboard->liveDalvikReg = vNone;
+    registerScoreboard->nativeReg = vNone;
+    registerScoreboard->nativeRegHi = vNone;
+}
+
+/* Kill the corresponding bit in the null-checked register list */
+static inline void killNullCheckedRegister(CompilationUnit *cUnit, int vReg)
+{
+    dvmClearBit(cUnit->registerScoreboard.nullCheckedRegs, vReg);
+}
+
+/* The Dalvik register pair held in native registers have changed */
+static inline void updateLiveRegisterPair(CompilationUnit *cUnit,
+                                          int vReg, int mRegLo, int mRegHi)
+{
+    cUnit->registerScoreboard.liveDalvikReg = vReg;
+    cUnit->registerScoreboard.nativeReg = mRegLo;
+    cUnit->registerScoreboard.nativeRegHi = mRegHi;
+    cUnit->registerScoreboard.isWide = true;
+}
+
+/* The Dalvik register held in a native register has changed */
+static inline void updateLiveRegister(CompilationUnit *cUnit,
+                                      int vReg, int mReg)
+{
+    cUnit->registerScoreboard.liveDalvikReg = vReg;
+    cUnit->registerScoreboard.nativeReg = mReg;
+    cUnit->registerScoreboard.isWide = false;
+}
+
+/*
+ * Given a Dalvik register id vSrc, use a very simple algorithm to increase
+ * the lifetime of cached Dalvik value in a native register.
+ */
+static inline int selectFirstRegister(CompilationUnit *cUnit, int vSrc,
+                                      bool isWide)
+{
+    RegisterScoreboard *registerScoreboard = &cUnit->registerScoreboard;
+
+    /* No live value - suggest to use r0 */
+    if (registerScoreboard->liveDalvikReg == vNone)
+        return r0;
+
+    /* Reuse the previously used native reg */
+    if (registerScoreboard->liveDalvikReg == vSrc) {
+        if (isWide != true) {
+            return registerScoreboard->nativeReg;
+        } else {
+            /* Return either r0 or r2 */
+            return (registerScoreboard->nativeReg + 1) & 2;
+        }
+    }
+
+    /* No reuse - choose the next one among r0..r3 in the round-robin fashion */
+    if (isWide) {
+        return (registerScoreboard->nativeReg + 2) & 2;
+    } else {
+        return (registerScoreboard->nativeReg + 1) & 3;
+    }
+
+}
+/*****************************************************************************/
+
+/*
  * The following are building blocks to insert constants into the pool or
  * instruction streams.
  */
@@ -279,7 +361,7 @@
             loadConstant(cUnit, rDestLo, vSrc*4);
             newLIR3(cUnit, ARMV5TE_ADD_RRR, rDestLo, rFP, rDestLo);
         }
-        assert(rDestLo != rDestHi);
+        assert(rDestLo < rDestHi);
         newLIR2(cUnit, ARMV5TE_LDMIA, rDestLo, (1<<rDestLo) | (1<<(rDestHi)));
     }
 }
@@ -291,6 +373,10 @@
 static void storeValuePair(CompilationUnit *cUnit, int rSrcLo, int rSrcHi,
                            int vDest, int rScratch)
 {
+    killNullCheckedRegister(cUnit, vDest);
+    killNullCheckedRegister(cUnit, vDest+1);
+    updateLiveRegisterPair(cUnit, vDest, rSrcLo, rSrcHi);
+
     /* Use reg + imm5*4 to store the values if possible */
     if (vDest <= 30) {
         newLIR3(cUnit, ARMV5TE_STR_RRI5, rSrcLo, rFP, vDest);
@@ -305,7 +391,7 @@
             loadConstant(cUnit, rScratch, vDest*4);
             newLIR3(cUnit, ARMV5TE_ADD_RRR, rScratch, rFP, rScratch);
         }
-        assert(rSrcLo != rSrcHi);
+        assert(rSrcLo < rSrcHi);
         newLIR2(cUnit, ARMV5TE_STMIA, rScratch, (1<<rSrcLo) | (1 << (rSrcHi)));
     }
 }
@@ -342,6 +428,9 @@
 static void storeValue(CompilationUnit *cUnit, int rSrc, int vDest,
                        int rScratch)
 {
+    killNullCheckedRegister(cUnit, vDest);
+    updateLiveRegister(cUnit, vDest, rSrc);
+
     /* Use reg + imm5*4 to store the value if possible */
     if (vDest <= 31) {
         newLIR3(cUnit, ARMV5TE_STR_RRI5, rSrc, rFP, vDest);
@@ -351,41 +440,32 @@
     }
 }
 
-/* Calculate the address of rFP+vSrc*4 */
-static void calculateValueAddress(CompilationUnit *cUnit, int vSrc, int rDest)
-{
-    /* Use add rd, rs, imm_3 */
-    if (vSrc <= 1) {
-        newLIR3(cUnit, ARMV5TE_ADD_RRI3, rDest, rFP, vSrc*4);
-    } else if (vSrc <= 64) {
-        /* Use add rd, imm_8 */
-        /* Sneak in 4 above rFP to cover one more register offset (ie v64) */
-        newLIR3(cUnit, ARMV5TE_ADD_RRI3, rDest, rFP, 4);
-        newLIR2(cUnit, ARMV5TE_ADD_RI8, rDest, (vSrc-1)*4);
-    } else {
-        /* Load offset from the constant pool */
-        loadConstant(cUnit, rDest, vSrc*4);
-        newLIR3(cUnit, ARMV5TE_ADD_RRR, rDest, rFP, rDest);
-    }
-}
-
 /*
  * Perform a binary operation on 64-bit operands and leave the results in the
  * r0/r1 pair.
  */
 static void genBinaryOpWide(CompilationUnit *cUnit, int vDest,
-                            Armv5teOpCode preinst, Armv5teOpCode inst)
+                            Armv5teOpCode preinst, Armv5teOpCode inst,
+                            int reg0, int reg2)
 {
-    newLIR23(cUnit, preinst, r0, r2);
-    newLIR23(cUnit, inst, r1, r3);
-    storeValuePair(cUnit, r0, r1, vDest, r2);
+    int reg1 = NEXT_REG(reg0);
+    int reg3 = NEXT_REG(reg2);
+    newLIR23(cUnit, preinst, reg0, reg2);
+    newLIR23(cUnit, inst, reg1, reg3);
+    storeValuePair(cUnit, reg0, reg1, vDest, reg2);
 }
 
 /* Perform a binary operation on 32-bit operands and leave the results in r0. */
-static void genBinaryOp(CompilationUnit *cUnit, int vDest, Armv5teOpCode inst)
+static void genBinaryOp(CompilationUnit *cUnit, int vDest, Armv5teOpCode inst,
+                        int reg0, int reg1, int regDest)
 {
-    newLIR23(cUnit, inst, r0, r1);
-    storeValue(cUnit, r0, vDest, r1);
+    if (EncodingMap[inst].flags & IS_BINARY_OP) {
+        newLIR2(cUnit, inst, reg0, reg1);
+        storeValue(cUnit, reg0, vDest, reg1);
+    } else {
+        newLIR3(cUnit, inst, regDest, reg0, reg1);
+        storeValue(cUnit, regDest, vDest, reg1);
+    }
 }
 
 /* Create the PC reconstruction slot if not already done */
@@ -436,11 +516,30 @@
     return genCheckCommon(cUnit, dOffset, branch, pcrLabel);
 }
 
-/* Perform null-check on a register */
-static Armv5teLIR *genNullCheck(CompilationUnit *cUnit, int reg, int dOffset,
-                                Armv5teLIR *pcrLabel)
+/*
+ * Perform null-check on a register. vReg is the Dalvik register being checked,
+ * and mReg is the machine register holding the actual value. If internal state
+ * indicates that vReg has been checked before the check request is ignored.
+ */
+static Armv5teLIR *genNullCheck(CompilationUnit *cUnit, int vReg, int mReg,
+                                int dOffset, Armv5teLIR *pcrLabel)
 {
-    return genRegImmCheck(cUnit, ARM_COND_EQ, reg, 0, dOffset, pcrLabel);
+    /* This particular Dalvik register has been null-checked */
+    if (dvmIsBitSet(cUnit->registerScoreboard.nullCheckedRegs, vReg)) {
+        return pcrLabel;
+    }
+    dvmSetBit(cUnit->registerScoreboard.nullCheckedRegs, vReg);
+    return genRegImmCheck(cUnit, ARM_COND_EQ, mReg, 0, dOffset, pcrLabel);
+}
+
+/*
+ * Perform zero-check on a register. Similar to genNullCheck but the value being
+ * checked does not have a corresponding Dalvik register.
+ */
+static Armv5teLIR *genZeroCheck(CompilationUnit *cUnit, int mReg,
+                                int dOffset, Armv5teLIR *pcrLabel)
+{
+    return genRegImmCheck(cUnit, ARM_COND_EQ, mReg, 0, dOffset, pcrLabel);
 }
 
 /* Perform bound check on two registers */
@@ -463,26 +562,54 @@
 static void genIGetWide(CompilationUnit *cUnit, MIR *mir, int fieldOffset)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
+    int reg0, reg1, reg2, reg3;
 
-    loadValue(cUnit, dInsn->vB, r2);
-    loadConstant(cUnit, r3, fieldOffset);
-    genNullCheck(cUnit, r2, mir->offset, NULL); /* null object? */
-    newLIR3(cUnit, ARMV5TE_ADD_RRR, r2, r2, r3);
-    newLIR2(cUnit, ARMV5TE_LDMIA, r2, (1<<r0 | 1<<r1));
-    storeValuePair(cUnit, r0, r1, dInsn->vA, r3);
+    /* Allocate reg0..reg3 into physical registers r0..r3 */
+
+    /* See if vB is in a native register. If so, reuse it. */
+    reg2 = selectFirstRegister(cUnit, dInsn->vB, false);
+    /* Ping reg3 to the other register of the same pair containing reg2 */
+    reg3 = reg2 ^ 0x1;
+    /*
+     * Ping reg0 to the first register of the alternate register pair
+     */
+    reg0 = (reg2 + 2) & 0x2;
+    reg1 = NEXT_REG(reg0);
+
+    loadValue(cUnit, dInsn->vB, reg2);
+    loadConstant(cUnit, reg3, fieldOffset);
+    genNullCheck(cUnit, dInsn->vB, reg2, mir->offset, NULL); /* null object? */
+    newLIR3(cUnit, ARMV5TE_ADD_RRR, reg2, reg2, reg3);
+    newLIR2(cUnit, ARMV5TE_LDMIA, reg2, (1<<reg0 | 1<<reg1));
+    storeValuePair(cUnit, reg0, reg1, dInsn->vA, reg3);
 }
 
 /* Store a wide field to an object instance */
 static void genIPutWide(CompilationUnit *cUnit, MIR *mir, int fieldOffset)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
+    int reg0, reg1, reg2, reg3;
 
-    loadValue(cUnit, dInsn->vB, r2);
-    loadValuePair(cUnit, dInsn->vA, r0, r1);
-    loadConstant(cUnit, r3, fieldOffset);
-    genNullCheck(cUnit, r2, mir->offset, NULL); /* null object? */
-    newLIR3(cUnit, ARMV5TE_ADD_RRR, r2, r2, r3);
-    newLIR2(cUnit, ARMV5TE_STMIA, r2, (1<<r0 | 1<<r1));
+    /* Allocate reg0..reg3 into physical registers r0..r3 */
+
+    /* See if vB is in a native register. If so, reuse it. */
+    reg2 = selectFirstRegister(cUnit, dInsn->vB, false);
+    /* Ping reg3 to the other register of the same pair containing reg2 */
+    reg3 = reg2 ^ 0x1;
+    /*
+     * Ping reg0 to the first register of the alternate register pair
+     */
+    reg0 = (reg2 + 2) & 0x2;
+    reg1 = NEXT_REG(reg0);
+
+
+    loadValue(cUnit, dInsn->vB, reg2);
+    loadValuePair(cUnit, dInsn->vA, reg0, reg1);
+    updateLiveRegisterPair(cUnit, dInsn->vA, reg0, reg1);
+    loadConstant(cUnit, reg3, fieldOffset);
+    genNullCheck(cUnit, dInsn->vB, reg2, mir->offset, NULL); /* null object? */
+    newLIR3(cUnit, ARMV5TE_ADD_RRR, reg2, reg2, reg3);
+    newLIR2(cUnit, ARMV5TE_STMIA, reg2, (1<<reg0 | 1<<reg1));
 }
 
 /*
@@ -499,13 +626,16 @@
                     int fieldOffset)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
+    int reg0, reg1;
 
+    reg0 = selectFirstRegister(cUnit, dInsn->vB, false);
+    reg1 = NEXT_REG(reg0);
     /* TUNING: write a utility routine to load via base + constant offset */
-    loadValue(cUnit, dInsn->vB, r0);
-    loadConstant(cUnit, r1, fieldOffset);
-    genNullCheck(cUnit, r0, mir->offset, NULL); /* null object? */
-    newLIR3(cUnit, inst, r0, r0, r1);
-    storeValue(cUnit, r0, dInsn->vA, r1);
+    loadValue(cUnit, dInsn->vB, reg0);
+    loadConstant(cUnit, reg1, fieldOffset);
+    genNullCheck(cUnit, dInsn->vB, reg0, mir->offset, NULL); /* null object? */
+    newLIR3(cUnit, inst, reg0, reg0, reg1);
+    storeValue(cUnit, reg0, dInsn->vA, reg1);
 }
 
 /*
@@ -520,13 +650,19 @@
                     int fieldOffset)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
+    int reg0, reg1, reg2;
+
+    reg0 = selectFirstRegister(cUnit, dInsn->vB, false);
+    reg1 = NEXT_REG(reg0);
+    reg2 = NEXT_REG(reg1);
 
     /* TUNING: write a utility routine to load via base + constant offset */
-    loadValue(cUnit, dInsn->vB, r2);
-    loadConstant(cUnit, r1, fieldOffset);
-    loadValue(cUnit, dInsn->vA, r0);
-    genNullCheck(cUnit, r2, mir->offset, NULL); /* null object? */
-    newLIR3(cUnit, inst, r0, r2, r1);
+    loadValue(cUnit, dInsn->vB, reg0);
+    loadConstant(cUnit, reg1, fieldOffset);
+    loadValue(cUnit, dInsn->vA, reg2);
+    updateLiveRegister(cUnit, dInsn->vA, reg2);
+    genNullCheck(cUnit, dInsn->vB, reg0, mir->offset, NULL); /* null object? */
+    newLIR3(cUnit, inst, reg2, reg0, reg1);
 }
 
 
@@ -547,26 +683,33 @@
 {
     int lenOffset = offsetof(ArrayObject, length);
     int dataOffset = offsetof(ArrayObject, contents);
+    int reg0, reg1, reg2, reg3;
 
-    loadValue(cUnit, vArray, r2);
-    loadValue(cUnit, vIndex, r3);
+    reg0 = selectFirstRegister(cUnit, vArray, false);
+    reg1 = NEXT_REG(reg0);
+    reg2 = NEXT_REG(reg1);
+    reg3 = NEXT_REG(reg2);
+
+    loadValue(cUnit, vArray, reg2);
+    loadValue(cUnit, vIndex, reg3);
 
     /* null object? */
-    Armv5teLIR * pcrLabel = genNullCheck(cUnit, r2, mir->offset, NULL);
-    newLIR3(cUnit, ARMV5TE_LDR_RRI5, r0, r2, lenOffset >> 2);  /* Get len */
-    newLIR2(cUnit, ARMV5TE_ADD_RI8, r2, dataOffset); /* r2 -> array data */
-    genBoundsCheck(cUnit, r3, r0, mir->offset, pcrLabel);
+    Armv5teLIR * pcrLabel = genNullCheck(cUnit, vArray, reg2, mir->offset,
+                                         NULL);
+    newLIR3(cUnit, ARMV5TE_LDR_RRI5, reg0, reg2, lenOffset >> 2);  /* Get len */
+    newLIR2(cUnit, ARMV5TE_ADD_RI8, reg2, dataOffset); /* reg2 -> array data */
+    genBoundsCheck(cUnit, reg3, reg0, mir->offset, pcrLabel);
     if (scale) {
-        newLIR3(cUnit, ARMV5TE_LSL, r3, r3, scale);
+        newLIR3(cUnit, ARMV5TE_LSL, reg3, reg3, scale);
     }
     if (scale==3) {
-        newLIR3(cUnit, inst, r0, r2, r3);
-        newLIR2(cUnit, ARMV5TE_ADD_RI8, r2, 4);
-        newLIR3(cUnit, inst, r1, r2, r3);
-        storeValuePair(cUnit, r0, r1, vDest, r3);
+        newLIR3(cUnit, inst, reg0, reg2, reg3);
+        newLIR2(cUnit, ARMV5TE_ADD_RI8, reg2, 4);
+        newLIR3(cUnit, inst, reg1, reg2, reg3);
+        storeValuePair(cUnit, reg0, reg1, vDest, reg3);
     } else {
-        newLIR3(cUnit, inst, r0, r2, r3);
-        storeValue(cUnit, r0, vDest, r3);
+        newLIR3(cUnit, inst, reg0, reg2, reg3);
+        storeValue(cUnit, reg0, vDest, reg3);
     }
 }
 
@@ -585,64 +728,84 @@
 {
     int lenOffset = offsetof(ArrayObject, length);
     int dataOffset = offsetof(ArrayObject, contents);
+    int reg0, reg1, reg2, reg3;
 
-    loadValue(cUnit, vArray, r2);
-    loadValue(cUnit, vIndex, r3);
+    reg0 = selectFirstRegister(cUnit, vArray, false);
+    reg1 = NEXT_REG(reg0);
+    reg2 = NEXT_REG(reg1);
+    reg3 = NEXT_REG(reg2);
+
+    loadValue(cUnit, vArray, reg2);
+    loadValue(cUnit, vIndex, reg3);
+
     /* null object? */
-    Armv5teLIR * pcrLabel = genNullCheck(cUnit, r2, mir->offset, NULL);
-    newLIR3(cUnit, ARMV5TE_LDR_RRI5, r0, r2, lenOffset >> 2);  /* Get len */
-    newLIR2(cUnit, ARMV5TE_ADD_RI8, r2, dataOffset); /* r2 -> array data */
-    genBoundsCheck(cUnit, r3, r0, mir->offset, pcrLabel);
-    /* at this point, r2 points to array, r3 is unscaled index */
+    Armv5teLIR * pcrLabel = genNullCheck(cUnit, vArray, reg2, mir->offset,
+                                         NULL);
+    newLIR3(cUnit, ARMV5TE_LDR_RRI5, reg0, reg2, lenOffset >> 2);  /* Get len */
+    newLIR2(cUnit, ARMV5TE_ADD_RI8, reg2, dataOffset); /* reg2 -> array data */
+    genBoundsCheck(cUnit, reg3, reg0, mir->offset, pcrLabel);
+    /* at this point, reg2 points to array, reg3 is unscaled index */
     if (scale==3) {
-        loadValuePair(cUnit, vSrc, r0, r1);
+        loadValuePair(cUnit, vSrc, reg0, reg1);
+        updateLiveRegisterPair(cUnit, vSrc, reg0, reg1);
     } else {
-        loadValue(cUnit, vSrc, r0);
+        loadValue(cUnit, vSrc, reg0);
+        updateLiveRegister(cUnit, vSrc, reg0);
     }
     if (scale) {
-        newLIR3(cUnit, ARMV5TE_LSL, r3, r3, scale);
+        newLIR3(cUnit, ARMV5TE_LSL, reg3, reg3, scale);
     }
     /*
-     * at this point, r2 points to array, r3 is scaled index, and r0[r1] is
-     * data
+     * at this point, reg2 points to array, reg3 is scaled index, and
+     * reg0[reg1] is data
      */
     if (scale==3) {
-        newLIR3(cUnit, inst, r0, r2, r3);
-        newLIR2(cUnit, ARMV5TE_ADD_RI8, r2, 4);
-        newLIR3(cUnit, inst, r1, r2, r3);
+        newLIR3(cUnit, inst, reg0, reg2, reg3);
+        newLIR2(cUnit, ARMV5TE_ADD_RI8, reg2, 4);
+        newLIR3(cUnit, inst, reg1, reg2, reg3);
     } else {
-        newLIR3(cUnit, inst, r0, r2, r3);
+        newLIR3(cUnit, inst, reg0, reg2, reg3);
     }
 }
 
 static bool genShiftOpLong(CompilationUnit *cUnit, MIR *mir, int vDest,
                            int vSrc1, int vShift)
 {
-     loadValuePair(cUnit, vSrc1, r0, r1);
-     loadValue(cUnit, vShift, r2);
-     switch( mir->dalvikInsn.opCode) {
-         case OP_SHL_LONG:
-         case OP_SHL_LONG_2ADDR:
-             genDispatchToHandler(cUnit, TEMPLATE_SHL_LONG);
-             break;
-         case OP_SHR_LONG:
-         case OP_SHR_LONG_2ADDR:
-             genDispatchToHandler(cUnit, TEMPLATE_SHR_LONG);
-             break;
-         case OP_USHR_LONG:
-         case OP_USHR_LONG_2ADDR:
-             genDispatchToHandler(cUnit, TEMPLATE_USHR_LONG);
-             break;
-         default:
-             return true;
-     }
-     storeValuePair(cUnit, r0, r1, vDest, r2);
-     return false;
+    /*
+     * Don't mess with the regsiters here as there is a particular calling
+     * convention to the out-of-line handler.
+     */
+    loadValue(cUnit, vShift, r2);
+    loadValuePair(cUnit, vSrc1, r0, r1);
+    switch( mir->dalvikInsn.opCode) {
+        case OP_SHL_LONG:
+        case OP_SHL_LONG_2ADDR:
+            genDispatchToHandler(cUnit, TEMPLATE_SHL_LONG);
+            break;
+        case OP_SHR_LONG:
+        case OP_SHR_LONG_2ADDR:
+            genDispatchToHandler(cUnit, TEMPLATE_SHR_LONG);
+            break;
+        case OP_USHR_LONG:
+        case OP_USHR_LONG_2ADDR:
+            genDispatchToHandler(cUnit, TEMPLATE_USHR_LONG);
+            break;
+        default:
+            return true;
+    }
+    storeValuePair(cUnit, r0, r1, vDest, r2);
+    return false;
 }
 bool dvmCompilerGenArithOpFloatPortable(CompilationUnit *cUnit, MIR *mir,
                                         int vDest, int vSrc1, int vSrc2)
 {
+    /*
+     * Don't optimize the regsiter usage here as they are governed by the EABI
+     * calling convention.
+     */
     void* funct;
+    int reg0, reg1;
+
     /* TODO: use a proper include file to define these */
     float __aeabi_fadd(float a, float b);
     float __aeabi_fsub(float a, float b);
@@ -650,6 +813,9 @@
     float __aeabi_fmul(float a, float b);
     float fmodf(float a, float b);
 
+    reg0 = selectFirstRegister(cUnit, vSrc2, false);
+    reg1 = NEXT_REG(reg0);
+
     switch (mir->dalvikInsn.opCode) {
         case OP_ADD_FLOAT_2ADDR:
         case OP_ADD_FLOAT:
@@ -672,10 +838,10 @@
             funct = (void*) fmodf;
             break;
         case OP_NEG_FLOAT: {
-            loadValue(cUnit, vSrc2, r0);
-            loadConstant(cUnit, r1, 0x80000000);
-            newLIR3(cUnit, ARMV5TE_ADD_RRR, r0, r0, r1);
-            storeValue(cUnit, r0, vDest, r1);
+            loadValue(cUnit, vSrc2, reg0);
+            loadConstant(cUnit, reg1, 0x80000000);
+            newLIR3(cUnit, ARMV5TE_ADD_RRR, reg0, reg0, reg1);
+            storeValue(cUnit, reg0, vDest, reg1);
             return false;
         }
         default:
@@ -693,6 +859,8 @@
                                          int vDest, int vSrc1, int vSrc2)
 {
     void* funct;
+    int reg0, reg1, reg2;
+
     /* TODO: use a proper include file to define these */
     double __aeabi_dadd(double a, double b);
     double __aeabi_dsub(double a, double b);
@@ -700,6 +868,10 @@
     double __aeabi_dmul(double a, double b);
     double fmod(double a, double b);
 
+    reg0 = selectFirstRegister(cUnit, vSrc2, true);
+    reg1 = NEXT_REG(reg0);
+    reg2 = NEXT_REG(reg1);
+
     switch (mir->dalvikInsn.opCode) {
         case OP_ADD_DOUBLE_2ADDR:
         case OP_ADD_DOUBLE:
@@ -722,15 +894,19 @@
             funct = (void*) fmod;
             break;
         case OP_NEG_DOUBLE: {
-            loadValuePair(cUnit, vSrc2, r0, r1);
-            loadConstant(cUnit, r2, 0x80000000);
-            newLIR3(cUnit, ARMV5TE_ADD_RRR, r1, r1, r2);
-            storeValuePair(cUnit, r0, r1, vDest, r2);
+            loadValuePair(cUnit, vSrc2, reg0, reg1);
+            loadConstant(cUnit, reg2, 0x80000000);
+            newLIR3(cUnit, ARMV5TE_ADD_RRR, reg1, reg1, reg2);
+            storeValuePair(cUnit, reg0, reg1, vDest, reg2);
             return false;
         }
         default:
             return true;
     }
+    /*
+     * Don't optimize the regsiter usage here as they are governed by the EABI
+     * calling convention.
+     */
     loadConstant(cUnit, r4PC, (int)funct);
     loadValuePair(cUnit, vSrc1, r0, r1);
     loadValuePair(cUnit, vSrc2, r2, r3);
@@ -747,6 +923,7 @@
     bool callOut = false;
     void *callTgt;
     int retReg = r0;
+    int reg0, reg1, reg2, reg3;
     /* TODO - find proper .h file to declare these */
     long long __aeabi_ldivmod(long long op1, long long op2);
 
@@ -801,21 +978,36 @@
             firstOp = ARMV5TE_EOR;
             secondOp = ARMV5TE_EOR;
             break;
-        case OP_NEG_LONG:
-            loadValuePair(cUnit, vSrc2, r2, r3);
-            loadConstant(cUnit, r1, 0);
-            newLIR3(cUnit, ARMV5TE_SUB_RRR, r0, r1, r2);
-            newLIR2(cUnit, ARMV5TE_SBC, r1, r3);
-            storeValuePair(cUnit, r0, r1, vDest, r2);
+        case OP_NEG_LONG: {
+            reg0 = selectFirstRegister(cUnit, vSrc2, true);
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+            reg3 = NEXT_REG(reg2);
+
+            loadValuePair(cUnit, vSrc2, reg0, reg1);
+            loadConstant(cUnit, reg3, 0);
+            newLIR3(cUnit, ARMV5TE_SUB_RRR, reg2, reg3, reg0);
+            newLIR2(cUnit, ARMV5TE_SBC, reg3, reg1);
+            storeValuePair(cUnit, r0, reg3, vDest, reg0);
             return false;
+        }
         default:
             LOGE("Invalid long arith op");
             dvmAbort();
     }
     if (!callOut) {
-        loadValuePair(cUnit, vSrc1, r0, r1);
-        loadValuePair(cUnit, vSrc2, r2, r3);
-        genBinaryOpWide(cUnit, vDest, firstOp, secondOp);
+        reg0 = selectFirstRegister(cUnit, vSrc1, true);
+        reg1 = NEXT_REG(reg0);
+        reg2 = NEXT_REG(reg1);
+        reg3 = NEXT_REG(reg2);
+
+        loadValuePair(cUnit, vSrc1, reg0, reg1);
+        loadValuePair(cUnit, vSrc2, reg2, reg3);
+        genBinaryOpWide(cUnit, vDest, firstOp, secondOp, reg0, reg2);
+    /*
+     * Don't optimize the regsiter usage here as they are governed by the EABI
+     * calling convention.
+     */
     } else {
         loadValuePair(cUnit, vSrc2, r2, r3);
         loadConstant(cUnit, r4PC, (int) callTgt);
@@ -834,6 +1026,7 @@
     bool checkZero = false;
     int retReg = r0;
     void *callTgt;
+    int reg0, reg1, regDest;
 
     /* TODO - find proper .h file to declare these */
     int __aeabi_idivmod(int op1, int op2);
@@ -903,15 +1096,46 @@
             dvmAbort();
     }
     if (!callOut) {
-        loadValue(cUnit, vSrc1, r0);
-        loadValue(cUnit, vSrc2, r1);
-        genBinaryOp(cUnit, vDest, armOp);
+         /* Try to allocate reg0 to the currently cached source operand  */
+        if (cUnit->registerScoreboard.liveDalvikReg == vSrc1) {
+            reg0 = selectFirstRegister(cUnit, vSrc1, false);
+            reg1 = NEXT_REG(reg0);
+            regDest = NEXT_REG(reg1);
+
+            loadValue(cUnit, vSrc1, reg0); /* Should be optimized away */
+            loadValue(cUnit, vSrc2, reg1);
+            genBinaryOp(cUnit, vDest, armOp, reg0, reg1, regDest);
+        } else {
+            reg0 = selectFirstRegister(cUnit, vSrc2, false);
+            reg1 = NEXT_REG(reg0);
+            regDest = NEXT_REG(reg1);
+
+            loadValue(cUnit, vSrc1, reg1); /* Load this value first */
+            loadValue(cUnit, vSrc2, reg0); /* May be optimized away */
+            genBinaryOp(cUnit, vDest, armOp, reg1, reg0, regDest);
+        }
     } else {
-        loadValue(cUnit, vSrc2, r1);
+        /*
+         * Load the callout target first since it will never be eliminated
+         * and its value will be used first.
+         */
         loadConstant(cUnit, r2, (int) callTgt);
-        loadValue(cUnit, vSrc1, r0);
+        /*
+         * Load vSrc2 first if it is not cached in a native register or it
+         * is in r0 which will be clobbered if vSrc1 is loaded first.
+         */
+        if (cUnit->registerScoreboard.liveDalvikReg != vSrc2 ||
+            cUnit->registerScoreboard.nativeReg == r0) {
+            /* Cannot be optimized and won't clobber r0 */
+            loadValue(cUnit, vSrc2, r1);
+            /* May be optimized if vSrc1 is cached */
+            loadValue(cUnit, vSrc1, r0);
+        } else {
+            loadValue(cUnit, vSrc1, r0);
+            loadValue(cUnit, vSrc2, r1);
+        }
         if (checkZero) {
-            genNullCheck(cUnit, r1, mir->offset, NULL);
+            genNullCheck(cUnit, vSrc2, r1, mir->offset, NULL);
         }
         newLIR1(cUnit, ARMV5TE_BLX_R, r2);
         storeValue(cUnit, retReg, vDest, r2);
@@ -962,6 +1186,10 @@
 static bool genConversionCall(CompilationUnit *cUnit, MIR *mir, void *funct,
                                      int srcSize, int tgtSize)
 {
+    /*
+     * Don't optimize the register usage since it calls out to template
+     * functions
+     */
     loadConstant(cUnit, r2, (int)funct);
     if (srcSize == 1) {
         loadValue(cUnit, mir->dalvikInsn.vB, r0);
@@ -980,12 +1208,13 @@
 /* Experimental example of completely inlining a native replacement */
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir)
 {
+    /* Don't optimize the register usage */
     int offset = (int) &((InterpState *) NULL)->retval;
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     assert(dInsn->vA == 1);
     loadValue(cUnit, dInsn->arg[0], r0);
     loadConstant(cUnit, r1, gDvm.offJavaLangString_count);
-    genNullCheck(cUnit, r0, mir->offset, NULL);
+    genNullCheck(cUnit, dInsn->arg[0], r0, mir->offset, NULL);
     newLIR3(cUnit, ARMV5TE_LDR_RRR, r0, r0, r1);
     newLIR3(cUnit, ARMV5TE_STR_RRI5, r0, rGLUE, offset >> 2);
     return false;
@@ -1010,7 +1239,8 @@
                 sizeof(StackSaveArea) + (dInsn->vA << 2));
         /* generate null check */
         if (pcrLabel) {
-            *pcrLabel = genNullCheck(cUnit, r0, mir->offset, NULL);
+            *pcrLabel = genNullCheck(cUnit, dInsn->arg[0], r0, mir->offset,
+                                     NULL);
         }
         newLIR2(cUnit, ARMV5TE_STMIA, r7, regMask);
     }
@@ -1048,7 +1278,7 @@
 
     /* generate null check */
     if (pcrLabel) {
-        *pcrLabel = genNullCheck(cUnit, r0, mir->offset, NULL);
+        *pcrLabel = genNullCheck(cUnit, dInsn->vC, r0, mir->offset, NULL);
     }
 
     /*
@@ -1203,17 +1433,28 @@
 
 static bool handleFmt11n_Fmt31i(CompilationUnit *cUnit, MIR *mir)
 {
+    int reg0, reg1, reg2;
+
     switch (mir->dalvikInsn.opCode) {
         case OP_CONST:
-        case OP_CONST_4:
-            loadConstant(cUnit, r0, mir->dalvikInsn.vB);
-            storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
+        case OP_CONST_4: {
+            /* Avoid using the previously used register */
+            reg0 = selectFirstRegister(cUnit, vNone, false);
+            reg1 = NEXT_REG(reg0);
+            loadConstant(cUnit, reg0, mir->dalvikInsn.vB);
+            storeValue(cUnit, reg0, mir->dalvikInsn.vA, reg1);
             break;
-        case OP_CONST_WIDE_32:
-            loadConstant(cUnit, r0, mir->dalvikInsn.vB);
-            newLIR3(cUnit, ARMV5TE_ASR, r1, r0, 31);
-            storeValuePair(cUnit, r0, r1, mir->dalvikInsn.vA, r2);
+        }
+        case OP_CONST_WIDE_32: {
+            /* Avoid using the previously used register */
+            reg0 = selectFirstRegister(cUnit, vNone, true);
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+            loadConstant(cUnit, reg0, mir->dalvikInsn.vB);
+            newLIR3(cUnit, ARMV5TE_ASR, reg1, reg0, 31);
+            storeValuePair(cUnit, reg0, reg1, mir->dalvikInsn.vA, reg2);
             break;
+        }
         default:
             return true;
     }
@@ -1222,16 +1463,26 @@
 
 static bool handleFmt21h(CompilationUnit *cUnit, MIR *mir)
 {
+    int reg0, reg1, reg2;
+
+    /* Avoid using the previously used register */
     switch (mir->dalvikInsn.opCode) {
-        case OP_CONST_HIGH16:
-            loadConstant(cUnit, r0, mir->dalvikInsn.vB << 16);
-            storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
+        case OP_CONST_HIGH16: {
+            reg0 = selectFirstRegister(cUnit, vNone, false);
+            reg1 = NEXT_REG(reg0);
+            loadConstant(cUnit, reg0, mir->dalvikInsn.vB << 16);
+            storeValue(cUnit, reg0, mir->dalvikInsn.vA, reg1);
             break;
-        case OP_CONST_WIDE_HIGH16:
-            loadConstant(cUnit, r1, mir->dalvikInsn.vB << 16);
-            loadConstant(cUnit, r0, 0);
-            storeValuePair(cUnit, r0, r1, mir->dalvikInsn.vA, r2);
+        }
+        case OP_CONST_WIDE_HIGH16: {
+            reg0 = selectFirstRegister(cUnit, vNone, true);
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+            loadConstant(cUnit, reg1, mir->dalvikInsn.vB << 16);
+            loadConstant(cUnit, reg0, 0);
+            storeValuePair(cUnit, reg0, reg1, mir->dalvikInsn.vA, reg2);
             break;
+        }
         default:
             return true;
     }
@@ -1247,6 +1498,15 @@
 
 static bool handleFmt21c_Fmt31c(CompilationUnit *cUnit, MIR *mir)
 {
+    /* Native register to use if the interested value is vA */
+    int regvA = selectFirstRegister(cUnit, mir->dalvikInsn.vA, false);
+    /* Native register to use if source is not from Dalvik registers */
+    int regvNone = selectFirstRegister(cUnit, vNone, false);
+    /* Similar to regvA but for 64-bit values */
+    int regvAWide = selectFirstRegister(cUnit, mir->dalvikInsn.vA, true);
+    /* Similar to regvNone but for 64-bit values */
+    int regvNoneWide = selectFirstRegister(cUnit, vNone, true);
+
     switch (mir->dalvikInsn.opCode) {
         /*
          * TODO: Verify that we can ignore the resolution check here because
@@ -1257,8 +1517,8 @@
             void *strPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResStrings[mir->dalvikInsn.vB]);
             assert(strPtr != NULL);
-            loadConstant(cUnit, r0, (int) strPtr );
-            storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
+            loadConstant(cUnit, regvNone, (int) strPtr );
+            storeValue(cUnit, regvNone, mir->dalvikInsn.vA, NEXT_REG(regvNone));
             break;
         }
         /*
@@ -1269,8 +1529,8 @@
             void *classPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResClasses[mir->dalvikInsn.vB]);
             assert(classPtr != NULL);
-            loadConstant(cUnit, r0, (int) classPtr );
-            storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
+            loadConstant(cUnit, regvNone, (int) classPtr );
+            storeValue(cUnit, regvNone, mir->dalvikInsn.vA, NEXT_REG(regvNone));
             break;
         }
         case OP_SGET_OBJECT:
@@ -1283,19 +1543,24 @@
             void *fieldPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResFields[mir->dalvikInsn.vB]);
             assert(fieldPtr != NULL);
-            loadConstant(cUnit, r0,  (int) fieldPtr + valOffset);
-            newLIR3(cUnit, ARMV5TE_LDR_RRI5, r0, r0, 0);
-            storeValue(cUnit, r0, mir->dalvikInsn.vA, r2);
+            loadConstant(cUnit, regvNone,  (int) fieldPtr + valOffset);
+            newLIR3(cUnit, ARMV5TE_LDR_RRI5, regvNone, regvNone, 0);
+            storeValue(cUnit, regvNone, mir->dalvikInsn.vA, NEXT_REG(regvNone));
             break;
         }
         case OP_SGET_WIDE: {
             int valOffset = (int)&((struct StaticField*)NULL)->value;
             void *fieldPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResFields[mir->dalvikInsn.vB]);
+            int reg0, reg1, reg2;
+
             assert(fieldPtr != NULL);
-            loadConstant(cUnit, r2,  (int) fieldPtr + valOffset);
-            newLIR2(cUnit, ARMV5TE_LDMIA, r2, (1<<r0 | 1<<r1));
-            storeValuePair(cUnit, r0, r1, mir->dalvikInsn.vA, r2);
+            reg0 = regvNoneWide;
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+            loadConstant(cUnit, reg2,  (int) fieldPtr + valOffset);
+            newLIR2(cUnit, ARMV5TE_LDMIA, reg2, (1<<reg0 | 1<<reg1));
+            storeValuePair(cUnit, reg0, reg1, mir->dalvikInsn.vA, reg2);
             break;
         }
         case OP_SPUT_OBJECT:
@@ -1307,23 +1572,35 @@
             int valOffset = (int)&((struct StaticField*)NULL)->value;
             void *fieldPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResFields[mir->dalvikInsn.vB]);
+
             assert(fieldPtr != NULL);
-            loadValue(cUnit, mir->dalvikInsn.vA, r0);
-            loadConstant(cUnit, r1,  (int) fieldPtr + valOffset);
-            newLIR3(cUnit, ARMV5TE_STR_RRI5, r0, r1, 0);
+            loadValue(cUnit, mir->dalvikInsn.vA, regvA);
+            updateLiveRegister(cUnit, mir->dalvikInsn.vA, regvA);
+            loadConstant(cUnit, NEXT_REG(regvA),  (int) fieldPtr + valOffset);
+            newLIR3(cUnit, ARMV5TE_STR_RRI5, regvA, NEXT_REG(regvA), 0);
             break;
         }
         case OP_SPUT_WIDE: {
+            int reg0, reg1, reg2;
             int valOffset = (int)&((struct StaticField*)NULL)->value;
             void *fieldPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResFields[mir->dalvikInsn.vB]);
+
             assert(fieldPtr != NULL);
-            loadValuePair(cUnit, mir->dalvikInsn.vA, r0, r1);
-            loadConstant(cUnit, r2,  (int) fieldPtr + valOffset);
-            newLIR2(cUnit, ARMV5TE_STMIA, r2, (1<<r0 | 1<<r1));
+            reg0 = regvAWide;
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+            loadValuePair(cUnit, mir->dalvikInsn.vA, reg0, reg1);
+            updateLiveRegisterPair(cUnit, mir->dalvikInsn.vA, reg0, reg1);
+            loadConstant(cUnit, reg2,  (int) fieldPtr + valOffset);
+            newLIR2(cUnit, ARMV5TE_STMIA, reg2, (1<<reg0 | 1<<reg1));
             break;
         }
         case OP_NEW_INSTANCE: {
+            /*
+             * Obey the calling convention and don't mess with the register
+             * usage.
+             */
             ClassObject *classPtr = (void*)
               (cUnit->method->clazz->pDvmDex->pResClasses[mir->dalvikInsn.vB]);
             assert(classPtr != NULL);
@@ -1333,8 +1610,8 @@
                 genInterpSingleStep(cUnit, mir);
                 return false;
             }
-            loadConstant(cUnit, r0, (int) classPtr);
             loadConstant(cUnit, r4PC, (int)dvmAllocObject);
+            loadConstant(cUnit, r0, (int) classPtr);
             genExportPC(cUnit, mir, r2, r3 );
             loadConstant(cUnit, r1, ALLOC_DONT_TRACK);
             newLIR1(cUnit, ARMV5TE_BLX_R, r4PC);
@@ -1342,11 +1619,15 @@
              * TODO: As coded, we'll bail and reinterpret on alloc failure.
              * Need a general mechanism to bail to thrown exception code.
              */
-            genNullCheck(cUnit, r0, mir->offset, NULL);
+            genZeroCheck(cUnit, r0, mir->offset, NULL);
             storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
             break;
         }
         case OP_CHECK_CAST: {
+            /*
+             * Obey the calling convention and don't mess with the register
+             * usage.
+             */
             ClassObject *classPtr =
               (cUnit->method->clazz->pDvmDex->pResClasses[mir->dalvikInsn.vB]);
             loadConstant(cUnit, r1, (int) classPtr );
@@ -1356,7 +1637,7 @@
              * instruction made into a trace, but we are seeing NULL at runtime
              * so this check is temporarily used as a workaround.
              */
-            Armv5teLIR * pcrLabel = genNullCheck(cUnit, r1, mir->offset, NULL);
+            Armv5teLIR * pcrLabel = genZeroCheck(cUnit, r1, mir->offset, NULL);
             newLIR2(cUnit, ARMV5TE_CMP_RI8, r0, 0);    /* Null? */
             Armv5teLIR *branch1 =
                 newLIR2(cUnit, ARMV5TE_B_COND, 4, ARM_COND_EQ);
@@ -1369,7 +1650,7 @@
                 newLIR2(cUnit, ARMV5TE_B_COND, 2, ARM_COND_EQ);
             newLIR1(cUnit, ARMV5TE_BLX_R, r4PC);
             /* check cast failed - punt to the interpreter */
-            genNullCheck(cUnit, r0, mir->offset, pcrLabel);
+            genZeroCheck(cUnit, r0, mir->offset, pcrLabel);
             /* check cast passed - branch target here */
             Armv5teLIR *target = newLIR0(cUnit, ARMV5TE_PSEUDO_TARGET_LABEL);
             branch1->generic.target = (LIR *)target;
@@ -1389,9 +1670,9 @@
         case OP_MOVE_EXCEPTION: {
             int offset = offsetof(InterpState, self);
             int exOffset = offsetof(Thread, exception);
-            newLIR3(cUnit, ARMV5TE_LDR_RRI5, r0, rGLUE, offset >> 2);
-            newLIR3(cUnit, ARMV5TE_LDR_RRI5, r1, r0, exOffset >> 2);
-            storeValue(cUnit, r1, mir->dalvikInsn.vA, r0);
+            newLIR3(cUnit, ARMV5TE_LDR_RRI5, r1, rGLUE, offset >> 2);
+            newLIR3(cUnit, ARMV5TE_LDR_RRI5, r0, r1, exOffset >> 2);
+            storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
            break;
         }
         case OP_MOVE_RESULT:
@@ -1445,7 +1726,7 @@
            * Lock/unlock won't throw, and this code does not support
            * DEADLOCK_PREDICTION or MONITOR_TRACKING.  Should it?
            */
-            genNullCheck(cUnit, r1, mir->offset, NULL);
+            genNullCheck(cUnit, mir->dalvikInsn.vA, r1, mir->offset, NULL);
             /* Do the call */
             newLIR1(cUnit, ARMV5TE_BLX_R, r2);
             break;
@@ -1507,6 +1788,7 @@
     OpCode opCode = mir->dalvikInsn.opCode;
     int vSrc1Dest = mir->dalvikInsn.vA;
     int vSrc2 = mir->dalvikInsn.vB;
+    int reg0, reg1, reg2;
 
     /* TODO - find the proper include file to declare these */
 
@@ -1514,6 +1796,14 @@
         return genArithOp( cUnit, mir );
     }
 
+    /*
+     * If data type is 64-bit, re-calculate the register numbers in the
+     * corresponding cases.
+     */
+    reg0 = selectFirstRegister(cUnit, vSrc2, false);
+    reg1 = NEXT_REG(reg0);
+    reg2 = NEXT_REG(reg1);
+
     switch (opCode) {
         case OP_INT_TO_FLOAT:
         case OP_FLOAT_TO_INT:
@@ -1538,45 +1828,55 @@
         case OP_NEG_DOUBLE:
             return dvmCompilerGenArithOpDouble(cUnit, mir, vSrc1Dest,
                                                vSrc1Dest, vSrc2);
-        case OP_MOVE_WIDE:
-            loadValuePair(cUnit, mir->dalvikInsn.vB, r0, r1);
-            storeValuePair(cUnit, r0, r1, mir->dalvikInsn.vA, r2);
+        case OP_MOVE_WIDE: {
+            reg0 = selectFirstRegister(cUnit, vSrc2, true);
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+
+            loadValuePair(cUnit, vSrc2, reg0, reg1);
+            storeValuePair(cUnit, reg0, reg1, vSrc1Dest, reg2);
             break;
-        case OP_INT_TO_LONG:
-            loadValue(cUnit, mir->dalvikInsn.vB, r0);
-            newLIR3(cUnit, ARMV5TE_ASR, r1, r0, 31);
-            storeValuePair(cUnit, r0, r1, mir->dalvikInsn.vA, r2);
+        }
+        case OP_INT_TO_LONG: {
+            reg0 = selectFirstRegister(cUnit, vSrc2, true);
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+
+            loadValue(cUnit, mir->dalvikInsn.vB, reg0);
+            newLIR3(cUnit, ARMV5TE_ASR, reg1, reg0, 31);
+            storeValuePair(cUnit, reg0, reg1, vSrc1Dest, reg2);
             break;
+        }
         case OP_MOVE:
         case OP_MOVE_OBJECT:
         case OP_LONG_TO_INT:
-            loadValue(cUnit, vSrc2, r0);
-            storeValue(cUnit, r0, vSrc1Dest, r1);
+            loadValue(cUnit, vSrc2, reg0);
+            storeValue(cUnit, reg0, vSrc1Dest, reg1);
             break;
         case OP_INT_TO_BYTE:
-            loadValue(cUnit, vSrc2, r0);
-            newLIR3(cUnit, ARMV5TE_LSL, r0, r0, 24);
-            newLIR3(cUnit, ARMV5TE_ASR, r0, r0, 24);
-            storeValue(cUnit, r0, vSrc1Dest, r1);
+            loadValue(cUnit, vSrc2, reg0);
+            newLIR3(cUnit, ARMV5TE_LSL, reg0, reg0, 24);
+            newLIR3(cUnit, ARMV5TE_ASR, reg0, reg0, 24);
+            storeValue(cUnit, reg0, vSrc1Dest, reg1);
             break;
         case OP_INT_TO_SHORT:
-            loadValue(cUnit, vSrc2, r0);
-            newLIR3(cUnit, ARMV5TE_LSL, r0, r0, 16);
-            newLIR3(cUnit, ARMV5TE_ASR, r0, r0, 16);
-            storeValue(cUnit, r0, vSrc1Dest, r1);
+            loadValue(cUnit, vSrc2, reg0);
+            newLIR3(cUnit, ARMV5TE_LSL, reg0, reg0, 16);
+            newLIR3(cUnit, ARMV5TE_ASR, reg0, reg0, 16);
+            storeValue(cUnit, reg0, vSrc1Dest, reg1);
             break;
         case OP_INT_TO_CHAR:
-            loadValue(cUnit, vSrc2, r0);
-            newLIR3(cUnit, ARMV5TE_LSL, r0, r0, 16);
-            newLIR3(cUnit, ARMV5TE_LSR, r0, r0, 16);
-            storeValue(cUnit, r0, vSrc1Dest, r1);
+            loadValue(cUnit, vSrc2, reg0);
+            newLIR3(cUnit, ARMV5TE_LSL, reg0, reg0, 16);
+            newLIR3(cUnit, ARMV5TE_LSR, reg0, reg0, 16);
+            storeValue(cUnit, reg0, vSrc1Dest, reg1);
             break;
         case OP_ARRAY_LENGTH: {
             int lenOffset = offsetof(ArrayObject, length);
-            loadValue(cUnit, vSrc2, r0);
-            genNullCheck(cUnit, r0, mir->offset, NULL);
-            newLIR3(cUnit, ARMV5TE_LDR_RRI5, r0, r0, lenOffset >> 2);
-            storeValue(cUnit, r0, vSrc1Dest, r1);
+            loadValue(cUnit, vSrc2, reg0);
+            genNullCheck(cUnit, vSrc2, reg0, mir->offset, NULL);
+            newLIR3(cUnit, ARMV5TE_LDR_RRI5, reg0, reg0, lenOffset >> 2);
+            storeValue(cUnit, reg0, vSrc1Dest, reg1);
             break;
         }
         default:
@@ -1588,80 +1888,34 @@
 static bool handleFmt21s(CompilationUnit *cUnit, MIR *mir)
 {
     OpCode dalvikOpCode = mir->dalvikInsn.opCode;
+    int reg0, reg1, reg2;
+
     /* It takes few instructions to handle OP_CONST_WIDE_16 inline */
     if (dalvikOpCode == OP_CONST_WIDE_16) {
-        int rDest = mir->dalvikInsn.vA;
+        int vDest = mir->dalvikInsn.vA;
         int BBBB = mir->dalvikInsn.vB;
-        int rLow = r0, rHigh = r1;
-        if (BBBB == 0) {
-            newLIR2(cUnit, ARMV5TE_MOV_IMM, rLow, 0);
-            rHigh = rLow;
-        } else if (BBBB > 0 && BBBB <= 255) {
-            /* rLow = ssssBBBB */
-            newLIR2(cUnit, ARMV5TE_MOV_IMM, rLow, BBBB);
-            /* rHigh = 0 */
-            newLIR2(cUnit, ARMV5TE_MOV_IMM, rHigh, 0);
-        } else {
-            loadConstant(cUnit, rLow, BBBB);
-            /*
-             * arithmetic-shift-right 32 bits to get the high half of long
-             * [63..32]
-             */
-            newLIR3(cUnit, ARMV5TE_ASR, rHigh, rLow, 0);
+
+        reg0 = selectFirstRegister(cUnit, vNone, true);
+        reg1 = NEXT_REG(reg0);
+        reg2 = NEXT_REG(reg1);
+
+        loadConstant(cUnit, reg0, BBBB);
+        loadConstant(cUnit, reg1, 0);
+        if (BBBB < 0) {
+            newLIR2(cUnit, ARMV5TE_SUB_RI8, reg1, -1);
         }
 
         /* Save the long values to the specified Dalvik register pair */
-        /*
-         * If rDest is no greater than 30, use two "str rd, [rFP + immed_5]"
-         * instructions to store the results. Effective address is
-         * rFP + immed_5 << 2.
-         */
-        if (rDest < 31) {
-            newLIR3(cUnit, ARMV5TE_STR_RRI5, rLow, rFP, rDest);
-            newLIR3(cUnit, ARMV5TE_STR_RRI5, rHigh, rFP, rDest+1);
-        } else {
-          /*
-           * Otherwise just load the frame offset from the constant pool and add
-           * it to rFP. Then use stmia to store the results to the specified
-           * register pair.
-           */
-            /* Need to replicate the content in r0 to r1 */
-            if (rLow == rHigh) {
-                newLIR3(cUnit, ARMV5TE_ADD_RRI3, rLow+1, rLow, 0);
-            }
-            /* load the rFP offset into r2 */
-            loadConstant(cUnit, r2, rDest*4);
-            newLIR3(cUnit, ARMV5TE_ADD_RRR, r2, rFP, r2);
-            newLIR2(cUnit, ARMV5TE_STMIA, r2, (1<<r0 | 1 << r1));
-        }
+        storeValuePair(cUnit, reg0, reg1, vDest, reg2);
     } else if (dalvikOpCode == OP_CONST_16) {
-        int rDest = mir->dalvikInsn.vA;
+        int vDest = mir->dalvikInsn.vA;
         int BBBB = mir->dalvikInsn.vB;
-        if (BBBB >= 0 && BBBB <= 255) {
-            /* r0 = BBBB */
-            newLIR2(cUnit, ARMV5TE_MOV_IMM, r0, BBBB);
-        } else {
-            loadConstant(cUnit, r0, BBBB);
-        }
 
-        /* Save the constant to the specified Dalvik register */
-        /*
-         * If rDest is no greater than 31, effective address is
-         * rFP + immed_5 << 2.
-         */
-        if (rDest < 32) {
-            newLIR3(cUnit, ARMV5TE_STR_RRI5, r0, rFP, rDest);
-        } else {
-          /*
-           * Otherwise just load the frame offset from the constant pool and add
-           * it to rFP. Then use stmia to store the results to the specified
-           * register pair.
-           */
-            /* load the rFP offset into r2 */
-            loadConstant(cUnit, r2, rDest*4);
-            newLIR3(cUnit, ARMV5TE_ADD_RRR, r2, rFP, r2);
-            newLIR3(cUnit, ARMV5TE_STR_RRI5, r0, r2, 0);
-        }
+        reg0 = selectFirstRegister(cUnit, vNone, false);
+        reg1 = NEXT_REG(reg0);
+
+        loadConstant(cUnit, reg0, BBBB);
+        storeValue(cUnit, reg0, vDest, reg1);
     } else {
         return true;
     }
@@ -1674,9 +1928,10 @@
 {
     OpCode dalvikOpCode = mir->dalvikInsn.opCode;
     Armv5teConditionCode cond;
+    int reg0 = selectFirstRegister(cUnit, mir->dalvikInsn.vA, false);
 
-    loadValue(cUnit, mir->dalvikInsn.vA, r0);
-    newLIR2(cUnit, ARMV5TE_CMP_RI8, r0, 0);
+    loadValue(cUnit, mir->dalvikInsn.vA, reg0);
+    newLIR2(cUnit, ARMV5TE_CMP_RI8, reg0, 0);
 
     switch (dalvikOpCode) {
         case OP_IF_EQZ:
@@ -1715,6 +1970,11 @@
     int vDest = mir->dalvikInsn.vA;
     int lit = mir->dalvikInsn.vC;
     int armOp;
+    int reg0, reg1, regDest;
+
+    reg0 = selectFirstRegister(cUnit, vSrc, false);
+    reg1 = NEXT_REG(reg0);
+    regDest = NEXT_REG(reg1);
 
     /* TODO: find the proper .h file to declare these */
     int __aeabi_idivmod(int op1, int op2);
@@ -1723,25 +1983,32 @@
     switch (dalvikOpCode) {
         case OP_ADD_INT_LIT8:
         case OP_ADD_INT_LIT16:
-            loadValue(cUnit, vSrc, r0);
-            if (lit <= 255 && lit >= 0) {
-                newLIR2(cUnit, ARMV5TE_ADD_RI8, r0, lit);
-                storeValue(cUnit, r0, vDest, r1);
+            loadValue(cUnit, vSrc, reg0);
+            if (lit <= 7 && lit >= 0) {
+                newLIR3(cUnit, ARMV5TE_ADD_RRI3, regDest, reg0, lit);
+                storeValue(cUnit, regDest, vDest, reg1);
+            } else if (lit <= 255 && lit >= 0) {
+                newLIR2(cUnit, ARMV5TE_ADD_RI8, reg0, lit);
+                storeValue(cUnit, reg0, vDest, reg1);
+            } else if (lit >= -7 && lit <= 0) {
+                /* Convert to a small constant subtraction */
+                newLIR3(cUnit, ARMV5TE_SUB_RRI3, regDest, reg0, -lit);
+                storeValue(cUnit, regDest, vDest, reg1);
             } else if (lit >= -255 && lit <= 0) {
                 /* Convert to a small constant subtraction */
-                newLIR2(cUnit, ARMV5TE_SUB_RI8, r0, -lit);
-                storeValue(cUnit, r0, vDest, r1);
+                newLIR2(cUnit, ARMV5TE_SUB_RI8, reg0, -lit);
+                storeValue(cUnit, reg0, vDest, reg1);
             } else {
-                loadConstant(cUnit, r1, lit);
-                genBinaryOp(cUnit, vDest, ARMV5TE_ADD_RRR);
+                loadConstant(cUnit, reg1, lit);
+                genBinaryOp(cUnit, vDest, ARMV5TE_ADD_RRR, reg0, reg1, regDest);
             }
             break;
 
         case OP_RSUB_INT_LIT8:
         case OP_RSUB_INT:
-            loadValue(cUnit, vSrc, r1);
-            loadConstant(cUnit, r0, lit);
-            genBinaryOp(cUnit, vDest, ARMV5TE_SUB_RRR);
+            loadValue(cUnit, vSrc, reg1);
+            loadConstant(cUnit, reg0, lit);
+            genBinaryOp(cUnit, vDest, ARMV5TE_SUB_RRR, reg0, reg1, regDest);
             break;
 
         case OP_MUL_INT_LIT8:
@@ -1752,8 +2019,8 @@
         case OP_OR_INT_LIT16:
         case OP_XOR_INT_LIT8:
         case OP_XOR_INT_LIT16:
-            loadValue(cUnit, vSrc, r0);
-            loadConstant(cUnit, r1, lit);
+            loadValue(cUnit, vSrc, reg0);
+            loadConstant(cUnit, reg1, lit);
             switch (dalvikOpCode) {
                 case OP_MUL_INT_LIT8:
                 case OP_MUL_INT_LIT16:
@@ -1774,13 +2041,13 @@
                 default:
                     dvmAbort();
             }
-            genBinaryOp(cUnit, vDest, armOp);
+            genBinaryOp(cUnit, vDest, armOp, reg0, reg1, regDest);
             break;
 
         case OP_SHL_INT_LIT8:
         case OP_SHR_INT_LIT8:
         case OP_USHR_INT_LIT8:
-            loadValue(cUnit, vSrc, r0);
+            loadValue(cUnit, vSrc, reg0);
             switch (dalvikOpCode) {
                 case OP_SHL_INT_LIT8:
                     armOp = ARMV5TE_LSL;
@@ -1793,12 +2060,13 @@
                     break;
                 default: dvmAbort();
             }
-            newLIR3(cUnit, armOp, r0, r0, lit);
-            storeValue(cUnit, r0, vDest, r1);
+            newLIR3(cUnit, armOp, reg0, reg0, lit);
+            storeValue(cUnit, reg0, vDest, reg1);
             break;
 
         case OP_DIV_INT_LIT8:
         case OP_DIV_INT_LIT16:
+            /* Register usage based on the calling convention */
             if (lit == 0) {
                 /* Let the interpreter deal with div by 0 */
                 genInterpSingleStep(cUnit, mir);
@@ -1813,6 +2081,7 @@
 
         case OP_REM_INT_LIT8:
         case OP_REM_INT_LIT16:
+            /* Register usage based on the calling convention */
             if (lit == 0) {
                 /* Let the interpreter deal with div by 0 */
                 genInterpSingleStep(cUnit, mir);
@@ -1867,7 +2136,7 @@
              * TODO: As coded, we'll bail and reinterpret on alloc failure.
              * Need a general mechanism to bail to thrown exception code.
              */
-            genNullCheck(cUnit, r0, mir->offset, pcrLabel);
+            genZeroCheck(cUnit, r0, mir->offset, pcrLabel);
             storeValue(cUnit, r0, mir->dalvikInsn.vA, r1);
             break;
         }
@@ -1974,10 +2243,22 @@
 {
     OpCode dalvikOpCode = mir->dalvikInsn.opCode;
     Armv5teConditionCode cond;
+    int reg0, reg1;
 
-    loadValue(cUnit, mir->dalvikInsn.vA, r0);
-    loadValue(cUnit, mir->dalvikInsn.vB, r1);
-    newLIR2(cUnit, ARMV5TE_CMP_RR, r0, r1);
+    if (cUnit->registerScoreboard.liveDalvikReg == (int) mir->dalvikInsn.vA) {
+        reg0 = selectFirstRegister(cUnit, mir->dalvikInsn.vA, false);
+        reg1 = NEXT_REG(reg0);
+        /* Load vB first since vA can be fetched via a move */
+        loadValue(cUnit, mir->dalvikInsn.vB, reg1);
+        loadValue(cUnit, mir->dalvikInsn.vA, reg0);
+    } else {
+        reg0 = selectFirstRegister(cUnit, mir->dalvikInsn.vB, false);
+        reg1 = NEXT_REG(reg0);
+        /* Load vA first since vB can be fetched via a move */
+        loadValue(cUnit, mir->dalvikInsn.vA, reg0);
+        loadValue(cUnit, mir->dalvikInsn.vB, reg1);
+    }
+    newLIR2(cUnit, ARMV5TE_CMP_RR, reg0, reg1);
 
     switch (dalvikOpCode) {
         case OP_IF_EQ:
@@ -2014,20 +2295,28 @@
     OpCode opCode = mir->dalvikInsn.opCode;
     int vSrc1Dest = mir->dalvikInsn.vA;
     int vSrc2 = mir->dalvikInsn.vB;
+    int reg0, reg1, reg2;
 
     switch (opCode) {
         case OP_MOVE_16:
         case OP_MOVE_OBJECT_16:
         case OP_MOVE_FROM16:
-        case OP_MOVE_OBJECT_FROM16:
-            loadValue(cUnit, vSrc2, r0);
-            storeValue(cUnit, r0, vSrc1Dest, r1);
+        case OP_MOVE_OBJECT_FROM16: {
+            reg0 = selectFirstRegister(cUnit, vSrc2, false);
+            reg1 = NEXT_REG(reg0);
+            loadValue(cUnit, vSrc2, reg0);
+            storeValue(cUnit, reg0, vSrc1Dest, reg1);
             break;
+        }
         case OP_MOVE_WIDE_16:
-        case OP_MOVE_WIDE_FROM16:
-            loadValuePair(cUnit, vSrc2, r0, r1);
-            storeValuePair(cUnit, r0, r1, vSrc1Dest, r2);
+        case OP_MOVE_WIDE_FROM16: {
+            reg0 = selectFirstRegister(cUnit, vSrc2, true);
+            reg1 = NEXT_REG(reg0);
+            reg2 = NEXT_REG(reg1);
+            loadValuePair(cUnit, vSrc2, reg0, reg1);
+            storeValuePair(cUnit, reg0, reg1, vSrc1Dest, reg2);
             break;
+        }
         default:
             return true;
     }
@@ -2041,6 +2330,7 @@
     int vB = mir->dalvikInsn.vB;
     int vC = mir->dalvikInsn.vC;
 
+    /* Don't optimize for register usage since out-of-line handlers are used */
     if ( (opCode >= OP_ADD_INT) && (opCode <= OP_REM_DOUBLE)) {
         return genArithOp( cUnit, mir );
     }
@@ -2108,7 +2398,7 @@
                  (int) (cUnit->method->insns + mir->offset));
             genExportPC(cUnit, mir, r2, r3 );
             newLIR1(cUnit, ARMV5TE_BLX_R, r4PC);
-            genNullCheck(cUnit, r0, mir->offset, NULL);
+            genZeroCheck(cUnit, r0, mir->offset, NULL);
             break;
         }
         /*
@@ -2458,8 +2748,9 @@
             newLIR1(cUnit, ARMV5TE_ADD_SPI7, 2);
 
             /* Did we throw? If so, redo under interpreter*/
-            genNullCheck(cUnit, r0, mir->offset, NULL);
+            genZeroCheck(cUnit, r0, mir->offset, NULL);
 
+            resetRegisterScoreboard(cUnit);
             break;
         }
         default:
@@ -2574,6 +2865,8 @@
 
         if (blockList[i]->blockType == DALVIK_BYTECODE) {
             labelList[i].opCode = ARMV5TE_PSEUDO_NORMAL_BLOCK_LABEL;
+            /* Reset the register state */
+            resetRegisterScoreboard(cUnit);
         } else {
             switch (blockList[i]->blockType) {
                 case CHAINING_CELL_NORMAL:
@@ -2620,12 +2913,20 @@
             }
             continue;
         }
+
+        Armv5teLIR *headLIR = NULL;
+
         for (mir = blockList[i]->firstMIRInsn; mir; mir = mir->next) {
             OpCode dalvikOpCode = mir->dalvikInsn.opCode;
             InstructionFormat dalvikFormat =
                 dexGetInstrFormat(gDvm.instrFormat, dalvikOpCode);
-            newLIR2(cUnit, ARMV5TE_PSEUDO_DALVIK_BYTECODE_BOUNDARY,
-                    mir->offset,dalvikOpCode);
+            Armv5teLIR *boundaryLIR =
+                newLIR2(cUnit, ARMV5TE_PSEUDO_DALVIK_BYTECODE_BOUNDARY,
+                        mir->offset,dalvikOpCode);
+            /* Remember the first LIR for this block */
+            if (headLIR == NULL) {
+                headLIR = boundaryLIR;
+            }
             bool notHandled;
             /*
              * Debugging: screen the opcode first to see if it is in the
@@ -2732,6 +3033,10 @@
                 break;
             }
         }
+
+        /* Eliminate redundant loads/stores and delay stores into later slots */
+        dvmCompilerApplyLocalOptimizations(cUnit, (LIR *) headLIR,
+                                           cUnit->lastLIRInsn);
         /*
          * Check if the block is terminated due to trace length constraint -
          * insert an unconditional branch to the chaining cell.
@@ -2743,7 +3048,7 @@
 
     }
 
-    /* Handle the codegen in predefined order */
+    /* Handle the chaining cells in predefined order */
     for (i = 0; i < CHAINING_CELL_LAST; i++) {
         size_t j;
         int *blockIdList = (int *) chainingListByType[i].elemList;
@@ -2786,6 +3091,8 @@
             }
         }
     }
+
+    dvmCompilerApplyGlobalOptimizations(cUnit);
 }
 
 /* Accept the work and start compiling */
@@ -2910,7 +3217,7 @@
  * Exported version of genDispatchToHandler
  * TODO: revisit source file structure
  */
-void dvmCompilerGenDispatchToHandler(CompilationUnit *cUnit, 
+void dvmCompilerGenDispatchToHandler(CompilationUnit *cUnit,
                                      TemplateOpCode opCode)
 {
     genDispatchToHandler(cUnit, opCode);
diff --git a/vm/compiler/codegen/armv5te/FpCodegen-armv5te-vfp.c b/vm/compiler/codegen/armv5te/FpCodegen-armv5te-vfp.c
index 7e483c8..a2dc3bf 100644
--- a/vm/compiler/codegen/armv5te/FpCodegen-armv5te-vfp.c
+++ b/vm/compiler/codegen/armv5te/FpCodegen-armv5te-vfp.c
@@ -23,6 +23,10 @@
 {
     TemplateOpCode opCode;
 
+    /*
+     * Don't attempt to optimize register usage since these opcodes call out to
+     * the handlers.
+     */
     switch (mir->dalvikInsn.opCode) {
         case OP_ADD_FLOAT_2ADDR:
         case OP_ADD_FLOAT:
@@ -60,6 +64,10 @@
 {
     TemplateOpCode opCode;
 
+    /*
+     * Don't attempt to optimize register usage since these opcodes call out to
+     * the handlers.
+     */
     switch (mir->dalvikInsn.opCode) {
         case OP_ADD_DOUBLE_2ADDR:
         case OP_ADD_DOUBLE:
@@ -137,6 +145,11 @@
                                 int vSrc1, int vSrc2)
 {
     TemplateOpCode template;
+
+    /*
+     * Don't attempt to optimize register usage since these opcodes call out to
+     * the handlers.
+     */
     switch(mir->dalvikInsn.opCode) {
         case OP_CMPL_FLOAT:
             template = TEMPLATE_CMPL_FLOAT_VFP;
diff --git a/vm/compiler/codegen/armv5te/FpCodegen-armv5te.c b/vm/compiler/codegen/armv5te/FpCodegen-armv5te.c
index 26a40bc..b69824d 100644
--- a/vm/compiler/codegen/armv5te/FpCodegen-armv5te.c
+++ b/vm/compiler/codegen/armv5te/FpCodegen-armv5te.c
@@ -38,6 +38,10 @@
 bool dvmCompilerGenCmpX(CompilationUnit *cUnit, MIR *mir, int vDest,
                                 int vSrc1, int vSrc2)
 {
+    /*
+     * Don't attempt to optimize register usage since these opcodes call out to
+     * the handlers.
+     */
     switch (mir->dalvikInsn.opCode) {
         case OP_CMPL_FLOAT:
             dvmCompilerLoadValue(cUnit, vSrc1, r0);
diff --git a/vm/compiler/codegen/armv5te/GlobalOptimizations.c b/vm/compiler/codegen/armv5te/GlobalOptimizations.c
new file mode 100644
index 0000000..2b8ec6f
--- /dev/null
+++ b/vm/compiler/codegen/armv5te/GlobalOptimizations.c
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dalvik.h"
+#include "vm/compiler/CompilerInternals.h"
+#include "Armv5teLIR.h"
+
+/*
+ * Identify unconditional branches that jump to the immediate successor of the
+ * branch itself.
+ */
+static void applyRedundantBranchElimination(CompilationUnit *cUnit)
+{
+    Armv5teLIR *thisLIR;
+
+    for (thisLIR = (Armv5teLIR *) cUnit->firstLIRInsn;
+         thisLIR != (Armv5teLIR *) cUnit->lastLIRInsn;
+         thisLIR = NEXT_LIR(thisLIR)) {
+
+        /* Branch to the next instruction */
+        if (thisLIR->opCode == ARMV5TE_B_UNCOND) {
+            Armv5teLIR *nextLIR = thisLIR;
+
+            while (true) {
+                nextLIR = NEXT_LIR(nextLIR);
+
+                /*
+                 * Is the branch target the next instruction?
+                 */
+                if (nextLIR == (Armv5teLIR *) thisLIR->generic.target) {
+                    thisLIR->isNop = true;
+                    break;
+                }
+
+                /*
+                 * Found real useful stuff between the branch and the target
+                 */
+                if (!isPseudoOpCode(nextLIR->opCode) ||
+                    nextLIR->opCode == ARMV5TE_PSEUDO_ALIGN4)
+                    break;
+            }
+        }
+    }
+}
+
+void dvmCompilerApplyGlobalOptimizations(CompilationUnit *cUnit)
+{
+    applyRedundantBranchElimination(cUnit);
+}
diff --git a/vm/compiler/codegen/armv5te/LocalOptimizations.c b/vm/compiler/codegen/armv5te/LocalOptimizations.c
new file mode 100644
index 0000000..1ce91af
--- /dev/null
+++ b/vm/compiler/codegen/armv5te/LocalOptimizations.c
@@ -0,0 +1,139 @@
+/*
+ * Copyright (C) 2009 The Android Open Source Project
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "Dalvik.h"
+#include "vm/compiler/CompilerInternals.h"
+#include "Armv5teLIR.h"
+
+/*
+ * Perform a pass of top-down walk to
+ * 1) Eliminate redundant loads and stores
+ * 2) Sink stores to latest possible slot
+ */
+static void applyLoadStoreElimination(CompilationUnit *cUnit,
+                                      Armv5teLIR *headLIR,
+                                      Armv5teLIR *tailLIR)
+{
+    Armv5teLIR *thisLIR;
+
+    cUnit->optRound++;
+    for (thisLIR = headLIR;
+         thisLIR != tailLIR;
+         thisLIR = NEXT_LIR(thisLIR)) {
+        /* Skip newly added instructions */
+        if (thisLIR->age >= cUnit->optRound) {
+            continue;
+        }
+        if (thisLIR->opCode == ARMV5TE_STR_RRI5 &&
+            thisLIR->operands[1] == rFP) {
+            int dRegId = thisLIR->operands[2];
+            int nativeRegId = thisLIR->operands[0];
+            Armv5teLIR *checkLIR;
+            int sinkDistance = 0;
+
+            for (checkLIR = NEXT_LIR(thisLIR);
+                 checkLIR != tailLIR;
+                 checkLIR = NEXT_LIR(checkLIR)) {
+
+                /* Check if a Dalvik register load is redundant */
+                if (checkLIR->opCode == ARMV5TE_LDR_RRI5 &&
+                    checkLIR->operands[1] == rFP &&
+                    checkLIR->operands[2] == dRegId) {
+                    /* Insert a move to replace the load */
+                    if (checkLIR->operands[0] != nativeRegId) {
+                        Armv5teLIR *moveLIR =
+                            dvmCompilerNew(sizeof(Armv5teLIR), true);
+                        moveLIR->opCode = ARMV5TE_MOV_RR;
+                        moveLIR->operands[0] = checkLIR->operands[0];
+                        moveLIR->operands[1] = nativeRegId;
+                        /*
+                         * Insertion is guaranteed to succeed since checkLIR
+                         * is never the first LIR on the list
+                         */
+                        dvmCompilerInsertLIRBefore((LIR *) checkLIR,
+                                                   (LIR *) moveLIR);
+                    }
+                    checkLIR->isNop = true;
+                    continue;
+
+                /* Found a true output dependency - nuke the previous store */
+                } else if (checkLIR->opCode == ARMV5TE_STR_RRI5 &&
+                           checkLIR->operands[1] == rFP &&
+                           checkLIR->operands[2] == dRegId) {
+                    thisLIR->isNop = true;
+                    break;
+                /* Find out the latest slot that the store can be sunk into */
+                } else {
+                    bool stopHere = false;
+
+                    /* Last instruction reached */
+                    stopHere |= checkLIR->generic.next == NULL;
+
+                    /* Store data is clobbered */
+                    stopHere |= (EncodingMap[checkLIR->opCode].flags &
+                                 CLOBBER_DEST) != 0 &&
+                                checkLIR->operands[0] == nativeRegId;
+                    /*
+                     * Conservatively assume there is a memory dependency
+                     * for st/ld multiples and reg+reg address mode
+                     */
+                    stopHere |= checkLIR->opCode == ARMV5TE_STMIA ||
+                                checkLIR->opCode == ARMV5TE_LDMIA ||
+                                checkLIR->opCode == ARMV5TE_STR_RRR ||
+                                checkLIR->opCode == ARMV5TE_LDR_RRR;
+
+                    stopHere |= (EncodingMap[checkLIR->opCode].flags &
+                                 IS_BRANCH) != 0;
+
+                    /* Found a new place to put the store - move it here */
+                    if (stopHere == true) {
+
+                        /* The store can be sunk for at least one cycle */
+                        if (sinkDistance != 0) {
+                            Armv5teLIR *newStoreLIR =
+                                dvmCompilerNew(sizeof(Armv5teLIR), true);
+                            *newStoreLIR = *thisLIR;
+                            newStoreLIR->age = cUnit->optRound;
+                            /*
+                             * Insertion is guaranteed to succeed since checkLIR
+                             * is never the first LIR on the list
+                             */
+                            dvmCompilerInsertLIRBefore((LIR *) checkLIR,
+                                                       (LIR *) newStoreLIR);
+                            thisLIR->isNop = true;
+                        }
+                        break;
+                    }
+
+                    /*
+                     * Saw a real instruction that the store can be sunk after
+                     */
+                    if (!isPseudoOpCode(checkLIR->opCode)) {
+                        sinkDistance++;
+                    }
+                }
+            }
+        }
+    }
+}
+
+void dvmCompilerApplyLocalOptimizations(CompilationUnit *cUnit, LIR *headLIR,
+                                        LIR *tailLIR)
+{
+    applyLoadStoreElimination(cUnit,
+                              (Armv5teLIR *) headLIR,
+                              (Armv5teLIR *) tailLIR);
+}