Add support to do suspend polling on backward branches in JIT'ed code.

The polling is expensive for now as it is done through three
instructions: ld/ld/branch. As a result, a bunch of bonus stuff has
been worked on to mitigate the extra overhead:

- Cleaned up resource flags for memory disambiguation.
- Rewrote load/store elimination and scheduler routines to hide
  the ld/ld latency for GC flag. Seperate the dependency checking into
  memory disambiguation part and resource conflict part.
- Allowed code motion for Dalvik/constant/non-aliasing loads to be
  hoisted above branches for null/range checks.
- Created extended basic blocks following goto instructions so that
  longer instruction streams can be optimized as a whole.

Without the bonus stuff, the performance dropped about ~5-10% on some
benchmarks because of the lack of headroom to hide the polling latency
in tight loops. With the bonus stuff, the performance delta is between
+/-5% with polling code generated. With the bonus stuff but disabling
polling, the new bonus stuff provides consistent performance
improvements:

CaffeineMark  3.6%
Linpack      11.1%
Scimark       9.7%
Sieve        33.0%
Checkers      6.0%

As a result, GC polling is disabled by default but can be turned on
through the -Xjitsuspendpoll flag for experimental purposes.

Change-Id: Ia81fc85de3e2b70e6cc93bc37c2b845892003cdb
diff --git a/vm/compiler/codegen/arm/ArchUtility.c b/vm/compiler/codegen/arm/ArchUtility.c
index c6bcac2..be6d56b 100644
--- a/vm/compiler/codegen/arm/ArchUtility.c
+++ b/vm/compiler/codegen/arm/ArchUtility.c
@@ -263,10 +263,22 @@
         if (mask & ENCODE_FP_STATUS) {
             strcat(buf, "fpcc ");
         }
+
+        /* Memory bits */
         if (armLIR && (mask & ENCODE_DALVIK_REG)) {
             sprintf(buf + strlen(buf), "dr%d%s", armLIR->aliasInfo & 0xffff,
                     (armLIR->aliasInfo & 0x80000000) ? "(+1)" : "");
         }
+        if (mask & ENCODE_LITERAL) {
+            strcat(buf, "lit ");
+        }
+
+        if (mask & ENCODE_HEAP_REF) {
+            strcat(buf, "heap ");
+        }
+        if (mask & ENCODE_MUST_NOT_ALIAS) {
+            strcat(buf, "noalias ");
+        }
     }
     if (buf[0]) {
         LOGD("%s: %s", prefix, buf);
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index d3e145e..5adc1ed 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -129,12 +129,12 @@
     kFPReg0     = 16,
     kRegEnd     = 48,
     kCCode      = kRegEnd,
-    kFPStatus,
-    kDalvikReg,
-    kLiteral,
-    kFrameRef,
-    kHeapRef,
-    kLitPoolRef
+    kFPStatus,          // FP status word
+    // The following four bits are for memory disambiguation
+    kDalvikReg,         // 1 Dalvik Frame (can be fully disambiguated)
+    kLiteral,           // 2 Literal pool (can be fully disambiguated)
+    kHeapRef,           // 3 Somewhere on the heap (alias with any other heap)
+    kMustNotAlias,      // 4 Guaranteed to be non-alias (eg *(r6+x))
 } ResourceEncodingPos;
 
 #define ENCODE_REG_LIST(N)      ((u8) N)
@@ -144,19 +144,15 @@
 #define ENCODE_CCODE            (1ULL << kCCode)
 #define ENCODE_FP_STATUS        (1ULL << kFPStatus)
 
-    /* Must alias */
+/* Abstract memory locations */
 #define ENCODE_DALVIK_REG       (1ULL << kDalvikReg)
 #define ENCODE_LITERAL          (1ULL << kLiteral)
-
-    /* May alias */
-#define ENCODE_FRAME_REF        (1ULL << kFrameRef)
 #define ENCODE_HEAP_REF         (1ULL << kHeapRef)
-#define ENCODE_LITPOOL_REF      (1ULL << kLitPoolRef)
+#define ENCODE_MUST_NOT_ALIAS   (1ULL << kMustNotAlias)
 
 #define ENCODE_ALL              (~0ULL)
-#define ENCODE_MEM_DEF          (ENCODE_FRAME_REF | ENCODE_HEAP_REF)
-#define ENCODE_MEM_USE          (ENCODE_FRAME_REF | ENCODE_HEAP_REF \
-                                 | ENCODE_LITPOOL_REF)
+#define ENCODE_MEM              (ENCODE_DALVIK_REG | ENCODE_LITERAL | \
+                                 ENCODE_HEAP_REF | ENCODE_MUST_NOT_ALIAS)
 
 #define DECODE_ALIAS_INFO_REG(X)        (X & 0xffff)
 #define DECODE_ALIAS_INFO_WIDE(X)       ((X & 0x80000000) ? 1 : 0)
diff --git a/vm/compiler/codegen/arm/CodegenCommon.c b/vm/compiler/codegen/arm/CodegenCommon.c
index f4ca95c..75134bf 100644
--- a/vm/compiler/codegen/arm/CodegenCommon.c
+++ b/vm/compiler/codegen/arm/CodegenCommon.c
@@ -35,14 +35,12 @@
 static void setMemRefType(ArmLIR *lir, bool isLoad, int memType)
 {
     u8 *maskPtr;
-    u8 mask;
-    assert( EncodingMap[lir->opcode].flags & (IS_LOAD | IS_STORE));
+    u8 mask = ENCODE_MEM;;
+    assert(EncodingMap[lir->opcode].flags & (IS_LOAD | IS_STORE));
     if (isLoad) {
         maskPtr = &lir->useMask;
-        mask = ENCODE_MEM_USE;
     } else {
         maskPtr = &lir->defMask;
-        mask = ENCODE_MEM_DEF;
     }
     /* Clear out the memref flags */
     *maskPtr &= ~mask;
@@ -50,14 +48,19 @@
     switch(memType) {
         case kLiteral:
             assert(isLoad);
-            *maskPtr |= (ENCODE_LITERAL | ENCODE_LITPOOL_REF);
+            *maskPtr |= ENCODE_LITERAL;
             break;
         case kDalvikReg:
-            *maskPtr |= (ENCODE_DALVIK_REG | ENCODE_FRAME_REF);
+            *maskPtr |= ENCODE_DALVIK_REG;
             break;
         case kHeapRef:
             *maskPtr |= ENCODE_HEAP_REF;
             break;
+        case kMustNotAlias:
+            /* Currently only loads can be marked as kMustNotAlias */
+            assert(!(EncodingMap[lir->opcode].flags & IS_STORE));
+            *maskPtr |= ENCODE_MUST_NOT_ALIAS;
+            break;
         default:
             LOGE("Jit: invalid memref kind - %d", memType);
             assert(0);  // Bail if debug build, set worst-case in the field
@@ -138,9 +141,13 @@
         setMemRefType(lir, flags & IS_LOAD, kHeapRef);
     }
 
+    /*
+     * Conservatively assume the branch here will call out a function that in
+     * turn will trash everything.
+     */
     if (flags & IS_BRANCH) {
-        lir->defMask |= ENCODE_REG_PC;
-        lir->useMask |= ENCODE_REG_PC;
+        lir->defMask = lir->useMask = ENCODE_ALL;
+        return;
     }
 
     if (flags & REG_DEF0) {
@@ -176,11 +183,6 @@
         lir->defMask = ENCODE_ALL;
     }
 
-    /* Set up the mask for resources that are used */
-    if (flags & IS_BRANCH) {
-        lir->useMask |= ENCODE_REG_PC;
-    }
-
     if (flags & (REG_USE0 | REG_USE1 | REG_USE2 | REG_USE3)) {
         int i;
 
@@ -225,6 +227,37 @@
 }
 
 /*
+ * Set up the accurate resource mask for branch instructions
+ */
+static void relaxBranchMasks(ArmLIR *lir)
+{
+    int flags = EncodingMap[lir->opcode].flags;
+
+    /* Make sure only branch instructions are passed here */
+    assert(flags & IS_BRANCH);
+
+    lir->useMask = lir->defMask = ENCODE_REG_PC;
+
+    if (flags & REG_DEF_LR) {
+        lir->defMask |= ENCODE_REG_LR;
+    }
+
+    if (flags & (REG_USE0 | REG_USE1 | REG_USE2 | REG_USE3)) {
+        int i;
+
+        for (i = 0; i < 4; i++) {
+            if (flags & (1 << (kRegUse0 + i))) {
+                setupRegMask(&lir->useMask, lir->operands[i]);
+            }
+        }
+    }
+
+    if (flags & USES_CCODES) {
+        lir->useMask |= ENCODE_CCODE;
+    }
+}
+
+/*
  * The following are building blocks to construct low-level IRs with 0 - 4
  * operands.
  */
@@ -407,5 +440,9 @@
     }
     /* Branch to the PC reconstruction code */
     branch->generic.target = (LIR *) pcrLabel;
+
+    /* Clear the conservative flags for branches that punt to the interpreter */
+    relaxBranchMasks(branch);
+
     return pcrLabel;
 }
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 7049e49..ade4197 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -1368,6 +1368,22 @@
 #endif
 
 /*
+ * Fetch *InterpState->pSelfSuspendCount. If the suspend count is non-zero,
+ * punt to the interpreter.
+ */
+static void genSuspendPoll(CompilationUnit *cUnit, MIR *mir)
+{
+    int rTemp = dvmCompilerAllocTemp(cUnit);
+    ArmLIR *ld;
+    ld = loadWordDisp(cUnit, rGLUE, offsetof(InterpState, pSelfSuspendCount),
+                      rTemp);
+    setMemRefType(ld, true /* isLoad */, kMustNotAlias);
+    ld = loadWordDisp(cUnit, rTemp, 0, rTemp);
+    setMemRefType(ld, true /* isLoad */, kMustNotAlias);
+    genRegImmCheck(cUnit, kArmCondNe, rTemp, 0, mir->offset, NULL);
+}
+
+/*
  * The following are the first-level codegen routines that analyze the format
  * of each bytecode then either dispatch special purpose codegen routines
  * or produce corresponding Thumb instructions directly.
@@ -1376,8 +1392,26 @@
 static bool handleFmt10t_Fmt20t_Fmt30t(CompilationUnit *cUnit, MIR *mir,
                                        BasicBlock *bb, ArmLIR *labelList)
 {
-    /* For OP_GOTO, OP_GOTO_16, and OP_GOTO_32 */
-    genUnconditionalBranch(cUnit, &labelList[bb->taken->id]);
+    /* backward branch? */
+    bool backwardBranch = (bb->taken->startOffset <= mir->offset);
+
+    if (backwardBranch && gDvmJit.genSuspendPoll) {
+        genSuspendPoll(cUnit, mir);
+    }
+
+    int numPredecessors = dvmCountSetBits(bb->taken->predecessors);
+    /*
+     * Things could be hoisted out of the taken block into the predecessor, so
+     * make sure it is dominated by the predecessor.
+     */
+    if (numPredecessors == 1 && bb->taken->visited == false &&
+        bb->taken->blockType == kDalvikByteCode &&
+        cUnit->methodJitMode == false ) {
+        cUnit->nextCodegenBlock = bb->taken;
+    } else {
+        /* For OP_GOTO, OP_GOTO_16, and OP_GOTO_32 */
+        genUnconditionalBranch(cUnit, &labelList[bb->taken->id]);
+    }
     return false;
 }
 
@@ -1995,8 +2029,16 @@
 {
     Opcode dalvikOpcode = mir->dalvikInsn.opcode;
     ArmConditionCode cond;
+    /* backward branch? */
+    bool backwardBranch = (bb->taken->startOffset <= mir->offset);
+
+    if (backwardBranch && gDvmJit.genSuspendPoll) {
+        genSuspendPoll(cUnit, mir);
+    }
+
     RegLocation rlSrc = dvmCompilerGetSrc(cUnit, mir, 0);
     rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+
     opRegImm(cUnit, kOpCmp, rlSrc.lowReg, 0);
 
 //TUNING: break this out to allow use of Thumb2 CB[N]Z
@@ -2509,11 +2551,19 @@
 {
     Opcode dalvikOpcode = mir->dalvikInsn.opcode;
     ArmConditionCode cond;
+    /* backward branch? */
+    bool backwardBranch = (bb->taken->startOffset <= mir->offset);
+
+    if (backwardBranch && gDvmJit.genSuspendPoll) {
+        genSuspendPoll(cUnit, mir);
+    }
+
     RegLocation rlSrc1 = dvmCompilerGetSrc(cUnit, mir, 0);
     RegLocation rlSrc2 = dvmCompilerGetSrc(cUnit, mir, 1);
 
     rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
     rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
+
     opRegReg(cUnit, kOpCmp, rlSrc1.lowReg, rlSrc2.lowReg);
 
     switch (dalvikOpcode) {
@@ -4094,6 +4144,10 @@
         dvmInitGrowableList(&chainingListByType[i], 2);
     }
 
+    /* Clear the visited flag for each block */
+    dvmCompilerDataFlowAnalysisDispatcher(cUnit, dvmCompilerClearVisitedFlag,
+                                          kAllNodes, false /* isIterative */);
+
     GrowableListIterator iterator;
     dvmGrowableListIteratorInit(&cUnit->blockList, &iterator);
 
@@ -4105,6 +4159,7 @@
         MIR *mir;
         BasicBlock *bb = (BasicBlock *) dvmGrowableListIteratorNext(&iterator);
         if (bb == NULL) break;
+        if (bb->visited == true) continue;
 
         labelList[i].operands[0] = bb->startOffset;
 
@@ -4199,171 +4254,189 @@
         }
 
         ArmLIR *headLIR = NULL;
+        BasicBlock *nextBB = bb;
 
-        for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
+        /*
+         * Try to build a longer optimization unit. Currently if the previous
+         * block ends with a goto, we continue adding instructions and don't
+         * reset the register allocation pool.
+         */
+        for (; nextBB != NULL; nextBB = cUnit->nextCodegenBlock) {
+            bb = nextBB;
+            bb->visited = true;
+            cUnit->nextCodegenBlock = NULL;
 
-            dvmCompilerResetRegPool(cUnit);
-            if (gDvmJit.disableOpt & (1 << kTrackLiveTemps)) {
-                dvmCompilerClobberAllRegs(cUnit);
-            }
+            for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
 
-            if (gDvmJit.disableOpt & (1 << kSuppressLoads)) {
-                dvmCompilerResetDefTracking(cUnit);
-            }
-
-            if (mir->dalvikInsn.opcode >= kMirOpFirst) {
-                handleExtendedMIR(cUnit, mir);
-                continue;
-            }
-
-
-            Opcode dalvikOpcode = mir->dalvikInsn.opcode;
-            InstructionFormat dalvikFormat = dexGetFormatFromOpcode(dalvikOpcode);
-            char *note;
-            if (mir->OptimizationFlags & MIR_INLINED) {
-                note = " (I)";
-            } else if (mir->OptimizationFlags & MIR_INLINED_PRED) {
-                note = " (PI)";
-            } else if (mir->OptimizationFlags & MIR_CALLEE) {
-                note = " (C)";
-            } else {
-                note = NULL;
-            }
-
-            ArmLIR *boundaryLIR;
-
-            /*
-             * Don't generate the boundary LIR unless we are debugging this
-             * trace or we need a scheduling barrier.
-             */
-            if (headLIR == NULL || cUnit->printMe == true) {
-                boundaryLIR =
-                    newLIR2(cUnit, kArmPseudoDalvikByteCodeBoundary,
-                            mir->offset,
-                            (int) dvmCompilerGetDalvikDisassembly(
-                                &mir->dalvikInsn, note));
-                /* Remember the first LIR for this block */
-                if (headLIR == NULL) {
-                    headLIR = boundaryLIR;
-                    /* Set the first boundaryLIR as a scheduling barrier */
-                    headLIR->defMask = ENCODE_ALL;
+                dvmCompilerResetRegPool(cUnit);
+                if (gDvmJit.disableOpt & (1 << kTrackLiveTemps)) {
+                    dvmCompilerClobberAllRegs(cUnit);
                 }
-            }
 
-            /* Don't generate the SSA annotation unless verbose mode is on */
-            if (cUnit->printMe && mir->ssaRep) {
-                char *ssaString = dvmCompilerGetSSAString(cUnit, mir->ssaRep);
-                newLIR1(cUnit, kArmPseudoSSARep, (int) ssaString);
-            }
+                if (gDvmJit.disableOpt & (1 << kSuppressLoads)) {
+                    dvmCompilerResetDefTracking(cUnit);
+                }
 
-            bool notHandled;
-            /*
-             * Debugging: screen the opcode first to see if it is in the
-             * do[-not]-compile list
-             */
-            bool singleStepMe = SINGLE_STEP_OP(dalvikOpcode);
+                if (mir->dalvikInsn.opcode >= kMirOpFirst) {
+                    handleExtendedMIR(cUnit, mir);
+                    continue;
+                }
+
+
+                Opcode dalvikOpcode = mir->dalvikInsn.opcode;
+                InstructionFormat dalvikFormat =
+                    dexGetFormatFromOpcode(dalvikOpcode);
+                char *note;
+                if (mir->OptimizationFlags & MIR_INLINED) {
+                    note = " (I)";
+                } else if (mir->OptimizationFlags & MIR_INLINED_PRED) {
+                    note = " (PI)";
+                } else if (mir->OptimizationFlags & MIR_CALLEE) {
+                    note = " (C)";
+                } else {
+                    note = NULL;
+                }
+
+                ArmLIR *boundaryLIR;
+
+                /*
+                 * Don't generate the boundary LIR unless we are debugging this
+                 * trace or we need a scheduling barrier.
+                 */
+                if (headLIR == NULL || cUnit->printMe == true) {
+                    boundaryLIR =
+                        newLIR2(cUnit, kArmPseudoDalvikByteCodeBoundary,
+                                mir->offset,
+                                (int) dvmCompilerGetDalvikDisassembly(
+                                    &mir->dalvikInsn, note));
+                    /* Remember the first LIR for this block */
+                    if (headLIR == NULL) {
+                        headLIR = boundaryLIR;
+                        /* Set the first boundaryLIR as a scheduling barrier */
+                        headLIR->defMask = ENCODE_ALL;
+                    }
+                }
+
+                /*
+                 * Don't generate the SSA annotation unless verbose mode is on
+                 */
+                if (cUnit->printMe && mir->ssaRep) {
+                    char *ssaString = dvmCompilerGetSSAString(cUnit,
+                                                              mir->ssaRep);
+                    newLIR1(cUnit, kArmPseudoSSARep, (int) ssaString);
+                }
+
+                bool notHandled;
+                /*
+                 * Debugging: screen the opcode first to see if it is in the
+                 * do[-not]-compile list
+                 */
+                bool singleStepMe = SINGLE_STEP_OP(dalvikOpcode);
 #if defined(WITH_SELF_VERIFICATION)
-          if (singleStepMe == false) {
-              singleStepMe = selfVerificationPuntOps(mir);
-          }
+              if (singleStepMe == false) {
+                  singleStepMe = selfVerificationPuntOps(mir);
+              }
 #endif
-            if (singleStepMe || cUnit->allSingleStep) {
-                notHandled = false;
-                genInterpSingleStep(cUnit, mir);
-            } else {
-                opcodeCoverage[dalvikOpcode]++;
-                switch (dalvikFormat) {
-                    case kFmt10t:
-                    case kFmt20t:
-                    case kFmt30t:
-                        notHandled = handleFmt10t_Fmt20t_Fmt30t(cUnit,
-                                  mir, bb, labelList);
-                        break;
-                    case kFmt10x:
-                        notHandled = handleFmt10x(cUnit, mir);
-                        break;
-                    case kFmt11n:
-                    case kFmt31i:
-                        notHandled = handleFmt11n_Fmt31i(cUnit, mir);
-                        break;
-                    case kFmt11x:
-                        notHandled = handleFmt11x(cUnit, mir);
-                        break;
-                    case kFmt12x:
-                        notHandled = handleFmt12x(cUnit, mir);
-                        break;
-                    case kFmt20bc:
-                    case kFmt40sc:
-                        notHandled = handleFmt20bc_Fmt40sc(cUnit, mir);
-                        break;
-                    case kFmt21c:
-                    case kFmt31c:
-                    case kFmt41c:
-                        notHandled = handleFmt21c_Fmt31c_Fmt41c(cUnit, mir);
-                        break;
-                    case kFmt21h:
-                        notHandled = handleFmt21h(cUnit, mir);
-                        break;
-                    case kFmt21s:
-                        notHandled = handleFmt21s(cUnit, mir);
-                        break;
-                    case kFmt21t:
-                        notHandled = handleFmt21t(cUnit, mir, bb, labelList);
-                        break;
-                    case kFmt22b:
-                    case kFmt22s:
-                        notHandled = handleFmt22b_Fmt22s(cUnit, mir);
-                        break;
-                    case kFmt22c:
-                    case kFmt52c:
-                        notHandled = handleFmt22c_Fmt52c(cUnit, mir);
-                        break;
-                    case kFmt22cs:
-                        notHandled = handleFmt22cs(cUnit, mir);
-                        break;
-                    case kFmt22t:
-                        notHandled = handleFmt22t(cUnit, mir, bb, labelList);
-                        break;
-                    case kFmt22x:
-                    case kFmt32x:
-                        notHandled = handleFmt22x_Fmt32x(cUnit, mir);
-                        break;
-                    case kFmt23x:
-                        notHandled = handleFmt23x(cUnit, mir);
-                        break;
-                    case kFmt31t:
-                        notHandled = handleFmt31t(cUnit, mir);
-                        break;
-                    case kFmt3rc:
-                    case kFmt35c:
-                    case kFmt5rc:
-                        notHandled = handleFmt35c_3rc_5rc(cUnit, mir, bb,
+                if (singleStepMe || cUnit->allSingleStep) {
+                    notHandled = false;
+                    genInterpSingleStep(cUnit, mir);
+                } else {
+                    opcodeCoverage[dalvikOpcode]++;
+                    switch (dalvikFormat) {
+                        case kFmt10t:
+                        case kFmt20t:
+                        case kFmt30t:
+                            notHandled = handleFmt10t_Fmt20t_Fmt30t(cUnit,
+                                      mir, bb, labelList);
+                            break;
+                        case kFmt10x:
+                            notHandled = handleFmt10x(cUnit, mir);
+                            break;
+                        case kFmt11n:
+                        case kFmt31i:
+                            notHandled = handleFmt11n_Fmt31i(cUnit, mir);
+                            break;
+                        case kFmt11x:
+                            notHandled = handleFmt11x(cUnit, mir);
+                            break;
+                        case kFmt12x:
+                            notHandled = handleFmt12x(cUnit, mir);
+                            break;
+                        case kFmt20bc:
+                        case kFmt40sc:
+                            notHandled = handleFmt20bc_Fmt40sc(cUnit, mir);
+                            break;
+                        case kFmt21c:
+                        case kFmt31c:
+                        case kFmt41c:
+                            notHandled = handleFmt21c_Fmt31c_Fmt41c(cUnit, mir);
+                            break;
+                        case kFmt21h:
+                            notHandled = handleFmt21h(cUnit, mir);
+                            break;
+                        case kFmt21s:
+                            notHandled = handleFmt21s(cUnit, mir);
+                            break;
+                        case kFmt21t:
+                            notHandled = handleFmt21t(cUnit, mir, bb,
                                                       labelList);
-                        break;
-                    case kFmt3rms:
-                    case kFmt35ms:
-                        notHandled = handleFmt35ms_3rms(cUnit, mir, bb,
-                                                        labelList);
-                        break;
-                    case kFmt35mi:
-                    case kFmt3rmi:
-                        notHandled = handleExecuteInline(cUnit, mir);
-                        break;
-                    case kFmt51l:
-                        notHandled = handleFmt51l(cUnit, mir);
-                        break;
-                    default:
-                        notHandled = true;
-                        break;
+                            break;
+                        case kFmt22b:
+                        case kFmt22s:
+                            notHandled = handleFmt22b_Fmt22s(cUnit, mir);
+                            break;
+                        case kFmt22c:
+                        case kFmt52c:
+                            notHandled = handleFmt22c_Fmt52c(cUnit, mir);
+                            break;
+                        case kFmt22cs:
+                            notHandled = handleFmt22cs(cUnit, mir);
+                            break;
+                        case kFmt22t:
+                            notHandled = handleFmt22t(cUnit, mir, bb,
+                                                      labelList);
+                            break;
+                        case kFmt22x:
+                        case kFmt32x:
+                            notHandled = handleFmt22x_Fmt32x(cUnit, mir);
+                            break;
+                        case kFmt23x:
+                            notHandled = handleFmt23x(cUnit, mir);
+                            break;
+                        case kFmt31t:
+                            notHandled = handleFmt31t(cUnit, mir);
+                            break;
+                        case kFmt3rc:
+                        case kFmt35c:
+                        case kFmt5rc:
+                            notHandled = handleFmt35c_3rc_5rc(cUnit, mir, bb,
+                                                          labelList);
+                            break;
+                        case kFmt3rms:
+                        case kFmt35ms:
+                            notHandled = handleFmt35ms_3rms(cUnit, mir, bb,
+                                                            labelList);
+                            break;
+                        case kFmt35mi:
+                        case kFmt3rmi:
+                            notHandled = handleExecuteInline(cUnit, mir);
+                            break;
+                        case kFmt51l:
+                            notHandled = handleFmt51l(cUnit, mir);
+                            break;
+                        default:
+                            notHandled = true;
+                            break;
+                    }
                 }
-            }
-            if (notHandled) {
-                LOGE("%#06x: Opcode 0x%x (%s) / Fmt %d not handled\n",
-                     mir->offset,
-                     dalvikOpcode, dexGetOpcodeName(dalvikOpcode),
-                     dalvikFormat);
-                dvmCompilerAbort(cUnit);
-                break;
+                if (notHandled) {
+                    LOGE("%#06x: Opcode 0x%x (%s) / Fmt %d not handled\n",
+                         mir->offset,
+                         dalvikOpcode, dexGetOpcodeName(dalvikOpcode),
+                         dalvikFormat);
+                    dvmCompilerAbort(cUnit);
+                    break;
+                }
             }
         }
 
@@ -4389,10 +4462,8 @@
          * insert an unconditional branch to the chaining cell.
          */
         if (bb->needFallThroughBranch) {
-            genUnconditionalBranch(cUnit,
-                                   &labelList[bb->fallThrough->id]);
+            genUnconditionalBranch(cUnit, &labelList[bb->fallThrough->id]);
         }
-
     }
 
     /* Handle the chaining cells in predefined order */
diff --git a/vm/compiler/codegen/arm/LocalOptimizations.c b/vm/compiler/codegen/arm/LocalOptimizations.c
index ae98a56..4c0354a 100644
--- a/vm/compiler/codegen/arm/LocalOptimizations.c
+++ b/vm/compiler/codegen/arm/LocalOptimizations.c
@@ -21,33 +21,23 @@
 
 #define DEBUG_OPT(X)
 
-ArmLIR* dvmCompilerGenCopy(CompilationUnit *cUnit, int rDest, int rSrc);
+/* Check RAW, WAR, and WAR dependency on the register operands */
+#define CHECK_REG_DEP(use, def, check) ((def & check->useMask) || \
+                                        ((use | def) & check->defMask))
 
-/* Is this a Dalvik register access? */
-static inline bool isDalvikLoad(ArmLIR *lir)
-{
-    return (lir->useMask != ENCODE_ALL) && (lir->useMask & ENCODE_DALVIK_REG);
-}
-
-/* Is this a load from the literal pool? */
-static inline bool isLiteralLoad(ArmLIR *lir)
-{
-    return (lir->useMask != ENCODE_ALL) && (lir->useMask & ENCODE_LITERAL);
-}
-
-static inline bool isDalvikStore(ArmLIR *lir)
-{
-    return (lir->defMask != ENCODE_ALL) && (lir->defMask & ENCODE_DALVIK_REG);
-}
+/* Scheduler heuristics */
+#define MAX_HOIST_DISTANCE 20
+#define LDLD_DISTANCE 4
+#define LD_LATENCY 2
 
 static inline bool isDalvikRegisterClobbered(ArmLIR *lir1, ArmLIR *lir2)
 {
-  int reg1Lo = DECODE_ALIAS_INFO_REG(lir1->aliasInfo);
-  int reg1Hi = reg1Lo + DECODE_ALIAS_INFO_WIDE(lir1->aliasInfo);
-  int reg2Lo = DECODE_ALIAS_INFO_REG(lir2->aliasInfo);
-  int reg2Hi = reg2Lo + DECODE_ALIAS_INFO_WIDE(lir2->aliasInfo);
+    int reg1Lo = DECODE_ALIAS_INFO_REG(lir1->aliasInfo);
+    int reg1Hi = reg1Lo + DECODE_ALIAS_INFO_WIDE(lir1->aliasInfo);
+    int reg2Lo = DECODE_ALIAS_INFO_REG(lir2->aliasInfo);
+    int reg2Hi = reg2Lo + DECODE_ALIAS_INFO_WIDE(lir2->aliasInfo);
 
-  return (reg1Lo == reg2Lo) || (reg1Lo == reg2Hi) || (reg1Hi == reg2Lo);
+    return (reg1Lo == reg2Lo) || (reg1Lo == reg2Hi) || (reg1Hi == reg2Lo);
 }
 
 #if 0
@@ -61,10 +51,39 @@
 }
 #endif
 
+/* Convert a more expensive instruction (ie load) into a move */
+static void convertMemOpIntoMove(CompilationUnit *cUnit, ArmLIR *origLIR,
+                                 int dest, int src)
+{
+    /* Insert a move to replace the load */
+    ArmLIR *moveLIR;
+    moveLIR = dvmCompilerRegCopyNoInsert( cUnit, dest, src);
+    /*
+     * Insert the converted instruction after the original since the
+     * optimization is scannng in the top-down order and the new instruction
+     * will need to be re-checked (eg the new dest clobbers the src used in
+     * thisLIR).
+     */
+    dvmCompilerInsertLIRAfter((LIR *) origLIR, (LIR *) moveLIR);
+}
+
 /*
- * Perform a pass of top-down walk to
- * 1) Eliminate redundant loads and stores
- * 2) Sink stores to latest possible slot
+ * Perform a pass of top-down walk, from the second-last instruction in the
+ * superblock, to eliminate redundant loads and stores.
+ *
+ * An earlier load can eliminate a later load iff
+ *   1) They are must-aliases
+ *   2) The native register is not clobbered in between
+ *   3) The memory location is not written to in between
+ *
+ * An earlier store can eliminate a later load iff
+ *   1) They are must-aliases
+ *   2) The native register is not clobbered in between
+ *   3) The memory location is not written to in between
+ *
+ * A later store can be eliminated by an earlier store iff
+ *   1) They are must-aliases
+ *   2) The memory location is not written to in between
  */
 static void applyLoadStoreElimination(CompilationUnit *cUnit,
                                       ArmLIR *headLIR,
@@ -72,428 +91,347 @@
 {
     ArmLIR *thisLIR;
 
-    cUnit->optRound++;
-    for (thisLIR = headLIR;
-         thisLIR != tailLIR;
-         thisLIR = NEXT_LIR(thisLIR)) {
-        /* Skip newly added instructions */
-        if (thisLIR->flags.age >= cUnit->optRound) {
+    if (headLIR == tailLIR) return;
+
+    for (thisLIR = PREV_LIR(tailLIR);
+         thisLIR != headLIR;
+         thisLIR = PREV_LIR(thisLIR)) {
+        int sinkDistance = 0;
+
+        /* Skip non-interesting instructions */
+        if ((thisLIR->flags.isNop == true) ||
+            isPseudoOpcode(thisLIR->opcode) ||
+            !(EncodingMap[thisLIR->opcode].flags & (IS_LOAD | IS_STORE))) {
             continue;
         }
-        if (isDalvikStore(thisLIR)) {
-            int nativeRegId = thisLIR->operands[0];
-            ArmLIR *checkLIR;
-            int sinkDistance = 0;
+
+        int nativeRegId = thisLIR->operands[0];
+        bool isThisLIRLoad = EncodingMap[thisLIR->opcode].flags & IS_LOAD;
+        ArmLIR *checkLIR;
+        /* Use the mem mask to determine the rough memory location */
+        u8 thisMemMask = (thisLIR->useMask | thisLIR->defMask) & ENCODE_MEM;
+
+        /*
+         * Currently only eliminate redundant ld/st for constant and Dalvik
+         * register accesses.
+         */
+        if (!(thisMemMask & (ENCODE_LITERAL | ENCODE_DALVIK_REG))) continue;
+
+        /*
+         * Add r15 (pc) to the resource mask to prevent this instruction
+         * from sinking past branch instructions. Also take out the memory
+         * region bits since stopMask is used to check data/control
+         * dependencies.
+         */
+        u8 stopUseRegMask = (ENCODE_REG_PC | thisLIR->useMask) &
+                            ~ENCODE_MEM;
+        u8 stopDefRegMask = thisLIR->defMask & ~ENCODE_MEM;
+
+        for (checkLIR = NEXT_LIR(thisLIR);
+             checkLIR != tailLIR;
+             checkLIR = NEXT_LIR(checkLIR)) {
+
+            u8 checkMemMask = (checkLIR->useMask | checkLIR->defMask) &
+                              ENCODE_MEM;
+            u8 aliasCondition = thisMemMask & checkMemMask;
+            bool stopHere = false;
+
             /*
-             * Add r15 (pc) to the mask to prevent this instruction
-             * from sinking past branch instructions. Unset the Dalvik register
-             * bit when checking with native resource constraints.
+             * Potential aliases seen - check the alias relations
              */
-            u8 stopMask = (ENCODE_REG_PC | thisLIR->useMask) &
-                          ~ENCODE_DALVIK_REG;
-
-            for (checkLIR = NEXT_LIR(thisLIR);
-                 checkLIR != tailLIR;
-                 checkLIR = NEXT_LIR(checkLIR)) {
-
-                /* Check if a Dalvik register load is redundant */
-                if (isDalvikLoad(checkLIR) &&
-                    (checkLIR->aliasInfo == thisLIR->aliasInfo) &&
-                    (REGTYPE(checkLIR->operands[0]) == REGTYPE(nativeRegId))) {
-                    /* Insert a move to replace the load */
-                    if (checkLIR->operands[0] != nativeRegId) {
-                        ArmLIR *moveLIR;
-                        moveLIR = dvmCompilerRegCopyNoInsert(
-                                    cUnit, checkLIR->operands[0], nativeRegId);
-                        /*
-                         * Insert the converted checkLIR instruction after the
-                         * the original checkLIR since the optimization is
-                         * scannng in the top-down order and the new instruction
-                         * will need to be checked.
-                         */
-                        dvmCompilerInsertLIRAfter((LIR *) checkLIR,
-                                                  (LIR *) moveLIR);
-                    }
-                    checkLIR->flags.isNop = true;
-                    continue;
-
-                /*
-                 * Found a true output dependency - nuke the previous store.
-                 * The register type doesn't matter here.
-                 */
-                } else if (isDalvikStore(checkLIR) &&
-                           (checkLIR->aliasInfo == thisLIR->aliasInfo)) {
-                    thisLIR->flags.isNop = true;
-                    break;
-                /* Find out the latest slot that the store can be sunk into */
-                } else {
-                    /* Last instruction reached */
-                    bool stopHere = (NEXT_LIR(checkLIR) == tailLIR);
-
-                    /* Store data is clobbered */
-                    stopHere |= ((stopMask & checkLIR->defMask) != 0);
-
-                    /* Store data partially clobbers the Dalvik register */
-                    if (stopHere == false &&
-                        ((checkLIR->useMask | checkLIR->defMask) &
-                         ENCODE_DALVIK_REG)) {
-                        stopHere = isDalvikRegisterClobbered(thisLIR, checkLIR);
-                    }
-
-                    /* Found a new place to put the store - move it here */
-                    if (stopHere == true) {
-                        DEBUG_OPT(dumpDependentInsnPair(thisLIR, checkLIR,
-                                                        "SINK STORE"));
-                        /* The store can be sunk for at least one cycle */
-                        if (sinkDistance != 0) {
-                            ArmLIR *newStoreLIR =
-                                (ArmLIR *)dvmCompilerNew(sizeof(ArmLIR), true);
-                            *newStoreLIR = *thisLIR;
-                            newStoreLIR->flags.age = cUnit->optRound;
-                            /*
-                             * Stop point found - insert *before* the checkLIR
-                             * since the instruction list is scanned in the
-                             * top-down order.
-                             */
-                            dvmCompilerInsertLIRBefore((LIR *) checkLIR,
-                                                       (LIR *) newStoreLIR);
-                            thisLIR->flags.isNop = true;
-                        }
-                        break;
-                    }
-
+            if (checkMemMask != ENCODE_MEM && aliasCondition != 0) {
+                bool isCheckLIRLoad = EncodingMap[checkLIR->opcode].flags &
+                                      IS_LOAD;
+                if  (aliasCondition == ENCODE_LITERAL) {
                     /*
-                     * Saw a real instruction that the store can be sunk after
+                     * Should only see literal loads in the instruction
+                     * stream.
                      */
-                    if (!isPseudoOpcode(checkLIR->opcode)) {
-                        sinkDistance++;
+                    assert(!(EncodingMap[checkLIR->opcode].flags &
+                             IS_STORE));
+                    /* Same value && same register type */
+                    if (checkLIR->aliasInfo == thisLIR->aliasInfo &&
+                        REGTYPE(checkLIR->operands[0]) == REGTYPE(nativeRegId)){
+                        /*
+                         * Different destination register - insert
+                         * a move
+                         */
+                        if (checkLIR->operands[0] != nativeRegId) {
+                            convertMemOpIntoMove(cUnit, checkLIR,
+                                                 checkLIR->operands[0],
+                                                 nativeRegId);
+                        }
+                        checkLIR->flags.isNop = true;
+                    }
+                } else if (aliasCondition == ENCODE_DALVIK_REG) {
+                    /* Must alias */
+                    if (checkLIR->aliasInfo == thisLIR->aliasInfo) {
+                        /* Only optimize compatible registers */
+                        bool regCompatible =
+                            REGTYPE(checkLIR->operands[0]) ==
+                            REGTYPE(nativeRegId);
+                        if ((isThisLIRLoad && isCheckLIRLoad) ||
+                            (!isThisLIRLoad && isCheckLIRLoad)) {
+                            /* RAR or RAW */
+                            if (regCompatible) {
+                                /*
+                                 * Different destination register -
+                                 * insert a move
+                                 */
+                                if (checkLIR->operands[0] !=
+                                    nativeRegId) {
+                                    convertMemOpIntoMove(cUnit,
+                                                 checkLIR,
+                                                 checkLIR->operands[0],
+                                                 nativeRegId);
+                                }
+                                checkLIR->flags.isNop = true;
+                            } else {
+                                /*
+                                 * Destinaions are of different types -
+                                 * something complicated going on so
+                                 * stop looking now.
+                                 */
+                                stopHere = true;
+                            }
+                        } else if (isThisLIRLoad && !isCheckLIRLoad) {
+                            /* WAR - register value is killed */
+                            stopHere = true;
+                        } else if (!isThisLIRLoad && !isCheckLIRLoad) {
+                            /* WAW - nuke the earlier store */
+                            thisLIR->flags.isNop = true;
+                            stopHere = true;
+                        }
+                    /* Partial overlap */
+                    } else if (isDalvikRegisterClobbered(thisLIR, checkLIR)) {
+                        /*
+                         * It is actually ok to continue if checkLIR
+                         * is a read. But it is hard to make a test
+                         * case for this so we just stop here to be
+                         * conservative.
+                         */
+                        stopHere = true;
                     }
                 }
+                /* Memory content may be updated. Stop looking now. */
+                if (stopHere) {
+                    break;
+                /* The checkLIR has been transformed - check the next one */
+                } else if (checkLIR->flags.isNop) {
+                    continue;
+                }
+            }
+
+
+            /*
+             * this and check LIRs have no memory dependency. Now check if
+             * their register operands have any RAW, WAR, and WAW
+             * dependencies. If so, stop looking.
+             */
+            if (stopHere == false) {
+                stopHere = CHECK_REG_DEP(stopUseRegMask, stopDefRegMask,
+                                         checkLIR);
+            }
+
+            if (stopHere == true) {
+                DEBUG_OPT(dumpDependentInsnPair(thisLIR, checkLIR,
+                                                "REG CLOBBERED"));
+                /* Only sink store instructions */
+                if (sinkDistance && !isThisLIRLoad) {
+                    ArmLIR *newStoreLIR =
+                        (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR), true);
+                    *newStoreLIR = *thisLIR;
+                    /*
+                     * Stop point found - insert *before* the checkLIR
+                     * since the instruction list is scanned in the
+                     * top-down order.
+                     */
+                    dvmCompilerInsertLIRBefore((LIR *) checkLIR,
+                                               (LIR *) newStoreLIR);
+                    thisLIR->flags.isNop = true;
+                }
+                break;
+            } else if (!checkLIR->flags.isNop) {
+                sinkDistance++;
             }
         }
     }
 }
 
+/*
+ * Perform a pass of bottom-up walk, from the second instruction in the
+ * superblock, to try to hoist loads to earlier slots.
+ */
 static void applyLoadHoisting(CompilationUnit *cUnit,
                               ArmLIR *headLIR,
                               ArmLIR *tailLIR)
 {
-    ArmLIR *thisLIR;
+    ArmLIR *thisLIR, *checkLIR;
     /*
-     * Don't want to hoist in front of first load following a barrier (or
-     * first instruction of the block.
+     * Store the list of independent instructions that can be hoisted past.
+     * Will decide the best place to insert later.
      */
-    bool firstLoad = true;
-    int maxHoist = dvmCompilerTargetOptHint(kMaxHoistDistance);
+    ArmLIR *prevInstList[MAX_HOIST_DISTANCE];
 
-    cUnit->optRound++;
-    for (thisLIR = headLIR;
+    /* Empty block */
+    if (headLIR == tailLIR) return;
+
+    /* Start from the second instruction */
+    for (thisLIR = NEXT_LIR(headLIR);
          thisLIR != tailLIR;
          thisLIR = NEXT_LIR(thisLIR)) {
-        /* Skip newly added instructions */
-        if (thisLIR->flags.age >= cUnit->optRound ||
-            thisLIR->flags.isNop == true) {
+
+        /* Skip non-interesting instructions */
+        if ((thisLIR->flags.isNop == true) ||
+            isPseudoOpcode(thisLIR->opcode) ||
+            !(EncodingMap[thisLIR->opcode].flags & IS_LOAD)) {
             continue;
         }
 
-        if (firstLoad && (EncodingMap[thisLIR->opcode].flags & IS_LOAD)) {
-            /*
-             * Ensure nothing will be hoisted in front of this load because
-             * it's result will likely be needed soon.
-             */
-            thisLIR->defMask |= ENCODE_MEM_USE;
-            firstLoad = false;
+        u8 stopUseAllMask = thisLIR->useMask;
+
+        /*
+         * Branches for null/range checks are marked with the true resource
+         * bits, and loads to Dalvik registers, constant pools, and non-alias
+         * locations are safe to be hoisted. So only mark the heap references
+         * conservatively here.
+         */
+        if (stopUseAllMask & ENCODE_HEAP_REF) {
+            stopUseAllMask |= ENCODE_REG_PC;
         }
 
-        firstLoad |= (thisLIR->defMask == ENCODE_ALL);
+        /* Similar as above, but just check for pure register dependency */
+        u8 stopUseRegMask = stopUseAllMask & ~ENCODE_MEM;
+        u8 stopDefRegMask = thisLIR->defMask & ~ENCODE_MEM;
 
-        if (isDalvikLoad(thisLIR)) {
-            int dRegId = DECODE_ALIAS_INFO_REG(thisLIR->aliasInfo);
-            int nativeRegId = thisLIR->operands[0];
-            ArmLIR *checkLIR;
-            int hoistDistance = 0;
-            u8 stopUseMask = (ENCODE_REG_PC | thisLIR->useMask);
-            u8 stopDefMask = thisLIR->defMask;
-            u8 checkResult;
+        int nextSlot = 0;
+        bool stopHere;
 
-            /* First check if the load can be completely elinimated */
-            for (checkLIR = PREV_LIR(thisLIR);
-                 checkLIR != headLIR;
-                 checkLIR = PREV_LIR(checkLIR)) {
+        /* Try to hoist the load to a good spot */
+        for (checkLIR = PREV_LIR(thisLIR);
+             checkLIR != headLIR;
+             checkLIR = PREV_LIR(checkLIR)) {
 
-                if (checkLIR->flags.isNop) continue;
+            if (checkLIR->flags.isNop) continue;
 
-                /*
-                 * Check if the Dalvik register is previously accessed
-                 * with exactly the same type.
-                 */
-                if ((isDalvikLoad(checkLIR) || isDalvikStore(checkLIR)) &&
-                    (checkLIR->aliasInfo == thisLIR->aliasInfo) &&
-                    (checkLIR->operands[0] == nativeRegId)) {
-                    /*
-                     * If it is previously accessed but with a different type,
-                     * the search will terminate later at the point checking
-                     * for partially overlapping stores.
-                     */
-                    thisLIR->flags.isNop = true;
-                    break;
-                }
+            u8 checkMemMask = checkLIR->defMask & ENCODE_MEM;
+            u8 aliasCondition = stopUseAllMask & checkMemMask;
+            stopHere = false;
 
-                /*
-                 * No earlier use/def can reach this load if:
-                 * 1) Head instruction is reached
-                 */
-                if (checkLIR == headLIR) {
-                    break;
-                }
-
-                checkResult = (stopUseMask | stopDefMask) & checkLIR->defMask;
-
-                /*
-                 * If both instructions are verified Dalvik accesses, clear the
-                 * may- and must-alias bits to detect true resource
-                 * dependencies.
-                 */
-                if (checkResult & ENCODE_DALVIK_REG) {
-                    checkResult &= ~(ENCODE_DALVIK_REG | ENCODE_FRAME_REF);
-                }
-
-                /*
-                 * 2) load target register is clobbered
-                 * 3) A branch is seen (stopUseMask has the PC bit set).
-                 */
-                if (checkResult) {
-                    break;
-                }
-
-                /* Store data partially clobbers the Dalvik register */
-                if (isDalvikStore(checkLIR) &&
-                    isDalvikRegisterClobbered(thisLIR, checkLIR)) {
-                    break;
-                }
-            }
-
-            /* The load has been eliminated */
-            if (thisLIR->flags.isNop) continue;
-
-            /*
-             * The load cannot be eliminated. See if it can be hoisted to an
-             * earlier spot.
-             */
-            for (checkLIR = PREV_LIR(thisLIR);
-                 /* empty by intention */;
-                 checkLIR = PREV_LIR(checkLIR)) {
-
-                if (checkLIR->flags.isNop) continue;
-
-                /*
-                 * Check if the "thisLIR" load is redundant
-                 * NOTE: At one point, we also triggered if the checkLIR
-                 * instruction was a load.  However, that tended to insert
-                 * a load/use dependency because the full scheduler is
-                 * not yet complete.  When it is, we chould also trigger
-                 * on loads.
-                 */
-                if (isDalvikStore(checkLIR) &&
-                    (checkLIR->aliasInfo == thisLIR->aliasInfo) &&
-                    (REGTYPE(checkLIR->operands[0]) == REGTYPE(nativeRegId))) {
-                    /* Insert a move to replace the load */
-                    if (checkLIR->operands[0] != nativeRegId) {
-                        ArmLIR *moveLIR;
-                        moveLIR = dvmCompilerRegCopyNoInsert(
-                                    cUnit, nativeRegId, checkLIR->operands[0]);
-                        /*
-                         * Convert *thisLIR* load into a move
-                         */
-                        dvmCompilerInsertLIRAfter((LIR *) checkLIR,
-                                                  (LIR *) moveLIR);
+            /* Potential WAR alias seen - check the exact relation */
+            if (checkMemMask != ENCODE_MEM && aliasCondition != 0) {
+                /* We can fully disambiguate Dalvik references */
+                if (aliasCondition == ENCODE_DALVIK_REG) {
+                    /* Must alias or partually overlap */
+                    if ((checkLIR->aliasInfo == thisLIR->aliasInfo) ||
+                        isDalvikRegisterClobbered(thisLIR, checkLIR)) {
+                        stopHere = true;
                     }
-                    thisLIR->flags.isNop = true;
-                    break;
-
-                /* Find out if the load can be yanked past the checkLIR */
+                /* Conservatively treat all heap refs as may-alias */
                 } else {
-                    /* Last instruction reached */
-                    bool stopHere = (checkLIR == headLIR);
-
-                    /* Base address is clobbered by checkLIR */
-                    checkResult = stopUseMask & checkLIR->defMask;
-                    if (checkResult & ENCODE_DALVIK_REG) {
-                        checkResult &= ~(ENCODE_DALVIK_REG | ENCODE_FRAME_REF);
-                    }
-                    stopHere |= (checkResult != 0);
-
-                    /* Load target clobbers use/def in checkLIR */
-                    checkResult = stopDefMask &
-                                  (checkLIR->useMask | checkLIR->defMask);
-                    if (checkResult & ENCODE_DALVIK_REG) {
-                        checkResult &= ~(ENCODE_DALVIK_REG | ENCODE_FRAME_REF);
-                    }
-                    stopHere |= (checkResult != 0);
-
-                    /* Store data partially clobbers the Dalvik register */
-                    if (stopHere == false &&
-                        (checkLIR->defMask & ENCODE_DALVIK_REG)) {
-                        stopHere = isDalvikRegisterClobbered(thisLIR, checkLIR);
-                    }
-
-                    /*
-                     * Stop at an earlier Dalvik load if the offset of checkLIR
-                     * is not less than thisLIR
-                     *
-                     * Experiments show that doing
-                     *
-                     * ldr     r1, [r5, #16]
-                     * ldr     r0, [r5, #20]
-                     *
-                     * is much faster than
-                     *
-                     * ldr     r0, [r5, #20]
-                     * ldr     r1, [r5, #16]
-                     */
-                    if (isDalvikLoad(checkLIR)) {
-                        int dRegId2 =
-                            DECODE_ALIAS_INFO_REG(checkLIR->aliasInfo);
-                        if (dRegId2 <= dRegId) {
-                            stopHere = true;
-                        }
-                    }
-
-                    /* Don't go too far */
-                    stopHere |= (hoistDistance >= maxHoist);
-
-                    /* Found a new place to put the load - move it here */
-                    if (stopHere == true) {
-                        DEBUG_OPT(dumpDependentInsnPair(thisLIR, checkLIR,
-                                                        "HOIST LOAD"));
-                        /* The load can be hoisted for at least one cycle */
-                        if (hoistDistance != 0) {
-                            ArmLIR *newLoadLIR =
-                                (ArmLIR *)dvmCompilerNew(sizeof(ArmLIR), true);
-                            *newLoadLIR = *thisLIR;
-                            newLoadLIR->flags.age = cUnit->optRound;
-                            /*
-                             * Stop point found - insert *after* the checkLIR
-                             * since the instruction list is scanned in the
-                             * bottom-up order.
-                             */
-                            dvmCompilerInsertLIRAfter((LIR *) checkLIR,
-                                                      (LIR *) newLoadLIR);
-                            thisLIR->flags.isNop = true;
-                        }
-                        break;
-                    }
-
-                    /*
-                     * Saw a real instruction that hosting the load is
-                     * beneficial
-                     */
-                    if (!isPseudoOpcode(checkLIR->opcode)) {
-                        hoistDistance++;
-                    }
+                    assert(aliasCondition == ENCODE_HEAP_REF);
+                    stopHere = true;
                 }
-            }
-        } else if (isLiteralLoad(thisLIR)) {
-            int litVal = thisLIR->aliasInfo;
-            int nativeRegId = thisLIR->operands[0];
-            ArmLIR *checkLIR;
-            int hoistDistance = 0;
-            u8 stopUseMask = (ENCODE_REG_PC | thisLIR->useMask) &
-                             ~ENCODE_LITPOOL_REF;
-            u8 stopDefMask = thisLIR->defMask & ~ENCODE_LITPOOL_REF;
-
-            /* First check if the load can be completely elinimated */
-            for (checkLIR = PREV_LIR(thisLIR);
-                 checkLIR != headLIR;
-                 checkLIR = PREV_LIR(checkLIR)) {
-
-                if (checkLIR->flags.isNop) continue;
-
-                /* Reloading same literal into same tgt reg? Eliminate if so */
-                if (isLiteralLoad(checkLIR) &&
-                    (checkLIR->aliasInfo == litVal) &&
-                    (checkLIR->operands[0] == nativeRegId)) {
-                    thisLIR->flags.isNop = true;
-                    break;
-                }
-
-                /*
-                 * No earlier use/def can reach this load if:
-                 * 1) Head instruction is reached
-                 * 2) load target register is clobbered
-                 * 3) A branch is seen (stopUseMask has the PC bit set).
-                 */
-                if ((checkLIR == headLIR) ||
-                    (stopUseMask | stopDefMask) & checkLIR->defMask) {
+                /* Memory content may be updated. Stop looking now. */
+                if (stopHere) {
+                    prevInstList[nextSlot++] = checkLIR;
                     break;
                 }
             }
 
-            /* The load has been eliminated */
-            if (thisLIR->flags.isNop) continue;
+            if (stopHere == false) {
+                stopHere = CHECK_REG_DEP(stopUseRegMask, stopDefRegMask,
+                                         checkLIR);
+            }
 
             /*
-             * The load cannot be eliminated. See if it can be hoisted to an
-             * earlier spot.
+             * Store the dependent or non-pseudo/indepedent instruction to the
+             * list.
              */
-            for (checkLIR = PREV_LIR(thisLIR);
-                 /* empty by intention */;
-                 checkLIR = PREV_LIR(checkLIR)) {
+            if (stopHere || !isPseudoOpcode(checkLIR->opcode)) {
+                prevInstList[nextSlot++] = checkLIR;
+                if (nextSlot == MAX_HOIST_DISTANCE) break;
+            }
 
-                if (checkLIR->flags.isNop) continue;
+            /* Found a new place to put the load - move it here */
+            if (stopHere == true) {
+                DEBUG_OPT(dumpDependentInsnPair(checkLIR, thisLIR
+                                                "HOIST STOP"));
+                break;
+            }
+        }
+
+        /*
+         * Reached the top - use headLIR as the dependent marker as all labels
+         * are barriers.
+         */
+        if (stopHere == false && nextSlot < MAX_HOIST_DISTANCE) {
+            prevInstList[nextSlot++] = headLIR;
+        }
+
+        /*
+         * At least one independent instruction is found. Scan in the reversed
+         * direction to find a beneficial slot.
+         */
+        if (nextSlot >= 2) {
+            int firstSlot = nextSlot - 2;
+            int slot;
+            ArmLIR *depLIR = prevInstList[nextSlot-1];
+            /* If there is ld-ld dependency, wait LDLD_DISTANCE cycles */
+            if (!isPseudoOpcode(depLIR->opcode) &&
+                (EncodingMap[depLIR->opcode].flags & IS_LOAD)) {
+                firstSlot -= LDLD_DISTANCE;
+            }
+            /*
+             * Make sure we check slot >= 0 since firstSlot may be negative
+             * when the loop is first entered.
+             */
+            for (slot = firstSlot; slot >= 0; slot--) {
+                ArmLIR *curLIR = prevInstList[slot];
+                ArmLIR *prevLIR = prevInstList[slot+1];
+
+                /* Check the highest instruction */
+                if (prevLIR->defMask == ENCODE_ALL) {
+                    /*
+                     * If the first instruction is a load, don't hoist anything
+                     * above it since it is unlikely to be beneficial.
+                     */
+                    if (EncodingMap[curLIR->opcode].flags & IS_LOAD) continue;
+                    /*
+                     * If the remaining number of slots is less than LD_LATENCY,
+                     * insert the hoisted load here.
+                     */
+                    if (slot < LD_LATENCY) break;
+                }
 
                 /*
-                 * TUNING: once a full scheduler exists, check here
-                 * for conversion of a redundant load into a copy similar
-                 * to the way redundant loads are handled above.
+                 * NOTE: now prevLIR is guaranteed to be a non-pseudo
+                 * instruction (ie accessing EncodingMap[prevLIR->opcode] is
+                 * safe).
+                 *
+                 * Try to find two instructions with load/use dependency until
+                 * the remaining instructions are less than LD_LATENCY.
                  */
-
-                /* Find out if the load can be yanked past the checkLIR */
-
-                /* Last instruction reached */
-                bool stopHere = (checkLIR == headLIR);
-
-                /* Base address is clobbered by checkLIR */
-                stopHere |= ((stopUseMask & checkLIR->defMask) != 0);
-
-                /* Load target clobbers use/def in checkLIR */
-                stopHere |= ((stopDefMask &
-                             (checkLIR->useMask | checkLIR->defMask)) != 0);
-
-                /* Avoid re-ordering literal pool loads */
-                stopHere |= isLiteralLoad(checkLIR);
-
-                /* Don't go too far */
-                stopHere |= (hoistDistance >= maxHoist);
-
-                /* Found a new place to put the load - move it here */
-                if (stopHere == true) {
-                    DEBUG_OPT(dumpDependentInsnPair(thisLIR, checkLIR,
-                                                    "HOIST LOAD"));
-                    /* The store can be hoisted for at least one cycle */
-                    if (hoistDistance != 0) {
-                        ArmLIR *newLoadLIR =
-                            (ArmLIR *)dvmCompilerNew(sizeof(ArmLIR), true);
-                        *newLoadLIR = *thisLIR;
-                        newLoadLIR->flags.age = cUnit->optRound;
-                        /*
-                         * Insertion is guaranteed to succeed since checkLIR
-                         * is never the first LIR on the list
-                         */
-                        dvmCompilerInsertLIRAfter((LIR *) checkLIR,
-                                                  (LIR *) newLoadLIR);
-                        thisLIR->flags.isNop = true;
-                    }
+                if (((curLIR->useMask & prevLIR->defMask) &&
+                     (EncodingMap[prevLIR->opcode].flags & IS_LOAD)) ||
+                    (slot < LD_LATENCY)) {
                     break;
                 }
+            }
 
+            /* Found a slot to hoist to */
+            if (slot >= 0) {
+                ArmLIR *curLIR = prevInstList[slot];
+                ArmLIR *newLoadLIR = (ArmLIR *) dvmCompilerNew(sizeof(ArmLIR),
+                                                               true);
+                *newLoadLIR = *thisLIR;
                 /*
-                 * Saw a real instruction that hosting the load is
-                 * beneficial
+                 * Insertion is guaranteed to succeed since checkLIR
+                 * is never the first LIR on the list
                  */
-                if (!isPseudoOpcode(checkLIR->opcode)) {
-                    hoistDistance++;
-                }
+                dvmCompilerInsertLIRBefore((LIR *) curLIR,
+                                           (LIR *) newLoadLIR);
+                thisLIR->flags.isNop = true;
             }
         }
     }