Add support to do suspend polling on backward branches in JIT'ed code.

The polling is expensive for now as it is done through three
instructions: ld/ld/branch. As a result, a bunch of bonus stuff has
been worked on to mitigate the extra overhead:

- Cleaned up resource flags for memory disambiguation.
- Rewrote load/store elimination and scheduler routines to hide
  the ld/ld latency for GC flag. Seperate the dependency checking into
  memory disambiguation part and resource conflict part.
- Allowed code motion for Dalvik/constant/non-aliasing loads to be
  hoisted above branches for null/range checks.
- Created extended basic blocks following goto instructions so that
  longer instruction streams can be optimized as a whole.

Without the bonus stuff, the performance dropped about ~5-10% on some
benchmarks because of the lack of headroom to hide the polling latency
in tight loops. With the bonus stuff, the performance delta is between
+/-5% with polling code generated. With the bonus stuff but disabling
polling, the new bonus stuff provides consistent performance
improvements:

CaffeineMark  3.6%
Linpack      11.1%
Scimark       9.7%
Sieve        33.0%
Checkers      6.0%

As a result, GC polling is disabled by default but can be turned on
through the -Xjitsuspendpoll flag for experimental purposes.

Change-Id: Ia81fc85de3e2b70e6cc93bc37c2b845892003cdb
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 7049e49..ade4197 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -1368,6 +1368,22 @@
 #endif
 
 /*
+ * Fetch *InterpState->pSelfSuspendCount. If the suspend count is non-zero,
+ * punt to the interpreter.
+ */
+static void genSuspendPoll(CompilationUnit *cUnit, MIR *mir)
+{
+    int rTemp = dvmCompilerAllocTemp(cUnit);
+    ArmLIR *ld;
+    ld = loadWordDisp(cUnit, rGLUE, offsetof(InterpState, pSelfSuspendCount),
+                      rTemp);
+    setMemRefType(ld, true /* isLoad */, kMustNotAlias);
+    ld = loadWordDisp(cUnit, rTemp, 0, rTemp);
+    setMemRefType(ld, true /* isLoad */, kMustNotAlias);
+    genRegImmCheck(cUnit, kArmCondNe, rTemp, 0, mir->offset, NULL);
+}
+
+/*
  * The following are the first-level codegen routines that analyze the format
  * of each bytecode then either dispatch special purpose codegen routines
  * or produce corresponding Thumb instructions directly.
@@ -1376,8 +1392,26 @@
 static bool handleFmt10t_Fmt20t_Fmt30t(CompilationUnit *cUnit, MIR *mir,
                                        BasicBlock *bb, ArmLIR *labelList)
 {
-    /* For OP_GOTO, OP_GOTO_16, and OP_GOTO_32 */
-    genUnconditionalBranch(cUnit, &labelList[bb->taken->id]);
+    /* backward branch? */
+    bool backwardBranch = (bb->taken->startOffset <= mir->offset);
+
+    if (backwardBranch && gDvmJit.genSuspendPoll) {
+        genSuspendPoll(cUnit, mir);
+    }
+
+    int numPredecessors = dvmCountSetBits(bb->taken->predecessors);
+    /*
+     * Things could be hoisted out of the taken block into the predecessor, so
+     * make sure it is dominated by the predecessor.
+     */
+    if (numPredecessors == 1 && bb->taken->visited == false &&
+        bb->taken->blockType == kDalvikByteCode &&
+        cUnit->methodJitMode == false ) {
+        cUnit->nextCodegenBlock = bb->taken;
+    } else {
+        /* For OP_GOTO, OP_GOTO_16, and OP_GOTO_32 */
+        genUnconditionalBranch(cUnit, &labelList[bb->taken->id]);
+    }
     return false;
 }
 
@@ -1995,8 +2029,16 @@
 {
     Opcode dalvikOpcode = mir->dalvikInsn.opcode;
     ArmConditionCode cond;
+    /* backward branch? */
+    bool backwardBranch = (bb->taken->startOffset <= mir->offset);
+
+    if (backwardBranch && gDvmJit.genSuspendPoll) {
+        genSuspendPoll(cUnit, mir);
+    }
+
     RegLocation rlSrc = dvmCompilerGetSrc(cUnit, mir, 0);
     rlSrc = loadValue(cUnit, rlSrc, kCoreReg);
+
     opRegImm(cUnit, kOpCmp, rlSrc.lowReg, 0);
 
 //TUNING: break this out to allow use of Thumb2 CB[N]Z
@@ -2509,11 +2551,19 @@
 {
     Opcode dalvikOpcode = mir->dalvikInsn.opcode;
     ArmConditionCode cond;
+    /* backward branch? */
+    bool backwardBranch = (bb->taken->startOffset <= mir->offset);
+
+    if (backwardBranch && gDvmJit.genSuspendPoll) {
+        genSuspendPoll(cUnit, mir);
+    }
+
     RegLocation rlSrc1 = dvmCompilerGetSrc(cUnit, mir, 0);
     RegLocation rlSrc2 = dvmCompilerGetSrc(cUnit, mir, 1);
 
     rlSrc1 = loadValue(cUnit, rlSrc1, kCoreReg);
     rlSrc2 = loadValue(cUnit, rlSrc2, kCoreReg);
+
     opRegReg(cUnit, kOpCmp, rlSrc1.lowReg, rlSrc2.lowReg);
 
     switch (dalvikOpcode) {
@@ -4094,6 +4144,10 @@
         dvmInitGrowableList(&chainingListByType[i], 2);
     }
 
+    /* Clear the visited flag for each block */
+    dvmCompilerDataFlowAnalysisDispatcher(cUnit, dvmCompilerClearVisitedFlag,
+                                          kAllNodes, false /* isIterative */);
+
     GrowableListIterator iterator;
     dvmGrowableListIteratorInit(&cUnit->blockList, &iterator);
 
@@ -4105,6 +4159,7 @@
         MIR *mir;
         BasicBlock *bb = (BasicBlock *) dvmGrowableListIteratorNext(&iterator);
         if (bb == NULL) break;
+        if (bb->visited == true) continue;
 
         labelList[i].operands[0] = bb->startOffset;
 
@@ -4199,171 +4254,189 @@
         }
 
         ArmLIR *headLIR = NULL;
+        BasicBlock *nextBB = bb;
 
-        for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
+        /*
+         * Try to build a longer optimization unit. Currently if the previous
+         * block ends with a goto, we continue adding instructions and don't
+         * reset the register allocation pool.
+         */
+        for (; nextBB != NULL; nextBB = cUnit->nextCodegenBlock) {
+            bb = nextBB;
+            bb->visited = true;
+            cUnit->nextCodegenBlock = NULL;
 
-            dvmCompilerResetRegPool(cUnit);
-            if (gDvmJit.disableOpt & (1 << kTrackLiveTemps)) {
-                dvmCompilerClobberAllRegs(cUnit);
-            }
+            for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
 
-            if (gDvmJit.disableOpt & (1 << kSuppressLoads)) {
-                dvmCompilerResetDefTracking(cUnit);
-            }
-
-            if (mir->dalvikInsn.opcode >= kMirOpFirst) {
-                handleExtendedMIR(cUnit, mir);
-                continue;
-            }
-
-
-            Opcode dalvikOpcode = mir->dalvikInsn.opcode;
-            InstructionFormat dalvikFormat = dexGetFormatFromOpcode(dalvikOpcode);
-            char *note;
-            if (mir->OptimizationFlags & MIR_INLINED) {
-                note = " (I)";
-            } else if (mir->OptimizationFlags & MIR_INLINED_PRED) {
-                note = " (PI)";
-            } else if (mir->OptimizationFlags & MIR_CALLEE) {
-                note = " (C)";
-            } else {
-                note = NULL;
-            }
-
-            ArmLIR *boundaryLIR;
-
-            /*
-             * Don't generate the boundary LIR unless we are debugging this
-             * trace or we need a scheduling barrier.
-             */
-            if (headLIR == NULL || cUnit->printMe == true) {
-                boundaryLIR =
-                    newLIR2(cUnit, kArmPseudoDalvikByteCodeBoundary,
-                            mir->offset,
-                            (int) dvmCompilerGetDalvikDisassembly(
-                                &mir->dalvikInsn, note));
-                /* Remember the first LIR for this block */
-                if (headLIR == NULL) {
-                    headLIR = boundaryLIR;
-                    /* Set the first boundaryLIR as a scheduling barrier */
-                    headLIR->defMask = ENCODE_ALL;
+                dvmCompilerResetRegPool(cUnit);
+                if (gDvmJit.disableOpt & (1 << kTrackLiveTemps)) {
+                    dvmCompilerClobberAllRegs(cUnit);
                 }
-            }
 
-            /* Don't generate the SSA annotation unless verbose mode is on */
-            if (cUnit->printMe && mir->ssaRep) {
-                char *ssaString = dvmCompilerGetSSAString(cUnit, mir->ssaRep);
-                newLIR1(cUnit, kArmPseudoSSARep, (int) ssaString);
-            }
+                if (gDvmJit.disableOpt & (1 << kSuppressLoads)) {
+                    dvmCompilerResetDefTracking(cUnit);
+                }
 
-            bool notHandled;
-            /*
-             * Debugging: screen the opcode first to see if it is in the
-             * do[-not]-compile list
-             */
-            bool singleStepMe = SINGLE_STEP_OP(dalvikOpcode);
+                if (mir->dalvikInsn.opcode >= kMirOpFirst) {
+                    handleExtendedMIR(cUnit, mir);
+                    continue;
+                }
+
+
+                Opcode dalvikOpcode = mir->dalvikInsn.opcode;
+                InstructionFormat dalvikFormat =
+                    dexGetFormatFromOpcode(dalvikOpcode);
+                char *note;
+                if (mir->OptimizationFlags & MIR_INLINED) {
+                    note = " (I)";
+                } else if (mir->OptimizationFlags & MIR_INLINED_PRED) {
+                    note = " (PI)";
+                } else if (mir->OptimizationFlags & MIR_CALLEE) {
+                    note = " (C)";
+                } else {
+                    note = NULL;
+                }
+
+                ArmLIR *boundaryLIR;
+
+                /*
+                 * Don't generate the boundary LIR unless we are debugging this
+                 * trace or we need a scheduling barrier.
+                 */
+                if (headLIR == NULL || cUnit->printMe == true) {
+                    boundaryLIR =
+                        newLIR2(cUnit, kArmPseudoDalvikByteCodeBoundary,
+                                mir->offset,
+                                (int) dvmCompilerGetDalvikDisassembly(
+                                    &mir->dalvikInsn, note));
+                    /* Remember the first LIR for this block */
+                    if (headLIR == NULL) {
+                        headLIR = boundaryLIR;
+                        /* Set the first boundaryLIR as a scheduling barrier */
+                        headLIR->defMask = ENCODE_ALL;
+                    }
+                }
+
+                /*
+                 * Don't generate the SSA annotation unless verbose mode is on
+                 */
+                if (cUnit->printMe && mir->ssaRep) {
+                    char *ssaString = dvmCompilerGetSSAString(cUnit,
+                                                              mir->ssaRep);
+                    newLIR1(cUnit, kArmPseudoSSARep, (int) ssaString);
+                }
+
+                bool notHandled;
+                /*
+                 * Debugging: screen the opcode first to see if it is in the
+                 * do[-not]-compile list
+                 */
+                bool singleStepMe = SINGLE_STEP_OP(dalvikOpcode);
 #if defined(WITH_SELF_VERIFICATION)
-          if (singleStepMe == false) {
-              singleStepMe = selfVerificationPuntOps(mir);
-          }
+              if (singleStepMe == false) {
+                  singleStepMe = selfVerificationPuntOps(mir);
+              }
 #endif
-            if (singleStepMe || cUnit->allSingleStep) {
-                notHandled = false;
-                genInterpSingleStep(cUnit, mir);
-            } else {
-                opcodeCoverage[dalvikOpcode]++;
-                switch (dalvikFormat) {
-                    case kFmt10t:
-                    case kFmt20t:
-                    case kFmt30t:
-                        notHandled = handleFmt10t_Fmt20t_Fmt30t(cUnit,
-                                  mir, bb, labelList);
-                        break;
-                    case kFmt10x:
-                        notHandled = handleFmt10x(cUnit, mir);
-                        break;
-                    case kFmt11n:
-                    case kFmt31i:
-                        notHandled = handleFmt11n_Fmt31i(cUnit, mir);
-                        break;
-                    case kFmt11x:
-                        notHandled = handleFmt11x(cUnit, mir);
-                        break;
-                    case kFmt12x:
-                        notHandled = handleFmt12x(cUnit, mir);
-                        break;
-                    case kFmt20bc:
-                    case kFmt40sc:
-                        notHandled = handleFmt20bc_Fmt40sc(cUnit, mir);
-                        break;
-                    case kFmt21c:
-                    case kFmt31c:
-                    case kFmt41c:
-                        notHandled = handleFmt21c_Fmt31c_Fmt41c(cUnit, mir);
-                        break;
-                    case kFmt21h:
-                        notHandled = handleFmt21h(cUnit, mir);
-                        break;
-                    case kFmt21s:
-                        notHandled = handleFmt21s(cUnit, mir);
-                        break;
-                    case kFmt21t:
-                        notHandled = handleFmt21t(cUnit, mir, bb, labelList);
-                        break;
-                    case kFmt22b:
-                    case kFmt22s:
-                        notHandled = handleFmt22b_Fmt22s(cUnit, mir);
-                        break;
-                    case kFmt22c:
-                    case kFmt52c:
-                        notHandled = handleFmt22c_Fmt52c(cUnit, mir);
-                        break;
-                    case kFmt22cs:
-                        notHandled = handleFmt22cs(cUnit, mir);
-                        break;
-                    case kFmt22t:
-                        notHandled = handleFmt22t(cUnit, mir, bb, labelList);
-                        break;
-                    case kFmt22x:
-                    case kFmt32x:
-                        notHandled = handleFmt22x_Fmt32x(cUnit, mir);
-                        break;
-                    case kFmt23x:
-                        notHandled = handleFmt23x(cUnit, mir);
-                        break;
-                    case kFmt31t:
-                        notHandled = handleFmt31t(cUnit, mir);
-                        break;
-                    case kFmt3rc:
-                    case kFmt35c:
-                    case kFmt5rc:
-                        notHandled = handleFmt35c_3rc_5rc(cUnit, mir, bb,
+                if (singleStepMe || cUnit->allSingleStep) {
+                    notHandled = false;
+                    genInterpSingleStep(cUnit, mir);
+                } else {
+                    opcodeCoverage[dalvikOpcode]++;
+                    switch (dalvikFormat) {
+                        case kFmt10t:
+                        case kFmt20t:
+                        case kFmt30t:
+                            notHandled = handleFmt10t_Fmt20t_Fmt30t(cUnit,
+                                      mir, bb, labelList);
+                            break;
+                        case kFmt10x:
+                            notHandled = handleFmt10x(cUnit, mir);
+                            break;
+                        case kFmt11n:
+                        case kFmt31i:
+                            notHandled = handleFmt11n_Fmt31i(cUnit, mir);
+                            break;
+                        case kFmt11x:
+                            notHandled = handleFmt11x(cUnit, mir);
+                            break;
+                        case kFmt12x:
+                            notHandled = handleFmt12x(cUnit, mir);
+                            break;
+                        case kFmt20bc:
+                        case kFmt40sc:
+                            notHandled = handleFmt20bc_Fmt40sc(cUnit, mir);
+                            break;
+                        case kFmt21c:
+                        case kFmt31c:
+                        case kFmt41c:
+                            notHandled = handleFmt21c_Fmt31c_Fmt41c(cUnit, mir);
+                            break;
+                        case kFmt21h:
+                            notHandled = handleFmt21h(cUnit, mir);
+                            break;
+                        case kFmt21s:
+                            notHandled = handleFmt21s(cUnit, mir);
+                            break;
+                        case kFmt21t:
+                            notHandled = handleFmt21t(cUnit, mir, bb,
                                                       labelList);
-                        break;
-                    case kFmt3rms:
-                    case kFmt35ms:
-                        notHandled = handleFmt35ms_3rms(cUnit, mir, bb,
-                                                        labelList);
-                        break;
-                    case kFmt35mi:
-                    case kFmt3rmi:
-                        notHandled = handleExecuteInline(cUnit, mir);
-                        break;
-                    case kFmt51l:
-                        notHandled = handleFmt51l(cUnit, mir);
-                        break;
-                    default:
-                        notHandled = true;
-                        break;
+                            break;
+                        case kFmt22b:
+                        case kFmt22s:
+                            notHandled = handleFmt22b_Fmt22s(cUnit, mir);
+                            break;
+                        case kFmt22c:
+                        case kFmt52c:
+                            notHandled = handleFmt22c_Fmt52c(cUnit, mir);
+                            break;
+                        case kFmt22cs:
+                            notHandled = handleFmt22cs(cUnit, mir);
+                            break;
+                        case kFmt22t:
+                            notHandled = handleFmt22t(cUnit, mir, bb,
+                                                      labelList);
+                            break;
+                        case kFmt22x:
+                        case kFmt32x:
+                            notHandled = handleFmt22x_Fmt32x(cUnit, mir);
+                            break;
+                        case kFmt23x:
+                            notHandled = handleFmt23x(cUnit, mir);
+                            break;
+                        case kFmt31t:
+                            notHandled = handleFmt31t(cUnit, mir);
+                            break;
+                        case kFmt3rc:
+                        case kFmt35c:
+                        case kFmt5rc:
+                            notHandled = handleFmt35c_3rc_5rc(cUnit, mir, bb,
+                                                          labelList);
+                            break;
+                        case kFmt3rms:
+                        case kFmt35ms:
+                            notHandled = handleFmt35ms_3rms(cUnit, mir, bb,
+                                                            labelList);
+                            break;
+                        case kFmt35mi:
+                        case kFmt3rmi:
+                            notHandled = handleExecuteInline(cUnit, mir);
+                            break;
+                        case kFmt51l:
+                            notHandled = handleFmt51l(cUnit, mir);
+                            break;
+                        default:
+                            notHandled = true;
+                            break;
+                    }
                 }
-            }
-            if (notHandled) {
-                LOGE("%#06x: Opcode 0x%x (%s) / Fmt %d not handled\n",
-                     mir->offset,
-                     dalvikOpcode, dexGetOpcodeName(dalvikOpcode),
-                     dalvikFormat);
-                dvmCompilerAbort(cUnit);
-                break;
+                if (notHandled) {
+                    LOGE("%#06x: Opcode 0x%x (%s) / Fmt %d not handled\n",
+                         mir->offset,
+                         dalvikOpcode, dexGetOpcodeName(dalvikOpcode),
+                         dalvikFormat);
+                    dvmCompilerAbort(cUnit);
+                    break;
+                }
             }
         }
 
@@ -4389,10 +4462,8 @@
          * insert an unconditional branch to the chaining cell.
          */
         if (bb->needFallThroughBranch) {
-            genUnconditionalBranch(cUnit,
-                                   &labelList[bb->fallThrough->id]);
+            genUnconditionalBranch(cUnit, &labelList[bb->fallThrough->id]);
         }
-
     }
 
     /* Handle the chaining cells in predefined order */