JIT: Fix for [Issue 2675245] FRF40 monkey crash in jit-cache

The JIT's chaining mechanism suffered from a narrow window that
could result in i-cache inconsistency.  One of the forms of chaining
cell consisted of a two 16-bit thumb instruction sequence.  If a thread were
interrupted between the execution of those two instructions *and*
another thread picked that moment to convert that cell's
chained/unchained state, then bad things happen.

This CL alters the chain/unchain model somewhat to avoid this case.
Chainable chaining cells grow by 4 bytes each, and instead of rewriting
a 32-bit cell to chain/unchain, we switch between chained and unchained
state by [re]writing the first 16-bits of the cell as either a 16-bit
Thumb unconditional branch (unchained mode) or the first half of a
32-bit Thumb branch. The 2nd 16-bits of the cell will never change once
the cell moves from its inital state - thus avoiding the possibility of it
becoming inconsistent.

This adds a trivial execution penalty on the slow path, but will add
about a kByte of memory usage to a typical process.

Change-Id: Id8b99802e11386cfbab23da6abae10e2d9fc4065
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index f7704ad..f580d6a 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -784,4 +784,7 @@
 
 #define CHAIN_CELL_OFFSET_TAG   0xcdab
 
+#define CHAIN_CELL_NORMAL_SIZE 12
+#define CHAIN_CELL_PREDICTED_SIZE 16
+
 #endif /* _DALVIK_VM_COMPILER_CODEGEN_ARM_ARMLIR_H */
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 499e287..05d311b 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -1159,7 +1159,7 @@
  *   |  .                            .
  *   |  |                            |
  *   |  +----------------------------+
- *   |  | Chaining Cells             |  -> 8 bytes each, must be 4 byte aligned
+ *   |  | Chaining Cells             |  -> 12/16 bytes each, must be 4 byte aligned
  *   |  .                            .
  *   |  .                            .
  *   |  |                            |
@@ -1395,10 +1395,18 @@
          * mix Arm & Thumb[2] translations, the following code should be
          * generalized.
          */
-        thumbTarget = (tgtAddr != gDvmJit.interpretTemplate);
+        thumbTarget = (tgtAddr != dvmCompilerGetInterpretTemplate());
 
         newInst = assembleChainingBranch(branchOffset, thumbTarget);
 
+        /*
+         * The second half-word instruction of the chaining cell must
+         * either be a nop (which represents initial state), or is the
+         * same exact branch halfword that we are trying to install.
+         */
+        assert( ((*branchAddr >> 16) == getSkeleton(kThumbOrr)) ||
+                ((*branchAddr >> 16) == (newInst >> 16)));
+
         *branchAddr = newInst;
         cacheflush((long)branchAddr, (long)branchAddr + 4, 0);
         UPDATE_CODE_CACHE_PATCHES();
@@ -1512,7 +1520,8 @@
      * Compilation not made yet for the callee. Reset the counter to a small
      * value and come back to check soon.
      */
-    if ((tgtAddr == 0) || ((void*)tgtAddr == gDvmJit.interpretTemplate)) {
+    if ((tgtAddr == 0) ||
+        ((void*)tgtAddr == dvmCompilerGetInterpretTemplate())) {
         /*
          * Wait for a few invocations (currently set to be 16) before trying
          * to setup the chain again.
@@ -1641,9 +1650,10 @@
     /* Get total count of chain cells */
     for (i = 0, cellSize = 0; i < kChainingCellGap; i++) {
         if (i != kChainingCellInvokePredicted) {
-            cellSize += pChainCellCounts->u.count[i] * 2;
+            cellSize += pChainCellCounts->u.count[i] * (CHAIN_CELL_NORMAL_SIZE >> 2);
         } else {
-            cellSize += pChainCellCounts->u.count[i] * 4;
+            cellSize += pChainCellCounts->u.count[i] *
+                (CHAIN_CELL_PREDICTED_SIZE >> 2);
         }
     }
 
@@ -1656,25 +1666,30 @@
 
     /* The cells are sorted in order - walk through them and reset */
     for (i = 0; i < kChainingCellGap; i++) {
-        int elemSize = 2; /* Most chaining cell has two words */
+        int elemSize = CHAIN_CELL_NORMAL_SIZE >> 2;  /* In 32-bit words */
         if (i == kChainingCellInvokePredicted) {
-            elemSize = 4;
+            elemSize = CHAIN_CELL_PREDICTED_SIZE >> 2;
         }
 
         for (j = 0; j < pChainCellCounts->u.count[i]; j++) {
-            int targetOffset;
             switch(i) {
                 case kChainingCellNormal:
-                    targetOffset = offsetof(InterpState,
-                          jitToInterpEntries.dvmJitToInterpNormal);
-                    break;
                 case kChainingCellHot:
                 case kChainingCellInvokeSingleton:
-                    targetOffset = offsetof(InterpState,
-                          jitToInterpEntries.dvmJitToInterpTraceSelect);
+                case kChainingCellBackwardBranch:
+                    /*
+                     * Replace the 1st half-word of the cell with an
+                     * unconditional branch, leaving the 2nd half-word
+                     * untouched.  This avoids problems with a thread
+                     * that is suspended between the two halves when
+                     * this unchaining takes place.
+                     */
+                    newInst = *pChainCells;
+                    newInst &= 0xFFFF0000;
+                    newInst |= getSkeleton(kThumbBUncond); /* b offset is 0 */
+                    *pChainCells = newInst;
                     break;
                 case kChainingCellInvokePredicted:
-                    targetOffset = 0;
                     predChainCell = (PredictedChainingCell *) pChainCells;
                     /*
                      * There could be a race on another mutator thread to use
@@ -1685,37 +1700,12 @@
                      */
                     predChainCell->clazz = PREDICTED_CHAIN_CLAZZ_INIT;
                     break;
-#if defined(WITH_SELF_VERIFICATION)
-                case kChainingCellBackwardBranch:
-                    targetOffset = offsetof(InterpState,
-                          jitToInterpEntries.dvmJitToInterpBackwardBranch);
-                    break;
-#elif defined(WITH_JIT_TUNING)
-                case kChainingCellBackwardBranch:
-                    targetOffset = offsetof(InterpState,
-                          jitToInterpEntries.dvmJitToInterpNormal);
-                    break;
-#endif
                 default:
-                    targetOffset = 0; // make gcc happy
                     LOGE("Unexpected chaining type: %d", i);
                     dvmAbort();  // dvmAbort OK here - can't safely recover
             }
             COMPILER_TRACE_CHAINING(
                 LOGD("Jit Runtime: unchaining 0x%x", (int)pChainCells));
-            /*
-             * Thumb code sequence for a chaining cell is:
-             *     ldr  r0, rGLUE, #<word offset>
-             *     blx  r0
-             */
-            if (i != kChainingCellInvokePredicted) {
-                targetOffset = targetOffset >> 2;  /* convert to word offset */
-                thumb1 = 0x6800 | (targetOffset << 6) |
-                         (rGLUE << 3) | (r0 << 0);
-                thumb2 = 0x4780 | (r0 << 3);
-                newInst = thumb2<<16 | thumb1;
-                *pChainCells = newInst;
-            }
             pChainCells += elemSize;  /* Advance by a fixed number of words */
         }
     }
@@ -1735,7 +1725,7 @@
             if (gDvmJit.pJitEntryTable[i].dPC &&
                    gDvmJit.pJitEntryTable[i].codeAddress &&
                    (gDvmJit.pJitEntryTable[i].codeAddress !=
-                    gDvmJit.interpretTemplate)) {
+                    dvmCompilerGetInterpretTemplate())) {
                 u4* lastAddress;
                 lastAddress =
                       dvmJitUnchain(gDvmJit.pJitEntryTable[i].codeAddress);
@@ -1797,7 +1787,7 @@
             LOGD("TRACEPROFILE 0x%08x 0 NULL 0 0", (int)traceBase);
         return 0;
     }
-    if (p->codeAddress == gDvmJit.interpretTemplate) {
+    if (p->codeAddress == dvmCompilerGetInterpretTemplate()) {
         if (!silent)
             LOGD("TRACEPROFILE 0x%08x 0 INTERPRET_ONLY  0 0", (int)traceBase);
         return 0;
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 2cd16a1..515e8af 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -2501,10 +2501,10 @@
  * blx &findPackedSwitchIndex
  * mov pc, r0
  * .align4
- * chaining cell for case 0 [8 bytes]
- * chaining cell for case 1 [8 bytes]
+ * chaining cell for case 0 [12 bytes]
+ * chaining cell for case 1 [12 bytes]
  *               :
- * chaining cell for case MIN(size, MAX_CHAINED_SWITCH_CASES)-1 [8 bytes]
+ * chaining cell for case MIN(size, MAX_CHAINED_SWITCH_CASES)-1 [12 bytes]
  * chaining cell for case default [8 bytes]
  * noChain exit
  */
@@ -2555,7 +2555,7 @@
         jumpIndex = index;
     }
 
-    chainingPC += jumpIndex * 8;
+    chainingPC += jumpIndex * CHAIN_CELL_NORMAL_SIZE;
     return (((s8) caseDPCOffset) << 32) | (u8) chainingPC;
 }
 
@@ -2606,13 +2606,14 @@
             /* MAX_CHAINED_SWITCH_CASES + 1 is the start of the overflow case */
             int jumpIndex = (i < MAX_CHAINED_SWITCH_CASES) ?
                            i : MAX_CHAINED_SWITCH_CASES + 1;
-            chainingPC += jumpIndex * 8;
+            chainingPC += jumpIndex * CHAIN_CELL_NORMAL_SIZE;
             return (((s8) entries[i]) << 32) | (u8) chainingPC;
         } else if (k > testVal) {
             break;
         }
     }
-    return chainingPC + MIN(size, MAX_CHAINED_SWITCH_CASES) * 8;
+    return chainingPC + MIN(size, MAX_CHAINED_SWITCH_CASES) *
+           CHAIN_CELL_NORMAL_SIZE;
 }
 
 static bool handleFmt31t(CompilationUnit *cUnit, MIR *mir)
@@ -3317,6 +3318,27 @@
  * Dalvik PC and special-purpose registers are reconstructed here.
  */
 
+/*
+ * Insert a
+ *    b   .+4
+ *    nop
+ * pair at the beginning of a chaining cell.  This serves as the
+ * switch branch that selects between reverting to the interpreter or
+ * not.  Once the cell is chained to a translation, the cell will
+ * contain a 32-bit branch.  Subsequent chain/unchain operations will
+ * then only alter that first 16-bits - the "b .+4" for unchaining,
+ * and the restoration of the first half of the 32-bit branch for
+ * rechaining.
+ */
+static void insertChainingSwitch(CompilationUnit *cUnit)
+{
+    ArmLIR *branch = newLIR0(cUnit, kThumbBUncond);
+    newLIR2(cUnit, kThumbOrr, r0, r0);
+    ArmLIR *target = newLIR0(cUnit, kArmPseudoTargetLabel);
+    target->defMask = ENCODE_ALL;
+    branch->generic.target = (LIR *) target;
+}
+
 /* Chaining cell for code that may need warmup. */
 static void handleNormalChainingCell(CompilationUnit *cUnit,
                                      unsigned int offset)
@@ -3325,6 +3347,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
             offsetof(InterpState,
                      jitToInterpEntries.dvmJitToInterpNormal) >> 2);
@@ -3343,6 +3366,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
             offsetof(InterpState,
                      jitToInterpEntries.dvmJitToInterpTraceSelect) >> 2);
@@ -3359,6 +3383,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
 #if defined(WITH_SELF_VERIFICATION)
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
         offsetof(InterpState,
@@ -3380,6 +3405,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
             offsetof(InterpState,
                      jitToInterpEntries.dvmJitToInterpTraceSelect) >> 2);