JIT: Fix for [Issue 2675245] FRF40 monkey crash in jit-cache

The JIT's chaining mechanism suffered from a narrow window that
could result in i-cache inconsistency.  One of the forms of chaining
cell consisted of a two 16-bit thumb instruction sequence.  If a thread were
interrupted between the execution of those two instructions *and*
another thread picked that moment to convert that cell's
chained/unchained state, then bad things happen.

This CL alters the chain/unchain model somewhat to avoid this case.
Chainable chaining cells grow by 4 bytes each, and instead of rewriting
a 32-bit cell to chain/unchain, we switch between chained and unchained
state by [re]writing the first 16-bits of the cell as either a 16-bit
Thumb unconditional branch (unchained mode) or the first half of a
32-bit Thumb branch. The 2nd 16-bits of the cell will never change once
the cell moves from its inital state - thus avoiding the possibility of it
becoming inconsistent.

This adds a trivial execution penalty on the slow path, but will add
about a kByte of memory usage to a typical process.

Change-Id: Id8b99802e11386cfbab23da6abae10e2d9fc4065
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 2cd16a1..515e8af 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -2501,10 +2501,10 @@
  * blx &findPackedSwitchIndex
  * mov pc, r0
  * .align4
- * chaining cell for case 0 [8 bytes]
- * chaining cell for case 1 [8 bytes]
+ * chaining cell for case 0 [12 bytes]
+ * chaining cell for case 1 [12 bytes]
  *               :
- * chaining cell for case MIN(size, MAX_CHAINED_SWITCH_CASES)-1 [8 bytes]
+ * chaining cell for case MIN(size, MAX_CHAINED_SWITCH_CASES)-1 [12 bytes]
  * chaining cell for case default [8 bytes]
  * noChain exit
  */
@@ -2555,7 +2555,7 @@
         jumpIndex = index;
     }
 
-    chainingPC += jumpIndex * 8;
+    chainingPC += jumpIndex * CHAIN_CELL_NORMAL_SIZE;
     return (((s8) caseDPCOffset) << 32) | (u8) chainingPC;
 }
 
@@ -2606,13 +2606,14 @@
             /* MAX_CHAINED_SWITCH_CASES + 1 is the start of the overflow case */
             int jumpIndex = (i < MAX_CHAINED_SWITCH_CASES) ?
                            i : MAX_CHAINED_SWITCH_CASES + 1;
-            chainingPC += jumpIndex * 8;
+            chainingPC += jumpIndex * CHAIN_CELL_NORMAL_SIZE;
             return (((s8) entries[i]) << 32) | (u8) chainingPC;
         } else if (k > testVal) {
             break;
         }
     }
-    return chainingPC + MIN(size, MAX_CHAINED_SWITCH_CASES) * 8;
+    return chainingPC + MIN(size, MAX_CHAINED_SWITCH_CASES) *
+           CHAIN_CELL_NORMAL_SIZE;
 }
 
 static bool handleFmt31t(CompilationUnit *cUnit, MIR *mir)
@@ -3317,6 +3318,27 @@
  * Dalvik PC and special-purpose registers are reconstructed here.
  */
 
+/*
+ * Insert a
+ *    b   .+4
+ *    nop
+ * pair at the beginning of a chaining cell.  This serves as the
+ * switch branch that selects between reverting to the interpreter or
+ * not.  Once the cell is chained to a translation, the cell will
+ * contain a 32-bit branch.  Subsequent chain/unchain operations will
+ * then only alter that first 16-bits - the "b .+4" for unchaining,
+ * and the restoration of the first half of the 32-bit branch for
+ * rechaining.
+ */
+static void insertChainingSwitch(CompilationUnit *cUnit)
+{
+    ArmLIR *branch = newLIR0(cUnit, kThumbBUncond);
+    newLIR2(cUnit, kThumbOrr, r0, r0);
+    ArmLIR *target = newLIR0(cUnit, kArmPseudoTargetLabel);
+    target->defMask = ENCODE_ALL;
+    branch->generic.target = (LIR *) target;
+}
+
 /* Chaining cell for code that may need warmup. */
 static void handleNormalChainingCell(CompilationUnit *cUnit,
                                      unsigned int offset)
@@ -3325,6 +3347,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
             offsetof(InterpState,
                      jitToInterpEntries.dvmJitToInterpNormal) >> 2);
@@ -3343,6 +3366,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
             offsetof(InterpState,
                      jitToInterpEntries.dvmJitToInterpTraceSelect) >> 2);
@@ -3359,6 +3383,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
 #if defined(WITH_SELF_VERIFICATION)
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
         offsetof(InterpState,
@@ -3380,6 +3405,7 @@
      * Use raw instruction constructors to guarantee that the generated
      * instructions fit the predefined cell size.
      */
+    insertChainingSwitch(cUnit);
     newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
             offsetof(InterpState,
                      jitToInterpEntries.dvmJitToInterpTraceSelect) >> 2);