Interpreter restructuring: eliminate InterpState

The key datastructure for the interpreter is InterpState.
This change eliminates it, merging its data with the Thread structure.

Here's why:

In principio creavit Fadden Thread et InterpState.  And it was good.

Thread holds thread-private state, while InterpState captures data
associated with a Dalvik interpreter activation.  Because JNI calls
can result in nested interpreter invocations, we can have more than one
InterpState for each actual thread.  InterpState was relatively small,
and it all worked well.  It was used enough that in the Arm version
a register (rGLUE) was dedicated to it.

Then, along came the JIT guys, who saw InterpState as a convenient place
to dump all sorts of useful data that they wanted quick access to through
that dedicated register.  InterpState grew and grew.  In terms of
space, this wasn't a big problem - but it did mean that the initialization
cost of each interpreter activation grew as well.  For applications
that do a lot of callbacks from native code into Dalvik, this is
measurable.  It's also mostly useless cost because much of the JIT-related
InterpState initialization was setting up useful constants - things that
don't need to be saved and restored all the time.

The biggest problem, though, deals with thread control.  When something
interesting is happening that needs all threads to be stopped (such as
GC and debugger attach), we have access to all of the Thread structures,
but we don't have access to all of the InterpState structures (which
may be buried/nested on the native stack).  As a result, polling for
thread suspension is done via a one-indirection pointer chase.  InterpState
itself can't hold the stop bits because we can't always find it, so
instead it holds a pointer to the global or thread-specific stop control.

Yuck.

With this change, we eliminate InterpState and merge all needed data
into Thread.  Further, we replace the decidated rGLUE register with a
pointer to the Thread structure (rSELF).  The small subset of state
data that needs to be saved and restored across nested interpreter
activations is collected into a record that is saved to the interpreter
frame, and restored on exit.  Further, these small records are linked
together to allow tracebacks to show nested activations.  Old InterpState
variables that simply contain useful constants are initialized once at
thread creation time.

This CL is large enough by itself that the new ability to streamline
suspend checks is not done here - that will happen in a future CL.  Here
we just focus on consolidation.

Change-Id: Ide6b2fb85716fea454ac113f5611263a96687356
diff --git a/vm/compiler/codegen/CodegenFactory.c b/vm/compiler/codegen/CodegenFactory.c
index aad6512..61e29d7 100644
--- a/vm/compiler/codegen/CodegenFactory.c
+++ b/vm/compiler/codegen/CodegenFactory.c
@@ -57,7 +57,7 @@
     if (rlSrc.location == kLocPhysReg) {
         genRegCopy(cUnit, reg1, rlSrc.lowReg);
     } else  if (rlSrc.location == kLocRetval) {
-        loadWordDisp(cUnit, rGLUE, offsetof(InterpState, retval), reg1);
+        loadWordDisp(cUnit, rSELF, offsetof(Thread, retval), reg1);
     } else {
         assert(rlSrc.location == kLocDalvikFrame);
         loadWordDisp(cUnit, rFP, dvmCompilerS2VReg(cUnit, rlSrc.sRegLow) << 2,
@@ -90,7 +90,7 @@
     if (rlSrc.location == kLocPhysReg) {
         genRegCopyWide(cUnit, regLo, regHi, rlSrc.lowReg, rlSrc.highReg);
     } else if (rlSrc.location == kLocRetval) {
-        loadBaseDispWide(cUnit, NULL, rGLUE, offsetof(InterpState, retval),
+        loadBaseDispWide(cUnit, NULL, rSELF, offsetof(Thread, retval),
                          regLo, regHi, INVALID_SREG);
     } else {
         assert(rlSrc.location == kLocDalvikFrame);
@@ -124,7 +124,7 @@
         rlSrc.location = kLocPhysReg;
         dvmCompilerMarkLive(cUnit, rlSrc.lowReg, rlSrc.sRegLow);
     } else if (rlSrc.location == kLocRetval) {
-        loadWordDisp(cUnit, rGLUE, offsetof(InterpState, retval), rlSrc.lowReg);
+        loadWordDisp(cUnit, rSELF, offsetof(Thread, retval), rlSrc.lowReg);
         rlSrc.location = kLocPhysReg;
         dvmCompilerClobber(cUnit, rlSrc.lowReg);
     }
@@ -164,7 +164,7 @@
 
 
     if (rlDest.location == kLocRetval) {
-        storeBaseDisp(cUnit, rGLUE, offsetof(InterpState, retval),
+        storeBaseDisp(cUnit, rSELF, offsetof(Thread, retval),
                       rlDest.lowReg, kWord);
         dvmCompilerClobber(cUnit, rlDest.lowReg);
     } else {
@@ -192,7 +192,7 @@
         dvmCompilerMarkLive(cUnit, rlSrc.highReg,
                             dvmCompilerSRegHi(rlSrc.sRegLow));
     } else if (rlSrc.location == kLocRetval) {
-        loadBaseDispWide(cUnit, NULL, rGLUE, offsetof(InterpState, retval),
+        loadBaseDispWide(cUnit, NULL, rSELF, offsetof(Thread, retval),
                          rlSrc.lowReg, rlSrc.highReg, INVALID_SREG);
         rlSrc.location = kLocPhysReg;
         dvmCompilerClobber(cUnit, rlSrc.lowReg);
@@ -242,7 +242,7 @@
 
 
     if (rlDest.location == kLocRetval) {
-        storeBaseDispWide(cUnit, rGLUE, offsetof(InterpState, retval),
+        storeBaseDispWide(cUnit, rSELF, offsetof(Thread, retval),
                           rlDest.lowReg, rlDest.highReg);
         dvmCompilerClobber(cUnit, rlDest.lowReg);
         dvmCompilerClobber(cUnit, rlDest.highReg);
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 5adc1ed..1f8b5d0 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -24,7 +24,7 @@
  * r0, r1, r2, r3 are always scratch
  * r4 (rPC) is scratch for Jit, but most be restored when resuming interp
  * r5 (rFP) is reserved [holds Dalvik frame pointer]
- * r6 (rGLUE) is reserved [holds current &interpState]
+ * r6 (rSELF) is reserved [holds current &Thread]
  * r7 (rINST) is scratch for Jit
  * r8 (rIBASE) is scratch for Jit, but must be restored when resuming interp
  * r9 is reserved
@@ -210,7 +210,7 @@
     r3 = 3,
     r4PC = 4,
     rFP = 5,
-    rGLUE = 6,
+    rSELF = 6,
     r7 = 7,
     r8 = 8,
     r9 = 9,
diff --git a/vm/compiler/codegen/arm/ArmRallocUtil.c b/vm/compiler/codegen/arm/ArmRallocUtil.c
index bc643c1..d6e73a0 100644
--- a/vm/compiler/codegen/arm/ArmRallocUtil.c
+++ b/vm/compiler/codegen/arm/ArmRallocUtil.c
@@ -29,7 +29,7 @@
  * Register usage for 16-bit Thumb systems:
  *     r0-r3: Temp/argument
  *     lr(r14):      Temp for translations, return address for handlers
- *     rGLUE(r6):    Pointer to InterpState
+ *     rSELF(r6):    Pointer to Thread
  *     rFP(r5):      Dalvik frame pointer
  *     r4, r7:       Temp for translations
  *     r8, r9, r10:   Temp preserved across C calls
@@ -38,7 +38,7 @@
  * Register usage for 32-bit Thumb systems:
  *     r0-r3: Temp/argument
  *     lr(r14):      Temp for translations, return address for handlers
- *     rGLUE(r6):    Pointer to InterpState
+ *     rSELF(r6):    Pointer to Thread
  *     rFP(r5):      Dalvik frame pointer
  *     r4, r7:       Temp for translations
  *     r8, r9, r10   Temp preserved across C calls
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 34793ee..79f5ec7 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -1606,7 +1606,7 @@
  *      next safe point.
  */
 const Method *dvmJitToPatchPredictedChain(const Method *method,
-                                          InterpState *interpState,
+                                          Thread *self,
                                           PredictedChainingCell *cell,
                                           const ClassObject *clazz)
 {
@@ -1646,7 +1646,7 @@
     PredictedChainingCell newCell;
 
     if (cell->clazz == NULL) {
-        newRechainCount = interpState->icRechainCount;
+        newRechainCount = self->icRechainCount;
     }
 
     int baseAddr = (int) cell + 4;   // PC is cur_addr + 4
@@ -1667,7 +1667,7 @@
     inlineCachePatchEnqueue(cell, &newCell);
 #endif
 done:
-    interpState->icRechainCount = newRechainCount;
+    self->icRechainCount = newRechainCount;
     return method;
 }
 
diff --git a/vm/compiler/codegen/arm/CalloutHelper.h b/vm/compiler/codegen/arm/CalloutHelper.h
index 414f8c5..931cf0f 100644
--- a/vm/compiler/codegen/arm/CalloutHelper.h
+++ b/vm/compiler/codegen/arm/CalloutHelper.h
@@ -82,7 +82,7 @@
 
 /* Originally declared in compiler/codegen/arm/Assemble.c */
 const Method *dvmJitToPatchPredictedChain(const Method *method,
-                                          InterpState *interpState,
+                                          Thread *self,
                                           PredictedChainingCell *cell,
                                           const ClassObject *clazz);
 
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 2ed17d6..02e6f87 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -32,7 +32,7 @@
     int regCardBase = dvmCompilerAllocTemp(cUnit);
     int regCardNo = dvmCompilerAllocTemp(cUnit);
     ArmLIR *branchOver = genCmpImmBranch(cUnit, kArmCondEq, valReg, 0);
-    loadWordDisp(cUnit, rGLUE, offsetof(InterpState, cardTable),
+    loadWordDisp(cUnit, rSELF, offsetof(Thread, cardTable),
                  regCardBase);
     opRegRegImm(cUnit, kOpLsr, regCardNo, tgtAddrReg, GC_CARD_SHIFT);
     storeBaseIndexed(cUnit, regCardBase, regCardNo, regCardBase, 0,
@@ -973,7 +973,7 @@
         /* Branch to the PC reconstruction code */
         branch->generic.target = (LIR *) pcrLabel;
     }
-    /* TODO: Move result to InterpState for non-void returns */
+    /* TODO: Move result to Thread for non-void returns */
 }
 
 static void genProcessArgsNoRange(CompilationUnit *cUnit, MIR *mir,
@@ -1248,7 +1248,7 @@
 
     LOAD_FUNC_ADDR(cUnit, r7, (int) dvmJitToPatchPredictedChain);
 
-    genRegCopy(cUnit, r1, rGLUE);
+    genRegCopy(cUnit, r1, rSELF);
 
     /*
      * r0 = calleeMethod
@@ -1287,7 +1287,7 @@
     /* r0 = dalvik pc */
     dvmCompilerFlushAllRegs(cUnit);
     loadConstant(cUnit, r0, (int) (cUnit->method->insns + offset));
-    loadWordDisp(cUnit, rGLUE, offsetof(InterpState,
+    loadWordDisp(cUnit, rSELF, offsetof(Thread,
                  jitToInterpEntries.dvmJitToInterpPunt), r1);
     opReg(cUnit, kOpBlx, r1);
 }
@@ -1313,9 +1313,9 @@
        genPuntToInterp(cUnit, mir->offset);
        return;
     }
-    int entryAddr = offsetof(InterpState,
+    int entryAddr = offsetof(Thread,
                              jitToInterpEntries.dvmJitToInterpSingleStep);
-    loadWordDisp(cUnit, rGLUE, entryAddr, r2);
+    loadWordDisp(cUnit, rSELF, entryAddr, r2);
     /* r0 = dalvik pc */
     loadConstant(cUnit, r0, (int) (cUnit->method->insns + mir->offset));
     /* r1 = dalvik pc of following instruction */
@@ -1342,7 +1342,7 @@
     dvmCompilerFlushAllRegs(cUnit);   /* Send everything to home location */
     RegLocation rlSrc = dvmCompilerGetSrc(cUnit, mir, 0);
     loadValueDirectFixed(cUnit, rlSrc, r1);
-    loadWordDisp(cUnit, rGLUE, offsetof(InterpState, self), r0);
+    genRegCopy(cUnit, r0, rSELF);
     genNullCheck(cUnit, rlSrc.sRegLow, r1, mir->offset, NULL);
     if (isEnter) {
         /* Get dPC of next insn */
@@ -1368,18 +1368,16 @@
 #endif
 
 /*
- * Fetch *InterpState->pSelfSuspendCount. If the suspend count is non-zero,
+ * Fetch *self->suspendCount. If the suspend count is non-zero,
  * punt to the interpreter.
  */
 static void genSuspendPoll(CompilationUnit *cUnit, MIR *mir)
 {
     int rTemp = dvmCompilerAllocTemp(cUnit);
     ArmLIR *ld;
-    ld = loadWordDisp(cUnit, rGLUE, offsetof(InterpState, pSelfSuspendCount),
+    ld = loadWordDisp(cUnit, rSELF, offsetof(Thread, suspendCount),
                       rTemp);
     setMemRefType(ld, true /* isLoad */, kMustNotAlias);
-    ld = loadWordDisp(cUnit, rTemp, 0, rTemp);
-    setMemRefType(ld, true /* isLoad */, kMustNotAlias);
     genRegImmCheck(cUnit, kArmCondNe, rTemp, 0, mir->offset, NULL);
 }
 
@@ -1834,16 +1832,13 @@
     RegLocation rlResult;
     switch (dalvikOpcode) {
         case OP_MOVE_EXCEPTION: {
-            int offset = offsetof(InterpState, self);
             int exOffset = offsetof(Thread, exception);
-            int selfReg = dvmCompilerAllocTemp(cUnit);
             int resetReg = dvmCompilerAllocTemp(cUnit);
             RegLocation rlDest = dvmCompilerGetDest(cUnit, mir, 0);
             rlResult = dvmCompilerEvalLoc(cUnit, rlDest, kCoreReg, true);
-            loadWordDisp(cUnit, rGLUE, offset, selfReg);
+            loadWordDisp(cUnit, rSELF, exOffset, rlResult.lowReg);
             loadConstant(cUnit, resetReg, 0);
-            loadWordDisp(cUnit, selfReg, exOffset, rlResult.lowReg);
-            storeWordDisp(cUnit, selfReg, exOffset, resetReg);
+            storeWordDisp(cUnit, rSELF, exOffset, resetReg);
             storeValue(cUnit, rlDest, rlResult);
            break;
         }
@@ -3235,7 +3230,7 @@
 
             LOAD_FUNC_ADDR(cUnit, r7, (int) dvmJitToPatchPredictedChain);
 
-            genRegCopy(cUnit, r1, rGLUE);
+            genRegCopy(cUnit, r1, rSELF);
             genRegCopy(cUnit, r2, r9);
             genRegCopy(cUnit, r3, r10);
 
@@ -3584,8 +3579,8 @@
     dvmCompilerClobberCallRegs(cUnit);
     dvmCompilerClobber(cUnit, r4PC);
     dvmCompilerClobber(cUnit, r7);
-    int offset = offsetof(InterpState, retval);
-    opRegRegImm(cUnit, kOpAdd, r4PC, rGLUE, offset);
+    int offset = offsetof(Thread, retval);
+    opRegRegImm(cUnit, kOpAdd, r4PC, rSELF, offset);
     opImm(cUnit, kOpPush, (1<<r4PC) | (1<<r7));
     LOAD_FUNC_ADDR(cUnit, r4PC, fn);
     genExportPC(cUnit, mir);
@@ -3713,8 +3708,8 @@
      * instructions fit the predefined cell size.
      */
     insertChainingSwitch(cUnit);
-    newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
-            offsetof(InterpState,
+    newLIR3(cUnit, kThumbLdrRRI5, r0, rSELF,
+            offsetof(Thread,
                      jitToInterpEntries.dvmJitToInterpNormal) >> 2);
     newLIR1(cUnit, kThumbBlxR, r0);
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
@@ -3732,8 +3727,8 @@
      * instructions fit the predefined cell size.
      */
     insertChainingSwitch(cUnit);
-    newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
-            offsetof(InterpState,
+    newLIR3(cUnit, kThumbLdrRRI5, r0, rSELF,
+            offsetof(Thread,
                      jitToInterpEntries.dvmJitToInterpTraceSelect) >> 2);
     newLIR1(cUnit, kThumbBlxR, r0);
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
@@ -3749,12 +3744,12 @@
      */
     insertChainingSwitch(cUnit);
 #if defined(WITH_SELF_VERIFICATION)
-    newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
-        offsetof(InterpState,
+    newLIR3(cUnit, kThumbLdrRRI5, r0, rSELF,
+        offsetof(Thread,
                  jitToInterpEntries.dvmJitToInterpBackwardBranch) >> 2);
 #else
-    newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
-        offsetof(InterpState, jitToInterpEntries.dvmJitToInterpNormal) >> 2);
+    newLIR3(cUnit, kThumbLdrRRI5, r0, rSELF,
+        offsetof(Thread, jitToInterpEntries.dvmJitToInterpNormal) >> 2);
 #endif
     newLIR1(cUnit, kThumbBlxR, r0);
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
@@ -3769,8 +3764,8 @@
      * instructions fit the predefined cell size.
      */
     insertChainingSwitch(cUnit);
-    newLIR3(cUnit, kThumbLdrRRI5, r0, rGLUE,
-            offsetof(InterpState,
+    newLIR3(cUnit, kThumbLdrRRI5, r0, rSELF,
+            offsetof(Thread,
                      jitToInterpEntries.dvmJitToInterpTraceSelect) >> 2);
     newLIR1(cUnit, kThumbBlxR, r0);
     addWordData(cUnit, (int) (callee->insns), true);
@@ -4231,7 +4226,7 @@
                 case kExceptionHandling:
                     labelList[i].opcode = kArmPseudoEHBlockLabel;
                     if (cUnit->pcReconstructionList.numUsed) {
-                        loadWordDisp(cUnit, rGLUE, offsetof(InterpState,
+                        loadWordDisp(cUnit, rSELF, offsetof(Thread,
                                      jitToInterpEntries.dvmJitToInterpPunt),
                                      r1);
                         opReg(cUnit, kOpBlx, r1);
@@ -4525,7 +4520,7 @@
      */
     if (cUnit->switchOverflowPad) {
         loadConstant(cUnit, r0, (int) cUnit->switchOverflowPad);
-        loadWordDisp(cUnit, rGLUE, offsetof(InterpState,
+        loadWordDisp(cUnit, rSELF, offsetof(Thread,
                      jitToInterpEntries.dvmJitToInterpNoChain), r2);
         opRegReg(cUnit, kOpAdd, r1, r1);
         opRegRegReg(cUnit, kOpAdd, r4PC, r0, r1);
diff --git a/vm/compiler/codegen/arm/Thumb/Gen.c b/vm/compiler/codegen/arm/Thumb/Gen.c
index c5d06de..7205530 100644
--- a/vm/compiler/codegen/arm/Thumb/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb/Gen.c
@@ -214,14 +214,14 @@
 
 static bool genInlinedAbsFloat(CompilationUnit *cUnit, MIR *mir)
 {
-    int offset = offsetof(InterpState, retval);
+    int offset = offsetof(Thread, retval);
     RegLocation rlSrc = dvmCompilerGetSrc(cUnit, mir, 0);
     int reg0 = loadValue(cUnit, rlSrc, kCoreReg).lowReg;
     int signMask = dvmCompilerAllocTemp(cUnit);
     loadConstant(cUnit, signMask, 0x7fffffff);
     newLIR2(cUnit, kThumbAndRR, reg0, signMask);
     dvmCompilerFreeTemp(cUnit, signMask);
-    storeWordDisp(cUnit, rGLUE, offset, reg0);
+    storeWordDisp(cUnit, rSELF, offset, reg0);
     //TUNING: rewrite this to not clobber
     dvmCompilerClobber(cUnit, reg0);
     return false;
@@ -229,17 +229,17 @@
 
 static bool genInlinedAbsDouble(CompilationUnit *cUnit, MIR *mir)
 {
-    int offset = offsetof(InterpState, retval);
+    int offset = offsetof(Thread, retval);
     RegLocation rlSrc = dvmCompilerGetSrcWide(cUnit, mir, 0, 1);
     RegLocation regSrc = loadValueWide(cUnit, rlSrc, kCoreReg);
     int reglo = regSrc.lowReg;
     int reghi = regSrc.highReg;
     int signMask = dvmCompilerAllocTemp(cUnit);
     loadConstant(cUnit, signMask, 0x7fffffff);
-    storeWordDisp(cUnit, rGLUE, offset, reglo);
+    storeWordDisp(cUnit, rSELF, offset, reglo);
     newLIR2(cUnit, kThumbAndRR, reghi, signMask);
     dvmCompilerFreeTemp(cUnit, signMask);
-    storeWordDisp(cUnit, rGLUE, offset + 4, reghi);
+    storeWordDisp(cUnit, rSELF, offset + 4, reghi);
     //TUNING: rewrite this to not clobber
     dvmCompilerClobber(cUnit, reghi);
     return false;
@@ -248,7 +248,7 @@
 /* No select in thumb, so we need to branch.  Thumb2 will do better */
 static bool genInlinedMinMaxInt(CompilationUnit *cUnit, MIR *mir, bool isMin)
 {
-    int offset = offsetof(InterpState, retval);
+    int offset = offsetof(Thread, retval);
     RegLocation rlSrc1 = dvmCompilerGetSrc(cUnit, mir, 0);
     RegLocation rlSrc2 = dvmCompilerGetSrc(cUnit, mir, 1);
     int reg0 = loadValue(cUnit, rlSrc1, kCoreReg).lowReg;
@@ -259,7 +259,7 @@
     newLIR2(cUnit, kThumbMovRR, reg0, reg1);
     ArmLIR *target = newLIR0(cUnit, kArmPseudoTargetLabel);
     target->defMask = ENCODE_ALL;
-    newLIR3(cUnit, kThumbStrRRI5, reg0, rGLUE, offset >> 2);
+    newLIR3(cUnit, kThumbStrRRI5, reg0, rSELF, offset >> 2);
     branch1->generic.target = (LIR *)target;
     //TUNING: rewrite this to not clobber
     dvmCompilerClobber(cUnit,reg0);
diff --git a/vm/compiler/codegen/arm/Thumb2/Gen.c b/vm/compiler/codegen/arm/Thumb2/Gen.c
index a0195bc..864b0b1 100644
--- a/vm/compiler/codegen/arm/Thumb2/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb2/Gen.c
@@ -246,9 +246,8 @@
     loadValueDirectFixed(cUnit, rlSrc, r1);  // Get obj
     dvmCompilerLockAllTemps(cUnit);  // Prepare for explicit register usage
     dvmCompilerFreeTemp(cUnit, r4PC);  // Free up r4 for general use
-    loadWordDisp(cUnit, rGLUE, offsetof(InterpState, self), r0); // Get self
     genNullCheck(cUnit, rlSrc.sRegLow, r1, mir->offset, NULL);
-    loadWordDisp(cUnit, r0, offsetof(Thread, threadId), r3); // Get threadId
+    loadWordDisp(cUnit, rSELF, offsetof(Thread, threadId), r3); // Get threadId
     newLIR3(cUnit, kThumb2Ldrex, r2, r1,
             offsetof(Object, lock) >> 2); // Get object->lock
     opRegImm(cUnit, kOpLsl, r3, LW_LOCK_OWNER_SHIFT); // Align owner
@@ -276,6 +275,7 @@
             sizeof(StackSaveArea) -
             offsetof(StackSaveArea, xtra.currentPc));
     /* Call template, and don't return */
+    genRegCopy(cUnit, r0, rSELF);
     genDispatchToHandler(cUnit, TEMPLATE_MONITOR_ENTER);
     // Resume here
     target = newLIR0(cUnit, kArmPseudoTargetLabel);
@@ -301,10 +301,9 @@
     loadValueDirectFixed(cUnit, rlSrc, r1);  // Get obj
     dvmCompilerLockAllTemps(cUnit);  // Prepare for explicit register usage
     dvmCompilerFreeTemp(cUnit, r4PC);  // Free up r4 for general use
-    loadWordDisp(cUnit, rGLUE, offsetof(InterpState, self), r0); // Get self
     genNullCheck(cUnit, rlSrc.sRegLow, r1, mir->offset, NULL);
     loadWordDisp(cUnit, r1, offsetof(Object, lock), r2); // Get object->lock
-    loadWordDisp(cUnit, r0, offsetof(Thread, threadId), r3); // Get threadId
+    loadWordDisp(cUnit, rSELF, offsetof(Thread, threadId), r3); // Get threadId
     // Is lock unheld on lock or held by us (==threadId) on unlock?
     opRegRegImm(cUnit, kOpAnd, r7, r2,
                 (LW_HASH_STATE_MASK << LW_HASH_STATE_SHIFT));
@@ -325,6 +324,7 @@
     loadConstant(cUnit, r3, (int) (cUnit->method->insns + mir->offset));
 
     LOAD_FUNC_ADDR(cUnit, r7, (int)dvmUnlockObject);
+    genRegCopy(cUnit, r0, rSELF);
     // Export PC (part 2)
     newLIR3(cUnit, kThumb2StrRRI8Predec, r3, rFP,
             sizeof(StackSaveArea) -
diff --git a/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c b/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
index c857fa5..076f5f1 100644
--- a/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
@@ -74,9 +74,9 @@
      * EA is calculated by doing "Rn + imm5 << 2". Make sure that the last
      * offset from the struct is less than 128.
      */
-    if ((offsetof(InterpState, jitToInterpEntries) +
+    if ((offsetof(Thread, jitToInterpEntries) +
          sizeof(struct JitToInterpEntries)) >= 128) {
-        LOGE("InterpState.jitToInterpEntries size overflow");
+        LOGE("Thread.jitToInterpEntries size overflow");
         dvmAbort();
     }
 
diff --git a/vm/compiler/codegen/arm/armv5te/ArchVariant.c b/vm/compiler/codegen/arm/armv5te/ArchVariant.c
index 0f16839..73d27f9 100644
--- a/vm/compiler/codegen/arm/armv5te/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv5te/ArchVariant.c
@@ -74,9 +74,9 @@
      * EA is calculated by doing "Rn + imm5 << 2". Make sure that the last
      * offset from the struct is less than 128.
      */
-    if ((offsetof(InterpState, jitToInterpEntries) +
+    if ((offsetof(Thread, jitToInterpEntries) +
          sizeof(struct JitToInterpEntries)) >= 128) {
-        LOGE("InterpState.jitToInterpEntries size overflow");
+        LOGE("Thread.jitToInterpEntries size overflow");
         dvmAbort();
     }
 
diff --git a/vm/compiler/codegen/arm/armv7-a-neon/ArchVariant.c b/vm/compiler/codegen/arm/armv7-a-neon/ArchVariant.c
index 3df1095..bcd6a46 100644
--- a/vm/compiler/codegen/arm/armv7-a-neon/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv7-a-neon/ArchVariant.c
@@ -69,9 +69,9 @@
      * EA is calculated by doing "Rn + imm5 << 2". Make sure that the last
      * offset from the struct is less than 128.
      */
-    if ((offsetof(InterpState, jitToInterpEntries) +
+    if ((offsetof(Thread, jitToInterpEntries) +
          sizeof(struct JitToInterpEntries)) >= 128) {
-        LOGE("InterpState.jitToInterpEntries size overflow");
+        LOGE("Thread.jitToInterpEntries size overflow");
         dvmAbort();
     }
 
diff --git a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
index 3df1095..bcd6a46 100644
--- a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
@@ -69,9 +69,9 @@
      * EA is calculated by doing "Rn + imm5 << 2". Make sure that the last
      * offset from the struct is less than 128.
      */
-    if ((offsetof(InterpState, jitToInterpEntries) +
+    if ((offsetof(Thread, jitToInterpEntries) +
          sizeof(struct JitToInterpEntries)) >= 128) {
-        LOGE("InterpState.jitToInterpEntries size overflow");
+        LOGE("Thread.jitToInterpEntries size overflow");
         dvmAbort();
     }
 
diff --git a/vm/compiler/codegen/x86/Assemble.c b/vm/compiler/codegen/x86/Assemble.c
index dbb9b02..d583001 100644
--- a/vm/compiler/codegen/x86/Assemble.c
+++ b/vm/compiler/codegen/x86/Assemble.c
@@ -99,7 +99,7 @@
  *      next safe point.
  */
 const Method *dvmJitToPatchPredictedChain(const Method *method,
-                                          InterpState *interpState,
+                                          Thread *self,
                                           PredictedChainingCell *cell,
                                           const ClassObject *clazz)
 {
diff --git a/vm/compiler/codegen/x86/CodegenDriver.c b/vm/compiler/codegen/x86/CodegenDriver.c
index e440e37..a5ef56a 100644
--- a/vm/compiler/codegen/x86/CodegenDriver.c
+++ b/vm/compiler/codegen/x86/CodegenDriver.c
@@ -46,7 +46,7 @@
     loadConstant(cUnit, rPC, (int)(cUnit->method->insns + offset));
     loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
     loadWordDisp(cUnit, rECX,
-                 offsetof(InterpState, jitToInterpEntries.dvmJitToInterpPunt),
+                 offsetof(Thread, jitToInterpEntries.dvmJitToInterpPunt),
                  rEAX);
     opReg(cUnit, kOpUncondBr, rEAX);
 }
@@ -68,7 +68,7 @@
        genPuntToInterp(cUnit, mir->offset);
        return;
     }
-    int entryAddr = offsetof(InterpState,
+    int entryAddr = offsetof(Thread,
                              jitToInterpEntries.dvmJitToInterpSingleStep);
     loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
     loadWordDisp(cUnit, rECX, entryAddr, rEAX); // rEAX<- entry address
diff --git a/vm/compiler/codegen/x86/ia32/ArchVariant.c b/vm/compiler/codegen/x86/ia32/ArchVariant.c
index 931189f..4ccd56f 100644
--- a/vm/compiler/codegen/x86/ia32/ArchVariant.c
+++ b/vm/compiler/codegen/x86/ia32/ArchVariant.c
@@ -74,7 +74,7 @@
      * EA is calculated by doing "Rn + imm5 << 2", make sure that the last
      * offset from the struct is less than 128.
      */
-    assert((offsetof(InterpState, jitToInterpEntries) +
+    assert((offsetof(Thread, jitToInterpEntries) +
             sizeof(struct JitToInterpEntries)) <= 128);
     return true;
 }