Introduce "just interpret" chainable pseudo-translation.

This is the first step towards enabling translation & self-cosim stress modes.
When trace selection begins, the trace head address is pinned and
remains in a limbo state until the translation is complete.  Previously,
if the trace selected aborted for any reason, the trace head would remain
forever in limbo.  This was not a correctness problem, but caused some
small performance anomolies and made life more difficult for self-cosimulation
mode.

This CL introduces a pseudo-translation that simply routes control to
the interpreter.  When we detect that a trace selection attempt has
failed, the trace head is associated with this fully-chainable
pseudo-translation.  This also has the benefit for self-cosimulation that
we are guaranteed forward progress.
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index bc39479..8e977c1 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -1259,16 +1259,28 @@
         info->codeAddress = (char*)info->codeAddress + 1;
 }
 
-static u4 assembleBXPair(int branchOffset)
+/*
+ * Returns the skeleton bit pattern associated with an opcode.  All
+ * variable fields are zeroed.
+ */
+static u4 getSkeleton(ArmOpCode op)
+{
+    return EncodingMap[op].skeleton;
+}
+
+static u4 assembleChainingBranch(int branchOffset, bool thumbTarget)
 {
     u4 thumb1, thumb2;
 
-    if ((branchOffset < -2048) | (branchOffset > 2046)) {
-        thumb1 =  (0xf000 | ((branchOffset>>12) & 0x7ff));
-        thumb2 =  (0xf800 | ((branchOffset>> 1) & 0x7ff));
+    if (!thumbTarget) {
+        thumb1 =  (getSkeleton(kThumbBlx1) | ((branchOffset>>12) & 0x7ff));
+        thumb2 =  (getSkeleton(kThumbBlx2) | ((branchOffset>> 1) & 0x7ff));
+    } else if ((branchOffset < -2048) | (branchOffset > 2046)) {
+        thumb1 =  (getSkeleton(kThumbBl1) | ((branchOffset>>12) & 0x7ff));
+        thumb2 =  (getSkeleton(kThumbBl2) | ((branchOffset>> 1) & 0x7ff));
     } else {
-        thumb1 =  (0xe000 | ((branchOffset>> 1) & 0x7ff));
-        thumb2 =  0x4300;  /* nop -> or r0, r0 */
+        thumb1 =  (getSkeleton(kThumbBUncond) | ((branchOffset>> 1) & 0x7ff));
+        thumb2 =  getSkeleton(kThumbOrr);  /* nop -> or r0, r0 */
     }
 
     return thumb2<<16 | thumb1;
@@ -1278,7 +1290,8 @@
  * Perform translation chain operation.
  * For ARM, we'll use a pair of thumb instructions to generate
  * an unconditional chaining branch of up to 4MB in distance.
- * Use a BL, though we don't really need the link.  The format is
+ * Use a BL, because the generic "interpret" translation needs
+ * the link register to find the dalvik pc of teh target.
  *     111HHooooooooooo
  * Where HH is 10 for the 1st inst, and 11 for the second and
  * the "o" field is each instruction's 11-bit contribution to the
@@ -1291,6 +1304,7 @@
     int baseAddr = (u4) branchAddr + 4;
     int branchOffset = (int) tgtAddr - baseAddr;
     u4 newInst;
+    bool thumbTarget;
 
     if (gDvm.sumThreadSuspendCount == 0) {
         assert((branchOffset >= -(1<<22)) && (branchOffset <= ((1<<22)-2)));
@@ -1301,7 +1315,16 @@
             LOGD("Jit Runtime: chaining 0x%x to 0x%x\n",
                  (int) branchAddr, (int) tgtAddr & -2));
 
-        newInst = assembleBXPair(branchOffset);
+        /*
+         * NOTE: normally, all translations are Thumb[2] mode, with
+         * a single exception: the default TEMPLATE_INTERPRET
+         * pseudo-translation.  If the need ever arises to
+         * mix Arm & Thumb[2] translations, the following code should be
+         * generalized.
+         */
+        thumbTarget = (tgtAddr != gDvmJit.interpretTemplate);
+
+        newInst = assembleChainingBranch(branchOffset, thumbTarget);
 
         *branchAddr = newInst;
         cacheflush((long)branchAddr, (long)branchAddr + 4, 0);
@@ -1354,7 +1377,7 @@
      * Compilation not made yet for the callee. Reset the counter to a small
      * value and come back to check soon.
      */
-    if (tgtAddr == 0) {
+    if ((tgtAddr == 0) || ((void*)tgtAddr == gDvmJit.interpretTemplate)) {
         /*
          * Wait for a few invocations (currently set to be 16) before trying
          * to setup the chain again.
@@ -1388,7 +1411,7 @@
              clazz->descriptor,
              method->name));
 
-    cell->branch = assembleBXPair(branchOffset);
+    cell->branch = assembleChainingBranch(branchOffset, true);
     cell->clazz = clazz;
     cell->method = method;
     /*
@@ -1517,7 +1540,9 @@
         dvmLockMutex(&gDvmJit.tableLock);
         for (i = 0; i < gDvmJit.jitTableSize; i++) {
             if (gDvmJit.pJitEntryTable[i].dPC &&
-                   gDvmJit.pJitEntryTable[i].codeAddress) {
+                   gDvmJit.pJitEntryTable[i].codeAddress &&
+                   (gDvmJit.pJitEntryTable[i].codeAddress !=
+                    gDvmJit.interpretTemplate)) {
                 u4* lastAddress;
                 lastAddress =
                       dvmJitUnchain(gDvmJit.pJitEntryTable[i].codeAddress);
@@ -1573,6 +1598,10 @@
         LOGD("TRACEPROFILE 0x%08x 0 NULL 0 0", (int)traceBase);
         return 0;
     }
+    if (p->codeAddress == gDvmJit.interpretTemplate) {
+        LOGD("TRACEPROFILE 0x%08x 0 INTERPRET_ONLY  0 0", (int)traceBase);
+        return 0;
+    }
 
     pExecutionCount = (u4*) (traceBase);
     pCellOffset = (u2*) (traceBase + 4);
diff --git a/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c b/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
index 8c61322..3a46cac 100644
--- a/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv5te-vfp/ArchVariant.c
@@ -53,6 +53,12 @@
             (int) gDvmJit.codeCache + templateEntryOffsets[opCode]);
 }
 
+void *dvmCompilerGetInterpretTemplate()
+{
+    return (void*) ((int)gDvmJit.codeCache +
+                    templateEntryOffsets[TEMPLATE_INTERPRET]);
+}
+
 /* Architecture-specific initializations and checks go here */
 static bool compilerArchVariantInit(void)
 {
diff --git a/vm/compiler/codegen/arm/armv5te/ArchVariant.c b/vm/compiler/codegen/arm/armv5te/ArchVariant.c
index a4b9ae3..4178c23 100644
--- a/vm/compiler/codegen/arm/armv5te/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv5te/ArchVariant.c
@@ -90,6 +90,12 @@
     return true;
 }
 
+void *dvmCompilerGetInterpretTemplate()
+{
+    return (void*) ((int)gDvmJit.codeCache +
+                    templateEntryOffsets[TEMPLATE_INTERPRET]);
+}
+
 static bool genInlineSqrt(CompilationUnit *cUnit, MIR *mir)
 {
     return false;   /* punt to C handler */
diff --git a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
index 0409135..02b9b79 100644
--- a/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
+++ b/vm/compiler/codegen/arm/armv7-a/ArchVariant.c
@@ -54,6 +54,12 @@
             (int) gDvmJit.codeCache + templateEntryOffsets[opCode]);
 }
 
+void *dvmCompilerGetInterpretTemplate()
+{
+    return (void*) ((int)gDvmJit.codeCache +
+                    templateEntryOffsets[TEMPLATE_INTERPRET]);
+}
+
 /* Architecture-specific initializations and checks go here */
 static bool compilerArchVariantInit(void)
 {
diff --git a/vm/compiler/template/armv5te-vfp/TemplateOpList.h b/vm/compiler/template/armv5te-vfp/TemplateOpList.h
index 1608920..d414e1b 100644
--- a/vm/compiler/template/armv5te-vfp/TemplateOpList.h
+++ b/vm/compiler/template/armv5te-vfp/TemplateOpList.h
@@ -55,3 +55,4 @@
 JIT_TEMPLATE(RESTORE_STATE)
 JIT_TEMPLATE(STRING_COMPARETO)
 JIT_TEMPLATE(STRING_INDEXOF)
+JIT_TEMPLATE(INTERPRET)
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INTERPRET.S b/vm/compiler/template/armv5te/TEMPLATE_INTERPRET.S
new file mode 100644
index 0000000..5484400
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_INTERPRET.S
@@ -0,0 +1,23 @@
+    /*
+     * This handler transfers control to the interpeter without performing
+     * any lookups.  It may be called either as part of a normal chaining
+     * operation, or from the transition code in header.S.  We distinquish
+     * the two cases by looking at the link register.  If called from a
+     * translation chain, it will point to the chaining Dalvik PC + 1.
+     * On entry:
+     *    lr - if NULL:
+     *        r1 - the Dalvik PC to begin interpretation.
+     *    else
+     *        [lr, #-1] contains Dalvik PC to begin interpretation
+     *    rGLUE - pointer to interpState
+     *    rFP - Dalvik frame pointer
+     */
+    cmp     lr, #0
+    ldrne   r1,[lr, #-1]
+    ldr     r2, .LinterpPunt
+    mov     r0, r1                       @ set Dalvik PC
+    bx      r2
+    @ doesn't return
+
+.LinterpPunt:
+    .word   dvmJitToInterpPunt
diff --git a/vm/compiler/template/armv5te/TemplateOpList.h b/vm/compiler/template/armv5te/TemplateOpList.h
index 88cc60a..1b5e6ea 100644
--- a/vm/compiler/template/armv5te/TemplateOpList.h
+++ b/vm/compiler/template/armv5te/TemplateOpList.h
@@ -40,3 +40,4 @@
 JIT_TEMPLATE(RESTORE_STATE)
 JIT_TEMPLATE(STRING_COMPARETO)
 JIT_TEMPLATE(STRING_INDEXOF)
+JIT_TEMPLATE(INTERPRET)
diff --git a/vm/compiler/template/armv5te/header.S b/vm/compiler/template/armv5te/header.S
index 9651032..c257105 100644
--- a/vm/compiler/template/armv5te/header.S
+++ b/vm/compiler/template/armv5te/header.S
@@ -85,6 +85,9 @@
 #define SAVEAREA_FROM_FP(_reg, _fpreg) \
     sub     _reg, _fpreg, #sizeofStackSaveArea
 
+#define EXPORT_PC() \
+    str     rPC, [rFP, #(-sizeofStackSaveArea + offStackSaveArea_currentPc)]
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
diff --git a/vm/compiler/template/armv7-a/TemplateOpList.h b/vm/compiler/template/armv7-a/TemplateOpList.h
index 1608920..d414e1b 100644
--- a/vm/compiler/template/armv7-a/TemplateOpList.h
+++ b/vm/compiler/template/armv7-a/TemplateOpList.h
@@ -55,3 +55,4 @@
 JIT_TEMPLATE(RESTORE_STATE)
 JIT_TEMPLATE(STRING_COMPARETO)
 JIT_TEMPLATE(STRING_INDEXOF)
+JIT_TEMPLATE(INTERPRET)
diff --git a/vm/compiler/template/config-armv5te-vfp b/vm/compiler/template/config-armv5te-vfp
index b5ca397..fc968fe 100644
--- a/vm/compiler/template/config-armv5te-vfp
+++ b/vm/compiler/template/config-armv5te-vfp
@@ -45,6 +45,7 @@
     op TEMPLATE_THROW_EXCEPTION_COMMON armv5te
     op TEMPLATE_STRING_COMPARETO armv5te
     op TEMPLATE_STRING_INDEXOF armv5te
+    op TEMPLATE_INTERPRET armv5te
 
 op-end
 
diff --git a/vm/compiler/template/config-armv7-a b/vm/compiler/template/config-armv7-a
index 1d3d331..7f7b478 100644
--- a/vm/compiler/template/config-armv7-a
+++ b/vm/compiler/template/config-armv7-a
@@ -45,6 +45,7 @@
     op TEMPLATE_THROW_EXCEPTION_COMMON armv5te
     op TEMPLATE_STRING_COMPARETO armv5te
     op TEMPLATE_STRING_INDEXOF armv5te
+    op TEMPLATE_INTERPRET armv5te
 
 op-end
 
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index cc86848..6604773 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -92,6 +92,9 @@
 #define SAVEAREA_FROM_FP(_reg, _fpreg) \
     sub     _reg, _fpreg, #sizeofStackSaveArea
 
+#define EXPORT_PC() \
+    str     rPC, [rFP, #(-sizeofStackSaveArea + offStackSaveArea_currentPc)]
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
@@ -1296,6 +1299,35 @@
     bx    lr
 
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INTERPRET
+dvmCompiler_TEMPLATE_INTERPRET:
+/* File: armv5te/TEMPLATE_INTERPRET.S */
+    /*
+     * This handler transfers control to the interpeter without performing
+     * any lookups.  It may be called either as part of a normal chaining
+     * operation, or from the transition code in header.S.  We distinquish
+     * the two cases by looking at the link register.  If called from a
+     * translation chain, it will point to the chaining Dalvik PC + 1.
+     * On entry:
+     *    lr - if NULL:
+     *        r1 - the Dalvik PC to begin interpretation.
+     *    else
+     *        [lr, #-1] contains Dalvik PC to begin interpretation
+     *    rGLUE - pointer to interpState
+     *    rFP - Dalvik frame pointer
+     */
+    cmp     lr, #0
+    ldrne   r1,[lr, #-1]
+    ldr     r2, .LinterpPunt
+    mov     r0, r1                       @ set Dalvik PC
+    bx      r2
+    @ doesn't return
+
+.LinterpPunt:
+    .word   dvmJitToInterpPunt
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index fbfaf86..cee118b 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -92,6 +92,9 @@
 #define SAVEAREA_FROM_FP(_reg, _fpreg) \
     sub     _reg, _fpreg, #sizeofStackSaveArea
 
+#define EXPORT_PC() \
+    str     rPC, [rFP, #(-sizeofStackSaveArea + offStackSaveArea_currentPc)]
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
@@ -1021,6 +1024,35 @@
     bx    lr
 
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INTERPRET
+dvmCompiler_TEMPLATE_INTERPRET:
+/* File: armv5te/TEMPLATE_INTERPRET.S */
+    /*
+     * This handler transfers control to the interpeter without performing
+     * any lookups.  It may be called either as part of a normal chaining
+     * operation, or from the transition code in header.S.  We distinquish
+     * the two cases by looking at the link register.  If called from a
+     * translation chain, it will point to the chaining Dalvik PC + 1.
+     * On entry:
+     *    lr - if NULL:
+     *        r1 - the Dalvik PC to begin interpretation.
+     *    else
+     *        [lr, #-1] contains Dalvik PC to begin interpretation
+     *    rGLUE - pointer to interpState
+     *    rFP - Dalvik frame pointer
+     */
+    cmp     lr, #0
+    ldrne   r1,[lr, #-1]
+    ldr     r2, .LinterpPunt
+    mov     r0, r1                       @ set Dalvik PC
+    bx      r2
+    @ doesn't return
+
+.LinterpPunt:
+    .word   dvmJitToInterpPunt
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index 4d479da..aab5067 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -92,6 +92,9 @@
 #define SAVEAREA_FROM_FP(_reg, _fpreg) \
     sub     _reg, _fpreg, #sizeofStackSaveArea
 
+#define EXPORT_PC() \
+    str     rPC, [rFP, #(-sizeofStackSaveArea + offStackSaveArea_currentPc)]
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
@@ -1296,6 +1299,35 @@
     bx    lr
 
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INTERPRET
+dvmCompiler_TEMPLATE_INTERPRET:
+/* File: armv5te/TEMPLATE_INTERPRET.S */
+    /*
+     * This handler transfers control to the interpeter without performing
+     * any lookups.  It may be called either as part of a normal chaining
+     * operation, or from the transition code in header.S.  We distinquish
+     * the two cases by looking at the link register.  If called from a
+     * translation chain, it will point to the chaining Dalvik PC + 1.
+     * On entry:
+     *    lr - if NULL:
+     *        r1 - the Dalvik PC to begin interpretation.
+     *    else
+     *        [lr, #-1] contains Dalvik PC to begin interpretation
+     *    rGLUE - pointer to interpState
+     *    rFP - Dalvik frame pointer
+     */
+    cmp     lr, #0
+    ldrne   r1,[lr, #-1]
+    ldr     r2, .LinterpPunt
+    mov     r0, r1                       @ set Dalvik PC
+    bx      r2
+    @ doesn't return
+
+.LinterpPunt:
+    .word   dvmJitToInterpPunt
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*