Experimental x86 Jit trace selection

Experimental support for trace selection for x86 host mode operation.
Not enabled by default.  Turned on by setting WITH_HOST_DALVIK true
and WITH_JIT true.  When enabled, profiles during x86 fast interpreter
operation, selects hot traces and "compiles" traces consisting of jumps
back to the interpreter.

First in a series of experimental x86 support checkins.

Change-Id: I0e423ec58a7bf01f226cb486f55de2841fab1002
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 8dd8adc..fe42f4c 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -627,6 +627,11 @@
                         compileOK = dvmCompilerDoWork(&work);
                     }
                     if (aborted || !compileOK) {
+#if 0 // for x86 JIT testing
+                        dvmJitSetCodeAddr(work.pc,
+                                          dvmCompilerGetInterpretTemplate(),
+                                          work.result.instructionSet);
+#endif
                         dvmCompilerArenaReset();
                     } else if (!work.result.discardResult &&
                                work.result.codeAddress) {
diff --git a/vm/compiler/codegen/x86/Assemble.c b/vm/compiler/codegen/x86/Assemble.c
index fbf53ca..31264ce 100644
--- a/vm/compiler/codegen/x86/Assemble.c
+++ b/vm/compiler/codegen/x86/Assemble.c
@@ -34,8 +34,6 @@
 #endif
 
 /*
- * FIXME - redo for x86
- *
  * Translation layout in the code cache.  Note that the codeAddress pointer
  * in JitTable will point directly to the code body (field codeAddress).  The
  * chain cell offset codeAddress - 2, and (if present) executionCount is at
@@ -52,7 +50,7 @@
  *   |  .                            .
  *   |  |                            |
  *   |  +----------------------------+
- *   |  | Chaining Cells             |  -> 12/16 bytes each, must be 4 byte aligned
+ *   |  | Chaining Cells             |  -> 16 bytes each, 8 byte aligned
  *   |  .                            .
  *   |  .                            .
  *   |  |                            |
@@ -66,8 +64,8 @@
  *      |                            |
  *      +----------------------------+
  *      | Literal pool               |  -> 4-byte aligned, variable size
- *      .                            .
- *      .                            .
+ *      .                            .     Note: for x86 literals will
+ *      .                            .     generally appear inline.
  *      |                            |
  *      +----------------------------+
  *
diff --git a/vm/compiler/codegen/x86/CodegenDriver.c b/vm/compiler/codegen/x86/CodegenDriver.c
index 69f637e..4a5d481 100644
--- a/vm/compiler/codegen/x86/CodegenDriver.c
+++ b/vm/compiler/codegen/x86/CodegenDriver.c
@@ -24,10 +24,63 @@
  * applicable directory below this one.
  */
 
+extern X86LIR *loadConstant(CompilationUnit *cUnit, int rDest, int value);
+extern X86LIR *loadWordDisp(CompilationUnit *cUnit, int rBase,
+                            int displacement, int rDest);
+extern void dvmCompilerFlushAllRegs(CompilationUnit *cUnit);
+extern void storeWordDisp(CompilationUnit *cUnit, int rBase,
+                          int displacement, int rSrc);
+extern X86LIR *opReg(CompilationUnit *cUnit, OpKind op, int rDestSrc);
+
 static int opcodeCoverage[kNumPackedOpcodes];
 static intptr_t templateEntryOffsets[TEMPLATE_LAST_MARK];
 
 /*
+ * Bail to the interpreter.  Will not return to this trace.
+ * On entry, rPC must be set correctly.
+ */
+static void genPuntToInterp(CompilationUnit *cUnit, unsigned int offset)
+{
+    dvmCompilerFlushAllRegs(cUnit);
+    loadConstant(cUnit, rPC, (int)(cUnit->method->insns + offset));
+    loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
+    loadWordDisp(cUnit, rECX,
+                 offsetof(InterpState, jitToInterpEntries.dvmJitToInterpPunt),
+                 rEAX);
+    opReg(cUnit, kOpUncondBr, rEAX);
+}
+
+static void genInterpSingleStep(CompilationUnit *cUnit, MIR *mir)
+{
+    int flags = dexGetFlagsFromOpcode(mir->dalvikInsn.opcode);
+    int flagsToCheck = kInstrCanBranch | kInstrCanSwitch | kInstrCanReturn |
+                       kInstrCanThrow;
+
+    //If already optimized out, just ignore
+    if (mir->dalvikInsn.opcode == OP_NOP)
+        return;
+
+    //Ugly, but necessary.  Flush all Dalvik regs so Interp can find them
+    dvmCompilerFlushAllRegs(cUnit);
+
+    if ((mir->next == NULL) || (flags & flagsToCheck)) {
+       genPuntToInterp(cUnit, mir->offset);
+       return;
+    }
+    int entryAddr = offsetof(InterpState,
+                             jitToInterpEntries.dvmJitToInterpSingleStep);
+    loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
+    loadWordDisp(cUnit, rECX, entryAddr, rEAX); // rEAX<- entry address
+    /* rPC = dalvik pc */
+    loadConstant(cUnit, rPC, (int) (cUnit->method->insns + mir->offset));
+    /* rECX = dalvik pc of following instruction */
+    loadConstant(cUnit, rECX, (int) (cUnit->method->insns + mir->next->offset));
+    /* Pass on the stack */
+    storeWordDisp(cUnit, rESP, OUT_ARG0, rECX);
+    opReg(cUnit, kOpCall, rEAX);
+}
+
+/*
  * The following are the first-level codegen routines that analyze the format
  * of each bytecode then either dispatch special purpose codegen routines
  * or produce corresponding Thumb instructions directly.
diff --git a/vm/compiler/codegen/x86/X86LIR.h b/vm/compiler/codegen/x86/X86LIR.h
index 62ac447..8acf015 100644
--- a/vm/compiler/codegen/x86/X86LIR.h
+++ b/vm/compiler/codegen/x86/X86LIR.h
@@ -27,7 +27,7 @@
  *     esp is native SP
  *
  * For interpreter:
- *     edx is Dalvik PC (rPC)
+ *     edi is Dalvik PC (rPC)
  *     ebx is rINST
  *
  * For JIT:
@@ -82,8 +82,8 @@
     int nextFPTemp;
     int numCoreRegs;
     RegisterInfo *coreRegs;
-    int numFPRegs;
-    RegisterInfo *FPRegs;
+    int numMMRegs;
+    RegisterInfo *MMRegs;
 } RegisterPool;
 
 typedef enum OpSize {
@@ -99,7 +99,6 @@
 
 typedef enum OpKind {
     kOpMov,
-    kOpMvn,
     kOpCmp,
     kOpLsl,
     kOpLsr,
@@ -114,15 +113,11 @@
     kOpAdc,
     kOpSub,
     kOpSbc,
-    kOpRsub,
     kOpMul,
     kOpDiv,
     kOpRem,
-    kOpBic,
-    kOpCmn,
     kOpTst,
-    kOpBkpt,
-    kOpBlx,
+    kOpCall,
     kOpPush,
     kOpPop,
     kOp2Char,
@@ -132,6 +127,37 @@
     kOpUncondBr,
 } OpKind;
 
+#define FP_REG_OFFSET 8
+
+typedef enum NativeRegisterPool {
+    rEAX = 0,
+    rECX = 1,
+    rEDX = 2,
+    rEBX = 3,
+    rESP = 4,
+    rEBP = 5,
+    rESI = 6,
+    rEDI = 7,
+    rXMM0 = 0 + FP_REG_OFFSET,
+    rXMM1 = 1 + FP_REG_OFFSET,
+    rXMM2 = 2 + FP_REG_OFFSET,
+    rXMM3 = 3 + FP_REG_OFFSET,
+    rXMM4 = 4 + FP_REG_OFFSET,
+    rXMM5 = 5 + FP_REG_OFFSET,
+    rXMM6 = 6 + FP_REG_OFFSET,
+    rXMM7 = 7 + FP_REG_OFFSET,
+} NativeRegisterPool;
+
+#define rPC rEDI
+#define rFP rESI
+#define rINST rEBX
+
+#define OUT_ARG0 0
+#define OUT_ARG1 4
+#define OUT_ARG2 8
+#define OUT_ARG3 12
+#define OUT_ARG4 16
+
 typedef struct X86LIR {
     LIR generic;
     //X86Opcode opcode;
diff --git a/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S b/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
index 4c98917..68b2d0d 100644
--- a/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
+++ b/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
@@ -1,27 +1,30 @@
     /*
-     * TODO: figure out how best to do this on x86, as we don't have
-     * an lr equivalent and probably don't want to push.
+     * This handler is a bit odd - it may be called via chaining or
+     * from static code and is expected to cause control to flow
+     * to the interpreter.  The problem is where to find the Dalvik
+     * PC of the next instruction.  When called via chaining, the dPC
+     * will be located at *rp.  When called from static code, rPC is
+     * valid and rp is a real return pointer (that should be ignored).
+     * The Arm target deals with this by using the link register as
+     * a flag.  If it is zero, we know we were called from static code.
+     * If non-zero, it points to the chain cell containing dPC.
+     * For x86, we'll infer the source by looking where rp points.
+     * If it points to anywhere within the code cache, we'll assume
+     * we got here via chaining.  Otherwise, we'll assume rPC is valid.
      *
-     * This handler transfers control to the interpeter without performing
-     * any lookups.  It may be called either as part of a normal chaining
-     * operation, or from the transition code in header.S.  We distinquish
-     * the two cases by looking at the link register.  If called from a
-     * translation chain, it will point to the chaining Dalvik PC -3.
      * On entry:
-     *    lr - if NULL:
-     *        r1 - the Dalvik PC to begin interpretation.
-     *    else
-     *        [lr, #3] contains Dalvik PC to begin interpretation
-     *    rGLUE - pointer to interpState
-     *    rFP - Dalvik frame pointer
-     *
-     *cmp     lr, #0
-     *ldrne   r1,[lr, #3]
-     *ldr     r2, .LinterpPunt
-     *mov     r0, r1                       @ set Dalvik PC
-     *bx      r2
-     *@ doesn't return
+     *    (TOS)<- return pointer or pointer to dPC
      */
+     movl   rGLUE,%ecx
+     movl   $$.LinterpPunt,%edx
+     pop    %eax
+     cmpl   %eax,offGlue_jitCacheEnd(%ecx)
+     ja     1f
+     cmpl   %eax,offGlue_jitCacheStart(%ecx)
+     jb     1f
+     movl   %eax,rPC
+1:
+     jmp    *(%edx)
 
 .LinterpPunt:
     .long   dvmJitToInterpPunt
diff --git a/vm/compiler/template/ia32/footer.S b/vm/compiler/template/ia32/footer.S
index 1b1a1ae..d11af69 100644
--- a/vm/compiler/template/ia32/footer.S
+++ b/vm/compiler/template/ia32/footer.S
@@ -7,12 +7,12 @@
     .text
     .align  4
 /*
- * FIXME - need a cacheflush for x86
+ * FIXME - verify that we don't need an explicit cache flush
+ * for x86.
  */
     .global cacheflush
 cacheflush:
-    movl  $$0xdeadf0f0, %eax
-    call *%eax
+    ret
 
 
     .global dmvCompilerTemplateEnd
diff --git a/vm/compiler/template/ia32/header.S b/vm/compiler/template/ia32/header.S
index 57f5a5b..a67ba6e 100644
--- a/vm/compiler/template/ia32/header.S
+++ b/vm/compiler/template/ia32/header.S
@@ -16,6 +16,12 @@
 
 #if defined(WITH_JIT)
 
+/* Subset of defines from mterp/x86/header.S */
+#define rGLUE (%ebp)
+#define rPC   %esi
+#define rFP   %edi
+#define rINST %ebx
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-ia32.S b/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
index 7726e97..1256ee4 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
@@ -23,6 +23,12 @@
 
 #if defined(WITH_JIT)
 
+/* Subset of defines from mterp/x86/header.S */
+#define rGLUE (%ebp)
+#define rPC   %esi
+#define rFP   %edi
+#define rINST %ebx
+
 /*
  * This is a #include, not a %include, because we want the C pre-processor
  * to expand the macros into assembler assignment statements.
@@ -51,29 +57,32 @@
 dvmCompiler_TEMPLATE_INTERPRET:
 /* File: ia32/TEMPLATE_INTERPRET.S */
     /*
-     * TODO: figure out how best to do this on x86, as we don't have
-     * an lr equivalent and probably don't want to push.
+     * This handler is a bit odd - it may be called via chaining or
+     * from static code and is expected to cause control to flow
+     * to the interpreter.  The problem is where to find the Dalvik
+     * PC of the next instruction.  When called via chaining, the dPC
+     * will be located at *rp.  When called from static code, rPC is
+     * valid and rp is a real return pointer (that should be ignored).
+     * The Arm target deals with this by using the link register as
+     * a flag.  If it is zero, we know we were called from static code.
+     * If non-zero, it points to the chain cell containing dPC.
+     * For x86, we'll infer the source by looking where rp points.
+     * If it points to anywhere within the code cache, we'll assume
+     * we got here via chaining.  Otherwise, we'll assume rPC is valid.
      *
-     * This handler transfers control to the interpeter without performing
-     * any lookups.  It may be called either as part of a normal chaining
-     * operation, or from the transition code in header.S.  We distinquish
-     * the two cases by looking at the link register.  If called from a
-     * translation chain, it will point to the chaining Dalvik PC -3.
      * On entry:
-     *    lr - if NULL:
-     *        r1 - the Dalvik PC to begin interpretation.
-     *    else
-     *        [lr, #3] contains Dalvik PC to begin interpretation
-     *    rGLUE - pointer to interpState
-     *    rFP - Dalvik frame pointer
-     *
-     *cmp     lr, #0
-     *ldrne   r1,[lr, #3]
-     *ldr     r2, .LinterpPunt
-     *mov     r0, r1                       @ set Dalvik PC
-     *bx      r2
-     *@ doesn't return
+     *    (TOS)<- return pointer or pointer to dPC
      */
+     movl   rGLUE,%ecx
+     movl   $.LinterpPunt,%edx
+     pop    %eax
+     cmpl   %eax,offGlue_jitCacheEnd(%ecx)
+     ja     1f
+     cmpl   %eax,offGlue_jitCacheStart(%ecx)
+     jb     1f
+     movl   %eax,rPC
+1:
+     jmp    *(%edx)
 
 .LinterpPunt:
     .long   dvmJitToInterpPunt
@@ -89,12 +98,12 @@
     .text
     .align  4
 /*
- * FIXME - need a cacheflush for x86
+ * FIXME - verify that we don't need an explicit cache flush
+ * for x86.
  */
     .global cacheflush
 cacheflush:
-    movl  $0xdeadf0f0, %eax
-    call *%eax
+    ret
 
 
     .global dmvCompilerTemplateEnd