Experimental x86 Jit trace selection

Experimental support for trace selection for x86 host mode operation.
Not enabled by default.  Turned on by setting WITH_HOST_DALVIK true
and WITH_JIT true.  When enabled, profiles during x86 fast interpreter
operation, selects hot traces and "compiles" traces consisting of jumps
back to the interpreter.

First in a series of experimental x86 support checkins.

Change-Id: I0e423ec58a7bf01f226cb486f55de2841fab1002
diff --git a/vm/compiler/codegen/x86/Assemble.c b/vm/compiler/codegen/x86/Assemble.c
index fbf53ca..31264ce 100644
--- a/vm/compiler/codegen/x86/Assemble.c
+++ b/vm/compiler/codegen/x86/Assemble.c
@@ -34,8 +34,6 @@
 #endif
 
 /*
- * FIXME - redo for x86
- *
  * Translation layout in the code cache.  Note that the codeAddress pointer
  * in JitTable will point directly to the code body (field codeAddress).  The
  * chain cell offset codeAddress - 2, and (if present) executionCount is at
@@ -52,7 +50,7 @@
  *   |  .                            .
  *   |  |                            |
  *   |  +----------------------------+
- *   |  | Chaining Cells             |  -> 12/16 bytes each, must be 4 byte aligned
+ *   |  | Chaining Cells             |  -> 16 bytes each, 8 byte aligned
  *   |  .                            .
  *   |  .                            .
  *   |  |                            |
@@ -66,8 +64,8 @@
  *      |                            |
  *      +----------------------------+
  *      | Literal pool               |  -> 4-byte aligned, variable size
- *      .                            .
- *      .                            .
+ *      .                            .     Note: for x86 literals will
+ *      .                            .     generally appear inline.
  *      |                            |
  *      +----------------------------+
  *
diff --git a/vm/compiler/codegen/x86/CodegenDriver.c b/vm/compiler/codegen/x86/CodegenDriver.c
index 69f637e..4a5d481 100644
--- a/vm/compiler/codegen/x86/CodegenDriver.c
+++ b/vm/compiler/codegen/x86/CodegenDriver.c
@@ -24,10 +24,63 @@
  * applicable directory below this one.
  */
 
+extern X86LIR *loadConstant(CompilationUnit *cUnit, int rDest, int value);
+extern X86LIR *loadWordDisp(CompilationUnit *cUnit, int rBase,
+                            int displacement, int rDest);
+extern void dvmCompilerFlushAllRegs(CompilationUnit *cUnit);
+extern void storeWordDisp(CompilationUnit *cUnit, int rBase,
+                          int displacement, int rSrc);
+extern X86LIR *opReg(CompilationUnit *cUnit, OpKind op, int rDestSrc);
+
 static int opcodeCoverage[kNumPackedOpcodes];
 static intptr_t templateEntryOffsets[TEMPLATE_LAST_MARK];
 
 /*
+ * Bail to the interpreter.  Will not return to this trace.
+ * On entry, rPC must be set correctly.
+ */
+static void genPuntToInterp(CompilationUnit *cUnit, unsigned int offset)
+{
+    dvmCompilerFlushAllRegs(cUnit);
+    loadConstant(cUnit, rPC, (int)(cUnit->method->insns + offset));
+    loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
+    loadWordDisp(cUnit, rECX,
+                 offsetof(InterpState, jitToInterpEntries.dvmJitToInterpPunt),
+                 rEAX);
+    opReg(cUnit, kOpUncondBr, rEAX);
+}
+
+static void genInterpSingleStep(CompilationUnit *cUnit, MIR *mir)
+{
+    int flags = dexGetFlagsFromOpcode(mir->dalvikInsn.opcode);
+    int flagsToCheck = kInstrCanBranch | kInstrCanSwitch | kInstrCanReturn |
+                       kInstrCanThrow;
+
+    //If already optimized out, just ignore
+    if (mir->dalvikInsn.opcode == OP_NOP)
+        return;
+
+    //Ugly, but necessary.  Flush all Dalvik regs so Interp can find them
+    dvmCompilerFlushAllRegs(cUnit);
+
+    if ((mir->next == NULL) || (flags & flagsToCheck)) {
+       genPuntToInterp(cUnit, mir->offset);
+       return;
+    }
+    int entryAddr = offsetof(InterpState,
+                             jitToInterpEntries.dvmJitToInterpSingleStep);
+    loadWordDisp(cUnit, rEBP, 0, rECX);  // Get glue
+    loadWordDisp(cUnit, rECX, entryAddr, rEAX); // rEAX<- entry address
+    /* rPC = dalvik pc */
+    loadConstant(cUnit, rPC, (int) (cUnit->method->insns + mir->offset));
+    /* rECX = dalvik pc of following instruction */
+    loadConstant(cUnit, rECX, (int) (cUnit->method->insns + mir->next->offset));
+    /* Pass on the stack */
+    storeWordDisp(cUnit, rESP, OUT_ARG0, rECX);
+    opReg(cUnit, kOpCall, rEAX);
+}
+
+/*
  * The following are the first-level codegen routines that analyze the format
  * of each bytecode then either dispatch special purpose codegen routines
  * or produce corresponding Thumb instructions directly.
diff --git a/vm/compiler/codegen/x86/X86LIR.h b/vm/compiler/codegen/x86/X86LIR.h
index 62ac447..8acf015 100644
--- a/vm/compiler/codegen/x86/X86LIR.h
+++ b/vm/compiler/codegen/x86/X86LIR.h
@@ -27,7 +27,7 @@
  *     esp is native SP
  *
  * For interpreter:
- *     edx is Dalvik PC (rPC)
+ *     edi is Dalvik PC (rPC)
  *     ebx is rINST
  *
  * For JIT:
@@ -82,8 +82,8 @@
     int nextFPTemp;
     int numCoreRegs;
     RegisterInfo *coreRegs;
-    int numFPRegs;
-    RegisterInfo *FPRegs;
+    int numMMRegs;
+    RegisterInfo *MMRegs;
 } RegisterPool;
 
 typedef enum OpSize {
@@ -99,7 +99,6 @@
 
 typedef enum OpKind {
     kOpMov,
-    kOpMvn,
     kOpCmp,
     kOpLsl,
     kOpLsr,
@@ -114,15 +113,11 @@
     kOpAdc,
     kOpSub,
     kOpSbc,
-    kOpRsub,
     kOpMul,
     kOpDiv,
     kOpRem,
-    kOpBic,
-    kOpCmn,
     kOpTst,
-    kOpBkpt,
-    kOpBlx,
+    kOpCall,
     kOpPush,
     kOpPop,
     kOp2Char,
@@ -132,6 +127,37 @@
     kOpUncondBr,
 } OpKind;
 
+#define FP_REG_OFFSET 8
+
+typedef enum NativeRegisterPool {
+    rEAX = 0,
+    rECX = 1,
+    rEDX = 2,
+    rEBX = 3,
+    rESP = 4,
+    rEBP = 5,
+    rESI = 6,
+    rEDI = 7,
+    rXMM0 = 0 + FP_REG_OFFSET,
+    rXMM1 = 1 + FP_REG_OFFSET,
+    rXMM2 = 2 + FP_REG_OFFSET,
+    rXMM3 = 3 + FP_REG_OFFSET,
+    rXMM4 = 4 + FP_REG_OFFSET,
+    rXMM5 = 5 + FP_REG_OFFSET,
+    rXMM6 = 6 + FP_REG_OFFSET,
+    rXMM7 = 7 + FP_REG_OFFSET,
+} NativeRegisterPool;
+
+#define rPC rEDI
+#define rFP rESI
+#define rINST rEBX
+
+#define OUT_ARG0 0
+#define OUT_ARG1 4
+#define OUT_ARG2 8
+#define OUT_ARG3 12
+#define OUT_ARG4 16
+
 typedef struct X86LIR {
     LIR generic;
     //X86Opcode opcode;