Experimental x86 Jit trace selection
Experimental support for trace selection for x86 host mode operation.
Not enabled by default. Turned on by setting WITH_HOST_DALVIK true
and WITH_JIT true. When enabled, profiles during x86 fast interpreter
operation, selects hot traces and "compiles" traces consisting of jumps
back to the interpreter.
First in a series of experimental x86 support checkins.
Change-Id: I0e423ec58a7bf01f226cb486f55de2841fab1002
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 8dd8adc..fe42f4c 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -627,6 +627,11 @@
compileOK = dvmCompilerDoWork(&work);
}
if (aborted || !compileOK) {
+#if 0 // for x86 JIT testing
+ dvmJitSetCodeAddr(work.pc,
+ dvmCompilerGetInterpretTemplate(),
+ work.result.instructionSet);
+#endif
dvmCompilerArenaReset();
} else if (!work.result.discardResult &&
work.result.codeAddress) {
diff --git a/vm/compiler/codegen/x86/Assemble.c b/vm/compiler/codegen/x86/Assemble.c
index fbf53ca..31264ce 100644
--- a/vm/compiler/codegen/x86/Assemble.c
+++ b/vm/compiler/codegen/x86/Assemble.c
@@ -34,8 +34,6 @@
#endif
/*
- * FIXME - redo for x86
- *
* Translation layout in the code cache. Note that the codeAddress pointer
* in JitTable will point directly to the code body (field codeAddress). The
* chain cell offset codeAddress - 2, and (if present) executionCount is at
@@ -52,7 +50,7 @@
* | . .
* | | |
* | +----------------------------+
- * | | Chaining Cells | -> 12/16 bytes each, must be 4 byte aligned
+ * | | Chaining Cells | -> 16 bytes each, 8 byte aligned
* | . .
* | . .
* | | |
@@ -66,8 +64,8 @@
* | |
* +----------------------------+
* | Literal pool | -> 4-byte aligned, variable size
- * . .
- * . .
+ * . . Note: for x86 literals will
+ * . . generally appear inline.
* | |
* +----------------------------+
*
diff --git a/vm/compiler/codegen/x86/CodegenDriver.c b/vm/compiler/codegen/x86/CodegenDriver.c
index 69f637e..4a5d481 100644
--- a/vm/compiler/codegen/x86/CodegenDriver.c
+++ b/vm/compiler/codegen/x86/CodegenDriver.c
@@ -24,10 +24,63 @@
* applicable directory below this one.
*/
+extern X86LIR *loadConstant(CompilationUnit *cUnit, int rDest, int value);
+extern X86LIR *loadWordDisp(CompilationUnit *cUnit, int rBase,
+ int displacement, int rDest);
+extern void dvmCompilerFlushAllRegs(CompilationUnit *cUnit);
+extern void storeWordDisp(CompilationUnit *cUnit, int rBase,
+ int displacement, int rSrc);
+extern X86LIR *opReg(CompilationUnit *cUnit, OpKind op, int rDestSrc);
+
static int opcodeCoverage[kNumPackedOpcodes];
static intptr_t templateEntryOffsets[TEMPLATE_LAST_MARK];
/*
+ * Bail to the interpreter. Will not return to this trace.
+ * On entry, rPC must be set correctly.
+ */
+static void genPuntToInterp(CompilationUnit *cUnit, unsigned int offset)
+{
+ dvmCompilerFlushAllRegs(cUnit);
+ loadConstant(cUnit, rPC, (int)(cUnit->method->insns + offset));
+ loadWordDisp(cUnit, rEBP, 0, rECX); // Get glue
+ loadWordDisp(cUnit, rECX,
+ offsetof(InterpState, jitToInterpEntries.dvmJitToInterpPunt),
+ rEAX);
+ opReg(cUnit, kOpUncondBr, rEAX);
+}
+
+static void genInterpSingleStep(CompilationUnit *cUnit, MIR *mir)
+{
+ int flags = dexGetFlagsFromOpcode(mir->dalvikInsn.opcode);
+ int flagsToCheck = kInstrCanBranch | kInstrCanSwitch | kInstrCanReturn |
+ kInstrCanThrow;
+
+ //If already optimized out, just ignore
+ if (mir->dalvikInsn.opcode == OP_NOP)
+ return;
+
+ //Ugly, but necessary. Flush all Dalvik regs so Interp can find them
+ dvmCompilerFlushAllRegs(cUnit);
+
+ if ((mir->next == NULL) || (flags & flagsToCheck)) {
+ genPuntToInterp(cUnit, mir->offset);
+ return;
+ }
+ int entryAddr = offsetof(InterpState,
+ jitToInterpEntries.dvmJitToInterpSingleStep);
+ loadWordDisp(cUnit, rEBP, 0, rECX); // Get glue
+ loadWordDisp(cUnit, rECX, entryAddr, rEAX); // rEAX<- entry address
+ /* rPC = dalvik pc */
+ loadConstant(cUnit, rPC, (int) (cUnit->method->insns + mir->offset));
+ /* rECX = dalvik pc of following instruction */
+ loadConstant(cUnit, rECX, (int) (cUnit->method->insns + mir->next->offset));
+ /* Pass on the stack */
+ storeWordDisp(cUnit, rESP, OUT_ARG0, rECX);
+ opReg(cUnit, kOpCall, rEAX);
+}
+
+/*
* The following are the first-level codegen routines that analyze the format
* of each bytecode then either dispatch special purpose codegen routines
* or produce corresponding Thumb instructions directly.
diff --git a/vm/compiler/codegen/x86/X86LIR.h b/vm/compiler/codegen/x86/X86LIR.h
index 62ac447..8acf015 100644
--- a/vm/compiler/codegen/x86/X86LIR.h
+++ b/vm/compiler/codegen/x86/X86LIR.h
@@ -27,7 +27,7 @@
* esp is native SP
*
* For interpreter:
- * edx is Dalvik PC (rPC)
+ * edi is Dalvik PC (rPC)
* ebx is rINST
*
* For JIT:
@@ -82,8 +82,8 @@
int nextFPTemp;
int numCoreRegs;
RegisterInfo *coreRegs;
- int numFPRegs;
- RegisterInfo *FPRegs;
+ int numMMRegs;
+ RegisterInfo *MMRegs;
} RegisterPool;
typedef enum OpSize {
@@ -99,7 +99,6 @@
typedef enum OpKind {
kOpMov,
- kOpMvn,
kOpCmp,
kOpLsl,
kOpLsr,
@@ -114,15 +113,11 @@
kOpAdc,
kOpSub,
kOpSbc,
- kOpRsub,
kOpMul,
kOpDiv,
kOpRem,
- kOpBic,
- kOpCmn,
kOpTst,
- kOpBkpt,
- kOpBlx,
+ kOpCall,
kOpPush,
kOpPop,
kOp2Char,
@@ -132,6 +127,37 @@
kOpUncondBr,
} OpKind;
+#define FP_REG_OFFSET 8
+
+typedef enum NativeRegisterPool {
+ rEAX = 0,
+ rECX = 1,
+ rEDX = 2,
+ rEBX = 3,
+ rESP = 4,
+ rEBP = 5,
+ rESI = 6,
+ rEDI = 7,
+ rXMM0 = 0 + FP_REG_OFFSET,
+ rXMM1 = 1 + FP_REG_OFFSET,
+ rXMM2 = 2 + FP_REG_OFFSET,
+ rXMM3 = 3 + FP_REG_OFFSET,
+ rXMM4 = 4 + FP_REG_OFFSET,
+ rXMM5 = 5 + FP_REG_OFFSET,
+ rXMM6 = 6 + FP_REG_OFFSET,
+ rXMM7 = 7 + FP_REG_OFFSET,
+} NativeRegisterPool;
+
+#define rPC rEDI
+#define rFP rESI
+#define rINST rEBX
+
+#define OUT_ARG0 0
+#define OUT_ARG1 4
+#define OUT_ARG2 8
+#define OUT_ARG3 12
+#define OUT_ARG4 16
+
typedef struct X86LIR {
LIR generic;
//X86Opcode opcode;
diff --git a/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S b/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
index 4c98917..68b2d0d 100644
--- a/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
+++ b/vm/compiler/template/ia32/TEMPLATE_INTERPRET.S
@@ -1,27 +1,30 @@
/*
- * TODO: figure out how best to do this on x86, as we don't have
- * an lr equivalent and probably don't want to push.
+ * This handler is a bit odd - it may be called via chaining or
+ * from static code and is expected to cause control to flow
+ * to the interpreter. The problem is where to find the Dalvik
+ * PC of the next instruction. When called via chaining, the dPC
+ * will be located at *rp. When called from static code, rPC is
+ * valid and rp is a real return pointer (that should be ignored).
+ * The Arm target deals with this by using the link register as
+ * a flag. If it is zero, we know we were called from static code.
+ * If non-zero, it points to the chain cell containing dPC.
+ * For x86, we'll infer the source by looking where rp points.
+ * If it points to anywhere within the code cache, we'll assume
+ * we got here via chaining. Otherwise, we'll assume rPC is valid.
*
- * This handler transfers control to the interpeter without performing
- * any lookups. It may be called either as part of a normal chaining
- * operation, or from the transition code in header.S. We distinquish
- * the two cases by looking at the link register. If called from a
- * translation chain, it will point to the chaining Dalvik PC -3.
* On entry:
- * lr - if NULL:
- * r1 - the Dalvik PC to begin interpretation.
- * else
- * [lr, #3] contains Dalvik PC to begin interpretation
- * rGLUE - pointer to interpState
- * rFP - Dalvik frame pointer
- *
- *cmp lr, #0
- *ldrne r1,[lr, #3]
- *ldr r2, .LinterpPunt
- *mov r0, r1 @ set Dalvik PC
- *bx r2
- *@ doesn't return
+ * (TOS)<- return pointer or pointer to dPC
*/
+ movl rGLUE,%ecx
+ movl $$.LinterpPunt,%edx
+ pop %eax
+ cmpl %eax,offGlue_jitCacheEnd(%ecx)
+ ja 1f
+ cmpl %eax,offGlue_jitCacheStart(%ecx)
+ jb 1f
+ movl %eax,rPC
+1:
+ jmp *(%edx)
.LinterpPunt:
.long dvmJitToInterpPunt
diff --git a/vm/compiler/template/ia32/footer.S b/vm/compiler/template/ia32/footer.S
index 1b1a1ae..d11af69 100644
--- a/vm/compiler/template/ia32/footer.S
+++ b/vm/compiler/template/ia32/footer.S
@@ -7,12 +7,12 @@
.text
.align 4
/*
- * FIXME - need a cacheflush for x86
+ * FIXME - verify that we don't need an explicit cache flush
+ * for x86.
*/
.global cacheflush
cacheflush:
- movl $$0xdeadf0f0, %eax
- call *%eax
+ ret
.global dmvCompilerTemplateEnd
diff --git a/vm/compiler/template/ia32/header.S b/vm/compiler/template/ia32/header.S
index 57f5a5b..a67ba6e 100644
--- a/vm/compiler/template/ia32/header.S
+++ b/vm/compiler/template/ia32/header.S
@@ -16,6 +16,12 @@
#if defined(WITH_JIT)
+/* Subset of defines from mterp/x86/header.S */
+#define rGLUE (%ebp)
+#define rPC %esi
+#define rFP %edi
+#define rINST %ebx
+
/*
* This is a #include, not a %include, because we want the C pre-processor
* to expand the macros into assembler assignment statements.
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-ia32.S b/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
index 7726e97..1256ee4 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-ia32.S
@@ -23,6 +23,12 @@
#if defined(WITH_JIT)
+/* Subset of defines from mterp/x86/header.S */
+#define rGLUE (%ebp)
+#define rPC %esi
+#define rFP %edi
+#define rINST %ebx
+
/*
* This is a #include, not a %include, because we want the C pre-processor
* to expand the macros into assembler assignment statements.
@@ -51,29 +57,32 @@
dvmCompiler_TEMPLATE_INTERPRET:
/* File: ia32/TEMPLATE_INTERPRET.S */
/*
- * TODO: figure out how best to do this on x86, as we don't have
- * an lr equivalent and probably don't want to push.
+ * This handler is a bit odd - it may be called via chaining or
+ * from static code and is expected to cause control to flow
+ * to the interpreter. The problem is where to find the Dalvik
+ * PC of the next instruction. When called via chaining, the dPC
+ * will be located at *rp. When called from static code, rPC is
+ * valid and rp is a real return pointer (that should be ignored).
+ * The Arm target deals with this by using the link register as
+ * a flag. If it is zero, we know we were called from static code.
+ * If non-zero, it points to the chain cell containing dPC.
+ * For x86, we'll infer the source by looking where rp points.
+ * If it points to anywhere within the code cache, we'll assume
+ * we got here via chaining. Otherwise, we'll assume rPC is valid.
*
- * This handler transfers control to the interpeter without performing
- * any lookups. It may be called either as part of a normal chaining
- * operation, or from the transition code in header.S. We distinquish
- * the two cases by looking at the link register. If called from a
- * translation chain, it will point to the chaining Dalvik PC -3.
* On entry:
- * lr - if NULL:
- * r1 - the Dalvik PC to begin interpretation.
- * else
- * [lr, #3] contains Dalvik PC to begin interpretation
- * rGLUE - pointer to interpState
- * rFP - Dalvik frame pointer
- *
- *cmp lr, #0
- *ldrne r1,[lr, #3]
- *ldr r2, .LinterpPunt
- *mov r0, r1 @ set Dalvik PC
- *bx r2
- *@ doesn't return
+ * (TOS)<- return pointer or pointer to dPC
*/
+ movl rGLUE,%ecx
+ movl $.LinterpPunt,%edx
+ pop %eax
+ cmpl %eax,offGlue_jitCacheEnd(%ecx)
+ ja 1f
+ cmpl %eax,offGlue_jitCacheStart(%ecx)
+ jb 1f
+ movl %eax,rPC
+1:
+ jmp *(%edx)
.LinterpPunt:
.long dvmJitToInterpPunt
@@ -89,12 +98,12 @@
.text
.align 4
/*
- * FIXME - need a cacheflush for x86
+ * FIXME - verify that we don't need an explicit cache flush
+ * for x86.
*/
.global cacheflush
cacheflush:
- movl $0xdeadf0f0, %eax
- call *%eax
+ ret
.global dmvCompilerTemplateEnd