Improved method invocation performance: 1.5x for virtual and 2.8x for interface.
- Implemented predicted chaining for invoke virtual and interface.
- Eliminated a little bit of fat for invoke native.
- Added 078-polymorphic-virtual for stress tests.
diff --git a/vm/compiler/codegen/armv5te/Assemble.c b/vm/compiler/codegen/armv5te/Assemble.c
index 9b4595d..9642c7d 100644
--- a/vm/compiler/codegen/armv5te/Assemble.c
+++ b/vm/compiler/codegen/armv5te/Assemble.c
@@ -91,7 +91,7 @@
IS_BINARY_OP | CLOBBER_DEST,
"add", "r!0d, r!1d"),
ENCODING_MAP(ARMV5TE_ADD_PC_REL, 0xa000, 10, 8, 7, 0, -1, -1,
- IS_BINARY_OP | CLOBBER_DEST,
+ IS_TERTIARY_OP | CLOBBER_DEST,
"add", "r!0d, pc, #!1E"),
ENCODING_MAP(ARMV5TE_ADD_SP_REL, 0xa800, 10, 8, 7, 0, -1, -1,
IS_BINARY_OP | CLOBBER_DEST,
@@ -169,7 +169,7 @@
IS_TERTIARY_OP | CLOBBER_DEST,
"ldr", "r!0d, [r!1d, r!2d]"),
ENCODING_MAP(ARMV5TE_LDR_PC_REL, 0x4800, 10, 8, 7, 0, -1, -1,
- IS_BINARY_OP | CLOBBER_DEST,
+ IS_TERTIARY_OP | CLOBBER_DEST,
"ldr", "r!0d, [pc, #!1E]"),
ENCODING_MAP(ARMV5TE_LDR_SP_REL, 0x9800, 10, 8, 7, 0, -1, -1,
IS_BINARY_OP | CLOBBER_DEST,
@@ -210,15 +210,15 @@
ENCODING_MAP(ARMV5TE_MOV_RR, 0x1c00, 2, 0, 5, 3, -1, -1,
IS_BINARY_OP | CLOBBER_DEST,
"mov", "r!0d, r!1d"),
- ENCODING_MAP(ARMV5TE_MOV_RR_LH, 0x4640, 2, 0, 5, 3, -1, -1,
- IS_BINARY_OP | CLOBBER_DEST,
- "mov", "r!0D, r!1d"),
- ENCODING_MAP(ARMV5TE_MOV_RR_HL, 0x4680, 2, 0, 5, 3, -1, -1,
- IS_BINARY_OP | CLOBBER_DEST,
- "mov", "r!0d, r!1D"),
- ENCODING_MAP(ARMV5TE_MOV_RR_HH, 0x46c0, 2, 0, 5, 3, -1, -1,
+ ENCODING_MAP(ARMV5TE_MOV_RR_H2H, 0x46c0, 2, 0, 5, 3, -1, -1,
IS_BINARY_OP | CLOBBER_DEST,
"mov", "r!0D, r!1D"),
+ ENCODING_MAP(ARMV5TE_MOV_RR_H2L, 0x4640, 2, 0, 5, 3, -1, -1,
+ IS_BINARY_OP | CLOBBER_DEST,
+ "mov", "r!0d, r!1D"),
+ ENCODING_MAP(ARMV5TE_MOV_RR_L2H, 0x4680, 2, 0, 5, 3, -1, -1,
+ IS_BINARY_OP | CLOBBER_DEST,
+ "mov", "r!0D, r!1d"),
ENCODING_MAP(ARMV5TE_MUL, 0x4340, 2, 0, 5, 3, -1, -1,
IS_BINARY_OP | CLOBBER_DEST,
"mul", "r!0d, r!1d"),
@@ -335,7 +335,12 @@
lir->opCode == ARMV5TE_ADD_PC_REL) {
Armv5teLIR *lirTarget = (Armv5teLIR *) lir->generic.target;
intptr_t pc = (lir->generic.offset + 4) & ~3;
- intptr_t target = lirTarget->generic.offset;
+ /*
+ * Allow an offset (stored in operands[2] to be added to the
+ * PC-relative target. Useful to get to a fixed field inside a
+ * chaining cell.
+ */
+ intptr_t target = lirTarget->generic.offset + lir->operands[2];
int delta = target - pc;
if (delta & 0x3) {
LOGE("PC-rel distance is not multiples of 4: %d\n", delta);
@@ -468,7 +473,7 @@
/* Add space for chain cell counts & trace description */
u4 chainCellOffset = offset;
- Armv5teLIR *chainCellOffsetLIR = cUnit->chainCellOffsetLIR;
+ Armv5teLIR *chainCellOffsetLIR = (Armv5teLIR *) cUnit->chainCellOffsetLIR;
assert(chainCellOffsetLIR);
assert(chainCellOffset < 0x10000);
assert(chainCellOffsetLIR->opCode == ARMV5TE_16BIT_DATA &&
@@ -544,6 +549,21 @@
(long)(cUnit->baseAddr + offset), 0);
}
+static u4 assembleBXPair(int branchOffset)
+{
+ u4 thumb1, thumb2;
+
+ if ((branchOffset < -2048) | (branchOffset > 2046)) {
+ thumb1 = (0xf000 | ((branchOffset>>12) & 0x7ff));
+ thumb2 = (0xf800 | ((branchOffset>> 1) & 0x7ff));
+ } else {
+ thumb1 = (0xe000 | ((branchOffset>> 1) & 0x7ff));
+ thumb2 = 0x4300; /* nop -> or r0, r0 */
+ }
+
+ return thumb2<<16 | thumb1;
+}
+
/*
* Perform translation chain operation.
* For ARM, we'll use a pair of thumb instructions to generate
@@ -560,8 +580,6 @@
{
int baseAddr = (u4) branchAddr + 4;
int branchOffset = (int) tgtAddr - baseAddr;
- u4 thumb1;
- u4 thumb2;
u4 newInst;
if (gDvm.sumThreadSuspendCount == 0) {
@@ -572,15 +590,9 @@
COMPILER_TRACE_CHAINING(
LOGD("Jit Runtime: chaining 0x%x to 0x%x\n",
(int) branchAddr, (int) tgtAddr & -2));
- if ((branchOffset < -2048) | (branchOffset > 2046)) {
- thumb1 = (0xf000 | ((branchOffset>>12) & 0x7ff));
- thumb2 = (0xf800 | ((branchOffset>> 1) & 0x7ff));
- } else {
- thumb1 = (0xe000 | ((branchOffset>> 1) & 0x7ff));
- thumb2 = 0x4300; /* nop -> or r0, r0 */
- }
- newInst = thumb2<<16 | thumb1;
+ newInst = assembleBXPair(branchOffset);
+
*branchAddr = newInst;
cacheflush((long)branchAddr, (long)branchAddr + 4, 0);
}
@@ -589,6 +601,83 @@
}
/*
+ * This method is called from the invoke templates for virtual and interface
+ * methods to speculatively setup a chain to the callee. The templates are
+ * written in assembly and have setup method, cell, and clazz at r0, r2, and
+ * r3 respectively, so there is a unused argument in the list. Upon return one
+ * of the following three results may happen:
+ * 1) Chain is not setup because the callee is native. Reset the rechain
+ * count to a big number so that it will take a long time before the next
+ * rechain attempt to happen.
+ * 2) Chain is not setup because the callee has not been created yet. Reset
+ * the rechain count to a small number and retry in the near future.
+ * 3) Ask all other threads to stop before patching this chaining cell.
+ * This is required because another thread may have passed the class check
+ * but hasn't reached the chaining cell yet to follow the chain. If we
+ * patch the content before halting the other thread, there could be a
+ * small window for race conditions to happen that it may follow the new
+ * but wrong chain to invoke a different method.
+ */
+const Method *dvmJitToPatchPredictedChain(const Method *method,
+ void *unused,
+ PredictedChainingCell *cell,
+ const ClassObject *clazz)
+{
+ /* Don't come back here for a long time if the method is native */
+ if (dvmIsNativeMethod(method)) {
+ cell->counter = PREDICTED_CHAIN_COUNTER_AVOID;
+ cacheflush((long) cell, (long) (cell+1), 0);
+ COMPILER_TRACE_CHAINING(
+ LOGD("Jit Runtime: predicted chain %p to native method %s ignored",
+ cell, method->name));
+ goto done;
+ }
+ int tgtAddr = (int) dvmJitGetCodeAddr(method->insns);
+
+ /*
+ * Compilation not made yet for the callee. Reset the counter to a small
+ * value and come back to check soon.
+ */
+ if (tgtAddr == 0) {
+ /*
+ * Wait for a few invocations (currently set to be 16) before trying
+ * to setup the chain again.
+ */
+ cell->counter = PREDICTED_CHAIN_COUNTER_DELAY;
+ cacheflush((long) cell, (long) (cell+1), 0);
+ COMPILER_TRACE_CHAINING(
+ LOGD("Jit Runtime: predicted chain %p to method %s delayed",
+ cell, method->name));
+ goto done;
+ }
+
+ /* Stop the world */
+ dvmSuspendAllThreads(SUSPEND_FOR_JIT);
+
+ int baseAddr = (int) cell + 4; // PC is cur_addr + 4
+ int branchOffset = tgtAddr - baseAddr;
+
+ COMPILER_TRACE_CHAINING(
+ LOGD("Jit Runtime: predicted chain %p from %s to %s (%s) patched",
+ cell, cell->clazz ? cell->clazz->descriptor : "NULL",
+ clazz->descriptor,
+ method->name));
+
+ cell->branch = assembleBXPair(branchOffset);
+ cell->clazz = clazz;
+ cell->method = method;
+ cell->counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
+
+ cacheflush((long) cell, (long) (cell+1), 0);
+
+ /* All done - resume all other threads */
+ dvmResumeAllThreads(SUSPEND_FOR_JIT);
+
+done:
+ return method;
+}
+
+/*
* Unchain a trace given the starting address of the translation
* in the code cache. Refer to the diagram in dvmCompilerAssembleLIR.
* Returns the address following the last cell unchained. Note that
@@ -601,24 +690,34 @@
u2 chainCellOffset = *pChainCellOffset;
ChainCellCounts *pChainCellCounts =
(ChainCellCounts*)((char*)codeAddr + chainCellOffset -3);
- int cellCount;
+ int cellSize;
u4* pChainCells;
u4* pStart;
u4 thumb1;
u4 thumb2;
u4 newInst;
int i,j;
+ PredictedChainingCell *predChainCell;
/* Get total count of chain cells */
- for (i = 0, cellCount = 0; i < CHAINING_CELL_LAST; i++) {
- cellCount += pChainCellCounts->u.count[i];
+ for (i = 0, cellSize = 0; i < CHAINING_CELL_LAST; i++) {
+ if (i != CHAINING_CELL_INVOKE_PREDICTED) {
+ cellSize += pChainCellCounts->u.count[i] * 2;
+ } else {
+ cellSize += pChainCellCounts->u.count[i] * 4;
+ }
}
/* Locate the beginning of the chain cell region */
- pStart = pChainCells = (u4*)((char*)pChainCellCounts - (cellCount * 8));
+ pStart = pChainCells = ((u4 *) pChainCellCounts) - cellSize;
/* The cells are sorted in order - walk through them and reset */
for (i = 0; i < CHAINING_CELL_LAST; i++) {
+ int elemSize = 2; /* Most chaining cell has two words */
+ if (i == CHAINING_CELL_INVOKE_PREDICTED) {
+ elemSize = 4;
+ }
+
for (j = 0; j < pChainCellCounts->u.count[i]; j++) {
int targetOffset;
switch(i) {
@@ -627,26 +726,38 @@
jitToInterpEntries.dvmJitToInterpNormal);
break;
case CHAINING_CELL_HOT:
- case CHAINING_CELL_INVOKE:
+ case CHAINING_CELL_INVOKE_SINGLETON:
targetOffset = offsetof(InterpState,
jitToInterpEntries.dvmJitToTraceSelect);
break;
+ case CHAINING_CELL_INVOKE_PREDICTED:
+ targetOffset = 0;
+ predChainCell = (PredictedChainingCell *) pChainCells;
+ /* Reset the cell to the init state */
+ predChainCell->branch = PREDICTED_CHAIN_BX_PAIR_INIT;
+ predChainCell->clazz = PREDICTED_CHAIN_CLAZZ_INIT;
+ predChainCell->method = PREDICTED_CHAIN_METHOD_INIT;
+ predChainCell->counter = PREDICTED_CHAIN_COUNTER_INIT;
+ break;
default:
dvmAbort();
}
+ COMPILER_TRACE_CHAINING(
+ LOGD("Jit Runtime: unchaining 0x%x", (int)pChainCells));
/*
- * Arm code sequence for a chaining cell is:
+ * Thumb code sequence for a chaining cell is:
* ldr r0, rGLUE, #<word offset>
* blx r0
*/
- COMPILER_TRACE_CHAINING(
- LOGD("Jit Runtime: unchaining 0x%x", (int)pChainCells));
- targetOffset = targetOffset >> 2; /* convert to word offset */
- thumb1 = 0x6800 | (targetOffset << 6) | (rGLUE << 3) | (r0 << 0);
- thumb2 = 0x4780 | (r0 << 3);
- newInst = thumb2<<16 | thumb1;
- *pChainCells = newInst;
- pChainCells += 2; /* Advance by 2 words */
+ if (i != CHAINING_CELL_INVOKE_PREDICTED) {
+ targetOffset = targetOffset >> 2; /* convert to word offset */
+ thumb1 = 0x6800 | (targetOffset << 6) |
+ (rGLUE << 3) | (r0 << 0);
+ thumb2 = 0x4780 | (r0 << 3);
+ newInst = thumb2<<16 | thumb1;
+ *pChainCells = newInst;
+ }
+ pChainCells += elemSize; /* Advance by a fixed number of words */
}
}
return pChainCells;