Fix performance issues related to chaining and unchaining.
1) Patching requests for predicted chaining cells (used by virtual/interface
methods) are now batched in a queue and processed when the VM is paused for GC.
2) When the code cache is full the reset operation is also conducted at the
end of GC pauses so this totally eliminates the need for the compiler thread
to issue suspend-all requests. This is a very rare event and when happening it
takes less than 5ms to finish.
3) Change the initial value of the branch in a predicted chaining cell from 0
(ie lsl r0, r0, #0) to 0xe7fe (ie branch to self) so that initializing a
predicted chaining cell doesn't need to suspend all threads. Together with 1)
seeing 20% speedup on some benchmarks.
4) Add TestCompability.c where defining "TEST_VM_IN_ECLAIR := true" in
buildspec.mk will activate dummy symbols needed to run libdvm.so in older
releases.
Bug: 2397689
Bug: 2396513
Bug: 2331313
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 9ed3a05..adcc16e 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -66,11 +66,10 @@
}
/*
- * Return if queue is full.
- * If the code cache is full, we will allow the work order to be added and
- * we use that to trigger code cache reset.
+ * Return if queue or code cache is full.
*/
- if (gDvmJit.compilerQueueLength == COMPILER_WORK_QUEUE_SIZE) {
+ if (gDvmJit.compilerQueueLength == COMPILER_WORK_QUEUE_SIZE ||
+ gDvmJit.codeCacheFull == true) {
result = false;
goto unlockAndExit;
}
@@ -94,8 +93,7 @@
newOrder->info = info;
newOrder->result.codeAddress = NULL;
newOrder->result.discardResult =
- (kind == kWorkOrderTraceDebug || kind == kWorkOrderICPatch) ?
- true : false;
+ (kind == kWorkOrderTraceDebug) ? true : false;
newOrder->result.requestingThread = dvmThreadSelf();
gDvmJit.compilerWorkEnqueueIndex++;
@@ -136,8 +134,8 @@
return false;
}
- // For debugging only
- // LOGD("Code cache starts at %p", gDvmJit.codeCache);
+ // STOPSHIP - for debugging only
+ LOGD("Code cache starts at %p", gDvmJit.codeCache);
/* Copy the template code into the beginning of the code cache */
int templateSize = (intptr_t) dmvCompilerTemplateEnd -
@@ -201,43 +199,38 @@
Thread* thread;
u8 startTime = dvmGetRelativeTimeUsec();
int inJit = 0;
-
- LOGD("Reset the JIT code cache (%d bytes used / %d time(s))",
- gDvmJit.codeCacheByteUsed, ++gDvmJit.numCodeCacheReset);
-
- /* Stop the world */
- dvmSuspendAllThreads(SUSPEND_FOR_CC_RESET);
+ int byteUsed = gDvmJit.codeCacheByteUsed;
/* If any thread is found stuck in the JIT state, don't reset the cache */
for (thread = gDvm.threadList; thread != NULL; thread = thread->next) {
+ /*
+ * Crawl the stack to wipe out the returnAddr field so that
+ * 1) the soon-to-be-deleted code in the JIT cache won't be used
+ * 2) or the thread stuck in the JIT land will soon return
+ * to the interpreter land
+ */
+ crawlDalvikStack(thread, false);
if (thread->inJitCodeCache) {
inJit++;
- /*
- * STOPSHIP
- * Change the verbose mode to false after the new code receives
- * more QA love.
- */
- crawlDalvikStack(thread, true);
}
}
if (inJit) {
- /* Wait a while for the busy threads to rest and try again */
- gDvmJit.delayCodeCacheReset = 256;
- goto done;
+ LOGD("JIT code cache reset delayed (%d bytes %d/%d)",
+ gDvmJit.codeCacheByteUsed, gDvmJit.numCodeCacheReset,
+ ++gDvmJit.numCodeCacheResetDelayed);
+ return;
}
- /* Drain the work queue to free the work order */
+ /* Lock the mutex to clean up the work queue */
+ dvmLockMutex(&gDvmJit.compilerLock);
+
+ /* Drain the work queue to free the work orders */
while (workQueueLength()) {
CompilerWorkOrder work = workDequeue();
free(work.info);
}
- /* Wipe out the returnAddr field that soon will point to stale code */
- for (thread = gDvm.threadList; thread != NULL; thread = thread->next) {
- crawlDalvikStack(thread, false);
- }
-
/* Reset the JitEntry table contents to the initial unpopulated state */
dvmJitResetTable();
@@ -261,15 +254,35 @@
gDvmJit.compilerWorkEnqueueIndex = gDvmJit.compilerWorkDequeueIndex = 0;
gDvmJit.compilerQueueLength = 0;
+ /* Reset the IC patch work queue */
+ dvmLockMutex(&gDvmJit.compilerICPatchLock);
+ gDvmJit.compilerICPatchIndex = 0;
+ dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
+
/* All clear now */
gDvmJit.codeCacheFull = false;
- LOGD("Code cache reset takes %lld usec",
- dvmGetRelativeTimeUsec() - startTime);
+ dvmUnlockMutex(&gDvmJit.compilerLock);
-done:
- /* Resume all threads */
- dvmResumeAllThreads(SUSPEND_FOR_CC_RESET);
+ LOGD("JIT code cache reset in %lld ms (%d bytes %d/%d)",
+ (dvmGetRelativeTimeUsec() - startTime) / 1000,
+ byteUsed, ++gDvmJit.numCodeCacheReset,
+ gDvmJit.numCodeCacheResetDelayed);
+}
+
+/*
+ * Perform actions that are only safe when all threads are suspended. Currently
+ * we do:
+ * 1) Check if the code cache is full. If so reset it and restart populating it
+ * from scratch.
+ * 2) Patch predicted chaining cells by consuming recorded work orders.
+ */
+void dvmCompilerPerformSafePointChecks(void)
+{
+ if (gDvmJit.codeCacheFull) {
+ resetCodeCache();
+ }
+ dvmCompilerPatchInlineCache();
}
bool compilerThreadStartup(void)
@@ -410,7 +423,6 @@
continue;
} else {
do {
- bool resizeFail = false;
CompilerWorkOrder work = workDequeue();
dvmUnlockMutex(&gDvmJit.compilerLock);
/*
@@ -421,11 +433,17 @@
/* Is JitTable filling up? */
if (gDvmJit.jitTableEntriesUsed >
(gDvmJit.jitTableSize - gDvmJit.jitTableSize/4)) {
- resizeFail = dvmJitResizeJitTable(gDvmJit.jitTableSize * 2);
+ bool resizeFail =
+ dvmJitResizeJitTable(gDvmJit.jitTableSize * 2);
+ /*
+ * If the jit table is full, consider it's time to reset
+ * the code cache too.
+ */
+ gDvmJit.codeCacheFull |= resizeFail;
}
if (gDvmJit.haltCompilerThread) {
LOGD("Compiler shutdown in progress - discarding request");
- } else if (!resizeFail) {
+ } else if (!gDvmJit.codeCacheFull) {
/* If compilation failed, use interpret-template */
if (!dvmCompilerDoWork(&work)) {
work.result.codeAddress = gDvmJit.interpretTemplate;
@@ -437,24 +455,6 @@
}
free(work.info);
dvmLockMutex(&gDvmJit.compilerLock);
-
- /*
- * FIXME - temporarily disable code cache reset until
- * stale code stops leaking.
- */
-#if 0
- if (gDvmJit.codeCacheFull == true || resizeFail) {
- if (gDvmJit.delayCodeCacheReset == 0) {
- resetCodeCache();
- assert(workQueueLength() == 0 ||
- gDvmJit.delayCodeCacheReset != 0);
- } else {
- LOGD("Delay the next %d tries to reset code cache",
- gDvmJit.delayCodeCacheReset);
- gDvmJit.delayCodeCacheReset--;
- }
- }
-#endif
} while (workQueueLength() != 0);
}
}
@@ -477,6 +477,7 @@
{
dvmInitMutex(&gDvmJit.compilerLock);
+ dvmInitMutex(&gDvmJit.compilerICPatchLock);
dvmLockMutex(&gDvmJit.compilerLock);
pthread_cond_init(&gDvmJit.compilerQueueActivity, NULL);
pthread_cond_init(&gDvmJit.compilerQueueEmpty, NULL);
diff --git a/vm/compiler/Compiler.h b/vm/compiler/Compiler.h
index 6b4d414..153e845 100644
--- a/vm/compiler/Compiler.h
+++ b/vm/compiler/Compiler.h
@@ -22,6 +22,7 @@
#define CODE_CACHE_SIZE 1024*1024
#define MAX_JIT_RUN_LEN 64
#define COMPILER_WORK_QUEUE_SIZE 100
+#define COMPILER_IC_PATCH_QUEUE_SIZE 64
#define COMPILER_TRACED(X)
#define COMPILER_TRACEE(X)
@@ -49,7 +50,6 @@
kWorkOrderMethod = 1, // Work is to compile a whole method
kWorkOrderTrace = 2, // Work is to compile code fragment(s)
kWorkOrderTraceDebug = 3, // Work is to compile/debug code fragment(s)
- kWorkOrderICPatch = 4, // Work is to patch a polymorphic callsite
} WorkOrderKind;
typedef struct CompilerWorkOrder {
@@ -59,6 +59,20 @@
JitTranslationInfo result;
} CompilerWorkOrder;
+/* Chain cell for predicted method invocation */
+typedef struct PredictedChainingCell {
+ u4 branch; /* Branch to chained destination */
+ const ClassObject *clazz; /* key #1 for prediction */
+ const Method *method; /* key #2 to lookup native PC from dalvik PC */
+ u4 counter; /* counter to patch the chaining cell */
+} PredictedChainingCell;
+
+/* Work order for inline cache patching */
+typedef struct ICPatchWorkOrder {
+ PredictedChainingCell *cellAddr; /* Address to be patched */
+ PredictedChainingCell cellContent; /* content of the new cell */
+} ICPatchWorkOrder;
+
typedef enum JitState {
kJitOff = 0,
kJitNormal = 1, // Profiling in mterp or running native
diff --git a/vm/compiler/codegen/CompilerCodegen.h b/vm/compiler/codegen/CompilerCodegen.h
index ff39cd4..4a27a67 100644
--- a/vm/compiler/codegen/CompilerCodegen.h
+++ b/vm/compiler/codegen/CompilerCodegen.h
@@ -41,6 +41,7 @@
void* dvmJitChain(void *tgtAddr, u4* branchAddr);
u4* dvmJitUnchain(void *codeAddr);
void dvmJitUnchainAll(void);
+void dvmCompilerPatchInlineCache(void);
/* Implemented in codegen/<target>/Ralloc.c */
void dvmCompilerRegAlloc(CompilationUnit *cUnit);
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 3254ff7..21e2a32 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -730,16 +730,9 @@
u8 defMask; // Resource mask for def
} ArmLIR;
-/* Chain cell for predicted method invocation */
-typedef struct PredictedChainingCell {
- u4 branch; /* Branch to chained destination */
- const ClassObject *clazz; /* key #1 for prediction */
- const Method *method; /* key #2 to lookup native PC from dalvik PC */
- u4 counter; /* counter to patch the chaining cell */
-} PredictedChainingCell;
-
/* Init values when a predicted chain is initially assembled */
-#define PREDICTED_CHAIN_BX_PAIR_INIT 0
+/* E7FE is branch to self */
+#define PREDICTED_CHAIN_BX_PAIR_INIT 0xe7fe
#define PREDICTED_CHAIN_CLAZZ_INIT 0
#define PREDICTED_CHAIN_METHOD_INIT 0
#define PREDICTED_CHAIN_COUNTER_INIT 0
@@ -748,7 +741,7 @@
#define PREDICTED_CHAIN_COUNTER_DELAY 512
/* Rechain after this many mis-predictions have happened */
-#define PREDICTED_CHAIN_COUNTER_RECHAIN 8192
+#define PREDICTED_CHAIN_COUNTER_RECHAIN 1024
/* Used if the resolved callee is a native method */
#define PREDICTED_CHAIN_COUNTER_AVOID 0x7fffffff
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 998c955..c3ad957 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -1328,7 +1328,12 @@
u4 newInst;
bool thumbTarget;
- if ((gDvmJit.pProfTable != NULL) && gDvm.sumThreadSuspendCount == 0) {
+ /*
+ * Only chain translations when there is no urge to ask all threads to
+ * suspend themselves via the interpreter.
+ */
+ if ((gDvmJit.pProfTable != NULL) && (gDvm.sumThreadSuspendCount == 0) &&
+ (gDvmJit.codeCacheFull == false)) {
assert((branchOffset >= -(1<<22)) && (branchOffset <= ((1<<22)-2)));
gDvmJit.translationChains++;
@@ -1350,12 +1355,48 @@
*branchAddr = newInst;
cacheflush((long)branchAddr, (long)branchAddr + 4, 0);
+ gDvmJit.hasNewChain = true;
}
return tgtAddr;
}
/*
+ * Attempt to enqueue a work order to patch an inline cache for a predicted
+ * chaining cell for virtual/interface calls.
+ */
+bool inlineCachePatchEnqueue(PredictedChainingCell *cellAddr,
+ PredictedChainingCell *newContent)
+{
+ bool result = true;
+
+ dvmLockMutex(&gDvmJit.compilerICPatchLock);
+
+ if (cellAddr->clazz == NULL &&
+ cellAddr->branch == PREDICTED_CHAIN_BX_PAIR_INIT) {
+ /*
+ * The update order matters - make sure clazz is updated last since it
+ * will bring the uninitialized chaining cell to life.
+ */
+ cellAddr->method = newContent->method;
+ cellAddr->branch = newContent->branch;
+ cellAddr->counter = newContent->counter;
+ cellAddr->clazz = newContent->clazz;
+ cacheflush((intptr_t) cellAddr, (intptr_t) (cellAddr+1), 0);
+ }
+ else if (gDvmJit.compilerICPatchIndex < COMPILER_IC_PATCH_QUEUE_SIZE) {
+ int index = gDvmJit.compilerICPatchIndex++;
+ gDvmJit.compilerICPatchQueue[index].cellAddr = cellAddr;
+ gDvmJit.compilerICPatchQueue[index].cellContent = *newContent;
+ } else {
+ result = false;
+ }
+
+ dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
+ return result;
+}
+
+/*
* This method is called from the invoke templates for virtual and interface
* methods to speculatively setup a chain to the callee. The templates are
* written in assembly and have setup method, cell, and clazz at r0, r2, and
@@ -1412,41 +1453,29 @@
goto done;
}
- /*
- * Bump up the counter first just in case other mutator threads are in
- * nearby territory to also attempt to rechain this cell. This is not
- * done in a thread-safe way and doesn't need to be since the consequence
- * of the race condition [rare] is two back-to-back suspend-all attempts,
- * which will be handled correctly.
- */
- cell->counter = PREDICTED_CHAIN_COUNTER_AVOID;
+ PredictedChainingCell newCell;
- PredictedChainingCell *newCell =
- (PredictedChainingCell *) malloc(sizeof(PredictedChainingCell));
+ /* Avoid back-to-back orders to the same cell */
+ cell->counter = PREDICTED_CHAIN_COUNTER_AVOID;
int baseAddr = (int) cell + 4; // PC is cur_addr + 4
int branchOffset = tgtAddr - baseAddr;
- newCell->branch = assembleChainingBranch(branchOffset, true);
- newCell->clazz = clazz;
- newCell->method = method;
+ newCell.branch = assembleChainingBranch(branchOffset, true);
+ newCell.clazz = clazz;
+ newCell.method = method;
+ newCell.counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
/*
- * Reset the counter again in case other mutator threads got invoked
- * between the previous rest and dvmSuspendAllThreads call.
- */
- newCell->counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
-
- /*
- * Enter the work order to the queue for the compiler thread to patch the
- * chaining cell.
+ * Enter the work order to the queue and the chaining cell will be patched
+ * the next time a safe point is entered.
*
- * No blocking call is added here because the patched result is not
- * intended to be immediately consumed by the requesting thread. Its
- * execution is simply resumed by chasing the class pointer to resolve the
- * callsite.
+ * If the enqueuing fails reset the rechain count to a normal value so that
+ * it won't get indefinitely delayed.
*/
- dvmCompilerWorkEnqueue((const u2 *) cell, kWorkOrderICPatch, newCell);
+ if (!inlineCachePatchEnqueue(cell, &newCell)) {
+ cell->counter = PREDICTED_CHAIN_COUNTER_RECHAIN;
+ }
#endif
done:
return method;
@@ -1456,31 +1485,61 @@
* Patch the inline cache content based on the content passed from the work
* order.
*/
-bool dvmJitPatchInlineCache(void *cellPtr, void *contentPtr)
+void dvmCompilerPatchInlineCache(void)
{
- PredictedChainingCell *cellDest = (PredictedChainingCell *) cellPtr;
- PredictedChainingCell *newContent = (PredictedChainingCell *) contentPtr;
+ int i;
+ PredictedChainingCell *minAddr, *maxAddr;
- /* Stop the world */
- dvmSuspendAllThreads(SUSPEND_FOR_IC_PATCH);
+ /* Nothing to be done */
+ if (gDvmJit.compilerICPatchIndex == 0) return;
+ /*
+ * Since all threads are already stopped we don't really need to acquire
+ * the lock. But race condition can be easily introduced in the future w/o
+ * paying attention so we still acquire the lock here.
+ */
+ dvmLockMutex(&gDvmJit.compilerICPatchLock);
- COMPILER_TRACE_CHAINING(
- LOGD("Jit Runtime: predicted chain %p from %s to %s (%s) patched",
- cellDest, cellDest->clazz ? cellDest->clazz->descriptor : "NULL",
- newContent->clazz->descriptor,
- newContent->method->name));
+ //LOGD("Number of IC patch work orders: %d", gDvmJit.compilerICPatchIndex);
- /* Install the new cell content */
- *cellDest = *newContent;
+ /* Initialize the min/max address range */
+ minAddr = (PredictedChainingCell *)
+ ((char *) gDvmJit.codeCache + CODE_CACHE_SIZE);
+ maxAddr = (PredictedChainingCell *) gDvmJit.codeCache;
- /* Then synchronize the I/D$ */
- cacheflush((long) cellDest, (long) (cellDest+1), 0);
+ for (i = 0; i < gDvmJit.compilerICPatchIndex; i++) {
+ PredictedChainingCell *cellAddr =
+ gDvmJit.compilerICPatchQueue[i].cellAddr;
+ PredictedChainingCell *cellContent =
+ &gDvmJit.compilerICPatchQueue[i].cellContent;
- /* All done - resume all other threads */
- dvmResumeAllThreads(SUSPEND_FOR_IC_PATCH);
+ if (cellAddr->clazz == NULL) {
+ COMPILER_TRACE_CHAINING(
+ LOGD("Jit Runtime: predicted chain %p to %s (%s) initialized",
+ cellAddr,
+ cellContent->clazz->descriptor,
+ cellContent->method->name));
+ } else {
+ COMPILER_TRACE_CHAINING(
+ LOGD("Jit Runtime: predicted chain %p from %s to %s (%s) "
+ "patched",
+ cellAddr,
+ cellAddr->clazz->descriptor,
+ cellContent->clazz->descriptor,
+ cellContent->method->name));
+ }
- return true;
+ /* Patch the chaining cell */
+ *cellAddr = *cellContent;
+ minAddr = (cellAddr < minAddr) ? cellAddr : minAddr;
+ maxAddr = (cellAddr > maxAddr) ? cellAddr : maxAddr;
+ }
+
+ /* Then synchronize the I/D cache */
+ cacheflush((long) minAddr, (long) (maxAddr+1), 0);
+
+ gDvmJit.compilerICPatchIndex = 0;
+ dvmUnlockMutex(&gDvmJit.compilerICPatchLock);
}
/*
@@ -1617,6 +1676,7 @@
dvmUnlockMutex(&gDvmJit.tableLock);
gDvmJit.translationChains = 0;
}
+ gDvmJit.hasNewChain = false;
}
typedef struct jitProfileAddrToLine {
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 5be07aa..b0e16b8 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -4121,8 +4121,7 @@
{
bool res;
- if (gDvmJit.codeCacheFull &&
- (work->kind != kWorkOrderICPatch)) {
+ if (gDvmJit.codeCacheFull) {
return false;
}
@@ -4142,9 +4141,6 @@
gDvmJit.printMe = oldPrintMe;;
break;
}
- case kWorkOrderICPatch:
- res = dvmJitPatchInlineCache((void *) work->pc, work->info);
- break;
default:
res = false;
dvmAbort();