Support traceview-style profiling in all builds

This change builds on an earlier bccheng change that allowed JIT'd code
to avoid reverting to the debug portable interpeter when doing traceview-style
method profiling.  That CL introduced a new traceview build (libdvm_traceview)
because the performance delta was too great to enable the capability for
all builds.

In this CL, we remove the libdvm_traceview build and provide full-speed
method tracing in all builds.  This is done by introducing "_PROF"
versions of invoke and return templates used by the JIT.  Normally, these
templates are not used, and performace in unaffected.  However, when method
profiling is enabled, all existing translation are purged and new translations
are created using the _PROF templates.  These templates introduce a
smallish performance penalty above and beyond the actual tracing cost, but
again are only used when tracing has been enabled.

Strictly speaking, there is a slight burden that is placed on invokes and
returns in the non-tracing case - on the order of an additional 3 or 4
cycles per invoke/return.  Those operations are already heavyweight enough
that I was unable to measure the added cost in benchmarks.

Change-Id: Ic09baf4249f1e716e136a65458f4e06cea35fc18
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 886b1f1..a1e3d0e 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -99,6 +99,7 @@
     newOrder->result.codeAddress = NULL;
     newOrder->result.discardResult =
         (kind == kWorkOrderTraceDebug) ? true : false;
+    newOrder->result.cacheVersion = gDvmJit.cacheVersion;
     newOrder->result.requestingThread = dvmThreadSelf();
 
     gDvmJit.compilerWorkEnqueueIndex++;
@@ -264,6 +265,9 @@
     /* Lock the mutex to clean up the work queue */
     dvmLockMutex(&gDvmJit.compilerLock);
 
+    /* Update the translation cache version */
+    gDvmJit.cacheVersion++;
+
     /* Drain the work queue to free the work orders */
     while (workQueueLength()) {
         CompilerWorkOrder work = workDequeue();
@@ -749,6 +753,33 @@
         return;
     }
 
+    /*
+     * On the first enabling of method tracing, switch the compiler
+     * into a mode that includes trace support for invokes and returns.
+     * If there are any existing translations, flush them.  NOTE:  we
+     * can't blindly flush the translation cache because this code
+     * may be executed before the compiler thread has finished
+     * initialization.
+     */
+    if ((gDvm.interpBreak & kSubModeMethodTrace) &&
+        !gDvmJit.methodTraceSupport) {
+        bool resetRequired;
+        /*
+         * compilerLock will prevent new compilations from being
+         * installed while we are working.
+         */
+        dvmLockMutex(&gDvmJit.compilerLock);
+        gDvmJit.cacheVersion++; // invalidate compilations in flight
+        gDvmJit.methodTraceSupport = true;
+        resetRequired = (gDvmJit.numCompilations != 0);
+        dvmUnlockMutex(&gDvmJit.compilerLock);
+        if (resetRequired) {
+            dvmSuspendAllThreads(SUSPEND_FOR_CC_RESET);
+            resetCodeCache();
+            dvmResumeAllThreads(SUSPEND_FOR_CC_RESET);
+        }
+    }
+
     dvmLockMutex(&gDvmJit.tableLock);
     jitActive = gDvmJit.pProfTable != NULL;
     jitActivate = !dvmDebuggerOrProfilerActive();
diff --git a/vm/compiler/Compiler.h b/vm/compiler/Compiler.h
index cd9d21b..d29520d 100644
--- a/vm/compiler/Compiler.h
+++ b/vm/compiler/Compiler.h
@@ -87,6 +87,7 @@
     bool discardResult;         // Used for debugging divergence and IC patching
     bool methodCompilationAborted;  // Cannot compile the whole method
     Thread *requestingThread;   // For debugging purpose
+    int cacheVersion;           // Used to identify stale trace requests
 } JitTranslationInfo;
 
 typedef enum WorkOrderKind {
diff --git a/vm/compiler/Frontend.c b/vm/compiler/Frontend.c
index b4c48e4..1095225 100644
--- a/vm/compiler/Frontend.c
+++ b/vm/compiler/Frontend.c
@@ -449,6 +449,11 @@
         return true;
     }
 
+    /* If the work order is stale, discard it */
+    if (info->cacheVersion != gDvmJit.cacheVersion) {
+        return false;
+    }
+
     compilationId++;
     memset(&cUnit, 0, sizeof(CompilationUnit));
 
diff --git a/vm/compiler/InlineTransformation.c b/vm/compiler/InlineTransformation.c
index 2cdba18..cab790c 100644
--- a/vm/compiler/InlineTransformation.c
+++ b/vm/compiler/InlineTransformation.c
@@ -304,6 +304,10 @@
         if ((flags & kInstrInvoke) == 0)
             continue;
 
+        /* Disable inlining when doing method tracing */
+        if (gDvmJit.methodTraceSupport)
+            continue;
+
         /*
          * If the invoke itself is selected for single stepping, don't bother
          * to inline it.
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index e52c26c..aa61285 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -1326,6 +1326,22 @@
     /* Don't go all the way if the goal is just to get the verbose output */
     if (info->discardResult) return;
 
+    /*
+     * The cache might disappear - acquire lock and check version
+     * Continue holding lock until translation cache update is complete.
+     * These actions are required here in the compiler thread because
+     * it is unaffected by suspend requests and doesn't know if a
+     * translation cache flush is in progress.
+     */
+    dvmLockMutex(&gDvmJit.compilerLock);
+    if (info->cacheVersion != gDvmJit.cacheVersion) {
+        /* Cache changed - discard current translation */
+        info->discardResult = true;
+        info->codeAddress = NULL;
+        dvmUnlockMutex(&gDvmJit.compilerLock);
+        return;
+    }
+
     cUnit->baseAddr = (char *) gDvmJit.codeCache + gDvmJit.codeCacheByteUsed;
     gDvmJit.codeCacheByteUsed += offset;
 
@@ -1353,6 +1369,7 @@
     /* Write the literals directly into the code cache */
     installDataContent(cUnit);
 
+
     /* Flush dcache and invalidate the icache to maintain coherence */
     dvmCompilerCacheFlush((long)cUnit->baseAddr,
                           (long)((char *) cUnit->baseAddr + offset), 0);
@@ -1360,6 +1377,9 @@
 
     PROTECT_CODE_CACHE(cUnit->baseAddr, offset);
 
+    /* Translation cache update complete - release lock */
+    dvmUnlockMutex(&gDvmJit.compilerLock);
+
     /* Record code entry point and instruction set */
     info->codeAddress = (char*)cUnit->baseAddr + cUnit->headerSize;
     /* If applicable, mark low bit to denote thumb */
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 236482f..7f62816 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -904,7 +904,9 @@
 /* Perform the actual operation for OP_RETURN_* */
 static void genReturnCommon(CompilationUnit *cUnit, MIR *mir)
 {
-    genDispatchToHandler(cUnit, TEMPLATE_RETURN);
+    genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+        TEMPLATE_RETURN_PROF :
+        TEMPLATE_RETURN);
 #if defined(WITH_JIT_TUNING)
     gDvmJit.returnOp++;
 #endif
@@ -1082,14 +1084,18 @@
      * r7 = calleeMethod->registersSize
      */
     if (dvmIsNativeMethod(calleeMethod)) {
-        genDispatchToHandler(cUnit, TEMPLATE_INVOKE_METHOD_NATIVE);
+        genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+            TEMPLATE_INVOKE_METHOD_NATIVE_PROF :
+            TEMPLATE_INVOKE_METHOD_NATIVE);
 #if defined(WITH_JIT_TUNING)
         gDvmJit.invokeNative++;
 #endif
     } else {
         /* For Java callees, set up r2 to be calleeMethod->outsSize */
         loadConstant(cUnit, r2, calleeMethod->outsSize);
-        genDispatchToHandler(cUnit, TEMPLATE_INVOKE_METHOD_CHAIN);
+        genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+            TEMPLATE_INVOKE_METHOD_CHAIN_PROF :
+            TEMPLATE_INVOKE_METHOD_CHAIN);
 #if defined(WITH_JIT_TUNING)
         gDvmJit.invokeMonomorphic++;
 #endif
@@ -1148,7 +1154,9 @@
     ArmLIR *predictedChainingCell = opRegRegImm(cUnit, kOpAdd, r2, rpc, 0);
     predictedChainingCell->generic.target = (LIR *) predChainingCell;
 
-    genDispatchToHandler(cUnit, TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN);
+    genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+        TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF :
+        TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN);
 
     /* return through lr - jump to the chaining cell */
     genUnconditionalBranch(cUnit, predChainingCell);
@@ -1211,7 +1219,9 @@
      * r1 = &ChainingCell,
      * r4PC = callsiteDPC,
      */
-    genDispatchToHandler(cUnit, TEMPLATE_INVOKE_METHOD_NO_OPT);
+    genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+        TEMPLATE_INVOKE_METHOD_NO_OPT_PROF :
+        TEMPLATE_INVOKE_METHOD_NO_OPT);
 #if defined(WITH_JIT_TUNING)
     gDvmJit.invokePolymorphic++;
 #endif
@@ -3053,7 +3063,9 @@
                 opRegRegImm(cUnit, kOpAdd, r2, rpc, 0);
             predictedChainingCell->generic.target = (LIR *) predChainingCell;
 
-            genDispatchToHandler(cUnit, TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN);
+            genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+                TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF :
+                TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN);
 
             /* return through lr - jump to the chaining cell */
             genUnconditionalBranch(cUnit, predChainingCell);
@@ -3156,7 +3168,9 @@
              * r1 = &ChainingCell,
              * r4PC = callsiteDPC,
              */
-            genDispatchToHandler(cUnit, TEMPLATE_INVOKE_METHOD_NO_OPT);
+            genDispatchToHandler(cUnit, gDvmJit.methodTraceSupport ?
+                TEMPLATE_INVOKE_METHOD_NO_OPT_PROF :
+                TEMPLATE_INVOKE_METHOD_NO_OPT);
 #if defined(WITH_JIT_TUNING)
             gDvmJit.invokePolymorphic++;
 #endif
@@ -3166,7 +3180,9 @@
         }
         /* NOP */
         case OP_INVOKE_DIRECT_EMPTY: {
-            return false;
+            if (gDvmJit.methodTraceSupport)
+                genInterpSingleStep(cUnit, mir);
+            break;
         }
         case OP_FILLED_NEW_ARRAY:
         case OP_FILLED_NEW_ARRAY_RANGE:
diff --git a/vm/compiler/template/armv5te-vfp/TemplateOpList.h b/vm/compiler/template/armv5te-vfp/TemplateOpList.h
index 97addfa..0365ba4 100644
--- a/vm/compiler/template/armv5te-vfp/TemplateOpList.h
+++ b/vm/compiler/template/armv5te-vfp/TemplateOpList.h
@@ -58,3 +58,8 @@
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
 JIT_TEMPLATE(PERIODIC_PROFILING)
+JIT_TEMPLATE(RETURN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NO_OPT_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_PREDICTED_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NATIVE_PROF)
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
index f1650d9..b6a8540 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
@@ -1,3 +1,4 @@
+%default { "chaintgt" : ".LinvokeChain" }
     /*
      * For monomorphic callsite, setup the Dalvik frame and return to the
      * Thumb code through the link register to transfer control to the callee
@@ -6,7 +7,7 @@
     @ r0 = methodToCall, r1 = returnCell, r2 = methodToCall->outsSize
     @ rPC = dalvikCallsite, r7 = methodToCall->registersSize
     @ methodToCall is guaranteed to be non-native
-.LinvokeChain:
+$chaintgt:
     ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
     add     r3, r1, #1  @ Thumb addr is odd
@@ -38,7 +39,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S
new file mode 100644
index 0000000..d1be4fd
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S
@@ -0,0 +1,3 @@
+#define TEMPLATE_INLINE_PROFILING
+%include "armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S" { "chaintgt" : ".LinvokeChainProf" }
+#undef TEMPLATE_INLINE_PROFILING
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
index 2a22a22..c3390ed 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
@@ -39,7 +39,7 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     @ r2=methodToCall, r6=rGLUE
     stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
     stmfd   sp!, {r0-r3}                @ preserve r0-r3
@@ -53,7 +53,7 @@
 
     blx     r8                          @ off to the native code
 
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}                @ restore r2 and r6
     @ r0=JNIMethod, r1=rGlue
     mov     lr, pc
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S
new file mode 100644
index 0000000..816277a
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S
@@ -0,0 +1,3 @@
+#define TEMPLATE_INLINE_PROFILING
+%include "armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S"
+#undef TEMPLATE_INLINE_PROFILING
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
index 405065f..72fe910 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
@@ -46,7 +46,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r3}                    @ preserve r0-r3
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S
new file mode 100644
index 0000000..bfea7d9
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S
@@ -0,0 +1,3 @@
+#define TEMPLATE_INLINE_PROFILING
+%include "armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S"
+#undef TEMPLATE_INLINE_PROFILING
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S
index 65b2cc3..6bce7bf 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S
@@ -1,3 +1,4 @@
+%default { "chaintgt" : ".LinvokeChain" }
     /*
      * For polymorphic callsite, check whether the cached class pointer matches
      * the current one. If so setup the Dalvik frame and return to the
@@ -42,7 +43,7 @@
 #endif
     ldreqh  r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
     ldreqh  r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
-    beq     .LinvokeChain   @ predicted chain is valid
+    beq     $chaintgt   @ predicted chain is valid
     ldr     r7, [r3, #offClassObject_vtable] @ r7 <- this->class->vtable
     cmp     r8, #0          @ initialized class or not
     moveq   r1, #0
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S
new file mode 100644
index 0000000..6ca5bdd
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S
@@ -0,0 +1,3 @@
+#define TEMPLATE_INLINE_PROFILING
+%include "armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S" { "chaintgt" : ".LinvokeChainProf" }
+#undef TEMPLATE_INLINE_PROFILING
diff --git a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
index 564b844..d0bbbfc 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
@@ -5,7 +5,7 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve live registers
     mov     r0, r6
     @ r0=rGlue
diff --git a/vm/compiler/template/armv5te/TEMPLATE_RETURN_PROF.S b/vm/compiler/template/armv5te/TEMPLATE_RETURN_PROF.S
new file mode 100644
index 0000000..d7af0bd
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_RETURN_PROF.S
@@ -0,0 +1,3 @@
+#define TEMPLATE_INLINE_PROFILING
+%include "armv5te/TEMPLATE_RETURN.S"
+#undef TEMPLATE_INLINE_PROFILING
diff --git a/vm/compiler/template/armv5te/TemplateOpList.h b/vm/compiler/template/armv5te/TemplateOpList.h
index 663e0df..abfec4b 100644
--- a/vm/compiler/template/armv5te/TemplateOpList.h
+++ b/vm/compiler/template/armv5te/TemplateOpList.h
@@ -43,3 +43,8 @@
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
 JIT_TEMPLATE(PERIODIC_PROFILING)
+JIT_TEMPLATE(RETURN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NO_OPT_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_PREDICTED_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NATIVE_PROF)
diff --git a/vm/compiler/template/armv5te/footer.S b/vm/compiler/template/armv5te/footer.S
index 7b35e8a..0a4e92d 100644
--- a/vm/compiler/template/armv5te/footer.S
+++ b/vm/compiler/template/armv5te/footer.S
@@ -17,12 +17,15 @@
     str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
                                         @ newFp->localRefCookie=top
     mov     r9, r3                      @ r9<- glue->self (preserve)
+    ldr     lr, [rGLUE, #offGlue_pInterpBreak]
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
 
     mov     r2, r0                      @ r2<- methodToCall
+    ldr     lr, [lr]                    @ lr<- set of active profilers
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+    ands    lr, #kSubModeMethodTrace
+    beq     121f                        @ hop if not profiling
     @ r2: methodToCall, r6: rGLUE
     stmfd   sp!, {r2,r6}
     stmfd   sp!, {r0-r3}
@@ -31,16 +34,18 @@
     mov     lr, pc
     ldr     pc, .LdvmFastMethodTraceEnter
     ldmfd   sp!, {r0-r3}
-#endif
 
     mov     lr, pc
     ldr     pc, [r2, #offMethod_nativeFunc]
 
-#if defined(WITH_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}
     mov     lr, pc
     ldr     pc, .LdvmFastNativeMethodTraceExit
-#endif
+    b       212f
+121:
+    mov     lr, pc
+    ldr     pc, [r2, #offMethod_nativeFunc]
+212:
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -114,14 +119,12 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
-#if defined(WITH_INLINE_PROFILING)
 .LdvmFastMethodTraceEnter:
     .word   dvmFastMethodTraceEnter
 .LdvmFastNativeMethodTraceExit:
     .word   dvmFastNativeMethodTraceExit
 .LdvmFastJavaMethodTraceExit:
     .word   dvmFastJavaMethodTraceExit
-#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/armv7-a-neon/TemplateOpList.h b/vm/compiler/template/armv7-a-neon/TemplateOpList.h
index 97addfa..0365ba4 100644
--- a/vm/compiler/template/armv7-a-neon/TemplateOpList.h
+++ b/vm/compiler/template/armv7-a-neon/TemplateOpList.h
@@ -58,3 +58,8 @@
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
 JIT_TEMPLATE(PERIODIC_PROFILING)
+JIT_TEMPLATE(RETURN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NO_OPT_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_PREDICTED_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NATIVE_PROF)
diff --git a/vm/compiler/template/armv7-a/TemplateOpList.h b/vm/compiler/template/armv7-a/TemplateOpList.h
index 97addfa..0365ba4 100644
--- a/vm/compiler/template/armv7-a/TemplateOpList.h
+++ b/vm/compiler/template/armv7-a/TemplateOpList.h
@@ -58,3 +58,8 @@
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
 JIT_TEMPLATE(PERIODIC_PROFILING)
+JIT_TEMPLATE(RETURN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NO_OPT_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_PREDICTED_CHAIN_PROF)
+JIT_TEMPLATE(INVOKE_METHOD_NATIVE_PROF)
diff --git a/vm/compiler/template/config-armv5te-vfp b/vm/compiler/template/config-armv5te-vfp
index 30b9200..774bd96 100644
--- a/vm/compiler/template/config-armv5te-vfp
+++ b/vm/compiler/template/config-armv5te-vfp
@@ -49,6 +49,11 @@
     op TEMPLATE_MONITOR_ENTER armv5te
     op TEMPLATE_MONITOR_ENTER_DEBUG armv5te
     op TEMPLATE_PERIODIC_PROFILING armv5te
+    op TEMPLATE_INVOKE_METHOD_CHAIN_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_NATIVE_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_NO_OPT_PROF armv5te
+    op TEMPLATE_RETURN_PROF armv5te
 
 op-end
 
diff --git a/vm/compiler/template/config-armv7-a b/vm/compiler/template/config-armv7-a
index 1408ca1..9d66e55 100644
--- a/vm/compiler/template/config-armv7-a
+++ b/vm/compiler/template/config-armv7-a
@@ -49,6 +49,11 @@
     op TEMPLATE_MONITOR_ENTER armv5te
     op TEMPLATE_MONITOR_ENTER_DEBUG armv5te
     op TEMPLATE_PERIODIC_PROFILING armv5te
+    op TEMPLATE_INVOKE_METHOD_CHAIN_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_NATIVE_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_NO_OPT_PROF armv5te
+    op TEMPLATE_RETURN_PROF armv5te
 op-end
 
 # "helper" code for C; include if you use any of the C stubs (this generates
diff --git a/vm/compiler/template/config-armv7-a-neon b/vm/compiler/template/config-armv7-a-neon
index 1408ca1..9d66e55 100644
--- a/vm/compiler/template/config-armv7-a-neon
+++ b/vm/compiler/template/config-armv7-a-neon
@@ -49,6 +49,11 @@
     op TEMPLATE_MONITOR_ENTER armv5te
     op TEMPLATE_MONITOR_ENTER_DEBUG armv5te
     op TEMPLATE_PERIODIC_PROFILING armv5te
+    op TEMPLATE_INVOKE_METHOD_CHAIN_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_NATIVE_PROF armv5te
+    op TEMPLATE_INVOKE_METHOD_NO_OPT_PROF armv5te
+    op TEMPLATE_RETURN_PROF armv5te
 op-end
 
 # "helper" code for C; include if you use any of the C stubs (this generates
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index a107b24..cd53096 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -166,7 +166,7 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve live registers
     mov     r0, r6
     @ r0=rGlue
@@ -271,7 +271,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r3}                    @ preserve r0-r3
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -331,7 +331,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -453,7 +453,7 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     @ r2=methodToCall, r6=rGLUE
     stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
     stmfd   sp!, {r0-r3}                @ preserve r0-r3
@@ -467,7 +467,7 @@
 
     blx     r8                          @ off to the native code
 
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}                @ restore r2 and r6
     @ r0=JNIMethod, r1=rGlue
     mov     lr, pc
@@ -1505,6 +1505,370 @@
      blx    r0
      bx     r4
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_RETURN_PROF
+dvmCompiler_TEMPLATE_RETURN_PROF:
+/* File: armv5te/TEMPLATE_RETURN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_RETURN.S */
+    /*
+     * Unwind a frame from the Dalvik stack for compiled OP_RETURN_XXX.
+     * If the stored value in returnAddr
+     * is non-zero, the caller is compiled by the JIT thus return to the
+     * address in the code cache following the invoke instruction. Otherwise
+     * return to the special dvmJitToInterpNoChain entry point.
+     */
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
+    SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
+    ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+#if !defined(WITH_SELF_VERIFICATION)
+    ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
+#else
+    mov     r9, #0                      @ disable chaining
+#endif
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
+                                        @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
+    cmp     r2, #0                      @ break frame?
+#if !defined(WITH_SELF_VERIFICATION)
+    beq     1f                          @ bail to interpreter
+#else
+    blxeq   lr                          @ punt to interpreter and compare state
+#endif
+    ldr     r1, .LdvmJitToInterpNoChainNoProfile @ defined in footer.S
+    mov     rFP, r10                    @ publish new FP
+    ldr     r10, [r2, #offMethod_clazz] @ r10<- method->clazz
+    ldr     r8, [r8]                    @ r8<- suspendCount
+
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r0, [r10, #offClassObject_pDvmDex] @ r0<- method->clazz->pDvmDex
+    str     rFP, [r3, #offThread_curFrame] @ self->curFrame = fp
+    add     rPC, rPC, #6                @ publish new rPC (advance 6 bytes)
+    str     r0, [rGLUE, #offGlue_methodClassDex]
+    cmp     r8, #0                      @ check the suspendCount
+    movne   r9, #0                      @ clear the chaining cell address
+    str     r9, [r3, #offThread_inJitCodeCache] @ in code cache or not
+    cmp     r9, #0                      @ chaining cell exists?
+    blxne   r9                          @ jump to the chaining cell
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1                      @ callsite is interpreted
+1:
+    stmia   rGLUE, {rPC, rFP}           @ SAVE_PC_FP_TO_GLUE()
+    ldr     r2, .LdvmMterpStdBail       @ defined in footer.S
+    mov     r1, #0                      @ changeInterp = false
+    mov     r0, rGLUE                   @ Expecting rGLUE in r0
+    blx     r2                          @ exit the interpreter
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S */
+    /*
+     * For polymorphic callsites - setup the Dalvik frame and load Dalvik PC
+     * into rPC then jump to dvmJitToInterpNoChain to dispatch the
+     * runtime-resolved callee.
+     */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    ldrh    r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldrh    r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    ldr     r10, [r0, #offMethod_accessFlags] @ r10<- methodToCall->accessFlags
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- methodToCall->insns
+
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    lr                          @ bail to the interpreter
+    tst     r10, #ACC_NATIVE
+#if !defined(WITH_SELF_VERIFICATION)
+    bne     .LinvokeNative
+#else
+    bxne    lr                          @ bail to the interpreter
+#endif
+
+    ldr     r10, .LdvmJitToInterpTraceSelectNoChain
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
+
+    @ Start executing the callee
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kInlineCacheMiss
+#endif
+    mov     pc, r10                         @ dvmJitToInterpTraceSelectNoChain
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S */
+    /*
+     * For monomorphic callsite, setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     */
+    @ r0 = methodToCall, r1 = returnCell, r2 = methodToCall->outsSize
+    @ rPC = dalvikCallsite, r7 = methodToCall->registersSize
+    @ methodToCall is guaranteed to be non-native
+.LinvokeChainProf:
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    add     r12, lr, #2                 @ setup the punt-to-interp address
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    r12                         @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    r12                         @ bail to the interpreter
+
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
+
+    bx      lr                              @ return to the callee-chaining cell
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
+    /*
+     * For polymorphic callsite, check whether the cached class pointer matches
+     * the current one. If so setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     *
+     * The predicted chaining cell is declared in ArmLIR.h with the
+     * following layout:
+     *
+     *  typedef struct PredictedChainingCell {
+     *      u4 branch;
+     *      const ClassObject *clazz;
+     *      const Method *method;
+     *      u4 counter;
+     *  } PredictedChainingCell;
+     *
+     * Upon returning to the callsite:
+     *    - lr  : to branch to the chaining cell
+     *    - lr+2: to punt to the interpreter
+     *    - lr+4: to fully resolve the callee and may rechain.
+     *            r3 <- class
+     *            r9 <- counter
+     */
+    @ r0 = this, r1 = returnCell, r2 = predictedChainCell, rPC = dalvikCallsite
+    ldr     r3, [r0, #offObject_clazz]  @ r3 <- this->class
+    ldr     r8, [r2, #4]    @ r8 <- predictedChainCell->clazz
+    ldr     r0, [r2, #8]    @ r0 <- predictedChainCell->method
+    ldr     r9, [rGLUE, #offGlue_icRechainCount]   @ r1 <- shared rechainCount
+    cmp     r3, r8          @ predicted class == actual class?
+#if defined(WITH_JIT_TUNING)
+    ldr     r7, .LdvmICHitCount
+#if defined(WORKAROUND_CORTEX_A9_745320)
+    /* Don't use conditional loads if the HW defect exists */
+    bne     101f
+    ldr     r10, [r7, #0]
+101:
+#else
+    ldreq   r10, [r7, #0]
+#endif
+    add     r10, r10, #1
+    streq   r10, [r7, #0]
+#endif
+    ldreqh  r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldreqh  r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    beq     .LinvokeChainProf   @ predicted chain is valid
+    ldr     r7, [r3, #offClassObject_vtable] @ r7 <- this->class->vtable
+    cmp     r8, #0          @ initialized class or not
+    moveq   r1, #0
+    subne   r1, r9, #1      @ count--
+    strne   r1, [rGLUE, #offGlue_icRechainCount]   @ write back to InterpState
+    add     lr, lr, #4      @ return to fully-resolve landing pad
+    /*
+     * r1 <- count
+     * r2 <- &predictedChainCell
+     * r3 <- this->class
+     * r4 <- dPC
+     * r7 <- this->class->vtable
+     */
+    bx      lr
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    @ r7 = methodToCall->registersSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    ldr     r8, [r8]                    @ r3<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    ldr     r8, [r0, #offMethod_nativeFunc] @ r8<- method->nativeFunc
+#if !defined(WITH_SELF_VERIFICATION)
+    bxne    lr                          @ bail to the interpreter
+#else
+    bx      lr                          @ bail to interpreter unconditionally
+#endif
+
+    @ go ahead and transfer control to the native code
+    ldr     r9, [r3, #offThread_jniLocal_topCookie] @ r9<- thread->localRef->...
+    mov     r2, #0
+    str     r1, [r3, #offThread_curFrame]   @ self->curFrame = newFp
+    str     r2, [r3, #offThread_inJitCodeCache] @ not in the jit code cache
+    str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
+                                        @ newFp->localRefCookie=top
+    mov     r9, r3                      @ r9<- glue->self (preserve)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
+
+    mov     r2, r0                      @ r2<- methodToCall
+    mov     r0, r1                      @ r0<- newFP
+    add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(TEMPLATE_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
+
+    blx     r8                          @ off to the native code
+
+#if defined(TEMPLATE_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastNativeMethodTraceExit
+#endif
+    @ native return; r9=self, r10=newSaveArea
+    @ equivalent to dvmPopJniLocals
+    ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
+    ldr     r0, [r10, #offStackSaveArea_localRefCookie] @ r0<- saved->top
+    ldr     r1, [r9, #offThread_exception] @ check for exception
+    str     rFP, [r9, #offThread_curFrame]  @ self->curFrame = fp
+    cmp     r1, #0                      @ null?
+    str     r0, [r9, #offThread_jniLocal_topCookie] @ new top <- old top
+    ldr     r0, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+
+    @ r0 = dalvikCallsitePC
+    bne     .LhandleException           @ no, handle exception
+
+    str     r2, [r9, #offThread_inJitCodeCache] @ set the mode properly
+    cmp     r2, #0                      @ return chaining cell still exists?
+    bxne    r2                          @ yes - go ahead
+
+    @ continue executing the next instruction through the interpreter
+    ldr     r1, .LdvmJitToInterpTraceSelectNoChain @ defined in footer.S
+    add     rPC, r0, #6                 @ reconstruct new rPC (advance 6 bytes)
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1
+
+#undef TEMPLATE_INLINE_PROFILING
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1526,12 +1890,15 @@
     str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
                                         @ newFp->localRefCookie=top
     mov     r9, r3                      @ r9<- glue->self (preserve)
+    ldr     lr, [rGLUE, #offGlue_pInterpBreak]
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
 
     mov     r2, r0                      @ r2<- methodToCall
+    ldr     lr, [lr]                    @ lr<- set of active profilers
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+    ands    lr, #kSubModeMethodTrace
+    beq     121f                        @ hop if not profiling
     @ r2: methodToCall, r6: rGLUE
     stmfd   sp!, {r2,r6}
     stmfd   sp!, {r0-r3}
@@ -1540,16 +1907,18 @@
     mov     lr, pc
     ldr     pc, .LdvmFastMethodTraceEnter
     ldmfd   sp!, {r0-r3}
-#endif
 
     mov     lr, pc
     ldr     pc, [r2, #offMethod_nativeFunc]
 
-#if defined(WITH_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}
     mov     lr, pc
     ldr     pc, .LdvmFastNativeMethodTraceExit
-#endif
+    b       212f
+121:
+    mov     lr, pc
+    ldr     pc, [r2, #offMethod_nativeFunc]
+212:
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1623,14 +1992,12 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
-#if defined(WITH_INLINE_PROFILING)
 .LdvmFastMethodTraceEnter:
     .word   dvmFastMethodTraceEnter
 .LdvmFastNativeMethodTraceExit:
     .word   dvmFastNativeMethodTraceExit
 .LdvmFastJavaMethodTraceExit:
     .word   dvmFastJavaMethodTraceExit
-#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index a6a0e9f..57d0aff 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -166,7 +166,7 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve live registers
     mov     r0, r6
     @ r0=rGlue
@@ -271,7 +271,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r3}                    @ preserve r0-r3
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -331,7 +331,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -453,7 +453,7 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     @ r2=methodToCall, r6=rGLUE
     stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
     stmfd   sp!, {r0-r3}                @ preserve r0-r3
@@ -467,7 +467,7 @@
 
     blx     r8                          @ off to the native code
 
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}                @ restore r2 and r6
     @ r0=JNIMethod, r1=rGlue
     mov     lr, pc
@@ -1236,6 +1236,370 @@
      blx    r0
      bx     r4
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_RETURN_PROF
+dvmCompiler_TEMPLATE_RETURN_PROF:
+/* File: armv5te/TEMPLATE_RETURN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_RETURN.S */
+    /*
+     * Unwind a frame from the Dalvik stack for compiled OP_RETURN_XXX.
+     * If the stored value in returnAddr
+     * is non-zero, the caller is compiled by the JIT thus return to the
+     * address in the code cache following the invoke instruction. Otherwise
+     * return to the special dvmJitToInterpNoChain entry point.
+     */
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
+    SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
+    ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+#if !defined(WITH_SELF_VERIFICATION)
+    ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
+#else
+    mov     r9, #0                      @ disable chaining
+#endif
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
+                                        @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
+    cmp     r2, #0                      @ break frame?
+#if !defined(WITH_SELF_VERIFICATION)
+    beq     1f                          @ bail to interpreter
+#else
+    blxeq   lr                          @ punt to interpreter and compare state
+#endif
+    ldr     r1, .LdvmJitToInterpNoChainNoProfile @ defined in footer.S
+    mov     rFP, r10                    @ publish new FP
+    ldr     r10, [r2, #offMethod_clazz] @ r10<- method->clazz
+    ldr     r8, [r8]                    @ r8<- suspendCount
+
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r0, [r10, #offClassObject_pDvmDex] @ r0<- method->clazz->pDvmDex
+    str     rFP, [r3, #offThread_curFrame] @ self->curFrame = fp
+    add     rPC, rPC, #6                @ publish new rPC (advance 6 bytes)
+    str     r0, [rGLUE, #offGlue_methodClassDex]
+    cmp     r8, #0                      @ check the suspendCount
+    movne   r9, #0                      @ clear the chaining cell address
+    str     r9, [r3, #offThread_inJitCodeCache] @ in code cache or not
+    cmp     r9, #0                      @ chaining cell exists?
+    blxne   r9                          @ jump to the chaining cell
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1                      @ callsite is interpreted
+1:
+    stmia   rGLUE, {rPC, rFP}           @ SAVE_PC_FP_TO_GLUE()
+    ldr     r2, .LdvmMterpStdBail       @ defined in footer.S
+    mov     r1, #0                      @ changeInterp = false
+    mov     r0, rGLUE                   @ Expecting rGLUE in r0
+    blx     r2                          @ exit the interpreter
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S */
+    /*
+     * For polymorphic callsites - setup the Dalvik frame and load Dalvik PC
+     * into rPC then jump to dvmJitToInterpNoChain to dispatch the
+     * runtime-resolved callee.
+     */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    ldrh    r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldrh    r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    ldr     r10, [r0, #offMethod_accessFlags] @ r10<- methodToCall->accessFlags
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- methodToCall->insns
+
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    lr                          @ bail to the interpreter
+    tst     r10, #ACC_NATIVE
+#if !defined(WITH_SELF_VERIFICATION)
+    bne     .LinvokeNative
+#else
+    bxne    lr                          @ bail to the interpreter
+#endif
+
+    ldr     r10, .LdvmJitToInterpTraceSelectNoChain
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
+
+    @ Start executing the callee
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kInlineCacheMiss
+#endif
+    mov     pc, r10                         @ dvmJitToInterpTraceSelectNoChain
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S */
+    /*
+     * For monomorphic callsite, setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     */
+    @ r0 = methodToCall, r1 = returnCell, r2 = methodToCall->outsSize
+    @ rPC = dalvikCallsite, r7 = methodToCall->registersSize
+    @ methodToCall is guaranteed to be non-native
+.LinvokeChainProf:
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    add     r12, lr, #2                 @ setup the punt-to-interp address
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    r12                         @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    r12                         @ bail to the interpreter
+
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
+
+    bx      lr                              @ return to the callee-chaining cell
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
+    /*
+     * For polymorphic callsite, check whether the cached class pointer matches
+     * the current one. If so setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     *
+     * The predicted chaining cell is declared in ArmLIR.h with the
+     * following layout:
+     *
+     *  typedef struct PredictedChainingCell {
+     *      u4 branch;
+     *      const ClassObject *clazz;
+     *      const Method *method;
+     *      u4 counter;
+     *  } PredictedChainingCell;
+     *
+     * Upon returning to the callsite:
+     *    - lr  : to branch to the chaining cell
+     *    - lr+2: to punt to the interpreter
+     *    - lr+4: to fully resolve the callee and may rechain.
+     *            r3 <- class
+     *            r9 <- counter
+     */
+    @ r0 = this, r1 = returnCell, r2 = predictedChainCell, rPC = dalvikCallsite
+    ldr     r3, [r0, #offObject_clazz]  @ r3 <- this->class
+    ldr     r8, [r2, #4]    @ r8 <- predictedChainCell->clazz
+    ldr     r0, [r2, #8]    @ r0 <- predictedChainCell->method
+    ldr     r9, [rGLUE, #offGlue_icRechainCount]   @ r1 <- shared rechainCount
+    cmp     r3, r8          @ predicted class == actual class?
+#if defined(WITH_JIT_TUNING)
+    ldr     r7, .LdvmICHitCount
+#if defined(WORKAROUND_CORTEX_A9_745320)
+    /* Don't use conditional loads if the HW defect exists */
+    bne     101f
+    ldr     r10, [r7, #0]
+101:
+#else
+    ldreq   r10, [r7, #0]
+#endif
+    add     r10, r10, #1
+    streq   r10, [r7, #0]
+#endif
+    ldreqh  r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldreqh  r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    beq     .LinvokeChainProf   @ predicted chain is valid
+    ldr     r7, [r3, #offClassObject_vtable] @ r7 <- this->class->vtable
+    cmp     r8, #0          @ initialized class or not
+    moveq   r1, #0
+    subne   r1, r9, #1      @ count--
+    strne   r1, [rGLUE, #offGlue_icRechainCount]   @ write back to InterpState
+    add     lr, lr, #4      @ return to fully-resolve landing pad
+    /*
+     * r1 <- count
+     * r2 <- &predictedChainCell
+     * r3 <- this->class
+     * r4 <- dPC
+     * r7 <- this->class->vtable
+     */
+    bx      lr
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    @ r7 = methodToCall->registersSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    ldr     r8, [r8]                    @ r3<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    ldr     r8, [r0, #offMethod_nativeFunc] @ r8<- method->nativeFunc
+#if !defined(WITH_SELF_VERIFICATION)
+    bxne    lr                          @ bail to the interpreter
+#else
+    bx      lr                          @ bail to interpreter unconditionally
+#endif
+
+    @ go ahead and transfer control to the native code
+    ldr     r9, [r3, #offThread_jniLocal_topCookie] @ r9<- thread->localRef->...
+    mov     r2, #0
+    str     r1, [r3, #offThread_curFrame]   @ self->curFrame = newFp
+    str     r2, [r3, #offThread_inJitCodeCache] @ not in the jit code cache
+    str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
+                                        @ newFp->localRefCookie=top
+    mov     r9, r3                      @ r9<- glue->self (preserve)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
+
+    mov     r2, r0                      @ r2<- methodToCall
+    mov     r0, r1                      @ r0<- newFP
+    add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(TEMPLATE_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
+
+    blx     r8                          @ off to the native code
+
+#if defined(TEMPLATE_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastNativeMethodTraceExit
+#endif
+    @ native return; r9=self, r10=newSaveArea
+    @ equivalent to dvmPopJniLocals
+    ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
+    ldr     r0, [r10, #offStackSaveArea_localRefCookie] @ r0<- saved->top
+    ldr     r1, [r9, #offThread_exception] @ check for exception
+    str     rFP, [r9, #offThread_curFrame]  @ self->curFrame = fp
+    cmp     r1, #0                      @ null?
+    str     r0, [r9, #offThread_jniLocal_topCookie] @ new top <- old top
+    ldr     r0, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+
+    @ r0 = dalvikCallsitePC
+    bne     .LhandleException           @ no, handle exception
+
+    str     r2, [r9, #offThread_inJitCodeCache] @ set the mode properly
+    cmp     r2, #0                      @ return chaining cell still exists?
+    bxne    r2                          @ yes - go ahead
+
+    @ continue executing the next instruction through the interpreter
+    ldr     r1, .LdvmJitToInterpTraceSelectNoChain @ defined in footer.S
+    add     rPC, r0, #6                 @ reconstruct new rPC (advance 6 bytes)
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1
+
+#undef TEMPLATE_INLINE_PROFILING
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1257,12 +1621,15 @@
     str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
                                         @ newFp->localRefCookie=top
     mov     r9, r3                      @ r9<- glue->self (preserve)
+    ldr     lr, [rGLUE, #offGlue_pInterpBreak]
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
 
     mov     r2, r0                      @ r2<- methodToCall
+    ldr     lr, [lr]                    @ lr<- set of active profilers
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+    ands    lr, #kSubModeMethodTrace
+    beq     121f                        @ hop if not profiling
     @ r2: methodToCall, r6: rGLUE
     stmfd   sp!, {r2,r6}
     stmfd   sp!, {r0-r3}
@@ -1271,16 +1638,18 @@
     mov     lr, pc
     ldr     pc, .LdvmFastMethodTraceEnter
     ldmfd   sp!, {r0-r3}
-#endif
 
     mov     lr, pc
     ldr     pc, [r2, #offMethod_nativeFunc]
 
-#if defined(WITH_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}
     mov     lr, pc
     ldr     pc, .LdvmFastNativeMethodTraceExit
-#endif
+    b       212f
+121:
+    mov     lr, pc
+    ldr     pc, [r2, #offMethod_nativeFunc]
+212:
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1354,14 +1723,12 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
-#if defined(WITH_INLINE_PROFILING)
 .LdvmFastMethodTraceEnter:
     .word   dvmFastMethodTraceEnter
 .LdvmFastNativeMethodTraceExit:
     .word   dvmFastNativeMethodTraceExit
 .LdvmFastJavaMethodTraceExit:
     .word   dvmFastJavaMethodTraceExit
-#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
index e4ed30b..10541d3 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
@@ -166,7 +166,7 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve live registers
     mov     r0, r6
     @ r0=rGlue
@@ -271,7 +271,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r3}                    @ preserve r0-r3
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -331,7 +331,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -453,7 +453,7 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     @ r2=methodToCall, r6=rGLUE
     stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
     stmfd   sp!, {r0-r3}                @ preserve r0-r3
@@ -467,7 +467,7 @@
 
     blx     r8                          @ off to the native code
 
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}                @ restore r2 and r6
     @ r0=JNIMethod, r1=rGlue
     mov     lr, pc
@@ -1505,6 +1505,370 @@
      blx    r0
      bx     r4
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_RETURN_PROF
+dvmCompiler_TEMPLATE_RETURN_PROF:
+/* File: armv5te/TEMPLATE_RETURN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_RETURN.S */
+    /*
+     * Unwind a frame from the Dalvik stack for compiled OP_RETURN_XXX.
+     * If the stored value in returnAddr
+     * is non-zero, the caller is compiled by the JIT thus return to the
+     * address in the code cache following the invoke instruction. Otherwise
+     * return to the special dvmJitToInterpNoChain entry point.
+     */
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
+    SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
+    ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+#if !defined(WITH_SELF_VERIFICATION)
+    ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
+#else
+    mov     r9, #0                      @ disable chaining
+#endif
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
+                                        @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
+    cmp     r2, #0                      @ break frame?
+#if !defined(WITH_SELF_VERIFICATION)
+    beq     1f                          @ bail to interpreter
+#else
+    blxeq   lr                          @ punt to interpreter and compare state
+#endif
+    ldr     r1, .LdvmJitToInterpNoChainNoProfile @ defined in footer.S
+    mov     rFP, r10                    @ publish new FP
+    ldr     r10, [r2, #offMethod_clazz] @ r10<- method->clazz
+    ldr     r8, [r8]                    @ r8<- suspendCount
+
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r0, [r10, #offClassObject_pDvmDex] @ r0<- method->clazz->pDvmDex
+    str     rFP, [r3, #offThread_curFrame] @ self->curFrame = fp
+    add     rPC, rPC, #6                @ publish new rPC (advance 6 bytes)
+    str     r0, [rGLUE, #offGlue_methodClassDex]
+    cmp     r8, #0                      @ check the suspendCount
+    movne   r9, #0                      @ clear the chaining cell address
+    str     r9, [r3, #offThread_inJitCodeCache] @ in code cache or not
+    cmp     r9, #0                      @ chaining cell exists?
+    blxne   r9                          @ jump to the chaining cell
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1                      @ callsite is interpreted
+1:
+    stmia   rGLUE, {rPC, rFP}           @ SAVE_PC_FP_TO_GLUE()
+    ldr     r2, .LdvmMterpStdBail       @ defined in footer.S
+    mov     r1, #0                      @ changeInterp = false
+    mov     r0, rGLUE                   @ Expecting rGLUE in r0
+    blx     r2                          @ exit the interpreter
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S */
+    /*
+     * For polymorphic callsites - setup the Dalvik frame and load Dalvik PC
+     * into rPC then jump to dvmJitToInterpNoChain to dispatch the
+     * runtime-resolved callee.
+     */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    ldrh    r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldrh    r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    ldr     r10, [r0, #offMethod_accessFlags] @ r10<- methodToCall->accessFlags
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- methodToCall->insns
+
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    lr                          @ bail to the interpreter
+    tst     r10, #ACC_NATIVE
+#if !defined(WITH_SELF_VERIFICATION)
+    bne     .LinvokeNative
+#else
+    bxne    lr                          @ bail to the interpreter
+#endif
+
+    ldr     r10, .LdvmJitToInterpTraceSelectNoChain
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
+
+    @ Start executing the callee
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kInlineCacheMiss
+#endif
+    mov     pc, r10                         @ dvmJitToInterpTraceSelectNoChain
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S */
+    /*
+     * For monomorphic callsite, setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     */
+    @ r0 = methodToCall, r1 = returnCell, r2 = methodToCall->outsSize
+    @ rPC = dalvikCallsite, r7 = methodToCall->registersSize
+    @ methodToCall is guaranteed to be non-native
+.LinvokeChainProf:
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    add     r12, lr, #2                 @ setup the punt-to-interp address
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    r12                         @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    r12                         @ bail to the interpreter
+
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
+
+    bx      lr                              @ return to the callee-chaining cell
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
+    /*
+     * For polymorphic callsite, check whether the cached class pointer matches
+     * the current one. If so setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     *
+     * The predicted chaining cell is declared in ArmLIR.h with the
+     * following layout:
+     *
+     *  typedef struct PredictedChainingCell {
+     *      u4 branch;
+     *      const ClassObject *clazz;
+     *      const Method *method;
+     *      u4 counter;
+     *  } PredictedChainingCell;
+     *
+     * Upon returning to the callsite:
+     *    - lr  : to branch to the chaining cell
+     *    - lr+2: to punt to the interpreter
+     *    - lr+4: to fully resolve the callee and may rechain.
+     *            r3 <- class
+     *            r9 <- counter
+     */
+    @ r0 = this, r1 = returnCell, r2 = predictedChainCell, rPC = dalvikCallsite
+    ldr     r3, [r0, #offObject_clazz]  @ r3 <- this->class
+    ldr     r8, [r2, #4]    @ r8 <- predictedChainCell->clazz
+    ldr     r0, [r2, #8]    @ r0 <- predictedChainCell->method
+    ldr     r9, [rGLUE, #offGlue_icRechainCount]   @ r1 <- shared rechainCount
+    cmp     r3, r8          @ predicted class == actual class?
+#if defined(WITH_JIT_TUNING)
+    ldr     r7, .LdvmICHitCount
+#if defined(WORKAROUND_CORTEX_A9_745320)
+    /* Don't use conditional loads if the HW defect exists */
+    bne     101f
+    ldr     r10, [r7, #0]
+101:
+#else
+    ldreq   r10, [r7, #0]
+#endif
+    add     r10, r10, #1
+    streq   r10, [r7, #0]
+#endif
+    ldreqh  r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldreqh  r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    beq     .LinvokeChainProf   @ predicted chain is valid
+    ldr     r7, [r3, #offClassObject_vtable] @ r7 <- this->class->vtable
+    cmp     r8, #0          @ initialized class or not
+    moveq   r1, #0
+    subne   r1, r9, #1      @ count--
+    strne   r1, [rGLUE, #offGlue_icRechainCount]   @ write back to InterpState
+    add     lr, lr, #4      @ return to fully-resolve landing pad
+    /*
+     * r1 <- count
+     * r2 <- &predictedChainCell
+     * r3 <- this->class
+     * r4 <- dPC
+     * r7 <- this->class->vtable
+     */
+    bx      lr
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    @ r7 = methodToCall->registersSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    ldr     r8, [r8]                    @ r3<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    ldr     r8, [r0, #offMethod_nativeFunc] @ r8<- method->nativeFunc
+#if !defined(WITH_SELF_VERIFICATION)
+    bxne    lr                          @ bail to the interpreter
+#else
+    bx      lr                          @ bail to interpreter unconditionally
+#endif
+
+    @ go ahead and transfer control to the native code
+    ldr     r9, [r3, #offThread_jniLocal_topCookie] @ r9<- thread->localRef->...
+    mov     r2, #0
+    str     r1, [r3, #offThread_curFrame]   @ self->curFrame = newFp
+    str     r2, [r3, #offThread_inJitCodeCache] @ not in the jit code cache
+    str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
+                                        @ newFp->localRefCookie=top
+    mov     r9, r3                      @ r9<- glue->self (preserve)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
+
+    mov     r2, r0                      @ r2<- methodToCall
+    mov     r0, r1                      @ r0<- newFP
+    add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(TEMPLATE_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
+
+    blx     r8                          @ off to the native code
+
+#if defined(TEMPLATE_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastNativeMethodTraceExit
+#endif
+    @ native return; r9=self, r10=newSaveArea
+    @ equivalent to dvmPopJniLocals
+    ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
+    ldr     r0, [r10, #offStackSaveArea_localRefCookie] @ r0<- saved->top
+    ldr     r1, [r9, #offThread_exception] @ check for exception
+    str     rFP, [r9, #offThread_curFrame]  @ self->curFrame = fp
+    cmp     r1, #0                      @ null?
+    str     r0, [r9, #offThread_jniLocal_topCookie] @ new top <- old top
+    ldr     r0, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+
+    @ r0 = dalvikCallsitePC
+    bne     .LhandleException           @ no, handle exception
+
+    str     r2, [r9, #offThread_inJitCodeCache] @ set the mode properly
+    cmp     r2, #0                      @ return chaining cell still exists?
+    bxne    r2                          @ yes - go ahead
+
+    @ continue executing the next instruction through the interpreter
+    ldr     r1, .LdvmJitToInterpTraceSelectNoChain @ defined in footer.S
+    add     rPC, r0, #6                 @ reconstruct new rPC (advance 6 bytes)
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1
+
+#undef TEMPLATE_INLINE_PROFILING
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1526,12 +1890,15 @@
     str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
                                         @ newFp->localRefCookie=top
     mov     r9, r3                      @ r9<- glue->self (preserve)
+    ldr     lr, [rGLUE, #offGlue_pInterpBreak]
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
 
     mov     r2, r0                      @ r2<- methodToCall
+    ldr     lr, [lr]                    @ lr<- set of active profilers
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+    ands    lr, #kSubModeMethodTrace
+    beq     121f                        @ hop if not profiling
     @ r2: methodToCall, r6: rGLUE
     stmfd   sp!, {r2,r6}
     stmfd   sp!, {r0-r3}
@@ -1540,16 +1907,18 @@
     mov     lr, pc
     ldr     pc, .LdvmFastMethodTraceEnter
     ldmfd   sp!, {r0-r3}
-#endif
 
     mov     lr, pc
     ldr     pc, [r2, #offMethod_nativeFunc]
 
-#if defined(WITH_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}
     mov     lr, pc
     ldr     pc, .LdvmFastNativeMethodTraceExit
-#endif
+    b       212f
+121:
+    mov     lr, pc
+    ldr     pc, [r2, #offMethod_nativeFunc]
+212:
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1623,14 +1992,12 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
-#if defined(WITH_INLINE_PROFILING)
 .LdvmFastMethodTraceEnter:
     .word   dvmFastMethodTraceEnter
 .LdvmFastNativeMethodTraceExit:
     .word   dvmFastNativeMethodTraceExit
 .LdvmFastJavaMethodTraceExit:
     .word   dvmFastJavaMethodTraceExit
-#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index fc26b3a..d584744 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -166,7 +166,7 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve live registers
     mov     r0, r6
     @ r0=rGlue
@@ -271,7 +271,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r3}                    @ preserve r0-r3
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -331,7 +331,7 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
     mov     r1, r6
     @ r0=methodToCall, r1=rGlue
@@ -453,7 +453,7 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     @ r2=methodToCall, r6=rGLUE
     stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
     stmfd   sp!, {r0-r3}                @ preserve r0-r3
@@ -467,7 +467,7 @@
 
     blx     r8                          @ off to the native code
 
-#if defined(WITH_INLINE_PROFILING)
+#if defined(TEMPLATE_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}                @ restore r2 and r6
     @ r0=JNIMethod, r1=rGlue
     mov     lr, pc
@@ -1505,6 +1505,370 @@
      blx    r0
      bx     r4
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_RETURN_PROF
+dvmCompiler_TEMPLATE_RETURN_PROF:
+/* File: armv5te/TEMPLATE_RETURN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_RETURN.S */
+    /*
+     * Unwind a frame from the Dalvik stack for compiled OP_RETURN_XXX.
+     * If the stored value in returnAddr
+     * is non-zero, the caller is compiled by the JIT thus return to the
+     * address in the code cache following the invoke instruction. Otherwise
+     * return to the special dvmJitToInterpNoChain entry point.
+     */
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastJavaMethodTraceExit
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
+    SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
+    ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    ldr     rPC, [r0, #offStackSaveArea_savedPc] @ rPC<- saveArea->savedPc
+#if !defined(WITH_SELF_VERIFICATION)
+    ldr     r9,  [r0, #offStackSaveArea_returnAddr] @ r9<- chaining cell ret
+#else
+    mov     r9, #0                      @ disable chaining
+#endif
+    ldr     r2, [r10, #(offStackSaveArea_method - sizeofStackSaveArea)]
+                                        @ r2<- method we're returning to
+    ldr     r3, [rGLUE, #offGlue_self]  @ r3<- glue->self
+    cmp     r2, #0                      @ break frame?
+#if !defined(WITH_SELF_VERIFICATION)
+    beq     1f                          @ bail to interpreter
+#else
+    blxeq   lr                          @ punt to interpreter and compare state
+#endif
+    ldr     r1, .LdvmJitToInterpNoChainNoProfile @ defined in footer.S
+    mov     rFP, r10                    @ publish new FP
+    ldr     r10, [r2, #offMethod_clazz] @ r10<- method->clazz
+    ldr     r8, [r8]                    @ r8<- suspendCount
+
+    str     r2, [rGLUE, #offGlue_method]@ glue->method = newSave->method
+    ldr     r0, [r10, #offClassObject_pDvmDex] @ r0<- method->clazz->pDvmDex
+    str     rFP, [r3, #offThread_curFrame] @ self->curFrame = fp
+    add     rPC, rPC, #6                @ publish new rPC (advance 6 bytes)
+    str     r0, [rGLUE, #offGlue_methodClassDex]
+    cmp     r8, #0                      @ check the suspendCount
+    movne   r9, #0                      @ clear the chaining cell address
+    str     r9, [r3, #offThread_inJitCodeCache] @ in code cache or not
+    cmp     r9, #0                      @ chaining cell exists?
+    blxne   r9                          @ jump to the chaining cell
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1                      @ callsite is interpreted
+1:
+    stmia   rGLUE, {rPC, rFP}           @ SAVE_PC_FP_TO_GLUE()
+    ldr     r2, .LdvmMterpStdBail       @ defined in footer.S
+    mov     r1, #0                      @ changeInterp = false
+    mov     r0, rGLUE                   @ Expecting rGLUE in r0
+    blx     r2                          @ exit the interpreter
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NO_OPT_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S */
+    /*
+     * For polymorphic callsites - setup the Dalvik frame and load Dalvik PC
+     * into rPC then jump to dvmJitToInterpNoChain to dispatch the
+     * runtime-resolved callee.
+     */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    ldrh    r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldrh    r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    ldr     r10, [r0, #offMethod_accessFlags] @ r10<- methodToCall->accessFlags
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+    ldr     rPC, [r0, #offMethod_insns]     @ rPC<- methodToCall->insns
+
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    lr                          @ bail to the interpreter
+    tst     r10, #ACC_NATIVE
+#if !defined(WITH_SELF_VERIFICATION)
+    bne     .LinvokeNative
+#else
+    bxne    lr                          @ bail to the interpreter
+#endif
+
+    ldr     r10, .LdvmJitToInterpTraceSelectNoChain
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
+
+    @ Start executing the callee
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kInlineCacheMiss
+#endif
+    mov     pc, r10                         @ dvmJitToInterpTraceSelectNoChain
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S */
+    /*
+     * For monomorphic callsite, setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     */
+    @ r0 = methodToCall, r1 = returnCell, r2 = methodToCall->outsSize
+    @ rPC = dalvikCallsite, r7 = methodToCall->registersSize
+    @ methodToCall is guaranteed to be non-native
+.LinvokeChainProf:
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    add     r12, lr, #2                 @ setup the punt-to-interp address
+    sub     r10, r10, r2, lsl #2        @ r10<- bottom (newsave - outsSize)
+    ldr     r8, [r8]                    @ r8<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    r12                         @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    ldr     r9, [r0, #offMethod_clazz]      @ r9<- method->clazz
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    bxne    r12                         @ bail to the interpreter
+
+    ldr     r3, [r9, #offClassObject_pDvmDex] @ r3<- method->clazz->pDvmDex
+    ldr     r2, [rGLUE, #offGlue_self]      @ r2<- glue->self
+
+    @ Update "glue" values for the new method
+    str     r0, [rGLUE, #offGlue_method]    @ glue->method = methodToCall
+    str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
+    mov     rFP, r1                         @ fp = newFp
+    str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(TEMPLATE_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
+
+    bx      lr                              @ return to the callee-chaining cell
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_PREDICTED_CHAIN.S */
+    /*
+     * For polymorphic callsite, check whether the cached class pointer matches
+     * the current one. If so setup the Dalvik frame and return to the
+     * Thumb code through the link register to transfer control to the callee
+     * method through a dedicated chaining cell.
+     *
+     * The predicted chaining cell is declared in ArmLIR.h with the
+     * following layout:
+     *
+     *  typedef struct PredictedChainingCell {
+     *      u4 branch;
+     *      const ClassObject *clazz;
+     *      const Method *method;
+     *      u4 counter;
+     *  } PredictedChainingCell;
+     *
+     * Upon returning to the callsite:
+     *    - lr  : to branch to the chaining cell
+     *    - lr+2: to punt to the interpreter
+     *    - lr+4: to fully resolve the callee and may rechain.
+     *            r3 <- class
+     *            r9 <- counter
+     */
+    @ r0 = this, r1 = returnCell, r2 = predictedChainCell, rPC = dalvikCallsite
+    ldr     r3, [r0, #offObject_clazz]  @ r3 <- this->class
+    ldr     r8, [r2, #4]    @ r8 <- predictedChainCell->clazz
+    ldr     r0, [r2, #8]    @ r0 <- predictedChainCell->method
+    ldr     r9, [rGLUE, #offGlue_icRechainCount]   @ r1 <- shared rechainCount
+    cmp     r3, r8          @ predicted class == actual class?
+#if defined(WITH_JIT_TUNING)
+    ldr     r7, .LdvmICHitCount
+#if defined(WORKAROUND_CORTEX_A9_745320)
+    /* Don't use conditional loads if the HW defect exists */
+    bne     101f
+    ldr     r10, [r7, #0]
+101:
+#else
+    ldreq   r10, [r7, #0]
+#endif
+    add     r10, r10, #1
+    streq   r10, [r7, #0]
+#endif
+    ldreqh  r7, [r0, #offMethod_registersSize]  @ r7<- methodToCall->regsSize
+    ldreqh  r2, [r0, #offMethod_outsSize]  @ r2<- methodToCall->outsSize
+    beq     .LinvokeChainProf   @ predicted chain is valid
+    ldr     r7, [r3, #offClassObject_vtable] @ r7 <- this->class->vtable
+    cmp     r8, #0          @ initialized class or not
+    moveq   r1, #0
+    subne   r1, r9, #1      @ count--
+    strne   r1, [rGLUE, #offGlue_icRechainCount]   @ write back to InterpState
+    add     lr, lr, #4      @ return to fully-resolve landing pad
+    /*
+     * r1 <- count
+     * r2 <- &predictedChainCell
+     * r3 <- this->class
+     * r4 <- dPC
+     * r7 <- this->class->vtable
+     */
+    bx      lr
+
+#undef TEMPLATE_INLINE_PROFILING
+
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF
+dvmCompiler_TEMPLATE_INVOKE_METHOD_NATIVE_PROF:
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE_PROF.S */
+#define TEMPLATE_INLINE_PROFILING
+/* File: armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S */
+    @ r0 = methodToCall, r1 = returnCell, rPC = dalvikCallsite
+    @ r7 = methodToCall->registersSize
+    ldr     r9, [rGLUE, #offGlue_interpStackEnd]    @ r9<- interpStackEnd
+    ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
+    add     r3, r1, #1  @ Thumb addr is odd
+    SAVEAREA_FROM_FP(r1, rFP)           @ r1<- stack save area
+    sub     r1, r1, r7, lsl #2          @ r1<- newFp (old savearea - regsSize)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- stack save area
+    ldr     r8, [r8]                    @ r3<- suspendCount (int)
+    cmp     r10, r9                     @ bottom < interpStackEnd?
+    bxlo    lr                          @ return to raise stack overflow excep.
+    @ r1 = newFP, r0 = methodToCall, r3 = returnCell, rPC = dalvikCallsite
+    str     rPC, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+    str     rPC, [r1, #(offStackSaveArea_savedPc - sizeofStackSaveArea)]
+
+    @ set up newSaveArea
+    str     rFP, [r1, #(offStackSaveArea_prevFrame - sizeofStackSaveArea)]
+    str     r3, [r1, #(offStackSaveArea_returnAddr - sizeofStackSaveArea)]
+    ldr     r3, [rGLUE, #offGlue_self]      @ r3<- glue->self
+    str     r0, [r1, #(offStackSaveArea_method - sizeofStackSaveArea)]
+    cmp     r8, #0                      @ suspendCount != 0
+    ldr     r8, [r0, #offMethod_nativeFunc] @ r8<- method->nativeFunc
+#if !defined(WITH_SELF_VERIFICATION)
+    bxne    lr                          @ bail to the interpreter
+#else
+    bx      lr                          @ bail to interpreter unconditionally
+#endif
+
+    @ go ahead and transfer control to the native code
+    ldr     r9, [r3, #offThread_jniLocal_topCookie] @ r9<- thread->localRef->...
+    mov     r2, #0
+    str     r1, [r3, #offThread_curFrame]   @ self->curFrame = newFp
+    str     r2, [r3, #offThread_inJitCodeCache] @ not in the jit code cache
+    str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
+                                        @ newFp->localRefCookie=top
+    mov     r9, r3                      @ r9<- glue->self (preserve)
+    SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
+
+    mov     r2, r0                      @ r2<- methodToCall
+    mov     r0, r1                      @ r0<- newFP
+    add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(TEMPLATE_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastMethodTraceEnter
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
+
+    blx     r8                          @ off to the native code
+
+#if defined(TEMPLATE_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    mov     lr, pc
+    ldr     pc, .LdvmFastNativeMethodTraceExit
+#endif
+    @ native return; r9=self, r10=newSaveArea
+    @ equivalent to dvmPopJniLocals
+    ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
+    ldr     r0, [r10, #offStackSaveArea_localRefCookie] @ r0<- saved->top
+    ldr     r1, [r9, #offThread_exception] @ check for exception
+    str     rFP, [r9, #offThread_curFrame]  @ self->curFrame = fp
+    cmp     r1, #0                      @ null?
+    str     r0, [r9, #offThread_jniLocal_topCookie] @ new top <- old top
+    ldr     r0, [rFP, #(offStackSaveArea_currentPc - sizeofStackSaveArea)]
+
+    @ r0 = dalvikCallsitePC
+    bne     .LhandleException           @ no, handle exception
+
+    str     r2, [r9, #offThread_inJitCodeCache] @ set the mode properly
+    cmp     r2, #0                      @ return chaining cell still exists?
+    bxne    r2                          @ yes - go ahead
+
+    @ continue executing the next instruction through the interpreter
+    ldr     r1, .LdvmJitToInterpTraceSelectNoChain @ defined in footer.S
+    add     rPC, r0, #6                 @ reconstruct new rPC (advance 6 bytes)
+#if defined(WITH_JIT_TUNING)
+    mov     r0, #kCallsiteInterpreted
+#endif
+    mov     pc, r1
+
+#undef TEMPLATE_INLINE_PROFILING
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1526,12 +1890,15 @@
     str     r9, [r1, #(offStackSaveArea_localRefCookie - sizeofStackSaveArea)]
                                         @ newFp->localRefCookie=top
     mov     r9, r3                      @ r9<- glue->self (preserve)
+    ldr     lr, [rGLUE, #offGlue_pInterpBreak]
     SAVEAREA_FROM_FP(r10, r1)           @ r10<- new stack save area
 
     mov     r2, r0                      @ r2<- methodToCall
+    ldr     lr, [lr]                    @ lr<- set of active profilers
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
-#if defined(WITH_INLINE_PROFILING)
+    ands    lr, #kSubModeMethodTrace
+    beq     121f                        @ hop if not profiling
     @ r2: methodToCall, r6: rGLUE
     stmfd   sp!, {r2,r6}
     stmfd   sp!, {r0-r3}
@@ -1540,16 +1907,18 @@
     mov     lr, pc
     ldr     pc, .LdvmFastMethodTraceEnter
     ldmfd   sp!, {r0-r3}
-#endif
 
     mov     lr, pc
     ldr     pc, [r2, #offMethod_nativeFunc]
 
-#if defined(WITH_INLINE_PROFILING)
     ldmfd   sp!, {r0-r1}
     mov     lr, pc
     ldr     pc, .LdvmFastNativeMethodTraceExit
-#endif
+    b       212f
+121:
+    mov     lr, pc
+    ldr     pc, [r2, #offMethod_nativeFunc]
+212:
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1623,14 +1992,12 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
-#if defined(WITH_INLINE_PROFILING)
 .LdvmFastMethodTraceEnter:
     .word   dvmFastMethodTraceEnter
 .LdvmFastNativeMethodTraceExit:
     .word   dvmFastNativeMethodTraceExit
 .LdvmFastJavaMethodTraceExit:
     .word   dvmFastJavaMethodTraceExit
-#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple: