Collect method traces with the fast interpreter and the JIT'ed code.

Insert inline code instead of switching to the debug interpreter in the hope
that the time stamps collected in traceview are more close to the real
world behavior with minimal profiling overhead.

Because the inline polling still introduces additional overhead (20% ~ 100%),
it is only enabled in the special VM build called "libdvm_traceview.so".
It won't work on the emulator because it is not implemented to collect the
detailed instruction traces.

Here are some performance numbers using the FibonacciSlow microbenchmark
(ie recursive workloads / the shorter the faster):

       time: configuration
  8,162,602: profiling off/libdvm.so/JIT off
  2,801,829: profiling off/libdvm.so/JIT on
  9,952,236: profiling off/libdvm_traceview.so/JIT off
  4,465,701: profiling off/libdvm_traceview.so/JIT on
164,786,585: profiling on/libdvm.so/JIT off
164,664,634: profiling on/libdvm.so/JIT on
 11,231,707: profiling on/libdvm_traceview.so/JIT off
  8,427,846: profiling on/libdvm_traceview.so/JIT on

Comparing the 8,427,846 vs 164,664,634 numbers againt the true baseline
performance number of 2,801,829, the new libdvm_traceview.so improves the time
skew from 58x to 3x.

Change-Id: I48611a3a4ff9c4950059249e5503c26abd6b138e
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index 60f060c..8c26989 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -741,7 +741,11 @@
 
     dvmLockMutex(&gDvmJit.tableLock);
     jitActive = gDvmJit.pProfTable != NULL;
-    jitActivate = !(gDvm.debuggerActive || (gDvm.activeProfilers > 0));
+    bool disableJit = gDvm.debuggerActive;
+#if !defined(WITH_INLINE_PROFILING)
+    disableJit = disableJit || (gDvm.activeProfilers > 0);
+#endif
+    jitActivate = !disableJit;
 
     if (jitActivate && !jitActive) {
         gDvmJit.pProfTable = gDvmJit.pProfTableCopy;
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
index a137d22..aaadc00 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_CHAIN.S
@@ -41,5 +41,12 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
index 2557863..eeac2b0 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NATIVE.S
@@ -41,9 +41,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
diff --git a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
index 5be6978..044d0ee 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_INVOKE_METHOD_NO_OPT.S
@@ -46,6 +46,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
diff --git a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
index b7ab971..b2e71ee 100644
--- a/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
+++ b/vm/compiler/template/armv5te/TEMPLATE_RETURN.S
@@ -5,6 +5,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
diff --git a/vm/compiler/template/armv5te/footer.S b/vm/compiler/template/armv5te/footer.S
index 73fc3d7..a391dbe 100644
--- a/vm/compiler/template/armv5te/footer.S
+++ b/vm/compiler/template/armv5te/footer.S
@@ -22,9 +22,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -96,6 +109,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index 60664fa..655bc54 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1458,9 +1494,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1532,6 +1581,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index ccdbcca..ff552bb 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1181,9 +1217,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1255,6 +1304,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
index e520056..34931f8 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1458,9 +1494,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1532,6 +1581,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple:
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index 87a0691..b10beef 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -177,6 +177,13 @@
      * address in the code cache following the invoke instruction. Otherwise
      * return to the special dvmJitToInterpNoChain entry point.
      */
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve live registers
+    mov     r0, r6
+    @ r0=rGlue
+    LDR_PC_LR ".LdvmFastJavaMethodTraceExit"
+    ldmfd   sp!, {r0-r2,lr}             @ restore live registers
+#endif
     SAVEAREA_FROM_FP(r0, rFP)           @ r0<- saveArea (old)
     ldr     r10, [r0, #offStackSaveArea_prevFrame] @ r10<- saveArea->prevFrame
     ldr     r8, [rGLUE, #offGlue_pSelfSuspendCount] @ r8<- &suspendCount
@@ -274,6 +281,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r3}                    @ preserve r0-r3
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                    @ restore r0-r3
+#endif
 
     @ Start executing the callee
 #if defined(WITH_JIT_TUNING)
@@ -329,6 +343,13 @@
     str     r3, [rGLUE, #offGlue_methodClassDex] @ glue->methodClassDex = ...
     mov     rFP, r1                         @ fp = newFp
     str     rFP, [r2, #offThread_curFrame]  @ self->curFrame = newFp
+#if defined(WITH_INLINE_PROFILING)
+    stmfd   sp!, {r0-r2,lr}             @ preserve clobbered live registers
+    mov     r1, r6
+    @ r0=methodToCall, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r2,lr}             @ restore registers
+#endif
 
     bx      lr                              @ return to the callee-chaining cell
 
@@ -436,9 +457,24 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2=methodToCall, r6=rGLUE
+    stmfd   sp!, {r2,r6}                @ to be consumed after JNI return
+    stmfd   sp!, {r0-r3}                @ preserve r0-r3
+    mov     r0, r2
+    mov     r1, r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}                @ restore r0-r3
+#endif
 
     blx     r8                          @ off to the native code
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}                @ restore r2 and r6
+    @ r0=JNIMethod, r1=rGlue
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ native return; r9=self, r10=newSaveArea
     @ equivalent to dvmPopJniLocals
     ldr     r2, [r10, #offStackSaveArea_returnAddr] @ r2 = chaining cell ret
@@ -1458,9 +1494,22 @@
     mov     r2, r0                      @ r2<- methodToCall
     mov     r0, r1                      @ r0<- newFP
     add     r1, rGLUE, #offGlue_retval  @ r1<- &retval
+#if defined(WITH_INLINE_PROFILING)
+    @ r2: methodToCall, r6: rGLUE
+    stmfd   sp!, {r2,r6}
+    stmfd   sp!, {r0-r3}
+    mov     r0, r2
+    mov     r1, r6
+    LDR_PC_LR ".LdvmFastMethodTraceEnter"
+    ldmfd   sp!, {r0-r3}
+#endif
 
     LDR_PC_LR "[r2, #offMethod_nativeFunc]"
 
+#if defined(WITH_INLINE_PROFILING)
+    ldmfd   sp!, {r0-r1}
+    LDR_PC_LR ".LdvmFastNativeMethodTraceExit"
+#endif
     @ Refresh Jit's on/off status
     ldr     r3, [rGLUE, #offGlue_ppJitProfTable]
 
@@ -1532,6 +1581,14 @@
 .LdvmSelfVerificationMemOpDecode:
     .word   dvmSelfVerificationMemOpDecode
 #endif
+#if defined(WITH_INLINE_PROFILING)
+.LdvmFastMethodTraceEnter:
+    .word   dvmFastMethodTraceEnter
+.LdvmFastNativeMethodTraceExit:
+    .word   dvmFastNativeMethodTraceExit
+.LdvmFastJavaMethodTraceExit:
+    .word   dvmFastJavaMethodTraceExit
+#endif
 .L__aeabi_cdcmple:
     .word   __aeabi_cdcmple
 .L__aeabi_cfcmple: