[JIT] Trace profiling support

In preparation for method compilation, this CL causes all traces to
include two entry points: profiling and non-profiling.  For now, the
profiling entry will only be used if dalvik is run with -Xjitprofile,
and largely works like it did before.  The difference is that profiling
support no longer requires the "assert" build - it's always there now.

This will enable us to do a form of sampling profiling of
traces in order to identify hot methods or hot trace groups,
while keeping the overhead low by only switching profiling on periodically.

To turn the periodic profiling on and off, we simply unchain all existing
translations and set the appropriate global profile state.  The underlying
translation lookup and chaining utilties will examine the profile state to
determine which entry point to use (i.e. - profiling or non-profiling) while
the traces naturally rechain during further execution.

Change-Id: I9ee33e69e33869b9fab3a57e88f9bc524175172b
diff --git a/vm/compiler/Compiler.c b/vm/compiler/Compiler.c
index c8ff62e..adb58dd 100644
--- a/vm/compiler/Compiler.c
+++ b/vm/compiler/Compiler.c
@@ -332,6 +332,7 @@
 {
     JitEntry *pJitTable = NULL;
     unsigned char *pJitProfTable = NULL;
+    JitTraceProfCounters *pJitTraceProfCounters = NULL;
     unsigned int i;
 
     if (!dvmCompilerArchInit())
@@ -398,6 +399,15 @@
     /* Is chain field wide enough for termination pattern? */
     assert(pJitTable[0].u.info.chain == gDvmJit.jitTableSize);
 
+    /* Allocate the trace profiling structure */
+    pJitTraceProfCounters = (JitTraceProfCounters*)
+                             calloc(1, sizeof(*pJitTraceProfCounters));
+    if (!pJitTraceProfCounters) {
+        LOGE("jit trace prof counters allocation failed\n");
+        dvmUnlockMutex(&gDvmJit.tableLock);
+        goto fail;
+    }
+
     gDvmJit.pJitEntryTable = pJitTable;
     gDvmJit.jitTableMask = gDvmJit.jitTableSize - 1;
     gDvmJit.jitTableEntriesUsed = 0;
@@ -409,6 +419,7 @@
      */
     gDvmJit.pProfTable = dvmDebuggerOrProfilerActive() ? NULL : pJitProfTable;
     gDvmJit.pProfTableCopy = pJitProfTable;
+    gDvmJit.pJitTraceProfCounters = pJitTraceProfCounters;
     dvmUnlockMutex(&gDvmJit.tableLock);
 
     /* Signal running threads to refresh their cached pJitTable pointers */
@@ -620,27 +631,19 @@
                 if (gDvmJit.haltCompilerThread) {
                     LOGD("Compiler shutdown in progress - discarding request");
                 } else if (!gDvmJit.codeCacheFull) {
-                    bool compileOK = false;
                     jmp_buf jmpBuf;
                     work.bailPtr = &jmpBuf;
                     bool aborted = setjmp(jmpBuf);
                     if (!aborted) {
-                        compileOK = dvmCompilerDoWork(&work);
+                        bool codeCompiled = dvmCompilerDoWork(&work);
+                        if (codeCompiled && !work.result.discardResult &&
+                                work.result.codeAddress) {
+                            dvmJitSetCodeAddr(work.pc, work.result.codeAddress,
+                                              work.result.instructionSet,
+                                              work.result.profileCodeSize);
+                        }
                     }
-                    if (aborted || !compileOK) {
-#if 0 // for x86 JIT testing
-                        dvmJitSetCodeAddr(work.pc,
-                                          dvmCompilerGetInterpretTemplate(),
-                                          work.result.instructionSet);
-#endif
-                        dvmCompilerArenaReset();
-                    } else if (!work.result.discardResult &&
-                               work.result.codeAddress) {
-                        /* Make sure that proper code addr is installed */
-                        assert(work.result.codeAddress != NULL);
-                        dvmJitSetCodeAddr(work.pc, work.result.codeAddress,
-                                          work.result.instructionSet);
-                    }
+                    dvmCompilerArenaReset();
                 }
                 free(work.info);
 #if defined(WITH_JIT_TUNING)
@@ -697,7 +700,8 @@
     gDvmJit.pProfTable = NULL;
     gDvmJit.pProfTableCopy = NULL;
 
-    if (gDvm.verboseShutdown) {
+    if (gDvm.verboseShutdown ||
+            gDvmJit.profileMode == kTraceProfilingContinuous) {
         dvmCompilerDumpStats();
         while (gDvmJit.compilerQueueLength)
           sleep(5);
diff --git a/vm/compiler/Compiler.h b/vm/compiler/Compiler.h
index 0a43df3..cd9d21b 100644
--- a/vm/compiler/Compiler.h
+++ b/vm/compiler/Compiler.h
@@ -45,15 +45,8 @@
 #define COMPILER_TRACE_CHAINING(X)
 
 /* Macro to change the permissions applied to a chunk of the code cache */
-#if !defined(WITH_JIT_TUNING)
 #define PROTECT_CODE_CACHE_ATTRS       (PROT_READ | PROT_EXEC)
 #define UNPROTECT_CODE_CACHE_ATTRS     (PROT_READ | PROT_EXEC | PROT_WRITE)
-#else
-/* When doing JIT profiling always grant the write permission */
-#define PROTECT_CODE_CACHE_ATTRS       (PROT_READ | PROT_EXEC |                \
-                                  (gDvmJit.profile ? PROT_WRITE : 0))
-#define UNPROTECT_CODE_CACHE_ATTRS     (PROT_READ | PROT_EXEC | PROT_WRITE)
-#endif
 
 /* Acquire the lock before removing PROT_WRITE from the specified mem region */
 #define UNPROTECT_CODE_CACHE(addr, size)                                       \
@@ -90,6 +83,7 @@
 typedef struct JitTranslationInfo {
     void *codeAddress;
     JitInstructionSetType instructionSet;
+    int profileCodeSize;
     bool discardResult;         // Used for debugging divergence and IC patching
     bool methodCompilationAborted;  // Cannot compile the whole method
     Thread *requestingThread;   // For debugging purpose
@@ -100,6 +94,7 @@
     kWorkOrderMethod = 1,       // Work is to compile a whole method
     kWorkOrderTrace = 2,        // Work is to compile code fragment(s)
     kWorkOrderTraceDebug = 3,   // Work is to compile/debug code fragment(s)
+    kWorkOrderProfileMode = 4,  // Change profiling mode
 } WorkOrderKind;
 
 typedef struct CompilerWorkOrder {
diff --git a/vm/compiler/CompilerIR.h b/vm/compiler/CompilerIR.h
index caf6fa6..54d41a5 100644
--- a/vm/compiler/CompilerIR.h
+++ b/vm/compiler/CompilerIR.h
@@ -206,11 +206,11 @@
     void *baseAddr;
     bool printMe;
     bool allSingleStep;
-    bool executionCount;                // Add code to count trace executions
     bool hasLoop;                       // Contains a loop
     bool hasInvoke;                     // Contains an invoke instruction
     bool heapMemOp;                     // Mark mem ops for self verification
     bool wholeMethod;
+    int profileCodeSize;                // Size of the profile prefix in bytes
     int numChainingCells[kChainingCellGap];
     LIR *firstChainingLIR[kChainingCellGap];
     LIR *chainingCellBottom;
diff --git a/vm/compiler/Frontend.c b/vm/compiler/Frontend.c
index 95ef026..915e5f3 100644
--- a/vm/compiler/Frontend.c
+++ b/vm/compiler/Frontend.c
@@ -458,9 +458,6 @@
     /* Initialize the printMe flag */
     cUnit.printMe = gDvmJit.printMe;
 
-    /* Initialize the profile flag */
-    cUnit.executionCount = gDvmJit.profile;
-
     /* Setup the method */
     cUnit.method = desc->method;
 
@@ -634,6 +631,7 @@
     for (blockId = 0; blockId < blockList->numUsed; blockId++) {
         curBB = (BasicBlock *) dvmGrowableListGetElement(blockList, blockId);
         MIR *lastInsn = curBB->lastMIRInsn;
+        BasicBlock *backwardCell;
         /* Skip empty blocks */
         if (lastInsn == NULL) {
             continue;
@@ -708,25 +706,11 @@
             exitBB->needFallThroughBranch = true;
 
             loopBranch->taken = exitBB;
-#if defined(WITH_SELF_VERIFICATION)
-            BasicBlock *backwardCell =
+            backwardCell =
                 dvmCompilerNewBB(kChainingCellBackwardBranch, numBlocks++);
             dvmInsertGrowableList(blockList, (intptr_t) backwardCell);
             backwardCell->startOffset = entryCodeBB->startOffset;
             loopBranch->fallThrough = backwardCell;
-#elif defined(WITH_JIT_TUNING)
-            if (gDvmJit.profile) {
-                BasicBlock *backwardCell =
-                    dvmCompilerNewBB(kChainingCellBackwardBranch, numBlocks++);
-                dvmInsertGrowableList(blockList, (intptr_t) backwardCell);
-                backwardCell->startOffset = entryCodeBB->startOffset;
-                loopBranch->fallThrough = backwardCell;
-            } else {
-                loopBranch->fallThrough = entryCodeBB;
-            }
-#else
-            loopBranch->fallThrough = entryCodeBB;
-#endif
 
             /* Create the chaining cell as the fallthrough of the exit block */
             exitChainingCell = dvmCompilerNewBB(kChainingCellNormal,
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 4f3434d..437c2ed 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -619,6 +619,8 @@
     kThumb2Bfc,          /* bfc [11110011011011110] [0] imm3[14-12]
                                   rd[11-8] imm2[7-6] [0] msb[4-0] */
     kThumb2Dmb,          /* dmb [1111001110111111100011110101] option[3-0] */
+    kThumb2LdrPcReln12,  /* ldr rd,[pc,-#imm12] [1111100011011111] rt[15-12]
+                                  imm12[11-0] */
 
     kArmLast,
 } ArmOpcode;
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 4154387..b5c04f5 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -876,6 +876,11 @@
                  kFmtBitBlt, 3, 0, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP,
                  "dmb","#!0B",2),
+    ENCODING_MAP(kThumb2LdrPcReln12,       0xf85f0000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 0, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1,
+                 IS_BINARY_OP | REG_DEF0 | REG_USE_PC | IS_LOAD,
+                 "ldr", "r!0d, [rpc, -#!1d]", 2),
 };
 
 /*
@@ -1163,21 +1168,21 @@
 /*
  * Translation layout in the code cache.  Note that the codeAddress pointer
  * in JitTable will point directly to the code body (field codeAddress).  The
- * chain cell offset codeAddress - 2, and (if present) executionCount is at
- * codeAddress - 6.
+ * chain cell offset codeAddress - 2, and the address of the trace profile
+ * counter is at codeAddress - 6.
  *
  *      +----------------------------+
- *      | Execution count            |  -> [Optional] 4 bytes
+ *      | Trace Profile Counter addr |  -> 4 bytes
  *      +----------------------------+
  *   +--| Offset to chain cell counts|  -> 2 bytes
  *   |  +----------------------------+
- *   |  | Code body                  |  -> Start address for translation
- *   |  |                            |     variable in 2-byte chunks
- *   |  .                            .     (JitTable's codeAddress points here)
+ *   |  | Trace profile code         |  <- entry point when profiling
+ *   |  .  -   -   -   -   -   -   - .
+ *   |  | Code body                  |  <- entry point when not profiling
  *   |  .                            .
  *   |  |                            |
  *   |  +----------------------------+
- *   |  | Chaining Cells             |  -> 12/16 bytes each, must be 4 byte aligned
+ *   |  | Chaining Cells             |  -> 12/16 bytes, 4 byte aligned
  *   |  .                            .
  *   |  .                            .
  *   |  |                            |
@@ -1251,13 +1256,10 @@
            chainCellOffsetLIR->operands[0] == CHAIN_CELL_OFFSET_TAG);
 
     /*
-     * Replace the CHAIN_CELL_OFFSET_TAG with the real value. If trace
-     * profiling is enabled, subtract 4 (occupied by the counter word) from
-     * the absolute offset as the value stored in chainCellOffsetLIR is the
-     * delta from &chainCellOffsetLIR to &ChainCellCounts.
+     * Adjust the CHAIN_CELL_OFFSET_TAG LIR's offset to remove the
+     * space occupied by the pointer to the trace profiling counter.
      */
-    chainCellOffsetLIR->operands[0] =
-        gDvmJit.profile ? (chainCellOffset - 4) : chainCellOffset;
+    chainCellOffsetLIR->operands[0] = chainCellOffset - 4;
 
     offset += sizeof(chainCellCounts) + descSize;
 
@@ -1363,6 +1365,8 @@
     /* If applicable, mark low bit to denote thumb */
     if (info->instructionSet != DALVIK_JIT_ARM)
         info->codeAddress = (char*)info->codeAddress + 1;
+    /* transfer the size of the profiling code */
+    info->profileCodeSize = cUnit->profileCodeSize;
 }
 
 /*
@@ -1836,14 +1840,37 @@
         (6 + (p->u.info.instructionSet == DALVIK_JIT_ARM ? 0 : 1));
 }
 
+/* Handy function to retrieve the profile count */
+static inline JitTraceCounter_t getProfileCount(const JitEntry *entry)
+{
+    if (entry->dPC == 0 || entry->codeAddress == 0 ||
+        entry->codeAddress == dvmCompilerGetInterpretTemplate())
+        return 0;
+
+    JitTraceCounter_t **p = (JitTraceCounter_t **) getTraceBase(entry);
+
+    return **p;
+}
+
+/* Handy function to reset the profile count */
+static inline void resetProfileCount(const JitEntry *entry)
+{
+    if (entry->dPC == 0 || entry->codeAddress == 0 ||
+        entry->codeAddress == dvmCompilerGetInterpretTemplate())
+        return;
+
+    JitTraceCounter_t **p = (JitTraceCounter_t **) getTraceBase(entry);
+
+    **p = 0;
+}
+
 /* Dumps profile info for a single trace */
 static int dumpTraceProfile(JitEntry *p, bool silent, bool reset,
                             unsigned long sum)
 {
     ChainCellCounts* pCellCounts;
     char* traceBase;
-    u4* pExecutionCount;
-    u4 executionCount;
+    JitTraceCounter_t count;
     u2* pCellOffset;
     JitTraceDescription *desc;
     const Method* method;
@@ -1861,14 +1888,12 @@
             LOGD("TRACEPROFILE 0x%08x 0 INTERPRET_ONLY  0 0", (int)traceBase);
         return 0;
     }
-
-    pExecutionCount = (u4*) (traceBase);
-    executionCount = *pExecutionCount;
+    count = getProfileCount(p);
     if (reset) {
-        *pExecutionCount =0;
+        resetProfileCount(p);
     }
     if (silent) {
-        return executionCount;
+        return count;
     }
     pCellOffset = (u2*) (traceBase + 4);
     pCellCounts = (ChainCellCounts*) ((char *)pCellOffset + *pCellOffset);
@@ -1893,8 +1918,8 @@
 
     LOGD("TRACEPROFILE 0x%08x % 10d %5.2f%% [%#x(+%d), %d] %s%s;%s",
          (int)traceBase,
-         executionCount,
-         ((float ) executionCount) / sum * 100.0,
+         count,
+         ((float ) count) / sum * 100.0,
          desc->trace[0].frag.startOffset,
          desc->trace[0].frag.numInsts,
          addrToLine.lineNum,
@@ -1919,7 +1944,7 @@
              methodDesc);
     }
 
-    return executionCount;
+    return count;
 }
 
 /* Create a copy of the trace descriptor of an existing compilation */
@@ -1948,27 +1973,14 @@
     return newCopy;
 }
 
-/* Handy function to retrieve the profile count */
-static inline int getProfileCount(const JitEntry *entry)
-{
-    if (entry->dPC == 0 || entry->codeAddress == 0 ||
-        entry->codeAddress == dvmCompilerGetInterpretTemplate())
-        return 0;
-
-    u4 *pExecutionCount = (u4 *) getTraceBase(entry);
-
-    return *pExecutionCount;
-}
-
-
 /* qsort callback function */
 static int sortTraceProfileCount(const void *entry1, const void *entry2)
 {
     const JitEntry *jitEntry1 = (const JitEntry *)entry1;
     const JitEntry *jitEntry2 = (const JitEntry *)entry2;
 
-    int count1 = getProfileCount(jitEntry1);
-    int count2 = getProfileCount(jitEntry2);
+    JitTraceCounter_t count1 = getProfileCount(jitEntry1);
+    JitTraceCounter_t count2 = getProfileCount(jitEntry2);
     return (count1 == count2) ? 0 : ((count1 > count2) ? -1 : 1);
 }
 
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 6473edb..181a128 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -3536,7 +3536,6 @@
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
 }
 
-#if defined(WITH_SELF_VERIFICATION) || defined(WITH_JIT_TUNING)
 /* Chaining cell for branches that branch back into the same basic block */
 static void handleBackwardBranchChainingCell(CompilationUnit *cUnit,
                                              unsigned int offset)
@@ -3558,7 +3557,6 @@
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
 }
 
-#endif
 /* Chaining cell for monomorphic method invocations. */
 static void handleInvokeSingletonChainingCell(CompilationUnit *cUnit,
                                               const Method *callee)
@@ -3944,39 +3942,8 @@
     GrowableListIterator iterator;
     dvmGrowableListIteratorInit(&cUnit->blockList, &iterator);
 
-    if (cUnit->executionCount) {
-        /*
-         * Reserve 6 bytes at the beginning of the trace
-         *        +----------------------------+
-         *        | execution count (4 bytes)  |
-         *        +----------------------------+
-         *        | chain cell offset (2 bytes)|
-         *        +----------------------------+
-         * ...and then code to increment the execution
-         * count:
-         *       mov   r0, pc       @ move adr of "mov r0,pc" + 4 to r0
-         *       sub   r0, #10      @ back up to addr of executionCount
-         *       ldr   r1, [r0]
-         *       add   r1, #1
-         *       str   r1, [r0]
-         */
-        newLIR1(cUnit, kArm16BitData, 0);
-        newLIR1(cUnit, kArm16BitData, 0);
-        cUnit->chainCellOffsetLIR =
-            (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
-        cUnit->headerSize = 6;
-        /* Thumb instruction used directly here to ensure correct size */
-        newLIR2(cUnit, kThumbMovRR_H2L, r0, rpc);
-        newLIR2(cUnit, kThumbSubRI8, r0, 10);
-        newLIR3(cUnit, kThumbLdrRRI5, r1, r0, 0);
-        newLIR2(cUnit, kThumbAddRI8, r1, 1);
-        newLIR3(cUnit, kThumbStrRRI5, r1, r0, 0);
-    } else {
-         /* Just reserve 2 bytes for the chain cell offset */
-        cUnit->chainCellOffsetLIR =
-            (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
-        cUnit->headerSize = 2;
-    }
+    /* Traces start with a profiling entry point.  Generate it here */
+    cUnit->profileCodeSize = genTraceProfileEntry(cUnit);
 
     /* Handle the content in each basic block */
     for (i = 0; ; i++) {
@@ -4062,7 +4029,6 @@
                         opReg(cUnit, kOpBlx, r1);
                     }
                     break;
-#if defined(WITH_SELF_VERIFICATION) || defined(WITH_JIT_TUNING)
                 case kChainingCellBackwardBranch:
                     labelList[i].opcode =
                         kArmPseudoChainingCellBackwardBranch;
@@ -4071,7 +4037,6 @@
                         &chainingListByType[kChainingCellBackwardBranch],
                         i);
                     break;
-#endif
                 default:
                     break;
             }
@@ -4303,12 +4268,10 @@
                 case kChainingCellHot:
                     handleHotChainingCell(cUnit, chainingBlock->startOffset);
                     break;
-#if defined(WITH_SELF_VERIFICATION) || defined(WITH_JIT_TUNING)
                 case kChainingCellBackwardBranch:
                     handleBackwardBranchChainingCell(cUnit,
                         chainingBlock->startOffset);
                     break;
-#endif
                 default:
                     LOGE("Bad blocktype %d", chainingBlock->blockType);
                     dvmCompilerAbort(cUnit);
@@ -4342,11 +4305,15 @@
 #endif
 }
 
-/* Accept the work and start compiling */
+/*
+ * Accept the work and start compiling.  Returns true if compilation
+ * is attempted.
+ */
 bool dvmCompilerDoWork(CompilerWorkOrder *work)
 {
     JitTraceDescription *desc;
-    bool res;
+    bool isCompile;
+    bool success = true;
 
     if (gDvmJit.codeCacheFull) {
         return false;
@@ -4354,27 +4321,35 @@
 
     switch (work->kind) {
         case kWorkOrderTrace:
+            isCompile = true;
             /* Start compilation with maximally allowed trace length */
             desc = (JitTraceDescription *)work->info;
-            res = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
-                                  work->bailPtr, 0 /* no hints */);
+            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
+                                        work->bailPtr, 0 /* no hints */);
             break;
         case kWorkOrderTraceDebug: {
             bool oldPrintMe = gDvmJit.printMe;
             gDvmJit.printMe = true;
+            isCompile = true;
             /* Start compilation with maximally allowed trace length */
             desc = (JitTraceDescription *)work->info;
-            res = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
-                                  work->bailPtr, 0 /* no hints */);
+            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
+                                        work->bailPtr, 0 /* no hints */);
             gDvmJit.printMe = oldPrintMe;
             break;
         }
+        case kWorkOrderProfileMode:
+            dvmJitChangeProfileMode((TraceProfilingModes)work->info);
+            isCompile = false;
+            break;
         default:
-            res = false;
+            isCompile = false;
             LOGE("Jit: unknown work order type");
             assert(0);  // Bail if debug build, discard otherwise
     }
-    return res;
+    if (!success)
+        work->result.codeAddress = NULL;
+    return isCompile;
 }
 
 /* Architectural-specific debugging helpers go here */
diff --git a/vm/compiler/codegen/arm/Thumb/Gen.c b/vm/compiler/codegen/arm/Thumb/Gen.c
index 07f3f09..b806965 100644
--- a/vm/compiler/codegen/arm/Thumb/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb/Gen.c
@@ -23,6 +23,62 @@
  */
 
 /*
+ * Reserve 6 bytes at the beginning of the trace
+ *        +----------------------------+
+ *        | prof count addr (4 bytes)  |
+ *        +----------------------------+
+ *        | chain cell offset (2 bytes)|
+ *        +----------------------------+
+ *
+ * ...and then code to increment the execution
+ *
+ * For continuous profiling (12 bytes):
+ *
+ *       mov   r0, pc       @ move adr of "mov r0,pc" + 4 to r0
+ *       sub   r0, #10      @ back up to addr prof count pointer
+ *       ldr   r0, [r0]     @ get address of counter
+ *       ldr   r1, [r0]
+ *       add   r1, #1
+ *       str   r1, [r0]
+ *
+ * For periodic profiling (4 bytes):
+ *       call  TEMPLATE_PERIODIC_PROFILING
+ *
+ * and return the size (in bytes) of the generated code.
+ */
+
+static int genTraceProfileEntry(CompilationUnit *cUnit)
+{
+    intptr_t addr = (intptr_t)dvmJitNextTraceCounter();
+    assert(__BYTE_ORDER == __LITTLE_ENDIAN);
+    newLIR1(cUnit, kArm16BitData, addr & 0xffff);
+    newLIR1(cUnit, kArm16BitData, (addr >> 16) & 0xffff);
+    cUnit->chainCellOffsetLIR =
+        (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
+    cUnit->headerSize = 6;
+    if ((gDvmJit.profileMode == kTraceProfilingContinuous) ||
+        (gDvmJit.profileMode == kTraceProfilingDisabled)) {
+        /* Thumb instruction used directly here to ensure correct size */
+        newLIR2(cUnit, kThumbMovRR_H2L, r0, rpc);
+        newLIR2(cUnit, kThumbSubRI8, r0, 10);
+        newLIR3(cUnit, kThumbLdrRRI5, r0, r0, 0);
+        newLIR3(cUnit, kThumbLdrRRI5, r1, r0, 0);
+        newLIR2(cUnit, kThumbAddRI8, r1, 1);
+        newLIR3(cUnit, kThumbStrRRI5, r1, r0, 0);
+        return 12;
+    } else {
+        int opcode = TEMPLATE_PERIODIC_PROFILING;
+        newLIR2(cUnit, kThumbBlx1,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        newLIR2(cUnit, kThumbBlx2,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        return 4;
+    }
+}
+
+/*
  * Perform a "reg cmp imm" operation and jump to the PCR region if condition
  * satisfies.
  */
diff --git a/vm/compiler/codegen/arm/Thumb2/Gen.c b/vm/compiler/codegen/arm/Thumb2/Gen.c
index 0891524..f5e1096 100644
--- a/vm/compiler/codegen/arm/Thumb2/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb2/Gen.c
@@ -15,13 +15,64 @@
  */
 
 /*
- * This file contains codegen for the Thumb ISA and is intended to be
+ * This file contains codegen for the Thumb2 ISA and is intended to be
  * includes by:
  *
  *        Codegen-$(TARGET_ARCH_VARIANT).c
  *
  */
 
+/*
+ * Reserve 6 bytes at the beginning of the trace
+ *        +----------------------------+
+ *        | prof count addr (4 bytes)  |
+ *        +----------------------------+
+ *        | chain cell offset (2 bytes)|
+ *        +----------------------------+
+ *
+ * ...and then code to increment the execution
+ *
+ * For continuous profiling (10 bytes)
+ *       ldr   r0, [pc-8]   @ get prof count addr    [4 bytes]
+ *       ldr   r1, [r0]     @ load counter           [2 bytes]
+ *       add   r1, #1       @ increment              [2 bytes]
+ *       str   r1, [r0]     @ store                  [2 bytes]
+ *
+ * For periodic profiling (4 bytes)
+ *       call  TEMPLATE_PERIODIC_PROFILING
+ *
+ * and return the size (in bytes) of the generated code.
+ */
+
+static int genTraceProfileEntry(CompilationUnit *cUnit)
+{
+    intptr_t addr = (intptr_t)dvmJitNextTraceCounter();
+    assert(__BYTE_ORDER == __LITTLE_ENDIAN);
+    newLIR1(cUnit, kArm16BitData, addr & 0xffff);
+    newLIR1(cUnit, kArm16BitData, (addr >> 16) & 0xffff);
+    cUnit->chainCellOffsetLIR =
+        (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
+    cUnit->headerSize = 6;
+    if ((gDvmJit.profileMode == kTraceProfilingContinuous) ||
+        (gDvmJit.profileMode == kTraceProfilingDisabled)) {
+        /* Thumb[2] instruction used directly here to ensure correct size */
+        newLIR2(cUnit, kThumb2LdrPcReln12, r0, 8);
+        newLIR3(cUnit, kThumbLdrRRI5, r1, r0, 0);
+        newLIR2(cUnit, kThumbAddRI8, r1, 1);
+        newLIR3(cUnit, kThumbStrRRI5, r1, r0, 0);
+        return 10;
+    } else {
+        int opcode = TEMPLATE_PERIODIC_PROFILING;
+        newLIR2(cUnit, kThumbBlx1,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        newLIR2(cUnit, kThumbBlx2,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        return 4;
+    }
+}
+
 static void genNegFloat(CompilationUnit *cUnit, RegLocation rlDest,
                         RegLocation rlSrc)
 {
diff --git a/vm/compiler/template/armv5te-vfp/TemplateOpList.h b/vm/compiler/template/armv5te-vfp/TemplateOpList.h
index d991bed..97addfa 100644
--- a/vm/compiler/template/armv5te-vfp/TemplateOpList.h
+++ b/vm/compiler/template/armv5te-vfp/TemplateOpList.h
@@ -57,3 +57,4 @@
 JIT_TEMPLATE(INTERPRET)
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
+JIT_TEMPLATE(PERIODIC_PROFILING)
diff --git a/vm/compiler/template/armv5te/TEMPLATE_PERIODIC_PROFILING.S b/vm/compiler/template/armv5te/TEMPLATE_PERIODIC_PROFILING.S
new file mode 100644
index 0000000..7f71096
--- /dev/null
+++ b/vm/compiler/template/armv5te/TEMPLATE_PERIODIC_PROFILING.S
@@ -0,0 +1,26 @@
+    /*
+     * Increment profile counter for this trace, and decrement
+     * sample counter.  If sample counter goes below zero, turn
+     * off profiling.
+     *
+     * On entry
+     * (lr-11) is address of pointer to counter.  Note: the counter
+     *    actually exists 10 bytes before the return target, but because
+     *    we are arriving from thumb mode, lr will have its low bit set.
+     */
+     ldr    r0, [lr,#-11]
+     ldr    r1, [rGLUE, #offGlue_pProfileCountdown]
+     ldr    r2, [r0]                    @ get counter
+     ldr    r3, [r1]                    @ get countdown timer
+     add    r2, #1
+     subs   r2, #1
+     blt    .L${opcode}_disable_profiling
+     str    r2, [r0]
+     str    r3, [r1]
+     bx     lr
+
+.L${opcode}_disable_profiling:
+     mov    r4, lr                     @ preserve lr
+     ldr    r0, .LdvmJitTraceProfilingOff
+     blx    r0
+     bx     r4
diff --git a/vm/compiler/template/armv5te/TemplateOpList.h b/vm/compiler/template/armv5te/TemplateOpList.h
index e81383c..663e0df 100644
--- a/vm/compiler/template/armv5te/TemplateOpList.h
+++ b/vm/compiler/template/armv5te/TemplateOpList.h
@@ -42,3 +42,4 @@
 JIT_TEMPLATE(INTERPRET)
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
+JIT_TEMPLATE(PERIODIC_PROFILING)
diff --git a/vm/compiler/template/armv5te/footer.S b/vm/compiler/template/armv5te/footer.S
index ba0335b..7b35e8a 100644
--- a/vm/compiler/template/armv5te/footer.S
+++ b/vm/compiler/template/armv5te/footer.S
@@ -104,6 +104,8 @@
     .word   dvmMterpCommonExceptionThrown
 .LdvmLockObject:
     .word   dvmLockObject
+.LdvmJitTraceProfilingOff:
+    .word   dvmJitTraceProfilingOff
 #if defined(WITH_JIT_TUNING)
 .LdvmICHitCount:
     .word   gDvmICHitCount
diff --git a/vm/compiler/template/armv7-a-neon/TemplateOpList.h b/vm/compiler/template/armv7-a-neon/TemplateOpList.h
index d991bed..97addfa 100644
--- a/vm/compiler/template/armv7-a-neon/TemplateOpList.h
+++ b/vm/compiler/template/armv7-a-neon/TemplateOpList.h
@@ -57,3 +57,4 @@
 JIT_TEMPLATE(INTERPRET)
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
+JIT_TEMPLATE(PERIODIC_PROFILING)
diff --git a/vm/compiler/template/armv7-a/TemplateOpList.h b/vm/compiler/template/armv7-a/TemplateOpList.h
index d991bed..97addfa 100644
--- a/vm/compiler/template/armv7-a/TemplateOpList.h
+++ b/vm/compiler/template/armv7-a/TemplateOpList.h
@@ -57,3 +57,4 @@
 JIT_TEMPLATE(INTERPRET)
 JIT_TEMPLATE(MONITOR_ENTER)
 JIT_TEMPLATE(MONITOR_ENTER_DEBUG)
+JIT_TEMPLATE(PERIODIC_PROFILING)
diff --git a/vm/compiler/template/config-armv5te-vfp b/vm/compiler/template/config-armv5te-vfp
index 1b02261..30b9200 100644
--- a/vm/compiler/template/config-armv5te-vfp
+++ b/vm/compiler/template/config-armv5te-vfp
@@ -48,6 +48,7 @@
     op TEMPLATE_INTERPRET armv5te
     op TEMPLATE_MONITOR_ENTER armv5te
     op TEMPLATE_MONITOR_ENTER_DEBUG armv5te
+    op TEMPLATE_PERIODIC_PROFILING armv5te
 
 op-end
 
diff --git a/vm/compiler/template/config-armv7-a b/vm/compiler/template/config-armv7-a
index be7af31..1408ca1 100644
--- a/vm/compiler/template/config-armv7-a
+++ b/vm/compiler/template/config-armv7-a
@@ -48,6 +48,7 @@
     op TEMPLATE_INTERPRET armv5te
     op TEMPLATE_MONITOR_ENTER armv5te
     op TEMPLATE_MONITOR_ENTER_DEBUG armv5te
+    op TEMPLATE_PERIODIC_PROFILING armv5te
 op-end
 
 # "helper" code for C; include if you use any of the C stubs (this generates
diff --git a/vm/compiler/template/config-armv7-a-neon b/vm/compiler/template/config-armv7-a-neon
index be7af31..1408ca1 100644
--- a/vm/compiler/template/config-armv7-a-neon
+++ b/vm/compiler/template/config-armv7-a-neon
@@ -48,6 +48,7 @@
     op TEMPLATE_INTERPRET armv5te
     op TEMPLATE_MONITOR_ENTER armv5te
     op TEMPLATE_MONITOR_ENTER_DEBUG armv5te
+    op TEMPLATE_PERIODIC_PROFILING armv5te
 op-end
 
 # "helper" code for C; include if you use any of the C stubs (this generates
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
index 8efbcaa..a107b24 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te-vfp.S
@@ -1473,6 +1473,38 @@
 #endif
     ldr     pc, .LdvmJitToInterpNoChain
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_PERIODIC_PROFILING
+dvmCompiler_TEMPLATE_PERIODIC_PROFILING:
+/* File: armv5te/TEMPLATE_PERIODIC_PROFILING.S */
+    /*
+     * Increment profile counter for this trace, and decrement
+     * sample counter.  If sample counter goes below zero, turn
+     * off profiling.
+     *
+     * On entry
+     * (lr-11) is address of pointer to counter.  Note: the counter
+     *    actually exists 10 bytes before the return target, but because
+     *    we are arriving from thumb mode, lr will have its low bit set.
+     */
+     ldr    r0, [lr,#-11]
+     ldr    r1, [rGLUE, #offGlue_pProfileCountdown]
+     ldr    r2, [r0]                    @ get counter
+     ldr    r3, [r1]                    @ get countdown timer
+     add    r2, #1
+     subs   r2, #1
+     blt    .LTEMPLATE_PERIODIC_PROFILING_disable_profiling
+     str    r2, [r0]
+     str    r3, [r1]
+     bx     lr
+
+.LTEMPLATE_PERIODIC_PROFILING_disable_profiling:
+     mov    r4, lr                     @ preserve lr
+     ldr    r0, .LdvmJitTraceProfilingOff
+     blx    r0
+     bx     r4
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1581,6 +1613,8 @@
     .word   dvmMterpCommonExceptionThrown
 .LdvmLockObject:
     .word   dvmLockObject
+.LdvmJitTraceProfilingOff:
+    .word   dvmJitTraceProfilingOff
 #if defined(WITH_JIT_TUNING)
 .LdvmICHitCount:
     .word   gDvmICHitCount
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
index 0df3ae6..a6a0e9f 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv5te.S
@@ -1204,6 +1204,38 @@
 #endif
     ldr     pc, .LdvmJitToInterpNoChain
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_PERIODIC_PROFILING
+dvmCompiler_TEMPLATE_PERIODIC_PROFILING:
+/* File: armv5te/TEMPLATE_PERIODIC_PROFILING.S */
+    /*
+     * Increment profile counter for this trace, and decrement
+     * sample counter.  If sample counter goes below zero, turn
+     * off profiling.
+     *
+     * On entry
+     * (lr-11) is address of pointer to counter.  Note: the counter
+     *    actually exists 10 bytes before the return target, but because
+     *    we are arriving from thumb mode, lr will have its low bit set.
+     */
+     ldr    r0, [lr,#-11]
+     ldr    r1, [rGLUE, #offGlue_pProfileCountdown]
+     ldr    r2, [r0]                    @ get counter
+     ldr    r3, [r1]                    @ get countdown timer
+     add    r2, #1
+     subs   r2, #1
+     blt    .LTEMPLATE_PERIODIC_PROFILING_disable_profiling
+     str    r2, [r0]
+     str    r3, [r1]
+     bx     lr
+
+.LTEMPLATE_PERIODIC_PROFILING_disable_profiling:
+     mov    r4, lr                     @ preserve lr
+     ldr    r0, .LdvmJitTraceProfilingOff
+     blx    r0
+     bx     r4
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1312,6 +1344,8 @@
     .word   dvmMterpCommonExceptionThrown
 .LdvmLockObject:
     .word   dvmLockObject
+.LdvmJitTraceProfilingOff:
+    .word   dvmJitTraceProfilingOff
 #if defined(WITH_JIT_TUNING)
 .LdvmICHitCount:
     .word   gDvmICHitCount
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
index ee3f8cb..e4ed30b 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a-neon.S
@@ -1473,6 +1473,38 @@
 #endif
     ldr     pc, .LdvmJitToInterpNoChain
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_PERIODIC_PROFILING
+dvmCompiler_TEMPLATE_PERIODIC_PROFILING:
+/* File: armv5te/TEMPLATE_PERIODIC_PROFILING.S */
+    /*
+     * Increment profile counter for this trace, and decrement
+     * sample counter.  If sample counter goes below zero, turn
+     * off profiling.
+     *
+     * On entry
+     * (lr-11) is address of pointer to counter.  Note: the counter
+     *    actually exists 10 bytes before the return target, but because
+     *    we are arriving from thumb mode, lr will have its low bit set.
+     */
+     ldr    r0, [lr,#-11]
+     ldr    r1, [rGLUE, #offGlue_pProfileCountdown]
+     ldr    r2, [r0]                    @ get counter
+     ldr    r3, [r1]                    @ get countdown timer
+     add    r2, #1
+     subs   r2, #1
+     blt    .LTEMPLATE_PERIODIC_PROFILING_disable_profiling
+     str    r2, [r0]
+     str    r3, [r1]
+     bx     lr
+
+.LTEMPLATE_PERIODIC_PROFILING_disable_profiling:
+     mov    r4, lr                     @ preserve lr
+     ldr    r0, .LdvmJitTraceProfilingOff
+     blx    r0
+     bx     r4
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1581,6 +1613,8 @@
     .word   dvmMterpCommonExceptionThrown
 .LdvmLockObject:
     .word   dvmLockObject
+.LdvmJitTraceProfilingOff:
+    .word   dvmJitTraceProfilingOff
 #if defined(WITH_JIT_TUNING)
 .LdvmICHitCount:
     .word   gDvmICHitCount
diff --git a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
index 3875f5a..fc26b3a 100644
--- a/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
+++ b/vm/compiler/template/out/CompilerTemplateAsm-armv7-a.S
@@ -1473,6 +1473,38 @@
 #endif
     ldr     pc, .LdvmJitToInterpNoChain
 
+/* ------------------------------ */
+    .balign 4
+    .global dvmCompiler_TEMPLATE_PERIODIC_PROFILING
+dvmCompiler_TEMPLATE_PERIODIC_PROFILING:
+/* File: armv5te/TEMPLATE_PERIODIC_PROFILING.S */
+    /*
+     * Increment profile counter for this trace, and decrement
+     * sample counter.  If sample counter goes below zero, turn
+     * off profiling.
+     *
+     * On entry
+     * (lr-11) is address of pointer to counter.  Note: the counter
+     *    actually exists 10 bytes before the return target, but because
+     *    we are arriving from thumb mode, lr will have its low bit set.
+     */
+     ldr    r0, [lr,#-11]
+     ldr    r1, [rGLUE, #offGlue_pProfileCountdown]
+     ldr    r2, [r0]                    @ get counter
+     ldr    r3, [r1]                    @ get countdown timer
+     add    r2, #1
+     subs   r2, #1
+     blt    .LTEMPLATE_PERIODIC_PROFILING_disable_profiling
+     str    r2, [r0]
+     str    r3, [r1]
+     bx     lr
+
+.LTEMPLATE_PERIODIC_PROFILING_disable_profiling:
+     mov    r4, lr                     @ preserve lr
+     ldr    r0, .LdvmJitTraceProfilingOff
+     blx    r0
+     bx     r4
+
     .size   dvmCompilerTemplateStart, .-dvmCompilerTemplateStart
 /* File: armv5te/footer.S */
 /*
@@ -1581,6 +1613,8 @@
     .word   dvmMterpCommonExceptionThrown
 .LdvmLockObject:
     .word   dvmLockObject
+.LdvmJitTraceProfilingOff:
+    .word   dvmJitTraceProfilingOff
 #if defined(WITH_JIT_TUNING)
 .LdvmICHitCount:
     .word   gDvmICHitCount