[JIT] Trace profiling support

In preparation for method compilation, this CL causes all traces to
include two entry points: profiling and non-profiling.  For now, the
profiling entry will only be used if dalvik is run with -Xjitprofile,
and largely works like it did before.  The difference is that profiling
support no longer requires the "assert" build - it's always there now.

This will enable us to do a form of sampling profiling of
traces in order to identify hot methods or hot trace groups,
while keeping the overhead low by only switching profiling on periodically.

To turn the periodic profiling on and off, we simply unchain all existing
translations and set the appropriate global profile state.  The underlying
translation lookup and chaining utilties will examine the profile state to
determine which entry point to use (i.e. - profiling or non-profiling) while
the traces naturally rechain during further execution.

Change-Id: I9ee33e69e33869b9fab3a57e88f9bc524175172b
diff --git a/vm/compiler/codegen/arm/ArmLIR.h b/vm/compiler/codegen/arm/ArmLIR.h
index 4f3434d..437c2ed 100644
--- a/vm/compiler/codegen/arm/ArmLIR.h
+++ b/vm/compiler/codegen/arm/ArmLIR.h
@@ -619,6 +619,8 @@
     kThumb2Bfc,          /* bfc [11110011011011110] [0] imm3[14-12]
                                   rd[11-8] imm2[7-6] [0] msb[4-0] */
     kThumb2Dmb,          /* dmb [1111001110111111100011110101] option[3-0] */
+    kThumb2LdrPcReln12,  /* ldr rd,[pc,-#imm12] [1111100011011111] rt[15-12]
+                                  imm12[11-0] */
 
     kArmLast,
 } ArmOpcode;
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index 4154387..b5c04f5 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -876,6 +876,11 @@
                  kFmtBitBlt, 3, 0, kFmtUnused, -1, -1, kFmtUnused, -1, -1,
                  kFmtUnused, -1, -1, IS_UNARY_OP,
                  "dmb","#!0B",2),
+    ENCODING_MAP(kThumb2LdrPcReln12,       0xf85f0000,
+                 kFmtBitBlt, 15, 12, kFmtBitBlt, 11, 0, kFmtUnused, -1, -1,
+                 kFmtUnused, -1, -1,
+                 IS_BINARY_OP | REG_DEF0 | REG_USE_PC | IS_LOAD,
+                 "ldr", "r!0d, [rpc, -#!1d]", 2),
 };
 
 /*
@@ -1163,21 +1168,21 @@
 /*
  * Translation layout in the code cache.  Note that the codeAddress pointer
  * in JitTable will point directly to the code body (field codeAddress).  The
- * chain cell offset codeAddress - 2, and (if present) executionCount is at
- * codeAddress - 6.
+ * chain cell offset codeAddress - 2, and the address of the trace profile
+ * counter is at codeAddress - 6.
  *
  *      +----------------------------+
- *      | Execution count            |  -> [Optional] 4 bytes
+ *      | Trace Profile Counter addr |  -> 4 bytes
  *      +----------------------------+
  *   +--| Offset to chain cell counts|  -> 2 bytes
  *   |  +----------------------------+
- *   |  | Code body                  |  -> Start address for translation
- *   |  |                            |     variable in 2-byte chunks
- *   |  .                            .     (JitTable's codeAddress points here)
+ *   |  | Trace profile code         |  <- entry point when profiling
+ *   |  .  -   -   -   -   -   -   - .
+ *   |  | Code body                  |  <- entry point when not profiling
  *   |  .                            .
  *   |  |                            |
  *   |  +----------------------------+
- *   |  | Chaining Cells             |  -> 12/16 bytes each, must be 4 byte aligned
+ *   |  | Chaining Cells             |  -> 12/16 bytes, 4 byte aligned
  *   |  .                            .
  *   |  .                            .
  *   |  |                            |
@@ -1251,13 +1256,10 @@
            chainCellOffsetLIR->operands[0] == CHAIN_CELL_OFFSET_TAG);
 
     /*
-     * Replace the CHAIN_CELL_OFFSET_TAG with the real value. If trace
-     * profiling is enabled, subtract 4 (occupied by the counter word) from
-     * the absolute offset as the value stored in chainCellOffsetLIR is the
-     * delta from &chainCellOffsetLIR to &ChainCellCounts.
+     * Adjust the CHAIN_CELL_OFFSET_TAG LIR's offset to remove the
+     * space occupied by the pointer to the trace profiling counter.
      */
-    chainCellOffsetLIR->operands[0] =
-        gDvmJit.profile ? (chainCellOffset - 4) : chainCellOffset;
+    chainCellOffsetLIR->operands[0] = chainCellOffset - 4;
 
     offset += sizeof(chainCellCounts) + descSize;
 
@@ -1363,6 +1365,8 @@
     /* If applicable, mark low bit to denote thumb */
     if (info->instructionSet != DALVIK_JIT_ARM)
         info->codeAddress = (char*)info->codeAddress + 1;
+    /* transfer the size of the profiling code */
+    info->profileCodeSize = cUnit->profileCodeSize;
 }
 
 /*
@@ -1836,14 +1840,37 @@
         (6 + (p->u.info.instructionSet == DALVIK_JIT_ARM ? 0 : 1));
 }
 
+/* Handy function to retrieve the profile count */
+static inline JitTraceCounter_t getProfileCount(const JitEntry *entry)
+{
+    if (entry->dPC == 0 || entry->codeAddress == 0 ||
+        entry->codeAddress == dvmCompilerGetInterpretTemplate())
+        return 0;
+
+    JitTraceCounter_t **p = (JitTraceCounter_t **) getTraceBase(entry);
+
+    return **p;
+}
+
+/* Handy function to reset the profile count */
+static inline void resetProfileCount(const JitEntry *entry)
+{
+    if (entry->dPC == 0 || entry->codeAddress == 0 ||
+        entry->codeAddress == dvmCompilerGetInterpretTemplate())
+        return;
+
+    JitTraceCounter_t **p = (JitTraceCounter_t **) getTraceBase(entry);
+
+    **p = 0;
+}
+
 /* Dumps profile info for a single trace */
 static int dumpTraceProfile(JitEntry *p, bool silent, bool reset,
                             unsigned long sum)
 {
     ChainCellCounts* pCellCounts;
     char* traceBase;
-    u4* pExecutionCount;
-    u4 executionCount;
+    JitTraceCounter_t count;
     u2* pCellOffset;
     JitTraceDescription *desc;
     const Method* method;
@@ -1861,14 +1888,12 @@
             LOGD("TRACEPROFILE 0x%08x 0 INTERPRET_ONLY  0 0", (int)traceBase);
         return 0;
     }
-
-    pExecutionCount = (u4*) (traceBase);
-    executionCount = *pExecutionCount;
+    count = getProfileCount(p);
     if (reset) {
-        *pExecutionCount =0;
+        resetProfileCount(p);
     }
     if (silent) {
-        return executionCount;
+        return count;
     }
     pCellOffset = (u2*) (traceBase + 4);
     pCellCounts = (ChainCellCounts*) ((char *)pCellOffset + *pCellOffset);
@@ -1893,8 +1918,8 @@
 
     LOGD("TRACEPROFILE 0x%08x % 10d %5.2f%% [%#x(+%d), %d] %s%s;%s",
          (int)traceBase,
-         executionCount,
-         ((float ) executionCount) / sum * 100.0,
+         count,
+         ((float ) count) / sum * 100.0,
          desc->trace[0].frag.startOffset,
          desc->trace[0].frag.numInsts,
          addrToLine.lineNum,
@@ -1919,7 +1944,7 @@
              methodDesc);
     }
 
-    return executionCount;
+    return count;
 }
 
 /* Create a copy of the trace descriptor of an existing compilation */
@@ -1948,27 +1973,14 @@
     return newCopy;
 }
 
-/* Handy function to retrieve the profile count */
-static inline int getProfileCount(const JitEntry *entry)
-{
-    if (entry->dPC == 0 || entry->codeAddress == 0 ||
-        entry->codeAddress == dvmCompilerGetInterpretTemplate())
-        return 0;
-
-    u4 *pExecutionCount = (u4 *) getTraceBase(entry);
-
-    return *pExecutionCount;
-}
-
-
 /* qsort callback function */
 static int sortTraceProfileCount(const void *entry1, const void *entry2)
 {
     const JitEntry *jitEntry1 = (const JitEntry *)entry1;
     const JitEntry *jitEntry2 = (const JitEntry *)entry2;
 
-    int count1 = getProfileCount(jitEntry1);
-    int count2 = getProfileCount(jitEntry2);
+    JitTraceCounter_t count1 = getProfileCount(jitEntry1);
+    JitTraceCounter_t count2 = getProfileCount(jitEntry2);
     return (count1 == count2) ? 0 : ((count1 > count2) ? -1 : 1);
 }
 
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index 6473edb..181a128 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -3536,7 +3536,6 @@
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
 }
 
-#if defined(WITH_SELF_VERIFICATION) || defined(WITH_JIT_TUNING)
 /* Chaining cell for branches that branch back into the same basic block */
 static void handleBackwardBranchChainingCell(CompilationUnit *cUnit,
                                              unsigned int offset)
@@ -3558,7 +3557,6 @@
     addWordData(cUnit, (int) (cUnit->method->insns + offset), true);
 }
 
-#endif
 /* Chaining cell for monomorphic method invocations. */
 static void handleInvokeSingletonChainingCell(CompilationUnit *cUnit,
                                               const Method *callee)
@@ -3944,39 +3942,8 @@
     GrowableListIterator iterator;
     dvmGrowableListIteratorInit(&cUnit->blockList, &iterator);
 
-    if (cUnit->executionCount) {
-        /*
-         * Reserve 6 bytes at the beginning of the trace
-         *        +----------------------------+
-         *        | execution count (4 bytes)  |
-         *        +----------------------------+
-         *        | chain cell offset (2 bytes)|
-         *        +----------------------------+
-         * ...and then code to increment the execution
-         * count:
-         *       mov   r0, pc       @ move adr of "mov r0,pc" + 4 to r0
-         *       sub   r0, #10      @ back up to addr of executionCount
-         *       ldr   r1, [r0]
-         *       add   r1, #1
-         *       str   r1, [r0]
-         */
-        newLIR1(cUnit, kArm16BitData, 0);
-        newLIR1(cUnit, kArm16BitData, 0);
-        cUnit->chainCellOffsetLIR =
-            (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
-        cUnit->headerSize = 6;
-        /* Thumb instruction used directly here to ensure correct size */
-        newLIR2(cUnit, kThumbMovRR_H2L, r0, rpc);
-        newLIR2(cUnit, kThumbSubRI8, r0, 10);
-        newLIR3(cUnit, kThumbLdrRRI5, r1, r0, 0);
-        newLIR2(cUnit, kThumbAddRI8, r1, 1);
-        newLIR3(cUnit, kThumbStrRRI5, r1, r0, 0);
-    } else {
-         /* Just reserve 2 bytes for the chain cell offset */
-        cUnit->chainCellOffsetLIR =
-            (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
-        cUnit->headerSize = 2;
-    }
+    /* Traces start with a profiling entry point.  Generate it here */
+    cUnit->profileCodeSize = genTraceProfileEntry(cUnit);
 
     /* Handle the content in each basic block */
     for (i = 0; ; i++) {
@@ -4062,7 +4029,6 @@
                         opReg(cUnit, kOpBlx, r1);
                     }
                     break;
-#if defined(WITH_SELF_VERIFICATION) || defined(WITH_JIT_TUNING)
                 case kChainingCellBackwardBranch:
                     labelList[i].opcode =
                         kArmPseudoChainingCellBackwardBranch;
@@ -4071,7 +4037,6 @@
                         &chainingListByType[kChainingCellBackwardBranch],
                         i);
                     break;
-#endif
                 default:
                     break;
             }
@@ -4303,12 +4268,10 @@
                 case kChainingCellHot:
                     handleHotChainingCell(cUnit, chainingBlock->startOffset);
                     break;
-#if defined(WITH_SELF_VERIFICATION) || defined(WITH_JIT_TUNING)
                 case kChainingCellBackwardBranch:
                     handleBackwardBranchChainingCell(cUnit,
                         chainingBlock->startOffset);
                     break;
-#endif
                 default:
                     LOGE("Bad blocktype %d", chainingBlock->blockType);
                     dvmCompilerAbort(cUnit);
@@ -4342,11 +4305,15 @@
 #endif
 }
 
-/* Accept the work and start compiling */
+/*
+ * Accept the work and start compiling.  Returns true if compilation
+ * is attempted.
+ */
 bool dvmCompilerDoWork(CompilerWorkOrder *work)
 {
     JitTraceDescription *desc;
-    bool res;
+    bool isCompile;
+    bool success = true;
 
     if (gDvmJit.codeCacheFull) {
         return false;
@@ -4354,27 +4321,35 @@
 
     switch (work->kind) {
         case kWorkOrderTrace:
+            isCompile = true;
             /* Start compilation with maximally allowed trace length */
             desc = (JitTraceDescription *)work->info;
-            res = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
-                                  work->bailPtr, 0 /* no hints */);
+            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
+                                        work->bailPtr, 0 /* no hints */);
             break;
         case kWorkOrderTraceDebug: {
             bool oldPrintMe = gDvmJit.printMe;
             gDvmJit.printMe = true;
+            isCompile = true;
             /* Start compilation with maximally allowed trace length */
             desc = (JitTraceDescription *)work->info;
-            res = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
-                                  work->bailPtr, 0 /* no hints */);
+            success = dvmCompileTrace(desc, JIT_MAX_TRACE_LEN, &work->result,
+                                        work->bailPtr, 0 /* no hints */);
             gDvmJit.printMe = oldPrintMe;
             break;
         }
+        case kWorkOrderProfileMode:
+            dvmJitChangeProfileMode((TraceProfilingModes)work->info);
+            isCompile = false;
+            break;
         default:
-            res = false;
+            isCompile = false;
             LOGE("Jit: unknown work order type");
             assert(0);  // Bail if debug build, discard otherwise
     }
-    return res;
+    if (!success)
+        work->result.codeAddress = NULL;
+    return isCompile;
 }
 
 /* Architectural-specific debugging helpers go here */
diff --git a/vm/compiler/codegen/arm/Thumb/Gen.c b/vm/compiler/codegen/arm/Thumb/Gen.c
index 07f3f09..b806965 100644
--- a/vm/compiler/codegen/arm/Thumb/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb/Gen.c
@@ -23,6 +23,62 @@
  */
 
 /*
+ * Reserve 6 bytes at the beginning of the trace
+ *        +----------------------------+
+ *        | prof count addr (4 bytes)  |
+ *        +----------------------------+
+ *        | chain cell offset (2 bytes)|
+ *        +----------------------------+
+ *
+ * ...and then code to increment the execution
+ *
+ * For continuous profiling (12 bytes):
+ *
+ *       mov   r0, pc       @ move adr of "mov r0,pc" + 4 to r0
+ *       sub   r0, #10      @ back up to addr prof count pointer
+ *       ldr   r0, [r0]     @ get address of counter
+ *       ldr   r1, [r0]
+ *       add   r1, #1
+ *       str   r1, [r0]
+ *
+ * For periodic profiling (4 bytes):
+ *       call  TEMPLATE_PERIODIC_PROFILING
+ *
+ * and return the size (in bytes) of the generated code.
+ */
+
+static int genTraceProfileEntry(CompilationUnit *cUnit)
+{
+    intptr_t addr = (intptr_t)dvmJitNextTraceCounter();
+    assert(__BYTE_ORDER == __LITTLE_ENDIAN);
+    newLIR1(cUnit, kArm16BitData, addr & 0xffff);
+    newLIR1(cUnit, kArm16BitData, (addr >> 16) & 0xffff);
+    cUnit->chainCellOffsetLIR =
+        (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
+    cUnit->headerSize = 6;
+    if ((gDvmJit.profileMode == kTraceProfilingContinuous) ||
+        (gDvmJit.profileMode == kTraceProfilingDisabled)) {
+        /* Thumb instruction used directly here to ensure correct size */
+        newLIR2(cUnit, kThumbMovRR_H2L, r0, rpc);
+        newLIR2(cUnit, kThumbSubRI8, r0, 10);
+        newLIR3(cUnit, kThumbLdrRRI5, r0, r0, 0);
+        newLIR3(cUnit, kThumbLdrRRI5, r1, r0, 0);
+        newLIR2(cUnit, kThumbAddRI8, r1, 1);
+        newLIR3(cUnit, kThumbStrRRI5, r1, r0, 0);
+        return 12;
+    } else {
+        int opcode = TEMPLATE_PERIODIC_PROFILING;
+        newLIR2(cUnit, kThumbBlx1,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        newLIR2(cUnit, kThumbBlx2,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        return 4;
+    }
+}
+
+/*
  * Perform a "reg cmp imm" operation and jump to the PCR region if condition
  * satisfies.
  */
diff --git a/vm/compiler/codegen/arm/Thumb2/Gen.c b/vm/compiler/codegen/arm/Thumb2/Gen.c
index 0891524..f5e1096 100644
--- a/vm/compiler/codegen/arm/Thumb2/Gen.c
+++ b/vm/compiler/codegen/arm/Thumb2/Gen.c
@@ -15,13 +15,64 @@
  */
 
 /*
- * This file contains codegen for the Thumb ISA and is intended to be
+ * This file contains codegen for the Thumb2 ISA and is intended to be
  * includes by:
  *
  *        Codegen-$(TARGET_ARCH_VARIANT).c
  *
  */
 
+/*
+ * Reserve 6 bytes at the beginning of the trace
+ *        +----------------------------+
+ *        | prof count addr (4 bytes)  |
+ *        +----------------------------+
+ *        | chain cell offset (2 bytes)|
+ *        +----------------------------+
+ *
+ * ...and then code to increment the execution
+ *
+ * For continuous profiling (10 bytes)
+ *       ldr   r0, [pc-8]   @ get prof count addr    [4 bytes]
+ *       ldr   r1, [r0]     @ load counter           [2 bytes]
+ *       add   r1, #1       @ increment              [2 bytes]
+ *       str   r1, [r0]     @ store                  [2 bytes]
+ *
+ * For periodic profiling (4 bytes)
+ *       call  TEMPLATE_PERIODIC_PROFILING
+ *
+ * and return the size (in bytes) of the generated code.
+ */
+
+static int genTraceProfileEntry(CompilationUnit *cUnit)
+{
+    intptr_t addr = (intptr_t)dvmJitNextTraceCounter();
+    assert(__BYTE_ORDER == __LITTLE_ENDIAN);
+    newLIR1(cUnit, kArm16BitData, addr & 0xffff);
+    newLIR1(cUnit, kArm16BitData, (addr >> 16) & 0xffff);
+    cUnit->chainCellOffsetLIR =
+        (LIR *) newLIR1(cUnit, kArm16BitData, CHAIN_CELL_OFFSET_TAG);
+    cUnit->headerSize = 6;
+    if ((gDvmJit.profileMode == kTraceProfilingContinuous) ||
+        (gDvmJit.profileMode == kTraceProfilingDisabled)) {
+        /* Thumb[2] instruction used directly here to ensure correct size */
+        newLIR2(cUnit, kThumb2LdrPcReln12, r0, 8);
+        newLIR3(cUnit, kThumbLdrRRI5, r1, r0, 0);
+        newLIR2(cUnit, kThumbAddRI8, r1, 1);
+        newLIR3(cUnit, kThumbStrRRI5, r1, r0, 0);
+        return 10;
+    } else {
+        int opcode = TEMPLATE_PERIODIC_PROFILING;
+        newLIR2(cUnit, kThumbBlx1,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        newLIR2(cUnit, kThumbBlx2,
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode],
+            (int) gDvmJit.codeCache + templateEntryOffsets[opcode]);
+        return 4;
+    }
+}
+
 static void genNegFloat(CompilationUnit *cUnit, RegLocation rlDest,
                         RegLocation rlSrc)
 {