Trace profiling support for the jit
diff --git a/vm/compiler/CompilerIR.h b/vm/compiler/CompilerIR.h
index 1eb7c40..8a2028a 100644
--- a/vm/compiler/CompilerIR.h
+++ b/vm/compiler/CompilerIR.h
@@ -75,6 +75,7 @@
     LIR *firstLIRInsn;
     LIR *lastLIRInsn;
     LIR *wordList;
+    LIR *chainCellOffsetLIR;
     GrowableList pcReconstructionList;
     int headerSize;                     // bytes before the first code ptr
     int dataOffset;                     // starting offset of literal pool
@@ -84,6 +85,7 @@
     bool printMe;
     bool allSingleStep;
     bool halveInstCount;
+    bool executionCount;                // Add code to count trace executions
     int numChainingCells[CHAINING_CELL_LAST];
     LIR *firstChainingLIR[CHAINING_CELL_LAST];
     RegisterScoreboard registerScoreboard;      // Track register dependency
diff --git a/vm/compiler/Frontend.c b/vm/compiler/Frontend.c
index 5e75d15..c8f3abc 100644
--- a/vm/compiler/Frontend.c
+++ b/vm/compiler/Frontend.c
@@ -206,6 +206,9 @@
     /* Initialize the printMe flag */
     cUnit.printMe = gDvmJit.printMe;
 
+    /* Initialize the profile flag */
+    cUnit.executionCount = gDvmJit.profile;
+
     /* Identify traces that we don't want to compile */
     if (gDvmJit.methodTable) {
         int len = strlen(desc->method->clazz->descriptor) +
diff --git a/vm/compiler/codegen/armv5te/Armv5teLIR.h b/vm/compiler/codegen/armv5te/Armv5teLIR.h
index dcf501b..f0a3f42 100644
--- a/vm/compiler/codegen/armv5te/Armv5teLIR.h
+++ b/vm/compiler/codegen/armv5te/Armv5teLIR.h
@@ -36,8 +36,19 @@
     rFP = 5,
     rGLUE = 6,
     r7 = 7,
+    r8 = 8,
+    r9 = 9,
+    r10 = 10,
+    r11 = 11,
+    r12 = 12,
+    r13 = 13,
+    rlr = 14,
+    rpc = 15
 } NativeRegisterPool;
 
+/* Mask to convert high reg to low for Thumb */
+#define THUMB_REG_MASK 0x7
+
 /* Thumb condition encodings */
 typedef enum Armv5teConditionCode {
     ARM_COND_EQ = 0x0,    /* 0000 */
diff --git a/vm/compiler/codegen/armv5te/Assemble.c b/vm/compiler/codegen/armv5te/Assemble.c
index a59d27f..9b4595d 100644
--- a/vm/compiler/codegen/armv5te/Assemble.c
+++ b/vm/compiler/codegen/armv5te/Assemble.c
@@ -436,7 +436,6 @@
  * before sending them off to the assembler. If out-of-range branch distance is
  * seen rearrange the instructions a bit to correct it.
  */
-#define CHAIN_CELL_OFFSET_SIZE 2
 void dvmCompilerAssembleLIR(CompilationUnit *cUnit)
 {
     LIR *lir;
@@ -469,7 +468,8 @@
 
     /* Add space for chain cell counts & trace description */
     u4 chainCellOffset = offset;
-    Armv5teLIR *chainCellOffsetLIR = (Armv5teLIR *) (cUnit->firstLIRInsn);
+    Armv5teLIR *chainCellOffsetLIR = cUnit->chainCellOffsetLIR;
+    assert(chainCellOffsetLIR);
     assert(chainCellOffset < 0x10000);
     assert(chainCellOffsetLIR->opCode == ARMV5TE_16BIT_DATA &&
            chainCellOffsetLIR->operands[0] == CHAIN_CELL_OFFSET_TAG);
@@ -517,8 +517,8 @@
         return;
     }
 
+
     cUnit->baseAddr = (char *) gDvmJit.codeCache + gDvmJit.codeCacheByteUsed;
-    cUnit->headerSize = CHAIN_CELL_OFFSET_SIZE;
     gDvmJit.codeCacheByteUsed += offset;
 
     /* Install the code block */
diff --git a/vm/compiler/codegen/armv5te/Codegen.c b/vm/compiler/codegen/armv5te/Codegen.c
index 86faa54..27bdec6 100644
--- a/vm/compiler/codegen/armv5te/Codegen.c
+++ b/vm/compiler/codegen/armv5te/Codegen.c
@@ -2842,11 +2842,36 @@
 
     BasicBlock **blockList = cUnit->blockList;
 
-    /*
-     * Reserve space at the beginning of each translation with fillers
-     * + Chain cell count (2 bytes)
-     */
-    newLIR1(cUnit, ARMV5TE_16BIT_DATA, CHAIN_CELL_OFFSET_TAG);
+    if (cUnit->executionCount) {
+        /*
+         * Reserve 6 bytes at the beginning of the trace
+         *        +----------------------------+
+         *        | execution count (4 bytes)  |
+         *        +----------------------------+
+         *        | chain cell offset (2 bytes)|
+         *        +----------------------------+
+         * ...and then code to increment the execution
+         * count:
+         *       mov   r0, pc       @ move adr of "mov r0,pc" + 4 to r0
+         *       sub   r0, #10      @ back up to addr of executionCount
+         *       ldr   r1, [r0]
+         *       add   r1, #1
+         *       str   r1, [r0]
+         */
+        newLIR1(cUnit, ARMV5TE_16BIT_DATA, 0);
+        newLIR1(cUnit, ARMV5TE_16BIT_DATA, 0);
+        cUnit->chainCellOffsetLIR = newLIR1(cUnit, ARMV5TE_16BIT_DATA, CHAIN_CELL_OFFSET_TAG);
+        cUnit->headerSize = 6;
+        newLIR2(cUnit, ARMV5TE_MOV_RR_HL, r0, rpc & THUMB_REG_MASK);
+        newLIR2(cUnit, ARMV5TE_SUB_RI8, r0, 10);
+        newLIR3(cUnit, ARMV5TE_LDR_RRI5, r1, r0, 0);
+        newLIR2(cUnit, ARMV5TE_ADD_RI8, r1, 1);
+        newLIR3(cUnit, ARMV5TE_STR_RRI5, r1, r0, 0);
+    } else {
+         /* Just reserve 2 bytes for the chain cell offset */
+        cUnit->chainCellOffsetLIR = newLIR1(cUnit, ARMV5TE_16BIT_DATA, CHAIN_CELL_OFFSET_TAG);
+        cUnit->headerSize = 2;
+    }
 
     /* Handle the content in each basic block */
     for (i = 0; i < cUnit->numBlocks; i++) {