JIT: Reworked the assembler to be smarter about short instruction forms

Previously, the JIT wasn't generating short-form compare and branch on
zero/not zero instructions for Thumb2.  The reason was that these only
allow a 1-byte displacement, and when they didn't reach the assembler would
abort the trace, split it in half and try again.  This change re-enables
cbz, cbnz generation and introduces a relatively lightweight retry
mechanism.

Also includes changes for Thumb2 to always generate large displacement
literal loads and conditional branches to minimize the number of retry
attempts.

Change-Id: Icf066836fad203f5c0fcbbb2ae8e1aa73d1cf816
diff --git a/vm/compiler/CompilerIR.h b/vm/compiler/CompilerIR.h
index 21aadec..82b97e5 100644
--- a/vm/compiler/CompilerIR.h
+++ b/vm/compiler/CompilerIR.h
@@ -152,6 +152,12 @@
 struct LoopAnalysis;
 struct RegisterPool;
 
+typedef enum AssemblerStatus {
+    kSuccess,
+    kRetryAll,
+    kRetryHalve
+} AssemblerStatus;
+
 typedef struct CompilationUnit {
     int numInsts;
     int numBlocks;
@@ -166,11 +172,12 @@
     int headerSize;                     // bytes before the first code ptr
     int dataOffset;                     // starting offset of literal pool
     int totalSize;                      // header + code size
+    AssemblerStatus assemblerStatus;    // Success or fix and retry
+    int assemblerRetries;               // How many times tried to fix assembly
     unsigned char *codeBuffer;
     void *baseAddr;
     bool printMe;
     bool allSingleStep;
-    bool halveInstCount;
     bool executionCount;                // Add code to count trace executions
     bool hasLoop;                       // Contains a loop
     bool hasInvoke;                     // Contains an invoke instruction
diff --git a/vm/compiler/Frontend.c b/vm/compiler/Frontend.c
index a828886..8e5a5e2 100644
--- a/vm/compiler/Frontend.c
+++ b/vm/compiler/Frontend.c
@@ -918,15 +918,17 @@
     /* Convert MIR to LIR, etc. */
     dvmCompilerMIR2LIR(&cUnit);
 
-    /* Convert LIR into machine code. */
-    dvmCompilerAssembleLIR(&cUnit, info);
+    /* Convert LIR into machine code. Loop for recoverable retries */
+    do {
+        dvmCompilerAssembleLIR(&cUnit, info);
+        cUnit.assemblerRetries++;
+        if (cUnit.printMe && cUnit.assemblerStatus != kSuccess)
+            LOGD("Assembler abort #%d on %d",cUnit.assemblerRetries,
+                  cUnit.assemblerStatus);
+    } while (cUnit.assemblerStatus == kRetryAll);
 
     if (cUnit.printMe) {
-        if (cUnit.halveInstCount) {
-            LOGD("Assembler aborted");
-        } else {
-            dvmCompilerCodegenDump(&cUnit);
-        }
+        dvmCompilerCodegenDump(&cUnit);
         LOGD("End %s%s, %d Dalvik instructions",
              desc->method->clazz->descriptor, desc->method->name,
              cUnit.numInsts);
@@ -935,18 +937,17 @@
     /* Reset the compiler resource pool */
     dvmCompilerArenaReset();
 
-    /* Success */
-    if (!cUnit.halveInstCount) {
-#if defined(WITH_JIT_TUNING)
-        methodStats->nativeSize += cUnit.totalSize;
-#endif
-        return info->codeAddress != NULL;
-
-    /* Halve the instruction count and retry again */
-    } else {
+    if (cUnit.assemblerStatus == kRetryHalve) {
+        /* Halve the instruction count and start from the top */
         return dvmCompileTrace(desc, cUnit.numInsts / 2, info, bailPtr,
                                optHints);
     }
+
+    assert(cUnit.assemblerStatus == kSuccess);
+#if defined(WITH_JIT_TUNING)
+    methodStats->nativeSize += cUnit.totalSize;
+#endif
+    return info->codeAddress != NULL;
 }
 
 /*
@@ -1290,7 +1291,7 @@
     /* Convert LIR into machine code. */
     dvmCompilerAssembleLIR(cUnit, info);
 
-    if (cUnit->halveInstCount) {
+    if (cUnit->assemblerStatus != kSuccess) {
         return false;
     }
 
diff --git a/vm/compiler/codegen/arm/Assemble.c b/vm/compiler/codegen/arm/Assemble.c
index c1b08a3..4f975b3 100644
--- a/vm/compiler/codegen/arm/Assemble.c
+++ b/vm/compiler/codegen/arm/Assemble.c
@@ -20,9 +20,12 @@
 
 #include "../../CompilerInternals.h"
 #include "ArmLIR.h"
+#include "Codegen.h"
 #include <unistd.h>             /* for cacheflush */
 #include <sys/mman.h>           /* for protection change */
 
+#define MAX_ASSEMBLER_RETRIES 10
+
 /*
  * opcode: ArmOpCode enum
  * skeleton: pre-designated bit-pattern for this opcode
@@ -914,8 +917,14 @@
     return sizeof(JitTraceDescription) + ((runCount+1) * sizeof(JitTraceRun));
 }
 
-/* Return TRUE if error happens */
-static bool assembleInstructions(CompilationUnit *cUnit, intptr_t startAddr)
+/*
+ * Assemble the LIR into binary instruction format.  Note that we may
+ * discover that pc-relative displacements may not fit the selected
+ * instruction.  In those cases we will try to substitute a new code
+ * sequence or request that the trace be shortened and retried.
+ */
+static AssemblerStatus assembleInstructions(CompilationUnit *cUnit,
+                                            intptr_t startAddr)
 {
     short *bufferAddr = (short *) cUnit->codeBuffer;
     ArmLIR *lir;
@@ -952,9 +961,9 @@
                 dvmCompilerAbort(cUnit);
             }
             if ((lir->opCode == kThumb2LdrPcRel12) && (delta > 4091)) {
-                return true;
+                return kRetryHalve;
             } else if (delta > 1020) {
-                return true;
+                return kRetryHalve;
             }
             if (lir->opCode == kThumb2Vldrs) {
                 lir->operands[2] = delta >> 2;
@@ -968,11 +977,23 @@
             intptr_t target = targetLIR->generic.offset;
             int delta = target - pc;
             if (delta > 126 || delta < 0) {
-                /*
-                 * TODO: allow multiple kinds of assembler failure to allow
-                 * change of code patterns when things don't fit.
-                 */
-                return true;
+                /* Convert to cmp rx,#0 / b[eq/ne] tgt pair */
+                ArmLIR *newInst = dvmCompilerNew(sizeof(ArmLIR), true);
+                /* Make new branch instruction and insert after */
+                newInst->opCode = kThumbBCond;
+                newInst->operands[0] = 0;
+                newInst->operands[1] = (lir->opCode == kThumb2Cbz) ?
+                                        kArmCondEq : kArmCondNe;
+                newInst->generic.target = lir->generic.target;
+                dvmCompilerSetupResourceMasks(newInst);
+                dvmCompilerInsertLIRAfter((LIR *)lir, (LIR *)newInst);
+                /* Convert the cb[n]z to a cmp rx, #0 ] */
+                lir->opCode = kThumbCmpRI8;
+                lir->operands[0] = lir->operands[1];
+                lir->operands[1] = 0;
+                lir->generic.target = 0;
+                dvmCompilerSetupResourceMasks(lir);
+                return kRetryAll;
             } else {
                 lir->operands[1] = delta >> 1;
             }
@@ -983,7 +1004,7 @@
             intptr_t target = targetLIR->generic.offset;
             int delta = target - pc;
             if ((lir->opCode == kThumbBCond) && (delta > 254 || delta < -256)) {
-                return true;
+                return kRetryHalve;
             }
             lir->operands[0] = delta >> 1;
         } else if (lir->opCode == kThumbBUncond) {
@@ -1029,18 +1050,12 @@
                     bits |= value;
                     break;
                 case kFmtBrOffset:
-                    /*
-                     * NOTE: branch offsets are not handled here, but
-                     * in the main assembly loop (where label values
-                     * are known).  For reference, here is what the
-                     * encoder handing would be:
-                         value = ((operand  & 0x80000) >> 19) << 26;
-                         value |= ((operand & 0x40000) >> 18) << 11;
-                         value |= ((operand & 0x20000) >> 17) << 13;
-                         value |= ((operand & 0x1f800) >> 11) << 16;
-                         value |= (operand  & 0x007ff);
-                         bits |= value;
-                     */
+                    value = ((operand  & 0x80000) >> 19) << 26;
+                    value |= ((operand & 0x40000) >> 18) << 11;
+                    value |= ((operand & 0x20000) >> 17) << 13;
+                    value |= ((operand & 0x1f800) >> 11) << 16;
+                    value |= (operand  & 0x007ff);
+                    bits |= value;
                     break;
                 case kFmtShift5:
                     value = ((operand & 0x1c) >> 2) << 12;
@@ -1117,7 +1132,7 @@
         }
         *bufferAddr++ = bits & 0xffff;
     }
-    return false;
+    return kSuccess;
 }
 
 #if defined(SIGNATURE_BREAKPOINT)
@@ -1277,16 +1292,29 @@
         return;
     }
 
-    bool assemblerFailure = assembleInstructions(
-        cUnit, (intptr_t) gDvmJit.codeCache + gDvmJit.codeCacheByteUsed);
-
     /*
-     * Currently the only reason that can cause the assembler to fail is due to
-     * trace length - cut it in half and retry.
+     * Attempt to assemble the trace.  Note that assembleInstructions
+     * may rewrite the code sequence and request a retry.
      */
-    if (assemblerFailure) {
-        cUnit->halveInstCount = true;
-        return;
+    cUnit->assemblerStatus = assembleInstructions(cUnit,
+          (intptr_t) gDvmJit.codeCache + gDvmJit.codeCacheByteUsed);
+
+    switch(cUnit->assemblerStatus) {
+        case kSuccess:
+            break;
+        case kRetryAll:
+            if (cUnit->assemblerRetries < MAX_ASSEMBLER_RETRIES) {
+                return;
+            }
+            /* Too many retries - reset and try cutting the trace in half */
+            cUnit->assemblerRetries = 0;
+            cUnit->assemblerStatus = kRetryHalve;
+            return;
+        case kRetryHalve:
+            return;
+        default:
+             LOGE("Unexpected assembler status: %d", cUnit->assemblerStatus);
+             dvmAbort();
     }
 
 #if defined(SIGNATURE_BREAKPOINT)
diff --git a/vm/compiler/codegen/arm/Codegen.h b/vm/compiler/codegen/arm/Codegen.h
index ca0a843..be74e3f 100644
--- a/vm/compiler/codegen/arm/Codegen.h
+++ b/vm/compiler/codegen/arm/Codegen.h
@@ -78,6 +78,8 @@
 extern void dvmCompilerRegCopyWide(CompilationUnit *cUnit, int destLo,
                                    int destHi, int srcLo, int srcHi);
 
+extern void dvmCompilerSetupResourceMasks(ArmLIR *lir);
+
 extern void dvmCompilerFlushRegImpl(CompilationUnit *cUnit, int rBase,
                                     int displacement, int rSrc, OpSize size);
 
diff --git a/vm/compiler/codegen/arm/CodegenDriver.c b/vm/compiler/codegen/arm/CodegenDriver.c
index dcabd8e..1ff9409 100644
--- a/vm/compiler/codegen/arm/CodegenDriver.c
+++ b/vm/compiler/codegen/arm/CodegenDriver.c
@@ -4426,6 +4426,12 @@
                       templateEntryOffsets[TEMPLATE_INTERPRET]);
 }
 
+/* Needed by the Assembler */
+void dvmCompilerSetupResourceMasks(ArmLIR *lir)
+{
+    setupResourceMasks(lir);
+}
+
 /* Needed by the ld/st optmizatons */
 ArmLIR* dvmCompilerRegCopyNoInsert(CompilationUnit *cUnit, int rDest, int rSrc)
 {
diff --git a/vm/compiler/codegen/arm/Thumb2/Factory.c b/vm/compiler/codegen/arm/Thumb2/Factory.c
index c7b52fd..2bf2940 100644
--- a/vm/compiler/codegen/arm/Thumb2/Factory.c
+++ b/vm/compiler/codegen/arm/Thumb2/Factory.c
@@ -171,7 +171,7 @@
         dataTarget = addWordData(cUnit, value, false);
     }
     ArmLIR *loadPcRel = dvmCompilerNew(sizeof(ArmLIR), true);
-    loadPcRel->opCode = LOWREG(rDest) ? kThumbLdrPcRel : kThumb2LdrPcRel12;
+    loadPcRel->opCode = kThumb2LdrPcRel12;
     loadPcRel->generic.target = (LIR *) dataTarget;
     loadPcRel->operands[0] = rDest;
     setupResourceMasks(loadPcRel);
@@ -227,7 +227,7 @@
 
 static ArmLIR *opCondBranch(CompilationUnit *cUnit, ArmConditionCode cc)
 {
-    return newLIR2(cUnit, kThumbBCond, 0 /* offset to be patched */, cc);
+    return newLIR2(cUnit, kThumb2BCond, 0 /* offset to be patched */, cc);
 }
 
 static ArmLIR *opImm(CompilationUnit *cUnit, OpKind op, int value)
@@ -1100,15 +1100,7 @@
 {
     ArmLIR *branch;
     int modImm;
-    /*
-     * TODO: re-enable usage of kThumb2Cbz & kThumb2Cbnz once assembler is
-     * enhanced to allow us to replace code patterns when instructions don't
-     * reach.  Currently, CB[N]Z is causing too many assembler aborts.
-     * What we want to do is emit the short forms, and then replace them with
-     * longer versions when needed.
-     */
-
-    if (0 && (LOWREG(reg)) && (checkValue == 0) &&
+    if ((LOWREG(reg)) && (checkValue == 0) &&
        ((cond == kArmCondEq) || (cond == kArmCondNe))) {
         branch = newLIR2(cUnit,
                          (cond == kArmCondEq) ? kThumb2Cbz : kThumb2Cbnz,