Improved codegen for inline, continuing codegen restructuring

Added support for Thumb2 IT.  Moved compare-long and floating point
comparisons inline.  Temporarily disabled use of Thumb2 CBZ & CBNZ
because they were causing too many out-of-range assembly restarts.
Bug fix for LIR3 assert.
diff --git a/vm/compiler/codegen/arm/Thumb2Util.c b/vm/compiler/codegen/arm/Thumb2Util.c
index f05a867..8d3ce07 100644
--- a/vm/compiler/codegen/arm/Thumb2Util.c
+++ b/vm/compiler/codegen/arm/Thumb2Util.c
@@ -45,6 +45,7 @@
 static ArmLIR *genBoundsCheck(CompilationUnit *cUnit, int rIndex,
                               int rBound, int dOffset, ArmLIR *pcrLabel);
 static ArmLIR *genRegCopy(CompilationUnit *cUnit, int rDest, int rSrc);
+static int inlinedTarget(MIR *mir);
 
 
 /* Routines which must be supplied here */
@@ -80,6 +81,8 @@
                            int rSrc1, int rSrc2);
 static ArmLIR *loadBaseIndexed(CompilationUnit *cUnit, int rBase,
                                int rIndex, int rDest, int scale, OpSize size);
+static void genCmpLong(CompilationUnit *cUnit, MIR *mir, int vDest, int vSrc1,
+                       int vSrc2);
 
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir);
 static bool genInlinedStringCharAt(CompilationUnit *cUnit, MIR *mir);
@@ -168,6 +171,46 @@
 
 }
 
+/*
+ * Generate a Thumb2 IT instruction, which can nullify up to
+ * four subsequent instructions based on a condition and its
+ * inverse.  The condition applies to the first instruction, which
+ * is executed if the condition is met.  The string "guide" consists
+ * of 0 to 3 chars, and applies to the 2nd through 4th instruction.
+ * A "T" means the instruction is executed if the condition is
+ * met, and an "E" means the instruction is executed if the condition
+ * is not met.
+ */
+static ArmLIR *genIT(CompilationUnit *cUnit, ArmConditionCode code,
+                     char *guide)
+{
+    int mask;
+    int condBit = code & 1;
+    int altBit = condBit ^ 1;
+    int mask3 = 0;
+    int mask2 = 0;
+    int mask1 = 0;
+    //Note: case fallthroughs intentional
+    switch(strlen(guide)) {
+        case 3:
+            mask1 = (guide[2] == 'T') ? condBit : altBit;
+        case 2:
+            mask2 = (guide[1] == 'T') ? condBit : altBit;
+        case 1:
+            mask3 = (guide[0] == 'T') ? condBit : altBit;
+            break;
+        case 0:
+            break;
+        default:
+            assert(0);
+            dvmAbort();
+    }
+    mask = (mask3 << 3) | (mask2 << 2) | (mask1 << 1) |
+           (1 << (3 - strlen(guide)));
+    return newLIR2(cUnit, THUMB2_IT, code, mask);
+}
+
+
 static ArmLIR *fpRegCopy(CompilationUnit *cUnit, int rDest, int rSrc)
 {
     ArmLIR* res = dvmCompilerNew(sizeof(ArmLIR), true);
@@ -279,10 +322,6 @@
     /* See if the value can be constructed cheaply */
     if ((value >= 0) && (value <= 255)) {
         return newLIR2(cUnit, THUMB_MOV_IMM, rDest, value);
-    } else if ((value & 0xFFFFFF00) == 0xFFFFFF00) {
-        res = newLIR2(cUnit, THUMB_MOV_IMM, rDest, ~value);
-        newLIR2(cUnit, THUMB_MVN, rDest, rDest);
-        return res;
     }
     /* Check Modified immediate special cases */
     modImm = modifiedImmediate(value);
@@ -599,7 +638,13 @@
 {
     ArmLIR *branch;
     int modImm;
-    if ((LOWREG(reg)) && (checkValue == 0) &&
+    /*
+     * TODO: re-enable usage of THUMB2_CBZ & THUMB2_CBNZ once assembler is enhanced
+     * to allow us to replace code patterns when instructions don't reach.  Currently,
+     * CB[N]Z is causing too many assembler aborts.  What we want to do is emit
+     * the short forms, and then replace them with longer versions when needed.
+     */
+    if (0 && (LOWREG(reg)) && (checkValue == 0) &&
        ((cond == ARM_COND_EQ) || (cond == ARM_COND_NE))) {
         branch = newLIR2(cUnit,
                          (cond == ARM_COND_EQ) ? THUMB2_CBZ : THUMB2_CBNZ,
@@ -974,12 +1019,10 @@
         case OP_ROR:
             return newLIR3(cUnit, THUMB2_ROR_RRI5, rDest, rSrc1, value);
         case OP_ADD:
-            if (LOWREG(rDest) && (rSrc1 == 13) && (value <= 1020)) { /* sp */
-                assert((value & 0x3) == 0);
+            if (LOWREG(rDest) && (rSrc1 == 13) && (value <= 1020) && ((value & 0x3)==0)) {
                 return newLIR3(cUnit, THUMB_ADD_SP_REL, rDest, rSrc1,
                                value >> 2);
-            } else if (LOWREG(rDest) && (rSrc1 == rpc) && (value <= 1020)) {
-                assert((value & 0x3) == 0);
+            } else if (LOWREG(rDest) && (rSrc1 == rpc) && (value <= 1020) && ((value & 0x3)==0)) {
                 return newLIR3(cUnit, THUMB_ADD_PC_REL, rDest, rSrc1,
                                value >> 2);
             }
@@ -1042,24 +1085,69 @@
         return newLIR3(cUnit, opCode, rDest, rSrc1, modImm);
     } else {
         loadConstant(cUnit, rScratch, value);
-        if (EncodingMap[opCode].flags & IS_QUAD_OP)
+        if (EncodingMap[altOpCode].flags & IS_QUAD_OP)
             return newLIR4(cUnit, altOpCode, rDest, rSrc1, rScratch, 0);
         else
             return newLIR3(cUnit, altOpCode, rDest, rSrc1, rScratch);
     }
 }
 
-//TODO: specialize the inlined routines for Thumb2
+/*
+ * 64-bit 3way compare function.
+ *     mov   r7, #-1
+ *     cmp   op1hi, op2hi
+ *     blt   done
+ *     bgt   flip
+ *     sub   r7, op1lo, op2lo (treat as unsigned)
+ *     beq   done
+ *     ite   hi
+ *     mov(hi)   r7, #-1
+ *     mov(!hi)  r7, #1
+ * flip:
+ *     neg   r7
+ * done:
+ */
+static void genCmpLong(CompilationUnit *cUnit, MIR *mir,
+                               int vDest, int vSrc1, int vSrc2)
+{
+    int op1lo = selectFirstRegister(cUnit, vSrc1, true);
+    int op1hi = NEXT_REG(op1lo);
+    int op2lo = NEXT_REG(op1hi);
+    int op2hi = NEXT_REG(op2lo);
+    loadValuePair(cUnit, vSrc1, op1lo, op1hi);
+    loadValuePair(cUnit, vSrc2, op2lo, op2hi);
+    /* Note: using hardcoded r7 & r4PC for now.  revisit */
+    loadConstant(cUnit, r7, -1);
+    opRegReg(cUnit, OP_CMP, op1hi, op2hi);
+    ArmLIR *branch1 = opImmImm(cUnit, OP_COND_BR, 0, ARM_COND_LT);
+    ArmLIR *branch2 = opImmImm(cUnit, OP_COND_BR, 0, ARM_COND_GT);
+    opRegRegReg(cUnit, OP_SUB, r7, op1lo, op2lo);
+    ArmLIR *branch3 = opImmImm(cUnit, OP_COND_BR, 0, ARM_COND_EQ);
+
+    // TODO: need assert mechanism to verify IT block size
+    branch1->generic.target = (LIR *) genIT(cUnit, ARM_COND_HI, "E");
+    newLIR2(cUnit, THUMB2_MOV_IMM_SHIFT, r7, modifiedImmediate(-1));
+    newLIR2(cUnit, THUMB_MOV_IMM, r7, 1);
+
+    branch2->generic.target = (LIR *) opRegReg(cUnit, OP_NEG, r7, r7);
+    branch1->generic.target = (LIR *) storeValue(cUnit, r7, vDest, r4PC);
+    branch3->generic.target = branch1->generic.target;
+}
+
 static bool genInlinedStringLength(CompilationUnit *cUnit, MIR *mir)
 {
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int offset = offsetof(InterpState, retval);
     int regObj = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int reg1 = NEXT_REG(regObj);
+    int vDest = inlinedTarget(mir);
     loadValue(cUnit, dInsn->arg[0], regObj);
     genNullCheck(cUnit, dInsn->arg[0], regObj, mir->offset, NULL);
     loadWordDisp(cUnit, regObj, gDvm.offJavaLangString_count, reg1);
-    storeWordDisp(cUnit, rGLUE, offset, reg1, regObj);
+    if (vDest >= 0)
+        storeValue(cUnit, reg1, vDest, regObj);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg1, rNone);
     return false;
 }
 
@@ -1072,6 +1160,7 @@
     int regIdx = NEXT_REG(regObj);
     int regMax = NEXT_REG(regIdx);
     int regOff = NEXT_REG(regMax);
+    int vDest = inlinedTarget(mir);
     loadValue(cUnit, dInsn->arg[0], regObj);
     loadValue(cUnit, dInsn->arg[1], regIdx);
     ArmLIR * pcrLabel = genNullCheck(cUnit, dInsn->arg[0], regObj,
@@ -1080,12 +1169,13 @@
     loadWordDisp(cUnit, regObj, gDvm.offJavaLangString_offset, regOff);
     loadWordDisp(cUnit, regObj, gDvm.offJavaLangString_value, regObj);
     genBoundsCheck(cUnit, regIdx, regMax, mir->offset, pcrLabel);
-
-    newLIR2(cUnit, THUMB_ADD_RI8, regObj, contents);
-    newLIR3(cUnit, THUMB_ADD_RRR, regIdx, regIdx, regOff);
-    newLIR3(cUnit, THUMB_ADD_RRR, regIdx, regIdx, regIdx);
-    newLIR3(cUnit, THUMB_LDRH_RRR, regMax, regObj, regIdx);
-    storeWordDisp(cUnit, rGLUE, offset, regMax, regObj);
+    opRegImm(cUnit, OP_ADD, regObj, contents, rNone);
+    opRegReg(cUnit, OP_ADD, regIdx, regOff);
+    loadBaseIndexed(cUnit, regObj, regIdx, regMax, 1, UNSIGNED_HALF);
+    if (vDest >= 0)
+        storeValue(cUnit, regMax, vDest, regObj);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, regMax, rNone);
     return false;
 }
 
@@ -1095,12 +1185,20 @@
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int reg0 = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int sign = NEXT_REG(reg0);
-    /* abs(x) = y<=x>>31, (x+y)^y.  Shorter in ARM/THUMB2, no skip in THUMB */
+    int vDest = inlinedTarget(mir);
+    /* abs(x) = y<=x>>31, (x+y)^y.  */
     loadValue(cUnit, dInsn->arg[0], reg0);
-    newLIR3(cUnit, THUMB_ASR, sign, reg0, 31);
-    newLIR3(cUnit, THUMB_ADD_RRR, reg0, reg0, sign);
-    newLIR2(cUnit, THUMB_EOR, reg0, sign);
-    storeWordDisp(cUnit, rGLUE, offset, reg0, sign);
+    /*
+     * Thumb2's IT block also yields 3 instructions, but imposes
+     * scheduling constraints.
+     */
+    opRegRegImm(cUnit, OP_ASR, sign, reg0, 31, rNone);
+    opRegReg(cUnit, OP_ADD, reg0, sign);
+    opRegReg(cUnit, OP_XOR, reg0, sign);
+    if (vDest >= 0)
+        storeValue(cUnit, reg0, vDest, sign);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg0, rNone);
     return false;
 }
 
@@ -1110,10 +1208,15 @@
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int reg0 = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int signMask = NEXT_REG(reg0);
+    int vDest = inlinedTarget(mir);
+    // TUNING: handle case of src already in FP reg
     loadValue(cUnit, dInsn->arg[0], reg0);
     loadConstant(cUnit, signMask, 0x7fffffff);
     newLIR2(cUnit, THUMB_AND_RR, reg0, signMask);
-    storeWordDisp(cUnit, rGLUE, offset, reg0, signMask);
+    if (vDest >= 0)
+        storeValue(cUnit, reg0, vDest, signMask);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg0, rNone);
     return false;
 }
 
@@ -1124,30 +1227,46 @@
     int oplo = selectFirstRegister(cUnit, dInsn->arg[0], true);
     int ophi = NEXT_REG(oplo);
     int signMask = NEXT_REG(ophi);
-    loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
-    loadConstant(cUnit, signMask, 0x7fffffff);
-    storeWordDisp(cUnit, rGLUE, offset, oplo, ophi);
-    newLIR2(cUnit, THUMB_AND_RR, ophi, signMask);
-    storeWordDisp(cUnit, rGLUE, offset + 4, ophi, oplo);
+    int vSrc = dInsn->arg[0];
+    int vDest = inlinedTarget(mir);
+    // TUNING: handle case of src already in FP reg
+    if (vDest >= 0) {
+        if (vDest == vSrc) {
+            loadValue(cUnit, vSrc+1, ophi);
+            opRegRegImm(cUnit, OP_AND, ophi, ophi, 0x7fffffff, signMask);
+            storeValue(cUnit, ophi, vDest + 1, signMask);
+        } else {
+            loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
+            opRegRegImm(cUnit, OP_AND, ophi, ophi, 0x7fffffff, signMask);
+            storeValuePair(cUnit, oplo, ophi, vDest, signMask);
+        }
+    } else {
+        loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
+        loadConstant(cUnit, signMask, 0x7fffffff);
+        storeWordDisp(cUnit, rGLUE, offset, oplo, rNone);
+        opRegReg(cUnit, OP_AND, ophi, signMask);
+        storeWordDisp(cUnit, rGLUE, offset + 4, ophi, rNone);
+    }
     return false;
 }
 
- /* No select in thumb, so we need to branch.  Thumb2 will do better */
 static bool genInlinedMinMaxInt(CompilationUnit *cUnit, MIR *mir, bool isMin)
 {
     int offset = offsetof(InterpState, retval);
     DecodedInstruction *dInsn = &mir->dalvikInsn;
     int reg0 = selectFirstRegister(cUnit, dInsn->arg[0], false);
     int reg1 = NEXT_REG(reg0);
+    int vDest = inlinedTarget(mir);
     loadValue(cUnit, dInsn->arg[0], reg0);
     loadValue(cUnit, dInsn->arg[1], reg1);
-    newLIR2(cUnit, THUMB_CMP_RR, reg0, reg1);
-    ArmLIR *branch1 = newLIR2(cUnit, THUMB_B_COND, 2,
-           isMin ? ARM_COND_LT : ARM_COND_GT);
-    newLIR2(cUnit, THUMB_MOV_RR, reg0, reg1);
-    ArmLIR *target =
-        newLIR3(cUnit, THUMB_STR_RRI5, reg0, rGLUE, offset >> 2);
-    branch1->generic.target = (LIR *)target;
+    opRegReg(cUnit, OP_CMP, reg0, reg1);
+    //TODO: need assertion mechanism to validate IT region size
+    genIT(cUnit, (isMin) ? ARM_COND_GT : ARM_COND_LT, "");
+    opRegReg(cUnit, OP_MOV, reg0, reg1);
+    if (vDest >= 0)
+        storeValue(cUnit, reg0, vDest, reg1);
+    else
+        storeWordDisp(cUnit, rGLUE, offset, reg0, rNone);
     return false;
 }
 
@@ -1158,14 +1277,24 @@
     int oplo = selectFirstRegister(cUnit, dInsn->arg[0], true);
     int ophi = NEXT_REG(oplo);
     int sign = NEXT_REG(ophi);
-    /* abs(x) = y<=x>>31, (x+y)^y.  Shorter in ARM/THUMB2, no skip in THUMB */
+    int vDest = inlinedTarget(mir);
+    /* abs(x) = y<=x>>31, (x+y)^y. */
     loadValuePair(cUnit, dInsn->arg[0], oplo, ophi);
-    newLIR3(cUnit, THUMB_ASR, sign, ophi, 31);
-    newLIR3(cUnit, THUMB_ADD_RRR, oplo, oplo, sign);
-    newLIR2(cUnit, THUMB_ADC, ophi, sign);
-    newLIR2(cUnit, THUMB_EOR, oplo, sign);
-    newLIR2(cUnit, THUMB_EOR, ophi, sign);
-    storeWordDisp(cUnit, rGLUE, offset, oplo, sign);
-    storeWordDisp(cUnit, rGLUE, offset + 4, ophi, sign);
+    /*
+     * Thumb2 IT block allows slightly shorter sequence,
+     * but introduces a scheduling barrier.  Stick with this
+     * mechanism for now.
+     */
+    opRegRegImm(cUnit, OP_ASR, sign, ophi, 31, rNone);
+    opRegReg(cUnit, OP_ADD, oplo, sign);
+    opRegReg(cUnit, OP_ADC, ophi, sign);
+    opRegReg(cUnit, OP_XOR, oplo, sign);
+    opRegReg(cUnit, OP_XOR, ophi, sign);
+    if (vDest >= 0) {
+        storeValuePair(cUnit, oplo, ophi, vDest, sign);
+    } else {
+        storeWordDisp(cUnit, rGLUE, offset, oplo, rNone);
+        storeWordDisp(cUnit, rGLUE, offset + 4, ophi, rNone);
+    }
     return false;
 }