Loop detection, improved reg allocation

Detect loops and loop nesting depth, and use the latter to
weight register uses (which are then used to determine which
registers to promote).

Also:

   o Fixed typo that prevented squashing of useless fp reg copies

   o Rescheduled array access checks to hide latency of limit load.

   o Add basic-block optimization pass to remove duplicate range
     checks.

   o Fixed bug that prevented recognition of redundant null
     checks following iput-wide and aput-wide.

Change-Id: Icfbae39e89b1d14b8703ad6bbb0b29c0635fed1e
diff --git a/src/compiler/codegen/CodegenUtil.cc b/src/compiler/codegen/CodegenUtil.cc
index 27433ca..cc31b29 100644
--- a/src/compiler/codegen/CodegenUtil.cc
+++ b/src/compiler/codegen/CodegenUtil.cc
@@ -371,6 +371,7 @@
     LOG(INFO) << "Outs                 : " << cUnit->numOuts;
     LOG(INFO) << "CoreSpills           : " << cUnit->numCoreSpills;
     LOG(INFO) << "FPSpills             : " << cUnit->numFPSpills;
+    LOG(INFO) << "CompilerTemps        : " << cUnit->numCompilerTemps;
     LOG(INFO) << "Frame size           : " << cUnit->frameSize;
     LOG(INFO) << "code size is " << cUnit->totalSize <<
         " bytes, Dalvik size is " << insnsSize * 2;
diff --git a/src/compiler/codegen/GenCommon.cc b/src/compiler/codegen/GenCommon.cc
index bfc5639..34d9fb9 100644
--- a/src/compiler/codegen/GenCommon.cc
+++ b/src/compiler/codegen/GenCommon.cc
@@ -1341,22 +1341,23 @@
         opRegCopy(cUnit, regPtr, rlArray.lowReg);
     }
 
-    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
-        int regLen = oatAllocTemp(cUnit);
+    bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    int regLen = INVALID_REG;
+    if (needsRangeCheck) {
+        regLen = oatAllocTemp(cUnit);
         //NOTE: max live temps(4) here.
         /* Get len */
         loadWordDisp(cUnit, rlArray.lowReg, lenOffset, regLen);
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
+    }
+    /* regPtr -> array data */
+    opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
+    /* at this point, regPtr points to array, 2 live temps */
+    rlSrc = loadValue(cUnit, rlSrc, regClass);
+    if (needsRangeCheck) {
         genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
                        kThrowArrayBounds);
         oatFreeTemp(cUnit, regLen);
-    } else {
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
     }
-    /* at this point, regPtr points to array, 2 live temps */
-    rlSrc = loadValue(cUnit, rlSrc, regClass);
     storeBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlSrc.lowReg,
                      scale, kWord);
 }
@@ -1406,21 +1407,15 @@
     }
 #else
     int regPtr = oatAllocTemp(cUnit);
-    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
-        int regLen = oatAllocTemp(cUnit);
+    bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    int regLen = INVALID_REG;
+    if (needsRangeCheck) {
+        regLen = oatAllocTemp(cUnit);
         /* Get len */
         loadWordDisp(cUnit, rlArray.lowReg, lenOffset, regLen);
-        /* regPtr -> array data */
-        opRegRegImm(cUnit, kOpAdd, regPtr, rlArray.lowReg, dataOffset);
-        // TODO: change kCondCS to a more meaningful name, is the sense of
-        // carry-set/clear flipped?
-        genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
-                       kThrowArrayBounds);
-        oatFreeTemp(cUnit, regLen);
-    } else {
-        /* regPtr -> array data */
-        opRegRegImm(cUnit, kOpAdd, regPtr, rlArray.lowReg, dataOffset);
     }
+    /* regPtr -> array data */
+    opRegRegImm(cUnit, kOpAdd, regPtr, rlArray.lowReg, dataOffset);
     oatFreeTemp(cUnit, rlArray.lowReg);
     if ((size == kLong) || (size == kDouble)) {
         if (scale) {
@@ -1434,6 +1429,13 @@
         oatFreeTemp(cUnit, rlIndex.lowReg);
         rlResult = oatEvalLoc(cUnit, rlDest, regClass, true);
 
+        if (needsRangeCheck) {
+            // TODO: change kCondCS to a more meaningful name, is the sense of
+            // carry-set/clear flipped?
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
         loadPair(cUnit, regPtr, rlResult.lowReg, rlResult.highReg);
 
         oatFreeTemp(cUnit, regPtr);
@@ -1441,6 +1443,13 @@
     } else {
         rlResult = oatEvalLoc(cUnit, rlDest, regClass, true);
 
+        if (needsRangeCheck) {
+            // TODO: change kCondCS to a more meaningful name, is the sense of
+            // carry-set/clear flipped?
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
         loadBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlResult.lowReg,
                         scale, size);
 
@@ -1483,20 +1492,16 @@
     /* null object? */
     genNullCheck(cUnit, rlArray.sRegLow, rlArray.lowReg, mir);
 
-    if (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK)) {
-        int regLen = oatAllocTemp(cUnit);
+    bool needsRangeCheck = (!(mir->optimizationFlags & MIR_IGNORE_RANGE_CHECK));
+    int regLen = INVALID_REG;
+    if (needsRangeCheck) {
+        regLen = oatAllocTemp(cUnit);
         //NOTE: max live temps(4) here.
         /* Get len */
         loadWordDisp(cUnit, rlArray.lowReg, lenOffset, regLen);
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
-        genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
-                       kThrowArrayBounds);
-        oatFreeTemp(cUnit, regLen);
-    } else {
-        /* regPtr -> array data */
-        opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
     }
+    /* regPtr -> array data */
+    opRegImm(cUnit, kOpAdd, regPtr, dataOffset);
     /* at this point, regPtr points to array, 2 live temps */
     if ((size == kLong) || (size == kDouble)) {
         //TUNING: specific wide routine that can handle fp regs
@@ -1510,12 +1515,22 @@
         }
         rlSrc = loadValueWide(cUnit, rlSrc, regClass);
 
+        if (needsRangeCheck) {
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
+
         storePair(cUnit, regPtr, rlSrc.lowReg, rlSrc.highReg);
 
         oatFreeTemp(cUnit, regPtr);
     } else {
         rlSrc = loadValue(cUnit, rlSrc, regClass);
-
+        if (needsRangeCheck) {
+            genRegRegCheck(cUnit, kCondCs, rlIndex.lowReg, regLen, mir,
+                           kThrowArrayBounds);
+            oatFreeTemp(cUnit, regLen);
+        }
         storeBaseIndexed(cUnit, regPtr, rlIndex.lowReg, rlSrc.lowReg,
                          scale, size);
     }
diff --git a/src/compiler/codegen/MethodCodegenDriver.cc b/src/compiler/codegen/MethodCodegenDriver.cc
index 64f55c6..45a0c75 100644
--- a/src/compiler/codegen/MethodCodegenDriver.cc
+++ b/src/compiler/codegen/MethodCodegenDriver.cc
@@ -757,9 +757,12 @@
             newLIR1(cUnit, kPseudoSSARep, (int) ssaString);
             break;
         }
-        case kMirOpCopy:
-            UNIMPLEMENTED(FATAL) << "Need kMirOpCopy";
+        case kMirOpCopy: {
+            RegLocation rlSrc = oatGetSrc(cUnit, mir, 0);
+            RegLocation rlDest = oatGetDest(cUnit, mir, 0);
+            storeValue(cUnit, rlDest, rlSrc);
             break;
+        }
         default:
             break;
     }
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index afbefff..c08b2e8 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -1043,52 +1043,31 @@
 void oatCountRefs(CompilationUnit *cUnit, BasicBlock* bb,
                   RefCounts* coreCounts, RefCounts* fpCounts)
 {
-    MIR* mir;
-    if (bb->blockType != kDalvikByteCode && bb->blockType != kEntryBlock &&
-        bb->blockType != kExitBlock)
+    if ((cUnit->disableOpt & (1 << kPromoteRegs)) ||
+        !((bb->blockType == kEntryBlock) || (bb->blockType == kExitBlock) ||
+          (bb->blockType == kDalvikByteCode))) {
         return;
-
-    for (mir = bb->firstMIRInsn; mir; mir = mir->next) {
-        SSARepresentation *ssaRep = mir->ssaRep;
-        if (ssaRep) {
-            for (int i = 0; i < ssaRep->numDefs;) {
-                RegLocation loc = cUnit->regLocation[ssaRep->defs[i]];
-                RefCounts* counts = loc.fp ? fpCounts : coreCounts;
-                int vReg = SRegToVReg(cUnit, ssaRep->defs[i]);
-                if (loc.defined) {
-                    counts[vReg].count++;
-                }
-                if (loc.wide) {
-                    if (loc.defined) {
-                        if (loc.fp) {
-                            counts[vReg].doubleStart = true;
-                        }
-                        counts[vReg+1].count++;
-                    }
-                    i += 2;
-                } else {
-                    i++;
+    }
+    for (int i = 0; i < cUnit->numSSARegs;) {
+        RegLocation loc = cUnit->regLocation[i];
+        RefCounts* counts = loc.fp ? fpCounts : coreCounts;
+        int vReg = SRegToVReg(cUnit, loc.sRegLow);
+        if (vReg < 0) {
+            vReg = cUnit->numDalvikRegisters - (vReg + 1);
+        }
+        if (loc.defined) {
+            counts[vReg].count += cUnit->useCounts.elemList[i];
+        }
+        if (loc.wide) {
+            if (loc.defined) {
+                if (loc.fp) {
+                    counts[vReg].doubleStart = true;
+                counts[vReg+1].count += cUnit->useCounts.elemList[i+1];
                 }
             }
-            for (int i = 0; i < ssaRep->numUses;) {
-                RegLocation loc = cUnit->regLocation[ssaRep->uses[i]];
-                RefCounts* counts = loc.fp ? fpCounts : coreCounts;
-                int vReg = SRegToVReg(cUnit, ssaRep->uses[i]);
-                if (loc.defined) {
-                    counts[vReg].count++;
-                }
-                if (loc.wide) {
-                    if (loc.defined) {
-                        if (loc.fp) {
-                            counts[vReg].doubleStart = true;
-                        }
-                        counts[vReg+1].count++;
-                    }
-                    i += 2;
-                } else {
-                    i++;
-                }
-            }
+            i += 2;
+        } else {
+            i++;
         }
     }
 }
@@ -1115,7 +1094,9 @@
  */
 extern void oatDoPromotion(CompilationUnit* cUnit)
 {
-    int numRegs = cUnit->numDalvikRegisters;
+    int regBias = cUnit->numCompilerTemps + 1;
+    int dalvikRegs = cUnit->numDalvikRegisters;
+    int numRegs = dalvikRegs + regBias;
 
     // Allow target code to add any special registers
     oatAdjustSpillMask(cUnit);
@@ -1135,9 +1116,14 @@
           oatNew(cUnit, sizeof(RefCounts) * numRegs, true, kAllocRegAlloc);
     RefCounts *fpRegs = (RefCounts *)
           oatNew(cUnit, sizeof(RefCounts) * numRegs, true, kAllocRegAlloc);
-    for (int i = 0; i < numRegs; i++) {
+    // Set ssa names for original Dalvik registers
+    for (int i = 0; i < dalvikRegs; i++) {
         coreRegs[i].sReg = fpRegs[i].sReg = i;
     }
+    // Set ssa names for Method* and compiler temps
+    for (int i = 0; i < regBias; i++) {
+        coreRegs[dalvikRegs + i].sReg = fpRegs[dalvikRegs + i].sReg = (-1 - i);
+    }
     GrowableListIterator iterator;
     oatGrowableListIteratorInit(&cUnit->blockList, &iterator);
     while (true) {
diff --git a/src/compiler/codegen/arm/Thumb2/Factory.cc b/src/compiler/codegen/arm/Thumb2/Factory.cc
index fdf0ca2..c79f7c6 100644
--- a/src/compiler/codegen/arm/Thumb2/Factory.cc
+++ b/src/compiler/codegen/arm/Thumb2/Factory.cc
@@ -1034,7 +1034,7 @@
         }
     }
     LIR* res = rawLIR(cUnit, cUnit->currentDalvikOffset, opcode, rDest, rSrc);
-    if (!(cUnit->disableOpt && (1 << kSafeOptimizations)) && rDest == rSrc) {
+    if (!(cUnit->disableOpt & (1 << kSafeOptimizations)) && rDest == rSrc) {
         res->flags.isNop = true;
     }
     return res;
diff --git a/src/compiler/codegen/mips/Mips32/Gen.cc b/src/compiler/codegen/mips/Mips32/Gen.cc
index e86a942..dc98508 100644
--- a/src/compiler/codegen/mips/Mips32/Gen.cc
+++ b/src/compiler/codegen/mips/Mips32/Gen.cc
@@ -450,7 +450,7 @@
 #endif
     LIR* res = rawLIR(cUnit, cUnit->currentDalvikOffset, kMipsMove,
                       rDest, rSrc);
-    if (!(cUnit->disableOpt && (1 << kSafeOptimizations)) && rDest == rSrc) {
+    if (!(cUnit->disableOpt & (1 << kSafeOptimizations)) && rDest == rSrc) {
         res->flags.isNop = true;
     }
     return res;