Quick compiler: restore optimizations

This CL re-enables optizations on the Quick compile path.
Notes:

  o Although all optimization are enabled, several are now useless
    because of llvm and bitcode constraints:
      - Large method de-optimization (i.e. - skipping expensive dataflow
        analysis) can't be done because we have to do the analysis to
        produce a CFG that makes the bitcode verifier happy.
      - Small method pattern matching isn't applicable w/ bitcode (though
        I can probably do something similar in the Quick backend, but
        looking for bitcode instead of dex patterns).
      - Branch fusing doesn't translate to bitcode.
      - Bitcode generation has de-optimized code layout.  We'll try to
        repair the damage in a subsequent CL.

  o There is an ugly workaround related to the way we're loading and
    unloading the compiler .so containing llvm. [See comment in compiler.cc]

  o We're still running single-threaded - need to add the magic to allow
    multi-threaded use of llvm.

  o With the CL, the phone boots, all target tests pass and all cts VM
    tests pass (except those being dealt with via a verifier change).

  o Compile time is pretty bad - when flashing it's best to follow
    with an adb sync to avoid on-device compilation of system apps.

Change-Id: I1c98f9e64aefbcbd24b957c71544c28450eb2023
diff --git a/src/compiler.cc b/src/compiler.cc
index b06f718..e31b9b5 100644
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -390,7 +390,20 @@
 #endif
   if (compiler_library_ != NULL) {
     VLOG(compiler) << "dlclose(" << compiler_library_ << ")";
+#if !defined(ART_USE_QUICK_COMPILER)
+    /*
+     * FIXME: Temporary workaround
+     * Apparently, llvm is adding dctors to atexit, but if we unload
+     * the library here the code will no longer be around at exit time
+     * and we die a flaming death in __cxa_finalize().  Apparently, some
+     * dlclose() implementations will scan the atexit list on unload and
+     * handle any associated with the soon-to-be-unloaded library.
+     * However, this is not required by POSIX and we don't do it.
+     * See: http://b/issue?id=4998315
+     * What's the right thing to do here?
+     */
     dlclose(compiler_library_);
+#endif
   }
 }
 
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index 69689f9..a606287 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -41,6 +41,9 @@
 #define EXERCISE_RESOLVE_METHOD (cUnit->enableDebug & \
   (1 << kDebugExerciseResolveMethod))
 
+// Minimum field size to contain Dalvik vReg number
+#define VREG_NUM_WIDTH 16
+
 enum RegisterClass {
   kCoreReg,
   kFPReg,
@@ -455,8 +458,8 @@
   int assemblerRetries;
   std::vector<uint8_t> codeBuffer;
   std::vector<uint32_t> mappingTable;
-  std::vector<uint16_t> coreVmapTable;
-  std::vector<uint16_t> fpVmapTable;
+  std::vector<uint32_t> coreVmapTable;
+  std::vector<uint32_t> fpVmapTable;
   bool genDebugger;                   // Generate code for debugger
   bool printMe;
   bool hasClassLiterals;              // Contains class ptrs used as literals
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index 0925793..8967649 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -1854,6 +1854,12 @@
       case Instruction::CMPG_FLOAT:
       case Instruction::CMPG_DOUBLE:
       case Instruction::CMP_LONG:
+#if defined(ART_USE_QUICK_COMPILER)
+        if (cUnit->genBitcode) {
+          // Bitcode doesn't allow this optimization.
+          break;
+        }
+#endif
         if (mir->next != NULL) {
           MIR* mirNext = mir->next;
           Instruction::Code brOpcode = mirNext->dalvikInsn.opcode;
@@ -2090,10 +2096,8 @@
   if (!(cUnit->disableOpt & (1 << kBBOpt))) {
     oatInitGrowableList(cUnit, &cUnit->compilerTemps, 6, kListMisc);
     DCHECK_EQ(cUnit->numCompilerTemps, 0);
-    if (!(cUnit->disableOpt & (1 << kBBOpt))) {
-      oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
-                                    kAllNodes, false /* isIterative */);
-    }
+    oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
+                                  kAllNodes, false /* isIterative */);
   }
 }
 
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index b893cca..3aedbe9 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -799,8 +799,6 @@
     //cUnit->enableDebug |= (1 << kDebugVerifyBitcode);
     //cUnit->printMe = true;
     //cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
-    // Disable non-safe optimizations for now
-    cUnit->disableOpt |= ~(1 << kSafeOptimizations);
   }
 #endif
   /* Are we generating code for the debugger? */
@@ -1127,8 +1125,11 @@
 
   // Combine vmap tables - core regs, then fp regs - into vmapTable
   std::vector<uint16_t> vmapTable;
+  // Core regs may have been inserted out of order - sort first
+  std::sort(cUnit->coreVmapTable.begin(), cUnit->coreVmapTable.end());
   for (size_t i = 0 ; i < cUnit->coreVmapTable.size(); i++) {
-    vmapTable.push_back(cUnit->coreVmapTable[i]);
+    // Copy, stripping out the phys register sort key
+    vmapTable.push_back(~(-1 << VREG_NUM_WIDTH) & cUnit->coreVmapTable[i]);
   }
   // If we have a frame, push a marker to take place of lr
   if (cUnit->frameSize > 0) {
@@ -1137,7 +1138,7 @@
     DCHECK_EQ(__builtin_popcount(cUnit->coreSpillMask), 0);
     DCHECK_EQ(__builtin_popcount(cUnit->fpSpillMask), 0);
   }
-  // Combine vmap tables - core regs, then fp regs
+  // Combine vmap tables - core regs, then fp regs. fp regs already sorted
   for (uint32_t i = 0; i < cUnit->fpVmapTable.size(); i++) {
     vmapTable.push_back(cUnit->fpVmapTable[i]);
   }
diff --git a/src/compiler/Ralloc.cc b/src/compiler/Ralloc.cc
index f4e735a..623d4ea 100644
--- a/src/compiler/Ralloc.cc
+++ b/src/compiler/Ralloc.cc
@@ -369,8 +369,8 @@
 {
   for (int i = 0; i < count; i++) {
     LOG(INFO) << StringPrintf("Loc[%02d] : %s, %c %c %c %c %c %c%d %c%d S%d",
-        i, storageName[table[i].location], table[i].wide ? 'W' : 'N',
-        table[i].defined ? 'D' : 'U',
+        table[i].origSReg, storageName[table[i].location],
+        table[i].wide ? 'W' : 'N', table[i].defined ? 'D' : 'U',
         table[i].fp ? 'F' : table[i].ref ? 'R' :'C',
         table[i].highWord ? 'H' : 'L', table[i].home ? 'h' : 't',
         oatIsFpReg(table[i].lowReg) ? 's' : 'r',
diff --git a/src/compiler/codegen/MethodBitcode.cc b/src/compiler/codegen/MethodBitcode.cc
index e7e4e5a..6b78765 100644
--- a/src/compiler/codegen/MethodBitcode.cc
+++ b/src/compiler/codegen/MethodBitcode.cc
@@ -107,17 +107,63 @@
   loc.wide = ((ty == cUnit->irb->getInt64Ty()) ||
               (ty == cUnit->irb->getDoubleTy()));
   loc.defined = true;
-  if ((ty == cUnit->irb->getFloatTy()) ||
-      (ty == cUnit->irb->getDoubleTy())) {
-    loc.fp = true;
-  } else if (ty == cUnit->irb->GetJObjectTy()) {
-    loc.ref = true;
-  } else {
-    loc.core = true;
-  }
-  loc.home = false;  // Will change during promotion
+  loc.home = false;  // May change during promotion
   loc.sRegLow = baseSReg;
   loc.origSReg = cUnit->locMap.size();
+  PromotionMap pMap = cUnit->promotionMap[baseSReg];
+  if (ty == cUnit->irb->getFloatTy()) {
+    loc.fp = true;
+    if (pMap.fpLocation == kLocPhysReg) {
+      loc.lowReg = pMap.fpReg;
+      loc.location = kLocPhysReg;
+      loc.home = true;
+    }
+  } else if (ty == cUnit->irb->getDoubleTy()) {
+    loc.fp = true;
+    PromotionMap pMapHigh = cUnit->promotionMap[baseSReg + 1];
+    if ((pMap.fpLocation == kLocPhysReg) &&
+        (pMapHigh.fpLocation == kLocPhysReg) &&
+        ((pMap.fpReg & 0x1) == 0) &&
+        (pMap.fpReg + 1 == pMapHigh.fpReg)) {
+      loc.lowReg = pMap.fpReg;
+      loc.highReg = pMapHigh.fpReg;
+      loc.location = kLocPhysReg;
+      loc.home = true;
+    }
+  } else if (ty == cUnit->irb->GetJObjectTy()) {
+    loc.ref = true;
+    if (pMap.coreLocation == kLocPhysReg) {
+      loc.lowReg = pMap.coreReg;
+      loc.location = kLocPhysReg;
+      loc.home = true;
+    }
+  } else if (ty == cUnit->irb->getInt64Ty()) {
+    loc.core = true;
+    PromotionMap pMapHigh = cUnit->promotionMap[baseSReg + 1];
+    if ((pMap.coreLocation == kLocPhysReg) &&
+        (pMapHigh.coreLocation == kLocPhysReg)) {
+      loc.lowReg = pMap.coreReg;
+      loc.highReg = pMapHigh.coreReg;
+      loc.location = kLocPhysReg;
+      loc.home = true;
+    }
+  } else {
+    loc.core = true;
+    if (pMap.coreLocation == kLocPhysReg) {
+      loc.lowReg = pMap.coreReg;
+      loc.location = kLocPhysReg;
+      loc.home = true;
+    }
+  }
+
+  if (cUnit->printMe && loc.home) {
+    if (loc.wide) {
+      LOG(INFO) << "Promoted wide " << s << " to regs " << loc.lowReg
+                << "/" << loc.highReg;
+    } else {
+      LOG(INFO) << "Promoted " << s << " to reg " << loc.lowReg;
+    }
+  }
   cUnit->locMap.Put(val, loc);
 }
 
@@ -2883,12 +2929,19 @@
         oatNew(cUnit, sizeof(RegLocation) * cUnit->numIns, true, kAllocMisc);
     llvm::Function::arg_iterator it(cUnit->func->arg_begin());
     llvm::Function::arg_iterator it_end(cUnit->func->arg_end());
+    // Skip past Method*
+    it++;
     for (unsigned i = 0; it != it_end; ++it) {
       llvm::Value* val = it;
       argLocs[i++] = valToLoc(cUnit, val);
       llvm::Type* ty = val->getType();
       if ((ty == cUnit->irb->getInt64Ty()) || (ty == cUnit->irb->getDoubleTy())) {
-        argLocs[i++].sRegLow = INVALID_SREG;
+        argLocs[i] = argLocs[i-1];
+        argLocs[i].lowReg = argLocs[i].highReg;
+        argLocs[i].origSReg++;
+        argLocs[i].sRegLow = INVALID_SREG;
+        argLocs[i].highWord = true;
+        i++;
       }
     }
     genEntrySequence(cUnit, argLocs, cUnit->methodLoc);
@@ -3365,15 +3418,78 @@
   cUnit->numFPSpills = 0;
   cUnit->coreVmapTable.clear();
   cUnit->fpVmapTable.clear();
-  oatAdjustSpillMask(cUnit);
-  cUnit->frameSize = oatComputeFrameSize(cUnit);
 
   /*
    * At this point, we've lost all knowledge of register promotion.
    * Rebuild that info from the MethodInfo intrinsic (if it
-   * exists - not required for correctness).
+   * exists - not required for correctness).  Normally, this will
+   * be the first instruction we encounter, so we won't have to iterate
+   * through everything.
    */
-  // TODO: find and recover MethodInfo.
+  for (llvm::inst_iterator i = llvm::inst_begin(func),
+       e = llvm::inst_end(func); i != e; ++i) {
+    llvm::CallInst* callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+    if (callInst != NULL) {
+      llvm::Function* callee = callInst->getCalledFunction();
+      greenland::IntrinsicHelper::IntrinsicId id =
+          cUnit->intrinsic_helper->GetIntrinsicId(callee);
+      if (id == greenland::IntrinsicHelper::MethodInfo) {
+        if (cUnit->printMe) {
+          LOG(INFO) << "Found MethodInfo";
+        }
+        llvm::MDNode* regInfoNode = callInst->getMetadata("RegInfo");
+        if (regInfoNode != NULL) {
+          llvm::ConstantInt* numInsValue =
+            static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(0));
+          llvm::ConstantInt* numRegsValue =
+            static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(1));
+          llvm::ConstantInt* numOutsValue =
+            static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(2));
+          llvm::ConstantInt* numCompilerTempsValue =
+            static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(3));
+          llvm::ConstantInt* numSSARegsValue =
+            static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(4));
+          if (cUnit->printMe) {
+             LOG(INFO) << "RegInfo - Ins:" << numInsValue->getZExtValue()
+                       << ", Regs:" << numRegsValue->getZExtValue()
+                       << ", Outs:" << numOutsValue->getZExtValue()
+                       << ", CTemps:" << numCompilerTempsValue->getZExtValue()
+                       << ", SSARegs:" << numSSARegsValue->getZExtValue();
+            }
+          }
+        llvm::MDNode* pmapInfoNode = callInst->getMetadata("PromotionMap");
+        if (pmapInfoNode != NULL) {
+          int elems = pmapInfoNode->getNumOperands();
+          if (cUnit->printMe) {
+            LOG(INFO) << "PMap size: " << elems;
+          }
+          for (int i = 0; i < elems; i++) {
+            llvm::ConstantInt* rawMapData =
+                static_cast<llvm::ConstantInt*>(pmapInfoNode->getOperand(i));
+            uint32_t mapData = rawMapData->getZExtValue();
+            PromotionMap* p = &cUnit->promotionMap[i];
+            p->firstInPair = (mapData >> 24) & 0xff;
+            p->fpReg = (mapData >> 16) & 0xff;
+            p->coreReg = (mapData >> 8) & 0xff;
+            p->fpLocation = static_cast<RegLocationType>((mapData >> 4) & 0xf);
+            if (p->fpLocation == kLocPhysReg) {
+              oatRecordFpPromotion(cUnit, p->fpReg, i);
+            }
+            p->coreLocation = static_cast<RegLocationType>(mapData & 0xf);
+            if (p->coreLocation == kLocPhysReg) {
+              oatRecordCorePromotion(cUnit, p->coreReg, i);
+            }
+          }
+          if (cUnit->printMe) {
+            oatDumpPromotionMap(cUnit);
+          }
+        }
+        break;
+      }
+    }
+  }
+  oatAdjustSpillMask(cUnit);
+  cUnit->frameSize = oatComputeFrameSize(cUnit);
 
   // Create RegLocations for arguments
   llvm::Function::arg_iterator it(cUnit->func->arg_begin());
diff --git a/src/compiler/codegen/Ralloc.h b/src/compiler/codegen/Ralloc.h
index d1518e8..db8fc7d 100644
--- a/src/compiler/codegen/Ralloc.h
+++ b/src/compiler/codegen/Ralloc.h
@@ -198,6 +198,9 @@
 extern void oatCountRefs(CompilationUnit*, BasicBlock*, RefCounts*, RefCounts*);
 extern int oatSortCounts(const void *val1, const void *val2);
 extern void oatDumpCounts(const RefCounts* arr, int size, const char* msg);
+extern void oatRecordCorePromotion(CompilationUnit* cUnit, int reg, int sReg);
+extern void oatRecordFpPromotion(CompilationUnit* cUnit, int reg, int sReg);
+
 
 /*
  * Architecture-dependent register allocation routines implemented in
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index 9d1878a..8fa110a 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -161,6 +161,20 @@
   }
 }
 
+void oatRecordCorePromotion(CompilationUnit* cUnit, int reg, int sReg)
+{
+  int pMapIdx = SRegToPMap(cUnit, sReg);
+  int vReg = SRegToVReg(cUnit, sReg);
+  oatGetRegInfo(cUnit, reg)->inUse = true;
+  cUnit->coreSpillMask |= (1 << reg);
+  // Include reg for later sort
+  cUnit->coreVmapTable.push_back(reg << VREG_NUM_WIDTH |
+                                 (vReg & ((1 << VREG_NUM_WIDTH) - 1)));
+  cUnit->numCoreSpills++;
+  cUnit->promotionMap[pMapIdx].coreLocation = kLocPhysReg;
+  cUnit->promotionMap[pMapIdx].coreReg = reg;
+}
+
 /* Reserve a callee-save register.  Return -1 if none available */
 extern int oatAllocPreservedCoreReg(CompilationUnit* cUnit, int sReg)
 {
@@ -168,21 +182,24 @@
   RegisterInfo* coreRegs = cUnit->regPool->coreRegs;
   for (int i = 0; i < cUnit->regPool->numCoreRegs; i++) {
     if (!coreRegs[i].isTemp && !coreRegs[i].inUse) {
-      int vReg = SRegToVReg(cUnit, sReg);
-      int pMapIdx = SRegToPMap(cUnit, sReg);
       res = coreRegs[i].reg;
-      coreRegs[i].inUse = true;
-      cUnit->coreSpillMask |= (1 << res);
-      cUnit->coreVmapTable.push_back(vReg);
-      cUnit->numCoreSpills++;
-      cUnit->promotionMap[pMapIdx].coreLocation = kLocPhysReg;
-      cUnit->promotionMap[pMapIdx].coreReg = res;
+      oatRecordCorePromotion(cUnit, res, sReg);
       break;
     }
   }
   return res;
 }
 
+void oatRecordFpPromotion(CompilationUnit* cUnit, int reg, int sReg)
+{
+  int pMapIdx = SRegToPMap(cUnit, sReg);
+  int vReg = SRegToVReg(cUnit, sReg);
+  oatGetRegInfo(cUnit, reg)->inUse = true;
+  oatMarkPreservedSingle(cUnit, vReg, reg);
+  cUnit->promotionMap[pMapIdx].fpLocation = kLocPhysReg;
+  cUnit->promotionMap[pMapIdx].fpReg = reg;
+}
+
 /*
  * Reserve a callee-save fp single register.  Try to fullfill request for
  * even/odd  allocation, but go ahead and allocate anything if not
@@ -195,13 +212,8 @@
   for (int i = 0; i < cUnit->regPool->numFPRegs; i++) {
     if (!FPRegs[i].isTemp && !FPRegs[i].inUse &&
       ((FPRegs[i].reg & 0x1) == 0) == even) {
-      int vReg = SRegToVReg(cUnit, sReg);
-      int pMapIdx = SRegToPMap(cUnit, sReg);
       res = FPRegs[i].reg;
-      FPRegs[i].inUse = true;
-      oatMarkPreservedSingle(cUnit, vReg, res);
-      cUnit->promotionMap[pMapIdx].fpLocation = kLocPhysReg;
-      cUnit->promotionMap[pMapIdx].fpReg = res;
+      oatRecordFpPromotion(cUnit, res, sReg);
       break;
     }
   }
@@ -1237,6 +1249,9 @@
       }
     }
   }
+  if (cUnit->printMe) {
+    oatDumpPromotionMap(cUnit);
+  }
 }
 
 /* Returns sp-relative offset in bytes for a VReg */