Quick compiler: restore optimizations
This CL re-enables optizations on the Quick compile path.
Notes:
o Although all optimization are enabled, several are now useless
because of llvm and bitcode constraints:
- Large method de-optimization (i.e. - skipping expensive dataflow
analysis) can't be done because we have to do the analysis to
produce a CFG that makes the bitcode verifier happy.
- Small method pattern matching isn't applicable w/ bitcode (though
I can probably do something similar in the Quick backend, but
looking for bitcode instead of dex patterns).
- Branch fusing doesn't translate to bitcode.
- Bitcode generation has de-optimized code layout. We'll try to
repair the damage in a subsequent CL.
o There is an ugly workaround related to the way we're loading and
unloading the compiler .so containing llvm. [See comment in compiler.cc]
o We're still running single-threaded - need to add the magic to allow
multi-threaded use of llvm.
o With the CL, the phone boots, all target tests pass and all cts VM
tests pass (except those being dealt with via a verifier change).
o Compile time is pretty bad - when flashing it's best to follow
with an adb sync to avoid on-device compilation of system apps.
Change-Id: I1c98f9e64aefbcbd24b957c71544c28450eb2023
diff --git a/src/compiler.cc b/src/compiler.cc
index b06f718..e31b9b5 100644
--- a/src/compiler.cc
+++ b/src/compiler.cc
@@ -390,7 +390,20 @@
#endif
if (compiler_library_ != NULL) {
VLOG(compiler) << "dlclose(" << compiler_library_ << ")";
+#if !defined(ART_USE_QUICK_COMPILER)
+ /*
+ * FIXME: Temporary workaround
+ * Apparently, llvm is adding dctors to atexit, but if we unload
+ * the library here the code will no longer be around at exit time
+ * and we die a flaming death in __cxa_finalize(). Apparently, some
+ * dlclose() implementations will scan the atexit list on unload and
+ * handle any associated with the soon-to-be-unloaded library.
+ * However, this is not required by POSIX and we don't do it.
+ * See: http://b/issue?id=4998315
+ * What's the right thing to do here?
+ */
dlclose(compiler_library_);
+#endif
}
}
diff --git a/src/compiler/CompilerIR.h b/src/compiler/CompilerIR.h
index 69689f9..a606287 100644
--- a/src/compiler/CompilerIR.h
+++ b/src/compiler/CompilerIR.h
@@ -41,6 +41,9 @@
#define EXERCISE_RESOLVE_METHOD (cUnit->enableDebug & \
(1 << kDebugExerciseResolveMethod))
+// Minimum field size to contain Dalvik vReg number
+#define VREG_NUM_WIDTH 16
+
enum RegisterClass {
kCoreReg,
kFPReg,
@@ -455,8 +458,8 @@
int assemblerRetries;
std::vector<uint8_t> codeBuffer;
std::vector<uint32_t> mappingTable;
- std::vector<uint16_t> coreVmapTable;
- std::vector<uint16_t> fpVmapTable;
+ std::vector<uint32_t> coreVmapTable;
+ std::vector<uint32_t> fpVmapTable;
bool genDebugger; // Generate code for debugger
bool printMe;
bool hasClassLiterals; // Contains class ptrs used as literals
diff --git a/src/compiler/Dataflow.cc b/src/compiler/Dataflow.cc
index 0925793..8967649 100644
--- a/src/compiler/Dataflow.cc
+++ b/src/compiler/Dataflow.cc
@@ -1854,6 +1854,12 @@
case Instruction::CMPG_FLOAT:
case Instruction::CMPG_DOUBLE:
case Instruction::CMP_LONG:
+#if defined(ART_USE_QUICK_COMPILER)
+ if (cUnit->genBitcode) {
+ // Bitcode doesn't allow this optimization.
+ break;
+ }
+#endif
if (mir->next != NULL) {
MIR* mirNext = mir->next;
Instruction::Code brOpcode = mirNext->dalvikInsn.opcode;
@@ -2090,10 +2096,8 @@
if (!(cUnit->disableOpt & (1 << kBBOpt))) {
oatInitGrowableList(cUnit, &cUnit->compilerTemps, 6, kListMisc);
DCHECK_EQ(cUnit->numCompilerTemps, 0);
- if (!(cUnit->disableOpt & (1 << kBBOpt))) {
- oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
- kAllNodes, false /* isIterative */);
- }
+ oatDataFlowAnalysisDispatcher(cUnit, basicBlockOpt,
+ kAllNodes, false /* isIterative */);
}
}
diff --git a/src/compiler/Frontend.cc b/src/compiler/Frontend.cc
index b893cca..3aedbe9 100644
--- a/src/compiler/Frontend.cc
+++ b/src/compiler/Frontend.cc
@@ -799,8 +799,6 @@
//cUnit->enableDebug |= (1 << kDebugVerifyBitcode);
//cUnit->printMe = true;
//cUnit->enableDebug |= (1 << kDebugDumpBitcodeFile);
- // Disable non-safe optimizations for now
- cUnit->disableOpt |= ~(1 << kSafeOptimizations);
}
#endif
/* Are we generating code for the debugger? */
@@ -1127,8 +1125,11 @@
// Combine vmap tables - core regs, then fp regs - into vmapTable
std::vector<uint16_t> vmapTable;
+ // Core regs may have been inserted out of order - sort first
+ std::sort(cUnit->coreVmapTable.begin(), cUnit->coreVmapTable.end());
for (size_t i = 0 ; i < cUnit->coreVmapTable.size(); i++) {
- vmapTable.push_back(cUnit->coreVmapTable[i]);
+ // Copy, stripping out the phys register sort key
+ vmapTable.push_back(~(-1 << VREG_NUM_WIDTH) & cUnit->coreVmapTable[i]);
}
// If we have a frame, push a marker to take place of lr
if (cUnit->frameSize > 0) {
@@ -1137,7 +1138,7 @@
DCHECK_EQ(__builtin_popcount(cUnit->coreSpillMask), 0);
DCHECK_EQ(__builtin_popcount(cUnit->fpSpillMask), 0);
}
- // Combine vmap tables - core regs, then fp regs
+ // Combine vmap tables - core regs, then fp regs. fp regs already sorted
for (uint32_t i = 0; i < cUnit->fpVmapTable.size(); i++) {
vmapTable.push_back(cUnit->fpVmapTable[i]);
}
diff --git a/src/compiler/Ralloc.cc b/src/compiler/Ralloc.cc
index f4e735a..623d4ea 100644
--- a/src/compiler/Ralloc.cc
+++ b/src/compiler/Ralloc.cc
@@ -369,8 +369,8 @@
{
for (int i = 0; i < count; i++) {
LOG(INFO) << StringPrintf("Loc[%02d] : %s, %c %c %c %c %c %c%d %c%d S%d",
- i, storageName[table[i].location], table[i].wide ? 'W' : 'N',
- table[i].defined ? 'D' : 'U',
+ table[i].origSReg, storageName[table[i].location],
+ table[i].wide ? 'W' : 'N', table[i].defined ? 'D' : 'U',
table[i].fp ? 'F' : table[i].ref ? 'R' :'C',
table[i].highWord ? 'H' : 'L', table[i].home ? 'h' : 't',
oatIsFpReg(table[i].lowReg) ? 's' : 'r',
diff --git a/src/compiler/codegen/MethodBitcode.cc b/src/compiler/codegen/MethodBitcode.cc
index e7e4e5a..6b78765 100644
--- a/src/compiler/codegen/MethodBitcode.cc
+++ b/src/compiler/codegen/MethodBitcode.cc
@@ -107,17 +107,63 @@
loc.wide = ((ty == cUnit->irb->getInt64Ty()) ||
(ty == cUnit->irb->getDoubleTy()));
loc.defined = true;
- if ((ty == cUnit->irb->getFloatTy()) ||
- (ty == cUnit->irb->getDoubleTy())) {
- loc.fp = true;
- } else if (ty == cUnit->irb->GetJObjectTy()) {
- loc.ref = true;
- } else {
- loc.core = true;
- }
- loc.home = false; // Will change during promotion
+ loc.home = false; // May change during promotion
loc.sRegLow = baseSReg;
loc.origSReg = cUnit->locMap.size();
+ PromotionMap pMap = cUnit->promotionMap[baseSReg];
+ if (ty == cUnit->irb->getFloatTy()) {
+ loc.fp = true;
+ if (pMap.fpLocation == kLocPhysReg) {
+ loc.lowReg = pMap.fpReg;
+ loc.location = kLocPhysReg;
+ loc.home = true;
+ }
+ } else if (ty == cUnit->irb->getDoubleTy()) {
+ loc.fp = true;
+ PromotionMap pMapHigh = cUnit->promotionMap[baseSReg + 1];
+ if ((pMap.fpLocation == kLocPhysReg) &&
+ (pMapHigh.fpLocation == kLocPhysReg) &&
+ ((pMap.fpReg & 0x1) == 0) &&
+ (pMap.fpReg + 1 == pMapHigh.fpReg)) {
+ loc.lowReg = pMap.fpReg;
+ loc.highReg = pMapHigh.fpReg;
+ loc.location = kLocPhysReg;
+ loc.home = true;
+ }
+ } else if (ty == cUnit->irb->GetJObjectTy()) {
+ loc.ref = true;
+ if (pMap.coreLocation == kLocPhysReg) {
+ loc.lowReg = pMap.coreReg;
+ loc.location = kLocPhysReg;
+ loc.home = true;
+ }
+ } else if (ty == cUnit->irb->getInt64Ty()) {
+ loc.core = true;
+ PromotionMap pMapHigh = cUnit->promotionMap[baseSReg + 1];
+ if ((pMap.coreLocation == kLocPhysReg) &&
+ (pMapHigh.coreLocation == kLocPhysReg)) {
+ loc.lowReg = pMap.coreReg;
+ loc.highReg = pMapHigh.coreReg;
+ loc.location = kLocPhysReg;
+ loc.home = true;
+ }
+ } else {
+ loc.core = true;
+ if (pMap.coreLocation == kLocPhysReg) {
+ loc.lowReg = pMap.coreReg;
+ loc.location = kLocPhysReg;
+ loc.home = true;
+ }
+ }
+
+ if (cUnit->printMe && loc.home) {
+ if (loc.wide) {
+ LOG(INFO) << "Promoted wide " << s << " to regs " << loc.lowReg
+ << "/" << loc.highReg;
+ } else {
+ LOG(INFO) << "Promoted " << s << " to reg " << loc.lowReg;
+ }
+ }
cUnit->locMap.Put(val, loc);
}
@@ -2883,12 +2929,19 @@
oatNew(cUnit, sizeof(RegLocation) * cUnit->numIns, true, kAllocMisc);
llvm::Function::arg_iterator it(cUnit->func->arg_begin());
llvm::Function::arg_iterator it_end(cUnit->func->arg_end());
+ // Skip past Method*
+ it++;
for (unsigned i = 0; it != it_end; ++it) {
llvm::Value* val = it;
argLocs[i++] = valToLoc(cUnit, val);
llvm::Type* ty = val->getType();
if ((ty == cUnit->irb->getInt64Ty()) || (ty == cUnit->irb->getDoubleTy())) {
- argLocs[i++].sRegLow = INVALID_SREG;
+ argLocs[i] = argLocs[i-1];
+ argLocs[i].lowReg = argLocs[i].highReg;
+ argLocs[i].origSReg++;
+ argLocs[i].sRegLow = INVALID_SREG;
+ argLocs[i].highWord = true;
+ i++;
}
}
genEntrySequence(cUnit, argLocs, cUnit->methodLoc);
@@ -3365,15 +3418,78 @@
cUnit->numFPSpills = 0;
cUnit->coreVmapTable.clear();
cUnit->fpVmapTable.clear();
- oatAdjustSpillMask(cUnit);
- cUnit->frameSize = oatComputeFrameSize(cUnit);
/*
* At this point, we've lost all knowledge of register promotion.
* Rebuild that info from the MethodInfo intrinsic (if it
- * exists - not required for correctness).
+ * exists - not required for correctness). Normally, this will
+ * be the first instruction we encounter, so we won't have to iterate
+ * through everything.
*/
- // TODO: find and recover MethodInfo.
+ for (llvm::inst_iterator i = llvm::inst_begin(func),
+ e = llvm::inst_end(func); i != e; ++i) {
+ llvm::CallInst* callInst = llvm::dyn_cast<llvm::CallInst>(&*i);
+ if (callInst != NULL) {
+ llvm::Function* callee = callInst->getCalledFunction();
+ greenland::IntrinsicHelper::IntrinsicId id =
+ cUnit->intrinsic_helper->GetIntrinsicId(callee);
+ if (id == greenland::IntrinsicHelper::MethodInfo) {
+ if (cUnit->printMe) {
+ LOG(INFO) << "Found MethodInfo";
+ }
+ llvm::MDNode* regInfoNode = callInst->getMetadata("RegInfo");
+ if (regInfoNode != NULL) {
+ llvm::ConstantInt* numInsValue =
+ static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(0));
+ llvm::ConstantInt* numRegsValue =
+ static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(1));
+ llvm::ConstantInt* numOutsValue =
+ static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(2));
+ llvm::ConstantInt* numCompilerTempsValue =
+ static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(3));
+ llvm::ConstantInt* numSSARegsValue =
+ static_cast<llvm::ConstantInt*>(regInfoNode->getOperand(4));
+ if (cUnit->printMe) {
+ LOG(INFO) << "RegInfo - Ins:" << numInsValue->getZExtValue()
+ << ", Regs:" << numRegsValue->getZExtValue()
+ << ", Outs:" << numOutsValue->getZExtValue()
+ << ", CTemps:" << numCompilerTempsValue->getZExtValue()
+ << ", SSARegs:" << numSSARegsValue->getZExtValue();
+ }
+ }
+ llvm::MDNode* pmapInfoNode = callInst->getMetadata("PromotionMap");
+ if (pmapInfoNode != NULL) {
+ int elems = pmapInfoNode->getNumOperands();
+ if (cUnit->printMe) {
+ LOG(INFO) << "PMap size: " << elems;
+ }
+ for (int i = 0; i < elems; i++) {
+ llvm::ConstantInt* rawMapData =
+ static_cast<llvm::ConstantInt*>(pmapInfoNode->getOperand(i));
+ uint32_t mapData = rawMapData->getZExtValue();
+ PromotionMap* p = &cUnit->promotionMap[i];
+ p->firstInPair = (mapData >> 24) & 0xff;
+ p->fpReg = (mapData >> 16) & 0xff;
+ p->coreReg = (mapData >> 8) & 0xff;
+ p->fpLocation = static_cast<RegLocationType>((mapData >> 4) & 0xf);
+ if (p->fpLocation == kLocPhysReg) {
+ oatRecordFpPromotion(cUnit, p->fpReg, i);
+ }
+ p->coreLocation = static_cast<RegLocationType>(mapData & 0xf);
+ if (p->coreLocation == kLocPhysReg) {
+ oatRecordCorePromotion(cUnit, p->coreReg, i);
+ }
+ }
+ if (cUnit->printMe) {
+ oatDumpPromotionMap(cUnit);
+ }
+ }
+ break;
+ }
+ }
+ }
+ oatAdjustSpillMask(cUnit);
+ cUnit->frameSize = oatComputeFrameSize(cUnit);
// Create RegLocations for arguments
llvm::Function::arg_iterator it(cUnit->func->arg_begin());
diff --git a/src/compiler/codegen/Ralloc.h b/src/compiler/codegen/Ralloc.h
index d1518e8..db8fc7d 100644
--- a/src/compiler/codegen/Ralloc.h
+++ b/src/compiler/codegen/Ralloc.h
@@ -198,6 +198,9 @@
extern void oatCountRefs(CompilationUnit*, BasicBlock*, RefCounts*, RefCounts*);
extern int oatSortCounts(const void *val1, const void *val2);
extern void oatDumpCounts(const RefCounts* arr, int size, const char* msg);
+extern void oatRecordCorePromotion(CompilationUnit* cUnit, int reg, int sReg);
+extern void oatRecordFpPromotion(CompilationUnit* cUnit, int reg, int sReg);
+
/*
* Architecture-dependent register allocation routines implemented in
diff --git a/src/compiler/codegen/RallocUtil.cc b/src/compiler/codegen/RallocUtil.cc
index 9d1878a..8fa110a 100644
--- a/src/compiler/codegen/RallocUtil.cc
+++ b/src/compiler/codegen/RallocUtil.cc
@@ -161,6 +161,20 @@
}
}
+void oatRecordCorePromotion(CompilationUnit* cUnit, int reg, int sReg)
+{
+ int pMapIdx = SRegToPMap(cUnit, sReg);
+ int vReg = SRegToVReg(cUnit, sReg);
+ oatGetRegInfo(cUnit, reg)->inUse = true;
+ cUnit->coreSpillMask |= (1 << reg);
+ // Include reg for later sort
+ cUnit->coreVmapTable.push_back(reg << VREG_NUM_WIDTH |
+ (vReg & ((1 << VREG_NUM_WIDTH) - 1)));
+ cUnit->numCoreSpills++;
+ cUnit->promotionMap[pMapIdx].coreLocation = kLocPhysReg;
+ cUnit->promotionMap[pMapIdx].coreReg = reg;
+}
+
/* Reserve a callee-save register. Return -1 if none available */
extern int oatAllocPreservedCoreReg(CompilationUnit* cUnit, int sReg)
{
@@ -168,21 +182,24 @@
RegisterInfo* coreRegs = cUnit->regPool->coreRegs;
for (int i = 0; i < cUnit->regPool->numCoreRegs; i++) {
if (!coreRegs[i].isTemp && !coreRegs[i].inUse) {
- int vReg = SRegToVReg(cUnit, sReg);
- int pMapIdx = SRegToPMap(cUnit, sReg);
res = coreRegs[i].reg;
- coreRegs[i].inUse = true;
- cUnit->coreSpillMask |= (1 << res);
- cUnit->coreVmapTable.push_back(vReg);
- cUnit->numCoreSpills++;
- cUnit->promotionMap[pMapIdx].coreLocation = kLocPhysReg;
- cUnit->promotionMap[pMapIdx].coreReg = res;
+ oatRecordCorePromotion(cUnit, res, sReg);
break;
}
}
return res;
}
+void oatRecordFpPromotion(CompilationUnit* cUnit, int reg, int sReg)
+{
+ int pMapIdx = SRegToPMap(cUnit, sReg);
+ int vReg = SRegToVReg(cUnit, sReg);
+ oatGetRegInfo(cUnit, reg)->inUse = true;
+ oatMarkPreservedSingle(cUnit, vReg, reg);
+ cUnit->promotionMap[pMapIdx].fpLocation = kLocPhysReg;
+ cUnit->promotionMap[pMapIdx].fpReg = reg;
+}
+
/*
* Reserve a callee-save fp single register. Try to fullfill request for
* even/odd allocation, but go ahead and allocate anything if not
@@ -195,13 +212,8 @@
for (int i = 0; i < cUnit->regPool->numFPRegs; i++) {
if (!FPRegs[i].isTemp && !FPRegs[i].inUse &&
((FPRegs[i].reg & 0x1) == 0) == even) {
- int vReg = SRegToVReg(cUnit, sReg);
- int pMapIdx = SRegToPMap(cUnit, sReg);
res = FPRegs[i].reg;
- FPRegs[i].inUse = true;
- oatMarkPreservedSingle(cUnit, vReg, res);
- cUnit->promotionMap[pMapIdx].fpLocation = kLocPhysReg;
- cUnit->promotionMap[pMapIdx].fpReg = res;
+ oatRecordFpPromotion(cUnit, res, sReg);
break;
}
}
@@ -1237,6 +1249,9 @@
}
}
}
+ if (cUnit->printMe) {
+ oatDumpPromotionMap(cUnit);
+ }
}
/* Returns sp-relative offset in bytes for a VReg */