[InlineCost] Add cl::opt to allow full inline cost to be computed for debugging purposes.

Currently, the inline cost model will bail once the inline cost exceeds the
inline threshold in order to avoid unnecessary compile-time. However, when
debugging it is useful to compute the full cost, so this command line option
is added to override the default behavior.

I took over this work from Chad Rosier (mcrosier@codeaurora.org).

Differential Revision: https://reviews.llvm.org/D35850

llvm-svn: 311371
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 1567a68..c26388a 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -82,6 +82,11 @@
              "entry frequency, for a callsite to be hot in the absence of "
              "profile information."));
 
+static cl::opt<bool> ComputeFullInlineCost(
+    "inline-cost-full", cl::Hidden, cl::init(false),
+    cl::desc("Compute the full inline cost of a call site even when the cost "
+             "exceeds the threshold."));
+
 namespace {
 
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
@@ -106,6 +111,9 @@
   // Cache the DataLayout since we use it a lot.
   const DataLayout &DL;
 
+  /// The OptimizationRemarkEmitter available for this compilation.
+  OptimizationRemarkEmitter *ORE;
+
   /// The candidate callsite being analyzed. Please do not use this to do
   /// analysis in the caller function; we want the inline cost query to be
   /// easily cacheable. Instead, use the cover function paramHasAttr.
@@ -243,10 +251,10 @@
   CallAnalyzer(const TargetTransformInfo &TTI,
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
-               ProfileSummaryInfo *PSI, Function &Callee, CallSite CSArg,
-               const InlineParams &Params)
+               ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
+               Function &Callee, CallSite CSArg, const InlineParams &Params)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
-        PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()),
+        PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
         CandidateCS(CSArg), Params(Params), Threshold(Params.DefaultThreshold),
         Cost(0), IsCallerRecursive(false), IsRecursiveCall(false),
         ExposesReturnsTwice(false), HasDynamicAlloca(false),
@@ -1138,7 +1146,7 @@
   // out. Pretend to inline the function, with a custom threshold.
   auto IndirectCallParams = Params;
   IndirectCallParams.DefaultThreshold = InlineConstants::IndirectCallThreshold;
-  CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, *F, CS,
+  CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, CS,
                   IndirectCallParams);
   if (CA.analyzeCall(CS)) {
     // We were able to inline the indirect call! Subtract the cost from the
@@ -1198,7 +1206,7 @@
       std::min((int64_t)CostUpperBound,
                (int64_t)SI.getNumCases() * InlineConstants::InstrCost + Cost);
 
-  if (CostLowerBound > Threshold) {
+  if (CostLowerBound > Threshold && !ComputeFullInlineCost) {
     Cost = CostLowerBound;
     return false;
   }
@@ -1347,21 +1355,36 @@
     else
       Cost += InlineConstants::InstrCost;
 
+    using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
     if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca ||
-        HasIndirectBr || HasFrameEscape)
+        HasIndirectBr || HasFrameEscape) {
+      if (ORE)
+        ORE->emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
+                                           CandidateCS.getInstruction())
+                  << NV("Callee", &F)
+                  << " has uninlinable pattern and cost is not fully computed");
       return false;
+    }
 
     // If the caller is a recursive function then we don't want to inline
     // functions which allocate a lot of stack space because it would increase
     // the caller stack usage dramatically.
     if (IsCallerRecursive &&
-        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
+        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
+      if (ORE)
+        ORE->emit(
+            OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
+                                     CandidateCS.getInstruction())
+            << NV("Callee", &F)
+            << " is recursive and allocates too much stack space. Cost is "
+               "not fully computed");
       return false;
+    }
 
     // Check if we've past the maximum possible threshold so we don't spin in
     // huge basic blocks that will never inline.
-    if (Cost > Threshold)
+    if (Cost > Threshold && !ComputeFullInlineCost)
       return false;
   }
 
@@ -1447,7 +1470,7 @@
     Cost += InlineConstants::ColdccPenalty;
 
   // Check if we're done. This can happen due to bonuses and penalties.
-  if (Cost > Threshold)
+  if (Cost > Threshold && !ComputeFullInlineCost)
     return false;
 
   if (F.empty())
@@ -1513,7 +1536,7 @@
   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
     // Bail out the moment we cross the threshold. This means we'll under-count
     // the cost, but only when undercounting doesn't matter.
-    if (Cost > Threshold)
+    if (Cost > Threshold && !ComputeFullInlineCost)
       break;
 
     BasicBlock *BB = BBWorklist[Idx];
@@ -1657,9 +1680,9 @@
     CallSite CS, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
-    ProfileSummaryInfo *PSI) {
+    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
   return getInlineCost(CS, CS.getCalledFunction(), Params, CalleeTTI,
-                       GetAssumptionCache, GetBFI, PSI);
+                       GetAssumptionCache, GetBFI, PSI, ORE);
 }
 
 InlineCost llvm::getInlineCost(
@@ -1667,7 +1690,7 @@
     TargetTransformInfo &CalleeTTI,
     std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
     Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
-    ProfileSummaryInfo *PSI) {
+    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
 
   // Cannot inline indirect calls.
   if (!Callee)
@@ -1699,10 +1722,13 @@
       CS.isNoInline())
     return llvm::InlineCost::getNever();
 
+  if (ORE)
+    ComputeFullInlineCost = true;
+
   DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                      << "... (caller:" << Caller->getName() << ")\n");
 
-  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, *Callee, CS,
+  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee, CS,
                   Params);
   bool ShouldInline = CA.analyzeCall(CS);
 
diff --git a/llvm/lib/Transforms/IPO/InlineSimple.cpp b/llvm/lib/Transforms/IPO/InlineSimple.cpp
index 50e7cc8..1af40d3 100644
--- a/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -57,12 +57,13 @@
   InlineCost getInlineCost(CallSite CS) override {
     Function *Callee = CS.getCalledFunction();
     TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
+    OptimizationRemarkEmitter ORE(CS.getCaller());
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
     return llvm::getInlineCost(CS, Params, TTI, GetAssumptionCache,
-                               /*GetBFI=*/None, PSI);
+                               /*GetBFI=*/None, PSI, &ORE);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 10a83f4..3c46913 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -845,6 +845,10 @@
     FunctionAnalysisManager &FAM =
         AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG)
             .getManager();
+
+    // Get the remarks emission analysis for the caller.
+    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&](Function &F) -> AssumptionCache & {
       return FAM.getResult<AssumptionAnalysis>(F);
@@ -857,12 +861,9 @@
       Function &Callee = *CS.getCalledFunction();
       auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
       return getInlineCost(CS, Params, CalleeTTI, GetAssumptionCache, {GetBFI},
-                           PSI);
+                           PSI, &ORE);
     };
 
-    // Get the remarks emission analysis for the caller.
-    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
     // Now process as many calls as we have within this caller in the sequnece.
     // We bail out as soon as the caller has to change so we can update the
     // call graph and prepare the context of that new caller.
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index 8840435..668a812 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -462,7 +462,7 @@
   Function *Caller = CS.getCaller();
   auto &CalleeTTI = (*GetTTI)(*Callee);
   InlineCost IC = getInlineCost(CS, getInlineParams(), CalleeTTI,
-                                *GetAssumptionCache, GetBFI, PSI);
+                                *GetAssumptionCache, GetBFI, PSI, &ORE);
 
   if (IC.isAlways()) {
     ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)