Re-commit : [PowerPC] Add handling for ColdCC calling convention and a pass to mark
candidates with coldcc attribute.

This recommits r322721 reverted due to sanitizer memory leak build bot failures.

Original commit message:
This patch adds support for the coldcc calling convention for Power.
This changes the set of non-volatile registers. It includes a pass to stress
test the implementation by marking all static directly called functions with
the coldcc attribute through the option -enable-coldcc-stress-test. It also
includes an option, -ppc-enable-coldcc, to add the coldcc attribute to
functions which are cold at all call sites based on BlockFrequencyInfo when
the containing function does not call any non cold functions.

Differential Revision: https://reviews.llvm.org/D38413

llvm-svn: 323778
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td
index a4f4c86..d7d2cad 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -45,6 +45,29 @@
   CCCustom<"CC_PPC_AnyReg_Error">
 ]>;
 
+// Return-value convention for PowerPC coldcc.
+def RetCC_PPC_Cold : CallingConv<[
+  // Use the same return registers as RetCC_PPC, but limited to only
+  // one return value. The remaining return values will be saved to
+  // the stack.
+  CCIfType<[i32, i1], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+  CCIfType<[i1], CCIfNotSubtarget<"isPPC64()", CCPromoteToType<i32>>>,
+
+  CCIfType<[i32], CCAssignToReg<[R3]>>,
+  CCIfType<[i64], CCAssignToReg<[X3]>>,
+  CCIfType<[i128], CCAssignToReg<[X3]>>,
+
+  CCIfType<[f32], CCAssignToReg<[F1]>>,
+  CCIfType<[f64], CCAssignToReg<[F1]>>,
+
+  CCIfType<[v4f64, v4f32, v4i1],
+           CCIfSubtarget<"hasQPX()", CCAssignToReg<[QF1]>>>,
+
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v1i128, v4f32, v2f64],
+           CCIfSubtarget<"hasAltivec()",
+           CCAssignToReg<[V2]>>>
+]>;
+
 // Return-value convention for PowerPC
 def RetCC_PPC : CallingConv<[
   CCIfCC<"CallingConv::AnyReg", CCDelegateTo<RetCC_PPC64_AnyReg>>,
@@ -271,6 +294,36 @@
 
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
+// coldcc calling convection marks most registers as non-volatile.
+// Do not include r1 since the stack pointer is never considered a CSR.
+// Do not include r2, since it is the TOC register and is added depending
+// on wether or not the function uses the TOC and is a non-leaf.
+// Do not include r0,r11,r13 as they are optional in functional linkage
+// and value may be altered by inter-library calls.
+// Do not include r12 as it is used as a scratch register.
+// Do not include return registers r3, f1, v2.
+def CSR_SVR32_ColdCC : CalleeSavedRegs<(add (sequence "R%u", 4, 10),
+                                          (sequence "R%u", 14, 31),
+                                          F0, (sequence "F%u", 2, 31),
+                                          (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR32_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR32_ColdCC,
+                                            (sequence "V%u", 0, 1),
+                                            (sequence "V%u", 3, 31))>;
+
+def CSR_SVR64_ColdCC : CalleeSavedRegs<(add  (sequence "X%u", 4, 10),
+                                             (sequence "X%u", 14, 31),
+                                             F0, (sequence "F%u", 2, 31),
+                                             (sequence "CR%u", 0, 7))>;
+
+def CSR_SVR64_ColdCC_R2: CalleeSavedRegs<(add CSR_SVR64_ColdCC, X2)>;
+
+def CSR_SVR64_ColdCC_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC,
+                                             (sequence "V%u", 0, 1),
+                                             (sequence "V%u", 3, 31))>;
+
+def CSR_SVR64_ColdCC_R2_Altivec : CalleeSavedRegs<(add CSR_SVR64_ColdCC_Altivec, X2)>;
+
 def CSR_64_AllRegs: CalleeSavedRegs<(add X0, (sequence "X%u", 3, 10),
                                              (sequence "X%u", 14, 31),
                                              (sequence "F%u", 0, 31),
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index cdfbfaf..cd19568 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -206,6 +206,8 @@
     return CC_PPC32_SVR4_ByVal;
   else if (Flag == 3)
     return CC_PPC32_SVR4_VarArg;
+  else if (Flag == 4)
+    return RetCC_PPC_Cold;
   else
     return RetCC_PPC;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 7902da2..bdda9d1 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -1950,7 +1950,14 @@
     bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
 
     // Add the callee-saved register as live-in; it's killed at the spill.
-    MBB.addLiveIn(Reg);
+    // Do not do this for callee-saved registers that are live-in to the
+    // function because they will already be marked live-in and this will be
+    // adding it for a second time. It is an error to add the same register
+    // to the set more than once.
+    const MachineRegisterInfo &MRI = MF->getRegInfo();
+    bool IsLiveIn = MRI.isLiveIn(Reg);
+    if (!IsLiveIn)
+       MBB.addLiveIn(Reg);
 
     if (CRSpilled && IsCRField) {
       CRMIB.addReg(Reg, RegState::ImplicitKill);
@@ -1980,7 +1987,10 @@
       }
     } else {
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+      // Use !IsLiveIn for the kill flag.
+      // We do not want to kill registers that are live in this function
+      // before their use because they will become undefined registers.
+      TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
                               CSI[i].getFrameIdx(), RC, TRI);
     }
   }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 2e99467..00dcb0d 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -4939,7 +4939,11 @@
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCRetInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
-  CCRetInfo.AnalyzeCallResult(Ins, RetCC_PPC);
+
+  CCRetInfo.AnalyzeCallResult(
+      Ins, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+               ? RetCC_PPC_Cold
+               : RetCC_PPC);
 
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -5159,6 +5163,7 @@
   // of the 32-bit SVR4 ABI stack frame layout.
 
   assert((CallConv == CallingConv::C ||
+          CallConv == CallingConv::Cold ||
           CallConv == CallingConv::Fast) && "Unknown calling convention!");
 
   unsigned PtrByteSize = 4;
@@ -6420,7 +6425,10 @@
                                   LLVMContext &Context) const {
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC_PPC);
+  return CCInfo.CheckReturn(
+      Outs, (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+                ? RetCC_PPC_Cold
+                : RetCC_PPC);
 }
 
 SDValue
@@ -6432,7 +6440,10 @@
   SmallVector<CCValAssign, 16> RVLocs;
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
-  CCInfo.AnalyzeReturn(Outs, RetCC_PPC);
+  CCInfo.AnalyzeReturn(Outs,
+                       (Subtarget.isSVR4ABI() && CallConv == CallingConv::Cold)
+                           ? RetCC_PPC_Cold
+                           : RetCC_PPC);
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 6b62a82..a938bb9 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -144,6 +144,17 @@
   // On PPC64, we might need to save r2 (but only if it is not reserved).
   bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
 
+  if (MF->getFunction().getCallingConv() == CallingConv::Cold) {
+    return TM.isPPC64()
+               ? (Subtarget.hasAltivec()
+                      ? (SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
+                                : CSR_SVR64_ColdCC_Altivec_SaveList)
+                      : (SaveR2 ? CSR_SVR64_ColdCC_R2_SaveList
+                                : CSR_SVR64_ColdCC_SaveList))
+               : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_SaveList
+                                         : CSR_SVR32_ColdCC_SaveList);
+  }
+
   return TM.isPPC64()
              ? (Subtarget.hasAltivec()
                     ? (SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
@@ -196,6 +207,13 @@
                         : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
                                                   : CSR_Darwin32_RegMask);
 
+  if (CC == CallingConv::Cold) {
+    return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR64_ColdCC_Altivec_RegMask
+                                                  : CSR_SVR64_ColdCC_RegMask)
+                        : (Subtarget.hasAltivec() ? CSR_SVR32_ColdCC_Altivec_RegMask
+                                                  : CSR_SVR32_ColdCC_RegMask);
+  }
+
   return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
                                                 : CSR_SVR464_RegMask)
                       : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index aa4073f..226c75f 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -27,6 +27,11 @@
 CacheLineSize("ppc-loop-prefetch-cache-line", cl::Hidden, cl::init(64),
               cl::desc("The loop prefetch cache line size"));
 
+static cl::opt<bool>
+EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
+                cl::desc("Enable using coldcc calling conv for cold "
+                         "internal functions"));
+
 //===----------------------------------------------------------------------===//
 //
 // PPC cost model.
@@ -215,6 +220,14 @@
   BaseT::getUnrollingPreferences(L, SE, UP);
 }
 
+// This function returns true to allow using coldcc calling convention.
+// Returning true results in coldcc being used for functions which are cold at
+// all call sites when the callers of the functions are not calling any other
+// non coldcc functions.
+bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
+  return EnablePPCColdCC;
+}
+
 bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
   // On the A2, always unroll aggressively. For QPX unaligned loads, we depend
   // on combining the loads generated for consecutive accesses, and failure to
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index b42dae4..2ee2b3e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -61,7 +61,7 @@
 
   /// \name Vector TTI Implementations
   /// @{
-
+  bool useColdCCForColdCall(Function &F);
   bool enableAggressiveInterleaving(bool LoopHasReductions);
   const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
       bool IsZeroCmp) const;