PTX: Always use registers for return values, but use .param space for device
     parameters if SM >= 2.0

- Update test cases to be more robust against register allocation changes
- Bump up the number of registers to 128 per type
- Include Python script to re-generate register file with any number of
  registers

llvm-svn: 133736
diff --git a/llvm/lib/Target/PTX/PTXISelLowering.cpp b/llvm/lib/Target/PTX/PTXISelLowering.cpp
index cb4a5d3..c821493 100644
--- a/llvm/lib/Target/PTX/PTXISelLowering.cpp
+++ b/llvm/lib/Target/PTX/PTXISelLowering.cpp
@@ -307,49 +307,35 @@
 
   MachineFunction& MF = DAG.getMachineFunction();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
-  const PTXSubtarget& ST = getTargetMachine().getSubtarget<PTXSubtarget>();
 
   SDValue Flag;
 
-  if (ST.getShaderModel() >= PTXSubtarget::PTX_SM_2_0) {
-    // For SM 2.0+, we return arguments in the param space
-    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-      SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue ParamIndex = DAG.getTargetConstant(i, MVT::i32);
-      SDValue Ops[] = { Chain, ParamIndex, OutVals[i], Flag };
-      Chain = DAG.getNode(PTXISD::STORE_PARAM, dl, VTs, Ops,
-                          Flag.getNode() ? 4 : 3);
-      Flag = Chain.getValue(1);
-      // Instead of storing a physical register in our argument list, we just
-      // store the total size of the parameter, in bits.  The ASM printer
-      // knows how to process this.
-      MFI->addRetReg(Outs[i].VT.getStoreSizeInBits());
-    }
-  } else {
-    // For SM < 2.0, we return arguments in registers
-    SmallVector<CCValAssign, 16> RVLocs;
-    CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-    getTargetMachine(), RVLocs, *DAG.getContext());
+  // Even though we could use the .param space for return arguments for
+  // device functions if SM >= 2.0 and the number of return arguments is
+  // only 1, we just always use registers since this makes the codegen
+  // easier.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+  getTargetMachine(), RVLocs, *DAG.getContext());
 
-    CCInfo.AnalyzeReturn(Outs, RetCC_PTX);
+  CCInfo.AnalyzeReturn(Outs, RetCC_PTX);
 
-    for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
-      CCValAssign& VA  = RVLocs[i];
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
+    CCValAssign& VA  = RVLocs[i];
 
-      assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
+    assert(VA.isRegLoc() && "CCValAssign must be RegLoc");
 
-      unsigned Reg = VA.getLocReg();
+    unsigned Reg = VA.getLocReg();
 
-      DAG.getMachineFunction().getRegInfo().addLiveOut(Reg);
+    DAG.getMachineFunction().getRegInfo().addLiveOut(Reg);
 
-      Chain = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i], Flag);
+    Chain = DAG.getCopyToReg(Chain, dl, Reg, OutVals[i], Flag);
 
-      // Guarantee that all emitted copies are stuck together,
-      // avoiding something bad
-      Flag = Chain.getValue(1);
+    // Guarantee that all emitted copies are stuck together,
+    // avoiding something bad
+    Flag = Chain.getValue(1);
 
-      MFI->addRetReg(Reg);
-    }
+    MFI->addRetReg(Reg);
   }
 
   if (Flag.getNode() == 0) {