Add preliminary support for .f32 in the PTX backend.

- Add appropriate TableGen patterns for fadd, fsub, fmul.
- Add .f32 as the PTX type for the LLVM float type.
- Allow parameters, return values, and global variable declarations
  to accept the float type.
- Add appropriate test cases.

Patch by Justin Holewinski

llvm-svn: 126636
diff --git a/llvm/lib/Target/PTX/PTXAsmPrinter.cpp b/llvm/lib/Target/PTX/PTXAsmPrinter.cpp
index a605997..25f26fa 100644
--- a/llvm/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/llvm/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -84,6 +84,7 @@
 static const char *getRegisterTypeName(unsigned RegNo) {
 #define TEST_REGCLS(cls, clsstr) \
   if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
+  TEST_REGCLS(RRegf32, f32);
   TEST_REGCLS(RRegs32, s32);
   TEST_REGCLS(Preds, pred);
 #undef TEST_REGCLS
@@ -115,6 +116,21 @@
   return NULL;
 }
 
+static const char *getTypeName(const Type* type) {
+  while (true) {
+    switch (type->getTypeID()) {
+      default: llvm_unreachable("Unknown type");
+      case Type::FloatTyID: return ".f32";
+      case Type::IntegerTyID: return ".s32";    // TODO:  Handle 64-bit types.
+      case Type::ArrayTyID:
+      case Type::PointerTyID:
+        type = dyn_cast<const SequentialType>(type)->getElementType();
+        break;
+    }
+  }
+  return NULL;
+}
+
 bool PTXAsmPrinter::doFinalization(Module &M) {
   // XXX Temproarily remove global variables so that doFinalization() will not
   // emit them again (global variables are emitted at beginning).
@@ -218,6 +234,15 @@
     case MachineOperand::MO_Register:
       OS << getRegisterName(MO.getReg());
       break;
+    case MachineOperand::MO_FPImmediate:
+      APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt();
+      if (constFP.getZExtValue() > 0) {
+        OS << "0F" << constFP.toString(16, false);
+      }
+      else {
+        OS << "0F00000000";
+      }
+      break;
   }
 }
 
@@ -265,8 +290,8 @@
     decl += " ";
   }
 
-  // TODO: add types
-  decl += ".s32 ";
+  decl += getTypeName(gv->getType());
+  decl += " ";
 
   decl += gvsym->getName();
 
diff --git a/llvm/lib/Target/PTX/PTXISelLowering.cpp b/llvm/lib/Target/PTX/PTXISelLowering.cpp
index e6d4490..d30c9ec 100644
--- a/llvm/lib/Target/PTX/PTXISelLowering.cpp
+++ b/llvm/lib/Target/PTX/PTXISelLowering.cpp
@@ -28,9 +28,12 @@
   // Set up the register classes.
   addRegisterClass(MVT::i1,  PTX::PredsRegisterClass);
   addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass);
-
+  addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass);
+  
   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
 
+  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
+  
   // Customize translation of memory addresses
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
@@ -87,7 +90,8 @@
   bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; }
 } argmap[] = {
   argmap_entry(MVT::i1,  PTX::PredsRegisterClass),
-  argmap_entry(MVT::i32, PTX::RRegs32RegisterClass)
+  argmap_entry(MVT::i32, PTX::RRegs32RegisterClass),
+  argmap_entry(MVT::f32, PTX::RRegf32RegisterClass)
 };
 } // end anonymous namespace
 
@@ -185,10 +189,18 @@
   if (Outs.size() == 0)
     return DAG.getNode(PTXISD::RET, dl, MVT::Other, Chain);
 
-  assert(Outs[0].VT == MVT::i32 && "Can return only basic types");
-
   SDValue Flag;
-  unsigned reg = PTX::R0;
+  unsigned reg;
+
+  if (Outs[0].VT == MVT::i32) {
+    reg = PTX::R0;
+  }
+  else if (Outs[0].VT == MVT::f32) {
+    reg = PTX::F0;
+  }
+  else {
+    assert(false && "Can return only basic types");
+  }
 
   MachineFunction &MF = DAG.getMachineFunction();
   PTXMachineFunctionInfo *MFI = MF.getInfo<PTXMachineFunctionInfo>();
diff --git a/llvm/lib/Target/PTX/PTXInstrInfo.cpp b/llvm/lib/Target/PTX/PTXInstrInfo.cpp
index 805759b..f2e5e4c 100644
--- a/llvm/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/llvm/lib/Target/PTX/PTXInstrInfo.cpp
@@ -28,6 +28,7 @@
   const int opcode;
 } map[] = {
   { &PTX::RRegs32RegClass, PTX::MOVrr },
+  { &PTX::RRegf32RegClass, PTX::MOVrr },
   { &PTX::PredsRegClass,   PTX::MOVpp }
 };
 
@@ -35,12 +36,13 @@
                                MachineBasicBlock::iterator I, DebugLoc DL,
                                unsigned DstReg, unsigned SrcReg,
                                bool KillSrc) const {
-  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i)
-    if (PTX::RRegs32RegClass.contains(DstReg, SrcReg)) {
+  for (int i = 0, e = sizeof(map)/sizeof(map[0]); i != e; ++ i) {
+    if (map[i].cls->contains(DstReg, SrcReg)) {
       BuildMI(MBB, I, DL,
-              get(PTX::MOVrr), DstReg).addReg(SrcReg, getKillRegState(KillSrc));
+              get(map[i].opcode), DstReg).addReg(SrcReg, getKillRegState(KillSrc));
       return;
     }
+  }
 
   llvm_unreachable("Impossible reg-to-reg copy");
 }
diff --git a/llvm/lib/Target/PTX/PTXInstrInfo.td b/llvm/lib/Target/PTX/PTXInstrInfo.td
index 9a74778..9d962b0 100644
--- a/llvm/lib/Target/PTX/PTXInstrInfo.td
+++ b/llvm/lib/Target/PTX/PTXInstrInfo.td
@@ -143,6 +143,18 @@
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
+// Three-operand f32 instruction template
+multiclass FLOAT3<string opcstr, SDNode opnode> {
+  def rr : InstPTX<(outs RRegf32:$d),
+                   (ins RRegf32:$a, RRegf32:$b),
+                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
+                   [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>;
+  def ri : InstPTX<(outs RRegf32:$d),
+                   (ins RRegf32:$a, f32imm:$b),
+                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
+                   [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>;
+}
+
 multiclass INT3<string opcstr, SDNode opnode> {
   def rr : InstPTX<(outs RRegs32:$d),
                    (ins RRegs32:$a, RRegs32:$b),
@@ -204,6 +216,12 @@
 // Instructions
 //===----------------------------------------------------------------------===//
 
+///===- Floating-Point Arithmetic Instructions ----------------------------===//
+
+defm FADD : FLOAT3<"add", fadd>;
+defm FSUB : FLOAT3<"sub", fsub>;
+defm FMUL : FLOAT3<"mul", fmul>;
+
 ///===- Integer Arithmetic Instructions -----------------------------------===//
 
 defm ADD : INT3<"add", add>;
@@ -223,6 +241,8 @@
     : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>;
   def MOVrr
     : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>;
+  def FMOVrr
+    : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>;
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
@@ -232,8 +252,12 @@
   def MOVri
     : InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a",
               [(set RRegs32:$d, imm:$a)]>;
+  def FMOVri
+    : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
+              [(set RRegf32:$d, fpimm:$a)]>;
 }
 
+// Integer loads
 defm LDg : PTX_LD<"ld.global", RRegs32, load_global>;
 defm LDc : PTX_LD<"ld.const",  RRegs32, load_constant>;
 defm LDl : PTX_LD<"ld.local",  RRegs32, load_local>;
@@ -243,12 +267,30 @@
 def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a),
                    "ld.param.%type\t$d, [$a]", []>;
 
+// Floating-point loads
+defm FLDg : PTX_LD<"ld.global", RRegf32, load_global>;
+defm FLDc : PTX_LD<"ld.const",  RRegf32, load_constant>;
+defm FLDl : PTX_LD<"ld.local",  RRegf32, load_local>;
+defm FLDp : PTX_LD<"ld.param",  RRegf32, load_parameter>;
+defm FLDs : PTX_LD<"ld.shared", RRegf32, load_shared>;
+
+def FLDpi : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a),
+                   "ld.param.%type\t$d, [$a]", []>;
+
+// Integer stores
 defm STg : PTX_ST<"st.global", RRegs32, store_global>;
 defm STl : PTX_ST<"st.local",  RRegs32, store_local>;
 // Store to parameter state space requires PTX 2.0 or higher?
 // defm STp : PTX_ST<"st.param",  RRegs32, store_parameter>;
 defm STs : PTX_ST<"st.shared", RRegs32, store_shared>;
 
+// Floating-point stores
+defm FSTg : PTX_ST<"st.global", RRegf32, store_global>;
+defm FSTl : PTX_ST<"st.local",  RRegf32, store_local>;
+// Store to parameter state space requires PTX 2.0 or higher?
+// defm FSTp : PTX_ST<"st.param",  RRegf32, store_parameter>;
+defm FSTs : PTX_ST<"st.shared", RRegf32, store_shared>;
+
 ///===- Control Flow Instructions -----------------------------------------===//
 
 let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
diff --git a/llvm/lib/Target/PTX/PTXRegisterInfo.td b/llvm/lib/Target/PTX/PTXRegisterInfo.td
index 22e2b34..9158f0d 100644
--- a/llvm/lib/Target/PTX/PTXRegisterInfo.td
+++ b/llvm/lib/Target/PTX/PTXRegisterInfo.td
@@ -85,6 +85,40 @@
 def R30 : PTXReg<"r30">;
 def R31 : PTXReg<"r31">;
 
+def F0  : PTXReg<"f0">;
+def F1  : PTXReg<"f1">;
+def F2  : PTXReg<"f2">;
+def F3  : PTXReg<"f3">;
+def F4  : PTXReg<"f4">;
+def F5  : PTXReg<"f5">;
+def F6  : PTXReg<"f6">;
+def F7  : PTXReg<"f7">;
+def F8  : PTXReg<"f8">;
+def F9  : PTXReg<"f9">;
+def F10 : PTXReg<"f10">;
+def F11 : PTXReg<"f11">;
+def F12 : PTXReg<"f12">;
+def F13 : PTXReg<"f13">;
+def F14 : PTXReg<"f14">;
+def F15 : PTXReg<"f15">;
+def F16 : PTXReg<"f16">;
+def F17 : PTXReg<"f17">;
+def F18 : PTXReg<"f18">;
+def F19 : PTXReg<"f19">;
+def F20 : PTXReg<"f20">;
+def F21 : PTXReg<"f21">;
+def F22 : PTXReg<"f22">;
+def F23 : PTXReg<"f23">;
+def F24 : PTXReg<"f24">;
+def F25 : PTXReg<"f25">;
+def F26 : PTXReg<"f26">;
+def F27 : PTXReg<"f27">;
+def F28 : PTXReg<"f28">;
+def F29 : PTXReg<"f29">;
+def F30 : PTXReg<"f30">;
+def F31 : PTXReg<"f31">;
+
+
 //===----------------------------------------------------------------------===//
 //  Register classes
 //===----------------------------------------------------------------------===//
@@ -100,3 +134,9 @@
                              R8, R9, R10, R11, R12, R13, R14, R15,
                              R16, R17, R18, R19, R20, R21, R22, R23,
                              R24, R25, R26, R27, R28, R29, R30, R31]>;
+
+def RRegf32 : RegisterClass<"PTX", [f32], 32,
+                            [F0, F1, F2, F3, F4, F5, F6, F7,
+                             F8, F9, F10, F11, F12, F13, F14, F15,
+                             F16, F17, F18, F19, F20, F21, F22, F23,
+                             F24, F25, F26, F27, F28, F29, F30, F31]>;