diff --git a/lib/Target/PTX/PTX.td b/lib/Target/PTX/PTX.td
index 8b1a1b1..9f62aa1 100644
--- a/lib/Target/PTX/PTX.td
+++ b/lib/Target/PTX/PTX.td
@@ -19,8 +19,34 @@
 // Subtarget Features.
 //===----------------------------------------------------------------------===//
 
-def FeatureSM20 : SubtargetFeature<"sm20", "is_sm20", "true",
-                                   "Enable sm_20 target architecture">;
+//===- Architectural Features ---------------------------------------------===//
+
+def FeatureDouble : SubtargetFeature<"double", "SupportsDouble", "true",
+                                     "Do not demote .f64 to .f32">;
+
+//===- PTX Version --------------------------------------------------------===//
+
+def FeaturePTX14 : SubtargetFeature<"ptx14", "PTXVersion", "PTX_VERSION_1_4",
+                                    "Use PTX Language Version 1.4">;
+
+def FeaturePTX20 : SubtargetFeature<"ptx20", "PTXVersion", "PTX_VERSION_2_0",
+                                    "Use PTX Language Version 2.0",
+                                    [FeaturePTX14]>;
+
+def FeaturePTX21 : SubtargetFeature<"ptx21", "PTXVersion", "PTX_VERSION_2_1",
+                                    "Use PTX Language Version 2.1",
+                                    [FeaturePTX20]>;
+
+//===- PTX Shader Model ---------------------------------------------------===//
+
+def FeatureSM10 : SubtargetFeature<"sm10", "PTXShaderModel", "PTX_SM_1_0",
+                                   "Enable Shader Model 1.0 compliance">;
+def FeatureSM13 : SubtargetFeature<"sm13", "PTXShaderModel", "PTX_SM_1_3",
+                                   "Enable Shader Model 1.3 compliance",
+                                   [FeatureSM10, FeatureDouble]>;
+def FeatureSM20 : SubtargetFeature<"sm20", "PTXShaderModel", "PTX_SM_2_0",
+                                   "Enable Shader Model 2.0 compliance",
+                                   [FeatureSM13]>;
 
 //===----------------------------------------------------------------------===//
 // PTX supported processors.
diff --git a/lib/Target/PTX/PTXAsmPrinter.cpp b/lib/Target/PTX/PTXAsmPrinter.cpp
index 25f26fa..35eeadc 100644
--- a/lib/Target/PTX/PTXAsmPrinter.cpp
+++ b/lib/Target/PTX/PTXAsmPrinter.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Target/Mangler.h"
@@ -37,13 +38,6 @@
 
 using namespace llvm;
 
-static cl::opt<std::string>
-OptPTXVersion("ptx-version", cl::desc("Set PTX version"), cl::init("1.4"));
-
-static cl::opt<std::string>
-OptPTXTarget("ptx-target", cl::desc("Set GPU target (comma-separated list)"),
-             cl::init("sm_10"));
-
 namespace {
 class PTXAsmPrinter : public AsmPrinter {
 public:
@@ -82,11 +76,14 @@
 static const char PARAM_PREFIX[] = "__param_";
 
 static const char *getRegisterTypeName(unsigned RegNo) {
-#define TEST_REGCLS(cls, clsstr) \
+#define TEST_REGCLS(cls, clsstr)                \
   if (PTX::cls ## RegisterClass->contains(RegNo)) return # clsstr;
-  TEST_REGCLS(RRegf32, f32);
-  TEST_REGCLS(RRegs32, s32);
   TEST_REGCLS(Preds, pred);
+  TEST_REGCLS(RRegu16, u16);
+  TEST_REGCLS(RRegu32, u32);
+  TEST_REGCLS(RRegu64, u64);
+  TEST_REGCLS(RRegf32, f32);
+  TEST_REGCLS(RRegf64, f64);
 #undef TEST_REGCLS
 
   llvm_unreachable("Not in any register class!");
@@ -121,7 +118,14 @@
     switch (type->getTypeID()) {
       default: llvm_unreachable("Unknown type");
       case Type::FloatTyID: return ".f32";
-      case Type::IntegerTyID: return ".s32";    // TODO:  Handle 64-bit types.
+      case Type::DoubleTyID: return ".f64";
+      case Type::IntegerTyID:
+        switch (type->getPrimitiveSizeInBits()) {
+          default: llvm_unreachable("Unknown integer bit-width");
+          case 16: return ".u16";
+          case 32: return ".u32";
+          case 64: return ".u64";
+        }
       case Type::ArrayTyID:
       case Type::PointerTyID:
         type = dyn_cast<const SequentialType>(type)->getElementType();
@@ -162,8 +166,11 @@
 
 void PTXAsmPrinter::EmitStartOfAsmFile(Module &M)
 {
-  OutStreamer.EmitRawText(Twine("\t.version " + OptPTXVersion));
-  OutStreamer.EmitRawText(Twine("\t.target " + OptPTXTarget));
+  const PTXSubtarget& ST = TM.getSubtarget<PTXSubtarget>();
+
+  OutStreamer.EmitRawText(Twine("\t.version " + ST.getPTXVersionString()));
+  OutStreamer.EmitRawText(Twine("\t.target " + ST.getTargetString() +
+                                (ST.supportsDouble() ? "" : ", map_f64_to_f32")));
   OutStreamer.AddBlankLine();
 
   // declare global variables
@@ -236,11 +243,24 @@
       break;
     case MachineOperand::MO_FPImmediate:
       APInt constFP = MO.getFPImm()->getValueAPF().bitcastToAPInt();
-      if (constFP.getZExtValue() > 0) {
-        OS << "0F" << constFP.toString(16, false);
+      bool  isFloat = MO.getFPImm()->getType()->getTypeID() == Type::FloatTyID;
+      // Emit 0F for 32-bit floats and 0D for 64-bit doubles.
+      if (isFloat) {
+        OS << "0F";
       }
       else {
-        OS << "0F00000000";
+        OS << "0D";
+      }
+      // Emit the encoded floating-point value.
+      if (constFP.getZExtValue() > 0) {
+        OS << constFP.toString(16, false);
+      }
+      else {
+        OS << "00000000";
+        // If We have a double-precision zero, pad to 8-bytes.
+        if (!isFloat) {
+          OS << "00000000";
+        }
       }
       break;
   }
@@ -338,12 +358,18 @@
   if (!MFI->argRegEmpty()) {
     decl += " (";
     if (isKernel) {
-      for (int i = 0, e = MFI->getNumArg(); i != e; ++i) {
-        if (i != 0)
+      unsigned cnt = 0;
+      //for (int i = 0, e = MFI->getNumArg(); i != e; ++i) {
+      for(PTXMachineFunctionInfo::reg_iterator
+          i = MFI->argRegBegin(), e = MFI->argRegEnd(), b = i; i != e; ++i) {
+        reg = *i;
+        assert(reg != PTX::NoRegister && "Not a valid register!");
+        if (i != b)
           decl += ", ";
-        decl += ".param .s32 "; // TODO: add types
+        decl += ".param .u32";  // TODO: Parse type from register map
+        decl += " ";
         decl += PARAM_PREFIX;
-        decl += utostr(i + 1);
+        decl += utostr(++cnt);
       }
     } else {
       for (PTXMachineFunctionInfo::reg_iterator
diff --git a/lib/Target/PTX/PTXISelDAGToDAG.cpp b/lib/Target/PTX/PTXISelDAGToDAG.cpp
index efb0e8b..1e6a53f 100644
--- a/lib/Target/PTX/PTXISelDAGToDAG.cpp
+++ b/lib/Target/PTX/PTXISelDAGToDAG.cpp
@@ -15,6 +15,7 @@
 #include "PTXTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -66,14 +67,34 @@
 }
 
 SDNode *PTXDAGToDAGISel::SelectREAD_PARAM(SDNode *Node) {
-  SDValue index = Node->getOperand(1);
-  DebugLoc dl = Node->getDebugLoc();
+  SDValue  index = Node->getOperand(1);
+  DebugLoc dl    = Node->getDebugLoc();
+  unsigned opcode;
 
   if (index.getOpcode() != ISD::TargetConstant)
     llvm_unreachable("READ_PARAM: index is not ISD::TargetConstant");
 
+  if (Node->getValueType(0) == MVT::i16) {
+    opcode = PTX::LDpiU16;
+  }
+  else if (Node->getValueType(0) == MVT::i32) {
+    opcode = PTX::LDpiU32;
+  }
+  else if (Node->getValueType(0) == MVT::i64) {
+    opcode = PTX::LDpiU64;
+  }
+  else if (Node->getValueType(0) == MVT::f32) {
+    opcode = PTX::LDpiF32;
+  }
+  else if (Node->getValueType(0) == MVT::f64) {
+    opcode = PTX::LDpiF64;
+  }
+  else {
+    llvm_unreachable("Unknown parameter type for ld.param");
+  }
+
   return PTXInstrInfo::
-    GetPTXMachineNode(CurDAG, PTX::LDpi, dl, MVT::i32, index);
+    GetPTXMachineNode(CurDAG, opcode, dl, Node->getValueType(0), index);
 }
 
 // Match memory operand of the form [reg+reg]
diff --git a/lib/Target/PTX/PTXISelLowering.cpp b/lib/Target/PTX/PTXISelLowering.cpp
index d30c9ec..147b2a8 100644
--- a/lib/Target/PTX/PTXISelLowering.cpp
+++ b/lib/Target/PTX/PTXISelLowering.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -27,13 +28,17 @@
   : TargetLowering(TM, new TargetLoweringObjectFileELF()) {
   // Set up the register classes.
   addRegisterClass(MVT::i1,  PTX::PredsRegisterClass);
-  addRegisterClass(MVT::i32, PTX::RRegs32RegisterClass);
+  addRegisterClass(MVT::i16, PTX::RRegu16RegisterClass);
+  addRegisterClass(MVT::i32, PTX::RRegu32RegisterClass);
+  addRegisterClass(MVT::i64, PTX::RRegu64RegisterClass);
   addRegisterClass(MVT::f32, PTX::RRegf32RegisterClass);
-  
+  addRegisterClass(MVT::f64, PTX::RRegf64RegisterClass);
+
   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  
+  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
+
   // Customize translation of memory addresses
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
@@ -90,10 +95,13 @@
   bool operator==(MVT::SimpleValueType _VT) const { return VT == _VT; }
 } argmap[] = {
   argmap_entry(MVT::i1,  PTX::PredsRegisterClass),
-  argmap_entry(MVT::i32, PTX::RRegs32RegisterClass),
-  argmap_entry(MVT::f32, PTX::RRegf32RegisterClass)
+  argmap_entry(MVT::i16, PTX::RRegu16RegisterClass),
+  argmap_entry(MVT::i32, PTX::RRegu32RegisterClass),
+  argmap_entry(MVT::i64, PTX::RRegu64RegisterClass),
+  argmap_entry(MVT::f32, PTX::RRegf32RegisterClass),
+  argmap_entry(MVT::f64, PTX::RRegf64RegisterClass)
 };
-} // end anonymous namespace
+}                               // end anonymous namespace
 
 SDValue PTXTargetLowering::
   LowerFormalArguments(SDValue Chain,
@@ -192,12 +200,21 @@
   SDValue Flag;
   unsigned reg;
 
-  if (Outs[0].VT == MVT::i32) {
+  if (Outs[0].VT == MVT::i16) {
+    reg = PTX::RH0;
+  }
+  else if (Outs[0].VT == MVT::i32) {
     reg = PTX::R0;
   }
+  else if (Outs[0].VT == MVT::i64) {
+    reg = PTX::RD0;
+  }
   else if (Outs[0].VT == MVT::f32) {
     reg = PTX::F0;
   }
+  else if (Outs[0].VT == MVT::f64) {
+    reg = PTX::FD0;
+  }
   else {
     assert(false && "Can return only basic types");
   }
diff --git a/lib/Target/PTX/PTXInstrInfo.cpp b/lib/Target/PTX/PTXInstrInfo.cpp
index f2e5e4c..7277238 100644
--- a/lib/Target/PTX/PTXInstrInfo.cpp
+++ b/lib/Target/PTX/PTXInstrInfo.cpp
@@ -27,9 +27,12 @@
   const TargetRegisterClass *cls;
   const int opcode;
 } map[] = {
-  { &PTX::RRegs32RegClass, PTX::MOVrr },
-  { &PTX::RRegf32RegClass, PTX::MOVrr },
-  { &PTX::PredsRegClass,   PTX::MOVpp }
+  { &PTX::RRegu16RegClass, PTX::MOVU16rr },
+  { &PTX::RRegu32RegClass, PTX::MOVU32rr },
+  { &PTX::RRegu64RegClass, PTX::MOVU64rr },
+  { &PTX::RRegf32RegClass, PTX::MOVF32rr },
+  { &PTX::RRegf64RegClass, PTX::MOVF64rr },
+  { &PTX::PredsRegClass,   PTX::MOVPREDrr }
 };
 
 void PTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
@@ -76,8 +79,12 @@
   switch (MI.getOpcode()) {
     default:
       return false;
-    case PTX::MOVpp:
-    case PTX::MOVrr:
+    case PTX::MOVU16rr:
+    case PTX::MOVU32rr:
+    case PTX::MOVU64rr:
+    case PTX::MOVF32rr:
+    case PTX::MOVF64rr:
+    case PTX::MOVPREDrr:
       assert(MI.getNumOperands() >= 2 &&
              MI.getOperand(0).isReg() && MI.getOperand(1).isReg() &&
              "Invalid register-register move instruction");
diff --git a/lib/Target/PTX/PTXInstrInfo.td b/lib/Target/PTX/PTXInstrInfo.td
index 9d962b0..fce6da6 100644
--- a/lib/Target/PTX/PTXInstrInfo.td
+++ b/lib/Target/PTX/PTXInstrInfo.td
@@ -114,7 +114,7 @@
 // Address operands
 def MEMri : Operand<i32> {
   let PrintMethod = "printMemOperand";
-  let MIOperandInfo = (ops RRegs32, i32imm);
+  let MIOperandInfo = (ops RRegu32, i32imm);
 }
 def MEMii : Operand<i32> {
   let PrintMethod = "printMemOperand";
@@ -143,75 +143,115 @@
 // Instruction Class Templates
 //===----------------------------------------------------------------------===//
 
-// Three-operand f32 instruction template
+// Three-operand floating-point instruction template
 multiclass FLOAT3<string opcstr, SDNode opnode> {
-  def rr : InstPTX<(outs RRegf32:$d),
-                   (ins RRegf32:$a, RRegf32:$b),
-                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
-                   [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>;
-  def ri : InstPTX<(outs RRegf32:$d),
-                   (ins RRegf32:$a, f32imm:$b),
-                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
-                   [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>;
+  def rr32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a, RRegf32:$b),
+                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
+                     [(set RRegf32:$d, (opnode RRegf32:$a, RRegf32:$b))]>;
+  def ri32 : InstPTX<(outs RRegf32:$d),
+                     (ins RRegf32:$a, f32imm:$b),
+                     !strconcat(opcstr, ".f32\t$d, $a, $b"),
+                     [(set RRegf32:$d, (opnode RRegf32:$a, fpimm:$b))]>;
+  def rr64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a, RRegf64:$b),
+                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
+                     [(set RRegf64:$d, (opnode RRegf64:$a, RRegf64:$b))]>;
+  def ri64 : InstPTX<(outs RRegf64:$d),
+                     (ins RRegf64:$a, f64imm:$b),
+                     !strconcat(opcstr, ".f64\t$d, $a, $b"),
+                     [(set RRegf64:$d, (opnode RRegf64:$a, fpimm:$b))]>;
 }
 
 multiclass INT3<string opcstr, SDNode opnode> {
-  def rr : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, RRegs32:$b),
-                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>;
-  def ri : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, i32imm:$b),
-                   !strconcat(opcstr, ".%type\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>;
+  def rr16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, RRegu16:$b),
+                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, RRegu16:$b))]>;
+  def ri16 : InstPTX<(outs RRegu16:$d),
+                     (ins RRegu16:$a, i16imm:$b),
+                     !strconcat(opcstr, ".u16\t$d, $a, $b"),
+                     [(set RRegu16:$d, (opnode RRegu16:$a, imm:$b))]>;
+  def rr32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, RRegu32:$b),
+                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
+  def ri32 : InstPTX<(outs RRegu32:$d),
+                     (ins RRegu32:$a, i32imm:$b),
+                     !strconcat(opcstr, ".u32\t$d, $a, $b"),
+                     [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
+  def rr64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, RRegu64:$b),
+                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, RRegu64:$b))]>;
+  def ri64 : InstPTX<(outs RRegu64:$d),
+                     (ins RRegu64:$a, i64imm:$b),
+                     !strconcat(opcstr, ".u64\t$d, $a, $b"),
+                     [(set RRegu64:$d, (opnode RRegu64:$a, imm:$b))]>;
 }
 
 // no %type directive, non-communtable
 multiclass INT3ntnc<string opcstr, SDNode opnode> {
-  def rr : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, RRegs32:$b),
+  def rr : InstPTX<(outs RRegu32:$d),
+                   (ins RRegu32:$a, RRegu32:$b),
                    !strconcat(opcstr, "\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, RRegs32:$b))]>;
-  def ri : InstPTX<(outs RRegs32:$d),
-                   (ins RRegs32:$a, i32imm:$b),
+                   [(set RRegu32:$d, (opnode RRegu32:$a, RRegu32:$b))]>;
+  def ri : InstPTX<(outs RRegu32:$d),
+                   (ins RRegu32:$a, i32imm:$b),
                    !strconcat(opcstr, "\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode RRegs32:$a, imm:$b))]>;
-  def ir : InstPTX<(outs RRegs32:$d),
-                   (ins i32imm:$a, RRegs32:$b),
+                   [(set RRegu32:$d, (opnode RRegu32:$a, imm:$b))]>;
+  def ir : InstPTX<(outs RRegu32:$d),
+                   (ins i32imm:$a, RRegu32:$b),
                    !strconcat(opcstr, "\t$d, $a, $b"),
-                   [(set RRegs32:$d, (opnode imm:$a, RRegs32:$b))]>;
+                   [(set RRegu32:$d, (opnode imm:$a, RRegu32:$b))]>;
 }
 
-multiclass PTX_LD<string opstr, RegisterClass RC, PatFrag pat_load> {
+multiclass PTX_LD<string opstr, string typestr, RegisterClass RC, PatFrag pat_load> {
   def rr : InstPTX<(outs RC:$d),
                    (ins MEMri:$a),
-                   !strconcat(opstr, ".%type\t$d, [$a]"),
+                   !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
                    [(set RC:$d, (pat_load ADDRrr:$a))]>;
   def ri : InstPTX<(outs RC:$d),
                    (ins MEMri:$a),
-                   !strconcat(opstr, ".%type\t$d, [$a]"),
+                   !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
                    [(set RC:$d, (pat_load ADDRri:$a))]>;
   def ii : InstPTX<(outs RC:$d),
                    (ins MEMii:$a),
-                   !strconcat(opstr, ".%type\t$d, [$a]"),
+                   !strconcat(opstr, !strconcat(typestr, "\t$d, [$a]")),
                    [(set RC:$d, (pat_load ADDRii:$a))]>;
 }
 
-multiclass PTX_ST<string opstr, RegisterClass RC, PatFrag pat_store> {
+multiclass PTX_LD_ALL<string opstr, PatFrag pat_load> {
+  defm u16 : PTX_LD<opstr, ".u16", RRegu16, pat_load>;
+  defm u32 : PTX_LD<opstr, ".u32", RRegu32, pat_load>;
+  defm u64 : PTX_LD<opstr, ".u64", RRegu64, pat_load>;
+  defm f32 : PTX_LD<opstr, ".f32", RRegf32, pat_load>;
+  defm f64 : PTX_LD<opstr, ".f64", RRegf64, pat_load>;
+}
+
+multiclass PTX_ST<string opstr, string typestr, RegisterClass RC, PatFrag pat_store> {
   def rr : InstPTX<(outs),
                    (ins RC:$d, MEMri:$a),
-                   !strconcat(opstr, ".%type\t[$a], $d"),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
                    [(pat_store RC:$d, ADDRrr:$a)]>;
   def ri : InstPTX<(outs),
                    (ins RC:$d, MEMri:$a),
-                   !strconcat(opstr, ".%type\t[$a], $d"),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
                    [(pat_store RC:$d, ADDRri:$a)]>;
   def ii : InstPTX<(outs),
                    (ins RC:$d, MEMii:$a),
-                   !strconcat(opstr, ".%type\t[$a], $d"),
+                   !strconcat(opstr, !strconcat(typestr, "\t[$a], $d")),
                    [(pat_store RC:$d, ADDRii:$a)]>;
 }
 
+multiclass PTX_ST_ALL<string opstr, PatFrag pat_store> {
+  defm u16 : PTX_ST<opstr, ".u16", RRegu16, pat_store>;
+  defm u32 : PTX_ST<opstr, ".u32", RRegu32, pat_store>;
+  defm u64 : PTX_ST<opstr, ".u64", RRegu64, pat_store>;
+  defm f32 : PTX_ST<opstr, ".f32", RRegf32, pat_store>;
+  defm f64 : PTX_ST<opstr, ".f64", RRegf64, pat_store>;
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions
 //===----------------------------------------------------------------------===//
@@ -236,60 +276,67 @@
 ///===- Data Movement and Conversion Instructions -------------------------===//
 
 let neverHasSideEffects = 1 in {
-  // rely on isMoveInstr to separate MOVpp, MOVrr, etc.
-  def MOVpp
+  def MOVPREDrr
     : InstPTX<(outs Preds:$d), (ins Preds:$a), "mov.pred\t$d, $a", []>;
-  def MOVrr
-    : InstPTX<(outs RRegs32:$d), (ins RRegs32:$a), "mov.%type\t$d, $a", []>;
-  def FMOVrr
+  def MOVU16rr
+    : InstPTX<(outs RRegu16:$d), (ins RRegu16:$a), "mov.u16\t$d, $a", []>;
+  def MOVU32rr
+    : InstPTX<(outs RRegu32:$d), (ins RRegu32:$a), "mov.u32\t$d, $a", []>;
+  def MOVU64rr
+    : InstPTX<(outs RRegu64:$d), (ins RRegu64:$a), "mov.u64\t$d, $a", []>;
+  def MOVF32rr
     : InstPTX<(outs RRegf32:$d), (ins RRegf32:$a), "mov.f32\t$d, $a", []>;
+  def MOVF64rr
+    : InstPTX<(outs RRegf64:$d), (ins RRegf64:$a), "mov.f64\t$d, $a", []>;
 }
 
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def MOVpi
+  def MOVPREDri
     : InstPTX<(outs Preds:$d), (ins i1imm:$a), "mov.pred\t$d, $a",
               [(set Preds:$d, imm:$a)]>;
-  def MOVri
-    : InstPTX<(outs RRegs32:$d), (ins i32imm:$a), "mov.s32\t$d, $a",
-              [(set RRegs32:$d, imm:$a)]>;
-  def FMOVri
+  def MOVU16ri
+    : InstPTX<(outs RRegu16:$d), (ins i16imm:$a), "mov.u16\t$d, $a",
+              [(set RRegu16:$d, imm:$a)]>;
+  def MOVU32ri
+    : InstPTX<(outs RRegu32:$d), (ins i32imm:$a), "mov.u32\t$d, $a",
+              [(set RRegu32:$d, imm:$a)]>;
+  def MOVU164ri
+    : InstPTX<(outs RRegu64:$d), (ins i64imm:$a), "mov.u64\t$d, $a",
+              [(set RRegu64:$d, imm:$a)]>;
+  def MOVF32ri
     : InstPTX<(outs RRegf32:$d), (ins f32imm:$a), "mov.f32\t$d, $a",
               [(set RRegf32:$d, fpimm:$a)]>;
+  def MOVF64ri
+    : InstPTX<(outs RRegf64:$d), (ins f64imm:$a), "mov.f64\t$d, $a",
+              [(set RRegf64:$d, fpimm:$a)]>;
 }
 
-// Integer loads
-defm LDg : PTX_LD<"ld.global", RRegs32, load_global>;
-defm LDc : PTX_LD<"ld.const",  RRegs32, load_constant>;
-defm LDl : PTX_LD<"ld.local",  RRegs32, load_local>;
-defm LDp : PTX_LD<"ld.param",  RRegs32, load_parameter>;
-defm LDs : PTX_LD<"ld.shared", RRegs32, load_shared>;
+// Loads
+defm LDg : PTX_LD_ALL<"ld.global", load_global>;
+defm LDc : PTX_LD_ALL<"ld.const",  load_constant>;
+defm LDl : PTX_LD_ALL<"ld.local",  load_local>;
+defm LDs : PTX_LD_ALL<"ld.shared", load_shared>;
 
-def LDpi : InstPTX<(outs RRegs32:$d), (ins MEMpi:$a),
-                   "ld.param.%type\t$d, [$a]", []>;
+// This is a special instruction that is manually inserted for kernel parameters
+def LDpiU16 : InstPTX<(outs RRegu16:$d), (ins MEMpi:$a),
+                      "ld.param.u16\t$d, [$a]", []>;
+def LDpiU32 : InstPTX<(outs RRegu32:$d), (ins MEMpi:$a),
+                      "ld.param.u32\t$d, [$a]", []>;
+def LDpiU64 : InstPTX<(outs RRegu64:$d), (ins MEMpi:$a),
+                      "ld.param.u64\t$d, [$a]", []>;
+def LDpiF32 : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a),
+                      "ld.param.f32\t$d, [$a]", []>;
+def LDpiF64 : InstPTX<(outs RRegf64:$d), (ins MEMpi:$a),
+                      "ld.param.f64\t$d, [$a]", []>;
 
-// Floating-point loads
-defm FLDg : PTX_LD<"ld.global", RRegf32, load_global>;
-defm FLDc : PTX_LD<"ld.const",  RRegf32, load_constant>;
-defm FLDl : PTX_LD<"ld.local",  RRegf32, load_local>;
-defm FLDp : PTX_LD<"ld.param",  RRegf32, load_parameter>;
-defm FLDs : PTX_LD<"ld.shared", RRegf32, load_shared>;
+// Stores
+defm STg : PTX_ST_ALL<"st.global", store_global>;
+defm STl : PTX_ST_ALL<"st.local",  store_local>;
+defm STs : PTX_ST_ALL<"st.shared", store_shared>;
 
-def FLDpi : InstPTX<(outs RRegf32:$d), (ins MEMpi:$a),
-                   "ld.param.%type\t$d, [$a]", []>;
-
-// Integer stores
-defm STg : PTX_ST<"st.global", RRegs32, store_global>;
-defm STl : PTX_ST<"st.local",  RRegs32, store_local>;
-// Store to parameter state space requires PTX 2.0 or higher?
-// defm STp : PTX_ST<"st.param",  RRegs32, store_parameter>;
-defm STs : PTX_ST<"st.shared", RRegs32, store_shared>;
-
-// Floating-point stores
-defm FSTg : PTX_ST<"st.global", RRegf32, store_global>;
-defm FSTl : PTX_ST<"st.local",  RRegf32, store_local>;
-// Store to parameter state space requires PTX 2.0 or higher?
-// defm FSTp : PTX_ST<"st.param",  RRegf32, store_parameter>;
-defm FSTs : PTX_ST<"st.shared", RRegf32, store_shared>;
+// defm STp : PTX_ST_ALL<"st.param",  store_parameter>;
+// defm LDp : PTX_LD_ALL<"ld.param",  load_parameter>;
+// TODO: Do something with st.param if/when it is needed.
 
 ///===- Control Flow Instructions -----------------------------------------===//
 
diff --git a/lib/Target/PTX/PTXMFInfoExtract.cpp b/lib/Target/PTX/PTXMFInfoExtract.cpp
index b37c740..c5e1910 100644
--- a/lib/Target/PTX/PTXMFInfoExtract.cpp
+++ b/lib/Target/PTX/PTXMFInfoExtract.cpp
@@ -79,12 +79,12 @@
 
   DEBUG(for (PTXMachineFunctionInfo::reg_iterator
              i = MFI->argRegBegin(), e = MFI->argRegEnd();
-	     i != e; ++i)
+             i != e; ++i)
         dbgs() << "Arg Reg: " << *i << "\n";);
 
   DEBUG(for (PTXMachineFunctionInfo::reg_iterator
              i = MFI->localVarRegBegin(), e = MFI->localVarRegEnd();
-	     i != e; ++i)
+             i != e; ++i)
         dbgs() << "Local Var Reg: " << *i << "\n";);
 
   return false;
diff --git a/lib/Target/PTX/PTXRegisterInfo.td b/lib/Target/PTX/PTXRegisterInfo.td
index 9158f0d..548e3bb 100644
--- a/lib/Target/PTX/PTXRegisterInfo.td
+++ b/lib/Target/PTX/PTXRegisterInfo.td
@@ -19,6 +19,8 @@
 //  Registers
 //===----------------------------------------------------------------------===//
 
+///===- Predicate Registers -----------------------------------------------===//
+
 def P0  : PTXReg<"p0">;
 def P1  : PTXReg<"p1">;
 def P2  : PTXReg<"p2">;
@@ -52,6 +54,43 @@
 def P30 : PTXReg<"p30">;
 def P31 : PTXReg<"p31">;
 
+///===- 16-bit Integer Registers ------------------------------------------===//
+
+def RH0  : PTXReg<"rh0">;
+def RH1  : PTXReg<"rh1">;
+def RH2  : PTXReg<"rh2">;
+def RH3  : PTXReg<"rh3">;
+def RH4  : PTXReg<"rh4">;
+def RH5  : PTXReg<"rh5">;
+def RH6  : PTXReg<"rh6">;
+def RH7  : PTXReg<"rh7">;
+def RH8  : PTXReg<"rh8">;
+def RH9  : PTXReg<"rh9">;
+def RH10 : PTXReg<"rh10">;
+def RH11 : PTXReg<"rh11">;
+def RH12 : PTXReg<"rh12">;
+def RH13 : PTXReg<"rh13">;
+def RH14 : PTXReg<"rh14">;
+def RH15 : PTXReg<"rh15">;
+def RH16 : PTXReg<"rh16">;
+def RH17 : PTXReg<"rh17">;
+def RH18 : PTXReg<"rh18">;
+def RH19 : PTXReg<"rh19">;
+def RH20 : PTXReg<"rh20">;
+def RH21 : PTXReg<"rh21">;
+def RH22 : PTXReg<"rh22">;
+def RH23 : PTXReg<"rh23">;
+def RH24 : PTXReg<"rh24">;
+def RH25 : PTXReg<"rh25">;
+def RH26 : PTXReg<"rh26">;
+def RH27 : PTXReg<"rh27">;
+def RH28 : PTXReg<"rh28">;
+def RH29 : PTXReg<"rh29">;
+def RH30 : PTXReg<"rh30">;
+def RH31 : PTXReg<"rh31">;
+
+///===- 32-bit Integer Registers ------------------------------------------===//
+
 def R0  : PTXReg<"r0">;
 def R1  : PTXReg<"r1">;
 def R2  : PTXReg<"r2">;
@@ -85,6 +124,43 @@
 def R30 : PTXReg<"r30">;
 def R31 : PTXReg<"r31">;
 
+///===- 64-bit Integer Registers ------------------------------------------===//
+
+def RD0  : PTXReg<"rd0">;
+def RD1  : PTXReg<"rd1">;
+def RD2  : PTXReg<"rd2">;
+def RD3  : PTXReg<"rd3">;
+def RD4  : PTXReg<"rd4">;
+def RD5  : PTXReg<"rd5">;
+def RD6  : PTXReg<"rd6">;
+def RD7  : PTXReg<"rd7">;
+def RD8  : PTXReg<"rd8">;
+def RD9  : PTXReg<"rd9">;
+def RD10 : PTXReg<"rd10">;
+def RD11 : PTXReg<"rd11">;
+def RD12 : PTXReg<"rd12">;
+def RD13 : PTXReg<"rd13">;
+def RD14 : PTXReg<"rd14">;
+def RD15 : PTXReg<"rd15">;
+def RD16 : PTXReg<"rd16">;
+def RD17 : PTXReg<"rd17">;
+def RD18 : PTXReg<"rd18">;
+def RD19 : PTXReg<"rd19">;
+def RD20 : PTXReg<"rd20">;
+def RD21 : PTXReg<"rd21">;
+def RD22 : PTXReg<"rd22">;
+def RD23 : PTXReg<"rd23">;
+def RD24 : PTXReg<"rd24">;
+def RD25 : PTXReg<"rd25">;
+def RD26 : PTXReg<"rd26">;
+def RD27 : PTXReg<"rd27">;
+def RD28 : PTXReg<"rd28">;
+def RD29 : PTXReg<"rd29">;
+def RD30 : PTXReg<"rd30">;
+def RD31 : PTXReg<"rd31">;
+
+///===- 32-bit Floating-Point Registers -----------------------------------===//
+
 def F0  : PTXReg<"f0">;
 def F1  : PTXReg<"f1">;
 def F2  : PTXReg<"f2">;
@@ -118,6 +194,41 @@
 def F30 : PTXReg<"f30">;
 def F31 : PTXReg<"f31">;
 
+///===- 64-bit Floating-Point Registers -----------------------------------===//
+
+def FD0  : PTXReg<"fd0">;
+def FD1  : PTXReg<"fd1">;
+def FD2  : PTXReg<"fd2">;
+def FD3  : PTXReg<"fd3">;
+def FD4  : PTXReg<"fd4">;
+def FD5  : PTXReg<"fd5">;
+def FD6  : PTXReg<"fd6">;
+def FD7  : PTXReg<"fd7">;
+def FD8  : PTXReg<"fd8">;
+def FD9  : PTXReg<"fd9">;
+def FD10 : PTXReg<"fd10">;
+def FD11 : PTXReg<"fd11">;
+def FD12 : PTXReg<"fd12">;
+def FD13 : PTXReg<"fd13">;
+def FD14 : PTXReg<"fd14">;
+def FD15 : PTXReg<"fd15">;
+def FD16 : PTXReg<"fd16">;
+def FD17 : PTXReg<"fd17">;
+def FD18 : PTXReg<"fd18">;
+def FD19 : PTXReg<"fd19">;
+def FD20 : PTXReg<"fd20">;
+def FD21 : PTXReg<"fd21">;
+def FD22 : PTXReg<"fd22">;
+def FD23 : PTXReg<"fd23">;
+def FD24 : PTXReg<"fd24">;
+def FD25 : PTXReg<"fd25">;
+def FD26 : PTXReg<"fd26">;
+def FD27 : PTXReg<"fd27">;
+def FD28 : PTXReg<"fd28">;
+def FD29 : PTXReg<"fd29">;
+def FD30 : PTXReg<"fd30">;
+def FD31 : PTXReg<"fd31">;
+
 
 //===----------------------------------------------------------------------===//
 //  Register classes
@@ -129,14 +240,32 @@
                            P16, P17, P18, P19, P20, P21, P22, P23,
                            P24, P25, P26, P27, P28, P29, P30, P31]>;
 
-def RRegs32 : RegisterClass<"PTX", [i32], 32,
+def RRegu16 : RegisterClass<"PTX", [i16], 16,
+                            [RH0, RH1, RH2, RH3, RH4, RH5, RH6, RH7,
+                             RH8, RH9, RH10, RH11, RH12, RH13, RH14, RH15,
+                             RH16, RH17, RH18, RH19, RH20, RH21, RH22, RH23,
+                             RH24, RH25, RH26, RH27, RH28, RH29, RH30, RH31]>;
+
+def RRegu32 : RegisterClass<"PTX", [i32], 32,
                             [R0, R1, R2, R3, R4, R5, R6, R7,
                              R8, R9, R10, R11, R12, R13, R14, R15,
                              R16, R17, R18, R19, R20, R21, R22, R23,
                              R24, R25, R26, R27, R28, R29, R30, R31]>;
 
+def RRegu64 : RegisterClass<"PTX", [i64], 64,
+                            [RD0, RD1, RD2, RD3, RD4, RD5, RD6, RD7,
+                             RD8, RD9, RD10, RD11, RD12, RD13, RD14, RD15,
+                             RD16, RD17, RD18, RD19, RD20, RD21, RD22, RD23,
+                             RD24, RD25, RD26, RD27, RD28, RD29, RD30, RD31]>;
+
 def RRegf32 : RegisterClass<"PTX", [f32], 32,
                             [F0, F1, F2, F3, F4, F5, F6, F7,
                              F8, F9, F10, F11, F12, F13, F14, F15,
                              F16, F17, F18, F19, F20, F21, F22, F23,
                              F24, F25, F26, F27, F28, F29, F30, F31]>;
+
+def RRegf64 : RegisterClass<"PTX", [f64], 64,
+                            [FD0, FD1, FD2, FD3, FD4, FD5, FD6, FD7,
+                             FD8, FD9, FD10, FD11, FD12, FD13, FD14, FD15,
+                             FD16, FD17, FD18, FD19, FD20, FD21, FD22, FD23,
+                             FD24, FD25, FD26, FD27, FD28, FD29, FD30, FD31]>;
diff --git a/lib/Target/PTX/PTXSubtarget.cpp b/lib/Target/PTX/PTXSubtarget.cpp
index 00e2c88..18a9305 100644
--- a/lib/Target/PTX/PTXSubtarget.cpp
+++ b/lib/Target/PTX/PTXSubtarget.cpp
@@ -12,12 +12,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "PTXSubtarget.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS) {
-  std::string TARGET = "sm_20";
-  // TODO: call ParseSubtargetFeatures(FS, TARGET);
+PTXSubtarget::PTXSubtarget(const std::string &TT, const std::string &FS)
+  : PTXShaderModel(PTX_SM_1_0),
+    PTXVersion(PTX_VERSION_1_4) {
+  std::string TARGET = "generic";
+  ParseSubtargetFeatures(FS, TARGET);
+}
+
+std::string PTXSubtarget::getTargetString() const {
+  switch(PTXShaderModel) {
+    default: llvm_unreachable("Unknown shader model");
+    case PTX_SM_1_0: return "sm_10";
+    case PTX_SM_1_3: return "sm_13";
+    case PTX_SM_2_0: return "sm_20";
+  }
+}
+
+std::string PTXSubtarget::getPTXVersionString() const {
+  switch(PTXVersion) {
+    default: llvm_unreachable("Unknown PTX version");
+    case PTX_VERSION_1_4: return "1.4";
+    case PTX_VERSION_2_0: return "2.0";
+    case PTX_VERSION_2_1: return "2.1";
+  }
 }
 
 #include "PTXGenSubtarget.inc"
diff --git a/lib/Target/PTX/PTXSubtarget.h b/lib/Target/PTX/PTXSubtarget.h
index 7fd85f8..9a9ada2 100644
--- a/lib/Target/PTX/PTXSubtarget.h
+++ b/lib/Target/PTX/PTXSubtarget.h
@@ -19,11 +19,36 @@
 namespace llvm {
   class PTXSubtarget : public TargetSubtarget {
     private:
-      bool is_sm20;
+      enum PTXShaderModelEnum {
+        PTX_SM_1_0,
+        PTX_SM_1_3,
+        PTX_SM_2_0
+      };
+
+      enum PTXVersionEnum {
+        PTX_VERSION_1_4,
+        PTX_VERSION_2_0,
+        PTX_VERSION_2_1
+      };
+
+      /// Shader Model supported on the target GPU.
+      PTXShaderModelEnum PTXShaderModel;
+
+      /// PTX Language Version.
+      PTXVersionEnum PTXVersion;
+
+      // The native .f64 type is supported on the hardware.
+      bool SupportsDouble;
 
     public:
       PTXSubtarget(const std::string &TT, const std::string &FS);
 
+      std::string getTargetString() const;
+
+      std::string getPTXVersionString() const;
+
+      bool supportsDouble() const { return SupportsDouble; }
+
       std::string ParseSubtargetFeatures(const std::string &FS,
                                          const std::string &CPU);
   }; // class PTXSubtarget
diff --git a/test/CodeGen/PTX/add.ll b/test/CodeGen/PTX/add.ll
index 9e777ae..598591c 100644
--- a/test/CodeGen/PTX/add.ll
+++ b/test/CodeGen/PTX/add.ll
@@ -1,29 +1,71 @@
 ; RUN: llc < %s -march=ptx | FileCheck %s
 
-define ptx_device i32 @t1(i32 %x, i32 %y) {
-; CHECK: add.s32 r0, r1, r2;
+define ptx_device i16 @t1_u16(i16 %x, i16 %y) {
+; CHECK: add.u16 rh0, rh1, rh2;
+; CHECK-NEXT: ret;
+	%z = add i16 %x, %y
+	ret i16 %z
+}
+
+define ptx_device i32 @t1_u32(i32 %x, i32 %y) {
+; CHECK: add.u32 r0, r1, r2;
+; CHECK-NEXT: ret;
 	%z = add i32 %x, %y
-; CHECK: ret;
 	ret i32 %z
 }
 
-define ptx_device i32 @t2(i32 %x) {
-; CHECK: add.s32 r0, r1, 1;
-	%z = add i32 %x, 1
-; CHECK: ret;
-	ret i32 %z
+define ptx_device i64 @t1_u64(i64 %x, i64 %y) {
+; CHECK: add.u64 rd0, rd1, rd2;
+; CHECK-NEXT: ret;
+	%z = add i64 %x, %y
+	ret i64 %z
 }
 
-define ptx_device float @t3(float %x, float %y) {
+define ptx_device float @t1_f32(float %x, float %y) {
 ; CHECK: add.f32 f0, f1, f2
 ; CHECK-NEXT: ret;
   %z = fadd float %x, %y
   ret float %z
 }
 
-define ptx_device float @t4(float %x) {
+define ptx_device double @t1_f64(double %x, double %y) {
+; CHECK: add.f64 fd0, fd1, fd2
+; CHECK-NEXT: ret;
+  %z = fadd double %x, %y
+  ret double %z
+}
+
+define ptx_device i16 @t2_u16(i16 %x) {
+; CHECK: add.u16 rh0, rh1, 1;
+; CHECK-NEXT: ret;
+	%z = add i16 %x, 1
+	ret i16 %z
+}
+
+define ptx_device i32 @t2_u32(i32 %x) {
+; CHECK: add.u32 r0, r1, 1;
+; CHECK-NEXT: ret;
+	%z = add i32 %x, 1
+	ret i32 %z
+}
+
+define ptx_device i64 @t2_u64(i64 %x) {
+; CHECK: add.u64 rd0, rd1, 1;
+; CHECK-NEXT: ret;
+	%z = add i64 %x, 1
+	ret i64 %z
+}
+
+define ptx_device float @t2_f32(float %x) {
 ; CHECK: add.f32 f0, f1, 0F3F800000;
 ; CHECK-NEXT: ret;
   %z = fadd float %x, 1.0
   ret float %z
 }
+
+define ptx_device double @t2_f64(double %x) {
+; CHECK: add.f64 fd0, fd1, 0D3FF0000000000000;
+; CHECK-NEXT: ret;
+  %z = fadd double %x, 1.0
+  ret double %z
+}
diff --git a/test/CodeGen/PTX/ld.ll b/test/CodeGen/PTX/ld.ll
index 836c4d4..e7cc92e 100644
--- a/test/CodeGen/PTX/ld.ll
+++ b/test/CodeGen/PTX/ld.ll
@@ -1,78 +1,422 @@
 ; RUN: llc < %s -march=ptx | FileCheck %s
 
-;CHECK: .extern .global .s32 array[];
-@array = external global [10 x i32]
+;CHECK: .extern .global .u16 array_i16[];
+@array_i16 = external global [10 x i16]
 
-;CHECK: .extern .const .s32 array_constant[];
-@array_constant = external addrspace(1) constant [10 x i32]
+;CHECK: .extern .const .u16 array_constant_i16[];
+@array_constant_i16 = external addrspace(1) constant [10 x i16]
 
-;CHECK: .extern .local .s32 array_local[];
-@array_local = external addrspace(2) global [10 x i32]
+;CHECK: .extern .local .u16 array_local_i16[];
+@array_local_i16 = external addrspace(2) global [10 x i16]
 
-;CHECK: .extern .shared .s32 array_shared[];
-@array_shared = external addrspace(4) global [10 x i32]
+;CHECK: .extern .shared .u16 array_shared_i16[];
+@array_shared_i16 = external addrspace(4) global [10 x i16]
 
-define ptx_device i32 @t1(i32* %p) {
+;CHECK: .extern .global .u32 array_i32[];
+@array_i32 = external global [10 x i32]
+
+;CHECK: .extern .const .u32 array_constant_i32[];
+@array_constant_i32 = external addrspace(1) constant [10 x i32]
+
+;CHECK: .extern .local .u32 array_local_i32[];
+@array_local_i32 = external addrspace(2) global [10 x i32]
+
+;CHECK: .extern .shared .u32 array_shared_i32[];
+@array_shared_i32 = external addrspace(4) global [10 x i32]
+
+;CHECK: .extern .global .u64 array_i64[];
+@array_i64 = external global [10 x i64]
+
+;CHECK: .extern .const .u64 array_constant_i64[];
+@array_constant_i64 = external addrspace(1) constant [10 x i64]
+
+;CHECK: .extern .local .u64 array_local_i64[];
+@array_local_i64 = external addrspace(2) global [10 x i64]
+
+;CHECK: .extern .shared .u64 array_shared_i64[];
+@array_shared_i64 = external addrspace(4) global [10 x i64]
+
+;CHECK: .extern .global .f32 array_float[];
+@array_float = external global [10 x float]
+
+;CHECK: .extern .const .f32 array_constant_float[];
+@array_constant_float = external addrspace(1) constant [10 x float]
+
+;CHECK: .extern .local .f32 array_local_float[];
+@array_local_float = external addrspace(2) global [10 x float]
+
+;CHECK: .extern .shared .f32 array_shared_float[];
+@array_shared_float = external addrspace(4) global [10 x float]
+
+;CHECK: .extern .global .f64 array_double[];
+@array_double = external global [10 x double]
+
+;CHECK: .extern .const .f64 array_constant_double[];
+@array_constant_double = external addrspace(1) constant [10 x double]
+
+;CHECK: .extern .local .f64 array_local_double[];
+@array_local_double = external addrspace(2) global [10 x double]
+
+;CHECK: .extern .shared .f64 array_shared_double[];
+@array_shared_double = external addrspace(4) global [10 x double]
+
+
+define ptx_device i16 @t1_u16(i16* %p) {
 entry:
-;CHECK: ld.global.s32 r0, [r1];
+;CHECK: ld.global.u16 rh0, [r1];
+;CHECK-NEXT; ret;
+  %x = load i16* %p
+  ret i16 %x
+}
+
+define ptx_device i32 @t1_u32(i32* %p) {
+entry:
+;CHECK: ld.global.u32 r0, [r1];
+;CHECK-NEXT: ret;
   %x = load i32* %p
   ret i32 %x
 }
 
-define ptx_device i32 @t2(i32* %p) {
+define ptx_device i64 @t1_u64(i64* %p) {
 entry:
-;CHECK: ld.global.s32 r0, [r1+4];
+;CHECK: ld.global.u64 rd0, [r1];
+;CHECK-NEXT: ret;
+  %x = load i64* %p
+  ret i64 %x
+}
+
+define ptx_device float @t1_f32(float* %p) {
+entry:
+;CHECK: ld.global.f32 f0, [r1];
+;CHECK-NEXT: ret;
+  %x = load float* %p
+  ret float %x
+}
+
+define ptx_device double @t1_f64(double* %p) {
+entry:
+;CHECK: ld.global.f64 fd0, [r1];
+;CHECK-NEXT: ret;
+  %x = load double* %p
+  ret double %x
+}
+
+define ptx_device i16 @t2_u16(i16* %p) {
+entry:
+;CHECK: ld.global.u16 rh0, [r1+2];
+;CHECK-NEXT: ret;
+  %i = getelementptr i16* %p, i32 1
+  %x = load i16* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t2_u32(i32* %p) {
+entry:
+;CHECK: ld.global.u32 r0, [r1+4];
+;CHECK-NEXT: ret;
   %i = getelementptr i32* %p, i32 1
   %x = load i32* %i
   ret i32 %x
 }
 
-define ptx_device i32 @t3(i32* %p, i32 %q) {
+define ptx_device i64 @t2_u64(i64* %p) {
+entry:
+;CHECK: ld.global.u64 rd0, [r1+8];
+;CHECK-NEXT: ret;
+  %i = getelementptr i64* %p, i32 1
+  %x = load i64* %i
+  ret i64 %x
+}
+
+define ptx_device float @t2_f32(float* %p) {
+entry:
+;CHECK: ld.global.f32 f0, [r1+4];
+;CHECK-NEXT: ret;
+  %i = getelementptr float* %p, i32 1
+  %x = load float* %i
+  ret float %x
+}
+
+define ptx_device double @t2_f64(double* %p) {
+entry:
+;CHECK: ld.global.f64 fd0, [r1+8];
+;CHECK-NEXT: ret;
+  %i = getelementptr double* %p, i32 1
+  %x = load double* %i
+  ret double %x
+}
+
+define ptx_device i16 @t3_u16(i16* %p, i32 %q) {
+entry:
+;CHECK: shl.b32 r0, r2, 1;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: ld.global.u16 rh0, [r0];
+  %i = getelementptr i16* %p, i32 %q
+  %x = load i16* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t3_u32(i32* %p, i32 %q) {
 entry:
 ;CHECK: shl.b32 r0, r2, 2;
-;CHECK: add.s32 r0, r1, r0;
-;CHECK: ld.global.s32 r0, [r0];
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: ld.global.u32 r0, [r0];
   %i = getelementptr i32* %p, i32 %q
   %x = load i32* %i
   ret i32 %x
 }
 
-define ptx_device i32 @t4_global() {
+define ptx_device i64 @t3_u64(i64* %p, i32 %q) {
 entry:
-;CHECK: ld.global.s32 r0, [array];
-  %i = getelementptr [10 x i32]* @array, i32 0, i32 0
+;CHECK: shl.b32 r0, r2, 3;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: ld.global.u64 rd0, [r0];
+  %i = getelementptr i64* %p, i32 %q
+  %x = load i64* %i
+  ret i64 %x
+}
+
+define ptx_device float @t3_f32(float* %p, i32 %q) {
+entry:
+;CHECK: shl.b32 r0, r2, 2;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: ld.global.f32 f0, [r0];
+  %i = getelementptr float* %p, i32 %q
+  %x = load float* %i
+  ret float %x
+}
+
+define ptx_device double @t3_f64(double* %p, i32 %q) {
+entry:
+;CHECK: shl.b32 r0, r2, 3;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: ld.global.f64 fd0, [r0];
+  %i = getelementptr double* %p, i32 %q
+  %x = load double* %i
+  ret double %x
+}
+
+define ptx_device i16 @t4_global_u16() {
+entry:
+;CHECK: ld.global.u16 rh0, [array_i16];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 0
+  %x = load i16* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t4_global_u32() {
+entry:
+;CHECK: ld.global.u32 r0, [array_i32];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 0
   %x = load i32* %i
   ret i32 %x
 }
 
-define ptx_device i32 @t4_const() {
+define ptx_device i64 @t4_global_u64() {
 entry:
-;CHECK: ld.const.s32 r0, [array_constant];
-  %i = getelementptr [10 x i32] addrspace(1)* @array_constant, i32 0, i32 0
+;CHECK: ld.global.u64 rd0, [array_i64];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 0
+  %x = load i64* %i
+  ret i64 %x
+}
+
+define ptx_device float @t4_global_f32() {
+entry:
+;CHECK: ld.global.f32 f0, [array_float];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float]* @array_float, i32 0, i32 0
+  %x = load float* %i
+  ret float %x
+}
+
+define ptx_device double @t4_global_f64() {
+entry:
+;CHECK: ld.global.f64 fd0, [array_double];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double]* @array_double, i32 0, i32 0
+  %x = load double* %i
+  ret double %x
+}
+
+define ptx_device i16 @t4_const_u16() {
+entry:
+;CHECK: ld.const.u16 rh0, [array_constant_i16];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16] addrspace(1)* @array_constant_i16, i32 0, i32 0
+  %x = load i16 addrspace(1)* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t4_const_u32() {
+entry:
+;CHECK: ld.const.u32 r0, [array_constant_i32];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32] addrspace(1)* @array_constant_i32, i32 0, i32 0
   %x = load i32 addrspace(1)* %i
   ret i32 %x
 }
 
-define ptx_device i32 @t4_local() {
+define ptx_device i64 @t4_const_u64() {
 entry:
-;CHECK: ld.local.s32 r0, [array_local];
-  %i = getelementptr [10 x i32] addrspace(2)* @array_local, i32 0, i32 0
+;CHECK: ld.const.u64 rd0, [array_constant_i64];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64] addrspace(1)* @array_constant_i64, i32 0, i32 0
+  %x = load i64 addrspace(1)* %i
+  ret i64 %x
+}
+
+define ptx_device float @t4_const_f32() {
+entry:
+;CHECK: ld.const.f32 f0, [array_constant_float];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float] addrspace(1)* @array_constant_float, i32 0, i32 0
+  %x = load float addrspace(1)* %i
+  ret float %x
+}
+
+define ptx_device double @t4_const_f64() {
+entry:
+;CHECK: ld.const.f64 fd0, [array_constant_double];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double] addrspace(1)* @array_constant_double, i32 0, i32 0
+  %x = load double addrspace(1)* %i
+  ret double %x
+}
+
+define ptx_device i16 @t4_local_u16() {
+entry:
+;CHECK: ld.local.u16 rh0, [array_local_i16];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16] addrspace(2)* @array_local_i16, i32 0, i32 0
+  %x = load i16 addrspace(2)* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t4_local_u32() {
+entry:
+;CHECK: ld.local.u32 r0, [array_local_i32];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32] addrspace(2)* @array_local_i32, i32 0, i32 0
   %x = load i32 addrspace(2)* %i
   ret i32 %x
 }
 
-define ptx_device i32 @t4_shared() {
+define ptx_device i64 @t4_local_u64() {
 entry:
-;CHECK: ld.shared.s32 r0, [array_shared];
-  %i = getelementptr [10 x i32] addrspace(4)* @array_shared, i32 0, i32 0
+;CHECK: ld.local.u64 rd0, [array_local_i64];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64] addrspace(2)* @array_local_i64, i32 0, i32 0
+  %x = load i64 addrspace(2)* %i
+  ret i64 %x
+}
+
+define ptx_device float @t4_local_f32() {
+entry:
+;CHECK: ld.local.f32 f0, [array_local_float];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float] addrspace(2)* @array_local_float, i32 0, i32 0
+  %x = load float addrspace(2)* %i
+  ret float %x
+}
+
+define ptx_device double @t4_local_f64() {
+entry:
+;CHECK: ld.local.f64 fd0, [array_local_double];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double] addrspace(2)* @array_local_double, i32 0, i32 0
+  %x = load double addrspace(2)* %i
+  ret double %x
+}
+
+define ptx_device i16 @t4_shared_u16() {
+entry:
+;CHECK: ld.shared.u16 rh0, [array_shared_i16];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16] addrspace(4)* @array_shared_i16, i32 0, i32 0
+  %x = load i16 addrspace(4)* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t4_shared_u32() {
+entry:
+;CHECK: ld.shared.u32 r0, [array_shared_i32];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32] addrspace(4)* @array_shared_i32, i32 0, i32 0
   %x = load i32 addrspace(4)* %i
   ret i32 %x
 }
 
-define ptx_device i32 @t5() {
+define ptx_device i64 @t4_shared_u64() {
 entry:
-;CHECK: ld.global.s32 r0, [array+4];
-  %i = getelementptr [10 x i32]* @array, i32 0, i32 1
+;CHECK: ld.shared.u64 rd0, [array_shared_i64];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64] addrspace(4)* @array_shared_i64, i32 0, i32 0
+  %x = load i64 addrspace(4)* %i
+  ret i64 %x
+}
+
+define ptx_device float @t4_shared_f32() {
+entry:
+;CHECK: ld.shared.f32 f0, [array_shared_float];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0
+  %x = load float addrspace(4)* %i
+  ret float %x
+}
+
+define ptx_device double @t4_shared_f64() {
+entry:
+;CHECK: ld.shared.f64 fd0, [array_shared_double];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0
+  %x = load double addrspace(4)* %i
+  ret double %x
+}
+
+define ptx_device i16 @t5_u16() {
+entry:
+;CHECK: ld.global.u16 rh0, [array_i16+2];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 1
+  %x = load i16* %i
+  ret i16 %x
+}
+
+define ptx_device i32 @t5_u32() {
+entry:
+;CHECK: ld.global.u32 r0, [array_i32+4];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 1
   %x = load i32* %i
   ret i32 %x
 }
+
+define ptx_device i64 @t5_u64() {
+entry:
+;CHECK: ld.global.u64 rd0, [array_i64+8];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 1
+  %x = load i64* %i
+  ret i64 %x
+}
+
+define ptx_device float @t5_f32() {
+entry:
+;CHECK: ld.global.f32 f0, [array_float+4];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float]* @array_float, i32 0, i32 1
+  %x = load float* %i
+  ret float %x
+}
+
+define ptx_device double @t5_f64() {
+entry:
+;CHECK: ld.global.f64 fd0, [array_double+8];
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double]* @array_double, i32 0, i32 1
+  %x = load double* %i
+  ret double %x
+}
diff --git a/test/CodeGen/PTX/ld_float.ll b/test/CodeGen/PTX/ld_float.ll
deleted file mode 100644
index 62d2c36..0000000
--- a/test/CodeGen/PTX/ld_float.ll
+++ /dev/null
@@ -1,86 +0,0 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
-
-;CHECK: .extern .global .f32 array[];
-@array = external global [10 x float]
-
-;CHECK: .extern .const .f32 array_constant[];
-@array_constant = external addrspace(1) constant [10 x float]
-
-;CHECK: .extern .local .f32 array_local[];
-@array_local = external addrspace(2) global [10 x float]
-
-;CHECK: .extern .shared .f32 array_shared[];
-@array_shared = external addrspace(4) global [10 x float]
-
-define ptx_device float @t1(float* %p) {
-entry:
-;CHECK: ld.global.f32 f0, [r1];
-;CHECK-NEXT: ret;
-  %x = load float* %p
-  ret float %x
-}
-
-define ptx_device float @t2(float* %p) {
-entry:
-;CHECK: ld.global.f32 f0, [r1+4];
-;CHECK-NEXT: ret;
-  %i = getelementptr float* %p, i32 1
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device float @t3(float* %p, i32 %q) {
-entry:
-;CHECK: shl.b32 r0, r2, 2;
-;CHECK-NEXT: add.s32 r0, r1, r0;
-;CHECK-NEXT: ld.global.f32 f0, [r0];
-;CHECK-NEXT: ret;
-  %i = getelementptr float* %p, i32 %q
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device float @t4_global() {
-entry:
-;CHECK: ld.global.f32 f0, [array];
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float]* @array, i32 0, i32 0
-  %x = load float* %i
-  ret float %x
-}
-
-define ptx_device float @t4_const() {
-entry:
-;CHECK: ld.const.f32 f0, [array_constant];
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float] addrspace(1)* @array_constant, i32 0, i32 0
-  %x = load float addrspace(1)* %i
-  ret float %x
-}
-
-define ptx_device float @t4_local() {
-entry:
-;CHECK: ld.local.f32 f0, [array_local];
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0
-  %x = load float addrspace(2)* %i
-  ret float %x
-}
-
-define ptx_device float @t4_shared() {
-entry:
-;CHECK: ld.shared.f32 f0, [array_shared];
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0
-  %x = load float addrspace(4)* %i
-  ret float %x
-}
-
-define ptx_device float @t5() {
-entry:
-;CHECK: ld.global.f32 f0, [array+4];
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float]* @array, i32 0, i32 1
-  %x = load float* %i
-  ret float %x
-}
diff --git a/test/CodeGen/PTX/mov.ll b/test/CodeGen/PTX/mov.ll
index d201a78..00dcf19 100644
--- a/test/CodeGen/PTX/mov.ll
+++ b/test/CodeGen/PTX/mov.ll
@@ -1,25 +1,62 @@
 ; RUN: llc < %s -march=ptx | FileCheck %s
 
-define ptx_device i32 @t1() {
-; CHECK: mov.s32 r0, 0;
+define ptx_device i16 @t1_u16() {
+; CHECK: mov.u16 rh0, 0;
+; CHECK: ret;
+	ret i16 0
+}
+
+define ptx_device i32 @t1_u32() {
+; CHECK: mov.u32 r0, 0;
 ; CHECK: ret;
 	ret i32 0
 }
 
-define ptx_device i32 @t2(i32 %x) {
-; CHECK: mov.s32 r0, r1;
+define ptx_device i64 @t1_u64() {
+; CHECK: mov.u64 rd0, 0;
+; CHECK: ret;
+	ret i64 0
+}
+
+define ptx_device float @t1_f32() {
+; CHECK: mov.f32 f0, 0F00000000;
+; CHECK: ret;
+	ret float 0.0
+}
+
+define ptx_device double @t1_f64() {
+; CHECK: mov.f64 fd0, 0D0000000000000000;
+; CHECK: ret;
+	ret double 0.0
+}
+
+define ptx_device i16 @t2_u16(i16 %x) {
+; CHECK: mov.u16 rh0, rh1;
+; CHECK: ret;
+	ret i16 %x
+}
+
+define ptx_device i32 @t2_u32(i32 %x) {
+; CHECK: mov.u32 r0, r1;
 ; CHECK: ret;
 	ret i32 %x
 }
 
-define ptx_device float @t3() {
-; CHECK: mov.f32 f0, 0F00000000;
-; CHECK-NEXT: ret;
-	ret float 0.0
+define ptx_device i64 @t2_u64(i64 %x) {
+; CHECK: mov.u64 rd0, rd1;
+; CHECK: ret;
+	ret i64 %x
 }
 
-define ptx_device float @t4(float %x) {
+define ptx_device float @t3_f32(float %x) {
 ; CHECK: mov.f32 f0, f1;
 ; CHECK-NEXT: ret;
 	ret float %x
 }
+
+define ptx_device double @t3_f64(double %x) {
+; CHECK: mov.f64 fd0, fd1;
+; CHECK-NEXT: ret;
+	ret double %x
+}
+
diff --git a/test/CodeGen/PTX/mul.ll b/test/CodeGen/PTX/mul.ll
index 01871da..fd0788f 100644
--- a/test/CodeGen/PTX/mul.ll
+++ b/test/CodeGen/PTX/mul.ll
@@ -10,16 +10,30 @@
 ;	ret i32 %z
 ;}
 
-define ptx_device float @t3(float %x, float %y) {
+define ptx_device float @t1_f32(float %x, float %y) {
 ; CHECK: mul.f32 f0, f1, f2
 ; CHECK-NEXT: ret;
   %z = fmul float %x, %y
   ret float %z
 }
 
-define ptx_device float @t4(float %x) {
+define ptx_device double @t1_f64(double %x, double %y) {
+; CHECK: mul.f64 fd0, fd1, fd2
+; CHECK-NEXT: ret;
+  %z = fmul double %x, %y
+  ret double %z
+}
+
+define ptx_device float @t2_f32(float %x) {
 ; CHECK: mul.f32 f0, f1, 0F40A00000;
 ; CHECK-NEXT: ret;
   %z = fmul float %x, 5.0
   ret float %z
 }
+
+define ptx_device double @t2_f64(double %x) {
+; CHECK: mul.f64 fd0, fd1, 0D4014000000000000;
+; CHECK-NEXT: ret;
+  %z = fmul double %x, 5.0
+  ret double %z
+}
diff --git a/test/CodeGen/PTX/options.ll b/test/CodeGen/PTX/options.ll
index a14d5c9..1435537 100644
--- a/test/CodeGen/PTX/options.ll
+++ b/test/CodeGen/PTX/options.ll
@@ -1,5 +1,8 @@
-; RUN: llc < %s -march=ptx -ptx-version=2.0 | grep ".version 2.0"
-; RUN: llc < %s -march=ptx -ptx-target=sm_20 | grep ".target sm_20"
+; RUN: llc < %s -march=ptx -mattr=ptx14 | grep ".version 1.4"
+; RUN: llc < %s -march=ptx -mattr=ptx20 | grep ".version 2.0"
+; RUN: llc < %s -march=ptx -mattr=ptx21 | grep ".version 2.1"
+; RUN: llc < %s -march=ptx -mattr=sm20 | grep ".target sm_20"
+; RUN: llc < %s -march=ptx -mattr=sm13 | grep ".target sm_13"
 
 define ptx_device void @t1() {
 	ret void
diff --git a/test/CodeGen/PTX/st.ll b/test/CodeGen/PTX/st.ll
index 2cbacb9..bbe89a1 100644
--- a/test/CodeGen/PTX/st.ll
+++ b/test/CodeGen/PTX/st.ll
@@ -1,71 +1,382 @@
 ; RUN: llc < %s -march=ptx | FileCheck %s
 
-;CHECK: .extern .global .s32 array[];
-@array = external global [10 x i32]
+;CHECK: .extern .global .u16 array_i16[];
+@array_i16 = external global [10 x i16]
 
-;CHECK: .extern .const .s32 array_constant[];
-@array_constant = external addrspace(1) constant [10 x i32]
+;CHECK: .extern .const .u16 array_constant_i16[];
+@array_constant_i16 = external addrspace(1) constant [10 x i16]
 
-;CHECK: .extern .local .s32 array_local[];
-@array_local = external addrspace(2) global [10 x i32]
+;CHECK: .extern .local .u16 array_local_i16[];
+@array_local_i16 = external addrspace(2) global [10 x i16]
 
-;CHECK: .extern .shared .s32 array_shared[];
-@array_shared = external addrspace(4) global [10 x i32]
+;CHECK: .extern .shared .u16 array_shared_i16[];
+@array_shared_i16 = external addrspace(4) global [10 x i16]
 
-define ptx_device void @t1(i32* %p, i32 %x) {
+;CHECK: .extern .global .u32 array_i32[];
+@array_i32 = external global [10 x i32]
+
+;CHECK: .extern .const .u32 array_constant_i32[];
+@array_constant_i32 = external addrspace(1) constant [10 x i32]
+
+;CHECK: .extern .local .u32 array_local_i32[];
+@array_local_i32 = external addrspace(2) global [10 x i32]
+
+;CHECK: .extern .shared .u32 array_shared_i32[];
+@array_shared_i32 = external addrspace(4) global [10 x i32]
+
+;CHECK: .extern .global .u64 array_i64[];
+@array_i64 = external global [10 x i64]
+
+;CHECK: .extern .const .u64 array_constant_i64[];
+@array_constant_i64 = external addrspace(1) constant [10 x i64]
+
+;CHECK: .extern .local .u64 array_local_i64[];
+@array_local_i64 = external addrspace(2) global [10 x i64]
+
+;CHECK: .extern .shared .u64 array_shared_i64[];
+@array_shared_i64 = external addrspace(4) global [10 x i64]
+
+;CHECK: .extern .global .f32 array_float[];
+@array_float = external global [10 x float]
+
+;CHECK: .extern .const .f32 array_constant_float[];
+@array_constant_float = external addrspace(1) constant [10 x float]
+
+;CHECK: .extern .local .f32 array_local_float[];
+@array_local_float = external addrspace(2) global [10 x float]
+
+;CHECK: .extern .shared .f32 array_shared_float[];
+@array_shared_float = external addrspace(4) global [10 x float]
+
+;CHECK: .extern .global .f64 array_double[];
+@array_double = external global [10 x double]
+
+;CHECK: .extern .const .f64 array_constant_double[];
+@array_constant_double = external addrspace(1) constant [10 x double]
+
+;CHECK: .extern .local .f64 array_local_double[];
+@array_local_double = external addrspace(2) global [10 x double]
+
+;CHECK: .extern .shared .f64 array_shared_double[];
+@array_shared_double = external addrspace(4) global [10 x double]
+
+
+define ptx_device void @t1_u16(i16* %p, i16 %x) {
 entry:
-;CHECK: st.global.s32 [r1], r2;
+;CHECK: st.global.u16 [r1], rh1;
+;CHECK-NEXT: ret;
+  store i16 %x, i16* %p
+  ret void
+}
+
+define ptx_device void @t1_u32(i32* %p, i32 %x) {
+entry:
+;CHECK: st.global.u32 [r1], r2;
+;CHECK-NEXT: ret;
   store i32 %x, i32* %p
   ret void
 }
 
-define ptx_device void @t2(i32* %p, i32 %x) {
+define ptx_device void @t1_u64(i64* %p, i64 %x) {
 entry:
-;CHECK: st.global.s32 [r1+4], r2;
+;CHECK: st.global.u64 [r1], rd1;
+;CHECK-NEXT: ret;
+  store i64 %x, i64* %p
+  ret void
+}
+
+define ptx_device void @t1_f32(float* %p, float %x) {
+entry:
+;CHECK: st.global.f32 [r1], f1;
+;CHECK-NEXT: ret;
+  store float %x, float* %p
+  ret void
+}
+
+define ptx_device void @t1_f64(double* %p, double %x) {
+entry:
+;CHECK: st.global.f64 [r1], fd1;
+;CHECK-NEXT: ret;
+  store double %x, double* %p
+  ret void
+}
+
+define ptx_device void @t2_u16(i16* %p, i16 %x) {
+entry:
+;CHECK: st.global.u16 [r1+2], rh1;
+;CHECK-NEXT: ret;
+  %i = getelementptr i16* %p, i32 1
+  store i16 %x, i16* %i
+  ret void
+}
+
+define ptx_device void @t2_u32(i32* %p, i32 %x) {
+entry:
+;CHECK: st.global.u32 [r1+4], r2;
+;CHECK-NEXT: ret;
   %i = getelementptr i32* %p, i32 1
   store i32 %x, i32* %i
   ret void
 }
 
-define ptx_device void @t3(i32* %p, i32 %q, i32 %x) {
-;CHECK: .reg .s32 r0;
+define ptx_device void @t2_u64(i64* %p, i64 %x) {
+entry:
+;CHECK: st.global.u64 [r1+8], rd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr i64* %p, i32 1
+  store i64 %x, i64* %i
+  ret void
+}
+
+define ptx_device void @t2_f32(float* %p, float %x) {
+entry:
+;CHECK: st.global.f32 [r1+4], f1;
+;CHECK-NEXT: ret;
+  %i = getelementptr float* %p, i32 1
+  store float %x, float* %i
+  ret void
+}
+
+define ptx_device void @t2_f64(double* %p, double %x) {
+entry:
+;CHECK: st.global.f64 [r1+8], fd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr double* %p, i32 1
+  store double %x, double* %i
+  ret void
+}
+
+define ptx_device void @t3_u16(i16* %p, i32 %q, i16 %x) {
+entry:
+;CHECK: shl.b32 r0, r2, 1;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: st.global.u16 [r0], rh1;
+;CHECK-NEXT: ret;
+  %i = getelementptr i16* %p, i32 %q
+  store i16 %x, i16* %i
+  ret void
+}
+
+define ptx_device void @t3_u32(i32* %p, i32 %q, i32 %x) {
 entry:
 ;CHECK: shl.b32 r0, r2, 2;
-;CHECK: add.s32 r0, r1, r0;
-;CHECK: st.global.s32 [r0], r3;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: st.global.u32 [r0], r3;
+;CHECK-NEXT: ret;
   %i = getelementptr i32* %p, i32 %q
   store i32 %x, i32* %i
   ret void
 }
 
-define ptx_device void @t4_global(i32 %x) {
+define ptx_device void @t3_u64(i64* %p, i32 %q, i64 %x) {
 entry:
-;CHECK: st.global.s32 [array], r1;
-  %i = getelementptr [10 x i32]* @array, i32 0, i32 0
+;CHECK: shl.b32 r0, r2, 3;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: st.global.u64 [r0], rd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr i64* %p, i32 %q
+  store i64 %x, i64* %i
+  ret void
+}
+
+define ptx_device void @t3_f32(float* %p, i32 %q, float %x) {
+entry:
+;CHECK: shl.b32 r0, r2, 2;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: st.global.f32 [r0], f1;
+;CHECK-NEXT: ret;
+  %i = getelementptr float* %p, i32 %q
+  store float %x, float* %i
+  ret void
+}
+
+define ptx_device void @t3_f64(double* %p, i32 %q, double %x) {
+entry:
+;CHECK: shl.b32 r0, r2, 3;
+;CHECK-NEXT: add.u32 r0, r1, r0;
+;CHECK-NEXT: st.global.f64 [r0], fd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr double* %p, i32 %q
+  store double %x, double* %i
+  ret void
+}
+
+define ptx_device void @t4_global_u16(i16 %x) {
+entry:
+;CHECK: st.global.u16 [array_i16], rh1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16]* @array_i16, i16 0, i16 0
+  store i16 %x, i16* %i
+  ret void
+}
+
+define ptx_device void @t4_global_u32(i32 %x) {
+entry:
+;CHECK: st.global.u32 [array_i32], r1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 0
   store i32 %x, i32* %i
   ret void
 }
 
-define ptx_device void @t4_local(i32 %x) {
+define ptx_device void @t4_global_u64(i64 %x) {
 entry:
-;CHECK: st.local.s32 [array_local], r1;
-  %i = getelementptr [10 x i32] addrspace(2)* @array_local, i32 0, i32 0
+;CHECK: st.global.u64 [array_i64], rd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 0
+  store i64 %x, i64* %i
+  ret void
+}
+
+define ptx_device void @t4_global_f32(float %x) {
+entry:
+;CHECK: st.global.f32 [array_float], f1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float]* @array_float, i32 0, i32 0
+  store float %x, float* %i
+  ret void
+}
+
+define ptx_device void @t4_global_f64(double %x) {
+entry:
+;CHECK: st.global.f64 [array_double], fd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double]* @array_double, i32 0, i32 0
+  store double %x, double* %i
+  ret void
+}
+
+define ptx_device void @t4_local_u16(i16 %x) {
+entry:
+;CHECK: st.local.u16 [array_local_i16], rh1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16] addrspace(2)* @array_local_i16, i32 0, i32 0
+  store i16 %x, i16 addrspace(2)* %i
+  ret void
+}
+
+define ptx_device void @t4_local_u32(i32 %x) {
+entry:
+;CHECK: st.local.u32 [array_local_i32], r1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32] addrspace(2)* @array_local_i32, i32 0, i32 0
   store i32 %x, i32 addrspace(2)* %i
   ret void
 }
 
-define ptx_device void @t4_shared(i32 %x) {
+define ptx_device void @t4_local_u64(i64 %x) {
 entry:
-;CHECK: st.shared.s32 [array_shared], r1;
-  %i = getelementptr [10 x i32] addrspace(4)* @array_shared, i32 0, i32 0
+;CHECK: st.local.u64 [array_local_i64], rd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64] addrspace(2)* @array_local_i64, i32 0, i32 0
+  store i64 %x, i64 addrspace(2)* %i
+  ret void
+}
+
+define ptx_device void @t4_local_f32(float %x) {
+entry:
+;CHECK: st.local.f32 [array_local_float], f1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float] addrspace(2)* @array_local_float, i32 0, i32 0
+  store float %x, float addrspace(2)* %i
+  ret void
+}
+
+define ptx_device void @t4_local_f64(double %x) {
+entry:
+;CHECK: st.local.f64 [array_local_double], fd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double] addrspace(2)* @array_local_double, i32 0, i32 0
+  store double %x, double addrspace(2)* %i
+  ret void
+}
+
+define ptx_device void @t4_shared_u16(i16 %x) {
+entry:
+;CHECK: st.shared.u16 [array_shared_i16], rh1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16] addrspace(4)* @array_shared_i16, i32 0, i32 0
+  store i16 %x, i16 addrspace(4)* %i
+  ret void
+}
+
+define ptx_device void @t4_shared_u32(i32 %x) {
+entry:
+;CHECK: st.shared.u32 [array_shared_i32], r1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32] addrspace(4)* @array_shared_i32, i32 0, i32 0
   store i32 %x, i32 addrspace(4)* %i
   ret void
 }
 
-define ptx_device void @t5(i32 %x) {
+define ptx_device void @t4_shared_u64(i64 %x) {
 entry:
-;CHECK: st.global.s32 [array+4], r1;
-  %i = getelementptr [10 x i32]* @array, i32 0, i32 1
+;CHECK: st.shared.u64 [array_shared_i64], rd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64] addrspace(4)* @array_shared_i64, i32 0, i32 0
+  store i64 %x, i64 addrspace(4)* %i
+  ret void
+}
+
+define ptx_device void @t4_shared_f32(float %x) {
+entry:
+;CHECK: st.shared.f32 [array_shared_float], f1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float] addrspace(4)* @array_shared_float, i32 0, i32 0
+  store float %x, float addrspace(4)* %i
+  ret void
+}
+
+define ptx_device void @t4_shared_f64(double %x) {
+entry:
+;CHECK: st.shared.f64 [array_shared_double], fd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double] addrspace(4)* @array_shared_double, i32 0, i32 0
+  store double %x, double addrspace(4)* %i
+  ret void
+}
+
+define ptx_device void @t5_u16(i16 %x) {
+entry:
+;CHECK: st.global.u16 [array_i16+2], rh1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i16]* @array_i16, i32 0, i32 1
+  store i16 %x, i16* %i
+  ret void
+}
+
+define ptx_device void @t5_u32(i32 %x) {
+entry:
+;CHECK: st.global.u32 [array_i32+4], r1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i32]* @array_i32, i32 0, i32 1
   store i32 %x, i32* %i
   ret void
 }
+
+define ptx_device void @t5_u64(i64 %x) {
+entry:
+;CHECK: st.global.u64 [array_i64+8], rd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x i64]* @array_i64, i32 0, i32 1
+  store i64 %x, i64* %i
+  ret void
+}
+
+define ptx_device void @t5_f32(float %x) {
+entry:
+;CHECK: st.global.f32 [array_float+4], f1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x float]* @array_float, i32 0, i32 1
+  store float %x, float* %i
+  ret void
+}
+
+define ptx_device void @t5_f64(double %x) {
+entry:
+;CHECK: st.global.f64 [array_double+8], fd1;
+;CHECK-NEXT: ret;
+  %i = getelementptr [10 x double]* @array_double, i32 0, i32 1
+  store double %x, double* %i
+  ret void
+}
diff --git a/test/CodeGen/PTX/st_float.ll b/test/CodeGen/PTX/st_float.ll
deleted file mode 100644
index f0e0010..0000000
--- a/test/CodeGen/PTX/st_float.ll
+++ /dev/null
@@ -1,78 +0,0 @@
-; RUN: llc < %s -march=ptx | FileCheck %s
-
-;CHECK: .extern .global .f32 array[];
-@array = external global [10 x float]
-
-;CHECK: .extern .const .f32 array_constant[];
-@array_constant = external addrspace(1) constant [10 x float]
-
-;CHECK: .extern .local .f32 array_local[];
-@array_local = external addrspace(2) global [10 x float]
-
-;CHECK: .extern .shared .f32 array_shared[];
-@array_shared = external addrspace(4) global [10 x float]
-
-define ptx_device void @t1(float* %p, float %x) {
-entry:
-;CHECK: st.global.f32 [r1], f1;
-;CHECK-NEXT: ret;
-  store float %x, float* %p
-  ret void
-}
-
-define ptx_device void @t2(float* %p, float %x) {
-entry:
-;CHECK: st.global.f32 [r1+4], f1;
-;CHECK-NEXT: ret;
-  %i = getelementptr float* %p, i32 1
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t3(float* %p, i32 %q, float %x) {
-;CHECK: .reg .s32 r0;
-entry:
-;CHECK: shl.b32 r0, r2, 2;
-;CHECK-NEXT: add.s32 r0, r1, r0;
-;CHECK-NEXT: st.global.f32 [r0], f1;
-;CHECK-NEXT: ret;
-  %i = getelementptr float* %p, i32 %q
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t4_global(float %x) {
-entry:
-;CHECK: st.global.f32 [array], f1;
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float]* @array, i32 0, i32 0
-  store float %x, float* %i
-  ret void
-}
-
-define ptx_device void @t4_local(float %x) {
-entry:
-;CHECK: st.local.f32 [array_local], f1;
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float] addrspace(2)* @array_local, i32 0, i32 0
-  store float %x, float addrspace(2)* %i
-  ret void
-}
-
-define ptx_device void @t4_shared(float %x) {
-entry:
-;CHECK: st.shared.f32 [array_shared], f1;
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float] addrspace(4)* @array_shared, i32 0, i32 0
-  store float %x, float addrspace(4)* %i
-  ret void
-}
-
-define ptx_device void @t5(float %x) {
-entry:
-;CHECK: st.global.f32 [array+4], f1;
-;CHECK-NEXT: ret;
-  %i = getelementptr [10 x float]* @array, i32 0, i32 1
-  store float %x, float* %i
-  ret void
-}
diff --git a/test/CodeGen/PTX/sub.ll b/test/CodeGen/PTX/sub.ll
index e11deca..4810e4f 100644
--- a/test/CodeGen/PTX/sub.ll
+++ b/test/CodeGen/PTX/sub.ll
@@ -1,29 +1,71 @@
 ; RUN: llc < %s -march=ptx | FileCheck %s
 
-define ptx_device i32 @t1(i32 %x, i32 %y) {
-;CHECK: sub.s32 r0, r1, r2;
+define ptx_device i16 @t1_u16(i16 %x, i16 %y) {
+; CHECK: sub.u16 rh0, rh1, rh2;
+; CHECK-NEXT: ret;
+	%z = sub i16 %x, %y
+	ret i16 %z
+}
+
+define ptx_device i32 @t1_u32(i32 %x, i32 %y) {
+; CHECK: sub.u32 r0, r1, r2;
+; CHECK-NEXT: ret;
 	%z = sub i32 %x, %y
-;CHECK: ret;
 	ret i32 %z
 }
 
-define ptx_device i32 @t2(i32 %x) {
-;CHECK: add.s32 r0, r1, -1;
-	%z = sub i32 %x, 1
-;CHECK: ret;
-	ret i32 %z
+define ptx_device i64 @t1_u64(i64 %x, i64 %y) {
+; CHECK: sub.u64 rd0, rd1, rd2;
+; CHECK-NEXT: ret;
+	%z = sub i64 %x, %y
+	ret i64 %z
 }
 
-define ptx_device float @t3(float %x, float %y) {
+define ptx_device float @t1_f32(float %x, float %y) {
 ; CHECK: sub.f32 f0, f1, f2
 ; CHECK-NEXT: ret;
   %z = fsub float %x, %y
   ret float %z
 }
 
-define ptx_device float @t4(float %x) {
+define ptx_device double @t1_f64(double %x, double %y) {
+; CHECK: sub.f64 fd0, fd1, fd2
+; CHECK-NEXT: ret;
+  %z = fsub double %x, %y
+  ret double %z
+}
+
+define ptx_device i16 @t2_u16(i16 %x) {
+; CHECK: add.u16 rh0, rh1, -1;
+; CHECK-NEXT: ret;
+	%z = sub i16 %x, 1
+	ret i16 %z
+}
+
+define ptx_device i32 @t2_u32(i32 %x) {
+; CHECK: add.u32 r0, r1, -1;
+; CHECK-NEXT: ret;
+	%z = sub i32 %x, 1
+	ret i32 %z
+}
+
+define ptx_device i64 @t2_u64(i64 %x) {
+; CHECK: add.u64 rd0, rd1, -1;
+; CHECK-NEXT: ret;
+	%z = sub i64 %x, 1
+	ret i64 %z
+}
+
+define ptx_device float @t2_f32(float %x) {
 ; CHECK: add.f32 f0, f1, 0FBF800000;
 ; CHECK-NEXT: ret;
   %z = fsub float %x, 1.0
   ret float %z
 }
+
+define ptx_device double @t2_f64(double %x) {
+; CHECK: add.f64 fd0, fd1, 0DBFF0000000000000;
+; CHECK-NEXT: ret;
+  %z = fsub double %x, 1.0
+  ret double %z
+}
