AMDGPU: Add VI i16 support

Patch By: Wei Ding

Differential Revision: https://reviews.llvm.org/D18049

llvm-svn: 286464
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index e7d6ef3..7a208d7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -493,6 +493,8 @@
 
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
 
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+
 class PredicateControl {
   Predicate SubtargetPredicate;
   Predicate SIAssemblerPredicate = isSICI;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 2e43d42..5a87148 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -587,19 +587,32 @@
 
 bool AMDGPUTargetLowering::isTruncateFree(EVT Source, EVT Dest) const {
   // Truncate is just accessing a subregister.
-  return Dest.bitsLT(Source) && (Dest.getSizeInBits() % 32 == 0);
+
+  unsigned SrcSize = Source.getSizeInBits();
+  unsigned DestSize = Dest.getSizeInBits();
+
+  return DestSize < SrcSize && DestSize % 32 == 0 ;
 }
 
 bool AMDGPUTargetLowering::isTruncateFree(Type *Source, Type *Dest) const {
   // Truncate is just accessing a subregister.
-  return Dest->getPrimitiveSizeInBits() < Source->getPrimitiveSizeInBits() &&
-         (Dest->getPrimitiveSizeInBits() % 32 == 0);
+
+  unsigned SrcSize = Source->getScalarSizeInBits();
+  unsigned DestSize = Dest->getScalarSizeInBits();
+
+  if (DestSize== 16 && Subtarget->has16BitInsts())
+    return SrcSize >= 32;
+
+  return DestSize < SrcSize && DestSize % 32 == 0;
 }
 
 bool AMDGPUTargetLowering::isZExtFree(Type *Src, Type *Dest) const {
   unsigned SrcSize = Src->getScalarSizeInBits();
   unsigned DestSize = Dest->getScalarSizeInBits();
 
+  if (SrcSize == 16 && Subtarget->has16BitInsts())
+    return DestSize >= 32;
+
   return SrcSize == 32 && DestSize == 64;
 }
 
@@ -608,6 +621,10 @@
   // practical purposes, the extra mov 0 to load a 64-bit is free.  As used,
   // this will enable reducing 64-bit operations the 32-bit, which is always
   // good.
+
+  if (Src == MVT::i16)
+    return Dest == MVT::i32 ||Dest == MVT::i64 ;
+
   return Src == MVT::i32 && Dest == MVT::i64;
 }
 
@@ -2447,6 +2464,10 @@
   if (VT.isVector() || Size > 64)
     return SDValue();
 
+  // There are i16 integer mul/mad.
+  if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
+    return SDValue();
+
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index cc9cce5..c2544c2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -529,14 +529,14 @@
 
   def : Pat <
     (fcopysign f32:$src0, f32:$src1),
-    (BFI_INT (LoadImm32 0x7fffffff), $src0, $src1)
+    (BFI_INT (LoadImm32 (i32 0x7fffffff)), $src0, $src1)
   >;
 
   def : Pat <
     (f64 (fcopysign f64:$src0, f64:$src1)),
     (REG_SEQUENCE RC64,
       (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
-      (BFI_INT (LoadImm32 0x7fffffff),
+      (BFI_INT (LoadImm32 (i32 0x7fffffff)),
                (i32 (EXTRACT_SUBREG $src0, sub1)),
                (i32 (EXTRACT_SUBREG $src1, sub1))), sub1)
   >;
@@ -545,7 +545,7 @@
     (f64 (fcopysign f64:$src0, f32:$src1)),
     (REG_SEQUENCE RC64,
       (i32 (EXTRACT_SUBREG $src0, sub0)), sub0,
-      (BFI_INT (LoadImm32 0x7fffffff),
+      (BFI_INT (LoadImm32 (i32 0x7fffffff)),
                (i32 (EXTRACT_SUBREG $src0, sub1)),
                $src1), sub1)
   >;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 42d16a5..928b5d2 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -708,13 +708,13 @@
 // int_SI_vs_load_input
 def : Pat<
   (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, 0, imm:$attr_offset, 0, 0, 0)
+  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
 >;
 
 // Offset in an 32-bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
-  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, 0, 0, 0, 0, 0)
+  (BUFFER_LOAD_DWORD_OFFEN $voff, $sbase, (i32 0), 0, 0, 0, 0)
 >;
 
 
@@ -914,7 +914,7 @@
 >;
 
 
-class MUBUFLoad_Pattern <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
+class MUBUFLoad_PatternADDR64 <MUBUF_Pseudo Instr_ADDR64, ValueType vt,
                               PatFrag constant_ld> : Pat <
      (vt (constant_ld (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset,
                                    i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
@@ -936,15 +936,34 @@
 }
 
 let Predicates = [isSICI] in {
-def : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
-def : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
-def : MUBUFLoad_Pattern <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
-def : MUBUFLoad_Pattern <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SBYTE_ADDR64, i32, sextloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_UBYTE_ADDR64, i32, az_extloadi8_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_SSHORT_ADDR64, i32, sextloadi16_constant>;
+def : MUBUFLoad_PatternADDR64 <BUFFER_LOAD_USHORT_ADDR64, i32, az_extloadi16_constant>;
 
 defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORD_ADDR64, BUFFER_LOAD_DWORD_OFFSET, i32, mubuf_load_atomic>;
 defm : MUBUFLoad_Atomic_Pattern <BUFFER_LOAD_DWORDX2_ADDR64, BUFFER_LOAD_DWORDX2_OFFSET, i64, mubuf_load_atomic>;
 } // End Predicates = [isSICI]
 
+multiclass MUBUFLoad_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+                               PatFrag ld> {
+
+  def : Pat <
+    (vt (ld (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                          i16:$offset, i1:$glc, i1:$slc, i1:$tfe))),
+    (Instr_OFFSET $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+  >;
+}
+
+let Predicates = [Has16BitInsts] in {
+
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, sextloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, az_extloadi8_constant>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_SBYTE_OFFSET, i16, mubuf_sextloadi8>;
+defm : MUBUFLoad_Pattern <BUFFER_LOAD_UBYTE_OFFSET, i16, mubuf_az_extloadi8>;
+
+} // End Predicates = [Has16BitInsts]
+
 class MUBUFScratchLoadPat <MUBUF_Pseudo Instr, ValueType vt, PatFrag ld> : Pat <
   (vt (ld (MUBUFScratch v4i32:$srsrc, i32:$vaddr,
                         i32:$soffset, u16imm:$offset))),
@@ -953,6 +972,8 @@
 
 def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i32, sextloadi8_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i32, extloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_SBYTE_OFFEN, i16, sextloadi8_private>;
+def : MUBUFScratchLoadPat <BUFFER_LOAD_UBYTE_OFFEN, i16, extloadi8_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_SSHORT_OFFEN, i32, sextloadi16_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, i32, extloadi16_private>;
 def : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, i32, load_private>;
@@ -1025,6 +1046,20 @@
 defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, global_store_atomic>;
 } // End Predicates = [isSICI]
 
+
+multiclass MUBUFStore_Pattern <MUBUF_Pseudo Instr_OFFSET, ValueType vt,
+                               PatFrag st> {
+
+  def : Pat <
+    (st vt:$vdata, (MUBUFOffset v4i32:$srsrc, i32:$soffset,
+                                      i16:$offset, i1:$glc, i1:$slc, i1:$tfe)),
+    (Instr_OFFSET $vdata, $srsrc, $soffset, $offset, $glc, $slc, $tfe)
+  >;
+}
+
+defm : MUBUFStore_Pattern <BUFFER_STORE_BYTE_OFFSET, i16, truncstorei8_global>;
+defm : MUBUFStore_Pattern <BUFFER_STORE_SHORT_OFFSET, i16, global_store>;
+
 class MUBUFScratchStorePat <MUBUF_Pseudo Instr, ValueType vt, PatFrag st> : Pat <
   (st vt:$value, (MUBUFScratch v4i32:$srsrc, i32:$vaddr, i32:$soffset,
                                u16imm:$offset)),
@@ -1033,6 +1068,8 @@
 
 def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i32, truncstorei8_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i32, truncstorei16_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_BYTE_OFFEN, i16, truncstorei8_private>;
+def : MUBUFScratchStorePat <BUFFER_STORE_SHORT_OFFEN, i16, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORD_OFFEN, i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX2_OFFEN, v2i32, store_private>;
 def : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, v4i32, store_private>;
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 54935bb..a077001 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -489,8 +489,12 @@
 
 def : DSReadPat <DS_READ_I8,  i32, si_sextload_local_i8>;
 def : DSReadPat <DS_READ_U8,  i32, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I8,  i16, si_sextload_local_i8>;
+def : DSReadPat <DS_READ_U8,  i16, si_az_extload_local_i8>;
+def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
 def : DSReadPat <DS_READ_I16, i32, si_sextload_local_i16>;
 def : DSReadPat <DS_READ_U16, i32, si_az_extload_local_i16>;
+def : DSReadPat <DS_READ_U16, i16, si_load_local>;
 def : DSReadPat <DS_READ_B32, i32, si_load_local>;
 
 let AddedComplexity = 100 in {
@@ -512,6 +516,8 @@
 
 def : DSWritePat <DS_WRITE_B8, i32, si_truncstore_local_i8>;
 def : DSWritePat <DS_WRITE_B16, i32, si_truncstore_local_i16>;
+def : DSWritePat <DS_WRITE_B8, i16, si_truncstore_local_i8>;
+def : DSWritePat <DS_WRITE_B16, i16, si_store_local>;
 def : DSWritePat <DS_WRITE_B32, i32, si_store_local>;
 
 let AddedComplexity = 100 in {
@@ -522,8 +528,8 @@
 def : Pat <
   (si_store_local v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0,
                                                                i8:$offset1)),
-  (DS_WRITE2_B32 $ptr, (EXTRACT_SUBREG $value, sub0),
-                       (EXTRACT_SUBREG $value, sub1), $offset0, $offset1,
+  (DS_WRITE2_B32 $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
+                       (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
                        (i1 0))
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7b54c61..4a86b1e 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -341,6 +341,8 @@
 
 def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i32>;
 def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i32>;
+def : FlatLoadPat <FLAT_LOAD_UBYTE, flat_az_extloadi8, i16>;
+def : FlatLoadPat <FLAT_LOAD_SBYTE, flat_sextloadi8, i16>;
 def : FlatLoadPat <FLAT_LOAD_USHORT, flat_az_extloadi16, i32>;
 def : FlatLoadPat <FLAT_LOAD_SSHORT, flat_sextloadi16, i32>;
 def : FlatLoadPat <FLAT_LOAD_DWORD, flat_load, i32>;
@@ -389,6 +391,10 @@
 
 } // End Predicates = [isCIVI]
 
+let Predicates = [isVI] in {
+  def : FlatStorePat <FLAT_STORE_BYTE, flat_truncstorei8, i16>;
+  def : FlatStorePat <FLAT_STORE_SHORT, flat_store, i16>;
+}
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3b84e38..ac13bd2 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -78,6 +78,9 @@
   addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
+  if (Subtarget->has16BitInsts())
+    addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
+
   computeRegisterProperties(STI.getRegisterInfo());
 
   // We need to custom lower vector stores from local memory
@@ -221,6 +224,55 @@
   setOperationAction(ISD::FDIV, MVT::f32, Custom);
   setOperationAction(ISD::FDIV, MVT::f64, Custom);
 
+  if (Subtarget->has16BitInsts()) {
+    setOperationAction(ISD::Constant, MVT::i16, Legal);
+
+    setOperationAction(ISD::SMIN, MVT::i16, Legal);
+    setOperationAction(ISD::SMAX, MVT::i16, Legal);
+
+    setOperationAction(ISD::UMIN, MVT::i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::i16, Legal);
+
+    setOperationAction(ISD::SETCC, MVT::i16, Promote);
+    AddPromotedToType(ISD::SETCC, MVT::i16, MVT::i32);
+
+    setOperationAction(ISD::SIGN_EXTEND, MVT::i16, Promote);
+    AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
+
+    setOperationAction(ISD::ROTR, MVT::i16, Promote);
+    setOperationAction(ISD::ROTL, MVT::i16, Promote);
+
+    setOperationAction(ISD::SDIV, MVT::i16, Promote);
+    setOperationAction(ISD::UDIV, MVT::i16, Promote);
+    setOperationAction(ISD::SREM, MVT::i16, Promote);
+    setOperationAction(ISD::UREM, MVT::i16, Promote);
+
+    setOperationAction(ISD::BSWAP, MVT::i16, Promote);
+    setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
+
+    setOperationAction(ISD::CTTZ, MVT::i16, Promote);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16, Promote);
+    setOperationAction(ISD::CTLZ, MVT::i16, Promote);
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16, Promote);
+
+    setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
+
+    setOperationAction(ISD::BR_CC, MVT::i16, Expand);
+
+    setOperationAction(ISD::LOAD, MVT::i16, Custom);
+
+    setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+
+    setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+    AddPromotedToType(ISD::UINT_TO_FP, MVT::i16, MVT::i32);
+    setOperationAction(ISD::SINT_TO_FP, MVT::i16, Promote);
+    AddPromotedToType(ISD::SINT_TO_FP, MVT::i16, MVT::i32);
+    setOperationAction(ISD::FP16_TO_FP, MVT::i16, Promote);
+    AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
+    setOperationAction(ISD::FP_TO_FP16, MVT::i16, Promote);
+    AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
+  }
+
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
@@ -2558,7 +2610,6 @@
   EVT MemVT = Load->getMemoryVT();
 
   if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
-    assert(MemVT == MVT::i1 && "Only i1 non-extloads expected");
     // FIXME: Copied from PPC
     // First, load into 32 bits, then truncate to 1 bit.
 
@@ -2566,8 +2617,10 @@
     SDValue BasePtr = Load->getBasePtr();
     MachineMemOperand *MMO = Load->getMemOperand();
 
+    EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
+
     SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
-                                   BasePtr, MVT::i8, MMO);
+                                   BasePtr, RealMemVT, MMO);
 
     SDValue Ops[] = {
       DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
@@ -3381,8 +3434,23 @@
   }
 
   EVT VT = K0->getValueType(0);
-  return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
-                     Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+
+  MVT NVT = MVT::i32;
+  unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+
+  SDValue Tmp1, Tmp2, Tmp3;
+  Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+  Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+  Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+
+  if (VT == MVT::i16) {
+    Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
+                       Tmp1, Tmp2, Tmp3);
+
+    return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
+  } else
+    return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
+                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
 }
 
 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index f19e99e..d770bd4 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1128,7 +1128,6 @@
 
 include "SIInstructions.td"
 include "CIInstructions.td"
-include "VIInstructions.td"
 
 include "DSInstructions.td"
 include "MIMGInstructions.td"
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4122eb9..b758a57 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -374,7 +374,7 @@
 
 def : Pat <
   (int_AMDGPU_kilp),
-  (SI_KILL 0xbf800000)
+  (SI_KILL (i32 0xbf800000))
 >;
 
 def : Pat <
@@ -555,7 +555,7 @@
 def : Pat <
   (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
                (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, 0, 1, $omod)
+  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
 >;
 
 /********** ================================ **********/
@@ -566,7 +566,7 @@
 
 def : Pat <
   (fneg (fabs f32:$src)),
-  (S_OR_B32 $src, (S_MOV_B32 0x80000000)) // Set sign bit
+  (S_OR_B32 $src, (S_MOV_B32(i32 0x80000000))) // Set sign bit
 >;
 
 // FIXME: Should use S_OR_B32
@@ -575,19 +575,19 @@
   (REG_SEQUENCE VReg_64,
     (i32 (EXTRACT_SUBREG f64:$src, sub0)),
     sub0,
-    (V_OR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
-                  (V_MOV_B32_e32 0x80000000)), // Set sign bit.
+    (V_OR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+                  (V_MOV_B32_e32 (i32 0x80000000))), // Set sign bit.
     sub1)
 >;
 
 def : Pat <
   (fabs f32:$src),
-  (V_AND_B32_e64 $src, (V_MOV_B32_e32 0x7fffffff))
+  (V_AND_B32_e64 $src, (V_MOV_B32_e32 (i32 0x7fffffff)))
 >;
 
 def : Pat <
   (fneg f32:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000))
+  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 (i32 0x80000000)))
 >;
 
 def : Pat <
@@ -595,8 +595,8 @@
   (REG_SEQUENCE VReg_64,
     (i32 (EXTRACT_SUBREG f64:$src, sub0)),
     sub0,
-    (V_AND_B32_e64 (EXTRACT_SUBREG f64:$src, sub1),
-                   (V_MOV_B32_e32 0x7fffffff)), // Set sign bit.
+    (V_AND_B32_e64 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+                   (V_MOV_B32_e32 (i32 0x7fffffff))), // Set sign bit.
      sub1)
 >;
 
@@ -605,8 +605,8 @@
   (REG_SEQUENCE VReg_64,
     (i32 (EXTRACT_SUBREG f64:$src, sub0)),
     sub0,
-    (V_XOR_B32_e32 (EXTRACT_SUBREG f64:$src, sub1),
-                   (V_MOV_B32_e32 0x80000000)),
+    (V_XOR_B32_e32 (i32 (EXTRACT_SUBREG f64:$src, sub1)),
+                   (i32 (V_MOV_B32_e32 (i32 0x80000000)))),
     sub1)
 >;
 
@@ -666,21 +666,21 @@
 def : Pat <
   (int_AMDGPU_cube v4f32:$src),
   (REG_SEQUENCE VReg_128,
-    (V_CUBETC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */, (EXTRACT_SUBREG $src, sub1),
-                  0 /* src2_modifiers */, (EXTRACT_SUBREG $src, sub2),
+    (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
                   0 /* clamp */, 0 /* omod */), sub0,
-    (V_CUBESC_F32 0 /* src0_modifiers */, (EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
-                  0 /* src2_modifiers */,(EXTRACT_SUBREG $src, sub2),
+    (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
                   0 /* clamp */, 0 /* omod */), sub1,
-    (V_CUBEMA_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+    (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
                   0 /* clamp */, 0 /* omod */), sub2,
-    (V_CUBEID_F32 0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub0),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub1),
-                  0 /* src1_modifiers */,(EXTRACT_SUBREG $src, sub2),
+    (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
+                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
                   0 /* clamp */, 0 /* omod */), sub3)
 >;
 
@@ -701,7 +701,7 @@
 def : Pat <
   (AMDGPUurecip i32:$src0),
   (V_CVT_U32_F32_e32
-    (V_MUL_F32_e32 CONST.FP_UINT_MAX_PLUS_1,
+    (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 >;
 
@@ -767,32 +767,37 @@
 //===----------------------------------------------------------------------===//
 
 def : Pat<(i32 (sext_inreg i32:$src, i1)),
-  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+  (S_BFE_I32 i32:$src, (i32 65536))>; // 0 | 1 << 16
 
 // Handle sext_inreg in i64
 def : Pat <
   (i64 (sext_inreg i64:$src, i1)),
-  (S_BFE_I64 i64:$src, 0x10000) // 0 | 1 << 16
+  (S_BFE_I64 i64:$src, (i32 0x10000)) // 0 | 1 << 16
+>;
+
+def : Pat <
+  (i16 (sext_inreg i16:$src, i8)),
+  (S_BFE_I32 $src, (i32 0x80000)) // 0 | 8 << 16
 >;
 
 def : Pat <
   (i64 (sext_inreg i64:$src, i8)),
-  (S_BFE_I64 i64:$src, 0x80000) // 0 | 8 << 16
+  (S_BFE_I64 i64:$src, (i32 0x80000)) // 0 | 8 << 16
 >;
 
 def : Pat <
   (i64 (sext_inreg i64:$src, i16)),
-  (S_BFE_I64 i64:$src, 0x100000) // 0 | 16 << 16
+  (S_BFE_I64 i64:$src, (i32 0x100000)) // 0 | 16 << 16
 >;
 
 def : Pat <
   (i64 (sext_inreg i64:$src, i32)),
-  (S_BFE_I64 i64:$src, 0x200000) // 0 | 32 << 16
+  (S_BFE_I64 i64:$src, (i32 0x200000)) // 0 | 32 << 16
 >;
 
 def : Pat <
   (i64 (zext i32:$src)),
-  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 0), sub1)
+  (REG_SEQUENCE SReg_64, $src, sub0, (S_MOV_B32 (i32 0)), sub1)
 >;
 
 def : Pat <
@@ -804,7 +809,7 @@
   (i64 (ext i1:$src)),
     (REG_SEQUENCE VReg_64,
       (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src), sub0,
-      (S_MOV_B32 0), sub1)
+      (S_MOV_B32 (i32 0)), sub1)
 >;
 
 
@@ -816,25 +821,25 @@
 def : Pat <
   (i64 (sext i32:$src)),
     (REG_SEQUENCE SReg_64, $src, sub0,
-    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, 31), SReg_32_XM0)), sub1)
+    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 $src, (i32 31)), SReg_32_XM0)), sub1)
 >;
 
 def : Pat <
   (i64 (sext i1:$src)),
   (REG_SEQUENCE VReg_64,
-    (V_CNDMASK_B32_e64 0, -1, $src), sub0,
-    (V_CNDMASK_B32_e64 0, -1, $src), sub1)
+    (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub0,
+    (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src), sub1)
 >;
 
-class FPToI1Pat<Instruction Inst, int KOne, ValueType vt, SDPatternOperator fp_to_int> : Pat <
+class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat <
   (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))),
-  (i1 (Inst 0, KOne, $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
+  (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE))
 >;
 
-def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, f32, fp_to_uint>;
-def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, f32, fp_to_sint>;
-def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, f64, fp_to_uint>;
-def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, f64, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_NEG_ONE, i32, f32, fp_to_sint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_ONE, i64, f64, fp_to_uint>;
+def : FPToI1Pat<V_CMP_EQ_F64_e64, CONST.FP64_NEG_ONE, i64, f64, fp_to_sint>;
 
 // If we need to perform a logical operation on i1 values, we need to
 // use vector comparisons since there is only one SCC register. Vector
@@ -859,12 +864,12 @@
 
 def : Pat <
   (f32 (sint_to_fp i1:$src)),
-  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_NEG_ONE, $src)
+  (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_NEG_ONE), $src)
 >;
 
 def : Pat <
   (f32 (uint_to_fp i1:$src)),
-  (V_CNDMASK_B32_e64 (i32 0), CONST.FP32_ONE, $src)
+  (V_CNDMASK_B32_e64 (i32 0), (i32 CONST.FP32_ONE), $src)
 >;
 
 def : Pat <
@@ -888,20 +893,20 @@
 
 def : Pat <
   (i1 (trunc i32:$a)),
-  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), 1)
+  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
 >;
 
 def : Pat <
   (i1 (trunc i64:$a)),
   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
-                    (EXTRACT_SUBREG $a, sub0)), 1)
+                    (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1))
 >;
 
 def : Pat <
   (i32 (bswap i32:$a)),
-  (V_BFI_B32 (S_MOV_B32 0x00ff00ff),
-             (V_ALIGNBIT_B32 $a, $a, 24),
-             (V_ALIGNBIT_B32 $a, $a, 8))
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32 $a, $a, (i32 24)),
+             (V_ALIGNBIT_B32 $a, $a, (i32 8)))
 >;
 
 def : Pat <
@@ -917,7 +922,7 @@
 
   def : Pat <
     (vt (add (vt (shl 1, vt:$a)), -1)),
-    (BFM $a, (MOV 0))
+    (BFM $a, (MOV (i32 0)))
   >;
 }
 
@@ -928,7 +933,7 @@
 
 def : Pat<
   (fcanonicalize f32:$src),
-  (V_MUL_F32_e64 0, CONST.FP32_ONE, 0, $src, 0, 0)
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
 >;
 
 def : Pat<
@@ -963,7 +968,7 @@
              (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
              DSTCLAMP.NONE, DSTOMOD.NONE),
          $x,
-         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, 3/*NaN*/)),
+         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
       DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 7d3634e..a5ba0ef 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -123,7 +123,7 @@
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 // SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
                             (add (sequence "SGPR%u", 0, 103))> {
   let AllocationPriority = 1;
 }
@@ -190,7 +190,8 @@
                                (add (decimate (shl TTMP_32, 3), 4))]>;
 
 // VGPR 32-bit registers
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+// i16 only on VI+
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
@@ -258,8 +259,8 @@
 }
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
-  (add SReg_32_XM0, M0)> {
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32,
+  (add SReg_32_XM0, M0, VCC_LO, VCC_HI, EXEC_LO, EXEC_HI, FLAT_SCR_LO, FLAT_SCR_HI)> {
   let AllocationPriority = 1;
 }
 
@@ -346,7 +347,7 @@
   let Size = 32;
 }
 
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VGPR_32, SReg_32)> {
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16], 32, (add VGPR_32, SReg_32)> {
   let isAllocatable = 0;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index e38a11d..2486fbf 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -879,7 +879,7 @@
   (i64 (ctpop i64:$src)),
     (i64 (REG_SEQUENCE SReg_64,
      (i32 (COPY_TO_REGCLASS (S_BCNT1_I32_B64 $src), SReg_32)), sub0,
-     (S_MOV_B32 0), sub1))
+     (S_MOV_B32 (i32 0)), sub1))
 >;
 
 def : Pat <
@@ -887,6 +887,18 @@
   (S_ABS_I32 $x)
 >;
 
+def : Pat <
+  (i16 imm:$imm),
+  (S_MOV_B32 imm:$imm)
+>;
+
+// Same as a 32-bit inreg
+def : Pat<
+  (i32 (sext i16:$src)),
+  (S_SEXT_I32_I16 $src)
+>;
+
+
 //===----------------------------------------------------------------------===//
 // SOP2 Patterns
 //===----------------------------------------------------------------------===//
@@ -898,6 +910,29 @@
   (S_ADD_U32 $src0, $src1)
 >;
 
+// FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that
+// REG_SEQUENCE patterns don't support instructions with multiple
+// outputs.
+def : Pat<
+  (i64 (zext i16:$src)),
+    (REG_SEQUENCE SReg_64,
+      (i32 (COPY_TO_REGCLASS (S_AND_B32 $src, (S_MOV_B32 (i32 0xffff))), SGPR_32)), sub0,
+      (S_MOV_B32 (i32 0)), sub1)
+>;
+
+def : Pat <
+  (i64 (sext i16:$src)),
+    (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
+    (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
+>;
+
+def : Pat<
+  (i32 (zext i16:$src)),
+  (S_AND_B32 (S_MOV_B32 (i32 0xffff)), $src)
+>;
+
+
+
 //===----------------------------------------------------------------------===//
 // SOPP Patterns
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td
index ead90ec..b45c8fc 100644
--- a/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VIInstructions.td
@@ -8,3 +8,7 @@
 //===----------------------------------------------------------------------===//
 // Instruction definitions for VI and newer.
 //===----------------------------------------------------------------------===//
+
+FIXME: Deleting this file broke buildbots that don't do full rebuilds.  This
+file is no longer used by the backend, so it can be deleted once all
+the buildbots update there dependencies.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 6124d4e..b284098 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -301,6 +301,20 @@
 
 }
 
+let Predicates = [isVI] in {
+
+def : Pat<
+    (f32 (f16_to_fp i16:$src)),
+    (V_CVT_F32_F16_e32 $src)
+>;
+
+def : Pat<
+    (i16 (fp_to_f16 f32:$src)),
+    (V_CVT_F16_F32_e32 $src)
+>;
+
+}
+
 //===----------------------------------------------------------------------===//
 // Target
 //===----------------------------------------------------------------------===//
@@ -561,10 +575,39 @@
 let Predicates = [isVI] in {
 
 def : Pat <
-  (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
-                      imm:$bound_ctrl),
+  (i32 (int_amdgcn_mov_dpp i32:$src, imm:$dpp_ctrl, imm:$row_mask, imm:$bank_mask,
+                      imm:$bound_ctrl)),
   (V_MOV_B32_dpp $src, (as_i32imm $dpp_ctrl), (as_i32imm $row_mask),
                        (as_i32imm $bank_mask), (as_i1imm $bound_ctrl))
 >;
 
+
+def : Pat<
+  (i32 (anyext i16:$src)),
+  (COPY $src)
+>;
+
+def : Pat<
+   (i64 (anyext i16:$src)),
+   (REG_SEQUENCE VReg_64,
+     (i32 (COPY $src)), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+def : Pat<
+  (i16 (trunc i32:$src)),
+  (COPY $src)
+>;
+
+def : Pat<
+  (i1 (trunc i16:$src)),
+  (COPY $src)
+>;
+
+
+def : Pat <
+  (i16 (trunc i64:$src)),
+  (EXTRACT_SUBREG $src, sub0)
+>;
+
 } // End Predicates = [isVI]
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index fc13382..570ca05 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -345,6 +345,78 @@
 
 } // End SubtargetPredicate = isVI
 
+// Note: 16-bit instructions produce a 0 result in the high 16-bits.
+multiclass Arithmetic_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+  (op i16:$src0, i16:$src1),
+  (inst $src0, $src1)
+>;
+
+def : Pat<
+  (i32 (zext (op i16:$src0, i16:$src1))),
+  (inst $src0, $src1)
+>;
+
+def : Pat<
+  (i64 (zext (op i16:$src0, i16:$src1))),
+   (REG_SEQUENCE VReg_64,
+     (inst $src0, $src1), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+
+}
+
+multiclass Bits_OpsRev_i16_Pats <SDPatternOperator op, Instruction inst> {
+
+def : Pat<
+  (op i16:$src0, i32:$src1),
+  (inst $src1, $src0)
+>;
+
+def : Pat<
+  (i32 (zext (op i16:$src0, i32:$src1))),
+  (inst $src1, $src0)
+>;
+
+
+def : Pat<
+  (i64 (zext (op i16:$src0, i32:$src1))),
+   (REG_SEQUENCE VReg_64,
+     (inst $src1, $src0), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+class ZExt_i16_i1_Pat <SDNode ext> : Pat <
+  (i16 (ext i1:$src)),
+  (V_CNDMASK_B32_e64 (i32 0), (i32 1), $src)
+>;
+
+let Predicates = [isVI] in {
+
+defm : Arithmetic_i16_Pats<add, V_ADD_U16_e32>;
+defm : Arithmetic_i16_Pats<mul, V_MUL_LO_U16_e32>;
+defm : Arithmetic_i16_Pats<sub, V_SUB_U16_e32>;
+defm : Arithmetic_i16_Pats<smin, V_MIN_I16_e32>;
+defm : Arithmetic_i16_Pats<smax, V_MAX_I16_e32>;
+defm : Arithmetic_i16_Pats<umin, V_MIN_U16_e32>;
+defm : Arithmetic_i16_Pats<umax, V_MAX_U16_e32>;
+
+defm : Arithmetic_i16_Pats<and, V_AND_B32_e32>;
+defm : Arithmetic_i16_Pats<or, V_OR_B32_e32>;
+defm : Arithmetic_i16_Pats<xor, V_XOR_B32_e32>;
+
+defm : Bits_OpsRev_i16_Pats<shl, V_LSHLREV_B16_e32>;
+defm : Bits_OpsRev_i16_Pats<srl, V_LSHRREV_B16_e32>;
+defm : Bits_OpsRev_i16_Pats<sra, V_ASHRREV_B16_e32>;
+
+def : ZExt_i16_i1_Pat<zext>;
+def : ZExt_i16_i1_Pat<sext>;
+def : ZExt_i16_i1_Pat<anyext>;
+
+} // End Predicates = [isVI]
+
 //===----------------------------------------------------------------------===//
 // SI
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 0f06375..73e3315 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -222,6 +222,38 @@
 
 } // End SubtargetPredicate = isVI
 
+def : Pat <
+  (i16 (select i1:$src0, i16:$src1, i16:$src2)),
+  (V_CNDMASK_B32_e64 $src2, $src1, $src0)
+>;
+
+let Predicates = [isVI] in {
+
+multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+                            Instruction inst, SDPatternOperator op3> {
+def : Pat<
+  (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
+  (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+  (i32 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+  (inst i16:$src0, i16:$src1, i16:$src2)
+>;
+
+def : Pat<
+  (i64 (op3 (op2 (op1 i16:$src0, i16:$src1), i16:$src2))),
+   (REG_SEQUENCE VReg_64,
+     (inst i16:$src0, i16:$src1, i16:$src2), sub0,
+     (V_MOV_B32_e32 (i32 0)), sub1)
+>;
+}
+
+defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+
+} // End Predicates = [isVI]
+
 
 //===----------------------------------------------------------------------===//
 // Target