AMDGPU: Match load d16 hi instructions

Also starts selecting global loads for constant address
in some cases. Some end up selecting to mubuf still, which
requires investigation.

We still get sub-optimal regalloc and extra waitcnts inserted
due to not really tracking the liveness of the separate register
halves.

llvm-svn: 313716
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index c7c3e01..e66bf40 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -145,16 +145,22 @@
   let hasPostISelHook = 1;
 }
 
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
-  (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
+  !if(HasTiedOutput,
+    (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+    (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
   "$vdst, $addr$offset$gds"> {
-
+  let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
   let has_data0 = 0;
   let has_data1 = 0;
 }
 
+class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
+  DS_1A_RET<opName, rc, 1>;
+
 class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
@@ -450,7 +456,7 @@
 def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
 
 let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
 }
 
 let mayStore = 0 in {
@@ -468,12 +474,12 @@
 def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
 
 let SubtargetPredicate = HasD16LoadStore in {
-def DS_READ_U8_D16     : DS_1A_RET<"ds_read_u8_d16">;
-def DS_READ_U8_D16_HI  : DS_1A_RET<"ds_read_u8_d16_hi">;
-def DS_READ_I8_D16     : DS_1A_RET<"ds_read_i8_d16">;
-def DS_READ_I8_D16_HI  : DS_1A_RET<"ds_read_i8_d16_hi">;
-def DS_READ_U16_D16    : DS_1A_RET<"ds_read_u16_d16">;
-def DS_READ_U16_D16_HI : DS_1A_RET<"ds_read_u16_d16_hi">;
+def DS_READ_U8_D16     : DS_1A_RET_Tied<"ds_read_u8_d16">;
+def DS_READ_U8_D16_HI  : DS_1A_RET_Tied<"ds_read_u8_d16_hi">;
+def DS_READ_I8_D16     : DS_1A_RET_Tied<"ds_read_i8_d16">;
+def DS_READ_I8_D16_HI  : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
+def DS_READ_U16_D16    : DS_1A_RET_Tied<"ds_read_u16_d16">;
+def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
 }
 
 let SubtargetPredicate = HasDSAddTid in {
@@ -543,6 +549,18 @@
   (inst $ptr, (as_i16imm $offset), (i1 0))
 >;
 
+multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
+  def : Pat <
+    (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
+    (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+  >;
+
+  def : Pat <
+    (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
+    (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+  >;
+}
+
 def : DSReadPat <DS_READ_I8,  i32, sextloadi8_local_m0>;
 def : DSReadPat <DS_READ_U8,  i32, az_extloadi8_local_m0>;
 def : DSReadPat <DS_READ_I8,  i16, sextloadi8_local_m0>;
@@ -565,6 +583,15 @@
   (DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
 >;
 
+
+let Predicates = [HasD16LoadStore] in {
+let AddedComplexity = 100 in {
+defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
+defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
+defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
+}
+}
+
 class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
   (frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
   (inst $ptr, $value, (as_i16imm $offset), (i1 0))