AMDGPU: Match load d16 hi instructions
Also starts selecting global loads for constant address
in some cases. Some end up selecting to mubuf still, which
requires investigation.
We still get sub-optimal regalloc and extra waitcnts inserted
due to not really tracking the liveness of the separate register
halves.
llvm-svn: 313716
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index c7c3e01..e66bf40 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -145,16 +145,22 @@
let hasPostISelHook = 1;
}
-class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, Operand ofs = offset>
+class DS_1A_RET<string opName, RegisterClass rc = VGPR_32, bit HasTiedOutput = 0, Operand ofs = offset>
: DS_Pseudo<opName,
(outs rc:$vdst),
- (ins VGPR_32:$addr, ofs:$offset, gds:$gds),
+ !if(HasTiedOutput,
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds, rc:$vdst_in),
+ (ins VGPR_32:$addr, ofs:$offset, gds:$gds)),
"$vdst, $addr$offset$gds"> {
-
+ let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+ let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
let has_data0 = 0;
let has_data1 = 0;
}
+class DS_1A_RET_Tied<string opName, RegisterClass rc = VGPR_32> :
+ DS_1A_RET<opName, rc, 1>;
+
class DS_1A_Off8_RET <string opName, RegisterClass rc = VGPR_32>
: DS_Pseudo<opName,
(outs rc:$vdst),
@@ -450,7 +456,7 @@
def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
-def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, SwizzleImm>;
+def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
}
let mayStore = 0 in {
@@ -468,12 +474,12 @@
def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
let SubtargetPredicate = HasD16LoadStore in {
-def DS_READ_U8_D16 : DS_1A_RET<"ds_read_u8_d16">;
-def DS_READ_U8_D16_HI : DS_1A_RET<"ds_read_u8_d16_hi">;
-def DS_READ_I8_D16 : DS_1A_RET<"ds_read_i8_d16">;
-def DS_READ_I8_D16_HI : DS_1A_RET<"ds_read_i8_d16_hi">;
-def DS_READ_U16_D16 : DS_1A_RET<"ds_read_u16_d16">;
-def DS_READ_U16_D16_HI : DS_1A_RET<"ds_read_u16_d16_hi">;
+def DS_READ_U8_D16 : DS_1A_RET_Tied<"ds_read_u8_d16">;
+def DS_READ_U8_D16_HI : DS_1A_RET_Tied<"ds_read_u8_d16_hi">;
+def DS_READ_I8_D16 : DS_1A_RET_Tied<"ds_read_i8_d16">;
+def DS_READ_I8_D16_HI : DS_1A_RET_Tied<"ds_read_i8_d16_hi">;
+def DS_READ_U16_D16 : DS_1A_RET_Tied<"ds_read_u16_d16">;
+def DS_READ_U16_D16_HI : DS_1A_RET_Tied<"ds_read_u16_d16_hi">;
}
let SubtargetPredicate = HasDSAddTid in {
@@ -543,6 +549,18 @@
(inst $ptr, (as_i16imm $offset), (i1 0))
>;
+multiclass DSReadPat_Hi16 <DS_Pseudo inst, PatFrag frag, ValueType vt = i16> {
+ def : Pat <
+ (build_vector vt:$lo, (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))),
+ (v2i16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+ >;
+
+ def : Pat <
+ (build_vector f16:$lo, (f16 (bitconvert (vt (frag (DS1Addr1Offset i32:$ptr, i32:$offset)))))),
+ (v2f16 (inst $ptr, (as_i16imm $offset), (i1 0), $lo))
+ >;
+}
+
def : DSReadPat <DS_READ_I8, i32, sextloadi8_local_m0>;
def : DSReadPat <DS_READ_U8, i32, az_extloadi8_local_m0>;
def : DSReadPat <DS_READ_I8, i16, sextloadi8_local_m0>;
@@ -565,6 +583,15 @@
(DS_READ2_B32 $ptr, $offset0, $offset1, (i1 0))
>;
+
+let Predicates = [HasD16LoadStore] in {
+let AddedComplexity = 100 in {
+defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
+defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
+defm : DSReadPat_Hi16<DS_READ_I8_D16_HI, sextloadi8_local>;
+}
+}
+
class DSWritePat <DS_Pseudo inst, ValueType vt, PatFrag frag> : Pat <
(frag vt:$value, (DS1Addr1Offset i32:$ptr, i32:$offset)),
(inst $ptr, $value, (as_i16imm $offset), (i1 0))