AMDGPU: Add D16 instructions preserve unused bits feature
- Predicate D16 patterns on this new feature
- Added this new feature to gfx900/2/4
Differential Revision: https://reviews.llvm.org/D46366
llvm-svn: 331551
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 73490f6..6a028a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -322,6 +322,13 @@
"Has deep learning instructions"
>;
+def FeatureD16PreservesUnusedBits : SubtargetFeature<
+ "d16-preserves-unused-bits",
+ "D16PreservesUnusedBits",
+ "true",
+ "D16 memory instructions preserve unused bits rather than zeroing them out"
+>;
+
//===------------------------------------------------------------===//
// Subtarget Features (options and debugging)
//===------------------------------------------------------------===//
@@ -608,20 +615,23 @@
def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,
[FeatureGFX9,
FeatureMadMixInsts,
- FeatureLDSBankCount32
+ FeatureLDSBankCount32,
+ FeatureD16PreservesUnusedBits
]>;
def FeatureISAVersion9_0_2 : SubtargetFeatureISAVersion <9,0,2,
[FeatureGFX9,
FeatureMadMixInsts,
FeatureLDSBankCount32,
- FeatureXNACK
+ FeatureXNACK,
+ FeatureD16PreservesUnusedBits
]>;
def FeatureISAVersion9_0_4 : SubtargetFeatureISAVersion <9,0,4,
[FeatureGFX9,
FeatureLDSBankCount32,
- FeatureFmaMixInsts]>;
+ FeatureFmaMixInsts,
+ FeatureD16PreservesUnusedBits]>;
def FeatureISAVersion9_0_6 : SubtargetFeatureISAVersion <9,0,6,
[FeatureGFX9,
@@ -769,6 +779,8 @@
def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+def D16PreservesUnusedBits : Predicate<"Subtarget->d16PreservesUnusedBits()">,
+ AssemblerPredicate<"FeatureD16PreservesUnusedBits">;
def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 40c583b..850715f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -162,6 +162,7 @@
HasSDWAOutModsVOPC(false),
HasDPP(false),
HasDLInsts(false),
+ D16PreservesUnusedBits(false),
FlatAddressSpace(false),
FlatInstOffsets(false),
FlatGlobalInsts(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index cce8800..0b070e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -166,6 +166,7 @@
bool HasSDWAOutModsVOPC;
bool HasDPP;
bool HasDLInsts;
+ bool D16PreservesUnusedBits;
bool FlatAddressSpace;
bool FlatInstOffsets;
bool FlatGlobalInsts;
@@ -546,6 +547,10 @@
return HasDLInsts;
}
+ bool d16PreservesUnusedBits() const {
+ return D16PreservesUnusedBits;
+ }
+
/// Returns the offset in bytes from the start of the input buffer
/// of the first explicit kernel argument.
unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index dc528fb..999d553 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1374,7 +1374,7 @@
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX4_OFFEN, BUFFER_LOAD_DWORDX4_OFFSET, v4i32, load_private>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SHORT_D16_HI_OFFEN, BUFFER_LOAD_SHORT_D16_HI_OFFSET, i16, load_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_UBYTE_D16_HI_OFFEN, BUFFER_LOAD_UBYTE_D16_HI_OFFSET, i16, az_extloadi8_private>;
defm : MUBUFScratchLoadPat_Hi16<BUFFER_LOAD_SBYTE_D16_HI_OFFEN, BUFFER_LOAD_SBYTE_D16_HI_OFFSET, i16, sextloadi8_private>;
@@ -1489,7 +1489,7 @@
defm : MUBUFScratchStorePat <BUFFER_STORE_DWORDX4_OFFEN, BUFFER_STORE_DWORDX4_OFFSET, v4i32, store_private>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
// Hiding the extract high pattern in the PatFrag seems to not
// automatically increase the complexity.
let AddedComplexity = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index c0f6d27..28887ea 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -655,7 +655,7 @@
} // End AddedComplexity = 100
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
let AddedComplexity = 100 in {
defm : DSReadPat_Hi16<DS_READ_U16_D16_HI, load_local>;
defm : DSReadPat_Hi16<DS_READ_U8_D16_HI, az_extloadi8_local>;
@@ -689,7 +689,7 @@
defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
defm : DSWritePat_mc <DS_WRITE_B32, i32, "store_local">;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 6938691..e6d3544 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -780,7 +780,7 @@
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStorePat <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
def : FlatStorePat <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
@@ -824,7 +824,7 @@
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX2, store_global, v2i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_DWORDX4, store_global, v4i32>;
-let OtherPredicates = [HasD16LoadStore] in {
+let OtherPredicates = [D16PreservesUnusedBits] in {
def : FlatStoreSignedPat <GLOBAL_STORE_SHORT_D16_HI, truncstorei16_hi16_global, i32>;
def : FlatStoreSignedPat <GLOBAL_STORE_BYTE_D16_HI, truncstorei8_hi16_global, i32>;