[AMDGPU] Shrink scalar AND, OR, XOR instructions
This change attempts to shrink scalar AND, OR and XOR instructions which take an immediate that isn't inlineable.
It performs:
AND s0, s0, ~(1 << n) -> BITSET0 s0, n
OR s0, s0, (1 << n) -> BITSET1 s0, n
AND s0, s1, x -> ANDN2 s0, s1, ~x
OR s0, s1, x -> ORN2 s0, s1, ~x
XOR s0, s1, x -> XNOR s0, s1, ~x
In particular, this catches setting and clearing the sign bit for fabs (and x, 0x7ffffffff -> bitset0 x, 31 and or x, 0x80000000 -> bitset1 x, 31).
llvm-svn: 348601
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 015773b..6ad7dd0 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -212,6 +212,82 @@
}
}
+/// Attempt to shink AND/OR/XOR operations requiring non-inlineable literals.
+/// For AND or OR, try using S_BITSET{0,1} to clear or set bits.
+/// If the inverse of the immediate is legal, use ANDN2, ORN2 or
+/// XNOR (as a ^ b == ~(a ^ ~b)).
+/// \returns true if the caller should continue the machine function iterator
+static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
+ MachineRegisterInfo &MRI,
+ const SIInstrInfo *TII,
+ MachineInstr &MI) {
+ unsigned Opc = MI.getOpcode();
+ const MachineOperand *Dest = &MI.getOperand(0);
+ MachineOperand *Src0 = &MI.getOperand(1);
+ MachineOperand *Src1 = &MI.getOperand(2);
+ MachineOperand *SrcReg = Src0;
+ MachineOperand *SrcImm = Src1;
+
+ if (SrcImm->isImm() &&
+ !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
+ uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+ uint32_t NewImm = 0;
+
+ if (Opc == AMDGPU::S_AND_B32) {
+ if (isPowerOf2_32(~Imm)) {
+ NewImm = countTrailingOnes(Imm);
+ Opc = AMDGPU::S_BITSET0_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ANDN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_OR_B32) {
+ if (isPowerOf2_32(Imm)) {
+ NewImm = countTrailingZeros(Imm);
+ Opc = AMDGPU::S_BITSET1_B32;
+ } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_ORN2_B32;
+ }
+ } else if (Opc == AMDGPU::S_XOR_B32) {
+ if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+ NewImm = ~Imm;
+ Opc = AMDGPU::S_XNOR_B32;
+ }
+ } else {
+ llvm_unreachable("unexpected opcode");
+ }
+
+ if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+ SrcImm == Src0) {
+ if (!TII->commuteInstruction(MI, false, 1, 2))
+ NewImm = 0;
+ }
+
+ if (NewImm != 0) {
+ if (TargetRegisterInfo::isVirtualRegister(Dest->getReg()) &&
+ SrcReg->isReg()) {
+ MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+ MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+ return true;
+ }
+
+ if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+ MI.setDesc(TII->get(Opc));
+ if (Opc == AMDGPU::S_BITSET0_B32 ||
+ Opc == AMDGPU::S_BITSET1_B32) {
+ Src0->ChangeToImmediate(NewImm);
+ MI.RemoveOperand(2);
+ } else {
+ SrcImm->setImm(NewImm);
+ }
+ }
+ }
+ }
+
+ return false;
+}
+
// This is the same as MachineInstr::readsRegister/modifiesRegister except
// it takes subregs into account.
static bool instAccessReg(iterator_range<MachineInstr::const_mop_iterator> &&R,
@@ -512,6 +588,14 @@
continue;
}
+ // Shrink scalar logic operations.
+ if (MI.getOpcode() == AMDGPU::S_AND_B32 ||
+ MI.getOpcode() == AMDGPU::S_OR_B32 ||
+ MI.getOpcode() == AMDGPU::S_XOR_B32) {
+ if (shrinkScalarLogicOp(ST, MRI, TII, MI))
+ continue;
+ }
+
if (!TII->hasVALU32BitEncoding(MI.getOpcode()))
continue;
diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
new file mode 100644
index 0000000..95bba7d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_clear_msb:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_clear_msb(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, 2147483647
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_set_msb:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_set_msb(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, 2147483648
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_clear_lsb:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2
+define amdgpu_kernel void @s_clear_lsb(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, 4294967294
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_set_lsb:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @s_set_lsb(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, 1
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_clear_midbit:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_clear_midbit(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, 4294967039
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_set_midbit:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, 256
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
new file mode 100644
index 0000000..a16fd53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_or_to_orn2:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 %in, -51
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_or_to_orn2_imm0:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+ %x = or i32 -51, %in
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 %in, -51
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2_imm0:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+ %x = and i32 -51, %in
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) {
+ %x = xor i32 %in, -51
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) {
+ %x = xor i32 -51, %in
+ store i32 %x, i32 addrspace(1)* %out
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index ba72969..f96019d 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -11,7 +11,8 @@
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
%fabs = call float @fabs(float %bc)
@@ -23,7 +24,8 @@
; R600-NOT: AND
; R600: |PV.{{[XYZW]}}|
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
%bc= bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
@@ -34,7 +36,8 @@
; FUNC-LABEL: {{^}}s_fabs_f32:
; R600: |{{(PV|T[0-9])\.[XYZW]}}|
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
%fabs = call float @llvm.fabs.f32(float %in)
store float %fabs, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index c72dab0..0ff5d96 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
@@ -35,6 +35,7 @@
; R600: -PV
; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; VI: s_bitset1_b32 s{{[0-9]+}}, 31
define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
%bc = bitcast i32 %in to float
%fabs = call float @llvm.fabs.f32(float %bc)
diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index 7fb47e0..b2fd9f6 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -14,7 +14,7 @@
; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32
+; SI: s_bitset1_b32
; CI: s_add_i32
; CHECK: ds_write_b32
define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index 87c18a7..f0dca07 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -48,7 +48,7 @@
; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
; SI, which is why it is being OR'd with the base pointer.
-; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; SI-DAG: s_bitset1_b32 [[ADDR:s[0-9]+]], 16
; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000