[AMDGPU] Shrink scalar AND, OR, XOR instructions

This change attempts to shrink scalar AND, OR and XOR instructions which take an immediate that isn't inlineable.

It performs:
AND s0, s0, ~(1 << n) -> BITSET0 s0, n
OR s0, s0, (1 << n) -> BITSET1 s0, n
AND s0, s1, x -> ANDN2 s0, s1, ~x
OR s0, s1, x -> ORN2 s0, s1, ~x
XOR s0, s1, x -> XNOR s0, s1, ~x

In particular, this catches setting and clearing the sign bit for fabs (and x, 0x7ffffffff -> bitset0 x, 31 and or x, 0x80000000 -> bitset1 x, 31).

llvm-svn: 348601
diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
new file mode 100644
index 0000000..95bba7d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_clear_msb:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_clear_msb(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, 2147483647
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_set_msb:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 31
+define amdgpu_kernel void @s_set_msb(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, 2147483648
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_clear_lsb:
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2
+define amdgpu_kernel void @s_clear_lsb(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, 4294967294
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_set_lsb:
+; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @s_set_lsb(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, 1
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_clear_midbit:
+; SI: s_bitset0_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_clear_midbit(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, 4294967039
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_set_midbit:
+; SI: s_bitset1_b32 s{{[0-9]+}}, 8
+define amdgpu_kernel void @s_set_midbit(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, 256
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
new file mode 100644
index 0000000..a16fd53
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -0,0 +1,49 @@
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+
+; SI-LABEL: {{^}}s_or_to_orn2:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 %in, -51
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_or_to_orn2_imm0:
+; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_or_to_orn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+  %x = or i32 -51, %in
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 %in, -51
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_and_to_andn2_imm0:
+; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_and_to_andn2_imm0(i32 addrspace(1)* %out, i32 %in) {
+  %x = and i32 -51, %in
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor(i32 addrspace(1)* %out, i32 %in) {
+  %x = xor i32 %in, -51
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
+
+; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
+; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
+define amdgpu_kernel void @s_xor_to_xnor_imm0(i32 addrspace(1)* %out, i32 %in) {
+  %x = xor i32 -51, %in
+  store i32 %x, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.ll b/llvm/test/CodeGen/AMDGPU/fabs.ll
index ba72969..f96019d 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.ll
@@ -11,7 +11,8 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
@@ -23,7 +24,8 @@
 ; R600-NOT: AND
 ; R600: |PV.{{[XYZW]}}|
 
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
@@ -34,7 +36,8 @@
 ; FUNC-LABEL: {{^}}s_fabs_f32:
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
-; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
+; VI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
index c72dab0..0ff5d96 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -1,5 +1,5 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
@@ -35,6 +35,7 @@
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
+; VI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
diff --git a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
index 7fb47e0..b2fd9f6 100644
--- a/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/llvm/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -14,7 +14,7 @@
 ; CHECK-LABEL: {{^}}use_gep_address_space_large_offset:
 ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
 ; SI, which is why it is being OR'd with the base pointer.
-; SI: s_or_b32
+; SI: s_bitset1_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
 define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
diff --git a/llvm/test/CodeGen/AMDGPU/local-64.ll b/llvm/test/CodeGen/AMDGPU/local-64.ll
index 87c18a7..f0dca07 100644
--- a/llvm/test/CodeGen/AMDGPU/local-64.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-64.ll
@@ -48,7 +48,7 @@
 
 ; The LDS offset will be 65536 bytes, which is larger than the size of LDS on
 ; SI, which is why it is being OR'd with the base pointer.
-; SI-DAG: s_or_b32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
+; SI-DAG: s_bitset1_b32 [[ADDR:s[0-9]+]], 16
 ; CI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; VI-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000
 ; GFX9-DAG: s_add_i32 [[ADDR:s[0-9]+]], s{{[0-9]+}}, 0x10000