[DAGCombine] (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
We already have a combine for this pattern when the input to shl is add, so we just need to enable the transformation when the input is or.
Original patch by @tstellar
Differential Revision: https://reviews.llvm.org/D19325
llvm-svn: 313251
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index 0413064..df0dfc6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -70,13 +70,13 @@
 
 ; FIXME: single bit op
 ; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
-; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; CI: v_or_b32_e32 [[OR:v[0-9]+]], [[MASK]], v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[OR]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SHL]]
+; CI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, v{{[0-9]+}}
+; CI: v_or_b32_e32 [[OR:v[0-9]+]], v{{[0-9]+}}, [[SHL]]
+; CI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, [[OR]]
+; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
 ; CIVI: flat_store_dword
 
 ; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
@@ -88,15 +88,14 @@
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; CI: v_or_b32_e32 [[OR00:v[0-9]+]], [[MASK]], v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, [[OR00]]
-; CI: v_or_b32_e32 [[OR01:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR01]]
-; CI: v_or_b32_e32 [[OR10:v[0-9]+]], [[MASK]], v{{[0-9]+}}
-; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, [[OR10]]
-; CI: v_or_b32_e32 [[OR11:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
-; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR11]]
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+; CI: v_lshlrev_b32_e32 [[SHL0:v[0-9]+]], 16, v{{[0-9]+}}
+; CI: v_or_b32_e32 [[OR0:v[0-9]+]], v{{[0-9]+}}, [[SHL0]]
+; CI: v_lshlrev_b32_e32 [[SHL1:v[0-9]+]], 16, v{{[0-9]+}}
+; CI: v_or_b32_e32 [[OR1:v[0-9]+]], v{{[0-9]+}}, [[SHL1]]
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR0]]
+; CI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]], [[OR1]]
+; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[VMASK:v[0-9]+]], [[MASK]]
 ; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[VMASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 08f286a..5282023 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -27,4 +27,27 @@
   ret void
 }
 
+; CHECK-LABEL: {{^}}ds_bpermute_add_shl:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute_add_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+  %index = add i32 %base_index, 1
+  %byte_index = shl i32 %index, 2
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK-LABEL: {{^}}ds_bpermute_or_shl:
+; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
+; CHECK: s_waitcnt lgkmcnt
+define void @ds_bpermute_or_shl(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+  %masked = and i32 %base_index, 62
+  %index = or i32 %masked, 1
+  %byte_index = shl i32 %index, 2
+  %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %byte_index, i32 %src) #0
+  store i32 %bpermute, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
 attributes #0 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll
index 13ac914..b730ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl.ll
@@ -476,4 +476,28 @@
    ret void
 }
 
+; FUNC-LABEL: {{^}}shl_or_k:
+; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, v{{[0-9]+}}
+; SI: v_or_b32_e32 [[OR:v[0-9]+]], 4, [[SHL]]
+; SI: buffer_store_dword [[OR]]
+define void @shl_or_k(i32 addrspace(1)* %out, i32 %in) {
+  %tmp0 = or i32 %in, 1
+  %tmp2 = shl i32 %tmp0, 2
+  store i32 %tmp2, i32 addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}shl_or_k_two_uses:
+; SI: v_or_b32_e32 [[OR:v[0-9]+]], 1, v{{[0-9]+}}
+; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 2, [[OR]]
+; SI-DAG: buffer_store_dword [[OR]]
+; SI-DAG: buffer_store_dword [[SHL]]
+define void @shl_or_k_two_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %in) {
+  %tmp0 = or i32 %in, 1
+  %tmp2 = shl i32 %tmp0, 2
+  store i32 %tmp2, i32 addrspace(1)* %out0
+  store i32 %tmp0, i32 addrspace(1)* %out1
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/combine-shl.ll b/llvm/test/CodeGen/X86/combine-shl.ll
index 560ba48..fbddd3c 100644
--- a/llvm/test/CodeGen/X86/combine-shl.ll
+++ b/llvm/test/CodeGen/X86/combine-shl.ll
@@ -537,19 +537,19 @@
   ret <4 x i32> %2
 }
 
-; FIXME: fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+; fold (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
 define <4 x i32> @combine_vec_shl_or0(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_shl_or0:
 ; SSE:       # BB#0:
-; SSE-NEXT:    por {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    pslld $2, %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_or0:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [5,5,5,5]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpslld $2, %xmm0, %xmm0
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [20,20,20,20]
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = or  <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
   %2 = shl <4 x i32> %1, <i32 2, i32 2, i32 2, i32 2>
@@ -559,14 +559,14 @@
 define <4 x i32> @combine_vec_shl_or1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_shl_or1:
 ; SSE:       # BB#0:
-; SSE-NEXT:    por {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
+; SSE-NEXT:    por {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_or1:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %1 = or  <4 x i32> %x, <i32 5, i32 6, i32 7, i32 8>
   %2 = shl <4 x i32> %1, <i32 1, i32 2, i32 3, i32 4>