[X86][SSE] Add support for combining VZEXT_MOVL target shuffles
Includes adding more general support for the pattern: VZEXT_MOVL(VZEXT_LOAD(ptr)) -> VZEXT_LOAD(ptr)
This has unearthed a couple of latent poor codegen issues (MINSS/MAXSS scalar load folding and MOVDDUP/BROADCAST load folding patterns), which will be fixed shortly.
Its also reduced a couple of tests so that they no longer reach the instruction threshold necessary to be combined to PSHUFB (see PR26183).
llvm-svn: 279646
diff --git a/llvm/test/CodeGen/X86/insertps-combine.ll b/llvm/test/CodeGen/X86/insertps-combine.ll
index b55a029..e801334 100644
--- a/llvm/test/CodeGen/X86/insertps-combine.ll
+++ b/llvm/test/CodeGen/X86/insertps-combine.ll
@@ -112,14 +112,12 @@
define <4 x float> @shuffle_v4f32_05zz(<4 x float> %a, <4 x float> %b) {
; SSE-LABEL: shuffle_v4f32_05zz:
; SSE: # BB#0:
-; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; SSE-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero
+; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
; SSE-NEXT: retq
;
; AVX-LABEL: shuffle_v4f32_05zz:
; AVX: # BB#0:
-; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
-; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
; AVX-NEXT: retq
%shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 undef, i32 undef>
%shuffle1 = shufflevector <4 x float> %shuffle, <4 x float> <float undef, float undef, float 0.000000e+00, float 0.000000e+00>, <4 x i32> <i32 0, i32 1, i32 6, i32 7>
diff --git a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
index bdc490f..cec0402 100644
--- a/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
+++ b/llvm/test/CodeGen/X86/vec_ss_load_fold.ll
@@ -9,7 +9,8 @@
; X32-NEXT: xorps %xmm1, %xmm1
; X32-NEXT: subss LCPI0_0, %xmm0
; X32-NEXT: mulss LCPI0_1, %xmm0
-; X32-NEXT: minss LCPI0_2, %xmm0
+; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT: minss %xmm2, %xmm0
; X32-NEXT: maxss %xmm1, %xmm0
; X32-NEXT: cvttss2si %xmm0, %eax
; X32-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
@@ -21,7 +22,8 @@
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
; X64-NEXT: subss {{.*}}(%rip), %xmm0
; X64-NEXT: mulss {{.*}}(%rip), %xmm0
-; X64-NEXT: minss {{.*}}(%rip), %xmm0
+; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT: minss %xmm2, %xmm0
; X64-NEXT: maxss %xmm1, %xmm0
; X64-NEXT: cvttss2si %xmm0, %eax
; X64-NEXT: ## kill: %AX<def> %AX<kill> %EAX<kill>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
index e64ca96..f53d30a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll
@@ -2176,29 +2176,18 @@
}
define <8 x i16> @insert_dup_mem_v8i16_i32(i32* %ptr) {
-; SSE2-LABEL: insert_dup_mem_v8i16_i32:
-; SSE2: # BB#0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: insert_dup_mem_v8i16_i32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: insert_dup_mem_v8i16_i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
-; SSE41-NEXT: retq
+; SSE-LABEL: insert_dup_mem_v8i16_i32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_mem_v8i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_mem_v8i16_i32:
@@ -2257,29 +2246,18 @@
}
define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(i32* %ptr) {
-; SSE2-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; SSE2: # BB#0:
-; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: insert_dup_elt1_mem_v8i16_i32:
-; SSE41: # BB#0:
-; SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE41-NEXT: pshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
-; SSE41-NEXT: retq
+; SSE-LABEL: insert_dup_elt1_mem_v8i16_i32:
+; SSE: # BB#0:
+; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE-NEXT: retq
;
; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: retq
;
; AVX2-LABEL: insert_dup_elt1_mem_v8i16_i32:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index c7d9ca8..29715e0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3564,7 +3564,9 @@
; AVX1-LABEL: insert_dup_elt3_mem_v16i16_i32:
; AVX1: # BB#0:
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,3,2,3,2,3,2,3,2,3,2,3,2,3,2,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1]
+; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 6e8fc5e..12ce9a2 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2814,31 +2814,12 @@
ret <4 x float> %d
}
-; FIXME: Failed to recognise that the VMOVSD has already zero'd the upper element
define void @combine_scalar_load_with_blend_with_zero(double* %a0, <4 x float>* %a1) {
-; SSE2-LABEL: combine_scalar_load_with_blend_with_zero:
-; SSE2: # BB#0:
-; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT: xorps %xmm1, %xmm1
-; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSE2-NEXT: movaps %xmm0, (%rsi)
-; SSE2-NEXT: retq
-;
-; SSSE3-LABEL: combine_scalar_load_with_blend_with_zero:
-; SSSE3: # BB#0:
-; SSSE3-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSSE3-NEXT: xorps %xmm1, %xmm1
-; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[3,0]
-; SSSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0,2]
-; SSSE3-NEXT: movaps %xmm0, (%rsi)
-; SSSE3-NEXT: retq
-;
-; SSE41-LABEL: combine_scalar_load_with_blend_with_zero:
-; SSE41: # BB#0:
-; SSE41-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE41-NEXT: movapd %xmm0, (%rsi)
-; SSE41-NEXT: retq
+; SSE-LABEL: combine_scalar_load_with_blend_with_zero:
+; SSE: # BB#0:
+; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT: movapd %xmm0, (%rsi)
+; SSE-NEXT: retq
;
; AVX-LABEL: combine_scalar_load_with_blend_with_zero:
; AVX: # BB#0:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
index e8d9aa2..bd59328 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -41,7 +41,8 @@
; ALL-NEXT: andq $-32, %rsp
; ALL-NEXT: subq $64, %rsp
; ALL-NEXT: vmovaps %ymm0, (%rsp)
-; ALL-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0]
+; ALL-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
+; ALL-NEXT: vmovddup {{.*#+}} xmm0 = xmm0[0,0]
; ALL-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; ALL-NEXT: movq %rbp, %rsp