[DAGCombiner] fold extract_subvector of extract_subvector
This is the sibling fold for insert-of-insert that was added with D56604.
Now that we have x86 shuffle narrowing (D57156), this change shows improvements for
lots of AVX512 reduction code (not sure that we would ever expect extract-of-extract otherwise).
There's a small regression in some of the partial-permute tests (extracting followed by splat).
That is tracked by PR40500:
https://bugs.llvm.org/show_bug.cgi?id=40500
Differential Revision: https://reviews.llvm.org/D57336
llvm-svn: 352528
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll b/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll
index f908495..a59eabb 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-widen.ll
@@ -117,7 +117,7 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
@@ -180,7 +180,7 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
; AVX512-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovq %xmm0, %rax
@@ -346,9 +346,9 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
@@ -417,9 +417,9 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddd %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; AVX512-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
@@ -657,11 +657,11 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
@@ -741,11 +741,11 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddw %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
; AVX512-NEXT: vpaddw %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vmovd %xmm0, %eax
@@ -1137,13 +1137,13 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax
@@ -1253,13 +1253,13 @@
; AVX512-NEXT: vextracti64x4 $1, %zmm0, %ymm1
; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
-; AVX512-NEXT: vpaddb %zmm1, %zmm0, %zmm0
+; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX512-NEXT: vpextrb $0, %xmm0, %eax