[AVX-512] Remove patterns that select vmovdqu8/16 for unmasked loads. Prefer vmovdqa64/vmovdqu64 instead.
These were taking priority over the aligned load instructions since there is no vmovda8/16. I don't think there is really a difference between aligned and unaligned on newer cpus so I don't think it matters which instructions we use.
But with this change we reduce the size of the isel table a little and we allow the aligned information to pass through to the evex->vec pass and produce the same output has avx/avx2 in some cases.
I also generally dislike patterns rooted in a bitcast which these were.
Differential Revision: https://reviews.llvm.org/D35977
llvm-svn: 309589
diff --git a/llvm/test/CodeGen/X86/vector-rotate-512.ll b/llvm/test/CodeGen/X86/vector-rotate-512.ll
index fa1b5c1..9403ea1 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-512.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-512.ll
@@ -81,7 +81,7 @@
;
; AVX512BW-LABEL: var_rotate_v32i16:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512BW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512BW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
@@ -90,7 +90,7 @@
;
; AVX512VLBW-LABEL: var_rotate_v32i16:
; AVX512VLBW: # BB#0:
-; AVX512VLBW-NEXT: vmovdqu16 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
; AVX512VLBW-NEXT: vpsubw %zmm1, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpsllvw %zmm1, %zmm0, %zmm1
; AVX512VLBW-NEXT: vpsrlvw %zmm2, %zmm0, %zmm0
@@ -226,7 +226,7 @@
;
; AVX512BW-LABEL: var_rotate_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512BW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -261,7 +261,7 @@
;
; AVX512VLBW-LABEL: var_rotate_v64i8:
; AVX512VLBW: # BB#0:
-; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
; AVX512VLBW-NEXT: vpsubb %zmm1, %zmm2, %zmm2
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -483,7 +483,7 @@
;
; AVX512BW-LABEL: constant_rotate_v64i8:
; AVX512BW: # BB#0:
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -496,7 +496,7 @@
; AVX512BW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
-; AVX512BW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
; AVX512BW-NEXT: vpmovb2m %zmm1, %k1
; AVX512BW-NEXT: vpsrlw $4, %zmm0, %zmm3
; AVX512BW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3
@@ -516,7 +516,7 @@
;
; AVX512VLBW-LABEL: constant_rotate_v64i8:
; AVX512VLBW: # BB#0:
-; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsllw $4, %zmm0, %zmm2
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm2, %zmm2
@@ -529,7 +529,7 @@
; AVX512VLBW-NEXT: vpaddb %zmm1, %zmm1, %zmm1
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpaddb %zmm2, %zmm2, %zmm2 {%k1}
-; AVX512VLBW-NEXT: vmovdqu8 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
+; AVX512VLBW-NEXT: vmovdqa64 {{.*#+}} zmm1 = [57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536,57600,41152,24704,8256,8192,24640,41088,57536]
; AVX512VLBW-NEXT: vpmovb2m %zmm1, %k1
; AVX512VLBW-NEXT: vpsrlw $4, %zmm0, %zmm3
; AVX512VLBW-NEXT: vpandq {{.*}}(%rip), %zmm3, %zmm3