AVX-512: Implemented 256/128bit VALIGND/Q instructions for SKX and KNL
Implemented DAG lowering for all these forms.
Added tests for DAG lowering and encoding.
Differential Revision: http://reviews.llvm.org/D10310
llvm-svn: 239300
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 040779e..229795c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10132,7 +10132,8 @@
else if (Mask[i] - i != AlignVal)
return SDValue();
}
- return DAG.getNode(X86ISD::VALIGN, DL, VT, V1, V2,
+ // Vector source operands should be swapped
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
DAG.getConstant(AlignVal, DL, MVT::i8));
}
@@ -15167,6 +15168,30 @@
Src1,Src2),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_3OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
case FMA_OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -15309,16 +15334,6 @@
return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(1));
- case Intrinsic::x86_avx512_mask_valign_q_512:
- case Intrinsic::x86_avx512_mask_valign_d_512:
- // Vector source operands are swapped.
- return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
- Op.getValueType(), Op.getOperand(2),
- Op.getOperand(1),
- Op.getOperand(3)),
- Op.getOperand(5), Op.getOperand(4),
- Subtarget, DAG);
-
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 7d16c22..c1d0aef 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5611,30 +5611,6 @@
(loadv8i64 addr:$src2), (i8 imm:$imm))),
(VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
-multiclass avx512_valign<X86VectorVTInfo _> {
- defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
- "valign"##_.Suffix,
- "$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86VAlign _.RC:$src2, _.RC:$src1,
- (i8 imm:$src3)))>,
- AVX512AIi8Base, EVEX_4V;
-
- // Also match valign of packed floats.
- def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
- (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>;
-
- let mayLoad = 1 in
- def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
- !strconcat("valign"##_.Suffix,
- "\t{$src3, $src2, $src1, $dst|"
- "$dst, $src1, $src2, $src3}"),
- []>, EVEX_4V;
-}
-defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
// Helper fragments to match sext vXi1 to vXiY.
def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
@@ -6121,7 +6097,7 @@
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
EVEX_V512;
}
@@ -6133,6 +6109,17 @@
}
}
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
let Predicates = [prd] in {
@@ -6189,3 +6176,18 @@
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+ let isCodeGenOnly = 1 in {
+ defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+ }
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>,
+ EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 44068f9..4d61b1f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1696,8 +1696,8 @@
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VALIGNQrri, X86::VALIGNQrmi, 0 },
- { X86::VALIGNDrri, X86::VALIGNDrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 8bf0d44..0268066 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -21,7 +21,8 @@
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
- INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK,
+ INTR_TYPE_3OP_MASK, FMA_OP_MASK,
INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
EXPAND_FROM_MEM, BLEND
};
@@ -603,6 +604,8 @@
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
diff --git a/llvm/test/CodeGen/X86/avx512-shuffle.ll b/llvm/test/CodeGen/X86/avx512-shuffle.ll
index 8411fee..2683d6f 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffle.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffle.ll
@@ -1,4 +1,5 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK-SKX
; CHECK-LABEL: test1:
; CHECK: vpermps
@@ -250,3 +251,86 @@
%c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 9, i32 1, i32 10, i32 2, i32 undef, i32 5, i32 15, i32 undef>
ret <8 x double> %c
}
+
+define <16 x i32> @test_align_v16i32_rr(<16 x i32> %a, <16 x i32> %b) nounwind {
+; CHECK-LABEL: test_align_v16i32_rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignd $3, %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ ret <16 x i32> %c
+}
+
+define <16 x i32> @test_align_v16i32_rm(<16 x i32>* %a.ptr, <16 x i32> %b) nounwind {
+; CHECK-LABEL: test_align_v16i32_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignd $3, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = load <16 x i32>, <16 x i32>* %a.ptr
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ ret <16 x i32> %c
+}
+
+define <16 x i32> @test_align_v16i32_rm_mask(<16 x i32>* %a.ptr, <16 x i32> %b, <16 x i1> %mask) nounwind {
+; CHECK-LABEL: test_align_v16i32_rm_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxbd %xmm1, %zmm1
+; CHECK-NEXT: vpandd {{.*}}(%rip){1to16}, %zmm1, %zmm1
+; CHECK-NEXT: vptestmd %zmm1, %zmm1, %k1
+; CHECK-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-NEXT: retq
+;
+; CHECK-SKX-LABEL: test_align_v16i32_rm_mask:
+; CHECK-SKX: ## BB#0:
+; CHECK-SKX-NEXT: vpmovb2m %xmm1, %k1
+; CHECK-SKX-NEXT: vmovdqa32 (%rdi), %zmm1
+; CHECK-SKX-NEXT: valignd $3, %zmm1, %zmm0, %zmm1 {%k1}
+; CHECK-SKX-NEXT: vmovaps %zmm1, %zmm0
+; CHECK-SKX-NEXT: retq
+ %a = load <16 x i32>, <16 x i32>* %a.ptr
+ %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18>
+ %res = select <16 x i1> %mask,<16 x i32> %c, <16 x i32> %a
+ ret <16 x i32> %res
+}
+
+define <8 x double> @test_align_v8f64_rr(<8 x double> %a, <8 x double> %b) nounwind {
+; CHECK-LABEL: test_align_v8f64_rr:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignq $3, %zmm0, %zmm1, %zmm0
+; CHECK-NEXT: retq
+ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x double> %c
+}
+
+define <8 x double> @test_align_v18f64_rm(<8 x double>* %a.ptr, <8 x double> %b) nounwind {
+; CHECK-LABEL: test_align_v18f64_rm:
+; CHECK: ## BB#0:
+; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0
+; CHECK-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a.ptr
+ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ ret <8 x double> %c
+}
+
+define <8 x double> @test_align_v18f64_rm_mask(<8 x double>* %a.ptr, <8 x double> %b, <8 x i1> %mask) nounwind {
+; CHECK-LABEL: test_align_v18f64_rm_mask:
+; CHECK: ## BB#0:
+; CHECK-NEXT: vpmovsxwq %xmm1, %zmm1
+; CHECK-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm1, %zmm1
+; CHECK-NEXT: vptestmq %zmm1, %zmm1, %k1
+; CHECK-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT: retq
+;
+; CHECK-SKX-LABEL: test_align_v18f64_rm_mask:
+; CHECK-SKX: ## BB#0:
+; CHECK-SKX-NEXT: vpmovw2m %xmm1, %k1
+; CHECK-SKX-NEXT: valignq $3, (%rdi), %zmm0, %zmm0 {%k1} {z}
+; CHECK-SKX-NEXT: retq
+ %a = load <8 x double>, <8 x double>* %a.ptr
+ %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10>
+ %res = select <8 x i1> %mask,<8 x double> %c, <8 x double> zeroinitializer
+ ret <8 x double> %res
+}
+
diff --git a/llvm/test/MC/X86/avx512-encodings.s b/llvm/test/MC/X86/avx512-encodings.s
index ba467af..ca0fccb 100644
--- a/llvm/test/MC/X86/avx512-encodings.s
+++ b/llvm/test/MC/X86/avx512-encodings.s
@@ -6084,6 +6084,66 @@
// CHECK: encoding: [0x62,0xf3,0xfd,0x49,0x03,0xcb,0x03]
valignq $3, %zmm3, %zmm0, %zmm1 {%k1}
+// CHECK: valignq $171, %zmm23, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0xab]
+ valignq $0xab, %zmm23, %zmm4, %zmm28
+
+// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3}
+// CHECK: encoding: [0x62,0x23,0xdd,0x4b,0x03,0xe7,0xab]
+ valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3}
+
+// CHECK: valignq $171, %zmm23, %zmm4, %zmm28 {%k3} {z}
+// CHECK: encoding: [0x62,0x23,0xdd,0xcb,0x03,0xe7,0xab]
+ valignq $0xab, %zmm23, %zmm4, %zmm28 {%k3} {z}
+
+// CHECK: valignq $123, %zmm23, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xe7,0x7b]
+ valignq $0x7b, %zmm23, %zmm4, %zmm28
+
+// CHECK: valignq $123, (%rcx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x21,0x7b]
+ valignq $0x7b, (%rcx), %zmm4, %zmm28
+
+// CHECK: valignq $123, 291(%rax,%r14,8), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x23,0xdd,0x48,0x03,0xa4,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ valignq $0x7b, 291(%rax,%r14,8), %zmm4, %zmm28
+
+// CHECK: valignq $123, (%rcx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x21,0x7b]
+ valignq $0x7b, (%rcx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, 8128(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x7f,0x7b]
+ valignq $0x7b, 8128(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, 8192(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0x00,0x20,0x00,0x00,0x7b]
+ valignq $0x7b, 8192(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, -8192(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0x62,0x80,0x7b]
+ valignq $0x7b, -8192(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, -8256(%rdx), %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x48,0x03,0xa2,0xc0,0xdf,0xff,0xff,0x7b]
+ valignq $0x7b, -8256(%rdx), %zmm4, %zmm28
+
+// CHECK: valignq $123, 1016(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x7f,0x7b]
+ valignq $0x7b, 1016(%rdx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, 1024(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0x00,0x04,0x00,0x00,0x7b]
+ valignq $0x7b, 1024(%rdx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, -1024(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0x62,0x80,0x7b]
+ valignq $0x7b, -1024(%rdx){1to8}, %zmm4, %zmm28
+
+// CHECK: valignq $123, -1032(%rdx){1to8}, %zmm4, %zmm28
+// CHECK: encoding: [0x62,0x63,0xdd,0x58,0x03,0xa2,0xf8,0xfb,0xff,0xff,0x7b]
+ valignq $0x7b, -1032(%rdx){1to8}, %zmm4, %zmm28
+
// CHECK: vextractf32x4 $3
// CHECK: encoding: [0x62,0xf3,0x7d,0x49,0x19,0xd9,0x03]
vextractf32x4 $3, %zmm3, %xmm1 {%k1}
diff --git a/llvm/test/MC/X86/x86-64-avx512f_vl.s b/llvm/test/MC/X86/x86-64-avx512f_vl.s
index 983e879..f521b3e 100644
--- a/llvm/test/MC/X86/x86-64-avx512f_vl.s
+++ b/llvm/test/MC/X86/x86-64-avx512f_vl.s
@@ -11013,3 +11013,122 @@
// CHECK: encoding: [0x62,0x63,0xad,0x30,0x43,0x8a,0xf8,0xfb,0xff,0xff,0x7b]
vshufi64x2 $0x7b, -1032(%rdx){1to4}, %ymm26, %ymm25
+// CHECK: valignq $171, %xmm24, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0xab]
+ valignq $0xab, %xmm24, %xmm18, %xmm19
+
+// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5}
+// CHECK: encoding: [0x62,0x83,0xed,0x05,0x03,0xd8,0xab]
+ valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5}
+
+// CHECK: valignq $171, %xmm24, %xmm18, %xmm19 {%k5} {z}
+// CHECK: encoding: [0x62,0x83,0xed,0x85,0x03,0xd8,0xab]
+ valignq $0xab, %xmm24, %xmm18, %xmm19 {%k5} {z}
+
+// CHECK: valignq $123, %xmm24, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0x83,0xed,0x00,0x03,0xd8,0x7b]
+ valignq $0x7b, %xmm24, %xmm18, %xmm19
+
+// CHECK: valignq $123, (%rcx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x19,0x7b]
+ valignq $0x7b, (%rcx), %xmm18, %xmm19
+
+// CHECK: valignq $123, 291(%rax,%r14,8), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xa3,0xed,0x00,0x03,0x9c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ valignq $0x7b, 291(%rax,%r14,8), %xmm18, %xmm19
+
+// CHECK: valignq $123, (%rcx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x19,0x7b]
+ valignq $0x7b, (%rcx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, 2032(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x7f,0x7b]
+ valignq $0x7b, 2032(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, 2048(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0x00,0x08,0x00,0x00,0x7b]
+ valignq $0x7b, 2048(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, -2048(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x5a,0x80,0x7b]
+ valignq $0x7b, -2048(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, -2064(%rdx), %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x00,0x03,0x9a,0xf0,0xf7,0xff,0xff,0x7b]
+ valignq $0x7b, -2064(%rdx), %xmm18, %xmm19
+
+// CHECK: valignq $123, 1016(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x7f,0x7b]
+ valignq $0x7b, 1016(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, 1024(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0x00,0x04,0x00,0x00,0x7b]
+ valignq $0x7b, 1024(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, -1024(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x5a,0x80,0x7b]
+ valignq $0x7b, -1024(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $123, -1032(%rdx){1to2}, %xmm18, %xmm19
+// CHECK: encoding: [0x62,0xe3,0xed,0x10,0x03,0x9a,0xf8,0xfb,0xff,0xff,0x7b]
+ valignq $0x7b, -1032(%rdx){1to2}, %xmm18, %xmm19
+
+// CHECK: valignq $171, %ymm26, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0xab]
+ valignq $0xab, %ymm26, %ymm24, %ymm25
+
+// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2}
+// CHECK: encoding: [0x62,0x03,0xbd,0x22,0x03,0xca,0xab]
+ valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2}
+
+// CHECK: valignq $171, %ymm26, %ymm24, %ymm25 {%k2} {z}
+// CHECK: encoding: [0x62,0x03,0xbd,0xa2,0x03,0xca,0xab]
+ valignq $0xab, %ymm26, %ymm24, %ymm25 {%k2} {z}
+
+// CHECK: valignq $123, %ymm26, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x03,0xbd,0x20,0x03,0xca,0x7b]
+ valignq $0x7b, %ymm26, %ymm24, %ymm25
+
+// CHECK: valignq $123, (%rcx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x09,0x7b]
+ valignq $0x7b, (%rcx), %ymm24, %ymm25
+
+// CHECK: valignq $123, 291(%rax,%r14,8), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x23,0xbd,0x20,0x03,0x8c,0xf0,0x23,0x01,0x00,0x00,0x7b]
+ valignq $0x7b, 291(%rax,%r14,8), %ymm24, %ymm25
+
+// CHECK: valignq $123, (%rcx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x09,0x7b]
+ valignq $0x7b, (%rcx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, 4064(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x7f,0x7b]
+ valignq $0x7b, 4064(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, 4096(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0x00,0x10,0x00,0x00,0x7b]
+ valignq $0x7b, 4096(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, -4096(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x4a,0x80,0x7b]
+ valignq $0x7b, -4096(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, -4128(%rdx), %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x20,0x03,0x8a,0xe0,0xef,0xff,0xff,0x7b]
+ valignq $0x7b, -4128(%rdx), %ymm24, %ymm25
+
+// CHECK: valignq $123, 1016(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x7f,0x7b]
+ valignq $0x7b, 1016(%rdx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, 1024(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0x00,0x04,0x00,0x00,0x7b]
+ valignq $0x7b, 1024(%rdx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, -1024(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x4a,0x80,0x7b]
+ valignq $0x7b, -1024(%rdx){1to4}, %ymm24, %ymm25
+
+// CHECK: valignq $123, -1032(%rdx){1to4}, %ymm24, %ymm25
+// CHECK: encoding: [0x62,0x63,0xbd,0x30,0x03,0x8a,0xf8,0xfb,0xff,0xff,0x7b]
+ valignq $0x7b, -1032(%rdx){1to4}, %ymm24, %ymm25