AVX-512: Implemented 256/128bit VALIGND/Q instructions for SKX and KNL
Implemented DAG lowering for all these forms.
Added tests for DAG lowering and encoding.
Differential Revision: http://reviews.llvm.org/D10310
llvm-svn: 239300
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 040779e..229795c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -10132,7 +10132,8 @@
else if (Mask[i] - i != AlignVal)
return SDValue();
}
- return DAG.getNode(X86ISD::VALIGN, DL, VT, V1, V2,
+ // Vector source operands should be swapped
+ return DAG.getNode(X86ISD::VALIGN, DL, VT, V2, V1,
DAG.getConstant(AlignVal, DL, MVT::i8));
}
@@ -15167,6 +15168,30 @@
Src1,Src2),
Mask, PassThru, Subtarget, DAG);
}
+ case INTR_TYPE_3OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue PassThru = Op.getOperand(4);
+ SDValue Mask = Op.getOperand(5);
+ // We specify 2 possible opcodes for intrinsics with rounding modes.
+ // First, we check if the intrinsic may have non-default rounding mode,
+ // (IntrData->Opc1 != 0), then we check the rounding mode operand.
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
+ if (Round != X86::STATIC_ROUNDING::CUR_DIRECTION) {
+ return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, Op.getValueType(),
+ Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ }
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
+ Src1, Src2, Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
case FMA_OP_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
@@ -15309,16 +15334,6 @@
return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
Op.getOperand(2), Op.getOperand(1));
- case Intrinsic::x86_avx512_mask_valign_q_512:
- case Intrinsic::x86_avx512_mask_valign_d_512:
- // Vector source operands are swapped.
- return getVectorMaskingNode(DAG.getNode(X86ISD::VALIGN, dl,
- Op.getValueType(), Op.getOperand(2),
- Op.getOperand(1),
- Op.getOperand(3)),
- Op.getOperand(5), Op.getOperand(4),
- Subtarget, DAG);
-
// ptest and testp intrinsics. The intrinsic these come from are designed to
// return an integer value, not just an instruction so lower it to the ptest
// or testp pattern and a setcc for the result.
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 7d16c22..c1d0aef 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5611,30 +5611,6 @@
(loadv8i64 addr:$src2), (i8 imm:$imm))),
(VSHUFPDZrmi VR512:$src1, addr:$src2, imm:$imm)>;
-multiclass avx512_valign<X86VectorVTInfo _> {
- defm rri : AVX512_maskable<0x03, MRMSrcReg, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.RC:$src2, u8imm:$src3),
- "valign"##_.Suffix,
- "$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86VAlign _.RC:$src2, _.RC:$src1,
- (i8 imm:$src3)))>,
- AVX512AIi8Base, EVEX_4V;
-
- // Also match valign of packed floats.
- def : Pat<(_.FloatVT (X86VAlign _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
- (!cast<Instruction>(NAME##rri) _.RC:$src2, _.RC:$src1, imm:$imm)>;
-
- let mayLoad = 1 in
- def rmi : AVX512AIi8<0x03, MRMSrcMem, (outs _.RC:$dst),
- (ins _.RC:$src1, _.MemOp:$src2, u8imm:$src3),
- !strconcat("valign"##_.Suffix,
- "\t{$src3, $src2, $src1, $dst|"
- "$dst, $src1, $src2, $src3}"),
- []>, EVEX_4V;
-}
-defm VALIGND : avx512_valign<v16i32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
-defm VALIGNQ : avx512_valign<v8i64_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
-
// Helper fragments to match sext vXi1 to vXiY.
def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
@@ -6121,7 +6097,7 @@
AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
let Predicates = [prd] in {
defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
EVEX_V512;
}
@@ -6133,6 +6109,17 @@
}
}
+multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ }
+}
+
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
let Predicates = [prd] in {
@@ -6189,3 +6176,18 @@
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+
+multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
+ AVX512VLVectorVTInfo VTInfo_FP>{
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+ let isCodeGenOnly = 1 in {
+ defm NAME#_FP: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0x03, X86VAlign>,
+ AVX512AIi8Base, EVEX_4V;
+ }
+}
+
+defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info, avx512vl_f32_info>,
+ EVEX_CD8<32, CD8VF>;
+defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info, avx512vl_f64_info>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 44068f9..4d61b1f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1696,8 +1696,8 @@
{ X86::VPSUBQZrr, X86::VPSUBQZrm, 0 },
{ X86::VSHUFPDZrri, X86::VSHUFPDZrmi, 0 },
{ X86::VSHUFPSZrri, X86::VSHUFPSZrmi, 0 },
- { X86::VALIGNQrri, X86::VALIGNQrmi, 0 },
- { X86::VALIGNDrri, X86::VALIGNDrmi, 0 },
+ { X86::VALIGNQZrri, X86::VALIGNQZrmi, 0 },
+ { X86::VALIGNDZrri, X86::VALIGNDZrmi, 0 },
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
{ X86::VBROADCASTSSZrkz, X86::VBROADCASTSSZmkz, TB_NO_REVERSE },
{ X86::VBROADCASTSDZrkz, X86::VBROADCASTSDZmkz, TB_NO_REVERSE },
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 8bf0d44..0268066 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -21,7 +21,8 @@
GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDPMC, RDTSC, XTEST, ADX,
INTR_TYPE_1OP, INTR_TYPE_2OP, INTR_TYPE_3OP,
CMP_MASK, CMP_MASK_CC, VSHIFT, VSHIFT_MASK, COMI,
- INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK, FMA_OP_MASK,
+ INTR_TYPE_1OP_MASK, INTR_TYPE_1OP_MASK_RM, INTR_TYPE_2OP_MASK,
+ INTR_TYPE_3OP_MASK, FMA_OP_MASK,
INTR_TYPE_SCALAR_MASK_RM, COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
EXPAND_FROM_MEM, BLEND
};
@@ -603,6 +604,8 @@
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_128, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_256, CMP_MASK_CC, X86ISD::CMPMU, 0),
X86_INTRINSIC_DATA(avx512_mask_ucmp_w_512, CMP_MASK_CC, X86ISD::CMPMU, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_d_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
+ X86_INTRINSIC_DATA(avx512_mask_valign_q_512, INTR_TYPE_3OP_MASK, X86ISD::VALIGN, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),
X86_INTRINSIC_DATA(avx512_mask_xor_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FXOR, 0),