[X86] Add SchedRW for PMULLD
Summary:
It seems many CPUs don't implement this instruction as well as the other vector multiplies. Often using a multi uop flow. Silvermont in particular has a 7 uop flow with 11 cycle throughput. Sandy Bridge implements it as a single uop with 5 cycle latency and 1 cycle throughput. But Haswell and later use 2 uops with 10 cycle latency and 2 cycle throughput.
This patch adds a new X86SchedWritePair we can use to tag this instruction separately. I've provided correct information for Silvermont, Btver2, and Sandy Bridge. I've removed the InstRWs for SandyBridge. I've left Haswell/Broadwell/Skylake InstRWs in place because I wasn't sure how to account for the different load latency between 128 and 256 bits. I also left Znver1 InstRWs in place because the existing values don't match Agner's spreadsheet.
I also left a FIXME in the SandyBridge model because it being used for the "generic" model is too optimistic for the 256/512-bit versions since those are multiple uops on all known CPUs.
Reviewers: RKSimon, GGanesh, courbet
Reviewed By: RKSimon
Subscribers: gchatelet, gbedwell, andreadb, llvm-commits
Differential Revision: https://reviews.llvm.org/D44972
llvm-svn: 328914
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 663f2d1..188a167 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -4505,7 +4505,7 @@
defm VPSUBUS : avx512_binop_rm_vl_bw<0xD8, 0xD9, "vpsubus", X86subus,
SSE_INTALU_ITINS_P, HasBWI, 0>;
defm VPMULLD : avx512_binop_rm_vl_d<0x40, "vpmulld", mul,
- SSE_INTMUL_ITINS_P, HasAVX512, 1>, T8PD;
+ SSE_PMULLD_ITINS, HasAVX512, 1>, T8PD;
defm VPMULLW : avx512_binop_rm_vl_w<0xD5, "vpmullw", mul,
SSE_INTMUL_ITINS_P, HasBWI, 1>;
defm VPMULLQ : avx512_binop_rm_vl_q<0x40, "vpmullq", mul,
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 57b296a..5001f11 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -195,7 +195,7 @@
IIC_SSE_MPSADBW_RR, IIC_SSE_MPSADBW_RM
>;
-let Sched = WriteVecIMul in
+let Sched = WritePMULLD in
def SSE_PMULLD_ITINS : OpndItins<
IIC_SSE_PMULLD_RR, IIC_SSE_PMULLD_RM
>;
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 2c264e3..b3b2efb 100755
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -167,6 +167,7 @@
defm : BWWriteResPair<WriteVecALU, [BWPort15], 1>; // Vector integer ALU op, no logicals.
defm : BWWriteResPair<WriteVecShift, [BWPort0], 1>; // Vector integer shifts.
defm : BWWriteResPair<WriteVecIMul, [BWPort0], 5>; // Vector integer multiply.
+defm : BWWriteResPair<WritePMULLD, [BWPort0], 10, [2], 2, 5>; // PMULLD
defm : BWWriteResPair<WriteShuffle, [BWPort5], 1>; // Vector shuffles.
defm : BWWriteResPair<WriteBlend, [BWPort15], 1>; // Vector blends.
defm : BWWriteResPair<WriteVarBlend, [BWPort5], 2, [2]>; // Vector variable blends.
@@ -2180,13 +2181,6 @@
def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm",
"LSL(16|32|64)rm")>;
-def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[BWWriteResGroup114], (instregex "(V?)PMULLD(Y?)rr")>;
-
def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> {
let Latency = 10;
let NumMicroOps = 2;
@@ -2462,13 +2456,6 @@
"DIVR_FST0r",
"DIVR_FrST0")>;
-def BWWriteResGroup148 : SchedWriteRes<[BWPort0,BWPort23]> {
- let Latency = 15;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[BWWriteResGroup148], (instregex "(V?)PMULLDrm")>;
-
def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
let Latency = 15;
let NumMicroOps = 10;
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index bd16dc6..b259801 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -163,6 +163,7 @@
defm : HWWriteResPair<WriteVecLogic, [HWPort015], 1>;
defm : HWWriteResPair<WriteVecALU, [HWPort15], 1>;
defm : HWWriteResPair<WriteVecIMul, [HWPort0], 5>;
+defm : HWWriteResPair<WritePMULLD, [HWPort0], 10, [2], 2, 6>;
defm : HWWriteResPair<WriteShuffle, [HWPort5], 1>;
defm : HWWriteResPair<WriteBlend, [HWPort15], 1>;
defm : HWWriteResPair<WriteShuffle256, [HWPort5], 3>;
@@ -2680,20 +2681,6 @@
}
def: InstRW<[HWWriteResGroup117], (instregex "(V?)DPPDrmi")>;
-def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[HWWriteResGroup118], (instregex "(V?)PMULLD(Y?)rr")>;
-
-def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> {
- let Latency = 16;
- let NumMicroOps = 3;
- let ResourceCycles = [2,1];
-}
-def: InstRW<[HWWriteResGroup119], (instregex "(V?)PMULLDrm")>;
-
def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> {
let Latency = 17;
let NumMicroOps = 3;
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 7316de6..85d9a89 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -151,6 +151,7 @@
defm : SBWriteResPair<WriteVecLogic, [SBPort5], 1>;
defm : SBWriteResPair<WriteVecALU, [SBPort1], 3>;
defm : SBWriteResPair<WriteVecIMul, [SBPort0], 5>;
+defm : SBWriteResPair<WritePMULLD, [SBPort0], 5, [1], 1, 6>; // TODO this is probably wrong for 256/512-bit for the "generic" model
defm : SBWriteResPair<WriteShuffle, [SBPort5], 1>;
defm : SBWriteResPair<WriteBlend, [SBPort15], 1>;
defm : SBWriteResPair<WriteVarBlend, [SBPort1, SBPort5], 2>;
@@ -672,7 +673,6 @@
"(V?)PMULHRSWrr",
"(V?)PMULHUWrr",
"(V?)PMULHWrr",
- "(V?)PMULLDrr",
"(V?)PMULLWrr",
"(V?)PMULUDQrr",
"(V?)PSADBWrr")>;
@@ -1602,7 +1602,6 @@
"(V?)PMULHRSWrm",
"(V?)PMULHUWrm",
"(V?)PMULHWrm",
- "(V?)PMULLDrm",
"(V?)PMULLWrm",
"(V?)PMULUDQrm",
"(V?)PSADBWrm")>;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 2a6658e..de54837 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -164,6 +164,7 @@
defm : SKLWriteResPair<WriteVecALU, [SKLPort15], 1>; // Vector integer ALU op, no logicals.
defm : SKLWriteResPair<WriteVecShift, [SKLPort0], 1>; // Vector integer shifts.
defm : SKLWriteResPair<WriteVecIMul, [SKLPort0], 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WritePMULLD, [SKLPort01], 10, [2], 2, 6>;
defm : SKLWriteResPair<WriteShuffle, [SKLPort5], 1>; // Vector shuffles.
defm : SKLWriteResPair<WriteBlend, [SKLPort15], 1>; // Vector blends.
defm : SKLWriteResPair<WriteVarBlend, [SKLPort5], 2, [2]>; // Vector variable blends.
@@ -1849,13 +1850,6 @@
"(V?)ROUNDSDr",
"(V?)ROUNDSSr")>;
-def SKLWriteResGroup105_2 : SchedWriteRes<[SKLPort01]> {
- let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
-}
-def: InstRW<[SKLWriteResGroup105_2], (instregex "(V?)PMULLD(Y?)rr")>;
-
def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> {
let Latency = 8;
let NumMicroOps = 2;
@@ -2559,13 +2553,6 @@
def: InstRW<[SKLWriteResGroup168], (instregex "(V?)ROUNDSDm")>;
def: InstRW<[SKLWriteResGroup168], (instregex "(V?)ROUNDSSm")>;
-def SKLWriteResGroup168_2 : SchedWriteRes<[SKLPort23,SKLPort01]> {
- let Latency = 16;
- let NumMicroOps = 3;
- let ResourceCycles = [1,2];
-}
-def: InstRW<[SKLWriteResGroup168_2], (instregex "(V?)PMULLDrm")>;
-
def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
let Latency = 14;
let NumMicroOps = 3;
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 7f336fd..4abc415 100755
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -164,6 +164,7 @@
defm : SKXWriteResPair<WriteVecALU, [SKXPort15], 1>; // Vector integer ALU op, no logicals.
defm : SKXWriteResPair<WriteVecShift, [SKXPort0], 1>; // Vector integer shifts.
defm : SKXWriteResPair<WriteVecIMul, [SKXPort0], 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WritePMULLD, [SKXPort015], 10, [2], 2, 6>; // Vector integer multiply.
defm : SKXWriteResPair<WriteShuffle, [SKXPort5], 1>; // Vector shuffles.
defm : SKXWriteResPair<WriteBlend, [SKXPort15], 1>; // Vector blends.
defm : SKXWriteResPair<WriteVarBlend, [SKXPort5], 2, [2]>; // Vector variable blends.
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index b5cb26c..3fee3a7 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -100,6 +100,7 @@
defm WriteVecALU : X86SchedWritePair; // Vector integer ALU op, no logicals.
defm WriteVecShift : X86SchedWritePair; // Vector integer shifts.
defm WriteVecIMul : X86SchedWritePair; // Vector integer multiply.
+defm WritePMULLD : X86SchedWritePair; // PMULLD
defm WriteShuffle : X86SchedWritePair; // Vector shuffles.
defm WriteBlend : X86SchedWritePair; // Vector blends.
defm WriteVarBlend : X86SchedWritePair; // Vector variable blends.
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index f098fce..86ee44b 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -345,6 +345,7 @@
defm : JWriteResFpuPair<WriteVecALU, [JFPU01, JVALU], 1>;
defm : JWriteResFpuPair<WriteVecShift, [JFPU01, JVALU], 1>;
defm : JWriteResFpuPair<WriteVecIMul, [JFPU0, JVIMUL], 2>;
+defm : JWriteResFpuPair<WritePMULLD, [JFPU0, JFPU01, JVIMUL, JVALU], 4, [2, 1, 2, 1], 3>;
defm : JWriteResFpuPair<WriteMPSAD, [JFPU0, JVIMUL], 3, [1, 2]>;
defm : JWriteResFpuPair<WriteShuffle, [JFPU01, JVALU], 1>;
defm : JWriteResFpuPair<WriteBlend, [JFPU01, JVALU], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 64a2ec1..81f9c37 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -138,6 +138,7 @@
defm : SLMWriteResPair<WriteVecLogic, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecALU, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WriteVecIMul, [SLM_FPC_RSV0], 4>;
+defm : SLMWriteResPair<WritePMULLD, [SLM_FPC_RSV0], 11, [11], 7>;
defm : SLMWriteResPair<WriteShuffle, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteBlend, [SLM_FPC_RSV0], 1>;
defm : SLMWriteResPair<WriteMPSAD, [SLM_FPC_RSV0], 7>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 74f32bc..ccdf23f 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -213,6 +213,7 @@
defm : ZnWriteResFpuPair<WritePHAdd, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecALU, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteVecIMul, [ZnFPU0], 4>;
+defm : ZnWriteResFpuPair<WritePMULLD, [ZnFPU0], 4>; // FIXME
defm : ZnWriteResFpuPair<WriteShuffle, [ZnFPU], 1>;
defm : ZnWriteResFpuPair<WriteBlend, [ZnFPU01], 1>;
defm : ZnWriteResFpuPair<WriteShuffle256, [ZnFPU], 2>;