[MC][X86] Correctly model additional operand latency caused by transfer delays from the integer to the floating point unit. This patch adds a new ReadAdvance definition named ReadInt2Fpu. ReadInt2Fpu allows x86 scheduling models to accurately describe delays caused by data transfers from the integer unit to the floating point unit. ReadInt2Fpu currently defaults to a delay of zero cycles (i.e. no delay) for all x86 models excluding BtVer2. That means, this patch is only a functional change for the Jaguar cpu model only. Tablegen definitions for instructions (V)PINSR* have been updated to account for the new ReadInt2Fpu. That read is mapped to the the GPR input operand. On Jaguar, int-to-fpu transfers are modeled as a +6cy delay. Before this patch, that extra delay was added to the opcode latency. In practice, the insert opcode only executes for 1cy. Most of the actual latency is actually contributed by the so-called operand-latency. According to the AMD SOG for family 16h, (V)PINSR* latency is defined by expression f+1, where f is defined as a forwarding delay from the integer unit to the fpu. When printing instruction latency from MCA (see InstructionInfoView.cpp) and LLC (only when flag -print-schedule is speified), we now need to account for any extra forwarding delays. We do this by checking if scheduling classes declare any negative ReadAdvance entries. Quoting a code comment in TargetSchedule.td: "A negative advance effectively increases latency, which may be used for cross-domain stalls". When computing the instruction latency for the purpose of our scheduling tests, we now add any extra delay to the formula. This avoids regressing existing codegen and mca schedule tests. It comes with the cost of an extra (but very simple) hook in MCSchedModel. Differential Revision: https://reviews.llvm.org/D57056 llvm-svn: 351965

commit: d768d355158d067374a2ee02755b6b44d2b71053 [log] [tgz]
author: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> Wed Jan 23 16:35:07 2019 +0000
committer: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net> Wed Jan 23 16:35:07 2019 +0000
tree: ed3fc02f01455bd4b92300e140bc831bd274f055
parent: 21ed868390fee0faca2fb86d52ec41ec65101bd6 [diff]
diff --git a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
index c9f90b8..e34f9a1 100644
--- a/llvm/lib/CodeGen/TargetSubtargetInfo.cpp
+++ b/llvm/lib/CodeGen/TargetSubtargetInfo.cpp

@@ -88,6 +88,12 @@
   TargetSchedModel TSchedModel;
   TSchedModel.init(this);
   unsigned Latency = TSchedModel.computeInstrLatency(&MI);
+
+  // Add extra latency due to forwarding delays.
+  const MCSchedClassDesc &SCDesc = *TSchedModel.resolveSchedClass(&MI);
+  Latency +=
+      MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
+
   double RThroughput = TSchedModel.computeReciprocalThroughput(&MI);
   return createSchedInfoStr(Latency, RThroughput);
 }
@@ -99,9 +105,17 @@
   TargetSchedModel TSchedModel;
   TSchedModel.init(this);
   unsigned Latency;
-  if (TSchedModel.hasInstrSchedModel())
+  if (TSchedModel.hasInstrSchedModel()) {
     Latency = TSchedModel.computeInstrLatency(MCI);
-  else if (TSchedModel.hasInstrItineraries()) {
+    // Add extra latency due to forwarding delays.
+    const MCSchedModel &SM = *TSchedModel.getMCSchedModel();
+    unsigned SClassID = getInstrInfo()->get(MCI.getOpcode()).getSchedClass();
+    while (SM.getSchedClassDesc(SClassID)->isVariant())
+      SClassID = resolveVariantSchedClass(SClassID, &MCI, SM.ProcID);
+    const MCSchedClassDesc &SCDesc = *SM.getSchedClassDesc(SClassID);  
+    Latency +=
+        MCSchedModel::getForwardingDelayCycles(getReadAdvanceEntries(SCDesc));
+  } else if (TSchedModel.hasInstrItineraries()) {
     auto *ItinData = TSchedModel.getInstrItineraries();
     Latency = ItinData->getStageLatency(
         getInstrInfo()->get(MCI.getOpcode()).getSchedClass());

diff --git a/llvm/lib/MC/MCSchedule.cpp b/llvm/lib/MC/MCSchedule.cpp
index 6797a47..1fc5ec5 100644
--- a/llvm/lib/MC/MCSchedule.cpp
+++ b/llvm/lib/MC/MCSchedule.cpp

@@ -149,3 +149,19 @@
   // that it can execute at the maximum default issue width.
   return 1.0 / DefaultIssueWidth;
 }
+
+unsigned
+MCSchedModel::getForwardingDelayCycles(ArrayRef<MCReadAdvanceEntry> Entries,
+                                       unsigned WriteResourceID) {
+  if (Entries.empty())
+    return 0;
+
+  int DelayCycles = 0;
+  for (const MCReadAdvanceEntry &E : Entries) {
+    if (E.WriteResourceID != WriteResourceID)
+      continue;
+    DelayCycles = std::min(DelayCycles, E.Cycles);
+  }
+
+  return std::abs(DelayCycles);
+}

diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index 4b0ec32..1e08f89 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp

@@ -532,6 +532,7 @@
   // Create a new empty descriptor.
   std::unique_ptr<InstrDesc> ID = llvm::make_unique<InstrDesc>();
   ID->NumMicroOps = SCDesc.NumMicroOps;
+  ID->SchedClassID = SchedClassID;
 
   if (MCDesc.isCall() && FirstCallInst) {
     // We don't correctly model calls.

diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index c00c4f4..8e2a45b 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td

@@ -543,7 +543,7 @@
                     "pinsrw\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                     [(set VR64:$dst, (int_x86_mmx_pinsr_w VR64:$src1,
                                       GR32orGR64:$src2, imm:$src3))]>,
-                    Sched<[WriteVecInsert]>;
+                    Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
 
   def MMX_PINSRWrm : MMXIi8<0xC4, MRMSrcMem,
                    (outs VR64:$dst),

diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 807af7f..e6427b1 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td

@@ -4122,7 +4122,7 @@
            "vpinsrw\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
        [(set VR128:$dst,
          (X86pinsrw VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
-       Sched<[WriteVecInsert]>;
+       Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
   def rm : Ii8<0xC4, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1,
                        i16mem:$src2, u8imm:$src3),
@@ -5577,7 +5577,7 @@
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (X86pinsrb VR128:$src1, GR32orGR64:$src2, imm:$src3))]>,
-      Sched<[WriteVecInsert]>;
+      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i8mem:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5603,7 +5603,7 @@
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v4i32 (insertelt VR128:$src1, GR32:$src2, imm:$src3)))]>,
-      Sched<[WriteVecInsert]>;
+      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i32mem:$src2, u8imm:$src3),
       !if(Is2Addr,
@@ -5629,7 +5629,7 @@
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
         (v2i64 (insertelt VR128:$src1, GR64:$src2, imm:$src3)))]>,
-      Sched<[WriteVecInsert]>;
+      Sched<[WriteVecInsert, ReadDefault, ReadInt2Fpu]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, i64mem:$src2, u8imm:$src3),
       !if(Is2Addr,

diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 03feff1..821948d 100644
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td

@@ -81,6 +81,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 5>;
 def : ReadAdvance<ReadAfterVecYLd, 6>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when queued in the reservation station.

diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 155d772..dc040e0 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td

@@ -86,6 +86,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 6>;
 def : ReadAdvance<ReadAfterVecYLd, 7>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when queued in the reservation station.

diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 4f47acd..503b905 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td

@@ -76,6 +76,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 6>;
 def : ReadAdvance<ReadAfterVecYLd, 7>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when queued in the reservation station.

diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 7d84d81..7104589 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td

@@ -80,6 +80,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 6>;
 def : ReadAdvance<ReadAfterVecYLd, 7>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when queued in the reservation station.

diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index 0d452f9..8bd8007 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td

@@ -80,6 +80,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 6>;
 def : ReadAdvance<ReadAfterVecYLd, 7>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when queued in the reservation station.

diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index 886c3ae..f50cb62 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td

@@ -17,6 +17,12 @@
 def ReadAfterVecXLd : SchedRead;
 def ReadAfterVecYLd : SchedRead;
 
+// Instructions that move data between general purpose registers and vector
+// registers may be subject to extra latency due to data bypass delays.
+// This SchedRead describes a bypass delay caused by data being moved from the
+// integer unit to the floating point unit.
+def ReadInt2Fpu : SchedRead;
+
 // Instructions with both a load and a store folded are modeled as a folded
 // load + WriteRMW.
 def WriteRMW : SchedWrite;

diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index 779692b..bf50aee 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td

@@ -46,6 +46,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 3>;
 def : ReadAdvance<ReadAfterVecYLd, 3>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when dispatched by the schedulers.

diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index ca14ed4..90ca799 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td

@@ -250,6 +250,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 5>;
 def : ReadAdvance<ReadAfterVecYLd, 5>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // A folded store needs a cycle on the PdStore for the store data.
 def : WriteRes<WriteRMW, [PdStore]>;
 

diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 8d8de3e..3a2ed73 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td

@@ -108,6 +108,11 @@
 def : ReadAdvance<ReadAfterVecXLd, 5>;
 def : ReadAdvance<ReadAfterVecYLd, 5>;
 
+/// "Additional 6 cycle transfer operation which moves a floating point
+/// operation input value from the integer unit to the floating point unit.
+/// Reference: AMDfam16h SOG (Appendix A "Instruction Latencies", Section A.2).
+def : ReadAdvance<ReadInt2Fpu, -6>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when dispatched by the schedulers.
@@ -540,7 +545,7 @@
 // Vector insert/extract operations.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 7, [1,1], 2>;
+defm : X86WriteRes<WriteVecInsert,      [JFPU01, JVALU], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecInsertLd,    [JFPU01, JVALU, JLAGU], 4, [1,1,1], 1>;
 defm : X86WriteRes<WriteVecExtract,     [JFPU0, JFPA, JALU0], 3, [1,1,1], 1>;
 defm : X86WriteRes<WriteVecExtractSt,   [JFPU1, JSTC, JSAGU], 3, [1,1,1], 1>;

diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 5dca0ff..fc150fc 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td

@@ -52,6 +52,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 3>;
 def : ReadAdvance<ReadAfterVecYLd, 3>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // Many SchedWrites are defined in pairs with and without a folded load.
 // Instructions with folded loads are usually micro-fused, so they only appear
 // as two micro-ops when queued in the reservation station.

diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 0407afc..1a75281 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td

@@ -94,6 +94,8 @@
 def : ReadAdvance<ReadAfterVecXLd, 8>;
 def : ReadAdvance<ReadAfterVecYLd, 8>;
 
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
 // The Integer PRF for Zen is 168 entries, and it holds the architectural and
 // speculative version of the 64-bit integer registers.
 // Reference: "Software Optimization Guide for AMD Family 17h Processors"
commit	d768d355158d067374a2ee02755b6b44d2b71053	[log] [tgz]
author	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>	Wed Jan 23 16:35:07 2019 +0000
committer	Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>	Wed Jan 23 16:35:07 2019 +0000
tree	ed3fc02f01455bd4b92300e140bc831bd274f055
parent	21ed868390fee0faca2fb86d52ec41ec65101bd6 [diff]