blob: 95b5a8b2c65ff1e08cfe8d94d139680ce432af80 [file] [log] [blame]
Hal Finkel42daeae2013-11-30 20:55:12 +00001//===-- PPCScheduleP7.td - PPC P7 Scheduling Definitions ---*- tablegen -*-===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This file defines the itinerary class data for the POWER7 processor.
11//
12//===----------------------------------------------------------------------===//
13
14// Primary reference:
15// IBM POWER7 multicore server processor
16// B. Sinharoy, et al.
17// IBM J. Res. & Dev. (55) 3. May/June 2011.
18
19// Scheduling for the P7 involves tracking two types of resources:
20// 1. The dispatch bundle slots
21// 2. The functional unit resources
22
23// Dispatch units:
24def P7_DU1 : FuncUnit;
25def P7_DU2 : FuncUnit;
26def P7_DU3 : FuncUnit;
27def P7_DU4 : FuncUnit;
28def P7_DU5 : FuncUnit;
29def P7_DU6 : FuncUnit;
30
31def P7_LS1 : FuncUnit; // Load/Store pipeline 1
32def P7_LS2 : FuncUnit; // Load/Store pipeline 2
33
34def P7_FX1 : FuncUnit; // FX pipeline 1
35def P7_FX2 : FuncUnit; // FX pipeline 2
36
37// VS pipeline 1 (vector integer ops. always here)
38def P7_VS1 : FuncUnit; // VS pipeline 1
39// VS pipeline 2 (128-bit stores and perms. here)
40def P7_VS2 : FuncUnit; // VS pipeline 2
41
42def P7_CRU : FuncUnit; // CR unit (CR logicals and move-from-SPRs)
43def P7_BRU : FuncUnit; // BR unit
44
45// Notes:
46// Each LSU pipeline can also execute FX add and logical instructions.
47// Each LSU pipeline can complete a load or store in one cycle.
48//
49// Each store is broken into two parts, AGEN goes to the LSU while a
50// "data steering" op. goes to the FXU or VSU.
51//
52// FX loads have a two cycle load-to-use latency (so one "bubble" cycle).
53// VSU loads have a three cycle load-to-use latency (so two "bubble" cycle).
54//
55// Frequent FX ops. take only one cycle and results can be used again in the
56// next cycle (there is a self-bypass). Getting results from the other FX
57// pipeline takes an additional cycle.
58//
59// The VSU XS is similar to the POWER6, but with a pipeline length of 2 cycles
60// (instead of 3 cycles on the POWER6). VSU XS handles vector FX-style ops.
61// Dispatch of an instruction to VS1 that uses four single prec. inputs
62// (either to a float or XC op). prevents dispatch in that cycle to VS2 of any
63// floating point instruction.
64//
65// The VSU PM is similar to the POWER6, but with a pipeline length of 3 cycles
66// (instead of 4 cycles on the POWER6). vsel is handled by the PM pipeline
67// (unlike on the POWER6).
68//
69// FMA from the VSUs can forward results in 6 cycles. VS1 XS and vector FP
70// share the same write-back, and have a 5-cycle latency difference, so the
71// IFU/IDU will not dispatch an XS instructon 5 cycles after a vector FP
72// op. has been dispatched to VS1.
73//
74// Three cycles after an L1 cache hit, a dependent VSU instruction can issue.
75//
76// Instruction dispatch groups have (at most) four non-branch instructions, and
77// two branches. Unlike on the POWER4/5, a branch does not automatically
78// end the dispatch group, but a second branch must be the last in the group.
79
80def P7Itineraries : ProcessorItineraries<
81 [P7_DU1, P7_DU2, P7_DU3, P7_DU4, P7_DU5, P7_DU6,
82 P7_LS1, P7_LS2, P7_FX1, P7_FX2, P7_VS1, P7_VS2, P7_CRU, P7_BRU], [], [
83 InstrItinData<IIC_IntSimple , [InstrStage<1, [P7_DU1, P7_DU2,
84 P7_DU3, P7_DU4], 0>,
85 InstrStage<1, [P7_FX1, P7_FX2,
86 P7_LS1, P7_LS2]>],
87 [1, 1, 1]>,
88 InstrItinData<IIC_IntGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
89 P7_DU3, P7_DU4], 0>,
90 InstrStage<1, [P7_FX1, P7_FX2]>],
91 [1, 1, 1]>,
92 InstrItinData<IIC_IntCompare , [InstrStage<1, [P7_DU1, P7_DU2,
93 P7_DU3, P7_DU4], 0>,
94 InstrStage<1, [P7_FX1, P7_FX2]>],
95 [1, 1, 1]>,
96 InstrItinData<IIC_IntDivW , [InstrStage<1, [P7_DU1], 0>,
97 InstrStage<1, [P7_DU2], 0>,
98 InstrStage<36, [P7_FX1, P7_FX2]>],
99 [36, 1, 1]>,
100 InstrItinData<IIC_IntDivD , [InstrStage<1, [P7_DU1], 0>,
101 InstrStage<1, [P7_DU2], 0>,
102 InstrStage<68, [P7_FX1, P7_FX2]>],
103 [68, 1, 1]>,
104 InstrItinData<IIC_IntMulHW , [InstrStage<1, [P7_DU1, P7_DU2,
105 P7_DU3, P7_DU4], 0>,
106 InstrStage<1, [P7_FX1, P7_FX2]>],
107 [4, 1, 1]>,
108 InstrItinData<IIC_IntMulHWU , [InstrStage<1, [P7_DU1, P7_DU2,
109 P7_DU3, P7_DU4], 0>,
110 InstrStage<1, [P7_FX1, P7_FX2]>],
111 [4, 1, 1]>,
112 InstrItinData<IIC_IntMulLI , [InstrStage<1, [P7_DU1, P7_DU2,
113 P7_DU3, P7_DU4], 0>,
114 InstrStage<1, [P7_FX1, P7_FX2]>],
115 [4, 1, 1]>,
116 InstrItinData<IIC_IntRotate , [InstrStage<1, [P7_DU1, P7_DU2,
117 P7_DU3, P7_DU4], 0>,
118 InstrStage<1, [P7_FX1, P7_FX2]>],
119 [1, 1, 1]>,
120 InstrItinData<IIC_IntRotateD , [InstrStage<1, [P7_DU1, P7_DU2,
121 P7_DU3, P7_DU4], 0>,
122 InstrStage<1, [P7_FX1, P7_FX2]>],
123 [1, 1, 1]>,
124 InstrItinData<IIC_IntShift , [InstrStage<1, [P7_DU1, P7_DU2,
125 P7_DU3, P7_DU4], 0>,
126 InstrStage<1, [P7_FX1, P7_FX2]>],
127 [1, 1, 1]>,
128 InstrItinData<IIC_IntTrapW , [InstrStage<1, [P7_DU1, P7_DU2,
129 P7_DU3, P7_DU4], 0>,
130 InstrStage<1, [P7_FX1, P7_FX2]>],
131 [1, 1]>,
132 InstrItinData<IIC_IntTrapD , [InstrStage<1, [P7_DU1, P7_DU2,
133 P7_DU3, P7_DU4], 0>,
134 InstrStage<1, [P7_FX1, P7_FX2]>],
135 [1, 1]>,
136 InstrItinData<IIC_BrB , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
137 InstrStage<1, [P7_BRU]>],
138 [3, 1, 1]>,
139 InstrItinData<IIC_BrCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
140 InstrStage<1, [P7_BRU]>],
141 [3, 1, 1]>,
142 InstrItinData<IIC_BrMCR , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
143 InstrStage<1, [P7_BRU]>],
144 [3, 1, 1]>,
145 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU5, P7_DU6], 0>,
146 InstrStage<1, [P7_BRU]>],
147 [3, 1, 1]>,
148 InstrItinData<IIC_LdStLoad , [InstrStage<1, [P7_DU1, P7_DU2,
149 P7_DU3, P7_DU4], 0>,
150 InstrStage<1, [P7_LS1, P7_LS2]>],
151 [2, 1, 1]>,
152 InstrItinData<IIC_LdStLoadUpd , [InstrStage<1, [P7_DU1], 0>,
153 InstrStage<1, [P7_DU2], 0>,
154 InstrStage<1, [P7_LS1, P7_LS2], 0>,
155 InstrStage<1, [P7_FX1, P7_FX2]>],
156 [2, 2, 1, 1]>,
157 InstrItinData<IIC_LdStLoadUpdX, [InstrStage<1, [P7_DU1], 0>,
158 InstrStage<1, [P7_DU2], 0>,
159 InstrStage<1, [P7_DU3], 0>,
160 InstrStage<1, [P7_DU4], 0>,
161 InstrStage<1, [P7_FX1, P7_FX2]>,
162 InstrStage<1, [P7_LS1, P7_LS2], 0>,
163 InstrStage<1, [P7_FX1, P7_FX2]>],
164 [3, 3, 1, 1]>,
165 InstrItinData<IIC_LdStLD , [InstrStage<1, [P7_DU1, P7_DU2,
166 P7_DU3, P7_DU4], 0>,
167 InstrStage<1, [P7_LS1, P7_LS2]>],
168 [2, 1, 1]>,
169 InstrItinData<IIC_LdStLDU , [InstrStage<1, [P7_DU1], 0>,
170 InstrStage<1, [P7_DU2], 0>,
171 InstrStage<1, [P7_LS1, P7_LS2], 0>,
172 InstrStage<1, [P7_FX1, P7_FX2]>],
173 [2, 2, 1, 1]>,
174 InstrItinData<IIC_LdStLDUX , [InstrStage<1, [P7_DU1], 0>,
175 InstrStage<1, [P7_DU2], 0>,
176 InstrStage<1, [P7_DU3], 0>,
177 InstrStage<1, [P7_DU4], 0>,
178 InstrStage<1, [P7_FX1, P7_FX2]>,
179 InstrStage<1, [P7_LS1, P7_LS2], 0>,
180 InstrStage<1, [P7_FX1, P7_FX2]>],
181 [3, 3, 1, 1]>,
182 InstrItinData<IIC_LdStLFD , [InstrStage<1, [P7_DU1, P7_DU2,
183 P7_DU3, P7_DU4], 0>,
184 InstrStage<1, [P7_LS1, P7_LS2]>],
185 [3, 1, 1]>,
186 InstrItinData<IIC_LdStLVecX , [InstrStage<1, [P7_DU1, P7_DU2,
187 P7_DU3, P7_DU4], 0>,
188 InstrStage<1, [P7_LS1, P7_LS2]>],
189 [3, 1, 1]>,
190 InstrItinData<IIC_LdStLFDU , [InstrStage<1, [P7_DU1], 0>,
191 InstrStage<1, [P7_DU2], 0>,
192 InstrStage<1, [P7_LS1, P7_LS2], 0>,
193 InstrStage<1, [P7_FX1, P7_FX2]>],
194 [3, 3, 1, 1]>,
195 InstrItinData<IIC_LdStLFDUX , [InstrStage<1, [P7_DU1], 0>,
196 InstrStage<1, [P7_DU2], 0>,
197 InstrStage<1, [P7_LS1, P7_LS2], 0>,
198 InstrStage<1, [P7_FX1, P7_FX2]>],
199 [3, 3, 1, 1]>,
200 InstrItinData<IIC_LdStLHA , [InstrStage<1, [P7_DU1], 0>,
201 InstrStage<1, [P7_DU2], 0>,
202 InstrStage<1, [P7_LS1, P7_LS2]>,
203 InstrStage<1, [P7_FX1, P7_FX2]>],
204 [3, 1, 1]>,
205 InstrItinData<IIC_LdStLHAU , [InstrStage<1, [P7_DU1], 0>,
206 InstrStage<1, [P7_DU2], 0>,
207 InstrStage<1, [P7_LS1, P7_LS2], 0>,
208 InstrStage<1, [P7_FX1, P7_FX2]>,
209 InstrStage<1, [P7_FX1, P7_FX2]>],
210 [4, 4, 1, 1]>,
211 InstrItinData<IIC_LdStLHAUX , [InstrStage<1, [P7_DU1], 0>,
212 InstrStage<1, [P7_DU2], 0>,
213 InstrStage<1, [P7_DU3], 0>,
214 InstrStage<1, [P7_DU4], 0>,
215 InstrStage<1, [P7_FX1, P7_FX2]>,
216 InstrStage<1, [P7_LS1, P7_LS2], 0>,
217 InstrStage<1, [P7_FX1, P7_FX2]>,
218 InstrStage<1, [P7_FX1, P7_FX2]>],
219 [4, 4, 1, 1]>,
220 InstrItinData<IIC_LdStLWA , [InstrStage<1, [P7_DU1], 0>,
221 InstrStage<1, [P7_DU2], 0>,
222 InstrStage<1, [P7_LS1, P7_LS2]>,
223 InstrStage<1, [P7_FX1, P7_FX2]>],
224 [3, 1, 1]>,
225 InstrItinData<IIC_LdStLWARX, [InstrStage<1, [P7_DU1], 0>,
226 InstrStage<1, [P7_DU2], 0>,
227 InstrStage<1, [P7_DU3], 0>,
228 InstrStage<1, [P7_DU4], 0>,
229 InstrStage<1, [P7_LS1, P7_LS2]>],
230 [3, 1, 1]>,
231 InstrItinData<IIC_LdStLDARX, [InstrStage<1, [P7_DU1], 0>,
232 InstrStage<1, [P7_DU2], 0>,
233 InstrStage<1, [P7_DU3], 0>,
234 InstrStage<1, [P7_DU4], 0>,
235 InstrStage<1, [P7_LS1, P7_LS2]>],
236 [3, 1, 1]>,
237 InstrItinData<IIC_LdStLMW , [InstrStage<1, [P7_DU1, P7_DU2,
238 P7_DU3, P7_DU4], 0>,
239 InstrStage<1, [P7_LS1, P7_LS2]>],
240 [2, 1, 1]>,
241 InstrItinData<IIC_LdStStore , [InstrStage<1, [P7_DU1, P7_DU2,
242 P7_DU3, P7_DU4], 0>,
243 InstrStage<1, [P7_LS1, P7_LS2], 0>,
244 InstrStage<1, [P7_FX1, P7_FX2]>],
245 [1, 1, 1]>,
246 InstrItinData<IIC_LdStSTD , [InstrStage<1, [P7_DU1, P7_DU2,
247 P7_DU3, P7_DU4], 0>,
248 InstrStage<1, [P7_LS1, P7_LS2], 0>,
249 InstrStage<1, [P7_FX1, P7_FX2]>],
250 [1, 1, 1]>,
251 InstrItinData<IIC_LdStSTDU , [InstrStage<1, [P7_DU1], 0>,
252 InstrStage<1, [P7_DU2], 0>,
253 InstrStage<1, [P7_LS1, P7_LS2], 0>,
254 InstrStage<1, [P7_FX1, P7_FX2]>,
255 InstrStage<1, [P7_FX1, P7_FX2]>],
256 [2, 1, 1, 1]>,
257 InstrItinData<IIC_LdStSTDUX , [InstrStage<1, [P7_DU1], 0>,
258 InstrStage<1, [P7_DU2], 0>,
259 InstrStage<1, [P7_DU3], 0>,
260 InstrStage<1, [P7_DU4], 0>,
261 InstrStage<1, [P7_LS1, P7_LS2], 0>,
262 InstrStage<1, [P7_FX1, P7_FX2]>,
263 InstrStage<1, [P7_FX1, P7_FX2]>],
264 [2, 1, 1, 1]>,
265 InstrItinData<IIC_LdStSTFD , [InstrStage<1, [P7_DU1, P7_DU2,
266 P7_DU3, P7_DU4], 0>,
267 InstrStage<1, [P7_LS1, P7_LS2], 0>,
268 InstrStage<1, [P7_VS1, P7_VS2]>],
269 [1, 1, 1]>,
270 InstrItinData<IIC_LdStSTFDU , [InstrStage<1, [P7_DU1], 0>,
271 InstrStage<1, [P7_DU2], 0>,
272 InstrStage<1, [P7_LS1, P7_LS2], 0>,
273 InstrStage<1, [P7_FX1, P7_FX2], 0>,
274 InstrStage<1, [P7_VS1, P7_VS2]>],
275 [2, 1, 1, 1]>,
276 InstrItinData<IIC_LdStSTVEBX , [InstrStage<1, [P7_DU1, P7_DU2,
277 P7_DU3, P7_DU4], 0>,
278 InstrStage<1, [P7_LS1, P7_LS2], 0>,
279 InstrStage<1, [P7_VS2]>],
280 [1, 1, 1]>,
281 InstrItinData<IIC_LdStSTDCX , [InstrStage<1, [P7_DU1], 0>,
282 InstrStage<1, [P7_DU2], 0>,
283 InstrStage<1, [P7_DU3], 0>,
284 InstrStage<1, [P7_DU4], 0>,
285 InstrStage<1, [P7_LS1, P7_LS2]>],
286 [1, 1, 1]>,
287 InstrItinData<IIC_LdStSTWCX , [InstrStage<1, [P7_DU1], 0>,
288 InstrStage<1, [P7_DU2], 0>,
289 InstrStage<1, [P7_DU3], 0>,
290 InstrStage<1, [P7_DU4], 0>,
291 InstrStage<1, [P7_LS1, P7_LS2]>],
292 [1, 1, 1]>,
293 InstrItinData<IIC_BrMCRX , [InstrStage<1, [P7_DU4], 0>,
294 InstrStage<1, [P7_CRU]>,
295 InstrStage<1, [P7_FX1, P7_FX2]>],
296 [3, 1]>, // mtcr
297 InstrItinData<IIC_SprMFCR , [InstrStage<1, [P7_DU1], 0>,
298 InstrStage<1, [P7_CRU]>],
299 [6, 1]>,
300 InstrItinData<IIC_SprMFCRF , [InstrStage<1, [P7_DU1], 0>,
301 InstrStage<1, [P7_CRU]>],
302 [3, 1]>,
303 InstrItinData<IIC_FPGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
304 P7_DU3, P7_DU4], 0>,
305 InstrStage<1, [P7_VS1, P7_VS2]>],
306 [5, 1, 1]>,
307 InstrItinData<IIC_FPCompare , [InstrStage<1, [P7_DU1, P7_DU2,
308 P7_DU3, P7_DU4], 0>,
309 InstrStage<1, [P7_VS1, P7_VS2]>],
310 [8, 1, 1]>,
311 InstrItinData<IIC_FPDivD , [InstrStage<1, [P7_DU1, P7_DU2,
312 P7_DU3, P7_DU4], 0>,
313 InstrStage<1, [P7_VS1, P7_VS2]>],
314 [33, 1, 1]>,
315 InstrItinData<IIC_FPDivS , [InstrStage<1, [P7_DU1, P7_DU2,
316 P7_DU3, P7_DU4], 0>,
317 InstrStage<1, [P7_VS1, P7_VS2]>],
318 [27, 1, 1]>,
319 InstrItinData<IIC_FPSqrtD , [InstrStage<1, [P7_DU1, P7_DU2,
320 P7_DU3, P7_DU4], 0>,
321 InstrStage<1, [P7_VS1, P7_VS2]>],
322 [44, 1, 1]>,
323 InstrItinData<IIC_FPSqrtS , [InstrStage<1, [P7_DU1, P7_DU2,
324 P7_DU3, P7_DU4], 0>,
325 InstrStage<1, [P7_VS1, P7_VS2]>],
326 [32, 1, 1]>,
327 InstrItinData<IIC_FPFused , [InstrStage<1, [P7_DU1, P7_DU2,
328 P7_DU3, P7_DU4], 0>,
329 InstrStage<1, [P7_VS1, P7_VS2]>],
330 [5, 1, 1, 1]>,
331 InstrItinData<IIC_FPRes , [InstrStage<1, [P7_DU1, P7_DU2,
332 P7_DU3, P7_DU4], 0>,
333 InstrStage<1, [P7_VS1, P7_VS2]>],
334 [5, 1, 1]>,
335 InstrItinData<IIC_VecGeneral , [InstrStage<1, [P7_DU1, P7_DU2,
336 P7_DU3, P7_DU4], 0>,
337 InstrStage<1, [P7_VS1]>],
338 [2, 1, 1]>,
339 InstrItinData<IIC_VecVSL , [InstrStage<1, [P7_DU1, P7_DU2,
340 P7_DU3, P7_DU4], 0>,
341 InstrStage<1, [P7_VS1]>],
342 [2, 1, 1]>,
343 InstrItinData<IIC_VecVSR , [InstrStage<1, [P7_DU1, P7_DU2,
344 P7_DU3, P7_DU4], 0>,
345 InstrStage<1, [P7_VS1]>],
346 [2, 1, 1]>,
347 InstrItinData<IIC_VecFP , [InstrStage<1, [P7_DU1, P7_DU2,
348 P7_DU3, P7_DU4], 0>,
349 InstrStage<1, [P7_VS1, P7_VS2]>],
350 [6, 1, 1]>,
351 InstrItinData<IIC_VecFPCompare, [InstrStage<1, [P7_DU1, P7_DU2,
352 P7_DU3, P7_DU4], 0>,
353 InstrStage<1, [P7_VS1, P7_VS2]>],
354 [6, 1, 1]>,
355 InstrItinData<IIC_VecFPRound , [InstrStage<1, [P7_DU1, P7_DU2,
356 P7_DU3, P7_DU4], 0>,
357 InstrStage<1, [P7_VS1, P7_VS2]>],
358 [6, 1, 1]>,
359 InstrItinData<IIC_VecComplex , [InstrStage<1, [P7_DU1, P7_DU2,
360 P7_DU3, P7_DU4], 0>,
361 InstrStage<1, [P7_VS1]>],
362 [7, 1, 1]>,
363 InstrItinData<IIC_VecPerm , [InstrStage<1, [P7_DU1, P7_DU2,
364 P7_DU3, P7_DU4], 0>,
365 InstrStage<1, [P7_VS2]>],
366 [2, 1, 1]>
367]>;
368
369// ===---------------------------------------------------------------------===//
370// P7 machine model for scheduling and other instruction cost heuristics.
371
372def P7Model : SchedMachineModel {
373 let IssueWidth = 6; // 4 (non-branch) instructions are dispatched per cycle.
374 // Note that the dispatch bundle size is 6 (including
375 // branches), but the total internal issue bandwidth per
376 // cycle (from all queues) is 8.
377
378 let MinLatency = 0; // Out-of-order dispatch.
379 let LoadLatency = 3; // Optimistic load latency assuming bypass.
380 // This is overriden by OperandCycles if the
381 // Itineraries are queried instead.
382 let MispredictPenalty = 16;
383
384 let Itineraries = P7Itineraries;
385}
386