blob: 0b4f78f67b096f576667ec1141590213e0edd673 [file] [log] [blame]
Chris Lattnerb86bd2c2006-03-27 07:04:16 +00001//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
2
Nate Begemanb64af912004-08-10 20:42:36 +00003TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00004* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00005* implement do-loop -> bdnz transform
Nate Begeman50fb3c42005-12-24 01:00:15 +00006
Nate Begemana63fee82006-02-03 05:17:06 +00007===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00008
Nate Begemana63fee82006-02-03 05:17:06 +00009Support 'update' load/store instructions. These are cracked on the G5, but are
10still a codesize win.
11
Chris Lattner26ddb502006-11-10 01:33:53 +000012With preinc enabled, this:
13
14long *%test4(long *%X, long *%dest) {
15 %Y = getelementptr long* %X, int 4
16 %A = load long* %Y
17 store long %A, long* %dest
18 ret long* %Y
19}
20
21compiles to:
22
23_test4:
24 mr r2, r3
25 lwzu r5, 32(r2)
26 lwz r3, 36(r3)
27 stw r5, 0(r4)
28 stw r3, 4(r4)
29 mr r3, r2
30 blr
31
32with -sched=list-burr, I get:
33
34_test4:
35 lwz r2, 36(r3)
36 lwzu r5, 32(r3)
37 stw r2, 4(r4)
38 stw r5, 0(r4)
39 blr
40
Nate Begemana63fee82006-02-03 05:17:06 +000041===-------------------------------------------------------------------------===
42
Chris Lattner6e112952006-11-07 18:30:21 +000043We compile the hottest inner loop of viterbi to:
44
45 li r6, 0
46 b LBB1_84 ;bb432.i
47LBB1_83: ;bb420.i
48 lbzx r8, r5, r7
49 addi r6, r7, 1
50 stbx r8, r4, r7
51LBB1_84: ;bb432.i
52 mr r7, r6
53 cmplwi cr0, r7, 143
54 bne cr0, LBB1_83 ;bb420.i
55
56The CBE manages to produce:
57
58 li r0, 143
59 mtctr r0
60loop:
61 lbzx r2, r2, r11
62 stbx r0, r2, r9
63 addi r2, r2, 1
64 bdz later
65 b loop
66
67This could be much better (bdnz instead of bdz) but it still beats us. If we
68produced this with bdnz, the loop would be a single dispatch group.
69
70===-------------------------------------------------------------------------===
71
Chris Lattner6a250ec2006-10-13 20:20:58 +000072Compile:
73
74void foo(int *P) {
75 if (P) *P = 0;
76}
77
78into:
79
80_foo:
81 cmpwi cr0,r3,0
82 beqlr cr0
83 li r0,0
84 stw r0,0(r3)
85 blr
86
87This is effectively a simple form of predication.
88
89===-------------------------------------------------------------------------===
90
Nate Begeman81e80972006-03-17 01:40:33 +000091Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
92we don't have to always run the branch selector for small functions.
Nate Begeman1ad9b3a2006-03-16 22:37:48 +000093
Chris Lattnera3c44542005-08-24 18:15:24 +000094===-------------------------------------------------------------------------===
95
Chris Lattnera3c44542005-08-24 18:15:24 +000096Lump the constant pool for each function into ONE pic object, and reference
97pieces of it as offsets from the start. For functions like this (contrived
98to have lots of constants obviously):
99
100double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
101
102We generate:
103
104_X:
105 lis r2, ha16(.CPI_X_0)
106 lfd f0, lo16(.CPI_X_0)(r2)
107 lis r2, ha16(.CPI_X_1)
108 lfd f2, lo16(.CPI_X_1)(r2)
109 fmadd f0, f1, f0, f2
110 lis r2, ha16(.CPI_X_2)
111 lfd f1, lo16(.CPI_X_2)(r2)
112 lis r2, ha16(.CPI_X_3)
113 lfd f2, lo16(.CPI_X_3)(r2)
114 fmadd f1, f0, f1, f2
115 blr
116
117It would be better to materialize .CPI_X into a register, then use immediates
118off of the register to avoid the lis's. This is even more important in PIC
119mode.
120
Chris Lattner39b248b2006-02-02 23:50:22 +0000121Note that this (and the static variable version) is discussed here for GCC:
122http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
123
Chris Lattnera3c44542005-08-24 18:15:24 +0000124===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +0000125
Chris Lattner33c1dab2006-02-03 06:22:11 +0000126PIC Code Gen IPO optimization:
127
128Squish small scalar globals together into a single global struct, allowing the
129address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
130of the GOT on targets with one).
131
132Note that this is discussed here for GCC:
133http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
134
135===-------------------------------------------------------------------------===
136
Nate Begeman92cce902005-09-06 15:30:48 +0000137Implement Newton-Rhapson method for improving estimate instructions to the
138correct accuracy, and implementing divide as multiply by reciprocal when it has
139more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +0000140
141===-------------------------------------------------------------------------===
142
Chris Lattnerae4664a2005-11-05 08:57:56 +0000143Compile this:
144
145int %f1(int %a, int %b) {
146 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
147 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
148 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
149 ret int %tmp.4
150}
151
152without a copy. We make this currently:
153
154_f1:
155 rlwinm r2, r4, 0, 24, 27
156 rlwimi r2, r3, 0, 28, 31
157 or r3, r2, r2
158 blr
159
160The two-addr pass or RA needs to learn when it is profitable to commute an
161instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
162currently only commutes to avoid inserting a copy BEFORE the two addr instr.
163
Chris Lattner62c08dd2005-12-08 07:13:28 +0000164===-------------------------------------------------------------------------===
165
166Compile offsets from allocas:
167
168int *%test() {
169 %X = alloca { int, int }
170 %Y = getelementptr {int,int}* %X, int 0, uint 1
171 ret int* %Y
172}
173
174into a single add, not two:
175
176_test:
177 addi r2, r1, -8
178 addi r3, r2, 4
179 blr
180
181--> important for C++.
182
Chris Lattner39706e62005-12-22 17:19:28 +0000183===-------------------------------------------------------------------------===
184
Chris Lattner39706e62005-12-22 17:19:28 +0000185No loads or stores of the constants should be needed:
186
187struct foo { double X, Y; };
188void xxx(struct foo F);
189void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
190
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000191===-------------------------------------------------------------------------===
192
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000193Darwin Stub LICM optimization:
194
195Loops like this:
196
197 for (...) bar();
198
199Have to go through an indirect stub if bar is external or linkonce. It would
200be better to compile it as:
201
202 fp = &bar;
203 for (...) fp();
204
205which only computes the address of bar once (instead of each time through the
206stub). This is Darwin specific and would have to be done in the code generator.
207Probably not a win on x86.
208
209===-------------------------------------------------------------------------===
210
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000211Simple IPO for argument passing, change:
212 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
213
214the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
215of arguments get assigned to r3 through r10. That is, if you have a function
216foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
217argument bytes for r4 and r5. The trick then would be to shuffle the argument
218order for functions we can internalize so that the maximum number of
219integers/pointers get passed in regs before you see any of the fp arguments.
220
221Instead of implementing this, it would actually probably be easier to just
222implement a PPC fastcc, where we could do whatever we wanted to the CC,
223including having this work sanely.
224
225===-------------------------------------------------------------------------===
226
227Fix Darwin FP-In-Integer Registers ABI
228
229Darwin passes doubles in structures in integer registers, which is very very
230bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
231that percolates these things out of functions.
232
233Check out how horrible this is:
234http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
235
236This is an extension of "interprocedural CC unmunging" that can't be done with
237just fastcc.
238
239===-------------------------------------------------------------------------===
240
Chris Lattner56b69642006-01-31 02:55:28 +0000241Compile this:
242
Chris Lattner83e64ba2006-01-31 07:16:34 +0000243int foo(int a) {
244 int b = (a < 8);
245 if (b) {
246 return b * 3; // ignore the fact that this is always 3.
247 } else {
248 return 2;
249 }
250}
251
252into something not this:
253
254_foo:
2551) cmpwi cr7, r3, 8
256 mfcr r2, 1
257 rlwinm r2, r2, 29, 31, 31
2581) cmpwi cr0, r3, 7
259 bgt cr0, LBB1_2 ; UnifiedReturnBlock
260LBB1_1: ; then
261 rlwinm r2, r2, 0, 31, 31
262 mulli r3, r2, 3
263 blr
264LBB1_2: ; UnifiedReturnBlock
265 li r3, 2
266 blr
267
268In particular, the two compares (marked 1) could be shared by reversing one.
269This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
270same operands (but backwards) exists. In this case, this wouldn't save us
271anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000272
Chris Lattner5a7efc92006-02-01 17:54:23 +0000273===-------------------------------------------------------------------------===
274
275The legalizer should lower this:
276
277bool %test(ulong %x) {
278 %tmp = setlt ulong %x, 4294967296
279 ret bool %tmp
280}
281
282into "if x.high == 0", not:
283
284_test:
285 addi r2, r3, -1
286 cntlzw r2, r2
287 cntlzw r3, r3
288 srwi r2, r2, 5
Nate Begeman93c740b2006-02-02 07:27:56 +0000289 srwi r4, r3, 5
290 li r3, 0
Chris Lattner5a7efc92006-02-01 17:54:23 +0000291 cmpwi cr0, r2, 0
292 bne cr0, LBB1_2 ;
293LBB1_1:
Nate Begeman93c740b2006-02-02 07:27:56 +0000294 or r3, r4, r4
Chris Lattner5a7efc92006-02-01 17:54:23 +0000295LBB1_2:
Chris Lattner5a7efc92006-02-01 17:54:23 +0000296 blr
297
298noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner275b8842006-02-02 07:37:11 +0000299
300
301===-------------------------------------------------------------------------===
302
303We should custom expand setcc instead of pretending that we have it. That
304would allow us to expose the access of the crbit after the mfcr, allowing
305that access to be trivially folded into other ops. A simple example:
306
307int foo(int a, int b) { return (a < b) << 4; }
308
309compiles into:
310
311_foo:
312 cmpw cr7, r3, r4
313 mfcr r2, 1
314 rlwinm r2, r2, 29, 31, 31
315 slwi r3, r2, 4
316 blr
317
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000318===-------------------------------------------------------------------------===
319
Nate Begemana63fee82006-02-03 05:17:06 +0000320Fold add and sub with constant into non-extern, non-weak addresses so this:
321
322static int a;
323void bar(int b) { a = b; }
324void foo(unsigned char *c) {
325 *c = a;
326}
327
328So that
329
330_foo:
331 lis r2, ha16(_a)
332 la r2, lo16(_a)(r2)
333 lbz r2, 3(r2)
334 stb r2, 0(r3)
335 blr
336
337Becomes
338
339_foo:
340 lis r2, ha16(_a+3)
341 lbz r2, lo16(_a+3)(r2)
342 stb r2, 0(r3)
343 blr
Chris Lattner21384532006-02-05 05:27:35 +0000344
345===-------------------------------------------------------------------------===
346
347We generate really bad code for this:
348
349int f(signed char *a, _Bool b, _Bool c) {
350 signed char t = 0;
351 if (b) t = *a;
352 if (c) *a = t;
353}
354
Chris Lattner00d18f02006-03-01 06:36:20 +0000355===-------------------------------------------------------------------------===
356
357This:
358int test(unsigned *P) { return *P >> 24; }
359
360Should compile to:
361
362_test:
363 lbz r3,0(r3)
364 blr
365
366not:
367
368_test:
369 lwz r2, 0(r3)
370 srwi r3, r2, 24
371 blr
372
Chris Lattner5a63c472006-03-07 04:42:59 +0000373===-------------------------------------------------------------------------===
374
375On the G5, logical CR operations are more expensive in their three
376address form: ops that read/write the same register are half as expensive as
377those that read from two registers that are different from their destination.
378
379We should model this with two separate instructions. The isel should generate
380the "two address" form of the instructions. When the register allocator
381detects that it needs to insert a copy due to the two-addresness of the CR
382logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
383we can convert to the "three address" instruction, to save code space.
384
385This only matters when we start generating cr logical ops.
386
Chris Lattner49f398b2006-03-08 00:25:47 +0000387===-------------------------------------------------------------------------===
388
389We should compile these two functions to the same thing:
390
391#include <stdlib.h>
392void f(int a, int b, int *P) {
393 *P = (a-b)>=0?(a-b):(b-a);
394}
395void g(int a, int b, int *P) {
396 *P = abs(a-b);
397}
398
399Further, they should compile to something better than:
400
401_g:
402 subf r2, r4, r3
403 subfic r3, r2, 0
404 cmpwi cr0, r2, -1
405 bgt cr0, LBB2_2 ; entry
406LBB2_1: ; entry
407 mr r2, r3
408LBB2_2: ; entry
409 stw r2, 0(r5)
410 blr
411
412GCC produces:
413
414_g:
415 subf r4,r4,r3
416 srawi r2,r4,31
417 xor r0,r2,r4
418 subf r0,r2,r0
419 stw r0,0(r5)
420 blr
421
422... which is much nicer.
423
424This theoretically may help improve twolf slightly (used in dimbox.c:142?).
425
426===-------------------------------------------------------------------------===
427
Nate Begeman2df99282006-03-16 18:50:44 +0000428int foo(int N, int ***W, int **TK, int X) {
429 int t, i;
430
431 for (t = 0; t < N; ++t)
432 for (i = 0; i < 4; ++i)
433 W[t / X][i][t % X] = TK[i][t];
434
435 return 5;
436}
437
Chris Lattnered511692006-03-16 22:25:55 +0000438We generate relatively atrocious code for this loop compared to gcc.
439
Chris Lattneref040dd2006-03-21 00:47:09 +0000440We could also strength reduce the rem and the div:
441http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
442
Chris Lattner28b1a0b2006-03-19 05:33:30 +0000443===-------------------------------------------------------------------------===
Chris Lattnered511692006-03-16 22:25:55 +0000444
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000445float foo(float X) { return (int)(X); }
446
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000447Currently produces:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000448
449_foo:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000450 fctiwz f0, f1
451 stfd f0, -8(r1)
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000452 lwz r2, -4(r1)
453 extsw r2, r2
454 std r2, -16(r1)
455 lfd f0, -16(r1)
456 fcfid f0, f0
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000457 frsp f1, f0
458 blr
459
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000460We could use a target dag combine to turn the lwz/extsw into an lwa when the
461lwz has a single use. Since LWA is cracked anyway, this would be a codesize
462win only.
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000463
Chris Lattner716aefc2006-03-23 21:28:44 +0000464===-------------------------------------------------------------------------===
465
Chris Lattner057f09b2006-03-24 20:04:27 +0000466We generate ugly code for this:
467
468void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
469 unsigned code = 0;
470 if(dx < -dw) code |= 1;
471 if(dx > dw) code |= 2;
472 if(dy < -dw) code |= 4;
473 if(dy > dw) code |= 8;
474 if(dz < -dw) code |= 16;
475 if(dz > dw) code |= 32;
476 *ret = code;
477}
478
Chris Lattner420736d2006-03-25 06:47:10 +0000479===-------------------------------------------------------------------------===
480
Chris Lattnered937902006-04-13 16:48:00 +0000481Complete the signed i32 to FP conversion code using 64-bit registers
482transformation, good for PI. See PPCISelLowering.cpp, this comment:
Chris Lattner220d2b82006-04-02 07:20:00 +0000483
Chris Lattnered937902006-04-13 16:48:00 +0000484 // FIXME: disable this lowered code. This generates 64-bit register values,
485 // and we don't model the fact that the top part is clobbered by calls. We
486 // need to flag these together so that the value isn't live across a call.
487 //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
Chris Lattner220d2b82006-04-02 07:20:00 +0000488
Chris Lattner9d62fa42006-05-17 19:02:25 +0000489Also, if the registers are spilled to the stack, we have to ensure that all
49064-bits of them are save/restored, otherwise we will miscompile the code. It
491sounds like we need to get the 64-bit register classes going.
492
Chris Lattner55c63252006-05-05 05:36:15 +0000493===-------------------------------------------------------------------------===
494
Nate Begeman75146202006-05-08 20:54:02 +0000495%struct.B = type { ubyte, [3 x ubyte] }
496
497void %foo(%struct.B* %b) {
498entry:
499 %tmp = cast %struct.B* %b to uint* ; <uint*> [#uses=1]
500 %tmp = load uint* %tmp ; <uint> [#uses=1]
501 %tmp3 = cast %struct.B* %b to uint* ; <uint*> [#uses=1]
502 %tmp4 = load uint* %tmp3 ; <uint> [#uses=1]
503 %tmp8 = cast %struct.B* %b to uint* ; <uint*> [#uses=2]
504 %tmp9 = load uint* %tmp8 ; <uint> [#uses=1]
505 %tmp4.mask17 = shl uint %tmp4, ubyte 1 ; <uint> [#uses=1]
506 %tmp1415 = and uint %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
507 %tmp.masked = and uint %tmp, 2147483648 ; <uint> [#uses=1]
508 %tmp11 = or uint %tmp1415, %tmp.masked ; <uint> [#uses=1]
509 %tmp12 = and uint %tmp9, 2147483647 ; <uint> [#uses=1]
510 %tmp13 = or uint %tmp12, %tmp11 ; <uint> [#uses=1]
511 store uint %tmp13, uint* %tmp8
Chris Lattner55c63252006-05-05 05:36:15 +0000512 ret void
513}
514
515We emit:
516
517_foo:
518 lwz r2, 0(r3)
Nate Begeman75146202006-05-08 20:54:02 +0000519 slwi r4, r2, 1
520 or r4, r4, r2
521 rlwimi r2, r4, 0, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000522 stw r2, 0(r3)
Chris Lattner55c63252006-05-05 05:36:15 +0000523 blr
524
Nate Begeman75146202006-05-08 20:54:02 +0000525We could collapse a bunch of those ORs and ANDs and generate the following
526equivalent code:
Chris Lattner55c63252006-05-05 05:36:15 +0000527
Nate Begeman4667f2c2006-05-08 17:38:32 +0000528_foo:
529 lwz r2, 0(r3)
Nate Begemand8624ed2006-05-08 19:09:24 +0000530 rlwinm r4, r2, 1, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000531 or r2, r2, r4
532 stw r2, 0(r3)
533 blr
Chris Lattner1eeedae2006-07-14 04:07:29 +0000534
535===-------------------------------------------------------------------------===
536
Chris Lattnerf0613e12006-09-14 20:56:30 +0000537We compile:
538
539unsigned test6(unsigned x) {
540 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
541}
542
543into:
544
545_test6:
546 lis r2, 255
547 rlwinm r3, r3, 16, 0, 31
548 ori r2, r2, 255
549 and r3, r3, r2
550 blr
551
552GCC gets it down to:
553
554_test6:
555 rlwinm r0,r3,16,8,15
556 rlwinm r3,r3,16,24,31
557 or r3,r3,r0
558 blr
559