blob: 514f8407c972d0f773547fe08a29733461f8de6f [file] [log] [blame]
Chris Lattnerb86bd2c2006-03-27 07:04:16 +00001//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
2
Nate Begemanb64af912004-08-10 20:42:36 +00003TODO:
Nate Begemana6ed0aa2008-02-11 04:16:09 +00004* lmw/stmw pass a la arm load store optimizer for prolog/epilog
Nate Begeman50fb3c42005-12-24 01:00:15 +00005
Nate Begemana63fee82006-02-03 05:17:06 +00006===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00007
Chris Lattnerddac7062010-01-07 17:53:10 +00008On PPC64, this:
9
10long f2 (long x) { return 0xfffffff000000000UL; }
11long f3 (long x) { return 0x1ffffffffUL; }
12
13could compile into:
14
15_f2:
16 li r3,-1
17 rldicr r3,r3,0,27
18 blr
19_f3:
20 li r3,-1
21 rldicl r3,r3,0,31
22 blr
23
24we produce:
25
26_f2:
27 lis r2, 4095
28 ori r2, r2, 65535
29 sldi r3, r2, 36
30 blr
31_f3:
32 li r2, 1
33 sldi r2, r2, 32
34 oris r2, r2, 65535
35 ori r3, r2, 65535
36 blr
37
Chris Lattner702917d2010-09-19 00:34:58 +000038===-------------------------------------------------------------------------===
39
40This code:
41
42unsigned add32carry(unsigned sum, unsigned x) {
43 unsigned z = sum + x;
44 if (sum + x < x)
45 z++;
46 return z;
47}
48
49Should compile to something like:
50
51 addc r3,r3,r4
52 addze r3,r3
53
54instead we get:
55
56 add r3, r4, r3
57 cmplw cr7, r3, r4
58 mfcr r4 ; 1
59 rlwinm r4, r4, 29, 31, 31
60 add r3, r3, r4
61
62Ick.
Chris Lattnerddac7062010-01-07 17:53:10 +000063
64===-------------------------------------------------------------------------===
65
Nate Begemana63fee82006-02-03 05:17:06 +000066Support 'update' load/store instructions. These are cracked on the G5, but are
67still a codesize win.
68
Chris Lattner26ddb502006-11-10 01:33:53 +000069With preinc enabled, this:
70
71long *%test4(long *%X, long *%dest) {
72 %Y = getelementptr long* %X, int 4
73 %A = load long* %Y
74 store long %A, long* %dest
75 ret long* %Y
76}
77
78compiles to:
79
80_test4:
81 mr r2, r3
82 lwzu r5, 32(r2)
83 lwz r3, 36(r3)
84 stw r5, 0(r4)
85 stw r3, 4(r4)
86 mr r3, r2
87 blr
88
89with -sched=list-burr, I get:
90
91_test4:
92 lwz r2, 36(r3)
93 lwzu r5, 32(r3)
94 stw r2, 4(r4)
95 stw r5, 0(r4)
96 blr
97
Nate Begemana63fee82006-02-03 05:17:06 +000098===-------------------------------------------------------------------------===
99
Chris Lattner6e112952006-11-07 18:30:21 +0000100We compile the hottest inner loop of viterbi to:
101
102 li r6, 0
103 b LBB1_84 ;bb432.i
104LBB1_83: ;bb420.i
105 lbzx r8, r5, r7
106 addi r6, r7, 1
107 stbx r8, r4, r7
108LBB1_84: ;bb432.i
109 mr r7, r6
110 cmplwi cr0, r7, 143
111 bne cr0, LBB1_83 ;bb420.i
112
113The CBE manages to produce:
114
115 li r0, 143
116 mtctr r0
117loop:
118 lbzx r2, r2, r11
119 stbx r0, r2, r9
120 addi r2, r2, 1
121 bdz later
122 b loop
123
124This could be much better (bdnz instead of bdz) but it still beats us. If we
125produced this with bdnz, the loop would be a single dispatch group.
126
127===-------------------------------------------------------------------------===
128
Chris Lattnera3c44542005-08-24 18:15:24 +0000129Lump the constant pool for each function into ONE pic object, and reference
130pieces of it as offsets from the start. For functions like this (contrived
131to have lots of constants obviously):
132
133double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
134
135We generate:
136
137_X:
138 lis r2, ha16(.CPI_X_0)
139 lfd f0, lo16(.CPI_X_0)(r2)
140 lis r2, ha16(.CPI_X_1)
141 lfd f2, lo16(.CPI_X_1)(r2)
142 fmadd f0, f1, f0, f2
143 lis r2, ha16(.CPI_X_2)
144 lfd f1, lo16(.CPI_X_2)(r2)
145 lis r2, ha16(.CPI_X_3)
146 lfd f2, lo16(.CPI_X_3)(r2)
147 fmadd f1, f0, f1, f2
148 blr
149
150It would be better to materialize .CPI_X into a register, then use immediates
151off of the register to avoid the lis's. This is even more important in PIC
152mode.
153
Chris Lattner39b248b2006-02-02 23:50:22 +0000154Note that this (and the static variable version) is discussed here for GCC:
155http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
156
Chris Lattneraabd0352007-08-23 15:16:03 +0000157Here's another example (the sgn function):
158double testf(double a) {
159 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
160}
161
162it produces a BB like this:
163LBB1_1: ; cond_true
164 lis r2, ha16(LCPI1_0)
165 lfs f0, lo16(LCPI1_0)(r2)
166 lis r2, ha16(LCPI1_1)
167 lis r3, ha16(LCPI1_2)
168 lfs f2, lo16(LCPI1_2)(r3)
169 lfs f3, lo16(LCPI1_1)(r2)
170 fsub f0, f0, f1
171 fsel f1, f0, f2, f3
172 blr
173
Chris Lattnera3c44542005-08-24 18:15:24 +0000174===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +0000175
Chris Lattner33c1dab2006-02-03 06:22:11 +0000176PIC Code Gen IPO optimization:
177
178Squish small scalar globals together into a single global struct, allowing the
179address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
180of the GOT on targets with one).
181
182Note that this is discussed here for GCC:
183http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
184
185===-------------------------------------------------------------------------===
186
Chris Lattner62c08dd2005-12-08 07:13:28 +0000187Compile offsets from allocas:
188
189int *%test() {
190 %X = alloca { int, int }
191 %Y = getelementptr {int,int}* %X, int 0, uint 1
192 ret int* %Y
193}
194
195into a single add, not two:
196
197_test:
198 addi r2, r1, -8
199 addi r3, r2, 4
200 blr
201
202--> important for C++.
203
Chris Lattner39706e62005-12-22 17:19:28 +0000204===-------------------------------------------------------------------------===
205
Chris Lattner39706e62005-12-22 17:19:28 +0000206No loads or stores of the constants should be needed:
207
208struct foo { double X, Y; };
209void xxx(struct foo F);
210void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
211
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000212===-------------------------------------------------------------------------===
213
Dale Johannesen7074fea2009-07-01 23:36:02 +0000214Darwin Stub removal:
215
216We still generate calls to foo$stub, and stubs, on Darwin. This is not
Chris Lattnerc4b0b402009-07-02 01:24:34 +0000217necessary when building with the Leopard (10.5) or later linker, as stubs are
218generated by ld when necessary. Parameterizing this based on the deployment
219target (-mmacosx-version-min) is probably enough. x86-32 does this right, see
220its logic.
Dale Johannesen7074fea2009-07-01 23:36:02 +0000221
222===-------------------------------------------------------------------------===
223
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000224Darwin Stub LICM optimization:
225
226Loops like this:
227
228 for (...) bar();
229
230Have to go through an indirect stub if bar is external or linkonce. It would
231be better to compile it as:
232
233 fp = &bar;
234 for (...) fp();
235
236which only computes the address of bar once (instead of each time through the
237stub). This is Darwin specific and would have to be done in the code generator.
238Probably not a win on x86.
239
240===-------------------------------------------------------------------------===
241
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000242Simple IPO for argument passing, change:
243 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
244
245the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
246of arguments get assigned to r3 through r10. That is, if you have a function
247foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
248argument bytes for r4 and r5. The trick then would be to shuffle the argument
249order for functions we can internalize so that the maximum number of
250integers/pointers get passed in regs before you see any of the fp arguments.
251
252Instead of implementing this, it would actually probably be easier to just
253implement a PPC fastcc, where we could do whatever we wanted to the CC,
254including having this work sanely.
255
256===-------------------------------------------------------------------------===
257
258Fix Darwin FP-In-Integer Registers ABI
259
260Darwin passes doubles in structures in integer registers, which is very very
Wesley Peckbf17cfa2010-11-23 03:31:01 +0000261bad. Add something like a BITCAST to LLVM, then do an i-p transformation that
262percolates these things out of functions.
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000263
264Check out how horrible this is:
265http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
266
267This is an extension of "interprocedural CC unmunging" that can't be done with
268just fastcc.
269
270===-------------------------------------------------------------------------===
271
Chris Lattner56b69642006-01-31 02:55:28 +0000272Compile this:
273
Chris Lattner83e64ba2006-01-31 07:16:34 +0000274int foo(int a) {
275 int b = (a < 8);
276 if (b) {
277 return b * 3; // ignore the fact that this is always 3.
278 } else {
279 return 2;
280 }
281}
282
283into something not this:
284
285_foo:
2861) cmpwi cr7, r3, 8
287 mfcr r2, 1
288 rlwinm r2, r2, 29, 31, 31
2891) cmpwi cr0, r3, 7
290 bgt cr0, LBB1_2 ; UnifiedReturnBlock
291LBB1_1: ; then
292 rlwinm r2, r2, 0, 31, 31
293 mulli r3, r2, 3
294 blr
295LBB1_2: ; UnifiedReturnBlock
296 li r3, 2
297 blr
298
299In particular, the two compares (marked 1) could be shared by reversing one.
300This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
301same operands (but backwards) exists. In this case, this wouldn't save us
302anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000303
Chris Lattner5a7efc92006-02-01 17:54:23 +0000304===-------------------------------------------------------------------------===
305
Chris Lattner275b8842006-02-02 07:37:11 +0000306We should custom expand setcc instead of pretending that we have it. That
307would allow us to expose the access of the crbit after the mfcr, allowing
308that access to be trivially folded into other ops. A simple example:
309
310int foo(int a, int b) { return (a < b) << 4; }
311
312compiles into:
313
314_foo:
315 cmpw cr7, r3, r4
316 mfcr r2, 1
317 rlwinm r2, r2, 29, 31, 31
318 slwi r3, r2, 4
319 blr
320
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000321===-------------------------------------------------------------------------===
322
Nate Begemana63fee82006-02-03 05:17:06 +0000323Fold add and sub with constant into non-extern, non-weak addresses so this:
324
325static int a;
326void bar(int b) { a = b; }
327void foo(unsigned char *c) {
328 *c = a;
329}
330
331So that
332
333_foo:
334 lis r2, ha16(_a)
335 la r2, lo16(_a)(r2)
336 lbz r2, 3(r2)
337 stb r2, 0(r3)
338 blr
339
340Becomes
341
342_foo:
343 lis r2, ha16(_a+3)
344 lbz r2, lo16(_a+3)(r2)
345 stb r2, 0(r3)
346 blr
Chris Lattner21384532006-02-05 05:27:35 +0000347
348===-------------------------------------------------------------------------===
349
350We generate really bad code for this:
351
352int f(signed char *a, _Bool b, _Bool c) {
353 signed char t = 0;
354 if (b) t = *a;
355 if (c) *a = t;
356}
357
Chris Lattner00d18f02006-03-01 06:36:20 +0000358===-------------------------------------------------------------------------===
359
360This:
361int test(unsigned *P) { return *P >> 24; }
362
363Should compile to:
364
365_test:
366 lbz r3,0(r3)
367 blr
368
369not:
370
371_test:
372 lwz r2, 0(r3)
373 srwi r3, r2, 24
374 blr
375
Chris Lattner5a63c472006-03-07 04:42:59 +0000376===-------------------------------------------------------------------------===
377
378On the G5, logical CR operations are more expensive in their three
379address form: ops that read/write the same register are half as expensive as
380those that read from two registers that are different from their destination.
381
382We should model this with two separate instructions. The isel should generate
383the "two address" form of the instructions. When the register allocator
384detects that it needs to insert a copy due to the two-addresness of the CR
385logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
386we can convert to the "three address" instruction, to save code space.
387
388This only matters when we start generating cr logical ops.
389
Chris Lattner49f398b2006-03-08 00:25:47 +0000390===-------------------------------------------------------------------------===
391
392We should compile these two functions to the same thing:
393
394#include <stdlib.h>
395void f(int a, int b, int *P) {
396 *P = (a-b)>=0?(a-b):(b-a);
397}
398void g(int a, int b, int *P) {
399 *P = abs(a-b);
400}
401
402Further, they should compile to something better than:
403
404_g:
405 subf r2, r4, r3
406 subfic r3, r2, 0
407 cmpwi cr0, r2, -1
408 bgt cr0, LBB2_2 ; entry
409LBB2_1: ; entry
410 mr r2, r3
411LBB2_2: ; entry
412 stw r2, 0(r5)
413 blr
414
415GCC produces:
416
417_g:
418 subf r4,r4,r3
419 srawi r2,r4,31
420 xor r0,r2,r4
421 subf r0,r2,r0
422 stw r0,0(r5)
423 blr
424
425... which is much nicer.
426
427This theoretically may help improve twolf slightly (used in dimbox.c:142?).
428
429===-------------------------------------------------------------------------===
430
Chris Lattner3f6bfda2010-01-24 02:27:03 +0000431PR5945: This:
432define i32 @clamp0g(i32 %a) {
433entry:
434 %cmp = icmp slt i32 %a, 0
435 %sel = select i1 %cmp, i32 0, i32 %a
436 ret i32 %sel
437}
438
439Is compile to this with the PowerPC (32-bit) backend:
440
441_clamp0g:
442 cmpwi cr0, r3, 0
443 li r2, 0
444 blt cr0, LBB1_2
445; BB#1: ; %entry
446 mr r2, r3
447LBB1_2: ; %entry
448 mr r3, r2
449 blr
450
451This could be reduced to the much simpler:
452
453_clamp0g:
454 srawi r2, r3, 31
455 andc r3, r3, r2
456 blr
457
458===-------------------------------------------------------------------------===
459
Nate Begeman2df99282006-03-16 18:50:44 +0000460int foo(int N, int ***W, int **TK, int X) {
461 int t, i;
462
463 for (t = 0; t < N; ++t)
464 for (i = 0; i < 4; ++i)
465 W[t / X][i][t % X] = TK[i][t];
466
467 return 5;
468}
469
Chris Lattnered511692006-03-16 22:25:55 +0000470We generate relatively atrocious code for this loop compared to gcc.
471
Chris Lattneref040dd2006-03-21 00:47:09 +0000472We could also strength reduce the rem and the div:
473http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
474
Chris Lattner28b1a0b2006-03-19 05:33:30 +0000475===-------------------------------------------------------------------------===
Chris Lattnered511692006-03-16 22:25:55 +0000476
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000477float foo(float X) { return (int)(X); }
478
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000479Currently produces:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000480
481_foo:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000482 fctiwz f0, f1
483 stfd f0, -8(r1)
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000484 lwz r2, -4(r1)
485 extsw r2, r2
486 std r2, -16(r1)
487 lfd f0, -16(r1)
488 fcfid f0, f0
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000489 frsp f1, f0
490 blr
491
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000492We could use a target dag combine to turn the lwz/extsw into an lwa when the
493lwz has a single use. Since LWA is cracked anyway, this would be a codesize
494win only.
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000495
Chris Lattner716aefc2006-03-23 21:28:44 +0000496===-------------------------------------------------------------------------===
497
Chris Lattner057f09b2006-03-24 20:04:27 +0000498We generate ugly code for this:
499
500void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
501 unsigned code = 0;
502 if(dx < -dw) code |= 1;
503 if(dx > dw) code |= 2;
504 if(dy < -dw) code |= 4;
505 if(dy > dw) code |= 8;
506 if(dz < -dw) code |= 16;
507 if(dz > dw) code |= 32;
508 *ret = code;
509}
510
Chris Lattner420736d2006-03-25 06:47:10 +0000511===-------------------------------------------------------------------------===
512
Nate Begeman908049b2007-01-29 21:21:22 +0000513%struct.B = type { i8, [3 x i8] }
Nate Begeman75146202006-05-08 20:54:02 +0000514
Nate Begeman908049b2007-01-29 21:21:22 +0000515define void @bar(%struct.B* %b) {
Nate Begeman75146202006-05-08 20:54:02 +0000516entry:
Nate Begeman908049b2007-01-29 21:21:22 +0000517 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
518 %tmp = load i32* %tmp ; <uint> [#uses=1]
519 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
520 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
521 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
522 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
523 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
524 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
525 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
526 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
527 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
528 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
529 store i32 %tmp13, i32* %tmp8
Chris Lattner55c63252006-05-05 05:36:15 +0000530 ret void
531}
532
533We emit:
534
535_foo:
536 lwz r2, 0(r3)
Nate Begeman75146202006-05-08 20:54:02 +0000537 slwi r4, r2, 1
538 or r4, r4, r2
539 rlwimi r2, r4, 0, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000540 stw r2, 0(r3)
Chris Lattner55c63252006-05-05 05:36:15 +0000541 blr
542
Nate Begeman75146202006-05-08 20:54:02 +0000543We could collapse a bunch of those ORs and ANDs and generate the following
544equivalent code:
Chris Lattner55c63252006-05-05 05:36:15 +0000545
Nate Begeman4667f2c2006-05-08 17:38:32 +0000546_foo:
547 lwz r2, 0(r3)
Nate Begemand8624ed2006-05-08 19:09:24 +0000548 rlwinm r4, r2, 1, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000549 or r2, r2, r4
550 stw r2, 0(r3)
551 blr
Chris Lattner1eeedae2006-07-14 04:07:29 +0000552
553===-------------------------------------------------------------------------===
554
Chris Lattnerf0613e12006-09-14 20:56:30 +0000555We compile:
556
557unsigned test6(unsigned x) {
558 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
559}
560
561into:
562
563_test6:
564 lis r2, 255
565 rlwinm r3, r3, 16, 0, 31
566 ori r2, r2, 255
567 and r3, r3, r2
568 blr
569
570GCC gets it down to:
571
572_test6:
573 rlwinm r0,r3,16,8,15
574 rlwinm r3,r3,16,24,31
575 or r3,r3,r0
576 blr
577
Chris Lattnerafd7a082007-01-18 07:34:57 +0000578
579===-------------------------------------------------------------------------===
580
581Consider a function like this:
582
583float foo(float X) { return X + 1234.4123f; }
584
585The FP constant ends up in the constant pool, so we need to get the LR register.
586 This ends up producing code like this:
587
588_foo:
589.LBB_foo_0: ; entry
590 mflr r11
591*** stw r11, 8(r1)
592 bl "L00000$pb"
593"L00000$pb":
594 mflr r2
595 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
596 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
597 fadds f1, f1, f0
598*** lwz r11, 8(r1)
599 mtlr r11
600 blr
601
602This is functional, but there is no reason to spill the LR register all the way
603to the stack (the two marked instrs): spilling it to a GPR is quite enough.
604
605Implementing this will require some codegen improvements. Nate writes:
606
607"So basically what we need to support the "no stack frame save and restore" is a
608generalization of the LR optimization to "callee-save regs".
609
610Currently, we have LR marked as a callee-save reg. The register allocator sees
611that it's callee save, and spills it directly to the stack.
612
613Ideally, something like this would happen:
614
615LR would be in a separate register class from the GPRs. The class of LR would be
616marked "unspillable". When the register allocator came across an unspillable
617reg, it would ask "what is the best class to copy this into that I *can* spill"
618If it gets a class back, which it will in this case (the gprs), it grabs a free
619register of that class. If it is then later necessary to spill that reg, so be
620it.
621
622===-------------------------------------------------------------------------===
Chris Lattner95b9d6e2007-01-31 19:49:20 +0000623
624We compile this:
625int test(_Bool X) {
626 return X ? 524288 : 0;
627}
628
629to:
630_test:
631 cmplwi cr0, r3, 0
632 lis r2, 8
633 li r3, 0
634 beq cr0, LBB1_2 ;entry
635LBB1_1: ;entry
636 mr r3, r2
637LBB1_2: ;entry
638 blr
639
640instead of:
641_test:
642 addic r2,r3,-1
643 subfe r0,r2,r3
644 slwi r3,r0,19
645 blr
646
647This sort of thing occurs a lot due to globalopt.
648
649===-------------------------------------------------------------------------===
Chris Lattner8abcfe12007-02-09 17:38:01 +0000650
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000651We compile:
652
653define i32 @bar(i32 %x) nounwind readnone ssp {
654entry:
655 %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1]
Chris Lattnerabb992d2010-01-24 00:09:49 +0000656 %neg = sext i1 %0 to i32 ; <i32> [#uses=1]
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000657 ret i32 %neg
658}
659
660to:
661
662_bar:
Chris Lattnerabb992d2010-01-24 00:09:49 +0000663 cntlzw r2, r3
664 slwi r2, r2, 26
665 srawi r3, r2, 31
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000666 blr
667
Chris Lattnerabb992d2010-01-24 00:09:49 +0000668it would be better to produce:
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000669
670_bar:
671 addic r3,r3,-1
672 subfe r3,r3,r3
673 blr
674
675===-------------------------------------------------------------------------===
676
Chris Lattner8abcfe12007-02-09 17:38:01 +0000677We currently compile 32-bit bswap:
678
679declare i32 @llvm.bswap.i32(i32 %A)
680define i32 @test(i32 %A) {
681 %B = call i32 @llvm.bswap.i32(i32 %A)
682 ret i32 %B
683}
684
685to:
686
687_test:
688 rlwinm r2, r3, 24, 16, 23
689 slwi r4, r3, 24
690 rlwimi r2, r3, 8, 24, 31
691 rlwimi r4, r3, 8, 8, 15
692 rlwimi r4, r2, 0, 16, 31
693 mr r3, r4
694 blr
695
696it would be more efficient to produce:
697
698_foo: mr r0,r3
699 rlwinm r3,r3,8,0xffffffff
700 rlwimi r3,r0,24,0,7
701 rlwimi r3,r0,24,16,23
702 blr
703
704===-------------------------------------------------------------------------===
Chris Lattner013e0512007-03-25 04:46:28 +0000705
706test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
707
708__ZNK4llvm5APInt17countLeadingZerosEv:
709 ld r2, 0(r3)
710 cntlzd r2, r2
711 or r2, r2, r2 <<-- silly.
712 addi r3, r2, -64
713 blr
714
715The dead or is a 'truncate' from 64- to 32-bits.
716
717===-------------------------------------------------------------------------===
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000718
719We generate horrible ppc code for this:
720
721#define N 2000000
722double a[N],c[N];
723void simpleloop() {
724 int j;
725 for (j=0; j<N; j++)
726 c[j] = a[j];
727}
728
729LBB1_1: ;bb
730 lfdx f0, r3, r4
731 addi r5, r5, 1 ;; Extra IV for the exit value compare.
732 stfdx f0, r2, r4
733 addi r4, r4, 8
734
735 xoris r6, r5, 30 ;; This is due to a large immediate.
736 cmplwi cr0, r6, 33920
737 bne cr0, LBB1_1
738
Chris Lattnerbf8ae842007-09-10 21:43:18 +0000739//===---------------------------------------------------------------------===//
740
741This:
742 #include <algorithm>
743 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
744 { return std::make_pair(a + b, a + b < a); }
745 bool no_overflow(unsigned a, unsigned b)
746 { return !full_add(a, b).second; }
747
748Should compile to:
749
750__Z11no_overflowjj:
751 add r4,r3,r4
752 subfc r3,r3,r4
753 li r3,0
754 adde r3,r3,r3
755 blr
756
757(or better) not:
758
759__Z11no_overflowjj:
760 add r2, r4, r3
761 cmplw cr7, r2, r3
762 mfcr r2
763 rlwinm r2, r2, 29, 31, 31
764 xori r3, r2, 1
765 blr
766
767//===---------------------------------------------------------------------===//
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000768
Chris Lattnerfe39edd2008-01-08 06:46:30 +0000769We compile some FP comparisons into an mfcr with two rlwinms and an or. For
770example:
771#include <math.h>
772int test(double x, double y) { return islessequal(x, y);}
773int test2(double x, double y) { return islessgreater(x, y);}
774int test3(double x, double y) { return !islessequal(x, y);}
775
776Compiles into (all three are similar, but the bits differ):
777
778_test:
779 fcmpu cr7, f1, f2
780 mfcr r2
781 rlwinm r3, r2, 29, 31, 31
782 rlwinm r2, r2, 31, 31, 31
783 or r3, r2, r3
784 blr
785
786GCC compiles this into:
787
788 _test:
789 fcmpu cr7,f1,f2
790 cror 30,28,30
791 mfcr r3
792 rlwinm r3,r3,31,1
793 blr
794
795which is more efficient and can use mfocr. See PR642 for some more context.
796
797//===---------------------------------------------------------------------===//
Chris Lattner150943c2008-03-02 19:27:34 +0000798
799void foo(float *data, float d) {
800 long i;
801 for (i = 0; i < 8000; i++)
802 data[i] = d;
803}
804void foo2(float *data, float d) {
805 long i;
806 data--;
807 for (i = 0; i < 8000; i++) {
808 data[1] = d;
809 data++;
810 }
811}
812
813These compile to:
814
815_foo:
816 li r2, 0
817LBB1_1: ; bb
818 addi r4, r2, 4
819 stfsx f1, r3, r2
820 cmplwi cr0, r4, 32000
821 mr r2, r4
822 bne cr0, LBB1_1 ; bb
823 blr
824_foo2:
825 li r2, 0
826LBB2_1: ; bb
827 addi r4, r2, 4
828 stfsx f1, r3, r2
829 cmplwi cr0, r4, 32000
830 mr r2, r4
831 bne cr0, LBB2_1 ; bb
832 blr
833
834The 'mr' could be eliminated to folding the add into the cmp better.
835
836//===---------------------------------------------------------------------===//
Dale Johannesena7647e62008-11-17 18:56:34 +0000837Codegen for the following (low-probability) case deteriorated considerably
838when the correctness fixes for unordered comparisons went in (PR 642, 58871).
839It should be possible to recover the code quality described in the comments.
840
841; RUN: llvm-as < %s | llc -march=ppc32 | grep or | count 3
842; This should produce one 'or' or 'cror' instruction per function.
843
844; RUN: llvm-as < %s | llc -march=ppc32 | grep mfcr | count 3
845; PR2964
846
847define i32 @test(double %x, double %y) nounwind {
848entry:
849 %tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1]
850 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
851 ret i32 %tmp345
852}
853
854define i32 @test2(double %x, double %y) nounwind {
855entry:
856 %tmp3 = fcmp one double %x, %y ; <i1> [#uses=1]
857 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
858 ret i32 %tmp345
859}
860
861define i32 @test3(double %x, double %y) nounwind {
862entry:
863 %tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1]
864 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
865 ret i32 %tmp34
866}
867//===----------------------------------------------------------------------===//
868; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
869
870; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
871; should not be generated except with -enable-finite-only-fp-math or the like).
872; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
873; recognize a more elaborate tree than a simple SETxx.
874
875define double @test_FNEG_sel(double %A, double %B, double %C) {
Dan Gohmana9445e12010-03-02 01:11:08 +0000876 %D = fsub double -0.000000e+00, %A ; <double> [#uses=1]
Dale Johannesena7647e62008-11-17 18:56:34 +0000877 %Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1]
878 %E = select i1 %Cond, double %B, double %C ; <double> [#uses=1]
879 ret double %E
880}
881
Dale Johannesen15ce1d72010-02-12 23:16:24 +0000882//===----------------------------------------------------------------------===//
883The save/restore sequence for CR in prolog/epilog is terrible:
884- Each CR subreg is saved individually, rather than doing one save as a unit.
885- On Darwin, the save is done after the decrement of SP, which means the offset
886from SP of the save slot can be too big for a store instruction, which means we
887need an additional register (currently hacked in 96015+96020; the solution there
888is correct, but poor).
889- On SVR4 the same thing can happen, and I don't think saving before the SP
890decrement is safe on that target, as there is no red zone. This is currently
891broken AFAIK, although it's not a target I can exercise.
892The following demonstrates the problem:
893extern void bar(char *p);
894void foo() {
895 char x[100000];
896 bar(x);
897 __asm__("" ::: "cr2");
898}