blob: cc2ff966332e85952e63ac9428fdd08a73ad75f3 [file] [log] [blame]
Chris Lattnerb86bd2c2006-03-27 07:04:16 +00001//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
2
Nate Begemanb64af912004-08-10 20:42:36 +00003TODO:
Nate Begemana6ed0aa2008-02-11 04:16:09 +00004* lmw/stmw pass a la arm load store optimizer for prolog/epilog
Nate Begeman50fb3c42005-12-24 01:00:15 +00005
Nate Begemana63fee82006-02-03 05:17:06 +00006===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00007
Chris Lattnerddac7062010-01-07 17:53:10 +00008On PPC64, this:
9
10long f2 (long x) { return 0xfffffff000000000UL; }
11long f3 (long x) { return 0x1ffffffffUL; }
12
13could compile into:
14
15_f2:
16 li r3,-1
17 rldicr r3,r3,0,27
18 blr
19_f3:
20 li r3,-1
21 rldicl r3,r3,0,31
22 blr
23
24we produce:
25
26_f2:
27 lis r2, 4095
28 ori r2, r2, 65535
29 sldi r3, r2, 36
30 blr
31_f3:
32 li r2, 1
33 sldi r2, r2, 32
34 oris r2, r2, 65535
35 ori r3, r2, 65535
36 blr
37
Chris Lattner702917d2010-09-19 00:34:58 +000038===-------------------------------------------------------------------------===
39
40This code:
41
42unsigned add32carry(unsigned sum, unsigned x) {
43 unsigned z = sum + x;
44 if (sum + x < x)
45 z++;
46 return z;
47}
48
49Should compile to something like:
50
51 addc r3,r3,r4
52 addze r3,r3
53
54instead we get:
55
56 add r3, r4, r3
57 cmplw cr7, r3, r4
58 mfcr r4 ; 1
59 rlwinm r4, r4, 29, 31, 31
60 add r3, r3, r4
61
62Ick.
Chris Lattnerddac7062010-01-07 17:53:10 +000063
64===-------------------------------------------------------------------------===
65
Nate Begemana63fee82006-02-03 05:17:06 +000066Support 'update' load/store instructions. These are cracked on the G5, but are
67still a codesize win.
68
Chris Lattner26ddb502006-11-10 01:33:53 +000069With preinc enabled, this:
70
71long *%test4(long *%X, long *%dest) {
72 %Y = getelementptr long* %X, int 4
73 %A = load long* %Y
74 store long %A, long* %dest
75 ret long* %Y
76}
77
78compiles to:
79
80_test4:
81 mr r2, r3
82 lwzu r5, 32(r2)
83 lwz r3, 36(r3)
84 stw r5, 0(r4)
85 stw r3, 4(r4)
86 mr r3, r2
87 blr
88
89with -sched=list-burr, I get:
90
91_test4:
92 lwz r2, 36(r3)
93 lwzu r5, 32(r3)
94 stw r2, 4(r4)
95 stw r5, 0(r4)
96 blr
97
Nate Begemana63fee82006-02-03 05:17:06 +000098===-------------------------------------------------------------------------===
99
Chris Lattner6e112952006-11-07 18:30:21 +0000100We compile the hottest inner loop of viterbi to:
101
102 li r6, 0
103 b LBB1_84 ;bb432.i
104LBB1_83: ;bb420.i
105 lbzx r8, r5, r7
106 addi r6, r7, 1
107 stbx r8, r4, r7
108LBB1_84: ;bb432.i
109 mr r7, r6
110 cmplwi cr0, r7, 143
111 bne cr0, LBB1_83 ;bb420.i
112
113The CBE manages to produce:
114
115 li r0, 143
116 mtctr r0
117loop:
118 lbzx r2, r2, r11
119 stbx r0, r2, r9
120 addi r2, r2, 1
121 bdz later
122 b loop
123
124This could be much better (bdnz instead of bdz) but it still beats us. If we
125produced this with bdnz, the loop would be a single dispatch group.
126
127===-------------------------------------------------------------------------===
128
Chris Lattner6a250ec2006-10-13 20:20:58 +0000129Compile:
130
131void foo(int *P) {
132 if (P) *P = 0;
133}
134
135into:
136
137_foo:
138 cmpwi cr0,r3,0
139 beqlr cr0
140 li r0,0
141 stw r0,0(r3)
142 blr
143
144This is effectively a simple form of predication.
145
146===-------------------------------------------------------------------------===
147
Chris Lattnera3c44542005-08-24 18:15:24 +0000148Lump the constant pool for each function into ONE pic object, and reference
149pieces of it as offsets from the start. For functions like this (contrived
150to have lots of constants obviously):
151
152double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
153
154We generate:
155
156_X:
157 lis r2, ha16(.CPI_X_0)
158 lfd f0, lo16(.CPI_X_0)(r2)
159 lis r2, ha16(.CPI_X_1)
160 lfd f2, lo16(.CPI_X_1)(r2)
161 fmadd f0, f1, f0, f2
162 lis r2, ha16(.CPI_X_2)
163 lfd f1, lo16(.CPI_X_2)(r2)
164 lis r2, ha16(.CPI_X_3)
165 lfd f2, lo16(.CPI_X_3)(r2)
166 fmadd f1, f0, f1, f2
167 blr
168
169It would be better to materialize .CPI_X into a register, then use immediates
170off of the register to avoid the lis's. This is even more important in PIC
171mode.
172
Chris Lattner39b248b2006-02-02 23:50:22 +0000173Note that this (and the static variable version) is discussed here for GCC:
174http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
175
Chris Lattneraabd0352007-08-23 15:16:03 +0000176Here's another example (the sgn function):
177double testf(double a) {
178 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
179}
180
181it produces a BB like this:
182LBB1_1: ; cond_true
183 lis r2, ha16(LCPI1_0)
184 lfs f0, lo16(LCPI1_0)(r2)
185 lis r2, ha16(LCPI1_1)
186 lis r3, ha16(LCPI1_2)
187 lfs f2, lo16(LCPI1_2)(r3)
188 lfs f3, lo16(LCPI1_1)(r2)
189 fsub f0, f0, f1
190 fsel f1, f0, f2, f3
191 blr
192
Chris Lattnera3c44542005-08-24 18:15:24 +0000193===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +0000194
Chris Lattner33c1dab2006-02-03 06:22:11 +0000195PIC Code Gen IPO optimization:
196
197Squish small scalar globals together into a single global struct, allowing the
198address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
199of the GOT on targets with one).
200
201Note that this is discussed here for GCC:
202http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
203
204===-------------------------------------------------------------------------===
205
Chris Lattner62c08dd2005-12-08 07:13:28 +0000206Compile offsets from allocas:
207
208int *%test() {
209 %X = alloca { int, int }
210 %Y = getelementptr {int,int}* %X, int 0, uint 1
211 ret int* %Y
212}
213
214into a single add, not two:
215
216_test:
217 addi r2, r1, -8
218 addi r3, r2, 4
219 blr
220
221--> important for C++.
222
Chris Lattner39706e62005-12-22 17:19:28 +0000223===-------------------------------------------------------------------------===
224
Chris Lattner39706e62005-12-22 17:19:28 +0000225No loads or stores of the constants should be needed:
226
227struct foo { double X, Y; };
228void xxx(struct foo F);
229void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
230
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000231===-------------------------------------------------------------------------===
232
Dale Johannesen7074fea2009-07-01 23:36:02 +0000233Darwin Stub removal:
234
235We still generate calls to foo$stub, and stubs, on Darwin. This is not
Chris Lattnerc4b0b402009-07-02 01:24:34 +0000236necessary when building with the Leopard (10.5) or later linker, as stubs are
237generated by ld when necessary. Parameterizing this based on the deployment
238target (-mmacosx-version-min) is probably enough. x86-32 does this right, see
239its logic.
Dale Johannesen7074fea2009-07-01 23:36:02 +0000240
241===-------------------------------------------------------------------------===
242
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000243Darwin Stub LICM optimization:
244
245Loops like this:
246
247 for (...) bar();
248
249Have to go through an indirect stub if bar is external or linkonce. It would
250be better to compile it as:
251
252 fp = &bar;
253 for (...) fp();
254
255which only computes the address of bar once (instead of each time through the
256stub). This is Darwin specific and would have to be done in the code generator.
257Probably not a win on x86.
258
259===-------------------------------------------------------------------------===
260
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000261Simple IPO for argument passing, change:
262 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
263
264the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
265of arguments get assigned to r3 through r10. That is, if you have a function
266foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
267argument bytes for r4 and r5. The trick then would be to shuffle the argument
268order for functions we can internalize so that the maximum number of
269integers/pointers get passed in regs before you see any of the fp arguments.
270
271Instead of implementing this, it would actually probably be easier to just
272implement a PPC fastcc, where we could do whatever we wanted to the CC,
273including having this work sanely.
274
275===-------------------------------------------------------------------------===
276
277Fix Darwin FP-In-Integer Registers ABI
278
279Darwin passes doubles in structures in integer registers, which is very very
Wesley Peckbf17cfa2010-11-23 03:31:01 +0000280bad. Add something like a BITCAST to LLVM, then do an i-p transformation that
281percolates these things out of functions.
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000282
283Check out how horrible this is:
284http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
285
286This is an extension of "interprocedural CC unmunging" that can't be done with
287just fastcc.
288
289===-------------------------------------------------------------------------===
290
Chris Lattner56b69642006-01-31 02:55:28 +0000291Compile this:
292
Chris Lattner83e64ba2006-01-31 07:16:34 +0000293int foo(int a) {
294 int b = (a < 8);
295 if (b) {
296 return b * 3; // ignore the fact that this is always 3.
297 } else {
298 return 2;
299 }
300}
301
302into something not this:
303
304_foo:
3051) cmpwi cr7, r3, 8
306 mfcr r2, 1
307 rlwinm r2, r2, 29, 31, 31
3081) cmpwi cr0, r3, 7
309 bgt cr0, LBB1_2 ; UnifiedReturnBlock
310LBB1_1: ; then
311 rlwinm r2, r2, 0, 31, 31
312 mulli r3, r2, 3
313 blr
314LBB1_2: ; UnifiedReturnBlock
315 li r3, 2
316 blr
317
318In particular, the two compares (marked 1) could be shared by reversing one.
319This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
320same operands (but backwards) exists. In this case, this wouldn't save us
321anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000322
Chris Lattner5a7efc92006-02-01 17:54:23 +0000323===-------------------------------------------------------------------------===
324
Chris Lattner275b8842006-02-02 07:37:11 +0000325We should custom expand setcc instead of pretending that we have it. That
326would allow us to expose the access of the crbit after the mfcr, allowing
327that access to be trivially folded into other ops. A simple example:
328
329int foo(int a, int b) { return (a < b) << 4; }
330
331compiles into:
332
333_foo:
334 cmpw cr7, r3, r4
335 mfcr r2, 1
336 rlwinm r2, r2, 29, 31, 31
337 slwi r3, r2, 4
338 blr
339
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000340===-------------------------------------------------------------------------===
341
Nate Begemana63fee82006-02-03 05:17:06 +0000342Fold add and sub with constant into non-extern, non-weak addresses so this:
343
344static int a;
345void bar(int b) { a = b; }
346void foo(unsigned char *c) {
347 *c = a;
348}
349
350So that
351
352_foo:
353 lis r2, ha16(_a)
354 la r2, lo16(_a)(r2)
355 lbz r2, 3(r2)
356 stb r2, 0(r3)
357 blr
358
359Becomes
360
361_foo:
362 lis r2, ha16(_a+3)
363 lbz r2, lo16(_a+3)(r2)
364 stb r2, 0(r3)
365 blr
Chris Lattner21384532006-02-05 05:27:35 +0000366
367===-------------------------------------------------------------------------===
368
369We generate really bad code for this:
370
371int f(signed char *a, _Bool b, _Bool c) {
372 signed char t = 0;
373 if (b) t = *a;
374 if (c) *a = t;
375}
376
Chris Lattner00d18f02006-03-01 06:36:20 +0000377===-------------------------------------------------------------------------===
378
379This:
380int test(unsigned *P) { return *P >> 24; }
381
382Should compile to:
383
384_test:
385 lbz r3,0(r3)
386 blr
387
388not:
389
390_test:
391 lwz r2, 0(r3)
392 srwi r3, r2, 24
393 blr
394
Chris Lattner5a63c472006-03-07 04:42:59 +0000395===-------------------------------------------------------------------------===
396
397On the G5, logical CR operations are more expensive in their three
398address form: ops that read/write the same register are half as expensive as
399those that read from two registers that are different from their destination.
400
401We should model this with two separate instructions. The isel should generate
402the "two address" form of the instructions. When the register allocator
403detects that it needs to insert a copy due to the two-addresness of the CR
404logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
405we can convert to the "three address" instruction, to save code space.
406
407This only matters when we start generating cr logical ops.
408
Chris Lattner49f398b2006-03-08 00:25:47 +0000409===-------------------------------------------------------------------------===
410
411We should compile these two functions to the same thing:
412
413#include <stdlib.h>
414void f(int a, int b, int *P) {
415 *P = (a-b)>=0?(a-b):(b-a);
416}
417void g(int a, int b, int *P) {
418 *P = abs(a-b);
419}
420
421Further, they should compile to something better than:
422
423_g:
424 subf r2, r4, r3
425 subfic r3, r2, 0
426 cmpwi cr0, r2, -1
427 bgt cr0, LBB2_2 ; entry
428LBB2_1: ; entry
429 mr r2, r3
430LBB2_2: ; entry
431 stw r2, 0(r5)
432 blr
433
434GCC produces:
435
436_g:
437 subf r4,r4,r3
438 srawi r2,r4,31
439 xor r0,r2,r4
440 subf r0,r2,r0
441 stw r0,0(r5)
442 blr
443
444... which is much nicer.
445
446This theoretically may help improve twolf slightly (used in dimbox.c:142?).
447
448===-------------------------------------------------------------------------===
449
Chris Lattner3f6bfda2010-01-24 02:27:03 +0000450PR5945: This:
451define i32 @clamp0g(i32 %a) {
452entry:
453 %cmp = icmp slt i32 %a, 0
454 %sel = select i1 %cmp, i32 0, i32 %a
455 ret i32 %sel
456}
457
458Is compile to this with the PowerPC (32-bit) backend:
459
460_clamp0g:
461 cmpwi cr0, r3, 0
462 li r2, 0
463 blt cr0, LBB1_2
464; BB#1: ; %entry
465 mr r2, r3
466LBB1_2: ; %entry
467 mr r3, r2
468 blr
469
470This could be reduced to the much simpler:
471
472_clamp0g:
473 srawi r2, r3, 31
474 andc r3, r3, r2
475 blr
476
477===-------------------------------------------------------------------------===
478
Nate Begeman2df99282006-03-16 18:50:44 +0000479int foo(int N, int ***W, int **TK, int X) {
480 int t, i;
481
482 for (t = 0; t < N; ++t)
483 for (i = 0; i < 4; ++i)
484 W[t / X][i][t % X] = TK[i][t];
485
486 return 5;
487}
488
Chris Lattnered511692006-03-16 22:25:55 +0000489We generate relatively atrocious code for this loop compared to gcc.
490
Chris Lattneref040dd2006-03-21 00:47:09 +0000491We could also strength reduce the rem and the div:
492http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
493
Chris Lattner28b1a0b2006-03-19 05:33:30 +0000494===-------------------------------------------------------------------------===
Chris Lattnered511692006-03-16 22:25:55 +0000495
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000496float foo(float X) { return (int)(X); }
497
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000498Currently produces:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000499
500_foo:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000501 fctiwz f0, f1
502 stfd f0, -8(r1)
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000503 lwz r2, -4(r1)
504 extsw r2, r2
505 std r2, -16(r1)
506 lfd f0, -16(r1)
507 fcfid f0, f0
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000508 frsp f1, f0
509 blr
510
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000511We could use a target dag combine to turn the lwz/extsw into an lwa when the
512lwz has a single use. Since LWA is cracked anyway, this would be a codesize
513win only.
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000514
Chris Lattner716aefc2006-03-23 21:28:44 +0000515===-------------------------------------------------------------------------===
516
Chris Lattner057f09b2006-03-24 20:04:27 +0000517We generate ugly code for this:
518
519void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
520 unsigned code = 0;
521 if(dx < -dw) code |= 1;
522 if(dx > dw) code |= 2;
523 if(dy < -dw) code |= 4;
524 if(dy > dw) code |= 8;
525 if(dz < -dw) code |= 16;
526 if(dz > dw) code |= 32;
527 *ret = code;
528}
529
Chris Lattner420736d2006-03-25 06:47:10 +0000530===-------------------------------------------------------------------------===
531
Nate Begeman908049b2007-01-29 21:21:22 +0000532%struct.B = type { i8, [3 x i8] }
Nate Begeman75146202006-05-08 20:54:02 +0000533
Nate Begeman908049b2007-01-29 21:21:22 +0000534define void @bar(%struct.B* %b) {
Nate Begeman75146202006-05-08 20:54:02 +0000535entry:
Nate Begeman908049b2007-01-29 21:21:22 +0000536 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
537 %tmp = load i32* %tmp ; <uint> [#uses=1]
538 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
539 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
540 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
541 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
542 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
543 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
544 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
545 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
546 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
547 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
548 store i32 %tmp13, i32* %tmp8
Chris Lattner55c63252006-05-05 05:36:15 +0000549 ret void
550}
551
552We emit:
553
554_foo:
555 lwz r2, 0(r3)
Nate Begeman75146202006-05-08 20:54:02 +0000556 slwi r4, r2, 1
557 or r4, r4, r2
558 rlwimi r2, r4, 0, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000559 stw r2, 0(r3)
Chris Lattner55c63252006-05-05 05:36:15 +0000560 blr
561
Nate Begeman75146202006-05-08 20:54:02 +0000562We could collapse a bunch of those ORs and ANDs and generate the following
563equivalent code:
Chris Lattner55c63252006-05-05 05:36:15 +0000564
Nate Begeman4667f2c2006-05-08 17:38:32 +0000565_foo:
566 lwz r2, 0(r3)
Nate Begemand8624ed2006-05-08 19:09:24 +0000567 rlwinm r4, r2, 1, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000568 or r2, r2, r4
569 stw r2, 0(r3)
570 blr
Chris Lattner1eeedae2006-07-14 04:07:29 +0000571
572===-------------------------------------------------------------------------===
573
Chris Lattnerf0613e12006-09-14 20:56:30 +0000574We compile:
575
576unsigned test6(unsigned x) {
577 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
578}
579
580into:
581
582_test6:
583 lis r2, 255
584 rlwinm r3, r3, 16, 0, 31
585 ori r2, r2, 255
586 and r3, r3, r2
587 blr
588
589GCC gets it down to:
590
591_test6:
592 rlwinm r0,r3,16,8,15
593 rlwinm r3,r3,16,24,31
594 or r3,r3,r0
595 blr
596
Chris Lattnerafd7a082007-01-18 07:34:57 +0000597
598===-------------------------------------------------------------------------===
599
600Consider a function like this:
601
602float foo(float X) { return X + 1234.4123f; }
603
604The FP constant ends up in the constant pool, so we need to get the LR register.
605 This ends up producing code like this:
606
607_foo:
608.LBB_foo_0: ; entry
609 mflr r11
610*** stw r11, 8(r1)
611 bl "L00000$pb"
612"L00000$pb":
613 mflr r2
614 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
615 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
616 fadds f1, f1, f0
617*** lwz r11, 8(r1)
618 mtlr r11
619 blr
620
621This is functional, but there is no reason to spill the LR register all the way
622to the stack (the two marked instrs): spilling it to a GPR is quite enough.
623
624Implementing this will require some codegen improvements. Nate writes:
625
626"So basically what we need to support the "no stack frame save and restore" is a
627generalization of the LR optimization to "callee-save regs".
628
629Currently, we have LR marked as a callee-save reg. The register allocator sees
630that it's callee save, and spills it directly to the stack.
631
632Ideally, something like this would happen:
633
634LR would be in a separate register class from the GPRs. The class of LR would be
635marked "unspillable". When the register allocator came across an unspillable
636reg, it would ask "what is the best class to copy this into that I *can* spill"
637If it gets a class back, which it will in this case (the gprs), it grabs a free
638register of that class. If it is then later necessary to spill that reg, so be
639it.
640
641===-------------------------------------------------------------------------===
Chris Lattner95b9d6e2007-01-31 19:49:20 +0000642
643We compile this:
644int test(_Bool X) {
645 return X ? 524288 : 0;
646}
647
648to:
649_test:
650 cmplwi cr0, r3, 0
651 lis r2, 8
652 li r3, 0
653 beq cr0, LBB1_2 ;entry
654LBB1_1: ;entry
655 mr r3, r2
656LBB1_2: ;entry
657 blr
658
659instead of:
660_test:
661 addic r2,r3,-1
662 subfe r0,r2,r3
663 slwi r3,r0,19
664 blr
665
666This sort of thing occurs a lot due to globalopt.
667
668===-------------------------------------------------------------------------===
Chris Lattner8abcfe12007-02-09 17:38:01 +0000669
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000670We compile:
671
672define i32 @bar(i32 %x) nounwind readnone ssp {
673entry:
674 %0 = icmp eq i32 %x, 0 ; <i1> [#uses=1]
Chris Lattnerabb992d2010-01-24 00:09:49 +0000675 %neg = sext i1 %0 to i32 ; <i32> [#uses=1]
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000676 ret i32 %neg
677}
678
679to:
680
681_bar:
Chris Lattnerabb992d2010-01-24 00:09:49 +0000682 cntlzw r2, r3
683 slwi r2, r2, 26
684 srawi r3, r2, 31
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000685 blr
686
Chris Lattnerabb992d2010-01-24 00:09:49 +0000687it would be better to produce:
Chris Lattnera9cf5b32010-01-23 18:42:37 +0000688
689_bar:
690 addic r3,r3,-1
691 subfe r3,r3,r3
692 blr
693
694===-------------------------------------------------------------------------===
695
Chris Lattner8abcfe12007-02-09 17:38:01 +0000696We currently compile 32-bit bswap:
697
698declare i32 @llvm.bswap.i32(i32 %A)
699define i32 @test(i32 %A) {
700 %B = call i32 @llvm.bswap.i32(i32 %A)
701 ret i32 %B
702}
703
704to:
705
706_test:
707 rlwinm r2, r3, 24, 16, 23
708 slwi r4, r3, 24
709 rlwimi r2, r3, 8, 24, 31
710 rlwimi r4, r3, 8, 8, 15
711 rlwimi r4, r2, 0, 16, 31
712 mr r3, r4
713 blr
714
715it would be more efficient to produce:
716
717_foo: mr r0,r3
718 rlwinm r3,r3,8,0xffffffff
719 rlwimi r3,r0,24,0,7
720 rlwimi r3,r0,24,16,23
721 blr
722
723===-------------------------------------------------------------------------===
Chris Lattner013e0512007-03-25 04:46:28 +0000724
725test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
726
727__ZNK4llvm5APInt17countLeadingZerosEv:
728 ld r2, 0(r3)
729 cntlzd r2, r2
730 or r2, r2, r2 <<-- silly.
731 addi r3, r2, -64
732 blr
733
734The dead or is a 'truncate' from 64- to 32-bits.
735
736===-------------------------------------------------------------------------===
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000737
738We generate horrible ppc code for this:
739
740#define N 2000000
741double a[N],c[N];
742void simpleloop() {
743 int j;
744 for (j=0; j<N; j++)
745 c[j] = a[j];
746}
747
748LBB1_1: ;bb
749 lfdx f0, r3, r4
750 addi r5, r5, 1 ;; Extra IV for the exit value compare.
751 stfdx f0, r2, r4
752 addi r4, r4, 8
753
754 xoris r6, r5, 30 ;; This is due to a large immediate.
755 cmplwi cr0, r6, 33920
756 bne cr0, LBB1_1
757
Chris Lattnerbf8ae842007-09-10 21:43:18 +0000758//===---------------------------------------------------------------------===//
759
760This:
761 #include <algorithm>
762 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
763 { return std::make_pair(a + b, a + b < a); }
764 bool no_overflow(unsigned a, unsigned b)
765 { return !full_add(a, b).second; }
766
767Should compile to:
768
769__Z11no_overflowjj:
770 add r4,r3,r4
771 subfc r3,r3,r4
772 li r3,0
773 adde r3,r3,r3
774 blr
775
776(or better) not:
777
778__Z11no_overflowjj:
779 add r2, r4, r3
780 cmplw cr7, r2, r3
781 mfcr r2
782 rlwinm r2, r2, 29, 31, 31
783 xori r3, r2, 1
784 blr
785
786//===---------------------------------------------------------------------===//
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000787
Chris Lattnerfe39edd2008-01-08 06:46:30 +0000788We compile some FP comparisons into an mfcr with two rlwinms and an or. For
789example:
790#include <math.h>
791int test(double x, double y) { return islessequal(x, y);}
792int test2(double x, double y) { return islessgreater(x, y);}
793int test3(double x, double y) { return !islessequal(x, y);}
794
795Compiles into (all three are similar, but the bits differ):
796
797_test:
798 fcmpu cr7, f1, f2
799 mfcr r2
800 rlwinm r3, r2, 29, 31, 31
801 rlwinm r2, r2, 31, 31, 31
802 or r3, r2, r3
803 blr
804
805GCC compiles this into:
806
807 _test:
808 fcmpu cr7,f1,f2
809 cror 30,28,30
810 mfcr r3
811 rlwinm r3,r3,31,1
812 blr
813
814which is more efficient and can use mfocr. See PR642 for some more context.
815
816//===---------------------------------------------------------------------===//
Chris Lattner150943c2008-03-02 19:27:34 +0000817
818void foo(float *data, float d) {
819 long i;
820 for (i = 0; i < 8000; i++)
821 data[i] = d;
822}
823void foo2(float *data, float d) {
824 long i;
825 data--;
826 for (i = 0; i < 8000; i++) {
827 data[1] = d;
828 data++;
829 }
830}
831
832These compile to:
833
834_foo:
835 li r2, 0
836LBB1_1: ; bb
837 addi r4, r2, 4
838 stfsx f1, r3, r2
839 cmplwi cr0, r4, 32000
840 mr r2, r4
841 bne cr0, LBB1_1 ; bb
842 blr
843_foo2:
844 li r2, 0
845LBB2_1: ; bb
846 addi r4, r2, 4
847 stfsx f1, r3, r2
848 cmplwi cr0, r4, 32000
849 mr r2, r4
850 bne cr0, LBB2_1 ; bb
851 blr
852
853The 'mr' could be eliminated to folding the add into the cmp better.
854
855//===---------------------------------------------------------------------===//
Dale Johannesena7647e62008-11-17 18:56:34 +0000856Codegen for the following (low-probability) case deteriorated considerably
857when the correctness fixes for unordered comparisons went in (PR 642, 58871).
858It should be possible to recover the code quality described in the comments.
859
860; RUN: llvm-as < %s | llc -march=ppc32 | grep or | count 3
861; This should produce one 'or' or 'cror' instruction per function.
862
863; RUN: llvm-as < %s | llc -march=ppc32 | grep mfcr | count 3
864; PR2964
865
866define i32 @test(double %x, double %y) nounwind {
867entry:
868 %tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1]
869 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
870 ret i32 %tmp345
871}
872
873define i32 @test2(double %x, double %y) nounwind {
874entry:
875 %tmp3 = fcmp one double %x, %y ; <i1> [#uses=1]
876 %tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
877 ret i32 %tmp345
878}
879
880define i32 @test3(double %x, double %y) nounwind {
881entry:
882 %tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1]
883 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
884 ret i32 %tmp34
885}
886//===----------------------------------------------------------------------===//
887; RUN: llvm-as < %s | llc -march=ppc32 | not grep fneg
888
889; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
890; should not be generated except with -enable-finite-only-fp-math or the like).
891; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
892; recognize a more elaborate tree than a simple SETxx.
893
894define double @test_FNEG_sel(double %A, double %B, double %C) {
Dan Gohmana9445e12010-03-02 01:11:08 +0000895 %D = fsub double -0.000000e+00, %A ; <double> [#uses=1]
Dale Johannesena7647e62008-11-17 18:56:34 +0000896 %Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1]
897 %E = select i1 %Cond, double %B, double %C ; <double> [#uses=1]
898 ret double %E
899}
900
Dale Johannesen15ce1d72010-02-12 23:16:24 +0000901//===----------------------------------------------------------------------===//
902The save/restore sequence for CR in prolog/epilog is terrible:
903- Each CR subreg is saved individually, rather than doing one save as a unit.
904- On Darwin, the save is done after the decrement of SP, which means the offset
905from SP of the save slot can be too big for a store instruction, which means we
906need an additional register (currently hacked in 96015+96020; the solution there
907is correct, but poor).
908- On SVR4 the same thing can happen, and I don't think saving before the SP
909decrement is safe on that target, as there is no red zone. This is currently
910broken AFAIK, although it's not a target I can exercise.
911The following demonstrates the problem:
912extern void bar(char *p);
913void foo() {
914 char x[100000];
915 bar(x);
916 __asm__("" ::: "cr2");
917}