blob: 55baf8d270ade8a0387ba6f00c02d383fc5b9373 [file] [log] [blame]
Chris Lattnerb86bd2c2006-03-27 07:04:16 +00001//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
2
Nate Begemanb64af912004-08-10 20:42:36 +00003TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00004* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00005* implement do-loop -> bdnz transform
Chris Lattnere46307a2008-01-15 22:15:02 +00006* Implement __builtin_trap (ISD::TRAP) as 'tw 31, 0, 0' aka 'trap'.
Nate Begemana6ed0aa2008-02-11 04:16:09 +00007* lmw/stmw pass a la arm load store optimizer for prolog/epilog
Nate Begeman50fb3c42005-12-24 01:00:15 +00008
Nate Begemana63fee82006-02-03 05:17:06 +00009===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +000010
Nate Begemana63fee82006-02-03 05:17:06 +000011Support 'update' load/store instructions. These are cracked on the G5, but are
12still a codesize win.
13
Chris Lattner26ddb502006-11-10 01:33:53 +000014With preinc enabled, this:
15
16long *%test4(long *%X, long *%dest) {
17 %Y = getelementptr long* %X, int 4
18 %A = load long* %Y
19 store long %A, long* %dest
20 ret long* %Y
21}
22
23compiles to:
24
25_test4:
26 mr r2, r3
27 lwzu r5, 32(r2)
28 lwz r3, 36(r3)
29 stw r5, 0(r4)
30 stw r3, 4(r4)
31 mr r3, r2
32 blr
33
34with -sched=list-burr, I get:
35
36_test4:
37 lwz r2, 36(r3)
38 lwzu r5, 32(r3)
39 stw r2, 4(r4)
40 stw r5, 0(r4)
41 blr
42
Nate Begemana63fee82006-02-03 05:17:06 +000043===-------------------------------------------------------------------------===
44
Chris Lattner6e112952006-11-07 18:30:21 +000045We compile the hottest inner loop of viterbi to:
46
47 li r6, 0
48 b LBB1_84 ;bb432.i
49LBB1_83: ;bb420.i
50 lbzx r8, r5, r7
51 addi r6, r7, 1
52 stbx r8, r4, r7
53LBB1_84: ;bb432.i
54 mr r7, r6
55 cmplwi cr0, r7, 143
56 bne cr0, LBB1_83 ;bb420.i
57
58The CBE manages to produce:
59
60 li r0, 143
61 mtctr r0
62loop:
63 lbzx r2, r2, r11
64 stbx r0, r2, r9
65 addi r2, r2, 1
66 bdz later
67 b loop
68
69This could be much better (bdnz instead of bdz) but it still beats us. If we
70produced this with bdnz, the loop would be a single dispatch group.
71
72===-------------------------------------------------------------------------===
73
Chris Lattner6a250ec2006-10-13 20:20:58 +000074Compile:
75
76void foo(int *P) {
77 if (P) *P = 0;
78}
79
80into:
81
82_foo:
83 cmpwi cr0,r3,0
84 beqlr cr0
85 li r0,0
86 stw r0,0(r3)
87 blr
88
89This is effectively a simple form of predication.
90
91===-------------------------------------------------------------------------===
92
Chris Lattnera3c44542005-08-24 18:15:24 +000093Lump the constant pool for each function into ONE pic object, and reference
94pieces of it as offsets from the start. For functions like this (contrived
95to have lots of constants obviously):
96
97double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
98
99We generate:
100
101_X:
102 lis r2, ha16(.CPI_X_0)
103 lfd f0, lo16(.CPI_X_0)(r2)
104 lis r2, ha16(.CPI_X_1)
105 lfd f2, lo16(.CPI_X_1)(r2)
106 fmadd f0, f1, f0, f2
107 lis r2, ha16(.CPI_X_2)
108 lfd f1, lo16(.CPI_X_2)(r2)
109 lis r2, ha16(.CPI_X_3)
110 lfd f2, lo16(.CPI_X_3)(r2)
111 fmadd f1, f0, f1, f2
112 blr
113
114It would be better to materialize .CPI_X into a register, then use immediates
115off of the register to avoid the lis's. This is even more important in PIC
116mode.
117
Chris Lattner39b248b2006-02-02 23:50:22 +0000118Note that this (and the static variable version) is discussed here for GCC:
119http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
120
Chris Lattneraabd0352007-08-23 15:16:03 +0000121Here's another example (the sgn function):
122double testf(double a) {
123 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
124}
125
126it produces a BB like this:
127LBB1_1: ; cond_true
128 lis r2, ha16(LCPI1_0)
129 lfs f0, lo16(LCPI1_0)(r2)
130 lis r2, ha16(LCPI1_1)
131 lis r3, ha16(LCPI1_2)
132 lfs f2, lo16(LCPI1_2)(r3)
133 lfs f3, lo16(LCPI1_1)(r2)
134 fsub f0, f0, f1
135 fsel f1, f0, f2, f3
136 blr
137
Chris Lattnera3c44542005-08-24 18:15:24 +0000138===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +0000139
Chris Lattner33c1dab2006-02-03 06:22:11 +0000140PIC Code Gen IPO optimization:
141
142Squish small scalar globals together into a single global struct, allowing the
143address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
144of the GOT on targets with one).
145
146Note that this is discussed here for GCC:
147http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
148
149===-------------------------------------------------------------------------===
150
Nate Begeman92cce902005-09-06 15:30:48 +0000151Implement Newton-Rhapson method for improving estimate instructions to the
152correct accuracy, and implementing divide as multiply by reciprocal when it has
153more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +0000154
155===-------------------------------------------------------------------------===
156
Chris Lattner62c08dd2005-12-08 07:13:28 +0000157Compile offsets from allocas:
158
159int *%test() {
160 %X = alloca { int, int }
161 %Y = getelementptr {int,int}* %X, int 0, uint 1
162 ret int* %Y
163}
164
165into a single add, not two:
166
167_test:
168 addi r2, r1, -8
169 addi r3, r2, 4
170 blr
171
172--> important for C++.
173
Chris Lattner39706e62005-12-22 17:19:28 +0000174===-------------------------------------------------------------------------===
175
Chris Lattner39706e62005-12-22 17:19:28 +0000176No loads or stores of the constants should be needed:
177
178struct foo { double X, Y; };
179void xxx(struct foo F);
180void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
181
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000182===-------------------------------------------------------------------------===
183
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000184Darwin Stub LICM optimization:
185
186Loops like this:
187
188 for (...) bar();
189
190Have to go through an indirect stub if bar is external or linkonce. It would
191be better to compile it as:
192
193 fp = &bar;
194 for (...) fp();
195
196which only computes the address of bar once (instead of each time through the
197stub). This is Darwin specific and would have to be done in the code generator.
198Probably not a win on x86.
199
200===-------------------------------------------------------------------------===
201
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000202Simple IPO for argument passing, change:
203 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
204
205the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
206of arguments get assigned to r3 through r10. That is, if you have a function
207foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
208argument bytes for r4 and r5. The trick then would be to shuffle the argument
209order for functions we can internalize so that the maximum number of
210integers/pointers get passed in regs before you see any of the fp arguments.
211
212Instead of implementing this, it would actually probably be easier to just
213implement a PPC fastcc, where we could do whatever we wanted to the CC,
214including having this work sanely.
215
216===-------------------------------------------------------------------------===
217
218Fix Darwin FP-In-Integer Registers ABI
219
220Darwin passes doubles in structures in integer registers, which is very very
221bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
222that percolates these things out of functions.
223
224Check out how horrible this is:
225http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
226
227This is an extension of "interprocedural CC unmunging" that can't be done with
228just fastcc.
229
230===-------------------------------------------------------------------------===
231
Chris Lattner56b69642006-01-31 02:55:28 +0000232Compile this:
233
Chris Lattner83e64ba2006-01-31 07:16:34 +0000234int foo(int a) {
235 int b = (a < 8);
236 if (b) {
237 return b * 3; // ignore the fact that this is always 3.
238 } else {
239 return 2;
240 }
241}
242
243into something not this:
244
245_foo:
2461) cmpwi cr7, r3, 8
247 mfcr r2, 1
248 rlwinm r2, r2, 29, 31, 31
2491) cmpwi cr0, r3, 7
250 bgt cr0, LBB1_2 ; UnifiedReturnBlock
251LBB1_1: ; then
252 rlwinm r2, r2, 0, 31, 31
253 mulli r3, r2, 3
254 blr
255LBB1_2: ; UnifiedReturnBlock
256 li r3, 2
257 blr
258
259In particular, the two compares (marked 1) could be shared by reversing one.
260This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
261same operands (but backwards) exists. In this case, this wouldn't save us
262anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000263
Chris Lattner5a7efc92006-02-01 17:54:23 +0000264===-------------------------------------------------------------------------===
265
Chris Lattner275b8842006-02-02 07:37:11 +0000266We should custom expand setcc instead of pretending that we have it. That
267would allow us to expose the access of the crbit after the mfcr, allowing
268that access to be trivially folded into other ops. A simple example:
269
270int foo(int a, int b) { return (a < b) << 4; }
271
272compiles into:
273
274_foo:
275 cmpw cr7, r3, r4
276 mfcr r2, 1
277 rlwinm r2, r2, 29, 31, 31
278 slwi r3, r2, 4
279 blr
280
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000281===-------------------------------------------------------------------------===
282
Nate Begemana63fee82006-02-03 05:17:06 +0000283Fold add and sub with constant into non-extern, non-weak addresses so this:
284
285static int a;
286void bar(int b) { a = b; }
287void foo(unsigned char *c) {
288 *c = a;
289}
290
291So that
292
293_foo:
294 lis r2, ha16(_a)
295 la r2, lo16(_a)(r2)
296 lbz r2, 3(r2)
297 stb r2, 0(r3)
298 blr
299
300Becomes
301
302_foo:
303 lis r2, ha16(_a+3)
304 lbz r2, lo16(_a+3)(r2)
305 stb r2, 0(r3)
306 blr
Chris Lattner21384532006-02-05 05:27:35 +0000307
308===-------------------------------------------------------------------------===
309
310We generate really bad code for this:
311
312int f(signed char *a, _Bool b, _Bool c) {
313 signed char t = 0;
314 if (b) t = *a;
315 if (c) *a = t;
316}
317
Chris Lattner00d18f02006-03-01 06:36:20 +0000318===-------------------------------------------------------------------------===
319
320This:
321int test(unsigned *P) { return *P >> 24; }
322
323Should compile to:
324
325_test:
326 lbz r3,0(r3)
327 blr
328
329not:
330
331_test:
332 lwz r2, 0(r3)
333 srwi r3, r2, 24
334 blr
335
Chris Lattner5a63c472006-03-07 04:42:59 +0000336===-------------------------------------------------------------------------===
337
338On the G5, logical CR operations are more expensive in their three
339address form: ops that read/write the same register are half as expensive as
340those that read from two registers that are different from their destination.
341
342We should model this with two separate instructions. The isel should generate
343the "two address" form of the instructions. When the register allocator
344detects that it needs to insert a copy due to the two-addresness of the CR
345logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
346we can convert to the "three address" instruction, to save code space.
347
348This only matters when we start generating cr logical ops.
349
Chris Lattner49f398b2006-03-08 00:25:47 +0000350===-------------------------------------------------------------------------===
351
352We should compile these two functions to the same thing:
353
354#include <stdlib.h>
355void f(int a, int b, int *P) {
356 *P = (a-b)>=0?(a-b):(b-a);
357}
358void g(int a, int b, int *P) {
359 *P = abs(a-b);
360}
361
362Further, they should compile to something better than:
363
364_g:
365 subf r2, r4, r3
366 subfic r3, r2, 0
367 cmpwi cr0, r2, -1
368 bgt cr0, LBB2_2 ; entry
369LBB2_1: ; entry
370 mr r2, r3
371LBB2_2: ; entry
372 stw r2, 0(r5)
373 blr
374
375GCC produces:
376
377_g:
378 subf r4,r4,r3
379 srawi r2,r4,31
380 xor r0,r2,r4
381 subf r0,r2,r0
382 stw r0,0(r5)
383 blr
384
385... which is much nicer.
386
387This theoretically may help improve twolf slightly (used in dimbox.c:142?).
388
389===-------------------------------------------------------------------------===
390
Nate Begeman2df99282006-03-16 18:50:44 +0000391int foo(int N, int ***W, int **TK, int X) {
392 int t, i;
393
394 for (t = 0; t < N; ++t)
395 for (i = 0; i < 4; ++i)
396 W[t / X][i][t % X] = TK[i][t];
397
398 return 5;
399}
400
Chris Lattnered511692006-03-16 22:25:55 +0000401We generate relatively atrocious code for this loop compared to gcc.
402
Chris Lattneref040dd2006-03-21 00:47:09 +0000403We could also strength reduce the rem and the div:
404http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
405
Chris Lattner28b1a0b2006-03-19 05:33:30 +0000406===-------------------------------------------------------------------------===
Chris Lattnered511692006-03-16 22:25:55 +0000407
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000408float foo(float X) { return (int)(X); }
409
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000410Currently produces:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000411
412_foo:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000413 fctiwz f0, f1
414 stfd f0, -8(r1)
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000415 lwz r2, -4(r1)
416 extsw r2, r2
417 std r2, -16(r1)
418 lfd f0, -16(r1)
419 fcfid f0, f0
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000420 frsp f1, f0
421 blr
422
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000423We could use a target dag combine to turn the lwz/extsw into an lwa when the
424lwz has a single use. Since LWA is cracked anyway, this would be a codesize
425win only.
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000426
Chris Lattner716aefc2006-03-23 21:28:44 +0000427===-------------------------------------------------------------------------===
428
Chris Lattner057f09b2006-03-24 20:04:27 +0000429We generate ugly code for this:
430
431void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
432 unsigned code = 0;
433 if(dx < -dw) code |= 1;
434 if(dx > dw) code |= 2;
435 if(dy < -dw) code |= 4;
436 if(dy > dw) code |= 8;
437 if(dz < -dw) code |= 16;
438 if(dz > dw) code |= 32;
439 *ret = code;
440}
441
Chris Lattner420736d2006-03-25 06:47:10 +0000442===-------------------------------------------------------------------------===
443
Chris Lattnered937902006-04-13 16:48:00 +0000444Complete the signed i32 to FP conversion code using 64-bit registers
445transformation, good for PI. See PPCISelLowering.cpp, this comment:
Chris Lattner220d2b82006-04-02 07:20:00 +0000446
Chris Lattnered937902006-04-13 16:48:00 +0000447 // FIXME: disable this lowered code. This generates 64-bit register values,
448 // and we don't model the fact that the top part is clobbered by calls. We
449 // need to flag these together so that the value isn't live across a call.
450 //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
Chris Lattner220d2b82006-04-02 07:20:00 +0000451
Chris Lattner9d62fa42006-05-17 19:02:25 +0000452Also, if the registers are spilled to the stack, we have to ensure that all
45364-bits of them are save/restored, otherwise we will miscompile the code. It
454sounds like we need to get the 64-bit register classes going.
455
Chris Lattner55c63252006-05-05 05:36:15 +0000456===-------------------------------------------------------------------------===
457
Nate Begeman908049b2007-01-29 21:21:22 +0000458%struct.B = type { i8, [3 x i8] }
Nate Begeman75146202006-05-08 20:54:02 +0000459
Nate Begeman908049b2007-01-29 21:21:22 +0000460define void @bar(%struct.B* %b) {
Nate Begeman75146202006-05-08 20:54:02 +0000461entry:
Nate Begeman908049b2007-01-29 21:21:22 +0000462 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
463 %tmp = load i32* %tmp ; <uint> [#uses=1]
464 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
465 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
466 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
467 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
468 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
469 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
470 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
471 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
472 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
473 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
474 store i32 %tmp13, i32* %tmp8
Chris Lattner55c63252006-05-05 05:36:15 +0000475 ret void
476}
477
478We emit:
479
480_foo:
481 lwz r2, 0(r3)
Nate Begeman75146202006-05-08 20:54:02 +0000482 slwi r4, r2, 1
483 or r4, r4, r2
484 rlwimi r2, r4, 0, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000485 stw r2, 0(r3)
Chris Lattner55c63252006-05-05 05:36:15 +0000486 blr
487
Nate Begeman75146202006-05-08 20:54:02 +0000488We could collapse a bunch of those ORs and ANDs and generate the following
489equivalent code:
Chris Lattner55c63252006-05-05 05:36:15 +0000490
Nate Begeman4667f2c2006-05-08 17:38:32 +0000491_foo:
492 lwz r2, 0(r3)
Nate Begemand8624ed2006-05-08 19:09:24 +0000493 rlwinm r4, r2, 1, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000494 or r2, r2, r4
495 stw r2, 0(r3)
496 blr
Chris Lattner1eeedae2006-07-14 04:07:29 +0000497
498===-------------------------------------------------------------------------===
499
Chris Lattnerf0613e12006-09-14 20:56:30 +0000500We compile:
501
502unsigned test6(unsigned x) {
503 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
504}
505
506into:
507
508_test6:
509 lis r2, 255
510 rlwinm r3, r3, 16, 0, 31
511 ori r2, r2, 255
512 and r3, r3, r2
513 blr
514
515GCC gets it down to:
516
517_test6:
518 rlwinm r0,r3,16,8,15
519 rlwinm r3,r3,16,24,31
520 or r3,r3,r0
521 blr
522
Chris Lattnerafd7a082007-01-18 07:34:57 +0000523
524===-------------------------------------------------------------------------===
525
526Consider a function like this:
527
528float foo(float X) { return X + 1234.4123f; }
529
530The FP constant ends up in the constant pool, so we need to get the LR register.
531 This ends up producing code like this:
532
533_foo:
534.LBB_foo_0: ; entry
535 mflr r11
536*** stw r11, 8(r1)
537 bl "L00000$pb"
538"L00000$pb":
539 mflr r2
540 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
541 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
542 fadds f1, f1, f0
543*** lwz r11, 8(r1)
544 mtlr r11
545 blr
546
547This is functional, but there is no reason to spill the LR register all the way
548to the stack (the two marked instrs): spilling it to a GPR is quite enough.
549
550Implementing this will require some codegen improvements. Nate writes:
551
552"So basically what we need to support the "no stack frame save and restore" is a
553generalization of the LR optimization to "callee-save regs".
554
555Currently, we have LR marked as a callee-save reg. The register allocator sees
556that it's callee save, and spills it directly to the stack.
557
558Ideally, something like this would happen:
559
560LR would be in a separate register class from the GPRs. The class of LR would be
561marked "unspillable". When the register allocator came across an unspillable
562reg, it would ask "what is the best class to copy this into that I *can* spill"
563If it gets a class back, which it will in this case (the gprs), it grabs a free
564register of that class. If it is then later necessary to spill that reg, so be
565it.
566
567===-------------------------------------------------------------------------===
Chris Lattner95b9d6e2007-01-31 19:49:20 +0000568
569We compile this:
570int test(_Bool X) {
571 return X ? 524288 : 0;
572}
573
574to:
575_test:
576 cmplwi cr0, r3, 0
577 lis r2, 8
578 li r3, 0
579 beq cr0, LBB1_2 ;entry
580LBB1_1: ;entry
581 mr r3, r2
582LBB1_2: ;entry
583 blr
584
585instead of:
586_test:
587 addic r2,r3,-1
588 subfe r0,r2,r3
589 slwi r3,r0,19
590 blr
591
592This sort of thing occurs a lot due to globalopt.
593
594===-------------------------------------------------------------------------===
Chris Lattner8abcfe12007-02-09 17:38:01 +0000595
596We currently compile 32-bit bswap:
597
598declare i32 @llvm.bswap.i32(i32 %A)
599define i32 @test(i32 %A) {
600 %B = call i32 @llvm.bswap.i32(i32 %A)
601 ret i32 %B
602}
603
604to:
605
606_test:
607 rlwinm r2, r3, 24, 16, 23
608 slwi r4, r3, 24
609 rlwimi r2, r3, 8, 24, 31
610 rlwimi r4, r3, 8, 8, 15
611 rlwimi r4, r2, 0, 16, 31
612 mr r3, r4
613 blr
614
615it would be more efficient to produce:
616
617_foo: mr r0,r3
618 rlwinm r3,r3,8,0xffffffff
619 rlwimi r3,r0,24,0,7
620 rlwimi r3,r0,24,16,23
621 blr
622
623===-------------------------------------------------------------------------===
Chris Lattner013e0512007-03-25 04:46:28 +0000624
625test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
626
627__ZNK4llvm5APInt17countLeadingZerosEv:
628 ld r2, 0(r3)
629 cntlzd r2, r2
630 or r2, r2, r2 <<-- silly.
631 addi r3, r2, -64
632 blr
633
634The dead or is a 'truncate' from 64- to 32-bits.
635
636===-------------------------------------------------------------------------===
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000637
638We generate horrible ppc code for this:
639
640#define N 2000000
641double a[N],c[N];
642void simpleloop() {
643 int j;
644 for (j=0; j<N; j++)
645 c[j] = a[j];
646}
647
648LBB1_1: ;bb
649 lfdx f0, r3, r4
650 addi r5, r5, 1 ;; Extra IV for the exit value compare.
651 stfdx f0, r2, r4
652 addi r4, r4, 8
653
654 xoris r6, r5, 30 ;; This is due to a large immediate.
655 cmplwi cr0, r6, 33920
656 bne cr0, LBB1_1
657
Chris Lattnerbf8ae842007-09-10 21:43:18 +0000658//===---------------------------------------------------------------------===//
659
660This:
661 #include <algorithm>
662 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
663 { return std::make_pair(a + b, a + b < a); }
664 bool no_overflow(unsigned a, unsigned b)
665 { return !full_add(a, b).second; }
666
667Should compile to:
668
669__Z11no_overflowjj:
670 add r4,r3,r4
671 subfc r3,r3,r4
672 li r3,0
673 adde r3,r3,r3
674 blr
675
676(or better) not:
677
678__Z11no_overflowjj:
679 add r2, r4, r3
680 cmplw cr7, r2, r3
681 mfcr r2
682 rlwinm r2, r2, 29, 31, 31
683 xori r3, r2, 1
684 blr
685
686//===---------------------------------------------------------------------===//
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000687
Chris Lattnerfe39edd2008-01-08 06:46:30 +0000688We compile some FP comparisons into an mfcr with two rlwinms and an or. For
689example:
690#include <math.h>
691int test(double x, double y) { return islessequal(x, y);}
692int test2(double x, double y) { return islessgreater(x, y);}
693int test3(double x, double y) { return !islessequal(x, y);}
694
695Compiles into (all three are similar, but the bits differ):
696
697_test:
698 fcmpu cr7, f1, f2
699 mfcr r2
700 rlwinm r3, r2, 29, 31, 31
701 rlwinm r2, r2, 31, 31, 31
702 or r3, r2, r3
703 blr
704
705GCC compiles this into:
706
707 _test:
708 fcmpu cr7,f1,f2
709 cror 30,28,30
710 mfcr r3
711 rlwinm r3,r3,31,1
712 blr
713
714which is more efficient and can use mfocr. See PR642 for some more context.
715
716//===---------------------------------------------------------------------===//
Chris Lattner150943c2008-03-02 19:27:34 +0000717
718void foo(float *data, float d) {
719 long i;
720 for (i = 0; i < 8000; i++)
721 data[i] = d;
722}
723void foo2(float *data, float d) {
724 long i;
725 data--;
726 for (i = 0; i < 8000; i++) {
727 data[1] = d;
728 data++;
729 }
730}
731
732These compile to:
733
734_foo:
735 li r2, 0
736LBB1_1: ; bb
737 addi r4, r2, 4
738 stfsx f1, r3, r2
739 cmplwi cr0, r4, 32000
740 mr r2, r4
741 bne cr0, LBB1_1 ; bb
742 blr
743_foo2:
744 li r2, 0
745LBB2_1: ; bb
746 addi r4, r2, 4
747 stfsx f1, r3, r2
748 cmplwi cr0, r4, 32000
749 mr r2, r4
750 bne cr0, LBB2_1 ; bb
751 blr
752
753The 'mr' could be eliminated to folding the add into the cmp better.
754
755//===---------------------------------------------------------------------===//