blob: 69e60fcb92230ba0ba31894ee503d12b4d08d20a [file] [log] [blame]
Chris Lattnerb86bd2c2006-03-27 07:04:16 +00001//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
2
Nate Begemanb64af912004-08-10 20:42:36 +00003TODO:
Nate Begemanef9531e2005-04-11 20:48:57 +00004* gpr0 allocation
Nate Begeman4a0de072004-10-26 04:10:53 +00005* implement do-loop -> bdnz transform
Chris Lattner86c9c342007-03-25 05:10:46 +00006* __builtin_return_address not supported on PPC
Nate Begeman50fb3c42005-12-24 01:00:15 +00007
Nate Begemana63fee82006-02-03 05:17:06 +00008===-------------------------------------------------------------------------===
Nate Begeman50fb3c42005-12-24 01:00:15 +00009
Nate Begemana63fee82006-02-03 05:17:06 +000010Support 'update' load/store instructions. These are cracked on the G5, but are
11still a codesize win.
12
Chris Lattner26ddb502006-11-10 01:33:53 +000013With preinc enabled, this:
14
15long *%test4(long *%X, long *%dest) {
16 %Y = getelementptr long* %X, int 4
17 %A = load long* %Y
18 store long %A, long* %dest
19 ret long* %Y
20}
21
22compiles to:
23
24_test4:
25 mr r2, r3
26 lwzu r5, 32(r2)
27 lwz r3, 36(r3)
28 stw r5, 0(r4)
29 stw r3, 4(r4)
30 mr r3, r2
31 blr
32
33with -sched=list-burr, I get:
34
35_test4:
36 lwz r2, 36(r3)
37 lwzu r5, 32(r3)
38 stw r2, 4(r4)
39 stw r5, 0(r4)
40 blr
41
Nate Begemana63fee82006-02-03 05:17:06 +000042===-------------------------------------------------------------------------===
43
Chris Lattner6e112952006-11-07 18:30:21 +000044We compile the hottest inner loop of viterbi to:
45
46 li r6, 0
47 b LBB1_84 ;bb432.i
48LBB1_83: ;bb420.i
49 lbzx r8, r5, r7
50 addi r6, r7, 1
51 stbx r8, r4, r7
52LBB1_84: ;bb432.i
53 mr r7, r6
54 cmplwi cr0, r7, 143
55 bne cr0, LBB1_83 ;bb420.i
56
57The CBE manages to produce:
58
59 li r0, 143
60 mtctr r0
61loop:
62 lbzx r2, r2, r11
63 stbx r0, r2, r9
64 addi r2, r2, 1
65 bdz later
66 b loop
67
68This could be much better (bdnz instead of bdz) but it still beats us. If we
69produced this with bdnz, the loop would be a single dispatch group.
70
71===-------------------------------------------------------------------------===
72
Chris Lattner6a250ec2006-10-13 20:20:58 +000073Compile:
74
75void foo(int *P) {
76 if (P) *P = 0;
77}
78
79into:
80
81_foo:
82 cmpwi cr0,r3,0
83 beqlr cr0
84 li r0,0
85 stw r0,0(r3)
86 blr
87
88This is effectively a simple form of predication.
89
90===-------------------------------------------------------------------------===
91
Chris Lattnera3c44542005-08-24 18:15:24 +000092Lump the constant pool for each function into ONE pic object, and reference
93pieces of it as offsets from the start. For functions like this (contrived
94to have lots of constants obviously):
95
96double X(double Y) { return (Y*1.23 + 4.512)*2.34 + 14.38; }
97
98We generate:
99
100_X:
101 lis r2, ha16(.CPI_X_0)
102 lfd f0, lo16(.CPI_X_0)(r2)
103 lis r2, ha16(.CPI_X_1)
104 lfd f2, lo16(.CPI_X_1)(r2)
105 fmadd f0, f1, f0, f2
106 lis r2, ha16(.CPI_X_2)
107 lfd f1, lo16(.CPI_X_2)(r2)
108 lis r2, ha16(.CPI_X_3)
109 lfd f2, lo16(.CPI_X_3)(r2)
110 fmadd f1, f0, f1, f2
111 blr
112
113It would be better to materialize .CPI_X into a register, then use immediates
114off of the register to avoid the lis's. This is even more important in PIC
115mode.
116
Chris Lattner39b248b2006-02-02 23:50:22 +0000117Note that this (and the static variable version) is discussed here for GCC:
118http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
119
Chris Lattnera3c44542005-08-24 18:15:24 +0000120===-------------------------------------------------------------------------===
Nate Begeman92cce902005-09-06 15:30:48 +0000121
Chris Lattner33c1dab2006-02-03 06:22:11 +0000122PIC Code Gen IPO optimization:
123
124Squish small scalar globals together into a single global struct, allowing the
125address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
126of the GOT on targets with one).
127
128Note that this is discussed here for GCC:
129http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
130
131===-------------------------------------------------------------------------===
132
Nate Begeman92cce902005-09-06 15:30:48 +0000133Implement Newton-Rhapson method for improving estimate instructions to the
134correct accuracy, and implementing divide as multiply by reciprocal when it has
135more than one use. Itanium will want this too.
Nate Begeman21e463b2005-10-16 05:39:50 +0000136
137===-------------------------------------------------------------------------===
138
Chris Lattnerae4664a2005-11-05 08:57:56 +0000139Compile this:
140
141int %f1(int %a, int %b) {
142 %tmp.1 = and int %a, 15 ; <int> [#uses=1]
143 %tmp.3 = and int %b, 240 ; <int> [#uses=1]
144 %tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
145 ret int %tmp.4
146}
147
148without a copy. We make this currently:
149
150_f1:
151 rlwinm r2, r4, 0, 24, 27
152 rlwimi r2, r3, 0, 28, 31
153 or r3, r2, r2
154 blr
155
156The two-addr pass or RA needs to learn when it is profitable to commute an
157instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
158currently only commutes to avoid inserting a copy BEFORE the two addr instr.
159
Chris Lattner62c08dd2005-12-08 07:13:28 +0000160===-------------------------------------------------------------------------===
161
162Compile offsets from allocas:
163
164int *%test() {
165 %X = alloca { int, int }
166 %Y = getelementptr {int,int}* %X, int 0, uint 1
167 ret int* %Y
168}
169
170into a single add, not two:
171
172_test:
173 addi r2, r1, -8
174 addi r3, r2, 4
175 blr
176
177--> important for C++.
178
Chris Lattner39706e62005-12-22 17:19:28 +0000179===-------------------------------------------------------------------------===
180
Chris Lattner39706e62005-12-22 17:19:28 +0000181No loads or stores of the constants should be needed:
182
183struct foo { double X, Y; };
184void xxx(struct foo F);
185void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
186
Chris Lattner1db4b4f2006-01-16 17:53:00 +0000187===-------------------------------------------------------------------------===
188
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000189Darwin Stub LICM optimization:
190
191Loops like this:
192
193 for (...) bar();
194
195Have to go through an indirect stub if bar is external or linkonce. It would
196be better to compile it as:
197
198 fp = &bar;
199 for (...) fp();
200
201which only computes the address of bar once (instead of each time through the
202stub). This is Darwin specific and would have to be done in the code generator.
203Probably not a win on x86.
204
205===-------------------------------------------------------------------------===
206
Chris Lattner98fbc2f2006-01-16 17:58:54 +0000207Simple IPO for argument passing, change:
208 void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
209
210the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
211of arguments get assigned to r3 through r10. That is, if you have a function
212foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
213argument bytes for r4 and r5. The trick then would be to shuffle the argument
214order for functions we can internalize so that the maximum number of
215integers/pointers get passed in regs before you see any of the fp arguments.
216
217Instead of implementing this, it would actually probably be easier to just
218implement a PPC fastcc, where we could do whatever we wanted to the CC,
219including having this work sanely.
220
221===-------------------------------------------------------------------------===
222
223Fix Darwin FP-In-Integer Registers ABI
224
225Darwin passes doubles in structures in integer registers, which is very very
226bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
227that percolates these things out of functions.
228
229Check out how horrible this is:
230http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
231
232This is an extension of "interprocedural CC unmunging" that can't be done with
233just fastcc.
234
235===-------------------------------------------------------------------------===
236
Chris Lattner56b69642006-01-31 02:55:28 +0000237Compile this:
238
Chris Lattner83e64ba2006-01-31 07:16:34 +0000239int foo(int a) {
240 int b = (a < 8);
241 if (b) {
242 return b * 3; // ignore the fact that this is always 3.
243 } else {
244 return 2;
245 }
246}
247
248into something not this:
249
250_foo:
2511) cmpwi cr7, r3, 8
252 mfcr r2, 1
253 rlwinm r2, r2, 29, 31, 31
2541) cmpwi cr0, r3, 7
255 bgt cr0, LBB1_2 ; UnifiedReturnBlock
256LBB1_1: ; then
257 rlwinm r2, r2, 0, 31, 31
258 mulli r3, r2, 3
259 blr
260LBB1_2: ; UnifiedReturnBlock
261 li r3, 2
262 blr
263
264In particular, the two compares (marked 1) could be shared by reversing one.
265This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
266same operands (but backwards) exists. In this case, this wouldn't save us
267anything though, because the compares still wouldn't be shared.
Chris Lattner0ddc1802006-02-01 00:28:12 +0000268
Chris Lattner5a7efc92006-02-01 17:54:23 +0000269===-------------------------------------------------------------------------===
270
Chris Lattner275b8842006-02-02 07:37:11 +0000271We should custom expand setcc instead of pretending that we have it. That
272would allow us to expose the access of the crbit after the mfcr, allowing
273that access to be trivially folded into other ops. A simple example:
274
275int foo(int a, int b) { return (a < b) << 4; }
276
277compiles into:
278
279_foo:
280 cmpw cr7, r3, r4
281 mfcr r2, 1
282 rlwinm r2, r2, 29, 31, 31
283 slwi r3, r2, 4
284 blr
285
Chris Lattnerd463f7f2006-02-03 01:49:49 +0000286===-------------------------------------------------------------------------===
287
Nate Begemana63fee82006-02-03 05:17:06 +0000288Fold add and sub with constant into non-extern, non-weak addresses so this:
289
290static int a;
291void bar(int b) { a = b; }
292void foo(unsigned char *c) {
293 *c = a;
294}
295
296So that
297
298_foo:
299 lis r2, ha16(_a)
300 la r2, lo16(_a)(r2)
301 lbz r2, 3(r2)
302 stb r2, 0(r3)
303 blr
304
305Becomes
306
307_foo:
308 lis r2, ha16(_a+3)
309 lbz r2, lo16(_a+3)(r2)
310 stb r2, 0(r3)
311 blr
Chris Lattner21384532006-02-05 05:27:35 +0000312
313===-------------------------------------------------------------------------===
314
315We generate really bad code for this:
316
317int f(signed char *a, _Bool b, _Bool c) {
318 signed char t = 0;
319 if (b) t = *a;
320 if (c) *a = t;
321}
322
Chris Lattner00d18f02006-03-01 06:36:20 +0000323===-------------------------------------------------------------------------===
324
325This:
326int test(unsigned *P) { return *P >> 24; }
327
328Should compile to:
329
330_test:
331 lbz r3,0(r3)
332 blr
333
334not:
335
336_test:
337 lwz r2, 0(r3)
338 srwi r3, r2, 24
339 blr
340
Chris Lattner5a63c472006-03-07 04:42:59 +0000341===-------------------------------------------------------------------------===
342
343On the G5, logical CR operations are more expensive in their three
344address form: ops that read/write the same register are half as expensive as
345those that read from two registers that are different from their destination.
346
347We should model this with two separate instructions. The isel should generate
348the "two address" form of the instructions. When the register allocator
349detects that it needs to insert a copy due to the two-addresness of the CR
350logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
351we can convert to the "three address" instruction, to save code space.
352
353This only matters when we start generating cr logical ops.
354
Chris Lattner49f398b2006-03-08 00:25:47 +0000355===-------------------------------------------------------------------------===
356
357We should compile these two functions to the same thing:
358
359#include <stdlib.h>
360void f(int a, int b, int *P) {
361 *P = (a-b)>=0?(a-b):(b-a);
362}
363void g(int a, int b, int *P) {
364 *P = abs(a-b);
365}
366
367Further, they should compile to something better than:
368
369_g:
370 subf r2, r4, r3
371 subfic r3, r2, 0
372 cmpwi cr0, r2, -1
373 bgt cr0, LBB2_2 ; entry
374LBB2_1: ; entry
375 mr r2, r3
376LBB2_2: ; entry
377 stw r2, 0(r5)
378 blr
379
380GCC produces:
381
382_g:
383 subf r4,r4,r3
384 srawi r2,r4,31
385 xor r0,r2,r4
386 subf r0,r2,r0
387 stw r0,0(r5)
388 blr
389
390... which is much nicer.
391
392This theoretically may help improve twolf slightly (used in dimbox.c:142?).
393
394===-------------------------------------------------------------------------===
395
Nate Begeman2df99282006-03-16 18:50:44 +0000396int foo(int N, int ***W, int **TK, int X) {
397 int t, i;
398
399 for (t = 0; t < N; ++t)
400 for (i = 0; i < 4; ++i)
401 W[t / X][i][t % X] = TK[i][t];
402
403 return 5;
404}
405
Chris Lattnered511692006-03-16 22:25:55 +0000406We generate relatively atrocious code for this loop compared to gcc.
407
Chris Lattneref040dd2006-03-21 00:47:09 +0000408We could also strength reduce the rem and the div:
409http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
410
Chris Lattner28b1a0b2006-03-19 05:33:30 +0000411===-------------------------------------------------------------------------===
Chris Lattnered511692006-03-16 22:25:55 +0000412
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000413float foo(float X) { return (int)(X); }
414
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000415Currently produces:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000416
417_foo:
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000418 fctiwz f0, f1
419 stfd f0, -8(r1)
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000420 lwz r2, -4(r1)
421 extsw r2, r2
422 std r2, -16(r1)
423 lfd f0, -16(r1)
424 fcfid f0, f0
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000425 frsp f1, f0
426 blr
427
Chris Lattner9d86a9d2006-03-22 05:33:23 +0000428We could use a target dag combine to turn the lwz/extsw into an lwa when the
429lwz has a single use. Since LWA is cracked anyway, this would be a codesize
430win only.
Nate Begemanc0a8b6d2006-03-21 18:58:20 +0000431
Chris Lattner716aefc2006-03-23 21:28:44 +0000432===-------------------------------------------------------------------------===
433
Chris Lattner057f09b2006-03-24 20:04:27 +0000434We generate ugly code for this:
435
436void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
437 unsigned code = 0;
438 if(dx < -dw) code |= 1;
439 if(dx > dw) code |= 2;
440 if(dy < -dw) code |= 4;
441 if(dy > dw) code |= 8;
442 if(dz < -dw) code |= 16;
443 if(dz > dw) code |= 32;
444 *ret = code;
445}
446
Chris Lattner420736d2006-03-25 06:47:10 +0000447===-------------------------------------------------------------------------===
448
Chris Lattnered937902006-04-13 16:48:00 +0000449Complete the signed i32 to FP conversion code using 64-bit registers
450transformation, good for PI. See PPCISelLowering.cpp, this comment:
Chris Lattner220d2b82006-04-02 07:20:00 +0000451
Chris Lattnered937902006-04-13 16:48:00 +0000452 // FIXME: disable this lowered code. This generates 64-bit register values,
453 // and we don't model the fact that the top part is clobbered by calls. We
454 // need to flag these together so that the value isn't live across a call.
455 //setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
Chris Lattner220d2b82006-04-02 07:20:00 +0000456
Chris Lattner9d62fa42006-05-17 19:02:25 +0000457Also, if the registers are spilled to the stack, we have to ensure that all
45864-bits of them are save/restored, otherwise we will miscompile the code. It
459sounds like we need to get the 64-bit register classes going.
460
Chris Lattner55c63252006-05-05 05:36:15 +0000461===-------------------------------------------------------------------------===
462
Nate Begeman908049b2007-01-29 21:21:22 +0000463%struct.B = type { i8, [3 x i8] }
Nate Begeman75146202006-05-08 20:54:02 +0000464
Nate Begeman908049b2007-01-29 21:21:22 +0000465define void @bar(%struct.B* %b) {
Nate Begeman75146202006-05-08 20:54:02 +0000466entry:
Nate Begeman908049b2007-01-29 21:21:22 +0000467 %tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
468 %tmp = load i32* %tmp ; <uint> [#uses=1]
469 %tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
470 %tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
471 %tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
472 %tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
473 %tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
474 %tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
475 %tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
476 %tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
477 %tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
478 %tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
479 store i32 %tmp13, i32* %tmp8
Chris Lattner55c63252006-05-05 05:36:15 +0000480 ret void
481}
482
483We emit:
484
485_foo:
486 lwz r2, 0(r3)
Nate Begeman75146202006-05-08 20:54:02 +0000487 slwi r4, r2, 1
488 or r4, r4, r2
489 rlwimi r2, r4, 0, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000490 stw r2, 0(r3)
Chris Lattner55c63252006-05-05 05:36:15 +0000491 blr
492
Nate Begeman75146202006-05-08 20:54:02 +0000493We could collapse a bunch of those ORs and ANDs and generate the following
494equivalent code:
Chris Lattner55c63252006-05-05 05:36:15 +0000495
Nate Begeman4667f2c2006-05-08 17:38:32 +0000496_foo:
497 lwz r2, 0(r3)
Nate Begemand8624ed2006-05-08 19:09:24 +0000498 rlwinm r4, r2, 1, 0, 0
Nate Begeman4667f2c2006-05-08 17:38:32 +0000499 or r2, r2, r4
500 stw r2, 0(r3)
501 blr
Chris Lattner1eeedae2006-07-14 04:07:29 +0000502
503===-------------------------------------------------------------------------===
504
Chris Lattnerf0613e12006-09-14 20:56:30 +0000505We compile:
506
507unsigned test6(unsigned x) {
508 return ((x & 0x00FF0000) >> 16) | ((x & 0x000000FF) << 16);
509}
510
511into:
512
513_test6:
514 lis r2, 255
515 rlwinm r3, r3, 16, 0, 31
516 ori r2, r2, 255
517 and r3, r3, r2
518 blr
519
520GCC gets it down to:
521
522_test6:
523 rlwinm r0,r3,16,8,15
524 rlwinm r3,r3,16,24,31
525 or r3,r3,r0
526 blr
527
Chris Lattnerafd7a082007-01-18 07:34:57 +0000528
529===-------------------------------------------------------------------------===
530
531Consider a function like this:
532
533float foo(float X) { return X + 1234.4123f; }
534
535The FP constant ends up in the constant pool, so we need to get the LR register.
536 This ends up producing code like this:
537
538_foo:
539.LBB_foo_0: ; entry
540 mflr r11
541*** stw r11, 8(r1)
542 bl "L00000$pb"
543"L00000$pb":
544 mflr r2
545 addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
546 lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
547 fadds f1, f1, f0
548*** lwz r11, 8(r1)
549 mtlr r11
550 blr
551
552This is functional, but there is no reason to spill the LR register all the way
553to the stack (the two marked instrs): spilling it to a GPR is quite enough.
554
555Implementing this will require some codegen improvements. Nate writes:
556
557"So basically what we need to support the "no stack frame save and restore" is a
558generalization of the LR optimization to "callee-save regs".
559
560Currently, we have LR marked as a callee-save reg. The register allocator sees
561that it's callee save, and spills it directly to the stack.
562
563Ideally, something like this would happen:
564
565LR would be in a separate register class from the GPRs. The class of LR would be
566marked "unspillable". When the register allocator came across an unspillable
567reg, it would ask "what is the best class to copy this into that I *can* spill"
568If it gets a class back, which it will in this case (the gprs), it grabs a free
569register of that class. If it is then later necessary to spill that reg, so be
570it.
571
572===-------------------------------------------------------------------------===
Chris Lattner95b9d6e2007-01-31 19:49:20 +0000573
574We compile this:
575int test(_Bool X) {
576 return X ? 524288 : 0;
577}
578
579to:
580_test:
581 cmplwi cr0, r3, 0
582 lis r2, 8
583 li r3, 0
584 beq cr0, LBB1_2 ;entry
585LBB1_1: ;entry
586 mr r3, r2
587LBB1_2: ;entry
588 blr
589
590instead of:
591_test:
592 addic r2,r3,-1
593 subfe r0,r2,r3
594 slwi r3,r0,19
595 blr
596
597This sort of thing occurs a lot due to globalopt.
598
599===-------------------------------------------------------------------------===
Chris Lattner8abcfe12007-02-09 17:38:01 +0000600
601We currently compile 32-bit bswap:
602
603declare i32 @llvm.bswap.i32(i32 %A)
604define i32 @test(i32 %A) {
605 %B = call i32 @llvm.bswap.i32(i32 %A)
606 ret i32 %B
607}
608
609to:
610
611_test:
612 rlwinm r2, r3, 24, 16, 23
613 slwi r4, r3, 24
614 rlwimi r2, r3, 8, 24, 31
615 rlwimi r4, r3, 8, 8, 15
616 rlwimi r4, r2, 0, 16, 31
617 mr r3, r4
618 blr
619
620it would be more efficient to produce:
621
622_foo: mr r0,r3
623 rlwinm r3,r3,8,0xffffffff
624 rlwimi r3,r0,24,0,7
625 rlwimi r3,r0,24,16,23
626 blr
627
628===-------------------------------------------------------------------------===
Chris Lattner013e0512007-03-25 04:46:28 +0000629
630test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
631
632__ZNK4llvm5APInt17countLeadingZerosEv:
633 ld r2, 0(r3)
634 cntlzd r2, r2
635 or r2, r2, r2 <<-- silly.
636 addi r3, r2, -64
637 blr
638
639The dead or is a 'truncate' from 64- to 32-bits.
640
641===-------------------------------------------------------------------------===
Chris Lattnerfcb1e612007-03-31 07:06:25 +0000642
643We generate horrible ppc code for this:
644
645#define N 2000000
646double a[N],c[N];
647void simpleloop() {
648 int j;
649 for (j=0; j<N; j++)
650 c[j] = a[j];
651}
652
653LBB1_1: ;bb
654 lfdx f0, r3, r4
655 addi r5, r5, 1 ;; Extra IV for the exit value compare.
656 stfdx f0, r2, r4
657 addi r4, r4, 8
658
659 xoris r6, r5, 30 ;; This is due to a large immediate.
660 cmplwi cr0, r6, 33920
661 bne cr0, LBB1_1
662
663===-------------------------------------------------------------------------===
664