Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: e5d4002846ebde813221a819f09791f386d53699 [file] [log] [blame]

Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1	//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
				2
				3	TODO:
				4	* gpr0 allocation
				5	* implement do-loop -> bdnz transform
Nate Begeman	10c8575	2008-02-11 04:16:09 +0000	[diff] [blame]	6	* lmw/stmw pass a la arm load store optimizer for prolog/epilog
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	7
				8	===-------------------------------------------------------------------------===
				9
				10	Support 'update' load/store instructions. These are cracked on the G5, but are
				11	still a codesize win.
				12
				13	With preinc enabled, this:
				14
				15	long %test4(long %X, long *%dest) {
				16	%Y = getelementptr long* %X, int 4
				17	%A = load long* %Y
				18	store long %A, long* %dest
				19	ret long* %Y
				20	}
				21
				22	compiles to:
				23
				24	_test4:
				25	mr r2, r3
				26	lwzu r5, 32(r2)
				27	lwz r3, 36(r3)
				28	stw r5, 0(r4)
				29	stw r3, 4(r4)
				30	mr r3, r2
				31	blr
				32
				33	with -sched=list-burr, I get:
				34
				35	_test4:
				36	lwz r2, 36(r3)
				37	lwzu r5, 32(r3)
				38	stw r2, 4(r4)
				39	stw r5, 0(r4)
				40	blr
				41
				42	===-------------------------------------------------------------------------===
				43
				44	We compile the hottest inner loop of viterbi to:
				45
				46	li r6, 0
				47	b LBB1_84 ;bb432.i
				48	LBB1_83: ;bb420.i
				49	lbzx r8, r5, r7
				50	addi r6, r7, 1
				51	stbx r8, r4, r7
				52	LBB1_84: ;bb432.i
				53	mr r7, r6
				54	cmplwi cr0, r7, 143
				55	bne cr0, LBB1_83 ;bb420.i
				56
				57	The CBE manages to produce:
				58
				59	li r0, 143
				60	mtctr r0
				61	loop:
				62	lbzx r2, r2, r11
				63	stbx r0, r2, r9
				64	addi r2, r2, 1
				65	bdz later
				66	b loop
				67
				68	This could be much better (bdnz instead of bdz) but it still beats us. If we
				69	produced this with bdnz, the loop would be a single dispatch group.
				70
				71	===-------------------------------------------------------------------------===
				72
				73	Compile:
				74
				75	void foo(int *P) {
				76	if (P) *P = 0;
				77	}
				78
				79	into:
				80
				81	_foo:
				82	cmpwi cr0,r3,0
				83	beqlr cr0
				84	li r0,0
				85	stw r0,0(r3)
				86	blr
				87
				88	This is effectively a simple form of predication.
				89
				90	===-------------------------------------------------------------------------===
				91
				92	Lump the constant pool for each function into ONE pic object, and reference
				93	pieces of it as offsets from the start. For functions like this (contrived
				94	to have lots of constants obviously):
				95
				96	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				97
				98	We generate:
				99
				100	_X:
				101	lis r2, ha16(.CPI_X_0)
				102	lfd f0, lo16(.CPI_X_0)(r2)
				103	lis r2, ha16(.CPI_X_1)
				104	lfd f2, lo16(.CPI_X_1)(r2)
				105	fmadd f0, f1, f0, f2
				106	lis r2, ha16(.CPI_X_2)
				107	lfd f1, lo16(.CPI_X_2)(r2)
				108	lis r2, ha16(.CPI_X_3)
				109	lfd f2, lo16(.CPI_X_3)(r2)
				110	fmadd f1, f0, f1, f2
				111	blr
				112
				113	It would be better to materialize .CPI_X into a register, then use immediates
				114	off of the register to avoid the lis's. This is even more important in PIC
				115	mode.
				116
				117	Note that this (and the static variable version) is discussed here for GCC:
				118	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				119
Chris Lattner	35d65d7	2007-08-23 15:16:03 +0000	[diff] [blame]	120	Here's another example (the sgn function):
				121	double testf(double a) {
				122	return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
				123	}
				124
				125	it produces a BB like this:
				126	LBB1_1: ; cond_true
				127	lis r2, ha16(LCPI1_0)
				128	lfs f0, lo16(LCPI1_0)(r2)
				129	lis r2, ha16(LCPI1_1)
				130	lis r3, ha16(LCPI1_2)
				131	lfs f2, lo16(LCPI1_2)(r3)
				132	lfs f3, lo16(LCPI1_1)(r2)
				133	fsub f0, f0, f1
				134	fsel f1, f0, f2, f3
				135	blr
				136
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	137	===-------------------------------------------------------------------------===
				138
				139	PIC Code Gen IPO optimization:
				140
				141	Squish small scalar globals together into a single global struct, allowing the
				142	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				143	of the GOT on targets with one).
				144
				145	Note that this is discussed here for GCC:
				146	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				147
				148	===-------------------------------------------------------------------------===
				149
				150	Implement Newton-Rhapson method for improving estimate instructions to the
				151	correct accuracy, and implementing divide as multiply by reciprocal when it has
				152	more than one use. Itanium will want this too.
				153
				154	===-------------------------------------------------------------------------===
				155
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	156	Compile offsets from allocas:
				157
				158	int *%test() {
				159	%X = alloca { int, int }
				160	%Y = getelementptr {int,int}* %X, int 0, uint 1
				161	ret int* %Y
				162	}
				163
				164	into a single add, not two:
				165
				166	_test:
				167	addi r2, r1, -8
				168	addi r3, r2, 4
				169	blr
				170
				171	--> important for C++.
				172
				173	===-------------------------------------------------------------------------===
				174
				175	No loads or stores of the constants should be needed:
				176
				177	struct foo { double X, Y; };
				178	void xxx(struct foo F);
				179	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				180
				181	===-------------------------------------------------------------------------===
				182
				183	Darwin Stub LICM optimization:
				184
				185	Loops like this:
				186
				187	for (...) bar();
				188
				189	Have to go through an indirect stub if bar is external or linkonce. It would
				190	be better to compile it as:
				191
				192	fp = &bar;
				193	for (...) fp();
				194
				195	which only computes the address of bar once (instead of each time through the
				196	stub). This is Darwin specific and would have to be done in the code generator.
				197	Probably not a win on x86.
				198
				199	===-------------------------------------------------------------------------===
				200
				201	Simple IPO for argument passing, change:
				202	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				203
				204	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				205	of arguments get assigned to r3 through r10. That is, if you have a function
				206	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				207	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				208	order for functions we can internalize so that the maximum number of
				209	integers/pointers get passed in regs before you see any of the fp arguments.
				210
				211	Instead of implementing this, it would actually probably be easier to just
				212	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				213	including having this work sanely.
				214
				215	===-------------------------------------------------------------------------===
				216
				217	Fix Darwin FP-In-Integer Registers ABI
				218
				219	Darwin passes doubles in structures in integer registers, which is very very
				220	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				221	that percolates these things out of functions.
				222
				223	Check out how horrible this is:
				224	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				225
				226	This is an extension of "interprocedural CC unmunging" that can't be done with
				227	just fastcc.
				228
				229	===-------------------------------------------------------------------------===
				230
				231	Compile this:
				232
				233	int foo(int a) {
				234	int b = (a < 8);
				235	if (b) {
				236	return b * 3; // ignore the fact that this is always 3.
				237	} else {
				238	return 2;
				239	}
				240	}
				241
				242	into something not this:
				243
				244	_foo:
				245	1) cmpwi cr7, r3, 8
				246	mfcr r2, 1
				247	rlwinm r2, r2, 29, 31, 31
				248	1) cmpwi cr0, r3, 7
				249	bgt cr0, LBB1_2 ; UnifiedReturnBlock
				250	LBB1_1: ; then
				251	rlwinm r2, r2, 0, 31, 31
				252	mulli r3, r2, 3
				253	blr
				254	LBB1_2: ; UnifiedReturnBlock
				255	li r3, 2
				256	blr
				257
				258	In particular, the two compares (marked 1) could be shared by reversing one.
				259	This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
				260	same operands (but backwards) exists. In this case, this wouldn't save us
				261	anything though, because the compares still wouldn't be shared.
				262
				263	===-------------------------------------------------------------------------===
				264
				265	We should custom expand setcc instead of pretending that we have it. That
				266	would allow us to expose the access of the crbit after the mfcr, allowing
				267	that access to be trivially folded into other ops. A simple example:
				268
				269	int foo(int a, int b) { return (a < b) << 4; }
				270
				271	compiles into:
				272
				273	_foo:
				274	cmpw cr7, r3, r4
				275	mfcr r2, 1
				276	rlwinm r2, r2, 29, 31, 31
				277	slwi r3, r2, 4
				278	blr
				279
				280	===-------------------------------------------------------------------------===
				281
				282	Fold add and sub with constant into non-extern, non-weak addresses so this:
				283
				284	static int a;
				285	void bar(int b) { a = b; }
				286	void foo(unsigned char *c) {
				287	*c = a;
				288	}
				289
				290	So that
				291
				292	_foo:
				293	lis r2, ha16(_a)
				294	la r2, lo16(_a)(r2)
				295	lbz r2, 3(r2)
				296	stb r2, 0(r3)
				297	blr
				298
				299	Becomes
				300
				301	_foo:
				302	lis r2, ha16(_a+3)
				303	lbz r2, lo16(_a+3)(r2)
				304	stb r2, 0(r3)
				305	blr
				306
				307	===-------------------------------------------------------------------------===
				308
				309	We generate really bad code for this:
				310
				311	int f(signed char *a, _Bool b, _Bool c) {
				312	signed char t = 0;
				313	if (b) t = *a;
				314	if (c) *a = t;
				315	}
				316
				317	===-------------------------------------------------------------------------===
				318
				319	This:
				320	int test(unsigned P) { return P >> 24; }
				321
				322	Should compile to:
				323
				324	_test:
				325	lbz r3,0(r3)
				326	blr
				327
				328	not:
				329
				330	_test:
				331	lwz r2, 0(r3)
				332	srwi r3, r2, 24
				333	blr
				334
				335	===-------------------------------------------------------------------------===
				336
				337	On the G5, logical CR operations are more expensive in their three
				338	address form: ops that read/write the same register are half as expensive as
				339	those that read from two registers that are different from their destination.
				340
				341	We should model this with two separate instructions. The isel should generate
				342	the "two address" form of the instructions. When the register allocator
				343	detects that it needs to insert a copy due to the two-addresness of the CR
				344	logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
				345	we can convert to the "three address" instruction, to save code space.
				346
				347	This only matters when we start generating cr logical ops.
				348
				349	===-------------------------------------------------------------------------===
				350
				351	We should compile these two functions to the same thing:
				352
				353	#include <stdlib.h>
				354	void f(int a, int b, int *P) {
				355	*P = (a-b)>=0?(a-b):(b-a);
				356	}
				357	void g(int a, int b, int *P) {
				358	*P = abs(a-b);
				359	}
				360
				361	Further, they should compile to something better than:
				362
				363	_g:
				364	subf r2, r4, r3
				365	subfic r3, r2, 0
				366	cmpwi cr0, r2, -1
				367	bgt cr0, LBB2_2 ; entry
				368	LBB2_1: ; entry
				369	mr r2, r3
				370	LBB2_2: ; entry
				371	stw r2, 0(r5)
				372	blr
				373
				374	GCC produces:
				375
				376	_g:
				377	subf r4,r4,r3
				378	srawi r2,r4,31
				379	xor r0,r2,r4
				380	subf r0,r2,r0
				381	stw r0,0(r5)
				382	blr
				383
				384	... which is much nicer.
				385
				386	This theoretically may help improve twolf slightly (used in dimbox.c:142?).
				387
				388	===-------------------------------------------------------------------------===
				389
				390	int foo(int N, int *W, int TK, int X) {
				391	int t, i;
				392
				393	for (t = 0; t < N; ++t)
				394	for (i = 0; i < 4; ++i)
				395	W[t / X][i][t % X] = TK[i][t];
				396
				397	return 5;
				398	}
				399
				400	We generate relatively atrocious code for this loop compared to gcc.
				401
				402	We could also strength reduce the rem and the div:
				403	http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
				404
				405	===-------------------------------------------------------------------------===
				406
				407	float foo(float X) { return (int)(X); }
				408
				409	Currently produces:
				410
				411	_foo:
				412	fctiwz f0, f1
				413	stfd f0, -8(r1)
				414	lwz r2, -4(r1)
				415	extsw r2, r2
				416	std r2, -16(r1)
				417	lfd f0, -16(r1)
				418	fcfid f0, f0
				419	frsp f1, f0
				420	blr
				421
				422	We could use a target dag combine to turn the lwz/extsw into an lwa when the
				423	lwz has a single use. Since LWA is cracked anyway, this would be a codesize
				424	win only.
				425
				426	===-------------------------------------------------------------------------===
				427
				428	We generate ugly code for this:
				429
				430	void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
				431	unsigned code = 0;
				432	if(dx < -dw) code \|= 1;
				433	if(dx > dw) code \|= 2;
				434	if(dy < -dw) code \|= 4;
				435	if(dy > dw) code \|= 8;
				436	if(dz < -dw) code \|= 16;
				437	if(dz > dw) code \|= 32;
				438	*ret = code;
				439	}
				440
				441	===-------------------------------------------------------------------------===
				442
				443	Complete the signed i32 to FP conversion code using 64-bit registers
				444	transformation, good for PI. See PPCISelLowering.cpp, this comment:
				445
				446	// FIXME: disable this lowered code. This generates 64-bit register values,
				447	// and we don't model the fact that the top part is clobbered by calls. We
				448	// need to flag these together so that the value isn't live across a call.
				449	//setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
				450
				451	Also, if the registers are spilled to the stack, we have to ensure that all
				452	64-bits of them are save/restored, otherwise we will miscompile the code. It
				453	sounds like we need to get the 64-bit register classes going.
				454
				455	===-------------------------------------------------------------------------===
				456
				457	%struct.B = type { i8, [3 x i8] }
				458
				459	define void @bar(%struct.B* %b) {
				460	entry:
				461	%tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
				462	%tmp = load i32* %tmp ; <uint> [#uses=1]
				463	%tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
				464	%tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
				465	%tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
				466	%tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
				467	%tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
				468	%tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
				469	%tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
				470	%tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
				471	%tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
				472	%tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
				473	store i32 %tmp13, i32* %tmp8
				474	ret void
				475	}
				476
				477	We emit:
				478
				479	_foo:
				480	lwz r2, 0(r3)
				481	slwi r4, r2, 1
				482	or r4, r4, r2
				483	rlwimi r2, r4, 0, 0, 0
				484	stw r2, 0(r3)
				485	blr
				486
				487	We could collapse a bunch of those ORs and ANDs and generate the following
				488	equivalent code:
				489
				490	_foo:
				491	lwz r2, 0(r3)
				492	rlwinm r4, r2, 1, 0, 0
				493	or r2, r2, r4
				494	stw r2, 0(r3)
				495	blr
				496
				497	===-------------------------------------------------------------------------===
				498
				499	We compile:
				500
				501	unsigned test6(unsigned x) {
				502	return ((x & 0x00FF0000) >> 16) \| ((x & 0x000000FF) << 16);
				503	}
				504
				505	into:
				506
				507	_test6:
				508	lis r2, 255
				509	rlwinm r3, r3, 16, 0, 31
				510	ori r2, r2, 255
				511	and r3, r3, r2
				512	blr
				513
				514	GCC gets it down to:
				515
				516	_test6:
				517	rlwinm r0,r3,16,8,15
				518	rlwinm r3,r3,16,24,31
				519	or r3,r3,r0
				520	blr
				521
				522
				523	===-------------------------------------------------------------------------===
				524
				525	Consider a function like this:
				526
				527	float foo(float X) { return X + 1234.4123f; }
				528
				529	The FP constant ends up in the constant pool, so we need to get the LR register.
				530	This ends up producing code like this:
				531
				532	_foo:
				533	.LBB_foo_0: ; entry
				534	mflr r11
				535	*** stw r11, 8(r1)
				536	bl "L00000$pb"
				537	"L00000$pb":
				538	mflr r2
				539	addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
				540	lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
				541	fadds f1, f1, f0
				542	*** lwz r11, 8(r1)
				543	mtlr r11
				544	blr
				545
				546	This is functional, but there is no reason to spill the LR register all the way
				547	to the stack (the two marked instrs): spilling it to a GPR is quite enough.
				548
				549	Implementing this will require some codegen improvements. Nate writes:
				550
				551	"So basically what we need to support the "no stack frame save and restore" is a
				552	generalization of the LR optimization to "callee-save regs".
				553
				554	Currently, we have LR marked as a callee-save reg. The register allocator sees
				555	that it's callee save, and spills it directly to the stack.
				556
				557	Ideally, something like this would happen:
				558
				559	LR would be in a separate register class from the GPRs. The class of LR would be
				560	marked "unspillable". When the register allocator came across an unspillable
				561	reg, it would ask "what is the best class to copy this into that I can spill"
				562	If it gets a class back, which it will in this case (the gprs), it grabs a free
				563	register of that class. If it is then later necessary to spill that reg, so be
				564	it.
				565
				566	===-------------------------------------------------------------------------===
				567
				568	We compile this:
				569	int test(_Bool X) {
				570	return X ? 524288 : 0;
				571	}
				572
				573	to:
				574	_test:
				575	cmplwi cr0, r3, 0
				576	lis r2, 8
				577	li r3, 0
				578	beq cr0, LBB1_2 ;entry
				579	LBB1_1: ;entry
				580	mr r3, r2
				581	LBB1_2: ;entry
				582	blr
				583
				584	instead of:
				585	_test:
				586	addic r2,r3,-1
				587	subfe r0,r2,r3
				588	slwi r3,r0,19
				589	blr
				590
				591	This sort of thing occurs a lot due to globalopt.
				592
				593	===-------------------------------------------------------------------------===
				594
				595	We currently compile 32-bit bswap:
				596
				597	declare i32 @llvm.bswap.i32(i32 %A)
				598	define i32 @test(i32 %A) {
				599	%B = call i32 @llvm.bswap.i32(i32 %A)
				600	ret i32 %B
				601	}
				602
				603	to:
				604
				605	_test:
				606	rlwinm r2, r3, 24, 16, 23
				607	slwi r4, r3, 24
				608	rlwimi r2, r3, 8, 24, 31
				609	rlwimi r4, r3, 8, 8, 15
				610	rlwimi r4, r2, 0, 16, 31
				611	mr r3, r4
				612	blr
				613
				614	it would be more efficient to produce:
				615
				616	_foo: mr r0,r3
				617	rlwinm r3,r3,8,0xffffffff
				618	rlwimi r3,r0,24,0,7
				619	rlwimi r3,r0,24,16,23
				620	blr
				621
				622	===-------------------------------------------------------------------------===
				623
				624	test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
				625
				626	__ZNK4llvm5APInt17countLeadingZerosEv:
				627	ld r2, 0(r3)
				628	cntlzd r2, r2
				629	or r2, r2, r2 <<-- silly.
				630	addi r3, r2, -64
				631	blr
				632
				633	The dead or is a 'truncate' from 64- to 32-bits.
				634
				635	===-------------------------------------------------------------------------===
				636
				637	We generate horrible ppc code for this:
				638
				639	#define N 2000000
				640	double a[N],c[N];
				641	void simpleloop() {
				642	int j;
				643	for (j=0; j<N; j++)
				644	c[j] = a[j];
				645	}
				646
				647	LBB1_1: ;bb
				648	lfdx f0, r3, r4
				649	addi r5, r5, 1 ;; Extra IV for the exit value compare.
				650	stfdx f0, r2, r4
				651	addi r4, r4, 8
				652
				653	xoris r6, r5, 30 ;; This is due to a large immediate.
				654	cmplwi cr0, r6, 33920
				655	bne cr0, LBB1_1
				656
Chris Lattner	4084d49	2007-09-10 21:43:18 +0000	[diff] [blame]	657	//===---------------------------------------------------------------------===//
				658
				659	This:
				660	#include <algorithm>
				661	inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
				662	{ return std::make_pair(a + b, a + b < a); }
				663	bool no_overflow(unsigned a, unsigned b)
				664	{ return !full_add(a, b).second; }
				665
				666	Should compile to:
				667
				668	__Z11no_overflowjj:
				669	add r4,r3,r4
				670	subfc r3,r3,r4
				671	li r3,0
				672	adde r3,r3,r3
				673	blr
				674
				675	(or better) not:
				676
				677	__Z11no_overflowjj:
				678	add r2, r4, r3
				679	cmplw cr7, r2, r3
				680	mfcr r2
				681	rlwinm r2, r2, 29, 31, 31
				682	xori r3, r2, 1
				683	blr
				684
				685	//===---------------------------------------------------------------------===//
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	686
Chris Lattner	6c36fb5	2008-01-08 06:46:30 +0000	[diff] [blame]	687	We compile some FP comparisons into an mfcr with two rlwinms and an or. For
				688	example:
				689	#include <math.h>
				690	int test(double x, double y) { return islessequal(x, y);}
				691	int test2(double x, double y) { return islessgreater(x, y);}
				692	int test3(double x, double y) { return !islessequal(x, y);}
				693
				694	Compiles into (all three are similar, but the bits differ):
				695
				696	_test:
				697	fcmpu cr7, f1, f2
				698	mfcr r2
				699	rlwinm r3, r2, 29, 31, 31
				700	rlwinm r2, r2, 31, 31, 31
				701	or r3, r2, r3
				702	blr
				703
				704	GCC compiles this into:
				705
				706	_test:
				707	fcmpu cr7,f1,f2
				708	cror 30,28,30
				709	mfcr r3
				710	rlwinm r3,r3,31,1
				711	blr
				712
				713	which is more efficient and can use mfocr. See PR642 for some more context.
				714
				715	//===---------------------------------------------------------------------===//
Chris Lattner	869440b	2008-03-02 19:27:34 +0000	[diff] [blame]	716
				717	void foo(float *data, float d) {
				718	long i;
				719	for (i = 0; i < 8000; i++)
				720	data[i] = d;
				721	}
				722	void foo2(float *data, float d) {
				723	long i;
				724	data--;
				725	for (i = 0; i < 8000; i++) {
				726	data[1] = d;
				727	data++;
				728	}
				729	}
				730
				731	These compile to:
				732
				733	_foo:
				734	li r2, 0
				735	LBB1_1: ; bb
				736	addi r4, r2, 4
				737	stfsx f1, r3, r2
				738	cmplwi cr0, r4, 32000
				739	mr r2, r4
				740	bne cr0, LBB1_1 ; bb
				741	blr
				742	_foo2:
				743	li r2, 0
				744	LBB2_1: ; bb
				745	addi r4, r2, 4
				746	stfsx f1, r3, r2
				747	cmplwi cr0, r4, 32000
				748	mr r2, r4
				749	bne cr0, LBB2_1 ; bb
				750	blr
				751
				752	The 'mr' could be eliminated to folding the add into the cmp better.
				753
				754	//===---------------------------------------------------------------------===//