Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: 1341c66f2668a8bcb3df16fe014db8b1d2b3c45a [file] [log] [blame]

Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1	//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
				2
				3	TODO:
				4	* gpr0 allocation
				5	* implement do-loop -> bdnz transform
Nate Begeman	10c8575	2008-02-11 04:16:09 +0000	[diff] [blame]	6	* lmw/stmw pass a la arm load store optimizer for prolog/epilog
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	7
				8	===-------------------------------------------------------------------------===
				9
				10	Support 'update' load/store instructions. These are cracked on the G5, but are
				11	still a codesize win.
				12
				13	With preinc enabled, this:
				14
				15	long %test4(long %X, long *%dest) {
				16	%Y = getelementptr long* %X, int 4
				17	%A = load long* %Y
				18	store long %A, long* %dest
				19	ret long* %Y
				20	}
				21
				22	compiles to:
				23
				24	_test4:
				25	mr r2, r3
				26	lwzu r5, 32(r2)
				27	lwz r3, 36(r3)
				28	stw r5, 0(r4)
				29	stw r3, 4(r4)
				30	mr r3, r2
				31	blr
				32
				33	with -sched=list-burr, I get:
				34
				35	_test4:
				36	lwz r2, 36(r3)
				37	lwzu r5, 32(r3)
				38	stw r2, 4(r4)
				39	stw r5, 0(r4)
				40	blr
				41
				42	===-------------------------------------------------------------------------===
				43
				44	We compile the hottest inner loop of viterbi to:
				45
				46	li r6, 0
				47	b LBB1_84 ;bb432.i
				48	LBB1_83: ;bb420.i
				49	lbzx r8, r5, r7
				50	addi r6, r7, 1
				51	stbx r8, r4, r7
				52	LBB1_84: ;bb432.i
				53	mr r7, r6
				54	cmplwi cr0, r7, 143
				55	bne cr0, LBB1_83 ;bb420.i
				56
				57	The CBE manages to produce:
				58
				59	li r0, 143
				60	mtctr r0
				61	loop:
				62	lbzx r2, r2, r11
				63	stbx r0, r2, r9
				64	addi r2, r2, 1
				65	bdz later
				66	b loop
				67
				68	This could be much better (bdnz instead of bdz) but it still beats us. If we
				69	produced this with bdnz, the loop would be a single dispatch group.
				70
				71	===-------------------------------------------------------------------------===
				72
				73	Compile:
				74
				75	void foo(int *P) {
				76	if (P) *P = 0;
				77	}
				78
				79	into:
				80
				81	_foo:
				82	cmpwi cr0,r3,0
				83	beqlr cr0
				84	li r0,0
				85	stw r0,0(r3)
				86	blr
				87
				88	This is effectively a simple form of predication.
				89
				90	===-------------------------------------------------------------------------===
				91
				92	Lump the constant pool for each function into ONE pic object, and reference
				93	pieces of it as offsets from the start. For functions like this (contrived
				94	to have lots of constants obviously):
				95
				96	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				97
				98	We generate:
				99
				100	_X:
				101	lis r2, ha16(.CPI_X_0)
				102	lfd f0, lo16(.CPI_X_0)(r2)
				103	lis r2, ha16(.CPI_X_1)
				104	lfd f2, lo16(.CPI_X_1)(r2)
				105	fmadd f0, f1, f0, f2
				106	lis r2, ha16(.CPI_X_2)
				107	lfd f1, lo16(.CPI_X_2)(r2)
				108	lis r2, ha16(.CPI_X_3)
				109	lfd f2, lo16(.CPI_X_3)(r2)
				110	fmadd f1, f0, f1, f2
				111	blr
				112
				113	It would be better to materialize .CPI_X into a register, then use immediates
				114	off of the register to avoid the lis's. This is even more important in PIC
				115	mode.
				116
				117	Note that this (and the static variable version) is discussed here for GCC:
				118	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				119
Chris Lattner	35d65d7	2007-08-23 15:16:03 +0000	[diff] [blame]	120	Here's another example (the sgn function):
				121	double testf(double a) {
				122	return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
				123	}
				124
				125	it produces a BB like this:
				126	LBB1_1: ; cond_true
				127	lis r2, ha16(LCPI1_0)
				128	lfs f0, lo16(LCPI1_0)(r2)
				129	lis r2, ha16(LCPI1_1)
				130	lis r3, ha16(LCPI1_2)
				131	lfs f2, lo16(LCPI1_2)(r3)
				132	lfs f3, lo16(LCPI1_1)(r2)
				133	fsub f0, f0, f1
				134	fsel f1, f0, f2, f3
				135	blr
				136
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	137	===-------------------------------------------------------------------------===
				138
				139	PIC Code Gen IPO optimization:
				140
				141	Squish small scalar globals together into a single global struct, allowing the
				142	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				143	of the GOT on targets with one).
				144
				145	Note that this is discussed here for GCC:
				146	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				147
				148	===-------------------------------------------------------------------------===
				149
				150	Implement Newton-Rhapson method for improving estimate instructions to the
				151	correct accuracy, and implementing divide as multiply by reciprocal when it has
				152	more than one use. Itanium will want this too.
				153
				154	===-------------------------------------------------------------------------===
				155
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	156	Compile offsets from allocas:
				157
				158	int *%test() {
				159	%X = alloca { int, int }
				160	%Y = getelementptr {int,int}* %X, int 0, uint 1
				161	ret int* %Y
				162	}
				163
				164	into a single add, not two:
				165
				166	_test:
				167	addi r2, r1, -8
				168	addi r3, r2, 4
				169	blr
				170
				171	--> important for C++.
				172
				173	===-------------------------------------------------------------------------===
				174
				175	No loads or stores of the constants should be needed:
				176
				177	struct foo { double X, Y; };
				178	void xxx(struct foo F);
				179	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				180
				181	===-------------------------------------------------------------------------===
				182
Dale Johannesen	3960163	2009-07-01 23:36:02 +0000	[diff] [blame]	183	Darwin Stub removal:
				184
				185	We still generate calls to foo$stub, and stubs, on Darwin. This is not
				186	necessary on Leopard (10.5) or later, as stubs are generated by ld when
				187	necessary. The choice should depend on the value of -mmacosx-version-min.
				188	x86-32 does this right, see its logic.
				189
				190	===-------------------------------------------------------------------------===
				191
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	192	Darwin Stub LICM optimization:
				193
				194	Loops like this:
				195
				196	for (...) bar();
				197
				198	Have to go through an indirect stub if bar is external or linkonce. It would
				199	be better to compile it as:
				200
				201	fp = &bar;
				202	for (...) fp();
				203
				204	which only computes the address of bar once (instead of each time through the
				205	stub). This is Darwin specific and would have to be done in the code generator.
				206	Probably not a win on x86.
				207
Dale Johannesen	3960163	2009-07-01 23:36:02 +0000	[diff] [blame]	208	Note that removing stubs altogether, as in the previous item, is better yet.
				209
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	210	===-------------------------------------------------------------------------===
				211
				212	Simple IPO for argument passing, change:
				213	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				214
				215	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				216	of arguments get assigned to r3 through r10. That is, if you have a function
				217	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				218	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				219	order for functions we can internalize so that the maximum number of
				220	integers/pointers get passed in regs before you see any of the fp arguments.
				221
				222	Instead of implementing this, it would actually probably be easier to just
				223	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				224	including having this work sanely.
				225
				226	===-------------------------------------------------------------------------===
				227
				228	Fix Darwin FP-In-Integer Registers ABI
				229
				230	Darwin passes doubles in structures in integer registers, which is very very
				231	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				232	that percolates these things out of functions.
				233
				234	Check out how horrible this is:
				235	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				236
				237	This is an extension of "interprocedural CC unmunging" that can't be done with
				238	just fastcc.
				239
				240	===-------------------------------------------------------------------------===
				241
				242	Compile this:
				243
				244	int foo(int a) {
				245	int b = (a < 8);
				246	if (b) {
				247	return b * 3; // ignore the fact that this is always 3.
				248	} else {
				249	return 2;
				250	}
				251	}
				252
				253	into something not this:
				254
				255	_foo:
				256	1) cmpwi cr7, r3, 8
				257	mfcr r2, 1
				258	rlwinm r2, r2, 29, 31, 31
				259	1) cmpwi cr0, r3, 7
				260	bgt cr0, LBB1_2 ; UnifiedReturnBlock
				261	LBB1_1: ; then
				262	rlwinm r2, r2, 0, 31, 31
				263	mulli r3, r2, 3
				264	blr
				265	LBB1_2: ; UnifiedReturnBlock
				266	li r3, 2
				267	blr
				268
				269	In particular, the two compares (marked 1) could be shared by reversing one.
				270	This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
				271	same operands (but backwards) exists. In this case, this wouldn't save us
				272	anything though, because the compares still wouldn't be shared.
				273
				274	===-------------------------------------------------------------------------===
				275
				276	We should custom expand setcc instead of pretending that we have it. That
				277	would allow us to expose the access of the crbit after the mfcr, allowing
				278	that access to be trivially folded into other ops. A simple example:
				279
				280	int foo(int a, int b) { return (a < b) << 4; }
				281
				282	compiles into:
				283
				284	_foo:
				285	cmpw cr7, r3, r4
				286	mfcr r2, 1
				287	rlwinm r2, r2, 29, 31, 31
				288	slwi r3, r2, 4
				289	blr
				290
				291	===-------------------------------------------------------------------------===
				292
				293	Fold add and sub with constant into non-extern, non-weak addresses so this:
				294
				295	static int a;
				296	void bar(int b) { a = b; }
				297	void foo(unsigned char *c) {
				298	*c = a;
				299	}
				300
				301	So that
				302
				303	_foo:
				304	lis r2, ha16(_a)
				305	la r2, lo16(_a)(r2)
				306	lbz r2, 3(r2)
				307	stb r2, 0(r3)
				308	blr
				309
				310	Becomes
				311
				312	_foo:
				313	lis r2, ha16(_a+3)
				314	lbz r2, lo16(_a+3)(r2)
				315	stb r2, 0(r3)
				316	blr
				317
				318	===-------------------------------------------------------------------------===
				319
				320	We generate really bad code for this:
				321
				322	int f(signed char *a, _Bool b, _Bool c) {
				323	signed char t = 0;
				324	if (b) t = *a;
				325	if (c) *a = t;
				326	}
				327
				328	===-------------------------------------------------------------------------===
				329
				330	This:
				331	int test(unsigned P) { return P >> 24; }
				332
				333	Should compile to:
				334
				335	_test:
				336	lbz r3,0(r3)
				337	blr
				338
				339	not:
				340
				341	_test:
				342	lwz r2, 0(r3)
				343	srwi r3, r2, 24
				344	blr
				345
				346	===-------------------------------------------------------------------------===
				347
				348	On the G5, logical CR operations are more expensive in their three
				349	address form: ops that read/write the same register are half as expensive as
				350	those that read from two registers that are different from their destination.
				351
				352	We should model this with two separate instructions. The isel should generate
				353	the "two address" form of the instructions. When the register allocator
				354	detects that it needs to insert a copy due to the two-addresness of the CR
				355	logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
				356	we can convert to the "three address" instruction, to save code space.
				357
				358	This only matters when we start generating cr logical ops.
				359
				360	===-------------------------------------------------------------------------===
				361
				362	We should compile these two functions to the same thing:
				363
				364	#include <stdlib.h>
				365	void f(int a, int b, int *P) {
				366	*P = (a-b)>=0?(a-b):(b-a);
				367	}
				368	void g(int a, int b, int *P) {
				369	*P = abs(a-b);
				370	}
				371
				372	Further, they should compile to something better than:
				373
				374	_g:
				375	subf r2, r4, r3
				376	subfic r3, r2, 0
				377	cmpwi cr0, r2, -1
				378	bgt cr0, LBB2_2 ; entry
				379	LBB2_1: ; entry
				380	mr r2, r3
				381	LBB2_2: ; entry
				382	stw r2, 0(r5)
				383	blr
				384
				385	GCC produces:
				386
				387	_g:
				388	subf r4,r4,r3
				389	srawi r2,r4,31
				390	xor r0,r2,r4
				391	subf r0,r2,r0
				392	stw r0,0(r5)
				393	blr
				394
				395	... which is much nicer.
				396
				397	This theoretically may help improve twolf slightly (used in dimbox.c:142?).
				398
				399	===-------------------------------------------------------------------------===
				400
				401	int foo(int N, int *W, int TK, int X) {
				402	int t, i;
				403
				404	for (t = 0; t < N; ++t)
				405	for (i = 0; i < 4; ++i)
				406	W[t / X][i][t % X] = TK[i][t];
				407
				408	return 5;
				409	}
				410
				411	We generate relatively atrocious code for this loop compared to gcc.
				412
				413	We could also strength reduce the rem and the div:
				414	http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
				415
				416	===-------------------------------------------------------------------------===
				417
				418	float foo(float X) { return (int)(X); }
				419
				420	Currently produces:
				421
				422	_foo:
				423	fctiwz f0, f1
				424	stfd f0, -8(r1)
				425	lwz r2, -4(r1)
				426	extsw r2, r2
				427	std r2, -16(r1)
				428	lfd f0, -16(r1)
				429	fcfid f0, f0
				430	frsp f1, f0
				431	blr
				432
				433	We could use a target dag combine to turn the lwz/extsw into an lwa when the
				434	lwz has a single use. Since LWA is cracked anyway, this would be a codesize
				435	win only.
				436
				437	===-------------------------------------------------------------------------===
				438
				439	We generate ugly code for this:
				440
				441	void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
				442	unsigned code = 0;
				443	if(dx < -dw) code \|= 1;
				444	if(dx > dw) code \|= 2;
				445	if(dy < -dw) code \|= 4;
				446	if(dy > dw) code \|= 8;
				447	if(dz < -dw) code \|= 16;
				448	if(dz > dw) code \|= 32;
				449	*ret = code;
				450	}
				451
				452	===-------------------------------------------------------------------------===
				453
				454	Complete the signed i32 to FP conversion code using 64-bit registers
				455	transformation, good for PI. See PPCISelLowering.cpp, this comment:
				456
				457	// FIXME: disable this lowered code. This generates 64-bit register values,
				458	// and we don't model the fact that the top part is clobbered by calls. We
				459	// need to flag these together so that the value isn't live across a call.
				460	//setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
				461
				462	Also, if the registers are spilled to the stack, we have to ensure that all
				463	64-bits of them are save/restored, otherwise we will miscompile the code. It
				464	sounds like we need to get the 64-bit register classes going.
				465
				466	===-------------------------------------------------------------------------===
				467
				468	%struct.B = type { i8, [3 x i8] }
				469
				470	define void @bar(%struct.B* %b) {
				471	entry:
				472	%tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
				473	%tmp = load i32* %tmp ; <uint> [#uses=1]
				474	%tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
				475	%tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
				476	%tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
				477	%tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
				478	%tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
				479	%tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
				480	%tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
				481	%tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
				482	%tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
				483	%tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
				484	store i32 %tmp13, i32* %tmp8
				485	ret void
				486	}
				487
				488	We emit:
				489
				490	_foo:
				491	lwz r2, 0(r3)
				492	slwi r4, r2, 1
				493	or r4, r4, r2
				494	rlwimi r2, r4, 0, 0, 0
				495	stw r2, 0(r3)
				496	blr
				497
				498	We could collapse a bunch of those ORs and ANDs and generate the following
				499	equivalent code:
				500
				501	_foo:
				502	lwz r2, 0(r3)
				503	rlwinm r4, r2, 1, 0, 0
				504	or r2, r2, r4
				505	stw r2, 0(r3)
				506	blr
				507
				508	===-------------------------------------------------------------------------===
				509
				510	We compile:
				511
				512	unsigned test6(unsigned x) {
				513	return ((x & 0x00FF0000) >> 16) \| ((x & 0x000000FF) << 16);
				514	}
				515
				516	into:
				517
				518	_test6:
				519	lis r2, 255
				520	rlwinm r3, r3, 16, 0, 31
				521	ori r2, r2, 255
				522	and r3, r3, r2
				523	blr
				524
				525	GCC gets it down to:
				526
				527	_test6:
				528	rlwinm r0,r3,16,8,15
				529	rlwinm r3,r3,16,24,31
				530	or r3,r3,r0
				531	blr
				532
				533
				534	===-------------------------------------------------------------------------===
				535
				536	Consider a function like this:
				537
				538	float foo(float X) { return X + 1234.4123f; }
				539
				540	The FP constant ends up in the constant pool, so we need to get the LR register.
				541	This ends up producing code like this:
				542
				543	_foo:
				544	.LBB_foo_0: ; entry
				545	mflr r11
				546	*** stw r11, 8(r1)
				547	bl "L00000$pb"
				548	"L00000$pb":
				549	mflr r2
				550	addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
				551	lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
				552	fadds f1, f1, f0
				553	*** lwz r11, 8(r1)
				554	mtlr r11
				555	blr
				556
				557	This is functional, but there is no reason to spill the LR register all the way
				558	to the stack (the two marked instrs): spilling it to a GPR is quite enough.
				559
				560	Implementing this will require some codegen improvements. Nate writes:
				561
				562	"So basically what we need to support the "no stack frame save and restore" is a
				563	generalization of the LR optimization to "callee-save regs".
				564
				565	Currently, we have LR marked as a callee-save reg. The register allocator sees
				566	that it's callee save, and spills it directly to the stack.
				567
				568	Ideally, something like this would happen:
				569
				570	LR would be in a separate register class from the GPRs. The class of LR would be
				571	marked "unspillable". When the register allocator came across an unspillable
				572	reg, it would ask "what is the best class to copy this into that I can spill"
				573	If it gets a class back, which it will in this case (the gprs), it grabs a free
				574	register of that class. If it is then later necessary to spill that reg, so be
				575	it.
				576
				577	===-------------------------------------------------------------------------===
				578
				579	We compile this:
				580	int test(_Bool X) {
				581	return X ? 524288 : 0;
				582	}
				583
				584	to:
				585	_test:
				586	cmplwi cr0, r3, 0
				587	lis r2, 8
				588	li r3, 0
				589	beq cr0, LBB1_2 ;entry
				590	LBB1_1: ;entry
				591	mr r3, r2
				592	LBB1_2: ;entry
				593	blr
				594
				595	instead of:
				596	_test:
				597	addic r2,r3,-1
				598	subfe r0,r2,r3
				599	slwi r3,r0,19
				600	blr
				601
				602	This sort of thing occurs a lot due to globalopt.
				603
				604	===-------------------------------------------------------------------------===
				605
				606	We currently compile 32-bit bswap:
				607
				608	declare i32 @llvm.bswap.i32(i32 %A)
				609	define i32 @test(i32 %A) {
				610	%B = call i32 @llvm.bswap.i32(i32 %A)
				611	ret i32 %B
				612	}
				613
				614	to:
				615
				616	_test:
				617	rlwinm r2, r3, 24, 16, 23
				618	slwi r4, r3, 24
				619	rlwimi r2, r3, 8, 24, 31
				620	rlwimi r4, r3, 8, 8, 15
				621	rlwimi r4, r2, 0, 16, 31
				622	mr r3, r4
				623	blr
				624
				625	it would be more efficient to produce:
				626
				627	_foo: mr r0,r3
				628	rlwinm r3,r3,8,0xffffffff
				629	rlwimi r3,r0,24,0,7
				630	rlwimi r3,r0,24,16,23
				631	blr
				632
				633	===-------------------------------------------------------------------------===
				634
				635	test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
				636
				637	__ZNK4llvm5APInt17countLeadingZerosEv:
				638	ld r2, 0(r3)
				639	cntlzd r2, r2
				640	or r2, r2, r2 <<-- silly.
				641	addi r3, r2, -64
				642	blr
				643
				644	The dead or is a 'truncate' from 64- to 32-bits.
				645
				646	===-------------------------------------------------------------------------===
				647
				648	We generate horrible ppc code for this:
				649
				650	#define N 2000000
				651	double a[N],c[N];
				652	void simpleloop() {
				653	int j;
				654	for (j=0; j<N; j++)
				655	c[j] = a[j];
				656	}
				657
				658	LBB1_1: ;bb
				659	lfdx f0, r3, r4
				660	addi r5, r5, 1 ;; Extra IV for the exit value compare.
				661	stfdx f0, r2, r4
				662	addi r4, r4, 8
				663
				664	xoris r6, r5, 30 ;; This is due to a large immediate.
				665	cmplwi cr0, r6, 33920
				666	bne cr0, LBB1_1
				667
Chris Lattner	4084d49	2007-09-10 21:43:18 +0000	[diff] [blame]	668	//===---------------------------------------------------------------------===//
				669
				670	This:
				671	#include <algorithm>
				672	inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
				673	{ return std::make_pair(a + b, a + b < a); }
				674	bool no_overflow(unsigned a, unsigned b)
				675	{ return !full_add(a, b).second; }
				676
				677	Should compile to:
				678
				679	__Z11no_overflowjj:
				680	add r4,r3,r4
				681	subfc r3,r3,r4
				682	li r3,0
				683	adde r3,r3,r3
				684	blr
				685
				686	(or better) not:
				687
				688	__Z11no_overflowjj:
				689	add r2, r4, r3
				690	cmplw cr7, r2, r3
				691	mfcr r2
				692	rlwinm r2, r2, 29, 31, 31
				693	xori r3, r2, 1
				694	blr
				695
				696	//===---------------------------------------------------------------------===//
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	697
Chris Lattner	6c36fb5	2008-01-08 06:46:30 +0000	[diff] [blame]	698	We compile some FP comparisons into an mfcr with two rlwinms and an or. For
				699	example:
				700	#include <math.h>
				701	int test(double x, double y) { return islessequal(x, y);}
				702	int test2(double x, double y) { return islessgreater(x, y);}
				703	int test3(double x, double y) { return !islessequal(x, y);}
				704
				705	Compiles into (all three are similar, but the bits differ):
				706
				707	_test:
				708	fcmpu cr7, f1, f2
				709	mfcr r2
				710	rlwinm r3, r2, 29, 31, 31
				711	rlwinm r2, r2, 31, 31, 31
				712	or r3, r2, r3
				713	blr
				714
				715	GCC compiles this into:
				716
				717	_test:
				718	fcmpu cr7,f1,f2
				719	cror 30,28,30
				720	mfcr r3
				721	rlwinm r3,r3,31,1
				722	blr
				723
				724	which is more efficient and can use mfocr. See PR642 for some more context.
				725
				726	//===---------------------------------------------------------------------===//
Chris Lattner	869440b	2008-03-02 19:27:34 +0000	[diff] [blame]	727
				728	void foo(float *data, float d) {
				729	long i;
				730	for (i = 0; i < 8000; i++)
				731	data[i] = d;
				732	}
				733	void foo2(float *data, float d) {
				734	long i;
				735	data--;
				736	for (i = 0; i < 8000; i++) {
				737	data[1] = d;
				738	data++;
				739	}
				740	}
				741
				742	These compile to:
				743
				744	_foo:
				745	li r2, 0
				746	LBB1_1: ; bb
				747	addi r4, r2, 4
				748	stfsx f1, r3, r2
				749	cmplwi cr0, r4, 32000
				750	mr r2, r4
				751	bne cr0, LBB1_1 ; bb
				752	blr
				753	_foo2:
				754	li r2, 0
				755	LBB2_1: ; bb
				756	addi r4, r2, 4
				757	stfsx f1, r3, r2
				758	cmplwi cr0, r4, 32000
				759	mr r2, r4
				760	bne cr0, LBB2_1 ; bb
				761	blr
				762
				763	The 'mr' could be eliminated to folding the add into the cmp better.
				764
				765	//===---------------------------------------------------------------------===//
Dale Johannesen	089c6c0	2008-11-17 18:56:34 +0000	[diff] [blame]	766	Codegen for the following (low-probability) case deteriorated considerably
				767	when the correctness fixes for unordered comparisons went in (PR 642, 58871).
				768	It should be possible to recover the code quality described in the comments.
				769
				770	; RUN: llvm-as < %s \| llc -march=ppc32 \| grep or \| count 3
				771	; This should produce one 'or' or 'cror' instruction per function.
				772
				773	; RUN: llvm-as < %s \| llc -march=ppc32 \| grep mfcr \| count 3
				774	; PR2964
				775
				776	define i32 @test(double %x, double %y) nounwind {
				777	entry:
				778	%tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1]
				779	%tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
				780	ret i32 %tmp345
				781	}
				782
				783	define i32 @test2(double %x, double %y) nounwind {
				784	entry:
				785	%tmp3 = fcmp one double %x, %y ; <i1> [#uses=1]
				786	%tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
				787	ret i32 %tmp345
				788	}
				789
				790	define i32 @test3(double %x, double %y) nounwind {
				791	entry:
				792	%tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1]
				793	%tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
				794	ret i32 %tmp34
				795	}
				796	//===----------------------------------------------------------------------===//
				797	; RUN: llvm-as < %s \| llc -march=ppc32 \| not grep fneg
				798
				799	; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
				800	; should not be generated except with -enable-finite-only-fp-math or the like).
				801	; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
				802	; recognize a more elaborate tree than a simple SETxx.
				803
				804	define double @test_FNEG_sel(double %A, double %B, double %C) {
				805	%D = sub double -0.000000e+00, %A ; <double> [#uses=1]
				806	%Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1]
				807	%E = select i1 %Cond, double %B, double %C ; <double> [#uses=1]
				808	ret double %E
				809	}
				810