Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: f5e50fc808a8d87ebe3547e0c5d3eaa2a5c20df6 [file] [log] [blame]

Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1	//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
				2
				3	TODO:
				4	* gpr0 allocation
				5	* implement do-loop -> bdnz transform
Nate Begeman	10c8575	2008-02-11 04:16:09 +0000	[diff] [blame]	6	* lmw/stmw pass a la arm load store optimizer for prolog/epilog
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	7
				8	===-------------------------------------------------------------------------===
				9
				10	Support 'update' load/store instructions. These are cracked on the G5, but are
				11	still a codesize win.
				12
				13	With preinc enabled, this:
				14
				15	long %test4(long %X, long *%dest) {
				16	%Y = getelementptr long* %X, int 4
				17	%A = load long* %Y
				18	store long %A, long* %dest
				19	ret long* %Y
				20	}
				21
				22	compiles to:
				23
				24	_test4:
				25	mr r2, r3
				26	lwzu r5, 32(r2)
				27	lwz r3, 36(r3)
				28	stw r5, 0(r4)
				29	stw r3, 4(r4)
				30	mr r3, r2
				31	blr
				32
				33	with -sched=list-burr, I get:
				34
				35	_test4:
				36	lwz r2, 36(r3)
				37	lwzu r5, 32(r3)
				38	stw r2, 4(r4)
				39	stw r5, 0(r4)
				40	blr
				41
				42	===-------------------------------------------------------------------------===
				43
				44	We compile the hottest inner loop of viterbi to:
				45
				46	li r6, 0
				47	b LBB1_84 ;bb432.i
				48	LBB1_83: ;bb420.i
				49	lbzx r8, r5, r7
				50	addi r6, r7, 1
				51	stbx r8, r4, r7
				52	LBB1_84: ;bb432.i
				53	mr r7, r6
				54	cmplwi cr0, r7, 143
				55	bne cr0, LBB1_83 ;bb420.i
				56
				57	The CBE manages to produce:
				58
				59	li r0, 143
				60	mtctr r0
				61	loop:
				62	lbzx r2, r2, r11
				63	stbx r0, r2, r9
				64	addi r2, r2, 1
				65	bdz later
				66	b loop
				67
				68	This could be much better (bdnz instead of bdz) but it still beats us. If we
				69	produced this with bdnz, the loop would be a single dispatch group.
				70
				71	===-------------------------------------------------------------------------===
				72
				73	Compile:
				74
				75	void foo(int *P) {
				76	if (P) *P = 0;
				77	}
				78
				79	into:
				80
				81	_foo:
				82	cmpwi cr0,r3,0
				83	beqlr cr0
				84	li r0,0
				85	stw r0,0(r3)
				86	blr
				87
				88	This is effectively a simple form of predication.
				89
				90	===-------------------------------------------------------------------------===
				91
				92	Lump the constant pool for each function into ONE pic object, and reference
				93	pieces of it as offsets from the start. For functions like this (contrived
				94	to have lots of constants obviously):
				95
				96	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				97
				98	We generate:
				99
				100	_X:
				101	lis r2, ha16(.CPI_X_0)
				102	lfd f0, lo16(.CPI_X_0)(r2)
				103	lis r2, ha16(.CPI_X_1)
				104	lfd f2, lo16(.CPI_X_1)(r2)
				105	fmadd f0, f1, f0, f2
				106	lis r2, ha16(.CPI_X_2)
				107	lfd f1, lo16(.CPI_X_2)(r2)
				108	lis r2, ha16(.CPI_X_3)
				109	lfd f2, lo16(.CPI_X_3)(r2)
				110	fmadd f1, f0, f1, f2
				111	blr
				112
				113	It would be better to materialize .CPI_X into a register, then use immediates
				114	off of the register to avoid the lis's. This is even more important in PIC
				115	mode.
				116
				117	Note that this (and the static variable version) is discussed here for GCC:
				118	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				119
Chris Lattner	35d65d7	2007-08-23 15:16:03 +0000	[diff] [blame]	120	Here's another example (the sgn function):
				121	double testf(double a) {
				122	return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
				123	}
				124
				125	it produces a BB like this:
				126	LBB1_1: ; cond_true
				127	lis r2, ha16(LCPI1_0)
				128	lfs f0, lo16(LCPI1_0)(r2)
				129	lis r2, ha16(LCPI1_1)
				130	lis r3, ha16(LCPI1_2)
				131	lfs f2, lo16(LCPI1_2)(r3)
				132	lfs f3, lo16(LCPI1_1)(r2)
				133	fsub f0, f0, f1
				134	fsel f1, f0, f2, f3
				135	blr
				136
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	137	===-------------------------------------------------------------------------===
				138
				139	PIC Code Gen IPO optimization:
				140
				141	Squish small scalar globals together into a single global struct, allowing the
				142	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				143	of the GOT on targets with one).
				144
				145	Note that this is discussed here for GCC:
				146	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				147
				148	===-------------------------------------------------------------------------===
				149
				150	Implement Newton-Rhapson method for improving estimate instructions to the
				151	correct accuracy, and implementing divide as multiply by reciprocal when it has
Dan Gohman	2a5ddf3	2009-07-24 00:30:09 +0000	[diff] [blame]	152	more than one use. Itanium would want this too.
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	153
				154	===-------------------------------------------------------------------------===
				155
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	156	Compile offsets from allocas:
				157
				158	int *%test() {
				159	%X = alloca { int, int }
				160	%Y = getelementptr {int,int}* %X, int 0, uint 1
				161	ret int* %Y
				162	}
				163
				164	into a single add, not two:
				165
				166	_test:
				167	addi r2, r1, -8
				168	addi r3, r2, 4
				169	blr
				170
				171	--> important for C++.
				172
				173	===-------------------------------------------------------------------------===
				174
				175	No loads or stores of the constants should be needed:
				176
				177	struct foo { double X, Y; };
				178	void xxx(struct foo F);
				179	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				180
				181	===-------------------------------------------------------------------------===
				182
Dale Johannesen	3960163	2009-07-01 23:36:02 +0000	[diff] [blame]	183	Darwin Stub removal:
				184
				185	We still generate calls to foo$stub, and stubs, on Darwin. This is not
Chris Lattner	763a26f	2009-07-02 01:24:34 +0000	[diff] [blame]	186	necessary when building with the Leopard (10.5) or later linker, as stubs are
				187	generated by ld when necessary. Parameterizing this based on the deployment
				188	target (-mmacosx-version-min) is probably enough. x86-32 does this right, see
				189	its logic.
Dale Johannesen	3960163	2009-07-01 23:36:02 +0000	[diff] [blame]	190
				191	===-------------------------------------------------------------------------===
				192
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	193	Darwin Stub LICM optimization:
				194
				195	Loops like this:
				196
				197	for (...) bar();
				198
				199	Have to go through an indirect stub if bar is external or linkonce. It would
				200	be better to compile it as:
				201
				202	fp = &bar;
				203	for (...) fp();
				204
				205	which only computes the address of bar once (instead of each time through the
				206	stub). This is Darwin specific and would have to be done in the code generator.
				207	Probably not a win on x86.
				208
				209	===-------------------------------------------------------------------------===
				210
				211	Simple IPO for argument passing, change:
				212	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				213
				214	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				215	of arguments get assigned to r3 through r10. That is, if you have a function
				216	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				217	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				218	order for functions we can internalize so that the maximum number of
				219	integers/pointers get passed in regs before you see any of the fp arguments.
				220
				221	Instead of implementing this, it would actually probably be easier to just
				222	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				223	including having this work sanely.
				224
				225	===-------------------------------------------------------------------------===
				226
				227	Fix Darwin FP-In-Integer Registers ABI
				228
				229	Darwin passes doubles in structures in integer registers, which is very very
				230	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				231	that percolates these things out of functions.
				232
				233	Check out how horrible this is:
				234	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				235
				236	This is an extension of "interprocedural CC unmunging" that can't be done with
				237	just fastcc.
				238
				239	===-------------------------------------------------------------------------===
				240
				241	Compile this:
				242
				243	int foo(int a) {
				244	int b = (a < 8);
				245	if (b) {
				246	return b * 3; // ignore the fact that this is always 3.
				247	} else {
				248	return 2;
				249	}
				250	}
				251
				252	into something not this:
				253
				254	_foo:
				255	1) cmpwi cr7, r3, 8
				256	mfcr r2, 1
				257	rlwinm r2, r2, 29, 31, 31
				258	1) cmpwi cr0, r3, 7
				259	bgt cr0, LBB1_2 ; UnifiedReturnBlock
				260	LBB1_1: ; then
				261	rlwinm r2, r2, 0, 31, 31
				262	mulli r3, r2, 3
				263	blr
				264	LBB1_2: ; UnifiedReturnBlock
				265	li r3, 2
				266	blr
				267
				268	In particular, the two compares (marked 1) could be shared by reversing one.
				269	This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
				270	same operands (but backwards) exists. In this case, this wouldn't save us
				271	anything though, because the compares still wouldn't be shared.
				272
				273	===-------------------------------------------------------------------------===
				274
				275	We should custom expand setcc instead of pretending that we have it. That
				276	would allow us to expose the access of the crbit after the mfcr, allowing
				277	that access to be trivially folded into other ops. A simple example:
				278
				279	int foo(int a, int b) { return (a < b) << 4; }
				280
				281	compiles into:
				282
				283	_foo:
				284	cmpw cr7, r3, r4
				285	mfcr r2, 1
				286	rlwinm r2, r2, 29, 31, 31
				287	slwi r3, r2, 4
				288	blr
				289
				290	===-------------------------------------------------------------------------===
				291
				292	Fold add and sub with constant into non-extern, non-weak addresses so this:
				293
				294	static int a;
				295	void bar(int b) { a = b; }
				296	void foo(unsigned char *c) {
				297	*c = a;
				298	}
				299
				300	So that
				301
				302	_foo:
				303	lis r2, ha16(_a)
				304	la r2, lo16(_a)(r2)
				305	lbz r2, 3(r2)
				306	stb r2, 0(r3)
				307	blr
				308
				309	Becomes
				310
				311	_foo:
				312	lis r2, ha16(_a+3)
				313	lbz r2, lo16(_a+3)(r2)
				314	stb r2, 0(r3)
				315	blr
				316
				317	===-------------------------------------------------------------------------===
				318
				319	We generate really bad code for this:
				320
				321	int f(signed char *a, _Bool b, _Bool c) {
				322	signed char t = 0;
				323	if (b) t = *a;
				324	if (c) *a = t;
				325	}
				326
				327	===-------------------------------------------------------------------------===
				328
				329	This:
				330	int test(unsigned P) { return P >> 24; }
				331
				332	Should compile to:
				333
				334	_test:
				335	lbz r3,0(r3)
				336	blr
				337
				338	not:
				339
				340	_test:
				341	lwz r2, 0(r3)
				342	srwi r3, r2, 24
				343	blr
				344
				345	===-------------------------------------------------------------------------===
				346
				347	On the G5, logical CR operations are more expensive in their three
				348	address form: ops that read/write the same register are half as expensive as
				349	those that read from two registers that are different from their destination.
				350
				351	We should model this with two separate instructions. The isel should generate
				352	the "two address" form of the instructions. When the register allocator
				353	detects that it needs to insert a copy due to the two-addresness of the CR
				354	logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
				355	we can convert to the "three address" instruction, to save code space.
				356
				357	This only matters when we start generating cr logical ops.
				358
				359	===-------------------------------------------------------------------------===
				360
				361	We should compile these two functions to the same thing:
				362
				363	#include <stdlib.h>
				364	void f(int a, int b, int *P) {
				365	*P = (a-b)>=0?(a-b):(b-a);
				366	}
				367	void g(int a, int b, int *P) {
				368	*P = abs(a-b);
				369	}
				370
				371	Further, they should compile to something better than:
				372
				373	_g:
				374	subf r2, r4, r3
				375	subfic r3, r2, 0
				376	cmpwi cr0, r2, -1
				377	bgt cr0, LBB2_2 ; entry
				378	LBB2_1: ; entry
				379	mr r2, r3
				380	LBB2_2: ; entry
				381	stw r2, 0(r5)
				382	blr
				383
				384	GCC produces:
				385
				386	_g:
				387	subf r4,r4,r3
				388	srawi r2,r4,31
				389	xor r0,r2,r4
				390	subf r0,r2,r0
				391	stw r0,0(r5)
				392	blr
				393
				394	... which is much nicer.
				395
				396	This theoretically may help improve twolf slightly (used in dimbox.c:142?).
				397
				398	===-------------------------------------------------------------------------===
				399
				400	int foo(int N, int *W, int TK, int X) {
				401	int t, i;
				402
				403	for (t = 0; t < N; ++t)
				404	for (i = 0; i < 4; ++i)
				405	W[t / X][i][t % X] = TK[i][t];
				406
				407	return 5;
				408	}
				409
				410	We generate relatively atrocious code for this loop compared to gcc.
				411
				412	We could also strength reduce the rem and the div:
				413	http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
				414
				415	===-------------------------------------------------------------------------===
				416
				417	float foo(float X) { return (int)(X); }
				418
				419	Currently produces:
				420
				421	_foo:
				422	fctiwz f0, f1
				423	stfd f0, -8(r1)
				424	lwz r2, -4(r1)
				425	extsw r2, r2
				426	std r2, -16(r1)
				427	lfd f0, -16(r1)
				428	fcfid f0, f0
				429	frsp f1, f0
				430	blr
				431
				432	We could use a target dag combine to turn the lwz/extsw into an lwa when the
				433	lwz has a single use. Since LWA is cracked anyway, this would be a codesize
				434	win only.
				435
				436	===-------------------------------------------------------------------------===
				437
				438	We generate ugly code for this:
				439
				440	void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
				441	unsigned code = 0;
				442	if(dx < -dw) code \|= 1;
				443	if(dx > dw) code \|= 2;
				444	if(dy < -dw) code \|= 4;
				445	if(dy > dw) code \|= 8;
				446	if(dz < -dw) code \|= 16;
				447	if(dz > dw) code \|= 32;
				448	*ret = code;
				449	}
				450
				451	===-------------------------------------------------------------------------===
				452
				453	Complete the signed i32 to FP conversion code using 64-bit registers
				454	transformation, good for PI. See PPCISelLowering.cpp, this comment:
				455
				456	// FIXME: disable this lowered code. This generates 64-bit register values,
				457	// and we don't model the fact that the top part is clobbered by calls. We
				458	// need to flag these together so that the value isn't live across a call.
				459	//setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
				460
				461	Also, if the registers are spilled to the stack, we have to ensure that all
				462	64-bits of them are save/restored, otherwise we will miscompile the code. It
				463	sounds like we need to get the 64-bit register classes going.
				464
				465	===-------------------------------------------------------------------------===
				466
				467	%struct.B = type { i8, [3 x i8] }
				468
				469	define void @bar(%struct.B* %b) {
				470	entry:
				471	%tmp = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
				472	%tmp = load i32* %tmp ; <uint> [#uses=1]
				473	%tmp3 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=1]
				474	%tmp4 = load i32* %tmp3 ; <uint> [#uses=1]
				475	%tmp8 = bitcast %struct.B* %b to i32* ; <uint*> [#uses=2]
				476	%tmp9 = load i32* %tmp8 ; <uint> [#uses=1]
				477	%tmp4.mask17 = shl i32 %tmp4, i8 1 ; <uint> [#uses=1]
				478	%tmp1415 = and i32 %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
				479	%tmp.masked = and i32 %tmp, 2147483648 ; <uint> [#uses=1]
				480	%tmp11 = or i32 %tmp1415, %tmp.masked ; <uint> [#uses=1]
				481	%tmp12 = and i32 %tmp9, 2147483647 ; <uint> [#uses=1]
				482	%tmp13 = or i32 %tmp12, %tmp11 ; <uint> [#uses=1]
				483	store i32 %tmp13, i32* %tmp8
				484	ret void
				485	}
				486
				487	We emit:
				488
				489	_foo:
				490	lwz r2, 0(r3)
				491	slwi r4, r2, 1
				492	or r4, r4, r2
				493	rlwimi r2, r4, 0, 0, 0
				494	stw r2, 0(r3)
				495	blr
				496
				497	We could collapse a bunch of those ORs and ANDs and generate the following
				498	equivalent code:
				499
				500	_foo:
				501	lwz r2, 0(r3)
				502	rlwinm r4, r2, 1, 0, 0
				503	or r2, r2, r4
				504	stw r2, 0(r3)
				505	blr
				506
				507	===-------------------------------------------------------------------------===
				508
				509	We compile:
				510
				511	unsigned test6(unsigned x) {
				512	return ((x & 0x00FF0000) >> 16) \| ((x & 0x000000FF) << 16);
				513	}
				514
				515	into:
				516
				517	_test6:
				518	lis r2, 255
				519	rlwinm r3, r3, 16, 0, 31
				520	ori r2, r2, 255
				521	and r3, r3, r2
				522	blr
				523
				524	GCC gets it down to:
				525
				526	_test6:
				527	rlwinm r0,r3,16,8,15
				528	rlwinm r3,r3,16,24,31
				529	or r3,r3,r0
				530	blr
				531
				532
				533	===-------------------------------------------------------------------------===
				534
				535	Consider a function like this:
				536
				537	float foo(float X) { return X + 1234.4123f; }
				538
				539	The FP constant ends up in the constant pool, so we need to get the LR register.
				540	This ends up producing code like this:
				541
				542	_foo:
				543	.LBB_foo_0: ; entry
				544	mflr r11
				545	*** stw r11, 8(r1)
				546	bl "L00000$pb"
				547	"L00000$pb":
				548	mflr r2
				549	addis r2, r2, ha16(.CPI_foo_0-"L00000$pb")
				550	lfs f0, lo16(.CPI_foo_0-"L00000$pb")(r2)
				551	fadds f1, f1, f0
				552	*** lwz r11, 8(r1)
				553	mtlr r11
				554	blr
				555
				556	This is functional, but there is no reason to spill the LR register all the way
				557	to the stack (the two marked instrs): spilling it to a GPR is quite enough.
				558
				559	Implementing this will require some codegen improvements. Nate writes:
				560
				561	"So basically what we need to support the "no stack frame save and restore" is a
				562	generalization of the LR optimization to "callee-save regs".
				563
				564	Currently, we have LR marked as a callee-save reg. The register allocator sees
				565	that it's callee save, and spills it directly to the stack.
				566
				567	Ideally, something like this would happen:
				568
				569	LR would be in a separate register class from the GPRs. The class of LR would be
				570	marked "unspillable". When the register allocator came across an unspillable
				571	reg, it would ask "what is the best class to copy this into that I can spill"
				572	If it gets a class back, which it will in this case (the gprs), it grabs a free
				573	register of that class. If it is then later necessary to spill that reg, so be
				574	it.
				575
				576	===-------------------------------------------------------------------------===
				577
				578	We compile this:
				579	int test(_Bool X) {
				580	return X ? 524288 : 0;
				581	}
				582
				583	to:
				584	_test:
				585	cmplwi cr0, r3, 0
				586	lis r2, 8
				587	li r3, 0
				588	beq cr0, LBB1_2 ;entry
				589	LBB1_1: ;entry
				590	mr r3, r2
				591	LBB1_2: ;entry
				592	blr
				593
				594	instead of:
				595	_test:
				596	addic r2,r3,-1
				597	subfe r0,r2,r3
				598	slwi r3,r0,19
				599	blr
				600
				601	This sort of thing occurs a lot due to globalopt.
				602
				603	===-------------------------------------------------------------------------===
				604
				605	We currently compile 32-bit bswap:
				606
				607	declare i32 @llvm.bswap.i32(i32 %A)
				608	define i32 @test(i32 %A) {
				609	%B = call i32 @llvm.bswap.i32(i32 %A)
				610	ret i32 %B
				611	}
				612
				613	to:
				614
				615	_test:
				616	rlwinm r2, r3, 24, 16, 23
				617	slwi r4, r3, 24
				618	rlwimi r2, r3, 8, 24, 31
				619	rlwimi r4, r3, 8, 8, 15
				620	rlwimi r4, r2, 0, 16, 31
				621	mr r3, r4
				622	blr
				623
				624	it would be more efficient to produce:
				625
				626	_foo: mr r0,r3
				627	rlwinm r3,r3,8,0xffffffff
				628	rlwimi r3,r0,24,0,7
				629	rlwimi r3,r0,24,16,23
				630	blr
				631
				632	===-------------------------------------------------------------------------===
				633
				634	test/CodeGen/PowerPC/2007-03-24-cntlzd.ll compiles to:
				635
				636	__ZNK4llvm5APInt17countLeadingZerosEv:
				637	ld r2, 0(r3)
				638	cntlzd r2, r2
				639	or r2, r2, r2 <<-- silly.
				640	addi r3, r2, -64
				641	blr
				642
				643	The dead or is a 'truncate' from 64- to 32-bits.
				644
				645	===-------------------------------------------------------------------------===
				646
				647	We generate horrible ppc code for this:
				648
				649	#define N 2000000
				650	double a[N],c[N];
				651	void simpleloop() {
				652	int j;
				653	for (j=0; j<N; j++)
				654	c[j] = a[j];
				655	}
				656
				657	LBB1_1: ;bb
				658	lfdx f0, r3, r4
				659	addi r5, r5, 1 ;; Extra IV for the exit value compare.
				660	stfdx f0, r2, r4
				661	addi r4, r4, 8
				662
				663	xoris r6, r5, 30 ;; This is due to a large immediate.
				664	cmplwi cr0, r6, 33920
				665	bne cr0, LBB1_1
				666
Chris Lattner	4084d49	2007-09-10 21:43:18 +0000	[diff] [blame]	667	//===---------------------------------------------------------------------===//
				668
				669	This:
				670	#include <algorithm>
				671	inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
				672	{ return std::make_pair(a + b, a + b < a); }
				673	bool no_overflow(unsigned a, unsigned b)
				674	{ return !full_add(a, b).second; }
				675
				676	Should compile to:
				677
				678	__Z11no_overflowjj:
				679	add r4,r3,r4
				680	subfc r3,r3,r4
				681	li r3,0
				682	adde r3,r3,r3
				683	blr
				684
				685	(or better) not:
				686
				687	__Z11no_overflowjj:
				688	add r2, r4, r3
				689	cmplw cr7, r2, r3
				690	mfcr r2
				691	rlwinm r2, r2, 29, 31, 31
				692	xori r3, r2, 1
				693	blr
				694
				695	//===---------------------------------------------------------------------===//
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	696
Chris Lattner	6c36fb5	2008-01-08 06:46:30 +0000	[diff] [blame]	697	We compile some FP comparisons into an mfcr with two rlwinms and an or. For
				698	example:
				699	#include <math.h>
				700	int test(double x, double y) { return islessequal(x, y);}
				701	int test2(double x, double y) { return islessgreater(x, y);}
				702	int test3(double x, double y) { return !islessequal(x, y);}
				703
				704	Compiles into (all three are similar, but the bits differ):
				705
				706	_test:
				707	fcmpu cr7, f1, f2
				708	mfcr r2
				709	rlwinm r3, r2, 29, 31, 31
				710	rlwinm r2, r2, 31, 31, 31
				711	or r3, r2, r3
				712	blr
				713
				714	GCC compiles this into:
				715
				716	_test:
				717	fcmpu cr7,f1,f2
				718	cror 30,28,30
				719	mfcr r3
				720	rlwinm r3,r3,31,1
				721	blr
				722
				723	which is more efficient and can use mfocr. See PR642 for some more context.
				724
				725	//===---------------------------------------------------------------------===//
Chris Lattner	869440b	2008-03-02 19:27:34 +0000	[diff] [blame]	726
				727	void foo(float *data, float d) {
				728	long i;
				729	for (i = 0; i < 8000; i++)
				730	data[i] = d;
				731	}
				732	void foo2(float *data, float d) {
				733	long i;
				734	data--;
				735	for (i = 0; i < 8000; i++) {
				736	data[1] = d;
				737	data++;
				738	}
				739	}
				740
				741	These compile to:
				742
				743	_foo:
				744	li r2, 0
				745	LBB1_1: ; bb
				746	addi r4, r2, 4
				747	stfsx f1, r3, r2
				748	cmplwi cr0, r4, 32000
				749	mr r2, r4
				750	bne cr0, LBB1_1 ; bb
				751	blr
				752	_foo2:
				753	li r2, 0
				754	LBB2_1: ; bb
				755	addi r4, r2, 4
				756	stfsx f1, r3, r2
				757	cmplwi cr0, r4, 32000
				758	mr r2, r4
				759	bne cr0, LBB2_1 ; bb
				760	blr
				761
				762	The 'mr' could be eliminated to folding the add into the cmp better.
				763
				764	//===---------------------------------------------------------------------===//
Dale Johannesen	089c6c0	2008-11-17 18:56:34 +0000	[diff] [blame]	765	Codegen for the following (low-probability) case deteriorated considerably
				766	when the correctness fixes for unordered comparisons went in (PR 642, 58871).
				767	It should be possible to recover the code quality described in the comments.
				768
				769	; RUN: llvm-as < %s \| llc -march=ppc32 \| grep or \| count 3
				770	; This should produce one 'or' or 'cror' instruction per function.
				771
				772	; RUN: llvm-as < %s \| llc -march=ppc32 \| grep mfcr \| count 3
				773	; PR2964
				774
				775	define i32 @test(double %x, double %y) nounwind {
				776	entry:
				777	%tmp3 = fcmp ole double %x, %y ; <i1> [#uses=1]
				778	%tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
				779	ret i32 %tmp345
				780	}
				781
				782	define i32 @test2(double %x, double %y) nounwind {
				783	entry:
				784	%tmp3 = fcmp one double %x, %y ; <i1> [#uses=1]
				785	%tmp345 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
				786	ret i32 %tmp345
				787	}
				788
				789	define i32 @test3(double %x, double %y) nounwind {
				790	entry:
				791	%tmp3 = fcmp ugt double %x, %y ; <i1> [#uses=1]
				792	%tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
				793	ret i32 %tmp34
				794	}
				795	//===----------------------------------------------------------------------===//
				796	; RUN: llvm-as < %s \| llc -march=ppc32 \| not grep fneg
				797
				798	; This could generate FSEL with appropriate flags (FSEL is not IEEE-safe, and
				799	; should not be generated except with -enable-finite-only-fp-math or the like).
				800	; With the correctness fixes for PR642 (58871) LowerSELECT_CC would need to
				801	; recognize a more elaborate tree than a simple SETxx.
				802
				803	define double @test_FNEG_sel(double %A, double %B, double %C) {
				804	%D = sub double -0.000000e+00, %A ; <double> [#uses=1]
				805	%Cond = fcmp ugt double %D, -0.000000e+00 ; <i1> [#uses=1]
				806	%E = select i1 %Cond, double %B, double %C ; <double> [#uses=1]
				807	ret double %E
				808	}
				809