Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: b23b9a4cc4be3cc6b2f2ad00759fda4955ae71d4 [file] [log] [blame]

Nate Begeman	b64af91	2004-08-10 20:42:36 +0000	[diff] [blame]	1	TODO:
Nate Begeman	ef9531e	2005-04-11 20:48:57 +0000	[diff] [blame]	2	* gpr0 allocation
Nate Begeman	4a0de07	2004-10-26 04:10:53 +0000	[diff] [blame]	3	* implement do-loop -> bdnz transform
Nate Begeman	ca068e8	2004-08-14 22:16:36 +0000	[diff] [blame]	4	* implement powerpc-64 for darwin
Nate Begeman	d332fd5	2004-08-29 22:02:43 +0000	[diff] [blame]	5	* use stfiwx in float->int
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	6
				7	* Fold add and sub with constant into non-extern, non-weak addresses so this:
Nate Begeman	4ad870d	2005-07-26 18:59:06 +0000	[diff] [blame]	8	lis r2, ha16(l2__ZTV4Cell)
				9	la r2, lo16(l2__ZTV4Cell)(r2)
				10	addi r2, r2, 8
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	11	becomes:
				12	lis r2, ha16(l2__ZTV4Cell+8)
				13	la r2, lo16(l2__ZTV4Cell+8)(r2)
				14
Chris Lattner	b65975a	2005-07-26 19:07:51 +0000	[diff] [blame]	15
Nate Begeman	5a01481	2005-08-14 01:17:16 +0000	[diff] [blame]	16	* Teach LLVM how to codegen this:
				17	unsigned short foo(float a) { return a; }
				18	as:
				19	_foo:
				20	fctiwz f0,f1
				21	stfd f0,-8(r1)
				22	lhz r3,-2(r1)
				23	blr
				24	not:
				25	_foo:
				26	fctiwz f0, f1
				27	stfd f0, -8(r1)
				28	lwz r2, -4(r1)
				29	rlwinm r3, r2, 0, 16, 31
				30	blr
				31
Chris Lattner	6281ae4	2005-08-05 19:18:32 +0000	[diff] [blame]	32	* Support 'update' load/store instructions. These are cracked on the G5, but
				33	are still a codesize win.
				34
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	35	* should hint to the branch select pass that it doesn't need to print the
				36	second unconditional branch, so we don't end up with things like:
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	37	b .LBBl42__2E_expand_function_8_674 ; loopentry.24
				38	b .LBBl42__2E_expand_function_8_42 ; NewDefault
				39	b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	40
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	41	===-------------------------------------------------------------------------===
				42
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	43	* Codegen this:
				44
				45	void test2(int X) {
				46	if (X == 0x12345678) bar();
				47	}
				48
				49	as:
				50
				51	xoris r0,r3,0x1234
				52	cmpwi cr0,r0,0x5678
				53	beq cr0,L6
				54
				55	not:
				56
				57	lis r2, 4660
				58	ori r2, r2, 22136
				59	cmpw cr0, r3, r2
				60	bne .LBB_test2_2
				61
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	62	===-------------------------------------------------------------------------===
				63
				64	Lump the constant pool for each function into ONE pic object, and reference
				65	pieces of it as offsets from the start. For functions like this (contrived
				66	to have lots of constants obviously):
				67
				68	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				69
				70	We generate:
				71
				72	_X:
				73	lis r2, ha16(.CPI_X_0)
				74	lfd f0, lo16(.CPI_X_0)(r2)
				75	lis r2, ha16(.CPI_X_1)
				76	lfd f2, lo16(.CPI_X_1)(r2)
				77	fmadd f0, f1, f0, f2
				78	lis r2, ha16(.CPI_X_2)
				79	lfd f1, lo16(.CPI_X_2)(r2)
				80	lis r2, ha16(.CPI_X_3)
				81	lfd f2, lo16(.CPI_X_3)(r2)
				82	fmadd f1, f0, f1, f2
				83	blr
				84
				85	It would be better to materialize .CPI_X into a register, then use immediates
				86	off of the register to avoid the lis's. This is even more important in PIC
				87	mode.
				88
				89	===-------------------------------------------------------------------------===
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	90
				91	Implement Newton-Rhapson method for improving estimate instructions to the
				92	correct accuracy, and implementing divide as multiply by reciprocal when it has
				93	more than one use. Itanium will want this too.
Nate Begeman	21e463b	2005-10-16 05:39:50 +0000	[diff] [blame]	94
				95	===-------------------------------------------------------------------------===
				96
				97	int foo(int a, int b) { return a == b ? 16 : 0; }
				98	_foo:
				99	cmpw cr7, r3, r4
				100	mfcr r2
				101	rlwinm r2, r2, 31, 31, 31
				102	slwi r3, r2, 4
				103	blr
				104
				105	If we exposed the srl & mask ops after the MFCR that we are doing to select
				106	the correct CR bit, then we could fold the slwi into the rlwinm before it.
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	107
				108	===-------------------------------------------------------------------------===
				109
				110	#define ARRAY_LENGTH 16
				111
				112	union bitfield {
				113	struct {
				114	#ifndef __ppc__
				115	unsigned int field0 : 6;
				116	unsigned int field1 : 6;
				117	unsigned int field2 : 6;
				118	unsigned int field3 : 6;
				119	unsigned int field4 : 3;
				120	unsigned int field5 : 4;
				121	unsigned int field6 : 1;
				122	#else
				123	unsigned int field6 : 1;
				124	unsigned int field5 : 4;
				125	unsigned int field4 : 3;
				126	unsigned int field3 : 6;
				127	unsigned int field2 : 6;
				128	unsigned int field1 : 6;
				129	unsigned int field0 : 6;
				130	#endif
				131	} bitfields, bits;
				132	unsigned int u32All;
				133	signed int i32All;
				134	float f32All;
				135	};
				136
				137
				138	typedef struct program_t {
				139	union bitfield array[ARRAY_LENGTH];
				140	int size;
				141	int loaded;
				142	} program;
				143
				144
				145	void AdjustBitfields(program* prog, unsigned int fmt1)
				146	{
				147	unsigned int shift = 0;
				148	unsigned int texCount = 0;
				149	unsigned int i;
				150
				151	for (i = 0; i < 8; i++)
				152	{
				153	prog->array[i].bitfields.field0 = texCount;
				154	prog->array[i].bitfields.field1 = texCount + 1;
				155	prog->array[i].bitfields.field2 = texCount + 2;
				156	prog->array[i].bitfields.field3 = texCount + 3;
				157
				158	texCount += (fmt1 >> shift) & 0x7;
				159	shift += 3;
				160	}
				161	}
				162
				163	In the loop above, the bitfield adds get generated as
				164	(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
				165
				166	Since the input to the (or and, and) is an (add) rather than a (shl), the shift
				167	doesn't get folded into the rlwimi instruction. We should ideally see through
				168	things like this, rather than forcing llvm to generate the equivalent
				169
				170	(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner	0195910	2005-10-28 00:20:45 +0000	[diff] [blame]	171
				172	===-------------------------------------------------------------------------===
				173
Chris Lattner	ae4664a	2005-11-05 08:57:56 +0000	[diff] [blame]	174	Compile this:
				175
				176	int %f1(int %a, int %b) {
				177	%tmp.1 = and int %a, 15 ; <int> [#uses=1]
				178	%tmp.3 = and int %b, 240 ; <int> [#uses=1]
				179	%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
				180	ret int %tmp.4
				181	}
				182
				183	without a copy. We make this currently:
				184
				185	_f1:
				186	rlwinm r2, r4, 0, 24, 27
				187	rlwimi r2, r3, 0, 28, 31
				188	or r3, r2, r2
				189	blr
				190
				191	The two-addr pass or RA needs to learn when it is profitable to commute an
				192	instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
				193	currently only commutes to avoid inserting a copy BEFORE the two addr instr.
				194
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	195	===-------------------------------------------------------------------------===
				196
Nate Begeman	eb20ed6	2006-01-28 01:22:10 +0000	[diff] [blame]	197	176.gcc contains a bunch of code like this (this occurs dozens of times):
				198
				199	int %test(uint %mode.0.i.0) {
				200	%tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
				201	%tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
				202	%tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
				203	%tmp.82 = and int %tmp.81, 16711680
				204	ret int %tmp.82
				205	}
				206
				207	which we compile to:
				208
				209	_test:
				210	extsb r2, r3
				211	rlwinm r3, r2, 16, 8, 15
				212	blr
				213
				214	The extsb is obviously dead. This can be handled by a future thing like
				215	MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
				216	the sign bits are never used, so we can fold the sext_inreg to nothing).
				217
				218	I'm seeing code like this:
				219
				220	srwi r3, r3, 16
				221	extsb r3, r3
				222	rlwimi r4, r3, 16, 8, 15
				223
				224	in which the extsb is preventing the srwi from being nuked.
				225
				226	===-------------------------------------------------------------------------===
				227
				228	Another example that occurs is:
				229
				230	uint %test(int %specbits.6.1) {
				231	%tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
				232	%tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
				233	%tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
				234	%tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
				235	ret uint %tmp.2543
				236	}
				237
				238	which we codegen as:
				239
				240	l1_test:
				241	srawi r2, r3, 11
				242	rlwinm r3, r2, 13, 18, 18
				243	blr
				244
				245	the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
				246	dead), which I think can then be folded into the rlwinm.
				247
				248	===-------------------------------------------------------------------------===
				249
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	250	Compile offsets from allocas:
				251
				252	int *%test() {
				253	%X = alloca { int, int }
				254	%Y = getelementptr {int,int}* %X, int 0, uint 1
				255	ret int* %Y
				256	}
				257
				258	into a single add, not two:
				259
				260	_test:
				261	addi r2, r1, -8
				262	addi r3, r2, 4
				263	blr
				264
				265	--> important for C++.
				266
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	267	===-------------------------------------------------------------------------===
				268
				269	int test3(int a, int b) { return (a < 0) ? a : 0; }
				270
				271	should be branch free code. LLVM is turning it into < 1 because of the RHS.
				272
				273	===-------------------------------------------------------------------------===
				274
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	275	No loads or stores of the constants should be needed:
				276
				277	struct foo { double X, Y; };
				278	void xxx(struct foo F);
				279	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				280
Chris Lattner	1db4b4f	2006-01-16 17:53:00 +0000	[diff] [blame]	281	===-------------------------------------------------------------------------===
				282
				283	For this:
				284
				285	int h(int i, int j, int k) {
				286	return (i==0\|\|j==0\|\|k == 0);
				287	}
				288
				289	We currently emit this:
				290
				291	_h:
				292	cntlzw r2, r3
				293	cntlzw r3, r4
				294	cntlzw r4, r5
				295	srwi r2, r2, 5
				296	srwi r3, r3, 5
				297	srwi r4, r4, 5
				298	or r2, r3, r2
				299	or r3, r2, r4
				300	blr
				301
				302	The ctlz/shift instructions are created by the isel, so the dag combiner doesn't
				303	have a chance to pull the shifts through the or's (eliminating two
				304	instructions). SETCC nodes should be custom lowered in this case, not expanded
				305	by the isel.
				306
Chris Lattner	98fbc2f	2006-01-16 17:58:54 +0000	[diff] [blame]	307	===-------------------------------------------------------------------------===
				308
				309	Darwin Stub LICM optimization:
				310
				311	Loops like this:
				312
				313	for (...) bar();
				314
				315	Have to go through an indirect stub if bar is external or linkonce. It would
				316	be better to compile it as:
				317
				318	fp = &bar;
				319	for (...) fp();
				320
				321	which only computes the address of bar once (instead of each time through the
				322	stub). This is Darwin specific and would have to be done in the code generator.
				323	Probably not a win on x86.
				324
				325	===-------------------------------------------------------------------------===
				326
				327	PowerPC i1/setcc stuff (depends on subreg stuff):
				328
				329	Check out the PPC code we get for 'compare' in this testcase:
				330	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
				331
				332	oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
				333	invert, invert, or), we then have to compare it against zero instead of
				334	using the value already in a CR!
				335
				336	that should be something like
				337	cmpw cr7, r8, r5
				338	cmpw cr0, r7, r3
				339	crnand cr0, cr0, cr7
				340	bne cr0, LBB_compare_4
				341
				342	instead of
				343	cmpw cr7, r8, r5
				344	cmpw cr0, r7, r3
				345	mfcr r7, 1
				346	mcrf cr7, cr0
				347	mfcr r8, 1
				348	rlwinm r7, r7, 30, 31, 31
				349	rlwinm r8, r8, 30, 31, 31
				350	xori r7, r7, 1
				351	xori r8, r8, 1
				352	addi r2, r2, 1
				353	or r7, r8, r7
				354	cmpwi cr0, r7, 0
				355	bne cr0, LBB_compare_4 ; loopexit
				356
				357	===-------------------------------------------------------------------------===
				358
				359	Simple IPO for argument passing, change:
				360	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				361
				362	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				363	of arguments get assigned to r3 through r10. That is, if you have a function
				364	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				365	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				366	order for functions we can internalize so that the maximum number of
				367	integers/pointers get passed in regs before you see any of the fp arguments.
				368
				369	Instead of implementing this, it would actually probably be easier to just
				370	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				371	including having this work sanely.
				372
				373	===-------------------------------------------------------------------------===
				374
				375	Fix Darwin FP-In-Integer Registers ABI
				376
				377	Darwin passes doubles in structures in integer registers, which is very very
				378	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				379	that percolates these things out of functions.
				380
				381	Check out how horrible this is:
				382	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				383
				384	This is an extension of "interprocedural CC unmunging" that can't be done with
				385	just fastcc.
				386
				387	===-------------------------------------------------------------------------===
				388
				389	Code Gen IPO optimization:
				390
				391	Squish small scalar globals together into a single global struct, allowing the
				392	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				393	of the GOT on targets with one).
				394
Chris Lattner	3cda14f	2006-01-19 02:09:38 +0000	[diff] [blame]	395	===-------------------------------------------------------------------------===
				396
				397	Generate lwbrx and other byteswapping load/store instructions when reasonable.
				398
Chris Lattner	9690979	2006-01-28 05:40:47 +0000	[diff] [blame]	399	===-------------------------------------------------------------------------===
				400
				401	Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
				402	TargetConstantVec's if it's one of the many forms that are algorithmically
				403	computable using the spiffy altivec instructions.
				404
Chris Lattner	56b6964	2006-01-31 02:55:28 +0000	[diff] [blame^]	405	===-------------------------------------------------------------------------===
				406
				407	Compile this:
				408
				409	double %test(double %X) {
				410	%Y = cast double %X to long
				411	%Z = cast long %Y to double
				412	ret double %Z
				413	}
				414
				415	to this:
				416
				417	_test:
				418	fctidz f0, f1
				419	stfd f0, -8(r1)
				420	lwz r2, -4(r1)
				421	lwz r3, -8(r1)
				422	stw r2, -12(r1)
				423	stw r3, -16(r1)
				424	lfd f0, -16(r1)
				425	fcfid f1, f0
				426	blr
				427
				428	without the lwz/stw's.
				429