Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: 7afcc7471af7d90baa7c6bbb24412790f46f5e48 [file] [log] [blame]

Nate Begeman	b64af91	2004-08-10 20:42:36 +0000	[diff] [blame]	1	TODO:
Nate Begeman	ef9531e	2005-04-11 20:48:57 +0000	[diff] [blame]	2	* gpr0 allocation
Nate Begeman	4a0de07	2004-10-26 04:10:53 +0000	[diff] [blame]	3	* implement do-loop -> bdnz transform
Nate Begeman	ca068e8	2004-08-14 22:16:36 +0000	[diff] [blame]	4	* implement powerpc-64 for darwin
Nate Begeman	d332fd5	2004-08-29 22:02:43 +0000	[diff] [blame]	5	* use stfiwx in float->int
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	6
				7	* Fold add and sub with constant into non-extern, non-weak addresses so this:
Nate Begeman	4ad870d	2005-07-26 18:59:06 +0000	[diff] [blame]	8	lis r2, ha16(l2__ZTV4Cell)
				9	la r2, lo16(l2__ZTV4Cell)(r2)
				10	addi r2, r2, 8
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	11	becomes:
				12	lis r2, ha16(l2__ZTV4Cell+8)
				13	la r2, lo16(l2__ZTV4Cell+8)(r2)
				14
Chris Lattner	b65975a	2005-07-26 19:07:51 +0000	[diff] [blame]	15
Nate Begeman	5a01481	2005-08-14 01:17:16 +0000	[diff] [blame]	16	* Teach LLVM how to codegen this:
				17	unsigned short foo(float a) { return a; }
				18	as:
				19	_foo:
				20	fctiwz f0,f1
				21	stfd f0,-8(r1)
				22	lhz r3,-2(r1)
				23	blr
				24	not:
				25	_foo:
				26	fctiwz f0, f1
				27	stfd f0, -8(r1)
				28	lwz r2, -4(r1)
				29	rlwinm r3, r2, 0, 16, 31
				30	blr
				31
Chris Lattner	6281ae4	2005-08-05 19:18:32 +0000	[diff] [blame]	32	* Support 'update' load/store instructions. These are cracked on the G5, but
				33	are still a codesize win.
				34
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	35	* should hint to the branch select pass that it doesn't need to print the
				36	second unconditional branch, so we don't end up with things like:
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	37	b .LBBl42__2E_expand_function_8_674 ; loopentry.24
				38	b .LBBl42__2E_expand_function_8_42 ; NewDefault
				39	b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	40
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	41	===-------------------------------------------------------------------------===
				42
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	43	* Codegen this:
				44
				45	void test2(int X) {
				46	if (X == 0x12345678) bar();
				47	}
				48
				49	as:
				50
				51	xoris r0,r3,0x1234
				52	cmpwi cr0,r0,0x5678
				53	beq cr0,L6
				54
				55	not:
				56
				57	lis r2, 4660
				58	ori r2, r2, 22136
				59	cmpw cr0, r3, r2
				60	bne .LBB_test2_2
				61
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	62	===-------------------------------------------------------------------------===
				63
				64	Lump the constant pool for each function into ONE pic object, and reference
				65	pieces of it as offsets from the start. For functions like this (contrived
				66	to have lots of constants obviously):
				67
				68	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				69
				70	We generate:
				71
				72	_X:
				73	lis r2, ha16(.CPI_X_0)
				74	lfd f0, lo16(.CPI_X_0)(r2)
				75	lis r2, ha16(.CPI_X_1)
				76	lfd f2, lo16(.CPI_X_1)(r2)
				77	fmadd f0, f1, f0, f2
				78	lis r2, ha16(.CPI_X_2)
				79	lfd f1, lo16(.CPI_X_2)(r2)
				80	lis r2, ha16(.CPI_X_3)
				81	lfd f2, lo16(.CPI_X_3)(r2)
				82	fmadd f1, f0, f1, f2
				83	blr
				84
				85	It would be better to materialize .CPI_X into a register, then use immediates
				86	off of the register to avoid the lis's. This is even more important in PIC
				87	mode.
				88
Chris Lattner	39b248b	2006-02-02 23:50:22 +0000	[diff] [blame^]	89	Note that this (and the static variable version) is discussed here for GCC:
				90	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				91
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	92	===-------------------------------------------------------------------------===
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	93
				94	Implement Newton-Rhapson method for improving estimate instructions to the
				95	correct accuracy, and implementing divide as multiply by reciprocal when it has
				96	more than one use. Itanium will want this too.
Nate Begeman	21e463b	2005-10-16 05:39:50 +0000	[diff] [blame]	97
				98	===-------------------------------------------------------------------------===
				99
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	100	#define ARRAY_LENGTH 16
				101
				102	union bitfield {
				103	struct {
				104	#ifndef __ppc__
				105	unsigned int field0 : 6;
				106	unsigned int field1 : 6;
				107	unsigned int field2 : 6;
				108	unsigned int field3 : 6;
				109	unsigned int field4 : 3;
				110	unsigned int field5 : 4;
				111	unsigned int field6 : 1;
				112	#else
				113	unsigned int field6 : 1;
				114	unsigned int field5 : 4;
				115	unsigned int field4 : 3;
				116	unsigned int field3 : 6;
				117	unsigned int field2 : 6;
				118	unsigned int field1 : 6;
				119	unsigned int field0 : 6;
				120	#endif
				121	} bitfields, bits;
				122	unsigned int u32All;
				123	signed int i32All;
				124	float f32All;
				125	};
				126
				127
				128	typedef struct program_t {
				129	union bitfield array[ARRAY_LENGTH];
				130	int size;
				131	int loaded;
				132	} program;
				133
				134
				135	void AdjustBitfields(program* prog, unsigned int fmt1)
				136	{
				137	unsigned int shift = 0;
				138	unsigned int texCount = 0;
				139	unsigned int i;
				140
				141	for (i = 0; i < 8; i++)
				142	{
				143	prog->array[i].bitfields.field0 = texCount;
				144	prog->array[i].bitfields.field1 = texCount + 1;
				145	prog->array[i].bitfields.field2 = texCount + 2;
				146	prog->array[i].bitfields.field3 = texCount + 3;
				147
				148	texCount += (fmt1 >> shift) & 0x7;
				149	shift += 3;
				150	}
				151	}
				152
				153	In the loop above, the bitfield adds get generated as
				154	(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
				155
				156	Since the input to the (or and, and) is an (add) rather than a (shl), the shift
				157	doesn't get folded into the rlwimi instruction. We should ideally see through
				158	things like this, rather than forcing llvm to generate the equivalent
				159
				160	(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner	0195910	2005-10-28 00:20:45 +0000	[diff] [blame]	161
				162	===-------------------------------------------------------------------------===
				163
Chris Lattner	ae4664a	2005-11-05 08:57:56 +0000	[diff] [blame]	164	Compile this:
				165
				166	int %f1(int %a, int %b) {
				167	%tmp.1 = and int %a, 15 ; <int> [#uses=1]
				168	%tmp.3 = and int %b, 240 ; <int> [#uses=1]
				169	%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
				170	ret int %tmp.4
				171	}
				172
				173	without a copy. We make this currently:
				174
				175	_f1:
				176	rlwinm r2, r4, 0, 24, 27
				177	rlwimi r2, r3, 0, 28, 31
				178	or r3, r2, r2
				179	blr
				180
				181	The two-addr pass or RA needs to learn when it is profitable to commute an
				182	instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
				183	currently only commutes to avoid inserting a copy BEFORE the two addr instr.
				184
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	185	===-------------------------------------------------------------------------===
				186
Nate Begeman	eb20ed6	2006-01-28 01:22:10 +0000	[diff] [blame]	187	176.gcc contains a bunch of code like this (this occurs dozens of times):
				188
				189	int %test(uint %mode.0.i.0) {
				190	%tmp.79 = cast uint %mode.0.i.0 to sbyte ; <sbyte> [#uses=1]
				191	%tmp.80 = cast sbyte %tmp.79 to int ; <int> [#uses=1]
				192	%tmp.81 = shl int %tmp.80, ubyte 16 ; <int> [#uses=1]
				193	%tmp.82 = and int %tmp.81, 16711680
				194	ret int %tmp.82
				195	}
				196
				197	which we compile to:
				198
				199	_test:
				200	extsb r2, r3
				201	rlwinm r3, r2, 16, 8, 15
				202	blr
				203
				204	The extsb is obviously dead. This can be handled by a future thing like
				205	MaskedValueIsZero that checks to see if bits are ever demanded (in this case,
				206	the sign bits are never used, so we can fold the sext_inreg to nothing).
				207
				208	I'm seeing code like this:
				209
				210	srwi r3, r3, 16
				211	extsb r3, r3
				212	rlwimi r4, r3, 16, 8, 15
				213
				214	in which the extsb is preventing the srwi from being nuked.
				215
				216	===-------------------------------------------------------------------------===
				217
				218	Another example that occurs is:
				219
				220	uint %test(int %specbits.6.1) {
				221	%tmp.2540 = shr int %specbits.6.1, ubyte 11 ; <int> [#uses=1]
				222	%tmp.2541 = cast int %tmp.2540 to uint ; <uint> [#uses=1]
				223	%tmp.2542 = shl uint %tmp.2541, ubyte 13 ; <uint> [#uses=1]
				224	%tmp.2543 = and uint %tmp.2542, 8192 ; <uint> [#uses=1]
				225	ret uint %tmp.2543
				226	}
				227
				228	which we codegen as:
				229
				230	l1_test:
				231	srawi r2, r3, 11
				232	rlwinm r3, r2, 13, 18, 18
				233	blr
				234
				235	the srawi can be nuked by turning the SAR into a logical SHR (the sext bits are
				236	dead), which I think can then be folded into the rlwinm.
				237
				238	===-------------------------------------------------------------------------===
				239
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	240	Compile offsets from allocas:
				241
				242	int *%test() {
				243	%X = alloca { int, int }
				244	%Y = getelementptr {int,int}* %X, int 0, uint 1
				245	ret int* %Y
				246	}
				247
				248	into a single add, not two:
				249
				250	_test:
				251	addi r2, r1, -8
				252	addi r3, r2, 4
				253	blr
				254
				255	--> important for C++.
				256
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	257	===-------------------------------------------------------------------------===
				258
				259	int test3(int a, int b) { return (a < 0) ? a : 0; }
				260
				261	should be branch free code. LLVM is turning it into < 1 because of the RHS.
				262
				263	===-------------------------------------------------------------------------===
				264
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	265	No loads or stores of the constants should be needed:
				266
				267	struct foo { double X, Y; };
				268	void xxx(struct foo F);
				269	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				270
Chris Lattner	1db4b4f	2006-01-16 17:53:00 +0000	[diff] [blame]	271	===-------------------------------------------------------------------------===
				272
Chris Lattner	98fbc2f	2006-01-16 17:58:54 +0000	[diff] [blame]	273	Darwin Stub LICM optimization:
				274
				275	Loops like this:
				276
				277	for (...) bar();
				278
				279	Have to go through an indirect stub if bar is external or linkonce. It would
				280	be better to compile it as:
				281
				282	fp = &bar;
				283	for (...) fp();
				284
				285	which only computes the address of bar once (instead of each time through the
				286	stub). This is Darwin specific and would have to be done in the code generator.
				287	Probably not a win on x86.
				288
				289	===-------------------------------------------------------------------------===
				290
				291	PowerPC i1/setcc stuff (depends on subreg stuff):
				292
				293	Check out the PPC code we get for 'compare' in this testcase:
				294	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
				295
				296	oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
				297	invert, invert, or), we then have to compare it against zero instead of
				298	using the value already in a CR!
				299
				300	that should be something like
				301	cmpw cr7, r8, r5
				302	cmpw cr0, r7, r3
				303	crnand cr0, cr0, cr7
				304	bne cr0, LBB_compare_4
				305
				306	instead of
				307	cmpw cr7, r8, r5
				308	cmpw cr0, r7, r3
				309	mfcr r7, 1
				310	mcrf cr7, cr0
				311	mfcr r8, 1
				312	rlwinm r7, r7, 30, 31, 31
				313	rlwinm r8, r8, 30, 31, 31
				314	xori r7, r7, 1
				315	xori r8, r8, 1
				316	addi r2, r2, 1
				317	or r7, r8, r7
				318	cmpwi cr0, r7, 0
				319	bne cr0, LBB_compare_4 ; loopexit
				320
				321	===-------------------------------------------------------------------------===
				322
				323	Simple IPO for argument passing, change:
				324	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				325
				326	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				327	of arguments get assigned to r3 through r10. That is, if you have a function
				328	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				329	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				330	order for functions we can internalize so that the maximum number of
				331	integers/pointers get passed in regs before you see any of the fp arguments.
				332
				333	Instead of implementing this, it would actually probably be easier to just
				334	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				335	including having this work sanely.
				336
				337	===-------------------------------------------------------------------------===
				338
				339	Fix Darwin FP-In-Integer Registers ABI
				340
				341	Darwin passes doubles in structures in integer registers, which is very very
				342	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				343	that percolates these things out of functions.
				344
				345	Check out how horrible this is:
				346	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				347
				348	This is an extension of "interprocedural CC unmunging" that can't be done with
				349	just fastcc.
				350
				351	===-------------------------------------------------------------------------===
				352
				353	Code Gen IPO optimization:
				354
				355	Squish small scalar globals together into a single global struct, allowing the
				356	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				357	of the GOT on targets with one).
				358
Chris Lattner	3cda14f	2006-01-19 02:09:38 +0000	[diff] [blame]	359	===-------------------------------------------------------------------------===
				360
				361	Generate lwbrx and other byteswapping load/store instructions when reasonable.
				362
Chris Lattner	9690979	2006-01-28 05:40:47 +0000	[diff] [blame]	363	===-------------------------------------------------------------------------===
				364
				365	Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
				366	TargetConstantVec's if it's one of the many forms that are algorithmically
				367	computable using the spiffy altivec instructions.
				368
Chris Lattner	56b6964	2006-01-31 02:55:28 +0000	[diff] [blame]	369	===-------------------------------------------------------------------------===
				370
				371	Compile this:
				372
				373	double %test(double %X) {
				374	%Y = cast double %X to long
				375	%Z = cast long %Y to double
				376	ret double %Z
				377	}
				378
				379	to this:
				380
				381	_test:
				382	fctidz f0, f1
				383	stfd f0, -8(r1)
				384	lwz r2, -4(r1)
				385	lwz r3, -8(r1)
				386	stw r2, -12(r1)
				387	stw r3, -16(r1)
				388	lfd f0, -16(r1)
				389	fcfid f1, f0
				390	blr
				391
				392	without the lwz/stw's.
				393
Chris Lattner	83e64ba	2006-01-31 07:16:34 +0000	[diff] [blame]	394	===-------------------------------------------------------------------------===
				395
				396	Compile this:
				397
				398	int foo(int a) {
				399	int b = (a < 8);
				400	if (b) {
				401	return b * 3; // ignore the fact that this is always 3.
				402	} else {
				403	return 2;
				404	}
				405	}
				406
				407	into something not this:
				408
				409	_foo:
				410	1) cmpwi cr7, r3, 8
				411	mfcr r2, 1
				412	rlwinm r2, r2, 29, 31, 31
				413	1) cmpwi cr0, r3, 7
				414	bgt cr0, LBB1_2 ; UnifiedReturnBlock
				415	LBB1_1: ; then
				416	rlwinm r2, r2, 0, 31, 31
				417	mulli r3, r2, 3
				418	blr
				419	LBB1_2: ; UnifiedReturnBlock
				420	li r3, 2
				421	blr
				422
				423	In particular, the two compares (marked 1) could be shared by reversing one.
				424	This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
				425	same operands (but backwards) exists. In this case, this wouldn't save us
				426	anything though, because the compares still wouldn't be shared.
Chris Lattner	0ddc180	2006-02-01 00:28:12 +0000	[diff] [blame]	427
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	428	===-------------------------------------------------------------------------===
				429
				430	The legalizer should lower this:
				431
				432	bool %test(ulong %x) {
				433	%tmp = setlt ulong %x, 4294967296
				434	ret bool %tmp
				435	}
				436
				437	into "if x.high == 0", not:
				438
				439	_test:
				440	addi r2, r3, -1
				441	cntlzw r2, r2
				442	cntlzw r3, r3
				443	srwi r2, r2, 5
Nate Begeman	93c740b	2006-02-02 07:27:56 +0000	[diff] [blame]	444	srwi r4, r3, 5
				445	li r3, 0
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	446	cmpwi cr0, r2, 0
				447	bne cr0, LBB1_2 ;
				448	LBB1_1:
Nate Begeman	93c740b	2006-02-02 07:27:56 +0000	[diff] [blame]	449	or r3, r4, r4
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	450	LBB1_2:
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	451	blr
				452
				453	noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner	275b884	2006-02-02 07:37:11 +0000	[diff] [blame]	454
				455
				456	===-------------------------------------------------------------------------===
				457
				458	We should custom expand setcc instead of pretending that we have it. That
				459	would allow us to expose the access of the crbit after the mfcr, allowing
				460	that access to be trivially folded into other ops. A simple example:
				461
				462	int foo(int a, int b) { return (a < b) << 4; }
				463
				464	compiles into:
				465
				466	_foo:
				467	cmpw cr7, r3, r4
				468	mfcr r2, 1
				469	rlwinm r2, r2, 29, 31, 31
				470	slwi r3, r2, 4
				471	blr
				472