Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: 050f6142912f7b0fbf677284f8b7ee5a2cab0570 [file] [log] [blame]

Nate Begeman	b64af91	2004-08-10 20:42:36 +0000	[diff] [blame]	1	TODO:
Nate Begeman	ef9531e	2005-04-11 20:48:57 +0000	[diff] [blame]	2	* gpr0 allocation
Nate Begeman	4a0de07	2004-10-26 04:10:53 +0000	[diff] [blame]	3	* implement do-loop -> bdnz transform
Nate Begeman	ca068e8	2004-08-14 22:16:36 +0000	[diff] [blame]	4	* implement powerpc-64 for darwin
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	5
Nate Begeman	a63fee8	2006-02-03 05:17:06 +0000	[diff] [blame]	6	===-------------------------------------------------------------------------===
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	7
Nate Begeman	a63fee8	2006-02-03 05:17:06 +0000	[diff] [blame]	8	Support 'update' load/store instructions. These are cracked on the G5, but are
				9	still a codesize win.
				10
				11	===-------------------------------------------------------------------------===
				12
				13	Should hint to the branch select pass that it doesn't need to print the second
				14	unconditional branch, so we don't end up with things like:
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	15	b .LBBl42__2E_expand_function_8_674 ; loopentry.24
				16	b .LBBl42__2E_expand_function_8_42 ; NewDefault
				17	b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	18
Chris Lattner	1541bc3	2006-02-03 22:06:45 +0000	[diff] [blame]	19	This occurs in SPASS.
				20
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	21	===-------------------------------------------------------------------------===
				22
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	23	* Codegen this:
				24
				25	void test2(int X) {
				26	if (X == 0x12345678) bar();
				27	}
				28
				29	as:
				30
				31	xoris r0,r3,0x1234
Nate Begeman	6e53ceb	2006-02-27 22:08:36 +0000	[diff] [blame]	32	cmplwi cr0,r0,0x5678
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	33	beq cr0,L6
				34
				35	not:
				36
				37	lis r2, 4660
				38	ori r2, r2, 22136
				39	cmpw cr0, r3, r2
				40	bne .LBB_test2_2
				41
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	42	===-------------------------------------------------------------------------===
				43
				44	Lump the constant pool for each function into ONE pic object, and reference
				45	pieces of it as offsets from the start. For functions like this (contrived
				46	to have lots of constants obviously):
				47
				48	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				49
				50	We generate:
				51
				52	_X:
				53	lis r2, ha16(.CPI_X_0)
				54	lfd f0, lo16(.CPI_X_0)(r2)
				55	lis r2, ha16(.CPI_X_1)
				56	lfd f2, lo16(.CPI_X_1)(r2)
				57	fmadd f0, f1, f0, f2
				58	lis r2, ha16(.CPI_X_2)
				59	lfd f1, lo16(.CPI_X_2)(r2)
				60	lis r2, ha16(.CPI_X_3)
				61	lfd f2, lo16(.CPI_X_3)(r2)
				62	fmadd f1, f0, f1, f2
				63	blr
				64
				65	It would be better to materialize .CPI_X into a register, then use immediates
				66	off of the register to avoid the lis's. This is even more important in PIC
				67	mode.
				68
Chris Lattner	39b248b	2006-02-02 23:50:22 +0000	[diff] [blame]	69	Note that this (and the static variable version) is discussed here for GCC:
				70	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				71
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	72	===-------------------------------------------------------------------------===
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	73
Chris Lattner	33c1dab	2006-02-03 06:22:11 +0000	[diff] [blame]	74	PIC Code Gen IPO optimization:
				75
				76	Squish small scalar globals together into a single global struct, allowing the
				77	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				78	of the GOT on targets with one).
				79
				80	Note that this is discussed here for GCC:
				81	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				82
				83	===-------------------------------------------------------------------------===
				84
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	85	Implement Newton-Rhapson method for improving estimate instructions to the
				86	correct accuracy, and implementing divide as multiply by reciprocal when it has
				87	more than one use. Itanium will want this too.
Nate Begeman	21e463b	2005-10-16 05:39:50 +0000	[diff] [blame]	88
				89	===-------------------------------------------------------------------------===
				90
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	91	#define ARRAY_LENGTH 16
				92
				93	union bitfield {
				94	struct {
				95	#ifndef __ppc__
				96	unsigned int field0 : 6;
				97	unsigned int field1 : 6;
				98	unsigned int field2 : 6;
				99	unsigned int field3 : 6;
				100	unsigned int field4 : 3;
				101	unsigned int field5 : 4;
				102	unsigned int field6 : 1;
				103	#else
				104	unsigned int field6 : 1;
				105	unsigned int field5 : 4;
				106	unsigned int field4 : 3;
				107	unsigned int field3 : 6;
				108	unsigned int field2 : 6;
				109	unsigned int field1 : 6;
				110	unsigned int field0 : 6;
				111	#endif
				112	} bitfields, bits;
				113	unsigned int u32All;
				114	signed int i32All;
				115	float f32All;
				116	};
				117
				118
				119	typedef struct program_t {
				120	union bitfield array[ARRAY_LENGTH];
				121	int size;
				122	int loaded;
				123	} program;
				124
				125
				126	void AdjustBitfields(program* prog, unsigned int fmt1)
				127	{
Nate Begeman	6e53ceb	2006-02-27 22:08:36 +0000	[diff] [blame]	128	prog->array[0].bitfields.field0 = fmt1;
				129	prog->array[0].bitfields.field1 = fmt1 + 1;
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	130	}
				131
Nate Begeman	6e53ceb	2006-02-27 22:08:36 +0000	[diff] [blame]	132	We currently generate:
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	133
Nate Begeman	6e53ceb	2006-02-27 22:08:36 +0000	[diff] [blame]	134	_AdjustBitfields:
				135	lwz r2, 0(r3)
				136	addi r5, r4, 1
				137	rlwinm r2, r2, 0, 0, 19
				138	rlwinm r5, r5, 6, 20, 25
				139	rlwimi r2, r4, 0, 26, 31
				140	or r2, r2, r5
				141	stw r2, 0(r3)
				142	blr
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	143
Nate Begeman	6e53ceb	2006-02-27 22:08:36 +0000	[diff] [blame]	144	We should teach someone that or (rlwimi, rlwinm) with disjoint masks can be
				145	turned into rlwimi (rlwimi)
				146
				147	The better codegen would be:
				148
				149	_AdjustBitfields:
				150	lwz r0,0(r3)
				151	rlwinm r4,r4,0,0xff
				152	rlwimi r0,r4,0,26,31
				153	addi r4,r4,1
				154	rlwimi r0,r4,6,20,25
				155	stw r0,0(r3)
				156	blr
Chris Lattner	0195910	2005-10-28 00:20:45 +0000	[diff] [blame]	157
				158	===-------------------------------------------------------------------------===
				159
Chris Lattner	ae4664a	2005-11-05 08:57:56 +0000	[diff] [blame]	160	Compile this:
				161
				162	int %f1(int %a, int %b) {
				163	%tmp.1 = and int %a, 15 ; <int> [#uses=1]
				164	%tmp.3 = and int %b, 240 ; <int> [#uses=1]
				165	%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
				166	ret int %tmp.4
				167	}
				168
				169	without a copy. We make this currently:
				170
				171	_f1:
				172	rlwinm r2, r4, 0, 24, 27
				173	rlwimi r2, r3, 0, 28, 31
				174	or r3, r2, r2
				175	blr
				176
				177	The two-addr pass or RA needs to learn when it is profitable to commute an
				178	instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
				179	currently only commutes to avoid inserting a copy BEFORE the two addr instr.
				180
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	181	===-------------------------------------------------------------------------===
				182
				183	Compile offsets from allocas:
				184
				185	int *%test() {
				186	%X = alloca { int, int }
				187	%Y = getelementptr {int,int}* %X, int 0, uint 1
				188	ret int* %Y
				189	}
				190
				191	into a single add, not two:
				192
				193	_test:
				194	addi r2, r1, -8
				195	addi r3, r2, 4
				196	blr
				197
				198	--> important for C++.
				199
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	200	===-------------------------------------------------------------------------===
				201
				202	int test3(int a, int b) { return (a < 0) ? a : 0; }
				203
				204	should be branch free code. LLVM is turning it into < 1 because of the RHS.
				205
				206	===-------------------------------------------------------------------------===
				207
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	208	No loads or stores of the constants should be needed:
				209
				210	struct foo { double X, Y; };
				211	void xxx(struct foo F);
				212	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				213
Chris Lattner	1db4b4f	2006-01-16 17:53:00 +0000	[diff] [blame]	214	===-------------------------------------------------------------------------===
				215
Chris Lattner	98fbc2f	2006-01-16 17:58:54 +0000	[diff] [blame]	216	Darwin Stub LICM optimization:
				217
				218	Loops like this:
				219
				220	for (...) bar();
				221
				222	Have to go through an indirect stub if bar is external or linkonce. It would
				223	be better to compile it as:
				224
				225	fp = &bar;
				226	for (...) fp();
				227
				228	which only computes the address of bar once (instead of each time through the
				229	stub). This is Darwin specific and would have to be done in the code generator.
				230	Probably not a win on x86.
				231
				232	===-------------------------------------------------------------------------===
				233
				234	PowerPC i1/setcc stuff (depends on subreg stuff):
				235
				236	Check out the PPC code we get for 'compare' in this testcase:
				237	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
				238
				239	oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
				240	invert, invert, or), we then have to compare it against zero instead of
				241	using the value already in a CR!
				242
				243	that should be something like
				244	cmpw cr7, r8, r5
				245	cmpw cr0, r7, r3
				246	crnand cr0, cr0, cr7
				247	bne cr0, LBB_compare_4
				248
				249	instead of
				250	cmpw cr7, r8, r5
				251	cmpw cr0, r7, r3
				252	mfcr r7, 1
				253	mcrf cr7, cr0
				254	mfcr r8, 1
				255	rlwinm r7, r7, 30, 31, 31
				256	rlwinm r8, r8, 30, 31, 31
				257	xori r7, r7, 1
				258	xori r8, r8, 1
				259	addi r2, r2, 1
				260	or r7, r8, r7
				261	cmpwi cr0, r7, 0
				262	bne cr0, LBB_compare_4 ; loopexit
				263
Chris Lattner	8d3f490	2006-02-08 06:43:51 +0000	[diff] [blame]	264	FreeBench/mason has a basic block that looks like this:
				265
				266	%tmp.130 = seteq int %p.0__, 5 ; <bool> [#uses=1]
				267	%tmp.134 = seteq int %p.1__, 6 ; <bool> [#uses=1]
				268	%tmp.139 = seteq int %p.2__, 12 ; <bool> [#uses=1]
				269	%tmp.144 = seteq int %p.3__, 13 ; <bool> [#uses=1]
				270	%tmp.149 = seteq int %p.4__, 14 ; <bool> [#uses=1]
				271	%tmp.154 = seteq int %p.5__, 15 ; <bool> [#uses=1]
				272	%bothcond = and bool %tmp.134, %tmp.130 ; <bool> [#uses=1]
				273	%bothcond123 = and bool %bothcond, %tmp.139 ; <bool>
				274	%bothcond124 = and bool %bothcond123, %tmp.144 ; <bool>
				275	%bothcond125 = and bool %bothcond124, %tmp.149 ; <bool>
				276	%bothcond126 = and bool %bothcond125, %tmp.154 ; <bool>
				277	br bool %bothcond126, label %shortcirc_next.5, label %else.0
				278
				279	This is a particularly important case where handling CRs better will help.
				280
Chris Lattner	98fbc2f	2006-01-16 17:58:54 +0000	[diff] [blame]	281	===-------------------------------------------------------------------------===
				282
				283	Simple IPO for argument passing, change:
				284	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				285
				286	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				287	of arguments get assigned to r3 through r10. That is, if you have a function
				288	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				289	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				290	order for functions we can internalize so that the maximum number of
				291	integers/pointers get passed in regs before you see any of the fp arguments.
				292
				293	Instead of implementing this, it would actually probably be easier to just
				294	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				295	including having this work sanely.
				296
				297	===-------------------------------------------------------------------------===
				298
				299	Fix Darwin FP-In-Integer Registers ABI
				300
				301	Darwin passes doubles in structures in integer registers, which is very very
				302	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				303	that percolates these things out of functions.
				304
				305	Check out how horrible this is:
				306	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				307
				308	This is an extension of "interprocedural CC unmunging" that can't be done with
				309	just fastcc.
				310
				311	===-------------------------------------------------------------------------===
				312
Chris Lattner	3cda14f	2006-01-19 02:09:38 +0000	[diff] [blame]	313	Generate lwbrx and other byteswapping load/store instructions when reasonable.
				314
Chris Lattner	9690979	2006-01-28 05:40:47 +0000	[diff] [blame]	315	===-------------------------------------------------------------------------===
				316
				317	Implement TargetConstantVec, and set up PPC to custom lower ConstantVec into
				318	TargetConstantVec's if it's one of the many forms that are algorithmically
				319	computable using the spiffy altivec instructions.
				320
Chris Lattner	56b6964	2006-01-31 02:55:28 +0000	[diff] [blame]	321	===-------------------------------------------------------------------------===
				322
				323	Compile this:
				324
Chris Lattner	83e64ba	2006-01-31 07:16:34 +0000	[diff] [blame]	325	int foo(int a) {
				326	int b = (a < 8);
				327	if (b) {
				328	return b * 3; // ignore the fact that this is always 3.
				329	} else {
				330	return 2;
				331	}
				332	}
				333
				334	into something not this:
				335
				336	_foo:
				337	1) cmpwi cr7, r3, 8
				338	mfcr r2, 1
				339	rlwinm r2, r2, 29, 31, 31
				340	1) cmpwi cr0, r3, 7
				341	bgt cr0, LBB1_2 ; UnifiedReturnBlock
				342	LBB1_1: ; then
				343	rlwinm r2, r2, 0, 31, 31
				344	mulli r3, r2, 3
				345	blr
				346	LBB1_2: ; UnifiedReturnBlock
				347	li r3, 2
				348	blr
				349
				350	In particular, the two compares (marked 1) could be shared by reversing one.
				351	This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
				352	same operands (but backwards) exists. In this case, this wouldn't save us
				353	anything though, because the compares still wouldn't be shared.
Chris Lattner	0ddc180	2006-02-01 00:28:12 +0000	[diff] [blame]	354
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	355	===-------------------------------------------------------------------------===
				356
				357	The legalizer should lower this:
				358
				359	bool %test(ulong %x) {
				360	%tmp = setlt ulong %x, 4294967296
				361	ret bool %tmp
				362	}
				363
				364	into "if x.high == 0", not:
				365
				366	_test:
				367	addi r2, r3, -1
				368	cntlzw r2, r2
				369	cntlzw r3, r3
				370	srwi r2, r2, 5
Nate Begeman	93c740b	2006-02-02 07:27:56 +0000	[diff] [blame]	371	srwi r4, r3, 5
				372	li r3, 0
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	373	cmpwi cr0, r2, 0
				374	bne cr0, LBB1_2 ;
				375	LBB1_1:
Nate Begeman	93c740b	2006-02-02 07:27:56 +0000	[diff] [blame]	376	or r3, r4, r4
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	377	LBB1_2:
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	378	blr
				379
				380	noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner	275b884	2006-02-02 07:37:11 +0000	[diff] [blame]	381
				382
				383	===-------------------------------------------------------------------------===
				384
				385	We should custom expand setcc instead of pretending that we have it. That
				386	would allow us to expose the access of the crbit after the mfcr, allowing
				387	that access to be trivially folded into other ops. A simple example:
				388
				389	int foo(int a, int b) { return (a < b) << 4; }
				390
				391	compiles into:
				392
				393	_foo:
				394	cmpw cr7, r3, r4
				395	mfcr r2, 1
				396	rlwinm r2, r2, 29, 31, 31
				397	slwi r3, r2, 4
				398	blr
				399
Chris Lattner	d463f7f	2006-02-03 01:49:49 +0000	[diff] [blame]	400	===-------------------------------------------------------------------------===
				401
Nate Begeman	a63fee8	2006-02-03 05:17:06 +0000	[diff] [blame]	402	Fold add and sub with constant into non-extern, non-weak addresses so this:
				403
				404	static int a;
				405	void bar(int b) { a = b; }
				406	void foo(unsigned char *c) {
				407	*c = a;
				408	}
				409
				410	So that
				411
				412	_foo:
				413	lis r2, ha16(_a)
				414	la r2, lo16(_a)(r2)
				415	lbz r2, 3(r2)
				416	stb r2, 0(r3)
				417	blr
				418
				419	Becomes
				420
				421	_foo:
				422	lis r2, ha16(_a+3)
				423	lbz r2, lo16(_a+3)(r2)
				424	stb r2, 0(r3)
				425	blr
Chris Lattner	2138453	2006-02-05 05:27:35 +0000	[diff] [blame]	426
				427	===-------------------------------------------------------------------------===
				428
				429	We generate really bad code for this:
				430
				431	int f(signed char *a, _Bool b, _Bool c) {
				432	signed char t = 0;
				433	if (b) t = *a;
				434	if (c) *a = t;
				435	}
				436
Chris Lattner	00d18f0	2006-03-01 06:36:20 +0000	[diff] [blame^]	437	===-------------------------------------------------------------------------===
				438
				439	This:
				440	int test(unsigned P) { return P >> 24; }
				441
				442	Should compile to:
				443
				444	_test:
				445	lbz r3,0(r3)
				446	blr
				447
				448	not:
				449
				450	_test:
				451	lwz r2, 0(r3)
				452	srwi r3, r2, 24
				453	blr
				454