Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: 58805562a87e94776eb3802de9c4953d8cbd094c [file] [log] [blame]

Nate Begeman	b64af91	2004-08-10 20:42:36 +0000	[diff] [blame]	1	TODO:
Nate Begeman	ef9531e	2005-04-11 20:48:57 +0000	[diff] [blame]	2	* gpr0 allocation
Nate Begeman	4a0de07	2004-10-26 04:10:53 +0000	[diff] [blame]	3	* implement do-loop -> bdnz transform
Nate Begeman	ca068e8	2004-08-14 22:16:36 +0000	[diff] [blame]	4	* implement powerpc-64 for darwin
Nate Begeman	d332fd5	2004-08-29 22:02:43 +0000	[diff] [blame]	5	* use stfiwx in float->int
Nate Begeman	4ad870d	2005-07-26 18:59:06 +0000	[diff] [blame]	6	* be able to combine sequences like the following into 2 instructions:
				7	lis r2, ha16(l2__ZTV4Cell)
				8	la r2, lo16(l2__ZTV4Cell)(r2)
				9	addi r2, r2, 8
Chris Lattner	b65975a	2005-07-26 19:07:51 +0000	[diff] [blame]	10
Nate Begeman	5a01481	2005-08-14 01:17:16 +0000	[diff] [blame]	11	* Teach LLVM how to codegen this:
				12	unsigned short foo(float a) { return a; }
				13	as:
				14	_foo:
				15	fctiwz f0,f1
				16	stfd f0,-8(r1)
				17	lhz r3,-2(r1)
				18	blr
				19	not:
				20	_foo:
				21	fctiwz f0, f1
				22	stfd f0, -8(r1)
				23	lwz r2, -4(r1)
				24	rlwinm r3, r2, 0, 16, 31
				25	blr
				26
Chris Lattner	3d8df55	2005-10-18 06:30:51 +0000	[diff] [blame]	27	and:
				28	extern int X, Y; int* test(int C) { return C? &X : &Y; }
				29	as one load when using --enable-pic.
Nate Begeman	5a01481	2005-08-14 01:17:16 +0000	[diff] [blame]	30
Chris Lattner	6281ae4	2005-08-05 19:18:32 +0000	[diff] [blame]	31	* Support 'update' load/store instructions. These are cracked on the G5, but
				32	are still a codesize win.
				33
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	34	* should hint to the branch select pass that it doesn't need to print the
				35	second unconditional branch, so we don't end up with things like:
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	36	b .LBBl42__2E_expand_function_8_674 ; loopentry.24
				37	b .LBBl42__2E_expand_function_8_42 ; NewDefault
				38	b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	39
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	40	===-------------------------------------------------------------------------===
				41
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	42	* Codegen this:
				43
				44	void test2(int X) {
				45	if (X == 0x12345678) bar();
				46	}
				47
				48	as:
				49
				50	xoris r0,r3,0x1234
				51	cmpwi cr0,r0,0x5678
				52	beq cr0,L6
				53
				54	not:
				55
				56	lis r2, 4660
				57	ori r2, r2, 22136
				58	cmpw cr0, r3, r2
				59	bne .LBB_test2_2
				60
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	61	===-------------------------------------------------------------------------===
				62
				63	Lump the constant pool for each function into ONE pic object, and reference
				64	pieces of it as offsets from the start. For functions like this (contrived
				65	to have lots of constants obviously):
				66
				67	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				68
				69	We generate:
				70
				71	_X:
				72	lis r2, ha16(.CPI_X_0)
				73	lfd f0, lo16(.CPI_X_0)(r2)
				74	lis r2, ha16(.CPI_X_1)
				75	lfd f2, lo16(.CPI_X_1)(r2)
				76	fmadd f0, f1, f0, f2
				77	lis r2, ha16(.CPI_X_2)
				78	lfd f1, lo16(.CPI_X_2)(r2)
				79	lis r2, ha16(.CPI_X_3)
				80	lfd f2, lo16(.CPI_X_3)(r2)
				81	fmadd f1, f0, f1, f2
				82	blr
				83
				84	It would be better to materialize .CPI_X into a register, then use immediates
				85	off of the register to avoid the lis's. This is even more important in PIC
				86	mode.
				87
				88	===-------------------------------------------------------------------------===
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	89
				90	Implement Newton-Rhapson method for improving estimate instructions to the
				91	correct accuracy, and implementing divide as multiply by reciprocal when it has
				92	more than one use. Itanium will want this too.
Nate Begeman	21e463b	2005-10-16 05:39:50 +0000	[diff] [blame]	93
				94	===-------------------------------------------------------------------------===
				95
				96	int foo(int a, int b) { return a == b ? 16 : 0; }
				97	_foo:
				98	cmpw cr7, r3, r4
				99	mfcr r2
				100	rlwinm r2, r2, 31, 31, 31
				101	slwi r3, r2, 4
				102	blr
				103
				104	If we exposed the srl & mask ops after the MFCR that we are doing to select
				105	the correct CR bit, then we could fold the slwi into the rlwinm before it.
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	106
				107	===-------------------------------------------------------------------------===
				108
				109	#define ARRAY_LENGTH 16
				110
				111	union bitfield {
				112	struct {
				113	#ifndef __ppc__
				114	unsigned int field0 : 6;
				115	unsigned int field1 : 6;
				116	unsigned int field2 : 6;
				117	unsigned int field3 : 6;
				118	unsigned int field4 : 3;
				119	unsigned int field5 : 4;
				120	unsigned int field6 : 1;
				121	#else
				122	unsigned int field6 : 1;
				123	unsigned int field5 : 4;
				124	unsigned int field4 : 3;
				125	unsigned int field3 : 6;
				126	unsigned int field2 : 6;
				127	unsigned int field1 : 6;
				128	unsigned int field0 : 6;
				129	#endif
				130	} bitfields, bits;
				131	unsigned int u32All;
				132	signed int i32All;
				133	float f32All;
				134	};
				135
				136
				137	typedef struct program_t {
				138	union bitfield array[ARRAY_LENGTH];
				139	int size;
				140	int loaded;
				141	} program;
				142
				143
				144	void AdjustBitfields(program* prog, unsigned int fmt1)
				145	{
				146	unsigned int shift = 0;
				147	unsigned int texCount = 0;
				148	unsigned int i;
				149
				150	for (i = 0; i < 8; i++)
				151	{
				152	prog->array[i].bitfields.field0 = texCount;
				153	prog->array[i].bitfields.field1 = texCount + 1;
				154	prog->array[i].bitfields.field2 = texCount + 2;
				155	prog->array[i].bitfields.field3 = texCount + 3;
				156
				157	texCount += (fmt1 >> shift) & 0x7;
				158	shift += 3;
				159	}
				160	}
				161
				162	In the loop above, the bitfield adds get generated as
				163	(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
				164
				165	Since the input to the (or and, and) is an (add) rather than a (shl), the shift
				166	doesn't get folded into the rlwimi instruction. We should ideally see through
				167	things like this, rather than forcing llvm to generate the equivalent
				168
				169	(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner	0195910	2005-10-28 00:20:45 +0000	[diff] [blame]	170
				171	===-------------------------------------------------------------------------===
				172
				173	Compile this (standard bitfield insert of a constant):
				174	void %test(uint* %tmp1) {
				175	%tmp2 = load uint* %tmp1 ; <uint> [#uses=1]
				176	%tmp5 = or uint %tmp2, 257949696 ; <uint> [#uses=1]
				177	%tmp6 = and uint %tmp5, 4018143231 ; <uint> [#uses=1]
				178	store uint %tmp6, uint* %tmp1
				179	ret void
				180	}
				181
				182	to:
				183
				184	_test:
				185	lwz r0,0(r3)
				186	li r2,123
				187	rlwimi r0,r2,21,3,10
				188	stw r0,0(r3)
				189	blr
				190
				191	instead of:
				192
				193	_test:
				194	lis r2, -4225
				195	lwz r4, 0(r3)
				196	ori r2, r2, 65535
				197	oris r4, r4, 3936
				198	and r2, r4, r2
				199	stw r2, 0(r3)
				200	blr
				201
Chris Lattner	ae4664a	2005-11-05 08:57:56 +0000	[diff] [blame]	202	===-------------------------------------------------------------------------===
				203
				204	Compile this:
				205
				206	int %f1(int %a, int %b) {
				207	%tmp.1 = and int %a, 15 ; <int> [#uses=1]
				208	%tmp.3 = and int %b, 240 ; <int> [#uses=1]
				209	%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
				210	ret int %tmp.4
				211	}
				212
				213	without a copy. We make this currently:
				214
				215	_f1:
				216	rlwinm r2, r4, 0, 24, 27
				217	rlwimi r2, r3, 0, 28, 31
				218	or r3, r2, r2
				219	blr
				220
				221	The two-addr pass or RA needs to learn when it is profitable to commute an
				222	instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
				223	currently only commutes to avoid inserting a copy BEFORE the two addr instr.
				224
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	225	===-------------------------------------------------------------------------===
				226
				227	Compile offsets from allocas:
				228
				229	int *%test() {
				230	%X = alloca { int, int }
				231	%Y = getelementptr {int,int}* %X, int 0, uint 1
				232	ret int* %Y
				233	}
				234
				235	into a single add, not two:
				236
				237	_test:
				238	addi r2, r1, -8
				239	addi r3, r2, 4
				240	blr
				241
				242	--> important for C++.
				243
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame^]	244	===-------------------------------------------------------------------------===
				245
				246	int test3(int a, int b) { return (a < 0) ? a : 0; }
				247
				248	should be branch free code. LLVM is turning it into < 1 because of the RHS.
				249
				250	===-------------------------------------------------------------------------===
				251
				252	For this testcase:
				253	int f1(int a, int b) { return (a&0xF)\|(b&0xF0); }
				254
				255	We currently emit:
				256	_f1:
				257	rlwinm r2, r4, 0, 24, 27
				258	rlwimi r2, r3, 0, 28, 31
				259	or r3, r2, r2
				260	blr
				261
				262	We could emit:
				263	_f1:
				264	rlwinm r4, r4, 0, 24, 27
				265	rlwimi r3, r4, 0, 0, 27
				266	blr
				267
				268	===-------------------------------------------------------------------------===
				269
				270	No loads or stores of the constants should be needed:
				271
				272	struct foo { double X, Y; };
				273	void xxx(struct foo F);
				274	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				275