Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: 88582966994acd5d2e540503f7f08141321de8f5 [file] [log] [blame]

Nate Begeman	b64af91	2004-08-10 20:42:36 +0000	[diff] [blame]	1	TODO:
Nate Begeman	ef9531e	2005-04-11 20:48:57 +0000	[diff] [blame]	2	* gpr0 allocation
Nate Begeman	4a0de07	2004-10-26 04:10:53 +0000	[diff] [blame]	3	* implement do-loop -> bdnz transform
Nate Begeman	ca068e8	2004-08-14 22:16:36 +0000	[diff] [blame]	4	* implement powerpc-64 for darwin
Nate Begeman	d332fd5	2004-08-29 22:02:43 +0000	[diff] [blame]	5	* use stfiwx in float->int
Nate Begeman	4ad870d	2005-07-26 18:59:06 +0000	[diff] [blame]	6	* be able to combine sequences like the following into 2 instructions:
				7	lis r2, ha16(l2__ZTV4Cell)
				8	la r2, lo16(l2__ZTV4Cell)(r2)
				9	addi r2, r2, 8
Chris Lattner	b65975a	2005-07-26 19:07:51 +0000	[diff] [blame]	10
Nate Begeman	5a01481	2005-08-14 01:17:16 +0000	[diff] [blame]	11	* Teach LLVM how to codegen this:
				12	unsigned short foo(float a) { return a; }
				13	as:
				14	_foo:
				15	fctiwz f0,f1
				16	stfd f0,-8(r1)
				17	lhz r3,-2(r1)
				18	blr
				19	not:
				20	_foo:
				21	fctiwz f0, f1
				22	stfd f0, -8(r1)
				23	lwz r2, -4(r1)
				24	rlwinm r3, r2, 0, 16, 31
				25	blr
				26
Chris Lattner	3d8df55	2005-10-18 06:30:51 +0000	[diff] [blame]	27	and:
				28	extern int X, Y; int* test(int C) { return C? &X : &Y; }
				29	as one load when using --enable-pic.
Nate Begeman	5a01481	2005-08-14 01:17:16 +0000	[diff] [blame]	30
Chris Lattner	6281ae4	2005-08-05 19:18:32 +0000	[diff] [blame]	31	* Support 'update' load/store instructions. These are cracked on the G5, but
				32	are still a codesize win.
				33
Chris Lattner	c7e18a1	2005-08-09 22:30:57 +0000	[diff] [blame]	34	* Add a custom legalizer for the GlobalAddress node, to move the funky darwin
				35	stub stuff from the instruction selector to the legalizer (exposing low-level
				36	operations to the dag for optzn. For example, we want to codegen this:
				37
				38	int A = 0;
				39	void B() { A++; }
				40	as:
				41	lis r9,ha16(_A)
				42	lwz r2,lo16(_A)(r9)
				43	addi r2,r2,1
				44	stw r2,lo16(_A)(r9)
				45	not:
				46	lis r2, ha16(_A)
				47	lwz r2, lo16(_A)(r2)
				48	addi r2, r2, 1
				49	lis r3, ha16(_A)
				50	stw r2, lo16(_A)(r3)
				51
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	52	* should hint to the branch select pass that it doesn't need to print the
				53	second unconditional branch, so we don't end up with things like:
Misha Brukman	4ce5ce2	2004-07-27 18:43:04 +0000	[diff] [blame]	54	b .LBBl42__2E_expand_function_8_674 ; loopentry.24
				55	b .LBBl42__2E_expand_function_8_42 ; NewDefault
				56	b .LBBl42__2E_expand_function_8_42 ; NewDefault
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	57
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	58	===-------------------------------------------------------------------------===
				59
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	60	* Codegen this:
				61
				62	void test2(int X) {
				63	if (X == 0x12345678) bar();
				64	}
				65
				66	as:
				67
				68	xoris r0,r3,0x1234
				69	cmpwi cr0,r0,0x5678
				70	beq cr0,L6
				71
				72	not:
				73
				74	lis r2, 4660
				75	ori r2, r2, 22136
				76	cmpw cr0, r3, r2
				77	bne .LBB_test2_2
				78
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	79	===-------------------------------------------------------------------------===
				80
				81	Lump the constant pool for each function into ONE pic object, and reference
				82	pieces of it as offsets from the start. For functions like this (contrived
				83	to have lots of constants obviously):
				84
				85	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				86
				87	We generate:
				88
				89	_X:
				90	lis r2, ha16(.CPI_X_0)
				91	lfd f0, lo16(.CPI_X_0)(r2)
				92	lis r2, ha16(.CPI_X_1)
				93	lfd f2, lo16(.CPI_X_1)(r2)
				94	fmadd f0, f1, f0, f2
				95	lis r2, ha16(.CPI_X_2)
				96	lfd f1, lo16(.CPI_X_2)(r2)
				97	lis r2, ha16(.CPI_X_3)
				98	lfd f2, lo16(.CPI_X_3)(r2)
				99	fmadd f1, f0, f1, f2
				100	blr
				101
				102	It would be better to materialize .CPI_X into a register, then use immediates
				103	off of the register to avoid the lis's. This is even more important in PIC
				104	mode.
				105
				106	===-------------------------------------------------------------------------===
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	107
				108	Implement Newton-Rhapson method for improving estimate instructions to the
				109	correct accuracy, and implementing divide as multiply by reciprocal when it has
				110	more than one use. Itanium will want this too.
Nate Begeman	21e463b	2005-10-16 05:39:50 +0000	[diff] [blame]	111
				112	===-------------------------------------------------------------------------===
				113
				114	int foo(int a, int b) { return a == b ? 16 : 0; }
				115	_foo:
				116	cmpw cr7, r3, r4
				117	mfcr r2
				118	rlwinm r2, r2, 31, 31, 31
				119	slwi r3, r2, 4
				120	blr
				121
				122	If we exposed the srl & mask ops after the MFCR that we are doing to select
				123	the correct CR bit, then we could fold the slwi into the rlwinm before it.
Nate Begeman	5cd61ce	2005-10-25 23:50:02 +0000	[diff] [blame]	124
				125	===-------------------------------------------------------------------------===
				126
				127	#define ARRAY_LENGTH 16
				128
				129	union bitfield {
				130	struct {
				131	#ifndef __ppc__
				132	unsigned int field0 : 6;
				133	unsigned int field1 : 6;
				134	unsigned int field2 : 6;
				135	unsigned int field3 : 6;
				136	unsigned int field4 : 3;
				137	unsigned int field5 : 4;
				138	unsigned int field6 : 1;
				139	#else
				140	unsigned int field6 : 1;
				141	unsigned int field5 : 4;
				142	unsigned int field4 : 3;
				143	unsigned int field3 : 6;
				144	unsigned int field2 : 6;
				145	unsigned int field1 : 6;
				146	unsigned int field0 : 6;
				147	#endif
				148	} bitfields, bits;
				149	unsigned int u32All;
				150	signed int i32All;
				151	float f32All;
				152	};
				153
				154
				155	typedef struct program_t {
				156	union bitfield array[ARRAY_LENGTH];
				157	int size;
				158	int loaded;
				159	} program;
				160
				161
				162	void AdjustBitfields(program* prog, unsigned int fmt1)
				163	{
				164	unsigned int shift = 0;
				165	unsigned int texCount = 0;
				166	unsigned int i;
				167
				168	for (i = 0; i < 8; i++)
				169	{
				170	prog->array[i].bitfields.field0 = texCount;
				171	prog->array[i].bitfields.field1 = texCount + 1;
				172	prog->array[i].bitfields.field2 = texCount + 2;
				173	prog->array[i].bitfields.field3 = texCount + 3;
				174
				175	texCount += (fmt1 >> shift) & 0x7;
				176	shift += 3;
				177	}
				178	}
				179
				180	In the loop above, the bitfield adds get generated as
				181	(add (shl bitfield, C1), (shl C2, C1)) where C2 is 1, 2 or 3.
				182
				183	Since the input to the (or and, and) is an (add) rather than a (shl), the shift
				184	doesn't get folded into the rlwimi instruction. We should ideally see through
				185	things like this, rather than forcing llvm to generate the equivalent
				186
				187	(shl (add bitfield, C2), C1) with some kind of mask.
Chris Lattner	0195910	2005-10-28 00:20:45 +0000	[diff] [blame]	188
				189	===-------------------------------------------------------------------------===
				190
				191	Compile this (standard bitfield insert of a constant):
				192	void %test(uint* %tmp1) {
				193	%tmp2 = load uint* %tmp1 ; <uint> [#uses=1]
				194	%tmp5 = or uint %tmp2, 257949696 ; <uint> [#uses=1]
				195	%tmp6 = and uint %tmp5, 4018143231 ; <uint> [#uses=1]
				196	store uint %tmp6, uint* %tmp1
				197	ret void
				198	}
				199
				200	to:
				201
				202	_test:
				203	lwz r0,0(r3)
				204	li r2,123
				205	rlwimi r0,r2,21,3,10
				206	stw r0,0(r3)
				207	blr
				208
				209	instead of:
				210
				211	_test:
				212	lis r2, -4225
				213	lwz r4, 0(r3)
				214	ori r2, r2, 65535
				215	oris r4, r4, 3936
				216	and r2, r4, r2
				217	stw r2, 0(r3)
				218	blr
				219
Chris Lattner	ae4664a	2005-11-05 08:57:56 +0000	[diff] [blame^]	220	===-------------------------------------------------------------------------===
				221
				222	Compile this:
				223
				224	int %f1(int %a, int %b) {
				225	%tmp.1 = and int %a, 15 ; <int> [#uses=1]
				226	%tmp.3 = and int %b, 240 ; <int> [#uses=1]
				227	%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
				228	ret int %tmp.4
				229	}
				230
				231	without a copy. We make this currently:
				232
				233	_f1:
				234	rlwinm r2, r4, 0, 24, 27
				235	rlwimi r2, r3, 0, 28, 31
				236	or r3, r2, r2
				237	blr
				238
				239	The two-addr pass or RA needs to learn when it is profitable to commute an
				240	instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
				241	currently only commutes to avoid inserting a copy BEFORE the two addr instr.
				242