Blame - lib/Target/PowerPC/README.txt - fp2-dev/platform/external/llvm

blob: 2c99e02e4e22f5277ff96e9dce4c574f82feb5f4 [file] [log] [blame]

Chris Lattner	b86bd2c	2006-03-27 07:04:16 +0000	[diff] [blame]	1	//===- README.txt - Notes for improving PowerPC-specific code gen ---------===//
				2
Nate Begeman	b64af91	2004-08-10 20:42:36 +0000	[diff] [blame]	3	TODO:
Nate Begeman	ef9531e	2005-04-11 20:48:57 +0000	[diff] [blame]	4	* gpr0 allocation
Nate Begeman	4a0de07	2004-10-26 04:10:53 +0000	[diff] [blame]	5	* implement do-loop -> bdnz transform
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	6
Nate Begeman	a63fee8	2006-02-03 05:17:06 +0000	[diff] [blame]	7	===-------------------------------------------------------------------------===
Nate Begeman	50fb3c4	2005-12-24 01:00:15 +0000	[diff] [blame]	8
Nate Begeman	a63fee8	2006-02-03 05:17:06 +0000	[diff] [blame]	9	Support 'update' load/store instructions. These are cracked on the G5, but are
				10	still a codesize win.
				11
				12	===-------------------------------------------------------------------------===
				13
Nate Begeman	81e8097	2006-03-17 01:40:33 +0000	[diff] [blame]	14	Teach the .td file to pattern match PPC::BR_COND to appropriate bc variant, so
				15	we don't have to always run the branch selector for small functions.
Nate Begeman	1ad9b3a	2006-03-16 22:37:48 +0000	[diff] [blame]	16
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	17	===-------------------------------------------------------------------------===
				18
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	19	* Codegen this:
				20
				21	void test2(int X) {
				22	if (X == 0x12345678) bar();
				23	}
				24
				25	as:
				26
				27	xoris r0,r3,0x1234
Nate Begeman	6e53ceb	2006-02-27 22:08:36 +0000	[diff] [blame]	28	cmplwi cr0,r0,0x5678
Chris Lattner	424dcbd	2005-08-23 06:27:59 +0000	[diff] [blame]	29	beq cr0,L6
				30
				31	not:
				32
				33	lis r2, 4660
				34	ori r2, r2, 22136
				35	cmpw cr0, r3, r2
				36	bne .LBB_test2_2
				37
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	38	===-------------------------------------------------------------------------===
				39
				40	Lump the constant pool for each function into ONE pic object, and reference
				41	pieces of it as offsets from the start. For functions like this (contrived
				42	to have lots of constants obviously):
				43
				44	double X(double Y) { return (Y1.23 + 4.512)2.34 + 14.38; }
				45
				46	We generate:
				47
				48	_X:
				49	lis r2, ha16(.CPI_X_0)
				50	lfd f0, lo16(.CPI_X_0)(r2)
				51	lis r2, ha16(.CPI_X_1)
				52	lfd f2, lo16(.CPI_X_1)(r2)
				53	fmadd f0, f1, f0, f2
				54	lis r2, ha16(.CPI_X_2)
				55	lfd f1, lo16(.CPI_X_2)(r2)
				56	lis r2, ha16(.CPI_X_3)
				57	lfd f2, lo16(.CPI_X_3)(r2)
				58	fmadd f1, f0, f1, f2
				59	blr
				60
				61	It would be better to materialize .CPI_X into a register, then use immediates
				62	off of the register to avoid the lis's. This is even more important in PIC
				63	mode.
				64
Chris Lattner	39b248b	2006-02-02 23:50:22 +0000	[diff] [blame]	65	Note that this (and the static variable version) is discussed here for GCC:
				66	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				67
Chris Lattner	a3c4454	2005-08-24 18:15:24 +0000	[diff] [blame]	68	===-------------------------------------------------------------------------===
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	69
Chris Lattner	33c1dab	2006-02-03 06:22:11 +0000	[diff] [blame]	70	PIC Code Gen IPO optimization:
				71
				72	Squish small scalar globals together into a single global struct, allowing the
				73	address of the struct to be CSE'd, avoiding PIC accesses (also reduces the size
				74	of the GOT on targets with one).
				75
				76	Note that this is discussed here for GCC:
				77	http://gcc.gnu.org/ml/gcc-patches/2006-02/msg00133.html
				78
				79	===-------------------------------------------------------------------------===
				80
Nate Begeman	92cce90	2005-09-06 15:30:48 +0000	[diff] [blame]	81	Implement Newton-Rhapson method for improving estimate instructions to the
				82	correct accuracy, and implementing divide as multiply by reciprocal when it has
				83	more than one use. Itanium will want this too.
Nate Begeman	21e463b	2005-10-16 05:39:50 +0000	[diff] [blame]	84
				85	===-------------------------------------------------------------------------===
				86
Chris Lattner	ae4664a	2005-11-05 08:57:56 +0000	[diff] [blame]	87	Compile this:
				88
				89	int %f1(int %a, int %b) {
				90	%tmp.1 = and int %a, 15 ; <int> [#uses=1]
				91	%tmp.3 = and int %b, 240 ; <int> [#uses=1]
				92	%tmp.4 = or int %tmp.3, %tmp.1 ; <int> [#uses=1]
				93	ret int %tmp.4
				94	}
				95
				96	without a copy. We make this currently:
				97
				98	_f1:
				99	rlwinm r2, r4, 0, 24, 27
				100	rlwimi r2, r3, 0, 28, 31
				101	or r3, r2, r2
				102	blr
				103
				104	The two-addr pass or RA needs to learn when it is profitable to commute an
				105	instruction to avoid a copy AFTER the 2-addr instruction. The 2-addr pass
				106	currently only commutes to avoid inserting a copy BEFORE the two addr instr.
				107
Chris Lattner	62c08dd	2005-12-08 07:13:28 +0000	[diff] [blame]	108	===-------------------------------------------------------------------------===
				109
				110	Compile offsets from allocas:
				111
				112	int *%test() {
				113	%X = alloca { int, int }
				114	%Y = getelementptr {int,int}* %X, int 0, uint 1
				115	ret int* %Y
				116	}
				117
				118	into a single add, not two:
				119
				120	_test:
				121	addi r2, r1, -8
				122	addi r3, r2, 4
				123	blr
				124
				125	--> important for C++.
				126
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	127	===-------------------------------------------------------------------------===
				128
				129	int test3(int a, int b) { return (a < 0) ? a : 0; }
				130
				131	should be branch free code. LLVM is turning it into < 1 because of the RHS.
				132
				133	===-------------------------------------------------------------------------===
				134
Chris Lattner	39706e6	2005-12-22 17:19:28 +0000	[diff] [blame]	135	No loads or stores of the constants should be needed:
				136
				137	struct foo { double X, Y; };
				138	void xxx(struct foo F);
				139	void bar() { struct foo R = { 1.0, 2.0 }; xxx(R); }
				140
Chris Lattner	1db4b4f	2006-01-16 17:53:00 +0000	[diff] [blame]	141	===-------------------------------------------------------------------------===
				142
Chris Lattner	98fbc2f	2006-01-16 17:58:54 +0000	[diff] [blame]	143	Darwin Stub LICM optimization:
				144
				145	Loops like this:
				146
				147	for (...) bar();
				148
				149	Have to go through an indirect stub if bar is external or linkonce. It would
				150	be better to compile it as:
				151
				152	fp = &bar;
				153	for (...) fp();
				154
				155	which only computes the address of bar once (instead of each time through the
				156	stub). This is Darwin specific and would have to be done in the code generator.
				157	Probably not a win on x86.
				158
				159	===-------------------------------------------------------------------------===
				160
				161	PowerPC i1/setcc stuff (depends on subreg stuff):
				162
				163	Check out the PPC code we get for 'compare' in this testcase:
				164	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=19672
				165
				166	oof. on top of not doing the logical crnand instead of (mfcr, mfcr,
				167	invert, invert, or), we then have to compare it against zero instead of
				168	using the value already in a CR!
				169
				170	that should be something like
				171	cmpw cr7, r8, r5
				172	cmpw cr0, r7, r3
				173	crnand cr0, cr0, cr7
				174	bne cr0, LBB_compare_4
				175
				176	instead of
				177	cmpw cr7, r8, r5
				178	cmpw cr0, r7, r3
				179	mfcr r7, 1
				180	mcrf cr7, cr0
				181	mfcr r8, 1
				182	rlwinm r7, r7, 30, 31, 31
				183	rlwinm r8, r8, 30, 31, 31
				184	xori r7, r7, 1
				185	xori r8, r8, 1
				186	addi r2, r2, 1
				187	or r7, r8, r7
				188	cmpwi cr0, r7, 0
				189	bne cr0, LBB_compare_4 ; loopexit
				190
Chris Lattner	8d3f490	2006-02-08 06:43:51 +0000	[diff] [blame]	191	FreeBench/mason has a basic block that looks like this:
				192
				193	%tmp.130 = seteq int %p.0__, 5 ; <bool> [#uses=1]
				194	%tmp.134 = seteq int %p.1__, 6 ; <bool> [#uses=1]
				195	%tmp.139 = seteq int %p.2__, 12 ; <bool> [#uses=1]
				196	%tmp.144 = seteq int %p.3__, 13 ; <bool> [#uses=1]
				197	%tmp.149 = seteq int %p.4__, 14 ; <bool> [#uses=1]
				198	%tmp.154 = seteq int %p.5__, 15 ; <bool> [#uses=1]
				199	%bothcond = and bool %tmp.134, %tmp.130 ; <bool> [#uses=1]
				200	%bothcond123 = and bool %bothcond, %tmp.139 ; <bool>
				201	%bothcond124 = and bool %bothcond123, %tmp.144 ; <bool>
				202	%bothcond125 = and bool %bothcond124, %tmp.149 ; <bool>
				203	%bothcond126 = and bool %bothcond125, %tmp.154 ; <bool>
				204	br bool %bothcond126, label %shortcirc_next.5, label %else.0
				205
				206	This is a particularly important case where handling CRs better will help.
				207
Chris Lattner	98fbc2f	2006-01-16 17:58:54 +0000	[diff] [blame]	208	===-------------------------------------------------------------------------===
				209
				210	Simple IPO for argument passing, change:
				211	void foo(int X, double Y, int Z) -> void foo(int X, int Z, double Y)
				212
				213	the Darwin ABI specifies that any integer arguments in the first 32 bytes worth
				214	of arguments get assigned to r3 through r10. That is, if you have a function
				215	foo(int, double, int) you get r3, f1, r6, since the 64 bit double ate up the
				216	argument bytes for r4 and r5. The trick then would be to shuffle the argument
				217	order for functions we can internalize so that the maximum number of
				218	integers/pointers get passed in regs before you see any of the fp arguments.
				219
				220	Instead of implementing this, it would actually probably be easier to just
				221	implement a PPC fastcc, where we could do whatever we wanted to the CC,
				222	including having this work sanely.
				223
				224	===-------------------------------------------------------------------------===
				225
				226	Fix Darwin FP-In-Integer Registers ABI
				227
				228	Darwin passes doubles in structures in integer registers, which is very very
				229	bad. Add something like a BIT_CONVERT to LLVM, then do an i-p transformation
				230	that percolates these things out of functions.
				231
				232	Check out how horrible this is:
				233	http://gcc.gnu.org/ml/gcc/2005-10/msg01036.html
				234
				235	This is an extension of "interprocedural CC unmunging" that can't be done with
				236	just fastcc.
				237
				238	===-------------------------------------------------------------------------===
				239
Chris Lattner	56b6964	2006-01-31 02:55:28 +0000	[diff] [blame]	240	Compile this:
				241
Chris Lattner	83e64ba	2006-01-31 07:16:34 +0000	[diff] [blame]	242	int foo(int a) {
				243	int b = (a < 8);
				244	if (b) {
				245	return b * 3; // ignore the fact that this is always 3.
				246	} else {
				247	return 2;
				248	}
				249	}
				250
				251	into something not this:
				252
				253	_foo:
				254	1) cmpwi cr7, r3, 8
				255	mfcr r2, 1
				256	rlwinm r2, r2, 29, 31, 31
				257	1) cmpwi cr0, r3, 7
				258	bgt cr0, LBB1_2 ; UnifiedReturnBlock
				259	LBB1_1: ; then
				260	rlwinm r2, r2, 0, 31, 31
				261	mulli r3, r2, 3
				262	blr
				263	LBB1_2: ; UnifiedReturnBlock
				264	li r3, 2
				265	blr
				266
				267	In particular, the two compares (marked 1) could be shared by reversing one.
				268	This could be done in the dag combiner, by swapping a BR_CC when a SETCC of the
				269	same operands (but backwards) exists. In this case, this wouldn't save us
				270	anything though, because the compares still wouldn't be shared.
Chris Lattner	0ddc180	2006-02-01 00:28:12 +0000	[diff] [blame]	271
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	272	===-------------------------------------------------------------------------===
				273
				274	The legalizer should lower this:
				275
				276	bool %test(ulong %x) {
				277	%tmp = setlt ulong %x, 4294967296
				278	ret bool %tmp
				279	}
				280
				281	into "if x.high == 0", not:
				282
				283	_test:
				284	addi r2, r3, -1
				285	cntlzw r2, r2
				286	cntlzw r3, r3
				287	srwi r2, r2, 5
Nate Begeman	93c740b	2006-02-02 07:27:56 +0000	[diff] [blame]	288	srwi r4, r3, 5
				289	li r3, 0
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	290	cmpwi cr0, r2, 0
				291	bne cr0, LBB1_2 ;
				292	LBB1_1:
Nate Begeman	93c740b	2006-02-02 07:27:56 +0000	[diff] [blame]	293	or r3, r4, r4
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	294	LBB1_2:
Chris Lattner	5a7efc9	2006-02-01 17:54:23 +0000	[diff] [blame]	295	blr
				296
				297	noticed in 2005-05-11-Popcount-ffs-fls.c.
Chris Lattner	275b884	2006-02-02 07:37:11 +0000	[diff] [blame]	298
				299
				300	===-------------------------------------------------------------------------===
				301
				302	We should custom expand setcc instead of pretending that we have it. That
				303	would allow us to expose the access of the crbit after the mfcr, allowing
				304	that access to be trivially folded into other ops. A simple example:
				305
				306	int foo(int a, int b) { return (a < b) << 4; }
				307
				308	compiles into:
				309
				310	_foo:
				311	cmpw cr7, r3, r4
				312	mfcr r2, 1
				313	rlwinm r2, r2, 29, 31, 31
				314	slwi r3, r2, 4
				315	blr
				316
Chris Lattner	d463f7f	2006-02-03 01:49:49 +0000	[diff] [blame]	317	===-------------------------------------------------------------------------===
				318
Nate Begeman	a63fee8	2006-02-03 05:17:06 +0000	[diff] [blame]	319	Fold add and sub with constant into non-extern, non-weak addresses so this:
				320
				321	static int a;
				322	void bar(int b) { a = b; }
				323	void foo(unsigned char *c) {
				324	*c = a;
				325	}
				326
				327	So that
				328
				329	_foo:
				330	lis r2, ha16(_a)
				331	la r2, lo16(_a)(r2)
				332	lbz r2, 3(r2)
				333	stb r2, 0(r3)
				334	blr
				335
				336	Becomes
				337
				338	_foo:
				339	lis r2, ha16(_a+3)
				340	lbz r2, lo16(_a+3)(r2)
				341	stb r2, 0(r3)
				342	blr
Chris Lattner	2138453	2006-02-05 05:27:35 +0000	[diff] [blame]	343
				344	===-------------------------------------------------------------------------===
				345
				346	We generate really bad code for this:
				347
				348	int f(signed char *a, _Bool b, _Bool c) {
				349	signed char t = 0;
				350	if (b) t = *a;
				351	if (c) *a = t;
				352	}
				353
Chris Lattner	00d18f0	2006-03-01 06:36:20 +0000	[diff] [blame]	354	===-------------------------------------------------------------------------===
				355
				356	This:
				357	int test(unsigned P) { return P >> 24; }
				358
				359	Should compile to:
				360
				361	_test:
				362	lbz r3,0(r3)
				363	blr
				364
				365	not:
				366
				367	_test:
				368	lwz r2, 0(r3)
				369	srwi r3, r2, 24
				370	blr
				371
Chris Lattner	5a63c47	2006-03-07 04:42:59 +0000	[diff] [blame]	372	===-------------------------------------------------------------------------===
				373
				374	On the G5, logical CR operations are more expensive in their three
				375	address form: ops that read/write the same register are half as expensive as
				376	those that read from two registers that are different from their destination.
				377
				378	We should model this with two separate instructions. The isel should generate
				379	the "two address" form of the instructions. When the register allocator
				380	detects that it needs to insert a copy due to the two-addresness of the CR
				381	logical op, it will invoke PPCInstrInfo::convertToThreeAddress. At this point
				382	we can convert to the "three address" instruction, to save code space.
				383
				384	This only matters when we start generating cr logical ops.
				385
Chris Lattner	49f398b	2006-03-08 00:25:47 +0000	[diff] [blame]	386	===-------------------------------------------------------------------------===
				387
				388	We should compile these two functions to the same thing:
				389
				390	#include <stdlib.h>
				391	void f(int a, int b, int *P) {
				392	*P = (a-b)>=0?(a-b):(b-a);
				393	}
				394	void g(int a, int b, int *P) {
				395	*P = abs(a-b);
				396	}
				397
				398	Further, they should compile to something better than:
				399
				400	_g:
				401	subf r2, r4, r3
				402	subfic r3, r2, 0
				403	cmpwi cr0, r2, -1
				404	bgt cr0, LBB2_2 ; entry
				405	LBB2_1: ; entry
				406	mr r2, r3
				407	LBB2_2: ; entry
				408	stw r2, 0(r5)
				409	blr
				410
				411	GCC produces:
				412
				413	_g:
				414	subf r4,r4,r3
				415	srawi r2,r4,31
				416	xor r0,r2,r4
				417	subf r0,r2,r0
				418	stw r0,0(r5)
				419	blr
				420
				421	... which is much nicer.
				422
				423	This theoretically may help improve twolf slightly (used in dimbox.c:142?).
				424
				425	===-------------------------------------------------------------------------===
				426
Nate Begeman	2df9928	2006-03-16 18:50:44 +0000	[diff] [blame]	427	int foo(int N, int *W, int TK, int X) {
				428	int t, i;
				429
				430	for (t = 0; t < N; ++t)
				431	for (i = 0; i < 4; ++i)
				432	W[t / X][i][t % X] = TK[i][t];
				433
				434	return 5;
				435	}
				436
Chris Lattner	ed51169	2006-03-16 22:25:55 +0000	[diff] [blame]	437	We generate relatively atrocious code for this loop compared to gcc.
				438
Chris Lattner	ef040dd	2006-03-21 00:47:09 +0000	[diff] [blame]	439	We could also strength reduce the rem and the div:
				440	http://www.lcs.mit.edu/pubs/pdf/MIT-LCS-TM-600.pdf
				441
Chris Lattner	28b1a0b	2006-03-19 05:33:30 +0000	[diff] [blame]	442	===-------------------------------------------------------------------------===
Chris Lattner	ed51169	2006-03-16 22:25:55 +0000	[diff] [blame]	443
Nate Begeman	c0a8b6d	2006-03-21 18:58:20 +0000	[diff] [blame]	444	float foo(float X) { return (int)(X); }
				445
Chris Lattner	9d86a9d	2006-03-22 05:33:23 +0000	[diff] [blame]	446	Currently produces:
Nate Begeman	c0a8b6d	2006-03-21 18:58:20 +0000	[diff] [blame]	447
				448	_foo:
Nate Begeman	c0a8b6d	2006-03-21 18:58:20 +0000	[diff] [blame]	449	fctiwz f0, f1
				450	stfd f0, -8(r1)
Chris Lattner	9d86a9d	2006-03-22 05:33:23 +0000	[diff] [blame]	451	lwz r2, -4(r1)
				452	extsw r2, r2
				453	std r2, -16(r1)
				454	lfd f0, -16(r1)
				455	fcfid f0, f0
Nate Begeman	c0a8b6d	2006-03-21 18:58:20 +0000	[diff] [blame]	456	frsp f1, f0
				457	blr
				458
Chris Lattner	9d86a9d	2006-03-22 05:33:23 +0000	[diff] [blame]	459	We could use a target dag combine to turn the lwz/extsw into an lwa when the
				460	lwz has a single use. Since LWA is cracked anyway, this would be a codesize
				461	win only.
Nate Begeman	c0a8b6d	2006-03-21 18:58:20 +0000	[diff] [blame]	462
Chris Lattner	716aefc	2006-03-23 21:28:44 +0000	[diff] [blame]	463	===-------------------------------------------------------------------------===
				464
Chris Lattner	057f09b	2006-03-24 20:04:27 +0000	[diff] [blame]	465	We generate ugly code for this:
				466
				467	void func(unsigned int *ret, float dx, float dy, float dz, float dw) {
				468	unsigned code = 0;
				469	if(dx < -dw) code \|= 1;
				470	if(dx > dw) code \|= 2;
				471	if(dy < -dw) code \|= 4;
				472	if(dy > dw) code \|= 8;
				473	if(dz < -dw) code \|= 16;
				474	if(dz > dw) code \|= 32;
				475	*ret = code;
				476	}
				477
Chris Lattner	420736d	2006-03-25 06:47:10 +0000	[diff] [blame]	478	===-------------------------------------------------------------------------===
				479
Chris Lattner	ed93790	2006-04-13 16:48:00 +0000	[diff] [blame]	480	Complete the signed i32 to FP conversion code using 64-bit registers
				481	transformation, good for PI. See PPCISelLowering.cpp, this comment:
Chris Lattner	220d2b8	2006-04-02 07:20:00 +0000	[diff] [blame]	482
Chris Lattner	ed93790	2006-04-13 16:48:00 +0000	[diff] [blame]	483	// FIXME: disable this lowered code. This generates 64-bit register values,
				484	// and we don't model the fact that the top part is clobbered by calls. We
				485	// need to flag these together so that the value isn't live across a call.
				486	//setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
Chris Lattner	220d2b8	2006-04-02 07:20:00 +0000	[diff] [blame]	487
Chris Lattner	9d62fa4	2006-05-17 19:02:25 +0000	[diff] [blame]	488	Also, if the registers are spilled to the stack, we have to ensure that all
				489	64-bits of them are save/restored, otherwise we will miscompile the code. It
				490	sounds like we need to get the 64-bit register classes going.
				491
Chris Lattner	55c6325	2006-05-05 05:36:15 +0000	[diff] [blame]	492	===-------------------------------------------------------------------------===
				493
Nate Begeman	7514620	2006-05-08 20:54:02 +0000	[diff] [blame]	494	%struct.B = type { ubyte, [3 x ubyte] }
				495
				496	void %foo(%struct.B* %b) {
				497	entry:
				498	%tmp = cast %struct.B* %b to uint* ; <uint*> [#uses=1]
				499	%tmp = load uint* %tmp ; <uint> [#uses=1]
				500	%tmp3 = cast %struct.B* %b to uint* ; <uint*> [#uses=1]
				501	%tmp4 = load uint* %tmp3 ; <uint> [#uses=1]
				502	%tmp8 = cast %struct.B* %b to uint* ; <uint*> [#uses=2]
				503	%tmp9 = load uint* %tmp8 ; <uint> [#uses=1]
				504	%tmp4.mask17 = shl uint %tmp4, ubyte 1 ; <uint> [#uses=1]
				505	%tmp1415 = and uint %tmp4.mask17, 2147483648 ; <uint> [#uses=1]
				506	%tmp.masked = and uint %tmp, 2147483648 ; <uint> [#uses=1]
				507	%tmp11 = or uint %tmp1415, %tmp.masked ; <uint> [#uses=1]
				508	%tmp12 = and uint %tmp9, 2147483647 ; <uint> [#uses=1]
				509	%tmp13 = or uint %tmp12, %tmp11 ; <uint> [#uses=1]
				510	store uint %tmp13, uint* %tmp8
Chris Lattner	55c6325	2006-05-05 05:36:15 +0000	[diff] [blame]	511	ret void
				512	}
				513
				514	We emit:
				515
				516	_foo:
				517	lwz r2, 0(r3)
Nate Begeman	7514620	2006-05-08 20:54:02 +0000	[diff] [blame]	518	slwi r4, r2, 1
				519	or r4, r4, r2
				520	rlwimi r2, r4, 0, 0, 0
Nate Begeman	4667f2c	2006-05-08 17:38:32 +0000	[diff] [blame]	521	stw r2, 0(r3)
Chris Lattner	55c6325	2006-05-05 05:36:15 +0000	[diff] [blame]	522	blr
				523
Nate Begeman	7514620	2006-05-08 20:54:02 +0000	[diff] [blame]	524	We could collapse a bunch of those ORs and ANDs and generate the following
				525	equivalent code:
Chris Lattner	55c6325	2006-05-05 05:36:15 +0000	[diff] [blame]	526
Nate Begeman	4667f2c	2006-05-08 17:38:32 +0000	[diff] [blame]	527	_foo:
				528	lwz r2, 0(r3)
Nate Begeman	d8624ed	2006-05-08 19:09:24 +0000	[diff] [blame]	529	rlwinm r4, r2, 1, 0, 0
Nate Begeman	4667f2c	2006-05-08 17:38:32 +0000	[diff] [blame]	530	or r2, r2, r4
				531	stw r2, 0(r3)
				532	blr
Chris Lattner	1eeedae	2006-07-14 04:07:29 +0000	[diff] [blame]	533
				534	===-------------------------------------------------------------------------===
				535
				536	On PPC64, this results in a truncate followed by a truncstore. These should
				537	be folded together.
				538
				539	unsigned short G;
				540	void foo(unsigned long H) { G = H; }
				541