Blame - lib/Target/X86/README.txt - fp2-dev/platform/external/llvm

blob: 567d5c4ebbd259d6a0899d6c8bc18ac3adb2609d [file] [log] [blame]

Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1	//===---------------------------------------------------------------------===//
				2	// Random ideas for the X86 backend.
				3	//===---------------------------------------------------------------------===//
				4
				5	Missing features:
				6	- Support for SSE4: http://www.intel.com/software/penryn
				7	http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
				8	- support for 3DNow!
				9	- weird abis?
				10
				11	//===---------------------------------------------------------------------===//
				12
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	13	CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
				14	backend knows how to three-addressify this shift, but it appears the register
				15	allocator isn't even asking it to do so in this case. We should investigate
				16	why this isn't happening, it could have significant impact on other important
				17	cases for X86 as well.
				18
				19	//===---------------------------------------------------------------------===//
				20
				21	This should be one DIV/IDIV instruction, not a libcall:
				22
				23	unsigned test(unsigned long long X, unsigned Y) {
				24	return X/Y;
				25	}
				26
				27	This can be done trivially with a custom legalizer. What about overflow
				28	though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
				29
				30	//===---------------------------------------------------------------------===//
				31
				32	Improvements to the multiply -> shift/add algorithm:
				33	http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
				34
				35	//===---------------------------------------------------------------------===//
				36
				37	Improve code like this (occurs fairly frequently, e.g. in LLVM):
				38	long long foo(int x) { return 1LL << x; }
				39
				40	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
				41	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
				42	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
				43
				44	Another useful one would be ~0ULL >> X and ~0ULL << X.
				45
				46	One better solution for 1LL << x is:
				47	xorl %eax, %eax
				48	xorl %edx, %edx
				49	testb $32, %cl
				50	sete %al
				51	setne %dl
				52	sall %cl, %eax
				53	sall %cl, %edx
				54
				55	But that requires good 8-bit subreg support.
				56
				57	64-bit shifts (in general) expand to really bad code. Instead of using
				58	cmovs, we should expand to a conditional branch like GCC produces.
				59
				60	//===---------------------------------------------------------------------===//
				61
				62	Compile this:
				63	_Bool f(_Bool a) { return a!=1; }
				64
				65	into:
				66	movzbl %dil, %eax
				67	xorl $1, %eax
				68	ret
				69
				70	//===---------------------------------------------------------------------===//
				71
				72	Some isel ideas:
				73
				74	1. Dynamic programming based approach when compile time if not an
				75	issue.
				76	2. Code duplication (addressing mode) during isel.
				77	3. Other ideas from "Register-Sensitive Selection, Duplication, and
				78	Sequencing of Instructions".
				79	4. Scheduling for reduced register pressure. E.g. "Minimum Register
				80	Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
				81	and other related papers.
				82	http://citeseer.ist.psu.edu/govindarajan01minimum.html
				83
				84	//===---------------------------------------------------------------------===//
				85
				86	Should we promote i16 to i32 to avoid partial register update stalls?
				87
				88	//===---------------------------------------------------------------------===//
				89
				90	Leave any_extend as pseudo instruction and hint to register
				91	allocator. Delay codegen until post register allocation.
Evan Cheng	fdbb667	2007-10-12 18:22:55 +0000	[diff] [blame]	92	Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
				93	the coalescer how to deal with it though.
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	94
				95	//===---------------------------------------------------------------------===//
				96
				97	Count leading zeros and count trailing zeros:
				98
				99	int clz(int X) { return __builtin_clz(X); }
				100	int ctz(int X) { return __builtin_ctz(X); }
				101
				102	$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
				103	clz:
				104	bsr %eax, DWORD PTR [%esp+4]
				105	xor %eax, 31
				106	ret
				107	ctz:
				108	bsf %eax, DWORD PTR [%esp+4]
				109	ret
				110
				111	however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
				112	aren't.
				113
				114	Another example (use predsimplify to eliminate a select):
				115
				116	int foo (unsigned long j) {
				117	if (j)
				118	return __builtin_ffs (j) - 1;
				119	else
				120	return 0;
				121	}
				122
				123	//===---------------------------------------------------------------------===//
				124
				125	It appears icc use push for parameter passing. Need to investigate.
				126
				127	//===---------------------------------------------------------------------===//
				128
				129	Only use inc/neg/not instructions on processors where they are faster than
				130	add/sub/xor. They are slower on the P4 due to only updating some processor
				131	flags.
				132
				133	//===---------------------------------------------------------------------===//
				134
				135	The instruction selector sometimes misses folding a load into a compare. The
				136	pattern is written as (cmp reg, (load p)). Because the compare isn't
				137	commutative, it is not matched with the load on both sides. The dag combiner
				138	should be made smart enough to cannonicalize the load into the RHS of a compare
				139	when it can invert the result of the compare for free.
				140
				141	//===---------------------------------------------------------------------===//
				142
				143	How about intrinsics? An example is:
				144	res = _mm_mulhi_epu16(A, _mm_mul_epu32(B, C));
				145
				146	compiles to
				147	pmuludq (%eax), %xmm0
				148	movl 8(%esp), %eax
				149	movdqa (%eax), %xmm1
				150	pmulhuw %xmm0, %xmm1
				151
				152	The transformation probably requires a X86 specific pass or a DAG combiner
				153	target specific hook.
				154
				155	//===---------------------------------------------------------------------===//
				156
				157	In many cases, LLVM generates code like this:
				158
				159	_test:
				160	movl 8(%esp), %eax
				161	cmpl %eax, 4(%esp)
				162	setl %al
				163	movzbl %al, %eax
				164	ret
				165
				166	on some processors (which ones?), it is more efficient to do this:
				167
				168	_test:
				169	movl 8(%esp), %ebx
				170	xor %eax, %eax
				171	cmpl %ebx, 4(%esp)
				172	setl %al
				173	ret
				174
				175	Doing this correctly is tricky though, as the xor clobbers the flags.
				176
				177	//===---------------------------------------------------------------------===//
				178
				179	We should generate bts/btr/etc instructions on targets where they are cheap or
				180	when codesize is important. e.g., for:
				181
				182	void setbit(int *target, int bit) {
				183	*target \|= (1 << bit);
				184	}
				185	void clearbit(int *target, int bit) {
				186	*target &= ~(1 << bit);
				187	}
				188
				189	//===---------------------------------------------------------------------===//
				190
				191	Instead of the following for memset char*, 1, 10:
				192
				193	movl $16843009, 4(%edx)
				194	movl $16843009, (%edx)
				195	movw $257, 8(%edx)
				196
				197	It might be better to generate
				198
				199	movl $16843009, %eax
				200	movl %eax, 4(%edx)
				201	movl %eax, (%edx)
				202	movw al, 8(%edx)
				203
				204	when we can spare a register. It reduces code size.
				205
				206	//===---------------------------------------------------------------------===//
				207
				208	Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
				209	get this:
				210
				211	int %test1(int %X) {
				212	%Y = div int %X, 8
				213	ret int %Y
				214	}
				215
				216	_test1:
				217	movl 4(%esp), %eax
				218	movl %eax, %ecx
				219	sarl $31, %ecx
				220	shrl $29, %ecx
				221	addl %ecx, %eax
				222	sarl $3, %eax
				223	ret
				224
				225	GCC knows several different ways to codegen it, one of which is this:
				226
				227	_test1:
				228	movl 4(%esp), %eax
				229	cmpl $-1, %eax
				230	leal 7(%eax), %ecx
				231	cmovle %ecx, %eax
				232	sarl $3, %eax
				233	ret
				234
				235	which is probably slower, but it's interesting at least :)
				236
				237	//===---------------------------------------------------------------------===//
				238
				239	The first BB of this code:
				240
				241	declare bool %foo()
				242	int %bar() {
				243	%V = call bool %foo()
				244	br bool %V, label %T, label %F
				245	T:
				246	ret int 1
				247	F:
				248	call bool %foo()
				249	ret int 12
				250	}
				251
				252	compiles to:
				253
				254	_bar:
				255	subl $12, %esp
				256	call L_foo$stub
				257	xorb $1, %al
				258	testb %al, %al
				259	jne LBB_bar_2 # F
				260
				261	It would be better to emit "cmp %al, 1" than a xor and test.
				262
				263	//===---------------------------------------------------------------------===//
				264
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	265	We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
				266	We should leave these as libcalls for everything over a much lower threshold,
				267	since libc is hand tuned for medium and large mem ops (avoiding RFO for large
				268	stores, TLB preheating, etc)
				269
				270	//===---------------------------------------------------------------------===//
				271
				272	Optimize this into something reasonable:
				273	x * copysign(1.0, y) * copysign(1.0, z)
				274
				275	//===---------------------------------------------------------------------===//
				276
				277	Optimize copysign(x, *y) to use an integer load from y.
				278
				279	//===---------------------------------------------------------------------===//
				280
				281	%X = weak global int 0
				282
				283	void %foo(int %N) {
				284	%N = cast int %N to uint
				285	%tmp.24 = setgt int %N, 0
				286	br bool %tmp.24, label %no_exit, label %return
				287
				288	no_exit:
				289	%indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
				290	%i.0.0 = cast uint %indvar to int
				291	volatile store int %i.0.0, int* %X
				292	%indvar.next = add uint %indvar, 1
				293	%exitcond = seteq uint %indvar.next, %N
				294	br bool %exitcond, label %return, label %no_exit
				295
				296	return:
				297	ret void
				298	}
				299
				300	compiles into:
				301
				302	.text
				303	.align 4
				304	.globl _foo
				305	_foo:
				306	movl 4(%esp), %eax
				307	cmpl $1, %eax
				308	jl LBB_foo_4 # return
				309	LBB_foo_1: # no_exit.preheader
				310	xorl %ecx, %ecx
				311	LBB_foo_2: # no_exit
				312	movl L_X$non_lazy_ptr, %edx
				313	movl %ecx, (%edx)
				314	incl %ecx
				315	cmpl %eax, %ecx
				316	jne LBB_foo_2 # no_exit
				317	LBB_foo_3: # return.loopexit
				318	LBB_foo_4: # return
				319	ret
				320
				321	We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
				322	remateralization is implemented. This can be accomplished with 1) a target
				323	dependent LICM pass or 2) makeing SelectDAG represent the whole function.
				324
				325	//===---------------------------------------------------------------------===//
				326
				327	The following tests perform worse with LSR:
				328
				329	lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
				330
				331	//===---------------------------------------------------------------------===//
				332
				333	We are generating far worse code than gcc:
				334
				335	volatile short X, Y;
				336
				337	void foo(int N) {
				338	int i;
				339	for (i = 0; i < N; i++) { X = i; Y = i*4; }
				340	}
				341
Evan Cheng	27a820a	2007-10-26 01:56:11 +0000	[diff] [blame]	342	LBB1_1: # entry.bb_crit_edge
				343	xorl %ecx, %ecx
				344	xorw %dx, %dx
				345	LBB1_2: # bb
				346	movl L_X$non_lazy_ptr, %esi
				347	movw %cx, (%esi)
				348	movl L_Y$non_lazy_ptr, %esi
				349	movw %dx, (%esi)
				350	addw $4, %dx
				351	incl %ecx
				352	cmpl %eax, %ecx
				353	jne LBB1_2 # bb
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	354
				355	vs.
				356
				357	xorl %edx, %edx
				358	movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
				359	movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
				360	L4:
				361	movw %dx, (%esi)
				362	leal 0(,%edx,4), %eax
				363	movw %ax, (%ecx)
				364	addl $1, %edx
				365	cmpl %edx, %edi
				366	jne L4
				367
Evan Cheng	27a820a	2007-10-26 01:56:11 +0000	[diff] [blame]	368	This is due to the lack of post regalloc LICM.
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	369
				370	//===---------------------------------------------------------------------===//
				371
				372	Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
				373	FR64 to VR128.
				374
				375	//===---------------------------------------------------------------------===//
				376
				377	mov $reg, 48(%esp)
				378	...
				379	leal 48(%esp), %eax
				380	mov %eax, (%esp)
				381	call _foo
				382
				383	Obviously it would have been better for the first mov (or any op) to store
				384	directly %esp[0] if there are no other uses.
				385
				386	//===---------------------------------------------------------------------===//
				387
				388	Adding to the list of cmp / test poor codegen issues:
				389
				390	int test(__m128 A, __m128 B) {
				391	if (_mm_comige_ss(A, B))
				392	return 3;
				393	else
				394	return 4;
				395	}
				396
				397	_test:
				398	movl 8(%esp), %eax
				399	movaps (%eax), %xmm0
				400	movl 4(%esp), %eax
				401	movaps (%eax), %xmm1
				402	comiss %xmm0, %xmm1
				403	setae %al
				404	movzbl %al, %ecx
				405	movl $3, %eax
				406	movl $4, %edx
				407	cmpl $0, %ecx
				408	cmove %edx, %eax
				409	ret
				410
				411	Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
				412	are a number of issues. 1) We are introducing a setcc between the result of the
				413	intrisic call and select. 2) The intrinsic is expected to produce a i32 value
				414	so a any extend (which becomes a zero extend) is added.
				415
				416	We probably need some kind of target DAG combine hook to fix this.
				417
				418	//===---------------------------------------------------------------------===//
				419
				420	We generate significantly worse code for this than GCC:
				421	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
				422	http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
				423
				424	There is also one case we do worse on PPC.
				425
				426	//===---------------------------------------------------------------------===//
				427
				428	If shorter, we should use things like:
				429	movzwl %ax, %eax
				430	instead of:
				431	andl $65535, %EAX
				432
				433	The former can also be used when the two-addressy nature of the 'and' would
				434	require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
				435
				436	//===---------------------------------------------------------------------===//
				437
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	438	Consider this:
				439
				440	typedef struct pair { float A, B; } pair;
				441	void pairtest(pair P, float *FP) {
				442	*FP = P.A+P.B;
				443	}
				444
				445	We currently generate this code with llvmgcc4:
				446
				447	_pairtest:
				448	movl 8(%esp), %eax
				449	movl 4(%esp), %ecx
				450	movd %eax, %xmm0
				451	movd %ecx, %xmm1
				452	addss %xmm0, %xmm1
				453	movl 12(%esp), %eax
				454	movss %xmm1, (%eax)
				455	ret
				456
				457	we should be able to generate:
				458	_pairtest:
				459	movss 4(%esp), %xmm0
				460	movl 12(%esp), %eax
				461	addss 8(%esp), %xmm0
				462	movss %xmm0, (%eax)
				463	ret
				464
				465	The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
				466	integer chunks. It does this so that structs like {short,short} are passed in
				467	a single 32-bit integer stack slot. We should handle the safe cases above much
				468	nicer, while still handling the hard cases.
				469
				470	While true in general, in this specific case we could do better by promoting
				471	load int + bitcast to float -> load fload. This basically needs alignment info,
				472	the code is already implemented (but disabled) in dag combine).
				473
				474	//===---------------------------------------------------------------------===//
				475
				476	Another instruction selector deficiency:
				477
				478	void %bar() {
				479	%tmp = load int (int)** %foo
				480	%tmp = tail call int %tmp( int 3 )
				481	ret void
				482	}
				483
				484	_bar:
				485	subl $12, %esp
				486	movl L_foo$non_lazy_ptr, %eax
				487	movl (%eax), %eax
				488	call *%eax
				489	addl $12, %esp
				490	ret
				491
				492	The current isel scheme will not allow the load to be folded in the call since
				493	the load's chain result is read by the callseq_start.
				494
				495	//===---------------------------------------------------------------------===//
				496
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	497	For this:
				498
				499	int test(int a)
				500	{
				501	return a * 3;
				502	}
				503
				504	We currently emits
				505	imull $3, 4(%esp), %eax
				506
				507	Perhaps this is what we really should generate is? Is imull three or four
				508	cycles? Note: ICC generates this:
				509	movl 4(%esp), %eax
				510	leal (%eax,%eax,2), %eax
				511
				512	The current instruction priority is based on pattern complexity. The former is
				513	more "complex" because it folds a load so the latter will not be emitted.
				514
				515	Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
				516	should always try to match LEA first since the LEA matching code does some
				517	estimate to determine whether the match is profitable.
				518
				519	However, if we care more about code size, then imull is better. It's two bytes
				520	shorter than movl + leal.
				521
				522	//===---------------------------------------------------------------------===//
				523
Chris Lattner	a86af9a	2007-08-11 18:19:07 +0000	[diff] [blame]	524	Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
				525
				526	int ctz_(unsigned X) { return __builtin_ctz(X); }
				527	int clz_(unsigned X) { return __builtin_clz(X); }
				528	int ffs_(unsigned X) { return __builtin_ffs(X); }
				529
				530	_ctz_:
				531	bsfl 4(%esp), %eax
				532	ret
				533	_clz_:
				534	bsrl 4(%esp), %eax
				535	xorl $31, %eax
				536	ret
				537	_ffs_:
				538	movl $-1, %edx
				539	bsfl 4(%esp), %eax
				540	cmove %edx, %eax
				541	addl $1, %eax
				542	ret
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	543
				544	//===---------------------------------------------------------------------===//
				545
				546	It appears gcc place string data with linkonce linkage in
				547	.section __TEXT,__const_coal,coalesced instead of
				548	.section __DATA,__const_coal,coalesced.
				549	Take a look at darwin.h, there are other Darwin assembler directives that we
				550	do not make use of.
				551
				552	//===---------------------------------------------------------------------===//
				553
				554	int %foo(int* %a, int %t) {
				555	entry:
				556	br label %cond_true
				557
				558	cond_true: ; preds = %cond_true, %entry
				559	%x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
				560	%t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
				561	%tmp2 = getelementptr int* %a, int %x.0.0
				562	%tmp3 = load int* %tmp2 ; <int> [#uses=1]
				563	%tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
				564	%tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
				565	%tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
				566	%tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
				567	br bool %tmp, label %bb12, label %cond_true
				568
				569	bb12: ; preds = %cond_true
				570	ret int %tmp7
				571	}
				572
				573	is pessimized by -loop-reduce and -indvars
				574
				575	//===---------------------------------------------------------------------===//
				576
				577	u32 to float conversion improvement:
				578
				579	float uint32_2_float( unsigned u ) {
				580	float fl = (int) (u & 0xffff);
				581	float fh = (int) (u >> 16);
				582	fh *= 0x1.0p16f;
				583	return fh + fl;
				584	}
				585
				586	00000000 subl $0x04,%esp
				587	00000003 movl 0x08(%esp,1),%eax
				588	00000007 movl %eax,%ecx
				589	00000009 shrl $0x10,%ecx
				590	0000000c cvtsi2ss %ecx,%xmm0
				591	00000010 andl $0x0000ffff,%eax
				592	00000015 cvtsi2ss %eax,%xmm1
				593	00000019 mulss 0x00000078,%xmm0
				594	00000021 addss %xmm1,%xmm0
				595	00000025 movss %xmm0,(%esp,1)
				596	0000002a flds (%esp,1)
				597	0000002d addl $0x04,%esp
				598	00000030 ret
				599
				600	//===---------------------------------------------------------------------===//
				601
				602	When using fastcc abi, align stack slot of argument of type double on 8 byte
				603	boundary to improve performance.
				604
				605	//===---------------------------------------------------------------------===//
				606
				607	Codegen:
				608
				609	int f(int a, int b) {
				610	if (a == 4 \|\| a == 6)
				611	b++;
				612	return b;
				613	}
				614
				615
				616	as:
				617
				618	or eax, 2
				619	cmp eax, 6
				620	jz label
				621
				622	//===---------------------------------------------------------------------===//
				623
				624	GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
				625	simplifications for integer "x cmp y ? a : b". For example, instead of:
				626
				627	int G;
				628	void f(int X, int Y) {
				629	G = X < 0 ? 14 : 13;
				630	}
				631
				632	compiling to:
				633
				634	_f:
				635	movl $14, %eax
				636	movl $13, %ecx
				637	movl 4(%esp), %edx
				638	testl %edx, %edx
				639	cmovl %eax, %ecx
				640	movl %ecx, _G
				641	ret
				642
				643	it could be:
				644	_f:
				645	movl 4(%esp), %eax
				646	sarl $31, %eax
				647	notl %eax
				648	addl $14, %eax
				649	movl %eax, _G
				650	ret
				651
				652	etc.
				653
				654	//===---------------------------------------------------------------------===//
				655
				656	Currently we don't have elimination of redundant stack manipulations. Consider
				657	the code:
				658
				659	int %main() {
				660	entry:
				661	call fastcc void %test1( )
				662	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
				663	ret int 0
				664	}
				665
				666	declare fastcc void %test1()
				667
				668	declare fastcc void %test2(sbyte*)
				669
				670
				671	This currently compiles to:
				672
				673	subl $16, %esp
				674	call _test5
				675	addl $12, %esp
				676	subl $16, %esp
				677	movl $_test5, (%esp)
				678	call _test6
				679	addl $12, %esp
				680
				681	The add\sub pair is really unneeded here.
				682
				683	//===---------------------------------------------------------------------===//
				684
				685	We currently compile sign_extend_inreg into two shifts:
				686
				687	long foo(long X) {
				688	return (long)(signed char)X;
				689	}
				690
				691	becomes:
				692
				693	_foo:
				694	movl 4(%esp), %eax
				695	shll $24, %eax
				696	sarl $24, %eax
				697	ret
				698
				699	This could be:
				700
				701	_foo:
				702	movsbl 4(%esp),%eax
				703	ret
				704
				705	//===---------------------------------------------------------------------===//
				706
				707	Consider the expansion of:
				708
				709	uint %test3(uint %X) {
				710	%tmp1 = rem uint %X, 255
				711	ret uint %tmp1
				712	}
				713
				714	Currently it compiles to:
				715
				716	...
				717	movl $2155905153, %ecx
				718	movl 8(%esp), %esi
				719	movl %esi, %eax
				720	mull %ecx
				721	...
				722
				723	This could be "reassociated" into:
				724
				725	movl $2155905153, %eax
				726	movl 8(%esp), %ecx
				727	mull %ecx
				728
				729	to avoid the copy. In fact, the existing two-address stuff would do this
				730	except that mul isn't a commutative 2-addr instruction. I guess this has
				731	to be done at isel time based on the #uses to mul?
				732
				733	//===---------------------------------------------------------------------===//
				734
				735	Make sure the instruction which starts a loop does not cross a cacheline
				736	boundary. This requires knowning the exact length of each machine instruction.
				737	That is somewhat complicated, but doable. Example 256.bzip2:
				738
				739	In the new trace, the hot loop has an instruction which crosses a cacheline
				740	boundary. In addition to potential cache misses, this can't help decoding as I
				741	imagine there has to be some kind of complicated decoder reset and realignment
				742	to grab the bytes from the next cacheline.
				743
				744	532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
				745	942 942 0x3d03 movl %dh, (1809(%esp, %esi)
				746	937 937 0x3d0a incl %esi
				747	3 3 0x3d0b cmpb %bl, %dl
				748	27 27 0x3d0d jnz 0x000062db <main+11707>
				749
				750	//===---------------------------------------------------------------------===//
				751
				752	In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
				753
				754	//===---------------------------------------------------------------------===//
				755
				756	This could be a single 16-bit load.
				757
				758	int f(char *p) {
				759	if ((p[0] == 1) & (p[1] == 2)) return 1;
				760	return 0;
				761	}
				762
				763	//===---------------------------------------------------------------------===//
				764
				765	We should inline lrintf and probably other libc functions.
				766
				767	//===---------------------------------------------------------------------===//
				768
				769	Start using the flags more. For example, compile:
				770
				771	int add_zf(int *x, int y, int a, int b) {
				772	if ((*x += y) == 0)
				773	return a;
				774	else
				775	return b;
				776	}
				777
				778	to:
				779	addl %esi, (%rdi)
				780	movl %edx, %eax
				781	cmovne %ecx, %eax
				782	ret
				783	instead of:
				784
				785	_add_zf:
				786	addl (%rdi), %esi
				787	movl %esi, (%rdi)
				788	testl %esi, %esi
				789	cmove %edx, %ecx
				790	movl %ecx, %eax
				791	ret
				792
				793	and:
				794
				795	int add_zf(int *x, int y, int a, int b) {
				796	if ((*x + y) < 0)
				797	return a;
				798	else
				799	return b;
				800	}
				801
				802	to:
				803
				804	add_zf:
				805	addl (%rdi), %esi
				806	movl %edx, %eax
				807	cmovns %ecx, %eax
				808	ret
				809
				810	instead of:
				811
				812	_add_zf:
				813	addl (%rdi), %esi
				814	testl %esi, %esi
				815	cmovs %edx, %ecx
				816	movl %ecx, %eax
				817	ret
				818
				819	//===---------------------------------------------------------------------===//
				820
				821	This:
				822	#include <math.h>
				823	int foo(double X) { return isnan(X); }
				824
				825	compiles to (-m64):
				826
				827	_foo:
				828	pxor %xmm1, %xmm1
				829	ucomisd %xmm1, %xmm0
				830	setp %al
				831	movzbl %al, %eax
				832	ret
				833
				834	the pxor is not needed, we could compare the value against itself.
				835
				836	//===---------------------------------------------------------------------===//
				837
				838	These two functions have identical effects:
				839
				840	unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
				841	unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
				842
				843	We currently compile them to:
				844
				845	_f:
				846	movl 4(%esp), %eax
				847	movl %eax, %ecx
				848	incl %ecx
				849	movl 8(%esp), %edx
				850	cmpl %edx, %ecx
				851	jne LBB1_2 #UnifiedReturnBlock
				852	LBB1_1: #cond_true
				853	addl $2, %eax
				854	ret
				855	LBB1_2: #UnifiedReturnBlock
				856	movl %ecx, %eax
				857	ret
				858	_f2:
				859	movl 4(%esp), %eax
				860	movl %eax, %ecx
				861	incl %ecx
				862	cmpl 8(%esp), %ecx
				863	sete %cl
				864	movzbl %cl, %ecx
				865	leal 1(%ecx,%eax), %eax
				866	ret
				867
				868	both of which are inferior to GCC's:
				869
				870	_f:
				871	movl 4(%esp), %edx
				872	leal 1(%edx), %eax
				873	addl $2, %edx
				874	cmpl 8(%esp), %eax
				875	cmove %edx, %eax
				876	ret
				877	_f2:
				878	movl 4(%esp), %eax
				879	addl $1, %eax
				880	xorl %edx, %edx
				881	cmpl 8(%esp), %eax
				882	sete %dl
				883	addl %edx, %eax
				884	ret
				885
				886	//===---------------------------------------------------------------------===//
				887
				888	This code:
				889
				890	void test(int X) {
				891	if (X) abort();
				892	}
				893
				894	is currently compiled to:
				895
				896	_test:
				897	subl $12, %esp
				898	cmpl $0, 16(%esp)
				899	jne LBB1_1
				900	addl $12, %esp
				901	ret
				902	LBB1_1:
				903	call L_abort$stub
				904
				905	It would be better to produce:
				906
				907	_test:
				908	subl $12, %esp
				909	cmpl $0, 16(%esp)
				910	jne L_abort$stub
				911	addl $12, %esp
				912	ret
				913
				914	This can be applied to any no-return function call that takes no arguments etc.
				915	Alternatively, the stack save/restore logic could be shrink-wrapped, producing
				916	something like this:
				917
				918	_test:
				919	cmpl $0, 4(%esp)
				920	jne LBB1_1
				921	ret
				922	LBB1_1:
				923	subl $12, %esp
				924	call L_abort$stub
				925
				926	Both are useful in different situations. Finally, it could be shrink-wrapped
				927	and tail called, like this:
				928
				929	_test:
				930	cmpl $0, 4(%esp)
				931	jne LBB1_1
				932	ret
				933	LBB1_1:
				934	pop %eax # realign stack.
				935	call L_abort$stub
				936
				937	Though this probably isn't worth it.
				938
				939	//===---------------------------------------------------------------------===//
				940
				941	We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner	0d64ec3	2007-08-11 18:16:46 +0000	[diff] [blame]	942	when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	943
				944	int foo(int A, int B) {
				945	return A+1;
				946	}
				947
				948	to:
				949
				950	_foo:
				951	leal 1(%edi), %eax
				952	ret
				953
				954	instead of:
				955
				956	_foo:
				957	incl %edi
				958	movl %edi, %eax
				959	ret
				960
				961	Another example is:
				962
				963	;; X's live range extends beyond the shift, so the register allocator
				964	;; cannot coalesce it with Y. Because of this, a copy needs to be
				965	;; emitted before the shift to save the register value before it is
				966	;; clobbered. However, this copy is not needed if the register
				967	;; allocator turns the shift into an LEA. This also occurs for ADD.
				968
				969	; Check that the shift gets turned into an LEA.
				970	; RUN: llvm-upgrade < %s \| llvm-as \| llc -march=x86 -x86-asm-syntax=intel \| \
				971	; RUN: not grep {mov E.X, E.X}
				972
				973	%G = external global int
				974
				975	int %test1(int %X, int %Y) {
				976	%Z = add int %X, %Y
				977	volatile store int %Y, int* %G
				978	volatile store int %Z, int* %G
				979	ret int %X
				980	}
				981
				982	int %test2(int %X) {
				983	%Z = add int %X, 1 ;; inc
				984	volatile store int %Z, int* %G
				985	ret int %X
				986	}
				987
				988	//===---------------------------------------------------------------------===//
				989
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	990	Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
				991	a neg instead of a sub instruction. Consider:
				992
				993	int test(char X) { return 7-X; }
				994
				995	we currently produce:
				996	_test:
				997	movl $7, %eax
				998	movsbl 4(%esp), %ecx
				999	subl %ecx, %eax
				1000	ret
				1001
				1002	We would use one fewer register if codegen'd as:
				1003
				1004	movsbl 4(%esp), %eax
				1005	neg %eax
				1006	add $7, %eax
				1007	ret
				1008
				1009	Note that this isn't beneficial if the load can be folded into the sub. In
				1010	this case, we want a sub:
				1011
				1012	int test(int X) { return 7-X; }
				1013	_test:
				1014	movl $7, %eax
				1015	subl 4(%esp), %eax
				1016	ret
				1017
				1018	//===---------------------------------------------------------------------===//
				1019
				1020	For code like:
				1021	phi (undef, x)
				1022
				1023	We get an implicit def on the undef side. If the phi is spilled, we then get:
				1024	implicitdef xmm1
				1025	store xmm1 -> stack
				1026
				1027	It should be possible to teach the x86 backend to "fold" the store into the
				1028	implicitdef, which just deletes the implicit def.
				1029
				1030	These instructions should go away:
				1031	#IMPLICIT_DEF %xmm1
				1032	movaps %xmm1, 192(%esp)
				1033	movaps %xmm1, 224(%esp)
				1034	movaps %xmm1, 176(%esp)
Chris Lattner	a3c76a4	2007-08-03 00:17:42 +0000	[diff] [blame]	1035
				1036	//===---------------------------------------------------------------------===//
				1037
				1038	This is a "commutable two-address" register coallescing deficiency:
				1039
				1040	define <4 x float> @test1(<4 x float> %V) {
				1041	entry:
Chris Lattner	a86af9a	2007-08-11 18:19:07 +0000	[diff] [blame]	1042	%tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
				1043	<4 x i32> < i32 3, i32 2, i32 1, i32 0 >
				1044	%add = add <4 x float> %tmp8, %V
Chris Lattner	a3c76a4	2007-08-03 00:17:42 +0000	[diff] [blame]	1045	ret <4 x float> %add
				1046	}
				1047
				1048	this codegens to:
				1049
				1050	_test1:
				1051	pshufd $27, %xmm0, %xmm1
				1052	addps %xmm0, %xmm1
				1053	movaps %xmm1, %xmm0
				1054	ret
				1055
				1056	instead of:
				1057
				1058	_test1:
				1059	pshufd $27, %xmm0, %xmm1
				1060	addps %xmm1, %xmm0
				1061	ret
				1062
Chris Lattner	32f6587	2007-08-20 02:14:33 +0000	[diff] [blame]	1063	//===---------------------------------------------------------------------===//
				1064
				1065	Leaf functions that require one 4-byte spill slot have a prolog like this:
				1066
				1067	_foo:
				1068	pushl %esi
				1069	subl $4, %esp
				1070	...
				1071	and an epilog like this:
				1072	addl $4, %esp
				1073	popl %esi
				1074	ret
				1075
				1076	It would be smaller, and potentially faster, to push eax on entry and to
				1077	pop into a dummy register instead of using addl/subl of esp. Just don't pop
				1078	into any return registers :)
				1079
				1080	//===---------------------------------------------------------------------===//
Chris Lattner	44b03cb	2007-08-23 15:22:07 +0000	[diff] [blame]	1081
				1082	The X86 backend should fold (branch (or (setcc, setcc))) into multiple
				1083	branches. We generate really poor code for:
				1084
				1085	double testf(double a) {
				1086	return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
				1087	}
				1088
				1089	For example, the entry BB is:
				1090
				1091	_testf:
				1092	subl $20, %esp
				1093	pxor %xmm0, %xmm0
				1094	movsd 24(%esp), %xmm1
				1095	ucomisd %xmm0, %xmm1
				1096	setnp %al
				1097	sete %cl
				1098	testb %cl, %al
				1099	jne LBB1_5 # UnifiedReturnBlock
				1100	LBB1_1: # cond_true
				1101
				1102
				1103	it would be better to replace the last four instructions with:
				1104
				1105	jp LBB1_1
				1106	je LBB1_5
				1107	LBB1_1:
				1108
				1109	We also codegen the inner ?: into a diamond:
				1110
				1111	cvtss2sd LCPI1_0(%rip), %xmm2
				1112	cvtss2sd LCPI1_1(%rip), %xmm3
				1113	ucomisd %xmm1, %xmm0
				1114	ja LBB1_3 # cond_true
				1115	LBB1_2: # cond_true
				1116	movapd %xmm3, %xmm2
				1117	LBB1_3: # cond_true
				1118	movapd %xmm2, %xmm0
				1119	ret
				1120
				1121	We should sink the load into xmm3 into the LBB1_2 block. This should
				1122	be pretty easy, and will nuke all the copies.
				1123
				1124	//===---------------------------------------------------------------------===//
Chris Lattner	4084d49	2007-09-10 21:43:18 +0000	[diff] [blame]	1125
				1126	This:
				1127	#include <algorithm>
				1128	inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
				1129	{ return std::make_pair(a + b, a + b < a); }
				1130	bool no_overflow(unsigned a, unsigned b)
				1131	{ return !full_add(a, b).second; }
				1132
				1133	Should compile to:
				1134
				1135
				1136	_Z11no_overflowjj:
				1137	addl %edi, %esi
				1138	setae %al
				1139	ret
				1140
				1141	on x86-64, not:
				1142
				1143	__Z11no_overflowjj:
				1144	addl %edi, %esi
				1145	cmpl %edi, %esi
				1146	setae %al
				1147	movzbl %al, %eax
				1148	ret
				1149
				1150
				1151	//===---------------------------------------------------------------------===//
Evan Cheng	35127a6	2007-09-10 22:16:37 +0000	[diff] [blame]	1152
				1153	Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
				1154	condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattner	a487bf7	2007-09-26 06:29:31 +0000	[diff] [blame]	1155
				1156	//===---------------------------------------------------------------------===//
				1157
				1158	We aren't matching RMW instructions aggressively
				1159	enough. Here's a reduced testcase (more in PR1160):
				1160
				1161	define void @test(i32* %huge_ptr, i32* %target_ptr) {
				1162	%A = load i32* %huge_ptr ; <i32> [#uses=1]
				1163	%B = load i32* %target_ptr ; <i32> [#uses=1]
				1164	%C = or i32 %A, %B ; <i32> [#uses=1]
				1165	store i32 %C, i32* %target_ptr
				1166	ret void
				1167	}
				1168
				1169	$ llvm-as < t.ll \| llc -march=x86-64
				1170
				1171	_test:
				1172	movl (%rdi), %eax
				1173	orl (%rsi), %eax
				1174	movl %eax, (%rsi)
				1175	ret
				1176
				1177	That should be something like:
				1178
				1179	_test:
				1180	movl (%rdi), %eax
				1181	orl %eax, (%rsi)
				1182	ret
				1183
				1184	//===---------------------------------------------------------------------===//
				1185
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1186	The following code:
				1187
Bill Wendling	c2036e3	2007-10-02 20:54:32 +0000	[diff] [blame]	1188	bb114.preheader: ; preds = %cond_next94
				1189	%tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
				1190	%tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
				1191	%tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
				1192	%tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
				1193	%tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
				1194	%tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
				1195	%tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
				1196	%tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
				1197	%tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
				1198	%tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
				1199	%tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
				1200	br label %bb114
				1201
				1202	produces:
				1203
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1204	LBB3_5: # bb114.preheader
				1205	movswl -68(%ebp), %eax
				1206	movl $32, %ecx
				1207	movl %ecx, -80(%ebp)
				1208	subl %eax, -80(%ebp)
				1209	movswl -52(%ebp), %eax
				1210	movl %ecx, -84(%ebp)
				1211	subl %eax, -84(%ebp)
				1212	movswl -70(%ebp), %eax
				1213	movl %ecx, -88(%ebp)
				1214	subl %eax, -88(%ebp)
				1215	movswl -50(%ebp), %eax
				1216	subl %eax, %ecx
				1217	movl %ecx, -76(%ebp)
				1218	movswl -42(%ebp), %eax
				1219	movl %eax, -92(%ebp)
				1220	movswl -66(%ebp), %eax
				1221	movl %eax, -96(%ebp)
				1222	movw $0, -98(%ebp)
				1223
Chris Lattner	792bae5	2007-10-03 03:40:24 +0000	[diff] [blame]	1224	This appears to be bad because the RA is not folding the store to the stack
				1225	slot into the movl. The above instructions could be:
				1226	movl $32, -80(%ebp)
				1227	...
				1228	movl $32, -84(%ebp)
				1229	...
				1230	This seems like a cross between remat and spill folding.
				1231
Bill Wendling	c2036e3	2007-10-02 20:54:32 +0000	[diff] [blame]	1232	This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1233	change, so we could simply subtract %eax from %ecx first and then use %ecx (or
				1234	vice-versa).
				1235
				1236	//===---------------------------------------------------------------------===//
				1237
Bill Wendling	c524bae	2007-10-02 21:43:06 +0000	[diff] [blame]	1238	For this code:
				1239
				1240	cond_next603: ; preds = %bb493, %cond_true336, %cond_next599
				1241	%v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ] ; <i32> [#uses=1]
				1242	%maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ] ; <i32> [#uses=2]
				1243	%cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ] ; <i32> [#uses=2]
				1244	%byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ] ; <i8*> [#uses=9]
				1245	%bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ] ; <i32> [#uses=4]
				1246	%source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ] ; <i32> [#uses=7]
				1247	%tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1 ; <i8*> [#uses=1]
				1248	%tmp607 = load i8* %tmp606, align 1 ; <i8> [#uses=1]
				1249
				1250	We produce this:
				1251
				1252	LBB4_70: # cond_next603
				1253	movl -20(%ebp), %esi
				1254	movl L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
				1255
				1256	However, ICC caches this information before the loop and produces this:
				1257
				1258	movl 88(%esp), %eax #481.12
				1259
				1260	//===---------------------------------------------------------------------===//
Bill Wendling	54c4f83	2007-10-02 21:49:31 +0000	[diff] [blame]	1261
				1262	This code:
				1263
				1264	%tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
				1265	br i1 %tmp659, label %cond_true662, label %cond_next715
				1266
				1267	produces this:
				1268
				1269	testw %cx, %cx
				1270	movswl %cx, %esi
				1271	jns LBB4_109 # cond_next715
				1272
				1273	Shark tells us that using %cx in the testw instruction is sub-optimal. It
				1274	suggests using the 32-bit register (which is what ICC uses).
				1275
				1276	//===---------------------------------------------------------------------===//
Chris Lattner	802c62a	2007-10-03 17:10:03 +0000	[diff] [blame]	1277
				1278	rdar://5506677 - We compile this:
				1279
				1280	define i32 @foo(double %x) {
				1281	%x14 = bitcast double %x to i64 ; <i64> [#uses=1]
				1282	%tmp713 = trunc i64 %x14 to i32 ; <i32> [#uses=1]
				1283	%tmp8 = and i32 %tmp713, 2147483647 ; <i32> [#uses=1]
				1284	ret i32 %tmp8
				1285	}
				1286
				1287	to:
				1288
				1289	_foo:
				1290	subl $12, %esp
				1291	fldl 16(%esp)
				1292	fstpl (%esp)
				1293	movl $2147483647, %eax
				1294	andl (%esp), %eax
				1295	addl $12, %esp
				1296	#FP_REG_KILL
				1297	ret
				1298
				1299	It would be much better to eliminate the fldl/fstpl by folding the bitcast
				1300	into the load SDNode. That would give us:
				1301
				1302	_foo:
				1303	movl $2147483647, %eax
				1304	andl 4(%esp), %eax
				1305	ret
				1306
				1307	//===---------------------------------------------------------------------===//
				1308
Chris Lattner	ae25999	2007-10-04 15:47:27 +0000	[diff] [blame]	1309	We compile this:
				1310
				1311	void compare (long long foo) {
				1312	if (foo < 4294967297LL)
				1313	abort();
				1314	}
				1315
				1316	to:
				1317
				1318	_compare:
				1319	subl $12, %esp
				1320	cmpl $0, 16(%esp)
				1321	setne %al
				1322	movzbw %al, %ax
				1323	cmpl $1, 20(%esp)
				1324	setg %cl
				1325	movzbw %cl, %cx
				1326	cmove %ax, %cx
				1327	movw %cx, %ax
				1328	testb $1, %al
				1329	je LBB1_2 # cond_true
				1330
				1331	(also really horrible code on ppc). This is due to the expand code for 64-bit
				1332	compares. GCC produces multiple branches, which is much nicer:
				1333
				1334	_compare:
				1335	pushl %ebp
				1336	movl %esp, %ebp
				1337	subl $8, %esp
				1338	movl 8(%ebp), %eax
				1339	movl 12(%ebp), %edx
				1340	subl $1, %edx
				1341	jg L5
				1342	L7:
				1343	jl L4
				1344	cmpl $0, %eax
				1345	jbe L4
				1346	L5:
				1347
				1348	//===---------------------------------------------------------------------===//
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1349
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1350	Tail call optimization improvements: Tail call optimization currently
				1351	pushes all arguments on the top of the stack (their normal place for
				1352	non-tail call optimized calls) before moving them to actual stack
				1353	slot. This is done to prevent overwriting of parameters (see example
				1354	below) that might be used, since the arguments of the callee
				1355	overwrites caller's arguments.
				1356
				1357	example:
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1358
				1359	int callee(int32, int64);
				1360	int caller(int32 arg1, int32 arg2) {
				1361	int64 local = arg2 * 2;
				1362	return callee(arg2, (int64)local);
				1363	}
				1364
				1365	[arg1] [!arg2 no longer valid since we moved local onto it]
				1366	[arg2] -> [(int64)
				1367	[RETADDR] local ]
				1368
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1369	Moving arg1 onto the stack slot of callee function would overwrite
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1370	arg2 of the caller.
				1371
				1372	Possible optimizations:
				1373
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1374	- Only push those arguments to the top of the stack that are actual
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1375	parameters of the caller function and have no local value in the
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1376	caller.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1377
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1378	In the above example local does not need to be pushed onto the top
				1379	of the stack as it is definitely not a caller's function
				1380	parameter.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1381
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1382	- Analyse the actual parameters of the callee to see which would
				1383	overwrite a caller parameter which is used by the callee and only
				1384	push them onto the top of the stack.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1385
				1386	int callee (int32 arg1, int32 arg2);
				1387	int caller (int32 arg1, int32 arg2) {
				1388	return callee(arg1,arg2);
				1389	}
				1390
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1391	Here we don't need to write any variables to the top of the stack
				1392	since they don't overwrite each other.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1393
				1394	int callee (int32 arg1, int32 arg2);
				1395	int caller (int32 arg1, int32 arg2) {
				1396	return callee(arg2,arg1);
				1397	}
				1398
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1399	Here we need to push the arguments because they overwrite each
				1400	other.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1401
				1402
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1403	Code for lowering directly onto callers arguments:
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1404	+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
				1405	+ SmallVector<SDOperand, 8> MemOpChains;
				1406	+
				1407	+ SDOperand FramePtr;
				1408	+ SDOperand PtrOff;
				1409	+ SDOperand FIN;
				1410	+ int FI = 0;
				1411	+ // Walk the register/memloc assignments, inserting copies/loads.
				1412	+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
				1413	+ CCValAssign &VA = ArgLocs[i];
				1414	+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
				1415	+
				1416	+ ....
				1417	+
				1418	+ if (VA.isRegLoc()) {
				1419	+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
				1420	+ } else {
				1421	+ assert(VA.isMemLoc());
				1422	+ // create frame index
				1423	+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
				1424	+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
				1425	+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
				1426	+ FIN = DAG.getFrameIndex(FI, MVT::i32);
				1427	+ // store relative to framepointer
				1428	+ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
				1429	+ }
				1430	+ }
				1431	//===---------------------------------------------------------------------===//
Evan Cheng	7f1ad6a	2007-10-28 04:01:09 +0000	[diff] [blame]	1432
				1433	main ()
				1434	{
				1435	int i = 0;
				1436	unsigned long int z = 0;
				1437
				1438	do {
				1439	z -= 0x00004000;
				1440	i++;
				1441	if (i > 0x00040000)
				1442	abort ();
				1443	} while (z > 0);
				1444	exit (0);
				1445	}
				1446
				1447	gcc compiles this to:
				1448
				1449	_main:
				1450	subl $28, %esp
				1451	xorl %eax, %eax
				1452	jmp L2
				1453	L3:
				1454	cmpl $262144, %eax
				1455	je L10
				1456	L2:
				1457	addl $1, %eax
				1458	cmpl $262145, %eax
				1459	jne L3
				1460	call L_abort$stub
				1461	L10:
				1462	movl $0, (%esp)
				1463	call L_exit$stub
				1464
				1465	llvm:
				1466
				1467	_main:
				1468	subl $12, %esp
				1469	movl $1, %eax
				1470	movl $16384, %ecx
				1471	LBB1_1: # bb
				1472	cmpl $262145, %eax
				1473	jge LBB1_4 # cond_true
				1474	LBB1_2: # cond_next
				1475	incl %eax
				1476	addl $4294950912, %ecx
				1477	cmpl $16384, %ecx
				1478	jne LBB1_1 # bb
				1479	LBB1_3: # bb11
				1480	xorl %eax, %eax
				1481	addl $12, %esp
				1482	ret
				1483	LBB1_4: # cond_true
				1484	call L_abort$stub
				1485
				1486	1. LSR should rewrite the first cmp with induction variable %ecx.
				1487	2. DAG combiner should fold
				1488	leal 1(%eax), %edx
				1489	cmpl $262145, %edx
				1490	=>
				1491	cmpl $262144, %eax
				1492
				1493	//===---------------------------------------------------------------------===//