Blame - lib/Target/X86/README.txt - fp2-dev/platform/external/llvm

blob: 9a5ca427e1e1c9867ee129b19406c18ac5275a8f [file] [log] [blame]

Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1	//===---------------------------------------------------------------------===//
				2	// Random ideas for the X86 backend.
				3	//===---------------------------------------------------------------------===//
				4
				5	Missing features:
				6	- Support for SSE4: http://www.intel.com/software/penryn
				7	http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
				8	- support for 3DNow!
				9	- weird abis?
				10
				11	//===---------------------------------------------------------------------===//
				12
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	13	CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
				14	backend knows how to three-addressify this shift, but it appears the register
				15	allocator isn't even asking it to do so in this case. We should investigate
				16	why this isn't happening, it could have significant impact on other important
				17	cases for X86 as well.
				18
				19	//===---------------------------------------------------------------------===//
				20
				21	This should be one DIV/IDIV instruction, not a libcall:
				22
				23	unsigned test(unsigned long long X, unsigned Y) {
				24	return X/Y;
				25	}
				26
				27	This can be done trivially with a custom legalizer. What about overflow
				28	though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
				29
				30	//===---------------------------------------------------------------------===//
				31
				32	Improvements to the multiply -> shift/add algorithm:
				33	http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
				34
				35	//===---------------------------------------------------------------------===//
				36
				37	Improve code like this (occurs fairly frequently, e.g. in LLVM):
				38	long long foo(int x) { return 1LL << x; }
				39
				40	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
				41	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
				42	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
				43
				44	Another useful one would be ~0ULL >> X and ~0ULL << X.
				45
				46	One better solution for 1LL << x is:
				47	xorl %eax, %eax
				48	xorl %edx, %edx
				49	testb $32, %cl
				50	sete %al
				51	setne %dl
				52	sall %cl, %eax
				53	sall %cl, %edx
				54
				55	But that requires good 8-bit subreg support.
				56
				57	64-bit shifts (in general) expand to really bad code. Instead of using
				58	cmovs, we should expand to a conditional branch like GCC produces.
				59
				60	//===---------------------------------------------------------------------===//
				61
				62	Compile this:
				63	_Bool f(_Bool a) { return a!=1; }
				64
				65	into:
				66	movzbl %dil, %eax
				67	xorl $1, %eax
				68	ret
				69
				70	//===---------------------------------------------------------------------===//
				71
				72	Some isel ideas:
				73
				74	1. Dynamic programming based approach when compile time if not an
				75	issue.
				76	2. Code duplication (addressing mode) during isel.
				77	3. Other ideas from "Register-Sensitive Selection, Duplication, and
				78	Sequencing of Instructions".
				79	4. Scheduling for reduced register pressure. E.g. "Minimum Register
				80	Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
				81	and other related papers.
				82	http://citeseer.ist.psu.edu/govindarajan01minimum.html
				83
				84	//===---------------------------------------------------------------------===//
				85
				86	Should we promote i16 to i32 to avoid partial register update stalls?
				87
				88	//===---------------------------------------------------------------------===//
				89
				90	Leave any_extend as pseudo instruction and hint to register
				91	allocator. Delay codegen until post register allocation.
Evan Cheng	fdbb667	2007-10-12 18:22:55 +0000	[diff] [blame]	92	Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
				93	the coalescer how to deal with it though.
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	94
				95	//===---------------------------------------------------------------------===//
				96
				97	Count leading zeros and count trailing zeros:
				98
				99	int clz(int X) { return __builtin_clz(X); }
				100	int ctz(int X) { return __builtin_ctz(X); }
				101
				102	$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
				103	clz:
				104	bsr %eax, DWORD PTR [%esp+4]
				105	xor %eax, 31
				106	ret
				107	ctz:
				108	bsf %eax, DWORD PTR [%esp+4]
				109	ret
				110
				111	however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
				112	aren't.
				113
				114	Another example (use predsimplify to eliminate a select):
				115
				116	int foo (unsigned long j) {
				117	if (j)
				118	return __builtin_ffs (j) - 1;
				119	else
				120	return 0;
				121	}
				122
				123	//===---------------------------------------------------------------------===//
				124
				125	It appears icc use push for parameter passing. Need to investigate.
				126
				127	//===---------------------------------------------------------------------===//
				128
				129	Only use inc/neg/not instructions on processors where they are faster than
				130	add/sub/xor. They are slower on the P4 due to only updating some processor
				131	flags.
				132
				133	//===---------------------------------------------------------------------===//
				134
				135	The instruction selector sometimes misses folding a load into a compare. The
				136	pattern is written as (cmp reg, (load p)). Because the compare isn't
				137	commutative, it is not matched with the load on both sides. The dag combiner
				138	should be made smart enough to cannonicalize the load into the RHS of a compare
				139	when it can invert the result of the compare for free.
				140
				141	//===---------------------------------------------------------------------===//
				142
				143	How about intrinsics? An example is:
				144	res = _mm_mulhi_epu16(A, _mm_mul_epu32(B, C));
				145
				146	compiles to
				147	pmuludq (%eax), %xmm0
				148	movl 8(%esp), %eax
				149	movdqa (%eax), %xmm1
				150	pmulhuw %xmm0, %xmm1
				151
				152	The transformation probably requires a X86 specific pass or a DAG combiner
				153	target specific hook.
				154
				155	//===---------------------------------------------------------------------===//
				156
				157	In many cases, LLVM generates code like this:
				158
				159	_test:
				160	movl 8(%esp), %eax
				161	cmpl %eax, 4(%esp)
				162	setl %al
				163	movzbl %al, %eax
				164	ret
				165
				166	on some processors (which ones?), it is more efficient to do this:
				167
				168	_test:
				169	movl 8(%esp), %ebx
				170	xor %eax, %eax
				171	cmpl %ebx, 4(%esp)
				172	setl %al
				173	ret
				174
				175	Doing this correctly is tricky though, as the xor clobbers the flags.
				176
				177	//===---------------------------------------------------------------------===//
				178
				179	We should generate bts/btr/etc instructions on targets where they are cheap or
				180	when codesize is important. e.g., for:
				181
				182	void setbit(int *target, int bit) {
				183	*target \|= (1 << bit);
				184	}
				185	void clearbit(int *target, int bit) {
				186	*target &= ~(1 << bit);
				187	}
				188
				189	//===---------------------------------------------------------------------===//
				190
				191	Instead of the following for memset char*, 1, 10:
				192
				193	movl $16843009, 4(%edx)
				194	movl $16843009, (%edx)
				195	movw $257, 8(%edx)
				196
				197	It might be better to generate
				198
				199	movl $16843009, %eax
				200	movl %eax, 4(%edx)
				201	movl %eax, (%edx)
				202	movw al, 8(%edx)
				203
				204	when we can spare a register. It reduces code size.
				205
				206	//===---------------------------------------------------------------------===//
				207
				208	Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
				209	get this:
				210
				211	int %test1(int %X) {
				212	%Y = div int %X, 8
				213	ret int %Y
				214	}
				215
				216	_test1:
				217	movl 4(%esp), %eax
				218	movl %eax, %ecx
				219	sarl $31, %ecx
				220	shrl $29, %ecx
				221	addl %ecx, %eax
				222	sarl $3, %eax
				223	ret
				224
				225	GCC knows several different ways to codegen it, one of which is this:
				226
				227	_test1:
				228	movl 4(%esp), %eax
				229	cmpl $-1, %eax
				230	leal 7(%eax), %ecx
				231	cmovle %ecx, %eax
				232	sarl $3, %eax
				233	ret
				234
				235	which is probably slower, but it's interesting at least :)
				236
				237	//===---------------------------------------------------------------------===//
				238
				239	The first BB of this code:
				240
				241	declare bool %foo()
				242	int %bar() {
				243	%V = call bool %foo()
				244	br bool %V, label %T, label %F
				245	T:
				246	ret int 1
				247	F:
				248	call bool %foo()
				249	ret int 12
				250	}
				251
				252	compiles to:
				253
				254	_bar:
				255	subl $12, %esp
				256	call L_foo$stub
				257	xorb $1, %al
				258	testb %al, %al
				259	jne LBB_bar_2 # F
				260
				261	It would be better to emit "cmp %al, 1" than a xor and test.
				262
				263	//===---------------------------------------------------------------------===//
				264
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	265	We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
				266	We should leave these as libcalls for everything over a much lower threshold,
				267	since libc is hand tuned for medium and large mem ops (avoiding RFO for large
				268	stores, TLB preheating, etc)
				269
				270	//===---------------------------------------------------------------------===//
				271
				272	Optimize this into something reasonable:
				273	x * copysign(1.0, y) * copysign(1.0, z)
				274
				275	//===---------------------------------------------------------------------===//
				276
				277	Optimize copysign(x, *y) to use an integer load from y.
				278
				279	//===---------------------------------------------------------------------===//
				280
				281	%X = weak global int 0
				282
				283	void %foo(int %N) {
				284	%N = cast int %N to uint
				285	%tmp.24 = setgt int %N, 0
				286	br bool %tmp.24, label %no_exit, label %return
				287
				288	no_exit:
				289	%indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
				290	%i.0.0 = cast uint %indvar to int
				291	volatile store int %i.0.0, int* %X
				292	%indvar.next = add uint %indvar, 1
				293	%exitcond = seteq uint %indvar.next, %N
				294	br bool %exitcond, label %return, label %no_exit
				295
				296	return:
				297	ret void
				298	}
				299
				300	compiles into:
				301
				302	.text
				303	.align 4
				304	.globl _foo
				305	_foo:
				306	movl 4(%esp), %eax
				307	cmpl $1, %eax
				308	jl LBB_foo_4 # return
				309	LBB_foo_1: # no_exit.preheader
				310	xorl %ecx, %ecx
				311	LBB_foo_2: # no_exit
				312	movl L_X$non_lazy_ptr, %edx
				313	movl %ecx, (%edx)
				314	incl %ecx
				315	cmpl %eax, %ecx
				316	jne LBB_foo_2 # no_exit
				317	LBB_foo_3: # return.loopexit
				318	LBB_foo_4: # return
				319	ret
				320
				321	We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
				322	remateralization is implemented. This can be accomplished with 1) a target
				323	dependent LICM pass or 2) makeing SelectDAG represent the whole function.
				324
				325	//===---------------------------------------------------------------------===//
				326
				327	The following tests perform worse with LSR:
				328
				329	lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
				330
				331	//===---------------------------------------------------------------------===//
				332
				333	We are generating far worse code than gcc:
				334
				335	volatile short X, Y;
				336
				337	void foo(int N) {
				338	int i;
				339	for (i = 0; i < N; i++) { X = i; Y = i*4; }
				340	}
				341
Evan Cheng	27a820a	2007-10-26 01:56:11 +0000	[diff] [blame]	342	LBB1_1: # entry.bb_crit_edge
				343	xorl %ecx, %ecx
				344	xorw %dx, %dx
				345	LBB1_2: # bb
				346	movl L_X$non_lazy_ptr, %esi
				347	movw %cx, (%esi)
				348	movl L_Y$non_lazy_ptr, %esi
				349	movw %dx, (%esi)
				350	addw $4, %dx
				351	incl %ecx
				352	cmpl %eax, %ecx
				353	jne LBB1_2 # bb
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	354
				355	vs.
				356
				357	xorl %edx, %edx
				358	movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
				359	movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
				360	L4:
				361	movw %dx, (%esi)
				362	leal 0(,%edx,4), %eax
				363	movw %ax, (%ecx)
				364	addl $1, %edx
				365	cmpl %edx, %edi
				366	jne L4
				367
Evan Cheng	27a820a	2007-10-26 01:56:11 +0000	[diff] [blame]	368	This is due to the lack of post regalloc LICM.
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	369
				370	//===---------------------------------------------------------------------===//
				371
				372	Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
				373	FR64 to VR128.
				374
				375	//===---------------------------------------------------------------------===//
				376
				377	mov $reg, 48(%esp)
				378	...
				379	leal 48(%esp), %eax
				380	mov %eax, (%esp)
				381	call _foo
				382
				383	Obviously it would have been better for the first mov (or any op) to store
				384	directly %esp[0] if there are no other uses.
				385
				386	//===---------------------------------------------------------------------===//
				387
				388	Adding to the list of cmp / test poor codegen issues:
				389
				390	int test(__m128 A, __m128 B) {
				391	if (_mm_comige_ss(A, B))
				392	return 3;
				393	else
				394	return 4;
				395	}
				396
				397	_test:
				398	movl 8(%esp), %eax
				399	movaps (%eax), %xmm0
				400	movl 4(%esp), %eax
				401	movaps (%eax), %xmm1
				402	comiss %xmm0, %xmm1
				403	setae %al
				404	movzbl %al, %ecx
				405	movl $3, %eax
				406	movl $4, %edx
				407	cmpl $0, %ecx
				408	cmove %edx, %eax
				409	ret
				410
				411	Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
				412	are a number of issues. 1) We are introducing a setcc between the result of the
				413	intrisic call and select. 2) The intrinsic is expected to produce a i32 value
				414	so a any extend (which becomes a zero extend) is added.
				415
				416	We probably need some kind of target DAG combine hook to fix this.
				417
				418	//===---------------------------------------------------------------------===//
				419
				420	We generate significantly worse code for this than GCC:
				421	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
				422	http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
				423
				424	There is also one case we do worse on PPC.
				425
				426	//===---------------------------------------------------------------------===//
				427
				428	If shorter, we should use things like:
				429	movzwl %ax, %eax
				430	instead of:
				431	andl $65535, %EAX
				432
				433	The former can also be used when the two-addressy nature of the 'and' would
				434	require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
				435
				436	//===---------------------------------------------------------------------===//
				437
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	438	Consider this:
				439
				440	typedef struct pair { float A, B; } pair;
				441	void pairtest(pair P, float *FP) {
				442	*FP = P.A+P.B;
				443	}
				444
				445	We currently generate this code with llvmgcc4:
				446
				447	_pairtest:
				448	movl 8(%esp), %eax
				449	movl 4(%esp), %ecx
				450	movd %eax, %xmm0
				451	movd %ecx, %xmm1
				452	addss %xmm0, %xmm1
				453	movl 12(%esp), %eax
				454	movss %xmm1, (%eax)
				455	ret
				456
				457	we should be able to generate:
				458	_pairtest:
				459	movss 4(%esp), %xmm0
				460	movl 12(%esp), %eax
				461	addss 8(%esp), %xmm0
				462	movss %xmm0, (%eax)
				463	ret
				464
				465	The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
				466	integer chunks. It does this so that structs like {short,short} are passed in
				467	a single 32-bit integer stack slot. We should handle the safe cases above much
				468	nicer, while still handling the hard cases.
				469
				470	While true in general, in this specific case we could do better by promoting
				471	load int + bitcast to float -> load fload. This basically needs alignment info,
				472	the code is already implemented (but disabled) in dag combine).
				473
				474	//===---------------------------------------------------------------------===//
				475
				476	Another instruction selector deficiency:
				477
				478	void %bar() {
				479	%tmp = load int (int)** %foo
				480	%tmp = tail call int %tmp( int 3 )
				481	ret void
				482	}
				483
				484	_bar:
				485	subl $12, %esp
				486	movl L_foo$non_lazy_ptr, %eax
				487	movl (%eax), %eax
				488	call *%eax
				489	addl $12, %esp
				490	ret
				491
				492	The current isel scheme will not allow the load to be folded in the call since
				493	the load's chain result is read by the callseq_start.
				494
				495	//===---------------------------------------------------------------------===//
				496
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	497	For this:
				498
				499	int test(int a)
				500	{
				501	return a * 3;
				502	}
				503
				504	We currently emits
				505	imull $3, 4(%esp), %eax
				506
				507	Perhaps this is what we really should generate is? Is imull three or four
				508	cycles? Note: ICC generates this:
				509	movl 4(%esp), %eax
				510	leal (%eax,%eax,2), %eax
				511
				512	The current instruction priority is based on pattern complexity. The former is
				513	more "complex" because it folds a load so the latter will not be emitted.
				514
				515	Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
				516	should always try to match LEA first since the LEA matching code does some
				517	estimate to determine whether the match is profitable.
				518
				519	However, if we care more about code size, then imull is better. It's two bytes
				520	shorter than movl + leal.
				521
				522	//===---------------------------------------------------------------------===//
				523
Chris Lattner	a86af9a	2007-08-11 18:19:07 +0000	[diff] [blame]	524	Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
				525
				526	int ctz_(unsigned X) { return __builtin_ctz(X); }
				527	int clz_(unsigned X) { return __builtin_clz(X); }
				528	int ffs_(unsigned X) { return __builtin_ffs(X); }
				529
				530	_ctz_:
				531	bsfl 4(%esp), %eax
				532	ret
				533	_clz_:
				534	bsrl 4(%esp), %eax
				535	xorl $31, %eax
				536	ret
				537	_ffs_:
				538	movl $-1, %edx
				539	bsfl 4(%esp), %eax
				540	cmove %edx, %eax
				541	addl $1, %eax
				542	ret
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	543
				544	//===---------------------------------------------------------------------===//
				545
				546	It appears gcc place string data with linkonce linkage in
				547	.section __TEXT,__const_coal,coalesced instead of
				548	.section __DATA,__const_coal,coalesced.
				549	Take a look at darwin.h, there are other Darwin assembler directives that we
				550	do not make use of.
				551
				552	//===---------------------------------------------------------------------===//
				553
				554	int %foo(int* %a, int %t) {
				555	entry:
				556	br label %cond_true
				557
				558	cond_true: ; preds = %cond_true, %entry
				559	%x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
				560	%t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
				561	%tmp2 = getelementptr int* %a, int %x.0.0
				562	%tmp3 = load int* %tmp2 ; <int> [#uses=1]
				563	%tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
				564	%tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
				565	%tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
				566	%tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
				567	br bool %tmp, label %bb12, label %cond_true
				568
				569	bb12: ; preds = %cond_true
				570	ret int %tmp7
				571	}
				572
				573	is pessimized by -loop-reduce and -indvars
				574
				575	//===---------------------------------------------------------------------===//
				576
				577	u32 to float conversion improvement:
				578
				579	float uint32_2_float( unsigned u ) {
				580	float fl = (int) (u & 0xffff);
				581	float fh = (int) (u >> 16);
				582	fh *= 0x1.0p16f;
				583	return fh + fl;
				584	}
				585
				586	00000000 subl $0x04,%esp
				587	00000003 movl 0x08(%esp,1),%eax
				588	00000007 movl %eax,%ecx
				589	00000009 shrl $0x10,%ecx
				590	0000000c cvtsi2ss %ecx,%xmm0
				591	00000010 andl $0x0000ffff,%eax
				592	00000015 cvtsi2ss %eax,%xmm1
				593	00000019 mulss 0x00000078,%xmm0
				594	00000021 addss %xmm1,%xmm0
				595	00000025 movss %xmm0,(%esp,1)
				596	0000002a flds (%esp,1)
				597	0000002d addl $0x04,%esp
				598	00000030 ret
				599
				600	//===---------------------------------------------------------------------===//
				601
				602	When using fastcc abi, align stack slot of argument of type double on 8 byte
				603	boundary to improve performance.
				604
				605	//===---------------------------------------------------------------------===//
				606
				607	Codegen:
				608
				609	int f(int a, int b) {
				610	if (a == 4 \|\| a == 6)
				611	b++;
				612	return b;
				613	}
				614
				615
				616	as:
				617
				618	or eax, 2
				619	cmp eax, 6
				620	jz label
				621
				622	//===---------------------------------------------------------------------===//
				623
				624	GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
				625	simplifications for integer "x cmp y ? a : b". For example, instead of:
				626
				627	int G;
				628	void f(int X, int Y) {
				629	G = X < 0 ? 14 : 13;
				630	}
				631
				632	compiling to:
				633
				634	_f:
				635	movl $14, %eax
				636	movl $13, %ecx
				637	movl 4(%esp), %edx
				638	testl %edx, %edx
				639	cmovl %eax, %ecx
				640	movl %ecx, _G
				641	ret
				642
				643	it could be:
				644	_f:
				645	movl 4(%esp), %eax
				646	sarl $31, %eax
				647	notl %eax
				648	addl $14, %eax
				649	movl %eax, _G
				650	ret
				651
				652	etc.
				653
Chris Lattner	e7037c2	2007-11-02 17:04:20 +0000	[diff] [blame^]	654	Another is:
				655	int usesbb(unsigned int a, unsigned int b) {
				656	return (a < b ? -1 : 0);
				657	}
				658	to:
				659	_usesbb:
				660	movl 8(%esp), %eax
				661	cmpl %eax, 4(%esp)
				662	sbbl %eax, %eax
				663	ret
				664
				665	instead of:
				666	_usesbb:
				667	xorl %eax, %eax
				668	movl 8(%esp), %ecx
				669	cmpl %ecx, 4(%esp)
				670	movl $4294967295, %ecx
				671	cmovb %ecx, %eax
				672	ret
				673
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	674	//===---------------------------------------------------------------------===//
				675
				676	Currently we don't have elimination of redundant stack manipulations. Consider
				677	the code:
				678
				679	int %main() {
				680	entry:
				681	call fastcc void %test1( )
				682	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
				683	ret int 0
				684	}
				685
				686	declare fastcc void %test1()
				687
				688	declare fastcc void %test2(sbyte*)
				689
				690
				691	This currently compiles to:
				692
				693	subl $16, %esp
				694	call _test5
				695	addl $12, %esp
				696	subl $16, %esp
				697	movl $_test5, (%esp)
				698	call _test6
				699	addl $12, %esp
				700
				701	The add\sub pair is really unneeded here.
				702
				703	//===---------------------------------------------------------------------===//
				704
				705	We currently compile sign_extend_inreg into two shifts:
				706
				707	long foo(long X) {
				708	return (long)(signed char)X;
				709	}
				710
				711	becomes:
				712
				713	_foo:
				714	movl 4(%esp), %eax
				715	shll $24, %eax
				716	sarl $24, %eax
				717	ret
				718
				719	This could be:
				720
				721	_foo:
				722	movsbl 4(%esp),%eax
				723	ret
				724
				725	//===---------------------------------------------------------------------===//
				726
				727	Consider the expansion of:
				728
				729	uint %test3(uint %X) {
				730	%tmp1 = rem uint %X, 255
				731	ret uint %tmp1
				732	}
				733
				734	Currently it compiles to:
				735
				736	...
				737	movl $2155905153, %ecx
				738	movl 8(%esp), %esi
				739	movl %esi, %eax
				740	mull %ecx
				741	...
				742
				743	This could be "reassociated" into:
				744
				745	movl $2155905153, %eax
				746	movl 8(%esp), %ecx
				747	mull %ecx
				748
				749	to avoid the copy. In fact, the existing two-address stuff would do this
				750	except that mul isn't a commutative 2-addr instruction. I guess this has
				751	to be done at isel time based on the #uses to mul?
				752
				753	//===---------------------------------------------------------------------===//
				754
				755	Make sure the instruction which starts a loop does not cross a cacheline
				756	boundary. This requires knowning the exact length of each machine instruction.
				757	That is somewhat complicated, but doable. Example 256.bzip2:
				758
				759	In the new trace, the hot loop has an instruction which crosses a cacheline
				760	boundary. In addition to potential cache misses, this can't help decoding as I
				761	imagine there has to be some kind of complicated decoder reset and realignment
				762	to grab the bytes from the next cacheline.
				763
				764	532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
				765	942 942 0x3d03 movl %dh, (1809(%esp, %esi)
				766	937 937 0x3d0a incl %esi
				767	3 3 0x3d0b cmpb %bl, %dl
				768	27 27 0x3d0d jnz 0x000062db <main+11707>
				769
				770	//===---------------------------------------------------------------------===//
				771
				772	In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
				773
				774	//===---------------------------------------------------------------------===//
				775
				776	This could be a single 16-bit load.
				777
				778	int f(char *p) {
				779	if ((p[0] == 1) & (p[1] == 2)) return 1;
				780	return 0;
				781	}
				782
				783	//===---------------------------------------------------------------------===//
				784
				785	We should inline lrintf and probably other libc functions.
				786
				787	//===---------------------------------------------------------------------===//
				788
				789	Start using the flags more. For example, compile:
				790
				791	int add_zf(int *x, int y, int a, int b) {
				792	if ((*x += y) == 0)
				793	return a;
				794	else
				795	return b;
				796	}
				797
				798	to:
				799	addl %esi, (%rdi)
				800	movl %edx, %eax
				801	cmovne %ecx, %eax
				802	ret
				803	instead of:
				804
				805	_add_zf:
				806	addl (%rdi), %esi
				807	movl %esi, (%rdi)
				808	testl %esi, %esi
				809	cmove %edx, %ecx
				810	movl %ecx, %eax
				811	ret
				812
				813	and:
				814
				815	int add_zf(int *x, int y, int a, int b) {
				816	if ((*x + y) < 0)
				817	return a;
				818	else
				819	return b;
				820	}
				821
				822	to:
				823
				824	add_zf:
				825	addl (%rdi), %esi
				826	movl %edx, %eax
				827	cmovns %ecx, %eax
				828	ret
				829
				830	instead of:
				831
				832	_add_zf:
				833	addl (%rdi), %esi
				834	testl %esi, %esi
				835	cmovs %edx, %ecx
				836	movl %ecx, %eax
				837	ret
				838
				839	//===---------------------------------------------------------------------===//
				840
				841	This:
				842	#include <math.h>
				843	int foo(double X) { return isnan(X); }
				844
				845	compiles to (-m64):
				846
				847	_foo:
				848	pxor %xmm1, %xmm1
				849	ucomisd %xmm1, %xmm0
				850	setp %al
				851	movzbl %al, %eax
				852	ret
				853
				854	the pxor is not needed, we could compare the value against itself.
				855
				856	//===---------------------------------------------------------------------===//
				857
				858	These two functions have identical effects:
				859
				860	unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
				861	unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
				862
				863	We currently compile them to:
				864
				865	_f:
				866	movl 4(%esp), %eax
				867	movl %eax, %ecx
				868	incl %ecx
				869	movl 8(%esp), %edx
				870	cmpl %edx, %ecx
				871	jne LBB1_2 #UnifiedReturnBlock
				872	LBB1_1: #cond_true
				873	addl $2, %eax
				874	ret
				875	LBB1_2: #UnifiedReturnBlock
				876	movl %ecx, %eax
				877	ret
				878	_f2:
				879	movl 4(%esp), %eax
				880	movl %eax, %ecx
				881	incl %ecx
				882	cmpl 8(%esp), %ecx
				883	sete %cl
				884	movzbl %cl, %ecx
				885	leal 1(%ecx,%eax), %eax
				886	ret
				887
				888	both of which are inferior to GCC's:
				889
				890	_f:
				891	movl 4(%esp), %edx
				892	leal 1(%edx), %eax
				893	addl $2, %edx
				894	cmpl 8(%esp), %eax
				895	cmove %edx, %eax
				896	ret
				897	_f2:
				898	movl 4(%esp), %eax
				899	addl $1, %eax
				900	xorl %edx, %edx
				901	cmpl 8(%esp), %eax
				902	sete %dl
				903	addl %edx, %eax
				904	ret
				905
				906	//===---------------------------------------------------------------------===//
				907
				908	This code:
				909
				910	void test(int X) {
				911	if (X) abort();
				912	}
				913
				914	is currently compiled to:
				915
				916	_test:
				917	subl $12, %esp
				918	cmpl $0, 16(%esp)
				919	jne LBB1_1
				920	addl $12, %esp
				921	ret
				922	LBB1_1:
				923	call L_abort$stub
				924
				925	It would be better to produce:
				926
				927	_test:
				928	subl $12, %esp
				929	cmpl $0, 16(%esp)
				930	jne L_abort$stub
				931	addl $12, %esp
				932	ret
				933
				934	This can be applied to any no-return function call that takes no arguments etc.
				935	Alternatively, the stack save/restore logic could be shrink-wrapped, producing
				936	something like this:
				937
				938	_test:
				939	cmpl $0, 4(%esp)
				940	jne LBB1_1
				941	ret
				942	LBB1_1:
				943	subl $12, %esp
				944	call L_abort$stub
				945
				946	Both are useful in different situations. Finally, it could be shrink-wrapped
				947	and tail called, like this:
				948
				949	_test:
				950	cmpl $0, 4(%esp)
				951	jne LBB1_1
				952	ret
				953	LBB1_1:
				954	pop %eax # realign stack.
				955	call L_abort$stub
				956
				957	Though this probably isn't worth it.
				958
				959	//===---------------------------------------------------------------------===//
				960
				961	We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner	0d64ec3	2007-08-11 18:16:46 +0000	[diff] [blame]	962	when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	963
				964	int foo(int A, int B) {
				965	return A+1;
				966	}
				967
				968	to:
				969
				970	_foo:
				971	leal 1(%edi), %eax
				972	ret
				973
				974	instead of:
				975
				976	_foo:
				977	incl %edi
				978	movl %edi, %eax
				979	ret
				980
				981	Another example is:
				982
				983	;; X's live range extends beyond the shift, so the register allocator
				984	;; cannot coalesce it with Y. Because of this, a copy needs to be
				985	;; emitted before the shift to save the register value before it is
				986	;; clobbered. However, this copy is not needed if the register
				987	;; allocator turns the shift into an LEA. This also occurs for ADD.
				988
				989	; Check that the shift gets turned into an LEA.
				990	; RUN: llvm-upgrade < %s \| llvm-as \| llc -march=x86 -x86-asm-syntax=intel \| \
				991	; RUN: not grep {mov E.X, E.X}
				992
				993	%G = external global int
				994
				995	int %test1(int %X, int %Y) {
				996	%Z = add int %X, %Y
				997	volatile store int %Y, int* %G
				998	volatile store int %Z, int* %G
				999	ret int %X
				1000	}
				1001
				1002	int %test2(int %X) {
				1003	%Z = add int %X, 1 ;; inc
				1004	volatile store int %Z, int* %G
				1005	ret int %X
				1006	}
				1007
				1008	//===---------------------------------------------------------------------===//
				1009
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1010	Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
				1011	a neg instead of a sub instruction. Consider:
				1012
				1013	int test(char X) { return 7-X; }
				1014
				1015	we currently produce:
				1016	_test:
				1017	movl $7, %eax
				1018	movsbl 4(%esp), %ecx
				1019	subl %ecx, %eax
				1020	ret
				1021
				1022	We would use one fewer register if codegen'd as:
				1023
				1024	movsbl 4(%esp), %eax
				1025	neg %eax
				1026	add $7, %eax
				1027	ret
				1028
				1029	Note that this isn't beneficial if the load can be folded into the sub. In
				1030	this case, we want a sub:
				1031
				1032	int test(int X) { return 7-X; }
				1033	_test:
				1034	movl $7, %eax
				1035	subl 4(%esp), %eax
				1036	ret
				1037
				1038	//===---------------------------------------------------------------------===//
				1039
				1040	For code like:
				1041	phi (undef, x)
				1042
				1043	We get an implicit def on the undef side. If the phi is spilled, we then get:
				1044	implicitdef xmm1
				1045	store xmm1 -> stack
				1046
				1047	It should be possible to teach the x86 backend to "fold" the store into the
				1048	implicitdef, which just deletes the implicit def.
				1049
				1050	These instructions should go away:
				1051	#IMPLICIT_DEF %xmm1
				1052	movaps %xmm1, 192(%esp)
				1053	movaps %xmm1, 224(%esp)
				1054	movaps %xmm1, 176(%esp)
Chris Lattner	a3c76a4	2007-08-03 00:17:42 +0000	[diff] [blame]	1055
				1056	//===---------------------------------------------------------------------===//
				1057
				1058	This is a "commutable two-address" register coallescing deficiency:
				1059
				1060	define <4 x float> @test1(<4 x float> %V) {
				1061	entry:
Chris Lattner	a86af9a	2007-08-11 18:19:07 +0000	[diff] [blame]	1062	%tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
				1063	<4 x i32> < i32 3, i32 2, i32 1, i32 0 >
				1064	%add = add <4 x float> %tmp8, %V
Chris Lattner	a3c76a4	2007-08-03 00:17:42 +0000	[diff] [blame]	1065	ret <4 x float> %add
				1066	}
				1067
				1068	this codegens to:
				1069
				1070	_test1:
				1071	pshufd $27, %xmm0, %xmm1
				1072	addps %xmm0, %xmm1
				1073	movaps %xmm1, %xmm0
				1074	ret
				1075
				1076	instead of:
				1077
				1078	_test1:
				1079	pshufd $27, %xmm0, %xmm1
				1080	addps %xmm1, %xmm0
				1081	ret
				1082
Chris Lattner	32f6587	2007-08-20 02:14:33 +0000	[diff] [blame]	1083	//===---------------------------------------------------------------------===//
				1084
				1085	Leaf functions that require one 4-byte spill slot have a prolog like this:
				1086
				1087	_foo:
				1088	pushl %esi
				1089	subl $4, %esp
				1090	...
				1091	and an epilog like this:
				1092	addl $4, %esp
				1093	popl %esi
				1094	ret
				1095
				1096	It would be smaller, and potentially faster, to push eax on entry and to
				1097	pop into a dummy register instead of using addl/subl of esp. Just don't pop
				1098	into any return registers :)
				1099
				1100	//===---------------------------------------------------------------------===//
Chris Lattner	44b03cb	2007-08-23 15:22:07 +0000	[diff] [blame]	1101
				1102	The X86 backend should fold (branch (or (setcc, setcc))) into multiple
				1103	branches. We generate really poor code for:
				1104
				1105	double testf(double a) {
				1106	return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
				1107	}
				1108
				1109	For example, the entry BB is:
				1110
				1111	_testf:
				1112	subl $20, %esp
				1113	pxor %xmm0, %xmm0
				1114	movsd 24(%esp), %xmm1
				1115	ucomisd %xmm0, %xmm1
				1116	setnp %al
				1117	sete %cl
				1118	testb %cl, %al
				1119	jne LBB1_5 # UnifiedReturnBlock
				1120	LBB1_1: # cond_true
				1121
				1122
				1123	it would be better to replace the last four instructions with:
				1124
				1125	jp LBB1_1
				1126	je LBB1_5
				1127	LBB1_1:
				1128
				1129	We also codegen the inner ?: into a diamond:
				1130
				1131	cvtss2sd LCPI1_0(%rip), %xmm2
				1132	cvtss2sd LCPI1_1(%rip), %xmm3
				1133	ucomisd %xmm1, %xmm0
				1134	ja LBB1_3 # cond_true
				1135	LBB1_2: # cond_true
				1136	movapd %xmm3, %xmm2
				1137	LBB1_3: # cond_true
				1138	movapd %xmm2, %xmm0
				1139	ret
				1140
				1141	We should sink the load into xmm3 into the LBB1_2 block. This should
				1142	be pretty easy, and will nuke all the copies.
				1143
				1144	//===---------------------------------------------------------------------===//
Chris Lattner	4084d49	2007-09-10 21:43:18 +0000	[diff] [blame]	1145
				1146	This:
				1147	#include <algorithm>
				1148	inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
				1149	{ return std::make_pair(a + b, a + b < a); }
				1150	bool no_overflow(unsigned a, unsigned b)
				1151	{ return !full_add(a, b).second; }
				1152
				1153	Should compile to:
				1154
				1155
				1156	_Z11no_overflowjj:
				1157	addl %edi, %esi
				1158	setae %al
				1159	ret
				1160
				1161	on x86-64, not:
				1162
				1163	__Z11no_overflowjj:
				1164	addl %edi, %esi
				1165	cmpl %edi, %esi
				1166	setae %al
				1167	movzbl %al, %eax
				1168	ret
				1169
				1170
				1171	//===---------------------------------------------------------------------===//
Evan Cheng	35127a6	2007-09-10 22:16:37 +0000	[diff] [blame]	1172
				1173	Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
				1174	condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattner	a487bf7	2007-09-26 06:29:31 +0000	[diff] [blame]	1175
				1176	//===---------------------------------------------------------------------===//
				1177
				1178	We aren't matching RMW instructions aggressively
				1179	enough. Here's a reduced testcase (more in PR1160):
				1180
				1181	define void @test(i32* %huge_ptr, i32* %target_ptr) {
				1182	%A = load i32* %huge_ptr ; <i32> [#uses=1]
				1183	%B = load i32* %target_ptr ; <i32> [#uses=1]
				1184	%C = or i32 %A, %B ; <i32> [#uses=1]
				1185	store i32 %C, i32* %target_ptr
				1186	ret void
				1187	}
				1188
				1189	$ llvm-as < t.ll \| llc -march=x86-64
				1190
				1191	_test:
				1192	movl (%rdi), %eax
				1193	orl (%rsi), %eax
				1194	movl %eax, (%rsi)
				1195	ret
				1196
				1197	That should be something like:
				1198
				1199	_test:
				1200	movl (%rdi), %eax
				1201	orl %eax, (%rsi)
				1202	ret
				1203
				1204	//===---------------------------------------------------------------------===//
				1205
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1206	The following code:
				1207
Bill Wendling	c2036e3	2007-10-02 20:54:32 +0000	[diff] [blame]	1208	bb114.preheader: ; preds = %cond_next94
				1209	%tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
				1210	%tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
				1211	%tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
				1212	%tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
				1213	%tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
				1214	%tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
				1215	%tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
				1216	%tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
				1217	%tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
				1218	%tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
				1219	%tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
				1220	br label %bb114
				1221
				1222	produces:
				1223
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1224	LBB3_5: # bb114.preheader
				1225	movswl -68(%ebp), %eax
				1226	movl $32, %ecx
				1227	movl %ecx, -80(%ebp)
				1228	subl %eax, -80(%ebp)
				1229	movswl -52(%ebp), %eax
				1230	movl %ecx, -84(%ebp)
				1231	subl %eax, -84(%ebp)
				1232	movswl -70(%ebp), %eax
				1233	movl %ecx, -88(%ebp)
				1234	subl %eax, -88(%ebp)
				1235	movswl -50(%ebp), %eax
				1236	subl %eax, %ecx
				1237	movl %ecx, -76(%ebp)
				1238	movswl -42(%ebp), %eax
				1239	movl %eax, -92(%ebp)
				1240	movswl -66(%ebp), %eax
				1241	movl %eax, -96(%ebp)
				1242	movw $0, -98(%ebp)
				1243
Chris Lattner	792bae5	2007-10-03 03:40:24 +0000	[diff] [blame]	1244	This appears to be bad because the RA is not folding the store to the stack
				1245	slot into the movl. The above instructions could be:
				1246	movl $32, -80(%ebp)
				1247	...
				1248	movl $32, -84(%ebp)
				1249	...
				1250	This seems like a cross between remat and spill folding.
				1251
Bill Wendling	c2036e3	2007-10-02 20:54:32 +0000	[diff] [blame]	1252	This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1253	change, so we could simply subtract %eax from %ecx first and then use %ecx (or
				1254	vice-versa).
				1255
				1256	//===---------------------------------------------------------------------===//
				1257
Bill Wendling	c524bae	2007-10-02 21:43:06 +0000	[diff] [blame]	1258	For this code:
				1259
				1260	cond_next603: ; preds = %bb493, %cond_true336, %cond_next599
				1261	%v.21050.1 = phi i32 [ %v.21050.0, %cond_next599 ], [ %tmp344, %cond_true336 ], [ %v.2, %bb493 ] ; <i32> [#uses=1]
				1262	%maxz.21051.1 = phi i32 [ %maxz.21051.0, %cond_next599 ], [ 0, %cond_true336 ], [ %maxz.2, %bb493 ] ; <i32> [#uses=2]
				1263	%cnt.01055.1 = phi i32 [ %cnt.01055.0, %cond_next599 ], [ 0, %cond_true336 ], [ %cnt.0, %bb493 ] ; <i32> [#uses=2]
				1264	%byteptr.9 = phi i8* [ %byteptr.12, %cond_next599 ], [ %byteptr.0, %cond_true336 ], [ %byteptr.10, %bb493 ] ; <i8*> [#uses=9]
				1265	%bitptr.6 = phi i32 [ %tmp5571104.1, %cond_next599 ], [ %tmp4921049, %cond_true336 ], [ %bitptr.7, %bb493 ] ; <i32> [#uses=4]
				1266	%source.5 = phi i32 [ %tmp602, %cond_next599 ], [ %source.0, %cond_true336 ], [ %source.6, %bb493 ] ; <i32> [#uses=7]
				1267	%tmp606 = getelementptr %struct.const_tables* @tables, i32 0, i32 0, i32 %cnt.01055.1 ; <i8*> [#uses=1]
				1268	%tmp607 = load i8* %tmp606, align 1 ; <i8> [#uses=1]
				1269
				1270	We produce this:
				1271
				1272	LBB4_70: # cond_next603
				1273	movl -20(%ebp), %esi
				1274	movl L_tables$non_lazy_ptr-"L4$pb"(%esi), %esi
				1275
				1276	However, ICC caches this information before the loop and produces this:
				1277
				1278	movl 88(%esp), %eax #481.12
				1279
				1280	//===---------------------------------------------------------------------===//
Bill Wendling	54c4f83	2007-10-02 21:49:31 +0000	[diff] [blame]	1281
				1282	This code:
				1283
				1284	%tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
				1285	br i1 %tmp659, label %cond_true662, label %cond_next715
				1286
				1287	produces this:
				1288
				1289	testw %cx, %cx
				1290	movswl %cx, %esi
				1291	jns LBB4_109 # cond_next715
				1292
				1293	Shark tells us that using %cx in the testw instruction is sub-optimal. It
				1294	suggests using the 32-bit register (which is what ICC uses).
				1295
				1296	//===---------------------------------------------------------------------===//
Chris Lattner	802c62a	2007-10-03 17:10:03 +0000	[diff] [blame]	1297
				1298	rdar://5506677 - We compile this:
				1299
				1300	define i32 @foo(double %x) {
				1301	%x14 = bitcast double %x to i64 ; <i64> [#uses=1]
				1302	%tmp713 = trunc i64 %x14 to i32 ; <i32> [#uses=1]
				1303	%tmp8 = and i32 %tmp713, 2147483647 ; <i32> [#uses=1]
				1304	ret i32 %tmp8
				1305	}
				1306
				1307	to:
				1308
				1309	_foo:
				1310	subl $12, %esp
				1311	fldl 16(%esp)
				1312	fstpl (%esp)
				1313	movl $2147483647, %eax
				1314	andl (%esp), %eax
				1315	addl $12, %esp
				1316	#FP_REG_KILL
				1317	ret
				1318
				1319	It would be much better to eliminate the fldl/fstpl by folding the bitcast
				1320	into the load SDNode. That would give us:
				1321
				1322	_foo:
				1323	movl $2147483647, %eax
				1324	andl 4(%esp), %eax
				1325	ret
				1326
				1327	//===---------------------------------------------------------------------===//
				1328
Chris Lattner	ae25999	2007-10-04 15:47:27 +0000	[diff] [blame]	1329	We compile this:
				1330
				1331	void compare (long long foo) {
				1332	if (foo < 4294967297LL)
				1333	abort();
				1334	}
				1335
				1336	to:
				1337
				1338	_compare:
				1339	subl $12, %esp
				1340	cmpl $0, 16(%esp)
				1341	setne %al
				1342	movzbw %al, %ax
				1343	cmpl $1, 20(%esp)
				1344	setg %cl
				1345	movzbw %cl, %cx
				1346	cmove %ax, %cx
				1347	movw %cx, %ax
				1348	testb $1, %al
				1349	je LBB1_2 # cond_true
				1350
				1351	(also really horrible code on ppc). This is due to the expand code for 64-bit
				1352	compares. GCC produces multiple branches, which is much nicer:
				1353
				1354	_compare:
				1355	pushl %ebp
				1356	movl %esp, %ebp
				1357	subl $8, %esp
				1358	movl 8(%ebp), %eax
				1359	movl 12(%ebp), %edx
				1360	subl $1, %edx
				1361	jg L5
				1362	L7:
				1363	jl L4
				1364	cmpl $0, %eax
				1365	jbe L4
				1366	L5:
				1367
				1368	//===---------------------------------------------------------------------===//
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1369
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1370	Tail call optimization improvements: Tail call optimization currently
				1371	pushes all arguments on the top of the stack (their normal place for
				1372	non-tail call optimized calls) before moving them to actual stack
				1373	slot. This is done to prevent overwriting of parameters (see example
				1374	below) that might be used, since the arguments of the callee
				1375	overwrites caller's arguments.
				1376
				1377	example:
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1378
				1379	int callee(int32, int64);
				1380	int caller(int32 arg1, int32 arg2) {
				1381	int64 local = arg2 * 2;
				1382	return callee(arg2, (int64)local);
				1383	}
				1384
				1385	[arg1] [!arg2 no longer valid since we moved local onto it]
				1386	[arg2] -> [(int64)
				1387	[RETADDR] local ]
				1388
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1389	Moving arg1 onto the stack slot of callee function would overwrite
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1390	arg2 of the caller.
				1391
				1392	Possible optimizations:
				1393
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1394	- Only push those arguments to the top of the stack that are actual
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1395	parameters of the caller function and have no local value in the
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1396	caller.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1397
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1398	In the above example local does not need to be pushed onto the top
				1399	of the stack as it is definitely not a caller's function
				1400	parameter.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1401
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1402	- Analyse the actual parameters of the callee to see which would
				1403	overwrite a caller parameter which is used by the callee and only
				1404	push them onto the top of the stack.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1405
				1406	int callee (int32 arg1, int32 arg2);
				1407	int caller (int32 arg1, int32 arg2) {
				1408	return callee(arg1,arg2);
				1409	}
				1410
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1411	Here we don't need to write any variables to the top of the stack
				1412	since they don't overwrite each other.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1413
				1414	int callee (int32 arg1, int32 arg2);
				1415	int caller (int32 arg1, int32 arg2) {
				1416	return callee(arg2,arg1);
				1417	}
				1418
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1419	Here we need to push the arguments because they overwrite each
				1420	other.
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1421
				1422
Arnold Schwaighofer	373e865	2007-10-12 21:30:57 +0000	[diff] [blame]	1423	Code for lowering directly onto callers arguments:
Arnold Schwaighofer	e2d6bbb	2007-10-11 19:40:01 +0000	[diff] [blame]	1424	+ SmallVector<std::pair<unsigned, SDOperand>, 8> RegsToPass;
				1425	+ SmallVector<SDOperand, 8> MemOpChains;
				1426	+
				1427	+ SDOperand FramePtr;
				1428	+ SDOperand PtrOff;
				1429	+ SDOperand FIN;
				1430	+ int FI = 0;
				1431	+ // Walk the register/memloc assignments, inserting copies/loads.
				1432	+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
				1433	+ CCValAssign &VA = ArgLocs[i];
				1434	+ SDOperand Arg = Op.getOperand(5+2*VA.getValNo());
				1435	+
				1436	+ ....
				1437	+
				1438	+ if (VA.isRegLoc()) {
				1439	+ RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
				1440	+ } else {
				1441	+ assert(VA.isMemLoc());
				1442	+ // create frame index
				1443	+ int32_t Offset = VA.getLocMemOffset()+FPDiff;
				1444	+ uint32_t OpSize = (MVT::getSizeInBits(VA.getLocVT())+7)/8;
				1445	+ FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset);
				1446	+ FIN = DAG.getFrameIndex(FI, MVT::i32);
				1447	+ // store relative to framepointer
				1448	+ MemOpChains.push_back(DAG.getStore(Chain, Arg, FIN, NULL, 0));
				1449	+ }
				1450	+ }
				1451	//===---------------------------------------------------------------------===//
Evan Cheng	7f1ad6a	2007-10-28 04:01:09 +0000	[diff] [blame]	1452
				1453	main ()
				1454	{
				1455	int i = 0;
				1456	unsigned long int z = 0;
				1457
				1458	do {
				1459	z -= 0x00004000;
				1460	i++;
				1461	if (i > 0x00040000)
				1462	abort ();
				1463	} while (z > 0);
				1464	exit (0);
				1465	}
				1466
				1467	gcc compiles this to:
				1468
				1469	_main:
				1470	subl $28, %esp
				1471	xorl %eax, %eax
				1472	jmp L2
				1473	L3:
				1474	cmpl $262144, %eax
				1475	je L10
				1476	L2:
				1477	addl $1, %eax
				1478	cmpl $262145, %eax
				1479	jne L3
				1480	call L_abort$stub
				1481	L10:
				1482	movl $0, (%esp)
				1483	call L_exit$stub
				1484
				1485	llvm:
				1486
				1487	_main:
				1488	subl $12, %esp
				1489	movl $1, %eax
				1490	movl $16384, %ecx
				1491	LBB1_1: # bb
				1492	cmpl $262145, %eax
				1493	jge LBB1_4 # cond_true
				1494	LBB1_2: # cond_next
				1495	incl %eax
				1496	addl $4294950912, %ecx
				1497	cmpl $16384, %ecx
				1498	jne LBB1_1 # bb
				1499	LBB1_3: # bb11
				1500	xorl %eax, %eax
				1501	addl $12, %esp
				1502	ret
				1503	LBB1_4: # cond_true
				1504	call L_abort$stub
				1505
				1506	1. LSR should rewrite the first cmp with induction variable %ecx.
				1507	2. DAG combiner should fold
				1508	leal 1(%eax), %edx
				1509	cmpl $262145, %edx
				1510	=>
				1511	cmpl $262144, %eax
				1512
				1513	//===---------------------------------------------------------------------===//