Blame - lib/Target/X86/README.txt - fp2-dev/platform/external/llvm

blob: 517240480d71fba6798c0e038f106d629a6f2917 [file] [log] [blame]

Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1	//===---------------------------------------------------------------------===//
				2	// Random ideas for the X86 backend.
				3	//===---------------------------------------------------------------------===//
				4
				5	Missing features:
				6	- Support for SSE4: http://www.intel.com/software/penryn
				7	http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
				8	- support for 3DNow!
				9	- weird abis?
				10
				11	//===---------------------------------------------------------------------===//
				12
				13	Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
				14	Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
				15	X86, & make the dag combiner produce it when needed. This will eliminate one
				16	imul from the code generated for:
				17
				18	long long test(long long X, long long Y) { return X*Y; }
				19
				20	by using the EAX result from the mul. We should add a similar node for
				21	DIVREM.
				22
				23	another case is:
				24
				25	long long test(int X, int Y) { return (long long)X*Y; }
				26
				27	... which should only be one imul instruction.
				28
				29	or:
				30
				31	unsigned long long int t2(unsigned int a, unsigned int b) {
				32	return (unsigned long long)a * b;
				33	}
				34
				35	... which should be one mul instruction.
				36
				37
				38	This can be done with a custom expander, but it would be nice to move this to
				39	generic code.
				40
				41	//===---------------------------------------------------------------------===//
				42
				43	CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
				44	backend knows how to three-addressify this shift, but it appears the register
				45	allocator isn't even asking it to do so in this case. We should investigate
				46	why this isn't happening, it could have significant impact on other important
				47	cases for X86 as well.
				48
				49	//===---------------------------------------------------------------------===//
				50
				51	This should be one DIV/IDIV instruction, not a libcall:
				52
				53	unsigned test(unsigned long long X, unsigned Y) {
				54	return X/Y;
				55	}
				56
				57	This can be done trivially with a custom legalizer. What about overflow
				58	though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
				59
				60	//===---------------------------------------------------------------------===//
				61
				62	Improvements to the multiply -> shift/add algorithm:
				63	http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
				64
				65	//===---------------------------------------------------------------------===//
				66
				67	Improve code like this (occurs fairly frequently, e.g. in LLVM):
				68	long long foo(int x) { return 1LL << x; }
				69
				70	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
				71	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
				72	http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
				73
				74	Another useful one would be ~0ULL >> X and ~0ULL << X.
				75
				76	One better solution for 1LL << x is:
				77	xorl %eax, %eax
				78	xorl %edx, %edx
				79	testb $32, %cl
				80	sete %al
				81	setne %dl
				82	sall %cl, %eax
				83	sall %cl, %edx
				84
				85	But that requires good 8-bit subreg support.
				86
				87	64-bit shifts (in general) expand to really bad code. Instead of using
				88	cmovs, we should expand to a conditional branch like GCC produces.
				89
				90	//===---------------------------------------------------------------------===//
				91
				92	Compile this:
				93	_Bool f(_Bool a) { return a!=1; }
				94
				95	into:
				96	movzbl %dil, %eax
				97	xorl $1, %eax
				98	ret
				99
				100	//===---------------------------------------------------------------------===//
				101
				102	Some isel ideas:
				103
				104	1. Dynamic programming based approach when compile time if not an
				105	issue.
				106	2. Code duplication (addressing mode) during isel.
				107	3. Other ideas from "Register-Sensitive Selection, Duplication, and
				108	Sequencing of Instructions".
				109	4. Scheduling for reduced register pressure. E.g. "Minimum Register
				110	Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
				111	and other related papers.
				112	http://citeseer.ist.psu.edu/govindarajan01minimum.html
				113
				114	//===---------------------------------------------------------------------===//
				115
				116	Should we promote i16 to i32 to avoid partial register update stalls?
				117
				118	//===---------------------------------------------------------------------===//
				119
				120	Leave any_extend as pseudo instruction and hint to register
				121	allocator. Delay codegen until post register allocation.
				122
				123	//===---------------------------------------------------------------------===//
				124
				125	Count leading zeros and count trailing zeros:
				126
				127	int clz(int X) { return __builtin_clz(X); }
				128	int ctz(int X) { return __builtin_ctz(X); }
				129
				130	$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
				131	clz:
				132	bsr %eax, DWORD PTR [%esp+4]
				133	xor %eax, 31
				134	ret
				135	ctz:
				136	bsf %eax, DWORD PTR [%esp+4]
				137	ret
				138
				139	however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
				140	aren't.
				141
				142	Another example (use predsimplify to eliminate a select):
				143
				144	int foo (unsigned long j) {
				145	if (j)
				146	return __builtin_ffs (j) - 1;
				147	else
				148	return 0;
				149	}
				150
				151	//===---------------------------------------------------------------------===//
				152
				153	It appears icc use push for parameter passing. Need to investigate.
				154
				155	//===---------------------------------------------------------------------===//
				156
				157	Only use inc/neg/not instructions on processors where they are faster than
				158	add/sub/xor. They are slower on the P4 due to only updating some processor
				159	flags.
				160
				161	//===---------------------------------------------------------------------===//
				162
				163	The instruction selector sometimes misses folding a load into a compare. The
				164	pattern is written as (cmp reg, (load p)). Because the compare isn't
				165	commutative, it is not matched with the load on both sides. The dag combiner
				166	should be made smart enough to cannonicalize the load into the RHS of a compare
				167	when it can invert the result of the compare for free.
				168
				169	//===---------------------------------------------------------------------===//
				170
				171	How about intrinsics? An example is:
				172	res = _mm_mulhi_epu16(A, _mm_mul_epu32(B, C));
				173
				174	compiles to
				175	pmuludq (%eax), %xmm0
				176	movl 8(%esp), %eax
				177	movdqa (%eax), %xmm1
				178	pmulhuw %xmm0, %xmm1
				179
				180	The transformation probably requires a X86 specific pass or a DAG combiner
				181	target specific hook.
				182
				183	//===---------------------------------------------------------------------===//
				184
				185	In many cases, LLVM generates code like this:
				186
				187	_test:
				188	movl 8(%esp), %eax
				189	cmpl %eax, 4(%esp)
				190	setl %al
				191	movzbl %al, %eax
				192	ret
				193
				194	on some processors (which ones?), it is more efficient to do this:
				195
				196	_test:
				197	movl 8(%esp), %ebx
				198	xor %eax, %eax
				199	cmpl %ebx, 4(%esp)
				200	setl %al
				201	ret
				202
				203	Doing this correctly is tricky though, as the xor clobbers the flags.
				204
				205	//===---------------------------------------------------------------------===//
				206
				207	We should generate bts/btr/etc instructions on targets where they are cheap or
				208	when codesize is important. e.g., for:
				209
				210	void setbit(int *target, int bit) {
				211	*target \|= (1 << bit);
				212	}
				213	void clearbit(int *target, int bit) {
				214	*target &= ~(1 << bit);
				215	}
				216
				217	//===---------------------------------------------------------------------===//
				218
				219	Instead of the following for memset char*, 1, 10:
				220
				221	movl $16843009, 4(%edx)
				222	movl $16843009, (%edx)
				223	movw $257, 8(%edx)
				224
				225	It might be better to generate
				226
				227	movl $16843009, %eax
				228	movl %eax, 4(%edx)
				229	movl %eax, (%edx)
				230	movw al, 8(%edx)
				231
				232	when we can spare a register. It reduces code size.
				233
				234	//===---------------------------------------------------------------------===//
				235
				236	Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
				237	get this:
				238
				239	int %test1(int %X) {
				240	%Y = div int %X, 8
				241	ret int %Y
				242	}
				243
				244	_test1:
				245	movl 4(%esp), %eax
				246	movl %eax, %ecx
				247	sarl $31, %ecx
				248	shrl $29, %ecx
				249	addl %ecx, %eax
				250	sarl $3, %eax
				251	ret
				252
				253	GCC knows several different ways to codegen it, one of which is this:
				254
				255	_test1:
				256	movl 4(%esp), %eax
				257	cmpl $-1, %eax
				258	leal 7(%eax), %ecx
				259	cmovle %ecx, %eax
				260	sarl $3, %eax
				261	ret
				262
				263	which is probably slower, but it's interesting at least :)
				264
				265	//===---------------------------------------------------------------------===//
				266
				267	The first BB of this code:
				268
				269	declare bool %foo()
				270	int %bar() {
				271	%V = call bool %foo()
				272	br bool %V, label %T, label %F
				273	T:
				274	ret int 1
				275	F:
				276	call bool %foo()
				277	ret int 12
				278	}
				279
				280	compiles to:
				281
				282	_bar:
				283	subl $12, %esp
				284	call L_foo$stub
				285	xorb $1, %al
				286	testb %al, %al
				287	jne LBB_bar_2 # F
				288
				289	It would be better to emit "cmp %al, 1" than a xor and test.
				290
				291	//===---------------------------------------------------------------------===//
				292
				293	Enable X86InstrInfo::convertToThreeAddress().
				294
				295	//===---------------------------------------------------------------------===//
				296
				297	We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
				298	We should leave these as libcalls for everything over a much lower threshold,
				299	since libc is hand tuned for medium and large mem ops (avoiding RFO for large
				300	stores, TLB preheating, etc)
				301
				302	//===---------------------------------------------------------------------===//
				303
				304	Optimize this into something reasonable:
				305	x * copysign(1.0, y) * copysign(1.0, z)
				306
				307	//===---------------------------------------------------------------------===//
				308
				309	Optimize copysign(x, *y) to use an integer load from y.
				310
				311	//===---------------------------------------------------------------------===//
				312
				313	%X = weak global int 0
				314
				315	void %foo(int %N) {
				316	%N = cast int %N to uint
				317	%tmp.24 = setgt int %N, 0
				318	br bool %tmp.24, label %no_exit, label %return
				319
				320	no_exit:
				321	%indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
				322	%i.0.0 = cast uint %indvar to int
				323	volatile store int %i.0.0, int* %X
				324	%indvar.next = add uint %indvar, 1
				325	%exitcond = seteq uint %indvar.next, %N
				326	br bool %exitcond, label %return, label %no_exit
				327
				328	return:
				329	ret void
				330	}
				331
				332	compiles into:
				333
				334	.text
				335	.align 4
				336	.globl _foo
				337	_foo:
				338	movl 4(%esp), %eax
				339	cmpl $1, %eax
				340	jl LBB_foo_4 # return
				341	LBB_foo_1: # no_exit.preheader
				342	xorl %ecx, %ecx
				343	LBB_foo_2: # no_exit
				344	movl L_X$non_lazy_ptr, %edx
				345	movl %ecx, (%edx)
				346	incl %ecx
				347	cmpl %eax, %ecx
				348	jne LBB_foo_2 # no_exit
				349	LBB_foo_3: # return.loopexit
				350	LBB_foo_4: # return
				351	ret
				352
				353	We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
				354	remateralization is implemented. This can be accomplished with 1) a target
				355	dependent LICM pass or 2) makeing SelectDAG represent the whole function.
				356
				357	//===---------------------------------------------------------------------===//
				358
				359	The following tests perform worse with LSR:
				360
				361	lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
				362
				363	//===---------------------------------------------------------------------===//
				364
				365	We are generating far worse code than gcc:
				366
				367	volatile short X, Y;
				368
				369	void foo(int N) {
				370	int i;
				371	for (i = 0; i < N; i++) { X = i; Y = i*4; }
				372	}
				373
				374	LBB1_1: #bb.preheader
				375	xorl %ecx, %ecx
				376	xorw %dx, %dx
				377	LBB1_2: #bb
				378	movl L_X$non_lazy_ptr, %esi
				379	movw %dx, (%esi)
				380	movw %dx, %si
				381	shlw $2, %si
				382	movl L_Y$non_lazy_ptr, %edi
				383	movw %si, (%edi)
				384	incl %ecx
				385	incw %dx
				386	cmpl %eax, %ecx
				387	jne LBB1_2 #bb
				388
				389	vs.
				390
				391	xorl %edx, %edx
				392	movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
				393	movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
				394	L4:
				395	movw %dx, (%esi)
				396	leal 0(,%edx,4), %eax
				397	movw %ax, (%ecx)
				398	addl $1, %edx
				399	cmpl %edx, %edi
				400	jne L4
				401
				402	There are 3 issues:
				403
				404	1. Lack of post regalloc LICM.
Christopher Lamb	2087a5e	2007-08-10 21:29:05 +0000	[diff] [blame]	405	2. LSR unable to reused IV for a different type (i16 vs. i32) even though
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	406	the cast would be free.
				407
				408	//===---------------------------------------------------------------------===//
				409
				410	Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
				411	FR64 to VR128.
				412
				413	//===---------------------------------------------------------------------===//
				414
				415	mov $reg, 48(%esp)
				416	...
				417	leal 48(%esp), %eax
				418	mov %eax, (%esp)
				419	call _foo
				420
				421	Obviously it would have been better for the first mov (or any op) to store
				422	directly %esp[0] if there are no other uses.
				423
				424	//===---------------------------------------------------------------------===//
				425
				426	Adding to the list of cmp / test poor codegen issues:
				427
				428	int test(__m128 A, __m128 B) {
				429	if (_mm_comige_ss(A, B))
				430	return 3;
				431	else
				432	return 4;
				433	}
				434
				435	_test:
				436	movl 8(%esp), %eax
				437	movaps (%eax), %xmm0
				438	movl 4(%esp), %eax
				439	movaps (%eax), %xmm1
				440	comiss %xmm0, %xmm1
				441	setae %al
				442	movzbl %al, %ecx
				443	movl $3, %eax
				444	movl $4, %edx
				445	cmpl $0, %ecx
				446	cmove %edx, %eax
				447	ret
				448
				449	Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
				450	are a number of issues. 1) We are introducing a setcc between the result of the
				451	intrisic call and select. 2) The intrinsic is expected to produce a i32 value
				452	so a any extend (which becomes a zero extend) is added.
				453
				454	We probably need some kind of target DAG combine hook to fix this.
				455
				456	//===---------------------------------------------------------------------===//
				457
				458	We generate significantly worse code for this than GCC:
				459	http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
				460	http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
				461
				462	There is also one case we do worse on PPC.
				463
				464	//===---------------------------------------------------------------------===//
				465
				466	If shorter, we should use things like:
				467	movzwl %ax, %eax
				468	instead of:
				469	andl $65535, %EAX
				470
				471	The former can also be used when the two-addressy nature of the 'and' would
				472	require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
				473
				474	//===---------------------------------------------------------------------===//
				475
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	476	Consider this:
				477
				478	typedef struct pair { float A, B; } pair;
				479	void pairtest(pair P, float *FP) {
				480	*FP = P.A+P.B;
				481	}
				482
				483	We currently generate this code with llvmgcc4:
				484
				485	_pairtest:
				486	movl 8(%esp), %eax
				487	movl 4(%esp), %ecx
				488	movd %eax, %xmm0
				489	movd %ecx, %xmm1
				490	addss %xmm0, %xmm1
				491	movl 12(%esp), %eax
				492	movss %xmm1, (%eax)
				493	ret
				494
				495	we should be able to generate:
				496	_pairtest:
				497	movss 4(%esp), %xmm0
				498	movl 12(%esp), %eax
				499	addss 8(%esp), %xmm0
				500	movss %xmm0, (%eax)
				501	ret
				502
				503	The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
				504	integer chunks. It does this so that structs like {short,short} are passed in
				505	a single 32-bit integer stack slot. We should handle the safe cases above much
				506	nicer, while still handling the hard cases.
				507
				508	While true in general, in this specific case we could do better by promoting
				509	load int + bitcast to float -> load fload. This basically needs alignment info,
				510	the code is already implemented (but disabled) in dag combine).
				511
				512	//===---------------------------------------------------------------------===//
				513
				514	Another instruction selector deficiency:
				515
				516	void %bar() {
				517	%tmp = load int (int)** %foo
				518	%tmp = tail call int %tmp( int 3 )
				519	ret void
				520	}
				521
				522	_bar:
				523	subl $12, %esp
				524	movl L_foo$non_lazy_ptr, %eax
				525	movl (%eax), %eax
				526	call *%eax
				527	addl $12, %esp
				528	ret
				529
				530	The current isel scheme will not allow the load to be folded in the call since
				531	the load's chain result is read by the callseq_start.
				532
				533	//===---------------------------------------------------------------------===//
				534
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	535	For this:
				536
				537	int test(int a)
				538	{
				539	return a * 3;
				540	}
				541
				542	We currently emits
				543	imull $3, 4(%esp), %eax
				544
				545	Perhaps this is what we really should generate is? Is imull three or four
				546	cycles? Note: ICC generates this:
				547	movl 4(%esp), %eax
				548	leal (%eax,%eax,2), %eax
				549
				550	The current instruction priority is based on pattern complexity. The former is
				551	more "complex" because it folds a load so the latter will not be emitted.
				552
				553	Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
				554	should always try to match LEA first since the LEA matching code does some
				555	estimate to determine whether the match is profitable.
				556
				557	However, if we care more about code size, then imull is better. It's two bytes
				558	shorter than movl + leal.
				559
				560	//===---------------------------------------------------------------------===//
				561
Chris Lattner	a86af9a	2007-08-11 18:19:07 +0000	[diff] [blame]	562	Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
				563
				564	int ctz_(unsigned X) { return __builtin_ctz(X); }
				565	int clz_(unsigned X) { return __builtin_clz(X); }
				566	int ffs_(unsigned X) { return __builtin_ffs(X); }
				567
				568	_ctz_:
				569	bsfl 4(%esp), %eax
				570	ret
				571	_clz_:
				572	bsrl 4(%esp), %eax
				573	xorl $31, %eax
				574	ret
				575	_ffs_:
				576	movl $-1, %edx
				577	bsfl 4(%esp), %eax
				578	cmove %edx, %eax
				579	addl $1, %eax
				580	ret
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	581
				582	//===---------------------------------------------------------------------===//
				583
				584	It appears gcc place string data with linkonce linkage in
				585	.section __TEXT,__const_coal,coalesced instead of
				586	.section __DATA,__const_coal,coalesced.
				587	Take a look at darwin.h, there are other Darwin assembler directives that we
				588	do not make use of.
				589
				590	//===---------------------------------------------------------------------===//
				591
				592	int %foo(int* %a, int %t) {
				593	entry:
				594	br label %cond_true
				595
				596	cond_true: ; preds = %cond_true, %entry
				597	%x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
				598	%t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
				599	%tmp2 = getelementptr int* %a, int %x.0.0
				600	%tmp3 = load int* %tmp2 ; <int> [#uses=1]
				601	%tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
				602	%tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
				603	%tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
				604	%tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
				605	br bool %tmp, label %bb12, label %cond_true
				606
				607	bb12: ; preds = %cond_true
				608	ret int %tmp7
				609	}
				610
				611	is pessimized by -loop-reduce and -indvars
				612
				613	//===---------------------------------------------------------------------===//
				614
				615	u32 to float conversion improvement:
				616
				617	float uint32_2_float( unsigned u ) {
				618	float fl = (int) (u & 0xffff);
				619	float fh = (int) (u >> 16);
				620	fh *= 0x1.0p16f;
				621	return fh + fl;
				622	}
				623
				624	00000000 subl $0x04,%esp
				625	00000003 movl 0x08(%esp,1),%eax
				626	00000007 movl %eax,%ecx
				627	00000009 shrl $0x10,%ecx
				628	0000000c cvtsi2ss %ecx,%xmm0
				629	00000010 andl $0x0000ffff,%eax
				630	00000015 cvtsi2ss %eax,%xmm1
				631	00000019 mulss 0x00000078,%xmm0
				632	00000021 addss %xmm1,%xmm0
				633	00000025 movss %xmm0,(%esp,1)
				634	0000002a flds (%esp,1)
				635	0000002d addl $0x04,%esp
				636	00000030 ret
				637
				638	//===---------------------------------------------------------------------===//
				639
				640	When using fastcc abi, align stack slot of argument of type double on 8 byte
				641	boundary to improve performance.
				642
				643	//===---------------------------------------------------------------------===//
				644
				645	Codegen:
				646
				647	int f(int a, int b) {
				648	if (a == 4 \|\| a == 6)
				649	b++;
				650	return b;
				651	}
				652
				653
				654	as:
				655
				656	or eax, 2
				657	cmp eax, 6
				658	jz label
				659
				660	//===---------------------------------------------------------------------===//
				661
				662	GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
				663	simplifications for integer "x cmp y ? a : b". For example, instead of:
				664
				665	int G;
				666	void f(int X, int Y) {
				667	G = X < 0 ? 14 : 13;
				668	}
				669
				670	compiling to:
				671
				672	_f:
				673	movl $14, %eax
				674	movl $13, %ecx
				675	movl 4(%esp), %edx
				676	testl %edx, %edx
				677	cmovl %eax, %ecx
				678	movl %ecx, _G
				679	ret
				680
				681	it could be:
				682	_f:
				683	movl 4(%esp), %eax
				684	sarl $31, %eax
				685	notl %eax
				686	addl $14, %eax
				687	movl %eax, _G
				688	ret
				689
				690	etc.
				691
				692	//===---------------------------------------------------------------------===//
				693
				694	Currently we don't have elimination of redundant stack manipulations. Consider
				695	the code:
				696
				697	int %main() {
				698	entry:
				699	call fastcc void %test1( )
				700	call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
				701	ret int 0
				702	}
				703
				704	declare fastcc void %test1()
				705
				706	declare fastcc void %test2(sbyte*)
				707
				708
				709	This currently compiles to:
				710
				711	subl $16, %esp
				712	call _test5
				713	addl $12, %esp
				714	subl $16, %esp
				715	movl $_test5, (%esp)
				716	call _test6
				717	addl $12, %esp
				718
				719	The add\sub pair is really unneeded here.
				720
				721	//===---------------------------------------------------------------------===//
				722
				723	We currently compile sign_extend_inreg into two shifts:
				724
				725	long foo(long X) {
				726	return (long)(signed char)X;
				727	}
				728
				729	becomes:
				730
				731	_foo:
				732	movl 4(%esp), %eax
				733	shll $24, %eax
				734	sarl $24, %eax
				735	ret
				736
				737	This could be:
				738
				739	_foo:
				740	movsbl 4(%esp),%eax
				741	ret
				742
				743	//===---------------------------------------------------------------------===//
				744
				745	Consider the expansion of:
				746
				747	uint %test3(uint %X) {
				748	%tmp1 = rem uint %X, 255
				749	ret uint %tmp1
				750	}
				751
				752	Currently it compiles to:
				753
				754	...
				755	movl $2155905153, %ecx
				756	movl 8(%esp), %esi
				757	movl %esi, %eax
				758	mull %ecx
				759	...
				760
				761	This could be "reassociated" into:
				762
				763	movl $2155905153, %eax
				764	movl 8(%esp), %ecx
				765	mull %ecx
				766
				767	to avoid the copy. In fact, the existing two-address stuff would do this
				768	except that mul isn't a commutative 2-addr instruction. I guess this has
				769	to be done at isel time based on the #uses to mul?
				770
				771	//===---------------------------------------------------------------------===//
				772
				773	Make sure the instruction which starts a loop does not cross a cacheline
				774	boundary. This requires knowning the exact length of each machine instruction.
				775	That is somewhat complicated, but doable. Example 256.bzip2:
				776
				777	In the new trace, the hot loop has an instruction which crosses a cacheline
				778	boundary. In addition to potential cache misses, this can't help decoding as I
				779	imagine there has to be some kind of complicated decoder reset and realignment
				780	to grab the bytes from the next cacheline.
				781
				782	532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
				783	942 942 0x3d03 movl %dh, (1809(%esp, %esi)
				784	937 937 0x3d0a incl %esi
				785	3 3 0x3d0b cmpb %bl, %dl
				786	27 27 0x3d0d jnz 0x000062db <main+11707>
				787
				788	//===---------------------------------------------------------------------===//
				789
				790	In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
				791
				792	//===---------------------------------------------------------------------===//
				793
				794	This could be a single 16-bit load.
				795
				796	int f(char *p) {
				797	if ((p[0] == 1) & (p[1] == 2)) return 1;
				798	return 0;
				799	}
				800
				801	//===---------------------------------------------------------------------===//
				802
				803	We should inline lrintf and probably other libc functions.
				804
				805	//===---------------------------------------------------------------------===//
				806
				807	Start using the flags more. For example, compile:
				808
				809	int add_zf(int *x, int y, int a, int b) {
				810	if ((*x += y) == 0)
				811	return a;
				812	else
				813	return b;
				814	}
				815
				816	to:
				817	addl %esi, (%rdi)
				818	movl %edx, %eax
				819	cmovne %ecx, %eax
				820	ret
				821	instead of:
				822
				823	_add_zf:
				824	addl (%rdi), %esi
				825	movl %esi, (%rdi)
				826	testl %esi, %esi
				827	cmove %edx, %ecx
				828	movl %ecx, %eax
				829	ret
				830
				831	and:
				832
				833	int add_zf(int *x, int y, int a, int b) {
				834	if ((*x + y) < 0)
				835	return a;
				836	else
				837	return b;
				838	}
				839
				840	to:
				841
				842	add_zf:
				843	addl (%rdi), %esi
				844	movl %edx, %eax
				845	cmovns %ecx, %eax
				846	ret
				847
				848	instead of:
				849
				850	_add_zf:
				851	addl (%rdi), %esi
				852	testl %esi, %esi
				853	cmovs %edx, %ecx
				854	movl %ecx, %eax
				855	ret
				856
				857	//===---------------------------------------------------------------------===//
				858
				859	This:
				860	#include <math.h>
				861	int foo(double X) { return isnan(X); }
				862
				863	compiles to (-m64):
				864
				865	_foo:
				866	pxor %xmm1, %xmm1
				867	ucomisd %xmm1, %xmm0
				868	setp %al
				869	movzbl %al, %eax
				870	ret
				871
				872	the pxor is not needed, we could compare the value against itself.
				873
				874	//===---------------------------------------------------------------------===//
				875
				876	These two functions have identical effects:
				877
				878	unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
				879	unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
				880
				881	We currently compile them to:
				882
				883	_f:
				884	movl 4(%esp), %eax
				885	movl %eax, %ecx
				886	incl %ecx
				887	movl 8(%esp), %edx
				888	cmpl %edx, %ecx
				889	jne LBB1_2 #UnifiedReturnBlock
				890	LBB1_1: #cond_true
				891	addl $2, %eax
				892	ret
				893	LBB1_2: #UnifiedReturnBlock
				894	movl %ecx, %eax
				895	ret
				896	_f2:
				897	movl 4(%esp), %eax
				898	movl %eax, %ecx
				899	incl %ecx
				900	cmpl 8(%esp), %ecx
				901	sete %cl
				902	movzbl %cl, %ecx
				903	leal 1(%ecx,%eax), %eax
				904	ret
				905
				906	both of which are inferior to GCC's:
				907
				908	_f:
				909	movl 4(%esp), %edx
				910	leal 1(%edx), %eax
				911	addl $2, %edx
				912	cmpl 8(%esp), %eax
				913	cmove %edx, %eax
				914	ret
				915	_f2:
				916	movl 4(%esp), %eax
				917	addl $1, %eax
				918	xorl %edx, %edx
				919	cmpl 8(%esp), %eax
				920	sete %dl
				921	addl %edx, %eax
				922	ret
				923
				924	//===---------------------------------------------------------------------===//
				925
				926	This code:
				927
				928	void test(int X) {
				929	if (X) abort();
				930	}
				931
				932	is currently compiled to:
				933
				934	_test:
				935	subl $12, %esp
				936	cmpl $0, 16(%esp)
				937	jne LBB1_1
				938	addl $12, %esp
				939	ret
				940	LBB1_1:
				941	call L_abort$stub
				942
				943	It would be better to produce:
				944
				945	_test:
				946	subl $12, %esp
				947	cmpl $0, 16(%esp)
				948	jne L_abort$stub
				949	addl $12, %esp
				950	ret
				951
				952	This can be applied to any no-return function call that takes no arguments etc.
				953	Alternatively, the stack save/restore logic could be shrink-wrapped, producing
				954	something like this:
				955
				956	_test:
				957	cmpl $0, 4(%esp)
				958	jne LBB1_1
				959	ret
				960	LBB1_1:
				961	subl $12, %esp
				962	call L_abort$stub
				963
				964	Both are useful in different situations. Finally, it could be shrink-wrapped
				965	and tail called, like this:
				966
				967	_test:
				968	cmpl $0, 4(%esp)
				969	jne LBB1_1
				970	ret
				971	LBB1_1:
				972	pop %eax # realign stack.
				973	call L_abort$stub
				974
				975	Though this probably isn't worth it.
				976
				977	//===---------------------------------------------------------------------===//
				978
				979	We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner	0d64ec3	2007-08-11 18:16:46 +0000	[diff] [blame]	980	when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	981
				982	int foo(int A, int B) {
				983	return A+1;
				984	}
				985
				986	to:
				987
				988	_foo:
				989	leal 1(%edi), %eax
				990	ret
				991
				992	instead of:
				993
				994	_foo:
				995	incl %edi
				996	movl %edi, %eax
				997	ret
				998
				999	Another example is:
				1000
				1001	;; X's live range extends beyond the shift, so the register allocator
				1002	;; cannot coalesce it with Y. Because of this, a copy needs to be
				1003	;; emitted before the shift to save the register value before it is
				1004	;; clobbered. However, this copy is not needed if the register
				1005	;; allocator turns the shift into an LEA. This also occurs for ADD.
				1006
				1007	; Check that the shift gets turned into an LEA.
				1008	; RUN: llvm-upgrade < %s \| llvm-as \| llc -march=x86 -x86-asm-syntax=intel \| \
				1009	; RUN: not grep {mov E.X, E.X}
				1010
				1011	%G = external global int
				1012
				1013	int %test1(int %X, int %Y) {
				1014	%Z = add int %X, %Y
				1015	volatile store int %Y, int* %G
				1016	volatile store int %Z, int* %G
				1017	ret int %X
				1018	}
				1019
				1020	int %test2(int %X) {
				1021	%Z = add int %X, 1 ;; inc
				1022	volatile store int %Z, int* %G
				1023	ret int %X
				1024	}
				1025
				1026	//===---------------------------------------------------------------------===//
				1027
Dan Gohman	f17a25c	2007-07-18 16:29:46 +0000	[diff] [blame]	1028	This:
				1029	#include <xmmintrin.h>
				1030	unsigned test(float f) {
				1031	return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
				1032	}
				1033
				1034	Compiles to:
				1035	_test:
				1036	movss 4(%esp), %xmm0
				1037	movd %xmm0, %eax
				1038	ret
				1039
				1040	it should compile to a move from the stack slot directly into eax. DAGCombine
				1041	has this xform, but it is currently disabled until the alignment fields of
				1042	the load/store nodes are trustworthy.
				1043
				1044	//===---------------------------------------------------------------------===//
				1045
				1046	Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
				1047	a neg instead of a sub instruction. Consider:
				1048
				1049	int test(char X) { return 7-X; }
				1050
				1051	we currently produce:
				1052	_test:
				1053	movl $7, %eax
				1054	movsbl 4(%esp), %ecx
				1055	subl %ecx, %eax
				1056	ret
				1057
				1058	We would use one fewer register if codegen'd as:
				1059
				1060	movsbl 4(%esp), %eax
				1061	neg %eax
				1062	add $7, %eax
				1063	ret
				1064
				1065	Note that this isn't beneficial if the load can be folded into the sub. In
				1066	this case, we want a sub:
				1067
				1068	int test(int X) { return 7-X; }
				1069	_test:
				1070	movl $7, %eax
				1071	subl 4(%esp), %eax
				1072	ret
				1073
				1074	//===---------------------------------------------------------------------===//
				1075
				1076	For code like:
				1077	phi (undef, x)
				1078
				1079	We get an implicit def on the undef side. If the phi is spilled, we then get:
				1080	implicitdef xmm1
				1081	store xmm1 -> stack
				1082
				1083	It should be possible to teach the x86 backend to "fold" the store into the
				1084	implicitdef, which just deletes the implicit def.
				1085
				1086	These instructions should go away:
				1087	#IMPLICIT_DEF %xmm1
				1088	movaps %xmm1, 192(%esp)
				1089	movaps %xmm1, 224(%esp)
				1090	movaps %xmm1, 176(%esp)
Chris Lattner	a3c76a4	2007-08-03 00:17:42 +0000	[diff] [blame]	1091
				1092	//===---------------------------------------------------------------------===//
				1093
				1094	This is a "commutable two-address" register coallescing deficiency:
				1095
				1096	define <4 x float> @test1(<4 x float> %V) {
				1097	entry:
Chris Lattner	a86af9a	2007-08-11 18:19:07 +0000	[diff] [blame]	1098	%tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
				1099	<4 x i32> < i32 3, i32 2, i32 1, i32 0 >
				1100	%add = add <4 x float> %tmp8, %V
Chris Lattner	a3c76a4	2007-08-03 00:17:42 +0000	[diff] [blame]	1101	ret <4 x float> %add
				1102	}
				1103
				1104	this codegens to:
				1105
				1106	_test1:
				1107	pshufd $27, %xmm0, %xmm1
				1108	addps %xmm0, %xmm1
				1109	movaps %xmm1, %xmm0
				1110	ret
				1111
				1112	instead of:
				1113
				1114	_test1:
				1115	pshufd $27, %xmm0, %xmm1
				1116	addps %xmm1, %xmm0
				1117	ret
				1118
Chris Lattner	32f6587	2007-08-20 02:14:33 +0000	[diff] [blame]	1119	//===---------------------------------------------------------------------===//
				1120
				1121	Leaf functions that require one 4-byte spill slot have a prolog like this:
				1122
				1123	_foo:
				1124	pushl %esi
				1125	subl $4, %esp
				1126	...
				1127	and an epilog like this:
				1128	addl $4, %esp
				1129	popl %esi
				1130	ret
				1131
				1132	It would be smaller, and potentially faster, to push eax on entry and to
				1133	pop into a dummy register instead of using addl/subl of esp. Just don't pop
				1134	into any return registers :)
				1135
				1136	//===---------------------------------------------------------------------===//
Chris Lattner	44b03cb	2007-08-23 15:22:07 +0000	[diff] [blame]	1137
				1138	The X86 backend should fold (branch (or (setcc, setcc))) into multiple
				1139	branches. We generate really poor code for:
				1140
				1141	double testf(double a) {
				1142	return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
				1143	}
				1144
				1145	For example, the entry BB is:
				1146
				1147	_testf:
				1148	subl $20, %esp
				1149	pxor %xmm0, %xmm0
				1150	movsd 24(%esp), %xmm1
				1151	ucomisd %xmm0, %xmm1
				1152	setnp %al
				1153	sete %cl
				1154	testb %cl, %al
				1155	jne LBB1_5 # UnifiedReturnBlock
				1156	LBB1_1: # cond_true
				1157
				1158
				1159	it would be better to replace the last four instructions with:
				1160
				1161	jp LBB1_1
				1162	je LBB1_5
				1163	LBB1_1:
				1164
				1165	We also codegen the inner ?: into a diamond:
				1166
				1167	cvtss2sd LCPI1_0(%rip), %xmm2
				1168	cvtss2sd LCPI1_1(%rip), %xmm3
				1169	ucomisd %xmm1, %xmm0
				1170	ja LBB1_3 # cond_true
				1171	LBB1_2: # cond_true
				1172	movapd %xmm3, %xmm2
				1173	LBB1_3: # cond_true
				1174	movapd %xmm2, %xmm0
				1175	ret
				1176
				1177	We should sink the load into xmm3 into the LBB1_2 block. This should
				1178	be pretty easy, and will nuke all the copies.
				1179
				1180	//===---------------------------------------------------------------------===//
Chris Lattner	4084d49	2007-09-10 21:43:18 +0000	[diff] [blame]	1181
				1182	This:
				1183	#include <algorithm>
				1184	inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
				1185	{ return std::make_pair(a + b, a + b < a); }
				1186	bool no_overflow(unsigned a, unsigned b)
				1187	{ return !full_add(a, b).second; }
				1188
				1189	Should compile to:
				1190
				1191
				1192	_Z11no_overflowjj:
				1193	addl %edi, %esi
				1194	setae %al
				1195	ret
				1196
				1197	on x86-64, not:
				1198
				1199	__Z11no_overflowjj:
				1200	addl %edi, %esi
				1201	cmpl %edi, %esi
				1202	setae %al
				1203	movzbl %al, %eax
				1204	ret
				1205
				1206
				1207	//===---------------------------------------------------------------------===//
Evan Cheng	35127a6	2007-09-10 22:16:37 +0000	[diff] [blame]	1208
				1209	Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
				1210	condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattner	a487bf7	2007-09-26 06:29:31 +0000	[diff] [blame]	1211
				1212	//===---------------------------------------------------------------------===//
				1213
				1214	We aren't matching RMW instructions aggressively
				1215	enough. Here's a reduced testcase (more in PR1160):
				1216
				1217	define void @test(i32* %huge_ptr, i32* %target_ptr) {
				1218	%A = load i32* %huge_ptr ; <i32> [#uses=1]
				1219	%B = load i32* %target_ptr ; <i32> [#uses=1]
				1220	%C = or i32 %A, %B ; <i32> [#uses=1]
				1221	store i32 %C, i32* %target_ptr
				1222	ret void
				1223	}
				1224
				1225	$ llvm-as < t.ll \| llc -march=x86-64
				1226
				1227	_test:
				1228	movl (%rdi), %eax
				1229	orl (%rsi), %eax
				1230	movl %eax, (%rsi)
				1231	ret
				1232
				1233	That should be something like:
				1234
				1235	_test:
				1236	movl (%rdi), %eax
				1237	orl %eax, (%rsi)
				1238	ret
				1239
				1240	//===---------------------------------------------------------------------===//
				1241
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1242	The following code:
				1243
Bill Wendling	c2036e3	2007-10-02 20:54:32 +0000	[diff] [blame^]	1244	bb114.preheader: ; preds = %cond_next94
				1245	%tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
				1246	%tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
				1247	%tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
				1248	%tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
				1249	%tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
				1250	%tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
				1251	%tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
				1252	%tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
				1253	%tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
				1254	%tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
				1255	%tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
				1256	br label %bb114
				1257
				1258	produces:
				1259
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1260	LBB3_5: # bb114.preheader
				1261	movswl -68(%ebp), %eax
				1262	movl $32, %ecx
				1263	movl %ecx, -80(%ebp)
				1264	subl %eax, -80(%ebp)
				1265	movswl -52(%ebp), %eax
				1266	movl %ecx, -84(%ebp)
				1267	subl %eax, -84(%ebp)
				1268	movswl -70(%ebp), %eax
				1269	movl %ecx, -88(%ebp)
				1270	subl %eax, -88(%ebp)
				1271	movswl -50(%ebp), %eax
				1272	subl %eax, %ecx
				1273	movl %ecx, -76(%ebp)
				1274	movswl -42(%ebp), %eax
				1275	movl %eax, -92(%ebp)
				1276	movswl -66(%ebp), %eax
				1277	movl %eax, -96(%ebp)
				1278	movw $0, -98(%ebp)
				1279
Bill Wendling	c2036e3	2007-10-02 20:54:32 +0000	[diff] [blame^]	1280	This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling	7f436dd	2007-10-02 20:42:59 +0000	[diff] [blame]	1281	change, so we could simply subtract %eax from %ecx first and then use %ecx (or
				1282	vice-versa).
				1283
				1284	//===---------------------------------------------------------------------===//
				1285