blob: 517240480d71fba6798c0e038f106d629a6f2917 [file] [log] [blame]
Dan Gohmanf17a25c2007-07-18 16:29:46 +00001//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
5Missing features:
6 - Support for SSE4: http://www.intel.com/software/penryn
7http://softwarecommunity.intel.com/isn/Downloads/Intel%20SSE4%20Programming%20Reference.pdf
8 - support for 3DNow!
9 - weird abis?
10
11//===---------------------------------------------------------------------===//
12
13Add a MUL2U and MUL2S nodes to represent a multiply that returns both the
14Hi and Lo parts (combination of MUL and MULH[SU] into one node). Add this to
15X86, & make the dag combiner produce it when needed. This will eliminate one
16imul from the code generated for:
17
18long long test(long long X, long long Y) { return X*Y; }
19
20by using the EAX result from the mul. We should add a similar node for
21DIVREM.
22
23another case is:
24
25long long test(int X, int Y) { return (long long)X*Y; }
26
27... which should only be one imul instruction.
28
29or:
30
31unsigned long long int t2(unsigned int a, unsigned int b) {
32 return (unsigned long long)a * b;
33}
34
35... which should be one mul instruction.
36
37
38This can be done with a custom expander, but it would be nice to move this to
39generic code.
40
41//===---------------------------------------------------------------------===//
42
43CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
44backend knows how to three-addressify this shift, but it appears the register
45allocator isn't even asking it to do so in this case. We should investigate
46why this isn't happening, it could have significant impact on other important
47cases for X86 as well.
48
49//===---------------------------------------------------------------------===//
50
51This should be one DIV/IDIV instruction, not a libcall:
52
53unsigned test(unsigned long long X, unsigned Y) {
54 return X/Y;
55}
56
57This can be done trivially with a custom legalizer. What about overflow
58though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
59
60//===---------------------------------------------------------------------===//
61
62Improvements to the multiply -> shift/add algorithm:
63http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
64
65//===---------------------------------------------------------------------===//
66
67Improve code like this (occurs fairly frequently, e.g. in LLVM):
68long long foo(int x) { return 1LL << x; }
69
70http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
71http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
72http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
73
74Another useful one would be ~0ULL >> X and ~0ULL << X.
75
76One better solution for 1LL << x is:
77 xorl %eax, %eax
78 xorl %edx, %edx
79 testb $32, %cl
80 sete %al
81 setne %dl
82 sall %cl, %eax
83 sall %cl, %edx
84
85But that requires good 8-bit subreg support.
86
8764-bit shifts (in general) expand to really bad code. Instead of using
88cmovs, we should expand to a conditional branch like GCC produces.
89
90//===---------------------------------------------------------------------===//
91
92Compile this:
93_Bool f(_Bool a) { return a!=1; }
94
95into:
96 movzbl %dil, %eax
97 xorl $1, %eax
98 ret
99
100//===---------------------------------------------------------------------===//
101
102Some isel ideas:
103
1041. Dynamic programming based approach when compile time if not an
105 issue.
1062. Code duplication (addressing mode) during isel.
1073. Other ideas from "Register-Sensitive Selection, Duplication, and
108 Sequencing of Instructions".
1094. Scheduling for reduced register pressure. E.g. "Minimum Register
110 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
111 and other related papers.
112 http://citeseer.ist.psu.edu/govindarajan01minimum.html
113
114//===---------------------------------------------------------------------===//
115
116Should we promote i16 to i32 to avoid partial register update stalls?
117
118//===---------------------------------------------------------------------===//
119
120Leave any_extend as pseudo instruction and hint to register
121allocator. Delay codegen until post register allocation.
122
123//===---------------------------------------------------------------------===//
124
125Count leading zeros and count trailing zeros:
126
127int clz(int X) { return __builtin_clz(X); }
128int ctz(int X) { return __builtin_ctz(X); }
129
130$ gcc t.c -S -o - -O3 -fomit-frame-pointer -masm=intel
131clz:
132 bsr %eax, DWORD PTR [%esp+4]
133 xor %eax, 31
134 ret
135ctz:
136 bsf %eax, DWORD PTR [%esp+4]
137 ret
138
139however, check that these are defined for 0 and 32. Our intrinsics are, GCC's
140aren't.
141
142Another example (use predsimplify to eliminate a select):
143
144int foo (unsigned long j) {
145 if (j)
146 return __builtin_ffs (j) - 1;
147 else
148 return 0;
149}
150
151//===---------------------------------------------------------------------===//
152
153It appears icc use push for parameter passing. Need to investigate.
154
155//===---------------------------------------------------------------------===//
156
157Only use inc/neg/not instructions on processors where they are faster than
158add/sub/xor. They are slower on the P4 due to only updating some processor
159flags.
160
161//===---------------------------------------------------------------------===//
162
163The instruction selector sometimes misses folding a load into a compare. The
164pattern is written as (cmp reg, (load p)). Because the compare isn't
165commutative, it is not matched with the load on both sides. The dag combiner
166should be made smart enough to cannonicalize the load into the RHS of a compare
167when it can invert the result of the compare for free.
168
169//===---------------------------------------------------------------------===//
170
171How about intrinsics? An example is:
172 *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
173
174compiles to
175 pmuludq (%eax), %xmm0
176 movl 8(%esp), %eax
177 movdqa (%eax), %xmm1
178 pmulhuw %xmm0, %xmm1
179
180The transformation probably requires a X86 specific pass or a DAG combiner
181target specific hook.
182
183//===---------------------------------------------------------------------===//
184
185In many cases, LLVM generates code like this:
186
187_test:
188 movl 8(%esp), %eax
189 cmpl %eax, 4(%esp)
190 setl %al
191 movzbl %al, %eax
192 ret
193
194on some processors (which ones?), it is more efficient to do this:
195
196_test:
197 movl 8(%esp), %ebx
198 xor %eax, %eax
199 cmpl %ebx, 4(%esp)
200 setl %al
201 ret
202
203Doing this correctly is tricky though, as the xor clobbers the flags.
204
205//===---------------------------------------------------------------------===//
206
207We should generate bts/btr/etc instructions on targets where they are cheap or
208when codesize is important. e.g., for:
209
210void setbit(int *target, int bit) {
211 *target |= (1 << bit);
212}
213void clearbit(int *target, int bit) {
214 *target &= ~(1 << bit);
215}
216
217//===---------------------------------------------------------------------===//
218
219Instead of the following for memset char*, 1, 10:
220
221 movl $16843009, 4(%edx)
222 movl $16843009, (%edx)
223 movw $257, 8(%edx)
224
225It might be better to generate
226
227 movl $16843009, %eax
228 movl %eax, 4(%edx)
229 movl %eax, (%edx)
230 movw al, 8(%edx)
231
232when we can spare a register. It reduces code size.
233
234//===---------------------------------------------------------------------===//
235
236Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
237get this:
238
239int %test1(int %X) {
240 %Y = div int %X, 8
241 ret int %Y
242}
243
244_test1:
245 movl 4(%esp), %eax
246 movl %eax, %ecx
247 sarl $31, %ecx
248 shrl $29, %ecx
249 addl %ecx, %eax
250 sarl $3, %eax
251 ret
252
253GCC knows several different ways to codegen it, one of which is this:
254
255_test1:
256 movl 4(%esp), %eax
257 cmpl $-1, %eax
258 leal 7(%eax), %ecx
259 cmovle %ecx, %eax
260 sarl $3, %eax
261 ret
262
263which is probably slower, but it's interesting at least :)
264
265//===---------------------------------------------------------------------===//
266
267The first BB of this code:
268
269declare bool %foo()
270int %bar() {
271 %V = call bool %foo()
272 br bool %V, label %T, label %F
273T:
274 ret int 1
275F:
276 call bool %foo()
277 ret int 12
278}
279
280compiles to:
281
282_bar:
283 subl $12, %esp
284 call L_foo$stub
285 xorb $1, %al
286 testb %al, %al
287 jne LBB_bar_2 # F
288
289It would be better to emit "cmp %al, 1" than a xor and test.
290
291//===---------------------------------------------------------------------===//
292
293Enable X86InstrInfo::convertToThreeAddress().
294
295//===---------------------------------------------------------------------===//
296
297We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
298We should leave these as libcalls for everything over a much lower threshold,
299since libc is hand tuned for medium and large mem ops (avoiding RFO for large
300stores, TLB preheating, etc)
301
302//===---------------------------------------------------------------------===//
303
304Optimize this into something reasonable:
305 x * copysign(1.0, y) * copysign(1.0, z)
306
307//===---------------------------------------------------------------------===//
308
309Optimize copysign(x, *y) to use an integer load from y.
310
311//===---------------------------------------------------------------------===//
312
313%X = weak global int 0
314
315void %foo(int %N) {
316 %N = cast int %N to uint
317 %tmp.24 = setgt int %N, 0
318 br bool %tmp.24, label %no_exit, label %return
319
320no_exit:
321 %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
322 %i.0.0 = cast uint %indvar to int
323 volatile store int %i.0.0, int* %X
324 %indvar.next = add uint %indvar, 1
325 %exitcond = seteq uint %indvar.next, %N
326 br bool %exitcond, label %return, label %no_exit
327
328return:
329 ret void
330}
331
332compiles into:
333
334 .text
335 .align 4
336 .globl _foo
337_foo:
338 movl 4(%esp), %eax
339 cmpl $1, %eax
340 jl LBB_foo_4 # return
341LBB_foo_1: # no_exit.preheader
342 xorl %ecx, %ecx
343LBB_foo_2: # no_exit
344 movl L_X$non_lazy_ptr, %edx
345 movl %ecx, (%edx)
346 incl %ecx
347 cmpl %eax, %ecx
348 jne LBB_foo_2 # no_exit
349LBB_foo_3: # return.loopexit
350LBB_foo_4: # return
351 ret
352
353We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
354remateralization is implemented. This can be accomplished with 1) a target
355dependent LICM pass or 2) makeing SelectDAG represent the whole function.
356
357//===---------------------------------------------------------------------===//
358
359The following tests perform worse with LSR:
360
361lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
362
363//===---------------------------------------------------------------------===//
364
365We are generating far worse code than gcc:
366
367volatile short X, Y;
368
369void foo(int N) {
370 int i;
371 for (i = 0; i < N; i++) { X = i; Y = i*4; }
372}
373
374LBB1_1: #bb.preheader
375 xorl %ecx, %ecx
376 xorw %dx, %dx
377LBB1_2: #bb
378 movl L_X$non_lazy_ptr, %esi
379 movw %dx, (%esi)
380 movw %dx, %si
381 shlw $2, %si
382 movl L_Y$non_lazy_ptr, %edi
383 movw %si, (%edi)
384 incl %ecx
385 incw %dx
386 cmpl %eax, %ecx
387 jne LBB1_2 #bb
388
389vs.
390
391 xorl %edx, %edx
392 movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
393 movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
394L4:
395 movw %dx, (%esi)
396 leal 0(,%edx,4), %eax
397 movw %ax, (%ecx)
398 addl $1, %edx
399 cmpl %edx, %edi
400 jne L4
401
402There are 3 issues:
403
4041. Lack of post regalloc LICM.
Christopher Lamb2087a5e2007-08-10 21:29:05 +00004052. LSR unable to reused IV for a different type (i16 vs. i32) even though
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000406 the cast would be free.
407
408//===---------------------------------------------------------------------===//
409
410Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
411FR64 to VR128.
412
413//===---------------------------------------------------------------------===//
414
415mov $reg, 48(%esp)
416...
417leal 48(%esp), %eax
418mov %eax, (%esp)
419call _foo
420
421Obviously it would have been better for the first mov (or any op) to store
422directly %esp[0] if there are no other uses.
423
424//===---------------------------------------------------------------------===//
425
426Adding to the list of cmp / test poor codegen issues:
427
428int test(__m128 *A, __m128 *B) {
429 if (_mm_comige_ss(*A, *B))
430 return 3;
431 else
432 return 4;
433}
434
435_test:
436 movl 8(%esp), %eax
437 movaps (%eax), %xmm0
438 movl 4(%esp), %eax
439 movaps (%eax), %xmm1
440 comiss %xmm0, %xmm1
441 setae %al
442 movzbl %al, %ecx
443 movl $3, %eax
444 movl $4, %edx
445 cmpl $0, %ecx
446 cmove %edx, %eax
447 ret
448
449Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
450are a number of issues. 1) We are introducing a setcc between the result of the
451intrisic call and select. 2) The intrinsic is expected to produce a i32 value
452so a any extend (which becomes a zero extend) is added.
453
454We probably need some kind of target DAG combine hook to fix this.
455
456//===---------------------------------------------------------------------===//
457
458We generate significantly worse code for this than GCC:
459http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
460http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
461
462There is also one case we do worse on PPC.
463
464//===---------------------------------------------------------------------===//
465
466If shorter, we should use things like:
467movzwl %ax, %eax
468instead of:
469andl $65535, %EAX
470
471The former can also be used when the two-addressy nature of the 'and' would
472require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
473
474//===---------------------------------------------------------------------===//
475
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000476Consider this:
477
478typedef struct pair { float A, B; } pair;
479void pairtest(pair P, float *FP) {
480 *FP = P.A+P.B;
481}
482
483We currently generate this code with llvmgcc4:
484
485_pairtest:
486 movl 8(%esp), %eax
487 movl 4(%esp), %ecx
488 movd %eax, %xmm0
489 movd %ecx, %xmm1
490 addss %xmm0, %xmm1
491 movl 12(%esp), %eax
492 movss %xmm1, (%eax)
493 ret
494
495we should be able to generate:
496_pairtest:
497 movss 4(%esp), %xmm0
498 movl 12(%esp), %eax
499 addss 8(%esp), %xmm0
500 movss %xmm0, (%eax)
501 ret
502
503The issue is that llvmgcc4 is forcing the struct to memory, then passing it as
504integer chunks. It does this so that structs like {short,short} are passed in
505a single 32-bit integer stack slot. We should handle the safe cases above much
506nicer, while still handling the hard cases.
507
508While true in general, in this specific case we could do better by promoting
509load int + bitcast to float -> load fload. This basically needs alignment info,
510the code is already implemented (but disabled) in dag combine).
511
512//===---------------------------------------------------------------------===//
513
514Another instruction selector deficiency:
515
516void %bar() {
517 %tmp = load int (int)** %foo
518 %tmp = tail call int %tmp( int 3 )
519 ret void
520}
521
522_bar:
523 subl $12, %esp
524 movl L_foo$non_lazy_ptr, %eax
525 movl (%eax), %eax
526 call *%eax
527 addl $12, %esp
528 ret
529
530The current isel scheme will not allow the load to be folded in the call since
531the load's chain result is read by the callseq_start.
532
533//===---------------------------------------------------------------------===//
534
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000535For this:
536
537int test(int a)
538{
539 return a * 3;
540}
541
542We currently emits
543 imull $3, 4(%esp), %eax
544
545Perhaps this is what we really should generate is? Is imull three or four
546cycles? Note: ICC generates this:
547 movl 4(%esp), %eax
548 leal (%eax,%eax,2), %eax
549
550The current instruction priority is based on pattern complexity. The former is
551more "complex" because it folds a load so the latter will not be emitted.
552
553Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
554should always try to match LEA first since the LEA matching code does some
555estimate to determine whether the match is profitable.
556
557However, if we care more about code size, then imull is better. It's two bytes
558shorter than movl + leal.
559
560//===---------------------------------------------------------------------===//
561
Chris Lattnera86af9a2007-08-11 18:19:07 +0000562Implement CTTZ, CTLZ with bsf and bsr. GCC produces:
563
564int ctz_(unsigned X) { return __builtin_ctz(X); }
565int clz_(unsigned X) { return __builtin_clz(X); }
566int ffs_(unsigned X) { return __builtin_ffs(X); }
567
568_ctz_:
569 bsfl 4(%esp), %eax
570 ret
571_clz_:
572 bsrl 4(%esp), %eax
573 xorl $31, %eax
574 ret
575_ffs_:
576 movl $-1, %edx
577 bsfl 4(%esp), %eax
578 cmove %edx, %eax
579 addl $1, %eax
580 ret
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000581
582//===---------------------------------------------------------------------===//
583
584It appears gcc place string data with linkonce linkage in
585.section __TEXT,__const_coal,coalesced instead of
586.section __DATA,__const_coal,coalesced.
587Take a look at darwin.h, there are other Darwin assembler directives that we
588do not make use of.
589
590//===---------------------------------------------------------------------===//
591
592int %foo(int* %a, int %t) {
593entry:
594 br label %cond_true
595
596cond_true: ; preds = %cond_true, %entry
597 %x.0.0 = phi int [ 0, %entry ], [ %tmp9, %cond_true ]
598 %t_addr.0.0 = phi int [ %t, %entry ], [ %tmp7, %cond_true ]
599 %tmp2 = getelementptr int* %a, int %x.0.0
600 %tmp3 = load int* %tmp2 ; <int> [#uses=1]
601 %tmp5 = add int %t_addr.0.0, %x.0.0 ; <int> [#uses=1]
602 %tmp7 = add int %tmp5, %tmp3 ; <int> [#uses=2]
603 %tmp9 = add int %x.0.0, 1 ; <int> [#uses=2]
604 %tmp = setgt int %tmp9, 39 ; <bool> [#uses=1]
605 br bool %tmp, label %bb12, label %cond_true
606
607bb12: ; preds = %cond_true
608 ret int %tmp7
609}
610
611is pessimized by -loop-reduce and -indvars
612
613//===---------------------------------------------------------------------===//
614
615u32 to float conversion improvement:
616
617float uint32_2_float( unsigned u ) {
618 float fl = (int) (u & 0xffff);
619 float fh = (int) (u >> 16);
620 fh *= 0x1.0p16f;
621 return fh + fl;
622}
623
62400000000 subl $0x04,%esp
62500000003 movl 0x08(%esp,1),%eax
62600000007 movl %eax,%ecx
62700000009 shrl $0x10,%ecx
6280000000c cvtsi2ss %ecx,%xmm0
62900000010 andl $0x0000ffff,%eax
63000000015 cvtsi2ss %eax,%xmm1
63100000019 mulss 0x00000078,%xmm0
63200000021 addss %xmm1,%xmm0
63300000025 movss %xmm0,(%esp,1)
6340000002a flds (%esp,1)
6350000002d addl $0x04,%esp
63600000030 ret
637
638//===---------------------------------------------------------------------===//
639
640When using fastcc abi, align stack slot of argument of type double on 8 byte
641boundary to improve performance.
642
643//===---------------------------------------------------------------------===//
644
645Codegen:
646
647int f(int a, int b) {
648 if (a == 4 || a == 6)
649 b++;
650 return b;
651}
652
653
654as:
655
656or eax, 2
657cmp eax, 6
658jz label
659
660//===---------------------------------------------------------------------===//
661
662GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
663simplifications for integer "x cmp y ? a : b". For example, instead of:
664
665int G;
666void f(int X, int Y) {
667 G = X < 0 ? 14 : 13;
668}
669
670compiling to:
671
672_f:
673 movl $14, %eax
674 movl $13, %ecx
675 movl 4(%esp), %edx
676 testl %edx, %edx
677 cmovl %eax, %ecx
678 movl %ecx, _G
679 ret
680
681it could be:
682_f:
683 movl 4(%esp), %eax
684 sarl $31, %eax
685 notl %eax
686 addl $14, %eax
687 movl %eax, _G
688 ret
689
690etc.
691
692//===---------------------------------------------------------------------===//
693
694Currently we don't have elimination of redundant stack manipulations. Consider
695the code:
696
697int %main() {
698entry:
699 call fastcc void %test1( )
700 call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
701 ret int 0
702}
703
704declare fastcc void %test1()
705
706declare fastcc void %test2(sbyte*)
707
708
709This currently compiles to:
710
711 subl $16, %esp
712 call _test5
713 addl $12, %esp
714 subl $16, %esp
715 movl $_test5, (%esp)
716 call _test6
717 addl $12, %esp
718
719The add\sub pair is really unneeded here.
720
721//===---------------------------------------------------------------------===//
722
723We currently compile sign_extend_inreg into two shifts:
724
725long foo(long X) {
726 return (long)(signed char)X;
727}
728
729becomes:
730
731_foo:
732 movl 4(%esp), %eax
733 shll $24, %eax
734 sarl $24, %eax
735 ret
736
737This could be:
738
739_foo:
740 movsbl 4(%esp),%eax
741 ret
742
743//===---------------------------------------------------------------------===//
744
745Consider the expansion of:
746
747uint %test3(uint %X) {
748 %tmp1 = rem uint %X, 255
749 ret uint %tmp1
750}
751
752Currently it compiles to:
753
754...
755 movl $2155905153, %ecx
756 movl 8(%esp), %esi
757 movl %esi, %eax
758 mull %ecx
759...
760
761This could be "reassociated" into:
762
763 movl $2155905153, %eax
764 movl 8(%esp), %ecx
765 mull %ecx
766
767to avoid the copy. In fact, the existing two-address stuff would do this
768except that mul isn't a commutative 2-addr instruction. I guess this has
769to be done at isel time based on the #uses to mul?
770
771//===---------------------------------------------------------------------===//
772
773Make sure the instruction which starts a loop does not cross a cacheline
774boundary. This requires knowning the exact length of each machine instruction.
775That is somewhat complicated, but doable. Example 256.bzip2:
776
777In the new trace, the hot loop has an instruction which crosses a cacheline
778boundary. In addition to potential cache misses, this can't help decoding as I
779imagine there has to be some kind of complicated decoder reset and realignment
780to grab the bytes from the next cacheline.
781
782532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
783942 942 0x3d03 movl %dh, (1809(%esp, %esi)
784937 937 0x3d0a incl %esi
7853 3 0x3d0b cmpb %bl, %dl
78627 27 0x3d0d jnz 0x000062db <main+11707>
787
788//===---------------------------------------------------------------------===//
789
790In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
791
792//===---------------------------------------------------------------------===//
793
794This could be a single 16-bit load.
795
796int f(char *p) {
797 if ((p[0] == 1) & (p[1] == 2)) return 1;
798 return 0;
799}
800
801//===---------------------------------------------------------------------===//
802
803We should inline lrintf and probably other libc functions.
804
805//===---------------------------------------------------------------------===//
806
807Start using the flags more. For example, compile:
808
809int add_zf(int *x, int y, int a, int b) {
810 if ((*x += y) == 0)
811 return a;
812 else
813 return b;
814}
815
816to:
817 addl %esi, (%rdi)
818 movl %edx, %eax
819 cmovne %ecx, %eax
820 ret
821instead of:
822
823_add_zf:
824 addl (%rdi), %esi
825 movl %esi, (%rdi)
826 testl %esi, %esi
827 cmove %edx, %ecx
828 movl %ecx, %eax
829 ret
830
831and:
832
833int add_zf(int *x, int y, int a, int b) {
834 if ((*x + y) < 0)
835 return a;
836 else
837 return b;
838}
839
840to:
841
842add_zf:
843 addl (%rdi), %esi
844 movl %edx, %eax
845 cmovns %ecx, %eax
846 ret
847
848instead of:
849
850_add_zf:
851 addl (%rdi), %esi
852 testl %esi, %esi
853 cmovs %edx, %ecx
854 movl %ecx, %eax
855 ret
856
857//===---------------------------------------------------------------------===//
858
859This:
860#include <math.h>
861int foo(double X) { return isnan(X); }
862
863compiles to (-m64):
864
865_foo:
866 pxor %xmm1, %xmm1
867 ucomisd %xmm1, %xmm0
868 setp %al
869 movzbl %al, %eax
870 ret
871
872the pxor is not needed, we could compare the value against itself.
873
874//===---------------------------------------------------------------------===//
875
876These two functions have identical effects:
877
878unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
879unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
880
881We currently compile them to:
882
883_f:
884 movl 4(%esp), %eax
885 movl %eax, %ecx
886 incl %ecx
887 movl 8(%esp), %edx
888 cmpl %edx, %ecx
889 jne LBB1_2 #UnifiedReturnBlock
890LBB1_1: #cond_true
891 addl $2, %eax
892 ret
893LBB1_2: #UnifiedReturnBlock
894 movl %ecx, %eax
895 ret
896_f2:
897 movl 4(%esp), %eax
898 movl %eax, %ecx
899 incl %ecx
900 cmpl 8(%esp), %ecx
901 sete %cl
902 movzbl %cl, %ecx
903 leal 1(%ecx,%eax), %eax
904 ret
905
906both of which are inferior to GCC's:
907
908_f:
909 movl 4(%esp), %edx
910 leal 1(%edx), %eax
911 addl $2, %edx
912 cmpl 8(%esp), %eax
913 cmove %edx, %eax
914 ret
915_f2:
916 movl 4(%esp), %eax
917 addl $1, %eax
918 xorl %edx, %edx
919 cmpl 8(%esp), %eax
920 sete %dl
921 addl %edx, %eax
922 ret
923
924//===---------------------------------------------------------------------===//
925
926This code:
927
928void test(int X) {
929 if (X) abort();
930}
931
932is currently compiled to:
933
934_test:
935 subl $12, %esp
936 cmpl $0, 16(%esp)
937 jne LBB1_1
938 addl $12, %esp
939 ret
940LBB1_1:
941 call L_abort$stub
942
943It would be better to produce:
944
945_test:
946 subl $12, %esp
947 cmpl $0, 16(%esp)
948 jne L_abort$stub
949 addl $12, %esp
950 ret
951
952This can be applied to any no-return function call that takes no arguments etc.
953Alternatively, the stack save/restore logic could be shrink-wrapped, producing
954something like this:
955
956_test:
957 cmpl $0, 4(%esp)
958 jne LBB1_1
959 ret
960LBB1_1:
961 subl $12, %esp
962 call L_abort$stub
963
964Both are useful in different situations. Finally, it could be shrink-wrapped
965and tail called, like this:
966
967_test:
968 cmpl $0, 4(%esp)
969 jne LBB1_1
970 ret
971LBB1_1:
972 pop %eax # realign stack.
973 call L_abort$stub
974
975Though this probably isn't worth it.
976
977//===---------------------------------------------------------------------===//
978
979We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner0d64ec32007-08-11 18:16:46 +0000980when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000981
982int foo(int A, int B) {
983 return A+1;
984}
985
986to:
987
988_foo:
989 leal 1(%edi), %eax
990 ret
991
992instead of:
993
994_foo:
995 incl %edi
996 movl %edi, %eax
997 ret
998
999Another example is:
1000
1001;; X's live range extends beyond the shift, so the register allocator
1002;; cannot coalesce it with Y. Because of this, a copy needs to be
1003;; emitted before the shift to save the register value before it is
1004;; clobbered. However, this copy is not needed if the register
1005;; allocator turns the shift into an LEA. This also occurs for ADD.
1006
1007; Check that the shift gets turned into an LEA.
1008; RUN: llvm-upgrade < %s | llvm-as | llc -march=x86 -x86-asm-syntax=intel | \
1009; RUN: not grep {mov E.X, E.X}
1010
1011%G = external global int
1012
1013int %test1(int %X, int %Y) {
1014 %Z = add int %X, %Y
1015 volatile store int %Y, int* %G
1016 volatile store int %Z, int* %G
1017 ret int %X
1018}
1019
1020int %test2(int %X) {
1021 %Z = add int %X, 1 ;; inc
1022 volatile store int %Z, int* %G
1023 ret int %X
1024}
1025
1026//===---------------------------------------------------------------------===//
1027
Dan Gohmanf17a25c2007-07-18 16:29:46 +00001028This:
1029#include <xmmintrin.h>
1030unsigned test(float f) {
1031 return _mm_cvtsi128_si32( (__m128i) _mm_set_ss( f ));
1032}
1033
1034Compiles to:
1035_test:
1036 movss 4(%esp), %xmm0
1037 movd %xmm0, %eax
1038 ret
1039
1040it should compile to a move from the stack slot directly into eax. DAGCombine
1041has this xform, but it is currently disabled until the alignment fields of
1042the load/store nodes are trustworthy.
1043
1044//===---------------------------------------------------------------------===//
1045
1046Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
1047a neg instead of a sub instruction. Consider:
1048
1049int test(char X) { return 7-X; }
1050
1051we currently produce:
1052_test:
1053 movl $7, %eax
1054 movsbl 4(%esp), %ecx
1055 subl %ecx, %eax
1056 ret
1057
1058We would use one fewer register if codegen'd as:
1059
1060 movsbl 4(%esp), %eax
1061 neg %eax
1062 add $7, %eax
1063 ret
1064
1065Note that this isn't beneficial if the load can be folded into the sub. In
1066this case, we want a sub:
1067
1068int test(int X) { return 7-X; }
1069_test:
1070 movl $7, %eax
1071 subl 4(%esp), %eax
1072 ret
1073
1074//===---------------------------------------------------------------------===//
1075
1076For code like:
1077phi (undef, x)
1078
1079We get an implicit def on the undef side. If the phi is spilled, we then get:
1080implicitdef xmm1
1081store xmm1 -> stack
1082
1083It should be possible to teach the x86 backend to "fold" the store into the
1084implicitdef, which just deletes the implicit def.
1085
1086These instructions should go away:
1087#IMPLICIT_DEF %xmm1
1088movaps %xmm1, 192(%esp)
1089movaps %xmm1, 224(%esp)
1090movaps %xmm1, 176(%esp)
Chris Lattnera3c76a42007-08-03 00:17:42 +00001091
1092//===---------------------------------------------------------------------===//
1093
1094This is a "commutable two-address" register coallescing deficiency:
1095
1096define <4 x float> @test1(<4 x float> %V) {
1097entry:
Chris Lattnera86af9a2007-08-11 18:19:07 +00001098 %tmp8 = shufflevector <4 x float> %V, <4 x float> undef,
1099 <4 x i32> < i32 3, i32 2, i32 1, i32 0 >
1100 %add = add <4 x float> %tmp8, %V
Chris Lattnera3c76a42007-08-03 00:17:42 +00001101 ret <4 x float> %add
1102}
1103
1104this codegens to:
1105
1106_test1:
1107 pshufd $27, %xmm0, %xmm1
1108 addps %xmm0, %xmm1
1109 movaps %xmm1, %xmm0
1110 ret
1111
1112instead of:
1113
1114_test1:
1115 pshufd $27, %xmm0, %xmm1
1116 addps %xmm1, %xmm0
1117 ret
1118
Chris Lattner32f65872007-08-20 02:14:33 +00001119//===---------------------------------------------------------------------===//
1120
1121Leaf functions that require one 4-byte spill slot have a prolog like this:
1122
1123_foo:
1124 pushl %esi
1125 subl $4, %esp
1126...
1127and an epilog like this:
1128 addl $4, %esp
1129 popl %esi
1130 ret
1131
1132It would be smaller, and potentially faster, to push eax on entry and to
1133pop into a dummy register instead of using addl/subl of esp. Just don't pop
1134into any return registers :)
1135
1136//===---------------------------------------------------------------------===//
Chris Lattner44b03cb2007-08-23 15:22:07 +00001137
1138The X86 backend should fold (branch (or (setcc, setcc))) into multiple
1139branches. We generate really poor code for:
1140
1141double testf(double a) {
1142 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
1143}
1144
1145For example, the entry BB is:
1146
1147_testf:
1148 subl $20, %esp
1149 pxor %xmm0, %xmm0
1150 movsd 24(%esp), %xmm1
1151 ucomisd %xmm0, %xmm1
1152 setnp %al
1153 sete %cl
1154 testb %cl, %al
1155 jne LBB1_5 # UnifiedReturnBlock
1156LBB1_1: # cond_true
1157
1158
1159it would be better to replace the last four instructions with:
1160
1161 jp LBB1_1
1162 je LBB1_5
1163LBB1_1:
1164
1165We also codegen the inner ?: into a diamond:
1166
1167 cvtss2sd LCPI1_0(%rip), %xmm2
1168 cvtss2sd LCPI1_1(%rip), %xmm3
1169 ucomisd %xmm1, %xmm0
1170 ja LBB1_3 # cond_true
1171LBB1_2: # cond_true
1172 movapd %xmm3, %xmm2
1173LBB1_3: # cond_true
1174 movapd %xmm2, %xmm0
1175 ret
1176
1177We should sink the load into xmm3 into the LBB1_2 block. This should
1178be pretty easy, and will nuke all the copies.
1179
1180//===---------------------------------------------------------------------===//
Chris Lattner4084d492007-09-10 21:43:18 +00001181
1182This:
1183 #include <algorithm>
1184 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
1185 { return std::make_pair(a + b, a + b < a); }
1186 bool no_overflow(unsigned a, unsigned b)
1187 { return !full_add(a, b).second; }
1188
1189Should compile to:
1190
1191
1192 _Z11no_overflowjj:
1193 addl %edi, %esi
1194 setae %al
1195 ret
1196
1197on x86-64, not:
1198
1199__Z11no_overflowjj:
1200 addl %edi, %esi
1201 cmpl %edi, %esi
1202 setae %al
1203 movzbl %al, %eax
1204 ret
1205
1206
1207//===---------------------------------------------------------------------===//
Evan Cheng35127a62007-09-10 22:16:37 +00001208
1209Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1210condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattnera487bf72007-09-26 06:29:31 +00001211
1212//===---------------------------------------------------------------------===//
1213
1214We aren't matching RMW instructions aggressively
1215enough. Here's a reduced testcase (more in PR1160):
1216
1217define void @test(i32* %huge_ptr, i32* %target_ptr) {
1218 %A = load i32* %huge_ptr ; <i32> [#uses=1]
1219 %B = load i32* %target_ptr ; <i32> [#uses=1]
1220 %C = or i32 %A, %B ; <i32> [#uses=1]
1221 store i32 %C, i32* %target_ptr
1222 ret void
1223}
1224
1225$ llvm-as < t.ll | llc -march=x86-64
1226
1227_test:
1228 movl (%rdi), %eax
1229 orl (%rsi), %eax
1230 movl %eax, (%rsi)
1231 ret
1232
1233That should be something like:
1234
1235_test:
1236 movl (%rdi), %eax
1237 orl %eax, (%rsi)
1238 ret
1239
1240//===---------------------------------------------------------------------===//
1241
Bill Wendling7f436dd2007-10-02 20:42:59 +00001242The following code:
1243
Bill Wendlingc2036e32007-10-02 20:54:32 +00001244bb114.preheader: ; preds = %cond_next94
1245 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
1246 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
1247 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
1248 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
1249 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
1250 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
1251 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
1252 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
1253 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
1254 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
1255 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
1256 br label %bb114
1257
1258produces:
1259
Bill Wendling7f436dd2007-10-02 20:42:59 +00001260LBB3_5: # bb114.preheader
1261 movswl -68(%ebp), %eax
1262 movl $32, %ecx
1263 movl %ecx, -80(%ebp)
1264 subl %eax, -80(%ebp)
1265 movswl -52(%ebp), %eax
1266 movl %ecx, -84(%ebp)
1267 subl %eax, -84(%ebp)
1268 movswl -70(%ebp), %eax
1269 movl %ecx, -88(%ebp)
1270 subl %eax, -88(%ebp)
1271 movswl -50(%ebp), %eax
1272 subl %eax, %ecx
1273 movl %ecx, -76(%ebp)
1274 movswl -42(%ebp), %eax
1275 movl %eax, -92(%ebp)
1276 movswl -66(%ebp), %eax
1277 movl %eax, -96(%ebp)
1278 movw $0, -98(%ebp)
1279
Bill Wendlingc2036e32007-10-02 20:54:32 +00001280This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling7f436dd2007-10-02 20:42:59 +00001281change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1282vice-versa).
1283
1284//===---------------------------------------------------------------------===//
1285