blob: 5f28579095f10713dd182acd3e90c18399643ac7 [file] [log] [blame]
Dan Gohmanf17a25c2007-07-18 16:29:46 +00001//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
Dan Gohmanf17a25c2007-07-18 16:29:46 +00005
6//===---------------------------------------------------------------------===//
7
Dan Gohmanf17a25c2007-07-18 16:29:46 +00008CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
9backend knows how to three-addressify this shift, but it appears the register
10allocator isn't even asking it to do so in this case. We should investigate
11why this isn't happening, it could have significant impact on other important
12cases for X86 as well.
13
14//===---------------------------------------------------------------------===//
15
16This should be one DIV/IDIV instruction, not a libcall:
17
18unsigned test(unsigned long long X, unsigned Y) {
19 return X/Y;
20}
21
22This can be done trivially with a custom legalizer. What about overflow
23though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
24
25//===---------------------------------------------------------------------===//
26
27Improvements to the multiply -> shift/add algorithm:
28http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
29
30//===---------------------------------------------------------------------===//
31
32Improve code like this (occurs fairly frequently, e.g. in LLVM):
33long long foo(int x) { return 1LL << x; }
34
35http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
36http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
37http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
38
39Another useful one would be ~0ULL >> X and ~0ULL << X.
40
41One better solution for 1LL << x is:
42 xorl %eax, %eax
43 xorl %edx, %edx
44 testb $32, %cl
45 sete %al
46 setne %dl
47 sall %cl, %eax
48 sall %cl, %edx
49
50But that requires good 8-bit subreg support.
51
Eli Friedman577c7492008-02-21 21:16:49 +000052Also, this might be better. It's an extra shift, but it's one instruction
53shorter, and doesn't stress 8-bit subreg support.
54(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
55but without the unnecessary and.)
56 movl %ecx, %eax
57 shrl $5, %eax
58 movl %eax, %edx
59 xorl $1, %edx
60 sall %cl, %eax
61 sall %cl. %edx
62
Dan Gohmanf17a25c2007-07-18 16:29:46 +00006364-bit shifts (in general) expand to really bad code. Instead of using
64cmovs, we should expand to a conditional branch like GCC produces.
65
66//===---------------------------------------------------------------------===//
67
68Compile this:
69_Bool f(_Bool a) { return a!=1; }
70
71into:
72 movzbl %dil, %eax
73 xorl $1, %eax
74 ret
75
Eli Friedman577c7492008-02-21 21:16:49 +000076(Although note that this isn't a legal way to express the code that llvm-gcc
77currently generates for that function.)
78
Dan Gohmanf17a25c2007-07-18 16:29:46 +000079//===---------------------------------------------------------------------===//
80
81Some isel ideas:
82
831. Dynamic programming based approach when compile time if not an
84 issue.
852. Code duplication (addressing mode) during isel.
863. Other ideas from "Register-Sensitive Selection, Duplication, and
87 Sequencing of Instructions".
884. Scheduling for reduced register pressure. E.g. "Minimum Register
89 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
90 and other related papers.
91 http://citeseer.ist.psu.edu/govindarajan01minimum.html
92
93//===---------------------------------------------------------------------===//
94
95Should we promote i16 to i32 to avoid partial register update stalls?
96
97//===---------------------------------------------------------------------===//
98
99Leave any_extend as pseudo instruction and hint to register
100allocator. Delay codegen until post register allocation.
Evan Chengfdbb6672007-10-12 18:22:55 +0000101Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
102the coalescer how to deal with it though.
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000103
104//===---------------------------------------------------------------------===//
105
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000106It appears icc use push for parameter passing. Need to investigate.
107
108//===---------------------------------------------------------------------===//
109
110Only use inc/neg/not instructions on processors where they are faster than
111add/sub/xor. They are slower on the P4 due to only updating some processor
112flags.
113
114//===---------------------------------------------------------------------===//
115
116The instruction selector sometimes misses folding a load into a compare. The
117pattern is written as (cmp reg, (load p)). Because the compare isn't
118commutative, it is not matched with the load on both sides. The dag combiner
119should be made smart enough to cannonicalize the load into the RHS of a compare
120when it can invert the result of the compare for free.
121
122//===---------------------------------------------------------------------===//
123
124How about intrinsics? An example is:
125 *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
126
127compiles to
128 pmuludq (%eax), %xmm0
129 movl 8(%esp), %eax
130 movdqa (%eax), %xmm1
131 pmulhuw %xmm0, %xmm1
132
133The transformation probably requires a X86 specific pass or a DAG combiner
134target specific hook.
135
136//===---------------------------------------------------------------------===//
137
138In many cases, LLVM generates code like this:
139
140_test:
141 movl 8(%esp), %eax
142 cmpl %eax, 4(%esp)
143 setl %al
144 movzbl %al, %eax
145 ret
146
147on some processors (which ones?), it is more efficient to do this:
148
149_test:
150 movl 8(%esp), %ebx
151 xor %eax, %eax
152 cmpl %ebx, 4(%esp)
153 setl %al
154 ret
155
156Doing this correctly is tricky though, as the xor clobbers the flags.
157
158//===---------------------------------------------------------------------===//
159
160We should generate bts/btr/etc instructions on targets where they are cheap or
161when codesize is important. e.g., for:
162
163void setbit(int *target, int bit) {
164 *target |= (1 << bit);
165}
166void clearbit(int *target, int bit) {
167 *target &= ~(1 << bit);
168}
169
170//===---------------------------------------------------------------------===//
171
172Instead of the following for memset char*, 1, 10:
173
174 movl $16843009, 4(%edx)
175 movl $16843009, (%edx)
176 movw $257, 8(%edx)
177
178It might be better to generate
179
180 movl $16843009, %eax
181 movl %eax, 4(%edx)
182 movl %eax, (%edx)
183 movw al, 8(%edx)
184
185when we can spare a register. It reduces code size.
186
187//===---------------------------------------------------------------------===//
188
189Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
190get this:
191
Eli Friedman1aa1f2c2008-02-28 00:21:43 +0000192define i32 @test1(i32 %X) {
193 %Y = sdiv i32 %X, 8
194 ret i32 %Y
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000195}
196
197_test1:
198 movl 4(%esp), %eax
199 movl %eax, %ecx
200 sarl $31, %ecx
201 shrl $29, %ecx
202 addl %ecx, %eax
203 sarl $3, %eax
204 ret
205
206GCC knows several different ways to codegen it, one of which is this:
207
208_test1:
209 movl 4(%esp), %eax
210 cmpl $-1, %eax
211 leal 7(%eax), %ecx
212 cmovle %ecx, %eax
213 sarl $3, %eax
214 ret
215
216which is probably slower, but it's interesting at least :)
217
218//===---------------------------------------------------------------------===//
219
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000220We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
221We should leave these as libcalls for everything over a much lower threshold,
222since libc is hand tuned for medium and large mem ops (avoiding RFO for large
223stores, TLB preheating, etc)
224
225//===---------------------------------------------------------------------===//
226
227Optimize this into something reasonable:
228 x * copysign(1.0, y) * copysign(1.0, z)
229
230//===---------------------------------------------------------------------===//
231
232Optimize copysign(x, *y) to use an integer load from y.
233
234//===---------------------------------------------------------------------===//
235
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000236The following tests perform worse with LSR:
237
238lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
239
240//===---------------------------------------------------------------------===//
241
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000242Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
243FR64 to VR128.
244
245//===---------------------------------------------------------------------===//
246
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000247Adding to the list of cmp / test poor codegen issues:
248
249int test(__m128 *A, __m128 *B) {
250 if (_mm_comige_ss(*A, *B))
251 return 3;
252 else
253 return 4;
254}
255
256_test:
257 movl 8(%esp), %eax
258 movaps (%eax), %xmm0
259 movl 4(%esp), %eax
260 movaps (%eax), %xmm1
261 comiss %xmm0, %xmm1
262 setae %al
263 movzbl %al, %ecx
264 movl $3, %eax
265 movl $4, %edx
266 cmpl $0, %ecx
267 cmove %edx, %eax
268 ret
269
270Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
271are a number of issues. 1) We are introducing a setcc between the result of the
272intrisic call and select. 2) The intrinsic is expected to produce a i32 value
273so a any extend (which becomes a zero extend) is added.
274
275We probably need some kind of target DAG combine hook to fix this.
276
277//===---------------------------------------------------------------------===//
278
279We generate significantly worse code for this than GCC:
280http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
281http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
282
283There is also one case we do worse on PPC.
284
285//===---------------------------------------------------------------------===//
286
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000287For this:
288
289int test(int a)
290{
291 return a * 3;
292}
293
294We currently emits
295 imull $3, 4(%esp), %eax
296
297Perhaps this is what we really should generate is? Is imull three or four
298cycles? Note: ICC generates this:
299 movl 4(%esp), %eax
300 leal (%eax,%eax,2), %eax
301
302The current instruction priority is based on pattern complexity. The former is
303more "complex" because it folds a load so the latter will not be emitted.
304
305Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
306should always try to match LEA first since the LEA matching code does some
307estimate to determine whether the match is profitable.
308
309However, if we care more about code size, then imull is better. It's two bytes
310shorter than movl + leal.
311
Eli Friedman9ab1db02008-11-30 07:52:27 +0000312On a Pentium M, both variants have the same characteristics with regard
313to throughput; however, the multiplication has a latency of four cycles, as
314opposed to two cycles for the movl+lea variant.
315
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000316//===---------------------------------------------------------------------===//
317
Eli Friedman577c7492008-02-21 21:16:49 +0000318__builtin_ffs codegen is messy.
Chris Lattnera86af9a2007-08-11 18:19:07 +0000319
Chris Lattnera86af9a2007-08-11 18:19:07 +0000320int ffs_(unsigned X) { return __builtin_ffs(X); }
321
Eli Friedman577c7492008-02-21 21:16:49 +0000322llvm produces:
323ffs_:
324 movl 4(%esp), %ecx
325 bsfl %ecx, %eax
326 movl $32, %edx
327 cmove %edx, %eax
328 incl %eax
329 xorl %edx, %edx
330 testl %ecx, %ecx
331 cmove %edx, %eax
Chris Lattnera86af9a2007-08-11 18:19:07 +0000332 ret
Eli Friedman577c7492008-02-21 21:16:49 +0000333
334vs gcc:
335
Chris Lattnera86af9a2007-08-11 18:19:07 +0000336_ffs_:
337 movl $-1, %edx
338 bsfl 4(%esp), %eax
339 cmove %edx, %eax
340 addl $1, %eax
341 ret
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000342
Eli Friedman577c7492008-02-21 21:16:49 +0000343Another example of __builtin_ffs (use predsimplify to eliminate a select):
344
345int foo (unsigned long j) {
346 if (j)
347 return __builtin_ffs (j) - 1;
348 else
349 return 0;
350}
351
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000352//===---------------------------------------------------------------------===//
353
354It appears gcc place string data with linkonce linkage in
355.section __TEXT,__const_coal,coalesced instead of
356.section __DATA,__const_coal,coalesced.
357Take a look at darwin.h, there are other Darwin assembler directives that we
358do not make use of.
359
360//===---------------------------------------------------------------------===//
361
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000362define i32 @foo(i32* %a, i32 %t) {
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000363entry:
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000364 br label %cond_true
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000365
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000366cond_true: ; preds = %cond_true, %entry
367 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
368 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
369 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
370 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
371 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
372 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
373 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
374 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
375 br i1 %tmp, label %bb12, label %cond_true
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000376
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000377bb12: ; preds = %cond_true
378 ret i32 %tmp7
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000379}
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000380is pessimized by -loop-reduce and -indvars
381
382//===---------------------------------------------------------------------===//
383
384u32 to float conversion improvement:
385
386float uint32_2_float( unsigned u ) {
387 float fl = (int) (u & 0xffff);
388 float fh = (int) (u >> 16);
389 fh *= 0x1.0p16f;
390 return fh + fl;
391}
392
39300000000 subl $0x04,%esp
39400000003 movl 0x08(%esp,1),%eax
39500000007 movl %eax,%ecx
39600000009 shrl $0x10,%ecx
3970000000c cvtsi2ss %ecx,%xmm0
39800000010 andl $0x0000ffff,%eax
39900000015 cvtsi2ss %eax,%xmm1
40000000019 mulss 0x00000078,%xmm0
40100000021 addss %xmm1,%xmm0
40200000025 movss %xmm0,(%esp,1)
4030000002a flds (%esp,1)
4040000002d addl $0x04,%esp
40500000030 ret
406
407//===---------------------------------------------------------------------===//
408
409When using fastcc abi, align stack slot of argument of type double on 8 byte
410boundary to improve performance.
411
412//===---------------------------------------------------------------------===//
413
414Codegen:
415
416int f(int a, int b) {
417 if (a == 4 || a == 6)
418 b++;
419 return b;
420}
421
422
423as:
424
425or eax, 2
426cmp eax, 6
427jz label
428
429//===---------------------------------------------------------------------===//
430
431GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
432simplifications for integer "x cmp y ? a : b". For example, instead of:
433
434int G;
435void f(int X, int Y) {
436 G = X < 0 ? 14 : 13;
437}
438
439compiling to:
440
441_f:
442 movl $14, %eax
443 movl $13, %ecx
444 movl 4(%esp), %edx
445 testl %edx, %edx
446 cmovl %eax, %ecx
447 movl %ecx, _G
448 ret
449
450it could be:
451_f:
452 movl 4(%esp), %eax
453 sarl $31, %eax
454 notl %eax
455 addl $14, %eax
456 movl %eax, _G
457 ret
458
459etc.
460
Chris Lattnere7037c22007-11-02 17:04:20 +0000461Another is:
462int usesbb(unsigned int a, unsigned int b) {
463 return (a < b ? -1 : 0);
464}
465to:
466_usesbb:
467 movl 8(%esp), %eax
468 cmpl %eax, 4(%esp)
469 sbbl %eax, %eax
470 ret
471
472instead of:
473_usesbb:
474 xorl %eax, %eax
475 movl 8(%esp), %ecx
476 cmpl %ecx, 4(%esp)
477 movl $4294967295, %ecx
478 cmovb %ecx, %eax
479 ret
480
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000481//===---------------------------------------------------------------------===//
482
483Currently we don't have elimination of redundant stack manipulations. Consider
484the code:
485
486int %main() {
487entry:
488 call fastcc void %test1( )
489 call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
490 ret int 0
491}
492
493declare fastcc void %test1()
494
495declare fastcc void %test2(sbyte*)
496
497
498This currently compiles to:
499
500 subl $16, %esp
501 call _test5
502 addl $12, %esp
503 subl $16, %esp
504 movl $_test5, (%esp)
505 call _test6
506 addl $12, %esp
507
508The add\sub pair is really unneeded here.
509
510//===---------------------------------------------------------------------===//
511
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000512Consider the expansion of:
513
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000514define i32 @test3(i32 %X) {
515 %tmp1 = urem i32 %X, 255
516 ret i32 %tmp1
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000517}
518
519Currently it compiles to:
520
521...
522 movl $2155905153, %ecx
523 movl 8(%esp), %esi
524 movl %esi, %eax
525 mull %ecx
526...
527
528This could be "reassociated" into:
529
530 movl $2155905153, %eax
531 movl 8(%esp), %ecx
532 mull %ecx
533
534to avoid the copy. In fact, the existing two-address stuff would do this
535except that mul isn't a commutative 2-addr instruction. I guess this has
536to be done at isel time based on the #uses to mul?
537
538//===---------------------------------------------------------------------===//
539
540Make sure the instruction which starts a loop does not cross a cacheline
541boundary. This requires knowning the exact length of each machine instruction.
542That is somewhat complicated, but doable. Example 256.bzip2:
543
544In the new trace, the hot loop has an instruction which crosses a cacheline
545boundary. In addition to potential cache misses, this can't help decoding as I
546imagine there has to be some kind of complicated decoder reset and realignment
547to grab the bytes from the next cacheline.
548
549532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
Eli Friedman9ab1db02008-11-30 07:52:27 +0000550942 942 0x3d03 movl %dh, (1809(%esp, %esi)
551937 937 0x3d0a incl %esi
5523 3 0x3d0b cmpb %bl, %dl
Dan Gohmanf17a25c2007-07-18 16:29:46 +000055327 27 0x3d0d jnz 0x000062db <main+11707>
554
555//===---------------------------------------------------------------------===//
556
557In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
558
559//===---------------------------------------------------------------------===//
560
561This could be a single 16-bit load.
562
563int f(char *p) {
564 if ((p[0] == 1) & (p[1] == 2)) return 1;
565 return 0;
566}
567
568//===---------------------------------------------------------------------===//
569
570We should inline lrintf and probably other libc functions.
571
572//===---------------------------------------------------------------------===//
573
574Start using the flags more. For example, compile:
575
576int add_zf(int *x, int y, int a, int b) {
577 if ((*x += y) == 0)
578 return a;
579 else
580 return b;
581}
582
583to:
584 addl %esi, (%rdi)
585 movl %edx, %eax
586 cmovne %ecx, %eax
587 ret
588instead of:
589
590_add_zf:
591 addl (%rdi), %esi
592 movl %esi, (%rdi)
593 testl %esi, %esi
594 cmove %edx, %ecx
595 movl %ecx, %eax
596 ret
597
598and:
599
600int add_zf(int *x, int y, int a, int b) {
601 if ((*x + y) < 0)
602 return a;
603 else
604 return b;
605}
606
607to:
608
609add_zf:
610 addl (%rdi), %esi
611 movl %edx, %eax
612 cmovns %ecx, %eax
613 ret
614
615instead of:
616
617_add_zf:
618 addl (%rdi), %esi
619 testl %esi, %esi
620 cmovs %edx, %ecx
621 movl %ecx, %eax
622 ret
623
624//===---------------------------------------------------------------------===//
625
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000626These two functions have identical effects:
627
628unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
629unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
630
631We currently compile them to:
632
633_f:
634 movl 4(%esp), %eax
635 movl %eax, %ecx
636 incl %ecx
637 movl 8(%esp), %edx
638 cmpl %edx, %ecx
639 jne LBB1_2 #UnifiedReturnBlock
640LBB1_1: #cond_true
641 addl $2, %eax
642 ret
643LBB1_2: #UnifiedReturnBlock
644 movl %ecx, %eax
645 ret
646_f2:
647 movl 4(%esp), %eax
648 movl %eax, %ecx
649 incl %ecx
650 cmpl 8(%esp), %ecx
651 sete %cl
652 movzbl %cl, %ecx
653 leal 1(%ecx,%eax), %eax
654 ret
655
656both of which are inferior to GCC's:
657
658_f:
659 movl 4(%esp), %edx
660 leal 1(%edx), %eax
661 addl $2, %edx
662 cmpl 8(%esp), %eax
663 cmove %edx, %eax
664 ret
665_f2:
666 movl 4(%esp), %eax
667 addl $1, %eax
668 xorl %edx, %edx
669 cmpl 8(%esp), %eax
670 sete %dl
671 addl %edx, %eax
672 ret
673
674//===---------------------------------------------------------------------===//
675
676This code:
677
678void test(int X) {
679 if (X) abort();
680}
681
682is currently compiled to:
683
684_test:
685 subl $12, %esp
686 cmpl $0, 16(%esp)
687 jne LBB1_1
688 addl $12, %esp
689 ret
690LBB1_1:
691 call L_abort$stub
692
693It would be better to produce:
694
695_test:
696 subl $12, %esp
697 cmpl $0, 16(%esp)
698 jne L_abort$stub
699 addl $12, %esp
700 ret
701
702This can be applied to any no-return function call that takes no arguments etc.
703Alternatively, the stack save/restore logic could be shrink-wrapped, producing
704something like this:
705
706_test:
707 cmpl $0, 4(%esp)
708 jne LBB1_1
709 ret
710LBB1_1:
711 subl $12, %esp
712 call L_abort$stub
713
714Both are useful in different situations. Finally, it could be shrink-wrapped
715and tail called, like this:
716
717_test:
718 cmpl $0, 4(%esp)
719 jne LBB1_1
720 ret
721LBB1_1:
722 pop %eax # realign stack.
723 call L_abort$stub
724
725Though this probably isn't worth it.
726
727//===---------------------------------------------------------------------===//
728
729We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner0d64ec32007-08-11 18:16:46 +0000730when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000731
732int foo(int A, int B) {
733 return A+1;
734}
735
736to:
737
738_foo:
739 leal 1(%edi), %eax
740 ret
741
742instead of:
743
744_foo:
745 incl %edi
746 movl %edi, %eax
747 ret
748
749Another example is:
750
751;; X's live range extends beyond the shift, so the register allocator
752;; cannot coalesce it with Y. Because of this, a copy needs to be
753;; emitted before the shift to save the register value before it is
754;; clobbered. However, this copy is not needed if the register
755;; allocator turns the shift into an LEA. This also occurs for ADD.
756
757; Check that the shift gets turned into an LEA.
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000758; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000759; RUN: not grep {mov E.X, E.X}
760
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000761@G = external global i32 ; <i32*> [#uses=3]
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000762
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000763define i32 @test1(i32 %X, i32 %Y) {
764 %Z = add i32 %X, %Y ; <i32> [#uses=1]
765 volatile store i32 %Y, i32* @G
766 volatile store i32 %Z, i32* @G
767 ret i32 %X
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000768}
769
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000770define i32 @test2(i32 %X) {
771 %Z = add i32 %X, 1 ; <i32> [#uses=1]
772 volatile store i32 %Z, i32* @G
773 ret i32 %X
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000774}
775
776//===---------------------------------------------------------------------===//
777
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000778Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
779a neg instead of a sub instruction. Consider:
780
781int test(char X) { return 7-X; }
782
783we currently produce:
784_test:
785 movl $7, %eax
786 movsbl 4(%esp), %ecx
787 subl %ecx, %eax
788 ret
789
790We would use one fewer register if codegen'd as:
791
792 movsbl 4(%esp), %eax
793 neg %eax
794 add $7, %eax
795 ret
796
797Note that this isn't beneficial if the load can be folded into the sub. In
798this case, we want a sub:
799
800int test(int X) { return 7-X; }
801_test:
802 movl $7, %eax
803 subl 4(%esp), %eax
804 ret
805
806//===---------------------------------------------------------------------===//
807
Chris Lattner32f65872007-08-20 02:14:33 +0000808Leaf functions that require one 4-byte spill slot have a prolog like this:
809
810_foo:
811 pushl %esi
812 subl $4, %esp
813...
814and an epilog like this:
815 addl $4, %esp
816 popl %esi
817 ret
818
819It would be smaller, and potentially faster, to push eax on entry and to
820pop into a dummy register instead of using addl/subl of esp. Just don't pop
821into any return registers :)
822
823//===---------------------------------------------------------------------===//
Chris Lattner44b03cb2007-08-23 15:22:07 +0000824
825The X86 backend should fold (branch (or (setcc, setcc))) into multiple
826branches. We generate really poor code for:
827
828double testf(double a) {
829 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
830}
831
832For example, the entry BB is:
833
834_testf:
835 subl $20, %esp
836 pxor %xmm0, %xmm0
837 movsd 24(%esp), %xmm1
838 ucomisd %xmm0, %xmm1
839 setnp %al
840 sete %cl
841 testb %cl, %al
842 jne LBB1_5 # UnifiedReturnBlock
843LBB1_1: # cond_true
844
845
846it would be better to replace the last four instructions with:
847
848 jp LBB1_1
849 je LBB1_5
850LBB1_1:
851
852We also codegen the inner ?: into a diamond:
853
854 cvtss2sd LCPI1_0(%rip), %xmm2
855 cvtss2sd LCPI1_1(%rip), %xmm3
856 ucomisd %xmm1, %xmm0
857 ja LBB1_3 # cond_true
858LBB1_2: # cond_true
859 movapd %xmm3, %xmm2
860LBB1_3: # cond_true
861 movapd %xmm2, %xmm0
862 ret
863
864We should sink the load into xmm3 into the LBB1_2 block. This should
865be pretty easy, and will nuke all the copies.
866
867//===---------------------------------------------------------------------===//
Chris Lattner4084d492007-09-10 21:43:18 +0000868
869This:
870 #include <algorithm>
871 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
872 { return std::make_pair(a + b, a + b < a); }
873 bool no_overflow(unsigned a, unsigned b)
874 { return !full_add(a, b).second; }
875
876Should compile to:
877
878
879 _Z11no_overflowjj:
880 addl %edi, %esi
881 setae %al
882 ret
883
Eli Friedman577c7492008-02-21 21:16:49 +0000884FIXME: That code looks wrong; bool return is normally defined as zext.
885
Chris Lattner4084d492007-09-10 21:43:18 +0000886on x86-64, not:
887
888__Z11no_overflowjj:
889 addl %edi, %esi
890 cmpl %edi, %esi
891 setae %al
892 movzbl %al, %eax
893 ret
894
895
896//===---------------------------------------------------------------------===//
Evan Cheng35127a62007-09-10 22:16:37 +0000897
898Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
899condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattnera487bf72007-09-26 06:29:31 +0000900
901//===---------------------------------------------------------------------===//
902
903We aren't matching RMW instructions aggressively
904enough. Here's a reduced testcase (more in PR1160):
905
906define void @test(i32* %huge_ptr, i32* %target_ptr) {
907 %A = load i32* %huge_ptr ; <i32> [#uses=1]
908 %B = load i32* %target_ptr ; <i32> [#uses=1]
909 %C = or i32 %A, %B ; <i32> [#uses=1]
910 store i32 %C, i32* %target_ptr
911 ret void
912}
913
914$ llvm-as < t.ll | llc -march=x86-64
915
916_test:
917 movl (%rdi), %eax
918 orl (%rsi), %eax
919 movl %eax, (%rsi)
920 ret
921
922That should be something like:
923
924_test:
925 movl (%rdi), %eax
926 orl %eax, (%rsi)
927 ret
928
929//===---------------------------------------------------------------------===//
930
Bill Wendling7f436dd2007-10-02 20:42:59 +0000931The following code:
932
Bill Wendlingc2036e32007-10-02 20:54:32 +0000933bb114.preheader: ; preds = %cond_next94
934 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
935 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
936 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
937 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
938 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
939 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
940 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
941 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
942 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
943 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
944 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
945 br label %bb114
946
947produces:
948
Bill Wendling7f436dd2007-10-02 20:42:59 +0000949LBB3_5: # bb114.preheader
950 movswl -68(%ebp), %eax
951 movl $32, %ecx
952 movl %ecx, -80(%ebp)
953 subl %eax, -80(%ebp)
954 movswl -52(%ebp), %eax
955 movl %ecx, -84(%ebp)
956 subl %eax, -84(%ebp)
957 movswl -70(%ebp), %eax
958 movl %ecx, -88(%ebp)
959 subl %eax, -88(%ebp)
960 movswl -50(%ebp), %eax
961 subl %eax, %ecx
962 movl %ecx, -76(%ebp)
963 movswl -42(%ebp), %eax
964 movl %eax, -92(%ebp)
965 movswl -66(%ebp), %eax
966 movl %eax, -96(%ebp)
967 movw $0, -98(%ebp)
968
Chris Lattner792bae52007-10-03 03:40:24 +0000969This appears to be bad because the RA is not folding the store to the stack
970slot into the movl. The above instructions could be:
971 movl $32, -80(%ebp)
972...
973 movl $32, -84(%ebp)
974...
975This seems like a cross between remat and spill folding.
976
Bill Wendlingc2036e32007-10-02 20:54:32 +0000977This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling7f436dd2007-10-02 20:42:59 +0000978change, so we could simply subtract %eax from %ecx first and then use %ecx (or
979vice-versa).
980
981//===---------------------------------------------------------------------===//
982
Bill Wendling54c4f832007-10-02 21:49:31 +0000983This code:
984
985 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
986 br i1 %tmp659, label %cond_true662, label %cond_next715
987
988produces this:
989
990 testw %cx, %cx
991 movswl %cx, %esi
992 jns LBB4_109 # cond_next715
993
994Shark tells us that using %cx in the testw instruction is sub-optimal. It
995suggests using the 32-bit register (which is what ICC uses).
996
997//===---------------------------------------------------------------------===//
Chris Lattner802c62a2007-10-03 17:10:03 +0000998
Chris Lattnerae259992007-10-04 15:47:27 +0000999We compile this:
1000
1001void compare (long long foo) {
1002 if (foo < 4294967297LL)
1003 abort();
1004}
1005
1006to:
1007
Eli Friedman577c7492008-02-21 21:16:49 +00001008compare:
1009 subl $4, %esp
1010 cmpl $0, 8(%esp)
Chris Lattnerae259992007-10-04 15:47:27 +00001011 setne %al
1012 movzbw %al, %ax
Eli Friedman577c7492008-02-21 21:16:49 +00001013 cmpl $1, 12(%esp)
Chris Lattnerae259992007-10-04 15:47:27 +00001014 setg %cl
1015 movzbw %cl, %cx
1016 cmove %ax, %cx
Eli Friedman577c7492008-02-21 21:16:49 +00001017 testb $1, %cl
1018 jne .LBB1_2 # UnifiedReturnBlock
1019.LBB1_1: # ifthen
1020 call abort
1021.LBB1_2: # UnifiedReturnBlock
1022 addl $4, %esp
1023 ret
Chris Lattnerae259992007-10-04 15:47:27 +00001024
1025(also really horrible code on ppc). This is due to the expand code for 64-bit
1026compares. GCC produces multiple branches, which is much nicer:
1027
Eli Friedman577c7492008-02-21 21:16:49 +00001028compare:
1029 subl $12, %esp
1030 movl 20(%esp), %edx
1031 movl 16(%esp), %eax
1032 decl %edx
1033 jle .L7
1034.L5:
1035 addl $12, %esp
1036 ret
1037 .p2align 4,,7
1038.L7:
1039 jl .L4
Chris Lattnerae259992007-10-04 15:47:27 +00001040 cmpl $0, %eax
Eli Friedman577c7492008-02-21 21:16:49 +00001041 .p2align 4,,8
1042 ja .L5
1043.L4:
1044 .p2align 4,,9
1045 call abort
Chris Lattnerae259992007-10-04 15:47:27 +00001046
1047//===---------------------------------------------------------------------===//
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001048
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001049Tail call optimization improvements: Tail call optimization currently
1050pushes all arguments on the top of the stack (their normal place for
Arnold Schwaighofer449b01a2008-01-11 16:49:42 +00001051non-tail call optimized calls) that source from the callers arguments
1052or that source from a virtual register (also possibly sourcing from
1053callers arguments).
1054This is done to prevent overwriting of parameters (see example
1055below) that might be used later.
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001056
1057example:
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001058
1059int callee(int32, int64);
1060int caller(int32 arg1, int32 arg2) {
1061 int64 local = arg2 * 2;
1062 return callee(arg2, (int64)local);
1063}
1064
1065[arg1] [!arg2 no longer valid since we moved local onto it]
1066[arg2] -> [(int64)
1067[RETADDR] local ]
1068
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001069Moving arg1 onto the stack slot of callee function would overwrite
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001070arg2 of the caller.
1071
1072Possible optimizations:
1073
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001074
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001075 - Analyse the actual parameters of the callee to see which would
1076 overwrite a caller parameter which is used by the callee and only
1077 push them onto the top of the stack.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001078
1079 int callee (int32 arg1, int32 arg2);
1080 int caller (int32 arg1, int32 arg2) {
1081 return callee(arg1,arg2);
1082 }
1083
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001084 Here we don't need to write any variables to the top of the stack
1085 since they don't overwrite each other.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001086
1087 int callee (int32 arg1, int32 arg2);
1088 int caller (int32 arg1, int32 arg2) {
1089 return callee(arg2,arg1);
1090 }
1091
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001092 Here we need to push the arguments because they overwrite each
1093 other.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001094
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001095//===---------------------------------------------------------------------===//
Evan Cheng7f1ad6a2007-10-28 04:01:09 +00001096
1097main ()
1098{
1099 int i = 0;
1100 unsigned long int z = 0;
1101
1102 do {
1103 z -= 0x00004000;
1104 i++;
1105 if (i > 0x00040000)
1106 abort ();
1107 } while (z > 0);
1108 exit (0);
1109}
1110
1111gcc compiles this to:
1112
1113_main:
1114 subl $28, %esp
1115 xorl %eax, %eax
1116 jmp L2
1117L3:
1118 cmpl $262144, %eax
1119 je L10
1120L2:
1121 addl $1, %eax
1122 cmpl $262145, %eax
1123 jne L3
1124 call L_abort$stub
1125L10:
1126 movl $0, (%esp)
1127 call L_exit$stub
1128
1129llvm:
1130
1131_main:
1132 subl $12, %esp
1133 movl $1, %eax
1134 movl $16384, %ecx
1135LBB1_1: # bb
1136 cmpl $262145, %eax
1137 jge LBB1_4 # cond_true
1138LBB1_2: # cond_next
1139 incl %eax
1140 addl $4294950912, %ecx
1141 cmpl $16384, %ecx
1142 jne LBB1_1 # bb
1143LBB1_3: # bb11
1144 xorl %eax, %eax
1145 addl $12, %esp
1146 ret
1147LBB1_4: # cond_true
1148 call L_abort$stub
1149
11501. LSR should rewrite the first cmp with induction variable %ecx.
11512. DAG combiner should fold
1152 leal 1(%eax), %edx
1153 cmpl $262145, %edx
1154 =>
1155 cmpl $262144, %eax
1156
1157//===---------------------------------------------------------------------===//
Chris Lattner358670b2007-11-24 06:13:33 +00001158
1159define i64 @test(double %X) {
1160 %Y = fptosi double %X to i64
1161 ret i64 %Y
1162}
1163
1164compiles to:
1165
1166_test:
1167 subl $20, %esp
1168 movsd 24(%esp), %xmm0
1169 movsd %xmm0, 8(%esp)
1170 fldl 8(%esp)
1171 fisttpll (%esp)
1172 movl 4(%esp), %edx
1173 movl (%esp), %eax
1174 addl $20, %esp
1175 #FP_REG_KILL
1176 ret
1177
1178This should just fldl directly from the input stack slot.
Chris Lattner10d54d12007-12-05 22:58:19 +00001179
1180//===---------------------------------------------------------------------===//
1181
1182This code:
1183int foo (int x) { return (x & 65535) | 255; }
1184
1185Should compile into:
1186
1187_foo:
1188 movzwl 4(%esp), %eax
Eli Friedman577c7492008-02-21 21:16:49 +00001189 orl $255, %eax
Chris Lattner10d54d12007-12-05 22:58:19 +00001190 ret
1191
1192instead of:
1193_foo:
1194 movl $255, %eax
1195 orl 4(%esp), %eax
1196 andl $65535, %eax
1197 ret
1198
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001199//===---------------------------------------------------------------------===//
1200
Chris Lattnereec7ac02008-02-21 06:51:29 +00001201We're codegen'ing multiply of long longs inefficiently:
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001202
Chris Lattnereec7ac02008-02-21 06:51:29 +00001203unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1204 return arg1 * arg2;
1205}
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001206
Chris Lattnereec7ac02008-02-21 06:51:29 +00001207We compile to (fomit-frame-pointer):
1208
1209_LLM:
1210 pushl %esi
1211 movl 8(%esp), %ecx
1212 movl 16(%esp), %esi
1213 movl %esi, %eax
1214 mull %ecx
1215 imull 12(%esp), %esi
1216 addl %edx, %esi
1217 imull 20(%esp), %ecx
1218 movl %esi, %edx
1219 addl %ecx, %edx
1220 popl %esi
1221 ret
1222
1223This looks like a scheduling deficiency and lack of remat of the load from
1224the argument area. ICC apparently produces:
1225
1226 movl 8(%esp), %ecx
1227 imull 12(%esp), %ecx
1228 movl 16(%esp), %eax
1229 imull 4(%esp), %eax
1230 addl %eax, %ecx
1231 movl 4(%esp), %eax
1232 mull 12(%esp)
1233 addl %ecx, %edx
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001234 ret
1235
Chris Lattnereec7ac02008-02-21 06:51:29 +00001236Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
1237http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001238
1239//===---------------------------------------------------------------------===//
1240
Chris Lattner2b55ebd2007-12-24 19:27:46 +00001241We can fold a store into "zeroing a reg". Instead of:
1242
1243xorl %eax, %eax
1244movl %eax, 124(%esp)
1245
1246we should get:
1247
1248movl $0, 124(%esp)
1249
1250if the flags of the xor are dead.
1251
Chris Lattner459ff992008-01-11 18:00:13 +00001252Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
1253be folded into: shl [mem], 1
1254
Chris Lattner2b55ebd2007-12-24 19:27:46 +00001255//===---------------------------------------------------------------------===//
Chris Lattner64400952007-12-28 21:50:40 +00001256
1257This testcase misses a read/modify/write opportunity (from PR1425):
1258
1259void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1260 int i;
1261 for(i=0; i<width; i++)
1262 b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1263}
1264
1265We compile it down to:
1266
1267LBB1_2: # bb
1268 movl (%esi,%edi,4), %ebx
1269 addl (%ecx,%edi,4), %ebx
1270 addl (%edx,%edi,4), %ebx
1271 movl %ebx, (%ecx,%edi,4)
1272 incl %edi
1273 cmpl %eax, %edi
1274 jne LBB1_2 # bb
1275
1276the inner loop should add to the memory location (%ecx,%edi,4), saving
1277a mov. Something like:
1278
1279 movl (%esi,%edi,4), %ebx
1280 addl (%edx,%edi,4), %ebx
1281 addl %ebx, (%ecx,%edi,4)
1282
Chris Lattnerbde73102007-12-29 05:51:58 +00001283Here is another interesting example:
1284
1285void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1286 int i;
1287 for(i=0; i<width; i++)
1288 b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1289}
1290
1291We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1292
1293LBB9_2: # bb
1294 movl (%ecx,%edi,4), %ebx
1295 subl (%esi,%edi,4), %ebx
1296 subl (%edx,%edi,4), %ebx
1297 movl %ebx, (%ecx,%edi,4)
1298 incl %edi
1299 cmpl %eax, %edi
1300 jne LBB9_2 # bb
1301
1302Additionally, LSR should rewrite the exit condition of these loops to use
Chris Lattner64400952007-12-28 21:50:40 +00001303a stride-4 IV, would would allow all the scales in the loop to go away.
1304This would result in smaller code and more efficient microops.
1305
1306//===---------------------------------------------------------------------===//
Chris Lattner0362a362008-01-07 21:59:58 +00001307
1308In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1309or and instruction, for example:
1310
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001311 xorpd LCPI1_0, %xmm2
Chris Lattner0362a362008-01-07 21:59:58 +00001312
1313However, if xmm2 gets spilled, we end up with really ugly code like this:
1314
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001315 movsd (%esp), %xmm0
1316 xorpd LCPI1_0, %xmm0
1317 movsd %xmm0, (%esp)
Chris Lattner0362a362008-01-07 21:59:58 +00001318
1319Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1320the neg/abs instruction, turning it into an *integer* operation, like this:
1321
1322 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
1323
1324you could also use xorb, but xorl is less likely to lead to a partial register
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001325stall. Here is a contrived testcase:
1326
1327double a, b, c;
1328void test(double *P) {
1329 double X = *P;
1330 a = X;
1331 bar();
1332 X = -X;
1333 b = X;
1334 bar();
1335 c = X;
1336}
Chris Lattner0362a362008-01-07 21:59:58 +00001337
1338//===---------------------------------------------------------------------===//
Andrew Lenharth785610d2008-02-16 01:24:58 +00001339
1340handling llvm.memory.barrier on pre SSE2 cpus
1341
1342should generate:
1343lock ; mov %esp, %esp
1344
1345//===---------------------------------------------------------------------===//
Chris Lattner7644ff32008-02-17 19:43:57 +00001346
1347The generated code on x86 for checking for signed overflow on a multiply the
1348obvious way is much longer than it needs to be.
1349
1350int x(int a, int b) {
1351 long long prod = (long long)a*b;
1352 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1353}
1354
1355See PR2053 for more details.
1356
1357//===---------------------------------------------------------------------===//
Chris Lattner83f22362008-02-18 18:30:13 +00001358
Eli Friedman577c7492008-02-21 21:16:49 +00001359We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1360more aggressively; it should cost the same as a move+shift on any modern
1361processor, but it's a lot shorter. Downside is that it puts more
1362pressure on register allocation because it has fixed operands.
1363
1364Example:
1365int abs(int x) {return x < 0 ? -x : x;}
1366
1367gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1368abs:
1369 movl 4(%esp), %eax
1370 cltd
1371 xorl %edx, %eax
1372 subl %edx, %eax
1373 ret
1374
1375//===---------------------------------------------------------------------===//
1376
1377Consider:
Chris Lattner83f22362008-02-18 18:30:13 +00001378int test(unsigned long a, unsigned long b) { return -(a < b); }
1379
1380We currently compile this to:
1381
1382define i32 @test(i32 %a, i32 %b) nounwind {
1383 %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
1384 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
1385 %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
1386 ret i32 %tmp5
1387}
1388
1389and
1390
1391_test:
1392 movl 8(%esp), %eax
1393 cmpl %eax, 4(%esp)
1394 setb %al
1395 movzbl %al, %eax
1396 negl %eax
1397 ret
1398
1399Several deficiencies here. First, we should instcombine zext+neg into sext:
1400
1401define i32 @test2(i32 %a, i32 %b) nounwind {
1402 %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
1403 %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
1404 ret i32 %tmp34
1405}
1406
1407However, before we can do that, we have to fix the bad codegen that we get for
1408sext from bool:
1409
1410_test2:
1411 movl 8(%esp), %eax
1412 cmpl %eax, 4(%esp)
1413 setb %al
1414 movzbl %al, %eax
1415 shll $31, %eax
1416 sarl $31, %eax
1417 ret
1418
1419This code should be at least as good as the code above. Once this is fixed, we
1420can optimize this specific case even more to:
1421
1422 movl 8(%esp), %eax
1423 xorl %ecx, %ecx
1424 cmpl %eax, 4(%esp)
1425 sbbl %ecx, %ecx
1426
1427//===---------------------------------------------------------------------===//
Eli Friedman1aa1f2c2008-02-28 00:21:43 +00001428
1429Take the following code (from
1430http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1431
1432extern unsigned char first_one[65536];
1433int FirstOnet(unsigned long long arg1)
1434{
1435 if (arg1 >> 48)
1436 return (first_one[arg1 >> 48]);
1437 return 0;
1438}
1439
1440
1441The following code is currently generated:
1442FirstOnet:
1443 movl 8(%esp), %eax
1444 cmpl $65536, %eax
1445 movl 4(%esp), %ecx
1446 jb .LBB1_2 # UnifiedReturnBlock
1447.LBB1_1: # ifthen
1448 shrl $16, %eax
1449 movzbl first_one(%eax), %eax
1450 ret
1451.LBB1_2: # UnifiedReturnBlock
1452 xorl %eax, %eax
1453 ret
1454
1455There are a few possible improvements here:
14561. We should be able to eliminate the dead load into %ecx
14572. We could change the "movl 8(%esp), %eax" into
1458 "movzwl 10(%esp), %eax"; this lets us change the cmpl
1459 into a testl, which is shorter, and eliminate the shift.
1460
1461We could also in theory eliminate the branch by using a conditional
1462for the address of the load, but that seems unlikely to be worthwhile
1463in general.
1464
1465//===---------------------------------------------------------------------===//
1466
Chris Lattner44a98ac2008-02-28 04:52:59 +00001467We compile this function:
1468
1469define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
1470entry:
1471 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
1472 br i1 %tmp2, label %bb7, label %bb
1473
1474bb: ; preds = %entry
1475 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
1476 ret i32 %tmp6
1477
1478bb7: ; preds = %entry
1479 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
1480 ret i32 %tmp10
1481}
1482
1483to:
1484
1485_foo:
1486 cmpb $0, 16(%esp)
1487 movl 12(%esp), %ecx
1488 movl 8(%esp), %eax
1489 movl 4(%esp), %edx
1490 je LBB1_2 # bb7
1491LBB1_1: # bb
1492 addl %edx, %eax
1493 ret
1494LBB1_2: # bb7
1495 movl %edx, %eax
1496 subl %ecx, %eax
1497 ret
1498
Gabor Greif02661592008-03-06 10:51:21 +00001499The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
Chris Lattner44a98ac2008-02-28 04:52:59 +00001500if it commuted the addl in LBB1_1.
1501
1502//===---------------------------------------------------------------------===//
Evan Cheng921dcba2008-03-28 07:07:06 +00001503
1504See rdar://4653682.
1505
1506From flops:
1507
1508LBB1_15: # bb310
1509 cvtss2sd LCPI1_0, %xmm1
1510 addsd %xmm1, %xmm0
1511 movsd 176(%esp), %xmm2
1512 mulsd %xmm0, %xmm2
1513 movapd %xmm2, %xmm3
1514 mulsd %xmm3, %xmm3
1515 movapd %xmm3, %xmm4
1516 mulsd LCPI1_23, %xmm4
1517 addsd LCPI1_24, %xmm4
1518 mulsd %xmm3, %xmm4
1519 addsd LCPI1_25, %xmm4
1520 mulsd %xmm3, %xmm4
1521 addsd LCPI1_26, %xmm4
1522 mulsd %xmm3, %xmm4
1523 addsd LCPI1_27, %xmm4
1524 mulsd %xmm3, %xmm4
1525 addsd LCPI1_28, %xmm4
1526 mulsd %xmm3, %xmm4
1527 addsd %xmm1, %xmm4
1528 mulsd %xmm2, %xmm4
1529 movsd 152(%esp), %xmm1
1530 addsd %xmm4, %xmm1
1531 movsd %xmm1, 152(%esp)
1532 incl %eax
1533 cmpl %eax, %esi
1534 jge LBB1_15 # bb310
1535LBB1_16: # bb358.loopexit
1536 movsd 152(%esp), %xmm0
1537 addsd %xmm0, %xmm0
1538 addsd LCPI1_22, %xmm0
1539 movsd %xmm0, 152(%esp)
1540
1541Rather than spilling the result of the last addsd in the loop, we should have
1542insert a copy to split the interval (one for the duration of the loop, one
1543extending to the fall through). The register pressure in the loop isn't high
1544enough to warrant the spill.
1545
1546Also check why xmm7 is not used at all in the function.
Chris Lattner16e5c782008-04-21 04:46:30 +00001547
1548//===---------------------------------------------------------------------===//
1549
1550Legalize loses track of the fact that bools are always zero extended when in
1551memory. This causes us to compile abort_gzip (from 164.gzip) from:
1552
1553target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1554target triple = "i386-apple-darwin8"
1555@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1556define fastcc void @abort_gzip() noreturn nounwind {
1557entry:
1558 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1559 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1560bb.i: ; preds = %entry
1561 tail call void @exit( i32 1 ) noreturn nounwind
1562 unreachable
1563bb4.i: ; preds = %entry
1564 store i1 true, i1* @in_exit.4870.b
1565 tail call void @exit( i32 1 ) noreturn nounwind
1566 unreachable
1567}
1568declare void @exit(i32) noreturn nounwind
1569
1570into:
1571
1572_abort_gzip:
1573 subl $12, %esp
1574 movb _in_exit.4870.b, %al
1575 notb %al
1576 testb $1, %al
1577 jne LBB1_2 ## bb4.i
1578LBB1_1: ## bb.i
1579 ...
1580
1581//===---------------------------------------------------------------------===//
Chris Lattner7cb1d332008-05-05 23:19:45 +00001582
1583We compile:
1584
1585int test(int x, int y) {
1586 return x-y-1;
1587}
1588
1589into (-m64):
1590
1591_test:
1592 decl %edi
1593 movl %edi, %eax
1594 subl %esi, %eax
1595 ret
1596
1597it would be better to codegen as: x+~y (notl+addl)
Edwin Törökfa9d5e22008-10-24 19:23:07 +00001598
1599//===---------------------------------------------------------------------===//
1600
1601This code:
1602
1603int foo(const char *str,...)
1604{
1605 __builtin_va_list a; int x;
1606 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1607 return x;
1608}
1609
1610gets compiled into this on x86-64:
1611 subq $200, %rsp
1612 movaps %xmm7, 160(%rsp)
1613 movaps %xmm6, 144(%rsp)
1614 movaps %xmm5, 128(%rsp)
1615 movaps %xmm4, 112(%rsp)
1616 movaps %xmm3, 96(%rsp)
1617 movaps %xmm2, 80(%rsp)
1618 movaps %xmm1, 64(%rsp)
1619 movaps %xmm0, 48(%rsp)
1620 movq %r9, 40(%rsp)
1621 movq %r8, 32(%rsp)
1622 movq %rcx, 24(%rsp)
1623 movq %rdx, 16(%rsp)
1624 movq %rsi, 8(%rsp)
1625 leaq (%rsp), %rax
1626 movq %rax, 192(%rsp)
1627 leaq 208(%rsp), %rax
1628 movq %rax, 184(%rsp)
1629 movl $48, 180(%rsp)
1630 movl $8, 176(%rsp)
1631 movl 176(%rsp), %eax
1632 cmpl $47, %eax
1633 jbe .LBB1_3 # bb
1634.LBB1_1: # bb3
1635 movq 184(%rsp), %rcx
1636 leaq 8(%rcx), %rax
1637 movq %rax, 184(%rsp)
1638.LBB1_2: # bb4
1639 movl (%rcx), %eax
1640 addq $200, %rsp
1641 ret
1642.LBB1_3: # bb
1643 movl %eax, %ecx
1644 addl $8, %eax
1645 addq 192(%rsp), %rcx
1646 movl %eax, 176(%rsp)
1647 jmp .LBB1_2 # bb4
1648
1649gcc 4.3 generates:
1650 subq $96, %rsp
1651.LCFI0:
1652 leaq 104(%rsp), %rax
1653 movq %rsi, -80(%rsp)
1654 movl $8, -120(%rsp)
1655 movq %rax, -112(%rsp)
1656 leaq -88(%rsp), %rax
1657 movq %rax, -104(%rsp)
1658 movl $8, %eax
1659 cmpl $48, %eax
1660 jb .L6
1661 movq -112(%rsp), %rdx
1662 movl (%rdx), %eax
1663 addq $96, %rsp
1664 ret
1665 .p2align 4,,10
1666 .p2align 3
1667.L6:
1668 mov %eax, %edx
1669 addq -104(%rsp), %rdx
1670 addl $8, %eax
1671 movl %eax, -120(%rsp)
1672 movl (%rdx), %eax
1673 addq $96, %rsp
1674 ret
1675
1676and it gets compiled into this on x86:
1677 pushl %ebp
1678 movl %esp, %ebp
1679 subl $4, %esp
1680 leal 12(%ebp), %eax
1681 movl %eax, -4(%ebp)
1682 leal 16(%ebp), %eax
1683 movl %eax, -4(%ebp)
1684 movl 12(%ebp), %eax
1685 addl $4, %esp
1686 popl %ebp
1687 ret
1688
1689gcc 4.3 generates:
1690 pushl %ebp
1691 movl %esp, %ebp
1692 movl 12(%ebp), %eax
1693 popl %ebp
1694 ret
Evan Chengbf97bec2008-11-11 17:35:52 +00001695
1696//===---------------------------------------------------------------------===//
1697
1698Teach tblgen not to check bitconvert source type in some cases. This allows us
1699to consolidate the following patterns in X86InstrMMX.td:
1700
1701def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1702 (iPTR 0))))),
1703 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1704def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1705 (iPTR 0))))),
1706 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1707def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1708 (iPTR 0))))),
1709 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1710
1711There are other cases in various td files.
Eli Friedman9ab1db02008-11-30 07:52:27 +00001712
1713//===---------------------------------------------------------------------===//
1714
1715Take something like the following on x86-32:
1716unsigned a(unsigned long long x, unsigned y) {return x % y;}
1717
1718We currently generate a libcall, but we really shouldn't: the expansion is
1719shorter and likely faster than the libcall. The expected code is something
1720like the following:
1721
1722 movl 12(%ebp), %eax
1723 movl 16(%ebp), %ecx
1724 xorl %edx, %edx
1725 divl %ecx
1726 movl 8(%ebp), %eax
1727 divl %ecx
1728 movl %edx, %eax
1729 ret
1730
1731A similar code sequence works for division.
1732
1733//===---------------------------------------------------------------------===//
Chris Lattnerbfccda62008-12-06 22:49:05 +00001734
1735These should compile to the same code, but the later codegen's to useless
1736instructions on X86. This may be a trivial dag combine (GCC PR7061):
1737
1738struct s1 { unsigned char a, b; };
1739unsigned long f1(struct s1 x) {
1740 return x.a + x.b;
1741}
1742struct s2 { unsigned a: 8, b: 8; };
1743unsigned long f2(struct s2 x) {
1744 return x.a + x.b;
1745}
1746
1747//===---------------------------------------------------------------------===//
1748
Chris Lattner9f34dc62009-02-08 20:44:19 +00001749We currently compile this:
1750
1751define i32 @func1(i32 %v1, i32 %v2) nounwind {
1752entry:
1753 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1754 %sum = extractvalue {i32, i1} %t, 0
1755 %obit = extractvalue {i32, i1} %t, 1
1756 br i1 %obit, label %overflow, label %normal
1757normal:
1758 ret i32 %sum
1759overflow:
1760 call void @llvm.trap()
1761 unreachable
1762}
1763declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1764declare void @llvm.trap()
1765
1766to:
1767
1768_func1:
1769 movl 4(%esp), %eax
1770 addl 8(%esp), %eax
1771 jo LBB1_2 ## overflow
1772LBB1_1: ## normal
1773 ret
1774LBB1_2: ## overflow
1775 ud2
1776
1777it would be nice to produce "into" someday.
1778
1779//===---------------------------------------------------------------------===//
Chris Lattner09c650b2009-02-17 01:16:14 +00001780
1781This code:
1782
1783void vec_mpys1(int y[], const int x[], int scaler) {
1784int i;
1785for (i = 0; i < 150; i++)
1786 y[i] += (((long long)scaler * (long long)x[i]) >> 31);
1787}
1788
1789Compiles to this loop with GCC 3.x:
1790
1791.L5:
1792 movl %ebx, %eax
1793 imull (%edi,%ecx,4)
1794 shrdl $31, %edx, %eax
1795 addl %eax, (%esi,%ecx,4)
1796 incl %ecx
1797 cmpl $149, %ecx
1798 jle .L5
1799
1800llvm-gcc compiles it to the much uglier:
1801
1802LBB1_1: ## bb1
1803 movl 24(%esp), %eax
1804 movl (%eax,%edi,4), %ebx
1805 movl %ebx, %ebp
1806 imull %esi, %ebp
1807 movl %ebx, %eax
1808 mull %ecx
1809 addl %ebp, %edx
1810 sarl $31, %ebx
1811 imull %ecx, %ebx
1812 addl %edx, %ebx
1813 shldl $1, %eax, %ebx
1814 movl 20(%esp), %eax
1815 addl %ebx, (%eax,%edi,4)
1816 incl %edi
1817 cmpl $150, %edi
1818 jne LBB1_1 ## bb1
1819
1820//===---------------------------------------------------------------------===//
Chris Lattner8eca7c72009-03-08 01:54:43 +00001821
1822test/CodeGen/X86/2009-03-07-FPConstSelect.ll compiles to:
1823
1824_f:
1825 xorl %eax, %eax
1826 cmpl $0, 4(%esp)
1827 movl $4, %ecx
1828 cmovne %eax, %ecx
1829 flds LCPI1_0(%ecx)
1830 ret
1831
1832we should recognize cmov of 0 and a power of two and compile it into a
1833setcc+shift. This would give us something like:
1834
1835_f:
1836 xorl %eax,%eax
1837 cmpl $0, 4(%esp)
1838 seteq %al
1839 flds LCPI1_0(%ecx, %eax,4)
1840 ret
1841
1842//===---------------------------------------------------------------------===//
Chris Lattnerfe8b5592009-03-08 03:04:26 +00001843
1844memcpy/memmove do not lower to SSE copies when possible. A silly example is:
1845define <16 x float> @foo(<16 x float> %A) nounwind {
1846 %tmp = alloca <16 x float>, align 16
1847 %tmp2 = alloca <16 x float>, align 16
1848 store <16 x float> %A, <16 x float>* %tmp
1849 %s = bitcast <16 x float>* %tmp to i8*
1850 %s2 = bitcast <16 x float>* %tmp2 to i8*
1851 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1852 %R = load <16 x float>* %tmp2
1853 ret <16 x float> %R
1854}
1855
1856declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1857
1858which compiles to:
1859
1860_foo:
1861 subl $140, %esp
1862 movaps %xmm3, 112(%esp)
1863 movaps %xmm2, 96(%esp)
1864 movaps %xmm1, 80(%esp)
1865 movaps %xmm0, 64(%esp)
1866 movl 60(%esp), %eax
1867 movl %eax, 124(%esp)
1868 movl 56(%esp), %eax
1869 movl %eax, 120(%esp)
1870 movl 52(%esp), %eax
1871 <many many more 32-bit copies>
1872 movaps (%esp), %xmm0
1873 movaps 16(%esp), %xmm1
1874 movaps 32(%esp), %xmm2
1875 movaps 48(%esp), %xmm3
1876 addl $140, %esp
1877 ret
1878
1879On Nehalem, it may even be cheaper to just use movups when unaligned than to
1880fall back to lower-granularity chunks.
1881
1882//===---------------------------------------------------------------------===//