blob: e0704f66c34cc60781f19894b15c1832ec8971b8 [file] [log] [blame]
Dan Gohmanf17a25c2007-07-18 16:29:46 +00001//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
Dan Gohmanf17a25c2007-07-18 16:29:46 +00005
6//===---------------------------------------------------------------------===//
7
Dan Gohmanf17a25c2007-07-18 16:29:46 +00008CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
9backend knows how to three-addressify this shift, but it appears the register
10allocator isn't even asking it to do so in this case. We should investigate
11why this isn't happening, it could have significant impact on other important
12cases for X86 as well.
13
14//===---------------------------------------------------------------------===//
15
16This should be one DIV/IDIV instruction, not a libcall:
17
18unsigned test(unsigned long long X, unsigned Y) {
19 return X/Y;
20}
21
22This can be done trivially with a custom legalizer. What about overflow
23though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
24
25//===---------------------------------------------------------------------===//
26
27Improvements to the multiply -> shift/add algorithm:
28http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
29
30//===---------------------------------------------------------------------===//
31
32Improve code like this (occurs fairly frequently, e.g. in LLVM):
33long long foo(int x) { return 1LL << x; }
34
35http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
36http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
37http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
38
39Another useful one would be ~0ULL >> X and ~0ULL << X.
40
41One better solution for 1LL << x is:
42 xorl %eax, %eax
43 xorl %edx, %edx
44 testb $32, %cl
45 sete %al
46 setne %dl
47 sall %cl, %eax
48 sall %cl, %edx
49
50But that requires good 8-bit subreg support.
51
Eli Friedman577c7492008-02-21 21:16:49 +000052Also, this might be better. It's an extra shift, but it's one instruction
53shorter, and doesn't stress 8-bit subreg support.
54(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
55but without the unnecessary and.)
56 movl %ecx, %eax
57 shrl $5, %eax
58 movl %eax, %edx
59 xorl $1, %edx
60 sall %cl, %eax
61 sall %cl. %edx
62
Dan Gohmanf17a25c2007-07-18 16:29:46 +00006364-bit shifts (in general) expand to really bad code. Instead of using
64cmovs, we should expand to a conditional branch like GCC produces.
65
66//===---------------------------------------------------------------------===//
67
68Compile this:
69_Bool f(_Bool a) { return a!=1; }
70
71into:
72 movzbl %dil, %eax
73 xorl $1, %eax
74 ret
75
Eli Friedman577c7492008-02-21 21:16:49 +000076(Although note that this isn't a legal way to express the code that llvm-gcc
77currently generates for that function.)
78
Dan Gohmanf17a25c2007-07-18 16:29:46 +000079//===---------------------------------------------------------------------===//
80
81Some isel ideas:
82
831. Dynamic programming based approach when compile time if not an
84 issue.
852. Code duplication (addressing mode) during isel.
863. Other ideas from "Register-Sensitive Selection, Duplication, and
87 Sequencing of Instructions".
884. Scheduling for reduced register pressure. E.g. "Minimum Register
89 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
90 and other related papers.
91 http://citeseer.ist.psu.edu/govindarajan01minimum.html
92
93//===---------------------------------------------------------------------===//
94
95Should we promote i16 to i32 to avoid partial register update stalls?
96
97//===---------------------------------------------------------------------===//
98
99Leave any_extend as pseudo instruction and hint to register
100allocator. Delay codegen until post register allocation.
Evan Chengfdbb6672007-10-12 18:22:55 +0000101Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
102the coalescer how to deal with it though.
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000103
104//===---------------------------------------------------------------------===//
105
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000106It appears icc use push for parameter passing. Need to investigate.
107
108//===---------------------------------------------------------------------===//
109
110Only use inc/neg/not instructions on processors where they are faster than
111add/sub/xor. They are slower on the P4 due to only updating some processor
112flags.
113
114//===---------------------------------------------------------------------===//
115
116The instruction selector sometimes misses folding a load into a compare. The
117pattern is written as (cmp reg, (load p)). Because the compare isn't
118commutative, it is not matched with the load on both sides. The dag combiner
119should be made smart enough to cannonicalize the load into the RHS of a compare
120when it can invert the result of the compare for free.
121
122//===---------------------------------------------------------------------===//
123
124How about intrinsics? An example is:
125 *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
126
127compiles to
128 pmuludq (%eax), %xmm0
129 movl 8(%esp), %eax
130 movdqa (%eax), %xmm1
131 pmulhuw %xmm0, %xmm1
132
133The transformation probably requires a X86 specific pass or a DAG combiner
134target specific hook.
135
136//===---------------------------------------------------------------------===//
137
138In many cases, LLVM generates code like this:
139
140_test:
141 movl 8(%esp), %eax
142 cmpl %eax, 4(%esp)
143 setl %al
144 movzbl %al, %eax
145 ret
146
147on some processors (which ones?), it is more efficient to do this:
148
149_test:
150 movl 8(%esp), %ebx
151 xor %eax, %eax
152 cmpl %ebx, 4(%esp)
153 setl %al
154 ret
155
156Doing this correctly is tricky though, as the xor clobbers the flags.
157
158//===---------------------------------------------------------------------===//
159
160We should generate bts/btr/etc instructions on targets where they are cheap or
161when codesize is important. e.g., for:
162
163void setbit(int *target, int bit) {
164 *target |= (1 << bit);
165}
166void clearbit(int *target, int bit) {
167 *target &= ~(1 << bit);
168}
169
170//===---------------------------------------------------------------------===//
171
172Instead of the following for memset char*, 1, 10:
173
174 movl $16843009, 4(%edx)
175 movl $16843009, (%edx)
176 movw $257, 8(%edx)
177
178It might be better to generate
179
180 movl $16843009, %eax
181 movl %eax, 4(%edx)
182 movl %eax, (%edx)
183 movw al, 8(%edx)
184
185when we can spare a register. It reduces code size.
186
187//===---------------------------------------------------------------------===//
188
189Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
190get this:
191
Eli Friedman1aa1f2c2008-02-28 00:21:43 +0000192define i32 @test1(i32 %X) {
193 %Y = sdiv i32 %X, 8
194 ret i32 %Y
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000195}
196
197_test1:
198 movl 4(%esp), %eax
199 movl %eax, %ecx
200 sarl $31, %ecx
201 shrl $29, %ecx
202 addl %ecx, %eax
203 sarl $3, %eax
204 ret
205
206GCC knows several different ways to codegen it, one of which is this:
207
208_test1:
209 movl 4(%esp), %eax
210 cmpl $-1, %eax
211 leal 7(%eax), %ecx
212 cmovle %ecx, %eax
213 sarl $3, %eax
214 ret
215
216which is probably slower, but it's interesting at least :)
217
218//===---------------------------------------------------------------------===//
219
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000220We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
221We should leave these as libcalls for everything over a much lower threshold,
222since libc is hand tuned for medium and large mem ops (avoiding RFO for large
223stores, TLB preheating, etc)
224
225//===---------------------------------------------------------------------===//
226
227Optimize this into something reasonable:
228 x * copysign(1.0, y) * copysign(1.0, z)
229
230//===---------------------------------------------------------------------===//
231
232Optimize copysign(x, *y) to use an integer load from y.
233
234//===---------------------------------------------------------------------===//
235
236%X = weak global int 0
237
238void %foo(int %N) {
239 %N = cast int %N to uint
240 %tmp.24 = setgt int %N, 0
241 br bool %tmp.24, label %no_exit, label %return
242
243no_exit:
244 %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
245 %i.0.0 = cast uint %indvar to int
246 volatile store int %i.0.0, int* %X
247 %indvar.next = add uint %indvar, 1
248 %exitcond = seteq uint %indvar.next, %N
249 br bool %exitcond, label %return, label %no_exit
250
251return:
252 ret void
253}
254
255compiles into:
256
257 .text
258 .align 4
259 .globl _foo
260_foo:
261 movl 4(%esp), %eax
262 cmpl $1, %eax
263 jl LBB_foo_4 # return
264LBB_foo_1: # no_exit.preheader
265 xorl %ecx, %ecx
266LBB_foo_2: # no_exit
267 movl L_X$non_lazy_ptr, %edx
268 movl %ecx, (%edx)
269 incl %ecx
270 cmpl %eax, %ecx
271 jne LBB_foo_2 # no_exit
272LBB_foo_3: # return.loopexit
273LBB_foo_4: # return
274 ret
275
276We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
277remateralization is implemented. This can be accomplished with 1) a target
278dependent LICM pass or 2) makeing SelectDAG represent the whole function.
279
280//===---------------------------------------------------------------------===//
281
282The following tests perform worse with LSR:
283
284lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
285
286//===---------------------------------------------------------------------===//
287
288We are generating far worse code than gcc:
289
290volatile short X, Y;
291
292void foo(int N) {
293 int i;
294 for (i = 0; i < N; i++) { X = i; Y = i*4; }
295}
296
Evan Cheng27a820a2007-10-26 01:56:11 +0000297LBB1_1: # entry.bb_crit_edge
298 xorl %ecx, %ecx
299 xorw %dx, %dx
300LBB1_2: # bb
301 movl L_X$non_lazy_ptr, %esi
302 movw %cx, (%esi)
303 movl L_Y$non_lazy_ptr, %esi
304 movw %dx, (%esi)
305 addw $4, %dx
306 incl %ecx
307 cmpl %eax, %ecx
308 jne LBB1_2 # bb
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000309
310vs.
311
312 xorl %edx, %edx
313 movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
314 movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
315L4:
316 movw %dx, (%esi)
317 leal 0(,%edx,4), %eax
318 movw %ax, (%ecx)
319 addl $1, %edx
320 cmpl %edx, %edi
321 jne L4
322
Evan Cheng27a820a2007-10-26 01:56:11 +0000323This is due to the lack of post regalloc LICM.
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000324
325//===---------------------------------------------------------------------===//
326
327Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
328FR64 to VR128.
329
330//===---------------------------------------------------------------------===//
331
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000332Adding to the list of cmp / test poor codegen issues:
333
334int test(__m128 *A, __m128 *B) {
335 if (_mm_comige_ss(*A, *B))
336 return 3;
337 else
338 return 4;
339}
340
341_test:
342 movl 8(%esp), %eax
343 movaps (%eax), %xmm0
344 movl 4(%esp), %eax
345 movaps (%eax), %xmm1
346 comiss %xmm0, %xmm1
347 setae %al
348 movzbl %al, %ecx
349 movl $3, %eax
350 movl $4, %edx
351 cmpl $0, %ecx
352 cmove %edx, %eax
353 ret
354
355Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
356are a number of issues. 1) We are introducing a setcc between the result of the
357intrisic call and select. 2) The intrinsic is expected to produce a i32 value
358so a any extend (which becomes a zero extend) is added.
359
360We probably need some kind of target DAG combine hook to fix this.
361
362//===---------------------------------------------------------------------===//
363
364We generate significantly worse code for this than GCC:
365http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
366http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
367
368There is also one case we do worse on PPC.
369
370//===---------------------------------------------------------------------===//
371
372If shorter, we should use things like:
373movzwl %ax, %eax
374instead of:
375andl $65535, %EAX
376
377The former can also be used when the two-addressy nature of the 'and' would
378require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
379
380//===---------------------------------------------------------------------===//
381
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000382For this:
383
384int test(int a)
385{
386 return a * 3;
387}
388
389We currently emits
390 imull $3, 4(%esp), %eax
391
392Perhaps this is what we really should generate is? Is imull three or four
393cycles? Note: ICC generates this:
394 movl 4(%esp), %eax
395 leal (%eax,%eax,2), %eax
396
397The current instruction priority is based on pattern complexity. The former is
398more "complex" because it folds a load so the latter will not be emitted.
399
400Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
401should always try to match LEA first since the LEA matching code does some
402estimate to determine whether the match is profitable.
403
404However, if we care more about code size, then imull is better. It's two bytes
405shorter than movl + leal.
406
407//===---------------------------------------------------------------------===//
408
Eli Friedman577c7492008-02-21 21:16:49 +0000409__builtin_ffs codegen is messy.
Chris Lattnera86af9a2007-08-11 18:19:07 +0000410
Chris Lattnera86af9a2007-08-11 18:19:07 +0000411int ffs_(unsigned X) { return __builtin_ffs(X); }
412
Eli Friedman577c7492008-02-21 21:16:49 +0000413llvm produces:
414ffs_:
415 movl 4(%esp), %ecx
416 bsfl %ecx, %eax
417 movl $32, %edx
418 cmove %edx, %eax
419 incl %eax
420 xorl %edx, %edx
421 testl %ecx, %ecx
422 cmove %edx, %eax
Chris Lattnera86af9a2007-08-11 18:19:07 +0000423 ret
Eli Friedman577c7492008-02-21 21:16:49 +0000424
425vs gcc:
426
Chris Lattnera86af9a2007-08-11 18:19:07 +0000427_ffs_:
428 movl $-1, %edx
429 bsfl 4(%esp), %eax
430 cmove %edx, %eax
431 addl $1, %eax
432 ret
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000433
Eli Friedman577c7492008-02-21 21:16:49 +0000434Another example of __builtin_ffs (use predsimplify to eliminate a select):
435
436int foo (unsigned long j) {
437 if (j)
438 return __builtin_ffs (j) - 1;
439 else
440 return 0;
441}
442
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000443//===---------------------------------------------------------------------===//
444
445It appears gcc place string data with linkonce linkage in
446.section __TEXT,__const_coal,coalesced instead of
447.section __DATA,__const_coal,coalesced.
448Take a look at darwin.h, there are other Darwin assembler directives that we
449do not make use of.
450
451//===---------------------------------------------------------------------===//
452
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000453define i32 @foo(i32* %a, i32 %t) {
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000454entry:
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000455 br label %cond_true
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000456
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000457cond_true: ; preds = %cond_true, %entry
458 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
459 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
460 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
461 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
462 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
463 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
464 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
465 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
466 br i1 %tmp, label %bb12, label %cond_true
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000467
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000468bb12: ; preds = %cond_true
469 ret i32 %tmp7
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000470}
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000471is pessimized by -loop-reduce and -indvars
472
473//===---------------------------------------------------------------------===//
474
475u32 to float conversion improvement:
476
477float uint32_2_float( unsigned u ) {
478 float fl = (int) (u & 0xffff);
479 float fh = (int) (u >> 16);
480 fh *= 0x1.0p16f;
481 return fh + fl;
482}
483
48400000000 subl $0x04,%esp
48500000003 movl 0x08(%esp,1),%eax
48600000007 movl %eax,%ecx
48700000009 shrl $0x10,%ecx
4880000000c cvtsi2ss %ecx,%xmm0
48900000010 andl $0x0000ffff,%eax
49000000015 cvtsi2ss %eax,%xmm1
49100000019 mulss 0x00000078,%xmm0
49200000021 addss %xmm1,%xmm0
49300000025 movss %xmm0,(%esp,1)
4940000002a flds (%esp,1)
4950000002d addl $0x04,%esp
49600000030 ret
497
498//===---------------------------------------------------------------------===//
499
500When using fastcc abi, align stack slot of argument of type double on 8 byte
501boundary to improve performance.
502
503//===---------------------------------------------------------------------===//
504
505Codegen:
506
507int f(int a, int b) {
508 if (a == 4 || a == 6)
509 b++;
510 return b;
511}
512
513
514as:
515
516or eax, 2
517cmp eax, 6
518jz label
519
520//===---------------------------------------------------------------------===//
521
522GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
523simplifications for integer "x cmp y ? a : b". For example, instead of:
524
525int G;
526void f(int X, int Y) {
527 G = X < 0 ? 14 : 13;
528}
529
530compiling to:
531
532_f:
533 movl $14, %eax
534 movl $13, %ecx
535 movl 4(%esp), %edx
536 testl %edx, %edx
537 cmovl %eax, %ecx
538 movl %ecx, _G
539 ret
540
541it could be:
542_f:
543 movl 4(%esp), %eax
544 sarl $31, %eax
545 notl %eax
546 addl $14, %eax
547 movl %eax, _G
548 ret
549
550etc.
551
Chris Lattnere7037c22007-11-02 17:04:20 +0000552Another is:
553int usesbb(unsigned int a, unsigned int b) {
554 return (a < b ? -1 : 0);
555}
556to:
557_usesbb:
558 movl 8(%esp), %eax
559 cmpl %eax, 4(%esp)
560 sbbl %eax, %eax
561 ret
562
563instead of:
564_usesbb:
565 xorl %eax, %eax
566 movl 8(%esp), %ecx
567 cmpl %ecx, 4(%esp)
568 movl $4294967295, %ecx
569 cmovb %ecx, %eax
570 ret
571
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000572//===---------------------------------------------------------------------===//
573
574Currently we don't have elimination of redundant stack manipulations. Consider
575the code:
576
577int %main() {
578entry:
579 call fastcc void %test1( )
580 call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
581 ret int 0
582}
583
584declare fastcc void %test1()
585
586declare fastcc void %test2(sbyte*)
587
588
589This currently compiles to:
590
591 subl $16, %esp
592 call _test5
593 addl $12, %esp
594 subl $16, %esp
595 movl $_test5, (%esp)
596 call _test6
597 addl $12, %esp
598
599The add\sub pair is really unneeded here.
600
601//===---------------------------------------------------------------------===//
602
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000603Consider the expansion of:
604
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000605define i32 @test3(i32 %X) {
606 %tmp1 = urem i32 %X, 255
607 ret i32 %tmp1
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000608}
609
610Currently it compiles to:
611
612...
613 movl $2155905153, %ecx
614 movl 8(%esp), %esi
615 movl %esi, %eax
616 mull %ecx
617...
618
619This could be "reassociated" into:
620
621 movl $2155905153, %eax
622 movl 8(%esp), %ecx
623 mull %ecx
624
625to avoid the copy. In fact, the existing two-address stuff would do this
626except that mul isn't a commutative 2-addr instruction. I guess this has
627to be done at isel time based on the #uses to mul?
628
629//===---------------------------------------------------------------------===//
630
631Make sure the instruction which starts a loop does not cross a cacheline
632boundary. This requires knowning the exact length of each machine instruction.
633That is somewhat complicated, but doable. Example 256.bzip2:
634
635In the new trace, the hot loop has an instruction which crosses a cacheline
636boundary. In addition to potential cache misses, this can't help decoding as I
637imagine there has to be some kind of complicated decoder reset and realignment
638to grab the bytes from the next cacheline.
639
640532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
641942 942 0x3d03 movl %dh, (1809(%esp, %esi)
642937 937 0x3d0a incl %esi
6433 3 0x3d0b cmpb %bl, %dl
64427 27 0x3d0d jnz 0x000062db <main+11707>
645
646//===---------------------------------------------------------------------===//
647
648In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
649
650//===---------------------------------------------------------------------===//
651
652This could be a single 16-bit load.
653
654int f(char *p) {
655 if ((p[0] == 1) & (p[1] == 2)) return 1;
656 return 0;
657}
658
659//===---------------------------------------------------------------------===//
660
661We should inline lrintf and probably other libc functions.
662
663//===---------------------------------------------------------------------===//
664
665Start using the flags more. For example, compile:
666
667int add_zf(int *x, int y, int a, int b) {
668 if ((*x += y) == 0)
669 return a;
670 else
671 return b;
672}
673
674to:
675 addl %esi, (%rdi)
676 movl %edx, %eax
677 cmovne %ecx, %eax
678 ret
679instead of:
680
681_add_zf:
682 addl (%rdi), %esi
683 movl %esi, (%rdi)
684 testl %esi, %esi
685 cmove %edx, %ecx
686 movl %ecx, %eax
687 ret
688
689and:
690
691int add_zf(int *x, int y, int a, int b) {
692 if ((*x + y) < 0)
693 return a;
694 else
695 return b;
696}
697
698to:
699
700add_zf:
701 addl (%rdi), %esi
702 movl %edx, %eax
703 cmovns %ecx, %eax
704 ret
705
706instead of:
707
708_add_zf:
709 addl (%rdi), %esi
710 testl %esi, %esi
711 cmovs %edx, %ecx
712 movl %ecx, %eax
713 ret
714
715//===---------------------------------------------------------------------===//
716
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000717These two functions have identical effects:
718
719unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
720unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
721
722We currently compile them to:
723
724_f:
725 movl 4(%esp), %eax
726 movl %eax, %ecx
727 incl %ecx
728 movl 8(%esp), %edx
729 cmpl %edx, %ecx
730 jne LBB1_2 #UnifiedReturnBlock
731LBB1_1: #cond_true
732 addl $2, %eax
733 ret
734LBB1_2: #UnifiedReturnBlock
735 movl %ecx, %eax
736 ret
737_f2:
738 movl 4(%esp), %eax
739 movl %eax, %ecx
740 incl %ecx
741 cmpl 8(%esp), %ecx
742 sete %cl
743 movzbl %cl, %ecx
744 leal 1(%ecx,%eax), %eax
745 ret
746
747both of which are inferior to GCC's:
748
749_f:
750 movl 4(%esp), %edx
751 leal 1(%edx), %eax
752 addl $2, %edx
753 cmpl 8(%esp), %eax
754 cmove %edx, %eax
755 ret
756_f2:
757 movl 4(%esp), %eax
758 addl $1, %eax
759 xorl %edx, %edx
760 cmpl 8(%esp), %eax
761 sete %dl
762 addl %edx, %eax
763 ret
764
765//===---------------------------------------------------------------------===//
766
767This code:
768
769void test(int X) {
770 if (X) abort();
771}
772
773is currently compiled to:
774
775_test:
776 subl $12, %esp
777 cmpl $0, 16(%esp)
778 jne LBB1_1
779 addl $12, %esp
780 ret
781LBB1_1:
782 call L_abort$stub
783
784It would be better to produce:
785
786_test:
787 subl $12, %esp
788 cmpl $0, 16(%esp)
789 jne L_abort$stub
790 addl $12, %esp
791 ret
792
793This can be applied to any no-return function call that takes no arguments etc.
794Alternatively, the stack save/restore logic could be shrink-wrapped, producing
795something like this:
796
797_test:
798 cmpl $0, 4(%esp)
799 jne LBB1_1
800 ret
801LBB1_1:
802 subl $12, %esp
803 call L_abort$stub
804
805Both are useful in different situations. Finally, it could be shrink-wrapped
806and tail called, like this:
807
808_test:
809 cmpl $0, 4(%esp)
810 jne LBB1_1
811 ret
812LBB1_1:
813 pop %eax # realign stack.
814 call L_abort$stub
815
816Though this probably isn't worth it.
817
818//===---------------------------------------------------------------------===//
819
820We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner0d64ec32007-08-11 18:16:46 +0000821when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000822
823int foo(int A, int B) {
824 return A+1;
825}
826
827to:
828
829_foo:
830 leal 1(%edi), %eax
831 ret
832
833instead of:
834
835_foo:
836 incl %edi
837 movl %edi, %eax
838 ret
839
840Another example is:
841
842;; X's live range extends beyond the shift, so the register allocator
843;; cannot coalesce it with Y. Because of this, a copy needs to be
844;; emitted before the shift to save the register value before it is
845;; clobbered. However, this copy is not needed if the register
846;; allocator turns the shift into an LEA. This also occurs for ADD.
847
848; Check that the shift gets turned into an LEA.
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000849; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000850; RUN: not grep {mov E.X, E.X}
851
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000852@G = external global i32 ; <i32*> [#uses=3]
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000853
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000854define i32 @test1(i32 %X, i32 %Y) {
855 %Z = add i32 %X, %Y ; <i32> [#uses=1]
856 volatile store i32 %Y, i32* @G
857 volatile store i32 %Z, i32* @G
858 ret i32 %X
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000859}
860
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000861define i32 @test2(i32 %X) {
862 %Z = add i32 %X, 1 ; <i32> [#uses=1]
863 volatile store i32 %Z, i32* @G
864 ret i32 %X
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000865}
866
867//===---------------------------------------------------------------------===//
868
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000869Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
870a neg instead of a sub instruction. Consider:
871
872int test(char X) { return 7-X; }
873
874we currently produce:
875_test:
876 movl $7, %eax
877 movsbl 4(%esp), %ecx
878 subl %ecx, %eax
879 ret
880
881We would use one fewer register if codegen'd as:
882
883 movsbl 4(%esp), %eax
884 neg %eax
885 add $7, %eax
886 ret
887
888Note that this isn't beneficial if the load can be folded into the sub. In
889this case, we want a sub:
890
891int test(int X) { return 7-X; }
892_test:
893 movl $7, %eax
894 subl 4(%esp), %eax
895 ret
896
897//===---------------------------------------------------------------------===//
898
Chris Lattner32f65872007-08-20 02:14:33 +0000899Leaf functions that require one 4-byte spill slot have a prolog like this:
900
901_foo:
902 pushl %esi
903 subl $4, %esp
904...
905and an epilog like this:
906 addl $4, %esp
907 popl %esi
908 ret
909
910It would be smaller, and potentially faster, to push eax on entry and to
911pop into a dummy register instead of using addl/subl of esp. Just don't pop
912into any return registers :)
913
914//===---------------------------------------------------------------------===//
Chris Lattner44b03cb2007-08-23 15:22:07 +0000915
916The X86 backend should fold (branch (or (setcc, setcc))) into multiple
917branches. We generate really poor code for:
918
919double testf(double a) {
920 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
921}
922
923For example, the entry BB is:
924
925_testf:
926 subl $20, %esp
927 pxor %xmm0, %xmm0
928 movsd 24(%esp), %xmm1
929 ucomisd %xmm0, %xmm1
930 setnp %al
931 sete %cl
932 testb %cl, %al
933 jne LBB1_5 # UnifiedReturnBlock
934LBB1_1: # cond_true
935
936
937it would be better to replace the last four instructions with:
938
939 jp LBB1_1
940 je LBB1_5
941LBB1_1:
942
943We also codegen the inner ?: into a diamond:
944
945 cvtss2sd LCPI1_0(%rip), %xmm2
946 cvtss2sd LCPI1_1(%rip), %xmm3
947 ucomisd %xmm1, %xmm0
948 ja LBB1_3 # cond_true
949LBB1_2: # cond_true
950 movapd %xmm3, %xmm2
951LBB1_3: # cond_true
952 movapd %xmm2, %xmm0
953 ret
954
955We should sink the load into xmm3 into the LBB1_2 block. This should
956be pretty easy, and will nuke all the copies.
957
958//===---------------------------------------------------------------------===//
Chris Lattner4084d492007-09-10 21:43:18 +0000959
960This:
961 #include <algorithm>
962 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
963 { return std::make_pair(a + b, a + b < a); }
964 bool no_overflow(unsigned a, unsigned b)
965 { return !full_add(a, b).second; }
966
967Should compile to:
968
969
970 _Z11no_overflowjj:
971 addl %edi, %esi
972 setae %al
973 ret
974
Eli Friedman577c7492008-02-21 21:16:49 +0000975FIXME: That code looks wrong; bool return is normally defined as zext.
976
Chris Lattner4084d492007-09-10 21:43:18 +0000977on x86-64, not:
978
979__Z11no_overflowjj:
980 addl %edi, %esi
981 cmpl %edi, %esi
982 setae %al
983 movzbl %al, %eax
984 ret
985
986
987//===---------------------------------------------------------------------===//
Evan Cheng35127a62007-09-10 22:16:37 +0000988
989Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
990condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattnera487bf72007-09-26 06:29:31 +0000991
992//===---------------------------------------------------------------------===//
993
994We aren't matching RMW instructions aggressively
995enough. Here's a reduced testcase (more in PR1160):
996
997define void @test(i32* %huge_ptr, i32* %target_ptr) {
998 %A = load i32* %huge_ptr ; <i32> [#uses=1]
999 %B = load i32* %target_ptr ; <i32> [#uses=1]
1000 %C = or i32 %A, %B ; <i32> [#uses=1]
1001 store i32 %C, i32* %target_ptr
1002 ret void
1003}
1004
1005$ llvm-as < t.ll | llc -march=x86-64
1006
1007_test:
1008 movl (%rdi), %eax
1009 orl (%rsi), %eax
1010 movl %eax, (%rsi)
1011 ret
1012
1013That should be something like:
1014
1015_test:
1016 movl (%rdi), %eax
1017 orl %eax, (%rsi)
1018 ret
1019
1020//===---------------------------------------------------------------------===//
1021
Bill Wendling7f436dd2007-10-02 20:42:59 +00001022The following code:
1023
Bill Wendlingc2036e32007-10-02 20:54:32 +00001024bb114.preheader: ; preds = %cond_next94
1025 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
1026 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
1027 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
1028 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
1029 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
1030 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
1031 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
1032 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
1033 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
1034 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
1035 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
1036 br label %bb114
1037
1038produces:
1039
Bill Wendling7f436dd2007-10-02 20:42:59 +00001040LBB3_5: # bb114.preheader
1041 movswl -68(%ebp), %eax
1042 movl $32, %ecx
1043 movl %ecx, -80(%ebp)
1044 subl %eax, -80(%ebp)
1045 movswl -52(%ebp), %eax
1046 movl %ecx, -84(%ebp)
1047 subl %eax, -84(%ebp)
1048 movswl -70(%ebp), %eax
1049 movl %ecx, -88(%ebp)
1050 subl %eax, -88(%ebp)
1051 movswl -50(%ebp), %eax
1052 subl %eax, %ecx
1053 movl %ecx, -76(%ebp)
1054 movswl -42(%ebp), %eax
1055 movl %eax, -92(%ebp)
1056 movswl -66(%ebp), %eax
1057 movl %eax, -96(%ebp)
1058 movw $0, -98(%ebp)
1059
Chris Lattner792bae52007-10-03 03:40:24 +00001060This appears to be bad because the RA is not folding the store to the stack
1061slot into the movl. The above instructions could be:
1062 movl $32, -80(%ebp)
1063...
1064 movl $32, -84(%ebp)
1065...
1066This seems like a cross between remat and spill folding.
1067
Bill Wendlingc2036e32007-10-02 20:54:32 +00001068This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling7f436dd2007-10-02 20:42:59 +00001069change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1070vice-versa).
1071
1072//===---------------------------------------------------------------------===//
1073
Bill Wendling54c4f832007-10-02 21:49:31 +00001074This code:
1075
1076 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
1077 br i1 %tmp659, label %cond_true662, label %cond_next715
1078
1079produces this:
1080
1081 testw %cx, %cx
1082 movswl %cx, %esi
1083 jns LBB4_109 # cond_next715
1084
1085Shark tells us that using %cx in the testw instruction is sub-optimal. It
1086suggests using the 32-bit register (which is what ICC uses).
1087
1088//===---------------------------------------------------------------------===//
Chris Lattner802c62a2007-10-03 17:10:03 +00001089
Chris Lattnerae259992007-10-04 15:47:27 +00001090We compile this:
1091
1092void compare (long long foo) {
1093 if (foo < 4294967297LL)
1094 abort();
1095}
1096
1097to:
1098
Eli Friedman577c7492008-02-21 21:16:49 +00001099compare:
1100 subl $4, %esp
1101 cmpl $0, 8(%esp)
Chris Lattnerae259992007-10-04 15:47:27 +00001102 setne %al
1103 movzbw %al, %ax
Eli Friedman577c7492008-02-21 21:16:49 +00001104 cmpl $1, 12(%esp)
Chris Lattnerae259992007-10-04 15:47:27 +00001105 setg %cl
1106 movzbw %cl, %cx
1107 cmove %ax, %cx
Eli Friedman577c7492008-02-21 21:16:49 +00001108 testb $1, %cl
1109 jne .LBB1_2 # UnifiedReturnBlock
1110.LBB1_1: # ifthen
1111 call abort
1112.LBB1_2: # UnifiedReturnBlock
1113 addl $4, %esp
1114 ret
Chris Lattnerae259992007-10-04 15:47:27 +00001115
1116(also really horrible code on ppc). This is due to the expand code for 64-bit
1117compares. GCC produces multiple branches, which is much nicer:
1118
Eli Friedman577c7492008-02-21 21:16:49 +00001119compare:
1120 subl $12, %esp
1121 movl 20(%esp), %edx
1122 movl 16(%esp), %eax
1123 decl %edx
1124 jle .L7
1125.L5:
1126 addl $12, %esp
1127 ret
1128 .p2align 4,,7
1129.L7:
1130 jl .L4
Chris Lattnerae259992007-10-04 15:47:27 +00001131 cmpl $0, %eax
Eli Friedman577c7492008-02-21 21:16:49 +00001132 .p2align 4,,8
1133 ja .L5
1134.L4:
1135 .p2align 4,,9
1136 call abort
Chris Lattnerae259992007-10-04 15:47:27 +00001137
1138//===---------------------------------------------------------------------===//
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001139
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001140Tail call optimization improvements: Tail call optimization currently
1141pushes all arguments on the top of the stack (their normal place for
Arnold Schwaighofer449b01a2008-01-11 16:49:42 +00001142non-tail call optimized calls) that source from the callers arguments
1143or that source from a virtual register (also possibly sourcing from
1144callers arguments).
1145This is done to prevent overwriting of parameters (see example
1146below) that might be used later.
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001147
1148example:
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001149
1150int callee(int32, int64);
1151int caller(int32 arg1, int32 arg2) {
1152 int64 local = arg2 * 2;
1153 return callee(arg2, (int64)local);
1154}
1155
1156[arg1] [!arg2 no longer valid since we moved local onto it]
1157[arg2] -> [(int64)
1158[RETADDR] local ]
1159
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001160Moving arg1 onto the stack slot of callee function would overwrite
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001161arg2 of the caller.
1162
1163Possible optimizations:
1164
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001165
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001166 - Analyse the actual parameters of the callee to see which would
1167 overwrite a caller parameter which is used by the callee and only
1168 push them onto the top of the stack.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001169
1170 int callee (int32 arg1, int32 arg2);
1171 int caller (int32 arg1, int32 arg2) {
1172 return callee(arg1,arg2);
1173 }
1174
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001175 Here we don't need to write any variables to the top of the stack
1176 since they don't overwrite each other.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001177
1178 int callee (int32 arg1, int32 arg2);
1179 int caller (int32 arg1, int32 arg2) {
1180 return callee(arg2,arg1);
1181 }
1182
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001183 Here we need to push the arguments because they overwrite each
1184 other.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001185
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001186//===---------------------------------------------------------------------===//
Evan Cheng7f1ad6a2007-10-28 04:01:09 +00001187
1188main ()
1189{
1190 int i = 0;
1191 unsigned long int z = 0;
1192
1193 do {
1194 z -= 0x00004000;
1195 i++;
1196 if (i > 0x00040000)
1197 abort ();
1198 } while (z > 0);
1199 exit (0);
1200}
1201
1202gcc compiles this to:
1203
1204_main:
1205 subl $28, %esp
1206 xorl %eax, %eax
1207 jmp L2
1208L3:
1209 cmpl $262144, %eax
1210 je L10
1211L2:
1212 addl $1, %eax
1213 cmpl $262145, %eax
1214 jne L3
1215 call L_abort$stub
1216L10:
1217 movl $0, (%esp)
1218 call L_exit$stub
1219
1220llvm:
1221
1222_main:
1223 subl $12, %esp
1224 movl $1, %eax
1225 movl $16384, %ecx
1226LBB1_1: # bb
1227 cmpl $262145, %eax
1228 jge LBB1_4 # cond_true
1229LBB1_2: # cond_next
1230 incl %eax
1231 addl $4294950912, %ecx
1232 cmpl $16384, %ecx
1233 jne LBB1_1 # bb
1234LBB1_3: # bb11
1235 xorl %eax, %eax
1236 addl $12, %esp
1237 ret
1238LBB1_4: # cond_true
1239 call L_abort$stub
1240
12411. LSR should rewrite the first cmp with induction variable %ecx.
12422. DAG combiner should fold
1243 leal 1(%eax), %edx
1244 cmpl $262145, %edx
1245 =>
1246 cmpl $262144, %eax
1247
1248//===---------------------------------------------------------------------===//
Chris Lattner358670b2007-11-24 06:13:33 +00001249
1250define i64 @test(double %X) {
1251 %Y = fptosi double %X to i64
1252 ret i64 %Y
1253}
1254
1255compiles to:
1256
1257_test:
1258 subl $20, %esp
1259 movsd 24(%esp), %xmm0
1260 movsd %xmm0, 8(%esp)
1261 fldl 8(%esp)
1262 fisttpll (%esp)
1263 movl 4(%esp), %edx
1264 movl (%esp), %eax
1265 addl $20, %esp
1266 #FP_REG_KILL
1267 ret
1268
1269This should just fldl directly from the input stack slot.
Chris Lattner10d54d12007-12-05 22:58:19 +00001270
1271//===---------------------------------------------------------------------===//
1272
1273This code:
1274int foo (int x) { return (x & 65535) | 255; }
1275
1276Should compile into:
1277
1278_foo:
1279 movzwl 4(%esp), %eax
Eli Friedman577c7492008-02-21 21:16:49 +00001280 orl $255, %eax
Chris Lattner10d54d12007-12-05 22:58:19 +00001281 ret
1282
1283instead of:
1284_foo:
1285 movl $255, %eax
1286 orl 4(%esp), %eax
1287 andl $65535, %eax
1288 ret
1289
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001290//===---------------------------------------------------------------------===//
1291
Chris Lattnereec7ac02008-02-21 06:51:29 +00001292We're codegen'ing multiply of long longs inefficiently:
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001293
Chris Lattnereec7ac02008-02-21 06:51:29 +00001294unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1295 return arg1 * arg2;
1296}
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001297
Chris Lattnereec7ac02008-02-21 06:51:29 +00001298We compile to (fomit-frame-pointer):
1299
1300_LLM:
1301 pushl %esi
1302 movl 8(%esp), %ecx
1303 movl 16(%esp), %esi
1304 movl %esi, %eax
1305 mull %ecx
1306 imull 12(%esp), %esi
1307 addl %edx, %esi
1308 imull 20(%esp), %ecx
1309 movl %esi, %edx
1310 addl %ecx, %edx
1311 popl %esi
1312 ret
1313
1314This looks like a scheduling deficiency and lack of remat of the load from
1315the argument area. ICC apparently produces:
1316
1317 movl 8(%esp), %ecx
1318 imull 12(%esp), %ecx
1319 movl 16(%esp), %eax
1320 imull 4(%esp), %eax
1321 addl %eax, %ecx
1322 movl 4(%esp), %eax
1323 mull 12(%esp)
1324 addl %ecx, %edx
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001325 ret
1326
Chris Lattnereec7ac02008-02-21 06:51:29 +00001327Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
1328http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001329
1330//===---------------------------------------------------------------------===//
1331
Chris Lattner2b55ebd2007-12-24 19:27:46 +00001332We can fold a store into "zeroing a reg". Instead of:
1333
1334xorl %eax, %eax
1335movl %eax, 124(%esp)
1336
1337we should get:
1338
1339movl $0, 124(%esp)
1340
1341if the flags of the xor are dead.
1342
Chris Lattner459ff992008-01-11 18:00:13 +00001343Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
1344be folded into: shl [mem], 1
1345
Chris Lattner2b55ebd2007-12-24 19:27:46 +00001346//===---------------------------------------------------------------------===//
Chris Lattner64400952007-12-28 21:50:40 +00001347
1348This testcase misses a read/modify/write opportunity (from PR1425):
1349
1350void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1351 int i;
1352 for(i=0; i<width; i++)
1353 b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1354}
1355
1356We compile it down to:
1357
1358LBB1_2: # bb
1359 movl (%esi,%edi,4), %ebx
1360 addl (%ecx,%edi,4), %ebx
1361 addl (%edx,%edi,4), %ebx
1362 movl %ebx, (%ecx,%edi,4)
1363 incl %edi
1364 cmpl %eax, %edi
1365 jne LBB1_2 # bb
1366
1367the inner loop should add to the memory location (%ecx,%edi,4), saving
1368a mov. Something like:
1369
1370 movl (%esi,%edi,4), %ebx
1371 addl (%edx,%edi,4), %ebx
1372 addl %ebx, (%ecx,%edi,4)
1373
Chris Lattnerbde73102007-12-29 05:51:58 +00001374Here is another interesting example:
1375
1376void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1377 int i;
1378 for(i=0; i<width; i++)
1379 b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1380}
1381
1382We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1383
1384LBB9_2: # bb
1385 movl (%ecx,%edi,4), %ebx
1386 subl (%esi,%edi,4), %ebx
1387 subl (%edx,%edi,4), %ebx
1388 movl %ebx, (%ecx,%edi,4)
1389 incl %edi
1390 cmpl %eax, %edi
1391 jne LBB9_2 # bb
1392
1393Additionally, LSR should rewrite the exit condition of these loops to use
Chris Lattner64400952007-12-28 21:50:40 +00001394a stride-4 IV, would would allow all the scales in the loop to go away.
1395This would result in smaller code and more efficient microops.
1396
1397//===---------------------------------------------------------------------===//
Chris Lattner0362a362008-01-07 21:59:58 +00001398
1399In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1400or and instruction, for example:
1401
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001402 xorpd LCPI1_0, %xmm2
Chris Lattner0362a362008-01-07 21:59:58 +00001403
1404However, if xmm2 gets spilled, we end up with really ugly code like this:
1405
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001406 movsd (%esp), %xmm0
1407 xorpd LCPI1_0, %xmm0
1408 movsd %xmm0, (%esp)
Chris Lattner0362a362008-01-07 21:59:58 +00001409
1410Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1411the neg/abs instruction, turning it into an *integer* operation, like this:
1412
1413 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
1414
1415you could also use xorb, but xorl is less likely to lead to a partial register
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001416stall. Here is a contrived testcase:
1417
1418double a, b, c;
1419void test(double *P) {
1420 double X = *P;
1421 a = X;
1422 bar();
1423 X = -X;
1424 b = X;
1425 bar();
1426 c = X;
1427}
Chris Lattner0362a362008-01-07 21:59:58 +00001428
1429//===---------------------------------------------------------------------===//
Andrew Lenharth785610d2008-02-16 01:24:58 +00001430
1431handling llvm.memory.barrier on pre SSE2 cpus
1432
1433should generate:
1434lock ; mov %esp, %esp
1435
1436//===---------------------------------------------------------------------===//
Chris Lattner7644ff32008-02-17 19:43:57 +00001437
1438The generated code on x86 for checking for signed overflow on a multiply the
1439obvious way is much longer than it needs to be.
1440
1441int x(int a, int b) {
1442 long long prod = (long long)a*b;
1443 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1444}
1445
1446See PR2053 for more details.
1447
1448//===---------------------------------------------------------------------===//
Chris Lattner83f22362008-02-18 18:30:13 +00001449
Eli Friedman577c7492008-02-21 21:16:49 +00001450We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1451more aggressively; it should cost the same as a move+shift on any modern
1452processor, but it's a lot shorter. Downside is that it puts more
1453pressure on register allocation because it has fixed operands.
1454
1455Example:
1456int abs(int x) {return x < 0 ? -x : x;}
1457
1458gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1459abs:
1460 movl 4(%esp), %eax
1461 cltd
1462 xorl %edx, %eax
1463 subl %edx, %eax
1464 ret
1465
1466//===---------------------------------------------------------------------===//
1467
1468Consider:
Chris Lattner83f22362008-02-18 18:30:13 +00001469int test(unsigned long a, unsigned long b) { return -(a < b); }
1470
1471We currently compile this to:
1472
1473define i32 @test(i32 %a, i32 %b) nounwind {
1474 %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
1475 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
1476 %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
1477 ret i32 %tmp5
1478}
1479
1480and
1481
1482_test:
1483 movl 8(%esp), %eax
1484 cmpl %eax, 4(%esp)
1485 setb %al
1486 movzbl %al, %eax
1487 negl %eax
1488 ret
1489
1490Several deficiencies here. First, we should instcombine zext+neg into sext:
1491
1492define i32 @test2(i32 %a, i32 %b) nounwind {
1493 %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
1494 %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
1495 ret i32 %tmp34
1496}
1497
1498However, before we can do that, we have to fix the bad codegen that we get for
1499sext from bool:
1500
1501_test2:
1502 movl 8(%esp), %eax
1503 cmpl %eax, 4(%esp)
1504 setb %al
1505 movzbl %al, %eax
1506 shll $31, %eax
1507 sarl $31, %eax
1508 ret
1509
1510This code should be at least as good as the code above. Once this is fixed, we
1511can optimize this specific case even more to:
1512
1513 movl 8(%esp), %eax
1514 xorl %ecx, %ecx
1515 cmpl %eax, 4(%esp)
1516 sbbl %ecx, %ecx
1517
1518//===---------------------------------------------------------------------===//
Eli Friedman1aa1f2c2008-02-28 00:21:43 +00001519
1520Take the following code (from
1521http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1522
1523extern unsigned char first_one[65536];
1524int FirstOnet(unsigned long long arg1)
1525{
1526 if (arg1 >> 48)
1527 return (first_one[arg1 >> 48]);
1528 return 0;
1529}
1530
1531
1532The following code is currently generated:
1533FirstOnet:
1534 movl 8(%esp), %eax
1535 cmpl $65536, %eax
1536 movl 4(%esp), %ecx
1537 jb .LBB1_2 # UnifiedReturnBlock
1538.LBB1_1: # ifthen
1539 shrl $16, %eax
1540 movzbl first_one(%eax), %eax
1541 ret
1542.LBB1_2: # UnifiedReturnBlock
1543 xorl %eax, %eax
1544 ret
1545
1546There are a few possible improvements here:
15471. We should be able to eliminate the dead load into %ecx
15482. We could change the "movl 8(%esp), %eax" into
1549 "movzwl 10(%esp), %eax"; this lets us change the cmpl
1550 into a testl, which is shorter, and eliminate the shift.
1551
1552We could also in theory eliminate the branch by using a conditional
1553for the address of the load, but that seems unlikely to be worthwhile
1554in general.
1555
1556//===---------------------------------------------------------------------===//
1557
Chris Lattner44a98ac2008-02-28 04:52:59 +00001558We compile this function:
1559
1560define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
1561entry:
1562 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
1563 br i1 %tmp2, label %bb7, label %bb
1564
1565bb: ; preds = %entry
1566 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
1567 ret i32 %tmp6
1568
1569bb7: ; preds = %entry
1570 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
1571 ret i32 %tmp10
1572}
1573
1574to:
1575
1576_foo:
1577 cmpb $0, 16(%esp)
1578 movl 12(%esp), %ecx
1579 movl 8(%esp), %eax
1580 movl 4(%esp), %edx
1581 je LBB1_2 # bb7
1582LBB1_1: # bb
1583 addl %edx, %eax
1584 ret
1585LBB1_2: # bb7
1586 movl %edx, %eax
1587 subl %ecx, %eax
1588 ret
1589
Gabor Greif02661592008-03-06 10:51:21 +00001590The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
Chris Lattner44a98ac2008-02-28 04:52:59 +00001591if it commuted the addl in LBB1_1.
1592
1593//===---------------------------------------------------------------------===//
Evan Cheng921dcba2008-03-28 07:07:06 +00001594
1595See rdar://4653682.
1596
1597From flops:
1598
1599LBB1_15: # bb310
1600 cvtss2sd LCPI1_0, %xmm1
1601 addsd %xmm1, %xmm0
1602 movsd 176(%esp), %xmm2
1603 mulsd %xmm0, %xmm2
1604 movapd %xmm2, %xmm3
1605 mulsd %xmm3, %xmm3
1606 movapd %xmm3, %xmm4
1607 mulsd LCPI1_23, %xmm4
1608 addsd LCPI1_24, %xmm4
1609 mulsd %xmm3, %xmm4
1610 addsd LCPI1_25, %xmm4
1611 mulsd %xmm3, %xmm4
1612 addsd LCPI1_26, %xmm4
1613 mulsd %xmm3, %xmm4
1614 addsd LCPI1_27, %xmm4
1615 mulsd %xmm3, %xmm4
1616 addsd LCPI1_28, %xmm4
1617 mulsd %xmm3, %xmm4
1618 addsd %xmm1, %xmm4
1619 mulsd %xmm2, %xmm4
1620 movsd 152(%esp), %xmm1
1621 addsd %xmm4, %xmm1
1622 movsd %xmm1, 152(%esp)
1623 incl %eax
1624 cmpl %eax, %esi
1625 jge LBB1_15 # bb310
1626LBB1_16: # bb358.loopexit
1627 movsd 152(%esp), %xmm0
1628 addsd %xmm0, %xmm0
1629 addsd LCPI1_22, %xmm0
1630 movsd %xmm0, 152(%esp)
1631
1632Rather than spilling the result of the last addsd in the loop, we should have
1633insert a copy to split the interval (one for the duration of the loop, one
1634extending to the fall through). The register pressure in the loop isn't high
1635enough to warrant the spill.
1636
1637Also check why xmm7 is not used at all in the function.
Chris Lattner16e5c782008-04-21 04:46:30 +00001638
1639//===---------------------------------------------------------------------===//
1640
1641Legalize loses track of the fact that bools are always zero extended when in
1642memory. This causes us to compile abort_gzip (from 164.gzip) from:
1643
1644target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1645target triple = "i386-apple-darwin8"
1646@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1647define fastcc void @abort_gzip() noreturn nounwind {
1648entry:
1649 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1650 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1651bb.i: ; preds = %entry
1652 tail call void @exit( i32 1 ) noreturn nounwind
1653 unreachable
1654bb4.i: ; preds = %entry
1655 store i1 true, i1* @in_exit.4870.b
1656 tail call void @exit( i32 1 ) noreturn nounwind
1657 unreachable
1658}
1659declare void @exit(i32) noreturn nounwind
1660
1661into:
1662
1663_abort_gzip:
1664 subl $12, %esp
1665 movb _in_exit.4870.b, %al
1666 notb %al
1667 testb $1, %al
1668 jne LBB1_2 ## bb4.i
1669LBB1_1: ## bb.i
1670 ...
1671
1672//===---------------------------------------------------------------------===//
Chris Lattner7cb1d332008-05-05 23:19:45 +00001673
1674We compile:
1675
1676int test(int x, int y) {
1677 return x-y-1;
1678}
1679
1680into (-m64):
1681
1682_test:
1683 decl %edi
1684 movl %edi, %eax
1685 subl %esi, %eax
1686 ret
1687
1688it would be better to codegen as: x+~y (notl+addl)