blob: fcbae96bee01112350bd9497f2b84fed52f64b28 [file] [log] [blame]
Dan Gohmanf17a25c2007-07-18 16:29:46 +00001//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
Dan Gohmanf17a25c2007-07-18 16:29:46 +00005
6//===---------------------------------------------------------------------===//
7
Dan Gohmanf17a25c2007-07-18 16:29:46 +00008CodeGen/X86/lea-3.ll:test3 should be a single LEA, not a shift/move. The X86
9backend knows how to three-addressify this shift, but it appears the register
10allocator isn't even asking it to do so in this case. We should investigate
11why this isn't happening, it could have significant impact on other important
12cases for X86 as well.
13
14//===---------------------------------------------------------------------===//
15
16This should be one DIV/IDIV instruction, not a libcall:
17
18unsigned test(unsigned long long X, unsigned Y) {
19 return X/Y;
20}
21
22This can be done trivially with a custom legalizer. What about overflow
23though? http://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224
24
25//===---------------------------------------------------------------------===//
26
27Improvements to the multiply -> shift/add algorithm:
28http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
29
30//===---------------------------------------------------------------------===//
31
32Improve code like this (occurs fairly frequently, e.g. in LLVM):
33long long foo(int x) { return 1LL << x; }
34
35http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
36http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
37http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
38
39Another useful one would be ~0ULL >> X and ~0ULL << X.
40
41One better solution for 1LL << x is:
42 xorl %eax, %eax
43 xorl %edx, %edx
44 testb $32, %cl
45 sete %al
46 setne %dl
47 sall %cl, %eax
48 sall %cl, %edx
49
50But that requires good 8-bit subreg support.
51
Eli Friedman577c7492008-02-21 21:16:49 +000052Also, this might be better. It's an extra shift, but it's one instruction
53shorter, and doesn't stress 8-bit subreg support.
54(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
55but without the unnecessary and.)
56 movl %ecx, %eax
57 shrl $5, %eax
58 movl %eax, %edx
59 xorl $1, %edx
60 sall %cl, %eax
61 sall %cl. %edx
62
Dan Gohmanf17a25c2007-07-18 16:29:46 +00006364-bit shifts (in general) expand to really bad code. Instead of using
64cmovs, we should expand to a conditional branch like GCC produces.
65
66//===---------------------------------------------------------------------===//
67
68Compile this:
69_Bool f(_Bool a) { return a!=1; }
70
71into:
72 movzbl %dil, %eax
73 xorl $1, %eax
74 ret
75
Eli Friedman577c7492008-02-21 21:16:49 +000076(Although note that this isn't a legal way to express the code that llvm-gcc
77currently generates for that function.)
78
Dan Gohmanf17a25c2007-07-18 16:29:46 +000079//===---------------------------------------------------------------------===//
80
81Some isel ideas:
82
831. Dynamic programming based approach when compile time if not an
84 issue.
852. Code duplication (addressing mode) during isel.
863. Other ideas from "Register-Sensitive Selection, Duplication, and
87 Sequencing of Instructions".
884. Scheduling for reduced register pressure. E.g. "Minimum Register
89 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
90 and other related papers.
91 http://citeseer.ist.psu.edu/govindarajan01minimum.html
92
93//===---------------------------------------------------------------------===//
94
95Should we promote i16 to i32 to avoid partial register update stalls?
96
97//===---------------------------------------------------------------------===//
98
99Leave any_extend as pseudo instruction and hint to register
100allocator. Delay codegen until post register allocation.
Evan Chengfdbb6672007-10-12 18:22:55 +0000101Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
102the coalescer how to deal with it though.
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000103
104//===---------------------------------------------------------------------===//
105
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000106It appears icc use push for parameter passing. Need to investigate.
107
108//===---------------------------------------------------------------------===//
109
110Only use inc/neg/not instructions on processors where they are faster than
111add/sub/xor. They are slower on the P4 due to only updating some processor
112flags.
113
114//===---------------------------------------------------------------------===//
115
116The instruction selector sometimes misses folding a load into a compare. The
117pattern is written as (cmp reg, (load p)). Because the compare isn't
118commutative, it is not matched with the load on both sides. The dag combiner
119should be made smart enough to cannonicalize the load into the RHS of a compare
120when it can invert the result of the compare for free.
121
122//===---------------------------------------------------------------------===//
123
124How about intrinsics? An example is:
125 *res = _mm_mulhi_epu16(*A, _mm_mul_epu32(*B, *C));
126
127compiles to
128 pmuludq (%eax), %xmm0
129 movl 8(%esp), %eax
130 movdqa (%eax), %xmm1
131 pmulhuw %xmm0, %xmm1
132
133The transformation probably requires a X86 specific pass or a DAG combiner
134target specific hook.
135
136//===---------------------------------------------------------------------===//
137
138In many cases, LLVM generates code like this:
139
140_test:
141 movl 8(%esp), %eax
142 cmpl %eax, 4(%esp)
143 setl %al
144 movzbl %al, %eax
145 ret
146
147on some processors (which ones?), it is more efficient to do this:
148
149_test:
150 movl 8(%esp), %ebx
151 xor %eax, %eax
152 cmpl %ebx, 4(%esp)
153 setl %al
154 ret
155
156Doing this correctly is tricky though, as the xor clobbers the flags.
157
158//===---------------------------------------------------------------------===//
159
160We should generate bts/btr/etc instructions on targets where they are cheap or
161when codesize is important. e.g., for:
162
163void setbit(int *target, int bit) {
164 *target |= (1 << bit);
165}
166void clearbit(int *target, int bit) {
167 *target &= ~(1 << bit);
168}
169
170//===---------------------------------------------------------------------===//
171
172Instead of the following for memset char*, 1, 10:
173
174 movl $16843009, 4(%edx)
175 movl $16843009, (%edx)
176 movw $257, 8(%edx)
177
178It might be better to generate
179
180 movl $16843009, %eax
181 movl %eax, 4(%edx)
182 movl %eax, (%edx)
183 movw al, 8(%edx)
184
185when we can spare a register. It reduces code size.
186
187//===---------------------------------------------------------------------===//
188
189Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
190get this:
191
Eli Friedman1aa1f2c2008-02-28 00:21:43 +0000192define i32 @test1(i32 %X) {
193 %Y = sdiv i32 %X, 8
194 ret i32 %Y
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000195}
196
197_test1:
198 movl 4(%esp), %eax
199 movl %eax, %ecx
200 sarl $31, %ecx
201 shrl $29, %ecx
202 addl %ecx, %eax
203 sarl $3, %eax
204 ret
205
206GCC knows several different ways to codegen it, one of which is this:
207
208_test1:
209 movl 4(%esp), %eax
210 cmpl $-1, %eax
211 leal 7(%eax), %ecx
212 cmovle %ecx, %eax
213 sarl $3, %eax
214 ret
215
216which is probably slower, but it's interesting at least :)
217
218//===---------------------------------------------------------------------===//
219
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000220We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
221We should leave these as libcalls for everything over a much lower threshold,
222since libc is hand tuned for medium and large mem ops (avoiding RFO for large
223stores, TLB preheating, etc)
224
225//===---------------------------------------------------------------------===//
226
227Optimize this into something reasonable:
228 x * copysign(1.0, y) * copysign(1.0, z)
229
230//===---------------------------------------------------------------------===//
231
232Optimize copysign(x, *y) to use an integer load from y.
233
234//===---------------------------------------------------------------------===//
235
236%X = weak global int 0
237
238void %foo(int %N) {
239 %N = cast int %N to uint
240 %tmp.24 = setgt int %N, 0
241 br bool %tmp.24, label %no_exit, label %return
242
243no_exit:
244 %indvar = phi uint [ 0, %entry ], [ %indvar.next, %no_exit ]
245 %i.0.0 = cast uint %indvar to int
246 volatile store int %i.0.0, int* %X
247 %indvar.next = add uint %indvar, 1
248 %exitcond = seteq uint %indvar.next, %N
249 br bool %exitcond, label %return, label %no_exit
250
251return:
252 ret void
253}
254
255compiles into:
256
257 .text
258 .align 4
259 .globl _foo
260_foo:
261 movl 4(%esp), %eax
262 cmpl $1, %eax
263 jl LBB_foo_4 # return
264LBB_foo_1: # no_exit.preheader
265 xorl %ecx, %ecx
266LBB_foo_2: # no_exit
267 movl L_X$non_lazy_ptr, %edx
268 movl %ecx, (%edx)
269 incl %ecx
270 cmpl %eax, %ecx
271 jne LBB_foo_2 # no_exit
272LBB_foo_3: # return.loopexit
273LBB_foo_4: # return
274 ret
275
276We should hoist "movl L_X$non_lazy_ptr, %edx" out of the loop after
277remateralization is implemented. This can be accomplished with 1) a target
278dependent LICM pass or 2) makeing SelectDAG represent the whole function.
279
280//===---------------------------------------------------------------------===//
281
282The following tests perform worse with LSR:
283
284lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
285
286//===---------------------------------------------------------------------===//
287
288We are generating far worse code than gcc:
289
290volatile short X, Y;
291
292void foo(int N) {
293 int i;
294 for (i = 0; i < N; i++) { X = i; Y = i*4; }
295}
296
Evan Cheng27a820a2007-10-26 01:56:11 +0000297LBB1_1: # entry.bb_crit_edge
298 xorl %ecx, %ecx
299 xorw %dx, %dx
300LBB1_2: # bb
301 movl L_X$non_lazy_ptr, %esi
302 movw %cx, (%esi)
303 movl L_Y$non_lazy_ptr, %esi
304 movw %dx, (%esi)
305 addw $4, %dx
306 incl %ecx
307 cmpl %eax, %ecx
308 jne LBB1_2 # bb
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000309
310vs.
311
312 xorl %edx, %edx
313 movl L_X$non_lazy_ptr-"L00000000001$pb"(%ebx), %esi
314 movl L_Y$non_lazy_ptr-"L00000000001$pb"(%ebx), %ecx
315L4:
316 movw %dx, (%esi)
317 leal 0(,%edx,4), %eax
318 movw %ax, (%ecx)
319 addl $1, %edx
320 cmpl %edx, %edi
321 jne L4
322
Evan Cheng27a820a2007-10-26 01:56:11 +0000323This is due to the lack of post regalloc LICM.
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000324
325//===---------------------------------------------------------------------===//
326
327Teach the coalescer to coalesce vregs of different register classes. e.g. FR32 /
328FR64 to VR128.
329
330//===---------------------------------------------------------------------===//
331
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000332Adding to the list of cmp / test poor codegen issues:
333
334int test(__m128 *A, __m128 *B) {
335 if (_mm_comige_ss(*A, *B))
336 return 3;
337 else
338 return 4;
339}
340
341_test:
342 movl 8(%esp), %eax
343 movaps (%eax), %xmm0
344 movl 4(%esp), %eax
345 movaps (%eax), %xmm1
346 comiss %xmm0, %xmm1
347 setae %al
348 movzbl %al, %ecx
349 movl $3, %eax
350 movl $4, %edx
351 cmpl $0, %ecx
352 cmove %edx, %eax
353 ret
354
355Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
356are a number of issues. 1) We are introducing a setcc between the result of the
357intrisic call and select. 2) The intrinsic is expected to produce a i32 value
358so a any extend (which becomes a zero extend) is added.
359
360We probably need some kind of target DAG combine hook to fix this.
361
362//===---------------------------------------------------------------------===//
363
364We generate significantly worse code for this than GCC:
365http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
366http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
367
368There is also one case we do worse on PPC.
369
370//===---------------------------------------------------------------------===//
371
372If shorter, we should use things like:
373movzwl %ax, %eax
374instead of:
375andl $65535, %EAX
376
377The former can also be used when the two-addressy nature of the 'and' would
378require a copy to be inserted (in X86InstrInfo::convertToThreeAddress).
379
380//===---------------------------------------------------------------------===//
381
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000382Another instruction selector deficiency:
383
384void %bar() {
385 %tmp = load int (int)** %foo
386 %tmp = tail call int %tmp( int 3 )
387 ret void
388}
389
390_bar:
391 subl $12, %esp
392 movl L_foo$non_lazy_ptr, %eax
393 movl (%eax), %eax
394 call *%eax
395 addl $12, %esp
396 ret
397
398The current isel scheme will not allow the load to be folded in the call since
399the load's chain result is read by the callseq_start.
400
401//===---------------------------------------------------------------------===//
402
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000403For this:
404
405int test(int a)
406{
407 return a * 3;
408}
409
410We currently emits
411 imull $3, 4(%esp), %eax
412
413Perhaps this is what we really should generate is? Is imull three or four
414cycles? Note: ICC generates this:
415 movl 4(%esp), %eax
416 leal (%eax,%eax,2), %eax
417
418The current instruction priority is based on pattern complexity. The former is
419more "complex" because it folds a load so the latter will not be emitted.
420
421Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
422should always try to match LEA first since the LEA matching code does some
423estimate to determine whether the match is profitable.
424
425However, if we care more about code size, then imull is better. It's two bytes
426shorter than movl + leal.
427
428//===---------------------------------------------------------------------===//
429
Eli Friedman577c7492008-02-21 21:16:49 +0000430__builtin_ffs codegen is messy.
Chris Lattnera86af9a2007-08-11 18:19:07 +0000431
Chris Lattnera86af9a2007-08-11 18:19:07 +0000432int ffs_(unsigned X) { return __builtin_ffs(X); }
433
Eli Friedman577c7492008-02-21 21:16:49 +0000434llvm produces:
435ffs_:
436 movl 4(%esp), %ecx
437 bsfl %ecx, %eax
438 movl $32, %edx
439 cmove %edx, %eax
440 incl %eax
441 xorl %edx, %edx
442 testl %ecx, %ecx
443 cmove %edx, %eax
Chris Lattnera86af9a2007-08-11 18:19:07 +0000444 ret
Eli Friedman577c7492008-02-21 21:16:49 +0000445
446vs gcc:
447
Chris Lattnera86af9a2007-08-11 18:19:07 +0000448_ffs_:
449 movl $-1, %edx
450 bsfl 4(%esp), %eax
451 cmove %edx, %eax
452 addl $1, %eax
453 ret
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000454
Eli Friedman577c7492008-02-21 21:16:49 +0000455Another example of __builtin_ffs (use predsimplify to eliminate a select):
456
457int foo (unsigned long j) {
458 if (j)
459 return __builtin_ffs (j) - 1;
460 else
461 return 0;
462}
463
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000464//===---------------------------------------------------------------------===//
465
466It appears gcc place string data with linkonce linkage in
467.section __TEXT,__const_coal,coalesced instead of
468.section __DATA,__const_coal,coalesced.
469Take a look at darwin.h, there are other Darwin assembler directives that we
470do not make use of.
471
472//===---------------------------------------------------------------------===//
473
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000474define i32 @foo(i32* %a, i32 %t) {
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000475entry:
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000476 br label %cond_true
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000477
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000478cond_true: ; preds = %cond_true, %entry
479 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
480 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
481 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
482 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
483 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
484 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
485 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
486 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
487 br i1 %tmp, label %bb12, label %cond_true
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000488
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000489bb12: ; preds = %cond_true
490 ret i32 %tmp7
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000491}
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000492is pessimized by -loop-reduce and -indvars
493
494//===---------------------------------------------------------------------===//
495
496u32 to float conversion improvement:
497
498float uint32_2_float( unsigned u ) {
499 float fl = (int) (u & 0xffff);
500 float fh = (int) (u >> 16);
501 fh *= 0x1.0p16f;
502 return fh + fl;
503}
504
50500000000 subl $0x04,%esp
50600000003 movl 0x08(%esp,1),%eax
50700000007 movl %eax,%ecx
50800000009 shrl $0x10,%ecx
5090000000c cvtsi2ss %ecx,%xmm0
51000000010 andl $0x0000ffff,%eax
51100000015 cvtsi2ss %eax,%xmm1
51200000019 mulss 0x00000078,%xmm0
51300000021 addss %xmm1,%xmm0
51400000025 movss %xmm0,(%esp,1)
5150000002a flds (%esp,1)
5160000002d addl $0x04,%esp
51700000030 ret
518
519//===---------------------------------------------------------------------===//
520
521When using fastcc abi, align stack slot of argument of type double on 8 byte
522boundary to improve performance.
523
524//===---------------------------------------------------------------------===//
525
526Codegen:
527
528int f(int a, int b) {
529 if (a == 4 || a == 6)
530 b++;
531 return b;
532}
533
534
535as:
536
537or eax, 2
538cmp eax, 6
539jz label
540
541//===---------------------------------------------------------------------===//
542
543GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
544simplifications for integer "x cmp y ? a : b". For example, instead of:
545
546int G;
547void f(int X, int Y) {
548 G = X < 0 ? 14 : 13;
549}
550
551compiling to:
552
553_f:
554 movl $14, %eax
555 movl $13, %ecx
556 movl 4(%esp), %edx
557 testl %edx, %edx
558 cmovl %eax, %ecx
559 movl %ecx, _G
560 ret
561
562it could be:
563_f:
564 movl 4(%esp), %eax
565 sarl $31, %eax
566 notl %eax
567 addl $14, %eax
568 movl %eax, _G
569 ret
570
571etc.
572
Chris Lattnere7037c22007-11-02 17:04:20 +0000573Another is:
574int usesbb(unsigned int a, unsigned int b) {
575 return (a < b ? -1 : 0);
576}
577to:
578_usesbb:
579 movl 8(%esp), %eax
580 cmpl %eax, 4(%esp)
581 sbbl %eax, %eax
582 ret
583
584instead of:
585_usesbb:
586 xorl %eax, %eax
587 movl 8(%esp), %ecx
588 cmpl %ecx, 4(%esp)
589 movl $4294967295, %ecx
590 cmovb %ecx, %eax
591 ret
592
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000593//===---------------------------------------------------------------------===//
594
595Currently we don't have elimination of redundant stack manipulations. Consider
596the code:
597
598int %main() {
599entry:
600 call fastcc void %test1( )
601 call fastcc void %test2( sbyte* cast (void ()* %test1 to sbyte*) )
602 ret int 0
603}
604
605declare fastcc void %test1()
606
607declare fastcc void %test2(sbyte*)
608
609
610This currently compiles to:
611
612 subl $16, %esp
613 call _test5
614 addl $12, %esp
615 subl $16, %esp
616 movl $_test5, (%esp)
617 call _test6
618 addl $12, %esp
619
620The add\sub pair is really unneeded here.
621
622//===---------------------------------------------------------------------===//
623
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000624Consider the expansion of:
625
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000626define i32 @test3(i32 %X) {
627 %tmp1 = urem i32 %X, 255
628 ret i32 %tmp1
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000629}
630
631Currently it compiles to:
632
633...
634 movl $2155905153, %ecx
635 movl 8(%esp), %esi
636 movl %esi, %eax
637 mull %ecx
638...
639
640This could be "reassociated" into:
641
642 movl $2155905153, %eax
643 movl 8(%esp), %ecx
644 mull %ecx
645
646to avoid the copy. In fact, the existing two-address stuff would do this
647except that mul isn't a commutative 2-addr instruction. I guess this has
648to be done at isel time based on the #uses to mul?
649
650//===---------------------------------------------------------------------===//
651
652Make sure the instruction which starts a loop does not cross a cacheline
653boundary. This requires knowning the exact length of each machine instruction.
654That is somewhat complicated, but doable. Example 256.bzip2:
655
656In the new trace, the hot loop has an instruction which crosses a cacheline
657boundary. In addition to potential cache misses, this can't help decoding as I
658imagine there has to be some kind of complicated decoder reset and realignment
659to grab the bytes from the next cacheline.
660
661532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
662942 942 0x3d03 movl %dh, (1809(%esp, %esi)
663937 937 0x3d0a incl %esi
6643 3 0x3d0b cmpb %bl, %dl
66527 27 0x3d0d jnz 0x000062db <main+11707>
666
667//===---------------------------------------------------------------------===//
668
669In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
670
671//===---------------------------------------------------------------------===//
672
673This could be a single 16-bit load.
674
675int f(char *p) {
676 if ((p[0] == 1) & (p[1] == 2)) return 1;
677 return 0;
678}
679
680//===---------------------------------------------------------------------===//
681
682We should inline lrintf and probably other libc functions.
683
684//===---------------------------------------------------------------------===//
685
686Start using the flags more. For example, compile:
687
688int add_zf(int *x, int y, int a, int b) {
689 if ((*x += y) == 0)
690 return a;
691 else
692 return b;
693}
694
695to:
696 addl %esi, (%rdi)
697 movl %edx, %eax
698 cmovne %ecx, %eax
699 ret
700instead of:
701
702_add_zf:
703 addl (%rdi), %esi
704 movl %esi, (%rdi)
705 testl %esi, %esi
706 cmove %edx, %ecx
707 movl %ecx, %eax
708 ret
709
710and:
711
712int add_zf(int *x, int y, int a, int b) {
713 if ((*x + y) < 0)
714 return a;
715 else
716 return b;
717}
718
719to:
720
721add_zf:
722 addl (%rdi), %esi
723 movl %edx, %eax
724 cmovns %ecx, %eax
725 ret
726
727instead of:
728
729_add_zf:
730 addl (%rdi), %esi
731 testl %esi, %esi
732 cmovs %edx, %ecx
733 movl %ecx, %eax
734 ret
735
736//===---------------------------------------------------------------------===//
737
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000738These two functions have identical effects:
739
740unsigned int f(unsigned int i, unsigned int n) {++i; if (i == n) ++i; return i;}
741unsigned int f2(unsigned int i, unsigned int n) {++i; i += i == n; return i;}
742
743We currently compile them to:
744
745_f:
746 movl 4(%esp), %eax
747 movl %eax, %ecx
748 incl %ecx
749 movl 8(%esp), %edx
750 cmpl %edx, %ecx
751 jne LBB1_2 #UnifiedReturnBlock
752LBB1_1: #cond_true
753 addl $2, %eax
754 ret
755LBB1_2: #UnifiedReturnBlock
756 movl %ecx, %eax
757 ret
758_f2:
759 movl 4(%esp), %eax
760 movl %eax, %ecx
761 incl %ecx
762 cmpl 8(%esp), %ecx
763 sete %cl
764 movzbl %cl, %ecx
765 leal 1(%ecx,%eax), %eax
766 ret
767
768both of which are inferior to GCC's:
769
770_f:
771 movl 4(%esp), %edx
772 leal 1(%edx), %eax
773 addl $2, %edx
774 cmpl 8(%esp), %eax
775 cmove %edx, %eax
776 ret
777_f2:
778 movl 4(%esp), %eax
779 addl $1, %eax
780 xorl %edx, %edx
781 cmpl 8(%esp), %eax
782 sete %dl
783 addl %edx, %eax
784 ret
785
786//===---------------------------------------------------------------------===//
787
788This code:
789
790void test(int X) {
791 if (X) abort();
792}
793
794is currently compiled to:
795
796_test:
797 subl $12, %esp
798 cmpl $0, 16(%esp)
799 jne LBB1_1
800 addl $12, %esp
801 ret
802LBB1_1:
803 call L_abort$stub
804
805It would be better to produce:
806
807_test:
808 subl $12, %esp
809 cmpl $0, 16(%esp)
810 jne L_abort$stub
811 addl $12, %esp
812 ret
813
814This can be applied to any no-return function call that takes no arguments etc.
815Alternatively, the stack save/restore logic could be shrink-wrapped, producing
816something like this:
817
818_test:
819 cmpl $0, 4(%esp)
820 jne LBB1_1
821 ret
822LBB1_1:
823 subl $12, %esp
824 call L_abort$stub
825
826Both are useful in different situations. Finally, it could be shrink-wrapped
827and tail called, like this:
828
829_test:
830 cmpl $0, 4(%esp)
831 jne LBB1_1
832 ret
833LBB1_1:
834 pop %eax # realign stack.
835 call L_abort$stub
836
837Though this probably isn't worth it.
838
839//===---------------------------------------------------------------------===//
840
841We need to teach the codegen to convert two-address INC instructions to LEA
Chris Lattner0d64ec32007-08-11 18:16:46 +0000842when the flags are dead (likewise dec). For example, on X86-64, compile:
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000843
844int foo(int A, int B) {
845 return A+1;
846}
847
848to:
849
850_foo:
851 leal 1(%edi), %eax
852 ret
853
854instead of:
855
856_foo:
857 incl %edi
858 movl %edi, %eax
859 ret
860
861Another example is:
862
863;; X's live range extends beyond the shift, so the register allocator
864;; cannot coalesce it with Y. Because of this, a copy needs to be
865;; emitted before the shift to save the register value before it is
866;; clobbered. However, this copy is not needed if the register
867;; allocator turns the shift into an LEA. This also occurs for ADD.
868
869; Check that the shift gets turned into an LEA.
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000870; RUN: llvm-as < %s | llc -march=x86 -x86-asm-syntax=intel | \
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000871; RUN: not grep {mov E.X, E.X}
872
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000873@G = external global i32 ; <i32*> [#uses=3]
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000874
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000875define i32 @test1(i32 %X, i32 %Y) {
876 %Z = add i32 %X, %Y ; <i32> [#uses=1]
877 volatile store i32 %Y, i32* @G
878 volatile store i32 %Z, i32* @G
879 ret i32 %X
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000880}
881
Chris Lattnerbea5feb2008-02-14 06:19:02 +0000882define i32 @test2(i32 %X) {
883 %Z = add i32 %X, 1 ; <i32> [#uses=1]
884 volatile store i32 %Z, i32* @G
885 ret i32 %X
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000886}
887
888//===---------------------------------------------------------------------===//
889
Dan Gohmanf17a25c2007-07-18 16:29:46 +0000890Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
891a neg instead of a sub instruction. Consider:
892
893int test(char X) { return 7-X; }
894
895we currently produce:
896_test:
897 movl $7, %eax
898 movsbl 4(%esp), %ecx
899 subl %ecx, %eax
900 ret
901
902We would use one fewer register if codegen'd as:
903
904 movsbl 4(%esp), %eax
905 neg %eax
906 add $7, %eax
907 ret
908
909Note that this isn't beneficial if the load can be folded into the sub. In
910this case, we want a sub:
911
912int test(int X) { return 7-X; }
913_test:
914 movl $7, %eax
915 subl 4(%esp), %eax
916 ret
917
918//===---------------------------------------------------------------------===//
919
Chris Lattner32f65872007-08-20 02:14:33 +0000920Leaf functions that require one 4-byte spill slot have a prolog like this:
921
922_foo:
923 pushl %esi
924 subl $4, %esp
925...
926and an epilog like this:
927 addl $4, %esp
928 popl %esi
929 ret
930
931It would be smaller, and potentially faster, to push eax on entry and to
932pop into a dummy register instead of using addl/subl of esp. Just don't pop
933into any return registers :)
934
935//===---------------------------------------------------------------------===//
Chris Lattner44b03cb2007-08-23 15:22:07 +0000936
937The X86 backend should fold (branch (or (setcc, setcc))) into multiple
938branches. We generate really poor code for:
939
940double testf(double a) {
941 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
942}
943
944For example, the entry BB is:
945
946_testf:
947 subl $20, %esp
948 pxor %xmm0, %xmm0
949 movsd 24(%esp), %xmm1
950 ucomisd %xmm0, %xmm1
951 setnp %al
952 sete %cl
953 testb %cl, %al
954 jne LBB1_5 # UnifiedReturnBlock
955LBB1_1: # cond_true
956
957
958it would be better to replace the last four instructions with:
959
960 jp LBB1_1
961 je LBB1_5
962LBB1_1:
963
964We also codegen the inner ?: into a diamond:
965
966 cvtss2sd LCPI1_0(%rip), %xmm2
967 cvtss2sd LCPI1_1(%rip), %xmm3
968 ucomisd %xmm1, %xmm0
969 ja LBB1_3 # cond_true
970LBB1_2: # cond_true
971 movapd %xmm3, %xmm2
972LBB1_3: # cond_true
973 movapd %xmm2, %xmm0
974 ret
975
976We should sink the load into xmm3 into the LBB1_2 block. This should
977be pretty easy, and will nuke all the copies.
978
979//===---------------------------------------------------------------------===//
Chris Lattner4084d492007-09-10 21:43:18 +0000980
981This:
982 #include <algorithm>
983 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
984 { return std::make_pair(a + b, a + b < a); }
985 bool no_overflow(unsigned a, unsigned b)
986 { return !full_add(a, b).second; }
987
988Should compile to:
989
990
991 _Z11no_overflowjj:
992 addl %edi, %esi
993 setae %al
994 ret
995
Eli Friedman577c7492008-02-21 21:16:49 +0000996FIXME: That code looks wrong; bool return is normally defined as zext.
997
Chris Lattner4084d492007-09-10 21:43:18 +0000998on x86-64, not:
999
1000__Z11no_overflowjj:
1001 addl %edi, %esi
1002 cmpl %edi, %esi
1003 setae %al
1004 movzbl %al, %eax
1005 ret
1006
1007
1008//===---------------------------------------------------------------------===//
Evan Cheng35127a62007-09-10 22:16:37 +00001009
1010Re-materialize MOV32r0 etc. with xor instead of changing them to moves if the
1011condition register is dead. xor reg reg is shorter than mov reg, #0.
Chris Lattnera487bf72007-09-26 06:29:31 +00001012
1013//===---------------------------------------------------------------------===//
1014
1015We aren't matching RMW instructions aggressively
1016enough. Here's a reduced testcase (more in PR1160):
1017
1018define void @test(i32* %huge_ptr, i32* %target_ptr) {
1019 %A = load i32* %huge_ptr ; <i32> [#uses=1]
1020 %B = load i32* %target_ptr ; <i32> [#uses=1]
1021 %C = or i32 %A, %B ; <i32> [#uses=1]
1022 store i32 %C, i32* %target_ptr
1023 ret void
1024}
1025
1026$ llvm-as < t.ll | llc -march=x86-64
1027
1028_test:
1029 movl (%rdi), %eax
1030 orl (%rsi), %eax
1031 movl %eax, (%rsi)
1032 ret
1033
1034That should be something like:
1035
1036_test:
1037 movl (%rdi), %eax
1038 orl %eax, (%rsi)
1039 ret
1040
1041//===---------------------------------------------------------------------===//
1042
Bill Wendling7f436dd2007-10-02 20:42:59 +00001043The following code:
1044
Bill Wendlingc2036e32007-10-02 20:54:32 +00001045bb114.preheader: ; preds = %cond_next94
1046 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
1047 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
1048 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
1049 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
1050 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
1051 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
1052 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
1053 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
1054 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
1055 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
1056 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
1057 br label %bb114
1058
1059produces:
1060
Bill Wendling7f436dd2007-10-02 20:42:59 +00001061LBB3_5: # bb114.preheader
1062 movswl -68(%ebp), %eax
1063 movl $32, %ecx
1064 movl %ecx, -80(%ebp)
1065 subl %eax, -80(%ebp)
1066 movswl -52(%ebp), %eax
1067 movl %ecx, -84(%ebp)
1068 subl %eax, -84(%ebp)
1069 movswl -70(%ebp), %eax
1070 movl %ecx, -88(%ebp)
1071 subl %eax, -88(%ebp)
1072 movswl -50(%ebp), %eax
1073 subl %eax, %ecx
1074 movl %ecx, -76(%ebp)
1075 movswl -42(%ebp), %eax
1076 movl %eax, -92(%ebp)
1077 movswl -66(%ebp), %eax
1078 movl %eax, -96(%ebp)
1079 movw $0, -98(%ebp)
1080
Chris Lattner792bae52007-10-03 03:40:24 +00001081This appears to be bad because the RA is not folding the store to the stack
1082slot into the movl. The above instructions could be:
1083 movl $32, -80(%ebp)
1084...
1085 movl $32, -84(%ebp)
1086...
1087This seems like a cross between remat and spill folding.
1088
Bill Wendlingc2036e32007-10-02 20:54:32 +00001089This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling7f436dd2007-10-02 20:42:59 +00001090change, so we could simply subtract %eax from %ecx first and then use %ecx (or
1091vice-versa).
1092
1093//===---------------------------------------------------------------------===//
1094
Bill Wendling54c4f832007-10-02 21:49:31 +00001095This code:
1096
1097 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
1098 br i1 %tmp659, label %cond_true662, label %cond_next715
1099
1100produces this:
1101
1102 testw %cx, %cx
1103 movswl %cx, %esi
1104 jns LBB4_109 # cond_next715
1105
1106Shark tells us that using %cx in the testw instruction is sub-optimal. It
1107suggests using the 32-bit register (which is what ICC uses).
1108
1109//===---------------------------------------------------------------------===//
Chris Lattner802c62a2007-10-03 17:10:03 +00001110
Chris Lattnerae259992007-10-04 15:47:27 +00001111We compile this:
1112
1113void compare (long long foo) {
1114 if (foo < 4294967297LL)
1115 abort();
1116}
1117
1118to:
1119
Eli Friedman577c7492008-02-21 21:16:49 +00001120compare:
1121 subl $4, %esp
1122 cmpl $0, 8(%esp)
Chris Lattnerae259992007-10-04 15:47:27 +00001123 setne %al
1124 movzbw %al, %ax
Eli Friedman577c7492008-02-21 21:16:49 +00001125 cmpl $1, 12(%esp)
Chris Lattnerae259992007-10-04 15:47:27 +00001126 setg %cl
1127 movzbw %cl, %cx
1128 cmove %ax, %cx
Eli Friedman577c7492008-02-21 21:16:49 +00001129 testb $1, %cl
1130 jne .LBB1_2 # UnifiedReturnBlock
1131.LBB1_1: # ifthen
1132 call abort
1133.LBB1_2: # UnifiedReturnBlock
1134 addl $4, %esp
1135 ret
Chris Lattnerae259992007-10-04 15:47:27 +00001136
1137(also really horrible code on ppc). This is due to the expand code for 64-bit
1138compares. GCC produces multiple branches, which is much nicer:
1139
Eli Friedman577c7492008-02-21 21:16:49 +00001140compare:
1141 subl $12, %esp
1142 movl 20(%esp), %edx
1143 movl 16(%esp), %eax
1144 decl %edx
1145 jle .L7
1146.L5:
1147 addl $12, %esp
1148 ret
1149 .p2align 4,,7
1150.L7:
1151 jl .L4
Chris Lattnerae259992007-10-04 15:47:27 +00001152 cmpl $0, %eax
Eli Friedman577c7492008-02-21 21:16:49 +00001153 .p2align 4,,8
1154 ja .L5
1155.L4:
1156 .p2align 4,,9
1157 call abort
Chris Lattnerae259992007-10-04 15:47:27 +00001158
1159//===---------------------------------------------------------------------===//
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001160
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001161Tail call optimization improvements: Tail call optimization currently
1162pushes all arguments on the top of the stack (their normal place for
Arnold Schwaighofer449b01a2008-01-11 16:49:42 +00001163non-tail call optimized calls) that source from the callers arguments
1164or that source from a virtual register (also possibly sourcing from
1165callers arguments).
1166This is done to prevent overwriting of parameters (see example
1167below) that might be used later.
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001168
1169example:
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001170
1171int callee(int32, int64);
1172int caller(int32 arg1, int32 arg2) {
1173 int64 local = arg2 * 2;
1174 return callee(arg2, (int64)local);
1175}
1176
1177[arg1] [!arg2 no longer valid since we moved local onto it]
1178[arg2] -> [(int64)
1179[RETADDR] local ]
1180
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001181Moving arg1 onto the stack slot of callee function would overwrite
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001182arg2 of the caller.
1183
1184Possible optimizations:
1185
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001186
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001187 - Analyse the actual parameters of the callee to see which would
1188 overwrite a caller parameter which is used by the callee and only
1189 push them onto the top of the stack.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001190
1191 int callee (int32 arg1, int32 arg2);
1192 int caller (int32 arg1, int32 arg2) {
1193 return callee(arg1,arg2);
1194 }
1195
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001196 Here we don't need to write any variables to the top of the stack
1197 since they don't overwrite each other.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001198
1199 int callee (int32 arg1, int32 arg2);
1200 int caller (int32 arg1, int32 arg2) {
1201 return callee(arg2,arg1);
1202 }
1203
Arnold Schwaighofer373e8652007-10-12 21:30:57 +00001204 Here we need to push the arguments because they overwrite each
1205 other.
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001206
Arnold Schwaighofere2d6bbb2007-10-11 19:40:01 +00001207//===---------------------------------------------------------------------===//
Evan Cheng7f1ad6a2007-10-28 04:01:09 +00001208
1209main ()
1210{
1211 int i = 0;
1212 unsigned long int z = 0;
1213
1214 do {
1215 z -= 0x00004000;
1216 i++;
1217 if (i > 0x00040000)
1218 abort ();
1219 } while (z > 0);
1220 exit (0);
1221}
1222
1223gcc compiles this to:
1224
1225_main:
1226 subl $28, %esp
1227 xorl %eax, %eax
1228 jmp L2
1229L3:
1230 cmpl $262144, %eax
1231 je L10
1232L2:
1233 addl $1, %eax
1234 cmpl $262145, %eax
1235 jne L3
1236 call L_abort$stub
1237L10:
1238 movl $0, (%esp)
1239 call L_exit$stub
1240
1241llvm:
1242
1243_main:
1244 subl $12, %esp
1245 movl $1, %eax
1246 movl $16384, %ecx
1247LBB1_1: # bb
1248 cmpl $262145, %eax
1249 jge LBB1_4 # cond_true
1250LBB1_2: # cond_next
1251 incl %eax
1252 addl $4294950912, %ecx
1253 cmpl $16384, %ecx
1254 jne LBB1_1 # bb
1255LBB1_3: # bb11
1256 xorl %eax, %eax
1257 addl $12, %esp
1258 ret
1259LBB1_4: # cond_true
1260 call L_abort$stub
1261
12621. LSR should rewrite the first cmp with induction variable %ecx.
12632. DAG combiner should fold
1264 leal 1(%eax), %edx
1265 cmpl $262145, %edx
1266 =>
1267 cmpl $262144, %eax
1268
1269//===---------------------------------------------------------------------===//
Chris Lattner358670b2007-11-24 06:13:33 +00001270
1271define i64 @test(double %X) {
1272 %Y = fptosi double %X to i64
1273 ret i64 %Y
1274}
1275
1276compiles to:
1277
1278_test:
1279 subl $20, %esp
1280 movsd 24(%esp), %xmm0
1281 movsd %xmm0, 8(%esp)
1282 fldl 8(%esp)
1283 fisttpll (%esp)
1284 movl 4(%esp), %edx
1285 movl (%esp), %eax
1286 addl $20, %esp
1287 #FP_REG_KILL
1288 ret
1289
1290This should just fldl directly from the input stack slot.
Chris Lattner10d54d12007-12-05 22:58:19 +00001291
1292//===---------------------------------------------------------------------===//
1293
1294This code:
1295int foo (int x) { return (x & 65535) | 255; }
1296
1297Should compile into:
1298
1299_foo:
1300 movzwl 4(%esp), %eax
Eli Friedman577c7492008-02-21 21:16:49 +00001301 orl $255, %eax
Chris Lattner10d54d12007-12-05 22:58:19 +00001302 ret
1303
1304instead of:
1305_foo:
1306 movl $255, %eax
1307 orl 4(%esp), %eax
1308 andl $65535, %eax
1309 ret
1310
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001311//===---------------------------------------------------------------------===//
1312
Chris Lattnereec7ac02008-02-21 06:51:29 +00001313We're codegen'ing multiply of long longs inefficiently:
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001314
Chris Lattnereec7ac02008-02-21 06:51:29 +00001315unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
1316 return arg1 * arg2;
1317}
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001318
Chris Lattnereec7ac02008-02-21 06:51:29 +00001319We compile to (fomit-frame-pointer):
1320
1321_LLM:
1322 pushl %esi
1323 movl 8(%esp), %ecx
1324 movl 16(%esp), %esi
1325 movl %esi, %eax
1326 mull %ecx
1327 imull 12(%esp), %esi
1328 addl %edx, %esi
1329 imull 20(%esp), %ecx
1330 movl %esi, %edx
1331 addl %ecx, %edx
1332 popl %esi
1333 ret
1334
1335This looks like a scheduling deficiency and lack of remat of the load from
1336the argument area. ICC apparently produces:
1337
1338 movl 8(%esp), %ecx
1339 imull 12(%esp), %ecx
1340 movl 16(%esp), %eax
1341 imull 4(%esp), %eax
1342 addl %eax, %ecx
1343 movl 4(%esp), %eax
1344 mull 12(%esp)
1345 addl %ecx, %edx
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001346 ret
1347
Chris Lattnereec7ac02008-02-21 06:51:29 +00001348Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
1349http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
Chris Lattnerd079b4e2007-12-18 16:48:14 +00001350
1351//===---------------------------------------------------------------------===//
1352
Chris Lattner2b55ebd2007-12-24 19:27:46 +00001353We can fold a store into "zeroing a reg". Instead of:
1354
1355xorl %eax, %eax
1356movl %eax, 124(%esp)
1357
1358we should get:
1359
1360movl $0, 124(%esp)
1361
1362if the flags of the xor are dead.
1363
Chris Lattner459ff992008-01-11 18:00:13 +00001364Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
1365be folded into: shl [mem], 1
1366
Chris Lattner2b55ebd2007-12-24 19:27:46 +00001367//===---------------------------------------------------------------------===//
Chris Lattner64400952007-12-28 21:50:40 +00001368
1369This testcase misses a read/modify/write opportunity (from PR1425):
1370
1371void vertical_decompose97iH1(int *b0, int *b1, int *b2, int width){
1372 int i;
1373 for(i=0; i<width; i++)
1374 b1[i] += (1*(b0[i] + b2[i])+0)>>0;
1375}
1376
1377We compile it down to:
1378
1379LBB1_2: # bb
1380 movl (%esi,%edi,4), %ebx
1381 addl (%ecx,%edi,4), %ebx
1382 addl (%edx,%edi,4), %ebx
1383 movl %ebx, (%ecx,%edi,4)
1384 incl %edi
1385 cmpl %eax, %edi
1386 jne LBB1_2 # bb
1387
1388the inner loop should add to the memory location (%ecx,%edi,4), saving
1389a mov. Something like:
1390
1391 movl (%esi,%edi,4), %ebx
1392 addl (%edx,%edi,4), %ebx
1393 addl %ebx, (%ecx,%edi,4)
1394
Chris Lattnerbde73102007-12-29 05:51:58 +00001395Here is another interesting example:
1396
1397void vertical_compose97iH1(int *b0, int *b1, int *b2, int width){
1398 int i;
1399 for(i=0; i<width; i++)
1400 b1[i] -= (1*(b0[i] + b2[i])+0)>>0;
1401}
1402
1403We miss the r/m/w opportunity here by using 2 subs instead of an add+sub[mem]:
1404
1405LBB9_2: # bb
1406 movl (%ecx,%edi,4), %ebx
1407 subl (%esi,%edi,4), %ebx
1408 subl (%edx,%edi,4), %ebx
1409 movl %ebx, (%ecx,%edi,4)
1410 incl %edi
1411 cmpl %eax, %edi
1412 jne LBB9_2 # bb
1413
1414Additionally, LSR should rewrite the exit condition of these loops to use
Chris Lattner64400952007-12-28 21:50:40 +00001415a stride-4 IV, would would allow all the scales in the loop to go away.
1416This would result in smaller code and more efficient microops.
1417
1418//===---------------------------------------------------------------------===//
Chris Lattner0362a362008-01-07 21:59:58 +00001419
1420In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
1421or and instruction, for example:
1422
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001423 xorpd LCPI1_0, %xmm2
Chris Lattner0362a362008-01-07 21:59:58 +00001424
1425However, if xmm2 gets spilled, we end up with really ugly code like this:
1426
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001427 movsd (%esp), %xmm0
1428 xorpd LCPI1_0, %xmm0
1429 movsd %xmm0, (%esp)
Chris Lattner0362a362008-01-07 21:59:58 +00001430
1431Since we 'know' that this is a 'neg', we can actually "fold" the spill into
1432the neg/abs instruction, turning it into an *integer* operation, like this:
1433
1434 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
1435
1436you could also use xorb, but xorl is less likely to lead to a partial register
Chris Lattnerb4cbb682008-01-09 00:37:18 +00001437stall. Here is a contrived testcase:
1438
1439double a, b, c;
1440void test(double *P) {
1441 double X = *P;
1442 a = X;
1443 bar();
1444 X = -X;
1445 b = X;
1446 bar();
1447 c = X;
1448}
Chris Lattner0362a362008-01-07 21:59:58 +00001449
1450//===---------------------------------------------------------------------===//
Andrew Lenharth785610d2008-02-16 01:24:58 +00001451
1452handling llvm.memory.barrier on pre SSE2 cpus
1453
1454should generate:
1455lock ; mov %esp, %esp
1456
1457//===---------------------------------------------------------------------===//
Chris Lattner7644ff32008-02-17 19:43:57 +00001458
1459The generated code on x86 for checking for signed overflow on a multiply the
1460obvious way is much longer than it needs to be.
1461
1462int x(int a, int b) {
1463 long long prod = (long long)a*b;
1464 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
1465}
1466
1467See PR2053 for more details.
1468
1469//===---------------------------------------------------------------------===//
Chris Lattner83f22362008-02-18 18:30:13 +00001470
Eli Friedman577c7492008-02-21 21:16:49 +00001471We should investigate using cdq/ctld (effect: edx = sar eax, 31)
1472more aggressively; it should cost the same as a move+shift on any modern
1473processor, but it's a lot shorter. Downside is that it puts more
1474pressure on register allocation because it has fixed operands.
1475
1476Example:
1477int abs(int x) {return x < 0 ? -x : x;}
1478
1479gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
1480abs:
1481 movl 4(%esp), %eax
1482 cltd
1483 xorl %edx, %eax
1484 subl %edx, %eax
1485 ret
1486
1487//===---------------------------------------------------------------------===//
1488
1489Consider:
Chris Lattner83f22362008-02-18 18:30:13 +00001490int test(unsigned long a, unsigned long b) { return -(a < b); }
1491
1492We currently compile this to:
1493
1494define i32 @test(i32 %a, i32 %b) nounwind {
1495 %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
1496 %tmp34 = zext i1 %tmp3 to i32 ; <i32> [#uses=1]
1497 %tmp5 = sub i32 0, %tmp34 ; <i32> [#uses=1]
1498 ret i32 %tmp5
1499}
1500
1501and
1502
1503_test:
1504 movl 8(%esp), %eax
1505 cmpl %eax, 4(%esp)
1506 setb %al
1507 movzbl %al, %eax
1508 negl %eax
1509 ret
1510
1511Several deficiencies here. First, we should instcombine zext+neg into sext:
1512
1513define i32 @test2(i32 %a, i32 %b) nounwind {
1514 %tmp3 = icmp ult i32 %a, %b ; <i1> [#uses=1]
1515 %tmp34 = sext i1 %tmp3 to i32 ; <i32> [#uses=1]
1516 ret i32 %tmp34
1517}
1518
1519However, before we can do that, we have to fix the bad codegen that we get for
1520sext from bool:
1521
1522_test2:
1523 movl 8(%esp), %eax
1524 cmpl %eax, 4(%esp)
1525 setb %al
1526 movzbl %al, %eax
1527 shll $31, %eax
1528 sarl $31, %eax
1529 ret
1530
1531This code should be at least as good as the code above. Once this is fixed, we
1532can optimize this specific case even more to:
1533
1534 movl 8(%esp), %eax
1535 xorl %ecx, %ecx
1536 cmpl %eax, 4(%esp)
1537 sbbl %ecx, %ecx
1538
1539//===---------------------------------------------------------------------===//
Eli Friedman1aa1f2c2008-02-28 00:21:43 +00001540
1541Take the following code (from
1542http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
1543
1544extern unsigned char first_one[65536];
1545int FirstOnet(unsigned long long arg1)
1546{
1547 if (arg1 >> 48)
1548 return (first_one[arg1 >> 48]);
1549 return 0;
1550}
1551
1552
1553The following code is currently generated:
1554FirstOnet:
1555 movl 8(%esp), %eax
1556 cmpl $65536, %eax
1557 movl 4(%esp), %ecx
1558 jb .LBB1_2 # UnifiedReturnBlock
1559.LBB1_1: # ifthen
1560 shrl $16, %eax
1561 movzbl first_one(%eax), %eax
1562 ret
1563.LBB1_2: # UnifiedReturnBlock
1564 xorl %eax, %eax
1565 ret
1566
1567There are a few possible improvements here:
15681. We should be able to eliminate the dead load into %ecx
15692. We could change the "movl 8(%esp), %eax" into
1570 "movzwl 10(%esp), %eax"; this lets us change the cmpl
1571 into a testl, which is shorter, and eliminate the shift.
1572
1573We could also in theory eliminate the branch by using a conditional
1574for the address of the load, but that seems unlikely to be worthwhile
1575in general.
1576
1577//===---------------------------------------------------------------------===//
1578
Chris Lattner44a98ac2008-02-28 04:52:59 +00001579We compile this function:
1580
1581define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
1582entry:
1583 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
1584 br i1 %tmp2, label %bb7, label %bb
1585
1586bb: ; preds = %entry
1587 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
1588 ret i32 %tmp6
1589
1590bb7: ; preds = %entry
1591 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
1592 ret i32 %tmp10
1593}
1594
1595to:
1596
1597_foo:
1598 cmpb $0, 16(%esp)
1599 movl 12(%esp), %ecx
1600 movl 8(%esp), %eax
1601 movl 4(%esp), %edx
1602 je LBB1_2 # bb7
1603LBB1_1: # bb
1604 addl %edx, %eax
1605 ret
1606LBB1_2: # bb7
1607 movl %edx, %eax
1608 subl %ecx, %eax
1609 ret
1610
Gabor Greif02661592008-03-06 10:51:21 +00001611The coalescer could coalesce "edx" with "eax" to avoid the movl in LBB1_2
Chris Lattner44a98ac2008-02-28 04:52:59 +00001612if it commuted the addl in LBB1_1.
1613
1614//===---------------------------------------------------------------------===//
Evan Cheng921dcba2008-03-28 07:07:06 +00001615
1616See rdar://4653682.
1617
1618From flops:
1619
1620LBB1_15: # bb310
1621 cvtss2sd LCPI1_0, %xmm1
1622 addsd %xmm1, %xmm0
1623 movsd 176(%esp), %xmm2
1624 mulsd %xmm0, %xmm2
1625 movapd %xmm2, %xmm3
1626 mulsd %xmm3, %xmm3
1627 movapd %xmm3, %xmm4
1628 mulsd LCPI1_23, %xmm4
1629 addsd LCPI1_24, %xmm4
1630 mulsd %xmm3, %xmm4
1631 addsd LCPI1_25, %xmm4
1632 mulsd %xmm3, %xmm4
1633 addsd LCPI1_26, %xmm4
1634 mulsd %xmm3, %xmm4
1635 addsd LCPI1_27, %xmm4
1636 mulsd %xmm3, %xmm4
1637 addsd LCPI1_28, %xmm4
1638 mulsd %xmm3, %xmm4
1639 addsd %xmm1, %xmm4
1640 mulsd %xmm2, %xmm4
1641 movsd 152(%esp), %xmm1
1642 addsd %xmm4, %xmm1
1643 movsd %xmm1, 152(%esp)
1644 incl %eax
1645 cmpl %eax, %esi
1646 jge LBB1_15 # bb310
1647LBB1_16: # bb358.loopexit
1648 movsd 152(%esp), %xmm0
1649 addsd %xmm0, %xmm0
1650 addsd LCPI1_22, %xmm0
1651 movsd %xmm0, 152(%esp)
1652
1653Rather than spilling the result of the last addsd in the loop, we should have
1654insert a copy to split the interval (one for the duration of the loop, one
1655extending to the fall through). The register pressure in the loop isn't high
1656enough to warrant the spill.
1657
1658Also check why xmm7 is not used at all in the function.
Chris Lattner16e5c782008-04-21 04:46:30 +00001659
1660//===---------------------------------------------------------------------===//
1661
1662Legalize loses track of the fact that bools are always zero extended when in
1663memory. This causes us to compile abort_gzip (from 164.gzip) from:
1664
1665target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
1666target triple = "i386-apple-darwin8"
1667@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1668define fastcc void @abort_gzip() noreturn nounwind {
1669entry:
1670 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1671 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1672bb.i: ; preds = %entry
1673 tail call void @exit( i32 1 ) noreturn nounwind
1674 unreachable
1675bb4.i: ; preds = %entry
1676 store i1 true, i1* @in_exit.4870.b
1677 tail call void @exit( i32 1 ) noreturn nounwind
1678 unreachable
1679}
1680declare void @exit(i32) noreturn nounwind
1681
1682into:
1683
1684_abort_gzip:
1685 subl $12, %esp
1686 movb _in_exit.4870.b, %al
1687 notb %al
1688 testb $1, %al
1689 jne LBB1_2 ## bb4.i
1690LBB1_1: ## bb.i
1691 ...
1692
1693//===---------------------------------------------------------------------===//