blob: c06a7b1ade6d6cce071c1dedeb4af8578ca2c1c4 [file] [log] [blame]
Chris Lattner2e81fba2005-10-23 19:52:42 +00001//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend.
3//===---------------------------------------------------------------------===//
4
Chris Lattner2e81fba2005-10-23 19:52:42 +00005Improvements to the multiply -> shift/add algorithm:
6http://gcc.gnu.org/ml/gcc-patches/2004-08/msg01590.html
7
8//===---------------------------------------------------------------------===//
9
10Improve code like this (occurs fairly frequently, e.g. in LLVM):
11long long foo(int x) { return 1LL << x; }
12
13http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01109.html
14http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01128.html
15http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01136.html
16
17Another useful one would be ~0ULL >> X and ~0ULL << X.
18
Chris Lattner34967102006-09-13 03:54:54 +000019One better solution for 1LL << x is:
20 xorl %eax, %eax
21 xorl %edx, %edx
22 testb $32, %cl
23 sete %al
24 setne %dl
25 sall %cl, %eax
26 sall %cl, %edx
27
28But that requires good 8-bit subreg support.
29
Eli Friedman5d8fa822008-02-21 21:16:49 +000030Also, this might be better. It's an extra shift, but it's one instruction
31shorter, and doesn't stress 8-bit subreg support.
32(From http://gcc.gnu.org/ml/gcc-patches/2004-09/msg01148.html,
33but without the unnecessary and.)
34 movl %ecx, %eax
35 shrl $5, %eax
36 movl %eax, %edx
37 xorl $1, %edx
38 sall %cl, %eax
39 sall %cl. %edx
40
Chris Lattner523dbc52006-09-18 05:36:54 +00004164-bit shifts (in general) expand to really bad code. Instead of using
42cmovs, we should expand to a conditional branch like GCC produces.
Chris Lattner34967102006-09-13 03:54:54 +000043
Chris Lattnerb5407072005-10-23 21:44:59 +000044//===---------------------------------------------------------------------===//
45
Evan Cheng6b760092005-12-17 01:25:19 +000046Some isel ideas:
47
Craig Topperf2106192012-01-07 20:35:21 +0000481. Dynamic programming based approach when compile time is not an
Evan Cheng6b760092005-12-17 01:25:19 +000049 issue.
502. Code duplication (addressing mode) during isel.
513. Other ideas from "Register-Sensitive Selection, Duplication, and
52 Sequencing of Instructions".
Roman Shirokiyd93998f2016-06-10 13:12:48 +0000534. Scheduling for reduced register pressure. E.g. "Minimum Register
54 Instruction Sequence Problem: Revisiting Optimal Code Generation for DAGs"
Chris Lattnerb7e074a2006-02-08 07:12:07 +000055 and other related papers.
56 http://citeseer.ist.psu.edu/govindarajan01minimum.html
Evan Cheng6b760092005-12-17 01:25:19 +000057
58//===---------------------------------------------------------------------===//
59
60Should we promote i16 to i32 to avoid partial register update stalls?
Evan Cheng7087cd22005-12-17 06:54:43 +000061
62//===---------------------------------------------------------------------===//
63
64Leave any_extend as pseudo instruction and hint to register
65allocator. Delay codegen until post register allocation.
Evan Cheng409fa442007-10-12 18:22:55 +000066Note. any_extend is now turned into an INSERT_SUBREG. We still need to teach
67the coalescer how to deal with it though.
Evan Cheng6305e502006-01-12 22:54:21 +000068
69//===---------------------------------------------------------------------===//
70
Evan Cheng27ba94b2007-07-17 18:39:45 +000071It appears icc use push for parameter passing. Need to investigate.
Chris Lattnerb2eacf42006-01-16 17:53:00 +000072
73//===---------------------------------------------------------------------===//
74
Chris Lattner5a7a22c2006-01-29 09:08:15 +000075The instruction selector sometimes misses folding a load into a compare. The
Roman Shirokiyd93998f2016-06-10 13:12:48 +000076pattern is written as (cmp reg, (load p)). Because the compare isn't
Chris Lattner5a7a22c2006-01-29 09:08:15 +000077commutative, it is not matched with the load on both sides. The dag combiner
Alp Toker171b0c32013-12-20 00:33:39 +000078should be made smart enough to canonicalize the load into the RHS of a compare
Chris Lattner5a7a22c2006-01-29 09:08:15 +000079when it can invert the result of the compare for free.
80
Evan Cheng9e77d9a2006-09-11 05:25:15 +000081//===---------------------------------------------------------------------===//
82
Chris Lattnerd3f033e2006-02-02 19:16:34 +000083In many cases, LLVM generates code like this:
84
85_test:
86 movl 8(%esp), %eax
87 cmpl %eax, 4(%esp)
88 setl %al
89 movzbl %al, %eax
90 ret
91
92on some processors (which ones?), it is more efficient to do this:
93
94_test:
95 movl 8(%esp), %ebx
Evan Cheng9e77d9a2006-09-11 05:25:15 +000096 xor %eax, %eax
Chris Lattnerd3f033e2006-02-02 19:16:34 +000097 cmpl %ebx, 4(%esp)
98 setl %al
99 ret
100
101Doing this correctly is tricky though, as the xor clobbers the flags.
102
Chris Lattnerd8208c32006-02-02 19:43:28 +0000103//===---------------------------------------------------------------------===//
104
Chris Lattner45bb34b2006-02-08 06:52:06 +0000105We should generate bts/btr/etc instructions on targets where they are cheap or
106when codesize is important. e.g., for:
107
108void setbit(int *target, int bit) {
109 *target |= (1 << bit);
110}
111void clearbit(int *target, int bit) {
112 *target &= ~(1 << bit);
113}
114
Chris Lattnerb4fc0502006-02-08 17:47:22 +0000115//===---------------------------------------------------------------------===//
116
Evan Chengf976d792006-02-14 08:25:32 +0000117Instead of the following for memset char*, 1, 10:
118
119 movl $16843009, 4(%edx)
120 movl $16843009, (%edx)
121 movw $257, 8(%edx)
122
123It might be better to generate
124
125 movl $16843009, %eax
126 movl %eax, 4(%edx)
127 movl %eax, (%edx)
128 movw al, 8(%edx)
129
130when we can spare a register. It reduces code size.
Evan Chengb590d3a2006-02-17 00:04:28 +0000131
132//===---------------------------------------------------------------------===//
133
Chris Lattner67c21b62006-02-17 04:20:13 +0000134Evaluate what the best way to codegen sdiv X, (2^C) is. For X/8, we currently
135get this:
136
Eli Friedman93e8b672008-02-28 00:21:43 +0000137define i32 @test1(i32 %X) {
138 %Y = sdiv i32 %X, 8
139 ret i32 %Y
Chris Lattner67c21b62006-02-17 04:20:13 +0000140}
141
142_test1:
143 movl 4(%esp), %eax
144 movl %eax, %ecx
145 sarl $31, %ecx
146 shrl $29, %ecx
147 addl %ecx, %eax
148 sarl $3, %eax
149 ret
150
151GCC knows several different ways to codegen it, one of which is this:
152
153_test1:
154 movl 4(%esp), %eax
155 cmpl $-1, %eax
156 leal 7(%eax), %ecx
157 cmovle %ecx, %eax
158 sarl $3, %eax
159 ret
160
161which is probably slower, but it's interesting at least :)
162
Evan Cheng45474002006-02-20 19:58:27 +0000163//===---------------------------------------------------------------------===//
164
Nate Begeman68cc9d42006-03-26 19:19:27 +0000165We are currently lowering large (1MB+) memmove/memcpy to rep/stosl and rep/movsl
166We should leave these as libcalls for everything over a much lower threshold,
167since libc is hand tuned for medium and large mem ops (avoiding RFO for large
168stores, TLB preheating, etc)
169
170//===---------------------------------------------------------------------===//
171
Chris Lattner920e6612006-03-09 01:39:46 +0000172Optimize this into something reasonable:
173 x * copysign(1.0, y) * copysign(1.0, z)
174
175//===---------------------------------------------------------------------===//
176
177Optimize copysign(x, *y) to use an integer load from y.
178
179//===---------------------------------------------------------------------===//
180
Evan Cheng66a9c0d2006-03-19 06:08:11 +0000181The following tests perform worse with LSR:
182
183lambda, siod, optimizer-eval, ackermann, hash2, nestedloop, strcat, and Treesor.
Chris Lattnerd16f6fd2006-03-19 22:27:41 +0000184
185//===---------------------------------------------------------------------===//
186
Evan Chengdddb6882006-04-05 23:46:04 +0000187Adding to the list of cmp / test poor codegen issues:
188
189int test(__m128 *A, __m128 *B) {
190 if (_mm_comige_ss(*A, *B))
191 return 3;
192 else
193 return 4;
194}
195
196_test:
197 movl 8(%esp), %eax
198 movaps (%eax), %xmm0
199 movl 4(%esp), %eax
200 movaps (%eax), %xmm1
201 comiss %xmm0, %xmm1
202 setae %al
203 movzbl %al, %ecx
204 movl $3, %eax
205 movl $4, %edx
206 cmpl $0, %ecx
207 cmove %edx, %eax
208 ret
209
210Note the setae, movzbl, cmpl, cmove can be replaced with a single cmovae. There
211are a number of issues. 1) We are introducing a setcc between the result of the
212intrisic call and select. 2) The intrinsic is expected to produce a i32 value
213so a any extend (which becomes a zero extend) is added.
214
215We probably need some kind of target DAG combine hook to fix this.
Evan Chengacf8b3c2006-04-06 23:21:24 +0000216
217//===---------------------------------------------------------------------===//
218
Chris Lattnerf1105272006-04-23 19:47:09 +0000219We generate significantly worse code for this than GCC:
220http://gcc.gnu.org/bugzilla/show_bug.cgi?id=21150
221http://gcc.gnu.org/bugzilla/attachment.cgi?id=8701
222
223There is also one case we do worse on PPC.
224
225//===---------------------------------------------------------------------===//
Evan Chengd03631e2006-04-24 23:30:10 +0000226
Evan Cheng02420142006-05-30 07:37:37 +0000227For this:
228
229int test(int a)
230{
231 return a * 3;
232}
233
234We currently emits
235 imull $3, 4(%esp), %eax
236
237Perhaps this is what we really should generate is? Is imull three or four
238cycles? Note: ICC generates this:
239 movl 4(%esp), %eax
240 leal (%eax,%eax,2), %eax
241
242The current instruction priority is based on pattern complexity. The former is
243more "complex" because it folds a load so the latter will not be emitted.
244
245Perhaps we should use AddedComplexity to give LEA32r a higher priority? We
246should always try to match LEA first since the LEA matching code does some
247estimate to determine whether the match is profitable.
248
249However, if we care more about code size, then imull is better. It's two bytes
250shorter than movl + leal.
Evan Cheng0f29df92006-06-04 09:08:00 +0000251
Eli Friedmane9ef1702008-11-30 07:52:27 +0000252On a Pentium M, both variants have the same characteristics with regard
253to throughput; however, the multiplication has a latency of four cycles, as
254opposed to two cycles for the movl+lea variant.
255
Evan Cheng0f29df92006-06-04 09:08:00 +0000256//===---------------------------------------------------------------------===//
257
Evan Cheng0f29df92006-06-04 09:08:00 +0000258It appears gcc place string data with linkonce linkage in
259.section __TEXT,__const_coal,coalesced instead of
260.section __DATA,__const_coal,coalesced.
261Take a look at darwin.h, there are other Darwin assembler directives that we
262do not make use of.
263
264//===---------------------------------------------------------------------===//
265
Chris Lattnereb63b092008-02-14 06:19:02 +0000266define i32 @foo(i32* %a, i32 %t) {
Nate Begeman6025c922006-08-02 05:31:20 +0000267entry:
Chris Lattnereb63b092008-02-14 06:19:02 +0000268 br label %cond_true
Nate Begeman6025c922006-08-02 05:31:20 +0000269
Chris Lattnereb63b092008-02-14 06:19:02 +0000270cond_true: ; preds = %cond_true, %entry
271 %x.0.0 = phi i32 [ 0, %entry ], [ %tmp9, %cond_true ] ; <i32> [#uses=3]
272 %t_addr.0.0 = phi i32 [ %t, %entry ], [ %tmp7, %cond_true ] ; <i32> [#uses=1]
273 %tmp2 = getelementptr i32* %a, i32 %x.0.0 ; <i32*> [#uses=1]
274 %tmp3 = load i32* %tmp2 ; <i32> [#uses=1]
275 %tmp5 = add i32 %t_addr.0.0, %x.0.0 ; <i32> [#uses=1]
276 %tmp7 = add i32 %tmp5, %tmp3 ; <i32> [#uses=2]
277 %tmp9 = add i32 %x.0.0, 1 ; <i32> [#uses=2]
278 %tmp = icmp sgt i32 %tmp9, 39 ; <i1> [#uses=1]
279 br i1 %tmp, label %bb12, label %cond_true
Nate Begeman6025c922006-08-02 05:31:20 +0000280
Chris Lattnereb63b092008-02-14 06:19:02 +0000281bb12: ; preds = %cond_true
282 ret i32 %tmp7
Chris Lattnercb295862006-06-15 21:33:31 +0000283}
Nate Begeman6025c922006-08-02 05:31:20 +0000284is pessimized by -loop-reduce and -indvars
Chris Lattnercb295862006-06-15 21:33:31 +0000285
286//===---------------------------------------------------------------------===//
Evan Chenga54b9642006-06-17 00:45:49 +0000287
Evan Cheng8a881f22006-07-19 21:29:30 +0000288u32 to float conversion improvement:
289
290float uint32_2_float( unsigned u ) {
291 float fl = (int) (u & 0xffff);
292 float fh = (int) (u >> 16);
293 fh *= 0x1.0p16f;
294 return fh + fl;
295}
296
29700000000 subl $0x04,%esp
29800000003 movl 0x08(%esp,1),%eax
29900000007 movl %eax,%ecx
30000000009 shrl $0x10,%ecx
3010000000c cvtsi2ss %ecx,%xmm0
30200000010 andl $0x0000ffff,%eax
30300000015 cvtsi2ss %eax,%xmm1
30400000019 mulss 0x00000078,%xmm0
30500000021 addss %xmm1,%xmm0
30600000025 movss %xmm0,(%esp,1)
3070000002a flds (%esp,1)
3080000002d addl $0x04,%esp
30900000030 ret
Evan Cheng23a21c12006-07-26 21:49:52 +0000310
311//===---------------------------------------------------------------------===//
312
313When using fastcc abi, align stack slot of argument of type double on 8 byte
314boundary to improve performance.
Chris Lattner08a5f382006-08-16 02:47:44 +0000315
316//===---------------------------------------------------------------------===//
317
Chris Lattnere413fea2006-09-13 04:19:50 +0000318GCC's ix86_expand_int_movcc function (in i386.c) has a ton of interesting
Benjamin Kramerb37ae332010-12-23 15:07:02 +0000319simplifications for integer "x cmp y ? a : b".
Chris Lattner389d4302007-11-02 17:04:20 +0000320
Chris Lattnere413fea2006-09-13 04:19:50 +0000321//===---------------------------------------------------------------------===//
Chris Lattner14633772006-09-13 23:37:16 +0000322
Chris Lattner03fda132006-10-12 22:01:26 +0000323Consider the expansion of:
324
Chris Lattnereb63b092008-02-14 06:19:02 +0000325define i32 @test3(i32 %X) {
326 %tmp1 = urem i32 %X, 255
327 ret i32 %tmp1
Chris Lattner03fda132006-10-12 22:01:26 +0000328}
329
330Currently it compiles to:
331
332...
333 movl $2155905153, %ecx
334 movl 8(%esp), %esi
335 movl %esi, %eax
336 mull %ecx
337...
338
339This could be "reassociated" into:
340
341 movl $2155905153, %eax
342 movl 8(%esp), %ecx
343 mull %ecx
344
345to avoid the copy. In fact, the existing two-address stuff would do this
346except that mul isn't a commutative 2-addr instruction. I guess this has
347to be done at isel time based on the #uses to mul?
348
Evan Cheng69b18252006-11-28 19:59:25 +0000349//===---------------------------------------------------------------------===//
350
351Make sure the instruction which starts a loop does not cross a cacheline
352boundary. This requires knowning the exact length of each machine instruction.
353That is somewhat complicated, but doable. Example 256.bzip2:
354
355In the new trace, the hot loop has an instruction which crosses a cacheline
356boundary. In addition to potential cache misses, this can't help decoding as I
357imagine there has to be some kind of complicated decoder reset and realignment
358to grab the bytes from the next cacheline.
359
360532 532 0x3cfc movb (1809(%esp, %esi), %bl <<<--- spans 2 64 byte lines
Eli Friedmane9ef1702008-11-30 07:52:27 +0000361942 942 0x3d03 movl %dh, (1809(%esp, %esi)
362937 937 0x3d0a incl %esi
3633 3 0x3d0b cmpb %bl, %dl
Evan Cheng69b18252006-11-28 19:59:25 +000036427 27 0x3d0d jnz 0x000062db <main+11707>
365
366//===---------------------------------------------------------------------===//
367
368In c99 mode, the preprocessor doesn't like assembly comments like #TRUNCATE.
Chris Lattner959113a2006-12-22 01:03:22 +0000369
370//===---------------------------------------------------------------------===//
371
372This could be a single 16-bit load.
Chris Lattnerf9cf0532007-01-03 19:12:31 +0000373
Chris Lattner959113a2006-12-22 01:03:22 +0000374int f(char *p) {
Chris Lattnerf9cf0532007-01-03 19:12:31 +0000375 if ((p[0] == 1) & (p[1] == 2)) return 1;
Chris Lattner959113a2006-12-22 01:03:22 +0000376 return 0;
377}
378
Chris Lattneraf313982007-01-06 01:30:45 +0000379//===---------------------------------------------------------------------===//
380
381We should inline lrintf and probably other libc functions.
382
383//===---------------------------------------------------------------------===//
Chris Lattnere76908b2007-01-15 06:25:39 +0000384
Chris Lattnerb5c89c82007-02-12 20:26:34 +0000385This code:
386
387void test(int X) {
388 if (X) abort();
389}
390
Chris Lattner44e94722007-02-12 21:20:26 +0000391is currently compiled to:
Chris Lattnerb5c89c82007-02-12 20:26:34 +0000392
393_test:
394 subl $12, %esp
395 cmpl $0, 16(%esp)
Chris Lattner44e94722007-02-12 21:20:26 +0000396 jne LBB1_1
Chris Lattnerb5c89c82007-02-12 20:26:34 +0000397 addl $12, %esp
398 ret
Chris Lattner44e94722007-02-12 21:20:26 +0000399LBB1_1:
Chris Lattnerb5c89c82007-02-12 20:26:34 +0000400 call L_abort$stub
401
402It would be better to produce:
403
404_test:
405 subl $12, %esp
406 cmpl $0, 16(%esp)
407 jne L_abort$stub
408 addl $12, %esp
409 ret
410
411This can be applied to any no-return function call that takes no arguments etc.
Chris Lattner44e94722007-02-12 21:20:26 +0000412Alternatively, the stack save/restore logic could be shrink-wrapped, producing
413something like this:
414
415_test:
416 cmpl $0, 4(%esp)
417 jne LBB1_1
418 ret
419LBB1_1:
420 subl $12, %esp
421 call L_abort$stub
422
423Both are useful in different situations. Finally, it could be shrink-wrapped
424and tail called, like this:
425
426_test:
427 cmpl $0, 4(%esp)
428 jne LBB1_1
429 ret
430LBB1_1:
431 pop %eax # realign stack.
432 call L_abort$stub
433
434Though this probably isn't worth it.
Chris Lattnerb5c89c82007-02-12 20:26:34 +0000435
436//===---------------------------------------------------------------------===//
Chris Lattnerfc2f5212007-03-02 05:04:52 +0000437
Chris Lattner623c7382007-05-10 00:08:04 +0000438Sometimes it is better to codegen subtractions from a constant (e.g. 7-x) with
439a neg instead of a sub instruction. Consider:
440
441int test(char X) { return 7-X; }
442
443we currently produce:
444_test:
445 movl $7, %eax
446 movsbl 4(%esp), %ecx
447 subl %ecx, %eax
448 ret
449
450We would use one fewer register if codegen'd as:
451
452 movsbl 4(%esp), %eax
453 neg %eax
454 add $7, %eax
455 ret
456
457Note that this isn't beneficial if the load can be folded into the sub. In
458this case, we want a sub:
459
460int test(int X) { return 7-X; }
461_test:
462 movl $7, %eax
463 subl 4(%esp), %eax
464 ret
Chris Lattnere2754632007-04-14 23:06:09 +0000465
Evan Chengf3140552007-07-18 08:21:49 +0000466//===---------------------------------------------------------------------===//
467
Chris Lattner78846b62007-08-20 02:14:33 +0000468Leaf functions that require one 4-byte spill slot have a prolog like this:
469
470_foo:
471 pushl %esi
472 subl $4, %esp
473...
474and an epilog like this:
475 addl $4, %esp
476 popl %esi
477 ret
478
479It would be smaller, and potentially faster, to push eax on entry and to
480pop into a dummy register instead of using addl/subl of esp. Just don't pop
481into any return registers :)
482
483//===---------------------------------------------------------------------===//
Chris Lattner33800d12007-08-23 15:22:07 +0000484
485The X86 backend should fold (branch (or (setcc, setcc))) into multiple
486branches. We generate really poor code for:
487
488double testf(double a) {
489 return a == 0.0 ? 0.0 : (a > 0.0 ? 1.0 : -1.0);
490}
491
492For example, the entry BB is:
493
494_testf:
495 subl $20, %esp
496 pxor %xmm0, %xmm0
497 movsd 24(%esp), %xmm1
498 ucomisd %xmm0, %xmm1
499 setnp %al
500 sete %cl
501 testb %cl, %al
502 jne LBB1_5 # UnifiedReturnBlock
503LBB1_1: # cond_true
504
505
506it would be better to replace the last four instructions with:
507
508 jp LBB1_1
509 je LBB1_5
510LBB1_1:
511
512We also codegen the inner ?: into a diamond:
513
514 cvtss2sd LCPI1_0(%rip), %xmm2
515 cvtss2sd LCPI1_1(%rip), %xmm3
516 ucomisd %xmm1, %xmm0
517 ja LBB1_3 # cond_true
518LBB1_2: # cond_true
519 movapd %xmm3, %xmm2
520LBB1_3: # cond_true
521 movapd %xmm2, %xmm0
522 ret
523
524We should sink the load into xmm3 into the LBB1_2 block. This should
525be pretty easy, and will nuke all the copies.
526
527//===---------------------------------------------------------------------===//
Chris Lattner6777b722007-09-10 21:43:18 +0000528
529This:
530 #include <algorithm>
531 inline std::pair<unsigned, bool> full_add(unsigned a, unsigned b)
532 { return std::make_pair(a + b, a + b < a); }
533 bool no_overflow(unsigned a, unsigned b)
534 { return !full_add(a, b).second; }
535
536Should compile to:
Eli Friedman78b98512011-02-19 21:54:28 +0000537 addl %esi, %edi
538 setae %al
539 movzbl %al, %eax
540 ret
Chris Lattner6777b722007-09-10 21:43:18 +0000541
Eli Friedman78b98512011-02-19 21:54:28 +0000542on x86-64, instead of the rather stupid-looking:
543 addl %esi, %edi
544 setb %al
545 xorb $1, %al
546 movzbl %al, %eax
547 ret
Chris Lattner6777b722007-09-10 21:43:18 +0000548
549
550//===---------------------------------------------------------------------===//
Evan Cheng8c3c1982007-09-10 22:16:37 +0000551
Bill Wendling9c4d61b2007-10-02 20:42:59 +0000552The following code:
553
Bill Wendling96ed3bb2007-10-02 20:54:32 +0000554bb114.preheader: ; preds = %cond_next94
555 %tmp231232 = sext i16 %tmp62 to i32 ; <i32> [#uses=1]
556 %tmp233 = sub i32 32, %tmp231232 ; <i32> [#uses=1]
557 %tmp245246 = sext i16 %tmp65 to i32 ; <i32> [#uses=1]
558 %tmp252253 = sext i16 %tmp68 to i32 ; <i32> [#uses=1]
559 %tmp254 = sub i32 32, %tmp252253 ; <i32> [#uses=1]
560 %tmp553554 = bitcast i16* %tmp37 to i8* ; <i8*> [#uses=2]
561 %tmp583584 = sext i16 %tmp98 to i32 ; <i32> [#uses=1]
562 %tmp585 = sub i32 32, %tmp583584 ; <i32> [#uses=1]
563 %tmp614615 = sext i16 %tmp101 to i32 ; <i32> [#uses=1]
564 %tmp621622 = sext i16 %tmp104 to i32 ; <i32> [#uses=1]
565 %tmp623 = sub i32 32, %tmp621622 ; <i32> [#uses=1]
566 br label %bb114
567
568produces:
569
Bill Wendling9c4d61b2007-10-02 20:42:59 +0000570LBB3_5: # bb114.preheader
571 movswl -68(%ebp), %eax
572 movl $32, %ecx
573 movl %ecx, -80(%ebp)
574 subl %eax, -80(%ebp)
575 movswl -52(%ebp), %eax
576 movl %ecx, -84(%ebp)
577 subl %eax, -84(%ebp)
578 movswl -70(%ebp), %eax
579 movl %ecx, -88(%ebp)
580 subl %eax, -88(%ebp)
581 movswl -50(%ebp), %eax
582 subl %eax, %ecx
583 movl %ecx, -76(%ebp)
584 movswl -42(%ebp), %eax
585 movl %eax, -92(%ebp)
586 movswl -66(%ebp), %eax
587 movl %eax, -96(%ebp)
588 movw $0, -98(%ebp)
589
Chris Lattner21ba1762007-10-03 03:40:24 +0000590This appears to be bad because the RA is not folding the store to the stack
591slot into the movl. The above instructions could be:
592 movl $32, -80(%ebp)
593...
594 movl $32, -84(%ebp)
595...
596This seems like a cross between remat and spill folding.
597
Bill Wendling96ed3bb2007-10-02 20:54:32 +0000598This has redundant subtractions of %eax from a stack slot. However, %ecx doesn't
Bill Wendling9c4d61b2007-10-02 20:42:59 +0000599change, so we could simply subtract %eax from %ecx first and then use %ecx (or
600vice-versa).
601
602//===---------------------------------------------------------------------===//
603
Bill Wendling3efc0752007-10-02 21:49:31 +0000604This code:
605
606 %tmp659 = icmp slt i16 %tmp654, 0 ; <i1> [#uses=1]
607 br i1 %tmp659, label %cond_true662, label %cond_next715
608
609produces this:
610
611 testw %cx, %cx
612 movswl %cx, %esi
613 jns LBB4_109 # cond_next715
614
615Shark tells us that using %cx in the testw instruction is sub-optimal. It
616suggests using the 32-bit register (which is what ICC uses).
617
618//===---------------------------------------------------------------------===//
Chris Lattner4bdb84f2007-10-03 17:10:03 +0000619
Chris Lattner1f2b5f02007-10-04 15:47:27 +0000620We compile this:
621
622void compare (long long foo) {
623 if (foo < 4294967297LL)
624 abort();
625}
626
627to:
628
Eli Friedman5d8fa822008-02-21 21:16:49 +0000629compare:
630 subl $4, %esp
631 cmpl $0, 8(%esp)
Chris Lattner1f2b5f02007-10-04 15:47:27 +0000632 setne %al
633 movzbw %al, %ax
Eli Friedman5d8fa822008-02-21 21:16:49 +0000634 cmpl $1, 12(%esp)
Chris Lattner1f2b5f02007-10-04 15:47:27 +0000635 setg %cl
636 movzbw %cl, %cx
637 cmove %ax, %cx
Eli Friedman5d8fa822008-02-21 21:16:49 +0000638 testb $1, %cl
639 jne .LBB1_2 # UnifiedReturnBlock
640.LBB1_1: # ifthen
641 call abort
642.LBB1_2: # UnifiedReturnBlock
643 addl $4, %esp
644 ret
Chris Lattner1f2b5f02007-10-04 15:47:27 +0000645
646(also really horrible code on ppc). This is due to the expand code for 64-bit
647compares. GCC produces multiple branches, which is much nicer:
648
Eli Friedman5d8fa822008-02-21 21:16:49 +0000649compare:
650 subl $12, %esp
651 movl 20(%esp), %edx
652 movl 16(%esp), %eax
653 decl %edx
654 jle .L7
655.L5:
656 addl $12, %esp
657 ret
658 .p2align 4,,7
659.L7:
660 jl .L4
Chris Lattner1f2b5f02007-10-04 15:47:27 +0000661 cmpl $0, %eax
Eli Friedman5d8fa822008-02-21 21:16:49 +0000662 .p2align 4,,8
663 ja .L5
664.L4:
665 .p2align 4,,9
666 call abort
Chris Lattner1f2b5f02007-10-04 15:47:27 +0000667
668//===---------------------------------------------------------------------===//
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000669
Arnold Schwaighofer1f0da1f2007-10-12 21:30:57 +0000670Tail call optimization improvements: Tail call optimization currently
671pushes all arguments on the top of the stack (their normal place for
Arnold Schwaighofer6cf72fb2008-01-11 16:49:42 +0000672non-tail call optimized calls) that source from the callers arguments
673or that source from a virtual register (also possibly sourcing from
674callers arguments).
675This is done to prevent overwriting of parameters (see example
676below) that might be used later.
Arnold Schwaighofer1f0da1f2007-10-12 21:30:57 +0000677
678example:
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000679
680int callee(int32, int64);
681int caller(int32 arg1, int32 arg2) {
682 int64 local = arg2 * 2;
683 return callee(arg2, (int64)local);
684}
685
686[arg1] [!arg2 no longer valid since we moved local onto it]
687[arg2] -> [(int64)
688[RETADDR] local ]
689
Arnold Schwaighofer1f0da1f2007-10-12 21:30:57 +0000690Moving arg1 onto the stack slot of callee function would overwrite
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000691arg2 of the caller.
692
693Possible optimizations:
694
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000695
Arnold Schwaighofer1f0da1f2007-10-12 21:30:57 +0000696 - Analyse the actual parameters of the callee to see which would
697 overwrite a caller parameter which is used by the callee and only
698 push them onto the top of the stack.
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000699
700 int callee (int32 arg1, int32 arg2);
701 int caller (int32 arg1, int32 arg2) {
702 return callee(arg1,arg2);
703 }
704
Arnold Schwaighofer1f0da1f2007-10-12 21:30:57 +0000705 Here we don't need to write any variables to the top of the stack
706 since they don't overwrite each other.
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000707
708 int callee (int32 arg1, int32 arg2);
709 int caller (int32 arg1, int32 arg2) {
710 return callee(arg2,arg1);
711 }
712
Arnold Schwaighofer1f0da1f2007-10-12 21:30:57 +0000713 Here we need to push the arguments because they overwrite each
714 other.
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000715
Arnold Schwaighofer9ccea992007-10-11 19:40:01 +0000716//===---------------------------------------------------------------------===//
Evan Chengc826ac52007-10-28 04:01:09 +0000717
718main ()
719{
720 int i = 0;
721 unsigned long int z = 0;
722
723 do {
724 z -= 0x00004000;
725 i++;
726 if (i > 0x00040000)
727 abort ();
728 } while (z > 0);
729 exit (0);
730}
731
732gcc compiles this to:
733
734_main:
735 subl $28, %esp
736 xorl %eax, %eax
737 jmp L2
738L3:
739 cmpl $262144, %eax
740 je L10
741L2:
742 addl $1, %eax
743 cmpl $262145, %eax
744 jne L3
745 call L_abort$stub
746L10:
747 movl $0, (%esp)
748 call L_exit$stub
749
750llvm:
751
752_main:
753 subl $12, %esp
754 movl $1, %eax
755 movl $16384, %ecx
756LBB1_1: # bb
757 cmpl $262145, %eax
758 jge LBB1_4 # cond_true
759LBB1_2: # cond_next
760 incl %eax
761 addl $4294950912, %ecx
762 cmpl $16384, %ecx
763 jne LBB1_1 # bb
764LBB1_3: # bb11
765 xorl %eax, %eax
766 addl $12, %esp
767 ret
768LBB1_4: # cond_true
769 call L_abort$stub
770
7711. LSR should rewrite the first cmp with induction variable %ecx.
7722. DAG combiner should fold
773 leal 1(%eax), %edx
774 cmpl $262145, %edx
775 =>
776 cmpl $262144, %eax
777
778//===---------------------------------------------------------------------===//
Chris Lattnerab98c412007-11-24 06:13:33 +0000779
780define i64 @test(double %X) {
781 %Y = fptosi double %X to i64
782 ret i64 %Y
783}
784
785compiles to:
786
787_test:
788 subl $20, %esp
789 movsd 24(%esp), %xmm0
790 movsd %xmm0, 8(%esp)
791 fldl 8(%esp)
792 fisttpll (%esp)
793 movl 4(%esp), %edx
794 movl (%esp), %eax
795 addl $20, %esp
796 #FP_REG_KILL
797 ret
798
799This should just fldl directly from the input stack slot.
Chris Lattnerad05e172007-12-05 22:58:19 +0000800
801//===---------------------------------------------------------------------===//
802
803This code:
804int foo (int x) { return (x & 65535) | 255; }
805
806Should compile into:
807
808_foo:
809 movzwl 4(%esp), %eax
Eli Friedman5d8fa822008-02-21 21:16:49 +0000810 orl $255, %eax
Chris Lattnerad05e172007-12-05 22:58:19 +0000811 ret
812
813instead of:
814_foo:
Eli Friedman78b98512011-02-19 21:54:28 +0000815 movl $65280, %eax
816 andl 4(%esp), %eax
817 orl $255, %eax
818 ret
Chris Lattnerad05e172007-12-05 22:58:19 +0000819
Chris Lattner2583a662007-12-18 16:48:14 +0000820//===---------------------------------------------------------------------===//
821
Chris Lattnere86c91f2008-02-21 06:51:29 +0000822We're codegen'ing multiply of long longs inefficiently:
Chris Lattner2583a662007-12-18 16:48:14 +0000823
Chris Lattnere86c91f2008-02-21 06:51:29 +0000824unsigned long long LLM(unsigned long long arg1, unsigned long long arg2) {
825 return arg1 * arg2;
826}
Chris Lattner2583a662007-12-18 16:48:14 +0000827
Chris Lattnere86c91f2008-02-21 06:51:29 +0000828We compile to (fomit-frame-pointer):
829
830_LLM:
831 pushl %esi
832 movl 8(%esp), %ecx
833 movl 16(%esp), %esi
834 movl %esi, %eax
835 mull %ecx
836 imull 12(%esp), %esi
837 addl %edx, %esi
838 imull 20(%esp), %ecx
839 movl %esi, %edx
840 addl %ecx, %edx
841 popl %esi
842 ret
843
844This looks like a scheduling deficiency and lack of remat of the load from
845the argument area. ICC apparently produces:
846
847 movl 8(%esp), %ecx
848 imull 12(%esp), %ecx
849 movl 16(%esp), %eax
850 imull 4(%esp), %eax
851 addl %eax, %ecx
852 movl 4(%esp), %eax
853 mull 12(%esp)
854 addl %ecx, %edx
Chris Lattner2583a662007-12-18 16:48:14 +0000855 ret
856
Chris Lattnere86c91f2008-02-21 06:51:29 +0000857Note that it remat'd loads from 4(esp) and 12(esp). See this GCC PR:
858http://gcc.gnu.org/bugzilla/show_bug.cgi?id=17236
Chris Lattner2583a662007-12-18 16:48:14 +0000859
860//===---------------------------------------------------------------------===//
861
Chris Lattner6c234bf2007-12-24 19:27:46 +0000862We can fold a store into "zeroing a reg". Instead of:
863
864xorl %eax, %eax
865movl %eax, 124(%esp)
866
867we should get:
868
869movl $0, 124(%esp)
870
871if the flags of the xor are dead.
872
Chris Lattnerff5998e2008-01-11 18:00:13 +0000873Likewise, we isel "x<<1" into "add reg,reg". If reg is spilled, this should
874be folded into: shl [mem], 1
875
Chris Lattner6c234bf2007-12-24 19:27:46 +0000876//===---------------------------------------------------------------------===//
Chris Lattnerd7980022007-12-28 21:50:40 +0000877
Chris Lattneref81aa72008-01-07 21:59:58 +0000878In SSE mode, we turn abs and neg into a load from the constant pool plus a xor
879or and instruction, for example:
880
Chris Lattner9129f512008-01-09 00:37:18 +0000881 xorpd LCPI1_0, %xmm2
Chris Lattneref81aa72008-01-07 21:59:58 +0000882
883However, if xmm2 gets spilled, we end up with really ugly code like this:
884
Chris Lattner9129f512008-01-09 00:37:18 +0000885 movsd (%esp), %xmm0
886 xorpd LCPI1_0, %xmm0
887 movsd %xmm0, (%esp)
Chris Lattneref81aa72008-01-07 21:59:58 +0000888
889Since we 'know' that this is a 'neg', we can actually "fold" the spill into
890the neg/abs instruction, turning it into an *integer* operation, like this:
891
892 xorl 2147483648, [mem+4] ## 2147483648 = (1 << 31)
893
894you could also use xorb, but xorl is less likely to lead to a partial register
Chris Lattner9129f512008-01-09 00:37:18 +0000895stall. Here is a contrived testcase:
896
897double a, b, c;
898void test(double *P) {
899 double X = *P;
900 a = X;
901 bar();
902 X = -X;
903 b = X;
904 bar();
905 c = X;
906}
Chris Lattneref81aa72008-01-07 21:59:58 +0000907
908//===---------------------------------------------------------------------===//
Andrew Lenharth9b254ee2008-02-16 01:24:58 +0000909
Chris Lattner1f652082008-02-17 19:43:57 +0000910The generated code on x86 for checking for signed overflow on a multiply the
911obvious way is much longer than it needs to be.
912
913int x(int a, int b) {
914 long long prod = (long long)a*b;
915 return prod > 0x7FFFFFFF || prod < (-0x7FFFFFFF-1);
916}
917
918See PR2053 for more details.
919
920//===---------------------------------------------------------------------===//
Chris Lattnera8272052008-02-18 18:30:13 +0000921
Eli Friedman5d8fa822008-02-21 21:16:49 +0000922We should investigate using cdq/ctld (effect: edx = sar eax, 31)
923more aggressively; it should cost the same as a move+shift on any modern
924processor, but it's a lot shorter. Downside is that it puts more
925pressure on register allocation because it has fixed operands.
926
927Example:
928int abs(int x) {return x < 0 ? -x : x;}
929
930gcc compiles this to the following when using march/mtune=pentium2/3/4/m/etc.:
931abs:
932 movl 4(%esp), %eax
933 cltd
934 xorl %edx, %eax
935 subl %edx, %eax
936 ret
937
938//===---------------------------------------------------------------------===//
939
Eli Friedman93e8b672008-02-28 00:21:43 +0000940Take the following code (from
941http://gcc.gnu.org/bugzilla/show_bug.cgi?id=16541):
942
943extern unsigned char first_one[65536];
944int FirstOnet(unsigned long long arg1)
945{
946 if (arg1 >> 48)
947 return (first_one[arg1 >> 48]);
948 return 0;
949}
950
951
952The following code is currently generated:
953FirstOnet:
954 movl 8(%esp), %eax
955 cmpl $65536, %eax
956 movl 4(%esp), %ecx
957 jb .LBB1_2 # UnifiedReturnBlock
958.LBB1_1: # ifthen
959 shrl $16, %eax
960 movzbl first_one(%eax), %eax
961 ret
962.LBB1_2: # UnifiedReturnBlock
963 xorl %eax, %eax
964 ret
965
Eli Friedman1f413032010-06-03 01:01:48 +0000966We could change the "movl 8(%esp), %eax" into "movzwl 10(%esp), %eax"; this
967lets us change the cmpl into a testl, which is shorter, and eliminate the shift.
Eli Friedman93e8b672008-02-28 00:21:43 +0000968
969//===---------------------------------------------------------------------===//
970
Chris Lattner83e80cd2008-02-28 04:52:59 +0000971We compile this function:
972
973define i32 @foo(i32 %a, i32 %b, i32 %c, i8 zeroext %d) nounwind {
974entry:
975 %tmp2 = icmp eq i8 %d, 0 ; <i1> [#uses=1]
976 br i1 %tmp2, label %bb7, label %bb
977
978bb: ; preds = %entry
979 %tmp6 = add i32 %b, %a ; <i32> [#uses=1]
980 ret i32 %tmp6
981
982bb7: ; preds = %entry
983 %tmp10 = sub i32 %a, %c ; <i32> [#uses=1]
984 ret i32 %tmp10
985}
986
987to:
988
Eli Friedman1f413032010-06-03 01:01:48 +0000989foo: # @foo
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000990# %bb.0: # %entry
Eli Friedman1f413032010-06-03 01:01:48 +0000991 movl 4(%esp), %ecx
Chris Lattner83e80cd2008-02-28 04:52:59 +0000992 cmpb $0, 16(%esp)
Eli Friedman1f413032010-06-03 01:01:48 +0000993 je .LBB0_2
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +0000994# %bb.1: # %bb
Chris Lattner83e80cd2008-02-28 04:52:59 +0000995 movl 8(%esp), %eax
Eli Friedman1f413032010-06-03 01:01:48 +0000996 addl %ecx, %eax
Chris Lattner83e80cd2008-02-28 04:52:59 +0000997 ret
Eli Friedman1f413032010-06-03 01:01:48 +0000998.LBB0_2: # %bb7
999 movl 12(%esp), %edx
1000 movl %ecx, %eax
1001 subl %edx, %eax
Chris Lattner83e80cd2008-02-28 04:52:59 +00001002 ret
1003
Eli Friedman1f413032010-06-03 01:01:48 +00001004There's an obviously unnecessary movl in .LBB0_2, and we could eliminate a
1005couple more movls by putting 4(%esp) into %eax instead of %ecx.
Chris Lattner83e80cd2008-02-28 04:52:59 +00001006
1007//===---------------------------------------------------------------------===//
Evan Cheng81e0c9a2008-03-28 07:07:06 +00001008
1009See rdar://4653682.
1010
1011From flops:
1012
1013LBB1_15: # bb310
1014 cvtss2sd LCPI1_0, %xmm1
1015 addsd %xmm1, %xmm0
1016 movsd 176(%esp), %xmm2
1017 mulsd %xmm0, %xmm2
1018 movapd %xmm2, %xmm3
1019 mulsd %xmm3, %xmm3
1020 movapd %xmm3, %xmm4
1021 mulsd LCPI1_23, %xmm4
1022 addsd LCPI1_24, %xmm4
1023 mulsd %xmm3, %xmm4
1024 addsd LCPI1_25, %xmm4
1025 mulsd %xmm3, %xmm4
1026 addsd LCPI1_26, %xmm4
1027 mulsd %xmm3, %xmm4
1028 addsd LCPI1_27, %xmm4
1029 mulsd %xmm3, %xmm4
1030 addsd LCPI1_28, %xmm4
1031 mulsd %xmm3, %xmm4
1032 addsd %xmm1, %xmm4
1033 mulsd %xmm2, %xmm4
1034 movsd 152(%esp), %xmm1
1035 addsd %xmm4, %xmm1
1036 movsd %xmm1, 152(%esp)
1037 incl %eax
1038 cmpl %eax, %esi
1039 jge LBB1_15 # bb310
1040LBB1_16: # bb358.loopexit
1041 movsd 152(%esp), %xmm0
1042 addsd %xmm0, %xmm0
1043 addsd LCPI1_22, %xmm0
1044 movsd %xmm0, 152(%esp)
1045
1046Rather than spilling the result of the last addsd in the loop, we should have
1047insert a copy to split the interval (one for the duration of the loop, one
1048extending to the fall through). The register pressure in the loop isn't high
1049enough to warrant the spill.
1050
1051Also check why xmm7 is not used at all in the function.
Chris Lattnera89143f2008-04-21 04:46:30 +00001052
1053//===---------------------------------------------------------------------===//
1054
Eli Friedman1f413032010-06-03 01:01:48 +00001055Take the following:
Chris Lattnera89143f2008-04-21 04:46:30 +00001056
Lang Hamesde7ab802011-10-10 23:42:08 +00001057target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-S128"
Chris Lattnera89143f2008-04-21 04:46:30 +00001058target triple = "i386-apple-darwin8"
1059@in_exit.4870.b = internal global i1 false ; <i1*> [#uses=2]
1060define fastcc void @abort_gzip() noreturn nounwind {
1061entry:
1062 %tmp.b.i = load i1* @in_exit.4870.b ; <i1> [#uses=1]
1063 br i1 %tmp.b.i, label %bb.i, label %bb4.i
1064bb.i: ; preds = %entry
1065 tail call void @exit( i32 1 ) noreturn nounwind
1066 unreachable
1067bb4.i: ; preds = %entry
1068 store i1 true, i1* @in_exit.4870.b
1069 tail call void @exit( i32 1 ) noreturn nounwind
1070 unreachable
1071}
1072declare void @exit(i32) noreturn nounwind
1073
Eli Friedman1f413032010-06-03 01:01:48 +00001074This compiles into:
1075_abort_gzip: ## @abort_gzip
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001076## %bb.0: ## %entry
Chris Lattnera89143f2008-04-21 04:46:30 +00001077 subl $12, %esp
1078 movb _in_exit.4870.b, %al
Eli Friedman1f413032010-06-03 01:01:48 +00001079 cmpb $1, %al
1080 jne LBB0_2
1081
1082We somehow miss folding the movb into the cmpb.
Chris Lattnera89143f2008-04-21 04:46:30 +00001083
1084//===---------------------------------------------------------------------===//
Chris Lattner6e2bf7c2008-05-05 23:19:45 +00001085
1086We compile:
1087
1088int test(int x, int y) {
1089 return x-y-1;
1090}
1091
1092into (-m64):
1093
1094_test:
1095 decl %edi
1096 movl %edi, %eax
1097 subl %esi, %eax
1098 ret
1099
1100it would be better to codegen as: x+~y (notl+addl)
Torok Edwin33986d82008-10-24 19:23:07 +00001101
1102//===---------------------------------------------------------------------===//
1103
1104This code:
1105
1106int foo(const char *str,...)
1107{
1108 __builtin_va_list a; int x;
1109 __builtin_va_start(a,str); x = __builtin_va_arg(a,int); __builtin_va_end(a);
1110 return x;
1111}
1112
1113gets compiled into this on x86-64:
1114 subq $200, %rsp
1115 movaps %xmm7, 160(%rsp)
1116 movaps %xmm6, 144(%rsp)
1117 movaps %xmm5, 128(%rsp)
1118 movaps %xmm4, 112(%rsp)
1119 movaps %xmm3, 96(%rsp)
1120 movaps %xmm2, 80(%rsp)
1121 movaps %xmm1, 64(%rsp)
1122 movaps %xmm0, 48(%rsp)
1123 movq %r9, 40(%rsp)
1124 movq %r8, 32(%rsp)
1125 movq %rcx, 24(%rsp)
1126 movq %rdx, 16(%rsp)
1127 movq %rsi, 8(%rsp)
1128 leaq (%rsp), %rax
1129 movq %rax, 192(%rsp)
1130 leaq 208(%rsp), %rax
1131 movq %rax, 184(%rsp)
1132 movl $48, 180(%rsp)
1133 movl $8, 176(%rsp)
1134 movl 176(%rsp), %eax
1135 cmpl $47, %eax
1136 jbe .LBB1_3 # bb
1137.LBB1_1: # bb3
1138 movq 184(%rsp), %rcx
1139 leaq 8(%rcx), %rax
1140 movq %rax, 184(%rsp)
1141.LBB1_2: # bb4
1142 movl (%rcx), %eax
1143 addq $200, %rsp
1144 ret
1145.LBB1_3: # bb
1146 movl %eax, %ecx
1147 addl $8, %eax
1148 addq 192(%rsp), %rcx
1149 movl %eax, 176(%rsp)
1150 jmp .LBB1_2 # bb4
1151
1152gcc 4.3 generates:
1153 subq $96, %rsp
1154.LCFI0:
1155 leaq 104(%rsp), %rax
1156 movq %rsi, -80(%rsp)
1157 movl $8, -120(%rsp)
1158 movq %rax, -112(%rsp)
1159 leaq -88(%rsp), %rax
1160 movq %rax, -104(%rsp)
1161 movl $8, %eax
1162 cmpl $48, %eax
1163 jb .L6
1164 movq -112(%rsp), %rdx
1165 movl (%rdx), %eax
1166 addq $96, %rsp
1167 ret
1168 .p2align 4,,10
1169 .p2align 3
1170.L6:
1171 mov %eax, %edx
1172 addq -104(%rsp), %rdx
1173 addl $8, %eax
1174 movl %eax, -120(%rsp)
1175 movl (%rdx), %eax
1176 addq $96, %rsp
1177 ret
1178
1179and it gets compiled into this on x86:
1180 pushl %ebp
1181 movl %esp, %ebp
1182 subl $4, %esp
1183 leal 12(%ebp), %eax
1184 movl %eax, -4(%ebp)
1185 leal 16(%ebp), %eax
1186 movl %eax, -4(%ebp)
1187 movl 12(%ebp), %eax
1188 addl $4, %esp
1189 popl %ebp
1190 ret
1191
1192gcc 4.3 generates:
1193 pushl %ebp
1194 movl %esp, %ebp
1195 movl 12(%ebp), %eax
1196 popl %ebp
1197 ret
Evan Cheng2d1937ed2008-11-11 17:35:52 +00001198
1199//===---------------------------------------------------------------------===//
1200
1201Teach tblgen not to check bitconvert source type in some cases. This allows us
1202to consolidate the following patterns in X86InstrMMX.td:
1203
1204def : Pat<(v2i32 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1205 (iPTR 0))))),
1206 (v2i32 (MMX_MOVDQ2Qrr VR128:$src))>;
1207def : Pat<(v4i16 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1208 (iPTR 0))))),
1209 (v4i16 (MMX_MOVDQ2Qrr VR128:$src))>;
1210def : Pat<(v8i8 (bitconvert (i64 (vector_extract (v2i64 VR128:$src),
1211 (iPTR 0))))),
1212 (v8i8 (MMX_MOVDQ2Qrr VR128:$src))>;
1213
1214There are other cases in various td files.
Eli Friedmane9ef1702008-11-30 07:52:27 +00001215
1216//===---------------------------------------------------------------------===//
1217
1218Take something like the following on x86-32:
1219unsigned a(unsigned long long x, unsigned y) {return x % y;}
1220
1221We currently generate a libcall, but we really shouldn't: the expansion is
1222shorter and likely faster than the libcall. The expected code is something
1223like the following:
1224
1225 movl 12(%ebp), %eax
1226 movl 16(%ebp), %ecx
1227 xorl %edx, %edx
1228 divl %ecx
1229 movl 8(%ebp), %eax
1230 divl %ecx
1231 movl %edx, %eax
1232 ret
1233
1234A similar code sequence works for division.
1235
1236//===---------------------------------------------------------------------===//
Chris Lattner527dd602008-12-06 22:49:05 +00001237
Chris Lattner1aca40e2009-02-08 20:44:19 +00001238We currently compile this:
1239
1240define i32 @func1(i32 %v1, i32 %v2) nounwind {
1241entry:
1242 %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
1243 %sum = extractvalue {i32, i1} %t, 0
1244 %obit = extractvalue {i32, i1} %t, 1
1245 br i1 %obit, label %overflow, label %normal
1246normal:
1247 ret i32 %sum
1248overflow:
1249 call void @llvm.trap()
1250 unreachable
1251}
1252declare {i32, i1} @llvm.sadd.with.overflow.i32(i32, i32)
1253declare void @llvm.trap()
1254
1255to:
1256
1257_func1:
1258 movl 4(%esp), %eax
1259 addl 8(%esp), %eax
1260 jo LBB1_2 ## overflow
1261LBB1_1: ## normal
1262 ret
1263LBB1_2: ## overflow
1264 ud2
1265
1266it would be nice to produce "into" someday.
1267
1268//===---------------------------------------------------------------------===//
Chris Lattnercba4b6f2009-02-17 01:16:14 +00001269
Dan Gohmand5b35ee2009-03-09 23:47:02 +00001270Test instructions can be eliminated by using EFLAGS values from arithmetic
Dan Gohmanb0d40092009-03-10 00:26:23 +00001271instructions. This is currently not done for mul, and, or, xor, neg, shl,
1272sra, srl, shld, shrd, atomic ops, and others. It is also currently not done
1273for read-modify-write instructions. It is also current not done if the
1274OF or CF flags are needed.
Dan Gohmand5b35ee2009-03-09 23:47:02 +00001275
1276The shift operators have the complication that when the shift count is
1277zero, EFLAGS is not set, so they can only subsume a test instruction if
Dan Gohmanb0d40092009-03-10 00:26:23 +00001278the shift count is known to be non-zero. Also, using the EFLAGS value
1279from a shift is apparently very slow on some x86 implementations.
Dan Gohmand5b35ee2009-03-09 23:47:02 +00001280
1281In read-modify-write instructions, the root node in the isel match is
1282the store, and isel has no way for the use of the EFLAGS result of the
1283arithmetic to be remapped to the new node.
1284
Dan Gohmanb0d40092009-03-10 00:26:23 +00001285Add and subtract instructions set OF on signed overflow and CF on unsiged
1286overflow, while test instructions always clear OF and CF. In order to
1287replace a test with an add or subtract in a situation where OF or CF is
1288needed, codegen must be able to prove that the operation cannot see
1289signed or unsigned overflow, respectively.
1290
Dan Gohmand5b35ee2009-03-09 23:47:02 +00001291//===---------------------------------------------------------------------===//
1292
Chris Lattner393ac622009-03-08 03:04:26 +00001293memcpy/memmove do not lower to SSE copies when possible. A silly example is:
1294define <16 x float> @foo(<16 x float> %A) nounwind {
1295 %tmp = alloca <16 x float>, align 16
1296 %tmp2 = alloca <16 x float>, align 16
1297 store <16 x float> %A, <16 x float>* %tmp
1298 %s = bitcast <16 x float>* %tmp to i8*
1299 %s2 = bitcast <16 x float>* %tmp2 to i8*
1300 call void @llvm.memcpy.i64(i8* %s, i8* %s2, i64 64, i32 16)
1301 %R = load <16 x float>* %tmp2
1302 ret <16 x float> %R
1303}
1304
1305declare void @llvm.memcpy.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
1306
1307which compiles to:
1308
1309_foo:
1310 subl $140, %esp
1311 movaps %xmm3, 112(%esp)
1312 movaps %xmm2, 96(%esp)
1313 movaps %xmm1, 80(%esp)
1314 movaps %xmm0, 64(%esp)
1315 movl 60(%esp), %eax
1316 movl %eax, 124(%esp)
1317 movl 56(%esp), %eax
1318 movl %eax, 120(%esp)
1319 movl 52(%esp), %eax
1320 <many many more 32-bit copies>
1321 movaps (%esp), %xmm0
1322 movaps 16(%esp), %xmm1
1323 movaps 32(%esp), %xmm2
1324 movaps 48(%esp), %xmm3
1325 addl $140, %esp
1326 ret
1327
1328On Nehalem, it may even be cheaper to just use movups when unaligned than to
1329fall back to lower-granularity chunks.
1330
1331//===---------------------------------------------------------------------===//
Chris Lattner9b650312009-05-25 20:28:19 +00001332
1333Implement processor-specific optimizations for parity with GCC on these
1334processors. GCC does two optimizations:
1335
13361. ix86_pad_returns inserts a noop before ret instructions if immediately
Chris Lattner0ab5e2c2011-04-15 05:18:47 +00001337 preceded by a conditional branch or is the target of a jump.
Chris Lattner9b650312009-05-25 20:28:19 +000013382. ix86_avoid_jump_misspredicts inserts noops in cases where a 16-byte block of
1339 code contains more than 3 branches.
1340
1341The first one is done for all AMDs, Core2, and "Generic"
1342The second one is done for: Atom, Pentium Pro, all AMDs, Pentium 4, Nocona,
1343 Core 2, and "Generic"
1344
1345//===---------------------------------------------------------------------===//
Eli Friedman32ad5e92009-06-11 23:07:04 +00001346Testcase:
1347int x(int a) { return (a&0xf0)>>4; }
1348
1349Current output:
1350 movl 4(%esp), %eax
1351 shrl $4, %eax
1352 andl $15, %eax
1353 ret
1354
1355Ideal output:
1356 movzbl 4(%esp), %eax
1357 shrl $4, %eax
1358 ret
1359
1360//===---------------------------------------------------------------------===//
1361
Evan Cheng92df9c32009-07-30 08:56:19 +00001362Re-implement atomic builtins __sync_add_and_fetch() and __sync_sub_and_fetch
1363properly.
1364
1365When the return value is not used (i.e. only care about the value in the
1366memory), x86 does not have to use add to implement these. Instead, it can use
1367add, sub, inc, dec instructions with the "lock" prefix.
1368
1369This is currently implemented using a bit of instruction selection trick. The
1370issue is the target independent pattern produces one output and a chain and we
1371want to map it into one that just output a chain. The current trick is to select
1372it into a MERGE_VALUES with the first definition being an implicit_def. The
1373proper solution is to add new ISD opcodes for the no-output variant. DAG
1374combiner can then transform the node before it gets to target node selection.
1375
1376Problem #2 is we are adding a whole bunch of x86 atomic instructions when in
1377fact these instructions are identical to the non-lock versions. We need a way to
1378add target specific information to target nodes and have this information
1379carried over to machine instructions. Asm printer (or JIT) can use this
1380information to add the "lock" prefix.
Bill Wendlinga2054022009-10-27 22:34:43 +00001381
1382//===---------------------------------------------------------------------===//
Eli Friedman4d4c6942010-02-10 21:26:04 +00001383
Rafael Espindola7a3b2442011-04-06 17:19:35 +00001384struct B {
1385 unsigned char y0 : 1;
1386};
Eli Friedman4d4c6942010-02-10 21:26:04 +00001387
Rafael Espindola7a3b2442011-04-06 17:19:35 +00001388int bar(struct B* a) { return a->y0; }
1389
1390define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
1391 %1 = getelementptr inbounds %struct.B* %a, i64 0, i32 0
1392 %2 = load i8* %1, align 1
1393 %3 = and i8 %2, 1
1394 %4 = zext i8 %3 to i32
1395 ret i32 %4
Eli Friedman4d4c6942010-02-10 21:26:04 +00001396}
1397
Rafael Espindola7a3b2442011-04-06 17:19:35 +00001398bar: # @bar
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001399# %bb.0:
Rafael Espindola7a3b2442011-04-06 17:19:35 +00001400 movb (%rdi), %al
1401 andb $1, %al
1402 movzbl %al, %eax
1403 ret
Eli Friedman4d4c6942010-02-10 21:26:04 +00001404
1405Missed optimization: should be movl+andl.
1406
1407//===---------------------------------------------------------------------===//
1408
Rafael Espindolab4dd95b2011-04-06 17:35:32 +00001409The x86_64 abi says:
1410
1411Booleans, when stored in a memory object, are stored as single byte objects the
1412value of which is always 0 (false) or 1 (true).
1413
1414We are not using this fact:
1415
1416int bar(_Bool *a) { return *a; }
1417
1418define i32 @bar(i8* nocapture %a) nounwind readonly optsize {
1419 %1 = load i8* %a, align 1, !tbaa !0
1420 %tmp = and i8 %1, 1
1421 %2 = zext i8 %tmp to i32
1422 ret i32 %2
1423}
1424
1425bar:
1426 movb (%rdi), %al
1427 andb $1, %al
1428 movzbl %al, %eax
1429 ret
1430
1431GCC produces
1432
1433bar:
1434 movzbl (%rdi), %eax
1435 ret
1436
1437//===---------------------------------------------------------------------===//
1438
Eli Friedman4d4c6942010-02-10 21:26:04 +00001439Take the following C code:
Eli Friedmanf75de6e2010-08-29 05:07:40 +00001440int f(int a, int b) { return (unsigned char)a == (unsigned char)b; }
1441
1442We generate the following IR with clang:
1443define i32 @f(i32 %a, i32 %b) nounwind readnone {
1444entry:
1445 %tmp = xor i32 %b, %a ; <i32> [#uses=1]
1446 %tmp6 = and i32 %tmp, 255 ; <i32> [#uses=1]
1447 %cmp = icmp eq i32 %tmp6, 0 ; <i1> [#uses=1]
1448 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1449 ret i32 %conv5
1450}
1451
1452And the following x86 code:
1453 xorl %esi, %edi
1454 testb $-1, %dil
1455 sete %al
1456 movzbl %al, %eax
1457 ret
1458
1459A cmpb instead of the xorl+testb would be one instruction shorter.
1460
1461//===---------------------------------------------------------------------===//
1462
1463Given the following C code:
1464int f(int a, int b) { return (signed char)a == (signed char)b; }
1465
1466We generate the following IR with clang:
1467define i32 @f(i32 %a, i32 %b) nounwind readnone {
1468entry:
1469 %sext = shl i32 %a, 24 ; <i32> [#uses=1]
1470 %conv1 = ashr i32 %sext, 24 ; <i32> [#uses=1]
1471 %sext6 = shl i32 %b, 24 ; <i32> [#uses=1]
1472 %conv4 = ashr i32 %sext6, 24 ; <i32> [#uses=1]
1473 %cmp = icmp eq i32 %conv1, %conv4 ; <i1> [#uses=1]
1474 %conv5 = zext i1 %cmp to i32 ; <i32> [#uses=1]
1475 ret i32 %conv5
1476}
1477
1478And the following x86 code:
1479 movsbl %sil, %eax
1480 movsbl %dil, %ecx
1481 cmpl %eax, %ecx
1482 sete %al
1483 movzbl %al, %eax
1484 ret
1485
1486
1487It should be possible to eliminate the sign extensions.
1488
1489//===---------------------------------------------------------------------===//
Dan Gohman3c9b5f32010-09-02 21:18:42 +00001490
1491LLVM misses a load+store narrowing opportunity in this code:
1492
1493%struct.bf = type { i64, i16, i16, i32 }
1494
1495@bfi = external global %struct.bf* ; <%struct.bf**> [#uses=2]
1496
1497define void @t1() nounwind ssp {
1498entry:
1499 %0 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1500 %1 = getelementptr %struct.bf* %0, i64 0, i32 1 ; <i16*> [#uses=1]
1501 %2 = bitcast i16* %1 to i32* ; <i32*> [#uses=2]
1502 %3 = load i32* %2, align 1 ; <i32> [#uses=1]
1503 %4 = and i32 %3, -65537 ; <i32> [#uses=1]
1504 store i32 %4, i32* %2, align 1
1505 %5 = load %struct.bf** @bfi, align 8 ; <%struct.bf*> [#uses=1]
1506 %6 = getelementptr %struct.bf* %5, i64 0, i32 1 ; <i16*> [#uses=1]
1507 %7 = bitcast i16* %6 to i32* ; <i32*> [#uses=2]
1508 %8 = load i32* %7, align 1 ; <i32> [#uses=1]
1509 %9 = and i32 %8, -131073 ; <i32> [#uses=1]
1510 store i32 %9, i32* %7, align 1
1511 ret void
1512}
1513
1514LLVM currently emits this:
1515
1516 movq bfi(%rip), %rax
1517 andl $-65537, 8(%rax)
1518 movq bfi(%rip), %rax
1519 andl $-131073, 8(%rax)
1520 ret
1521
1522It could narrow the loads and stores to emit this:
1523
1524 movq bfi(%rip), %rax
1525 andb $-2, 10(%rax)
1526 movq bfi(%rip), %rax
1527 andb $-3, 10(%rax)
1528 ret
1529
1530The trouble is that there is a TokenFactor between the store and the
1531load, making it non-trivial to determine if there's anything between
1532the load and the store which would prohibit narrowing.
1533
1534//===---------------------------------------------------------------------===//
Benjamin Kramerb37ae332010-12-23 15:07:02 +00001535
1536This code:
1537void foo(unsigned x) {
1538 if (x == 0) bar();
1539 else if (x == 1) qux();
1540}
1541
1542currently compiles into:
1543_foo:
1544 movl 4(%esp), %eax
1545 cmpl $1, %eax
1546 je LBB0_3
1547 testl %eax, %eax
1548 jne LBB0_4
1549
1550the testl could be removed:
1551_foo:
1552 movl 4(%esp), %eax
1553 cmpl $1, %eax
1554 je LBB0_3
1555 jb LBB0_4
1556
15570 is the only unsigned number < 1.
1558
1559//===---------------------------------------------------------------------===//
Chris Lattner51415d22011-01-02 18:31:38 +00001560
1561This code:
1562
1563%0 = type { i32, i1 }
1564
1565define i32 @add32carry(i32 %sum, i32 %x) nounwind readnone ssp {
1566entry:
1567 %uadd = tail call %0 @llvm.uadd.with.overflow.i32(i32 %sum, i32 %x)
1568 %cmp = extractvalue %0 %uadd, 1
1569 %inc = zext i1 %cmp to i32
1570 %add = add i32 %x, %sum
1571 %z.0 = add i32 %add, %inc
1572 ret i32 %z.0
1573}
1574
1575declare %0 @llvm.uadd.with.overflow.i32(i32, i32) nounwind readnone
1576
1577compiles to:
1578
1579_add32carry: ## @add32carry
1580 addl %esi, %edi
1581 sbbl %ecx, %ecx
1582 movl %edi, %eax
1583 subl %ecx, %eax
1584 ret
1585
1586But it could be:
1587
1588_add32carry:
1589 leal (%rsi,%rdi), %eax
1590 cmpl %esi, %eax
1591 adcl $0, %eax
1592 ret
1593
1594//===---------------------------------------------------------------------===//
Chris Lattner5237feb2011-02-21 17:03:47 +00001595
1596The hot loop of 256.bzip2 contains code that looks a bit like this:
1597
1598int foo(char *P, char *Q, int x, int y) {
1599 if (P[0] != Q[0])
1600 return P[0] < Q[0];
1601 if (P[1] != Q[1])
1602 return P[1] < Q[1];
1603 if (P[2] != Q[2])
1604 return P[2] < Q[2];
1605 return P[3] < Q[3];
1606}
1607
1608In the real code, we get a lot more wrong than this. However, even in this
1609code we generate:
1610
1611_foo: ## @foo
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001612## %bb.0: ## %entry
Chris Lattner5237feb2011-02-21 17:03:47 +00001613 movb (%rsi), %al
1614 movb (%rdi), %cl
1615 cmpb %al, %cl
1616 je LBB0_2
1617LBB0_1: ## %if.then
1618 cmpb %al, %cl
1619 jmp LBB0_5
1620LBB0_2: ## %if.end
1621 movb 1(%rsi), %al
1622 movb 1(%rdi), %cl
1623 cmpb %al, %cl
1624 jne LBB0_1
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001625## %bb.3: ## %if.end38
Chris Lattner5237feb2011-02-21 17:03:47 +00001626 movb 2(%rsi), %al
1627 movb 2(%rdi), %cl
1628 cmpb %al, %cl
1629 jne LBB0_1
Francis Visoiu Mistrih25528d62017-12-04 17:18:51 +00001630## %bb.4: ## %if.end60
Chris Lattner5237feb2011-02-21 17:03:47 +00001631 movb 3(%rdi), %al
1632 cmpb 3(%rsi), %al
1633LBB0_5: ## %if.end60
1634 setl %al
1635 movzbl %al, %eax
1636 ret
1637
1638Note that we generate jumps to LBB0_1 which does a redundant compare. The
1639redundant compare also forces the register values to be live, which prevents
1640folding one of the loads into the compare. In contrast, GCC 4.2 produces:
1641
1642_foo:
1643 movzbl (%rsi), %eax
1644 cmpb %al, (%rdi)
1645 jne L10
1646L12:
1647 movzbl 1(%rsi), %eax
1648 cmpb %al, 1(%rdi)
1649 jne L10
1650 movzbl 2(%rsi), %eax
1651 cmpb %al, 2(%rdi)
1652 jne L10
1653 movzbl 3(%rdi), %eax
1654 cmpb 3(%rsi), %al
1655L10:
1656 setl %al
1657 movzbl %al, %eax
1658 ret
1659
1660which is "perfect".
1661
1662//===---------------------------------------------------------------------===//
1663
Eli Friedmane8f2be02011-03-17 01:22:09 +00001664For the branch in the following code:
1665int a();
1666int b(int x, int y) {
1667 if (x & (1<<(y&7)))
1668 return a();
1669 return y;
1670}
1671
1672We currently generate:
1673 movb %sil, %al
1674 andb $7, %al
1675 movzbl %al, %eax
1676 btl %eax, %edi
1677 jae .LBB0_2
1678
1679movl+andl would be shorter than the movb+andb+movzbl sequence.
1680
1681//===---------------------------------------------------------------------===//
1682
1683For the following:
1684struct u1 {
1685 float x, y;
1686};
1687float foo(struct u1 u) {
1688 return u.x + u.y;
1689}
1690
1691We currently generate:
1692 movdqa %xmm0, %xmm1
1693 pshufd $1, %xmm0, %xmm0 # xmm0 = xmm0[1,0,0,0]
1694 addss %xmm1, %xmm0
1695 ret
1696
1697We could save an instruction here by commuting the addss.
1698
1699//===---------------------------------------------------------------------===//
Chris Lattner6f195462011-04-14 18:47:18 +00001700
1701This (from PR9661):
1702
1703float clamp_float(float a) {
1704 if (a > 1.0f)
1705 return 1.0f;
1706 else if (a < 0.0f)
1707 return 0.0f;
1708 else
1709 return a;
1710}
1711
1712Could compile to:
1713
1714clamp_float: # @clamp_float
1715 movss .LCPI0_0(%rip), %xmm1
1716 minss %xmm1, %xmm0
1717 pxor %xmm1, %xmm1
1718 maxss %xmm1, %xmm0
1719 ret
1720
1721with -ffast-math.
1722
1723//===---------------------------------------------------------------------===//
Chris Lattner2a75c722011-04-28 05:33:16 +00001724
1725This function (from PR9803):
1726
1727int clamp2(int a) {
1728 if (a > 5)
1729 a = 5;
1730 if (a < 0)
1731 return 0;
1732 return a;
1733}
1734
1735Compiles to:
1736
1737_clamp2: ## @clamp2
1738 pushq %rbp
1739 movq %rsp, %rbp
1740 cmpl $5, %edi
1741 movl $5, %ecx
1742 cmovlel %edi, %ecx
1743 testl %ecx, %ecx
1744 movl $0, %eax
1745 cmovnsl %ecx, %eax
1746 popq %rbp
1747 ret
1748
1749The move of 0 could be scheduled above the test to make it is xor reg,reg.
1750
1751//===---------------------------------------------------------------------===//
Chris Lattner1e81f572011-05-17 07:22:33 +00001752
1753GCC PR48986. We currently compile this:
1754
1755void bar(void);
1756void yyy(int* p) {
1757 if (__sync_fetch_and_add(p, -1) == 1)
1758 bar();
1759}
1760
1761into:
1762 movl $-1, %eax
1763 lock
1764 xaddl %eax, (%rdi)
1765 cmpl $1, %eax
1766 je LBB0_2
1767
1768Instead we could generate:
1769
1770 lock
1771 dec %rdi
1772 je LBB0_2
1773
1774The trick is to match "fetch_and_add(X, -C) == C".
1775
1776//===---------------------------------------------------------------------===//
Benjamin Kramer88d31b32012-03-30 13:02:58 +00001777
1778unsigned t(unsigned a, unsigned b) {
1779 return a <= b ? 5 : -5;
1780}
1781
1782We generate:
1783 movl $5, %ecx
1784 cmpl %esi, %edi
1785 movl $-5, %eax
1786 cmovbel %ecx, %eax
1787
1788GCC:
1789 cmpl %edi, %esi
1790 sbbl %eax, %eax
1791 andl $-10, %eax
1792 addl $5, %eax
Andrey Churbanov3535e042013-08-30 14:40:24 +00001793
Benjamin Kramer88d31b32012-03-30 13:02:58 +00001794//===---------------------------------------------------------------------===//