blob: 73cf27692447ebe63e384ce63fe3f48e0cfcd7e8 [file] [log] [blame]
Chris Lattner17f1f1a2006-05-19 20:51:43 +00001//===---------------------------------------------------------------------===//
2// Random ideas for the X86 backend: SSE-specific stuff.
3//===---------------------------------------------------------------------===//
4
Chris Lattnera42202e2010-08-23 17:30:29 +00005//===---------------------------------------------------------------------===//
6
7SSE Variable shift can be custom lowered to something like this, which uses a
8small table + unaligned load + shuffle instead of going through memory.
9
10__m128i_shift_right:
11 .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
12 .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
13
14...
15__m128i shift_right(__m128i value, unsigned long offset) {
16 return _mm_shuffle_epi8(value,
17 _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset)));
18}
Chris Lattnerdb8adb92007-08-15 16:58:38 +000019
Chris Lattner17f1f1a2006-05-19 20:51:43 +000020//===---------------------------------------------------------------------===//
21
Chris Lattneraecf47a2010-08-25 23:31:42 +000022SSE has instructions for doing operations on complex numbers, we should pattern
Chris Lattner5cac0f72010-09-05 20:22:09 +000023match them. For example, this should turn into a horizontal add:
24
25typedef float __attribute__((vector_size(16))) v4f32;
26float f32(v4f32 A) {
27 return A[0]+A[1]+A[2]+A[3];
28}
29
30Instead we get this:
31
32_f32: ## @f32
33 pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0]
34 addss %xmm0, %xmm1
35 pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0]
36 movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1]
37 movaps %xmm0, %xmm3
38 addss %xmm1, %xmm3
39 movdqa %xmm2, %xmm0
40 addss %xmm3, %xmm0
41 ret
42
43Also, there are cases where some simple local SLP would improve codegen a bit.
44compiling this:
Chris Lattneraecf47a2010-08-25 23:31:42 +000045
46_Complex float f32(_Complex float A, _Complex float B) {
47 return A+B;
48}
49
50into:
51
Chris Lattner5cac0f72010-09-05 20:22:09 +000052_f32: ## @f32
Chris Lattneraecf47a2010-08-25 23:31:42 +000053 movdqa %xmm0, %xmm2
54 addss %xmm1, %xmm2
Chris Lattner5cac0f72010-09-05 20:22:09 +000055 pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0]
56 pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0]
57 addss %xmm1, %xmm3
58 movaps %xmm2, %xmm0
59 unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
Chris Lattneraecf47a2010-08-25 23:31:42 +000060 ret
61
Chris Lattner5cac0f72010-09-05 20:22:09 +000062seems silly when it could just be one addps.
Chris Lattneraecf47a2010-08-25 23:31:42 +000063
64
65//===---------------------------------------------------------------------===//
66
Chris Lattner35a14712006-10-18 17:04:09 +000067Expand libm rounding functions inline: Significant speedups possible.
68http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html
69
70//===---------------------------------------------------------------------===//
71
Chris Lattner17f1f1a2006-05-19 20:51:43 +000072When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and
73other fast SSE modes.
74
75//===---------------------------------------------------------------------===//
76
Chris Lattner9a8eb0d2008-09-20 19:17:53 +000077Think about doing i64 math in SSE regs on x86-32.
Chris Lattner17f1f1a2006-05-19 20:51:43 +000078
79//===---------------------------------------------------------------------===//
80
81This testcase should have no SSE instructions in it, and only one load from
82a constant pool:
83
84double %test3(bool %B) {
85 %C = select bool %B, double 123.412, double 523.01123123
86 ret double %C
87}
88
89Currently, the select is being lowered, which prevents the dag combiner from
90turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)'
91
92The pattern isel got this one right.
93
94//===---------------------------------------------------------------------===//
95
Chris Lattner17f1f1a2006-05-19 20:51:43 +000096Lower memcpy / memset to a series of SSE 128 bit move instructions when it's
97feasible.
98
99//===---------------------------------------------------------------------===//
100
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000101Codegen:
102 if (copysign(1.0, x) == copysign(1.0, y))
103into:
104 if (x^y & mask)
105when using SSE.
106
107//===---------------------------------------------------------------------===//
108
109Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half
110of a v4sf value.
111
112//===---------------------------------------------------------------------===//
113
114Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}.
115Perhaps use pxor / xorp* to clear a XMM register first?
116
117//===---------------------------------------------------------------------===//
118
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000119External test Nurbs exposed some problems. Look for
120__ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc
121emits:
122
123 movaps (%edx), %xmm2 #59.21
124 movaps (%edx), %xmm5 #60.21
125 movaps (%edx), %xmm4 #61.21
126 movaps (%edx), %xmm3 #62.21
127 movl 40(%ecx), %ebp #69.49
128 shufps $0, %xmm2, %xmm5 #60.21
129 movl 100(%esp), %ebx #69.20
130 movl (%ebx), %edi #69.20
131 imull %ebp, %edi #69.49
132 addl (%eax), %edi #70.33
133 shufps $85, %xmm2, %xmm4 #61.21
134 shufps $170, %xmm2, %xmm3 #62.21
135 shufps $255, %xmm2, %xmm2 #63.21
136 lea (%ebp,%ebp,2), %ebx #69.49
137 negl %ebx #69.49
138 lea -3(%edi,%ebx), %ebx #70.33
139 shll $4, %ebx #68.37
140 addl 32(%ecx), %ebx #68.37
141 testb $15, %bl #91.13
142 jne L_B1.24 # Prob 5% #91.13
143
144This is the llvm code after instruction scheduling:
145
146cond_next140 (0xa910740, LLVM BB @0xa90beb0):
147 %reg1078 = MOV32ri -3
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000148 %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
149 %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000150 %reg1080 = IMUL32rr %reg1079, %reg1037
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000151 %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000152 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000153 %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000154 %reg1082 = SHL32ri %reg1038, 4
155 %reg1039 = ADD32rr %reg1036, %reg1082
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000156 %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000157 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170
158 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0
159 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255
160 %reg1033 = SHUFPSrr %reg1083, %reg1083, 85
161 %reg1040 = MOV32rr %reg1039
162 %reg1084 = AND32ri8 %reg1039, 15
163 CMP32ri8 %reg1084, 0
164 JE mbb<cond_next204,0xa914d30>
165
166Still ok. After register allocation:
167
168cond_next140 (0xa910740, LLVM BB @0xa90beb0):
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000169 %eax = MOV32ri -3
Francis Visoiu Mistrih0b5bdce2017-12-15 16:33:45 +0000170 %edx = MOV32rm %stack.3, 1, %noreg, 0
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000171 ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
Francis Visoiu Mistrih0b5bdce2017-12-15 16:33:45 +0000172 %edx = MOV32rm %stack.7, 1, %noreg, 0
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000173 %edx = MOV32rm %edx, 1, %noreg, 40
174 IMUL32rr %eax<def&use>, %edx
Francis Visoiu Mistrih0b5bdce2017-12-15 16:33:45 +0000175 %esi = MOV32rm %stack.5, 1, %noreg, 0
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000176 %esi = MOV32rm %esi, 1, %noreg, 0
Francis Visoiu Mistrih0b5bdce2017-12-15 16:33:45 +0000177 MOV32mr %stack.4, 1, %noreg, 0, %esi
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000178 %eax = LEA32r %esi, 1, %eax, -3
Francis Visoiu Mistrih0b5bdce2017-12-15 16:33:45 +0000179 %esi = MOV32rm %stack.7, 1, %noreg, 0
Francis Visoiu Mistrih9d7bb0c2017-11-28 17:15:09 +0000180 %esi = MOV32rm %esi, 1, %noreg, 32
181 %edi = MOV32rr %eax
182 SHL32ri %edi<def&use>, 4
183 ADD32rr %edi<def&use>, %esi
184 %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
185 %xmm1 = MOVAPSrr %xmm0
186 SHUFPSrr %xmm1<def&use>, %xmm1, 170
187 %xmm2 = MOVAPSrr %xmm0
188 SHUFPSrr %xmm2<def&use>, %xmm2, 0
189 %xmm3 = MOVAPSrr %xmm0
190 SHUFPSrr %xmm3<def&use>, %xmm3, 255
191 SHUFPSrr %xmm0<def&use>, %xmm0, 85
192 %ebx = MOV32rr %edi
193 AND32ri8 %ebx<def&use>, 15
194 CMP32ri8 %ebx, 0
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000195 JE mbb<cond_next204,0xa914d30>
196
197This looks really bad. The problem is shufps is a destructive opcode. Since it
198appears as operand two in more than one shufps ops. It resulted in a number of
199copies. Note icc also suffers from the same problem. Either the instruction
200selector should select pshufd or The register allocator can made the two-address
201to three-address transformation.
202
203It also exposes some other problems. See MOV32ri -3 and the spills.
204
205//===---------------------------------------------------------------------===//
206
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000207Consider:
208
209__m128 test(float a) {
210 return _mm_set_ps(0.0, 0.0, 0.0, a*a);
211}
212
213This compiles into:
214
215movss 4(%esp), %xmm1
216mulss %xmm1, %xmm1
217xorps %xmm0, %xmm0
218movss %xmm1, %xmm0
219ret
220
221Because mulss doesn't modify the top 3 elements, the top elements of
222xmm1 are already zero'd. We could compile this to:
223
224movss 4(%esp), %xmm0
225mulss %xmm0, %xmm0
226ret
227
228//===---------------------------------------------------------------------===//
229
230Here's a sick and twisted idea. Consider code like this:
231
232__m128 test(__m128 a) {
233 float b = *(float*)&A;
234 ...
235 return _mm_set_ps(0.0, 0.0, 0.0, b);
236}
237
238This might compile to this code:
239
240movaps c(%esp), %xmm1
241xorps %xmm0, %xmm0
242movss %xmm1, %xmm0
243ret
244
245Now consider if the ... code caused xmm1 to get spilled. This might produce
246this code:
247
248movaps c(%esp), %xmm1
249movaps %xmm1, c2(%esp)
250...
251
252xorps %xmm0, %xmm0
253movaps c2(%esp), %xmm1
254movss %xmm1, %xmm0
255ret
256
257However, since the reload is only used by these instructions, we could
258"fold" it into the uses, producing something like this:
259
260movaps c(%esp), %xmm1
261movaps %xmm1, c2(%esp)
262...
263
264movss c2(%esp), %xmm0
265ret
266
267... saving two instructions.
268
269The basic idea is that a reload from a spill slot, can, if only one 4-byte
Dan Gohman4a618822010-02-10 16:03:48 +0000270chunk is used, bring in 3 zeros the one element instead of 4 elements.
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000271This can be used to simplify a variety of shuffle operations, where the
272elements are fixed zeros.
273
274//===---------------------------------------------------------------------===//
275
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000276This code generates ugly code, probably due to costs being off or something:
277
Chris Lattnerd2b8a362007-12-29 19:31:47 +0000278define void @test(float* %P, <4 x float>* %P2 ) {
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000279 %xFloat0.688 = load float* %P
Chris Lattnerd2b8a362007-12-29 19:31:47 +0000280 %tmp = load <4 x float>* %P2
281 %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000282 store <4 x float> %inFloat3.713, <4 x float>* %P2
283 ret void
284}
285
286Generates:
287
288_test:
Chris Lattnerd2b8a362007-12-29 19:31:47 +0000289 movl 8(%esp), %eax
290 movaps (%eax), %xmm0
291 pxor %xmm1, %xmm1
292 movaps %xmm0, %xmm2
293 shufps $50, %xmm1, %xmm2
294 shufps $132, %xmm2, %xmm0
295 movaps %xmm0, (%eax)
296 ret
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000297
Chris Lattnerd2b8a362007-12-29 19:31:47 +0000298Would it be better to generate:
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000299
300_test:
301 movl 8(%esp), %ecx
302 movaps (%ecx), %xmm0
303 xor %eax, %eax
304 pinsrw $6, %eax, %xmm0
305 pinsrw $7, %eax, %xmm0
306 movaps %xmm0, (%ecx)
307 ret
308
Chris Lattnerd2b8a362007-12-29 19:31:47 +0000309?
Chris Lattner17f1f1a2006-05-19 20:51:43 +0000310
311//===---------------------------------------------------------------------===//
312
313Some useful information in the Apple Altivec / SSE Migration Guide:
314
315http://developer.apple.com/documentation/Performance/Conceptual/
316Accelerate_sse_migration/index.html
317
318e.g. SSE select using and, andnot, or. Various SSE compare translations.
Evan Chengddced952006-05-30 23:56:31 +0000319
320//===---------------------------------------------------------------------===//
321
322Add hooks to commute some CMPP operations.
Chris Lattner37c1c442006-06-14 21:26:18 +0000323
324//===---------------------------------------------------------------------===//
325
Evan Cheng32860f42006-07-10 21:42:16 +0000326Apply the same transformation that merged four float into a single 128-bit load
327to loads from constant pool.
Evan Cheng915026c2006-11-10 22:09:17 +0000328
329//===---------------------------------------------------------------------===//
330
331Floating point max / min are commutable when -enable-unsafe-fp-path is
332specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other
333nodes which are selected to max / min instructions that are marked commutable.
Chris Lattner2ebb2e92007-02-27 17:21:09 +0000334
335//===---------------------------------------------------------------------===//
336
Chris Lattnerfef69f52007-09-26 06:15:48 +0000337We should materialize vector constants like "all ones" and "signbit" with
Chris Lattner51883ac2007-08-24 15:17:59 +0000338code like:
339
340 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
341
342and:
343 cmpeqps xmm1, xmm1 ; xmm1 = all-ones
344 psrlq xmm1, 31 ; xmm1 = all 100000000000...
345
346instead of using a load from the constant pool. The later is important for
347ABS/NEG/copysign etc.
348
349//===---------------------------------------------------------------------===//
Chris Lattnerfef69f52007-09-26 06:15:48 +0000350
Chris Lattner909a54c2007-10-29 06:19:48 +0000351These functions:
352
353#include <xmmintrin.h>
354__m128i a;
355void x(unsigned short n) {
356 a = _mm_slli_epi32 (a, n);
357}
358void y(unsigned n) {
359 a = _mm_slli_epi32 (a, n);
360}
361
362compile to ( -O3 -static -fomit-frame-pointer):
363_x:
364 movzwl 4(%esp), %eax
365 movd %eax, %xmm0
366 movaps _a, %xmm1
367 pslld %xmm0, %xmm1
368 movaps %xmm1, _a
369 ret
370_y:
371 movd 4(%esp), %xmm0
372 movaps _a, %xmm1
373 pslld %xmm0, %xmm1
374 movaps %xmm1, _a
375 ret
376
377"y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems
378like movd would be sufficient in both cases as the value is already zero
379extended in the 32-bit stack slot IIRC. For signed short, it should also be
380save, as a really-signed value would be undefined for pslld.
381
382
383//===---------------------------------------------------------------------===//
Evan Cheng78c460c2007-12-21 01:31:58 +0000384
385#include <math.h>
386int t1(double d) { return signbit(d); }
387
388This currently compiles to:
389 subl $12, %esp
390 movsd 16(%esp), %xmm0
391 movsd %xmm0, (%esp)
392 movl 4(%esp), %eax
393 shrl $31, %eax
394 addl $12, %esp
395 ret
396
397We should use movmskp{s|d} instead.
398
Chris Lattner2dd23b92008-01-26 20:12:07 +0000399//===---------------------------------------------------------------------===//
400
401CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single
402(aligned) vector load. This functionality has a couple of problems.
403
4041. The code to infer alignment from loads of globals is in the X86 backend,
405 not the dag combiner. This is because dagcombine2 needs to be able to see
406 through the X86ISD::Wrapper node, which DAGCombine can't really do.
4072. The code for turning 4 x load into a single vector load is target
408 independent and should be moved to the dag combiner.
4093. The code for turning 4 x load into a vector load can only handle a direct
410 load from a global or a direct load from the stack. It should be generalized
411 to handle any load from P, P+4, P+8, P+12, where P can be anything.
4124. The alignment inference code cannot handle loads from globals in non-static
413 mode because it doesn't look through the extra dyld stub load. If you try
414 vec_align.ll without -relocation-model=static, you'll see what I mean.
415
416//===---------------------------------------------------------------------===//
Chris Lattner2e4719e2008-01-27 07:31:41 +0000417
418We should lower store(fneg(load p), q) into an integer load+xor+store, which
419eliminates a constant pool load. For example, consider:
420
421define i64 @ccosf(float %z.0, float %z.1) nounwind readonly {
422entry:
Dan Gohman6f34abd2010-03-02 01:11:08 +0000423 %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1]
Chris Lattnera76e23a2008-03-08 22:28:45 +0000424 %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly
425 ret i64 %tmp20
Chris Lattner2e4719e2008-01-27 07:31:41 +0000426}
Eli Friedmanceb13f22010-06-03 01:47:31 +0000427declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly
Chris Lattner2e4719e2008-01-27 07:31:41 +0000428
429This currently compiles to:
430
431LCPI1_0: # <4 x float>
432 .long 2147483648 # float -0
433 .long 2147483648 # float -0
434 .long 2147483648 # float -0
435 .long 2147483648 # float -0
436_ccosf:
437 subl $12, %esp
438 movss 16(%esp), %xmm0
439 movss %xmm0, 4(%esp)
440 movss 20(%esp), %xmm0
441 xorps LCPI1_0, %xmm0
442 movss %xmm0, (%esp)
443 call L_ccoshf$stub
444 addl $12, %esp
445 ret
446
447Note the load into xmm0, then xor (to negate), then store. In PIC mode,
448this code computes the pic base and does two loads to do the constant pool
449load, so the improvement is much bigger.
450
451The tricky part about this xform is that the argument load/store isn't exposed
452until post-legalize, and at that point, the fneg has been custom expanded into
453an X86 fxor. This means that we need to handle this case in the x86 backend
454instead of in target independent code.
455
456//===---------------------------------------------------------------------===//
Nate Begeman2d77e8e42008-02-11 04:19:36 +0000457
458Non-SSE4 insert into 16 x i8 is atrociously bad.
459
460//===---------------------------------------------------------------------===//
461
462<2 x i64> extract is substantially worse than <2 x f64>, even if the destination
463is memory.
464
465//===---------------------------------------------------------------------===//
466
Nate Begeman2d77e8e42008-02-11 04:19:36 +0000467INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert
468any number of 0.0 simultaneously. Currently we only use it for simple
469insertions.
470
471See comments in LowerINSERT_VECTOR_ELT_SSE4.
Nate Begemaneea32992008-02-13 07:06:12 +0000472
473//===---------------------------------------------------------------------===//
474
475On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not
476Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are
477legal, it'll just take a few extra patterns written in the .td file.
478
479Note: this is not a code quality issue; the custom lowered code happens to be
480right, but we shouldn't have to custom lower anything. This is probably related
481to <2 x i64> ops being so bad.
482
Chris Lattner2acd0c22008-03-05 07:22:39 +0000483//===---------------------------------------------------------------------===//
484
Anton Korobeynikova38e72d2008-05-11 14:33:15 +0000485LLVM currently generates stack realignment code, when it is not necessary
486needed. The problem is that we need to know about stack alignment too early,
487before RA runs.
488
489At that point we don't know, whether there will be vector spill, or not.
490Stack realignment logic is overly conservative here, but otherwise we can
491produce unaligned loads/stores.
492
493Fixing this will require some huge RA changes.
494
495Testcase:
496#include <emmintrin.h>
497
498typedef short vSInt16 __attribute__ ((__vector_size__ (16)));
499
500static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873,
501- 22725, - 12873};;
502
503vSInt16 madd(vSInt16 b)
504{
505 return _mm_madd_epi16(a, b);
506}
507
508Generated code (x86-32, linux):
509madd:
510 pushl %ebp
511 movl %esp, %ebp
512 andl $-16, %esp
513 movaps .LCPI1_0, %xmm1
514 pmaddwd %xmm1, %xmm0
515 movl %ebp, %esp
516 popl %ebp
517 ret
518
519//===---------------------------------------------------------------------===//
520
Chris Lattnerd17f58a2008-05-13 18:48:54 +0000521Consider:
522#include <emmintrin.h>
523__m128 foo2 (float x) {
524 return _mm_set_ps (0, 0, x, 0);
525}
526
527In x86-32 mode, we generate this spiffy code:
528
529_foo2:
530 movss 4(%esp), %xmm0
531 pshufd $81, %xmm0, %xmm0
532 ret
533
534in x86-64 mode, we generate this code, which could be better:
535
536_foo2:
537 xorps %xmm1, %xmm1
538 movss %xmm0, %xmm1
539 pshufd $81, %xmm1, %xmm0
540 ret
541
542In sse4 mode, we could use insertps to make both better.
543
Chris Lattner03ce2062008-05-13 19:56:20 +0000544Here's another testcase that could use insertps [mem]:
545
546#include <xmmintrin.h>
547extern float x2, x3;
548__m128 foo1 (float x1, float x4) {
549 return _mm_set_ps (x2, x1, x3, x4);
550}
551
552gcc mainline compiles it to:
553
554foo1:
555 insertps $0x10, x2(%rip), %xmm0
556 insertps $0x10, x3(%rip), %xmm1
557 movaps %xmm1, %xmm2
558 movlhps %xmm0, %xmm2
559 movaps %xmm2, %xmm0
560 ret
561
Chris Lattnerd17f58a2008-05-13 18:48:54 +0000562//===---------------------------------------------------------------------===//
Anton Korobeynikova38e72d2008-05-11 14:33:15 +0000563
Chris Lattner3546c2b2008-05-23 04:29:53 +0000564We compile vector multiply-by-constant into poor code:
565
566define <4 x i32> @f(<4 x i32> %i) nounwind {
567 %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 >
568 ret <4 x i32> %A
569}
570
Dan Gohman66eea1b2008-05-23 18:05:39 +0000571On targets without SSE4.1, this compiles into:
Chris Lattner3546c2b2008-05-23 04:29:53 +0000572
573LCPI1_0: ## <4 x i32>
574 .long 10
575 .long 10
576 .long 10
577 .long 10
578 .text
579 .align 4,0x90
580 .globl _f
581_f:
582 pshufd $3, %xmm0, %xmm1
583 movd %xmm1, %eax
584 imull LCPI1_0+12, %eax
585 movd %eax, %xmm1
586 pshufd $1, %xmm0, %xmm2
587 movd %xmm2, %eax
588 imull LCPI1_0+4, %eax
589 movd %eax, %xmm2
590 punpckldq %xmm1, %xmm2
591 movd %xmm0, %eax
592 imull LCPI1_0, %eax
593 movd %eax, %xmm1
594 movhlps %xmm0, %xmm0
595 movd %xmm0, %eax
596 imull LCPI1_0+8, %eax
597 movd %eax, %xmm0
598 punpckldq %xmm0, %xmm1
599 movaps %xmm1, %xmm0
600 punpckldq %xmm2, %xmm0
601 ret
Evan Chengd25cb8e2008-05-23 17:28:11 +0000602
Dan Gohman66eea1b2008-05-23 18:05:39 +0000603It would be better to synthesize integer vector multiplication by constants
604using shifts and adds, pslld and paddd here. And even on targets with SSE4.1,
605simple cases such as multiplication by powers of two would be better as
606vector shifts than as multiplications.
607
Evan Chengd25cb8e2008-05-23 17:28:11 +0000608//===---------------------------------------------------------------------===//
609
610We compile this:
611
612__m128i
613foo2 (char x)
614{
615 return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0);
616}
617
618into:
619 movl $1, %eax
620 xorps %xmm0, %xmm0
621 pinsrw $2, %eax, %xmm0
622 movzbl 4(%esp), %eax
623 pinsrw $3, %eax, %xmm0
624 movl $256, %eax
625 pinsrw $7, %eax, %xmm0
626 ret
627
628
629gcc-4.2:
630 subl $12, %esp
631 movzbl 16(%esp), %eax
632 movdqa LC0, %xmm0
633 pinsrw $3, %eax, %xmm0
634 addl $12, %esp
635 ret
636 .const
637 .align 4
638LC0:
639 .word 0
640 .word 0
641 .word 1
642 .word 0
643 .word 0
644 .word 0
645 .word 0
646 .word 256
647
648With SSE4, it should be
649 movdqa .LC0(%rip), %xmm0
650 pinsrb $6, %edi, %xmm0
Evan Cheng3fc23722008-06-25 20:52:59 +0000651
652//===---------------------------------------------------------------------===//
653
654We should transform a shuffle of two vectors of constants into a single vector
655of constants. Also, insertelement of a constant into a vector of constants
656should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll.
657
658We compiled it to something horrible:
659
660 .align 4
661LCPI1_1: ## float
662 .long 1065353216 ## float 1
663 .const
664
665 .align 4
666LCPI1_0: ## <4 x float>
667 .space 4
668 .long 1065353216 ## float 1
669 .space 4
670 .long 1065353216 ## float 1
671 .text
672 .align 4,0x90
673 .globl _t
674_t:
675 xorps %xmm0, %xmm0
676 movhps LCPI1_0, %xmm0
677 movss LCPI1_1, %xmm1
678 movaps %xmm0, %xmm2
679 shufps $2, %xmm1, %xmm2
680 shufps $132, %xmm2, %xmm0
681 movaps %xmm0, 0
Chris Lattnerf076d5e2008-08-19 00:41:02 +0000682
683//===---------------------------------------------------------------------===//
Chris Lattner9a8eb0d2008-09-20 19:17:53 +0000684rdar://5907648
685
686This function:
687
688float foo(unsigned char x) {
689 return x;
690}
691
692compiles to (x86-32):
693
694define float @foo(i8 zeroext %x) nounwind {
695 %tmp12 = uitofp i8 %x to float ; <float> [#uses=1]
696 ret float %tmp12
697}
698
699compiles to:
700
701_foo:
702 subl $4, %esp
703 movzbl 8(%esp), %eax
704 cvtsi2ss %eax, %xmm0
705 movss %xmm0, (%esp)
706 flds (%esp)
707 addl $4, %esp
708 ret
709
710We should be able to use:
711 cvtsi2ss 8($esp), %xmm0
712since we know the stack slot is already zext'd.
713
Evan Chengf31f2882009-01-28 08:35:02 +0000714//===---------------------------------------------------------------------===//
715
716Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64))
717when code size is critical. movlps is slower than movsd on core2 but it's one
718byte shorter.
Chris Lattner553fd7e2009-02-04 19:08:01 +0000719
720//===---------------------------------------------------------------------===//
721
722We should use a dynamic programming based approach to tell when using FPStack
723operations is cheaper than SSE. SciMark montecarlo contains code like this
724for example:
725
726double MonteCarlo_num_flops(int Num_samples) {
727 return ((double) Num_samples)* 4.0;
728}
729
730In fpstack mode, this compiles into:
731
732LCPI1_0:
733 .long 1082130432 ## float 4.000000e+00
734_MonteCarlo_num_flops:
735 subl $4, %esp
736 movl 8(%esp), %eax
737 movl %eax, (%esp)
738 fildl (%esp)
739 fmuls LCPI1_0
740 addl $4, %esp
741 ret
742
743in SSE mode, it compiles into significantly slower code:
744
745_MonteCarlo_num_flops:
746 subl $12, %esp
747 cvtsi2sd 16(%esp), %xmm0
748 mulsd LCPI1_0, %xmm0
749 movsd %xmm0, (%esp)
750 fldl (%esp)
751 addl $12, %esp
752 ret
753
754There are also other cases in scimark where using fpstack is better, it is
755cheaper to do fld1 than load from a constant pool for example, so
756"load, add 1.0, store" is better done in the fp stack, etc.
757
758//===---------------------------------------------------------------------===//
Chris Lattner3eb76c22010-01-13 23:29:11 +0000759
Chris Lattnerfb5670f2010-02-04 07:32:01 +0000760These should compile into the same code (PR6214): Perhaps instcombine should
761canonicalize the former into the later?
762
763define float @foo(float %x) nounwind {
764 %t = bitcast float %x to i32
765 %s = and i32 %t, 2147483647
766 %d = bitcast i32 %s to float
767 ret float %d
768}
769
770declare float @fabsf(float %n)
771define float @bar(float %x) nounwind {
772 %d = call float @fabsf(float %x)
773 ret float %d
774}
775
776//===---------------------------------------------------------------------===//
777
Chris Lattnercf11e6022010-02-09 05:45:29 +0000778This IR (from PR6194):
779
Lang Hamesde7ab802011-10-10 23:42:08 +0000780target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
Chris Lattnercf11e6022010-02-09 05:45:29 +0000781target triple = "x86_64-apple-darwin10.0.0"
782
783%0 = type { double, double }
784%struct.float3 = type { float, float, float }
785
786define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp {
787entry:
788 %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1]
789 %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1]
790 %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1]
791 %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1]
792 %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1]
793 %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1]
794 %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1]
795 store float %tmp12, float* %tmp5
796 ret void
797}
798
799Compiles to:
800
801_test: ## @test
802 movd %xmm0, %rax
803 shrq $32, %rax
804 movl %eax, 4(%rdi)
805 ret
806
807This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and
808doing a shuffle from v[1] to v[0] then a float store.
809
810//===---------------------------------------------------------------------===//
Chris Lattner7b909ac2010-07-05 05:48:41 +0000811
Benjamin Kramer57003a6762012-03-19 00:43:34 +0000812[UNSAFE FP]
813
814void foo(double, double, double);
815void norm(double x, double y, double z) {
816 double scale = __builtin_sqrt(x*x + y*y + z*z);
817 foo(x/scale, y/scale, z/scale);
818}
819
820We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is
821slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first
822and emit 3 mulsd in place of the divs. This can be done as a target-independent
823transform.
824
825If we're dealing with floats instead of doubles we could even replace the sqrtss
826and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the
827cost of reduced accuracy.
828
829//===---------------------------------------------------------------------===//
Chris Lattnerba3ba8f2012-09-03 02:58:21 +0000830
831This function should be matched to haddpd when the appropriate CPU is enabled:
832
833#include <x86intrin.h>
834double f (__m128d p) {
835 return p[0] + p[1];
836}
837
838similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should
839turn into hsubpd also.
840
841//===---------------------------------------------------------------------===//
Benjamin Kramer189fc582013-02-17 23:34:14 +0000842
843define <2 x i32> @foo(<2 x double> %in) {
844 %x = fptosi <2 x double> %in to <2 x i32>
845 ret <2 x i32> %x
846}
847
848Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si.
849
850//===---------------------------------------------------------------------===//