| //===---------------------------------------------------------------------===// |
| // Random ideas for the X86 backend: SSE-specific stuff. |
| //===---------------------------------------------------------------------===// |
| |
| //===---------------------------------------------------------------------===// |
| |
| SSE Variable shift can be custom lowered to something like this, which uses a |
| small table + unaligned load + shuffle instead of going through memory. |
| |
| __m128i_shift_right: |
| .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
| |
| ... |
| __m128i shift_right(__m128i value, unsigned long offset) { |
| return _mm_shuffle_epi8(value, |
| _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); |
| } |
| |
| //===---------------------------------------------------------------------===// |
| |
| SSE has instructions for doing operations on complex numbers, we should pattern |
| match them. For example, this should turn into a horizontal add: |
| |
| typedef float __attribute__((vector_size(16))) v4f32; |
| float f32(v4f32 A) { |
| return A[0]+A[1]+A[2]+A[3]; |
| } |
| |
| Instead we get this: |
| |
| _f32: ## @f32 |
| pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] |
| addss %xmm0, %xmm1 |
| pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] |
| movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] |
| movaps %xmm0, %xmm3 |
| addss %xmm1, %xmm3 |
| movdqa %xmm2, %xmm0 |
| addss %xmm3, %xmm0 |
| ret |
| |
| Also, there are cases where some simple local SLP would improve codegen a bit. |
| compiling this: |
| |
| _Complex float f32(_Complex float A, _Complex float B) { |
| return A+B; |
| } |
| |
| into: |
| |
| _f32: ## @f32 |
| movdqa %xmm0, %xmm2 |
| addss %xmm1, %xmm2 |
| pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] |
| pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] |
| addss %xmm1, %xmm3 |
| movaps %xmm2, %xmm0 |
| unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] |
| ret |
| |
| seems silly when it could just be one addps. |
| |
| |
| //===---------------------------------------------------------------------===// |
| |
| Expand libm rounding functions inline: Significant speedups possible. |
| http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html |
| |
| //===---------------------------------------------------------------------===// |
| |
| When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and |
| other fast SSE modes. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Think about doing i64 math in SSE regs on x86-32. |
| |
| //===---------------------------------------------------------------------===// |
| |
| This testcase should have no SSE instructions in it, and only one load from |
| a constant pool: |
| |
| double %test3(bool %B) { |
| %C = select bool %B, double 123.412, double 523.01123123 |
| ret double %C |
| } |
| |
| Currently, the select is being lowered, which prevents the dag combiner from |
| turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' |
| |
| The pattern isel got this one right. |
| |
| //===---------------------------------------------------------------------===// |
| |
| SSE should implement 'select_cc' using 'emulated conditional moves' that use |
| pcmp/pand/pandn/por to do a selection instead of a conditional branch: |
| |
| double %X(double %Y, double %Z, double %A, double %B) { |
| %C = setlt double %A, %B |
| %z = fadd double %Z, 0.0 ;; select operand is not a load |
| %D = select bool %C, double %Y, double %z |
| ret double %D |
| } |
| |
| We currently emit: |
| |
| _X: |
| subl $12, %esp |
| xorpd %xmm0, %xmm0 |
| addsd 24(%esp), %xmm0 |
| movsd 32(%esp), %xmm1 |
| movsd 16(%esp), %xmm2 |
| ucomisd 40(%esp), %xmm1 |
| jb LBB_X_2 |
| LBB_X_1: |
| movsd %xmm0, %xmm2 |
| LBB_X_2: |
| movsd %xmm2, (%esp) |
| fldl (%esp) |
| addl $12, %esp |
| ret |
| |
| //===---------------------------------------------------------------------===// |
| |
| Lower memcpy / memset to a series of SSE 128 bit move instructions when it's |
| feasible. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Codegen: |
| if (copysign(1.0, x) == copysign(1.0, y)) |
| into: |
| if (x^y & mask) |
| when using SSE. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half |
| of a v4sf value. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. |
| Perhaps use pxor / xorp* to clear a XMM register first? |
| |
| //===---------------------------------------------------------------------===// |
| |
| External test Nurbs exposed some problems. Look for |
| __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc |
| emits: |
| |
| movaps (%edx), %xmm2 #59.21 |
| movaps (%edx), %xmm5 #60.21 |
| movaps (%edx), %xmm4 #61.21 |
| movaps (%edx), %xmm3 #62.21 |
| movl 40(%ecx), %ebp #69.49 |
| shufps $0, %xmm2, %xmm5 #60.21 |
| movl 100(%esp), %ebx #69.20 |
| movl (%ebx), %edi #69.20 |
| imull %ebp, %edi #69.49 |
| addl (%eax), %edi #70.33 |
| shufps $85, %xmm2, %xmm4 #61.21 |
| shufps $170, %xmm2, %xmm3 #62.21 |
| shufps $255, %xmm2, %xmm2 #63.21 |
| lea (%ebp,%ebp,2), %ebx #69.49 |
| negl %ebx #69.49 |
| lea -3(%edi,%ebx), %ebx #70.33 |
| shll $4, %ebx #68.37 |
| addl 32(%ecx), %ebx #68.37 |
| testb $15, %bl #91.13 |
| jne L_B1.24 # Prob 5% #91.13 |
| |
| This is the llvm code after instruction scheduling: |
| |
| cond_next140 (0xa910740, LLVM BB @0xa90beb0): |
| %reg1078 = MOV32ri -3 |
| %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 |
| %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 |
| %reg1080 = IMUL32rr %reg1079, %reg1037 |
| %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 |
| %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 |
| %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 |
| %reg1082 = SHL32ri %reg1038, 4 |
| %reg1039 = ADD32rr %reg1036, %reg1082 |
| %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 |
| %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 |
| %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 |
| %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 |
| %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 |
| %reg1040 = MOV32rr %reg1039 |
| %reg1084 = AND32ri8 %reg1039, 15 |
| CMP32ri8 %reg1084, 0 |
| JE mbb<cond_next204,0xa914d30> |
| |
| Still ok. After register allocation: |
| |
| cond_next140 (0xa910740, LLVM BB @0xa90beb0): |
| %EAX = MOV32ri -3 |
| %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 |
| ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 |
| %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 |
| %EDX = MOV32rm %EDX, 1, %NOREG, 40 |
| IMUL32rr %EAX<def&use>, %EDX |
| %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 |
| %ESI = MOV32rm %ESI, 1, %NOREG, 0 |
| MOV32mr <fi#4>, 1, %NOREG, 0, %ESI |
| %EAX = LEA32r %ESI, 1, %EAX, -3 |
| %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 |
| %ESI = MOV32rm %ESI, 1, %NOREG, 32 |
| %EDI = MOV32rr %EAX |
| SHL32ri %EDI<def&use>, 4 |
| ADD32rr %EDI<def&use>, %ESI |
| %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 |
| %XMM1 = MOVAPSrr %XMM0 |
| SHUFPSrr %XMM1<def&use>, %XMM1, 170 |
| %XMM2 = MOVAPSrr %XMM0 |
| SHUFPSrr %XMM2<def&use>, %XMM2, 0 |
| %XMM3 = MOVAPSrr %XMM0 |
| SHUFPSrr %XMM3<def&use>, %XMM3, 255 |
| SHUFPSrr %XMM0<def&use>, %XMM0, 85 |
| %EBX = MOV32rr %EDI |
| AND32ri8 %EBX<def&use>, 15 |
| CMP32ri8 %EBX, 0 |
| JE mbb<cond_next204,0xa914d30> |
| |
| This looks really bad. The problem is shufps is a destructive opcode. Since it |
| appears as operand two in more than one shufps ops. It resulted in a number of |
| copies. Note icc also suffers from the same problem. Either the instruction |
| selector should select pshufd or The register allocator can made the two-address |
| to three-address transformation. |
| |
| It also exposes some other problems. See MOV32ri -3 and the spills. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Consider: |
| |
| __m128 test(float a) { |
| return _mm_set_ps(0.0, 0.0, 0.0, a*a); |
| } |
| |
| This compiles into: |
| |
| movss 4(%esp), %xmm1 |
| mulss %xmm1, %xmm1 |
| xorps %xmm0, %xmm0 |
| movss %xmm1, %xmm0 |
| ret |
| |
| Because mulss doesn't modify the top 3 elements, the top elements of |
| xmm1 are already zero'd. We could compile this to: |
| |
| movss 4(%esp), %xmm0 |
| mulss %xmm0, %xmm0 |
| ret |
| |
| //===---------------------------------------------------------------------===// |
| |
| Here's a sick and twisted idea. Consider code like this: |
| |
| __m128 test(__m128 a) { |
| float b = *(float*)&A; |
| ... |
| return _mm_set_ps(0.0, 0.0, 0.0, b); |
| } |
| |
| This might compile to this code: |
| |
| movaps c(%esp), %xmm1 |
| xorps %xmm0, %xmm0 |
| movss %xmm1, %xmm0 |
| ret |
| |
| Now consider if the ... code caused xmm1 to get spilled. This might produce |
| this code: |
| |
| movaps c(%esp), %xmm1 |
| movaps %xmm1, c2(%esp) |
| ... |
| |
| xorps %xmm0, %xmm0 |
| movaps c2(%esp), %xmm1 |
| movss %xmm1, %xmm0 |
| ret |
| |
| However, since the reload is only used by these instructions, we could |
| "fold" it into the uses, producing something like this: |
| |
| movaps c(%esp), %xmm1 |
| movaps %xmm1, c2(%esp) |
| ... |
| |
| movss c2(%esp), %xmm0 |
| ret |
| |
| ... saving two instructions. |
| |
| The basic idea is that a reload from a spill slot, can, if only one 4-byte |
| chunk is used, bring in 3 zeros the one element instead of 4 elements. |
| This can be used to simplify a variety of shuffle operations, where the |
| elements are fixed zeros. |
| |
| //===---------------------------------------------------------------------===// |
| |
| This code generates ugly code, probably due to costs being off or something: |
| |
| define void @test(float* %P, <4 x float>* %P2 ) { |
| %xFloat0.688 = load float* %P |
| %tmp = load <4 x float>* %P2 |
| %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 |
| store <4 x float> %inFloat3.713, <4 x float>* %P2 |
| ret void |
| } |
| |
| Generates: |
| |
| _test: |
| movl 8(%esp), %eax |
| movaps (%eax), %xmm0 |
| pxor %xmm1, %xmm1 |
| movaps %xmm0, %xmm2 |
| shufps $50, %xmm1, %xmm2 |
| shufps $132, %xmm2, %xmm0 |
| movaps %xmm0, (%eax) |
| ret |
| |
| Would it be better to generate: |
| |
| _test: |
| movl 8(%esp), %ecx |
| movaps (%ecx), %xmm0 |
| xor %eax, %eax |
| pinsrw $6, %eax, %xmm0 |
| pinsrw $7, %eax, %xmm0 |
| movaps %xmm0, (%ecx) |
| ret |
| |
| ? |
| |
| //===---------------------------------------------------------------------===// |
| |
| Some useful information in the Apple Altivec / SSE Migration Guide: |
| |
| http://developer.apple.com/documentation/Performance/Conceptual/ |
| Accelerate_sse_migration/index.html |
| |
| e.g. SSE select using and, andnot, or. Various SSE compare translations. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Add hooks to commute some CMPP operations. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Apply the same transformation that merged four float into a single 128-bit load |
| to loads from constant pool. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Floating point max / min are commutable when -enable-unsafe-fp-path is |
| specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other |
| nodes which are selected to max / min instructions that are marked commutable. |
| |
| //===---------------------------------------------------------------------===// |
| |
| We should materialize vector constants like "all ones" and "signbit" with |
| code like: |
| |
| cmpeqps xmm1, xmm1 ; xmm1 = all-ones |
| |
| and: |
| cmpeqps xmm1, xmm1 ; xmm1 = all-ones |
| psrlq xmm1, 31 ; xmm1 = all 100000000000... |
| |
| instead of using a load from the constant pool. The later is important for |
| ABS/NEG/copysign etc. |
| |
| //===---------------------------------------------------------------------===// |
| |
| These functions: |
| |
| #include <xmmintrin.h> |
| __m128i a; |
| void x(unsigned short n) { |
| a = _mm_slli_epi32 (a, n); |
| } |
| void y(unsigned n) { |
| a = _mm_slli_epi32 (a, n); |
| } |
| |
| compile to ( -O3 -static -fomit-frame-pointer): |
| _x: |
| movzwl 4(%esp), %eax |
| movd %eax, %xmm0 |
| movaps _a, %xmm1 |
| pslld %xmm0, %xmm1 |
| movaps %xmm1, _a |
| ret |
| _y: |
| movd 4(%esp), %xmm0 |
| movaps _a, %xmm1 |
| pslld %xmm0, %xmm1 |
| movaps %xmm1, _a |
| ret |
| |
| "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems |
| like movd would be sufficient in both cases as the value is already zero |
| extended in the 32-bit stack slot IIRC. For signed short, it should also be |
| save, as a really-signed value would be undefined for pslld. |
| |
| |
| //===---------------------------------------------------------------------===// |
| |
| #include <math.h> |
| int t1(double d) { return signbit(d); } |
| |
| This currently compiles to: |
| subl $12, %esp |
| movsd 16(%esp), %xmm0 |
| movsd %xmm0, (%esp) |
| movl 4(%esp), %eax |
| shrl $31, %eax |
| addl $12, %esp |
| ret |
| |
| We should use movmskp{s|d} instead. |
| |
| //===---------------------------------------------------------------------===// |
| |
| CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single |
| (aligned) vector load. This functionality has a couple of problems. |
| |
| 1. The code to infer alignment from loads of globals is in the X86 backend, |
| not the dag combiner. This is because dagcombine2 needs to be able to see |
| through the X86ISD::Wrapper node, which DAGCombine can't really do. |
| 2. The code for turning 4 x load into a single vector load is target |
| independent and should be moved to the dag combiner. |
| 3. The code for turning 4 x load into a vector load can only handle a direct |
| load from a global or a direct load from the stack. It should be generalized |
| to handle any load from P, P+4, P+8, P+12, where P can be anything. |
| 4. The alignment inference code cannot handle loads from globals in non-static |
| mode because it doesn't look through the extra dyld stub load. If you try |
| vec_align.ll without -relocation-model=static, you'll see what I mean. |
| |
| //===---------------------------------------------------------------------===// |
| |
| We should lower store(fneg(load p), q) into an integer load+xor+store, which |
| eliminates a constant pool load. For example, consider: |
| |
| define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { |
| entry: |
| %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] |
| %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly |
| ret i64 %tmp20 |
| } |
| declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly |
| |
| This currently compiles to: |
| |
| LCPI1_0: # <4 x float> |
| .long 2147483648 # float -0 |
| .long 2147483648 # float -0 |
| .long 2147483648 # float -0 |
| .long 2147483648 # float -0 |
| _ccosf: |
| subl $12, %esp |
| movss 16(%esp), %xmm0 |
| movss %xmm0, 4(%esp) |
| movss 20(%esp), %xmm0 |
| xorps LCPI1_0, %xmm0 |
| movss %xmm0, (%esp) |
| call L_ccoshf$stub |
| addl $12, %esp |
| ret |
| |
| Note the load into xmm0, then xor (to negate), then store. In PIC mode, |
| this code computes the pic base and does two loads to do the constant pool |
| load, so the improvement is much bigger. |
| |
| The tricky part about this xform is that the argument load/store isn't exposed |
| until post-legalize, and at that point, the fneg has been custom expanded into |
| an X86 fxor. This means that we need to handle this case in the x86 backend |
| instead of in target independent code. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Non-SSE4 insert into 16 x i8 is atrociously bad. |
| |
| //===---------------------------------------------------------------------===// |
| |
| <2 x i64> extract is substantially worse than <2 x f64>, even if the destination |
| is memory. |
| |
| //===---------------------------------------------------------------------===// |
| |
| SSE4 extract-to-mem ops aren't being pattern matched because of the AssertZext |
| sitting between the truncate and the extract. |
| |
| //===---------------------------------------------------------------------===// |
| |
| INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert |
| any number of 0.0 simultaneously. Currently we only use it for simple |
| insertions. |
| |
| See comments in LowerINSERT_VECTOR_ELT_SSE4. |
| |
| //===---------------------------------------------------------------------===// |
| |
| On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not |
| Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are |
| legal, it'll just take a few extra patterns written in the .td file. |
| |
| Note: this is not a code quality issue; the custom lowered code happens to be |
| right, but we shouldn't have to custom lower anything. This is probably related |
| to <2 x i64> ops being so bad. |
| |
| //===---------------------------------------------------------------------===// |
| |
| 'select' on vectors and scalars could be a whole lot better. We currently |
| lower them to conditional branches. On x86-64 for example, we compile this: |
| |
| double test(double a, double b, double c, double d) { return a<b ? c : d; } |
| |
| to: |
| |
| _test: |
| ucomisd %xmm0, %xmm1 |
| ja LBB1_2 # entry |
| LBB1_1: # entry |
| movapd %xmm3, %xmm2 |
| LBB1_2: # entry |
| movapd %xmm2, %xmm0 |
| ret |
| |
| instead of: |
| |
| _test: |
| cmpltsd %xmm1, %xmm0 |
| andpd %xmm0, %xmm2 |
| andnpd %xmm3, %xmm0 |
| orpd %xmm2, %xmm0 |
| ret |
| |
| For unpredictable branches, the later is much more efficient. This should |
| just be a matter of having scalar sse map to SELECT_CC and custom expanding |
| or iseling it. |
| |
| //===---------------------------------------------------------------------===// |
| |
| LLVM currently generates stack realignment code, when it is not necessary |
| needed. The problem is that we need to know about stack alignment too early, |
| before RA runs. |
| |
| At that point we don't know, whether there will be vector spill, or not. |
| Stack realignment logic is overly conservative here, but otherwise we can |
| produce unaligned loads/stores. |
| |
| Fixing this will require some huge RA changes. |
| |
| Testcase: |
| #include <emmintrin.h> |
| |
| typedef short vSInt16 __attribute__ ((__vector_size__ (16))); |
| |
| static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, |
| - 22725, - 12873};; |
| |
| vSInt16 madd(vSInt16 b) |
| { |
| return _mm_madd_epi16(a, b); |
| } |
| |
| Generated code (x86-32, linux): |
| madd: |
| pushl %ebp |
| movl %esp, %ebp |
| andl $-16, %esp |
| movaps .LCPI1_0, %xmm1 |
| pmaddwd %xmm1, %xmm0 |
| movl %ebp, %esp |
| popl %ebp |
| ret |
| |
| //===---------------------------------------------------------------------===// |
| |
| Consider: |
| #include <emmintrin.h> |
| __m128 foo2 (float x) { |
| return _mm_set_ps (0, 0, x, 0); |
| } |
| |
| In x86-32 mode, we generate this spiffy code: |
| |
| _foo2: |
| movss 4(%esp), %xmm0 |
| pshufd $81, %xmm0, %xmm0 |
| ret |
| |
| in x86-64 mode, we generate this code, which could be better: |
| |
| _foo2: |
| xorps %xmm1, %xmm1 |
| movss %xmm0, %xmm1 |
| pshufd $81, %xmm1, %xmm0 |
| ret |
| |
| In sse4 mode, we could use insertps to make both better. |
| |
| Here's another testcase that could use insertps [mem]: |
| |
| #include <xmmintrin.h> |
| extern float x2, x3; |
| __m128 foo1 (float x1, float x4) { |
| return _mm_set_ps (x2, x1, x3, x4); |
| } |
| |
| gcc mainline compiles it to: |
| |
| foo1: |
| insertps $0x10, x2(%rip), %xmm0 |
| insertps $0x10, x3(%rip), %xmm1 |
| movaps %xmm1, %xmm2 |
| movlhps %xmm0, %xmm2 |
| movaps %xmm2, %xmm0 |
| ret |
| |
| //===---------------------------------------------------------------------===// |
| |
| We compile vector multiply-by-constant into poor code: |
| |
| define <4 x i32> @f(<4 x i32> %i) nounwind { |
| %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > |
| ret <4 x i32> %A |
| } |
| |
| On targets without SSE4.1, this compiles into: |
| |
| LCPI1_0: ## <4 x i32> |
| .long 10 |
| .long 10 |
| .long 10 |
| .long 10 |
| .text |
| .align 4,0x90 |
| .globl _f |
| _f: |
| pshufd $3, %xmm0, %xmm1 |
| movd %xmm1, %eax |
| imull LCPI1_0+12, %eax |
| movd %eax, %xmm1 |
| pshufd $1, %xmm0, %xmm2 |
| movd %xmm2, %eax |
| imull LCPI1_0+4, %eax |
| movd %eax, %xmm2 |
| punpckldq %xmm1, %xmm2 |
| movd %xmm0, %eax |
| imull LCPI1_0, %eax |
| movd %eax, %xmm1 |
| movhlps %xmm0, %xmm0 |
| movd %xmm0, %eax |
| imull LCPI1_0+8, %eax |
| movd %eax, %xmm0 |
| punpckldq %xmm0, %xmm1 |
| movaps %xmm1, %xmm0 |
| punpckldq %xmm2, %xmm0 |
| ret |
| |
| It would be better to synthesize integer vector multiplication by constants |
| using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, |
| simple cases such as multiplication by powers of two would be better as |
| vector shifts than as multiplications. |
| |
| //===---------------------------------------------------------------------===// |
| |
| We compile this: |
| |
| __m128i |
| foo2 (char x) |
| { |
| return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); |
| } |
| |
| into: |
| movl $1, %eax |
| xorps %xmm0, %xmm0 |
| pinsrw $2, %eax, %xmm0 |
| movzbl 4(%esp), %eax |
| pinsrw $3, %eax, %xmm0 |
| movl $256, %eax |
| pinsrw $7, %eax, %xmm0 |
| ret |
| |
| |
| gcc-4.2: |
| subl $12, %esp |
| movzbl 16(%esp), %eax |
| movdqa LC0, %xmm0 |
| pinsrw $3, %eax, %xmm0 |
| addl $12, %esp |
| ret |
| .const |
| .align 4 |
| LC0: |
| .word 0 |
| .word 0 |
| .word 1 |
| .word 0 |
| .word 0 |
| .word 0 |
| .word 0 |
| .word 256 |
| |
| With SSE4, it should be |
| movdqa .LC0(%rip), %xmm0 |
| pinsrb $6, %edi, %xmm0 |
| |
| //===---------------------------------------------------------------------===// |
| |
| We should transform a shuffle of two vectors of constants into a single vector |
| of constants. Also, insertelement of a constant into a vector of constants |
| should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. |
| |
| We compiled it to something horrible: |
| |
| .align 4 |
| LCPI1_1: ## float |
| .long 1065353216 ## float 1 |
| .const |
| |
| .align 4 |
| LCPI1_0: ## <4 x float> |
| .space 4 |
| .long 1065353216 ## float 1 |
| .space 4 |
| .long 1065353216 ## float 1 |
| .text |
| .align 4,0x90 |
| .globl _t |
| _t: |
| xorps %xmm0, %xmm0 |
| movhps LCPI1_0, %xmm0 |
| movss LCPI1_1, %xmm1 |
| movaps %xmm0, %xmm2 |
| shufps $2, %xmm1, %xmm2 |
| shufps $132, %xmm2, %xmm0 |
| movaps %xmm0, 0 |
| |
| //===---------------------------------------------------------------------===// |
| rdar://5907648 |
| |
| This function: |
| |
| float foo(unsigned char x) { |
| return x; |
| } |
| |
| compiles to (x86-32): |
| |
| define float @foo(i8 zeroext %x) nounwind { |
| %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] |
| ret float %tmp12 |
| } |
| |
| compiles to: |
| |
| _foo: |
| subl $4, %esp |
| movzbl 8(%esp), %eax |
| cvtsi2ss %eax, %xmm0 |
| movss %xmm0, (%esp) |
| flds (%esp) |
| addl $4, %esp |
| ret |
| |
| We should be able to use: |
| cvtsi2ss 8($esp), %xmm0 |
| since we know the stack slot is already zext'd. |
| |
| //===---------------------------------------------------------------------===// |
| |
| Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) |
| when code size is critical. movlps is slower than movsd on core2 but it's one |
| byte shorter. |
| |
| //===---------------------------------------------------------------------===// |
| |
| We should use a dynamic programming based approach to tell when using FPStack |
| operations is cheaper than SSE. SciMark montecarlo contains code like this |
| for example: |
| |
| double MonteCarlo_num_flops(int Num_samples) { |
| return ((double) Num_samples)* 4.0; |
| } |
| |
| In fpstack mode, this compiles into: |
| |
| LCPI1_0: |
| .long 1082130432 ## float 4.000000e+00 |
| _MonteCarlo_num_flops: |
| subl $4, %esp |
| movl 8(%esp), %eax |
| movl %eax, (%esp) |
| fildl (%esp) |
| fmuls LCPI1_0 |
| addl $4, %esp |
| ret |
| |
| in SSE mode, it compiles into significantly slower code: |
| |
| _MonteCarlo_num_flops: |
| subl $12, %esp |
| cvtsi2sd 16(%esp), %xmm0 |
| mulsd LCPI1_0, %xmm0 |
| movsd %xmm0, (%esp) |
| fldl (%esp) |
| addl $12, %esp |
| ret |
| |
| There are also other cases in scimark where using fpstack is better, it is |
| cheaper to do fld1 than load from a constant pool for example, so |
| "load, add 1.0, store" is better done in the fp stack, etc. |
| |
| //===---------------------------------------------------------------------===// |
| |
| The X86 backend should be able to if-convert SSE comparisons like "ucomisd" to |
| "cmpsd". For example, this code: |
| |
| double d1(double x) { return x == x ? x : x + x; } |
| |
| Compiles into: |
| |
| _d1: |
| ucomisd %xmm0, %xmm0 |
| jnp LBB1_2 |
| addsd %xmm0, %xmm0 |
| ret |
| LBB1_2: |
| ret |
| |
| Also, the 'ret's should be shared. This is PR6032. |
| |
| //===---------------------------------------------------------------------===// |
| |
| These should compile into the same code (PR6214): Perhaps instcombine should |
| canonicalize the former into the later? |
| |
| define float @foo(float %x) nounwind { |
| %t = bitcast float %x to i32 |
| %s = and i32 %t, 2147483647 |
| %d = bitcast i32 %s to float |
| ret float %d |
| } |
| |
| declare float @fabsf(float %n) |
| define float @bar(float %x) nounwind { |
| %d = call float @fabsf(float %x) |
| ret float %d |
| } |
| |
| //===---------------------------------------------------------------------===// |
| |
| This IR (from PR6194): |
| |
| target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" |
| target triple = "x86_64-apple-darwin10.0.0" |
| |
| %0 = type { double, double } |
| %struct.float3 = type { float, float, float } |
| |
| define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { |
| entry: |
| %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] |
| %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] |
| %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] |
| %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] |
| %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] |
| %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] |
| %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] |
| store float %tmp12, float* %tmp5 |
| ret void |
| } |
| |
| Compiles to: |
| |
| _test: ## @test |
| movd %xmm0, %rax |
| shrq $32, %rax |
| movl %eax, 4(%rdi) |
| ret |
| |
| This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and |
| doing a shuffle from v[1] to v[0] then a float store. |
| |
| //===---------------------------------------------------------------------===// |
| |
| On SSE4 machines, we compile this code: |
| |
| define <2 x float> @test2(<2 x float> %Q, <2 x float> %R, |
| <2 x float> *%P) nounwind { |
| %Z = fadd <2 x float> %Q, %R |
| |
| store <2 x float> %Z, <2 x float> *%P |
| ret <2 x float> %Z |
| } |
| |
| into: |
| |
| _test2: ## @test2 |
| ## BB#0: |
| insertps $0, %xmm2, %xmm2 |
| insertps $16, %xmm3, %xmm2 |
| insertps $0, %xmm0, %xmm3 |
| insertps $16, %xmm1, %xmm3 |
| addps %xmm2, %xmm3 |
| movq %xmm3, (%rdi) |
| movaps %xmm3, %xmm0 |
| pshufd $1, %xmm3, %xmm1 |
| ## kill: XMM1<def> XMM1<kill> |
| ret |
| |
| The insertps's of $0 are pointless complex copies. |
| |
| //===---------------------------------------------------------------------===// |
| |
| If SSE4.1 is available we should inline rounding functions instead of emitting |
| a libcall. |
| |
| floor: roundsd $0x01, %xmm, %xmm |
| ceil: roundsd $0x02, %xmm, %xmm |
| |
| and likewise for the single precision versions. |
| |
| Currently, SelectionDAGBuilder doesn't turn calls to these functions into the |
| corresponding nodes and some targets (including X86) aren't ready for them. |
| |
| //===---------------------------------------------------------------------===// |