Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 1 | //===---------------------------------------------------------------------===// |
| 2 | // Random ideas for the X86 backend: SSE-specific stuff. |
| 3 | //===---------------------------------------------------------------------===// |
| 4 | |
Chris Lattner | a42202e | 2010-08-23 17:30:29 +0000 | [diff] [blame] | 5 | //===---------------------------------------------------------------------===// |
| 6 | |
| 7 | SSE Variable shift can be custom lowered to something like this, which uses a |
| 8 | small table + unaligned load + shuffle instead of going through memory. |
| 9 | |
| 10 | __m128i_shift_right: |
| 11 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
| 12 | .byte -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 |
| 13 | |
| 14 | ... |
| 15 | __m128i shift_right(__m128i value, unsigned long offset) { |
| 16 | return _mm_shuffle_epi8(value, |
| 17 | _mm_loadu_si128((__m128 *) (___m128i_shift_right + offset))); |
| 18 | } |
Chris Lattner | db8adb9 | 2007-08-15 16:58:38 +0000 | [diff] [blame] | 19 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 20 | //===---------------------------------------------------------------------===// |
| 21 | |
Chris Lattner | aecf47a | 2010-08-25 23:31:42 +0000 | [diff] [blame] | 22 | SSE has instructions for doing operations on complex numbers, we should pattern |
Chris Lattner | 5cac0f7 | 2010-09-05 20:22:09 +0000 | [diff] [blame] | 23 | match them. For example, this should turn into a horizontal add: |
| 24 | |
| 25 | typedef float __attribute__((vector_size(16))) v4f32; |
| 26 | float f32(v4f32 A) { |
| 27 | return A[0]+A[1]+A[2]+A[3]; |
| 28 | } |
| 29 | |
| 30 | Instead we get this: |
| 31 | |
| 32 | _f32: ## @f32 |
| 33 | pshufd $1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0,0,0] |
| 34 | addss %xmm0, %xmm1 |
| 35 | pshufd $3, %xmm0, %xmm2 ## xmm2 = xmm0[3,0,0,0] |
| 36 | movhlps %xmm0, %xmm0 ## xmm0 = xmm0[1,1] |
| 37 | movaps %xmm0, %xmm3 |
| 38 | addss %xmm1, %xmm3 |
| 39 | movdqa %xmm2, %xmm0 |
| 40 | addss %xmm3, %xmm0 |
| 41 | ret |
| 42 | |
| 43 | Also, there are cases where some simple local SLP would improve codegen a bit. |
| 44 | compiling this: |
Chris Lattner | aecf47a | 2010-08-25 23:31:42 +0000 | [diff] [blame] | 45 | |
| 46 | _Complex float f32(_Complex float A, _Complex float B) { |
| 47 | return A+B; |
| 48 | } |
| 49 | |
| 50 | into: |
| 51 | |
Chris Lattner | 5cac0f7 | 2010-09-05 20:22:09 +0000 | [diff] [blame] | 52 | _f32: ## @f32 |
Chris Lattner | aecf47a | 2010-08-25 23:31:42 +0000 | [diff] [blame] | 53 | movdqa %xmm0, %xmm2 |
| 54 | addss %xmm1, %xmm2 |
Chris Lattner | 5cac0f7 | 2010-09-05 20:22:09 +0000 | [diff] [blame] | 55 | pshufd $1, %xmm1, %xmm1 ## xmm1 = xmm1[1,0,0,0] |
| 56 | pshufd $1, %xmm0, %xmm3 ## xmm3 = xmm0[1,0,0,0] |
| 57 | addss %xmm1, %xmm3 |
| 58 | movaps %xmm2, %xmm0 |
| 59 | unpcklps %xmm3, %xmm0 ## xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] |
Chris Lattner | aecf47a | 2010-08-25 23:31:42 +0000 | [diff] [blame] | 60 | ret |
| 61 | |
Chris Lattner | 5cac0f7 | 2010-09-05 20:22:09 +0000 | [diff] [blame] | 62 | seems silly when it could just be one addps. |
Chris Lattner | aecf47a | 2010-08-25 23:31:42 +0000 | [diff] [blame] | 63 | |
| 64 | |
| 65 | //===---------------------------------------------------------------------===// |
| 66 | |
Chris Lattner | 35a1471 | 2006-10-18 17:04:09 +0000 | [diff] [blame] | 67 | Expand libm rounding functions inline: Significant speedups possible. |
| 68 | http://gcc.gnu.org/ml/gcc-patches/2006-10/msg00909.html |
| 69 | |
| 70 | //===---------------------------------------------------------------------===// |
| 71 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 72 | When compiled with unsafemath enabled, "main" should enable SSE DAZ mode and |
| 73 | other fast SSE modes. |
| 74 | |
| 75 | //===---------------------------------------------------------------------===// |
| 76 | |
Chris Lattner | 9a8eb0d | 2008-09-20 19:17:53 +0000 | [diff] [blame] | 77 | Think about doing i64 math in SSE regs on x86-32. |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 78 | |
| 79 | //===---------------------------------------------------------------------===// |
| 80 | |
| 81 | This testcase should have no SSE instructions in it, and only one load from |
| 82 | a constant pool: |
| 83 | |
| 84 | double %test3(bool %B) { |
| 85 | %C = select bool %B, double 123.412, double 523.01123123 |
| 86 | ret double %C |
| 87 | } |
| 88 | |
| 89 | Currently, the select is being lowered, which prevents the dag combiner from |
| 90 | turning 'select (load CPI1), (load CPI2)' -> 'load (select CPI1, CPI2)' |
| 91 | |
| 92 | The pattern isel got this one right. |
| 93 | |
| 94 | //===---------------------------------------------------------------------===// |
| 95 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 96 | Lower memcpy / memset to a series of SSE 128 bit move instructions when it's |
| 97 | feasible. |
| 98 | |
| 99 | //===---------------------------------------------------------------------===// |
| 100 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 101 | Codegen: |
| 102 | if (copysign(1.0, x) == copysign(1.0, y)) |
| 103 | into: |
| 104 | if (x^y & mask) |
| 105 | when using SSE. |
| 106 | |
| 107 | //===---------------------------------------------------------------------===// |
| 108 | |
| 109 | Use movhps to update upper 64-bits of a v4sf value. Also movlps on lower half |
| 110 | of a v4sf value. |
| 111 | |
| 112 | //===---------------------------------------------------------------------===// |
| 113 | |
| 114 | Better codegen for vector_shuffles like this { x, 0, 0, 0 } or { x, 0, x, 0}. |
| 115 | Perhaps use pxor / xorp* to clear a XMM register first? |
| 116 | |
| 117 | //===---------------------------------------------------------------------===// |
| 118 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 119 | External test Nurbs exposed some problems. Look for |
| 120 | __ZN15Nurbs_SSE_Cubic17TessellateSurfaceE, bb cond_next140. This is what icc |
| 121 | emits: |
| 122 | |
| 123 | movaps (%edx), %xmm2 #59.21 |
| 124 | movaps (%edx), %xmm5 #60.21 |
| 125 | movaps (%edx), %xmm4 #61.21 |
| 126 | movaps (%edx), %xmm3 #62.21 |
| 127 | movl 40(%ecx), %ebp #69.49 |
| 128 | shufps $0, %xmm2, %xmm5 #60.21 |
| 129 | movl 100(%esp), %ebx #69.20 |
| 130 | movl (%ebx), %edi #69.20 |
| 131 | imull %ebp, %edi #69.49 |
| 132 | addl (%eax), %edi #70.33 |
| 133 | shufps $85, %xmm2, %xmm4 #61.21 |
| 134 | shufps $170, %xmm2, %xmm3 #62.21 |
| 135 | shufps $255, %xmm2, %xmm2 #63.21 |
| 136 | lea (%ebp,%ebp,2), %ebx #69.49 |
| 137 | negl %ebx #69.49 |
| 138 | lea -3(%edi,%ebx), %ebx #70.33 |
| 139 | shll $4, %ebx #68.37 |
| 140 | addl 32(%ecx), %ebx #68.37 |
| 141 | testb $15, %bl #91.13 |
| 142 | jne L_B1.24 # Prob 5% #91.13 |
| 143 | |
| 144 | This is the llvm code after instruction scheduling: |
| 145 | |
| 146 | cond_next140 (0xa910740, LLVM BB @0xa90beb0): |
| 147 | %reg1078 = MOV32ri -3 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 148 | %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 |
| 149 | %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 150 | %reg1080 = IMUL32rr %reg1079, %reg1037 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 151 | %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 152 | %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 153 | %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 154 | %reg1082 = SHL32ri %reg1038, 4 |
| 155 | %reg1039 = ADD32rr %reg1036, %reg1082 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 156 | %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 157 | %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 |
| 158 | %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 |
| 159 | %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 |
| 160 | %reg1033 = SHUFPSrr %reg1083, %reg1083, 85 |
| 161 | %reg1040 = MOV32rr %reg1039 |
| 162 | %reg1084 = AND32ri8 %reg1039, 15 |
| 163 | CMP32ri8 %reg1084, 0 |
| 164 | JE mbb<cond_next204,0xa914d30> |
| 165 | |
| 166 | Still ok. After register allocation: |
| 167 | |
| 168 | cond_next140 (0xa910740, LLVM BB @0xa90beb0): |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 169 | %eax = MOV32ri -3 |
Francis Visoiu Mistrih | 0b5bdce | 2017-12-15 16:33:45 +0000 | [diff] [blame^] | 170 | %edx = MOV32rm %stack.3, 1, %noreg, 0 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 171 | ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 |
Francis Visoiu Mistrih | 0b5bdce | 2017-12-15 16:33:45 +0000 | [diff] [blame^] | 172 | %edx = MOV32rm %stack.7, 1, %noreg, 0 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 173 | %edx = MOV32rm %edx, 1, %noreg, 40 |
| 174 | IMUL32rr %eax<def&use>, %edx |
Francis Visoiu Mistrih | 0b5bdce | 2017-12-15 16:33:45 +0000 | [diff] [blame^] | 175 | %esi = MOV32rm %stack.5, 1, %noreg, 0 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 176 | %esi = MOV32rm %esi, 1, %noreg, 0 |
Francis Visoiu Mistrih | 0b5bdce | 2017-12-15 16:33:45 +0000 | [diff] [blame^] | 177 | MOV32mr %stack.4, 1, %noreg, 0, %esi |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 178 | %eax = LEA32r %esi, 1, %eax, -3 |
Francis Visoiu Mistrih | 0b5bdce | 2017-12-15 16:33:45 +0000 | [diff] [blame^] | 179 | %esi = MOV32rm %stack.7, 1, %noreg, 0 |
Francis Visoiu Mistrih | 9d7bb0c | 2017-11-28 17:15:09 +0000 | [diff] [blame] | 180 | %esi = MOV32rm %esi, 1, %noreg, 32 |
| 181 | %edi = MOV32rr %eax |
| 182 | SHL32ri %edi<def&use>, 4 |
| 183 | ADD32rr %edi<def&use>, %esi |
| 184 | %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 |
| 185 | %xmm1 = MOVAPSrr %xmm0 |
| 186 | SHUFPSrr %xmm1<def&use>, %xmm1, 170 |
| 187 | %xmm2 = MOVAPSrr %xmm0 |
| 188 | SHUFPSrr %xmm2<def&use>, %xmm2, 0 |
| 189 | %xmm3 = MOVAPSrr %xmm0 |
| 190 | SHUFPSrr %xmm3<def&use>, %xmm3, 255 |
| 191 | SHUFPSrr %xmm0<def&use>, %xmm0, 85 |
| 192 | %ebx = MOV32rr %edi |
| 193 | AND32ri8 %ebx<def&use>, 15 |
| 194 | CMP32ri8 %ebx, 0 |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 195 | JE mbb<cond_next204,0xa914d30> |
| 196 | |
| 197 | This looks really bad. The problem is shufps is a destructive opcode. Since it |
| 198 | appears as operand two in more than one shufps ops. It resulted in a number of |
| 199 | copies. Note icc also suffers from the same problem. Either the instruction |
| 200 | selector should select pshufd or The register allocator can made the two-address |
| 201 | to three-address transformation. |
| 202 | |
| 203 | It also exposes some other problems. See MOV32ri -3 and the spills. |
| 204 | |
| 205 | //===---------------------------------------------------------------------===// |
| 206 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 207 | Consider: |
| 208 | |
| 209 | __m128 test(float a) { |
| 210 | return _mm_set_ps(0.0, 0.0, 0.0, a*a); |
| 211 | } |
| 212 | |
| 213 | This compiles into: |
| 214 | |
| 215 | movss 4(%esp), %xmm1 |
| 216 | mulss %xmm1, %xmm1 |
| 217 | xorps %xmm0, %xmm0 |
| 218 | movss %xmm1, %xmm0 |
| 219 | ret |
| 220 | |
| 221 | Because mulss doesn't modify the top 3 elements, the top elements of |
| 222 | xmm1 are already zero'd. We could compile this to: |
| 223 | |
| 224 | movss 4(%esp), %xmm0 |
| 225 | mulss %xmm0, %xmm0 |
| 226 | ret |
| 227 | |
| 228 | //===---------------------------------------------------------------------===// |
| 229 | |
| 230 | Here's a sick and twisted idea. Consider code like this: |
| 231 | |
| 232 | __m128 test(__m128 a) { |
| 233 | float b = *(float*)&A; |
| 234 | ... |
| 235 | return _mm_set_ps(0.0, 0.0, 0.0, b); |
| 236 | } |
| 237 | |
| 238 | This might compile to this code: |
| 239 | |
| 240 | movaps c(%esp), %xmm1 |
| 241 | xorps %xmm0, %xmm0 |
| 242 | movss %xmm1, %xmm0 |
| 243 | ret |
| 244 | |
| 245 | Now consider if the ... code caused xmm1 to get spilled. This might produce |
| 246 | this code: |
| 247 | |
| 248 | movaps c(%esp), %xmm1 |
| 249 | movaps %xmm1, c2(%esp) |
| 250 | ... |
| 251 | |
| 252 | xorps %xmm0, %xmm0 |
| 253 | movaps c2(%esp), %xmm1 |
| 254 | movss %xmm1, %xmm0 |
| 255 | ret |
| 256 | |
| 257 | However, since the reload is only used by these instructions, we could |
| 258 | "fold" it into the uses, producing something like this: |
| 259 | |
| 260 | movaps c(%esp), %xmm1 |
| 261 | movaps %xmm1, c2(%esp) |
| 262 | ... |
| 263 | |
| 264 | movss c2(%esp), %xmm0 |
| 265 | ret |
| 266 | |
| 267 | ... saving two instructions. |
| 268 | |
| 269 | The basic idea is that a reload from a spill slot, can, if only one 4-byte |
Dan Gohman | 4a61882 | 2010-02-10 16:03:48 +0000 | [diff] [blame] | 270 | chunk is used, bring in 3 zeros the one element instead of 4 elements. |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 271 | This can be used to simplify a variety of shuffle operations, where the |
| 272 | elements are fixed zeros. |
| 273 | |
| 274 | //===---------------------------------------------------------------------===// |
| 275 | |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 276 | This code generates ugly code, probably due to costs being off or something: |
| 277 | |
Chris Lattner | d2b8a36 | 2007-12-29 19:31:47 +0000 | [diff] [blame] | 278 | define void @test(float* %P, <4 x float>* %P2 ) { |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 279 | %xFloat0.688 = load float* %P |
Chris Lattner | d2b8a36 | 2007-12-29 19:31:47 +0000 | [diff] [blame] | 280 | %tmp = load <4 x float>* %P2 |
| 281 | %inFloat3.713 = insertelement <4 x float> %tmp, float 0.0, i32 3 |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 282 | store <4 x float> %inFloat3.713, <4 x float>* %P2 |
| 283 | ret void |
| 284 | } |
| 285 | |
| 286 | Generates: |
| 287 | |
| 288 | _test: |
Chris Lattner | d2b8a36 | 2007-12-29 19:31:47 +0000 | [diff] [blame] | 289 | movl 8(%esp), %eax |
| 290 | movaps (%eax), %xmm0 |
| 291 | pxor %xmm1, %xmm1 |
| 292 | movaps %xmm0, %xmm2 |
| 293 | shufps $50, %xmm1, %xmm2 |
| 294 | shufps $132, %xmm2, %xmm0 |
| 295 | movaps %xmm0, (%eax) |
| 296 | ret |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 297 | |
Chris Lattner | d2b8a36 | 2007-12-29 19:31:47 +0000 | [diff] [blame] | 298 | Would it be better to generate: |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 299 | |
| 300 | _test: |
| 301 | movl 8(%esp), %ecx |
| 302 | movaps (%ecx), %xmm0 |
| 303 | xor %eax, %eax |
| 304 | pinsrw $6, %eax, %xmm0 |
| 305 | pinsrw $7, %eax, %xmm0 |
| 306 | movaps %xmm0, (%ecx) |
| 307 | ret |
| 308 | |
Chris Lattner | d2b8a36 | 2007-12-29 19:31:47 +0000 | [diff] [blame] | 309 | ? |
Chris Lattner | 17f1f1a | 2006-05-19 20:51:43 +0000 | [diff] [blame] | 310 | |
| 311 | //===---------------------------------------------------------------------===// |
| 312 | |
| 313 | Some useful information in the Apple Altivec / SSE Migration Guide: |
| 314 | |
| 315 | http://developer.apple.com/documentation/Performance/Conceptual/ |
| 316 | Accelerate_sse_migration/index.html |
| 317 | |
| 318 | e.g. SSE select using and, andnot, or. Various SSE compare translations. |
Evan Cheng | ddced95 | 2006-05-30 23:56:31 +0000 | [diff] [blame] | 319 | |
| 320 | //===---------------------------------------------------------------------===// |
| 321 | |
| 322 | Add hooks to commute some CMPP operations. |
Chris Lattner | 37c1c44 | 2006-06-14 21:26:18 +0000 | [diff] [blame] | 323 | |
| 324 | //===---------------------------------------------------------------------===// |
| 325 | |
Evan Cheng | 32860f4 | 2006-07-10 21:42:16 +0000 | [diff] [blame] | 326 | Apply the same transformation that merged four float into a single 128-bit load |
| 327 | to loads from constant pool. |
Evan Cheng | 915026c | 2006-11-10 22:09:17 +0000 | [diff] [blame] | 328 | |
| 329 | //===---------------------------------------------------------------------===// |
| 330 | |
| 331 | Floating point max / min are commutable when -enable-unsafe-fp-path is |
| 332 | specified. We should turn int_x86_sse_max_ss and X86ISD::FMIN etc. into other |
| 333 | nodes which are selected to max / min instructions that are marked commutable. |
Chris Lattner | 2ebb2e9 | 2007-02-27 17:21:09 +0000 | [diff] [blame] | 334 | |
| 335 | //===---------------------------------------------------------------------===// |
| 336 | |
Chris Lattner | fef69f5 | 2007-09-26 06:15:48 +0000 | [diff] [blame] | 337 | We should materialize vector constants like "all ones" and "signbit" with |
Chris Lattner | 51883ac | 2007-08-24 15:17:59 +0000 | [diff] [blame] | 338 | code like: |
| 339 | |
| 340 | cmpeqps xmm1, xmm1 ; xmm1 = all-ones |
| 341 | |
| 342 | and: |
| 343 | cmpeqps xmm1, xmm1 ; xmm1 = all-ones |
| 344 | psrlq xmm1, 31 ; xmm1 = all 100000000000... |
| 345 | |
| 346 | instead of using a load from the constant pool. The later is important for |
| 347 | ABS/NEG/copysign etc. |
| 348 | |
| 349 | //===---------------------------------------------------------------------===// |
Chris Lattner | fef69f5 | 2007-09-26 06:15:48 +0000 | [diff] [blame] | 350 | |
Chris Lattner | 909a54c | 2007-10-29 06:19:48 +0000 | [diff] [blame] | 351 | These functions: |
| 352 | |
| 353 | #include <xmmintrin.h> |
| 354 | __m128i a; |
| 355 | void x(unsigned short n) { |
| 356 | a = _mm_slli_epi32 (a, n); |
| 357 | } |
| 358 | void y(unsigned n) { |
| 359 | a = _mm_slli_epi32 (a, n); |
| 360 | } |
| 361 | |
| 362 | compile to ( -O3 -static -fomit-frame-pointer): |
| 363 | _x: |
| 364 | movzwl 4(%esp), %eax |
| 365 | movd %eax, %xmm0 |
| 366 | movaps _a, %xmm1 |
| 367 | pslld %xmm0, %xmm1 |
| 368 | movaps %xmm1, _a |
| 369 | ret |
| 370 | _y: |
| 371 | movd 4(%esp), %xmm0 |
| 372 | movaps _a, %xmm1 |
| 373 | pslld %xmm0, %xmm1 |
| 374 | movaps %xmm1, _a |
| 375 | ret |
| 376 | |
| 377 | "y" looks good, but "x" does silly movzwl stuff around into a GPR. It seems |
| 378 | like movd would be sufficient in both cases as the value is already zero |
| 379 | extended in the 32-bit stack slot IIRC. For signed short, it should also be |
| 380 | save, as a really-signed value would be undefined for pslld. |
| 381 | |
| 382 | |
| 383 | //===---------------------------------------------------------------------===// |
Evan Cheng | 78c460c | 2007-12-21 01:31:58 +0000 | [diff] [blame] | 384 | |
| 385 | #include <math.h> |
| 386 | int t1(double d) { return signbit(d); } |
| 387 | |
| 388 | This currently compiles to: |
| 389 | subl $12, %esp |
| 390 | movsd 16(%esp), %xmm0 |
| 391 | movsd %xmm0, (%esp) |
| 392 | movl 4(%esp), %eax |
| 393 | shrl $31, %eax |
| 394 | addl $12, %esp |
| 395 | ret |
| 396 | |
| 397 | We should use movmskp{s|d} instead. |
| 398 | |
Chris Lattner | 2dd23b9 | 2008-01-26 20:12:07 +0000 | [diff] [blame] | 399 | //===---------------------------------------------------------------------===// |
| 400 | |
| 401 | CodeGen/X86/vec_align.ll tests whether we can turn 4 scalar loads into a single |
| 402 | (aligned) vector load. This functionality has a couple of problems. |
| 403 | |
| 404 | 1. The code to infer alignment from loads of globals is in the X86 backend, |
| 405 | not the dag combiner. This is because dagcombine2 needs to be able to see |
| 406 | through the X86ISD::Wrapper node, which DAGCombine can't really do. |
| 407 | 2. The code for turning 4 x load into a single vector load is target |
| 408 | independent and should be moved to the dag combiner. |
| 409 | 3. The code for turning 4 x load into a vector load can only handle a direct |
| 410 | load from a global or a direct load from the stack. It should be generalized |
| 411 | to handle any load from P, P+4, P+8, P+12, where P can be anything. |
| 412 | 4. The alignment inference code cannot handle loads from globals in non-static |
| 413 | mode because it doesn't look through the extra dyld stub load. If you try |
| 414 | vec_align.ll without -relocation-model=static, you'll see what I mean. |
| 415 | |
| 416 | //===---------------------------------------------------------------------===// |
Chris Lattner | 2e4719e | 2008-01-27 07:31:41 +0000 | [diff] [blame] | 417 | |
| 418 | We should lower store(fneg(load p), q) into an integer load+xor+store, which |
| 419 | eliminates a constant pool load. For example, consider: |
| 420 | |
| 421 | define i64 @ccosf(float %z.0, float %z.1) nounwind readonly { |
| 422 | entry: |
Dan Gohman | 6f34abd | 2010-03-02 01:11:08 +0000 | [diff] [blame] | 423 | %tmp6 = fsub float -0.000000e+00, %z.1 ; <float> [#uses=1] |
Chris Lattner | a76e23a | 2008-03-08 22:28:45 +0000 | [diff] [blame] | 424 | %tmp20 = tail call i64 @ccoshf( float %tmp6, float %z.0 ) nounwind readonly |
| 425 | ret i64 %tmp20 |
Chris Lattner | 2e4719e | 2008-01-27 07:31:41 +0000 | [diff] [blame] | 426 | } |
Eli Friedman | ceb13f2 | 2010-06-03 01:47:31 +0000 | [diff] [blame] | 427 | declare i64 @ccoshf(float %z.0, float %z.1) nounwind readonly |
Chris Lattner | 2e4719e | 2008-01-27 07:31:41 +0000 | [diff] [blame] | 428 | |
| 429 | This currently compiles to: |
| 430 | |
| 431 | LCPI1_0: # <4 x float> |
| 432 | .long 2147483648 # float -0 |
| 433 | .long 2147483648 # float -0 |
| 434 | .long 2147483648 # float -0 |
| 435 | .long 2147483648 # float -0 |
| 436 | _ccosf: |
| 437 | subl $12, %esp |
| 438 | movss 16(%esp), %xmm0 |
| 439 | movss %xmm0, 4(%esp) |
| 440 | movss 20(%esp), %xmm0 |
| 441 | xorps LCPI1_0, %xmm0 |
| 442 | movss %xmm0, (%esp) |
| 443 | call L_ccoshf$stub |
| 444 | addl $12, %esp |
| 445 | ret |
| 446 | |
| 447 | Note the load into xmm0, then xor (to negate), then store. In PIC mode, |
| 448 | this code computes the pic base and does two loads to do the constant pool |
| 449 | load, so the improvement is much bigger. |
| 450 | |
| 451 | The tricky part about this xform is that the argument load/store isn't exposed |
| 452 | until post-legalize, and at that point, the fneg has been custom expanded into |
| 453 | an X86 fxor. This means that we need to handle this case in the x86 backend |
| 454 | instead of in target independent code. |
| 455 | |
| 456 | //===---------------------------------------------------------------------===// |
Nate Begeman | 2d77e8e4 | 2008-02-11 04:19:36 +0000 | [diff] [blame] | 457 | |
| 458 | Non-SSE4 insert into 16 x i8 is atrociously bad. |
| 459 | |
| 460 | //===---------------------------------------------------------------------===// |
| 461 | |
| 462 | <2 x i64> extract is substantially worse than <2 x f64>, even if the destination |
| 463 | is memory. |
| 464 | |
| 465 | //===---------------------------------------------------------------------===// |
| 466 | |
Nate Begeman | 2d77e8e4 | 2008-02-11 04:19:36 +0000 | [diff] [blame] | 467 | INSERTPS can match any insert (extract, imm1), imm2 for 4 x float, and insert |
| 468 | any number of 0.0 simultaneously. Currently we only use it for simple |
| 469 | insertions. |
| 470 | |
| 471 | See comments in LowerINSERT_VECTOR_ELT_SSE4. |
Nate Begeman | eea3299 | 2008-02-13 07:06:12 +0000 | [diff] [blame] | 472 | |
| 473 | //===---------------------------------------------------------------------===// |
| 474 | |
| 475 | On a random note, SSE2 should declare insert/extract of 2 x f64 as legal, not |
| 476 | Custom. All combinations of insert/extract reg-reg, reg-mem, and mem-reg are |
| 477 | legal, it'll just take a few extra patterns written in the .td file. |
| 478 | |
| 479 | Note: this is not a code quality issue; the custom lowered code happens to be |
| 480 | right, but we shouldn't have to custom lower anything. This is probably related |
| 481 | to <2 x i64> ops being so bad. |
| 482 | |
Chris Lattner | 2acd0c2 | 2008-03-05 07:22:39 +0000 | [diff] [blame] | 483 | //===---------------------------------------------------------------------===// |
| 484 | |
Anton Korobeynikov | a38e72d | 2008-05-11 14:33:15 +0000 | [diff] [blame] | 485 | LLVM currently generates stack realignment code, when it is not necessary |
| 486 | needed. The problem is that we need to know about stack alignment too early, |
| 487 | before RA runs. |
| 488 | |
| 489 | At that point we don't know, whether there will be vector spill, or not. |
| 490 | Stack realignment logic is overly conservative here, but otherwise we can |
| 491 | produce unaligned loads/stores. |
| 492 | |
| 493 | Fixing this will require some huge RA changes. |
| 494 | |
| 495 | Testcase: |
| 496 | #include <emmintrin.h> |
| 497 | |
| 498 | typedef short vSInt16 __attribute__ ((__vector_size__ (16))); |
| 499 | |
| 500 | static const vSInt16 a = {- 22725, - 12873, - 22725, - 12873, - 22725, - 12873, |
| 501 | - 22725, - 12873};; |
| 502 | |
| 503 | vSInt16 madd(vSInt16 b) |
| 504 | { |
| 505 | return _mm_madd_epi16(a, b); |
| 506 | } |
| 507 | |
| 508 | Generated code (x86-32, linux): |
| 509 | madd: |
| 510 | pushl %ebp |
| 511 | movl %esp, %ebp |
| 512 | andl $-16, %esp |
| 513 | movaps .LCPI1_0, %xmm1 |
| 514 | pmaddwd %xmm1, %xmm0 |
| 515 | movl %ebp, %esp |
| 516 | popl %ebp |
| 517 | ret |
| 518 | |
| 519 | //===---------------------------------------------------------------------===// |
| 520 | |
Chris Lattner | d17f58a | 2008-05-13 18:48:54 +0000 | [diff] [blame] | 521 | Consider: |
| 522 | #include <emmintrin.h> |
| 523 | __m128 foo2 (float x) { |
| 524 | return _mm_set_ps (0, 0, x, 0); |
| 525 | } |
| 526 | |
| 527 | In x86-32 mode, we generate this spiffy code: |
| 528 | |
| 529 | _foo2: |
| 530 | movss 4(%esp), %xmm0 |
| 531 | pshufd $81, %xmm0, %xmm0 |
| 532 | ret |
| 533 | |
| 534 | in x86-64 mode, we generate this code, which could be better: |
| 535 | |
| 536 | _foo2: |
| 537 | xorps %xmm1, %xmm1 |
| 538 | movss %xmm0, %xmm1 |
| 539 | pshufd $81, %xmm1, %xmm0 |
| 540 | ret |
| 541 | |
| 542 | In sse4 mode, we could use insertps to make both better. |
| 543 | |
Chris Lattner | 03ce206 | 2008-05-13 19:56:20 +0000 | [diff] [blame] | 544 | Here's another testcase that could use insertps [mem]: |
| 545 | |
| 546 | #include <xmmintrin.h> |
| 547 | extern float x2, x3; |
| 548 | __m128 foo1 (float x1, float x4) { |
| 549 | return _mm_set_ps (x2, x1, x3, x4); |
| 550 | } |
| 551 | |
| 552 | gcc mainline compiles it to: |
| 553 | |
| 554 | foo1: |
| 555 | insertps $0x10, x2(%rip), %xmm0 |
| 556 | insertps $0x10, x3(%rip), %xmm1 |
| 557 | movaps %xmm1, %xmm2 |
| 558 | movlhps %xmm0, %xmm2 |
| 559 | movaps %xmm2, %xmm0 |
| 560 | ret |
| 561 | |
Chris Lattner | d17f58a | 2008-05-13 18:48:54 +0000 | [diff] [blame] | 562 | //===---------------------------------------------------------------------===// |
Anton Korobeynikov | a38e72d | 2008-05-11 14:33:15 +0000 | [diff] [blame] | 563 | |
Chris Lattner | 3546c2b | 2008-05-23 04:29:53 +0000 | [diff] [blame] | 564 | We compile vector multiply-by-constant into poor code: |
| 565 | |
| 566 | define <4 x i32> @f(<4 x i32> %i) nounwind { |
| 567 | %A = mul <4 x i32> %i, < i32 10, i32 10, i32 10, i32 10 > |
| 568 | ret <4 x i32> %A |
| 569 | } |
| 570 | |
Dan Gohman | 66eea1b | 2008-05-23 18:05:39 +0000 | [diff] [blame] | 571 | On targets without SSE4.1, this compiles into: |
Chris Lattner | 3546c2b | 2008-05-23 04:29:53 +0000 | [diff] [blame] | 572 | |
| 573 | LCPI1_0: ## <4 x i32> |
| 574 | .long 10 |
| 575 | .long 10 |
| 576 | .long 10 |
| 577 | .long 10 |
| 578 | .text |
| 579 | .align 4,0x90 |
| 580 | .globl _f |
| 581 | _f: |
| 582 | pshufd $3, %xmm0, %xmm1 |
| 583 | movd %xmm1, %eax |
| 584 | imull LCPI1_0+12, %eax |
| 585 | movd %eax, %xmm1 |
| 586 | pshufd $1, %xmm0, %xmm2 |
| 587 | movd %xmm2, %eax |
| 588 | imull LCPI1_0+4, %eax |
| 589 | movd %eax, %xmm2 |
| 590 | punpckldq %xmm1, %xmm2 |
| 591 | movd %xmm0, %eax |
| 592 | imull LCPI1_0, %eax |
| 593 | movd %eax, %xmm1 |
| 594 | movhlps %xmm0, %xmm0 |
| 595 | movd %xmm0, %eax |
| 596 | imull LCPI1_0+8, %eax |
| 597 | movd %eax, %xmm0 |
| 598 | punpckldq %xmm0, %xmm1 |
| 599 | movaps %xmm1, %xmm0 |
| 600 | punpckldq %xmm2, %xmm0 |
| 601 | ret |
Evan Cheng | d25cb8e | 2008-05-23 17:28:11 +0000 | [diff] [blame] | 602 | |
Dan Gohman | 66eea1b | 2008-05-23 18:05:39 +0000 | [diff] [blame] | 603 | It would be better to synthesize integer vector multiplication by constants |
| 604 | using shifts and adds, pslld and paddd here. And even on targets with SSE4.1, |
| 605 | simple cases such as multiplication by powers of two would be better as |
| 606 | vector shifts than as multiplications. |
| 607 | |
Evan Cheng | d25cb8e | 2008-05-23 17:28:11 +0000 | [diff] [blame] | 608 | //===---------------------------------------------------------------------===// |
| 609 | |
| 610 | We compile this: |
| 611 | |
| 612 | __m128i |
| 613 | foo2 (char x) |
| 614 | { |
| 615 | return _mm_set_epi8 (1, 0, 0, 0, 0, 0, 0, 0, 0, x, 0, 1, 0, 0, 0, 0); |
| 616 | } |
| 617 | |
| 618 | into: |
| 619 | movl $1, %eax |
| 620 | xorps %xmm0, %xmm0 |
| 621 | pinsrw $2, %eax, %xmm0 |
| 622 | movzbl 4(%esp), %eax |
| 623 | pinsrw $3, %eax, %xmm0 |
| 624 | movl $256, %eax |
| 625 | pinsrw $7, %eax, %xmm0 |
| 626 | ret |
| 627 | |
| 628 | |
| 629 | gcc-4.2: |
| 630 | subl $12, %esp |
| 631 | movzbl 16(%esp), %eax |
| 632 | movdqa LC0, %xmm0 |
| 633 | pinsrw $3, %eax, %xmm0 |
| 634 | addl $12, %esp |
| 635 | ret |
| 636 | .const |
| 637 | .align 4 |
| 638 | LC0: |
| 639 | .word 0 |
| 640 | .word 0 |
| 641 | .word 1 |
| 642 | .word 0 |
| 643 | .word 0 |
| 644 | .word 0 |
| 645 | .word 0 |
| 646 | .word 256 |
| 647 | |
| 648 | With SSE4, it should be |
| 649 | movdqa .LC0(%rip), %xmm0 |
| 650 | pinsrb $6, %edi, %xmm0 |
Evan Cheng | 3fc2372 | 2008-06-25 20:52:59 +0000 | [diff] [blame] | 651 | |
| 652 | //===---------------------------------------------------------------------===// |
| 653 | |
| 654 | We should transform a shuffle of two vectors of constants into a single vector |
| 655 | of constants. Also, insertelement of a constant into a vector of constants |
| 656 | should also result in a vector of constants. e.g. 2008-06-25-VecISelBug.ll. |
| 657 | |
| 658 | We compiled it to something horrible: |
| 659 | |
| 660 | .align 4 |
| 661 | LCPI1_1: ## float |
| 662 | .long 1065353216 ## float 1 |
| 663 | .const |
| 664 | |
| 665 | .align 4 |
| 666 | LCPI1_0: ## <4 x float> |
| 667 | .space 4 |
| 668 | .long 1065353216 ## float 1 |
| 669 | .space 4 |
| 670 | .long 1065353216 ## float 1 |
| 671 | .text |
| 672 | .align 4,0x90 |
| 673 | .globl _t |
| 674 | _t: |
| 675 | xorps %xmm0, %xmm0 |
| 676 | movhps LCPI1_0, %xmm0 |
| 677 | movss LCPI1_1, %xmm1 |
| 678 | movaps %xmm0, %xmm2 |
| 679 | shufps $2, %xmm1, %xmm2 |
| 680 | shufps $132, %xmm2, %xmm0 |
| 681 | movaps %xmm0, 0 |
Chris Lattner | f076d5e | 2008-08-19 00:41:02 +0000 | [diff] [blame] | 682 | |
| 683 | //===---------------------------------------------------------------------===// |
Chris Lattner | 9a8eb0d | 2008-09-20 19:17:53 +0000 | [diff] [blame] | 684 | rdar://5907648 |
| 685 | |
| 686 | This function: |
| 687 | |
| 688 | float foo(unsigned char x) { |
| 689 | return x; |
| 690 | } |
| 691 | |
| 692 | compiles to (x86-32): |
| 693 | |
| 694 | define float @foo(i8 zeroext %x) nounwind { |
| 695 | %tmp12 = uitofp i8 %x to float ; <float> [#uses=1] |
| 696 | ret float %tmp12 |
| 697 | } |
| 698 | |
| 699 | compiles to: |
| 700 | |
| 701 | _foo: |
| 702 | subl $4, %esp |
| 703 | movzbl 8(%esp), %eax |
| 704 | cvtsi2ss %eax, %xmm0 |
| 705 | movss %xmm0, (%esp) |
| 706 | flds (%esp) |
| 707 | addl $4, %esp |
| 708 | ret |
| 709 | |
| 710 | We should be able to use: |
| 711 | cvtsi2ss 8($esp), %xmm0 |
| 712 | since we know the stack slot is already zext'd. |
| 713 | |
Evan Cheng | f31f288 | 2009-01-28 08:35:02 +0000 | [diff] [blame] | 714 | //===---------------------------------------------------------------------===// |
| 715 | |
| 716 | Consider using movlps instead of movsd to implement (scalar_to_vector (loadf64)) |
| 717 | when code size is critical. movlps is slower than movsd on core2 but it's one |
| 718 | byte shorter. |
Chris Lattner | 553fd7e | 2009-02-04 19:08:01 +0000 | [diff] [blame] | 719 | |
| 720 | //===---------------------------------------------------------------------===// |
| 721 | |
| 722 | We should use a dynamic programming based approach to tell when using FPStack |
| 723 | operations is cheaper than SSE. SciMark montecarlo contains code like this |
| 724 | for example: |
| 725 | |
| 726 | double MonteCarlo_num_flops(int Num_samples) { |
| 727 | return ((double) Num_samples)* 4.0; |
| 728 | } |
| 729 | |
| 730 | In fpstack mode, this compiles into: |
| 731 | |
| 732 | LCPI1_0: |
| 733 | .long 1082130432 ## float 4.000000e+00 |
| 734 | _MonteCarlo_num_flops: |
| 735 | subl $4, %esp |
| 736 | movl 8(%esp), %eax |
| 737 | movl %eax, (%esp) |
| 738 | fildl (%esp) |
| 739 | fmuls LCPI1_0 |
| 740 | addl $4, %esp |
| 741 | ret |
| 742 | |
| 743 | in SSE mode, it compiles into significantly slower code: |
| 744 | |
| 745 | _MonteCarlo_num_flops: |
| 746 | subl $12, %esp |
| 747 | cvtsi2sd 16(%esp), %xmm0 |
| 748 | mulsd LCPI1_0, %xmm0 |
| 749 | movsd %xmm0, (%esp) |
| 750 | fldl (%esp) |
| 751 | addl $12, %esp |
| 752 | ret |
| 753 | |
| 754 | There are also other cases in scimark where using fpstack is better, it is |
| 755 | cheaper to do fld1 than load from a constant pool for example, so |
| 756 | "load, add 1.0, store" is better done in the fp stack, etc. |
| 757 | |
| 758 | //===---------------------------------------------------------------------===// |
Chris Lattner | 3eb76c2 | 2010-01-13 23:29:11 +0000 | [diff] [blame] | 759 | |
Chris Lattner | fb5670f | 2010-02-04 07:32:01 +0000 | [diff] [blame] | 760 | These should compile into the same code (PR6214): Perhaps instcombine should |
| 761 | canonicalize the former into the later? |
| 762 | |
| 763 | define float @foo(float %x) nounwind { |
| 764 | %t = bitcast float %x to i32 |
| 765 | %s = and i32 %t, 2147483647 |
| 766 | %d = bitcast i32 %s to float |
| 767 | ret float %d |
| 768 | } |
| 769 | |
| 770 | declare float @fabsf(float %n) |
| 771 | define float @bar(float %x) nounwind { |
| 772 | %d = call float @fabsf(float %x) |
| 773 | ret float %d |
| 774 | } |
| 775 | |
| 776 | //===---------------------------------------------------------------------===// |
| 777 | |
Chris Lattner | cf11e602 | 2010-02-09 05:45:29 +0000 | [diff] [blame] | 778 | This IR (from PR6194): |
| 779 | |
Lang Hames | de7ab80 | 2011-10-10 23:42:08 +0000 | [diff] [blame] | 780 | target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" |
Chris Lattner | cf11e602 | 2010-02-09 05:45:29 +0000 | [diff] [blame] | 781 | target triple = "x86_64-apple-darwin10.0.0" |
| 782 | |
| 783 | %0 = type { double, double } |
| 784 | %struct.float3 = type { float, float, float } |
| 785 | |
| 786 | define void @test(%0, %struct.float3* nocapture %res) nounwind noinline ssp { |
| 787 | entry: |
| 788 | %tmp18 = extractvalue %0 %0, 0 ; <double> [#uses=1] |
| 789 | %tmp19 = bitcast double %tmp18 to i64 ; <i64> [#uses=1] |
| 790 | %tmp20 = zext i64 %tmp19 to i128 ; <i128> [#uses=1] |
| 791 | %tmp10 = lshr i128 %tmp20, 32 ; <i128> [#uses=1] |
| 792 | %tmp11 = trunc i128 %tmp10 to i32 ; <i32> [#uses=1] |
| 793 | %tmp12 = bitcast i32 %tmp11 to float ; <float> [#uses=1] |
| 794 | %tmp5 = getelementptr inbounds %struct.float3* %res, i64 0, i32 1 ; <float*> [#uses=1] |
| 795 | store float %tmp12, float* %tmp5 |
| 796 | ret void |
| 797 | } |
| 798 | |
| 799 | Compiles to: |
| 800 | |
| 801 | _test: ## @test |
| 802 | movd %xmm0, %rax |
| 803 | shrq $32, %rax |
| 804 | movl %eax, 4(%rdi) |
| 805 | ret |
| 806 | |
| 807 | This would be better kept in the SSE unit by treating XMM0 as a 4xfloat and |
| 808 | doing a shuffle from v[1] to v[0] then a float store. |
| 809 | |
| 810 | //===---------------------------------------------------------------------===// |
Chris Lattner | 7b909ac | 2010-07-05 05:48:41 +0000 | [diff] [blame] | 811 | |
Benjamin Kramer | 57003a676 | 2012-03-19 00:43:34 +0000 | [diff] [blame] | 812 | [UNSAFE FP] |
| 813 | |
| 814 | void foo(double, double, double); |
| 815 | void norm(double x, double y, double z) { |
| 816 | double scale = __builtin_sqrt(x*x + y*y + z*z); |
| 817 | foo(x/scale, y/scale, z/scale); |
| 818 | } |
| 819 | |
| 820 | We currently generate an sqrtsd and 3 divsd instructions. This is bad, fp div is |
| 821 | slow and not pipelined. In -ffast-math mode we could compute "1.0/scale" first |
| 822 | and emit 3 mulsd in place of the divs. This can be done as a target-independent |
| 823 | transform. |
| 824 | |
| 825 | If we're dealing with floats instead of doubles we could even replace the sqrtss |
| 826 | and inversion with an rsqrtss instruction, which computes 1/sqrt faster at the |
| 827 | cost of reduced accuracy. |
| 828 | |
| 829 | //===---------------------------------------------------------------------===// |
Chris Lattner | ba3ba8f | 2012-09-03 02:58:21 +0000 | [diff] [blame] | 830 | |
| 831 | This function should be matched to haddpd when the appropriate CPU is enabled: |
| 832 | |
| 833 | #include <x86intrin.h> |
| 834 | double f (__m128d p) { |
| 835 | return p[0] + p[1]; |
| 836 | } |
| 837 | |
| 838 | similarly, v[0]-v[1] should match to hsubpd, and {v[0]-v[1], w[0]-w[1]} should |
| 839 | turn into hsubpd also. |
| 840 | |
| 841 | //===---------------------------------------------------------------------===// |
Benjamin Kramer | 189fc58 | 2013-02-17 23:34:14 +0000 | [diff] [blame] | 842 | |
| 843 | define <2 x i32> @foo(<2 x double> %in) { |
| 844 | %x = fptosi <2 x double> %in to <2 x i32> |
| 845 | ret <2 x i32> %x |
| 846 | } |
| 847 | |
| 848 | Should compile into cvttpd2dq instead of being scalarized into 2 cvttsd2si. |
| 849 | |
| 850 | //===---------------------------------------------------------------------===// |