blob: 403663ff88378465168c2c02e60c9beab0defa8f [file] [log] [blame]
Anders Carlsson566d8da2008-12-22 00:01:20 +00001/*===---- xmmintrin.h - SSE intrinsics -------------------------------------===
2 *
3 * Permission is hereby granted, free of charge, to any person obtaining a copy
4 * of this software and associated documentation files (the "Software"), to deal
5 * in the Software without restriction, including without limitation the rights
6 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7 * copies of the Software, and to permit persons to whom the Software is
8 * furnished to do so, subject to the following conditions:
9 *
10 * The above copyright notice and this permission notice shall be included in
11 * all copies or substantial portions of the Software.
12 *
13 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 * THE SOFTWARE.
20 *
21 *===-----------------------------------------------------------------------===
22 */
23
24#ifndef __XMMINTRIN_H
25#define __XMMINTRIN_H
26
27#ifndef __SSE__
Anders Carlsson4fd3e632008-12-26 00:57:11 +000028#error "SSE instruction set not enabled"
Anders Carlsson566d8da2008-12-22 00:01:20 +000029#else
30
Anders Carlsson4fcc3132008-12-22 00:48:30 +000031#include <mmintrin.h>
32
Anders Carlsson398082e2008-12-22 17:42:23 +000033typedef float __v4sf __attribute__((__vector_size__(16)));
Anders Carlsson566d8da2008-12-22 00:01:20 +000034typedef float __m128 __attribute__((__vector_size__(16)));
35
Anders Carlsson398082e2008-12-22 17:42:23 +000036#include <mm_malloc.h>
Anders Carlsson398082e2008-12-22 17:42:23 +000037
Mike Stumpdae44132009-02-13 14:24:50 +000038static inline __m128 __attribute__((__always_inline__))
39_mm_add_ss(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000040{
41 return __builtin_ia32_addss(a, b);
42}
43
Mike Stumpdae44132009-02-13 14:24:50 +000044static inline __m128 __attribute__((__always_inline__))
45_mm_add_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000046{
47 return a + b;
48}
49
Mike Stumpdae44132009-02-13 14:24:50 +000050static inline __m128 __attribute__((__always_inline__))
51_mm_sub_ss(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000052{
53 return __builtin_ia32_subss(a, b);
54}
55
Mike Stumpdae44132009-02-13 14:24:50 +000056static inline __m128 __attribute__((__always_inline__))
57_mm_sub_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000058{
59 return a - b;
60}
61
Mike Stumpdae44132009-02-13 14:24:50 +000062static inline __m128 __attribute__((__always_inline__))
63_mm_mul_ss(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000064{
65 return __builtin_ia32_mulss(a, b);
66}
67
Mike Stumpdae44132009-02-13 14:24:50 +000068static inline __m128 __attribute__((__always_inline__))
69_mm_mul_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000070{
71 return a * b;
72}
73
Mike Stumpdae44132009-02-13 14:24:50 +000074static inline __m128 __attribute__((__always_inline__))
75_mm_div_ss(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000076{
77 return __builtin_ia32_divss(a, b);
78}
79
Mike Stumpdae44132009-02-13 14:24:50 +000080static inline __m128 __attribute__((__always_inline__))
81_mm_div_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +000082{
83 return a / b;
84}
85
Mike Stumpdae44132009-02-13 14:24:50 +000086static inline __m128 __attribute__((__always_inline__))
87_mm_sqrt_ss(__m128 a)
Anders Carlsson566d8da2008-12-22 00:01:20 +000088{
89 return __builtin_ia32_sqrtss(a);
90}
91
Mike Stumpdae44132009-02-13 14:24:50 +000092static inline __m128 __attribute__((__always_inline__))
93_mm_sqrt_ps(__m128 a)
Anders Carlsson566d8da2008-12-22 00:01:20 +000094{
95 return __builtin_ia32_sqrtps(a);
96}
97
Mike Stumpdae44132009-02-13 14:24:50 +000098static inline __m128 __attribute__((__always_inline__))
99_mm_rcp_ss(__m128 a)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000100{
101 return __builtin_ia32_rcpss(a);
102}
103
Mike Stumpdae44132009-02-13 14:24:50 +0000104static inline __m128 __attribute__((__always_inline__))
105_mm_rcp_ps(__m128 a)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000106{
107 return __builtin_ia32_rcpps(a);
108}
109
Mike Stumpdae44132009-02-13 14:24:50 +0000110static inline __m128 __attribute__((__always_inline__))
111_mm_rsqrt_ss(__m128 a)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000112{
113 return __builtin_ia32_rsqrtss(a);
114}
115
Mike Stumpdae44132009-02-13 14:24:50 +0000116static inline __m128 __attribute__((__always_inline__))
117_mm_rsqrt_ps(__m128 a)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000118{
119 return __builtin_ia32_rsqrtps(a);
120}
121
Mike Stumpdae44132009-02-13 14:24:50 +0000122static inline __m128 __attribute__((__always_inline__))
123_mm_min_ss(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000124{
125 return __builtin_ia32_minss(a, b);
126}
127
Mike Stumpdae44132009-02-13 14:24:50 +0000128static inline __m128 __attribute__((__always_inline__))
129_mm_min_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000130{
131 return __builtin_ia32_minps(a, b);
132}
133
Mike Stumpdae44132009-02-13 14:24:50 +0000134static inline __m128 __attribute__((__always_inline__))
135_mm_max_ss(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000136{
137 return __builtin_ia32_maxss(a, b);
138}
139
Mike Stumpdae44132009-02-13 14:24:50 +0000140static inline __m128 __attribute__((__always_inline__))
141_mm_max_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000142{
143 return __builtin_ia32_maxps(a, b);
144}
145
Mike Stumpdae44132009-02-13 14:24:50 +0000146static inline __m128 __attribute__((__always_inline__))
147_mm_and_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000148{
149 return __builtin_ia32_andps(a, b);
150}
151
Mike Stumpdae44132009-02-13 14:24:50 +0000152static inline __m128 __attribute__((__always_inline__))
153_mm_andnot_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000154{
155 return __builtin_ia32_andnps(a, b);
156}
157
Mike Stumpdae44132009-02-13 14:24:50 +0000158static inline __m128 __attribute__((__always_inline__))
159_mm_or_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000160{
161 return __builtin_ia32_orps(a, b);
162}
163
Mike Stumpdae44132009-02-13 14:24:50 +0000164static inline __m128 __attribute__((__always_inline__))
165_mm_xor_ps(__m128 a, __m128 b)
Anders Carlsson566d8da2008-12-22 00:01:20 +0000166{
167 return __builtin_ia32_xorps(a, b);
168}
169
Mike Stumpdae44132009-02-13 14:24:50 +0000170static inline __m128 __attribute__((__always_inline__))
171_mm_cmpeq_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000172{
173 return (__m128)__builtin_ia32_cmpeqss(a, b);
174}
175
Mike Stumpdae44132009-02-13 14:24:50 +0000176static inline __m128 __attribute__((__always_inline__))
177_mm_cmpeq_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000178{
179 return (__m128)__builtin_ia32_cmpeqps(a, b);
180}
181
Mike Stumpdae44132009-02-13 14:24:50 +0000182static inline __m128 __attribute__((__always_inline__))
183_mm_cmplt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000184{
185 return (__m128)__builtin_ia32_cmpltss(a, b);
186}
187
Mike Stumpdae44132009-02-13 14:24:50 +0000188static inline __m128 __attribute__((__always_inline__))
189_mm_cmplt_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000190{
191 return (__m128)__builtin_ia32_cmpltps(a, b);
192}
193
Mike Stumpdae44132009-02-13 14:24:50 +0000194static inline __m128 __attribute__((__always_inline__))
195_mm_cmple_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000196{
197 return (__m128)__builtin_ia32_cmpless(a, b);
198}
199
Mike Stumpdae44132009-02-13 14:24:50 +0000200static inline __m128 __attribute__((__always_inline__))
201_mm_cmple_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000202{
203 return (__m128)__builtin_ia32_cmpleps(a, b);
204}
205
Mike Stumpdae44132009-02-13 14:24:50 +0000206static inline __m128 __attribute__((__always_inline__))
207_mm_cmpgt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000208{
209 return (__m128)__builtin_ia32_cmpltss(b, a);
210}
211
Mike Stumpdae44132009-02-13 14:24:50 +0000212static inline __m128 __attribute__((__always_inline__))
213_mm_cmpgt_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000214{
215 return (__m128)__builtin_ia32_cmpltps(b, a);
216}
217
Mike Stumpdae44132009-02-13 14:24:50 +0000218static inline __m128 __attribute__((__always_inline__))
219_mm_cmpge_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000220{
221 return (__m128)__builtin_ia32_cmpless(b, a);
222}
223
Mike Stumpdae44132009-02-13 14:24:50 +0000224static inline __m128 __attribute__((__always_inline__))
225_mm_cmpge_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000226{
227 return (__m128)__builtin_ia32_cmpleps(b, a);
228}
229
Mike Stumpdae44132009-02-13 14:24:50 +0000230static inline __m128 __attribute__((__always_inline__))
231_mm_cmpneq_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000232{
233 return (__m128)__builtin_ia32_cmpneqss(a, b);
234}
235
Mike Stumpdae44132009-02-13 14:24:50 +0000236static inline __m128 __attribute__((__always_inline__))
237_mm_cmpneq_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000238{
239 return (__m128)__builtin_ia32_cmpneqps(a, b);
240}
241
Mike Stumpdae44132009-02-13 14:24:50 +0000242static inline __m128 __attribute__((__always_inline__))
243_mm_cmpnlt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000244{
245 return (__m128)__builtin_ia32_cmpnltss(a, b);
246}
247
Mike Stumpdae44132009-02-13 14:24:50 +0000248static inline __m128 __attribute__((__always_inline__))
249_mm_cmpnlt_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000250{
251 return (__m128)__builtin_ia32_cmpnltps(a, b);
252}
253
Mike Stumpdae44132009-02-13 14:24:50 +0000254static inline __m128 __attribute__((__always_inline__))
255_mm_cmpnle_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000256{
257 return (__m128)__builtin_ia32_cmpnless(a, b);
258}
259
Mike Stumpdae44132009-02-13 14:24:50 +0000260static inline __m128 __attribute__((__always_inline__))
261_mm_cmpnle_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000262{
263 return (__m128)__builtin_ia32_cmpnleps(a, b);
264}
265
Mike Stumpdae44132009-02-13 14:24:50 +0000266static inline __m128 __attribute__((__always_inline__))
267_mm_cmpngt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000268{
269 return (__m128)__builtin_ia32_cmpnltss(b, a);
270}
271
Mike Stumpdae44132009-02-13 14:24:50 +0000272static inline __m128 __attribute__((__always_inline__))
273_mm_cmpngt_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000274{
275 return (__m128)__builtin_ia32_cmpnltps(b, a);
276}
277
Mike Stumpdae44132009-02-13 14:24:50 +0000278static inline __m128 __attribute__((__always_inline__))
279_mm_cmpnge_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000280{
281 return (__m128)__builtin_ia32_cmpnless(b, a);
282}
283
Mike Stumpdae44132009-02-13 14:24:50 +0000284static inline __m128 __attribute__((__always_inline__))
285_mm_cmpnge_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000286{
287 return (__m128)__builtin_ia32_cmpnleps(b, a);
288}
289
Mike Stumpdae44132009-02-13 14:24:50 +0000290static inline __m128 __attribute__((__always_inline__))
291_mm_cmpord_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000292{
293 return (__m128)__builtin_ia32_cmpordss(a, b);
294}
295
Mike Stumpdae44132009-02-13 14:24:50 +0000296static inline __m128 __attribute__((__always_inline__))
297_mm_cmpord_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000298{
299 return (__m128)__builtin_ia32_cmpordps(a, b);
300}
301
Mike Stumpdae44132009-02-13 14:24:50 +0000302static inline __m128 __attribute__((__always_inline__))
303_mm_cmpunord_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000304{
305 return (__m128)__builtin_ia32_cmpunordss(a, b);
306}
307
Mike Stumpdae44132009-02-13 14:24:50 +0000308static inline __m128 __attribute__((__always_inline__))
309_mm_cmpunord_ps(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000310{
311 return (__m128)__builtin_ia32_cmpunordps(a, b);
312}
313
Mike Stumpdae44132009-02-13 14:24:50 +0000314static inline int __attribute__((__always_inline__))
315_mm_comieq_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000316{
317 return __builtin_ia32_comieq(a, b);
318}
319
Mike Stumpdae44132009-02-13 14:24:50 +0000320static inline int __attribute__((__always_inline__))
321_mm_comilt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000322{
323 return __builtin_ia32_comilt(a, b);
324}
325
Mike Stumpdae44132009-02-13 14:24:50 +0000326static inline int __attribute__((__always_inline__))
327_mm_comile_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000328{
329 return __builtin_ia32_comile(a, b);
330}
331
Mike Stumpdae44132009-02-13 14:24:50 +0000332static inline int __attribute__((__always_inline__))
333_mm_comigt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000334{
335 return __builtin_ia32_comigt(a, b);
336}
337
Mike Stumpdae44132009-02-13 14:24:50 +0000338static inline int __attribute__((__always_inline__))
339_mm_comige_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000340{
341 return __builtin_ia32_comige(a, b);
342}
343
Mike Stumpdae44132009-02-13 14:24:50 +0000344static inline int __attribute__((__always_inline__))
345_mm_comineq_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000346{
347 return __builtin_ia32_comineq(a, b);
348}
349
Mike Stumpdae44132009-02-13 14:24:50 +0000350static inline int __attribute__((__always_inline__))
351_mm_ucomieq_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000352{
353 return __builtin_ia32_ucomieq(a, b);
354}
355
Mike Stumpdae44132009-02-13 14:24:50 +0000356static inline int __attribute__((__always_inline__))
357_mm_ucomilt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000358{
359 return __builtin_ia32_ucomilt(a, b);
360}
361
Mike Stumpdae44132009-02-13 14:24:50 +0000362static inline int __attribute__((__always_inline__))
363_mm_ucomile_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000364{
365 return __builtin_ia32_ucomile(a, b);
366}
367
Mike Stumpdae44132009-02-13 14:24:50 +0000368static inline int __attribute__((__always_inline__))
369_mm_ucomigt_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000370{
371 return __builtin_ia32_ucomigt(a, b);
372}
373
Mike Stumpdae44132009-02-13 14:24:50 +0000374static inline int __attribute__((__always_inline__))
375_mm_ucomige_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000376{
377 return __builtin_ia32_ucomige(a, b);
378}
379
Mike Stumpdae44132009-02-13 14:24:50 +0000380static inline int __attribute__((__always_inline__))
381_mm_ucomineq_ss(__m128 a, __m128 b)
Anders Carlssonf62c6812008-12-22 00:28:39 +0000382{
383 return __builtin_ia32_ucomineq(a, b);
384}
385
Mike Stumpdae44132009-02-13 14:24:50 +0000386static inline int __attribute__((__always_inline__))
387_mm_cvtss_si32(__m128 a)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000388{
389 return __builtin_ia32_cvtss2si(a);
390}
391
Mike Stumpdae44132009-02-13 14:24:50 +0000392static inline long long __attribute__((__always_inline__))
393_mm_cvtss_si64(__m128 a)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000394{
395 return __builtin_ia32_cvtss2si64(a);
396}
397
Mike Stumpdae44132009-02-13 14:24:50 +0000398static inline __m64 __attribute__((__always_inline__))
399_mm_cvtps_pi32(__m128 a)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000400{
401 return (__m64)__builtin_ia32_cvtps2pi(a);
402}
403
Mike Stumpdae44132009-02-13 14:24:50 +0000404static inline int __attribute__((__always_inline__))
405_mm_cvttss_si32(__m128 a)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000406{
407 return __builtin_ia32_cvttss2si(a);
408}
409
Mike Stumpdae44132009-02-13 14:24:50 +0000410static inline long long __attribute__((__always_inline__))
411_mm_cvttss_si64(__m128 a)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000412{
413 return __builtin_ia32_cvttss2si64(a);
414}
415
Mike Stumpdae44132009-02-13 14:24:50 +0000416static inline __m64 __attribute__((__always_inline__))
417_mm_cvttps_pi32(__m128 a)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000418{
419 return (__m64)__builtin_ia32_cvttps2pi(a);
420}
421
Mike Stumpdae44132009-02-13 14:24:50 +0000422static inline __m128 __attribute__((__always_inline__))
423_mm_cvtsi32_ss(__m128 a, int b)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000424{
425 return __builtin_ia32_cvtsi2ss(a, b);
426}
427
Anders Carlsson1b76b802008-12-22 01:26:50 +0000428#ifdef __x86_64__
429
Mike Stumpdae44132009-02-13 14:24:50 +0000430static inline __m128 __attribute__((__always_inline__))
431_mm_cvtsi64_ss(__m128 a, long long b)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000432{
433 return __builtin_ia32_cvtsi642ss(a, b);
434}
435
Anders Carlsson1b76b802008-12-22 01:26:50 +0000436#endif
437
Mike Stumpdae44132009-02-13 14:24:50 +0000438static inline __m128 __attribute__((__always_inline__))
439_mm_cvtpi32_ps(__m128 a, __m64 b)
Anders Carlsson4fcc3132008-12-22 00:48:30 +0000440{
441 return __builtin_ia32_cvtpi2ps(a, (__v2si)b);
442}
443
Mike Stumpdae44132009-02-13 14:24:50 +0000444static inline float __attribute__((__always_inline__))
445_mm_cvtss_f32(__m128 a)
Anders Carlsson1b76b802008-12-22 01:26:50 +0000446{
Anders Carlssona6431dc2008-12-22 07:08:03 +0000447 return a[0];
Anders Carlsson1b76b802008-12-22 01:26:50 +0000448}
449
Mike Stumpdae44132009-02-13 14:24:50 +0000450static inline __m128 __attribute__((__always_inline__))
451_mm_loadh_pi(__m128 a, __m64 const *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000452{
453 return __builtin_ia32_loadhps(a, (__v2si *)p);
454}
455
Mike Stumpdae44132009-02-13 14:24:50 +0000456static inline __m128 __attribute__((__always_inline__))
457_mm_loadl_pi(__m128 a, __m64 const *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000458{
459 return __builtin_ia32_loadlps(a, (__v2si *)p);
460}
461
Mike Stumpdae44132009-02-13 14:24:50 +0000462static inline __m128 __attribute__((__always_inline__))
463_mm_load_ss(float *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000464{
465 return (__m128){ *p, 0, 0, 0 };
466}
467
Mike Stumpdae44132009-02-13 14:24:50 +0000468static inline __m128 __attribute__((__always_inline__))
469_mm_load1_ps(float *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000470{
471 return (__m128){ *p, *p, *p, *p };
472}
473
Mike Stumpdae44132009-02-13 14:24:50 +0000474static inline __m128 __attribute__((__always_inline__))
475_mm_load_ps(float *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000476{
477 return *(__m128*)p;
478}
479
Mike Stumpdae44132009-02-13 14:24:50 +0000480static inline __m128 __attribute__((__always_inline__))
481_mm_loadu_ps(float *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000482{
483 return __builtin_ia32_loadups(p);
484}
485
Mike Stumpdae44132009-02-13 14:24:50 +0000486static inline __m128 __attribute__((__always_inline__))
487_mm_loadr_ps(float *p)
Anders Carlsson97700862008-12-22 02:43:30 +0000488{
489 __m128 a = _mm_load_ps(p);
490 return __builtin_shufflevector(a, a, 3, 2, 1, 0);
491}
492
Mike Stumpdae44132009-02-13 14:24:50 +0000493static inline __m128 __attribute__((__always_inline__))
494_mm_set_ss(float w)
Anders Carlssona6ba0012008-12-22 02:51:35 +0000495{
496 return (__m128){ w, 0, 0, 0 };
497}
498
Mike Stumpdae44132009-02-13 14:24:50 +0000499static inline __m128 __attribute__((__always_inline__))
500_mm_set1_ps(float w)
Anders Carlssona6ba0012008-12-22 02:51:35 +0000501{
502 return (__m128){ w, w, w, w };
503}
504
Anders Carlsson12868cc2008-12-27 04:26:15 +0000505// Microsoft specific.
Mike Stumpdae44132009-02-13 14:24:50 +0000506static inline __m128 __attribute__((__always_inline__))
507_mm_set_ps1(float w)
Anders Carlsson12868cc2008-12-27 04:26:15 +0000508{
509 return _mm_set1_ps(w);
510}
511
Mike Stumpdae44132009-02-13 14:24:50 +0000512static inline __m128 __attribute__((__always_inline__))
513_mm_set_ps(float z, float y, float x, float w)
Anders Carlssona6ba0012008-12-22 02:51:35 +0000514{
515 return (__m128){ w, x, y, z };
516}
517
Mike Stumpdae44132009-02-13 14:24:50 +0000518static inline __m128 __attribute__((__always_inline__))
519_mm_setr_ps(float z, float y, float x, float w)
Anders Carlssona6ba0012008-12-22 02:51:35 +0000520{
521 return (__m128){ z, y, x, w };
522}
523
Mike Stumpdae44132009-02-13 14:24:50 +0000524static inline __m128 __attribute__((__always__inline__))
525_mm_setzero_ps(void)
Anders Carlssona6ba0012008-12-22 02:51:35 +0000526{
527 return (__m128){ 0, 0, 0, 0 };
528}
529
Mike Stumpdae44132009-02-13 14:24:50 +0000530static inline void __attribute__((__always__inline__))
531_mm_storeh_pi(__m64 *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000532{
533 __builtin_ia32_storehps((__v2si *)p, a);
534}
535
Mike Stumpdae44132009-02-13 14:24:50 +0000536static inline void __attribute__((__always__inline__))
537_mm_storel_pi(__m64 *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000538{
539 __builtin_ia32_storelps((__v2si *)p, a);
540}
541
Mike Stumpdae44132009-02-13 14:24:50 +0000542static inline void __attribute__((__always__inline__))
543_mm_store_ss(float *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000544{
545 *p = a[0];
546}
547
Mike Stumpdae44132009-02-13 14:24:50 +0000548static inline void __attribute__((__always_inline__))
549_mm_storeu_ps(float *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000550{
551 __builtin_ia32_storeups(p, a);
552}
553
Mike Stumpdae44132009-02-13 14:24:50 +0000554static inline void __attribute__((__always_inline__))
555_mm_store1_ps(float *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000556{
557 a = __builtin_shufflevector(a, a, 0, 0, 0, 0);
558 _mm_storeu_ps(p, a);
559}
560
Mike Stumpdae44132009-02-13 14:24:50 +0000561static inline void __attribute__((__always_inline__))
562_mm_store_ps(float *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000563{
564 *(__m128 *)p = a;
565}
566
Mike Stumpdae44132009-02-13 14:24:50 +0000567static inline void __attribute__((__always_inline__))
568_mm_storer_ps(float *p, __m128 a)
Anders Carlsson09b93052008-12-22 03:16:40 +0000569{
570 a = __builtin_shufflevector(a, a, 3, 2, 1, 0);
571 _mm_store_ps(p, a);
572}
573
Anders Carlssondedad4e2008-12-22 03:50:21 +0000574#define _MM_HINT_T0 1
575#define _MM_HINT_T1 2
576#define _MM_HINT_T2 3
577#define _MM_HINT_NTA 0
578
Anders Carlsson62af71c2008-12-22 04:55:36 +0000579/* FIXME: We have to #define this because "sel" must be a constant integer, and
580 Sema doesn't do any form of constant propagation yet. */
Anders Carlssondedad4e2008-12-22 03:50:21 +0000581
582#define _mm_prefetch(a, sel) (__builtin_prefetch((void *)a, 0, sel))
583
Mike Stumpdae44132009-02-13 14:24:50 +0000584static inline void __attribute__((__always_inline__))
585_mm_stream_pi(__m64 *p, __m64 a)
Anders Carlssondedad4e2008-12-22 03:50:21 +0000586{
587 __builtin_ia32_movntq(p, a);
588}
589
Mike Stumpdae44132009-02-13 14:24:50 +0000590static inline void __attribute__((__always_inline__))
591_mm_stream_ps(float *p, __m128 a)
Anders Carlssondedad4e2008-12-22 03:50:21 +0000592{
593 __builtin_ia32_movntps(p, a);
594}
595
Mike Stumpdae44132009-02-13 14:24:50 +0000596static inline void __attribute__((__always_inline__))
597_mm_sfence(void)
Anders Carlssondedad4e2008-12-22 03:50:21 +0000598{
599 __builtin_ia32_sfence();
600}
601
Mike Stumpdae44132009-02-13 14:24:50 +0000602static inline int __attribute__((__always_inline__))
603_mm_extract_pi16(__m64 a, int n)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000604{
605 /* FIXME:
606 * This should force n to be an immediate.
607 * This does not use the PEXTRW instruction. From looking at the LLVM source, the
608 instruction doesn't seem to be hooked up.
609 * The code could probably be made better :)
610 */
611 __v4hi b = (__v4hi)a;
612 return b[(n == 0) ? 0 : (n == 1 ? 1 : (n == 2 ? 2 : 3))];
613}
614
Anders Carlsson8e28d242008-12-22 07:34:23 +0000615/* FIXME: Implement this. We could add a __builtin_insertelement function that's similar to
616 the already existing __builtin_shufflevector.
617*/
618/*
Mike Stumpdae44132009-02-13 14:24:50 +0000619static inline __m64 __attribute__((__always_inline__))
620_mm_insert_pi16(__m64 a, int d, int n)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000621{
Anders Carlsson62af71c2008-12-22 04:55:36 +0000622 return (__m64){ 0LL };
623}
Anders Carlsson8e28d242008-12-22 07:34:23 +0000624*/
Anders Carlsson62af71c2008-12-22 04:55:36 +0000625
Mike Stumpdae44132009-02-13 14:24:50 +0000626static inline __m64 __attribute__((__always_inline__))
627_mm_max_pi16(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000628{
629 return (__m64)__builtin_ia32_pmaxsw((__v4hi)a, (__v4hi)b);
630}
631
Mike Stumpdae44132009-02-13 14:24:50 +0000632static inline __m64 __attribute__((__always_inline__))
633_mm_max_pu8(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000634{
635 return (__m64)__builtin_ia32_pmaxub((__v8qi)a, (__v8qi)b);
636}
637
Mike Stumpdae44132009-02-13 14:24:50 +0000638static inline __m64 __attribute__((__always_inline__))
639_mm_min_pi16(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000640{
641 return (__m64)__builtin_ia32_pminsw((__v4hi)a, (__v4hi)b);
642}
643
Mike Stumpdae44132009-02-13 14:24:50 +0000644static inline __m64 __attribute__((__always_inline__))
645_mm_min_pu8(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000646{
647 return (__m64)__builtin_ia32_pminub((__v8qi)a, (__v8qi)b);
648}
649
Mike Stumpdae44132009-02-13 14:24:50 +0000650static inline int __attribute__((__always_inline__))
651_mm_movemask_pi8(__m64 a)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000652{
653 return __builtin_ia32_pmovmskb((__v8qi)a);
654}
655
Mike Stumpdae44132009-02-13 14:24:50 +0000656static inline __m64 __attribute__((__always_inline__))
657_mm_mulhi_pu16(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000658{
659 return (__m64)__builtin_ia32_pmulhuw((__v4hi)a, (__v4hi)b);
660}
661
662#define _mm_shuffle_pi16(a, n) ((__m64)__builtin_ia32_pshufw((__v4hi)a, n))
663
Mike Stumpdae44132009-02-13 14:24:50 +0000664static inline void __attribute__((__always_inline__))
665_mm_maskmove_si64(__m64 d, __m64 n, char *p)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000666{
667 __builtin_ia32_maskmovq((__v8qi)d, (__v8qi)n, p);
668}
669
Mike Stumpdae44132009-02-13 14:24:50 +0000670static inline __m64 __attribute__((__always_inline__))
671_mm_avg_pu8(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000672{
673 return (__m64)__builtin_ia32_pavgb((__v8qi)a, (__v8qi)b);
674}
675
Mike Stumpdae44132009-02-13 14:24:50 +0000676static inline __m64 __attribute__((__always_inline__))
677_mm_avg_pu16(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000678{
679 return (__m64)__builtin_ia32_pavgw((__v4hi)a, (__v4hi)b);
680}
681
Mike Stumpdae44132009-02-13 14:24:50 +0000682static inline __m64 __attribute__((__always_inline___))
683_mm_sad_pu8(__m64 a, __m64 b)
Anders Carlsson62af71c2008-12-22 04:55:36 +0000684{
685 return (__m64)__builtin_ia32_psadbw((__v8qi)a, (__v8qi)b);
686}
Anders Carlssonc1f9afd2008-12-22 05:00:07 +0000687
Mike Stumpdae44132009-02-13 14:24:50 +0000688static inline unsigned int __attribute__((__always_inline___))
689_mm_getcsr(void)
Anders Carlssonc1f9afd2008-12-22 05:00:07 +0000690{
691 return __builtin_ia32_stmxcsr();
692}
693
Mike Stumpdae44132009-02-13 14:24:50 +0000694static inline void __attribute__((__always_inline__))
695_mm_setcsr(unsigned int i)
Anders Carlssonc1f9afd2008-12-22 05:00:07 +0000696{
697 __builtin_ia32_ldmxcsr(i);
698}
699
Anders Carlsson50099cb2008-12-22 05:20:34 +0000700#define _mm_shuffle_ps(a, b, mask) (__builtin_ia32_shufps(a, b, mask))
701
Mike Stumpdae44132009-02-13 14:24:50 +0000702static inline __m128 __attribute__((__always_inline__))
703_mm_unpackhi_ps(__m128 a, __m128 b)
Anders Carlsson50099cb2008-12-22 05:20:34 +0000704{
705 return __builtin_shufflevector(a, b, 2, 6, 3, 7);
706}
707
Mike Stumpdae44132009-02-13 14:24:50 +0000708static inline __m128 __attribute__((__always_inline__))
709_mm_unpacklo_ps(__m128 a, __m128 b)
Anders Carlsson50099cb2008-12-22 05:20:34 +0000710{
711 return __builtin_shufflevector(a, b, 0, 4, 1, 5);
712}
713
Mike Stumpdae44132009-02-13 14:24:50 +0000714static inline __m128 __attribute__((__always_inline__))
715_mm_move_ss(__m128 a, __m128 b)
Anders Carlsson50099cb2008-12-22 05:20:34 +0000716{
717 return __builtin_shufflevector(a, b, 4, 1, 2, 3);
718}
719
Mike Stumpdae44132009-02-13 14:24:50 +0000720static inline __m128 __attribute__((__always_inline__))
721_mm_movehl_ps(__m128 a, __m128 b)
Anders Carlsson50099cb2008-12-22 05:20:34 +0000722{
723 return __builtin_shufflevector(a, b, 6, 7, 2, 3);
724}
725
Mike Stumpdae44132009-02-13 14:24:50 +0000726static inline __m128 __attribute__((__always_inline__))
727_mm_movelh_ps(__m128 a, __m128 b)
Anders Carlsson50099cb2008-12-22 05:20:34 +0000728{
729 return __builtin_shufflevector(a, b, 0, 1, 4, 5);
730}
731
Mike Stumpdae44132009-02-13 14:24:50 +0000732static inline __m128 __attribute__((__always_inline__))
733_mm_cvtpi16_ps(__m64 a)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000734{
735 __m64 b, c;
736 __m128 r;
737
738 b = _mm_setzero_si64();
739 b = _mm_cmpgt_pi16(b, a);
740 c = _mm_unpackhi_pi16(a, b);
741 r = _mm_setzero_ps();
742 r = _mm_cvtpi32_ps(r, c);
743 r = _mm_movelh_ps(r, r);
744 c = _mm_unpacklo_pi16(a, b);
745 r = _mm_cvtpi32_ps(r, c);
746
747 return r;
748}
749
Mike Stumpdae44132009-02-13 14:24:50 +0000750static inline __m128 __attribute__((__always_inline__))
751_mm_cvtpu16_ps(__m64 a)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000752{
753 __m64 b, c;
754 __m128 r;
755
756 b = _mm_setzero_si64();
757 c = _mm_unpackhi_pi16(a, b);
758 r = _mm_setzero_ps();
759 r = _mm_cvtpi32_ps(r, c);
760 r = _mm_movelh_ps(r, r);
761 c = _mm_unpacklo_pi16(a, b);
762 r = _mm_cvtpi32_ps(r, c);
763
764 return r;
765}
766
Mike Stumpdae44132009-02-13 14:24:50 +0000767static inline __m128 __attribute__((__always_inline__))
768_mm_cvtpi8_ps(__m64 a)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000769{
770 __m64 b;
771
772 b = _mm_setzero_si64();
773 b = _mm_cmpgt_pi8(b, a);
774 b = _mm_unpacklo_pi8(a, b);
775
776 return _mm_cvtpi16_ps(b);
777}
778
Mike Stumpdae44132009-02-13 14:24:50 +0000779static inline __m128 __attribute__((__always_inline__))
780_mm_cvtpu8_ps(__m64 a)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000781{
782 __m64 b;
783
784 b = _mm_setzero_si64();
785 b = _mm_unpacklo_pi8(a, b);
786
787 return _mm_cvtpi16_ps(b);
788}
789
Mike Stumpdae44132009-02-13 14:24:50 +0000790static inline __m128 __attribute__((__always_inline__))
791_mm_cvtpi32x2_ps(__m64 a, __m64 b)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000792{
793 __m128 c;
794
795 c = _mm_setzero_ps();
796 c = _mm_cvtpi32_ps(c, b);
797 c = _mm_movelh_ps(c, c);
798
799 return _mm_cvtpi32_ps(c, a);
800}
801
Mike Stumpdae44132009-02-13 14:24:50 +0000802static inline __m64 __attribute__((__always_inline__))
803_mm_cvtps_pi16(__m128 a)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000804{
805 __m64 b, c;
806
807 b = _mm_cvtps_pi32(a);
808 a = _mm_movehl_ps(a, a);
809 c = _mm_cvtps_pi32(a);
810
811 return _mm_packs_pi16(b, c);
812}
813
Mike Stumpdae44132009-02-13 14:24:50 +0000814static inline __m64 __attribute__((__always_inline__))
815_mm_cvtps_pi8(__m128 a)
Anders Carlssona6431dc2008-12-22 07:08:03 +0000816{
817 __m64 b, c;
818
819 b = _mm_cvtps_pi16(a);
820 c = _mm_setzero_si64();
821
822 return _mm_packs_pi16(b, c);
823}
824
Mike Stumpdae44132009-02-13 14:24:50 +0000825static inline int __attribute__((__always_inline__))
826_mm_movemask_ps(__m128 a)
Anders Carlsson50099cb2008-12-22 05:20:34 +0000827{
828 return __builtin_ia32_movmskps(a);
829}
830
Anders Carlssonb5955092008-12-22 05:42:03 +0000831#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
832
Anders Carlsson4cc44272009-02-11 06:29:32 +0000833#define _MM_EXCEPT_INVALID (0x0001)
834#define _MM_EXCEPT_DENORM (0x0002)
835#define _MM_EXCEPT_DIV_ZERO (0x0004)
836#define _MM_EXCEPT_OVERFLOW (0x0008)
837#define _MM_EXCEPT_UNDERFLOW (0x0010)
838#define _MM_EXCEPT_INEXACT (0x0020)
Anders Carlssonbbd1fa22009-01-21 01:49:39 +0000839#define _MM_EXCEPT_MASK (0x003f)
Anders Carlsson4cc44272009-02-11 06:29:32 +0000840
Anders Carlssonbbd1fa22009-01-21 01:49:39 +0000841#define _MM_MASK_INVALID (0x0080)
842#define _MM_MASK_DENORM (0x0100)
Anders Carlsson4cc44272009-02-11 06:29:32 +0000843#define _MM_EXCEPT_DIV_ZERO (0x0200)
844#define _MM_EXCEPT_OVERFLOW (0x0400)
845#define _MM_EXCEPT_UNDERFLOW (0x0800)
846#define _MM_EXCEPT_INEXACT (0x1000)
Anders Carlssonbbd1fa22009-01-21 01:49:39 +0000847#define _MM_MASK_MASK (0x1f80)
848
Anders Carlssonbbd1fa22009-01-21 01:49:39 +0000849#define _MM_ROUND_NEAREST (0x0000)
850#define _MM_ROUND_DOWN (0x2000)
851#define _MM_ROUND_UP (0x4000)
852#define _MM_ROUND_TOWARD_ZERO (0x6000)
Anders Carlsson4cc44272009-02-11 06:29:32 +0000853#define _MM_ROUND_MASK (0x6000)
Anders Carlssonbbd1fa22009-01-21 01:49:39 +0000854
855#define _MM_FLUSH_ZERO_MASK (0x8000)
856#define _MM_FLUSH_ZERO_ON (0x8000)
857#define _MM_FLUSH_ZERO_OFF (0x8000)
Anders Carlssonb5955092008-12-22 05:42:03 +0000858
859#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK)
860#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK)
Anders Carlsson62005c12009-01-20 21:51:44 +0000861#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK)
Anders Carlssonb5955092008-12-22 05:42:03 +0000862#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK)
863
Anders Carlsson62005c12009-01-20 21:51:44 +0000864#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x)))
Anders Carlssonb5955092008-12-22 05:42:03 +0000865#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x)))
Anders Carlsson62005c12009-01-20 21:51:44 +0000866#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x)))
Anders Carlssonb5955092008-12-22 05:42:03 +0000867#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x)))
868
869#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
870do { \
871 __m128 tmp3, tmp2, tmp1, tmp0; \
872 tmp0 = _mm_unpacklo_ps((row0), (row1)); \
873 tmp2 = _mm_unpacklo_ps((row2), (row3)); \
874 tmp1 = _mm_unpackhi_ps((row0), (row1)); \
875 tmp3 = _mm_unpackhi_ps((row2), (row3)); \
876 (row0) = _mm_movelh_ps(tmp0, tmp2); \
877 (row1) = _mm_movehl_ps(tmp2, tmp0); \
878 (row2) = _mm_movelh_ps(tmp1, tmp3); \
879 (row3) = _mm_movelh_ps(tmp3, tmp1); \
880} while (0)
881
Anders Carlsson4fd3e632008-12-26 00:57:11 +0000882#include <emmintrin.h>
883
Anders Carlsson566d8da2008-12-22 00:01:20 +0000884#endif /* __SSE__ */
885
886#endif /* __XMMINTRIN_H */