blob: 617ce24acd3f65b62114bd5343bedcc813196ea6 [file] [log] [blame]
Logan Chiendf4f7662019-09-04 16:45:23 -07001/*===---- emmintrin.h - Implementation of SSE2 intrinsics on PowerPC -------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since X86 SSE2 intrinsics mainly handles __m128i and __m128d type,
18 PowerPC VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE2 scalar float semantics on POWER.
23
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
27
28 Most SSE2 scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications.
31*/
32#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef EMMINTRIN_H_
36#define EMMINTRIN_H_
37
38#include <altivec.h>
39
40/* We need definitions from the SSE header files. */
41#include <xmmintrin.h>
42
43/* SSE2 */
44typedef __vector double __v2df;
45typedef __vector long long __v2di;
46typedef __vector unsigned long long __v2du;
47typedef __vector int __v4si;
48typedef __vector unsigned int __v4su;
49typedef __vector short __v8hi;
50typedef __vector unsigned short __v8hu;
51typedef __vector signed char __v16qi;
52typedef __vector unsigned char __v16qu;
53
54/* The Intel API is flexible enough that we must allow aliasing with other
55 vector types, and their scalar components. */
56typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
57typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
58
59/* Unaligned version of the same types. */
60typedef long long __m128i_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
61typedef double __m128d_u __attribute__ ((__vector_size__ (16), __may_alias__, __aligned__ (1)));
62
63/* Define two value permute mask. */
64#define _MM_SHUFFLE2(x,y) (((x) << 1) | (y))
65
66/* Create a vector with element 0 as F and the rest zero. */
67extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
68_mm_set_sd (double __F)
69{
70 return __extension__ (__m128d){ __F, 0.0 };
71}
72
73/* Create a vector with both elements equal to F. */
74extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75_mm_set1_pd (double __F)
76{
77 return __extension__ (__m128d){ __F, __F };
78}
79
80extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
81_mm_set_pd1 (double __F)
82{
83 return _mm_set1_pd (__F);
84}
85
86/* Create a vector with the lower value X and upper value W. */
87extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
88_mm_set_pd (double __W, double __X)
89{
90 return __extension__ (__m128d){ __X, __W };
91}
92
93/* Create a vector with the lower value W and upper value X. */
94extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
95_mm_setr_pd (double __W, double __X)
96{
97 return __extension__ (__m128d){ __W, __X };
98}
99
100/* Create an undefined vector. */
101extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
102_mm_undefined_pd (void)
103{
104 __m128d __Y = __Y;
105 return __Y;
106}
107
108/* Create a vector of zeros. */
109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
110_mm_setzero_pd (void)
111{
112 return (__m128d) vec_splats (0);
113}
114
115/* Sets the low DPFP value of A from the low value of B. */
116extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
117_mm_move_sd (__m128d __A, __m128d __B)
118{
119 __v2df result = (__v2df) __A;
120 result [0] = ((__v2df) __B)[0];
121 return (__m128d) result;
122}
123
124/* Load two DPFP values from P. The address must be 16-byte aligned. */
125extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
126_mm_load_pd (double const *__P)
127{
128 return ((__m128d)vec_ld(0, (__v16qu*)__P));
129}
130
131/* Load two DPFP values from P. The address need not be 16-byte aligned. */
132extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
133_mm_loadu_pd (double const *__P)
134{
135 return (vec_vsx_ld(0, __P));
136}
137
138/* Create a vector with all two elements equal to *P. */
139extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
140_mm_load1_pd (double const *__P)
141{
142 return (vec_splats (*__P));
143}
144
145/* Create a vector with element 0 as *P and the rest zero. */
146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
147_mm_load_sd (double const *__P)
148{
149 return _mm_set_sd (*__P);
150}
151
152extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153_mm_load_pd1 (double const *__P)
154{
155 return _mm_load1_pd (__P);
156}
157
158/* Load two DPFP values in reverse order. The address must be aligned. */
159extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160_mm_loadr_pd (double const *__P)
161{
162 __v2df __tmp = _mm_load_pd (__P);
163 return (__m128d)vec_xxpermdi (__tmp, __tmp, 2);
164}
165
166/* Store two DPFP values. The address must be 16-byte aligned. */
167extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
168_mm_store_pd (double *__P, __m128d __A)
169{
170 vec_st((__v16qu)__A, 0, (__v16qu*)__P);
171}
172
173/* Store two DPFP values. The address need not be 16-byte aligned. */
174extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
175_mm_storeu_pd (double *__P, __m128d __A)
176{
177 *(__m128d_u *)__P = __A;
178}
179
180/* Stores the lower DPFP value. */
181extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
182_mm_store_sd (double *__P, __m128d __A)
183{
184 *__P = ((__v2df)__A)[0];
185}
186
187extern __inline double __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188_mm_cvtsd_f64 (__m128d __A)
189{
190 return ((__v2df)__A)[0];
191}
192
193extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
194_mm_storel_pd (double *__P, __m128d __A)
195{
196 _mm_store_sd (__P, __A);
197}
198
199/* Stores the upper DPFP value. */
200extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201_mm_storeh_pd (double *__P, __m128d __A)
202{
203 *__P = ((__v2df)__A)[1];
204}
205/* Store the lower DPFP value across two words.
206 The address must be 16-byte aligned. */
207extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
208_mm_store1_pd (double *__P, __m128d __A)
209{
210 _mm_store_pd (__P, vec_splat (__A, 0));
211}
212
213extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
214_mm_store_pd1 (double *__P, __m128d __A)
215{
216 _mm_store1_pd (__P, __A);
217}
218
219/* Store two DPFP values in reverse order. The address must be aligned. */
220extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221_mm_storer_pd (double *__P, __m128d __A)
222{
223 _mm_store_pd (__P, vec_xxpermdi (__A, __A, 2));
224}
225
226/* Intel intrinsic. */
227extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228_mm_cvtsi128_si64 (__m128i __A)
229{
230 return ((__v2di)__A)[0];
231}
232
233/* Microsoft intrinsic. */
234extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
235_mm_cvtsi128_si64x (__m128i __A)
236{
237 return ((__v2di)__A)[0];
238}
239
240extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
241_mm_add_pd (__m128d __A, __m128d __B)
242{
243 return (__m128d) ((__v2df)__A + (__v2df)__B);
244}
245
246/* Add the lower double-precision (64-bit) floating-point element in
247 a and b, store the result in the lower element of dst, and copy
248 the upper element from a to the upper element of dst. */
249extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
250_mm_add_sd (__m128d __A, __m128d __B)
251{
252 __A[0] = __A[0] + __B[0];
253 return (__A);
254}
255
256extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257_mm_sub_pd (__m128d __A, __m128d __B)
258{
259 return (__m128d) ((__v2df)__A - (__v2df)__B);
260}
261
262extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
263_mm_sub_sd (__m128d __A, __m128d __B)
264{
265 __A[0] = __A[0] - __B[0];
266 return (__A);
267}
268
269extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
270_mm_mul_pd (__m128d __A, __m128d __B)
271{
272 return (__m128d) ((__v2df)__A * (__v2df)__B);
273}
274
275extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
276_mm_mul_sd (__m128d __A, __m128d __B)
277{
278 __A[0] = __A[0] * __B[0];
279 return (__A);
280}
281
282extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
283_mm_div_pd (__m128d __A, __m128d __B)
284{
285 return (__m128d) ((__v2df)__A / (__v2df)__B);
286}
287
288extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
289_mm_div_sd (__m128d __A, __m128d __B)
290{
291 __A[0] = __A[0] / __B[0];
292 return (__A);
293}
294
295extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
296_mm_sqrt_pd (__m128d __A)
297{
298 return (vec_sqrt (__A));
299}
300
301/* Return pair {sqrt (B[0]), A[1]}. */
302extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
303_mm_sqrt_sd (__m128d __A, __m128d __B)
304{
305 __v2df c;
306 c = vec_sqrt ((__v2df) _mm_set1_pd (__B[0]));
307 return (__m128d) _mm_setr_pd (c[0], __A[1]);
308}
309
310extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
311_mm_min_pd (__m128d __A, __m128d __B)
312{
313 return (vec_min (__A, __B));
314}
315
316extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
317_mm_min_sd (__m128d __A, __m128d __B)
318{
319 __v2df a, b, c;
320 a = vec_splats (__A[0]);
321 b = vec_splats (__B[0]);
322 c = vec_min (a, b);
323 return (__m128d) _mm_setr_pd (c[0], __A[1]);
324}
325
326extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
327_mm_max_pd (__m128d __A, __m128d __B)
328{
329 return (vec_max (__A, __B));
330}
331
332extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
333_mm_max_sd (__m128d __A, __m128d __B)
334{
335 __v2df a, b, c;
336 a = vec_splats (__A[0]);
337 b = vec_splats (__B[0]);
338 c = vec_max (a, b);
339 return (__m128d) _mm_setr_pd (c[0], __A[1]);
340}
341
342extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
343_mm_cmpeq_pd (__m128d __A, __m128d __B)
344{
345 return ((__m128d)vec_cmpeq ((__v2df) __A, (__v2df) __B));
346}
347
348extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
349_mm_cmplt_pd (__m128d __A, __m128d __B)
350{
351 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
352}
353
354extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
355_mm_cmple_pd (__m128d __A, __m128d __B)
356{
357 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
358}
359
360extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361_mm_cmpgt_pd (__m128d __A, __m128d __B)
362{
363 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
364}
365
366extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
367_mm_cmpge_pd (__m128d __A, __m128d __B)
368{
369 return ((__m128d)vec_cmpge ((__v2df) __A,(__v2df) __B));
370}
371
372extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
373_mm_cmpneq_pd (__m128d __A, __m128d __B)
374{
375 __v2df temp = (__v2df) vec_cmpeq ((__v2df) __A, (__v2df)__B);
376 return ((__m128d)vec_nor (temp, temp));
377}
378
379extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_cmpnlt_pd (__m128d __A, __m128d __B)
381{
382 return ((__m128d)vec_cmpge ((__v2df) __A, (__v2df) __B));
383}
384
385extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
386_mm_cmpnle_pd (__m128d __A, __m128d __B)
387{
388 return ((__m128d)vec_cmpgt ((__v2df) __A, (__v2df) __B));
389}
390
391extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
392_mm_cmpngt_pd (__m128d __A, __m128d __B)
393{
394 return ((__m128d)vec_cmple ((__v2df) __A, (__v2df) __B));
395}
396
397extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
398_mm_cmpnge_pd (__m128d __A, __m128d __B)
399{
400 return ((__m128d)vec_cmplt ((__v2df) __A, (__v2df) __B));
401}
402
403extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
404_mm_cmpord_pd (__m128d __A, __m128d __B)
405{
406#if _ARCH_PWR8
407 __v2du c, d;
408 /* Compare against self will return false (0's) if NAN. */
409 c = (__v2du)vec_cmpeq (__A, __A);
410 d = (__v2du)vec_cmpeq (__B, __B);
411#else
412 __v2du a, b;
413 __v2du c, d;
414 const __v2du double_exp_mask = {0x7ff0000000000000, 0x7ff0000000000000};
415 a = (__v2du)vec_abs ((__v2df)__A);
416 b = (__v2du)vec_abs ((__v2df)__B);
417 c = (__v2du)vec_cmpgt (double_exp_mask, a);
418 d = (__v2du)vec_cmpgt (double_exp_mask, b);
419#endif
420 /* A != NAN and B != NAN. */
421 return ((__m128d)vec_and(c, d));
422}
423
424extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
425_mm_cmpunord_pd (__m128d __A, __m128d __B)
426{
427#if _ARCH_PWR8
428 __v2du c, d;
429 /* Compare against self will return false (0's) if NAN. */
430 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
431 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
432 /* A == NAN OR B == NAN converts too:
433 NOT(A != NAN) OR NOT(B != NAN). */
434 c = vec_nor (c, c);
435 return ((__m128d)vec_orc(c, d));
436#else
437 __v2du c, d;
438 /* Compare against self will return false (0's) if NAN. */
439 c = (__v2du)vec_cmpeq ((__v2df)__A, (__v2df)__A);
440 d = (__v2du)vec_cmpeq ((__v2df)__B, (__v2df)__B);
441 /* Convert the true ('1's) is NAN. */
442 c = vec_nor (c, c);
443 d = vec_nor (d, d);
444 return ((__m128d)vec_or(c, d));
445#endif
446}
447
448extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449_mm_cmpeq_sd(__m128d __A, __m128d __B)
450{
451 __v2df a, b, c;
452 /* PowerISA VSX does not allow partial (for just lower double)
453 results. So to insure we don't generate spurious exceptions
454 (from the upper double values) we splat the lower double
455 before we do the operation. */
456 a = vec_splats (__A[0]);
457 b = vec_splats (__B[0]);
458 c = (__v2df) vec_cmpeq(a, b);
459 /* Then we merge the lower double result with the original upper
460 double from __A. */
461 return (__m128d) _mm_setr_pd (c[0], __A[1]);
462}
463
464extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
465_mm_cmplt_sd (__m128d __A, __m128d __B)
466{
467 __v2df a, b, c;
468 a = vec_splats (__A[0]);
469 b = vec_splats (__B[0]);
470 c = (__v2df) vec_cmplt(a, b);
471 return (__m128d) _mm_setr_pd (c[0], __A[1]);
472}
473
474extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm_cmple_sd (__m128d __A, __m128d __B)
476{
477 __v2df a, b, c;
478 a = vec_splats (__A[0]);
479 b = vec_splats (__B[0]);
480 c = (__v2df) vec_cmple(a, b);
481 return (__m128d) _mm_setr_pd (c[0], __A[1]);
482}
483
484extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
485_mm_cmpgt_sd (__m128d __A, __m128d __B)
486{
487 __v2df a, b, c;
488 a = vec_splats (__A[0]);
489 b = vec_splats (__B[0]);
490 c = (__v2df) vec_cmpgt(a, b);
491 return (__m128d) _mm_setr_pd (c[0], __A[1]);
492}
493
494extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495_mm_cmpge_sd (__m128d __A, __m128d __B)
496{
497 __v2df a, b, c;
498 a = vec_splats (__A[0]);
499 b = vec_splats (__B[0]);
500 c = (__v2df) vec_cmpge(a, b);
501 return (__m128d) _mm_setr_pd (c[0], __A[1]);
502}
503
504extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
505_mm_cmpneq_sd (__m128d __A, __m128d __B)
506{
507 __v2df a, b, c;
508 a = vec_splats (__A[0]);
509 b = vec_splats (__B[0]);
510 c = (__v2df) vec_cmpeq(a, b);
511 c = vec_nor (c, c);
512 return (__m128d) _mm_setr_pd (c[0], __A[1]);
513}
514
515extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
516_mm_cmpnlt_sd (__m128d __A, __m128d __B)
517{
518 __v2df a, b, c;
519 a = vec_splats (__A[0]);
520 b = vec_splats (__B[0]);
521 /* Not less than is just greater than or equal. */
522 c = (__v2df) vec_cmpge(a, b);
523 return (__m128d) _mm_setr_pd (c[0], __A[1]);
524}
525
526extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
527_mm_cmpnle_sd (__m128d __A, __m128d __B)
528{
529 __v2df a, b, c;
530 a = vec_splats (__A[0]);
531 b = vec_splats (__B[0]);
532 /* Not less than or equal is just greater than. */
533 c = (__v2df) vec_cmpge(a, b);
534 return (__m128d) _mm_setr_pd (c[0], __A[1]);
535}
536
537extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538_mm_cmpngt_sd (__m128d __A, __m128d __B)
539{
540 __v2df a, b, c;
541 a = vec_splats (__A[0]);
542 b = vec_splats (__B[0]);
543 /* Not greater than is just less than or equal. */
544 c = (__v2df) vec_cmple(a, b);
545 return (__m128d) _mm_setr_pd (c[0], __A[1]);
546}
547
548extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549_mm_cmpnge_sd (__m128d __A, __m128d __B)
550{
551 __v2df a, b, c;
552 a = vec_splats (__A[0]);
553 b = vec_splats (__B[0]);
554 /* Not greater than or equal is just less than. */
555 c = (__v2df) vec_cmplt(a, b);
556 return (__m128d) _mm_setr_pd (c[0], __A[1]);
557}
558
559extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
560_mm_cmpord_sd (__m128d __A, __m128d __B)
561{
562 __v2df r;
563 r = (__v2df)_mm_cmpord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
564 return (__m128d) _mm_setr_pd (r[0], ((__v2df)__A)[1]);
565}
566
567extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
568_mm_cmpunord_sd (__m128d __A, __m128d __B)
569{
570 __v2df r;
571 r = _mm_cmpunord_pd (vec_splats (__A[0]), vec_splats (__B[0]));
572 return (__m128d) _mm_setr_pd (r[0], __A[1]);
573}
574
575/* FIXME
576 The __mm_comi??_sd and __mm_ucomi??_sd implementations below are
577 exactly the same because GCC for PowerPC only generates unordered
578 compares (scalar and vector).
579 Technically __mm_comieq_sp et all should be using the ordered
580 compare and signal for QNaNs. The __mm_ucomieq_sd et all should
581 be OK. */
582extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
583_mm_comieq_sd (__m128d __A, __m128d __B)
584{
585 return (__A[0] == __B[0]);
586}
587
588extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589_mm_comilt_sd (__m128d __A, __m128d __B)
590{
591 return (__A[0] < __B[0]);
592}
593
594extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
595_mm_comile_sd (__m128d __A, __m128d __B)
596{
597 return (__A[0] <= __B[0]);
598}
599
600extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
601_mm_comigt_sd (__m128d __A, __m128d __B)
602{
603 return (__A[0] > __B[0]);
604}
605
606extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_comige_sd (__m128d __A, __m128d __B)
608{
609 return (__A[0] >= __B[0]);
610}
611
612extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
613_mm_comineq_sd (__m128d __A, __m128d __B)
614{
615 return (__A[0] != __B[0]);
616}
617
618extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
619_mm_ucomieq_sd (__m128d __A, __m128d __B)
620{
621 return (__A[0] == __B[0]);
622}
623
624extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm_ucomilt_sd (__m128d __A, __m128d __B)
626{
627 return (__A[0] < __B[0]);
628}
629
630extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
631_mm_ucomile_sd (__m128d __A, __m128d __B)
632{
633 return (__A[0] <= __B[0]);
634}
635
636extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
637_mm_ucomigt_sd (__m128d __A, __m128d __B)
638{
639 return (__A[0] > __B[0]);
640}
641
642extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643_mm_ucomige_sd (__m128d __A, __m128d __B)
644{
645 return (__A[0] >= __B[0]);
646}
647
648extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
649_mm_ucomineq_sd (__m128d __A, __m128d __B)
650{
651 return (__A[0] != __B[0]);
652}
653
654/* Create a vector of Qi, where i is the element number. */
655extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
656_mm_set_epi64x (long long __q1, long long __q0)
657{
658 return __extension__ (__m128i)(__v2di){ __q0, __q1 };
659}
660
661extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662_mm_set_epi64 (__m64 __q1, __m64 __q0)
663{
664 return _mm_set_epi64x ((long long)__q1, (long long)__q0);
665}
666
667extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
669{
670 return __extension__ (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
671}
672
673extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
674_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
675 short __q3, short __q2, short __q1, short __q0)
676{
677 return __extension__ (__m128i)(__v8hi){
678 __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
679}
680
681extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
682_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
683 char __q11, char __q10, char __q09, char __q08,
684 char __q07, char __q06, char __q05, char __q04,
685 char __q03, char __q02, char __q01, char __q00)
686{
687 return __extension__ (__m128i)(__v16qi){
688 __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
689 __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
690 };
691}
692
693/* Set all of the elements of the vector to A. */
694extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
695_mm_set1_epi64x (long long __A)
696{
697 return _mm_set_epi64x (__A, __A);
698}
699
700extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701_mm_set1_epi64 (__m64 __A)
702{
703 return _mm_set_epi64 (__A, __A);
704}
705
706extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
707_mm_set1_epi32 (int __A)
708{
709 return _mm_set_epi32 (__A, __A, __A, __A);
710}
711
712extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
713_mm_set1_epi16 (short __A)
714{
715 return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
716}
717
718extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
719_mm_set1_epi8 (char __A)
720{
721 return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
722 __A, __A, __A, __A, __A, __A, __A, __A);
723}
724
725/* Create a vector of Qi, where i is the element number.
726 The parameter order is reversed from the _mm_set_epi* functions. */
727extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
728_mm_setr_epi64 (__m64 __q0, __m64 __q1)
729{
730 return _mm_set_epi64 (__q1, __q0);
731}
732
733extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
735{
736 return _mm_set_epi32 (__q3, __q2, __q1, __q0);
737}
738
739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
740_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
741 short __q4, short __q5, short __q6, short __q7)
742{
743 return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
744}
745
746extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
747_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
748 char __q04, char __q05, char __q06, char __q07,
749 char __q08, char __q09, char __q10, char __q11,
750 char __q12, char __q13, char __q14, char __q15)
751{
752 return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
753 __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
754}
755
756/* Create a vector with element 0 as *P and the rest zero. */
757extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
758_mm_load_si128 (__m128i const *__P)
759{
760 return *__P;
761}
762
763extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
764_mm_loadu_si128 (__m128i_u const *__P)
765{
766 return (__m128i) (vec_vsx_ld(0, (signed int const *)__P));
767}
768
769extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770_mm_loadl_epi64 (__m128i_u const *__P)
771{
772 return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
773}
774
775extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
776_mm_store_si128 (__m128i *__P, __m128i __B)
777{
778 vec_st ((__v16qu) __B, 0, (__v16qu*)__P);
779}
780
781extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
782_mm_storeu_si128 (__m128i_u *__P, __m128i __B)
783{
784 *__P = __B;
785}
786
787extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
788_mm_storel_epi64 (__m128i_u *__P, __m128i __B)
789{
790 *(long long *)__P = ((__v2di)__B)[0];
791}
792
793extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794_mm_movepi64_pi64 (__m128i_u __B)
795{
796 return (__m64) ((__v2di)__B)[0];
797}
798
799extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800_mm_movpi64_epi64 (__m64 __A)
801{
802 return _mm_set_epi64 ((__m64)0LL, __A);
803}
804
805extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806_mm_move_epi64 (__m128i __A)
807{
808 return _mm_set_epi64 ((__m64)0LL, (__m64)__A[0]);
809}
810
811/* Create an undefined vector. */
812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
813_mm_undefined_si128 (void)
814{
815 __m128i __Y = __Y;
816 return __Y;
817}
818
819/* Create a vector of zeros. */
820extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
821_mm_setzero_si128 (void)
822{
823 return __extension__ (__m128i)(__v4si){ 0, 0, 0, 0 };
824}
825
826#ifdef _ARCH_PWR8
827extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
828_mm_cvtepi32_pd (__m128i __A)
829{
830 __v2di val;
831 /* For LE need to generate Vector Unpack Low Signed Word.
832 Which is generated from unpackh. */
833 val = (__v2di)vec_unpackh ((__v4si)__A);
834
835 return (__m128d)vec_ctf (val, 0);
836}
837#endif
838
839extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
840_mm_cvtepi32_ps (__m128i __A)
841{
842 return ((__m128)vec_ctf((__v4si)__A, 0));
843}
844
845extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
846_mm_cvtpd_epi32 (__m128d __A)
847{
848 __v2df rounded = vec_rint (__A);
849 __v4si result, temp;
850 const __v4si vzero =
851 { 0, 0, 0, 0 };
852
853 /* VSX Vector truncate Double-Precision to integer and Convert to
854 Signed Integer Word format with Saturate. */
855 __asm__(
856 "xvcvdpsxws %x0,%x1"
857 : "=wa" (temp)
858 : "wa" (rounded)
859 : );
860
861#ifdef _ARCH_PWR8
862 temp = vec_mergeo (temp, temp);
863 result = (__v4si) vec_vpkudum ((__vector long long) temp,
864 (__vector long long) vzero);
865#else
866 {
867 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
868 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
869 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
870 }
871#endif
872 return (__m128i) result;
873}
874
875extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
876_mm_cvtpd_pi32 (__m128d __A)
877{
878 __m128i result = _mm_cvtpd_epi32(__A);
879
880 return (__m64) result[0];
881}
882
883extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
884_mm_cvtpd_ps (__m128d __A)
885{
886 __v4sf result;
887 __v4si temp;
888 const __v4si vzero = { 0, 0, 0, 0 };
889
890 __asm__(
891 "xvcvdpsp %x0,%x1"
892 : "=wa" (temp)
893 : "wa" (__A)
894 : );
895
896#ifdef _ARCH_PWR8
897 temp = vec_mergeo (temp, temp);
898 result = (__v4sf) vec_vpkudum ((__vector long long) temp,
899 (__vector long long) vzero);
900#else
901 {
902 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
903 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
904 result = (__v4sf) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
905 }
906#endif
907 return ((__m128)result);
908}
909
910extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
911_mm_cvttpd_epi32 (__m128d __A)
912{
913 __v4si result;
914 __v4si temp;
915 const __v4si vzero = { 0, 0, 0, 0 };
916
917 /* VSX Vector truncate Double-Precision to integer and Convert to
918 Signed Integer Word format with Saturate. */
919 __asm__(
920 "xvcvdpsxws %x0,%x1"
921 : "=wa" (temp)
922 : "wa" (__A)
923 : );
924
925#ifdef _ARCH_PWR8
926 temp = vec_mergeo (temp, temp);
927 result = (__v4si) vec_vpkudum ((__vector long long) temp,
928 (__vector long long) vzero);
929#else
930 {
931 const __v16qu pkperm = {0x00, 0x01, 0x02, 0x03, 0x08, 0x09, 0x0a, 0x0b,
932 0x14, 0x15, 0x16, 0x17, 0x1c, 0x1d, 0x1e, 0x1f };
933 result = (__v4si) vec_perm ((__v16qu) temp, (__v16qu) vzero, pkperm);
934 }
935#endif
936
937 return ((__m128i) result);
938}
939
940extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
941_mm_cvttpd_pi32 (__m128d __A)
942{
943 __m128i result = _mm_cvttpd_epi32 (__A);
944
945 return (__m64) result[0];
946}
947
948extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
949_mm_cvtsi128_si32 (__m128i __A)
950{
951 return ((__v4si)__A)[0];
952}
953
954#ifdef _ARCH_PWR8
955extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
956_mm_cvtpi32_pd (__m64 __A)
957{
958 __v4si temp;
959 __v2di tmp2;
960 __v2df result;
961
962 temp = (__v4si)vec_splats (__A);
963 tmp2 = (__v2di)vec_unpackl (temp);
964 result = vec_ctf ((__vector signed long long) tmp2, 0);
965 return (__m128d)result;
966}
967#endif
968
969extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
970_mm_cvtps_epi32 (__m128 __A)
971{
972 __v4sf rounded;
973 __v4si result;
974
975 rounded = vec_rint((__v4sf) __A);
976 result = vec_cts (rounded, 0);
977 return (__m128i) result;
978}
979
980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
981_mm_cvttps_epi32 (__m128 __A)
982{
983 __v4si result;
984
985 result = vec_cts ((__v4sf) __A, 0);
986 return (__m128i) result;
987}
988
989extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm_cvtps_pd (__m128 __A)
991{
992 /* Check if vec_doubleh is defined by <altivec.h>. If so use that. */
993#ifdef vec_doubleh
994 return (__m128d) vec_doubleh ((__v4sf)__A);
995#else
996 /* Otherwise the compiler is not current and so need to generate the
997 equivalent code. */
998 __v4sf a = (__v4sf)__A;
999 __v4sf temp;
1000 __v2df result;
1001#ifdef __LITTLE_ENDIAN__
1002 /* The input float values are in elements {[0], [1]} but the convert
1003 instruction needs them in elements {[1], [3]}, So we use two
1004 shift left double vector word immediates to get the elements
1005 lined up. */
1006 temp = __builtin_vsx_xxsldwi (a, a, 3);
1007 temp = __builtin_vsx_xxsldwi (a, temp, 2);
1008#else
1009 /* The input float values are in elements {[0], [1]} but the convert
1010 instruction needs them in elements {[0], [2]}, So we use two
1011 shift left double vector word immediates to get the elements
1012 lined up. */
1013 temp = vec_vmrghw (a, a);
1014#endif
1015 __asm__(
1016 " xvcvspdp %x0,%x1"
1017 : "=wa" (result)
1018 : "wa" (temp)
1019 : );
1020 return (__m128d) result;
1021#endif
1022}
1023
1024extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1025_mm_cvtsd_si32 (__m128d __A)
1026{
1027 __v2df rounded = vec_rint((__v2df) __A);
1028 int result = ((__v2df)rounded)[0];
1029
1030 return result;
1031}
1032/* Intel intrinsic. */
1033extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034_mm_cvtsd_si64 (__m128d __A)
1035{
1036 __v2df rounded = vec_rint ((__v2df) __A );
1037 long long result = ((__v2df) rounded)[0];
1038
1039 return result;
1040}
1041
1042/* Microsoft intrinsic. */
1043extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1044_mm_cvtsd_si64x (__m128d __A)
1045{
1046 return _mm_cvtsd_si64 ((__v2df)__A);
1047}
1048
1049extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1050_mm_cvttsd_si32 (__m128d __A)
1051{
1052 int result = ((__v2df)__A)[0];
1053
1054 return result;
1055}
1056
1057/* Intel intrinsic. */
1058extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1059_mm_cvttsd_si64 (__m128d __A)
1060{
1061 long long result = ((__v2df)__A)[0];
1062
1063 return result;
1064}
1065
1066/* Microsoft intrinsic. */
1067extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1068_mm_cvttsd_si64x (__m128d __A)
1069{
1070 return _mm_cvttsd_si64 (__A);
1071}
1072
1073extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074_mm_cvtsd_ss (__m128 __A, __m128d __B)
1075{
1076 __v4sf result = (__v4sf)__A;
1077
1078#ifdef __LITTLE_ENDIAN__
1079 __v4sf temp_s;
1080 /* Copy double element[0] to element [1] for conversion. */
1081 __v2df temp_b = vec_splat((__v2df)__B, 0);
1082
1083 /* Pre-rotate __A left 3 (logically right 1) elements. */
1084 result = __builtin_vsx_xxsldwi (result, result, 3);
1085 /* Convert double to single float scalar in a vector. */
1086 __asm__(
1087 "xscvdpsp %x0,%x1"
1088 : "=wa" (temp_s)
1089 : "wa" (temp_b)
1090 : );
1091 /* Shift the resulting scalar into vector element [0]. */
1092 result = __builtin_vsx_xxsldwi (result, temp_s, 1);
1093#else
1094 result [0] = ((__v2df)__B)[0];
1095#endif
1096 return (__m128) result;
1097}
1098
1099extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100_mm_cvtsi32_sd (__m128d __A, int __B)
1101{
1102 __v2df result = (__v2df)__A;
1103 double db = __B;
1104 result [0] = db;
1105 return (__m128d)result;
1106}
1107
1108/* Intel intrinsic. */
1109extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1110_mm_cvtsi64_sd (__m128d __A, long long __B)
1111{
1112 __v2df result = (__v2df)__A;
1113 double db = __B;
1114 result [0] = db;
1115 return (__m128d)result;
1116}
1117
1118/* Microsoft intrinsic. */
1119extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1120_mm_cvtsi64x_sd (__m128d __A, long long __B)
1121{
1122 return _mm_cvtsi64_sd (__A, __B);
1123}
1124
1125extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126_mm_cvtss_sd (__m128d __A, __m128 __B)
1127{
1128#ifdef __LITTLE_ENDIAN__
1129 /* Use splat to move element [0] into position for the convert. */
1130 __v4sf temp = vec_splat ((__v4sf)__B, 0);
1131 __v2df res;
1132 /* Convert single float scalar to double in a vector. */
1133 __asm__(
1134 "xscvspdp %x0,%x1"
1135 : "=wa" (res)
1136 : "wa" (temp)
1137 : );
1138 return (__m128d) vec_mergel (res, (__v2df)__A);
1139#else
1140 __v2df res = (__v2df)__A;
1141 res [0] = ((__v4sf)__B) [0];
1142 return (__m128d) res;
1143#endif
1144}
1145
1146extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147_mm_shuffle_pd(__m128d __A, __m128d __B, const int __mask)
1148{
1149 __vector double result;
1150 const int litmsk = __mask & 0x3;
1151
1152 if (litmsk == 0)
1153 result = vec_mergeh (__A, __B);
1154#if __GNUC__ < 6
1155 else if (litmsk == 1)
1156 result = vec_xxpermdi (__B, __A, 2);
1157 else if (litmsk == 2)
1158 result = vec_xxpermdi (__B, __A, 1);
1159#else
1160 else if (litmsk == 1)
1161 result = vec_xxpermdi (__A, __B, 2);
1162 else if (litmsk == 2)
1163 result = vec_xxpermdi (__A, __B, 1);
1164#endif
1165 else
1166 result = vec_mergel (__A, __B);
1167
1168 return result;
1169}
1170
1171extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1172_mm_unpackhi_pd (__m128d __A, __m128d __B)
1173{
1174 return (__m128d) vec_mergel ((__v2df)__A, (__v2df)__B);
1175}
1176
1177extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1178_mm_unpacklo_pd (__m128d __A, __m128d __B)
1179{
1180 return (__m128d) vec_mergeh ((__v2df)__A, (__v2df)__B);
1181}
1182
1183extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184_mm_loadh_pd (__m128d __A, double const *__B)
1185{
1186 __v2df result = (__v2df)__A;
1187 result [1] = *__B;
1188 return (__m128d)result;
1189}
1190
1191extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1192_mm_loadl_pd (__m128d __A, double const *__B)
1193{
1194 __v2df result = (__v2df)__A;
1195 result [0] = *__B;
1196 return (__m128d)result;
1197}
1198
1199#ifdef _ARCH_PWR8
1200/* Intrinsic functions that require PowerISA 2.07 minimum. */
1201
1202/* Creates a 2-bit mask from the most significant bits of the DPFP values. */
1203extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1204_mm_movemask_pd (__m128d __A)
1205{
1206 __vector unsigned long long result;
1207 static const __vector unsigned int perm_mask =
1208 {
1209#ifdef __LITTLE_ENDIAN__
1210 0x80800040, 0x80808080, 0x80808080, 0x80808080
1211#else
1212 0x80808080, 0x80808080, 0x80808080, 0x80804000
1213#endif
1214 };
1215
1216 result = ((__vector unsigned long long)
1217 vec_vbpermq ((__vector unsigned char) __A,
1218 (__vector unsigned char) perm_mask));
1219
1220#ifdef __LITTLE_ENDIAN__
1221 return result[1];
1222#else
1223 return result[0];
1224#endif
1225}
1226#endif /* _ARCH_PWR8 */
1227
1228extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1229_mm_packs_epi16 (__m128i __A, __m128i __B)
1230{
1231 return (__m128i) vec_packs ((__v8hi) __A, (__v8hi)__B);
1232}
1233
1234extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1235_mm_packs_epi32 (__m128i __A, __m128i __B)
1236{
1237 return (__m128i) vec_packs ((__v4si)__A, (__v4si)__B);
1238}
1239
1240extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1241_mm_packus_epi16 (__m128i __A, __m128i __B)
1242{
1243 return (__m128i) vec_packsu ((__v8hi) __A, (__v8hi)__B);
1244}
1245
1246extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1247_mm_unpackhi_epi8 (__m128i __A, __m128i __B)
1248{
1249 return (__m128i) vec_mergel ((__v16qu)__A, (__v16qu)__B);
1250}
1251
1252extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253_mm_unpackhi_epi16 (__m128i __A, __m128i __B)
1254{
1255 return (__m128i) vec_mergel ((__v8hu)__A, (__v8hu)__B);
1256}
1257
1258extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1259_mm_unpackhi_epi32 (__m128i __A, __m128i __B)
1260{
1261 return (__m128i) vec_mergel ((__v4su)__A, (__v4su)__B);
1262}
1263
1264extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1265_mm_unpackhi_epi64 (__m128i __A, __m128i __B)
1266{
1267 return (__m128i) vec_mergel ((__vector long long) __A,
1268 (__vector long long) __B);
1269}
1270
1271extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1272_mm_unpacklo_epi8 (__m128i __A, __m128i __B)
1273{
1274 return (__m128i) vec_mergeh ((__v16qu)__A, (__v16qu)__B);
1275}
1276
1277extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1278_mm_unpacklo_epi16 (__m128i __A, __m128i __B)
1279{
1280 return (__m128i) vec_mergeh ((__v8hi)__A, (__v8hi)__B);
1281}
1282
1283extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1284_mm_unpacklo_epi32 (__m128i __A, __m128i __B)
1285{
1286 return (__m128i) vec_mergeh ((__v4si)__A, (__v4si)__B);
1287}
1288
1289extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1290_mm_unpacklo_epi64 (__m128i __A, __m128i __B)
1291{
1292 return (__m128i) vec_mergeh ((__vector long long) __A,
1293 (__vector long long) __B);
1294}
1295
1296extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297_mm_add_epi8 (__m128i __A, __m128i __B)
1298{
1299 return (__m128i) ((__v16qu)__A + (__v16qu)__B);
1300}
1301
1302extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1303_mm_add_epi16 (__m128i __A, __m128i __B)
1304{
1305 return (__m128i) ((__v8hu)__A + (__v8hu)__B);
1306}
1307
1308extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1309_mm_add_epi32 (__m128i __A, __m128i __B)
1310{
1311 return (__m128i) ((__v4su)__A + (__v4su)__B);
1312}
1313
1314extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1315_mm_add_epi64 (__m128i __A, __m128i __B)
1316{
1317 return (__m128i) ((__v2du)__A + (__v2du)__B);
1318}
1319
1320extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1321_mm_adds_epi8 (__m128i __A, __m128i __B)
1322{
1323 return (__m128i) vec_adds ((__v16qi)__A, (__v16qi)__B);
1324}
1325
1326extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1327_mm_adds_epi16 (__m128i __A, __m128i __B)
1328{
1329 return (__m128i) vec_adds ((__v8hi)__A, (__v8hi)__B);
1330}
1331
1332extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1333_mm_adds_epu8 (__m128i __A, __m128i __B)
1334{
1335 return (__m128i) vec_adds ((__v16qu)__A, (__v16qu)__B);
1336}
1337
1338extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339_mm_adds_epu16 (__m128i __A, __m128i __B)
1340{
1341 return (__m128i) vec_adds ((__v8hu)__A, (__v8hu)__B);
1342}
1343
1344extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1345_mm_sub_epi8 (__m128i __A, __m128i __B)
1346{
1347 return (__m128i) ((__v16qu)__A - (__v16qu)__B);
1348}
1349
1350extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1351_mm_sub_epi16 (__m128i __A, __m128i __B)
1352{
1353 return (__m128i) ((__v8hu)__A - (__v8hu)__B);
1354}
1355
1356extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1357_mm_sub_epi32 (__m128i __A, __m128i __B)
1358{
1359 return (__m128i) ((__v4su)__A - (__v4su)__B);
1360}
1361
1362extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1363_mm_sub_epi64 (__m128i __A, __m128i __B)
1364{
1365 return (__m128i) ((__v2du)__A - (__v2du)__B);
1366}
1367
1368extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1369_mm_subs_epi8 (__m128i __A, __m128i __B)
1370{
1371 return (__m128i) vec_subs ((__v16qi)__A, (__v16qi)__B);
1372}
1373
1374extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1375_mm_subs_epi16 (__m128i __A, __m128i __B)
1376{
1377 return (__m128i) vec_subs ((__v8hi)__A, (__v8hi)__B);
1378}
1379
1380extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1381_mm_subs_epu8 (__m128i __A, __m128i __B)
1382{
1383 return (__m128i) vec_subs ((__v16qu)__A, (__v16qu)__B);
1384}
1385
1386extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387_mm_subs_epu16 (__m128i __A, __m128i __B)
1388{
1389 return (__m128i) vec_subs ((__v8hu)__A, (__v8hu)__B);
1390}
1391
1392extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1393_mm_madd_epi16 (__m128i __A, __m128i __B)
1394{
1395 __vector signed int zero = {0, 0, 0, 0};
1396
1397 return (__m128i) vec_vmsumshm ((__v8hi)__A, (__v8hi)__B, zero);
1398}
1399
1400extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1401_mm_mulhi_epi16 (__m128i __A, __m128i __B)
1402{
1403 __vector signed int w0, w1;
1404
1405 __vector unsigned char xform1 = {
1406#ifdef __LITTLE_ENDIAN__
1407 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1408 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1409#else
1410 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1411 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
1412#endif
1413 };
1414
1415 w0 = vec_vmulesh ((__v8hi)__A, (__v8hi)__B);
1416 w1 = vec_vmulosh ((__v8hi)__A, (__v8hi)__B);
1417 return (__m128i) vec_perm (w0, w1, xform1);
1418}
1419
1420extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1421_mm_mullo_epi16 (__m128i __A, __m128i __B)
1422{
1423 return (__m128i) ((__v8hi)__A * (__v8hi)__B);
1424}
1425
1426extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1427_mm_mul_su32 (__m64 __A, __m64 __B)
1428{
1429 unsigned int a = __A;
1430 unsigned int b = __B;
1431
1432 return ((__m64)a * (__m64)b);
1433}
1434
1435extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1436_mm_mul_epu32 (__m128i __A, __m128i __B)
1437{
1438#if __GNUC__ < 8
1439 __v2du result;
1440
1441#ifdef __LITTLE_ENDIAN__
1442 /* VMX Vector Multiply Odd Unsigned Word. */
1443 __asm__(
1444 "vmulouw %0,%1,%2"
1445 : "=v" (result)
1446 : "v" (__A), "v" (__B)
1447 : );
1448#else
1449 /* VMX Vector Multiply Even Unsigned Word. */
1450 __asm__(
1451 "vmuleuw %0,%1,%2"
1452 : "=v" (result)
1453 : "v" (__A), "v" (__B)
1454 : );
1455#endif
1456 return (__m128i) result;
1457#else
1458 return (__m128i) vec_mule ((__v4su)__A, (__v4su)__B);
1459#endif
1460}
1461
1462extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1463_mm_slli_epi16 (__m128i __A, int __B)
1464{
1465 __v8hu lshift;
1466 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1467
1468 if (__B >= 0 && __B < 16)
1469 {
1470 if (__builtin_constant_p(__B))
1471 lshift = (__v8hu) vec_splat_s16(__B);
1472 else
1473 lshift = vec_splats ((unsigned short) __B);
1474
1475 result = vec_sl ((__v8hi) __A, lshift);
1476 }
1477
1478 return (__m128i) result;
1479}
1480
1481extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1482_mm_slli_epi32 (__m128i __A, int __B)
1483{
1484 __v4su lshift;
1485 __v4si result = { 0, 0, 0, 0 };
1486
1487 if (__B >= 0 && __B < 32)
1488 {
1489 if (__builtin_constant_p(__B) && __B < 16)
1490 lshift = (__v4su) vec_splat_s32(__B);
1491 else
1492 lshift = vec_splats ((unsigned int) __B);
1493
1494 result = vec_sl ((__v4si) __A, lshift);
1495 }
1496
1497 return (__m128i) result;
1498}
1499
1500#ifdef _ARCH_PWR8
1501extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1502_mm_slli_epi64 (__m128i __A, int __B)
1503{
1504 __v2du lshift;
1505 __v2di result = { 0, 0 };
1506
1507 if (__B >= 0 && __B < 64)
1508 {
1509 if (__builtin_constant_p(__B) && __B < 16)
1510 lshift = (__v2du) vec_splat_s32(__B);
1511 else
1512 lshift = (__v2du) vec_splats ((unsigned int) __B);
1513
1514 result = vec_sl ((__v2di) __A, lshift);
1515 }
1516
1517 return (__m128i) result;
1518}
1519#endif
1520
1521extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1522_mm_srai_epi16 (__m128i __A, int __B)
1523{
1524 __v8hu rshift = { 15, 15, 15, 15, 15, 15, 15, 15 };
1525 __v8hi result;
1526
1527 if (__B < 16)
1528 {
1529 if (__builtin_constant_p(__B))
1530 rshift = (__v8hu) vec_splat_s16(__B);
1531 else
1532 rshift = vec_splats ((unsigned short) __B);
1533 }
1534 result = vec_sra ((__v8hi) __A, rshift);
1535
1536 return (__m128i) result;
1537}
1538
1539extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1540_mm_srai_epi32 (__m128i __A, int __B)
1541{
1542 __v4su rshift = { 31, 31, 31, 31 };
1543 __v4si result;
1544
1545 if (__B < 32)
1546 {
1547 if (__builtin_constant_p(__B))
1548 {
1549 if (__B < 16)
1550 rshift = (__v4su) vec_splat_s32(__B);
1551 else
1552 rshift = (__v4su) vec_splats((unsigned int)__B);
1553 }
1554 else
1555 rshift = vec_splats ((unsigned int) __B);
1556 }
1557 result = vec_sra ((__v4si) __A, rshift);
1558
1559 return (__m128i) result;
1560}
1561
1562extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1563_mm_bslli_si128 (__m128i __A, const int __N)
1564{
1565 __v16qu result;
1566 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1567
1568 if (__N < 16)
1569 result = vec_sld ((__v16qu) __A, zeros, __N);
1570 else
1571 result = zeros;
1572
1573 return (__m128i) result;
1574}
1575
1576extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1577_mm_bsrli_si128 (__m128i __A, const int __N)
1578{
1579 __v16qu result;
1580 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1581
1582 if (__N < 16)
1583#ifdef __LITTLE_ENDIAN__
1584 if (__builtin_constant_p(__N))
1585 /* Would like to use Vector Shift Left Double by Octet
1586 Immediate here to use the immediate form and avoid
1587 load of __N * 8 value into a separate VR. */
1588 result = vec_sld (zeros, (__v16qu) __A, (16 - __N));
1589 else
1590#endif
1591 {
1592 __v16qu shift = vec_splats((unsigned char)(__N*8));
1593#ifdef __LITTLE_ENDIAN__
1594 result = vec_sro ((__v16qu)__A, shift);
1595#else
1596 result = vec_slo ((__v16qu)__A, shift);
1597#endif
1598 }
1599 else
1600 result = zeros;
1601
1602 return (__m128i) result;
1603}
1604
1605extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1606_mm_srli_si128 (__m128i __A, const int __N)
1607{
1608 return _mm_bsrli_si128 (__A, __N);
1609}
1610
1611extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1612_mm_slli_si128 (__m128i __A, const int _imm5)
1613{
1614 __v16qu result;
1615 const __v16qu zeros = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
1616
1617 if (_imm5 < 16)
1618#ifdef __LITTLE_ENDIAN__
1619 result = vec_sld ((__v16qu) __A, zeros, _imm5);
1620#else
1621 result = vec_sld (zeros, (__v16qu) __A, (16 - _imm5));
1622#endif
1623 else
1624 result = zeros;
1625
1626 return (__m128i) result;
1627}
1628
1629extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1630
1631_mm_srli_epi16 (__m128i __A, int __B)
1632{
1633 __v8hu rshift;
1634 __v8hi result = { 0, 0, 0, 0, 0, 0, 0, 0 };
1635
1636 if (__B < 16)
1637 {
1638 if (__builtin_constant_p(__B))
1639 rshift = (__v8hu) vec_splat_s16(__B);
1640 else
1641 rshift = vec_splats ((unsigned short) __B);
1642
1643 result = vec_sr ((__v8hi) __A, rshift);
1644 }
1645
1646 return (__m128i) result;
1647}
1648
1649extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1650_mm_srli_epi32 (__m128i __A, int __B)
1651{
1652 __v4su rshift;
1653 __v4si result = { 0, 0, 0, 0 };
1654
1655 if (__B < 32)
1656 {
1657 if (__builtin_constant_p(__B))
1658 {
1659 if (__B < 16)
1660 rshift = (__v4su) vec_splat_s32(__B);
1661 else
1662 rshift = (__v4su) vec_splats((unsigned int)__B);
1663 }
1664 else
1665 rshift = vec_splats ((unsigned int) __B);
1666
1667 result = vec_sr ((__v4si) __A, rshift);
1668 }
1669
1670 return (__m128i) result;
1671}
1672
1673#ifdef _ARCH_PWR8
1674extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1675_mm_srli_epi64 (__m128i __A, int __B)
1676{
1677 __v2du rshift;
1678 __v2di result = { 0, 0 };
1679
1680 if (__B < 64)
1681 {
1682 if (__builtin_constant_p(__B))
1683 {
1684 if (__B < 16)
1685 rshift = (__v2du) vec_splat_s32(__B);
1686 else
1687 rshift = (__v2du) vec_splats((unsigned long long)__B);
1688 }
1689 else
1690 rshift = (__v2du) vec_splats ((unsigned int) __B);
1691
1692 result = vec_sr ((__v2di) __A, rshift);
1693 }
1694
1695 return (__m128i) result;
1696}
1697#endif
1698
1699extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1700_mm_sll_epi16 (__m128i __A, __m128i __B)
1701{
1702 __v8hu lshift;
1703 __vector __bool short shmask;
1704 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1705 __v8hu result;
1706
1707#ifdef __LITTLE_ENDIAN__
1708 lshift = vec_splat ((__v8hu) __B, 0);
1709#else
1710 lshift = vec_splat ((__v8hu) __B, 3);
1711#endif
1712 shmask = vec_cmple (lshift, shmax);
1713 result = vec_sl ((__v8hu) __A, lshift);
1714 result = vec_sel ((__v8hu) shmask, result, shmask);
1715
1716 return (__m128i) result;
1717}
1718
1719extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1720_mm_sll_epi32 (__m128i __A, __m128i __B)
1721{
1722 __v4su lshift;
1723 __vector __bool int shmask;
1724 const __v4su shmax = { 32, 32, 32, 32 };
1725 __v4su result;
1726#ifdef __LITTLE_ENDIAN__
1727 lshift = vec_splat ((__v4su) __B, 0);
1728#else
1729 lshift = vec_splat ((__v4su) __B, 1);
1730#endif
1731 shmask = vec_cmplt (lshift, shmax);
1732 result = vec_sl ((__v4su) __A, lshift);
1733 result = vec_sel ((__v4su) shmask, result, shmask);
1734
1735 return (__m128i) result;
1736}
1737
1738#ifdef _ARCH_PWR8
1739extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1740_mm_sll_epi64 (__m128i __A, __m128i __B)
1741{
1742 __v2du lshift;
1743 __vector __bool long long shmask;
1744 const __v2du shmax = { 64, 64 };
1745 __v2du result;
1746
1747 lshift = vec_splat ((__v2du) __B, 0);
1748 shmask = vec_cmplt (lshift, shmax);
1749 result = vec_sl ((__v2du) __A, lshift);
1750 result = vec_sel ((__v2du) shmask, result, shmask);
1751
1752 return (__m128i) result;
1753}
1754#endif
1755
1756extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1757_mm_sra_epi16 (__m128i __A, __m128i __B)
1758{
1759 const __v8hu rshmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1760 __v8hu rshift;
1761 __v8hi result;
1762
1763#ifdef __LITTLE_ENDIAN__
1764 rshift = vec_splat ((__v8hu)__B, 0);
1765#else
1766 rshift = vec_splat ((__v8hu)__B, 3);
1767#endif
1768 rshift = vec_min (rshift, rshmax);
1769 result = vec_sra ((__v8hi) __A, rshift);
1770
1771 return (__m128i) result;
1772}
1773
1774extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1775_mm_sra_epi32 (__m128i __A, __m128i __B)
1776{
1777 const __v4su rshmax = { 31, 31, 31, 31 };
1778 __v4su rshift;
1779 __v4si result;
1780
1781#ifdef __LITTLE_ENDIAN__
1782 rshift = vec_splat ((__v4su)__B, 0);
1783#else
1784 rshift = vec_splat ((__v4su)__B, 1);
1785#endif
1786 rshift = vec_min (rshift, rshmax);
1787 result = vec_sra ((__v4si) __A, rshift);
1788
1789 return (__m128i) result;
1790}
1791
1792extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1793_mm_srl_epi16 (__m128i __A, __m128i __B)
1794{
1795 __v8hu rshift;
1796 __vector __bool short shmask;
1797 const __v8hu shmax = { 15, 15, 15, 15, 15, 15, 15, 15 };
1798 __v8hu result;
1799
1800#ifdef __LITTLE_ENDIAN__
1801 rshift = vec_splat ((__v8hu) __B, 0);
1802#else
1803 rshift = vec_splat ((__v8hu) __B, 3);
1804#endif
1805 shmask = vec_cmple (rshift, shmax);
1806 result = vec_sr ((__v8hu) __A, rshift);
1807 result = vec_sel ((__v8hu) shmask, result, shmask);
1808
1809 return (__m128i) result;
1810}
1811
1812extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1813_mm_srl_epi32 (__m128i __A, __m128i __B)
1814{
1815 __v4su rshift;
1816 __vector __bool int shmask;
1817 const __v4su shmax = { 32, 32, 32, 32 };
1818 __v4su result;
1819
1820#ifdef __LITTLE_ENDIAN__
1821 rshift = vec_splat ((__v4su) __B, 0);
1822#else
1823 rshift = vec_splat ((__v4su) __B, 1);
1824#endif
1825 shmask = vec_cmplt (rshift, shmax);
1826 result = vec_sr ((__v4su) __A, rshift);
1827 result = vec_sel ((__v4su) shmask, result, shmask);
1828
1829 return (__m128i) result;
1830}
1831
1832#ifdef _ARCH_PWR8
1833extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1834_mm_srl_epi64 (__m128i __A, __m128i __B)
1835{
1836 __v2du rshift;
1837 __vector __bool long long shmask;
1838 const __v2du shmax = { 64, 64 };
1839 __v2du result;
1840
1841 rshift = vec_splat ((__v2du) __B, 0);
1842 shmask = vec_cmplt (rshift, shmax);
1843 result = vec_sr ((__v2du) __A, rshift);
1844 result = vec_sel ((__v2du) shmask, result, shmask);
1845
1846 return (__m128i) result;
1847}
1848#endif
1849
1850extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1851_mm_and_pd (__m128d __A, __m128d __B)
1852{
1853 return (vec_and ((__v2df) __A, (__v2df) __B));
1854}
1855
1856extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1857_mm_andnot_pd (__m128d __A, __m128d __B)
1858{
1859 return (vec_andc ((__v2df) __B, (__v2df) __A));
1860}
1861
1862extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1863_mm_or_pd (__m128d __A, __m128d __B)
1864{
1865 return (vec_or ((__v2df) __A, (__v2df) __B));
1866}
1867
1868extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1869_mm_xor_pd (__m128d __A, __m128d __B)
1870{
1871 return (vec_xor ((__v2df) __A, (__v2df) __B));
1872}
1873
1874extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1875_mm_and_si128 (__m128i __A, __m128i __B)
1876{
1877 return (__m128i)vec_and ((__v2di) __A, (__v2di) __B);
1878}
1879
1880extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1881_mm_andnot_si128 (__m128i __A, __m128i __B)
1882{
1883 return (__m128i)vec_andc ((__v2di) __B, (__v2di) __A);
1884}
1885
1886extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1887_mm_or_si128 (__m128i __A, __m128i __B)
1888{
1889 return (__m128i)vec_or ((__v2di) __A, (__v2di) __B);
1890}
1891
1892extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1893_mm_xor_si128 (__m128i __A, __m128i __B)
1894{
1895 return (__m128i)vec_xor ((__v2di) __A, (__v2di) __B);
1896}
1897
1898extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1899_mm_cmpeq_epi8 (__m128i __A, __m128i __B)
1900{
1901 return (__m128i) vec_cmpeq ((__v16qi) __A, (__v16qi)__B);
1902}
1903
1904extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1905_mm_cmpeq_epi16 (__m128i __A, __m128i __B)
1906{
1907 return (__m128i) vec_cmpeq ((__v8hi) __A, (__v8hi)__B);
1908}
1909
1910extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1911_mm_cmpeq_epi32 (__m128i __A, __m128i __B)
1912{
1913 return (__m128i) vec_cmpeq ((__v4si) __A, (__v4si)__B);
1914}
1915
1916extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1917_mm_cmplt_epi8 (__m128i __A, __m128i __B)
1918{
1919 return (__m128i) vec_cmplt ((__v16qi) __A, (__v16qi)__B);
1920}
1921
1922extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1923_mm_cmplt_epi16 (__m128i __A, __m128i __B)
1924{
1925 return (__m128i) vec_cmplt ((__v8hi) __A, (__v8hi)__B);
1926}
1927
1928extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1929_mm_cmplt_epi32 (__m128i __A, __m128i __B)
1930{
1931 return (__m128i) vec_cmplt ((__v4si) __A, (__v4si)__B);
1932}
1933
1934extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1935_mm_cmpgt_epi8 (__m128i __A, __m128i __B)
1936{
1937 return (__m128i) vec_cmpgt ((__v16qi) __A, (__v16qi)__B);
1938}
1939
1940extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1941_mm_cmpgt_epi16 (__m128i __A, __m128i __B)
1942{
1943 return (__m128i) vec_cmpgt ((__v8hi) __A, (__v8hi)__B);
1944}
1945
1946extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1947_mm_cmpgt_epi32 (__m128i __A, __m128i __B)
1948{
1949 return (__m128i) vec_cmpgt ((__v4si) __A, (__v4si)__B);
1950}
1951
1952extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1953_mm_extract_epi16 (__m128i const __A, int const __N)
1954{
1955 return (unsigned short) ((__v8hi)__A)[__N & 7];
1956}
1957
1958extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1959_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
1960{
1961 __v8hi result = (__v8hi)__A;
1962
1963 result [(__N & 7)] = __D;
1964
1965 return (__m128i) result;
1966}
1967
1968extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1969_mm_max_epi16 (__m128i __A, __m128i __B)
1970{
1971 return (__m128i) vec_max ((__v8hi)__A, (__v8hi)__B);
1972}
1973
1974extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1975_mm_max_epu8 (__m128i __A, __m128i __B)
1976{
1977 return (__m128i) vec_max ((__v16qu) __A, (__v16qu)__B);
1978}
1979
1980extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1981_mm_min_epi16 (__m128i __A, __m128i __B)
1982{
1983 return (__m128i) vec_min ((__v8hi) __A, (__v8hi)__B);
1984}
1985
1986extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1987_mm_min_epu8 (__m128i __A, __m128i __B)
1988{
1989 return (__m128i) vec_min ((__v16qu) __A, (__v16qu)__B);
1990}
1991
1992
1993#ifdef _ARCH_PWR8
1994/* Intrinsic functions that require PowerISA 2.07 minimum. */
1995
1996/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1997extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1998_mm_movemask_epi8 (__m128i __A)
1999{
2000 __vector unsigned long long result;
2001 static const __vector unsigned char perm_mask =
2002 {
2003 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40,
2004 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00
2005 };
2006
2007 result = ((__vector unsigned long long)
2008 vec_vbpermq ((__vector unsigned char) __A,
2009 (__vector unsigned char) perm_mask));
2010
2011#ifdef __LITTLE_ENDIAN__
2012 return result[1];
2013#else
2014 return result[0];
2015#endif
2016}
2017#endif /* _ARCH_PWR8 */
2018
2019extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2020_mm_mulhi_epu16 (__m128i __A, __m128i __B)
2021{
2022 __v4su w0, w1;
2023 __v16qu xform1 = {
2024#ifdef __LITTLE_ENDIAN__
2025 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
2026 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
2027#else
2028 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
2029 0x08, 0x09, 0x18, 0x19, 0x0C, 0x0D, 0x1C, 0x1D
2030#endif
2031 };
2032
2033 w0 = vec_vmuleuh ((__v8hu)__A, (__v8hu)__B);
2034 w1 = vec_vmulouh ((__v8hu)__A, (__v8hu)__B);
2035 return (__m128i) vec_perm (w0, w1, xform1);
2036}
2037
2038extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2039_mm_shufflehi_epi16 (__m128i __A, const int __mask)
2040{
2041 unsigned long element_selector_98 = __mask & 0x03;
2042 unsigned long element_selector_BA = (__mask >> 2) & 0x03;
2043 unsigned long element_selector_DC = (__mask >> 4) & 0x03;
2044 unsigned long element_selector_FE = (__mask >> 6) & 0x03;
2045 static const unsigned short permute_selectors[4] =
2046 {
2047#ifdef __LITTLE_ENDIAN__
2048 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
2049#else
2050 0x0809, 0x0A0B, 0x0C0D, 0x0E0F
2051#endif
2052 };
2053 __v2du pmask =
2054#ifdef __LITTLE_ENDIAN__
2055 { 0x1716151413121110UL, 0UL};
2056#else
2057 { 0x1011121314151617UL, 0UL};
2058#endif
2059 __m64_union t;
2060 __v2du a, r;
2061
2062 t.as_short[0] = permute_selectors[element_selector_98];
2063 t.as_short[1] = permute_selectors[element_selector_BA];
2064 t.as_short[2] = permute_selectors[element_selector_DC];
2065 t.as_short[3] = permute_selectors[element_selector_FE];
2066 pmask[1] = t.as_m64;
2067 a = (__v2du)__A;
2068 r = vec_perm (a, a, (__vector unsigned char)pmask);
2069 return (__m128i) r;
2070}
2071
2072extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2073_mm_shufflelo_epi16 (__m128i __A, const int __mask)
2074{
2075 unsigned long element_selector_10 = __mask & 0x03;
2076 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2077 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2078 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2079 static const unsigned short permute_selectors[4] =
2080 {
2081#ifdef __LITTLE_ENDIAN__
2082 0x0100, 0x0302, 0x0504, 0x0706
2083#else
2084 0x0001, 0x0203, 0x0405, 0x0607
2085#endif
2086 };
2087 __v2du pmask =
2088#ifdef __LITTLE_ENDIAN__
2089 { 0UL, 0x1f1e1d1c1b1a1918UL};
2090#else
2091 { 0UL, 0x18191a1b1c1d1e1fUL};
2092#endif
2093 __m64_union t;
2094 __v2du a, r;
2095 t.as_short[0] = permute_selectors[element_selector_10];
2096 t.as_short[1] = permute_selectors[element_selector_32];
2097 t.as_short[2] = permute_selectors[element_selector_54];
2098 t.as_short[3] = permute_selectors[element_selector_76];
2099 pmask[0] = t.as_m64;
2100 a = (__v2du)__A;
2101 r = vec_perm (a, a, (__vector unsigned char)pmask);
2102 return (__m128i) r;
2103}
2104
2105extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2106_mm_shuffle_epi32 (__m128i __A, const int __mask)
2107{
2108 unsigned long element_selector_10 = __mask & 0x03;
2109 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
2110 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
2111 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
2112 static const unsigned int permute_selectors[4] =
2113 {
2114#ifdef __LITTLE_ENDIAN__
2115 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
2116#else
2117 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
2118#endif
2119 };
2120 __v4su t;
2121
2122 t[0] = permute_selectors[element_selector_10];
2123 t[1] = permute_selectors[element_selector_32];
2124 t[2] = permute_selectors[element_selector_54] + 0x10101010;
2125 t[3] = permute_selectors[element_selector_76] + 0x10101010;
2126 return (__m128i)vec_perm ((__v4si) __A, (__v4si)__A, (__vector unsigned char)t);
2127}
2128
2129extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2130_mm_maskmoveu_si128 (__m128i __A, __m128i __B, char *__C)
2131{
2132 __v2du hibit = { 0x7f7f7f7f7f7f7f7fUL, 0x7f7f7f7f7f7f7f7fUL};
2133 __v16qu mask, tmp;
2134 __m128i_u *p = (__m128i_u*)__C;
2135
2136 tmp = (__v16qu)_mm_loadu_si128(p);
2137 mask = (__v16qu)vec_cmpgt ((__v16qu)__B, (__v16qu)hibit);
2138 tmp = vec_sel (tmp, (__v16qu)__A, mask);
2139 _mm_storeu_si128 (p, (__m128i)tmp);
2140}
2141
2142extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2143_mm_avg_epu8 (__m128i __A, __m128i __B)
2144{
2145 return (__m128i) vec_avg ((__v16qu)__A, (__v16qu)__B);
2146}
2147
2148extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2149_mm_avg_epu16 (__m128i __A, __m128i __B)
2150{
2151 return (__m128i) vec_avg ((__v8hu)__A, (__v8hu)__B);
2152}
2153
2154
2155extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2156_mm_sad_epu8 (__m128i __A, __m128i __B)
2157{
2158 __v16qu a, b;
2159 __v16qu vmin, vmax, vabsdiff;
2160 __v4si vsum;
2161 const __v4su zero = { 0, 0, 0, 0 };
2162 __v4si result;
2163
2164 a = (__v16qu) __A;
2165 b = (__v16qu) __B;
2166 vmin = vec_min (a, b);
2167 vmax = vec_max (a, b);
2168 vabsdiff = vec_sub (vmax, vmin);
2169 /* Sum four groups of bytes into integers. */
2170 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
2171 /* Sum across four integers with two integer results. */
2172 result = vec_sum2s (vsum, (__vector signed int) zero);
2173 /* Rotate the sums into the correct position. */
2174#ifdef __LITTLE_ENDIAN__
2175 result = vec_sld (result, result, 4);
2176#else
2177 result = vec_sld (result, result, 6);
2178#endif
2179 /* Rotate the sums into the correct position. */
2180 return (__m128i) result;
2181}
2182
2183extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2184_mm_stream_si32 (int *__A, int __B)
2185{
2186 /* Use the data cache block touch for store transient. */
2187 __asm__ (
2188 "dcbtstt 0,%0"
2189 :
2190 : "b" (__A)
2191 : "memory"
2192 );
2193 *__A = __B;
2194}
2195
2196extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2197_mm_stream_si64 (long long int *__A, long long int __B)
2198{
2199 /* Use the data cache block touch for store transient. */
2200 __asm__ (
2201 " dcbtstt 0,%0"
2202 :
2203 : "b" (__A)
2204 : "memory"
2205 );
2206 *__A = __B;
2207}
2208
2209extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2210_mm_stream_si128 (__m128i *__A, __m128i __B)
2211{
2212 /* Use the data cache block touch for store transient. */
2213 __asm__ (
2214 "dcbtstt 0,%0"
2215 :
2216 : "b" (__A)
2217 : "memory"
2218 );
2219 *__A = __B;
2220}
2221
2222extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2223_mm_stream_pd (double *__A, __m128d __B)
2224{
2225 /* Use the data cache block touch for store transient. */
2226 __asm__ (
2227 "dcbtstt 0,%0"
2228 :
2229 : "b" (__A)
2230 : "memory"
2231 );
2232 *(__m128d*)__A = __B;
2233}
2234
2235extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2236_mm_clflush (void const *__A)
2237{
2238 /* Use the data cache block flush. */
2239 __asm__ (
2240 "dcbf 0,%0"
2241 :
2242 : "b" (__A)
2243 : "memory"
2244 );
2245}
2246
2247extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2248_mm_lfence (void)
2249{
2250 /* Use light weight sync for load to load ordering. */
2251 __atomic_thread_fence (__ATOMIC_RELEASE);
2252}
2253
2254extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2255_mm_mfence (void)
2256{
2257 /* Use heavy weight sync for any to any ordering. */
2258 __atomic_thread_fence (__ATOMIC_SEQ_CST);
2259}
2260
2261extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2262_mm_cvtsi32_si128 (int __A)
2263{
2264 return _mm_set_epi32 (0, 0, 0, __A);
2265}
2266
2267extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2268_mm_cvtsi64_si128 (long long __A)
2269{
2270 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2271}
2272
2273/* Microsoft intrinsic. */
2274extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2275_mm_cvtsi64x_si128 (long long __A)
2276{
2277 return __extension__ (__m128i)(__v2di){ __A, 0LL };
2278}
2279
2280/* Casts between various SP, DP, INT vector types. Note that these do no
2281 conversion of values, they just change the type. */
2282extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2283_mm_castpd_ps(__m128d __A)
2284{
2285 return (__m128) __A;
2286}
2287
2288extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2289_mm_castpd_si128(__m128d __A)
2290{
2291 return (__m128i) __A;
2292}
2293
2294extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2295_mm_castps_pd(__m128 __A)
2296{
2297 return (__m128d) __A;
2298}
2299
2300extern __inline __m128i __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2301_mm_castps_si128(__m128 __A)
2302{
2303 return (__m128i) __A;
2304}
2305
2306extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2307_mm_castsi128_ps(__m128i __A)
2308{
2309 return (__m128) __A;
2310}
2311
2312extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, __artificial__))
2313_mm_castsi128_pd(__m128i __A)
2314{
2315 return (__m128d) __A;
2316}
2317
2318#endif /* EMMINTRIN_H_ */