blob: 0e45b96769f879a31a4555db35312e5386351f5c [file] [log] [blame]
Logan Chiendf4f7662019-09-04 16:45:23 -07001/*===---- xmmintrin.h - Implementation of SSE intrinsics on PowerPC --------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since X86 SSE intrinsics mainly handles __m128 type, PowerPC
18 VMX/VSX ISA is a good match for vector float SIMD operations.
19 However scalar float operations in vector (XMM) registers require
20 the POWER8 VSX ISA (2.07) level. There are differences for data
21 format and placement of float scalars in the vector register, which
22 require extra steps to match SSE scalar float semantics on POWER.
23
24 It should be noted that there's much difference between X86_64's
25 MXSCR and PowerISA's FPSCR/VSCR registers. It's recommended to use
26 portable <fenv.h> instead of access MXSCR directly.
27
28 Most SSE scalar float intrinsic operations can be performed more
29 efficiently as C language float scalar operations or optimized to
30 use vector SIMD operations. We recommend this for new applications. */
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -080031#error "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
Logan Chiendf4f7662019-09-04 16:45:23 -070032#endif
33
34#ifndef _XMMINTRIN_H_INCLUDED
35#define _XMMINTRIN_H_INCLUDED
36
Logan Chienbedbf4f2020-01-06 19:35:19 -080037#if defined(__linux__) && defined(__ppc64__)
38
Logan Chiendf4f7662019-09-04 16:45:23 -070039/* Define four value permute mask */
40#define _MM_SHUFFLE(w,x,y,z) (((w) << 6) | ((x) << 4) | ((y) << 2) | (z))
41
42#include <altivec.h>
43
44/* Avoid collisions between altivec.h and strict adherence to C++ and
45 C11 standards. This should eventually be done inside altivec.h itself,
46 but only after testing a full distro build. */
47#if defined(__STRICT_ANSI__) && (defined(__cplusplus) || \
48 (defined(__STDC_VERSION__) && \
49 __STDC_VERSION__ >= 201112L))
50#undef vector
51#undef pixel
52#undef bool
53#endif
54
55/* We need type definitions from the MMX header file. */
56#include <mmintrin.h>
57
58/* Get _mm_malloc () and _mm_free (). */
59#if __STDC_HOSTED__
60#include <mm_malloc.h>
61#endif
62
63/* The Intel API is flexible enough that we must allow aliasing with other
64 vector types, and their scalar components. */
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -080065typedef vector float __m128 __attribute__((__may_alias__));
Logan Chiendf4f7662019-09-04 16:45:23 -070066
67/* Unaligned version of the same type. */
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -080068typedef vector float __m128_u __attribute__((__may_alias__, __aligned__(1)));
Logan Chiendf4f7662019-09-04 16:45:23 -070069
70/* Internal data types for implementing the intrinsics. */
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -080071typedef vector float __v4sf;
Logan Chiendf4f7662019-09-04 16:45:23 -070072
73/* Create an undefined vector. */
74extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
75_mm_undefined_ps (void)
76{
77 __m128 __Y = __Y;
78 return __Y;
79}
80
81/* Create a vector of zeros. */
82extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83_mm_setzero_ps (void)
84{
85 return __extension__ (__m128){ 0.0f, 0.0f, 0.0f, 0.0f };
86}
87
88/* Load four SPFP values from P. The address must be 16-byte aligned. */
89extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
90_mm_load_ps (float const *__P)
91{
92 return ((__m128)vec_ld(0, (__v4sf*)__P));
93}
94
95/* Load four SPFP values from P. The address need not be 16-byte aligned. */
96extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
97_mm_loadu_ps (float const *__P)
98{
99 return (vec_vsx_ld(0, __P));
100}
101
102/* Load four SPFP values in reverse order. The address must be aligned. */
103extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
104_mm_loadr_ps (float const *__P)
105{
106 __v4sf __tmp;
107 __m128 result;
108 static const __vector unsigned char permute_vector =
109 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
110 0x17, 0x10, 0x11, 0x12, 0x13 };
111
112 __tmp = vec_ld (0, (__v4sf *) __P);
113 result = (__m128) vec_perm (__tmp, __tmp, permute_vector);
114 return result;
115}
116
117/* Create a vector with all four elements equal to F. */
118extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119_mm_set1_ps (float __F)
120{
121 return __extension__ (__m128)(__v4sf){ __F, __F, __F, __F };
122}
123
124extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
125_mm_set_ps1 (float __F)
126{
127 return _mm_set1_ps (__F);
128}
129
130/* Create the vector [Z Y X W]. */
131extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
132_mm_set_ps (const float __Z, const float __Y, const float __X, const float __W)
133{
134 return __extension__ (__m128)(__v4sf){ __W, __X, __Y, __Z };
135}
136
137/* Create the vector [W X Y Z]. */
138extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
139_mm_setr_ps (float __Z, float __Y, float __X, float __W)
140{
141 return __extension__ (__m128)(__v4sf){ __Z, __Y, __X, __W };
142}
143
144/* Store four SPFP values. The address must be 16-byte aligned. */
145extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
146_mm_store_ps (float *__P, __m128 __A)
147{
148 vec_st((__v4sf)__A, 0, (__v4sf*)__P);
149}
150
151/* Store four SPFP values. The address need not be 16-byte aligned. */
152extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
153_mm_storeu_ps (float *__P, __m128 __A)
154{
155 *(__m128_u *)__P = __A;
156}
157
158/* Store four SPFP values in reverse order. The address must be aligned. */
159extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
160_mm_storer_ps (float *__P, __m128 __A)
161{
162 __v4sf __tmp;
163 static const __vector unsigned char permute_vector =
164 { 0x1C, 0x1D, 0x1E, 0x1F, 0x18, 0x19, 0x1A, 0x1B, 0x14, 0x15, 0x16,
165 0x17, 0x10, 0x11, 0x12, 0x13 };
166
167 __tmp = (__m128) vec_perm (__A, __A, permute_vector);
168
169 _mm_store_ps (__P, __tmp);
170}
171
172/* Store the lower SPFP value across four words. */
173extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
174_mm_store1_ps (float *__P, __m128 __A)
175{
176 __v4sf __va = vec_splat((__v4sf)__A, 0);
177 _mm_store_ps (__P, __va);
178}
179
180extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
181_mm_store_ps1 (float *__P, __m128 __A)
182{
183 _mm_store1_ps (__P, __A);
184}
185
186/* Create a vector with element 0 as F and the rest zero. */
187extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
188_mm_set_ss (float __F)
189{
190 return __extension__ (__m128)(__v4sf){ __F, 0.0f, 0.0f, 0.0f };
191}
192
193/* Sets the low SPFP value of A from the low value of B. */
194extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
195_mm_move_ss (__m128 __A, __m128 __B)
196{
197 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
198
199 return (vec_sel ((__v4sf)__A, (__v4sf)__B, mask));
200}
201
202/* Create a vector with element 0 as *P and the rest zero. */
203extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
204_mm_load_ss (float const *__P)
205{
206 return _mm_set_ss (*__P);
207}
208
209/* Stores the lower SPFP value. */
210extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
211_mm_store_ss (float *__P, __m128 __A)
212{
213 *__P = ((__v4sf)__A)[0];
214}
215
216/* Perform the respective operation on the lower SPFP (single-precision
217 floating-point) values of A and B; the upper three SPFP values are
218 passed through from A. */
219
220extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
221_mm_add_ss (__m128 __A, __m128 __B)
222{
223#ifdef _ARCH_PWR7
224 __m128 a, b, c;
225 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
226 /* PowerISA VSX does not allow partial (for just lower double)
227 results. So to insure we don't generate spurious exceptions
228 (from the upper double values) we splat the lower double
229 before we to the operation. */
230 a = vec_splat (__A, 0);
231 b = vec_splat (__B, 0);
232 c = a + b;
233 /* Then we merge the lower float result with the original upper
234 float elements from __A. */
235 return (vec_sel (__A, c, mask));
236#else
237 __A[0] = __A[0] + __B[0];
238 return (__A);
239#endif
240}
241
242extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
243_mm_sub_ss (__m128 __A, __m128 __B)
244{
245#ifdef _ARCH_PWR7
246 __m128 a, b, c;
247 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
248 /* PowerISA VSX does not allow partial (for just lower double)
249 results. So to insure we don't generate spurious exceptions
250 (from the upper double values) we splat the lower double
251 before we to the operation. */
252 a = vec_splat (__A, 0);
253 b = vec_splat (__B, 0);
254 c = a - b;
255 /* Then we merge the lower float result with the original upper
256 float elements from __A. */
257 return (vec_sel (__A, c, mask));
258#else
259 __A[0] = __A[0] - __B[0];
260 return (__A);
261#endif
262}
263
264extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265_mm_mul_ss (__m128 __A, __m128 __B)
266{
267#ifdef _ARCH_PWR7
268 __m128 a, b, c;
269 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
270 /* PowerISA VSX does not allow partial (for just lower double)
271 results. So to insure we don't generate spurious exceptions
272 (from the upper double values) we splat the lower double
273 before we to the operation. */
274 a = vec_splat (__A, 0);
275 b = vec_splat (__B, 0);
276 c = a * b;
277 /* Then we merge the lower float result with the original upper
278 float elements from __A. */
279 return (vec_sel (__A, c, mask));
280#else
281 __A[0] = __A[0] * __B[0];
282 return (__A);
283#endif
284}
285
286extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
287_mm_div_ss (__m128 __A, __m128 __B)
288{
289#ifdef _ARCH_PWR7
290 __m128 a, b, c;
291 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
292 /* PowerISA VSX does not allow partial (for just lower double)
293 results. So to insure we don't generate spurious exceptions
294 (from the upper double values) we splat the lower double
295 before we to the operation. */
296 a = vec_splat (__A, 0);
297 b = vec_splat (__B, 0);
298 c = a / b;
299 /* Then we merge the lower float result with the original upper
300 float elements from __A. */
301 return (vec_sel (__A, c, mask));
302#else
303 __A[0] = __A[0] / __B[0];
304 return (__A);
305#endif
306}
307
308extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309_mm_sqrt_ss (__m128 __A)
310{
311 __m128 a, c;
312 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
313 /* PowerISA VSX does not allow partial (for just lower double)
314 * results. So to insure we don't generate spurious exceptions
315 * (from the upper double values) we splat the lower double
316 * before we to the operation. */
317 a = vec_splat (__A, 0);
318 c = vec_sqrt (a);
319 /* Then we merge the lower float result with the original upper
320 * float elements from __A. */
321 return (vec_sel (__A, c, mask));
322}
323
324/* Perform the respective operation on the four SPFP values in A and B. */
325extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
326_mm_add_ps (__m128 __A, __m128 __B)
327{
328 return (__m128) ((__v4sf)__A + (__v4sf)__B);
329}
330
331extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
332_mm_sub_ps (__m128 __A, __m128 __B)
333{
334 return (__m128) ((__v4sf)__A - (__v4sf)__B);
335}
336
337extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338_mm_mul_ps (__m128 __A, __m128 __B)
339{
340 return (__m128) ((__v4sf)__A * (__v4sf)__B);
341}
342
343extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm_div_ps (__m128 __A, __m128 __B)
345{
346 return (__m128) ((__v4sf)__A / (__v4sf)__B);
347}
348
349extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
350_mm_sqrt_ps (__m128 __A)
351{
352 return (vec_sqrt ((__v4sf)__A));
353}
354
355extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm_rcp_ps (__m128 __A)
357{
358 return (vec_re ((__v4sf)__A));
359}
360
361extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
362_mm_rsqrt_ps (__m128 __A)
363{
364 return (vec_rsqrte (__A));
365}
366
367extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
368_mm_rcp_ss (__m128 __A)
369{
370 __m128 a, c;
371 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
372 /* PowerISA VSX does not allow partial (for just lower double)
373 * results. So to insure we don't generate spurious exceptions
374 * (from the upper double values) we splat the lower double
375 * before we to the operation. */
376 a = vec_splat (__A, 0);
377 c = _mm_rcp_ps (a);
378 /* Then we merge the lower float result with the original upper
379 * float elements from __A. */
380 return (vec_sel (__A, c, mask));
381}
382
383extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
384_mm_rsqrt_ss (__m128 __A)
385{
386 __m128 a, c;
387 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
388 /* PowerISA VSX does not allow partial (for just lower double)
389 * results. So to insure we don't generate spurious exceptions
390 * (from the upper double values) we splat the lower double
391 * before we to the operation. */
392 a = vec_splat (__A, 0);
393 c = vec_rsqrte (a);
394 /* Then we merge the lower float result with the original upper
395 * float elements from __A. */
396 return (vec_sel (__A, c, mask));
397}
398
399extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
400_mm_min_ss (__m128 __A, __m128 __B)
401{
402 __v4sf a, b, c;
403 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
404 /* PowerISA VSX does not allow partial (for just lower float)
405 * results. So to insure we don't generate spurious exceptions
406 * (from the upper float values) we splat the lower float
407 * before we to the operation. */
408 a = vec_splat ((__v4sf)__A, 0);
409 b = vec_splat ((__v4sf)__B, 0);
410 c = vec_min (a, b);
411 /* Then we merge the lower float result with the original upper
412 * float elements from __A. */
413 return (vec_sel ((__v4sf)__A, c, mask));
414}
415
416extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
417_mm_max_ss (__m128 __A, __m128 __B)
418{
419 __v4sf a, b, c;
420 static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
421 /* PowerISA VSX does not allow partial (for just lower float)
422 * results. So to insure we don't generate spurious exceptions
423 * (from the upper float values) we splat the lower float
424 * before we to the operation. */
425 a = vec_splat (__A, 0);
426 b = vec_splat (__B, 0);
427 c = vec_max (a, b);
428 /* Then we merge the lower float result with the original upper
429 * float elements from __A. */
430 return (vec_sel ((__v4sf)__A, c, mask));
431}
432
433extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
434_mm_min_ps (__m128 __A, __m128 __B)
435{
436 __vector __bool int m = vec_cmpgt ((__v4sf) __B, (__v4sf) __A);
437 return vec_sel (__B, __A, m);
438}
439
440extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
441_mm_max_ps (__m128 __A, __m128 __B)
442{
443 __vector __bool int m = vec_cmpgt ((__v4sf) __A, (__v4sf) __B);
444 return vec_sel (__B, __A, m);
445}
446
447/* Perform logical bit-wise operations on 128-bit values. */
448extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
449_mm_and_ps (__m128 __A, __m128 __B)
450{
451 return ((__m128)vec_and ((__v4sf)__A, (__v4sf)__B));
452// return __builtin_ia32_andps (__A, __B);
453}
454
455extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
456_mm_andnot_ps (__m128 __A, __m128 __B)
457{
458 return ((__m128)vec_andc ((__v4sf)__B, (__v4sf)__A));
459}
460
461extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
462_mm_or_ps (__m128 __A, __m128 __B)
463{
464 return ((__m128)vec_or ((__v4sf)__A, (__v4sf)__B));
465}
466
467extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
468_mm_xor_ps (__m128 __A, __m128 __B)
469{
470 return ((__m128)vec_xor ((__v4sf)__A, (__v4sf)__B));
471}
472
473/* Perform a comparison on the four SPFP values of A and B. For each
474 element, if the comparison is true, place a mask of all ones in the
475 result, otherwise a mask of zeros. */
476extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
477_mm_cmpeq_ps (__m128 __A, __m128 __B)
478{
479 return ((__m128)vec_cmpeq ((__v4sf)__A,(__v4sf) __B));
480}
481
482extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
483_mm_cmplt_ps (__m128 __A, __m128 __B)
484{
485 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
486}
487
488extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
489_mm_cmple_ps (__m128 __A, __m128 __B)
490{
491 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
492}
493
494extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
495_mm_cmpgt_ps (__m128 __A, __m128 __B)
496{
497 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
498}
499
500extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
501_mm_cmpge_ps (__m128 __A, __m128 __B)
502{
503 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
504}
505
506extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
507_mm_cmpneq_ps (__m128 __A, __m128 __B)
508{
509 __v4sf temp = (__v4sf ) vec_cmpeq ((__v4sf) __A, (__v4sf)__B);
510 return ((__m128)vec_nor (temp, temp));
511}
512
513extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
514_mm_cmpnlt_ps (__m128 __A, __m128 __B)
515{
516 return ((__m128)vec_cmpge ((__v4sf)__A, (__v4sf)__B));
517}
518
519extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
520_mm_cmpnle_ps (__m128 __A, __m128 __B)
521{
522 return ((__m128)vec_cmpgt ((__v4sf)__A, (__v4sf)__B));
523}
524
525extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
526_mm_cmpngt_ps (__m128 __A, __m128 __B)
527{
528 return ((__m128)vec_cmple ((__v4sf)__A, (__v4sf)__B));
529}
530
531extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
532_mm_cmpnge_ps (__m128 __A, __m128 __B)
533{
534 return ((__m128)vec_cmplt ((__v4sf)__A, (__v4sf)__B));
535}
536
537extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
538_mm_cmpord_ps (__m128 __A, __m128 __B)
539{
540 __vector unsigned int a, b;
541 __vector unsigned int c, d;
542 static const __vector unsigned int float_exp_mask =
543 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
544
545 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
546 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
547 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
548 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
549 return ((__m128 ) vec_and (c, d));
550}
551
552extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
553_mm_cmpunord_ps (__m128 __A, __m128 __B)
554{
555 __vector unsigned int a, b;
556 __vector unsigned int c, d;
557 static const __vector unsigned int float_exp_mask =
558 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
559
560 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
561 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
562 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
563 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
564 return ((__m128 ) vec_or (c, d));
565}
566
567/* Perform a comparison on the lower SPFP values of A and B. If the
568 comparison is true, place a mask of all ones in the result, otherwise a
569 mask of zeros. The upper three SPFP values are passed through from A. */
570extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
571_mm_cmpeq_ss (__m128 __A, __m128 __B)
572{
573 static const __vector unsigned int mask =
574 { 0xffffffff, 0, 0, 0 };
575 __v4sf a, b, c;
576 /* PowerISA VMX does not allow partial (for just element 0)
577 * results. So to insure we don't generate spurious exceptions
578 * (from the upper elements) we splat the lower float
579 * before we to the operation. */
580 a = vec_splat ((__v4sf) __A, 0);
581 b = vec_splat ((__v4sf) __B, 0);
582 c = (__v4sf) vec_cmpeq(a, b);
583 /* Then we merge the lower float result with the original upper
584 * float elements from __A. */
585 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
586}
587
588extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
589_mm_cmplt_ss (__m128 __A, __m128 __B)
590{
591 static const __vector unsigned int mask =
592 { 0xffffffff, 0, 0, 0 };
593 __v4sf a, b, c;
594 /* PowerISA VMX does not allow partial (for just element 0)
595 * results. So to insure we don't generate spurious exceptions
596 * (from the upper elements) we splat the lower float
597 * before we to the operation. */
598 a = vec_splat ((__v4sf) __A, 0);
599 b = vec_splat ((__v4sf) __B, 0);
600 c = (__v4sf) vec_cmplt(a, b);
601 /* Then we merge the lower float result with the original upper
602 * float elements from __A. */
603 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
604}
605
606extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
607_mm_cmple_ss (__m128 __A, __m128 __B)
608{
609 static const __vector unsigned int mask =
610 { 0xffffffff, 0, 0, 0 };
611 __v4sf a, b, c;
612 /* PowerISA VMX does not allow partial (for just element 0)
613 * results. So to insure we don't generate spurious exceptions
614 * (from the upper elements) we splat the lower float
615 * before we to the operation. */
616 a = vec_splat ((__v4sf) __A, 0);
617 b = vec_splat ((__v4sf) __B, 0);
618 c = (__v4sf) vec_cmple(a, b);
619 /* Then we merge the lower float result with the original upper
620 * float elements from __A. */
621 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
622}
623
624extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
625_mm_cmpgt_ss (__m128 __A, __m128 __B)
626{
627 static const __vector unsigned int mask =
628 { 0xffffffff, 0, 0, 0 };
629 __v4sf a, b, c;
630 /* PowerISA VMX does not allow partial (for just element 0)
631 * results. So to insure we don't generate spurious exceptions
632 * (from the upper elements) we splat the lower float
633 * before we to the operation. */
634 a = vec_splat ((__v4sf) __A, 0);
635 b = vec_splat ((__v4sf) __B, 0);
636 c = (__v4sf) vec_cmpgt(a, b);
637 /* Then we merge the lower float result with the original upper
638 * float elements from __A. */
639 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
640}
641
642extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
643_mm_cmpge_ss (__m128 __A, __m128 __B)
644{
645 static const __vector unsigned int mask =
646 { 0xffffffff, 0, 0, 0 };
647 __v4sf a, b, c;
648 /* PowerISA VMX does not allow partial (for just element 0)
649 * results. So to insure we don't generate spurious exceptions
650 * (from the upper elements) we splat the lower float
651 * before we to the operation. */
652 a = vec_splat ((__v4sf) __A, 0);
653 b = vec_splat ((__v4sf) __B, 0);
654 c = (__v4sf) vec_cmpge(a, b);
655 /* Then we merge the lower float result with the original upper
656 * float elements from __A. */
657 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
658}
659
660extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
661_mm_cmpneq_ss (__m128 __A, __m128 __B)
662{
663 static const __vector unsigned int mask =
664 { 0xffffffff, 0, 0, 0 };
665 __v4sf a, b, c;
666 /* PowerISA VMX does not allow partial (for just element 0)
667 * results. So to insure we don't generate spurious exceptions
668 * (from the upper elements) we splat the lower float
669 * before we to the operation. */
670 a = vec_splat ((__v4sf) __A, 0);
671 b = vec_splat ((__v4sf) __B, 0);
672 c = (__v4sf) vec_cmpeq(a, b);
673 c = vec_nor (c, c);
674 /* Then we merge the lower float result with the original upper
675 * float elements from __A. */
676 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
677}
678
679extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
680_mm_cmpnlt_ss (__m128 __A, __m128 __B)
681{
682 static const __vector unsigned int mask =
683 { 0xffffffff, 0, 0, 0 };
684 __v4sf a, b, c;
685 /* PowerISA VMX does not allow partial (for just element 0)
686 * results. So to insure we don't generate spurious exceptions
687 * (from the upper elements) we splat the lower float
688 * before we to the operation. */
689 a = vec_splat ((__v4sf) __A, 0);
690 b = vec_splat ((__v4sf) __B, 0);
691 c = (__v4sf) vec_cmpge(a, b);
692 /* Then we merge the lower float result with the original upper
693 * float elements from __A. */
694 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
695}
696
697extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
698_mm_cmpnle_ss (__m128 __A, __m128 __B)
699{
700 static const __vector unsigned int mask =
701 { 0xffffffff, 0, 0, 0 };
702 __v4sf a, b, c;
703 /* PowerISA VMX does not allow partial (for just element 0)
704 * results. So to insure we don't generate spurious exceptions
705 * (from the upper elements) we splat the lower float
706 * before we to the operation. */
707 a = vec_splat ((__v4sf) __A, 0);
708 b = vec_splat ((__v4sf) __B, 0);
709 c = (__v4sf) vec_cmpgt(a, b);
710 /* Then we merge the lower float result with the original upper
711 * float elements from __A. */
712 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
713}
714
715extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
716_mm_cmpngt_ss (__m128 __A, __m128 __B)
717{
718 static const __vector unsigned int mask =
719 { 0xffffffff, 0, 0, 0 };
720 __v4sf a, b, c;
721 /* PowerISA VMX does not allow partial (for just element 0)
722 * results. So to insure we don't generate spurious exceptions
723 * (from the upper elements) we splat the lower float
724 * before we to the operation. */
725 a = vec_splat ((__v4sf) __A, 0);
726 b = vec_splat ((__v4sf) __B, 0);
727 c = (__v4sf) vec_cmple(a, b);
728 /* Then we merge the lower float result with the original upper
729 * float elements from __A. */
730 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
731}
732
733extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
734_mm_cmpnge_ss (__m128 __A, __m128 __B)
735{
736 static const __vector unsigned int mask =
737 { 0xffffffff, 0, 0, 0 };
738 __v4sf a, b, c;
739 /* PowerISA VMX does not allow partial (for just element 0)
740 * results. So to insure we don't generate spurious exceptions
741 * (from the upper elements) we splat the lower float
742 * before we do the operation. */
743 a = vec_splat ((__v4sf) __A, 0);
744 b = vec_splat ((__v4sf) __B, 0);
745 c = (__v4sf) vec_cmplt(a, b);
746 /* Then we merge the lower float result with the original upper
747 * float elements from __A. */
748 return ((__m128)vec_sel ((__v4sf)__A, c, mask));
749}
750
751extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
752_mm_cmpord_ss (__m128 __A, __m128 __B)
753{
754 __vector unsigned int a, b;
755 __vector unsigned int c, d;
756 static const __vector unsigned int float_exp_mask =
757 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
758 static const __vector unsigned int mask =
759 { 0xffffffff, 0, 0, 0 };
760
761 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
762 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
763 c = (__vector unsigned int) vec_cmpgt (float_exp_mask, a);
764 d = (__vector unsigned int) vec_cmpgt (float_exp_mask, b);
765 c = vec_and (c, d);
766 /* Then we merge the lower float result with the original upper
767 * float elements from __A. */
768 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
769}
770
771extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
772_mm_cmpunord_ss (__m128 __A, __m128 __B)
773{
774 __vector unsigned int a, b;
775 __vector unsigned int c, d;
776 static const __vector unsigned int float_exp_mask =
777 { 0x7f800000, 0x7f800000, 0x7f800000, 0x7f800000 };
778 static const __vector unsigned int mask =
779 { 0xffffffff, 0, 0, 0 };
780
781 a = (__vector unsigned int) vec_abs ((__v4sf)__A);
782 b = (__vector unsigned int) vec_abs ((__v4sf)__B);
783 c = (__vector unsigned int) vec_cmpgt (a, float_exp_mask);
784 d = (__vector unsigned int) vec_cmpgt (b, float_exp_mask);
785 c = vec_or (c, d);
786 /* Then we merge the lower float result with the original upper
787 * float elements from __A. */
788 return ((__m128)vec_sel ((__v4sf)__A, (__v4sf)c, mask));
789}
790
791/* Compare the lower SPFP values of A and B and return 1 if true
792 and 0 if false. */
793extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
794_mm_comieq_ss (__m128 __A, __m128 __B)
795{
796 return (__A[0] == __B[0]);
797}
798
799extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
800_mm_comilt_ss (__m128 __A, __m128 __B)
801{
802 return (__A[0] < __B[0]);
803}
804
805extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
806_mm_comile_ss (__m128 __A, __m128 __B)
807{
808 return (__A[0] <= __B[0]);
809}
810
811extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
812_mm_comigt_ss (__m128 __A, __m128 __B)
813{
814 return (__A[0] > __B[0]);
815}
816
817extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
818_mm_comige_ss (__m128 __A, __m128 __B)
819{
820 return (__A[0] >= __B[0]);
821}
822
823extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
824_mm_comineq_ss (__m128 __A, __m128 __B)
825{
826 return (__A[0] != __B[0]);
827}
828
829/* FIXME
830 * The __mm_ucomi??_ss implementations below are exactly the same as
831 * __mm_comi??_ss because GCC for PowerPC only generates unordered
832 * compares (scalar and vector).
833 * Technically __mm_comieq_ss et al should be using the ordered
834 * compare and signal for QNaNs.
835 * The __mm_ucomieq_sd et all should be OK, as is.
836 */
837extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
838_mm_ucomieq_ss (__m128 __A, __m128 __B)
839{
840 return (__A[0] == __B[0]);
841}
842
843extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
844_mm_ucomilt_ss (__m128 __A, __m128 __B)
845{
846 return (__A[0] < __B[0]);
847}
848
849extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
850_mm_ucomile_ss (__m128 __A, __m128 __B)
851{
852 return (__A[0] <= __B[0]);
853}
854
855extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
856_mm_ucomigt_ss (__m128 __A, __m128 __B)
857{
858 return (__A[0] > __B[0]);
859}
860
861extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
862_mm_ucomige_ss (__m128 __A, __m128 __B)
863{
864 return (__A[0] >= __B[0]);
865}
866
867extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
868_mm_ucomineq_ss (__m128 __A, __m128 __B)
869{
870 return (__A[0] != __B[0]);
871}
872
873extern __inline float __attribute__((__gnu_inline__, __always_inline__, __artificial__))
874_mm_cvtss_f32 (__m128 __A)
875{
876 return ((__v4sf)__A)[0];
877}
878
879/* Convert the lower SPFP value to a 32-bit integer according to the current
880 rounding mode. */
881extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
882_mm_cvtss_si32 (__m128 __A)
883{
884 __m64 res = 0;
885#ifdef _ARCH_PWR8
886 double dtmp;
887 __asm__(
888#ifdef __LITTLE_ENDIAN__
889 "xxsldwi %x0,%x0,%x0,3;\n"
890#endif
891 "xscvspdp %x2,%x0;\n"
892 "fctiw %2,%2;\n"
893 "mfvsrd %1,%x2;\n"
894 : "+wa" (__A),
895 "=r" (res),
896 "=f" (dtmp)
897 : );
898#else
899 res = __builtin_rint(__A[0]);
900#endif
901 return (res);
902}
903
904extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
905_mm_cvt_ss2si (__m128 __A)
906{
907 return _mm_cvtss_si32 (__A);
908}
909
910/* Convert the lower SPFP value to a 32-bit integer according to the
911 current rounding mode. */
912
913/* Intel intrinsic. */
914extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
915_mm_cvtss_si64 (__m128 __A)
916{
917 __m64 res = 0;
918#ifdef _ARCH_PWR8
919 double dtmp;
920 __asm__(
921#ifdef __LITTLE_ENDIAN__
922 "xxsldwi %x0,%x0,%x0,3;\n"
923#endif
924 "xscvspdp %x2,%x0;\n"
925 "fctid %2,%2;\n"
926 "mfvsrd %1,%x2;\n"
927 : "+wa" (__A),
928 "=r" (res),
929 "=f" (dtmp)
930 : );
931#else
932 res = __builtin_llrint(__A[0]);
933#endif
934 return (res);
935}
936
937/* Microsoft intrinsic. */
938extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939_mm_cvtss_si64x (__m128 __A)
940{
941 return _mm_cvtss_si64 ((__v4sf) __A);
942}
943
944/* Constants for use with _mm_prefetch. */
945enum _mm_hint
946{
947 /* _MM_HINT_ET is _MM_HINT_T with set 3rd bit. */
948 _MM_HINT_ET0 = 7,
949 _MM_HINT_ET1 = 6,
950 _MM_HINT_T0 = 3,
951 _MM_HINT_T1 = 2,
952 _MM_HINT_T2 = 1,
953 _MM_HINT_NTA = 0
954};
955
956/* Loads one cache line from address P to a location "closer" to the
957 processor. The selector I specifies the type of prefetch operation. */
958extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
959_mm_prefetch (const void *__P, enum _mm_hint __I)
960{
961 /* Current PowerPC will ignores the hint parameters. */
962 __builtin_prefetch (__P);
963}
964
965/* Convert the two lower SPFP values to 32-bit integers according to the
966 current rounding mode. Return the integers in packed form. */
967extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
968_mm_cvtps_pi32 (__m128 __A)
969{
970 /* Splat two lower SPFP values to both halves. */
971 __v4sf temp, rounded;
972 __vector unsigned long long result;
973
974 /* Splat two lower SPFP values to both halves. */
975 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
976 rounded = vec_rint(temp);
977 result = (__vector unsigned long long) vec_cts (rounded, 0);
978
979 return (__m64) ((__vector long long) result)[0];
980}
981
982extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
983_mm_cvt_ps2pi (__m128 __A)
984{
985 return _mm_cvtps_pi32 (__A);
986}
987
988/* Truncate the lower SPFP value to a 32-bit integer. */
989extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
990_mm_cvttss_si32 (__m128 __A)
991{
992 /* Extract the lower float element. */
993 float temp = __A[0];
994 /* truncate to 32-bit integer and return. */
995 return temp;
996}
997
998extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
999_mm_cvtt_ss2si (__m128 __A)
1000{
1001 return _mm_cvttss_si32 (__A);
1002}
1003
1004/* Intel intrinsic. */
1005extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1006_mm_cvttss_si64 (__m128 __A)
1007{
1008 /* Extract the lower float element. */
1009 float temp = __A[0];
1010 /* truncate to 32-bit integer and return. */
1011 return temp;
1012}
1013
1014/* Microsoft intrinsic. */
1015extern __inline long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1016_mm_cvttss_si64x (__m128 __A)
1017{
1018 /* Extract the lower float element. */
1019 float temp = __A[0];
1020 /* truncate to 32-bit integer and return. */
1021 return temp;
1022}
1023
1024/* Truncate the two lower SPFP values to 32-bit integers. Return the
1025 integers in packed form. */
1026extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1027_mm_cvttps_pi32 (__m128 __A)
1028{
1029 __v4sf temp;
1030 __vector unsigned long long result;
1031
1032 /* Splat two lower SPFP values to both halves. */
1033 temp = (__v4sf) vec_splat ((__vector long long)__A, 0);
1034 result = (__vector unsigned long long) vec_cts (temp, 0);
1035
1036 return (__m64) ((__vector long long) result)[0];
1037}
1038
1039extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1040_mm_cvtt_ps2pi (__m128 __A)
1041{
1042 return _mm_cvttps_pi32 (__A);
1043}
1044
1045/* Convert B to a SPFP value and insert it as element zero in A. */
1046extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1047_mm_cvtsi32_ss (__m128 __A, int __B)
1048{
1049 float temp = __B;
1050 __A[0] = temp;
1051
1052 return __A;
1053}
1054
1055extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1056_mm_cvt_si2ss (__m128 __A, int __B)
1057{
1058 return _mm_cvtsi32_ss (__A, __B);
1059}
1060
1061/* Convert B to a SPFP value and insert it as element zero in A. */
1062/* Intel intrinsic. */
1063extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1064_mm_cvtsi64_ss (__m128 __A, long long __B)
1065{
1066 float temp = __B;
1067 __A[0] = temp;
1068
1069 return __A;
1070}
1071
1072/* Microsoft intrinsic. */
1073extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074_mm_cvtsi64x_ss (__m128 __A, long long __B)
1075{
1076 return _mm_cvtsi64_ss (__A, __B);
1077}
1078
1079/* Convert the two 32-bit values in B to SPFP form and insert them
1080 as the two lower elements in A. */
1081extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1082_mm_cvtpi32_ps (__m128 __A, __m64 __B)
1083{
1084 __vector signed int vm1;
1085 __vector float vf1;
1086
1087 vm1 = (__vector signed int) (__vector unsigned long long) {__B, __B};
1088 vf1 = (__vector float) vec_ctf (vm1, 0);
1089
1090 return ((__m128) (__vector unsigned long long)
1091 { ((__vector unsigned long long)vf1) [0],
1092 ((__vector unsigned long long)__A) [1]});
1093}
1094
1095extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1096_mm_cvt_pi2ps (__m128 __A, __m64 __B)
1097{
1098 return _mm_cvtpi32_ps (__A, __B);
1099}
1100
1101/* Convert the four signed 16-bit values in A to SPFP form. */
1102extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1103_mm_cvtpi16_ps (__m64 __A)
1104{
1105 __vector signed short vs8;
1106 __vector signed int vi4;
1107 __vector float vf1;
1108
1109 vs8 = (__vector signed short) (__vector unsigned long long) { __A, __A };
1110 vi4 = vec_vupklsh (vs8);
1111 vf1 = (__vector float) vec_ctf (vi4, 0);
1112
1113 return (__m128) vf1;
1114}
1115
1116/* Convert the four unsigned 16-bit values in A to SPFP form. */
1117extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1118_mm_cvtpu16_ps (__m64 __A)
1119{
1120 const __vector unsigned short zero =
1121 { 0, 0, 0, 0, 0, 0, 0, 0 };
1122 __vector unsigned short vs8;
1123 __vector unsigned int vi4;
1124 __vector float vf1;
1125
1126 vs8 = (__vector unsigned short) (__vector unsigned long long) { __A, __A };
1127 vi4 = (__vector unsigned int) vec_mergel
1128#ifdef __LITTLE_ENDIAN__
1129 (vs8, zero);
1130#else
1131 (zero, vs8);
1132#endif
1133 vf1 = (__vector float) vec_ctf (vi4, 0);
1134
1135 return (__m128) vf1;
1136}
1137
1138/* Convert the low four signed 8-bit values in A to SPFP form. */
1139extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1140_mm_cvtpi8_ps (__m64 __A)
1141{
1142 __vector signed char vc16;
1143 __vector signed short vs8;
1144 __vector signed int vi4;
1145 __vector float vf1;
1146
1147 vc16 = (__vector signed char) (__vector unsigned long long) { __A, __A };
1148 vs8 = vec_vupkhsb (vc16);
1149 vi4 = vec_vupkhsh (vs8);
1150 vf1 = (__vector float) vec_ctf (vi4, 0);
1151
1152 return (__m128) vf1;
1153}
1154
1155/* Convert the low four unsigned 8-bit values in A to SPFP form. */
1156extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1157
1158_mm_cvtpu8_ps (__m64 __A)
1159{
1160 const __vector unsigned char zero =
1161 { 0, 0, 0, 0, 0, 0, 0, 0 };
1162 __vector unsigned char vc16;
1163 __vector unsigned short vs8;
1164 __vector unsigned int vi4;
1165 __vector float vf1;
1166
1167 vc16 = (__vector unsigned char) (__vector unsigned long long) { __A, __A };
1168#ifdef __LITTLE_ENDIAN__
1169 vs8 = (__vector unsigned short) vec_mergel (vc16, zero);
1170 vi4 = (__vector unsigned int) vec_mergeh (vs8,
1171 (__vector unsigned short) zero);
1172#else
1173 vs8 = (__vector unsigned short) vec_mergel (zero, vc16);
1174 vi4 = (__vector unsigned int) vec_mergeh ((__vector unsigned short) zero,
1175 vs8);
1176#endif
1177 vf1 = (__vector float) vec_ctf (vi4, 0);
1178
1179 return (__m128) vf1;
1180}
1181
1182/* Convert the four signed 32-bit values in A and B to SPFP form. */
1183extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1184_mm_cvtpi32x2_ps (__m64 __A, __m64 __B)
1185{
1186 __vector signed int vi4;
1187 __vector float vf4;
1188
1189 vi4 = (__vector signed int) (__vector unsigned long long) { __A, __B };
1190 vf4 = (__vector float) vec_ctf (vi4, 0);
1191 return (__m128) vf4;
1192}
1193
1194/* Convert the four SPFP values in A to four signed 16-bit integers. */
1195extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1196_mm_cvtps_pi16 (__m128 __A)
1197{
1198 __v4sf rounded;
1199 __vector signed int temp;
1200 __vector unsigned long long result;
1201
1202 rounded = vec_rint(__A);
1203 temp = vec_cts (rounded, 0);
1204 result = (__vector unsigned long long) vec_pack (temp, temp);
1205
1206 return (__m64) ((__vector long long) result)[0];
1207}
1208
1209/* Convert the four SPFP values in A to four signed 8-bit integers. */
1210extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1211_mm_cvtps_pi8 (__m128 __A)
1212{
1213 __v4sf rounded;
1214 __vector signed int tmp_i;
1215 static const __vector signed int zero = {0, 0, 0, 0};
1216 __vector signed short tmp_s;
1217 __vector signed char res_v;
1218
1219 rounded = vec_rint(__A);
1220 tmp_i = vec_cts (rounded, 0);
1221 tmp_s = vec_pack (tmp_i, zero);
1222 res_v = vec_pack (tmp_s, tmp_s);
1223 return (__m64) ((__vector long long) res_v)[0];
1224}
1225
1226/* Selects four specific SPFP values from A and B based on MASK. */
1227extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228
1229_mm_shuffle_ps (__m128 __A, __m128 __B, int const __mask)
1230{
1231 unsigned long element_selector_10 = __mask & 0x03;
1232 unsigned long element_selector_32 = (__mask >> 2) & 0x03;
1233 unsigned long element_selector_54 = (__mask >> 4) & 0x03;
1234 unsigned long element_selector_76 = (__mask >> 6) & 0x03;
1235 static const unsigned int permute_selectors[4] =
1236 {
1237#ifdef __LITTLE_ENDIAN__
1238 0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
1239#else
1240 0x00010203, 0x04050607, 0x08090A0B, 0x0C0D0E0F
1241#endif
1242 };
1243 __vector unsigned int t;
1244
1245 t[0] = permute_selectors[element_selector_10];
1246 t[1] = permute_selectors[element_selector_32];
1247 t[2] = permute_selectors[element_selector_54] + 0x10101010;
1248 t[3] = permute_selectors[element_selector_76] + 0x10101010;
1249 return vec_perm ((__v4sf) __A, (__v4sf)__B, (__vector unsigned char)t);
1250}
1251
1252/* Selects and interleaves the upper two SPFP values from A and B. */
1253extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1254_mm_unpackhi_ps (__m128 __A, __m128 __B)
1255{
1256 return (__m128) vec_vmrglw ((__v4sf) __A, (__v4sf)__B);
1257}
1258
1259/* Selects and interleaves the lower two SPFP values from A and B. */
1260extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1261_mm_unpacklo_ps (__m128 __A, __m128 __B)
1262{
1263 return (__m128) vec_vmrghw ((__v4sf) __A, (__v4sf)__B);
1264}
1265
1266/* Sets the upper two SPFP values with 64-bits of data loaded from P;
1267 the lower two values are passed through from A. */
1268extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1269_mm_loadh_pi (__m128 __A, __m64 const *__P)
1270{
1271 __vector unsigned long long __a = (__vector unsigned long long)__A;
1272 __vector unsigned long long __p = vec_splats(*__P);
1273 __a [1] = __p [1];
1274
1275 return (__m128)__a;
1276}
1277
1278/* Stores the upper two SPFP values of A into P. */
1279extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1280_mm_storeh_pi (__m64 *__P, __m128 __A)
1281{
1282 __vector unsigned long long __a = (__vector unsigned long long) __A;
1283
1284 *__P = __a[1];
1285}
1286
1287/* Moves the upper two values of B into the lower two values of A. */
1288extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1289_mm_movehl_ps (__m128 __A, __m128 __B)
1290{
1291 return (__m128) vec_mergel ((__vector unsigned long long)__B,
1292 (__vector unsigned long long)__A);
1293}
1294
1295/* Moves the lower two values of B into the upper two values of A. */
1296extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1297_mm_movelh_ps (__m128 __A, __m128 __B)
1298{
1299 return (__m128) vec_mergeh ((__vector unsigned long long)__A,
1300 (__vector unsigned long long)__B);
1301}
1302
1303/* Sets the lower two SPFP values with 64-bits of data loaded from P;
1304 the upper two values are passed through from A. */
1305extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1306_mm_loadl_pi (__m128 __A, __m64 const *__P)
1307{
1308 __vector unsigned long long __a = (__vector unsigned long long)__A;
1309 __vector unsigned long long __p = vec_splats(*__P);
1310 __a [0] = __p [0];
1311
1312 return (__m128)__a;
1313}
1314
1315/* Stores the lower two SPFP values of A into P. */
1316extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1317_mm_storel_pi (__m64 *__P, __m128 __A)
1318{
1319 __vector unsigned long long __a = (__vector unsigned long long) __A;
1320
1321 *__P = __a[0];
1322}
1323
1324#ifdef _ARCH_PWR8
1325/* Intrinsic functions that require PowerISA 2.07 minimum. */
1326
1327/* Creates a 4-bit mask from the most significant bits of the SPFP values. */
1328extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1329_mm_movemask_ps (__m128 __A)
1330{
1331 __vector unsigned long long result;
1332 static const __vector unsigned int perm_mask =
1333 {
1334#ifdef __LITTLE_ENDIAN__
1335 0x00204060, 0x80808080, 0x80808080, 0x80808080
1336#else
1337 0x80808080, 0x80808080, 0x80808080, 0x00204060
1338#endif
1339 };
1340
1341 result = ((__vector unsigned long long)
1342 vec_vbpermq ((__vector unsigned char) __A,
1343 (__vector unsigned char) perm_mask));
1344
1345#ifdef __LITTLE_ENDIAN__
1346 return result[1];
1347#else
1348 return result[0];
1349#endif
1350}
1351#endif /* _ARCH_PWR8 */
1352
1353/* Create a vector with all four elements equal to *P. */
1354extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1355_mm_load1_ps (float const *__P)
1356{
1357 return _mm_set1_ps (*__P);
1358}
1359
1360extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1361_mm_load_ps1 (float const *__P)
1362{
1363 return _mm_load1_ps (__P);
1364}
1365
1366/* Extracts one of the four words of A. The selector N must be immediate. */
1367extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1368_mm_extract_pi16 (__m64 const __A, int const __N)
1369{
1370 unsigned int shiftr = __N & 3;
1371#ifdef __BIG_ENDIAN__
1372 shiftr = 3 - shiftr;
1373#endif
1374
1375 return ((__A >> (shiftr * 16)) & 0xffff);
1376}
1377
1378extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1379_m_pextrw (__m64 const __A, int const __N)
1380{
1381 return _mm_extract_pi16 (__A, __N);
1382}
1383
1384/* Inserts word D into one of four words of A. The selector N must be
1385 immediate. */
1386extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1387_mm_insert_pi16 (__m64 const __A, int const __D, int const __N)
1388{
1389 const int shiftl = (__N & 3) * 16;
1390 const __m64 shiftD = (const __m64) __D << shiftl;
1391 const __m64 mask = 0xffffUL << shiftl;
1392 __m64 result = (__A & (~mask)) | (shiftD & mask);
1393
1394 return (result);
1395}
1396
1397extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1398_m_pinsrw (__m64 const __A, int const __D, int const __N)
1399{
1400 return _mm_insert_pi16 (__A, __D, __N);
1401}
1402
1403/* Compute the element-wise maximum of signed 16-bit values. */
1404extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405
1406_mm_max_pi16 (__m64 __A, __m64 __B)
1407{
1408#if _ARCH_PWR8
1409 __vector signed short a, b, r;
1410 __vector __bool short c;
1411
1412 a = (__vector signed short)vec_splats (__A);
1413 b = (__vector signed short)vec_splats (__B);
1414 c = (__vector __bool short)vec_cmpgt (a, b);
1415 r = vec_sel (b, a, c);
1416 return (__m64) ((__vector long long) r)[0];
1417#else
1418 __m64_union m1, m2, res;
1419
1420 m1.as_m64 = __A;
1421 m2.as_m64 = __B;
1422
1423 res.as_short[0] =
1424 (m1.as_short[0] > m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1425 res.as_short[1] =
1426 (m1.as_short[1] > m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1427 res.as_short[2] =
1428 (m1.as_short[2] > m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1429 res.as_short[3] =
1430 (m1.as_short[3] > m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1431
1432 return (__m64) res.as_m64;
1433#endif
1434}
1435
1436extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1437_m_pmaxsw (__m64 __A, __m64 __B)
1438{
1439 return _mm_max_pi16 (__A, __B);
1440}
1441
1442/* Compute the element-wise maximum of unsigned 8-bit values. */
1443extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1444_mm_max_pu8 (__m64 __A, __m64 __B)
1445{
1446#if _ARCH_PWR8
1447 __vector unsigned char a, b, r;
1448 __vector __bool char c;
1449
1450 a = (__vector unsigned char)vec_splats (__A);
1451 b = (__vector unsigned char)vec_splats (__B);
1452 c = (__vector __bool char)vec_cmpgt (a, b);
1453 r = vec_sel (b, a, c);
1454 return (__m64) ((__vector long long) r)[0];
1455#else
1456 __m64_union m1, m2, res;
1457 long i;
1458
1459 m1.as_m64 = __A;
1460 m2.as_m64 = __B;
1461
1462
1463 for (i = 0; i < 8; i++)
1464 res.as_char[i] =
1465 ((unsigned char) m1.as_char[i] > (unsigned char) m2.as_char[i]) ?
1466 m1.as_char[i] : m2.as_char[i];
1467
1468 return (__m64) res.as_m64;
1469#endif
1470}
1471
1472extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1473_m_pmaxub (__m64 __A, __m64 __B)
1474{
1475 return _mm_max_pu8 (__A, __B);
1476}
1477
1478/* Compute the element-wise minimum of signed 16-bit values. */
1479extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1480_mm_min_pi16 (__m64 __A, __m64 __B)
1481{
1482#if _ARCH_PWR8
1483 __vector signed short a, b, r;
1484 __vector __bool short c;
1485
1486 a = (__vector signed short)vec_splats (__A);
1487 b = (__vector signed short)vec_splats (__B);
1488 c = (__vector __bool short)vec_cmplt (a, b);
1489 r = vec_sel (b, a, c);
1490 return (__m64) ((__vector long long) r)[0];
1491#else
1492 __m64_union m1, m2, res;
1493
1494 m1.as_m64 = __A;
1495 m2.as_m64 = __B;
1496
1497 res.as_short[0] =
1498 (m1.as_short[0] < m2.as_short[0]) ? m1.as_short[0] : m2.as_short[0];
1499 res.as_short[1] =
1500 (m1.as_short[1] < m2.as_short[1]) ? m1.as_short[1] : m2.as_short[1];
1501 res.as_short[2] =
1502 (m1.as_short[2] < m2.as_short[2]) ? m1.as_short[2] : m2.as_short[2];
1503 res.as_short[3] =
1504 (m1.as_short[3] < m2.as_short[3]) ? m1.as_short[3] : m2.as_short[3];
1505
1506 return (__m64) res.as_m64;
1507#endif
1508}
1509
1510extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1511_m_pminsw (__m64 __A, __m64 __B)
1512{
1513 return _mm_min_pi16 (__A, __B);
1514}
1515
1516/* Compute the element-wise minimum of unsigned 8-bit values. */
1517extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1518_mm_min_pu8 (__m64 __A, __m64 __B)
1519{
1520#if _ARCH_PWR8
1521 __vector unsigned char a, b, r;
1522 __vector __bool char c;
1523
1524 a = (__vector unsigned char)vec_splats (__A);
1525 b = (__vector unsigned char)vec_splats (__B);
1526 c = (__vector __bool char)vec_cmplt (a, b);
1527 r = vec_sel (b, a, c);
1528 return (__m64) ((__vector long long) r)[0];
1529#else
1530 __m64_union m1, m2, res;
1531 long i;
1532
1533 m1.as_m64 = __A;
1534 m2.as_m64 = __B;
1535
1536
1537 for (i = 0; i < 8; i++)
1538 res.as_char[i] =
1539 ((unsigned char) m1.as_char[i] < (unsigned char) m2.as_char[i]) ?
1540 m1.as_char[i] : m2.as_char[i];
1541
1542 return (__m64) res.as_m64;
1543#endif
1544}
1545
1546extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1547_m_pminub (__m64 __A, __m64 __B)
1548{
1549 return _mm_min_pu8 (__A, __B);
1550}
1551
1552/* Create an 8-bit mask of the signs of 8-bit values. */
1553extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1554_mm_movemask_pi8 (__m64 __A)
1555{
1556 unsigned long long p =
1557#ifdef __LITTLE_ENDIAN__
1558 0x0008101820283038UL; // permute control for sign bits
1559#else
1560 0x3830282018100800UL; // permute control for sign bits
1561#endif
1562 return __builtin_bpermd (p, __A);
1563}
1564
1565extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1566_m_pmovmskb (__m64 __A)
1567{
1568 return _mm_movemask_pi8 (__A);
1569}
1570
1571/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
1572 in B and produce the high 16 bits of the 32-bit results. */
1573extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1574_mm_mulhi_pu16 (__m64 __A, __m64 __B)
1575{
1576 __vector unsigned short a, b;
1577 __vector unsigned short c;
1578 __vector unsigned int w0, w1;
1579 __vector unsigned char xform1 = {
1580#ifdef __LITTLE_ENDIAN__
1581 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17,
1582 0x0A, 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1583#else
1584 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15,
1585 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1586#endif
1587 };
1588
1589 a = (__vector unsigned short)vec_splats (__A);
1590 b = (__vector unsigned short)vec_splats (__B);
1591
1592 w0 = vec_vmuleuh (a, b);
1593 w1 = vec_vmulouh (a, b);
1594 c = (__vector unsigned short)vec_perm (w0, w1, xform1);
1595
1596 return (__m64) ((__vector long long) c)[0];
1597}
1598
1599extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1600_m_pmulhuw (__m64 __A, __m64 __B)
1601{
1602 return _mm_mulhi_pu16 (__A, __B);
1603}
1604
1605/* Return a combination of the four 16-bit values in A. The selector
1606 must be an immediate. */
1607extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1608_mm_shuffle_pi16 (__m64 __A, int const __N)
1609{
1610 unsigned long element_selector_10 = __N & 0x03;
1611 unsigned long element_selector_32 = (__N >> 2) & 0x03;
1612 unsigned long element_selector_54 = (__N >> 4) & 0x03;
1613 unsigned long element_selector_76 = (__N >> 6) & 0x03;
1614 static const unsigned short permute_selectors[4] =
1615 {
1616#ifdef __LITTLE_ENDIAN__
1617 0x0908, 0x0B0A, 0x0D0C, 0x0F0E
1618#else
1619 0x0607, 0x0405, 0x0203, 0x0001
1620#endif
1621 };
1622 __m64_union t;
1623 __vector unsigned long long a, p, r;
1624
1625#ifdef __LITTLE_ENDIAN__
1626 t.as_short[0] = permute_selectors[element_selector_10];
1627 t.as_short[1] = permute_selectors[element_selector_32];
1628 t.as_short[2] = permute_selectors[element_selector_54];
1629 t.as_short[3] = permute_selectors[element_selector_76];
1630#else
1631 t.as_short[3] = permute_selectors[element_selector_10];
1632 t.as_short[2] = permute_selectors[element_selector_32];
1633 t.as_short[1] = permute_selectors[element_selector_54];
1634 t.as_short[0] = permute_selectors[element_selector_76];
1635#endif
1636 p = vec_splats (t.as_m64);
1637 a = vec_splats (__A);
1638 r = vec_perm (a, a, (__vector unsigned char)p);
1639 return (__m64) ((__vector long long) r)[0];
1640}
1641
1642extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1643_m_pshufw (__m64 __A, int const __N)
1644{
1645 return _mm_shuffle_pi16 (__A, __N);
1646}
1647
1648/* Conditionally store byte elements of A into P. The high bit of each
1649 byte in the selector N determines whether the corresponding byte from
1650 A is stored. */
1651extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1652_mm_maskmove_si64 (__m64 __A, __m64 __N, char *__P)
1653{
1654 __m64 hibit = 0x8080808080808080UL;
1655 __m64 mask, tmp;
1656 __m64 *p = (__m64*)__P;
1657
1658 tmp = *p;
1659 mask = _mm_cmpeq_pi8 ((__N & hibit), hibit);
1660 tmp = (tmp & (~mask)) | (__A & mask);
1661 *p = tmp;
1662}
1663
1664extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1665_m_maskmovq (__m64 __A, __m64 __N, char *__P)
1666{
1667 _mm_maskmove_si64 (__A, __N, __P);
1668}
1669
1670/* Compute the rounded averages of the unsigned 8-bit values in A and B. */
1671extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1672_mm_avg_pu8 (__m64 __A, __m64 __B)
1673{
1674 __vector unsigned char a, b, c;
1675
1676 a = (__vector unsigned char)vec_splats (__A);
1677 b = (__vector unsigned char)vec_splats (__B);
1678 c = vec_avg (a, b);
1679 return (__m64) ((__vector long long) c)[0];
1680}
1681
1682extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1683_m_pavgb (__m64 __A, __m64 __B)
1684{
1685 return _mm_avg_pu8 (__A, __B);
1686}
1687
1688/* Compute the rounded averages of the unsigned 16-bit values in A and B. */
1689extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1690_mm_avg_pu16 (__m64 __A, __m64 __B)
1691{
1692 __vector unsigned short a, b, c;
1693
1694 a = (__vector unsigned short)vec_splats (__A);
1695 b = (__vector unsigned short)vec_splats (__B);
1696 c = vec_avg (a, b);
1697 return (__m64) ((__vector long long) c)[0];
1698}
1699
1700extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1701_m_pavgw (__m64 __A, __m64 __B)
1702{
1703 return _mm_avg_pu16 (__A, __B);
1704}
1705
1706/* Compute the sum of the absolute differences of the unsigned 8-bit
1707 values in A and B. Return the value in the lower 16-bit word; the
1708 upper words are cleared. */
1709extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1710_mm_sad_pu8 (__m64 __A, __m64 __B)
1711{
1712 __vector unsigned char a, b;
1713 __vector unsigned char vmin, vmax, vabsdiff;
1714 __vector signed int vsum;
1715 const __vector unsigned int zero =
1716 { 0, 0, 0, 0 };
1717 __m64_union result = {0};
1718
1719 a = (__vector unsigned char) (__vector unsigned long long) { 0UL, __A };
1720 b = (__vector unsigned char) (__vector unsigned long long) { 0UL, __B };
1721 vmin = vec_min (a, b);
1722 vmax = vec_max (a, b);
1723 vabsdiff = vec_sub (vmax, vmin);
1724 /* Sum four groups of bytes into integers. */
1725 vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
1726 /* Sum across four integers with integer result. */
1727 vsum = vec_sums (vsum, (__vector signed int) zero);
1728 /* The sum is in the right most 32-bits of the vector result.
1729 Transfer to a GPR and truncate to 16 bits. */
1730 result.as_short[0] = vsum[3];
1731 return result.as_m64;
1732}
1733
1734extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1735_m_psadbw (__m64 __A, __m64 __B)
1736{
1737 return _mm_sad_pu8 (__A, __B);
1738}
1739
1740/* Stores the data in A to the address P without polluting the caches. */
1741extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1742_mm_stream_pi (__m64 *__P, __m64 __A)
1743{
1744 /* Use the data cache block touch for store transient. */
1745 __asm__ (
1746 " dcbtstt 0,%0"
1747 :
1748 : "b" (__P)
1749 : "memory"
1750 );
1751 *__P = __A;
1752}
1753
1754/* Likewise. The address must be 16-byte aligned. */
1755extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1756_mm_stream_ps (float *__P, __m128 __A)
1757{
1758 /* Use the data cache block touch for store transient. */
1759 __asm__ (
1760 " dcbtstt 0,%0"
1761 :
1762 : "b" (__P)
1763 : "memory"
1764 );
1765 _mm_store_ps (__P, __A);
1766}
1767
1768/* Guarantees that every preceding store is globally visible before
1769 any subsequent store. */
1770extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1771_mm_sfence (void)
1772{
1773 /* Generate a light weight sync. */
1774 __atomic_thread_fence (__ATOMIC_RELEASE);
1775}
1776
1777/* The execution of the next instruction is delayed by an implementation
1778 specific amount of time. The instruction does not modify the
1779 architectural state. This is after the pop_options pragma because
1780 it does not require SSE support in the processor--the encoding is a
1781 nop on processors that do not support it. */
1782extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1783_mm_pause (void)
1784{
1785 /* There is no exact match with this construct, but the following is
1786 close to the desired effect. */
1787#if _ARCH_PWR8
1788 /* On power8 and later processors we can depend on Program Priority
1789 (PRI) and associated "very low" PPI setting. Since we don't know
1790 what PPI this thread is running at we: 1) save the current PRI
1791 from the PPR SPR into a local GRP, 2) set the PRI to "very low*
1792 via the special or 31,31,31 encoding. 3) issue an "isync" to
1793 insure the PRI change takes effect before we execute any more
1794 instructions.
1795 Now we can execute a lwsync (release barrier) while we execute
1796 this thread at "very low" PRI. Finally we restore the original
1797 PRI and continue execution. */
1798 unsigned long __PPR;
1799
1800 __asm__ volatile (
1801 " mfppr %0;"
1802 " or 31,31,31;"
1803 " isync;"
1804 " lwsync;"
1805 " isync;"
1806 " mtppr %0;"
1807 : "=r" (__PPR)
1808 :
1809 : "memory"
1810 );
1811#else
1812 /* For older processor where we may not even have Program Priority
1813 controls we can only depend on Heavy Weight Sync. */
1814 __atomic_thread_fence (__ATOMIC_SEQ_CST);
1815#endif
1816}
1817
1818/* Transpose the 4x4 matrix composed of row[0-3]. */
1819#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
1820do { \
1821 __v4sf __r0 = (row0), __r1 = (row1), __r2 = (row2), __r3 = (row3); \
1822 __v4sf __t0 = vec_vmrghw (__r0, __r1); \
1823 __v4sf __t1 = vec_vmrghw (__r2, __r3); \
1824 __v4sf __t2 = vec_vmrglw (__r0, __r1); \
1825 __v4sf __t3 = vec_vmrglw (__r2, __r3); \
1826 (row0) = (__v4sf)vec_mergeh ((__vector long long)__t0, \
1827 (__vector long long)__t1); \
1828 (row1) = (__v4sf)vec_mergel ((__vector long long)__t0, \
1829 (__vector long long)__t1); \
1830 (row2) = (__v4sf)vec_mergeh ((__vector long long)__t2, \
1831 (__vector long long)__t3); \
1832 (row3) = (__v4sf)vec_mergel ((__vector long long)__t2, \
1833 (__vector long long)__t3); \
1834} while (0)
1835
1836/* For backward source compatibility. */
1837//# include <emmintrin.h>
1838
Logan Chienbedbf4f2020-01-06 19:35:19 -08001839#else
1840#include_next <xmmintrin.h>
1841#endif /* defined(__linux__) && defined(__ppc64__) */
1842
Logan Chiendf4f7662019-09-04 16:45:23 -07001843#endif /* _XMMINTRIN_H_INCLUDED */