blob: 54e4ee9f4468696bda027874059229bc1b902d61 [file] [log] [blame]
Logan Chiendf4f7662019-09-04 16:45:23 -07001/*===---- mmintrin.h - Implementation of MMX intrinsics on PowerPC ---------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header file is to help porting code using Intel intrinsics
15 explicitly from x86_64 to powerpc64/powerpc64le.
16
17 Since PowerPC target doesn't support native 64-bit vector type, we
18 typedef __m64 to 64-bit unsigned long long in MMX intrinsics, which
19 works well for _si64 and some _pi32 operations.
20
21 For _pi16 and _pi8 operations, it's better to transfer __m64 into
22 128-bit PowerPC vector first. Power8 introduced direct register
23 move instructions which helps for more efficient implementation.
24
25 It's user's responsibility to determine if the results of such port
26 are acceptable or further changes are needed. Please note that much
27 code using Intel intrinsics CAN BE REWRITTEN in more portable and
28 efficient standard C or GNU C extensions with 64-bit scalar
29 operations, or 128-bit SSE/Altivec operations, which are more
30 recommended. */
31#error \
32 "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this error."
33#endif
34
35#ifndef _MMINTRIN_H_INCLUDED
36#define _MMINTRIN_H_INCLUDED
37
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -080038#if defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__))
Logan Chienbedbf4f2020-01-06 19:35:19 -080039
Logan Chiendf4f7662019-09-04 16:45:23 -070040#include <altivec.h>
41/* The Intel API is flexible enough that we must allow aliasing with other
42 vector types, and their scalar components. */
43typedef __attribute__((__aligned__(8))) unsigned long long __m64;
44
45typedef __attribute__((__aligned__(8))) union {
46 __m64 as_m64;
47 char as_char[8];
48 signed char as_signed_char[8];
49 short as_short[4];
50 int as_int[2];
51 long long as_long_long;
52 float as_float[2];
53 double as_double;
54} __m64_union;
55
56/* Empty the multimedia state. */
57extern __inline void
58 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
59 _mm_empty(void) {
60 /* nothing to do on PowerPC. */
61}
62
63extern __inline void
64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
65 _m_empty(void) {
66 /* nothing to do on PowerPC. */
67}
68
69/* Convert I to a __m64 object. The integer is zero-extended to 64-bits. */
70extern __inline __m64
71 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
72 _mm_cvtsi32_si64(int __i) {
73 return (__m64)(unsigned int)__i;
74}
75
76extern __inline __m64
77 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
78 _m_from_int(int __i) {
79 return _mm_cvtsi32_si64(__i);
80}
81
82/* Convert the lower 32 bits of the __m64 object into an integer. */
83extern __inline int
84 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
85 _mm_cvtsi64_si32(__m64 __i) {
86 return ((int)__i);
87}
88
89extern __inline int
90 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
91 _m_to_int(__m64 __i) {
92 return _mm_cvtsi64_si32(__i);
93}
94
95/* Convert I to a __m64 object. */
96
97/* Intel intrinsic. */
98extern __inline __m64
99 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
100 _m_from_int64(long long __i) {
101 return (__m64)__i;
102}
103
104extern __inline __m64
105 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
106 _mm_cvtsi64_m64(long long __i) {
107 return (__m64)__i;
108}
109
110/* Microsoft intrinsic. */
111extern __inline __m64
112 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
113 _mm_cvtsi64x_si64(long long __i) {
114 return (__m64)__i;
115}
116
117extern __inline __m64
118 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
119 _mm_set_pi64x(long long __i) {
120 return (__m64)__i;
121}
122
123/* Convert the __m64 object to a 64bit integer. */
124
125/* Intel intrinsic. */
126extern __inline long long
127 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
128 _m_to_int64(__m64 __i) {
129 return (long long)__i;
130}
131
132extern __inline long long
133 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
134 _mm_cvtm64_si64(__m64 __i) {
135 return (long long)__i;
136}
137
138/* Microsoft intrinsic. */
139extern __inline long long
140 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
141 _mm_cvtsi64_si64x(__m64 __i) {
142 return (long long)__i;
143}
144
145#ifdef _ARCH_PWR8
146/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
147 the result, and the four 16-bit values from M2 into the upper four 8-bit
148 values of the result, all with signed saturation. */
149extern __inline __m64
150 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
151 _mm_packs_pi16(__m64 __m1, __m64 __m2) {
152 __vector signed short vm1;
153 __vector signed char vresult;
154
155 vm1 = (__vector signed short)(__vector unsigned long long)
156#ifdef __LITTLE_ENDIAN__
157 {__m1, __m2};
158#else
159 {__m2, __m1};
160#endif
161 vresult = vec_packs(vm1, vm1);
162 return (__m64)((__vector long long)vresult)[0];
163}
164
165extern __inline __m64
166 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
167 _m_packsswb(__m64 __m1, __m64 __m2) {
168 return _mm_packs_pi16(__m1, __m2);
169}
170
171/* Pack the two 32-bit values from M1 in to the lower two 16-bit values of
172 the result, and the two 32-bit values from M2 into the upper two 16-bit
173 values of the result, all with signed saturation. */
174extern __inline __m64
175 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
176 _mm_packs_pi32(__m64 __m1, __m64 __m2) {
177 __vector signed int vm1;
178 __vector signed short vresult;
179
180 vm1 = (__vector signed int)(__vector unsigned long long)
181#ifdef __LITTLE_ENDIAN__
182 {__m1, __m2};
183#else
184 {__m2, __m1};
185#endif
186 vresult = vec_packs(vm1, vm1);
187 return (__m64)((__vector long long)vresult)[0];
188}
189
190extern __inline __m64
191 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
192 _m_packssdw(__m64 __m1, __m64 __m2) {
193 return _mm_packs_pi32(__m1, __m2);
194}
195
196/* Pack the four 16-bit values from M1 into the lower four 8-bit values of
197 the result, and the four 16-bit values from M2 into the upper four 8-bit
198 values of the result, all with unsigned saturation. */
199extern __inline __m64
200 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
201 _mm_packs_pu16(__m64 __m1, __m64 __m2) {
202 __vector unsigned char r;
203 __vector signed short vm1 = (__vector signed short)(__vector long long)
204#ifdef __LITTLE_ENDIAN__
205 {__m1, __m2};
206#else
207 {__m2, __m1};
208#endif
209 const __vector signed short __zero = {0};
210 __vector __bool short __select = vec_cmplt(vm1, __zero);
211 r = vec_packs((__vector unsigned short)vm1, (__vector unsigned short)vm1);
212 __vector __bool char packsel = vec_pack(__select, __select);
213 r = vec_sel(r, (const __vector unsigned char)__zero, packsel);
214 return (__m64)((__vector long long)r)[0];
215}
216
217extern __inline __m64
218 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
219 _m_packuswb(__m64 __m1, __m64 __m2) {
220 return _mm_packs_pu16(__m1, __m2);
221}
222#endif /* end ARCH_PWR8 */
223
224/* Interleave the four 8-bit values from the high half of M1 with the four
225 8-bit values from the high half of M2. */
226extern __inline __m64
227 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
228 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2) {
229#if _ARCH_PWR8
230 __vector unsigned char a, b, c;
231
232 a = (__vector unsigned char)vec_splats(__m1);
233 b = (__vector unsigned char)vec_splats(__m2);
234 c = vec_mergel(a, b);
235 return (__m64)((__vector long long)c)[1];
236#else
237 __m64_union m1, m2, res;
238
239 m1.as_m64 = __m1;
240 m2.as_m64 = __m2;
241
242 res.as_char[0] = m1.as_char[4];
243 res.as_char[1] = m2.as_char[4];
244 res.as_char[2] = m1.as_char[5];
245 res.as_char[3] = m2.as_char[5];
246 res.as_char[4] = m1.as_char[6];
247 res.as_char[5] = m2.as_char[6];
248 res.as_char[6] = m1.as_char[7];
249 res.as_char[7] = m2.as_char[7];
250
251 return (__m64)res.as_m64;
252#endif
253}
254
255extern __inline __m64
256 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
257 _m_punpckhbw(__m64 __m1, __m64 __m2) {
258 return _mm_unpackhi_pi8(__m1, __m2);
259}
260
261/* Interleave the two 16-bit values from the high half of M1 with the two
262 16-bit values from the high half of M2. */
263extern __inline __m64
264 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
265 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2) {
266 __m64_union m1, m2, res;
267
268 m1.as_m64 = __m1;
269 m2.as_m64 = __m2;
270
271 res.as_short[0] = m1.as_short[2];
272 res.as_short[1] = m2.as_short[2];
273 res.as_short[2] = m1.as_short[3];
274 res.as_short[3] = m2.as_short[3];
275
276 return (__m64)res.as_m64;
277}
278
279extern __inline __m64
280 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
281 _m_punpckhwd(__m64 __m1, __m64 __m2) {
282 return _mm_unpackhi_pi16(__m1, __m2);
283}
284/* Interleave the 32-bit value from the high half of M1 with the 32-bit
285 value from the high half of M2. */
286extern __inline __m64
287 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
288 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2) {
289 __m64_union m1, m2, res;
290
291 m1.as_m64 = __m1;
292 m2.as_m64 = __m2;
293
294 res.as_int[0] = m1.as_int[1];
295 res.as_int[1] = m2.as_int[1];
296
297 return (__m64)res.as_m64;
298}
299
300extern __inline __m64
301 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
302 _m_punpckhdq(__m64 __m1, __m64 __m2) {
303 return _mm_unpackhi_pi32(__m1, __m2);
304}
305/* Interleave the four 8-bit values from the low half of M1 with the four
306 8-bit values from the low half of M2. */
307extern __inline __m64
308 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
309 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2) {
310#if _ARCH_PWR8
311 __vector unsigned char a, b, c;
312
313 a = (__vector unsigned char)vec_splats(__m1);
314 b = (__vector unsigned char)vec_splats(__m2);
315 c = vec_mergel(a, b);
316 return (__m64)((__vector long long)c)[0];
317#else
318 __m64_union m1, m2, res;
319
320 m1.as_m64 = __m1;
321 m2.as_m64 = __m2;
322
323 res.as_char[0] = m1.as_char[0];
324 res.as_char[1] = m2.as_char[0];
325 res.as_char[2] = m1.as_char[1];
326 res.as_char[3] = m2.as_char[1];
327 res.as_char[4] = m1.as_char[2];
328 res.as_char[5] = m2.as_char[2];
329 res.as_char[6] = m1.as_char[3];
330 res.as_char[7] = m2.as_char[3];
331
332 return (__m64)res.as_m64;
333#endif
334}
335
336extern __inline __m64
337 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
338 _m_punpcklbw(__m64 __m1, __m64 __m2) {
339 return _mm_unpacklo_pi8(__m1, __m2);
340}
341/* Interleave the two 16-bit values from the low half of M1 with the two
342 16-bit values from the low half of M2. */
343extern __inline __m64
344 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
345 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2) {
346 __m64_union m1, m2, res;
347
348 m1.as_m64 = __m1;
349 m2.as_m64 = __m2;
350
351 res.as_short[0] = m1.as_short[0];
352 res.as_short[1] = m2.as_short[0];
353 res.as_short[2] = m1.as_short[1];
354 res.as_short[3] = m2.as_short[1];
355
356 return (__m64)res.as_m64;
357}
358
359extern __inline __m64
360 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
361 _m_punpcklwd(__m64 __m1, __m64 __m2) {
362 return _mm_unpacklo_pi16(__m1, __m2);
363}
364
365/* Interleave the 32-bit value from the low half of M1 with the 32-bit
366 value from the low half of M2. */
367extern __inline __m64
368 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
369 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2) {
370 __m64_union m1, m2, res;
371
372 m1.as_m64 = __m1;
373 m2.as_m64 = __m2;
374
375 res.as_int[0] = m1.as_int[0];
376 res.as_int[1] = m2.as_int[0];
377
378 return (__m64)res.as_m64;
379}
380
381extern __inline __m64
382 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
383 _m_punpckldq(__m64 __m1, __m64 __m2) {
384 return _mm_unpacklo_pi32(__m1, __m2);
385}
386
387/* Add the 8-bit values in M1 to the 8-bit values in M2. */
388extern __inline __m64
389 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
390 _mm_add_pi8(__m64 __m1, __m64 __m2) {
391#if _ARCH_PWR8
392 __vector signed char a, b, c;
393
394 a = (__vector signed char)vec_splats(__m1);
395 b = (__vector signed char)vec_splats(__m2);
396 c = vec_add(a, b);
397 return (__m64)((__vector long long)c)[0];
398#else
399 __m64_union m1, m2, res;
400
401 m1.as_m64 = __m1;
402 m2.as_m64 = __m2;
403
404 res.as_char[0] = m1.as_char[0] + m2.as_char[0];
405 res.as_char[1] = m1.as_char[1] + m2.as_char[1];
406 res.as_char[2] = m1.as_char[2] + m2.as_char[2];
407 res.as_char[3] = m1.as_char[3] + m2.as_char[3];
408 res.as_char[4] = m1.as_char[4] + m2.as_char[4];
409 res.as_char[5] = m1.as_char[5] + m2.as_char[5];
410 res.as_char[6] = m1.as_char[6] + m2.as_char[6];
411 res.as_char[7] = m1.as_char[7] + m2.as_char[7];
412
413 return (__m64)res.as_m64;
414#endif
415}
416
417extern __inline __m64
418 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
419 _m_paddb(__m64 __m1, __m64 __m2) {
420 return _mm_add_pi8(__m1, __m2);
421}
422
423/* Add the 16-bit values in M1 to the 16-bit values in M2. */
424extern __inline __m64
425 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
426 _mm_add_pi16(__m64 __m1, __m64 __m2) {
427#if _ARCH_PWR8
428 __vector signed short a, b, c;
429
430 a = (__vector signed short)vec_splats(__m1);
431 b = (__vector signed short)vec_splats(__m2);
432 c = vec_add(a, b);
433 return (__m64)((__vector long long)c)[0];
434#else
435 __m64_union m1, m2, res;
436
437 m1.as_m64 = __m1;
438 m2.as_m64 = __m2;
439
440 res.as_short[0] = m1.as_short[0] + m2.as_short[0];
441 res.as_short[1] = m1.as_short[1] + m2.as_short[1];
442 res.as_short[2] = m1.as_short[2] + m2.as_short[2];
443 res.as_short[3] = m1.as_short[3] + m2.as_short[3];
444
445 return (__m64)res.as_m64;
446#endif
447}
448
449extern __inline __m64
450 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
451 _m_paddw(__m64 __m1, __m64 __m2) {
452 return _mm_add_pi16(__m1, __m2);
453}
454
455/* Add the 32-bit values in M1 to the 32-bit values in M2. */
456extern __inline __m64
457 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
458 _mm_add_pi32(__m64 __m1, __m64 __m2) {
459#if _ARCH_PWR9
460 __vector signed int a, b, c;
461
462 a = (__vector signed int)vec_splats(__m1);
463 b = (__vector signed int)vec_splats(__m2);
464 c = vec_add(a, b);
465 return (__m64)((__vector long long)c)[0];
466#else
467 __m64_union m1, m2, res;
468
469 m1.as_m64 = __m1;
470 m2.as_m64 = __m2;
471
472 res.as_int[0] = m1.as_int[0] + m2.as_int[0];
473 res.as_int[1] = m1.as_int[1] + m2.as_int[1];
474
475 return (__m64)res.as_m64;
476#endif
477}
478
479extern __inline __m64
480 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
481 _m_paddd(__m64 __m1, __m64 __m2) {
482 return _mm_add_pi32(__m1, __m2);
483}
484
485/* Subtract the 8-bit values in M2 from the 8-bit values in M1. */
486extern __inline __m64
487 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
488 _mm_sub_pi8(__m64 __m1, __m64 __m2) {
489#if _ARCH_PWR8
490 __vector signed char a, b, c;
491
492 a = (__vector signed char)vec_splats(__m1);
493 b = (__vector signed char)vec_splats(__m2);
494 c = vec_sub(a, b);
495 return (__m64)((__vector long long)c)[0];
496#else
497 __m64_union m1, m2, res;
498
499 m1.as_m64 = __m1;
500 m2.as_m64 = __m2;
501
502 res.as_char[0] = m1.as_char[0] - m2.as_char[0];
503 res.as_char[1] = m1.as_char[1] - m2.as_char[1];
504 res.as_char[2] = m1.as_char[2] - m2.as_char[2];
505 res.as_char[3] = m1.as_char[3] - m2.as_char[3];
506 res.as_char[4] = m1.as_char[4] - m2.as_char[4];
507 res.as_char[5] = m1.as_char[5] - m2.as_char[5];
508 res.as_char[6] = m1.as_char[6] - m2.as_char[6];
509 res.as_char[7] = m1.as_char[7] - m2.as_char[7];
510
511 return (__m64)res.as_m64;
512#endif
513}
514
515extern __inline __m64
516 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
517 _m_psubb(__m64 __m1, __m64 __m2) {
518 return _mm_sub_pi8(__m1, __m2);
519}
520
521/* Subtract the 16-bit values in M2 from the 16-bit values in M1. */
522extern __inline __m64
523 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
524 _mm_sub_pi16(__m64 __m1, __m64 __m2) {
525#if _ARCH_PWR8
526 __vector signed short a, b, c;
527
528 a = (__vector signed short)vec_splats(__m1);
529 b = (__vector signed short)vec_splats(__m2);
530 c = vec_sub(a, b);
531 return (__m64)((__vector long long)c)[0];
532#else
533 __m64_union m1, m2, res;
534
535 m1.as_m64 = __m1;
536 m2.as_m64 = __m2;
537
538 res.as_short[0] = m1.as_short[0] - m2.as_short[0];
539 res.as_short[1] = m1.as_short[1] - m2.as_short[1];
540 res.as_short[2] = m1.as_short[2] - m2.as_short[2];
541 res.as_short[3] = m1.as_short[3] - m2.as_short[3];
542
543 return (__m64)res.as_m64;
544#endif
545}
546
547extern __inline __m64
548 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
549 _m_psubw(__m64 __m1, __m64 __m2) {
550 return _mm_sub_pi16(__m1, __m2);
551}
552
553/* Subtract the 32-bit values in M2 from the 32-bit values in M1. */
554extern __inline __m64
555 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
556 _mm_sub_pi32(__m64 __m1, __m64 __m2) {
557#if _ARCH_PWR9
558 __vector signed int a, b, c;
559
560 a = (__vector signed int)vec_splats(__m1);
561 b = (__vector signed int)vec_splats(__m2);
562 c = vec_sub(a, b);
563 return (__m64)((__vector long long)c)[0];
564#else
565 __m64_union m1, m2, res;
566
567 m1.as_m64 = __m1;
568 m2.as_m64 = __m2;
569
570 res.as_int[0] = m1.as_int[0] - m2.as_int[0];
571 res.as_int[1] = m1.as_int[1] - m2.as_int[1];
572
573 return (__m64)res.as_m64;
574#endif
575}
576
577extern __inline __m64
578 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
579 _m_psubd(__m64 __m1, __m64 __m2) {
580 return _mm_sub_pi32(__m1, __m2);
581}
582
583extern __inline __m64
584 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
585 _mm_add_si64(__m64 __m1, __m64 __m2) {
586 return (__m1 + __m2);
587}
588
589extern __inline __m64
590 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
591 _mm_sub_si64(__m64 __m1, __m64 __m2) {
592 return (__m1 - __m2);
593}
594
595/* Shift the 64-bit value in M left by COUNT. */
596extern __inline __m64
597 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
598 _mm_sll_si64(__m64 __m, __m64 __count) {
599 return (__m << __count);
600}
601
602extern __inline __m64
603 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
604 _m_psllq(__m64 __m, __m64 __count) {
605 return _mm_sll_si64(__m, __count);
606}
607
608extern __inline __m64
609 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
610 _mm_slli_si64(__m64 __m, const int __count) {
611 return (__m << __count);
612}
613
614extern __inline __m64
615 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
616 _m_psllqi(__m64 __m, const int __count) {
617 return _mm_slli_si64(__m, __count);
618}
619
620/* Shift the 64-bit value in M left by COUNT; shift in zeros. */
621extern __inline __m64
622 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
623 _mm_srl_si64(__m64 __m, __m64 __count) {
624 return (__m >> __count);
625}
626
627extern __inline __m64
628 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
629 _m_psrlq(__m64 __m, __m64 __count) {
630 return _mm_srl_si64(__m, __count);
631}
632
633extern __inline __m64
634 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
635 _mm_srli_si64(__m64 __m, const int __count) {
636 return (__m >> __count);
637}
638
639extern __inline __m64
640 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
641 _m_psrlqi(__m64 __m, const int __count) {
642 return _mm_srli_si64(__m, __count);
643}
644
645/* Bit-wise AND the 64-bit values in M1 and M2. */
646extern __inline __m64
647 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
648 _mm_and_si64(__m64 __m1, __m64 __m2) {
649 return (__m1 & __m2);
650}
651
652extern __inline __m64
653 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
654 _m_pand(__m64 __m1, __m64 __m2) {
655 return _mm_and_si64(__m1, __m2);
656}
657
658/* Bit-wise complement the 64-bit value in M1 and bit-wise AND it with the
659 64-bit value in M2. */
660extern __inline __m64
661 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
662 _mm_andnot_si64(__m64 __m1, __m64 __m2) {
663 return (~__m1 & __m2);
664}
665
666extern __inline __m64
667 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
668 _m_pandn(__m64 __m1, __m64 __m2) {
669 return _mm_andnot_si64(__m1, __m2);
670}
671
672/* Bit-wise inclusive OR the 64-bit values in M1 and M2. */
673extern __inline __m64
674 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
675 _mm_or_si64(__m64 __m1, __m64 __m2) {
676 return (__m1 | __m2);
677}
678
679extern __inline __m64
680 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
681 _m_por(__m64 __m1, __m64 __m2) {
682 return _mm_or_si64(__m1, __m2);
683}
684
685/* Bit-wise exclusive OR the 64-bit values in M1 and M2. */
686extern __inline __m64
687 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
688 _mm_xor_si64(__m64 __m1, __m64 __m2) {
689 return (__m1 ^ __m2);
690}
691
692extern __inline __m64
693 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
694 _m_pxor(__m64 __m1, __m64 __m2) {
695 return _mm_xor_si64(__m1, __m2);
696}
697
698/* Creates a 64-bit zero. */
699extern __inline __m64
700 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
701 _mm_setzero_si64(void) {
702 return (__m64)0;
703}
704
705/* Compare eight 8-bit values. The result of the comparison is 0xFF if the
706 test is true and zero if false. */
707extern __inline __m64
708 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
709 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2) {
710#if defined(_ARCH_PWR6) && defined(__powerpc64__)
711 __m64 res;
712 __asm__("cmpb %0,%1,%2;\n" : "=r"(res) : "r"(__m1), "r"(__m2) :);
713 return (res);
714#else
715 __m64_union m1, m2, res;
716
717 m1.as_m64 = __m1;
718 m2.as_m64 = __m2;
719
720 res.as_char[0] = (m1.as_char[0] == m2.as_char[0]) ? -1 : 0;
721 res.as_char[1] = (m1.as_char[1] == m2.as_char[1]) ? -1 : 0;
722 res.as_char[2] = (m1.as_char[2] == m2.as_char[2]) ? -1 : 0;
723 res.as_char[3] = (m1.as_char[3] == m2.as_char[3]) ? -1 : 0;
724 res.as_char[4] = (m1.as_char[4] == m2.as_char[4]) ? -1 : 0;
725 res.as_char[5] = (m1.as_char[5] == m2.as_char[5]) ? -1 : 0;
726 res.as_char[6] = (m1.as_char[6] == m2.as_char[6]) ? -1 : 0;
727 res.as_char[7] = (m1.as_char[7] == m2.as_char[7]) ? -1 : 0;
728
729 return (__m64)res.as_m64;
730#endif
731}
732
733extern __inline __m64
734 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
735 _m_pcmpeqb(__m64 __m1, __m64 __m2) {
736 return _mm_cmpeq_pi8(__m1, __m2);
737}
738
739extern __inline __m64
740 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
741 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2) {
742#if _ARCH_PWR8
743 __vector signed char a, b, c;
744
745 a = (__vector signed char)vec_splats(__m1);
746 b = (__vector signed char)vec_splats(__m2);
747 c = (__vector signed char)vec_cmpgt(a, b);
748 return (__m64)((__vector long long)c)[0];
749#else
750 __m64_union m1, m2, res;
751
752 m1.as_m64 = __m1;
753 m2.as_m64 = __m2;
754
755 res.as_char[0] = (m1.as_char[0] > m2.as_char[0]) ? -1 : 0;
756 res.as_char[1] = (m1.as_char[1] > m2.as_char[1]) ? -1 : 0;
757 res.as_char[2] = (m1.as_char[2] > m2.as_char[2]) ? -1 : 0;
758 res.as_char[3] = (m1.as_char[3] > m2.as_char[3]) ? -1 : 0;
759 res.as_char[4] = (m1.as_char[4] > m2.as_char[4]) ? -1 : 0;
760 res.as_char[5] = (m1.as_char[5] > m2.as_char[5]) ? -1 : 0;
761 res.as_char[6] = (m1.as_char[6] > m2.as_char[6]) ? -1 : 0;
762 res.as_char[7] = (m1.as_char[7] > m2.as_char[7]) ? -1 : 0;
763
764 return (__m64)res.as_m64;
765#endif
766}
767
768extern __inline __m64
769 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
770 _m_pcmpgtb(__m64 __m1, __m64 __m2) {
771 return _mm_cmpgt_pi8(__m1, __m2);
772}
773
774/* Compare four 16-bit values. The result of the comparison is 0xFFFF if
775 the test is true and zero if false. */
776extern __inline __m64
777 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
778 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2) {
779#if _ARCH_PWR8
780 __vector signed short a, b, c;
781
782 a = (__vector signed short)vec_splats(__m1);
783 b = (__vector signed short)vec_splats(__m2);
784 c = (__vector signed short)vec_cmpeq(a, b);
785 return (__m64)((__vector long long)c)[0];
786#else
787 __m64_union m1, m2, res;
788
789 m1.as_m64 = __m1;
790 m2.as_m64 = __m2;
791
792 res.as_short[0] = (m1.as_short[0] == m2.as_short[0]) ? -1 : 0;
793 res.as_short[1] = (m1.as_short[1] == m2.as_short[1]) ? -1 : 0;
794 res.as_short[2] = (m1.as_short[2] == m2.as_short[2]) ? -1 : 0;
795 res.as_short[3] = (m1.as_short[3] == m2.as_short[3]) ? -1 : 0;
796
797 return (__m64)res.as_m64;
798#endif
799}
800
801extern __inline __m64
802 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
803 _m_pcmpeqw(__m64 __m1, __m64 __m2) {
804 return _mm_cmpeq_pi16(__m1, __m2);
805}
806
807extern __inline __m64
808 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
809 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2) {
810#if _ARCH_PWR8
811 __vector signed short a, b, c;
812
813 a = (__vector signed short)vec_splats(__m1);
814 b = (__vector signed short)vec_splats(__m2);
815 c = (__vector signed short)vec_cmpgt(a, b);
816 return (__m64)((__vector long long)c)[0];
817#else
818 __m64_union m1, m2, res;
819
820 m1.as_m64 = __m1;
821 m2.as_m64 = __m2;
822
823 res.as_short[0] = (m1.as_short[0] > m2.as_short[0]) ? -1 : 0;
824 res.as_short[1] = (m1.as_short[1] > m2.as_short[1]) ? -1 : 0;
825 res.as_short[2] = (m1.as_short[2] > m2.as_short[2]) ? -1 : 0;
826 res.as_short[3] = (m1.as_short[3] > m2.as_short[3]) ? -1 : 0;
827
828 return (__m64)res.as_m64;
829#endif
830}
831
832extern __inline __m64
833 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
834 _m_pcmpgtw(__m64 __m1, __m64 __m2) {
835 return _mm_cmpgt_pi16(__m1, __m2);
836}
837
838/* Compare two 32-bit values. The result of the comparison is 0xFFFFFFFF if
839 the test is true and zero if false. */
840extern __inline __m64
841 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
842 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2) {
843#if _ARCH_PWR9
844 __vector signed int a, b, c;
845
846 a = (__vector signed int)vec_splats(__m1);
847 b = (__vector signed int)vec_splats(__m2);
848 c = (__vector signed int)vec_cmpeq(a, b);
849 return (__m64)((__vector long long)c)[0];
850#else
851 __m64_union m1, m2, res;
852
853 m1.as_m64 = __m1;
854 m2.as_m64 = __m2;
855
856 res.as_int[0] = (m1.as_int[0] == m2.as_int[0]) ? -1 : 0;
857 res.as_int[1] = (m1.as_int[1] == m2.as_int[1]) ? -1 : 0;
858
859 return (__m64)res.as_m64;
860#endif
861}
862
863extern __inline __m64
864 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
865 _m_pcmpeqd(__m64 __m1, __m64 __m2) {
866 return _mm_cmpeq_pi32(__m1, __m2);
867}
868
869extern __inline __m64
870 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
871 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2) {
872#if _ARCH_PWR9
873 __vector signed int a, b, c;
874
875 a = (__vector signed int)vec_splats(__m1);
876 b = (__vector signed int)vec_splats(__m2);
877 c = (__vector signed int)vec_cmpgt(a, b);
878 return (__m64)((__vector long long)c)[0];
879#else
880 __m64_union m1, m2, res;
881
882 m1.as_m64 = __m1;
883 m2.as_m64 = __m2;
884
885 res.as_int[0] = (m1.as_int[0] > m2.as_int[0]) ? -1 : 0;
886 res.as_int[1] = (m1.as_int[1] > m2.as_int[1]) ? -1 : 0;
887
888 return (__m64)res.as_m64;
889#endif
890}
891
892extern __inline __m64
893 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
894 _m_pcmpgtd(__m64 __m1, __m64 __m2) {
895 return _mm_cmpgt_pi32(__m1, __m2);
896}
897
898#if _ARCH_PWR8
899/* Add the 8-bit values in M1 to the 8-bit values in M2 using signed
900 saturated arithmetic. */
901extern __inline __m64
902 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
903 _mm_adds_pi8(__m64 __m1, __m64 __m2) {
904 __vector signed char a, b, c;
905
906 a = (__vector signed char)vec_splats(__m1);
907 b = (__vector signed char)vec_splats(__m2);
908 c = vec_adds(a, b);
909 return (__m64)((__vector long long)c)[0];
910}
911
912extern __inline __m64
913 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
914 _m_paddsb(__m64 __m1, __m64 __m2) {
915 return _mm_adds_pi8(__m1, __m2);
916}
917/* Add the 16-bit values in M1 to the 16-bit values in M2 using signed
918 saturated arithmetic. */
919extern __inline __m64
920 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
921 _mm_adds_pi16(__m64 __m1, __m64 __m2) {
922 __vector signed short a, b, c;
923
924 a = (__vector signed short)vec_splats(__m1);
925 b = (__vector signed short)vec_splats(__m2);
926 c = vec_adds(a, b);
927 return (__m64)((__vector long long)c)[0];
928}
929
930extern __inline __m64
931 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
932 _m_paddsw(__m64 __m1, __m64 __m2) {
933 return _mm_adds_pi16(__m1, __m2);
934}
935/* Add the 8-bit values in M1 to the 8-bit values in M2 using unsigned
936 saturated arithmetic. */
937extern __inline __m64
938 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
939 _mm_adds_pu8(__m64 __m1, __m64 __m2) {
940 __vector unsigned char a, b, c;
941
942 a = (__vector unsigned char)vec_splats(__m1);
943 b = (__vector unsigned char)vec_splats(__m2);
944 c = vec_adds(a, b);
945 return (__m64)((__vector long long)c)[0];
946}
947
948extern __inline __m64
949 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
950 _m_paddusb(__m64 __m1, __m64 __m2) {
951 return _mm_adds_pu8(__m1, __m2);
952}
953
954/* Add the 16-bit values in M1 to the 16-bit values in M2 using unsigned
955 saturated arithmetic. */
956extern __inline __m64
957 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
958 _mm_adds_pu16(__m64 __m1, __m64 __m2) {
959 __vector unsigned short a, b, c;
960
961 a = (__vector unsigned short)vec_splats(__m1);
962 b = (__vector unsigned short)vec_splats(__m2);
963 c = vec_adds(a, b);
964 return (__m64)((__vector long long)c)[0];
965}
966
967extern __inline __m64
968 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
969 _m_paddusw(__m64 __m1, __m64 __m2) {
970 return _mm_adds_pu16(__m1, __m2);
971}
972
973/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using signed
974 saturating arithmetic. */
975extern __inline __m64
976 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
977 _mm_subs_pi8(__m64 __m1, __m64 __m2) {
978 __vector signed char a, b, c;
979
980 a = (__vector signed char)vec_splats(__m1);
981 b = (__vector signed char)vec_splats(__m2);
982 c = vec_subs(a, b);
983 return (__m64)((__vector long long)c)[0];
984}
985
986extern __inline __m64
987 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
988 _m_psubsb(__m64 __m1, __m64 __m2) {
989 return _mm_subs_pi8(__m1, __m2);
990}
991
992/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
993 signed saturating arithmetic. */
994extern __inline __m64
995 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
996 _mm_subs_pi16(__m64 __m1, __m64 __m2) {
997 __vector signed short a, b, c;
998
999 a = (__vector signed short)vec_splats(__m1);
1000 b = (__vector signed short)vec_splats(__m2);
1001 c = vec_subs(a, b);
1002 return (__m64)((__vector long long)c)[0];
1003}
1004
1005extern __inline __m64
1006 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1007 _m_psubsw(__m64 __m1, __m64 __m2) {
1008 return _mm_subs_pi16(__m1, __m2);
1009}
1010
1011/* Subtract the 8-bit values in M2 from the 8-bit values in M1 using
1012 unsigned saturating arithmetic. */
1013extern __inline __m64
1014 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1015 _mm_subs_pu8(__m64 __m1, __m64 __m2) {
1016 __vector unsigned char a, b, c;
1017
1018 a = (__vector unsigned char)vec_splats(__m1);
1019 b = (__vector unsigned char)vec_splats(__m2);
1020 c = vec_subs(a, b);
1021 return (__m64)((__vector long long)c)[0];
1022}
1023
1024extern __inline __m64
1025 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1026 _m_psubusb(__m64 __m1, __m64 __m2) {
1027 return _mm_subs_pu8(__m1, __m2);
1028}
1029
1030/* Subtract the 16-bit values in M2 from the 16-bit values in M1 using
1031 unsigned saturating arithmetic. */
1032extern __inline __m64
1033 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1034 _mm_subs_pu16(__m64 __m1, __m64 __m2) {
1035 __vector unsigned short a, b, c;
1036
1037 a = (__vector unsigned short)vec_splats(__m1);
1038 b = (__vector unsigned short)vec_splats(__m2);
1039 c = vec_subs(a, b);
1040 return (__m64)((__vector long long)c)[0];
1041}
1042
1043extern __inline __m64
1044 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1045 _m_psubusw(__m64 __m1, __m64 __m2) {
1046 return _mm_subs_pu16(__m1, __m2);
1047}
1048
1049/* Multiply four 16-bit values in M1 by four 16-bit values in M2 producing
1050 four 32-bit intermediate results, which are then summed by pairs to
1051 produce two 32-bit results. */
1052extern __inline __m64
1053 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1054 _mm_madd_pi16(__m64 __m1, __m64 __m2) {
1055 __vector signed short a, b;
1056 __vector signed int c;
1057 __vector signed int zero = {0, 0, 0, 0};
1058
1059 a = (__vector signed short)vec_splats(__m1);
1060 b = (__vector signed short)vec_splats(__m2);
1061 c = vec_vmsumshm(a, b, zero);
1062 return (__m64)((__vector long long)c)[0];
1063}
1064
1065extern __inline __m64
1066 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1067 _m_pmaddwd(__m64 __m1, __m64 __m2) {
1068 return _mm_madd_pi16(__m1, __m2);
1069}
1070/* Multiply four signed 16-bit values in M1 by four signed 16-bit values in
1071 M2 and produce the high 16 bits of the 32-bit results. */
1072extern __inline __m64
1073 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1074 _mm_mulhi_pi16(__m64 __m1, __m64 __m2) {
1075 __vector signed short a, b;
1076 __vector signed short c;
1077 __vector signed int w0, w1;
1078 __vector unsigned char xform1 = {
1079#ifdef __LITTLE_ENDIAN__
1080 0x02, 0x03, 0x12, 0x13, 0x06, 0x07, 0x16, 0x17, 0x0A,
1081 0x0B, 0x1A, 0x1B, 0x0E, 0x0F, 0x1E, 0x1F
1082#else
1083 0x00, 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15, 0x00,
1084 0x01, 0x10, 0x11, 0x04, 0x05, 0x14, 0x15
1085#endif
1086 };
1087
1088 a = (__vector signed short)vec_splats(__m1);
1089 b = (__vector signed short)vec_splats(__m2);
1090
1091 w0 = vec_vmulesh(a, b);
1092 w1 = vec_vmulosh(a, b);
1093 c = (__vector signed short)vec_perm(w0, w1, xform1);
1094
1095 return (__m64)((__vector long long)c)[0];
1096}
1097
1098extern __inline __m64
1099 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1100 _m_pmulhw(__m64 __m1, __m64 __m2) {
1101 return _mm_mulhi_pi16(__m1, __m2);
1102}
1103
1104/* Multiply four 16-bit values in M1 by four 16-bit values in M2 and produce
1105 the low 16 bits of the results. */
1106extern __inline __m64
1107 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1108 _mm_mullo_pi16(__m64 __m1, __m64 __m2) {
1109 __vector signed short a, b, c;
1110
1111 a = (__vector signed short)vec_splats(__m1);
1112 b = (__vector signed short)vec_splats(__m2);
1113 c = a * b;
1114 return (__m64)((__vector long long)c)[0];
1115}
1116
1117extern __inline __m64
1118 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1119 _m_pmullw(__m64 __m1, __m64 __m2) {
1120 return _mm_mullo_pi16(__m1, __m2);
1121}
1122
1123/* Shift four 16-bit values in M left by COUNT. */
1124extern __inline __m64
1125 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1126 _mm_sll_pi16(__m64 __m, __m64 __count) {
1127 __vector signed short m, r;
1128 __vector unsigned short c;
1129
1130 if (__count <= 15) {
1131 m = (__vector signed short)vec_splats(__m);
1132 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1133 r = vec_sl(m, (__vector unsigned short)c);
1134 return (__m64)((__vector long long)r)[0];
1135 } else
1136 return (0);
1137}
1138
1139extern __inline __m64
1140 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1141 _m_psllw(__m64 __m, __m64 __count) {
1142 return _mm_sll_pi16(__m, __count);
1143}
1144
1145extern __inline __m64
1146 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1147 _mm_slli_pi16(__m64 __m, int __count) {
1148 /* Promote int to long then invoke mm_sll_pi16. */
1149 return _mm_sll_pi16(__m, __count);
1150}
1151
1152extern __inline __m64
1153 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1154 _m_psllwi(__m64 __m, int __count) {
1155 return _mm_slli_pi16(__m, __count);
1156}
1157
1158/* Shift two 32-bit values in M left by COUNT. */
1159extern __inline __m64
1160 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1161 _mm_sll_pi32(__m64 __m, __m64 __count) {
1162 __m64_union m, res;
1163
1164 m.as_m64 = __m;
1165
1166 res.as_int[0] = m.as_int[0] << __count;
1167 res.as_int[1] = m.as_int[1] << __count;
1168 return (res.as_m64);
1169}
1170
1171extern __inline __m64
1172 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1173 _m_pslld(__m64 __m, __m64 __count) {
1174 return _mm_sll_pi32(__m, __count);
1175}
1176
1177extern __inline __m64
1178 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1179 _mm_slli_pi32(__m64 __m, int __count) {
1180 /* Promote int to long then invoke mm_sll_pi32. */
1181 return _mm_sll_pi32(__m, __count);
1182}
1183
1184extern __inline __m64
1185 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1186 _m_pslldi(__m64 __m, int __count) {
1187 return _mm_slli_pi32(__m, __count);
1188}
1189
1190/* Shift four 16-bit values in M right by COUNT; shift in the sign bit. */
1191extern __inline __m64
1192 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1193 _mm_sra_pi16(__m64 __m, __m64 __count) {
1194 __vector signed short m, r;
1195 __vector unsigned short c;
1196
1197 if (__count <= 15) {
1198 m = (__vector signed short)vec_splats(__m);
1199 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1200 r = vec_sra(m, (__vector unsigned short)c);
1201 return (__m64)((__vector long long)r)[0];
1202 } else
1203 return (0);
1204}
1205
1206extern __inline __m64
1207 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1208 _m_psraw(__m64 __m, __m64 __count) {
1209 return _mm_sra_pi16(__m, __count);
1210}
1211
1212extern __inline __m64
1213 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1214 _mm_srai_pi16(__m64 __m, int __count) {
1215 /* Promote int to long then invoke mm_sra_pi32. */
1216 return _mm_sra_pi16(__m, __count);
1217}
1218
1219extern __inline __m64
1220 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1221 _m_psrawi(__m64 __m, int __count) {
1222 return _mm_srai_pi16(__m, __count);
1223}
1224
1225/* Shift two 32-bit values in M right by COUNT; shift in the sign bit. */
1226extern __inline __m64
1227 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1228 _mm_sra_pi32(__m64 __m, __m64 __count) {
1229 __m64_union m, res;
1230
1231 m.as_m64 = __m;
1232
1233 res.as_int[0] = m.as_int[0] >> __count;
1234 res.as_int[1] = m.as_int[1] >> __count;
1235 return (res.as_m64);
1236}
1237
1238extern __inline __m64
1239 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1240 _m_psrad(__m64 __m, __m64 __count) {
1241 return _mm_sra_pi32(__m, __count);
1242}
1243
1244extern __inline __m64
1245 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1246 _mm_srai_pi32(__m64 __m, int __count) {
1247 /* Promote int to long then invoke mm_sra_pi32. */
1248 return _mm_sra_pi32(__m, __count);
1249}
1250
1251extern __inline __m64
1252 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1253 _m_psradi(__m64 __m, int __count) {
1254 return _mm_srai_pi32(__m, __count);
1255}
1256
1257/* Shift four 16-bit values in M right by COUNT; shift in zeros. */
1258extern __inline __m64
1259 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1260 _mm_srl_pi16(__m64 __m, __m64 __count) {
1261 __vector unsigned short m, r;
1262 __vector unsigned short c;
1263
1264 if (__count <= 15) {
1265 m = (__vector unsigned short)vec_splats(__m);
1266 c = (__vector unsigned short)vec_splats((unsigned short)__count);
1267 r = vec_sr(m, (__vector unsigned short)c);
1268 return (__m64)((__vector long long)r)[0];
1269 } else
1270 return (0);
1271}
1272
1273extern __inline __m64
1274 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1275 _m_psrlw(__m64 __m, __m64 __count) {
1276 return _mm_srl_pi16(__m, __count);
1277}
1278
1279extern __inline __m64
1280 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1281 _mm_srli_pi16(__m64 __m, int __count) {
1282 /* Promote int to long then invoke mm_sra_pi32. */
1283 return _mm_srl_pi16(__m, __count);
1284}
1285
1286extern __inline __m64
1287 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1288 _m_psrlwi(__m64 __m, int __count) {
1289 return _mm_srli_pi16(__m, __count);
1290}
1291
1292/* Shift two 32-bit values in M right by COUNT; shift in zeros. */
1293extern __inline __m64
1294 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1295 _mm_srl_pi32(__m64 __m, __m64 __count) {
1296 __m64_union m, res;
1297
1298 m.as_m64 = __m;
1299
1300 res.as_int[0] = (unsigned int)m.as_int[0] >> __count;
1301 res.as_int[1] = (unsigned int)m.as_int[1] >> __count;
1302 return (res.as_m64);
1303}
1304
1305extern __inline __m64
1306 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1307 _m_psrld(__m64 __m, __m64 __count) {
1308 return _mm_srl_pi32(__m, __count);
1309}
1310
1311extern __inline __m64
1312 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1313 _mm_srli_pi32(__m64 __m, int __count) {
1314 /* Promote int to long then invoke mm_srl_pi32. */
1315 return _mm_srl_pi32(__m, __count);
1316}
1317
1318extern __inline __m64
1319 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1320 _m_psrldi(__m64 __m, int __count) {
1321 return _mm_srli_pi32(__m, __count);
1322}
1323#endif /* _ARCH_PWR8 */
1324
1325/* Creates a vector of two 32-bit values; I0 is least significant. */
1326extern __inline __m64
1327 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1328 _mm_set_pi32(int __i1, int __i0) {
1329 __m64_union res;
1330
1331 res.as_int[0] = __i0;
1332 res.as_int[1] = __i1;
1333 return (res.as_m64);
1334}
1335
1336/* Creates a vector of four 16-bit values; W0 is least significant. */
1337extern __inline __m64
1338 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1339 _mm_set_pi16(short __w3, short __w2, short __w1, short __w0) {
1340 __m64_union res;
1341
1342 res.as_short[0] = __w0;
1343 res.as_short[1] = __w1;
1344 res.as_short[2] = __w2;
1345 res.as_short[3] = __w3;
1346 return (res.as_m64);
1347}
1348
1349/* Creates a vector of eight 8-bit values; B0 is least significant. */
1350extern __inline __m64
1351 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1352 _mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3,
1353 char __b2, char __b1, char __b0) {
1354 __m64_union res;
1355
1356 res.as_char[0] = __b0;
1357 res.as_char[1] = __b1;
1358 res.as_char[2] = __b2;
1359 res.as_char[3] = __b3;
1360 res.as_char[4] = __b4;
1361 res.as_char[5] = __b5;
1362 res.as_char[6] = __b6;
1363 res.as_char[7] = __b7;
1364 return (res.as_m64);
1365}
1366
1367/* Similar, but with the arguments in reverse order. */
1368extern __inline __m64
1369 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1370 _mm_setr_pi32(int __i0, int __i1) {
1371 __m64_union res;
1372
1373 res.as_int[0] = __i0;
1374 res.as_int[1] = __i1;
1375 return (res.as_m64);
1376}
1377
1378extern __inline __m64
1379 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1380 _mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) {
1381 return _mm_set_pi16(__w3, __w2, __w1, __w0);
1382}
1383
1384extern __inline __m64
1385 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1386 _mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4,
1387 char __b5, char __b6, char __b7) {
1388 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1389}
1390
1391/* Creates a vector of two 32-bit values, both elements containing I. */
1392extern __inline __m64
1393 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1394 _mm_set1_pi32(int __i) {
1395 __m64_union res;
1396
1397 res.as_int[0] = __i;
1398 res.as_int[1] = __i;
1399 return (res.as_m64);
1400}
1401
1402/* Creates a vector of four 16-bit values, all elements containing W. */
1403extern __inline __m64
1404 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1405 _mm_set1_pi16(short __w) {
1406#if _ARCH_PWR9
1407 __vector signed short w;
1408
1409 w = (__vector signed short)vec_splats(__w);
1410 return (__m64)((__vector long long)w)[0];
1411#else
1412 __m64_union res;
1413
1414 res.as_short[0] = __w;
1415 res.as_short[1] = __w;
1416 res.as_short[2] = __w;
1417 res.as_short[3] = __w;
1418 return (res.as_m64);
1419#endif
1420}
1421
1422/* Creates a vector of eight 8-bit values, all elements containing B. */
1423extern __inline __m64
1424 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
1425 _mm_set1_pi8(signed char __b) {
1426#if _ARCH_PWR8
1427 __vector signed char b;
1428
1429 b = (__vector signed char)vec_splats(__b);
1430 return (__m64)((__vector long long)b)[0];
1431#else
1432 __m64_union res;
1433
1434 res.as_char[0] = __b;
1435 res.as_char[1] = __b;
1436 res.as_char[2] = __b;
1437 res.as_char[3] = __b;
1438 res.as_char[4] = __b;
1439 res.as_char[5] = __b;
1440 res.as_char[6] = __b;
1441 res.as_char[7] = __b;
1442 return (res.as_m64);
1443#endif
1444}
Logan Chienbedbf4f2020-01-06 19:35:19 -08001445
1446#else
1447#include_next <mmintrin.h>
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -08001448#endif /* defined(__ppc64__) && (defined(__linux__) || defined(__FreeBSD__)) \
1449 */
Logan Chienbedbf4f2020-01-06 19:35:19 -08001450
Logan Chiendf4f7662019-09-04 16:45:23 -07001451#endif /* _MMINTRIN_H_INCLUDED */