blob: b5a935d5e47e67848559088426601df4c14ef78e [file] [log] [blame]
Logan Chienbedbf4f2020-01-06 19:35:19 -08001/*===---- tmmintrin.h - Implementation of SSSE3 intrinsics on PowerPC ------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===-----------------------------------------------------------------------===
8 */
9
10/* Implemented from the specification included in the Intel C++ Compiler
11 User Guide and Reference, version 9.0. */
12
13#ifndef NO_WARN_X86_INTRINSICS
14/* This header is distributed to simplify porting x86_64 code that
15 makes explicit use of Intel intrinsics to powerpc64le.
16
17 It is the user's responsibility to determine if the results are
18 acceptable and make additional changes as necessary.
19
20 Note that much code that uses Intel intrinsics can be rewritten in
21 standard C or GNU C extensions, which are more portable and better
22 optimized across multiple targets. */
23#endif
24
25#ifndef TMMINTRIN_H_
26#define TMMINTRIN_H_
27
28#if defined(__linux__) && defined(__ppc64__)
29
30#include <altivec.h>
31
32/* We need definitions from the SSE header files. */
33#include <pmmintrin.h>
34
35extern __inline __m128i
36__attribute__((__gnu_inline__, __always_inline__, __artificial__))
37_mm_abs_epi16 (__m128i __A)
38{
39 return (__m128i) vec_abs ((__v8hi) __A);
40}
41
42extern __inline __m128i
43__attribute__((__gnu_inline__, __always_inline__, __artificial__))
44_mm_abs_epi32 (__m128i __A)
45{
46 return (__m128i) vec_abs ((__v4si) __A);
47}
48
49extern __inline __m128i
50__attribute__((__gnu_inline__, __always_inline__, __artificial__))
51_mm_abs_epi8 (__m128i __A)
52{
53 return (__m128i) vec_abs ((__v16qi) __A);
54}
55
56extern __inline __m64
57__attribute__((__gnu_inline__, __always_inline__, __artificial__))
58_mm_abs_pi16 (__m64 __A)
59{
60 __v8hi __B = (__v8hi) (__v2du) { __A, __A };
61 return (__m64) ((__v2du) vec_abs (__B))[0];
62}
63
64extern __inline __m64
65__attribute__((__gnu_inline__, __always_inline__, __artificial__))
66_mm_abs_pi32 (__m64 __A)
67{
68 __v4si __B = (__v4si) (__v2du) { __A, __A };
69 return (__m64) ((__v2du) vec_abs (__B))[0];
70}
71
72extern __inline __m64
73__attribute__((__gnu_inline__, __always_inline__, __artificial__))
74_mm_abs_pi8 (__m64 __A)
75{
76 __v16qi __B = (__v16qi) (__v2du) { __A, __A };
77 return (__m64) ((__v2du) vec_abs (__B))[0];
78}
79
80extern __inline __m128i
81__attribute__((__gnu_inline__, __always_inline__, __artificial__))
82_mm_alignr_epi8 (__m128i __A, __m128i __B, const unsigned int __count)
83{
84 if (__builtin_constant_p (__count) && __count < 16)
85 {
86#ifdef __LITTLE_ENDIAN__
87 __A = (__m128i) vec_reve ((__v16qu) __A);
88 __B = (__m128i) vec_reve ((__v16qu) __B);
89#endif
90 __A = (__m128i) vec_sld ((__v16qu) __B, (__v16qu) __A, __count);
91#ifdef __LITTLE_ENDIAN__
92 __A = (__m128i) vec_reve ((__v16qu) __A);
93#endif
94 return __A;
95 }
96
97 if (__count == 0)
98 return __B;
99
100 if (__count >= 16)
101 {
102 if (__count >= 32)
103 {
104 const __v16qu zero = { 0 };
105 return (__m128i) zero;
106 }
107 else
108 {
109 const __v16qu __shift =
110 vec_splats ((unsigned char) ((__count - 16) * 8));
111#ifdef __LITTLE_ENDIAN__
112 return (__m128i) vec_sro ((__v16qu) __A, __shift);
113#else
114 return (__m128i) vec_slo ((__v16qu) __A, __shift);
115#endif
116 }
117 }
118 else
119 {
120 const __v16qu __shiftA =
121 vec_splats ((unsigned char) ((16 - __count) * 8));
122 const __v16qu __shiftB = vec_splats ((unsigned char) (__count * 8));
123#ifdef __LITTLE_ENDIAN__
124 __A = (__m128i) vec_slo ((__v16qu) __A, __shiftA);
125 __B = (__m128i) vec_sro ((__v16qu) __B, __shiftB);
126#else
127 __A = (__m128i) vec_sro ((__v16qu) __A, __shiftA);
128 __B = (__m128i) vec_slo ((__v16qu) __B, __shiftB);
129#endif
130 return (__m128i) vec_or ((__v16qu) __A, (__v16qu) __B);
131 }
132}
133
134extern __inline __m64
135__attribute__((__gnu_inline__, __always_inline__, __artificial__))
136_mm_alignr_pi8 (__m64 __A, __m64 __B, unsigned int __count)
137{
138 if (__count < 16)
139 {
140 __v2du __C = { __B, __A };
141#ifdef __LITTLE_ENDIAN__
142 const __v4su __shift = { __count << 3, 0, 0, 0 };
143 __C = (__v2du) vec_sro ((__v16qu) __C, (__v16qu) __shift);
144#else
145 const __v4su __shift = { 0, 0, 0, __count << 3 };
146 __C = (__v2du) vec_slo ((__v16qu) __C, (__v16qu) __shift);
147#endif
148 return (__m64) __C[0];
149 }
150 else
151 {
152 const __m64 __zero = { 0 };
153 return __zero;
154 }
155}
156
157extern __inline __m128i
158__attribute__((__gnu_inline__, __always_inline__, __artificial__))
159_mm_hadd_epi16 (__m128i __A, __m128i __B)
160{
161 const __v16qu __P =
162 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
163 const __v16qu __Q =
164 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
165 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
166 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
167 return (__m128i) vec_add (__C, __D);
168}
169
170extern __inline __m128i
171__attribute__((__gnu_inline__, __always_inline__, __artificial__))
172_mm_hadd_epi32 (__m128i __A, __m128i __B)
173{
174 const __v16qu __P =
175 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
176 const __v16qu __Q =
177 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
178 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
179 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
180 return (__m128i) vec_add (__C, __D);
181}
182
183extern __inline __m64
184__attribute__((__gnu_inline__, __always_inline__, __artificial__))
185_mm_hadd_pi16 (__m64 __A, __m64 __B)
186{
187 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
188 const __v16qu __P =
189 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
190 const __v16qu __Q =
191 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
192 __v8hi __D = vec_perm (__C, __C, __Q);
193 __C = vec_perm (__C, __C, __P);
194 __C = vec_add (__C, __D);
195 return (__m64) ((__v2du) __C)[1];
196}
197
198extern __inline __m64
199__attribute__((__gnu_inline__, __always_inline__, __artificial__))
200_mm_hadd_pi32 (__m64 __A, __m64 __B)
201{
202 __v4si __C = (__v4si) (__v2du) { __A, __B };
203 const __v16qu __P =
204 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
205 const __v16qu __Q =
206 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
207 __v4si __D = vec_perm (__C, __C, __Q);
208 __C = vec_perm (__C, __C, __P);
209 __C = vec_add (__C, __D);
210 return (__m64) ((__v2du) __C)[1];
211}
212
213extern __inline __m128i
214__attribute__((__gnu_inline__, __always_inline__, __artificial__))
215_mm_hadds_epi16 (__m128i __A, __m128i __B)
216{
217 __v4si __C = { 0 }, __D = { 0 };
218 __C = vec_sum4s ((__v8hi) __A, __C);
219 __D = vec_sum4s ((__v8hi) __B, __D);
220 __C = (__v4si) vec_packs (__C, __D);
221 return (__m128i) __C;
222}
223
224extern __inline __m64
225__attribute__((__gnu_inline__, __always_inline__, __artificial__))
226_mm_hadds_pi16 (__m64 __A, __m64 __B)
227{
228 const __v4si __zero = { 0 };
229 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
230 __v4si __D = vec_sum4s (__C, __zero);
231 __C = vec_packs (__D, __D);
232 return (__m64) ((__v2du) __C)[1];
233}
234
235extern __inline __m128i
236__attribute__((__gnu_inline__, __always_inline__, __artificial__))
237_mm_hsub_epi16 (__m128i __A, __m128i __B)
238{
239 const __v16qu __P =
240 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
241 const __v16qu __Q =
242 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
243 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
244 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
245 return (__m128i) vec_sub (__C, __D);
246}
247
248extern __inline __m128i
249__attribute__((__gnu_inline__, __always_inline__, __artificial__))
250_mm_hsub_epi32 (__m128i __A, __m128i __B)
251{
252 const __v16qu __P =
253 { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
254 const __v16qu __Q =
255 { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
256 __v4si __C = vec_perm ((__v4si) __A, (__v4si) __B, __P);
257 __v4si __D = vec_perm ((__v4si) __A, (__v4si) __B, __Q);
258 return (__m128i) vec_sub (__C, __D);
259}
260
261extern __inline __m64
262__attribute__((__gnu_inline__, __always_inline__, __artificial__))
263_mm_hsub_pi16 (__m64 __A, __m64 __B)
264{
265 const __v16qu __P =
266 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
267 const __v16qu __Q =
268 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
269 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
270 __v8hi __D = vec_perm (__C, __C, __Q);
271 __C = vec_perm (__C, __C, __P);
272 __C = vec_sub (__C, __D);
273 return (__m64) ((__v2du) __C)[1];
274}
275
276extern __inline __m64
277__attribute__((__gnu_inline__, __always_inline__, __artificial__))
278_mm_hsub_pi32 (__m64 __A, __m64 __B)
279{
280 const __v16qu __P =
281 { 0, 1, 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 8, 9, 10, 11 };
282 const __v16qu __Q =
283 { 4, 5, 6, 7, 12, 13, 14, 15, 4, 5, 6, 7, 12, 13, 14, 15 };
284 __v4si __C = (__v4si) (__v2du) { __A, __B };
285 __v4si __D = vec_perm (__C, __C, __Q);
286 __C = vec_perm (__C, __C, __P);
287 __C = vec_sub (__C, __D);
288 return (__m64) ((__v2du) __C)[1];
289}
290
291extern __inline __m128i
292__attribute__((__gnu_inline__, __always_inline__, __artificial__))
293_mm_hsubs_epi16 (__m128i __A, __m128i __B)
294{
295 const __v16qu __P =
296 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
297 const __v16qu __Q =
298 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
299 __v8hi __C = vec_perm ((__v8hi) __A, (__v8hi) __B, __P);
300 __v8hi __D = vec_perm ((__v8hi) __A, (__v8hi) __B, __Q);
301 return (__m128i) vec_subs (__C, __D);
302}
303
304extern __inline __m64
305__attribute__((__gnu_inline__, __always_inline__, __artificial__))
306_mm_hsubs_pi16 (__m64 __A, __m64 __B)
307{
308 const __v16qu __P =
309 { 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13 };
310 const __v16qu __Q =
311 { 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15 };
312 __v8hi __C = (__v8hi) (__v2du) { __A, __B };
313 __v8hi __D = vec_perm (__C, __C, __P);
314 __v8hi __E = vec_perm (__C, __C, __Q);
315 __C = vec_subs (__D, __E);
316 return (__m64) ((__v2du) __C)[1];
317}
318
319extern __inline __m128i
320__attribute__((__gnu_inline__, __always_inline__, __artificial__))
321_mm_shuffle_epi8 (__m128i __A, __m128i __B)
322{
323 const __v16qi __zero = { 0 };
324 __vector __bool char __select = vec_cmplt ((__v16qi) __B, __zero);
325 __v16qi __C = vec_perm ((__v16qi) __A, (__v16qi) __A, (__v16qu) __B);
326 return (__m128i) vec_sel (__C, __zero, __select);
327}
328
329extern __inline __m64
330__attribute__((__gnu_inline__, __always_inline__, __artificial__))
331_mm_shuffle_pi8 (__m64 __A, __m64 __B)
332{
333 const __v16qi __zero = { 0 };
334 __v16qi __C = (__v16qi) (__v2du) { __A, __A };
335 __v16qi __D = (__v16qi) (__v2du) { __B, __B };
336 __vector __bool char __select = vec_cmplt ((__v16qi) __D, __zero);
337 __C = vec_perm ((__v16qi) __C, (__v16qi) __C, (__v16qu) __D);
338 __C = vec_sel (__C, __zero, __select);
339 return (__m64) ((__v2du) (__C))[0];
340}
341
342extern __inline __m128i
343__attribute__((__gnu_inline__, __always_inline__, __artificial__))
344_mm_sign_epi8 (__m128i __A, __m128i __B)
345{
346 const __v16qi __zero = { 0 };
347 __v16qi __selectneg = (__v16qi) vec_cmplt ((__v16qi) __B, __zero);
348 __v16qi __selectpos =
349 (__v16qi) vec_neg ((__v16qi) vec_cmpgt ((__v16qi) __B, __zero));
350 __v16qi __conv = vec_add (__selectneg, __selectpos);
351 return (__m128i) vec_mul ((__v16qi) __A, (__v16qi) __conv);
352}
353
354extern __inline __m128i
355__attribute__((__gnu_inline__, __always_inline__, __artificial__))
356_mm_sign_epi16 (__m128i __A, __m128i __B)
357{
358 const __v8hi __zero = { 0 };
359 __v8hi __selectneg = (__v8hi) vec_cmplt ((__v8hi) __B, __zero);
360 __v8hi __selectpos =
361 (__v8hi) vec_neg ((__v8hi) vec_cmpgt ((__v8hi) __B, __zero));
362 __v8hi __conv = vec_add (__selectneg, __selectpos);
363 return (__m128i) vec_mul ((__v8hi) __A, (__v8hi) __conv);
364}
365
366extern __inline __m128i
367__attribute__((__gnu_inline__, __always_inline__, __artificial__))
368_mm_sign_epi32 (__m128i __A, __m128i __B)
369{
370 const __v4si __zero = { 0 };
371 __v4si __selectneg = (__v4si) vec_cmplt ((__v4si) __B, __zero);
372 __v4si __selectpos =
373 (__v4si) vec_neg ((__v4si) vec_cmpgt ((__v4si) __B, __zero));
374 __v4si __conv = vec_add (__selectneg, __selectpos);
375 return (__m128i) vec_mul ((__v4si) __A, (__v4si) __conv);
376}
377
378extern __inline __m64
379__attribute__((__gnu_inline__, __always_inline__, __artificial__))
380_mm_sign_pi8 (__m64 __A, __m64 __B)
381{
382 const __v16qi __zero = { 0 };
383 __v16qi __C = (__v16qi) (__v2du) { __A, __A };
384 __v16qi __D = (__v16qi) (__v2du) { __B, __B };
385 __C = (__v16qi) _mm_sign_epi8 ((__m128i) __C, (__m128i) __D);
386 return (__m64) ((__v2du) (__C))[0];
387}
388
389extern __inline __m64
390__attribute__((__gnu_inline__, __always_inline__, __artificial__))
391_mm_sign_pi16 (__m64 __A, __m64 __B)
392{
393 const __v8hi __zero = { 0 };
394 __v8hi __C = (__v8hi) (__v2du) { __A, __A };
395 __v8hi __D = (__v8hi) (__v2du) { __B, __B };
396 __C = (__v8hi) _mm_sign_epi16 ((__m128i) __C, (__m128i) __D);
397 return (__m64) ((__v2du) (__C))[0];
398}
399
400extern __inline __m64
401__attribute__((__gnu_inline__, __always_inline__, __artificial__))
402_mm_sign_pi32 (__m64 __A, __m64 __B)
403{
404 const __v4si __zero = { 0 };
405 __v4si __C = (__v4si) (__v2du) { __A, __A };
406 __v4si __D = (__v4si) (__v2du) { __B, __B };
407 __C = (__v4si) _mm_sign_epi32 ((__m128i) __C, (__m128i) __D);
408 return (__m64) ((__v2du) (__C))[0];
409}
410
411extern __inline __m128i
412__attribute__((__gnu_inline__, __always_inline__, __artificial__))
413_mm_maddubs_epi16 (__m128i __A, __m128i __B)
414{
415 __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
416 __v8hi __C = vec_and (vec_unpackh ((__v16qi) __A), __unsigned);
417 __v8hi __D = vec_and (vec_unpackl ((__v16qi) __A), __unsigned);
418 __v8hi __E = vec_unpackh ((__v16qi) __B);
419 __v8hi __F = vec_unpackl ((__v16qi) __B);
420 __C = vec_mul (__C, __E);
421 __D = vec_mul (__D, __F);
422 const __v16qu __odds =
423 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
424 const __v16qu __evens =
425 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
426 __E = vec_perm (__C, __D, __odds);
427 __F = vec_perm (__C, __D, __evens);
428 return (__m128i) vec_adds (__E, __F);
429}
430
431extern __inline __m64
432__attribute__((__gnu_inline__, __always_inline__, __artificial__))
433_mm_maddubs_pi16 (__m64 __A, __m64 __B)
434{
435 __v8hi __C = (__v8hi) (__v2du) { __A, __A };
436 __C = vec_unpackl ((__v16qi) __C);
437 const __v8hi __unsigned = vec_splats ((signed short) 0x00ff);
438 __C = vec_and (__C, __unsigned);
439 __v8hi __D = (__v8hi) (__v2du) { __B, __B };
440 __D = vec_unpackl ((__v16qi) __D);
441 __D = vec_mul (__C, __D);
442 const __v16qu __odds =
443 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 };
444 const __v16qu __evens =
445 { 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 };
446 __C = vec_perm (__D, __D, __odds);
447 __D = vec_perm (__D, __D, __evens);
448 __C = vec_adds (__C, __D);
449 return (__m64) ((__v2du) (__C))[0];
450}
451
452extern __inline __m128i
453__attribute__((__gnu_inline__, __always_inline__, __artificial__))
454_mm_mulhrs_epi16 (__m128i __A, __m128i __B)
455{
456 __v4si __C = vec_unpackh ((__v8hi) __A);
457 __v4si __D = vec_unpackh ((__v8hi) __B);
458 __C = vec_mul (__C, __D);
459 __D = vec_unpackl ((__v8hi) __A);
460 __v4si __E = vec_unpackl ((__v8hi) __B);
461 __D = vec_mul (__D, __E);
462 const __v4su __shift = vec_splats ((unsigned int) 14);
463 __C = vec_sr (__C, __shift);
464 __D = vec_sr (__D, __shift);
465 const __v4si __ones = vec_splats ((signed int) 1);
466 __C = vec_add (__C, __ones);
467 __C = vec_sr (__C, (__v4su) __ones);
468 __D = vec_add (__D, __ones);
469 __D = vec_sr (__D, (__v4su) __ones);
470 return (__m128i) vec_pack (__C, __D);
471}
472
473extern __inline __m64
474__attribute__((__gnu_inline__, __always_inline__, __artificial__))
475_mm_mulhrs_pi16 (__m64 __A, __m64 __B)
476{
477 __v4si __C = (__v4si) (__v2du) { __A, __A };
478 __C = vec_unpackh ((__v8hi) __C);
479 __v4si __D = (__v4si) (__v2du) { __B, __B };
480 __D = vec_unpackh ((__v8hi) __D);
481 __C = vec_mul (__C, __D);
482 const __v4su __shift = vec_splats ((unsigned int) 14);
483 __C = vec_sr (__C, __shift);
484 const __v4si __ones = vec_splats ((signed int) 1);
485 __C = vec_add (__C, __ones);
486 __C = vec_sr (__C, (__v4su) __ones);
487 __v8hi __E = vec_pack (__C, __D);
488 return (__m64) ((__v2du) (__E))[0];
489}
490
491#else
492#include_next <tmmintrin.h>
493#endif /* defined(__linux__) && defined(__ppc64__) */
494
495#endif /* TMMINTRIN_H_ */