blob: bcffa8187801c7e65e3131e5ac4b1a8594b015f3 [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __TMMINTRIN_H
11#define __TMMINTRIN_H
12
Pirama Arumuga Nainarec8c89d2022-02-23 09:26:16 -080013#if !defined(__i386__) && !defined(__x86_64__)
14#error "This header is only meant to be used on x86 and x64 architecture"
15#endif
16
Logan Chien2833ffb2018-10-09 10:03:24 +080017#include <pmmintrin.h>
18
19/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080020#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
21#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
Logan Chien2833ffb2018-10-09 10:03:24 +080022
Logan Chien55afb0a2018-10-15 10:42:14 +080023/// Computes the absolute value of each of the packed 8-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080024/// integers in the source operand and stores the 8-bit unsigned integer
25/// results in the destination.
26///
27/// \headerfile <x86intrin.h>
28///
29/// This intrinsic corresponds to the \c PABSB instruction.
30///
31/// \param __a
32/// A 64-bit vector of [8 x i8].
33/// \returns A 64-bit integer vector containing the absolute values of the
34/// elements in the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +080035static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +080036_mm_abs_pi8(__m64 __a)
37{
38 return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
39}
40
Logan Chien55afb0a2018-10-15 10:42:14 +080041/// Computes the absolute value of each of the packed 8-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080042/// integers in the source operand and stores the 8-bit unsigned integer
43/// results in the destination.
44///
45/// \headerfile <x86intrin.h>
46///
47/// This intrinsic corresponds to the \c VPABSB instruction.
48///
49/// \param __a
50/// A 128-bit vector of [16 x i8].
51/// \returns A 128-bit integer vector containing the absolute values of the
52/// elements in the operand.
53static __inline__ __m128i __DEFAULT_FN_ATTRS
54_mm_abs_epi8(__m128i __a)
55{
56 return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
57}
58
Logan Chien55afb0a2018-10-15 10:42:14 +080059/// Computes the absolute value of each of the packed 16-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080060/// integers in the source operand and stores the 16-bit unsigned integer
61/// results in the destination.
62///
63/// \headerfile <x86intrin.h>
64///
65/// This intrinsic corresponds to the \c PABSW instruction.
66///
67/// \param __a
68/// A 64-bit vector of [4 x i16].
69/// \returns A 64-bit integer vector containing the absolute values of the
70/// elements in the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +080071static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +080072_mm_abs_pi16(__m64 __a)
73{
74 return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
75}
76
Logan Chien55afb0a2018-10-15 10:42:14 +080077/// Computes the absolute value of each of the packed 16-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080078/// integers in the source operand and stores the 16-bit unsigned integer
79/// results in the destination.
80///
81/// \headerfile <x86intrin.h>
82///
83/// This intrinsic corresponds to the \c VPABSW instruction.
84///
85/// \param __a
86/// A 128-bit vector of [8 x i16].
87/// \returns A 128-bit integer vector containing the absolute values of the
88/// elements in the operand.
89static __inline__ __m128i __DEFAULT_FN_ATTRS
90_mm_abs_epi16(__m128i __a)
91{
92 return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
93}
94
Logan Chien55afb0a2018-10-15 10:42:14 +080095/// Computes the absolute value of each of the packed 32-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080096/// integers in the source operand and stores the 32-bit unsigned integer
97/// results in the destination.
98///
99/// \headerfile <x86intrin.h>
100///
101/// This intrinsic corresponds to the \c PABSD instruction.
102///
103/// \param __a
104/// A 64-bit vector of [2 x i32].
105/// \returns A 64-bit integer vector containing the absolute values of the
106/// elements in the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +0800107static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800108_mm_abs_pi32(__m64 __a)
109{
110 return (__m64)__builtin_ia32_pabsd((__v2si)__a);
111}
112
Logan Chien55afb0a2018-10-15 10:42:14 +0800113/// Computes the absolute value of each of the packed 32-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +0800114/// integers in the source operand and stores the 32-bit unsigned integer
115/// results in the destination.
116///
117/// \headerfile <x86intrin.h>
118///
119/// This intrinsic corresponds to the \c VPABSD instruction.
120///
121/// \param __a
122/// A 128-bit vector of [4 x i32].
123/// \returns A 128-bit integer vector containing the absolute values of the
124/// elements in the operand.
125static __inline__ __m128i __DEFAULT_FN_ATTRS
126_mm_abs_epi32(__m128i __a)
127{
128 return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
129}
130
Logan Chien55afb0a2018-10-15 10:42:14 +0800131/// Concatenates the two 128-bit integer vector operands, and
Logan Chien2833ffb2018-10-09 10:03:24 +0800132/// right-shifts the result by the number of bytes specified in the immediate
133/// operand.
134///
135/// \headerfile <x86intrin.h>
136///
137/// \code
138/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
139/// \endcode
140///
141/// This intrinsic corresponds to the \c PALIGNR instruction.
142///
143/// \param a
144/// A 128-bit vector of [16 x i8] containing one of the source operands.
145/// \param b
146/// A 128-bit vector of [16 x i8] containing one of the source operands.
147/// \param n
148/// An immediate operand specifying how many bytes to right-shift the result.
149/// \returns A 128-bit integer vector containing the concatenated right-shifted
150/// value.
Logan Chien55afb0a2018-10-15 10:42:14 +0800151#define _mm_alignr_epi8(a, b, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800152 ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
153 (__v16qi)(__m128i)(b), (n)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800154
Logan Chien55afb0a2018-10-15 10:42:14 +0800155/// Concatenates the two 64-bit integer vector operands, and right-shifts
Logan Chien2833ffb2018-10-09 10:03:24 +0800156/// the result by the number of bytes specified in the immediate operand.
157///
158/// \headerfile <x86intrin.h>
159///
160/// \code
161/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
162/// \endcode
163///
164/// This intrinsic corresponds to the \c PALIGNR instruction.
165///
166/// \param a
167/// A 64-bit vector of [8 x i8] containing one of the source operands.
168/// \param b
169/// A 64-bit vector of [8 x i8] containing one of the source operands.
170/// \param n
171/// An immediate operand specifying how many bytes to right-shift the result.
172/// \returns A 64-bit integer vector containing the concatenated right-shifted
173/// value.
Logan Chien55afb0a2018-10-15 10:42:14 +0800174#define _mm_alignr_pi8(a, b, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800175 ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800176
Logan Chien55afb0a2018-10-15 10:42:14 +0800177/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800178/// 128-bit vectors of [8 x i16].
179///
180/// \headerfile <x86intrin.h>
181///
182/// This intrinsic corresponds to the \c VPHADDW instruction.
183///
184/// \param __a
185/// A 128-bit vector of [8 x i16] containing one of the source operands. The
186/// horizontal sums of the values are stored in the lower bits of the
187/// destination.
188/// \param __b
189/// A 128-bit vector of [8 x i16] containing one of the source operands. The
190/// horizontal sums of the values are stored in the upper bits of the
191/// destination.
192/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
193/// both operands.
194static __inline__ __m128i __DEFAULT_FN_ATTRS
195_mm_hadd_epi16(__m128i __a, __m128i __b)
196{
197 return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
198}
199
Logan Chien55afb0a2018-10-15 10:42:14 +0800200/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800201/// 128-bit vectors of [4 x i32].
202///
203/// \headerfile <x86intrin.h>
204///
205/// This intrinsic corresponds to the \c VPHADDD instruction.
206///
207/// \param __a
208/// A 128-bit vector of [4 x i32] containing one of the source operands. The
209/// horizontal sums of the values are stored in the lower bits of the
210/// destination.
211/// \param __b
212/// A 128-bit vector of [4 x i32] containing one of the source operands. The
213/// horizontal sums of the values are stored in the upper bits of the
214/// destination.
215/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
216/// both operands.
217static __inline__ __m128i __DEFAULT_FN_ATTRS
218_mm_hadd_epi32(__m128i __a, __m128i __b)
219{
220 return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
221}
222
Logan Chien55afb0a2018-10-15 10:42:14 +0800223/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800224/// 64-bit vectors of [4 x i16].
225///
226/// \headerfile <x86intrin.h>
227///
228/// This intrinsic corresponds to the \c PHADDW instruction.
229///
230/// \param __a
231/// A 64-bit vector of [4 x i16] containing one of the source operands. The
232/// horizontal sums of the values are stored in the lower bits of the
233/// destination.
234/// \param __b
235/// A 64-bit vector of [4 x i16] containing one of the source operands. The
236/// horizontal sums of the values are stored in the upper bits of the
237/// destination.
238/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
239/// operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800240static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800241_mm_hadd_pi16(__m64 __a, __m64 __b)
242{
243 return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
244}
245
Logan Chien55afb0a2018-10-15 10:42:14 +0800246/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800247/// 64-bit vectors of [2 x i32].
248///
249/// \headerfile <x86intrin.h>
250///
251/// This intrinsic corresponds to the \c PHADDD instruction.
252///
253/// \param __a
254/// A 64-bit vector of [2 x i32] containing one of the source operands. The
255/// horizontal sums of the values are stored in the lower bits of the
256/// destination.
257/// \param __b
258/// A 64-bit vector of [2 x i32] containing one of the source operands. The
259/// horizontal sums of the values are stored in the upper bits of the
260/// destination.
261/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
262/// operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800263static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800264_mm_hadd_pi32(__m64 __a, __m64 __b)
265{
266 return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
267}
268
Logan Chien55afb0a2018-10-15 10:42:14 +0800269/// Horizontally adds the adjacent pairs of values contained in 2 packed
270/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
271/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
272/// 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800273///
274/// \headerfile <x86intrin.h>
275///
276/// This intrinsic corresponds to the \c VPHADDSW instruction.
277///
278/// \param __a
279/// A 128-bit vector of [8 x i16] containing one of the source operands. The
280/// horizontal sums of the values are stored in the lower bits of the
281/// destination.
282/// \param __b
283/// A 128-bit vector of [8 x i16] containing one of the source operands. The
284/// horizontal sums of the values are stored in the upper bits of the
285/// destination.
286/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
287/// sums of both operands.
288static __inline__ __m128i __DEFAULT_FN_ATTRS
289_mm_hadds_epi16(__m128i __a, __m128i __b)
290{
291 return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
292}
293
Logan Chien55afb0a2018-10-15 10:42:14 +0800294/// Horizontally adds the adjacent pairs of values contained in 2 packed
295/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
296/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
297/// 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800298///
299/// \headerfile <x86intrin.h>
300///
301/// This intrinsic corresponds to the \c PHADDSW instruction.
302///
303/// \param __a
304/// A 64-bit vector of [4 x i16] containing one of the source operands. The
305/// horizontal sums of the values are stored in the lower bits of the
306/// destination.
307/// \param __b
308/// A 64-bit vector of [4 x i16] containing one of the source operands. The
309/// horizontal sums of the values are stored in the upper bits of the
310/// destination.
311/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
312/// sums of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800313static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800314_mm_hadds_pi16(__m64 __a, __m64 __b)
315{
316 return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
317}
318
Logan Chien55afb0a2018-10-15 10:42:14 +0800319/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800320/// packed 128-bit vectors of [8 x i16].
321///
322/// \headerfile <x86intrin.h>
323///
324/// This intrinsic corresponds to the \c VPHSUBW instruction.
325///
326/// \param __a
327/// A 128-bit vector of [8 x i16] containing one of the source operands. The
328/// horizontal differences between the values are stored in the lower bits of
329/// the destination.
330/// \param __b
331/// A 128-bit vector of [8 x i16] containing one of the source operands. The
332/// horizontal differences between the values are stored in the upper bits of
333/// the destination.
334/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
335/// of both operands.
336static __inline__ __m128i __DEFAULT_FN_ATTRS
337_mm_hsub_epi16(__m128i __a, __m128i __b)
338{
339 return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
340}
341
Logan Chien55afb0a2018-10-15 10:42:14 +0800342/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800343/// packed 128-bit vectors of [4 x i32].
344///
345/// \headerfile <x86intrin.h>
346///
347/// This intrinsic corresponds to the \c VPHSUBD instruction.
348///
349/// \param __a
350/// A 128-bit vector of [4 x i32] containing one of the source operands. The
351/// horizontal differences between the values are stored in the lower bits of
352/// the destination.
353/// \param __b
354/// A 128-bit vector of [4 x i32] containing one of the source operands. The
355/// horizontal differences between the values are stored in the upper bits of
356/// the destination.
357/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
358/// of both operands.
359static __inline__ __m128i __DEFAULT_FN_ATTRS
360_mm_hsub_epi32(__m128i __a, __m128i __b)
361{
362 return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
363}
364
Logan Chien55afb0a2018-10-15 10:42:14 +0800365/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800366/// packed 64-bit vectors of [4 x i16].
367///
368/// \headerfile <x86intrin.h>
369///
370/// This intrinsic corresponds to the \c PHSUBW instruction.
371///
372/// \param __a
373/// A 64-bit vector of [4 x i16] containing one of the source operands. The
374/// horizontal differences between the values are stored in the lower bits of
375/// the destination.
376/// \param __b
377/// A 64-bit vector of [4 x i16] containing one of the source operands. The
378/// horizontal differences between the values are stored in the upper bits of
379/// the destination.
380/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
381/// of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800382static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800383_mm_hsub_pi16(__m64 __a, __m64 __b)
384{
385 return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
386}
387
Logan Chien55afb0a2018-10-15 10:42:14 +0800388/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800389/// packed 64-bit vectors of [2 x i32].
390///
391/// \headerfile <x86intrin.h>
392///
393/// This intrinsic corresponds to the \c PHSUBD instruction.
394///
395/// \param __a
396/// A 64-bit vector of [2 x i32] containing one of the source operands. The
397/// horizontal differences between the values are stored in the lower bits of
398/// the destination.
399/// \param __b
400/// A 64-bit vector of [2 x i32] containing one of the source operands. The
401/// horizontal differences between the values are stored in the upper bits of
402/// the destination.
403/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
404/// of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800405static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800406_mm_hsub_pi32(__m64 __a, __m64 __b)
407{
408 return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
409}
410
Logan Chien55afb0a2018-10-15 10:42:14 +0800411/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800412/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
Logan Chien55afb0a2018-10-15 10:42:14 +0800413/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
414/// saturated to 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800415///
416/// \headerfile <x86intrin.h>
417///
418/// This intrinsic corresponds to the \c VPHSUBSW instruction.
419///
420/// \param __a
421/// A 128-bit vector of [8 x i16] containing one of the source operands. The
422/// horizontal differences between the values are stored in the lower bits of
423/// the destination.
424/// \param __b
425/// A 128-bit vector of [8 x i16] containing one of the source operands. The
426/// horizontal differences between the values are stored in the upper bits of
427/// the destination.
428/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
429/// differences of both operands.
430static __inline__ __m128i __DEFAULT_FN_ATTRS
431_mm_hsubs_epi16(__m128i __a, __m128i __b)
432{
433 return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
434}
435
Logan Chien55afb0a2018-10-15 10:42:14 +0800436/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800437/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
Logan Chien55afb0a2018-10-15 10:42:14 +0800438/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
439/// saturated to 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800440///
441/// \headerfile <x86intrin.h>
442///
443/// This intrinsic corresponds to the \c PHSUBSW instruction.
444///
445/// \param __a
446/// A 64-bit vector of [4 x i16] containing one of the source operands. The
447/// horizontal differences between the values are stored in the lower bits of
448/// the destination.
449/// \param __b
450/// A 64-bit vector of [4 x i16] containing one of the source operands. The
451/// horizontal differences between the values are stored in the upper bits of
452/// the destination.
453/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
454/// differences of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800455static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800456_mm_hsubs_pi16(__m64 __a, __m64 __b)
457{
458 return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
459}
460
Logan Chien55afb0a2018-10-15 10:42:14 +0800461/// Multiplies corresponding pairs of packed 8-bit unsigned integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800462/// values contained in the first source operand and packed 8-bit signed
463/// integer values contained in the second source operand, adds pairs of
464/// contiguous products with signed saturation, and writes the 16-bit sums to
Logan Chien55afb0a2018-10-15 10:42:14 +0800465/// the corresponding bits in the destination.
466///
467/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
468/// both operands are multiplied, and the sum of both results is written to
469/// bits [15:0] of the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800470///
471/// \headerfile <x86intrin.h>
472///
473/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
474///
475/// \param __a
476/// A 128-bit integer vector containing the first source operand.
477/// \param __b
478/// A 128-bit integer vector containing the second source operand.
479/// \returns A 128-bit integer vector containing the sums of products of both
Logan Chien55afb0a2018-10-15 10:42:14 +0800480/// operands: \n
481/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
482/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
483/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
484/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
485/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
486/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
487/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
488/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
Logan Chien2833ffb2018-10-09 10:03:24 +0800489static __inline__ __m128i __DEFAULT_FN_ATTRS
490_mm_maddubs_epi16(__m128i __a, __m128i __b)
491{
492 return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
493}
494
Logan Chien55afb0a2018-10-15 10:42:14 +0800495/// Multiplies corresponding pairs of packed 8-bit unsigned integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800496/// values contained in the first source operand and packed 8-bit signed
497/// integer values contained in the second source operand, adds pairs of
498/// contiguous products with signed saturation, and writes the 16-bit sums to
Logan Chien55afb0a2018-10-15 10:42:14 +0800499/// the corresponding bits in the destination.
500///
501/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
502/// both operands are multiplied, and the sum of both results is written to
503/// bits [15:0] of the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800504///
505/// \headerfile <x86intrin.h>
506///
507/// This intrinsic corresponds to the \c PMADDUBSW instruction.
508///
509/// \param __a
510/// A 64-bit integer vector containing the first source operand.
511/// \param __b
512/// A 64-bit integer vector containing the second source operand.
513/// \returns A 64-bit integer vector containing the sums of products of both
Logan Chien55afb0a2018-10-15 10:42:14 +0800514/// operands: \n
515/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
516/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
517/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
518/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
519static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800520_mm_maddubs_pi16(__m64 __a, __m64 __b)
521{
522 return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
523}
524
Logan Chien55afb0a2018-10-15 10:42:14 +0800525/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800526/// products to the 18 most significant bits by right-shifting, rounds the
527/// truncated value by adding 1, and writes bits [16:1] to the destination.
528///
529/// \headerfile <x86intrin.h>
530///
531/// This intrinsic corresponds to the \c VPMULHRSW instruction.
532///
533/// \param __a
534/// A 128-bit vector of [8 x i16] containing one of the source operands.
535/// \param __b
536/// A 128-bit vector of [8 x i16] containing one of the source operands.
537/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
538/// products of both operands.
539static __inline__ __m128i __DEFAULT_FN_ATTRS
540_mm_mulhrs_epi16(__m128i __a, __m128i __b)
541{
542 return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
543}
544
Logan Chien55afb0a2018-10-15 10:42:14 +0800545/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800546/// products to the 18 most significant bits by right-shifting, rounds the
547/// truncated value by adding 1, and writes bits [16:1] to the destination.
548///
549/// \headerfile <x86intrin.h>
550///
551/// This intrinsic corresponds to the \c PMULHRSW instruction.
552///
553/// \param __a
554/// A 64-bit vector of [4 x i16] containing one of the source operands.
555/// \param __b
556/// A 64-bit vector of [4 x i16] containing one of the source operands.
557/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
558/// products of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800559static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800560_mm_mulhrs_pi16(__m64 __a, __m64 __b)
561{
562 return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
563}
564
Logan Chien55afb0a2018-10-15 10:42:14 +0800565/// Copies the 8-bit integers from a 128-bit integer vector to the
Logan Chien2833ffb2018-10-09 10:03:24 +0800566/// destination or clears 8-bit values in the destination, as specified by
567/// the second source operand.
568///
569/// \headerfile <x86intrin.h>
570///
571/// This intrinsic corresponds to the \c VPSHUFB instruction.
572///
573/// \param __a
574/// A 128-bit integer vector containing the values to be copied.
575/// \param __b
576/// A 128-bit integer vector containing control bytes corresponding to
577/// positions in the destination:
Logan Chien55afb0a2018-10-15 10:42:14 +0800578/// Bit 7: \n
579/// 1: Clear the corresponding byte in the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800580/// 0: Copy the selected source byte to the corresponding byte in the
Logan Chien55afb0a2018-10-15 10:42:14 +0800581/// destination. \n
582/// Bits [6:4] Reserved. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800583/// Bits [3:0] select the source byte to be copied.
584/// \returns A 128-bit integer vector containing the copied or cleared values.
585static __inline__ __m128i __DEFAULT_FN_ATTRS
586_mm_shuffle_epi8(__m128i __a, __m128i __b)
587{
588 return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
589}
590
Logan Chien55afb0a2018-10-15 10:42:14 +0800591/// Copies the 8-bit integers from a 64-bit integer vector to the
Logan Chien2833ffb2018-10-09 10:03:24 +0800592/// destination or clears 8-bit values in the destination, as specified by
593/// the second source operand.
594///
595/// \headerfile <x86intrin.h>
596///
597/// This intrinsic corresponds to the \c PSHUFB instruction.
598///
599/// \param __a
600/// A 64-bit integer vector containing the values to be copied.
601/// \param __b
602/// A 64-bit integer vector containing control bytes corresponding to
603/// positions in the destination:
Logan Chien55afb0a2018-10-15 10:42:14 +0800604/// Bit 7: \n
605/// 1: Clear the corresponding byte in the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800606/// 0: Copy the selected source byte to the corresponding byte in the
Logan Chien55afb0a2018-10-15 10:42:14 +0800607/// destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800608/// Bits [3:0] select the source byte to be copied.
609/// \returns A 64-bit integer vector containing the copied or cleared values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800610static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800611_mm_shuffle_pi8(__m64 __a, __m64 __b)
612{
613 return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
614}
615
Logan Chien55afb0a2018-10-15 10:42:14 +0800616/// For each 8-bit integer in the first source operand, perform one of
617/// the following actions as specified by the second source operand.
618///
619/// If the byte in the second source is negative, calculate the two's
620/// complement of the corresponding byte in the first source, and write that
621/// value to the destination. If the byte in the second source is positive,
622/// copy the corresponding byte from the first source to the destination. If
623/// the byte in the second source is zero, clear the corresponding byte in
624/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800625///
626/// \headerfile <x86intrin.h>
627///
628/// This intrinsic corresponds to the \c VPSIGNB instruction.
629///
630/// \param __a
631/// A 128-bit integer vector containing the values to be copied.
632/// \param __b
633/// A 128-bit integer vector containing control bytes corresponding to
634/// positions in the destination.
635/// \returns A 128-bit integer vector containing the resultant values.
636static __inline__ __m128i __DEFAULT_FN_ATTRS
637_mm_sign_epi8(__m128i __a, __m128i __b)
638{
639 return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
640}
641
Logan Chien55afb0a2018-10-15 10:42:14 +0800642/// For each 16-bit integer in the first source operand, perform one of
643/// the following actions as specified by the second source operand.
644///
645/// If the word in the second source is negative, calculate the two's
646/// complement of the corresponding word in the first source, and write that
647/// value to the destination. If the word in the second source is positive,
648/// copy the corresponding word from the first source to the destination. If
649/// the word in the second source is zero, clear the corresponding word in
650/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800651///
652/// \headerfile <x86intrin.h>
653///
654/// This intrinsic corresponds to the \c VPSIGNW instruction.
655///
656/// \param __a
657/// A 128-bit integer vector containing the values to be copied.
658/// \param __b
659/// A 128-bit integer vector containing control words corresponding to
660/// positions in the destination.
661/// \returns A 128-bit integer vector containing the resultant values.
662static __inline__ __m128i __DEFAULT_FN_ATTRS
663_mm_sign_epi16(__m128i __a, __m128i __b)
664{
665 return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
666}
667
Logan Chien55afb0a2018-10-15 10:42:14 +0800668/// For each 32-bit integer in the first source operand, perform one of
669/// the following actions as specified by the second source operand.
670///
671/// If the doubleword in the second source is negative, calculate the two's
Logan Chien2833ffb2018-10-09 10:03:24 +0800672/// complement of the corresponding word in the first source, and write that
673/// value to the destination. If the doubleword in the second source is
674/// positive, copy the corresponding word from the first source to the
675/// destination. If the doubleword in the second source is zero, clear the
676/// corresponding word in the destination.
677///
678/// \headerfile <x86intrin.h>
679///
680/// This intrinsic corresponds to the \c VPSIGND instruction.
681///
682/// \param __a
683/// A 128-bit integer vector containing the values to be copied.
684/// \param __b
685/// A 128-bit integer vector containing control doublewords corresponding to
686/// positions in the destination.
687/// \returns A 128-bit integer vector containing the resultant values.
688static __inline__ __m128i __DEFAULT_FN_ATTRS
689_mm_sign_epi32(__m128i __a, __m128i __b)
690{
691 return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
692}
693
Logan Chien55afb0a2018-10-15 10:42:14 +0800694/// For each 8-bit integer in the first source operand, perform one of
695/// the following actions as specified by the second source operand.
696///
697/// If the byte in the second source is negative, calculate the two's
698/// complement of the corresponding byte in the first source, and write that
699/// value to the destination. If the byte in the second source is positive,
700/// copy the corresponding byte from the first source to the destination. If
701/// the byte in the second source is zero, clear the corresponding byte in
702/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800703///
704/// \headerfile <x86intrin.h>
705///
706/// This intrinsic corresponds to the \c PSIGNB instruction.
707///
708/// \param __a
709/// A 64-bit integer vector containing the values to be copied.
710/// \param __b
711/// A 64-bit integer vector containing control bytes corresponding to
712/// positions in the destination.
713/// \returns A 64-bit integer vector containing the resultant values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800714static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800715_mm_sign_pi8(__m64 __a, __m64 __b)
716{
717 return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
718}
719
Logan Chien55afb0a2018-10-15 10:42:14 +0800720/// For each 16-bit integer in the first source operand, perform one of
721/// the following actions as specified by the second source operand.
722///
723/// If the word in the second source is negative, calculate the two's
724/// complement of the corresponding word in the first source, and write that
725/// value to the destination. If the word in the second source is positive,
726/// copy the corresponding word from the first source to the destination. If
727/// the word in the second source is zero, clear the corresponding word in
728/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800729///
730/// \headerfile <x86intrin.h>
731///
732/// This intrinsic corresponds to the \c PSIGNW instruction.
733///
734/// \param __a
735/// A 64-bit integer vector containing the values to be copied.
736/// \param __b
737/// A 64-bit integer vector containing control words corresponding to
738/// positions in the destination.
739/// \returns A 64-bit integer vector containing the resultant values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800740static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800741_mm_sign_pi16(__m64 __a, __m64 __b)
742{
743 return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
744}
745
Logan Chien55afb0a2018-10-15 10:42:14 +0800746/// For each 32-bit integer in the first source operand, perform one of
747/// the following actions as specified by the second source operand.
748///
749/// If the doubleword in the second source is negative, calculate the two's
Logan Chien2833ffb2018-10-09 10:03:24 +0800750/// complement of the corresponding doubleword in the first source, and
751/// write that value to the destination. If the doubleword in the second
752/// source is positive, copy the corresponding doubleword from the first
753/// source to the destination. If the doubleword in the second source is
754/// zero, clear the corresponding doubleword in the destination.
755///
756/// \headerfile <x86intrin.h>
757///
758/// This intrinsic corresponds to the \c PSIGND instruction.
759///
760/// \param __a
761/// A 64-bit integer vector containing the values to be copied.
762/// \param __b
763/// A 64-bit integer vector containing two control doublewords corresponding
764/// to positions in the destination.
765/// \returns A 64-bit integer vector containing the resultant values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800766static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800767_mm_sign_pi32(__m64 __a, __m64 __b)
768{
769 return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
770}
771
772#undef __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +0800773#undef __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800774
775#endif /* __TMMINTRIN_H */