blob: dbd959d0a62cb3d8f65a3786b60e657c0f4fb2a0 [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __TMMINTRIN_H
11#define __TMMINTRIN_H
12
13#include <pmmintrin.h>
14
15/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080016#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64)))
17#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64)))
Logan Chien2833ffb2018-10-09 10:03:24 +080018
Logan Chien55afb0a2018-10-15 10:42:14 +080019/// Computes the absolute value of each of the packed 8-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080020/// integers in the source operand and stores the 8-bit unsigned integer
21/// results in the destination.
22///
23/// \headerfile <x86intrin.h>
24///
25/// This intrinsic corresponds to the \c PABSB instruction.
26///
27/// \param __a
28/// A 64-bit vector of [8 x i8].
29/// \returns A 64-bit integer vector containing the absolute values of the
30/// elements in the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +080031static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +080032_mm_abs_pi8(__m64 __a)
33{
34 return (__m64)__builtin_ia32_pabsb((__v8qi)__a);
35}
36
Logan Chien55afb0a2018-10-15 10:42:14 +080037/// Computes the absolute value of each of the packed 8-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080038/// integers in the source operand and stores the 8-bit unsigned integer
39/// results in the destination.
40///
41/// \headerfile <x86intrin.h>
42///
43/// This intrinsic corresponds to the \c VPABSB instruction.
44///
45/// \param __a
46/// A 128-bit vector of [16 x i8].
47/// \returns A 128-bit integer vector containing the absolute values of the
48/// elements in the operand.
49static __inline__ __m128i __DEFAULT_FN_ATTRS
50_mm_abs_epi8(__m128i __a)
51{
52 return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a);
53}
54
Logan Chien55afb0a2018-10-15 10:42:14 +080055/// Computes the absolute value of each of the packed 16-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080056/// integers in the source operand and stores the 16-bit unsigned integer
57/// results in the destination.
58///
59/// \headerfile <x86intrin.h>
60///
61/// This intrinsic corresponds to the \c PABSW instruction.
62///
63/// \param __a
64/// A 64-bit vector of [4 x i16].
65/// \returns A 64-bit integer vector containing the absolute values of the
66/// elements in the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +080067static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +080068_mm_abs_pi16(__m64 __a)
69{
70 return (__m64)__builtin_ia32_pabsw((__v4hi)__a);
71}
72
Logan Chien55afb0a2018-10-15 10:42:14 +080073/// Computes the absolute value of each of the packed 16-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080074/// integers in the source operand and stores the 16-bit unsigned integer
75/// results in the destination.
76///
77/// \headerfile <x86intrin.h>
78///
79/// This intrinsic corresponds to the \c VPABSW instruction.
80///
81/// \param __a
82/// A 128-bit vector of [8 x i16].
83/// \returns A 128-bit integer vector containing the absolute values of the
84/// elements in the operand.
85static __inline__ __m128i __DEFAULT_FN_ATTRS
86_mm_abs_epi16(__m128i __a)
87{
88 return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a);
89}
90
Logan Chien55afb0a2018-10-15 10:42:14 +080091/// Computes the absolute value of each of the packed 32-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +080092/// integers in the source operand and stores the 32-bit unsigned integer
93/// results in the destination.
94///
95/// \headerfile <x86intrin.h>
96///
97/// This intrinsic corresponds to the \c PABSD instruction.
98///
99/// \param __a
100/// A 64-bit vector of [2 x i32].
101/// \returns A 64-bit integer vector containing the absolute values of the
102/// elements in the operand.
Logan Chien55afb0a2018-10-15 10:42:14 +0800103static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800104_mm_abs_pi32(__m64 __a)
105{
106 return (__m64)__builtin_ia32_pabsd((__v2si)__a);
107}
108
Logan Chien55afb0a2018-10-15 10:42:14 +0800109/// Computes the absolute value of each of the packed 32-bit signed
Logan Chien2833ffb2018-10-09 10:03:24 +0800110/// integers in the source operand and stores the 32-bit unsigned integer
111/// results in the destination.
112///
113/// \headerfile <x86intrin.h>
114///
115/// This intrinsic corresponds to the \c VPABSD instruction.
116///
117/// \param __a
118/// A 128-bit vector of [4 x i32].
119/// \returns A 128-bit integer vector containing the absolute values of the
120/// elements in the operand.
121static __inline__ __m128i __DEFAULT_FN_ATTRS
122_mm_abs_epi32(__m128i __a)
123{
124 return (__m128i)__builtin_ia32_pabsd128((__v4si)__a);
125}
126
Logan Chien55afb0a2018-10-15 10:42:14 +0800127/// Concatenates the two 128-bit integer vector operands, and
Logan Chien2833ffb2018-10-09 10:03:24 +0800128/// right-shifts the result by the number of bytes specified in the immediate
129/// operand.
130///
131/// \headerfile <x86intrin.h>
132///
133/// \code
134/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n);
135/// \endcode
136///
137/// This intrinsic corresponds to the \c PALIGNR instruction.
138///
139/// \param a
140/// A 128-bit vector of [16 x i8] containing one of the source operands.
141/// \param b
142/// A 128-bit vector of [16 x i8] containing one of the source operands.
143/// \param n
144/// An immediate operand specifying how many bytes to right-shift the result.
145/// \returns A 128-bit integer vector containing the concatenated right-shifted
146/// value.
Logan Chien55afb0a2018-10-15 10:42:14 +0800147#define _mm_alignr_epi8(a, b, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800148 ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \
149 (__v16qi)(__m128i)(b), (n)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800150
Logan Chien55afb0a2018-10-15 10:42:14 +0800151/// Concatenates the two 64-bit integer vector operands, and right-shifts
Logan Chien2833ffb2018-10-09 10:03:24 +0800152/// the result by the number of bytes specified in the immediate operand.
153///
154/// \headerfile <x86intrin.h>
155///
156/// \code
157/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n);
158/// \endcode
159///
160/// This intrinsic corresponds to the \c PALIGNR instruction.
161///
162/// \param a
163/// A 64-bit vector of [8 x i8] containing one of the source operands.
164/// \param b
165/// A 64-bit vector of [8 x i8] containing one of the source operands.
166/// \param n
167/// An immediate operand specifying how many bytes to right-shift the result.
168/// \returns A 64-bit integer vector containing the concatenated right-shifted
169/// value.
Logan Chien55afb0a2018-10-15 10:42:14 +0800170#define _mm_alignr_pi8(a, b, n) \
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800171 ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n)))
Logan Chien2833ffb2018-10-09 10:03:24 +0800172
Logan Chien55afb0a2018-10-15 10:42:14 +0800173/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800174/// 128-bit vectors of [8 x i16].
175///
176/// \headerfile <x86intrin.h>
177///
178/// This intrinsic corresponds to the \c VPHADDW instruction.
179///
180/// \param __a
181/// A 128-bit vector of [8 x i16] containing one of the source operands. The
182/// horizontal sums of the values are stored in the lower bits of the
183/// destination.
184/// \param __b
185/// A 128-bit vector of [8 x i16] containing one of the source operands. The
186/// horizontal sums of the values are stored in the upper bits of the
187/// destination.
188/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of
189/// both operands.
190static __inline__ __m128i __DEFAULT_FN_ATTRS
191_mm_hadd_epi16(__m128i __a, __m128i __b)
192{
193 return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b);
194}
195
Logan Chien55afb0a2018-10-15 10:42:14 +0800196/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800197/// 128-bit vectors of [4 x i32].
198///
199/// \headerfile <x86intrin.h>
200///
201/// This intrinsic corresponds to the \c VPHADDD instruction.
202///
203/// \param __a
204/// A 128-bit vector of [4 x i32] containing one of the source operands. The
205/// horizontal sums of the values are stored in the lower bits of the
206/// destination.
207/// \param __b
208/// A 128-bit vector of [4 x i32] containing one of the source operands. The
209/// horizontal sums of the values are stored in the upper bits of the
210/// destination.
211/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of
212/// both operands.
213static __inline__ __m128i __DEFAULT_FN_ATTRS
214_mm_hadd_epi32(__m128i __a, __m128i __b)
215{
216 return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b);
217}
218
Logan Chien55afb0a2018-10-15 10:42:14 +0800219/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800220/// 64-bit vectors of [4 x i16].
221///
222/// \headerfile <x86intrin.h>
223///
224/// This intrinsic corresponds to the \c PHADDW instruction.
225///
226/// \param __a
227/// A 64-bit vector of [4 x i16] containing one of the source operands. The
228/// horizontal sums of the values are stored in the lower bits of the
229/// destination.
230/// \param __b
231/// A 64-bit vector of [4 x i16] containing one of the source operands. The
232/// horizontal sums of the values are stored in the upper bits of the
233/// destination.
234/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both
235/// operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800236static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800237_mm_hadd_pi16(__m64 __a, __m64 __b)
238{
239 return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b);
240}
241
Logan Chien55afb0a2018-10-15 10:42:14 +0800242/// Horizontally adds the adjacent pairs of values contained in 2 packed
Logan Chien2833ffb2018-10-09 10:03:24 +0800243/// 64-bit vectors of [2 x i32].
244///
245/// \headerfile <x86intrin.h>
246///
247/// This intrinsic corresponds to the \c PHADDD instruction.
248///
249/// \param __a
250/// A 64-bit vector of [2 x i32] containing one of the source operands. The
251/// horizontal sums of the values are stored in the lower bits of the
252/// destination.
253/// \param __b
254/// A 64-bit vector of [2 x i32] containing one of the source operands. The
255/// horizontal sums of the values are stored in the upper bits of the
256/// destination.
257/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both
258/// operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800259static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800260_mm_hadd_pi32(__m64 __a, __m64 __b)
261{
262 return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b);
263}
264
Logan Chien55afb0a2018-10-15 10:42:14 +0800265/// Horizontally adds the adjacent pairs of values contained in 2 packed
266/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are
267/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
268/// 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800269///
270/// \headerfile <x86intrin.h>
271///
272/// This intrinsic corresponds to the \c VPHADDSW instruction.
273///
274/// \param __a
275/// A 128-bit vector of [8 x i16] containing one of the source operands. The
276/// horizontal sums of the values are stored in the lower bits of the
277/// destination.
278/// \param __b
279/// A 128-bit vector of [8 x i16] containing one of the source operands. The
280/// horizontal sums of the values are stored in the upper bits of the
281/// destination.
282/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
283/// sums of both operands.
284static __inline__ __m128i __DEFAULT_FN_ATTRS
285_mm_hadds_epi16(__m128i __a, __m128i __b)
286{
287 return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b);
288}
289
Logan Chien55afb0a2018-10-15 10:42:14 +0800290/// Horizontally adds the adjacent pairs of values contained in 2 packed
291/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are
292/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to
293/// 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800294///
295/// \headerfile <x86intrin.h>
296///
297/// This intrinsic corresponds to the \c PHADDSW instruction.
298///
299/// \param __a
300/// A 64-bit vector of [4 x i16] containing one of the source operands. The
301/// horizontal sums of the values are stored in the lower bits of the
302/// destination.
303/// \param __b
304/// A 64-bit vector of [4 x i16] containing one of the source operands. The
305/// horizontal sums of the values are stored in the upper bits of the
306/// destination.
307/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
308/// sums of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800309static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800310_mm_hadds_pi16(__m64 __a, __m64 __b)
311{
312 return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b);
313}
314
Logan Chien55afb0a2018-10-15 10:42:14 +0800315/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800316/// packed 128-bit vectors of [8 x i16].
317///
318/// \headerfile <x86intrin.h>
319///
320/// This intrinsic corresponds to the \c VPHSUBW instruction.
321///
322/// \param __a
323/// A 128-bit vector of [8 x i16] containing one of the source operands. The
324/// horizontal differences between the values are stored in the lower bits of
325/// the destination.
326/// \param __b
327/// A 128-bit vector of [8 x i16] containing one of the source operands. The
328/// horizontal differences between the values are stored in the upper bits of
329/// the destination.
330/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences
331/// of both operands.
332static __inline__ __m128i __DEFAULT_FN_ATTRS
333_mm_hsub_epi16(__m128i __a, __m128i __b)
334{
335 return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b);
336}
337
Logan Chien55afb0a2018-10-15 10:42:14 +0800338/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800339/// packed 128-bit vectors of [4 x i32].
340///
341/// \headerfile <x86intrin.h>
342///
343/// This intrinsic corresponds to the \c VPHSUBD instruction.
344///
345/// \param __a
346/// A 128-bit vector of [4 x i32] containing one of the source operands. The
347/// horizontal differences between the values are stored in the lower bits of
348/// the destination.
349/// \param __b
350/// A 128-bit vector of [4 x i32] containing one of the source operands. The
351/// horizontal differences between the values are stored in the upper bits of
352/// the destination.
353/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences
354/// of both operands.
355static __inline__ __m128i __DEFAULT_FN_ATTRS
356_mm_hsub_epi32(__m128i __a, __m128i __b)
357{
358 return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b);
359}
360
Logan Chien55afb0a2018-10-15 10:42:14 +0800361/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800362/// packed 64-bit vectors of [4 x i16].
363///
364/// \headerfile <x86intrin.h>
365///
366/// This intrinsic corresponds to the \c PHSUBW instruction.
367///
368/// \param __a
369/// A 64-bit vector of [4 x i16] containing one of the source operands. The
370/// horizontal differences between the values are stored in the lower bits of
371/// the destination.
372/// \param __b
373/// A 64-bit vector of [4 x i16] containing one of the source operands. The
374/// horizontal differences between the values are stored in the upper bits of
375/// the destination.
376/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences
377/// of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800378static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800379_mm_hsub_pi16(__m64 __a, __m64 __b)
380{
381 return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b);
382}
383
Logan Chien55afb0a2018-10-15 10:42:14 +0800384/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800385/// packed 64-bit vectors of [2 x i32].
386///
387/// \headerfile <x86intrin.h>
388///
389/// This intrinsic corresponds to the \c PHSUBD instruction.
390///
391/// \param __a
392/// A 64-bit vector of [2 x i32] containing one of the source operands. The
393/// horizontal differences between the values are stored in the lower bits of
394/// the destination.
395/// \param __b
396/// A 64-bit vector of [2 x i32] containing one of the source operands. The
397/// horizontal differences between the values are stored in the upper bits of
398/// the destination.
399/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences
400/// of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800401static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800402_mm_hsub_pi32(__m64 __a, __m64 __b)
403{
404 return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b);
405}
406
Logan Chien55afb0a2018-10-15 10:42:14 +0800407/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800408/// packed 128-bit vectors of [8 x i16]. Positive differences greater than
Logan Chien55afb0a2018-10-15 10:42:14 +0800409/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
410/// saturated to 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800411///
412/// \headerfile <x86intrin.h>
413///
414/// This intrinsic corresponds to the \c VPHSUBSW instruction.
415///
416/// \param __a
417/// A 128-bit vector of [8 x i16] containing one of the source operands. The
418/// horizontal differences between the values are stored in the lower bits of
419/// the destination.
420/// \param __b
421/// A 128-bit vector of [8 x i16] containing one of the source operands. The
422/// horizontal differences between the values are stored in the upper bits of
423/// the destination.
424/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated
425/// differences of both operands.
426static __inline__ __m128i __DEFAULT_FN_ATTRS
427_mm_hsubs_epi16(__m128i __a, __m128i __b)
428{
429 return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b);
430}
431
Logan Chien55afb0a2018-10-15 10:42:14 +0800432/// Horizontally subtracts the adjacent pairs of values contained in 2
Logan Chien2833ffb2018-10-09 10:03:24 +0800433/// packed 64-bit vectors of [4 x i16]. Positive differences greater than
Logan Chien55afb0a2018-10-15 10:42:14 +0800434/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are
435/// saturated to 0x8000.
Logan Chien2833ffb2018-10-09 10:03:24 +0800436///
437/// \headerfile <x86intrin.h>
438///
439/// This intrinsic corresponds to the \c PHSUBSW instruction.
440///
441/// \param __a
442/// A 64-bit vector of [4 x i16] containing one of the source operands. The
443/// horizontal differences between the values are stored in the lower bits of
444/// the destination.
445/// \param __b
446/// A 64-bit vector of [4 x i16] containing one of the source operands. The
447/// horizontal differences between the values are stored in the upper bits of
448/// the destination.
449/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated
450/// differences of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800451static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800452_mm_hsubs_pi16(__m64 __a, __m64 __b)
453{
454 return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b);
455}
456
Logan Chien55afb0a2018-10-15 10:42:14 +0800457/// Multiplies corresponding pairs of packed 8-bit unsigned integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800458/// values contained in the first source operand and packed 8-bit signed
459/// integer values contained in the second source operand, adds pairs of
460/// contiguous products with signed saturation, and writes the 16-bit sums to
Logan Chien55afb0a2018-10-15 10:42:14 +0800461/// the corresponding bits in the destination.
462///
463/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
464/// both operands are multiplied, and the sum of both results is written to
465/// bits [15:0] of the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800466///
467/// \headerfile <x86intrin.h>
468///
469/// This intrinsic corresponds to the \c VPMADDUBSW instruction.
470///
471/// \param __a
472/// A 128-bit integer vector containing the first source operand.
473/// \param __b
474/// A 128-bit integer vector containing the second source operand.
475/// \returns A 128-bit integer vector containing the sums of products of both
Logan Chien55afb0a2018-10-15 10:42:14 +0800476/// operands: \n
477/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
478/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
479/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
480/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n
481/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n
482/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n
483/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n
484/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15)
Logan Chien2833ffb2018-10-09 10:03:24 +0800485static __inline__ __m128i __DEFAULT_FN_ATTRS
486_mm_maddubs_epi16(__m128i __a, __m128i __b)
487{
488 return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b);
489}
490
Logan Chien55afb0a2018-10-15 10:42:14 +0800491/// Multiplies corresponding pairs of packed 8-bit unsigned integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800492/// values contained in the first source operand and packed 8-bit signed
493/// integer values contained in the second source operand, adds pairs of
494/// contiguous products with signed saturation, and writes the 16-bit sums to
Logan Chien55afb0a2018-10-15 10:42:14 +0800495/// the corresponding bits in the destination.
496///
497/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of
498/// both operands are multiplied, and the sum of both results is written to
499/// bits [15:0] of the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800500///
501/// \headerfile <x86intrin.h>
502///
503/// This intrinsic corresponds to the \c PMADDUBSW instruction.
504///
505/// \param __a
506/// A 64-bit integer vector containing the first source operand.
507/// \param __b
508/// A 64-bit integer vector containing the second source operand.
509/// \returns A 64-bit integer vector containing the sums of products of both
Logan Chien55afb0a2018-10-15 10:42:14 +0800510/// operands: \n
511/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n
512/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n
513/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n
514/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7)
515static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800516_mm_maddubs_pi16(__m64 __a, __m64 __b)
517{
518 return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b);
519}
520
Logan Chien55afb0a2018-10-15 10:42:14 +0800521/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800522/// products to the 18 most significant bits by right-shifting, rounds the
523/// truncated value by adding 1, and writes bits [16:1] to the destination.
524///
525/// \headerfile <x86intrin.h>
526///
527/// This intrinsic corresponds to the \c VPMULHRSW instruction.
528///
529/// \param __a
530/// A 128-bit vector of [8 x i16] containing one of the source operands.
531/// \param __b
532/// A 128-bit vector of [8 x i16] containing one of the source operands.
533/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled
534/// products of both operands.
535static __inline__ __m128i __DEFAULT_FN_ATTRS
536_mm_mulhrs_epi16(__m128i __a, __m128i __b)
537{
538 return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b);
539}
540
Logan Chien55afb0a2018-10-15 10:42:14 +0800541/// Multiplies packed 16-bit signed integer values, truncates the 32-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800542/// products to the 18 most significant bits by right-shifting, rounds the
543/// truncated value by adding 1, and writes bits [16:1] to the destination.
544///
545/// \headerfile <x86intrin.h>
546///
547/// This intrinsic corresponds to the \c PMULHRSW instruction.
548///
549/// \param __a
550/// A 64-bit vector of [4 x i16] containing one of the source operands.
551/// \param __b
552/// A 64-bit vector of [4 x i16] containing one of the source operands.
553/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled
554/// products of both operands.
Logan Chien55afb0a2018-10-15 10:42:14 +0800555static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800556_mm_mulhrs_pi16(__m64 __a, __m64 __b)
557{
558 return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b);
559}
560
Logan Chien55afb0a2018-10-15 10:42:14 +0800561/// Copies the 8-bit integers from a 128-bit integer vector to the
Logan Chien2833ffb2018-10-09 10:03:24 +0800562/// destination or clears 8-bit values in the destination, as specified by
563/// the second source operand.
564///
565/// \headerfile <x86intrin.h>
566///
567/// This intrinsic corresponds to the \c VPSHUFB instruction.
568///
569/// \param __a
570/// A 128-bit integer vector containing the values to be copied.
571/// \param __b
572/// A 128-bit integer vector containing control bytes corresponding to
573/// positions in the destination:
Logan Chien55afb0a2018-10-15 10:42:14 +0800574/// Bit 7: \n
575/// 1: Clear the corresponding byte in the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800576/// 0: Copy the selected source byte to the corresponding byte in the
Logan Chien55afb0a2018-10-15 10:42:14 +0800577/// destination. \n
578/// Bits [6:4] Reserved. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800579/// Bits [3:0] select the source byte to be copied.
580/// \returns A 128-bit integer vector containing the copied or cleared values.
581static __inline__ __m128i __DEFAULT_FN_ATTRS
582_mm_shuffle_epi8(__m128i __a, __m128i __b)
583{
584 return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b);
585}
586
Logan Chien55afb0a2018-10-15 10:42:14 +0800587/// Copies the 8-bit integers from a 64-bit integer vector to the
Logan Chien2833ffb2018-10-09 10:03:24 +0800588/// destination or clears 8-bit values in the destination, as specified by
589/// the second source operand.
590///
591/// \headerfile <x86intrin.h>
592///
593/// This intrinsic corresponds to the \c PSHUFB instruction.
594///
595/// \param __a
596/// A 64-bit integer vector containing the values to be copied.
597/// \param __b
598/// A 64-bit integer vector containing control bytes corresponding to
599/// positions in the destination:
Logan Chien55afb0a2018-10-15 10:42:14 +0800600/// Bit 7: \n
601/// 1: Clear the corresponding byte in the destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800602/// 0: Copy the selected source byte to the corresponding byte in the
Logan Chien55afb0a2018-10-15 10:42:14 +0800603/// destination. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800604/// Bits [3:0] select the source byte to be copied.
605/// \returns A 64-bit integer vector containing the copied or cleared values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800606static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800607_mm_shuffle_pi8(__m64 __a, __m64 __b)
608{
609 return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b);
610}
611
Logan Chien55afb0a2018-10-15 10:42:14 +0800612/// For each 8-bit integer in the first source operand, perform one of
613/// the following actions as specified by the second source operand.
614///
615/// If the byte in the second source is negative, calculate the two's
616/// complement of the corresponding byte in the first source, and write that
617/// value to the destination. If the byte in the second source is positive,
618/// copy the corresponding byte from the first source to the destination. If
619/// the byte in the second source is zero, clear the corresponding byte in
620/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800621///
622/// \headerfile <x86intrin.h>
623///
624/// This intrinsic corresponds to the \c VPSIGNB instruction.
625///
626/// \param __a
627/// A 128-bit integer vector containing the values to be copied.
628/// \param __b
629/// A 128-bit integer vector containing control bytes corresponding to
630/// positions in the destination.
631/// \returns A 128-bit integer vector containing the resultant values.
632static __inline__ __m128i __DEFAULT_FN_ATTRS
633_mm_sign_epi8(__m128i __a, __m128i __b)
634{
635 return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
636}
637
Logan Chien55afb0a2018-10-15 10:42:14 +0800638/// For each 16-bit integer in the first source operand, perform one of
639/// the following actions as specified by the second source operand.
640///
641/// If the word in the second source is negative, calculate the two's
642/// complement of the corresponding word in the first source, and write that
643/// value to the destination. If the word in the second source is positive,
644/// copy the corresponding word from the first source to the destination. If
645/// the word in the second source is zero, clear the corresponding word in
646/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800647///
648/// \headerfile <x86intrin.h>
649///
650/// This intrinsic corresponds to the \c VPSIGNW instruction.
651///
652/// \param __a
653/// A 128-bit integer vector containing the values to be copied.
654/// \param __b
655/// A 128-bit integer vector containing control words corresponding to
656/// positions in the destination.
657/// \returns A 128-bit integer vector containing the resultant values.
658static __inline__ __m128i __DEFAULT_FN_ATTRS
659_mm_sign_epi16(__m128i __a, __m128i __b)
660{
661 return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
662}
663
Logan Chien55afb0a2018-10-15 10:42:14 +0800664/// For each 32-bit integer in the first source operand, perform one of
665/// the following actions as specified by the second source operand.
666///
667/// If the doubleword in the second source is negative, calculate the two's
Logan Chien2833ffb2018-10-09 10:03:24 +0800668/// complement of the corresponding word in the first source, and write that
669/// value to the destination. If the doubleword in the second source is
670/// positive, copy the corresponding word from the first source to the
671/// destination. If the doubleword in the second source is zero, clear the
672/// corresponding word in the destination.
673///
674/// \headerfile <x86intrin.h>
675///
676/// This intrinsic corresponds to the \c VPSIGND instruction.
677///
678/// \param __a
679/// A 128-bit integer vector containing the values to be copied.
680/// \param __b
681/// A 128-bit integer vector containing control doublewords corresponding to
682/// positions in the destination.
683/// \returns A 128-bit integer vector containing the resultant values.
684static __inline__ __m128i __DEFAULT_FN_ATTRS
685_mm_sign_epi32(__m128i __a, __m128i __b)
686{
687 return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
688}
689
Logan Chien55afb0a2018-10-15 10:42:14 +0800690/// For each 8-bit integer in the first source operand, perform one of
691/// the following actions as specified by the second source operand.
692///
693/// If the byte in the second source is negative, calculate the two's
694/// complement of the corresponding byte in the first source, and write that
695/// value to the destination. If the byte in the second source is positive,
696/// copy the corresponding byte from the first source to the destination. If
697/// the byte in the second source is zero, clear the corresponding byte in
698/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800699///
700/// \headerfile <x86intrin.h>
701///
702/// This intrinsic corresponds to the \c PSIGNB instruction.
703///
704/// \param __a
705/// A 64-bit integer vector containing the values to be copied.
706/// \param __b
707/// A 64-bit integer vector containing control bytes corresponding to
708/// positions in the destination.
709/// \returns A 64-bit integer vector containing the resultant values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800710static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800711_mm_sign_pi8(__m64 __a, __m64 __b)
712{
713 return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b);
714}
715
Logan Chien55afb0a2018-10-15 10:42:14 +0800716/// For each 16-bit integer in the first source operand, perform one of
717/// the following actions as specified by the second source operand.
718///
719/// If the word in the second source is negative, calculate the two's
720/// complement of the corresponding word in the first source, and write that
721/// value to the destination. If the word in the second source is positive,
722/// copy the corresponding word from the first source to the destination. If
723/// the word in the second source is zero, clear the corresponding word in
724/// the destination.
Logan Chien2833ffb2018-10-09 10:03:24 +0800725///
726/// \headerfile <x86intrin.h>
727///
728/// This intrinsic corresponds to the \c PSIGNW instruction.
729///
730/// \param __a
731/// A 64-bit integer vector containing the values to be copied.
732/// \param __b
733/// A 64-bit integer vector containing control words corresponding to
734/// positions in the destination.
735/// \returns A 64-bit integer vector containing the resultant values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800736static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800737_mm_sign_pi16(__m64 __a, __m64 __b)
738{
739 return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b);
740}
741
Logan Chien55afb0a2018-10-15 10:42:14 +0800742/// For each 32-bit integer in the first source operand, perform one of
743/// the following actions as specified by the second source operand.
744///
745/// If the doubleword in the second source is negative, calculate the two's
Logan Chien2833ffb2018-10-09 10:03:24 +0800746/// complement of the corresponding doubleword in the first source, and
747/// write that value to the destination. If the doubleword in the second
748/// source is positive, copy the corresponding doubleword from the first
749/// source to the destination. If the doubleword in the second source is
750/// zero, clear the corresponding doubleword in the destination.
751///
752/// \headerfile <x86intrin.h>
753///
754/// This intrinsic corresponds to the \c PSIGND instruction.
755///
756/// \param __a
757/// A 64-bit integer vector containing the values to be copied.
758/// \param __b
759/// A 64-bit integer vector containing two control doublewords corresponding
760/// to positions in the destination.
761/// \returns A 64-bit integer vector containing the resultant values.
Logan Chien55afb0a2018-10-15 10:42:14 +0800762static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800763_mm_sign_pi32(__m64 __a, __m64 __b)
764{
765 return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b);
766}
767
768#undef __DEFAULT_FN_ATTRS
Logan Chien55afb0a2018-10-15 10:42:14 +0800769#undef __DEFAULT_FN_ATTRS_MMX
Logan Chien2833ffb2018-10-09 10:03:24 +0800770
771#endif /* __TMMINTRIN_H */