blob: 79a8b55016b193a5f335d7648e0c39f5ef30f19c [file] [log] [blame]
Logan Chien2833ffb2018-10-09 10:03:24 +08001/*===---- mmintrin.h - MMX intrinsics --------------------------------------===
2 *
Logan Chiendf4f7662019-09-04 16:45:23 -07003 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
Logan Chien2833ffb2018-10-09 10:03:24 +08006 *
7 *===-----------------------------------------------------------------------===
8 */
9
10#ifndef __MMINTRIN_H
11#define __MMINTRIN_H
12
Logan Chiendbcf4122019-03-21 10:50:25 +080013typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8)));
Logan Chien2833ffb2018-10-09 10:03:24 +080014
15typedef long long __v1di __attribute__((__vector_size__(8)));
16typedef int __v2si __attribute__((__vector_size__(8)));
17typedef short __v4hi __attribute__((__vector_size__(8)));
18typedef char __v8qi __attribute__((__vector_size__(8)));
19
20/* Define the default attributes for the functions in this file. */
Logan Chien55afb0a2018-10-15 10:42:14 +080021#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64)))
Logan Chien2833ffb2018-10-09 10:03:24 +080022
Logan Chien55afb0a2018-10-15 10:42:14 +080023/// Clears the MMX state by setting the state of the x87 stack registers
Logan Chien2833ffb2018-10-09 10:03:24 +080024/// to empty.
25///
26/// \headerfile <x86intrin.h>
27///
Logan Chien55afb0a2018-10-15 10:42:14 +080028/// This intrinsic corresponds to the <c> EMMS </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080029///
Logan Chien55afb0a2018-10-15 10:42:14 +080030static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx")))
Logan Chien2833ffb2018-10-09 10:03:24 +080031_mm_empty(void)
32{
33 __builtin_ia32_emms();
34}
35
Logan Chien55afb0a2018-10-15 10:42:14 +080036/// Constructs a 64-bit integer vector, setting the lower 32 bits to the
Logan Chien2833ffb2018-10-09 10:03:24 +080037/// value of the 32-bit integer parameter and setting the upper 32 bits to 0.
38///
39/// \headerfile <x86intrin.h>
40///
Logan Chien55afb0a2018-10-15 10:42:14 +080041/// This intrinsic corresponds to the <c> MOVD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080042///
43/// \param __i
44/// A 32-bit integer value.
45/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the
46/// parameter. The upper 32 bits are set to 0.
47static __inline__ __m64 __DEFAULT_FN_ATTRS
48_mm_cvtsi32_si64(int __i)
49{
50 return (__m64)__builtin_ia32_vec_init_v2si(__i, 0);
51}
52
Logan Chien55afb0a2018-10-15 10:42:14 +080053/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit
Logan Chien2833ffb2018-10-09 10:03:24 +080054/// signed integer.
55///
56/// \headerfile <x86intrin.h>
57///
Logan Chien55afb0a2018-10-15 10:42:14 +080058/// This intrinsic corresponds to the <c> MOVD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080059///
60/// \param __m
61/// A 64-bit integer vector.
62/// \returns A 32-bit signed integer value containing the lower 32 bits of the
63/// parameter.
64static __inline__ int __DEFAULT_FN_ATTRS
65_mm_cvtsi64_si32(__m64 __m)
66{
67 return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0);
68}
69
Logan Chien55afb0a2018-10-15 10:42:14 +080070/// Casts a 64-bit signed integer value into a 64-bit integer vector.
Logan Chien2833ffb2018-10-09 10:03:24 +080071///
72/// \headerfile <x86intrin.h>
73///
Logan Chien55afb0a2018-10-15 10:42:14 +080074/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080075///
76/// \param __i
77/// A 64-bit signed integer.
78/// \returns A 64-bit integer vector containing the same bitwise pattern as the
79/// parameter.
80static __inline__ __m64 __DEFAULT_FN_ATTRS
81_mm_cvtsi64_m64(long long __i)
82{
83 return (__m64)__i;
84}
85
Logan Chien55afb0a2018-10-15 10:42:14 +080086/// Casts a 64-bit integer vector into a 64-bit signed integer value.
Logan Chien2833ffb2018-10-09 10:03:24 +080087///
88/// \headerfile <x86intrin.h>
89///
Logan Chien55afb0a2018-10-15 10:42:14 +080090/// This intrinsic corresponds to the <c> MOVQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +080091///
92/// \param __m
93/// A 64-bit integer vector.
94/// \returns A 64-bit signed integer containing the same bitwise pattern as the
95/// parameter.
96static __inline__ long long __DEFAULT_FN_ATTRS
97_mm_cvtm64_si64(__m64 __m)
98{
99 return (long long)__m;
100}
101
Logan Chien55afb0a2018-10-15 10:42:14 +0800102/// Converts 16-bit signed integers from both 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800103/// parameters of [4 x i16] into 8-bit signed integer values, and constructs
104/// a 64-bit integer vector of [8 x i8] as the result. Positive values
105/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80
106/// are saturated to 0x80.
107///
108/// \headerfile <x86intrin.h>
109///
Logan Chien55afb0a2018-10-15 10:42:14 +0800110/// This intrinsic corresponds to the <c> PACKSSWB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800111///
112/// \param __m1
113/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
114/// 16-bit signed integer and is converted to an 8-bit signed integer with
115/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
116/// Negative values less than 0x80 are saturated to 0x80. The converted
117/// [4 x i8] values are written to the lower 32 bits of the result.
118/// \param __m2
119/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
120/// 16-bit signed integer and is converted to an 8-bit signed integer with
121/// saturation. Positive values greater than 0x7F are saturated to 0x7F.
122/// Negative values less than 0x80 are saturated to 0x80. The converted
123/// [4 x i8] values are written to the upper 32 bits of the result.
124/// \returns A 64-bit integer vector of [8 x i8] containing the converted
125/// values.
126static __inline__ __m64 __DEFAULT_FN_ATTRS
127_mm_packs_pi16(__m64 __m1, __m64 __m2)
128{
129 return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2);
130}
131
Logan Chien55afb0a2018-10-15 10:42:14 +0800132/// Converts 32-bit signed integers from both 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800133/// parameters of [2 x i32] into 16-bit signed integer values, and constructs
134/// a 64-bit integer vector of [4 x i16] as the result. Positive values
135/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than
136/// 0x8000 are saturated to 0x8000.
137///
138/// \headerfile <x86intrin.h>
139///
Logan Chien55afb0a2018-10-15 10:42:14 +0800140/// This intrinsic corresponds to the <c> PACKSSDW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800141///
142/// \param __m1
143/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
144/// 32-bit signed integer and is converted to a 16-bit signed integer with
145/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
146/// Negative values less than 0x8000 are saturated to 0x8000. The converted
147/// [2 x i16] values are written to the lower 32 bits of the result.
148/// \param __m2
149/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a
150/// 32-bit signed integer and is converted to a 16-bit signed integer with
151/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF.
152/// Negative values less than 0x8000 are saturated to 0x8000. The converted
153/// [2 x i16] values are written to the upper 32 bits of the result.
154/// \returns A 64-bit integer vector of [4 x i16] containing the converted
155/// values.
156static __inline__ __m64 __DEFAULT_FN_ATTRS
157_mm_packs_pi32(__m64 __m1, __m64 __m2)
158{
159 return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2);
160}
161
Logan Chien55afb0a2018-10-15 10:42:14 +0800162/// Converts 16-bit signed integers from both 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800163/// parameters of [4 x i16] into 8-bit unsigned integer values, and
164/// constructs a 64-bit integer vector of [8 x i8] as the result. Values
165/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated
166/// to 0.
167///
168/// \headerfile <x86intrin.h>
169///
Logan Chien55afb0a2018-10-15 10:42:14 +0800170/// This intrinsic corresponds to the <c> PACKUSWB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800171///
172/// \param __m1
173/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
174/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
175/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
176/// than 0 are saturated to 0. The converted [4 x i8] values are written to
177/// the lower 32 bits of the result.
178/// \param __m2
179/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a
180/// 16-bit signed integer and is converted to an 8-bit unsigned integer with
181/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less
182/// than 0 are saturated to 0. The converted [4 x i8] values are written to
183/// the upper 32 bits of the result.
184/// \returns A 64-bit integer vector of [8 x i8] containing the converted
185/// values.
186static __inline__ __m64 __DEFAULT_FN_ATTRS
187_mm_packs_pu16(__m64 __m1, __m64 __m2)
188{
189 return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2);
190}
191
Logan Chien55afb0a2018-10-15 10:42:14 +0800192/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8]
Logan Chien2833ffb2018-10-09 10:03:24 +0800193/// and interleaves them into a 64-bit integer vector of [8 x i8].
194///
195/// \headerfile <x86intrin.h>
196///
Logan Chien55afb0a2018-10-15 10:42:14 +0800197/// This intrinsic corresponds to the <c> PUNPCKHBW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800198///
199/// \param __m1
Logan Chien55afb0a2018-10-15 10:42:14 +0800200/// A 64-bit integer vector of [8 x i8]. \n
201/// Bits [39:32] are written to bits [7:0] of the result. \n
202/// Bits [47:40] are written to bits [23:16] of the result. \n
203/// Bits [55:48] are written to bits [39:32] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800204/// Bits [63:56] are written to bits [55:48] of the result.
205/// \param __m2
206/// A 64-bit integer vector of [8 x i8].
Logan Chien55afb0a2018-10-15 10:42:14 +0800207/// Bits [39:32] are written to bits [15:8] of the result. \n
208/// Bits [47:40] are written to bits [31:24] of the result. \n
209/// Bits [55:48] are written to bits [47:40] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800210/// Bits [63:56] are written to bits [63:56] of the result.
211/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
212/// values.
213static __inline__ __m64 __DEFAULT_FN_ATTRS
214_mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
215{
216 return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2);
217}
218
Logan Chien55afb0a2018-10-15 10:42:14 +0800219/// Unpacks the upper 32 bits from two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +0800220/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
221///
222/// \headerfile <x86intrin.h>
223///
Logan Chien55afb0a2018-10-15 10:42:14 +0800224/// This intrinsic corresponds to the <c> PUNPCKHWD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800225///
226/// \param __m1
227/// A 64-bit integer vector of [4 x i16].
Logan Chien55afb0a2018-10-15 10:42:14 +0800228/// Bits [47:32] are written to bits [15:0] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800229/// Bits [63:48] are written to bits [47:32] of the result.
230/// \param __m2
231/// A 64-bit integer vector of [4 x i16].
Logan Chien55afb0a2018-10-15 10:42:14 +0800232/// Bits [47:32] are written to bits [31:16] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800233/// Bits [63:48] are written to bits [63:48] of the result.
234/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
235/// values.
236static __inline__ __m64 __DEFAULT_FN_ATTRS
237_mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
238{
239 return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2);
240}
241
Logan Chien55afb0a2018-10-15 10:42:14 +0800242/// Unpacks the upper 32 bits from two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +0800243/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
244///
245/// \headerfile <x86intrin.h>
246///
Logan Chien55afb0a2018-10-15 10:42:14 +0800247/// This intrinsic corresponds to the <c> PUNPCKHDQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800248///
249/// \param __m1
250/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
251/// the lower 32 bits of the result.
252/// \param __m2
253/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to
254/// the upper 32 bits of the result.
255/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
256/// values.
257static __inline__ __m64 __DEFAULT_FN_ATTRS
258_mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
259{
260 return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2);
261}
262
Logan Chien55afb0a2018-10-15 10:42:14 +0800263/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8]
Logan Chien2833ffb2018-10-09 10:03:24 +0800264/// and interleaves them into a 64-bit integer vector of [8 x i8].
265///
266/// \headerfile <x86intrin.h>
267///
Logan Chien55afb0a2018-10-15 10:42:14 +0800268/// This intrinsic corresponds to the <c> PUNPCKLBW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800269///
270/// \param __m1
271/// A 64-bit integer vector of [8 x i8].
Logan Chien55afb0a2018-10-15 10:42:14 +0800272/// Bits [7:0] are written to bits [7:0] of the result. \n
273/// Bits [15:8] are written to bits [23:16] of the result. \n
274/// Bits [23:16] are written to bits [39:32] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800275/// Bits [31:24] are written to bits [55:48] of the result.
276/// \param __m2
277/// A 64-bit integer vector of [8 x i8].
Logan Chien55afb0a2018-10-15 10:42:14 +0800278/// Bits [7:0] are written to bits [15:8] of the result. \n
279/// Bits [15:8] are written to bits [31:24] of the result. \n
280/// Bits [23:16] are written to bits [47:40] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800281/// Bits [31:24] are written to bits [63:56] of the result.
282/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved
283/// values.
284static __inline__ __m64 __DEFAULT_FN_ATTRS
285_mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
286{
287 return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2);
288}
289
Logan Chien55afb0a2018-10-15 10:42:14 +0800290/// Unpacks the lower 32 bits from two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +0800291/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16].
292///
293/// \headerfile <x86intrin.h>
294///
Logan Chien55afb0a2018-10-15 10:42:14 +0800295/// This intrinsic corresponds to the <c> PUNPCKLWD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800296///
297/// \param __m1
298/// A 64-bit integer vector of [4 x i16].
Logan Chien55afb0a2018-10-15 10:42:14 +0800299/// Bits [15:0] are written to bits [15:0] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800300/// Bits [31:16] are written to bits [47:32] of the result.
301/// \param __m2
302/// A 64-bit integer vector of [4 x i16].
Logan Chien55afb0a2018-10-15 10:42:14 +0800303/// Bits [15:0] are written to bits [31:16] of the result. \n
Logan Chien2833ffb2018-10-09 10:03:24 +0800304/// Bits [31:16] are written to bits [63:48] of the result.
305/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved
306/// values.
307static __inline__ __m64 __DEFAULT_FN_ATTRS
308_mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
309{
310 return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2);
311}
312
Logan Chien55afb0a2018-10-15 10:42:14 +0800313/// Unpacks the lower 32 bits from two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +0800314/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32].
315///
316/// \headerfile <x86intrin.h>
317///
Logan Chien55afb0a2018-10-15 10:42:14 +0800318/// This intrinsic corresponds to the <c> PUNPCKLDQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800319///
320/// \param __m1
321/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
322/// the lower 32 bits of the result.
323/// \param __m2
324/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to
325/// the upper 32 bits of the result.
326/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved
327/// values.
328static __inline__ __m64 __DEFAULT_FN_ATTRS
329_mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
330{
331 return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2);
332}
333
Logan Chien55afb0a2018-10-15 10:42:14 +0800334/// Adds each 8-bit integer element of the first 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800335/// of [8 x i8] to the corresponding 8-bit integer element of the second
336/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are
337/// packed into a 64-bit integer vector of [8 x i8].
338///
339/// \headerfile <x86intrin.h>
340///
Logan Chien55afb0a2018-10-15 10:42:14 +0800341/// This intrinsic corresponds to the <c> PADDB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800342///
343/// \param __m1
344/// A 64-bit integer vector of [8 x i8].
345/// \param __m2
346/// A 64-bit integer vector of [8 x i8].
347/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both
348/// parameters.
349static __inline__ __m64 __DEFAULT_FN_ATTRS
350_mm_add_pi8(__m64 __m1, __m64 __m2)
351{
352 return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2);
353}
354
Logan Chien55afb0a2018-10-15 10:42:14 +0800355/// Adds each 16-bit integer element of the first 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800356/// of [4 x i16] to the corresponding 16-bit integer element of the second
357/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are
358/// packed into a 64-bit integer vector of [4 x i16].
359///
360/// \headerfile <x86intrin.h>
361///
Logan Chien55afb0a2018-10-15 10:42:14 +0800362/// This intrinsic corresponds to the <c> PADDW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800363///
364/// \param __m1
365/// A 64-bit integer vector of [4 x i16].
366/// \param __m2
367/// A 64-bit integer vector of [4 x i16].
368/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both
369/// parameters.
370static __inline__ __m64 __DEFAULT_FN_ATTRS
371_mm_add_pi16(__m64 __m1, __m64 __m2)
372{
373 return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2);
374}
375
Logan Chien55afb0a2018-10-15 10:42:14 +0800376/// Adds each 32-bit integer element of the first 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800377/// of [2 x i32] to the corresponding 32-bit integer element of the second
378/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are
379/// packed into a 64-bit integer vector of [2 x i32].
380///
381/// \headerfile <x86intrin.h>
382///
Logan Chien55afb0a2018-10-15 10:42:14 +0800383/// This intrinsic corresponds to the <c> PADDD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800384///
385/// \param __m1
386/// A 64-bit integer vector of [2 x i32].
387/// \param __m2
388/// A 64-bit integer vector of [2 x i32].
389/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both
390/// parameters.
391static __inline__ __m64 __DEFAULT_FN_ATTRS
392_mm_add_pi32(__m64 __m1, __m64 __m2)
393{
394 return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2);
395}
396
Logan Chien55afb0a2018-10-15 10:42:14 +0800397/// Adds each 8-bit signed integer element of the first 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800398/// vector of [8 x i8] to the corresponding 8-bit signed integer element of
399/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than
400/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to
401/// 0x80. The results are packed into a 64-bit integer vector of [8 x i8].
402///
403/// \headerfile <x86intrin.h>
404///
Logan Chien55afb0a2018-10-15 10:42:14 +0800405/// This intrinsic corresponds to the <c> PADDSB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800406///
407/// \param __m1
408/// A 64-bit integer vector of [8 x i8].
409/// \param __m2
410/// A 64-bit integer vector of [8 x i8].
411/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums
412/// of both parameters.
413static __inline__ __m64 __DEFAULT_FN_ATTRS
414_mm_adds_pi8(__m64 __m1, __m64 __m2)
415{
416 return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2);
417}
418
Logan Chien55afb0a2018-10-15 10:42:14 +0800419/// Adds each 16-bit signed integer element of the first 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800420/// vector of [4 x i16] to the corresponding 16-bit signed integer element of
421/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than
422/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are
423/// saturated to 0x8000. The results are packed into a 64-bit integer vector
424/// of [4 x i16].
425///
426/// \headerfile <x86intrin.h>
427///
Logan Chien55afb0a2018-10-15 10:42:14 +0800428/// This intrinsic corresponds to the <c> PADDSW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800429///
430/// \param __m1
431/// A 64-bit integer vector of [4 x i16].
432/// \param __m2
433/// A 64-bit integer vector of [4 x i16].
434/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums
435/// of both parameters.
436static __inline__ __m64 __DEFAULT_FN_ATTRS
437_mm_adds_pi16(__m64 __m1, __m64 __m2)
438{
439 return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2);
440}
441
Logan Chien55afb0a2018-10-15 10:42:14 +0800442/// Adds each 8-bit unsigned integer element of the first 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800443/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of
444/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are
445/// saturated to 0xFF. The results are packed into a 64-bit integer vector of
446/// [8 x i8].
447///
448/// \headerfile <x86intrin.h>
449///
Logan Chien55afb0a2018-10-15 10:42:14 +0800450/// This intrinsic corresponds to the <c> PADDUSB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800451///
452/// \param __m1
453/// A 64-bit integer vector of [8 x i8].
454/// \param __m2
455/// A 64-bit integer vector of [8 x i8].
456/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
457/// unsigned sums of both parameters.
458static __inline__ __m64 __DEFAULT_FN_ATTRS
459_mm_adds_pu8(__m64 __m1, __m64 __m2)
460{
461 return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2);
462}
463
Logan Chien55afb0a2018-10-15 10:42:14 +0800464/// Adds each 16-bit unsigned integer element of the first 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800465/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element
466/// of the second 64-bit integer vector of [4 x i16]. Sums greater than
467/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit
468/// integer vector of [4 x i16].
469///
470/// \headerfile <x86intrin.h>
471///
Logan Chien55afb0a2018-10-15 10:42:14 +0800472/// This intrinsic corresponds to the <c> PADDUSW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800473///
474/// \param __m1
475/// A 64-bit integer vector of [4 x i16].
476/// \param __m2
477/// A 64-bit integer vector of [4 x i16].
478/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
479/// unsigned sums of both parameters.
480static __inline__ __m64 __DEFAULT_FN_ATTRS
481_mm_adds_pu16(__m64 __m1, __m64 __m2)
482{
483 return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2);
484}
485
Logan Chien55afb0a2018-10-15 10:42:14 +0800486/// Subtracts each 8-bit integer element of the second 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800487/// vector of [8 x i8] from the corresponding 8-bit integer element of the
488/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results
489/// are packed into a 64-bit integer vector of [8 x i8].
490///
491/// \headerfile <x86intrin.h>
492///
Logan Chien55afb0a2018-10-15 10:42:14 +0800493/// This intrinsic corresponds to the <c> PSUBB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800494///
495/// \param __m1
496/// A 64-bit integer vector of [8 x i8] containing the minuends.
497/// \param __m2
498/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
499/// \returns A 64-bit integer vector of [8 x i8] containing the differences of
500/// both parameters.
501static __inline__ __m64 __DEFAULT_FN_ATTRS
502_mm_sub_pi8(__m64 __m1, __m64 __m2)
503{
504 return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2);
505}
506
Logan Chien55afb0a2018-10-15 10:42:14 +0800507/// Subtracts each 16-bit integer element of the second 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800508/// vector of [4 x i16] from the corresponding 16-bit integer element of the
509/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the
510/// results are packed into a 64-bit integer vector of [4 x i16].
511///
512/// \headerfile <x86intrin.h>
513///
Logan Chien55afb0a2018-10-15 10:42:14 +0800514/// This intrinsic corresponds to the <c> PSUBW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800515///
516/// \param __m1
517/// A 64-bit integer vector of [4 x i16] containing the minuends.
518/// \param __m2
519/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
520/// \returns A 64-bit integer vector of [4 x i16] containing the differences of
521/// both parameters.
522static __inline__ __m64 __DEFAULT_FN_ATTRS
523_mm_sub_pi16(__m64 __m1, __m64 __m2)
524{
525 return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2);
526}
527
Logan Chien55afb0a2018-10-15 10:42:14 +0800528/// Subtracts each 32-bit integer element of the second 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800529/// vector of [2 x i32] from the corresponding 32-bit integer element of the
530/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the
531/// results are packed into a 64-bit integer vector of [2 x i32].
532///
533/// \headerfile <x86intrin.h>
534///
Logan Chien55afb0a2018-10-15 10:42:14 +0800535/// This intrinsic corresponds to the <c> PSUBD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800536///
537/// \param __m1
538/// A 64-bit integer vector of [2 x i32] containing the minuends.
539/// \param __m2
540/// A 64-bit integer vector of [2 x i32] containing the subtrahends.
541/// \returns A 64-bit integer vector of [2 x i32] containing the differences of
542/// both parameters.
543static __inline__ __m64 __DEFAULT_FN_ATTRS
544_mm_sub_pi32(__m64 __m1, __m64 __m2)
545{
546 return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2);
547}
548
Logan Chien55afb0a2018-10-15 10:42:14 +0800549/// Subtracts each 8-bit signed integer element of the second 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800550/// integer vector of [8 x i8] from the corresponding 8-bit signed integer
551/// element of the first 64-bit integer vector of [8 x i8]. Positive results
552/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80
553/// are saturated to 0x80. The results are packed into a 64-bit integer
554/// vector of [8 x i8].
555///
556/// \headerfile <x86intrin.h>
557///
Logan Chien55afb0a2018-10-15 10:42:14 +0800558/// This intrinsic corresponds to the <c> PSUBSB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800559///
560/// \param __m1
561/// A 64-bit integer vector of [8 x i8] containing the minuends.
562/// \param __m2
563/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
564/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
565/// differences of both parameters.
566static __inline__ __m64 __DEFAULT_FN_ATTRS
567_mm_subs_pi8(__m64 __m1, __m64 __m2)
568{
569 return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2);
570}
571
Logan Chien55afb0a2018-10-15 10:42:14 +0800572/// Subtracts each 16-bit signed integer element of the second 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800573/// integer vector of [4 x i16] from the corresponding 16-bit signed integer
574/// element of the first 64-bit integer vector of [4 x i16]. Positive results
575/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than
576/// 0x8000 are saturated to 0x8000. The results are packed into a 64-bit
577/// integer vector of [4 x i16].
578///
579/// \headerfile <x86intrin.h>
580///
Logan Chien55afb0a2018-10-15 10:42:14 +0800581/// This intrinsic corresponds to the <c> PSUBSW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800582///
583/// \param __m1
584/// A 64-bit integer vector of [4 x i16] containing the minuends.
585/// \param __m2
586/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
587/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
588/// differences of both parameters.
589static __inline__ __m64 __DEFAULT_FN_ATTRS
590_mm_subs_pi16(__m64 __m1, __m64 __m2)
591{
592 return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2);
593}
594
Logan Chien55afb0a2018-10-15 10:42:14 +0800595/// Subtracts each 8-bit unsigned integer element of the second 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800596/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer
Logan Chien55afb0a2018-10-15 10:42:14 +0800597/// element of the first 64-bit integer vector of [8 x i8].
598///
599/// If an element of the first vector is less than the corresponding element
600/// of the second vector, the result is saturated to 0. The results are
601/// packed into a 64-bit integer vector of [8 x i8].
Logan Chien2833ffb2018-10-09 10:03:24 +0800602///
603/// \headerfile <x86intrin.h>
604///
Logan Chien55afb0a2018-10-15 10:42:14 +0800605/// This intrinsic corresponds to the <c> PSUBUSB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800606///
607/// \param __m1
608/// A 64-bit integer vector of [8 x i8] containing the minuends.
609/// \param __m2
610/// A 64-bit integer vector of [8 x i8] containing the subtrahends.
611/// \returns A 64-bit integer vector of [8 x i8] containing the saturated
612/// differences of both parameters.
613static __inline__ __m64 __DEFAULT_FN_ATTRS
614_mm_subs_pu8(__m64 __m1, __m64 __m2)
615{
616 return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2);
617}
618
Logan Chien55afb0a2018-10-15 10:42:14 +0800619/// Subtracts each 16-bit unsigned integer element of the second 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800620/// integer vector of [4 x i16] from the corresponding 16-bit unsigned
Logan Chien55afb0a2018-10-15 10:42:14 +0800621/// integer element of the first 64-bit integer vector of [4 x i16].
622///
623/// If an element of the first vector is less than the corresponding element
624/// of the second vector, the result is saturated to 0. The results are
625/// packed into a 64-bit integer vector of [4 x i16].
Logan Chien2833ffb2018-10-09 10:03:24 +0800626///
627/// \headerfile <x86intrin.h>
628///
Logan Chien55afb0a2018-10-15 10:42:14 +0800629/// This intrinsic corresponds to the <c> PSUBUSW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800630///
631/// \param __m1
632/// A 64-bit integer vector of [4 x i16] containing the minuends.
633/// \param __m2
634/// A 64-bit integer vector of [4 x i16] containing the subtrahends.
635/// \returns A 64-bit integer vector of [4 x i16] containing the saturated
636/// differences of both parameters.
637static __inline__ __m64 __DEFAULT_FN_ATTRS
638_mm_subs_pu16(__m64 __m1, __m64 __m2)
639{
640 return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2);
641}
642
Logan Chien55afb0a2018-10-15 10:42:14 +0800643/// Multiplies each 16-bit signed integer element of the first 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800644/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
645/// element of the second 64-bit integer vector of [4 x i16] and get four
646/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums.
647/// The lower 32 bits of these two sums are packed into a 64-bit integer
Logan Chien55afb0a2018-10-15 10:42:14 +0800648/// vector of [2 x i32].
649///
650/// For example, bits [15:0] of both parameters are multiplied, bits [31:16]
651/// of both parameters are multiplied, and the sum of both results is written
652/// to bits [31:0] of the result.
Logan Chien2833ffb2018-10-09 10:03:24 +0800653///
654/// \headerfile <x86intrin.h>
655///
Logan Chien55afb0a2018-10-15 10:42:14 +0800656/// This intrinsic corresponds to the <c> PMADDWD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800657///
658/// \param __m1
659/// A 64-bit integer vector of [4 x i16].
660/// \param __m2
661/// A 64-bit integer vector of [4 x i16].
662/// \returns A 64-bit integer vector of [2 x i32] containing the sums of
663/// products of both parameters.
664static __inline__ __m64 __DEFAULT_FN_ATTRS
665_mm_madd_pi16(__m64 __m1, __m64 __m2)
666{
667 return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2);
668}
669
Logan Chien55afb0a2018-10-15 10:42:14 +0800670/// Multiplies each 16-bit signed integer element of the first 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800671/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
672/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper
673/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
674///
675/// \headerfile <x86intrin.h>
676///
Logan Chien55afb0a2018-10-15 10:42:14 +0800677/// This intrinsic corresponds to the <c> PMULHW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800678///
679/// \param __m1
680/// A 64-bit integer vector of [4 x i16].
681/// \param __m2
682/// A 64-bit integer vector of [4 x i16].
683/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits
684/// of the products of both parameters.
685static __inline__ __m64 __DEFAULT_FN_ATTRS
686_mm_mulhi_pi16(__m64 __m1, __m64 __m2)
687{
688 return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2);
689}
690
Logan Chien55afb0a2018-10-15 10:42:14 +0800691/// Multiplies each 16-bit signed integer element of the first 64-bit
Logan Chien2833ffb2018-10-09 10:03:24 +0800692/// integer vector of [4 x i16] by the corresponding 16-bit signed integer
693/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower
694/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16].
695///
696/// \headerfile <x86intrin.h>
697///
Logan Chien55afb0a2018-10-15 10:42:14 +0800698/// This intrinsic corresponds to the <c> PMULLW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800699///
700/// \param __m1
701/// A 64-bit integer vector of [4 x i16].
702/// \param __m2
703/// A 64-bit integer vector of [4 x i16].
704/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits
705/// of the products of both parameters.
706static __inline__ __m64 __DEFAULT_FN_ATTRS
707_mm_mullo_pi16(__m64 __m1, __m64 __m2)
708{
709 return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2);
710}
711
Logan Chien55afb0a2018-10-15 10:42:14 +0800712/// Left-shifts each 16-bit signed integer element of the first
Logan Chien2833ffb2018-10-09 10:03:24 +0800713/// parameter, which is a 64-bit integer vector of [4 x i16], by the number
714/// of bits specified by the second parameter, which is a 64-bit integer. The
715/// lower 16 bits of the results are packed into a 64-bit integer vector of
716/// [4 x i16].
717///
718/// \headerfile <x86intrin.h>
719///
Logan Chien55afb0a2018-10-15 10:42:14 +0800720/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800721///
722/// \param __m
723/// A 64-bit integer vector of [4 x i16].
724/// \param __count
725/// A 64-bit integer vector interpreted as a single 64-bit integer.
726/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
Logan Chien55afb0a2018-10-15 10:42:14 +0800727/// values. If \a __count is greater or equal to 16, the result is set to all
728/// 0.
Logan Chien2833ffb2018-10-09 10:03:24 +0800729static __inline__ __m64 __DEFAULT_FN_ATTRS
730_mm_sll_pi16(__m64 __m, __m64 __count)
731{
732 return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count);
733}
734
Logan Chien55afb0a2018-10-15 10:42:14 +0800735/// Left-shifts each 16-bit signed integer element of a 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800736/// vector of [4 x i16] by the number of bits specified by a 32-bit integer.
737/// The lower 16 bits of the results are packed into a 64-bit integer vector
738/// of [4 x i16].
739///
740/// \headerfile <x86intrin.h>
741///
Logan Chien55afb0a2018-10-15 10:42:14 +0800742/// This intrinsic corresponds to the <c> PSLLW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800743///
744/// \param __m
745/// A 64-bit integer vector of [4 x i16].
746/// \param __count
747/// A 32-bit integer value.
748/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted
Logan Chien55afb0a2018-10-15 10:42:14 +0800749/// values. If \a __count is greater or equal to 16, the result is set to all
750/// 0.
Logan Chien2833ffb2018-10-09 10:03:24 +0800751static __inline__ __m64 __DEFAULT_FN_ATTRS
752_mm_slli_pi16(__m64 __m, int __count)
753{
754 return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count);
755}
756
Logan Chien55afb0a2018-10-15 10:42:14 +0800757/// Left-shifts each 32-bit signed integer element of the first
Logan Chien2833ffb2018-10-09 10:03:24 +0800758/// parameter, which is a 64-bit integer vector of [2 x i32], by the number
759/// of bits specified by the second parameter, which is a 64-bit integer. The
760/// lower 32 bits of the results are packed into a 64-bit integer vector of
761/// [2 x i32].
762///
763/// \headerfile <x86intrin.h>
764///
Logan Chien55afb0a2018-10-15 10:42:14 +0800765/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800766///
767/// \param __m
768/// A 64-bit integer vector of [2 x i32].
769/// \param __count
770/// A 64-bit integer vector interpreted as a single 64-bit integer.
771/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
Logan Chien55afb0a2018-10-15 10:42:14 +0800772/// values. If \a __count is greater or equal to 32, the result is set to all
773/// 0.
Logan Chien2833ffb2018-10-09 10:03:24 +0800774static __inline__ __m64 __DEFAULT_FN_ATTRS
775_mm_sll_pi32(__m64 __m, __m64 __count)
776{
777 return (__m64)__builtin_ia32_pslld((__v2si)__m, __count);
778}
779
Logan Chien55afb0a2018-10-15 10:42:14 +0800780/// Left-shifts each 32-bit signed integer element of a 64-bit integer
Logan Chien2833ffb2018-10-09 10:03:24 +0800781/// vector of [2 x i32] by the number of bits specified by a 32-bit integer.
782/// The lower 32 bits of the results are packed into a 64-bit integer vector
783/// of [2 x i32].
784///
785/// \headerfile <x86intrin.h>
786///
Logan Chien55afb0a2018-10-15 10:42:14 +0800787/// This intrinsic corresponds to the <c> PSLLD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800788///
789/// \param __m
790/// A 64-bit integer vector of [2 x i32].
791/// \param __count
792/// A 32-bit integer value.
793/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted
Logan Chien55afb0a2018-10-15 10:42:14 +0800794/// values. If \a __count is greater or equal to 32, the result is set to all
795/// 0.
Logan Chien2833ffb2018-10-09 10:03:24 +0800796static __inline__ __m64 __DEFAULT_FN_ATTRS
797_mm_slli_pi32(__m64 __m, int __count)
798{
799 return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count);
800}
801
Logan Chien55afb0a2018-10-15 10:42:14 +0800802/// Left-shifts the first 64-bit integer parameter by the number of bits
Logan Chien2833ffb2018-10-09 10:03:24 +0800803/// specified by the second 64-bit integer parameter. The lower 64 bits of
804/// result are returned.
805///
806/// \headerfile <x86intrin.h>
807///
Logan Chien55afb0a2018-10-15 10:42:14 +0800808/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800809///
810/// \param __m
811/// A 64-bit integer vector interpreted as a single 64-bit integer.
812/// \param __count
813/// A 64-bit integer vector interpreted as a single 64-bit integer.
814/// \returns A 64-bit integer vector containing the left-shifted value. If
Logan Chien55afb0a2018-10-15 10:42:14 +0800815/// \a __count is greater or equal to 64, the result is set to 0.
Logan Chien2833ffb2018-10-09 10:03:24 +0800816static __inline__ __m64 __DEFAULT_FN_ATTRS
817_mm_sll_si64(__m64 __m, __m64 __count)
818{
819 return (__m64)__builtin_ia32_psllq((__v1di)__m, __count);
820}
821
Logan Chien55afb0a2018-10-15 10:42:14 +0800822/// Left-shifts the first parameter, which is a 64-bit integer, by the
Logan Chien2833ffb2018-10-09 10:03:24 +0800823/// number of bits specified by the second parameter, which is a 32-bit
824/// integer. The lower 64 bits of result are returned.
825///
826/// \headerfile <x86intrin.h>
827///
Logan Chien55afb0a2018-10-15 10:42:14 +0800828/// This intrinsic corresponds to the <c> PSLLQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800829///
830/// \param __m
831/// A 64-bit integer vector interpreted as a single 64-bit integer.
832/// \param __count
833/// A 32-bit integer value.
834/// \returns A 64-bit integer vector containing the left-shifted value. If
Logan Chien55afb0a2018-10-15 10:42:14 +0800835/// \a __count is greater or equal to 64, the result is set to 0.
Logan Chien2833ffb2018-10-09 10:03:24 +0800836static __inline__ __m64 __DEFAULT_FN_ATTRS
837_mm_slli_si64(__m64 __m, int __count)
838{
839 return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count);
840}
841
Logan Chien55afb0a2018-10-15 10:42:14 +0800842/// Right-shifts each 16-bit integer element of the first parameter,
Logan Chien2833ffb2018-10-09 10:03:24 +0800843/// which is a 64-bit integer vector of [4 x i16], by the number of bits
Logan Chien55afb0a2018-10-15 10:42:14 +0800844/// specified by the second parameter, which is a 64-bit integer.
845///
846/// High-order bits are filled with the sign bit of the initial value of each
847/// 16-bit element. The 16-bit results are packed into a 64-bit integer
848/// vector of [4 x i16].
Logan Chien2833ffb2018-10-09 10:03:24 +0800849///
850/// \headerfile <x86intrin.h>
851///
Logan Chien55afb0a2018-10-15 10:42:14 +0800852/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800853///
854/// \param __m
855/// A 64-bit integer vector of [4 x i16].
856/// \param __count
857/// A 64-bit integer vector interpreted as a single 64-bit integer.
858/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
859/// values.
860static __inline__ __m64 __DEFAULT_FN_ATTRS
861_mm_sra_pi16(__m64 __m, __m64 __count)
862{
863 return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count);
864}
865
Logan Chien55afb0a2018-10-15 10:42:14 +0800866/// Right-shifts each 16-bit integer element of a 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800867/// of [4 x i16] by the number of bits specified by a 32-bit integer.
Logan Chien55afb0a2018-10-15 10:42:14 +0800868///
Logan Chien2833ffb2018-10-09 10:03:24 +0800869/// High-order bits are filled with the sign bit of the initial value of each
870/// 16-bit element. The 16-bit results are packed into a 64-bit integer
871/// vector of [4 x i16].
872///
873/// \headerfile <x86intrin.h>
874///
Logan Chien55afb0a2018-10-15 10:42:14 +0800875/// This intrinsic corresponds to the <c> PSRAW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800876///
877/// \param __m
878/// A 64-bit integer vector of [4 x i16].
879/// \param __count
880/// A 32-bit integer value.
881/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
882/// values.
883static __inline__ __m64 __DEFAULT_FN_ATTRS
884_mm_srai_pi16(__m64 __m, int __count)
885{
886 return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count);
887}
888
Logan Chien55afb0a2018-10-15 10:42:14 +0800889/// Right-shifts each 32-bit integer element of the first parameter,
Logan Chien2833ffb2018-10-09 10:03:24 +0800890/// which is a 64-bit integer vector of [2 x i32], by the number of bits
Logan Chien55afb0a2018-10-15 10:42:14 +0800891/// specified by the second parameter, which is a 64-bit integer.
892///
893/// High-order bits are filled with the sign bit of the initial value of each
894/// 32-bit element. The 32-bit results are packed into a 64-bit integer
895/// vector of [2 x i32].
Logan Chien2833ffb2018-10-09 10:03:24 +0800896///
897/// \headerfile <x86intrin.h>
898///
Logan Chien55afb0a2018-10-15 10:42:14 +0800899/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800900///
901/// \param __m
902/// A 64-bit integer vector of [2 x i32].
903/// \param __count
904/// A 64-bit integer vector interpreted as a single 64-bit integer.
905/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
906/// values.
907static __inline__ __m64 __DEFAULT_FN_ATTRS
908_mm_sra_pi32(__m64 __m, __m64 __count)
909{
910 return (__m64)__builtin_ia32_psrad((__v2si)__m, __count);
911}
912
Logan Chien55afb0a2018-10-15 10:42:14 +0800913/// Right-shifts each 32-bit integer element of a 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800914/// of [2 x i32] by the number of bits specified by a 32-bit integer.
Logan Chien55afb0a2018-10-15 10:42:14 +0800915///
Logan Chien2833ffb2018-10-09 10:03:24 +0800916/// High-order bits are filled with the sign bit of the initial value of each
917/// 32-bit element. The 32-bit results are packed into a 64-bit integer
918/// vector of [2 x i32].
919///
920/// \headerfile <x86intrin.h>
921///
Logan Chien55afb0a2018-10-15 10:42:14 +0800922/// This intrinsic corresponds to the <c> PSRAD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800923///
924/// \param __m
925/// A 64-bit integer vector of [2 x i32].
926/// \param __count
927/// A 32-bit integer value.
928/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
929/// values.
930static __inline__ __m64 __DEFAULT_FN_ATTRS
931_mm_srai_pi32(__m64 __m, int __count)
932{
933 return (__m64)__builtin_ia32_psradi((__v2si)__m, __count);
934}
935
Logan Chien55afb0a2018-10-15 10:42:14 +0800936/// Right-shifts each 16-bit integer element of the first parameter,
Logan Chien2833ffb2018-10-09 10:03:24 +0800937/// which is a 64-bit integer vector of [4 x i16], by the number of bits
Logan Chien55afb0a2018-10-15 10:42:14 +0800938/// specified by the second parameter, which is a 64-bit integer.
939///
940/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
941/// integer vector of [4 x i16].
Logan Chien2833ffb2018-10-09 10:03:24 +0800942///
943/// \headerfile <x86intrin.h>
944///
Logan Chien55afb0a2018-10-15 10:42:14 +0800945/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800946///
947/// \param __m
948/// A 64-bit integer vector of [4 x i16].
949/// \param __count
950/// A 64-bit integer vector interpreted as a single 64-bit integer.
951/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
952/// values.
953static __inline__ __m64 __DEFAULT_FN_ATTRS
954_mm_srl_pi16(__m64 __m, __m64 __count)
955{
956 return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count);
957}
958
Logan Chien55afb0a2018-10-15 10:42:14 +0800959/// Right-shifts each 16-bit integer element of a 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +0800960/// of [4 x i16] by the number of bits specified by a 32-bit integer.
Logan Chien55afb0a2018-10-15 10:42:14 +0800961///
Logan Chien2833ffb2018-10-09 10:03:24 +0800962/// High-order bits are cleared. The 16-bit results are packed into a 64-bit
963/// integer vector of [4 x i16].
964///
965/// \headerfile <x86intrin.h>
966///
Logan Chien55afb0a2018-10-15 10:42:14 +0800967/// This intrinsic corresponds to the <c> PSRLW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800968///
969/// \param __m
970/// A 64-bit integer vector of [4 x i16].
971/// \param __count
972/// A 32-bit integer value.
973/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted
974/// values.
975static __inline__ __m64 __DEFAULT_FN_ATTRS
976_mm_srli_pi16(__m64 __m, int __count)
977{
978 return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count);
979}
980
Logan Chien55afb0a2018-10-15 10:42:14 +0800981/// Right-shifts each 32-bit integer element of the first parameter,
Logan Chien2833ffb2018-10-09 10:03:24 +0800982/// which is a 64-bit integer vector of [2 x i32], by the number of bits
Logan Chien55afb0a2018-10-15 10:42:14 +0800983/// specified by the second parameter, which is a 64-bit integer.
984///
985/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
986/// integer vector of [2 x i32].
Logan Chien2833ffb2018-10-09 10:03:24 +0800987///
988/// \headerfile <x86intrin.h>
989///
Logan Chien55afb0a2018-10-15 10:42:14 +0800990/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +0800991///
992/// \param __m
993/// A 64-bit integer vector of [2 x i32].
994/// \param __count
995/// A 64-bit integer vector interpreted as a single 64-bit integer.
996/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
997/// values.
998static __inline__ __m64 __DEFAULT_FN_ATTRS
999_mm_srl_pi32(__m64 __m, __m64 __count)
1000{
1001 return (__m64)__builtin_ia32_psrld((__v2si)__m, __count);
1002}
1003
Logan Chien55afb0a2018-10-15 10:42:14 +08001004/// Right-shifts each 32-bit integer element of a 64-bit integer vector
Logan Chien2833ffb2018-10-09 10:03:24 +08001005/// of [2 x i32] by the number of bits specified by a 32-bit integer.
Logan Chien55afb0a2018-10-15 10:42:14 +08001006///
Logan Chien2833ffb2018-10-09 10:03:24 +08001007/// High-order bits are cleared. The 32-bit results are packed into a 64-bit
1008/// integer vector of [2 x i32].
1009///
1010/// \headerfile <x86intrin.h>
1011///
Logan Chien55afb0a2018-10-15 10:42:14 +08001012/// This intrinsic corresponds to the <c> PSRLD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001013///
1014/// \param __m
1015/// A 64-bit integer vector of [2 x i32].
1016/// \param __count
1017/// A 32-bit integer value.
1018/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted
1019/// values.
1020static __inline__ __m64 __DEFAULT_FN_ATTRS
1021_mm_srli_pi32(__m64 __m, int __count)
1022{
1023 return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count);
1024}
1025
Logan Chien55afb0a2018-10-15 10:42:14 +08001026/// Right-shifts the first 64-bit integer parameter by the number of bits
1027/// specified by the second 64-bit integer parameter.
1028///
1029/// High-order bits are cleared.
Logan Chien2833ffb2018-10-09 10:03:24 +08001030///
1031/// \headerfile <x86intrin.h>
1032///
Logan Chien55afb0a2018-10-15 10:42:14 +08001033/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001034///
1035/// \param __m
1036/// A 64-bit integer vector interpreted as a single 64-bit integer.
1037/// \param __count
1038/// A 64-bit integer vector interpreted as a single 64-bit integer.
1039/// \returns A 64-bit integer vector containing the right-shifted value.
1040static __inline__ __m64 __DEFAULT_FN_ATTRS
1041_mm_srl_si64(__m64 __m, __m64 __count)
1042{
1043 return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count);
1044}
1045
Logan Chien55afb0a2018-10-15 10:42:14 +08001046/// Right-shifts the first parameter, which is a 64-bit integer, by the
Logan Chien2833ffb2018-10-09 10:03:24 +08001047/// number of bits specified by the second parameter, which is a 32-bit
Logan Chien55afb0a2018-10-15 10:42:14 +08001048/// integer.
1049///
1050/// High-order bits are cleared.
Logan Chien2833ffb2018-10-09 10:03:24 +08001051///
1052/// \headerfile <x86intrin.h>
1053///
Logan Chien55afb0a2018-10-15 10:42:14 +08001054/// This intrinsic corresponds to the <c> PSRLQ </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001055///
1056/// \param __m
1057/// A 64-bit integer vector interpreted as a single 64-bit integer.
1058/// \param __count
1059/// A 32-bit integer value.
1060/// \returns A 64-bit integer vector containing the right-shifted value.
1061static __inline__ __m64 __DEFAULT_FN_ATTRS
1062_mm_srli_si64(__m64 __m, int __count)
1063{
1064 return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count);
1065}
1066
Logan Chien55afb0a2018-10-15 10:42:14 +08001067/// Performs a bitwise AND of two 64-bit integer vectors.
Logan Chien2833ffb2018-10-09 10:03:24 +08001068///
1069/// \headerfile <x86intrin.h>
1070///
Logan Chien55afb0a2018-10-15 10:42:14 +08001071/// This intrinsic corresponds to the <c> PAND </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001072///
1073/// \param __m1
1074/// A 64-bit integer vector.
1075/// \param __m2
1076/// A 64-bit integer vector.
1077/// \returns A 64-bit integer vector containing the bitwise AND of both
1078/// parameters.
1079static __inline__ __m64 __DEFAULT_FN_ATTRS
1080_mm_and_si64(__m64 __m1, __m64 __m2)
1081{
1082 return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2);
1083}
1084
Logan Chien55afb0a2018-10-15 10:42:14 +08001085/// Performs a bitwise NOT of the first 64-bit integer vector, and then
Logan Chien2833ffb2018-10-09 10:03:24 +08001086/// performs a bitwise AND of the intermediate result and the second 64-bit
1087/// integer vector.
1088///
1089/// \headerfile <x86intrin.h>
1090///
Logan Chien55afb0a2018-10-15 10:42:14 +08001091/// This intrinsic corresponds to the <c> PANDN </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001092///
1093/// \param __m1
1094/// A 64-bit integer vector. The one's complement of this parameter is used
1095/// in the bitwise AND.
1096/// \param __m2
1097/// A 64-bit integer vector.
1098/// \returns A 64-bit integer vector containing the bitwise AND of the second
1099/// parameter and the one's complement of the first parameter.
1100static __inline__ __m64 __DEFAULT_FN_ATTRS
1101_mm_andnot_si64(__m64 __m1, __m64 __m2)
1102{
1103 return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2);
1104}
1105
Logan Chien55afb0a2018-10-15 10:42:14 +08001106/// Performs a bitwise OR of two 64-bit integer vectors.
Logan Chien2833ffb2018-10-09 10:03:24 +08001107///
1108/// \headerfile <x86intrin.h>
1109///
Logan Chien55afb0a2018-10-15 10:42:14 +08001110/// This intrinsic corresponds to the <c> POR </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001111///
1112/// \param __m1
1113/// A 64-bit integer vector.
1114/// \param __m2
1115/// A 64-bit integer vector.
1116/// \returns A 64-bit integer vector containing the bitwise OR of both
1117/// parameters.
1118static __inline__ __m64 __DEFAULT_FN_ATTRS
1119_mm_or_si64(__m64 __m1, __m64 __m2)
1120{
1121 return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2);
1122}
1123
Logan Chien55afb0a2018-10-15 10:42:14 +08001124/// Performs a bitwise exclusive OR of two 64-bit integer vectors.
Logan Chien2833ffb2018-10-09 10:03:24 +08001125///
1126/// \headerfile <x86intrin.h>
1127///
Logan Chien55afb0a2018-10-15 10:42:14 +08001128/// This intrinsic corresponds to the <c> PXOR </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001129///
1130/// \param __m1
1131/// A 64-bit integer vector.
1132/// \param __m2
1133/// A 64-bit integer vector.
1134/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both
1135/// parameters.
1136static __inline__ __m64 __DEFAULT_FN_ATTRS
1137_mm_xor_si64(__m64 __m1, __m64 __m2)
1138{
1139 return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2);
1140}
1141
Logan Chien55afb0a2018-10-15 10:42:14 +08001142/// Compares the 8-bit integer elements of two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001143/// [8 x i8] to determine if the element of the first vector is equal to the
Logan Chien55afb0a2018-10-15 10:42:14 +08001144/// corresponding element of the second vector.
1145///
1146/// The comparison yields 0 for false, 0xFF for true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001147///
1148/// \headerfile <x86intrin.h>
1149///
Logan Chien55afb0a2018-10-15 10:42:14 +08001150/// This intrinsic corresponds to the <c> PCMPEQB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001151///
1152/// \param __m1
1153/// A 64-bit integer vector of [8 x i8].
1154/// \param __m2
1155/// A 64-bit integer vector of [8 x i8].
1156/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1157/// results.
1158static __inline__ __m64 __DEFAULT_FN_ATTRS
1159_mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
1160{
1161 return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2);
1162}
1163
Logan Chien55afb0a2018-10-15 10:42:14 +08001164/// Compares the 16-bit integer elements of two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001165/// [4 x i16] to determine if the element of the first vector is equal to the
Logan Chien55afb0a2018-10-15 10:42:14 +08001166/// corresponding element of the second vector.
1167///
1168/// The comparison yields 0 for false, 0xFFFF for true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001169///
1170/// \headerfile <x86intrin.h>
1171///
Logan Chien55afb0a2018-10-15 10:42:14 +08001172/// This intrinsic corresponds to the <c> PCMPEQW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001173///
1174/// \param __m1
1175/// A 64-bit integer vector of [4 x i16].
1176/// \param __m2
1177/// A 64-bit integer vector of [4 x i16].
1178/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1179/// results.
1180static __inline__ __m64 __DEFAULT_FN_ATTRS
1181_mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
1182{
1183 return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2);
1184}
1185
Logan Chien55afb0a2018-10-15 10:42:14 +08001186/// Compares the 32-bit integer elements of two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001187/// [2 x i32] to determine if the element of the first vector is equal to the
Logan Chien55afb0a2018-10-15 10:42:14 +08001188/// corresponding element of the second vector.
1189///
1190/// The comparison yields 0 for false, 0xFFFFFFFF for true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001191///
1192/// \headerfile <x86intrin.h>
1193///
Logan Chien55afb0a2018-10-15 10:42:14 +08001194/// This intrinsic corresponds to the <c> PCMPEQD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001195///
1196/// \param __m1
1197/// A 64-bit integer vector of [2 x i32].
1198/// \param __m2
1199/// A 64-bit integer vector of [2 x i32].
1200/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1201/// results.
1202static __inline__ __m64 __DEFAULT_FN_ATTRS
1203_mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
1204{
1205 return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2);
1206}
1207
Logan Chien55afb0a2018-10-15 10:42:14 +08001208/// Compares the 8-bit integer elements of two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001209/// [8 x i8] to determine if the element of the first vector is greater than
Logan Chien55afb0a2018-10-15 10:42:14 +08001210/// the corresponding element of the second vector.
1211///
1212/// The comparison yields 0 for false, 0xFF for true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001213///
1214/// \headerfile <x86intrin.h>
1215///
Logan Chien55afb0a2018-10-15 10:42:14 +08001216/// This intrinsic corresponds to the <c> PCMPGTB </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001217///
1218/// \param __m1
1219/// A 64-bit integer vector of [8 x i8].
1220/// \param __m2
1221/// A 64-bit integer vector of [8 x i8].
1222/// \returns A 64-bit integer vector of [8 x i8] containing the comparison
1223/// results.
1224static __inline__ __m64 __DEFAULT_FN_ATTRS
1225_mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
1226{
1227 return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2);
1228}
1229
Logan Chien55afb0a2018-10-15 10:42:14 +08001230/// Compares the 16-bit integer elements of two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001231/// [4 x i16] to determine if the element of the first vector is greater than
Logan Chien55afb0a2018-10-15 10:42:14 +08001232/// the corresponding element of the second vector.
1233///
1234/// The comparison yields 0 for false, 0xFFFF for true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001235///
1236/// \headerfile <x86intrin.h>
1237///
Logan Chien55afb0a2018-10-15 10:42:14 +08001238/// This intrinsic corresponds to the <c> PCMPGTW </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001239///
1240/// \param __m1
1241/// A 64-bit integer vector of [4 x i16].
1242/// \param __m2
1243/// A 64-bit integer vector of [4 x i16].
1244/// \returns A 64-bit integer vector of [4 x i16] containing the comparison
1245/// results.
1246static __inline__ __m64 __DEFAULT_FN_ATTRS
1247_mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
1248{
1249 return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2);
1250}
1251
Logan Chien55afb0a2018-10-15 10:42:14 +08001252/// Compares the 32-bit integer elements of two 64-bit integer vectors of
Logan Chien2833ffb2018-10-09 10:03:24 +08001253/// [2 x i32] to determine if the element of the first vector is greater than
Logan Chien55afb0a2018-10-15 10:42:14 +08001254/// the corresponding element of the second vector.
1255///
1256/// The comparison yields 0 for false, 0xFFFFFFFF for true.
Logan Chien2833ffb2018-10-09 10:03:24 +08001257///
1258/// \headerfile <x86intrin.h>
1259///
Logan Chien55afb0a2018-10-15 10:42:14 +08001260/// This intrinsic corresponds to the <c> PCMPGTD </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001261///
1262/// \param __m1
1263/// A 64-bit integer vector of [2 x i32].
1264/// \param __m2
1265/// A 64-bit integer vector of [2 x i32].
1266/// \returns A 64-bit integer vector of [2 x i32] containing the comparison
1267/// results.
1268static __inline__ __m64 __DEFAULT_FN_ATTRS
1269_mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
1270{
1271 return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2);
1272}
1273
Logan Chien55afb0a2018-10-15 10:42:14 +08001274/// Constructs a 64-bit integer vector initialized to zero.
Logan Chien2833ffb2018-10-09 10:03:24 +08001275///
1276/// \headerfile <x86intrin.h>
1277///
Logan Chien55afb0a2018-10-15 10:42:14 +08001278/// This intrinsic corresponds to the <c> PXOR </c> instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001279///
1280/// \returns An initialized 64-bit integer vector with all elements set to zero.
1281static __inline__ __m64 __DEFAULT_FN_ATTRS
1282_mm_setzero_si64(void)
1283{
Logan Chien55afb0a2018-10-15 10:42:14 +08001284 return __extension__ (__m64){ 0LL };
Logan Chien2833ffb2018-10-09 10:03:24 +08001285}
1286
Logan Chien55afb0a2018-10-15 10:42:14 +08001287/// Constructs a 64-bit integer vector initialized with the specified
Logan Chien2833ffb2018-10-09 10:03:24 +08001288/// 32-bit integer values.
1289///
1290/// \headerfile <x86intrin.h>
1291///
1292/// This intrinsic is a utility function and does not correspond to a specific
1293/// instruction.
1294///
1295/// \param __i1
1296/// A 32-bit integer value used to initialize the upper 32 bits of the
1297/// result.
1298/// \param __i0
1299/// A 32-bit integer value used to initialize the lower 32 bits of the
1300/// result.
1301/// \returns An initialized 64-bit integer vector.
1302static __inline__ __m64 __DEFAULT_FN_ATTRS
1303_mm_set_pi32(int __i1, int __i0)
1304{
1305 return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1);
1306}
1307
Logan Chien55afb0a2018-10-15 10:42:14 +08001308/// Constructs a 64-bit integer vector initialized with the specified
Logan Chien2833ffb2018-10-09 10:03:24 +08001309/// 16-bit integer values.
1310///
1311/// \headerfile <x86intrin.h>
1312///
1313/// This intrinsic is a utility function and does not correspond to a specific
1314/// instruction.
1315///
1316/// \param __s3
1317/// A 16-bit integer value used to initialize bits [63:48] of the result.
1318/// \param __s2
1319/// A 16-bit integer value used to initialize bits [47:32] of the result.
1320/// \param __s1
1321/// A 16-bit integer value used to initialize bits [31:16] of the result.
1322/// \param __s0
1323/// A 16-bit integer value used to initialize bits [15:0] of the result.
1324/// \returns An initialized 64-bit integer vector.
1325static __inline__ __m64 __DEFAULT_FN_ATTRS
1326_mm_set_pi16(short __s3, short __s2, short __s1, short __s0)
1327{
1328 return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3);
1329}
1330
Logan Chien55afb0a2018-10-15 10:42:14 +08001331/// Constructs a 64-bit integer vector initialized with the specified
Logan Chien2833ffb2018-10-09 10:03:24 +08001332/// 8-bit integer values.
1333///
1334/// \headerfile <x86intrin.h>
1335///
1336/// This intrinsic is a utility function and does not correspond to a specific
1337/// instruction.
1338///
1339/// \param __b7
1340/// An 8-bit integer value used to initialize bits [63:56] of the result.
1341/// \param __b6
1342/// An 8-bit integer value used to initialize bits [55:48] of the result.
1343/// \param __b5
1344/// An 8-bit integer value used to initialize bits [47:40] of the result.
1345/// \param __b4
1346/// An 8-bit integer value used to initialize bits [39:32] of the result.
1347/// \param __b3
1348/// An 8-bit integer value used to initialize bits [31:24] of the result.
1349/// \param __b2
1350/// An 8-bit integer value used to initialize bits [23:16] of the result.
1351/// \param __b1
1352/// An 8-bit integer value used to initialize bits [15:8] of the result.
1353/// \param __b0
1354/// An 8-bit integer value used to initialize bits [7:0] of the result.
1355/// \returns An initialized 64-bit integer vector.
1356static __inline__ __m64 __DEFAULT_FN_ATTRS
1357_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2,
1358 char __b1, char __b0)
1359{
1360 return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3,
1361 __b4, __b5, __b6, __b7);
1362}
1363
Logan Chien55afb0a2018-10-15 10:42:14 +08001364/// Constructs a 64-bit integer vector of [2 x i32], with each of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001365/// 32-bit integer vector elements set to the specified 32-bit integer
1366/// value.
1367///
1368/// \headerfile <x86intrin.h>
1369///
Logan Chien55afb0a2018-10-15 10:42:14 +08001370/// This intrinsic is a utility function and does not correspond to a specific
1371/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001372///
1373/// \param __i
1374/// A 32-bit integer value used to initialize each vector element of the
1375/// result.
1376/// \returns An initialized 64-bit integer vector of [2 x i32].
1377static __inline__ __m64 __DEFAULT_FN_ATTRS
1378_mm_set1_pi32(int __i)
1379{
1380 return _mm_set_pi32(__i, __i);
1381}
1382
Logan Chien55afb0a2018-10-15 10:42:14 +08001383/// Constructs a 64-bit integer vector of [4 x i16], with each of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001384/// 16-bit integer vector elements set to the specified 16-bit integer
1385/// value.
1386///
1387/// \headerfile <x86intrin.h>
1388///
Logan Chien55afb0a2018-10-15 10:42:14 +08001389/// This intrinsic is a utility function and does not correspond to a specific
1390/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001391///
1392/// \param __w
1393/// A 16-bit integer value used to initialize each vector element of the
1394/// result.
1395/// \returns An initialized 64-bit integer vector of [4 x i16].
1396static __inline__ __m64 __DEFAULT_FN_ATTRS
1397_mm_set1_pi16(short __w)
1398{
1399 return _mm_set_pi16(__w, __w, __w, __w);
1400}
1401
Logan Chien55afb0a2018-10-15 10:42:14 +08001402/// Constructs a 64-bit integer vector of [8 x i8], with each of the
Logan Chien2833ffb2018-10-09 10:03:24 +08001403/// 8-bit integer vector elements set to the specified 8-bit integer value.
1404///
1405/// \headerfile <x86intrin.h>
1406///
Logan Chien55afb0a2018-10-15 10:42:14 +08001407/// This intrinsic is a utility function and does not correspond to a specific
1408/// instruction.
Logan Chien2833ffb2018-10-09 10:03:24 +08001409///
1410/// \param __b
1411/// An 8-bit integer value used to initialize each vector element of the
1412/// result.
1413/// \returns An initialized 64-bit integer vector of [8 x i8].
1414static __inline__ __m64 __DEFAULT_FN_ATTRS
1415_mm_set1_pi8(char __b)
1416{
1417 return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b);
1418}
1419
Logan Chien55afb0a2018-10-15 10:42:14 +08001420/// Constructs a 64-bit integer vector, initialized in reverse order with
Logan Chien2833ffb2018-10-09 10:03:24 +08001421/// the specified 32-bit integer values.
1422///
1423/// \headerfile <x86intrin.h>
1424///
1425/// This intrinsic is a utility function and does not correspond to a specific
1426/// instruction.
1427///
1428/// \param __i0
1429/// A 32-bit integer value used to initialize the lower 32 bits of the
1430/// result.
1431/// \param __i1
1432/// A 32-bit integer value used to initialize the upper 32 bits of the
1433/// result.
1434/// \returns An initialized 64-bit integer vector.
1435static __inline__ __m64 __DEFAULT_FN_ATTRS
1436_mm_setr_pi32(int __i0, int __i1)
1437{
1438 return _mm_set_pi32(__i1, __i0);
1439}
1440
Logan Chien55afb0a2018-10-15 10:42:14 +08001441/// Constructs a 64-bit integer vector, initialized in reverse order with
Logan Chien2833ffb2018-10-09 10:03:24 +08001442/// the specified 16-bit integer values.
1443///
1444/// \headerfile <x86intrin.h>
1445///
1446/// This intrinsic is a utility function and does not correspond to a specific
1447/// instruction.
1448///
1449/// \param __w0
1450/// A 16-bit integer value used to initialize bits [15:0] of the result.
1451/// \param __w1
1452/// A 16-bit integer value used to initialize bits [31:16] of the result.
1453/// \param __w2
1454/// A 16-bit integer value used to initialize bits [47:32] of the result.
1455/// \param __w3
1456/// A 16-bit integer value used to initialize bits [63:48] of the result.
1457/// \returns An initialized 64-bit integer vector.
1458static __inline__ __m64 __DEFAULT_FN_ATTRS
1459_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3)
1460{
1461 return _mm_set_pi16(__w3, __w2, __w1, __w0);
1462}
1463
Logan Chien55afb0a2018-10-15 10:42:14 +08001464/// Constructs a 64-bit integer vector, initialized in reverse order with
Logan Chien2833ffb2018-10-09 10:03:24 +08001465/// the specified 8-bit integer values.
1466///
1467/// \headerfile <x86intrin.h>
1468///
1469/// This intrinsic is a utility function and does not correspond to a specific
1470/// instruction.
1471///
1472/// \param __b0
1473/// An 8-bit integer value used to initialize bits [7:0] of the result.
1474/// \param __b1
1475/// An 8-bit integer value used to initialize bits [15:8] of the result.
1476/// \param __b2
1477/// An 8-bit integer value used to initialize bits [23:16] of the result.
1478/// \param __b3
1479/// An 8-bit integer value used to initialize bits [31:24] of the result.
1480/// \param __b4
1481/// An 8-bit integer value used to initialize bits [39:32] of the result.
1482/// \param __b5
1483/// An 8-bit integer value used to initialize bits [47:40] of the result.
1484/// \param __b6
1485/// An 8-bit integer value used to initialize bits [55:48] of the result.
1486/// \param __b7
1487/// An 8-bit integer value used to initialize bits [63:56] of the result.
1488/// \returns An initialized 64-bit integer vector.
1489static __inline__ __m64 __DEFAULT_FN_ATTRS
1490_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5,
1491 char __b6, char __b7)
1492{
1493 return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0);
1494}
1495
1496#undef __DEFAULT_FN_ATTRS
1497
1498/* Aliases for compatibility. */
1499#define _m_empty _mm_empty
1500#define _m_from_int _mm_cvtsi32_si64
1501#define _m_from_int64 _mm_cvtsi64_m64
1502#define _m_to_int _mm_cvtsi64_si32
1503#define _m_to_int64 _mm_cvtm64_si64
1504#define _m_packsswb _mm_packs_pi16
1505#define _m_packssdw _mm_packs_pi32
1506#define _m_packuswb _mm_packs_pu16
1507#define _m_punpckhbw _mm_unpackhi_pi8
1508#define _m_punpckhwd _mm_unpackhi_pi16
1509#define _m_punpckhdq _mm_unpackhi_pi32
1510#define _m_punpcklbw _mm_unpacklo_pi8
1511#define _m_punpcklwd _mm_unpacklo_pi16
1512#define _m_punpckldq _mm_unpacklo_pi32
1513#define _m_paddb _mm_add_pi8
1514#define _m_paddw _mm_add_pi16
1515#define _m_paddd _mm_add_pi32
1516#define _m_paddsb _mm_adds_pi8
1517#define _m_paddsw _mm_adds_pi16
1518#define _m_paddusb _mm_adds_pu8
1519#define _m_paddusw _mm_adds_pu16
1520#define _m_psubb _mm_sub_pi8
1521#define _m_psubw _mm_sub_pi16
1522#define _m_psubd _mm_sub_pi32
1523#define _m_psubsb _mm_subs_pi8
1524#define _m_psubsw _mm_subs_pi16
1525#define _m_psubusb _mm_subs_pu8
1526#define _m_psubusw _mm_subs_pu16
1527#define _m_pmaddwd _mm_madd_pi16
1528#define _m_pmulhw _mm_mulhi_pi16
1529#define _m_pmullw _mm_mullo_pi16
1530#define _m_psllw _mm_sll_pi16
1531#define _m_psllwi _mm_slli_pi16
1532#define _m_pslld _mm_sll_pi32
1533#define _m_pslldi _mm_slli_pi32
1534#define _m_psllq _mm_sll_si64
1535#define _m_psllqi _mm_slli_si64
1536#define _m_psraw _mm_sra_pi16
1537#define _m_psrawi _mm_srai_pi16
1538#define _m_psrad _mm_sra_pi32
1539#define _m_psradi _mm_srai_pi32
1540#define _m_psrlw _mm_srl_pi16
1541#define _m_psrlwi _mm_srli_pi16
1542#define _m_psrld _mm_srl_pi32
1543#define _m_psrldi _mm_srli_pi32
1544#define _m_psrlq _mm_srl_si64
1545#define _m_psrlqi _mm_srli_si64
1546#define _m_pand _mm_and_si64
1547#define _m_pandn _mm_andnot_si64
1548#define _m_por _mm_or_si64
1549#define _m_pxor _mm_xor_si64
1550#define _m_pcmpeqb _mm_cmpeq_pi8
1551#define _m_pcmpeqw _mm_cmpeq_pi16
1552#define _m_pcmpeqd _mm_cmpeq_pi32
1553#define _m_pcmpgtb _mm_cmpgt_pi8
1554#define _m_pcmpgtw _mm_cmpgt_pi16
1555#define _m_pcmpgtd _mm_cmpgt_pi32
1556
1557#endif /* __MMINTRIN_H */
1558