blob: 4940666e80836608c3661b761d7dc63996cc8c63 [file] [log] [blame]
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -07001/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------===
2 *
3 * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 * See https://llvm.org/LICENSE.txt for license information.
5 * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 *
7 *===------------------------------------------------------------------------===
8 */
9
10#ifndef __IMMINTRIN_H
11#error "Never use <amxintrin.h> directly; include <immintrin.h> instead."
12#endif /* __IMMINTRIN_H */
13
14#ifndef __AMXINTRIN_H
15#define __AMXINTRIN_H
16#ifdef __x86_64__
17
18/* Define the default attributes for the functions in this file. */
19#define __DEFAULT_FN_ATTRS_TILE \
20 __attribute__((__always_inline__, __nodebug__, __target__("amx-tile")))
21#define __DEFAULT_FN_ATTRS_INT8 \
22 __attribute__((__always_inline__, __nodebug__, __target__("amx-int8")))
23#define __DEFAULT_FN_ATTRS_BF16 \
24 __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16")))
25
26/// Load tile configuration from a 64-byte memory location specified by
27/// "mem_addr". The tile configuration includes the tile type palette, the
28/// number of bytes per row, and the number of rows. If the specified
29/// palette_id is zero, that signifies the init state for both the tile
30/// config and the tile data, and the tiles are zeroed. Any invalid
31/// configurations will result in #GP fault.
32///
33/// \headerfile <immintrin.h>
34///
35/// This intrinsic corresponds to the <c> LDTILECFG </c> instruction.
36///
37/// \param __config
38/// A pointer to 512-bits configuration
39static __inline__ void __DEFAULT_FN_ATTRS_TILE
40_tile_loadconfig(const void *__config) {
41 __builtin_ia32_tile_loadconfig(__config);
42}
43
44/// Stores the current tile configuration to a 64-byte memory location
45/// specified by "mem_addr". The tile configuration includes the tile type
46/// palette, the number of bytes per row, and the number of rows. If tiles
47/// are not configured, all zeroes will be stored to memory.
48///
49/// \headerfile <immintrin.h>
50///
51/// This intrinsic corresponds to the <c> STTILECFG </c> instruction.
52///
53/// \param __config
54/// A pointer to 512-bits configuration
55static __inline__ void __DEFAULT_FN_ATTRS_TILE
56_tile_storeconfig(void *__config) {
57 __builtin_ia32_tile_storeconfig(__config);
58}
59
60/// Release the tile configuration to return to the init state, which
61/// releases all storage it currently holds.
62///
63/// \headerfile <immintrin.h>
64///
65/// This intrinsic corresponds to the <c> TILERELEASE </c> instruction.
66static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) {
67 __builtin_ia32_tilerelease();
68}
69
70/// Load tile rows from memory specifieid by "base" address and "stride" into
71/// destination tile "dst" using the tile configuration previously configured
72/// via "_tile_loadconfig".
73///
74/// \headerfile <immintrin.h>
75///
76/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
77///
78/// \param dst
79/// A destination tile. Max size is 1024 Bytes.
80/// \param base
81/// A pointer to base address.
82/// \param stride
83/// The stride between the rows' data to be loaded in memory.
84#define _tile_loadd(dst, base, stride) \
85 __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \
86 (__SIZE_TYPE__)(stride))
87
88/// Load tile rows from memory specifieid by "base" address and "stride" into
89/// destination tile "dst" using the tile configuration previously configured
90/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation
91/// that the data will likely not be reused in the near future and the data
92/// caching can be optimized accordingly.
93///
94/// \headerfile <immintrin.h>
95///
96/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
97///
98/// \param dst
99/// A destination tile. Max size is 1024 Bytes.
100/// \param base
101/// A pointer to base address.
102/// \param stride
103/// The stride between the rows' data to be loaded in memory.
104#define _tile_stream_loadd(dst, base, stride) \
105 __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \
106 (__SIZE_TYPE__)(stride))
107
108/// Store the tile specified by "src" to memory specifieid by "base" address and
109/// "stride" using the tile configuration previously configured via
110/// "_tile_loadconfig".
111///
112/// \headerfile <immintrin.h>
113///
114/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
115///
116/// \param dst
117/// A destination tile. Max size is 1024 Bytes.
118/// \param base
119/// A pointer to base address.
120/// \param stride
121/// The stride between the rows' data to be stored in memory.
122#define _tile_stored(dst, base, stride) \
123 __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride))
124
125/// Zero the tile specified by "tdest".
126///
127/// \headerfile <immintrin.h>
128///
129/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
130///
131/// \param tile
132/// The destination tile to be zero. Max size is 1024 Bytes.
133#define _tile_zero(tile) __builtin_ia32_tilezero((tile))
134
135/// Compute dot-product of bytes in tiles with a source/destination accumulator.
136/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
137/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
138/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
139/// and store the 32-bit result back to tile "dst".
140///
141/// \headerfile <immintrin.h>
142///
143/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
144///
145/// \param dst
146/// The destination tile. Max size is 1024 Bytes.
147/// \param src0
148/// The 1st source tile. Max size is 1024 Bytes.
149/// \param src1
150/// The 2nd source tile. Max size is 1024 Bytes.
151#define _tile_dpbssd(dst, src0, src1) \
152 __builtin_ia32_tdpbssd((dst), (src0), (src1))
153
154/// Compute dot-product of bytes in tiles with a source/destination accumulator.
155/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
156/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
157/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
158/// in "dst", and store the 32-bit result back to tile "dst".
159///
160/// \headerfile <immintrin.h>
161///
162/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
163///
164/// \param dst
165/// The destination tile. Max size is 1024 Bytes.
166/// \param src0
167/// The 1st source tile. Max size is 1024 Bytes.
168/// \param src1
169/// The 2nd source tile. Max size is 1024 Bytes.
170#define _tile_dpbsud(dst, src0, src1) \
171 __builtin_ia32_tdpbsud((dst), (src0), (src1))
172
173/// Compute dot-product of bytes in tiles with a source/destination accumulator.
174/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
175/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
176/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
177/// and store the 32-bit result back to tile "dst".
178///
179/// \headerfile <immintrin.h>
180///
181/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
182///
183/// \param dst
184/// The destination tile. Max size is 1024 Bytes.
185/// \param src0
186/// The 1st source tile. Max size is 1024 Bytes.
187/// \param src1
188/// The 2nd source tile. Max size is 1024 Bytes.
189#define _tile_dpbusd(dst, src0, src1) \
190 __builtin_ia32_tdpbusd((dst), (src0), (src1))
191
192/// Compute dot-product of bytes in tiles with a source/destination accumulator.
193/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
194/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
195/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
196/// "dst", and store the 32-bit result back to tile "dst".
197///
198/// \headerfile <immintrin.h>
199///
200/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
201///
202/// \param dst
203/// The destination tile. Max size is 1024 Bytes.
204/// \param src0
205/// The 1st source tile. Max size is 1024 Bytes.
206/// \param src1
207/// The 2nd source tile. Max size is 1024 Bytes.
208#define _tile_dpbuud(dst, src0, src1) \
209 __builtin_ia32_tdpbuud((dst), (src0), (src1))
210
211/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
212/// src1, accumulating the intermediate single-precision (32-bit) floating-point
213/// elements with elements in "dst", and store the 32-bit result back to tile
214/// "dst".
215///
216/// \headerfile <immintrin.h>
217///
218/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
219///
220/// \param dst
221/// The destination tile. Max size is 1024 Bytes.
222/// \param src0
223/// The 1st source tile. Max size is 1024 Bytes.
224/// \param src1
225/// The 2nd source tile. Max size is 1024 Bytes.
226#define _tile_dpbf16ps(dst, src0, src1) \
227 __builtin_ia32_tdpbf16ps((dst), (src0), (src1))
228
229/// AMX tile register size can be configured, the maximum size is 16x64=1024
230/// bytes. Since there is no 2D type in llvm IR, we use vector type to
231/// represent 2D tile and the fixed size is maximum amx tile register size.
232typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64)));
233
234/// This is internal intrinsic. C/C++ user should avoid calling it directly.
235static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
236_tile_loadd_internal(unsigned short m, unsigned short n, const void *base,
237 __SIZE_TYPE__ stride) {
238 return __builtin_ia32_tileloadd64_internal(m, n, base,
239 (__SIZE_TYPE__)(stride));
240}
241
242/// This is internal intrinsic. C/C++ user should avoid calling it directly.
243static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800244_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base,
245 __SIZE_TYPE__ stride) {
246 return __builtin_ia32_tileloaddt164_internal(m, n, base,
247 (__SIZE_TYPE__)(stride));
248}
249
250/// This is internal intrinsic. C/C++ user should avoid calling it directly.
251static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700252_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k,
253 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
254 return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2);
255}
256
257/// This is internal intrinsic. C/C++ user should avoid calling it directly.
258static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
259_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k,
260 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
261 return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2);
262}
263
264/// This is internal intrinsic. C/C++ user should avoid calling it directly.
265static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
266_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k,
267 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
268 return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2);
269}
270
271/// This is internal intrinsic. C/C++ user should avoid calling it directly.
272static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8
273_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k,
274 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
275 return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2);
276}
277
278/// This is internal intrinsic. C/C++ user should avoid calling it directly.
279static __inline__ void __DEFAULT_FN_ATTRS_INT8
280_tile_stored_internal(unsigned short m, unsigned short n, void *base,
281 __SIZE_TYPE__ stride, _tile1024i tile) {
282 return __builtin_ia32_tilestored64_internal(m, n, base,
283 (__SIZE_TYPE__)(stride), tile);
284}
285
286/// This is internal intrinsic. C/C++ user should avoid calling it directly.
287static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16
288_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k,
289 _tile1024i dst, _tile1024i src1, _tile1024i src2) {
290 return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2);
291}
292
293/// This struct pack the shape and tile data together for user. We suggest
294/// initializing the struct as early as possible, because compiler depends
295/// on the shape information to do configure. The constant value is preferred
296/// for optimization by compiler.
297typedef struct __tile1024i_str {
298 const unsigned short row;
299 const unsigned short col;
300 _tile1024i tile;
301} __tile1024i;
302
303/// Load tile rows from memory specifieid by "base" address and "stride" into
304/// destination tile "dst".
305///
306/// \headerfile <immintrin.h>
307///
308/// This intrinsic corresponds to the <c> TILELOADD </c> instruction.
309///
310/// \param dst
311/// A destination tile. Max size is 1024 Bytes.
312/// \param base
313/// A pointer to base address.
314/// \param stride
315/// The stride between the rows' data to be loaded in memory.
316__DEFAULT_FN_ATTRS_TILE
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800317static __inline__ void __tile_loadd(__tile1024i *dst, const void *base,
318 __SIZE_TYPE__ stride) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700319 dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride);
320}
321
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800322/// Load tile rows from memory specifieid by "base" address and "stride" into
323/// destination tile "dst". This intrinsic provides a hint to the implementation
324/// that the data will likely not be reused in the near future and the data
325/// caching can be optimized accordingly.
326///
327/// \headerfile <immintrin.h>
328///
329/// This intrinsic corresponds to the <c> TILELOADDT1 </c> instruction.
330///
331/// \param dst
332/// A destination tile. Max size is 1024 Bytes.
333/// \param base
334/// A pointer to base address.
335/// \param stride
336/// The stride between the rows' data to be loaded in memory.
337__DEFAULT_FN_ATTRS_TILE
338static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base,
339 __SIZE_TYPE__ stride) {
340 dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride);
341}
342
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700343/// Compute dot-product of bytes in tiles with a source/destination accumulator.
344/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
345/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
346/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
347/// and store the 32-bit result back to tile "dst".
348///
349/// \headerfile <immintrin.h>
350///
351/// This intrinsic corresponds to the <c> TDPBSSD </c> instruction.
352///
353/// \param dst
354/// The destination tile. Max size is 1024 Bytes.
355/// \param src0
356/// The 1st source tile. Max size is 1024 Bytes.
357/// \param src1
358/// The 2nd source tile. Max size is 1024 Bytes.
359__DEFAULT_FN_ATTRS_INT8
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800360static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0,
361 __tile1024i src1) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700362 dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile,
363 src0.tile, src1.tile);
364}
365
366/// Compute dot-product of bytes in tiles with a source/destination accumulator.
367/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with
368/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
369/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer
370/// in "dst", and store the 32-bit result back to tile "dst".
371///
372/// \headerfile <immintrin.h>
373///
374/// This intrinsic corresponds to the <c> TDPBSUD </c> instruction.
375///
376/// \param dst
377/// The destination tile. Max size is 1024 Bytes.
378/// \param src0
379/// The 1st source tile. Max size is 1024 Bytes.
380/// \param src1
381/// The 2nd source tile. Max size is 1024 Bytes.
382__DEFAULT_FN_ATTRS_INT8
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800383static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0,
384 __tile1024i src1) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700385 dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile,
386 src0.tile, src1.tile);
387}
388
389/// Compute dot-product of bytes in tiles with a source/destination accumulator.
390/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
391/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit
392/// results. Sum these 4 results with the corresponding 32-bit integer in "dst",
393/// and store the 32-bit result back to tile "dst".
394///
395/// \headerfile <immintrin.h>
396///
397/// This intrinsic corresponds to the <c> TDPBUSD </c> instruction.
398///
399/// \param dst
400/// The destination tile. Max size is 1024 Bytes.
401/// \param src0
402/// The 1st source tile. Max size is 1024 Bytes.
403/// \param src1
404/// The 2nd source tile. Max size is 1024 Bytes.
405__DEFAULT_FN_ATTRS_INT8
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800406static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0,
407 __tile1024i src1) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700408 dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile,
409 src0.tile, src1.tile);
410}
411
412/// Compute dot-product of bytes in tiles with a source/destination accumulator.
413/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with
414/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate
415/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in
416/// "dst", and store the 32-bit result back to tile "dst".
417///
418/// \headerfile <immintrin.h>
419///
420/// This intrinsic corresponds to the <c> TDPBUUD </c> instruction.
421///
422/// \param dst
423/// The destination tile. Max size is 1024 Bytes.
424/// \param src0
425/// The 1st source tile. Max size is 1024 Bytes.
426/// \param src1
427/// The 2nd source tile. Max size is 1024 Bytes.
428__DEFAULT_FN_ATTRS_INT8
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800429static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0,
430 __tile1024i src1) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700431 dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile,
432 src0.tile, src1.tile);
433}
434
435/// Store the tile specified by "src" to memory specifieid by "base" address and
436/// "stride".
437///
438/// \headerfile <immintrin.h>
439///
440/// This intrinsic corresponds to the <c> TILESTORED </c> instruction.
441///
442/// \param dst
443/// A destination tile. Max size is 1024 Bytes.
444/// \param base
445/// A pointer to base address.
446/// \param stride
447/// The stride between the rows' data to be stored in memory.
448__DEFAULT_FN_ATTRS_TILE
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800449static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride,
450 __tile1024i src) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700451 _tile_stored_internal(src.row, src.col, base, stride, src.tile);
452}
453
454/// Zero the tile specified by "dst".
455///
456/// \headerfile <immintrin.h>
457///
458/// This intrinsic corresponds to the <c> TILEZERO </c> instruction.
459///
460/// \param dst
461/// The destination tile to be zero. Max size is 1024 Bytes.
462__DEFAULT_FN_ATTRS_TILE
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800463static __inline__ void __tile_zero(__tile1024i *dst) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700464 dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col);
465}
466
467/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and
468/// src1, accumulating the intermediate single-precision (32-bit) floating-point
469/// elements with elements in "dst", and store the 32-bit result back to tile
470/// "dst".
471///
472/// \headerfile <immintrin.h>
473///
474/// This intrinsic corresponds to the <c> TDPBF16PS </c> instruction.
475///
476/// \param dst
477/// The destination tile. Max size is 1024 Bytes.
478/// \param src0
479/// The 1st source tile. Max size is 1024 Bytes.
480/// \param src1
481/// The 2nd source tile. Max size is 1024 Bytes.
482__DEFAULT_FN_ATTRS_BF16
Pirama Arumuga Nainar494f6452021-12-02 10:42:14 -0800483static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0,
484 __tile1024i src1) {
Pirama Arumuga Nainar7e1f8392021-08-16 17:30:48 -0700485 dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile,
486 src0.tile, src1.tile);
487}
488
489#undef __DEFAULT_FN_ATTRS_TILE
490#undef __DEFAULT_FN_ATTRS_INT8
491#undef __DEFAULT_FN_ATTRS_BF16
492
493#endif /* __x86_64__ */
494#endif /* __AMXINTRIN_H */