blob: e8dfa6bc2d32ed2acee6ca4a28a2839c9a616ab4 [file] [log] [blame]
Frank Barchardb0e4fae2020-05-04 15:27:51 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f16-igemm-minmax.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
22#include "gemm-microkernel-tester.h"
23
24
25#if XNN_ARCH_ARM64
26 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4) {
27 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(4)
36 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
37 }
38
39 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(4)
49 .cn_stride(11)
50 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
51 }
52
53 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
54 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
55 for (uint32_t m = 1; m <= 1; m++) {
56 for (uint32_t n = 1; n <= 8; n++) {
57 GemmMicrokernelTester()
58 .mr(1)
59 .nr(8)
60 .kr(1)
61 .sr(1)
62 .m(m)
63 .n(n)
64 .k(4)
65 .iterations(1)
66 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
67 }
68 }
69 }
70
71 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
72 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
73 for (uint32_t m = 1; m <= 1; m++) {
74 GemmMicrokernelTester()
75 .mr(1)
76 .nr(8)
77 .kr(1)
78 .sr(1)
79 .m(m)
80 .n(8)
81 .k(4)
82 .iterations(1)
83 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
84 }
85 }
86
87 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
88 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
89 for (uint32_t n = 1; n <= 8; n++) {
90 GemmMicrokernelTester()
91 .mr(1)
92 .nr(8)
93 .kr(1)
94 .sr(1)
95 .m(1)
96 .n(n)
97 .k(4)
98 .iterations(1)
99 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
100 }
101 }
102
103 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4) {
104 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
105 for (size_t k = 1; k < 4; k++) {
106 GemmMicrokernelTester()
107 .mr(1)
108 .nr(8)
109 .kr(1)
110 .sr(1)
111 .m(1)
112 .n(8)
113 .k(k)
114 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
115 }
116 }
117
118 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
119 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
120 for (size_t k = 1; k < 4; k++) {
121 for (uint32_t m = 1; m <= 1; m++) {
122 for (uint32_t n = 1; n <= 8; n++) {
123 GemmMicrokernelTester()
124 .mr(1)
125 .nr(8)
126 .kr(1)
127 .sr(1)
128 .m(m)
129 .n(n)
130 .k(k)
131 .iterations(1)
132 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
133 }
134 }
135 }
136 }
137
138 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4) {
139 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
140 for (size_t k = 5; k < 8; k++) {
141 GemmMicrokernelTester()
142 .mr(1)
143 .nr(8)
144 .kr(1)
145 .sr(1)
146 .m(1)
147 .n(8)
148 .k(k)
149 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
150 }
151 }
152
153 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
154 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
155 for (size_t k = 5; k < 8; k++) {
156 for (uint32_t m = 1; m <= 1; m++) {
157 for (uint32_t n = 1; n <= 8; n++) {
158 GemmMicrokernelTester()
159 .mr(1)
160 .nr(8)
161 .kr(1)
162 .sr(1)
163 .m(m)
164 .n(n)
165 .k(k)
166 .iterations(1)
167 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
168 }
169 }
170 }
171 }
172
173 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4) {
174 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
175 for (size_t k = 8; k <= 40; k += 4) {
176 GemmMicrokernelTester()
177 .mr(1)
178 .nr(8)
179 .kr(1)
180 .sr(1)
181 .m(1)
182 .n(8)
183 .k(k)
184 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
185 }
186 }
187
188 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
189 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
190 for (size_t k = 8; k <= 40; k += 4) {
191 for (uint32_t m = 1; m <= 1; m++) {
192 for (uint32_t n = 1; n <= 8; n++) {
193 GemmMicrokernelTester()
194 .mr(1)
195 .nr(8)
196 .kr(1)
197 .sr(1)
198 .m(m)
199 .n(n)
200 .k(k)
201 .iterations(1)
202 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
203 }
204 }
205 }
206 }
207
208 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8) {
209 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
210 for (uint32_t n = 9; n < 16; n++) {
211 for (size_t k = 1; k <= 20; k += 5) {
212 GemmMicrokernelTester()
213 .mr(1)
214 .nr(8)
215 .kr(1)
216 .sr(1)
217 .m(1)
218 .n(8)
219 .k(k)
220 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
221 }
222 }
223 }
224
225 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
226 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
227 for (uint32_t n = 9; n < 16; n++) {
228 for (size_t k = 1; k <= 20; k += 5) {
229 GemmMicrokernelTester()
230 .mr(1)
231 .nr(8)
232 .kr(1)
233 .sr(1)
234 .m(1)
235 .n(8)
236 .k(k)
237 .cn_stride(11)
238 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
239 }
240 }
241 }
242
243 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
244 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
245 for (uint32_t n = 9; n < 16; n++) {
246 for (size_t k = 1; k <= 20; k += 5) {
247 for (uint32_t m = 1; m <= 1; m++) {
248 GemmMicrokernelTester()
249 .mr(1)
250 .nr(8)
251 .kr(1)
252 .sr(1)
253 .m(m)
254 .n(n)
255 .k(k)
256 .iterations(1)
257 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
258 }
259 }
260 }
261 }
262
263 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8) {
264 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
265 for (uint32_t n = 16; n <= 24; n += 8) {
266 for (size_t k = 1; k <= 20; k += 5) {
267 GemmMicrokernelTester()
268 .mr(1)
269 .nr(8)
270 .kr(1)
271 .sr(1)
272 .m(1)
273 .n(8)
274 .k(k)
275 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
276 }
277 }
278 }
279
280 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
281 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
282 for (uint32_t n = 16; n <= 24; n += 8) {
283 for (size_t k = 1; k <= 20; k += 5) {
284 GemmMicrokernelTester()
285 .mr(1)
286 .nr(8)
287 .kr(1)
288 .sr(1)
289 .m(1)
290 .n(n)
291 .k(k)
292 .cn_stride(11)
293 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
294 }
295 }
296 }
297
298 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
299 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
300 for (uint32_t n = 16; n <= 24; n += 8) {
301 for (size_t k = 1; k <= 20; k += 5) {
302 for (uint32_t m = 1; m <= 1; m++) {
303 GemmMicrokernelTester()
304 .mr(1)
305 .nr(8)
306 .kr(1)
307 .sr(1)
308 .m(m)
309 .n(n)
310 .k(k)
311 .iterations(1)
312 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
313 }
314 }
315 }
316 }
317
318 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, small_kernel) {
319 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
320 for (size_t k = 1; k <= 20; k += 5) {
321 GemmMicrokernelTester()
322 .mr(1)
323 .nr(8)
324 .kr(1)
325 .sr(1)
326 .m(1)
327 .n(8)
328 .k(k)
329 .ks(3)
330 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
331 }
332 }
333
334 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, small_kernel_subtile) {
335 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
336 for (size_t k = 1; k <= 20; k += 5) {
337 for (uint32_t m = 1; m <= 1; m++) {
338 for (uint32_t n = 1; n <= 8; n++) {
339 GemmMicrokernelTester()
340 .mr(1)
341 .nr(8)
342 .kr(1)
343 .sr(1)
344 .m(m)
345 .n(n)
346 .k(k)
347 .ks(3)
348 .iterations(1)
349 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
350 }
351 }
352 }
353 }
354
355 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) {
356 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
357 for (uint32_t n = 9; n < 16; n++) {
358 for (size_t k = 1; k <= 20; k += 5) {
359 GemmMicrokernelTester()
360 .mr(1)
361 .nr(8)
362 .kr(1)
363 .sr(1)
364 .m(1)
365 .n(8)
366 .k(k)
367 .ks(3)
368 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
369 }
370 }
371 }
372
373 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) {
374 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
375 for (uint32_t n = 16; n <= 24; n += 8) {
376 for (size_t k = 1; k <= 20; k += 5) {
377 GemmMicrokernelTester()
378 .mr(1)
379 .nr(8)
380 .kr(1)
381 .sr(1)
382 .m(1)
383 .n(8)
384 .k(k)
385 .ks(3)
386 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
387 }
388 }
389 }
390
391 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
392 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
393 for (size_t k = 1; k <= 20; k += 5) {
394 for (uint32_t m = 1; m <= 1; m++) {
395 for (uint32_t n = 1; n <= 8; n++) {
396 GemmMicrokernelTester()
397 .mr(1)
398 .nr(8)
399 .kr(1)
400 .sr(1)
401 .m(m)
402 .n(n)
403 .k(k)
404 .cm_stride(11)
405 .iterations(1)
406 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
407 }
408 }
409 }
410 }
411
412 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, a_offset) {
413 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
414 for (size_t k = 1; k <= 20; k += 5) {
415 GemmMicrokernelTester()
416 .mr(1)
417 .nr(8)
418 .kr(1)
419 .sr(1)
420 .m(1)
421 .n(8)
422 .k(k)
423 .ks(3)
424 .a_offset(23)
425 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
426 }
427 }
428
429 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, zero) {
430 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
431 for (uint32_t mz = 0; mz < 1; mz++) {
432 for (size_t k = 1; k <= 20; k += 5) {
433 GemmMicrokernelTester()
434 .mr(1)
435 .nr(8)
436 .kr(1)
437 .sr(1)
438 .m(1)
439 .n(8)
440 .k(k)
441 .ks(3)
442 .a_offset(23)
443 .zero_index(mz)
444 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
445 }
446 }
447 }
448
449 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmin) {
450 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
451 GemmMicrokernelTester()
452 .mr(1)
453 .nr(8)
454 .kr(1)
455 .sr(1)
456 .m(1)
457 .n(8)
458 .k(4)
459 .qmin(128)
460 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
461 }
462
463 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmax) {
464 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
465 GemmMicrokernelTester()
466 .mr(1)
467 .nr(8)
468 .kr(1)
469 .sr(1)
470 .m(1)
471 .n(8)
472 .k(4)
473 .qmax(128)
474 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
475 }
476
477 TEST(F16_IGEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm) {
478 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
479 GemmMicrokernelTester()
480 .mr(1)
481 .nr(8)
482 .kr(1)
483 .sr(1)
484 .m(1)
485 .n(8)
486 .k(4)
487 .cm_stride(11)
488 .Test(xnn_f16_igemm_minmax_ukernel_1x8__neonfp16arith_ld64);
489 }
490#endif // XNN_ARCH_ARM64
491
492
493#if XNN_ARCH_ARM64
494 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4) {
495 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
496 GemmMicrokernelTester()
497 .mr(4)
498 .nr(8)
499 .kr(1)
500 .sr(1)
501 .m(4)
502 .n(8)
503 .k(4)
504 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
505 }
506
507 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cn) {
508 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
509 GemmMicrokernelTester()
510 .mr(4)
511 .nr(8)
512 .kr(1)
513 .sr(1)
514 .m(4)
515 .n(8)
516 .k(4)
517 .cn_stride(11)
518 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
519 }
520
521 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
522 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
523 for (uint32_t m = 1; m <= 4; m++) {
524 for (uint32_t n = 1; n <= 8; n++) {
525 GemmMicrokernelTester()
526 .mr(4)
527 .nr(8)
528 .kr(1)
529 .sr(1)
530 .m(m)
531 .n(n)
532 .k(4)
533 .iterations(1)
534 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
535 }
536 }
537 }
538
539 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
540 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
541 for (uint32_t m = 1; m <= 4; m++) {
542 GemmMicrokernelTester()
543 .mr(4)
544 .nr(8)
545 .kr(1)
546 .sr(1)
547 .m(m)
548 .n(8)
549 .k(4)
550 .iterations(1)
551 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
552 }
553 }
554
555 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
556 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
557 for (uint32_t n = 1; n <= 8; n++) {
558 GemmMicrokernelTester()
559 .mr(4)
560 .nr(8)
561 .kr(1)
562 .sr(1)
563 .m(4)
564 .n(n)
565 .k(4)
566 .iterations(1)
567 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
568 }
569 }
570
571 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4) {
572 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
573 for (size_t k = 1; k < 4; k++) {
574 GemmMicrokernelTester()
575 .mr(4)
576 .nr(8)
577 .kr(1)
578 .sr(1)
579 .m(4)
580 .n(8)
581 .k(k)
582 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
583 }
584 }
585
586 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
587 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
588 for (size_t k = 1; k < 4; k++) {
589 for (uint32_t m = 1; m <= 4; m++) {
590 for (uint32_t n = 1; n <= 8; n++) {
591 GemmMicrokernelTester()
592 .mr(4)
593 .nr(8)
594 .kr(1)
595 .sr(1)
596 .m(m)
597 .n(n)
598 .k(k)
599 .iterations(1)
600 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
601 }
602 }
603 }
604 }
605
606 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4) {
607 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
608 for (size_t k = 5; k < 8; k++) {
609 GemmMicrokernelTester()
610 .mr(4)
611 .nr(8)
612 .kr(1)
613 .sr(1)
614 .m(4)
615 .n(8)
616 .k(k)
617 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
618 }
619 }
620
621 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
622 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
623 for (size_t k = 5; k < 8; k++) {
624 for (uint32_t m = 1; m <= 4; m++) {
625 for (uint32_t n = 1; n <= 8; n++) {
626 GemmMicrokernelTester()
627 .mr(4)
628 .nr(8)
629 .kr(1)
630 .sr(1)
631 .m(m)
632 .n(n)
633 .k(k)
634 .iterations(1)
635 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
636 }
637 }
638 }
639 }
640
641 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4) {
642 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
643 for (size_t k = 8; k <= 40; k += 4) {
644 GemmMicrokernelTester()
645 .mr(4)
646 .nr(8)
647 .kr(1)
648 .sr(1)
649 .m(4)
650 .n(8)
651 .k(k)
652 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
653 }
654 }
655
656 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
657 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
658 for (size_t k = 8; k <= 40; k += 4) {
659 for (uint32_t m = 1; m <= 4; m++) {
660 for (uint32_t n = 1; n <= 8; n++) {
661 GemmMicrokernelTester()
662 .mr(4)
663 .nr(8)
664 .kr(1)
665 .sr(1)
666 .m(m)
667 .n(n)
668 .k(k)
669 .iterations(1)
670 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
671 }
672 }
673 }
674 }
675
676 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8) {
677 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
678 for (uint32_t n = 9; n < 16; n++) {
679 for (size_t k = 1; k <= 20; k += 5) {
680 GemmMicrokernelTester()
681 .mr(4)
682 .nr(8)
683 .kr(1)
684 .sr(1)
685 .m(4)
686 .n(8)
687 .k(k)
688 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
689 }
690 }
691 }
692
693 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
694 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
695 for (uint32_t n = 9; n < 16; n++) {
696 for (size_t k = 1; k <= 20; k += 5) {
697 GemmMicrokernelTester()
698 .mr(4)
699 .nr(8)
700 .kr(1)
701 .sr(1)
702 .m(4)
703 .n(8)
704 .k(k)
705 .cn_stride(11)
706 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
707 }
708 }
709 }
710
711 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
712 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
713 for (uint32_t n = 9; n < 16; n++) {
714 for (size_t k = 1; k <= 20; k += 5) {
715 for (uint32_t m = 1; m <= 4; m++) {
716 GemmMicrokernelTester()
717 .mr(4)
718 .nr(8)
719 .kr(1)
720 .sr(1)
721 .m(m)
722 .n(n)
723 .k(k)
724 .iterations(1)
725 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
726 }
727 }
728 }
729 }
730
731 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8) {
732 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
733 for (uint32_t n = 16; n <= 24; n += 8) {
734 for (size_t k = 1; k <= 20; k += 5) {
735 GemmMicrokernelTester()
736 .mr(4)
737 .nr(8)
738 .kr(1)
739 .sr(1)
740 .m(4)
741 .n(8)
742 .k(k)
743 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
744 }
745 }
746 }
747
748 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
749 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
750 for (uint32_t n = 16; n <= 24; n += 8) {
751 for (size_t k = 1; k <= 20; k += 5) {
752 GemmMicrokernelTester()
753 .mr(4)
754 .nr(8)
755 .kr(1)
756 .sr(1)
757 .m(4)
758 .n(n)
759 .k(k)
760 .cn_stride(11)
761 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
762 }
763 }
764 }
765
766 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
767 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
768 for (uint32_t n = 16; n <= 24; n += 8) {
769 for (size_t k = 1; k <= 20; k += 5) {
770 for (uint32_t m = 1; m <= 4; m++) {
771 GemmMicrokernelTester()
772 .mr(4)
773 .nr(8)
774 .kr(1)
775 .sr(1)
776 .m(m)
777 .n(n)
778 .k(k)
779 .iterations(1)
780 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
781 }
782 }
783 }
784 }
785
786 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, small_kernel) {
787 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
788 for (size_t k = 1; k <= 20; k += 5) {
789 GemmMicrokernelTester()
790 .mr(4)
791 .nr(8)
792 .kr(1)
793 .sr(1)
794 .m(4)
795 .n(8)
796 .k(k)
797 .ks(3)
798 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
799 }
800 }
801
802 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, small_kernel_subtile) {
803 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
804 for (size_t k = 1; k <= 20; k += 5) {
805 for (uint32_t m = 1; m <= 4; m++) {
806 for (uint32_t n = 1; n <= 8; n++) {
807 GemmMicrokernelTester()
808 .mr(4)
809 .nr(8)
810 .kr(1)
811 .sr(1)
812 .m(m)
813 .n(n)
814 .k(k)
815 .ks(3)
816 .iterations(1)
817 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
818 }
819 }
820 }
821 }
822
823 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) {
824 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
825 for (uint32_t n = 9; n < 16; n++) {
826 for (size_t k = 1; k <= 20; k += 5) {
827 GemmMicrokernelTester()
828 .mr(4)
829 .nr(8)
830 .kr(1)
831 .sr(1)
832 .m(4)
833 .n(8)
834 .k(k)
835 .ks(3)
836 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
837 }
838 }
839 }
840
841 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) {
842 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
843 for (uint32_t n = 16; n <= 24; n += 8) {
844 for (size_t k = 1; k <= 20; k += 5) {
845 GemmMicrokernelTester()
846 .mr(4)
847 .nr(8)
848 .kr(1)
849 .sr(1)
850 .m(4)
851 .n(8)
852 .k(k)
853 .ks(3)
854 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
855 }
856 }
857 }
858
859 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
860 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
861 for (size_t k = 1; k <= 20; k += 5) {
862 for (uint32_t m = 1; m <= 4; m++) {
863 for (uint32_t n = 1; n <= 8; n++) {
864 GemmMicrokernelTester()
865 .mr(4)
866 .nr(8)
867 .kr(1)
868 .sr(1)
869 .m(m)
870 .n(n)
871 .k(k)
872 .cm_stride(11)
873 .iterations(1)
874 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
875 }
876 }
877 }
878 }
879
880 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, a_offset) {
881 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
882 for (size_t k = 1; k <= 20; k += 5) {
883 GemmMicrokernelTester()
884 .mr(4)
885 .nr(8)
886 .kr(1)
887 .sr(1)
888 .m(4)
889 .n(8)
890 .k(k)
891 .ks(3)
892 .a_offset(83)
893 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
894 }
895 }
896
897 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, zero) {
898 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
899 for (uint32_t mz = 0; mz < 4; mz++) {
900 for (size_t k = 1; k <= 20; k += 5) {
901 GemmMicrokernelTester()
902 .mr(4)
903 .nr(8)
904 .kr(1)
905 .sr(1)
906 .m(4)
907 .n(8)
908 .k(k)
909 .ks(3)
910 .a_offset(83)
911 .zero_index(mz)
912 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
913 }
914 }
915 }
916
917 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmin) {
918 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
919 GemmMicrokernelTester()
920 .mr(4)
921 .nr(8)
922 .kr(1)
923 .sr(1)
924 .m(4)
925 .n(8)
926 .k(4)
927 .qmin(128)
928 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
929 }
930
931 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmax) {
932 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
933 GemmMicrokernelTester()
934 .mr(4)
935 .nr(8)
936 .kr(1)
937 .sr(1)
938 .m(4)
939 .n(8)
940 .k(4)
941 .qmax(128)
942 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
943 }
944
945 TEST(F16_IGEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm) {
946 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
947 GemmMicrokernelTester()
948 .mr(4)
949 .nr(8)
950 .kr(1)
951 .sr(1)
952 .m(4)
953 .n(8)
954 .k(4)
955 .cm_stride(11)
956 .Test(xnn_f16_igemm_minmax_ukernel_4x8__neonfp16arith_ld64);
957 }
958#endif // XNN_ARCH_ARM64
959
960
961#if XNN_ARCH_ARM64
962 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4) {
963 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
964 GemmMicrokernelTester()
965 .mr(6)
966 .nr(8)
967 .kr(1)
968 .sr(1)
969 .m(6)
970 .n(8)
971 .k(4)
972 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
973 }
974
975 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cn) {
976 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
977 GemmMicrokernelTester()
978 .mr(6)
979 .nr(8)
980 .kr(1)
981 .sr(1)
982 .m(6)
983 .n(8)
984 .k(4)
985 .cn_stride(11)
986 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
987 }
988
989 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
990 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
991 for (uint32_t m = 1; m <= 6; m++) {
992 for (uint32_t n = 1; n <= 8; n++) {
993 GemmMicrokernelTester()
994 .mr(6)
995 .nr(8)
996 .kr(1)
997 .sr(1)
998 .m(m)
999 .n(n)
1000 .k(4)
1001 .iterations(1)
1002 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1003 }
1004 }
1005 }
1006
1007 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
1008 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1009 for (uint32_t m = 1; m <= 6; m++) {
1010 GemmMicrokernelTester()
1011 .mr(6)
1012 .nr(8)
1013 .kr(1)
1014 .sr(1)
1015 .m(m)
1016 .n(8)
1017 .k(4)
1018 .iterations(1)
1019 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1020 }
1021 }
1022
1023 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
1024 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1025 for (uint32_t n = 1; n <= 8; n++) {
1026 GemmMicrokernelTester()
1027 .mr(6)
1028 .nr(8)
1029 .kr(1)
1030 .sr(1)
1031 .m(6)
1032 .n(n)
1033 .k(4)
1034 .iterations(1)
1035 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1036 }
1037 }
1038
1039 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4) {
1040 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1041 for (size_t k = 1; k < 4; k++) {
1042 GemmMicrokernelTester()
1043 .mr(6)
1044 .nr(8)
1045 .kr(1)
1046 .sr(1)
1047 .m(6)
1048 .n(8)
1049 .k(k)
1050 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1051 }
1052 }
1053
1054 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
1055 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1056 for (size_t k = 1; k < 4; k++) {
1057 for (uint32_t m = 1; m <= 6; m++) {
1058 for (uint32_t n = 1; n <= 8; n++) {
1059 GemmMicrokernelTester()
1060 .mr(6)
1061 .nr(8)
1062 .kr(1)
1063 .sr(1)
1064 .m(m)
1065 .n(n)
1066 .k(k)
1067 .iterations(1)
1068 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1069 }
1070 }
1071 }
1072 }
1073
1074 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4) {
1075 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1076 for (size_t k = 5; k < 8; k++) {
1077 GemmMicrokernelTester()
1078 .mr(6)
1079 .nr(8)
1080 .kr(1)
1081 .sr(1)
1082 .m(6)
1083 .n(8)
1084 .k(k)
1085 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1086 }
1087 }
1088
1089 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
1090 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1091 for (size_t k = 5; k < 8; k++) {
1092 for (uint32_t m = 1; m <= 6; m++) {
1093 for (uint32_t n = 1; n <= 8; n++) {
1094 GemmMicrokernelTester()
1095 .mr(6)
1096 .nr(8)
1097 .kr(1)
1098 .sr(1)
1099 .m(m)
1100 .n(n)
1101 .k(k)
1102 .iterations(1)
1103 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1104 }
1105 }
1106 }
1107 }
1108
1109 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4) {
1110 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1111 for (size_t k = 8; k <= 40; k += 4) {
1112 GemmMicrokernelTester()
1113 .mr(6)
1114 .nr(8)
1115 .kr(1)
1116 .sr(1)
1117 .m(6)
1118 .n(8)
1119 .k(k)
1120 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1121 }
1122 }
1123
1124 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
1125 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1126 for (size_t k = 8; k <= 40; k += 4) {
1127 for (uint32_t m = 1; m <= 6; m++) {
1128 for (uint32_t n = 1; n <= 8; n++) {
1129 GemmMicrokernelTester()
1130 .mr(6)
1131 .nr(8)
1132 .kr(1)
1133 .sr(1)
1134 .m(m)
1135 .n(n)
1136 .k(k)
1137 .iterations(1)
1138 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1139 }
1140 }
1141 }
1142 }
1143
1144 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8) {
1145 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1146 for (uint32_t n = 9; n < 16; n++) {
1147 for (size_t k = 1; k <= 20; k += 5) {
1148 GemmMicrokernelTester()
1149 .mr(6)
1150 .nr(8)
1151 .kr(1)
1152 .sr(1)
1153 .m(6)
1154 .n(8)
1155 .k(k)
1156 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1157 }
1158 }
1159 }
1160
1161 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
1162 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1163 for (uint32_t n = 9; n < 16; n++) {
1164 for (size_t k = 1; k <= 20; k += 5) {
1165 GemmMicrokernelTester()
1166 .mr(6)
1167 .nr(8)
1168 .kr(1)
1169 .sr(1)
1170 .m(6)
1171 .n(8)
1172 .k(k)
1173 .cn_stride(11)
1174 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1175 }
1176 }
1177 }
1178
1179 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
1180 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1181 for (uint32_t n = 9; n < 16; n++) {
1182 for (size_t k = 1; k <= 20; k += 5) {
1183 for (uint32_t m = 1; m <= 6; m++) {
1184 GemmMicrokernelTester()
1185 .mr(6)
1186 .nr(8)
1187 .kr(1)
1188 .sr(1)
1189 .m(m)
1190 .n(n)
1191 .k(k)
1192 .iterations(1)
1193 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1194 }
1195 }
1196 }
1197 }
1198
1199 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8) {
1200 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1201 for (uint32_t n = 16; n <= 24; n += 8) {
1202 for (size_t k = 1; k <= 20; k += 5) {
1203 GemmMicrokernelTester()
1204 .mr(6)
1205 .nr(8)
1206 .kr(1)
1207 .sr(1)
1208 .m(6)
1209 .n(8)
1210 .k(k)
1211 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1212 }
1213 }
1214 }
1215
1216 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
1217 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1218 for (uint32_t n = 16; n <= 24; n += 8) {
1219 for (size_t k = 1; k <= 20; k += 5) {
1220 GemmMicrokernelTester()
1221 .mr(6)
1222 .nr(8)
1223 .kr(1)
1224 .sr(1)
1225 .m(6)
1226 .n(n)
1227 .k(k)
1228 .cn_stride(11)
1229 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1230 }
1231 }
1232 }
1233
1234 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
1235 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1236 for (uint32_t n = 16; n <= 24; n += 8) {
1237 for (size_t k = 1; k <= 20; k += 5) {
1238 for (uint32_t m = 1; m <= 6; m++) {
1239 GemmMicrokernelTester()
1240 .mr(6)
1241 .nr(8)
1242 .kr(1)
1243 .sr(1)
1244 .m(m)
1245 .n(n)
1246 .k(k)
1247 .iterations(1)
1248 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1249 }
1250 }
1251 }
1252 }
1253
1254 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, small_kernel) {
1255 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1256 for (size_t k = 1; k <= 20; k += 5) {
1257 GemmMicrokernelTester()
1258 .mr(6)
1259 .nr(8)
1260 .kr(1)
1261 .sr(1)
1262 .m(6)
1263 .n(8)
1264 .k(k)
1265 .ks(3)
1266 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1267 }
1268 }
1269
1270 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, small_kernel_subtile) {
1271 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1272 for (size_t k = 1; k <= 20; k += 5) {
1273 for (uint32_t m = 1; m <= 6; m++) {
1274 for (uint32_t n = 1; n <= 8; n++) {
1275 GemmMicrokernelTester()
1276 .mr(6)
1277 .nr(8)
1278 .kr(1)
1279 .sr(1)
1280 .m(m)
1281 .n(n)
1282 .k(k)
1283 .ks(3)
1284 .iterations(1)
1285 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1286 }
1287 }
1288 }
1289 }
1290
1291 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) {
1292 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1293 for (uint32_t n = 9; n < 16; n++) {
1294 for (size_t k = 1; k <= 20; k += 5) {
1295 GemmMicrokernelTester()
1296 .mr(6)
1297 .nr(8)
1298 .kr(1)
1299 .sr(1)
1300 .m(6)
1301 .n(8)
1302 .k(k)
1303 .ks(3)
1304 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1305 }
1306 }
1307 }
1308
1309 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) {
1310 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1311 for (uint32_t n = 16; n <= 24; n += 8) {
1312 for (size_t k = 1; k <= 20; k += 5) {
1313 GemmMicrokernelTester()
1314 .mr(6)
1315 .nr(8)
1316 .kr(1)
1317 .sr(1)
1318 .m(6)
1319 .n(8)
1320 .k(k)
1321 .ks(3)
1322 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1323 }
1324 }
1325 }
1326
1327 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
1328 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1329 for (size_t k = 1; k <= 20; k += 5) {
1330 for (uint32_t m = 1; m <= 6; m++) {
1331 for (uint32_t n = 1; n <= 8; n++) {
1332 GemmMicrokernelTester()
1333 .mr(6)
1334 .nr(8)
1335 .kr(1)
1336 .sr(1)
1337 .m(m)
1338 .n(n)
1339 .k(k)
1340 .cm_stride(11)
1341 .iterations(1)
1342 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1343 }
1344 }
1345 }
1346 }
1347
1348 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, a_offset) {
1349 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1350 for (size_t k = 1; k <= 20; k += 5) {
1351 GemmMicrokernelTester()
1352 .mr(6)
1353 .nr(8)
1354 .kr(1)
1355 .sr(1)
1356 .m(6)
1357 .n(8)
1358 .k(k)
1359 .ks(3)
1360 .a_offset(127)
1361 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1362 }
1363 }
1364
1365 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, zero) {
1366 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1367 for (uint32_t mz = 0; mz < 6; mz++) {
1368 for (size_t k = 1; k <= 20; k += 5) {
1369 GemmMicrokernelTester()
1370 .mr(6)
1371 .nr(8)
1372 .kr(1)
1373 .sr(1)
1374 .m(6)
1375 .n(8)
1376 .k(k)
1377 .ks(3)
1378 .a_offset(127)
1379 .zero_index(mz)
1380 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1381 }
1382 }
1383 }
1384
1385 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmin) {
1386 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1387 GemmMicrokernelTester()
1388 .mr(6)
1389 .nr(8)
1390 .kr(1)
1391 .sr(1)
1392 .m(6)
1393 .n(8)
1394 .k(4)
1395 .qmin(128)
1396 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1397 }
1398
1399 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmax) {
1400 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1401 GemmMicrokernelTester()
1402 .mr(6)
1403 .nr(8)
1404 .kr(1)
1405 .sr(1)
1406 .m(6)
1407 .n(8)
1408 .k(4)
1409 .qmax(128)
1410 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1411 }
1412
1413 TEST(F16_IGEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm) {
1414 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1415 GemmMicrokernelTester()
1416 .mr(6)
1417 .nr(8)
1418 .kr(1)
1419 .sr(1)
1420 .m(6)
1421 .n(8)
1422 .k(4)
1423 .cm_stride(11)
1424 .Test(xnn_f16_igemm_minmax_ukernel_6x8__neonfp16arith_ld64);
1425 }
1426#endif // XNN_ARCH_ARM64
1427
1428
1429#if XNN_ARCH_ARM64
1430 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4) {
1431 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1432 GemmMicrokernelTester()
1433 .mr(8)
1434 .nr(8)
1435 .kr(1)
1436 .sr(1)
1437 .m(8)
1438 .n(8)
1439 .k(4)
1440 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1441 }
1442
1443 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cn) {
1444 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1445 GemmMicrokernelTester()
1446 .mr(8)
1447 .nr(8)
1448 .kr(1)
1449 .sr(1)
1450 .m(8)
1451 .n(8)
1452 .k(4)
1453 .cn_stride(11)
1454 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1455 }
1456
1457 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
1458 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1459 for (uint32_t m = 1; m <= 8; m++) {
1460 for (uint32_t n = 1; n <= 8; n++) {
1461 GemmMicrokernelTester()
1462 .mr(8)
1463 .nr(8)
1464 .kr(1)
1465 .sr(1)
1466 .m(m)
1467 .n(n)
1468 .k(4)
1469 .iterations(1)
1470 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1471 }
1472 }
1473 }
1474
1475 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
1476 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1477 for (uint32_t m = 1; m <= 8; m++) {
1478 GemmMicrokernelTester()
1479 .mr(8)
1480 .nr(8)
1481 .kr(1)
1482 .sr(1)
1483 .m(m)
1484 .n(8)
1485 .k(4)
1486 .iterations(1)
1487 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1488 }
1489 }
1490
1491 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
1492 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1493 for (uint32_t n = 1; n <= 8; n++) {
1494 GemmMicrokernelTester()
1495 .mr(8)
1496 .nr(8)
1497 .kr(1)
1498 .sr(1)
1499 .m(8)
1500 .n(n)
1501 .k(4)
1502 .iterations(1)
1503 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1504 }
1505 }
1506
1507 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4) {
1508 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1509 for (size_t k = 1; k < 4; k++) {
1510 GemmMicrokernelTester()
1511 .mr(8)
1512 .nr(8)
1513 .kr(1)
1514 .sr(1)
1515 .m(8)
1516 .n(8)
1517 .k(k)
1518 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1519 }
1520 }
1521
1522 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
1523 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1524 for (size_t k = 1; k < 4; k++) {
1525 for (uint32_t m = 1; m <= 8; m++) {
1526 for (uint32_t n = 1; n <= 8; n++) {
1527 GemmMicrokernelTester()
1528 .mr(8)
1529 .nr(8)
1530 .kr(1)
1531 .sr(1)
1532 .m(m)
1533 .n(n)
1534 .k(k)
1535 .iterations(1)
1536 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1537 }
1538 }
1539 }
1540 }
1541
1542 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4) {
1543 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1544 for (size_t k = 5; k < 8; k++) {
1545 GemmMicrokernelTester()
1546 .mr(8)
1547 .nr(8)
1548 .kr(1)
1549 .sr(1)
1550 .m(8)
1551 .n(8)
1552 .k(k)
1553 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1554 }
1555 }
1556
1557 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
1558 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1559 for (size_t k = 5; k < 8; k++) {
1560 for (uint32_t m = 1; m <= 8; m++) {
1561 for (uint32_t n = 1; n <= 8; n++) {
1562 GemmMicrokernelTester()
1563 .mr(8)
1564 .nr(8)
1565 .kr(1)
1566 .sr(1)
1567 .m(m)
1568 .n(n)
1569 .k(k)
1570 .iterations(1)
1571 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1572 }
1573 }
1574 }
1575 }
1576
1577 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4) {
1578 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1579 for (size_t k = 8; k <= 40; k += 4) {
1580 GemmMicrokernelTester()
1581 .mr(8)
1582 .nr(8)
1583 .kr(1)
1584 .sr(1)
1585 .m(8)
1586 .n(8)
1587 .k(k)
1588 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1589 }
1590 }
1591
1592 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
1593 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1594 for (size_t k = 8; k <= 40; k += 4) {
1595 for (uint32_t m = 1; m <= 8; m++) {
1596 for (uint32_t n = 1; n <= 8; n++) {
1597 GemmMicrokernelTester()
1598 .mr(8)
1599 .nr(8)
1600 .kr(1)
1601 .sr(1)
1602 .m(m)
1603 .n(n)
1604 .k(k)
1605 .iterations(1)
1606 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1607 }
1608 }
1609 }
1610 }
1611
1612 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8) {
1613 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1614 for (uint32_t n = 9; n < 16; n++) {
1615 for (size_t k = 1; k <= 20; k += 5) {
1616 GemmMicrokernelTester()
1617 .mr(8)
1618 .nr(8)
1619 .kr(1)
1620 .sr(1)
1621 .m(8)
1622 .n(8)
1623 .k(k)
1624 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1625 }
1626 }
1627 }
1628
1629 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
1630 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1631 for (uint32_t n = 9; n < 16; n++) {
1632 for (size_t k = 1; k <= 20; k += 5) {
1633 GemmMicrokernelTester()
1634 .mr(8)
1635 .nr(8)
1636 .kr(1)
1637 .sr(1)
1638 .m(8)
1639 .n(8)
1640 .k(k)
1641 .cn_stride(11)
1642 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1643 }
1644 }
1645 }
1646
1647 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
1648 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1649 for (uint32_t n = 9; n < 16; n++) {
1650 for (size_t k = 1; k <= 20; k += 5) {
1651 for (uint32_t m = 1; m <= 8; m++) {
1652 GemmMicrokernelTester()
1653 .mr(8)
1654 .nr(8)
1655 .kr(1)
1656 .sr(1)
1657 .m(m)
1658 .n(n)
1659 .k(k)
1660 .iterations(1)
1661 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1662 }
1663 }
1664 }
1665 }
1666
1667 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8) {
1668 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1669 for (uint32_t n = 16; n <= 24; n += 8) {
1670 for (size_t k = 1; k <= 20; k += 5) {
1671 GemmMicrokernelTester()
1672 .mr(8)
1673 .nr(8)
1674 .kr(1)
1675 .sr(1)
1676 .m(8)
1677 .n(8)
1678 .k(k)
1679 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1680 }
1681 }
1682 }
1683
1684 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
1685 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1686 for (uint32_t n = 16; n <= 24; n += 8) {
1687 for (size_t k = 1; k <= 20; k += 5) {
1688 GemmMicrokernelTester()
1689 .mr(8)
1690 .nr(8)
1691 .kr(1)
1692 .sr(1)
1693 .m(8)
1694 .n(n)
1695 .k(k)
1696 .cn_stride(11)
1697 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1698 }
1699 }
1700 }
1701
1702 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
1703 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1704 for (uint32_t n = 16; n <= 24; n += 8) {
1705 for (size_t k = 1; k <= 20; k += 5) {
1706 for (uint32_t m = 1; m <= 8; m++) {
1707 GemmMicrokernelTester()
1708 .mr(8)
1709 .nr(8)
1710 .kr(1)
1711 .sr(1)
1712 .m(m)
1713 .n(n)
1714 .k(k)
1715 .iterations(1)
1716 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1717 }
1718 }
1719 }
1720 }
1721
1722 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, small_kernel) {
1723 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1724 for (size_t k = 1; k <= 20; k += 5) {
1725 GemmMicrokernelTester()
1726 .mr(8)
1727 .nr(8)
1728 .kr(1)
1729 .sr(1)
1730 .m(8)
1731 .n(8)
1732 .k(k)
1733 .ks(3)
1734 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1735 }
1736 }
1737
1738 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, small_kernel_subtile) {
1739 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1740 for (size_t k = 1; k <= 20; k += 5) {
1741 for (uint32_t m = 1; m <= 8; m++) {
1742 for (uint32_t n = 1; n <= 8; n++) {
1743 GemmMicrokernelTester()
1744 .mr(8)
1745 .nr(8)
1746 .kr(1)
1747 .sr(1)
1748 .m(m)
1749 .n(n)
1750 .k(k)
1751 .ks(3)
1752 .iterations(1)
1753 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1754 }
1755 }
1756 }
1757 }
1758
1759 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_small_kernel) {
1760 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1761 for (uint32_t n = 9; n < 16; n++) {
1762 for (size_t k = 1; k <= 20; k += 5) {
1763 GemmMicrokernelTester()
1764 .mr(8)
1765 .nr(8)
1766 .kr(1)
1767 .sr(1)
1768 .m(8)
1769 .n(8)
1770 .k(k)
1771 .ks(3)
1772 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1773 }
1774 }
1775 }
1776
1777 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_small_kernel) {
1778 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1779 for (uint32_t n = 16; n <= 24; n += 8) {
1780 for (size_t k = 1; k <= 20; k += 5) {
1781 GemmMicrokernelTester()
1782 .mr(8)
1783 .nr(8)
1784 .kr(1)
1785 .sr(1)
1786 .m(8)
1787 .n(8)
1788 .k(k)
1789 .ks(3)
1790 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1791 }
1792 }
1793 }
1794
1795 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
1796 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1797 for (size_t k = 1; k <= 20; k += 5) {
1798 for (uint32_t m = 1; m <= 8; m++) {
1799 for (uint32_t n = 1; n <= 8; n++) {
1800 GemmMicrokernelTester()
1801 .mr(8)
1802 .nr(8)
1803 .kr(1)
1804 .sr(1)
1805 .m(m)
1806 .n(n)
1807 .k(k)
1808 .cm_stride(11)
1809 .iterations(1)
1810 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1811 }
1812 }
1813 }
1814 }
1815
1816 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, a_offset) {
1817 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1818 for (size_t k = 1; k <= 20; k += 5) {
1819 GemmMicrokernelTester()
1820 .mr(8)
1821 .nr(8)
1822 .kr(1)
1823 .sr(1)
1824 .m(8)
1825 .n(8)
1826 .k(k)
1827 .ks(3)
1828 .a_offset(163)
1829 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1830 }
1831 }
1832
1833 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, zero) {
1834 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1835 for (uint32_t mz = 0; mz < 8; mz++) {
1836 for (size_t k = 1; k <= 20; k += 5) {
1837 GemmMicrokernelTester()
1838 .mr(8)
1839 .nr(8)
1840 .kr(1)
1841 .sr(1)
1842 .m(8)
1843 .n(8)
1844 .k(k)
1845 .ks(3)
1846 .a_offset(163)
1847 .zero_index(mz)
1848 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1849 }
1850 }
1851 }
1852
1853 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmin) {
1854 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1855 GemmMicrokernelTester()
1856 .mr(8)
1857 .nr(8)
1858 .kr(1)
1859 .sr(1)
1860 .m(8)
1861 .n(8)
1862 .k(4)
1863 .qmin(128)
1864 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1865 }
1866
1867 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmax) {
1868 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1869 GemmMicrokernelTester()
1870 .mr(8)
1871 .nr(8)
1872 .kr(1)
1873 .sr(1)
1874 .m(8)
1875 .n(8)
1876 .k(4)
1877 .qmax(128)
1878 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1879 }
1880
1881 TEST(F16_IGEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm) {
1882 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1883 GemmMicrokernelTester()
1884 .mr(8)
1885 .nr(8)
1886 .kr(1)
1887 .sr(1)
1888 .m(8)
1889 .n(8)
1890 .k(4)
1891 .cm_stride(11)
1892 .Test(xnn_f16_igemm_minmax_ukernel_8x8__neonfp16arith_ld64);
1893 }
1894#endif // XNN_ARCH_ARM64
Frank Barchard3f9f99f2020-05-06 01:12:04 -07001895
1896
1897#if XNN_ARCH_ARM64
1898 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4) {
1899 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1900 GemmMicrokernelTester()
1901 .mr(1)
1902 .nr(16)
1903 .kr(1)
1904 .sr(1)
1905 .m(1)
1906 .n(16)
1907 .k(4)
1908 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1909 }
1910
1911 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cn) {
1912 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1913 GemmMicrokernelTester()
1914 .mr(1)
1915 .nr(16)
1916 .kr(1)
1917 .sr(1)
1918 .m(1)
1919 .n(16)
1920 .k(4)
1921 .cn_stride(19)
1922 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1923 }
1924
1925 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
1926 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1927 for (uint32_t m = 1; m <= 1; m++) {
1928 for (uint32_t n = 1; n <= 16; n++) {
1929 GemmMicrokernelTester()
1930 .mr(1)
1931 .nr(16)
1932 .kr(1)
1933 .sr(1)
1934 .m(m)
1935 .n(n)
1936 .k(4)
1937 .iterations(1)
1938 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1939 }
1940 }
1941 }
1942
1943 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
1944 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1945 for (uint32_t m = 1; m <= 1; m++) {
1946 GemmMicrokernelTester()
1947 .mr(1)
1948 .nr(16)
1949 .kr(1)
1950 .sr(1)
1951 .m(m)
1952 .n(16)
1953 .k(4)
1954 .iterations(1)
1955 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1956 }
1957 }
1958
1959 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
1960 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1961 for (uint32_t n = 1; n <= 16; n++) {
1962 GemmMicrokernelTester()
1963 .mr(1)
1964 .nr(16)
1965 .kr(1)
1966 .sr(1)
1967 .m(1)
1968 .n(n)
1969 .k(4)
1970 .iterations(1)
1971 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1972 }
1973 }
1974
1975 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4) {
1976 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1977 for (size_t k = 1; k < 4; k++) {
1978 GemmMicrokernelTester()
1979 .mr(1)
1980 .nr(16)
1981 .kr(1)
1982 .sr(1)
1983 .m(1)
1984 .n(16)
1985 .k(k)
1986 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1987 }
1988 }
1989
1990 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
1991 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1992 for (size_t k = 1; k < 4; k++) {
1993 for (uint32_t m = 1; m <= 1; m++) {
1994 for (uint32_t n = 1; n <= 16; n++) {
1995 GemmMicrokernelTester()
1996 .mr(1)
1997 .nr(16)
1998 .kr(1)
1999 .sr(1)
2000 .m(m)
2001 .n(n)
2002 .k(k)
2003 .iterations(1)
2004 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2005 }
2006 }
2007 }
2008 }
2009
2010 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4) {
2011 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2012 for (size_t k = 5; k < 8; k++) {
2013 GemmMicrokernelTester()
2014 .mr(1)
2015 .nr(16)
2016 .kr(1)
2017 .sr(1)
2018 .m(1)
2019 .n(16)
2020 .k(k)
2021 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2022 }
2023 }
2024
2025 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
2026 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2027 for (size_t k = 5; k < 8; k++) {
2028 for (uint32_t m = 1; m <= 1; m++) {
2029 for (uint32_t n = 1; n <= 16; n++) {
2030 GemmMicrokernelTester()
2031 .mr(1)
2032 .nr(16)
2033 .kr(1)
2034 .sr(1)
2035 .m(m)
2036 .n(n)
2037 .k(k)
2038 .iterations(1)
2039 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2040 }
2041 }
2042 }
2043 }
2044
2045 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4) {
2046 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2047 for (size_t k = 8; k <= 40; k += 4) {
2048 GemmMicrokernelTester()
2049 .mr(1)
2050 .nr(16)
2051 .kr(1)
2052 .sr(1)
2053 .m(1)
2054 .n(16)
2055 .k(k)
2056 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2057 }
2058 }
2059
2060 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
2061 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2062 for (size_t k = 8; k <= 40; k += 4) {
2063 for (uint32_t m = 1; m <= 1; m++) {
2064 for (uint32_t n = 1; n <= 16; n++) {
2065 GemmMicrokernelTester()
2066 .mr(1)
2067 .nr(16)
2068 .kr(1)
2069 .sr(1)
2070 .m(m)
2071 .n(n)
2072 .k(k)
2073 .iterations(1)
2074 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2075 }
2076 }
2077 }
2078 }
2079
2080 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16) {
2081 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2082 for (uint32_t n = 17; n < 32; n++) {
2083 for (size_t k = 1; k <= 20; k += 5) {
2084 GemmMicrokernelTester()
2085 .mr(1)
2086 .nr(16)
2087 .kr(1)
2088 .sr(1)
2089 .m(1)
2090 .n(16)
2091 .k(k)
2092 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2093 }
2094 }
2095 }
2096
2097 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
2098 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2099 for (uint32_t n = 17; n < 32; n++) {
2100 for (size_t k = 1; k <= 20; k += 5) {
2101 GemmMicrokernelTester()
2102 .mr(1)
2103 .nr(16)
2104 .kr(1)
2105 .sr(1)
2106 .m(1)
2107 .n(16)
2108 .k(k)
2109 .cn_stride(19)
2110 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2111 }
2112 }
2113 }
2114
2115 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
2116 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2117 for (uint32_t n = 17; n < 32; n++) {
2118 for (size_t k = 1; k <= 20; k += 5) {
2119 for (uint32_t m = 1; m <= 1; m++) {
2120 GemmMicrokernelTester()
2121 .mr(1)
2122 .nr(16)
2123 .kr(1)
2124 .sr(1)
2125 .m(m)
2126 .n(n)
2127 .k(k)
2128 .iterations(1)
2129 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2130 }
2131 }
2132 }
2133 }
2134
2135 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16) {
2136 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2137 for (uint32_t n = 32; n <= 48; n += 16) {
2138 for (size_t k = 1; k <= 20; k += 5) {
2139 GemmMicrokernelTester()
2140 .mr(1)
2141 .nr(16)
2142 .kr(1)
2143 .sr(1)
2144 .m(1)
2145 .n(16)
2146 .k(k)
2147 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2148 }
2149 }
2150 }
2151
2152 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
2153 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2154 for (uint32_t n = 32; n <= 48; n += 16) {
2155 for (size_t k = 1; k <= 20; k += 5) {
2156 GemmMicrokernelTester()
2157 .mr(1)
2158 .nr(16)
2159 .kr(1)
2160 .sr(1)
2161 .m(1)
2162 .n(n)
2163 .k(k)
2164 .cn_stride(19)
2165 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2166 }
2167 }
2168 }
2169
2170 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
2171 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2172 for (uint32_t n = 32; n <= 48; n += 16) {
2173 for (size_t k = 1; k <= 20; k += 5) {
2174 for (uint32_t m = 1; m <= 1; m++) {
2175 GemmMicrokernelTester()
2176 .mr(1)
2177 .nr(16)
2178 .kr(1)
2179 .sr(1)
2180 .m(m)
2181 .n(n)
2182 .k(k)
2183 .iterations(1)
2184 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2185 }
2186 }
2187 }
2188 }
2189
2190 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, small_kernel) {
2191 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2192 for (size_t k = 1; k <= 20; k += 5) {
2193 GemmMicrokernelTester()
2194 .mr(1)
2195 .nr(16)
2196 .kr(1)
2197 .sr(1)
2198 .m(1)
2199 .n(16)
2200 .k(k)
2201 .ks(3)
2202 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2203 }
2204 }
2205
2206 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, small_kernel_subtile) {
2207 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2208 for (size_t k = 1; k <= 20; k += 5) {
2209 for (uint32_t m = 1; m <= 1; m++) {
2210 for (uint32_t n = 1; n <= 16; n++) {
2211 GemmMicrokernelTester()
2212 .mr(1)
2213 .nr(16)
2214 .kr(1)
2215 .sr(1)
2216 .m(m)
2217 .n(n)
2218 .k(k)
2219 .ks(3)
2220 .iterations(1)
2221 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2222 }
2223 }
2224 }
2225 }
2226
2227 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) {
2228 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2229 for (uint32_t n = 17; n < 32; n++) {
2230 for (size_t k = 1; k <= 20; k += 5) {
2231 GemmMicrokernelTester()
2232 .mr(1)
2233 .nr(16)
2234 .kr(1)
2235 .sr(1)
2236 .m(1)
2237 .n(16)
2238 .k(k)
2239 .ks(3)
2240 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2241 }
2242 }
2243 }
2244
2245 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) {
2246 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2247 for (uint32_t n = 32; n <= 48; n += 16) {
2248 for (size_t k = 1; k <= 20; k += 5) {
2249 GemmMicrokernelTester()
2250 .mr(1)
2251 .nr(16)
2252 .kr(1)
2253 .sr(1)
2254 .m(1)
2255 .n(16)
2256 .k(k)
2257 .ks(3)
2258 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2259 }
2260 }
2261 }
2262
2263 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
2264 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2265 for (size_t k = 1; k <= 20; k += 5) {
2266 for (uint32_t m = 1; m <= 1; m++) {
2267 for (uint32_t n = 1; n <= 16; n++) {
2268 GemmMicrokernelTester()
2269 .mr(1)
2270 .nr(16)
2271 .kr(1)
2272 .sr(1)
2273 .m(m)
2274 .n(n)
2275 .k(k)
2276 .cm_stride(19)
2277 .iterations(1)
2278 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2279 }
2280 }
2281 }
2282 }
2283
2284 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, a_offset) {
2285 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2286 for (size_t k = 1; k <= 20; k += 5) {
2287 GemmMicrokernelTester()
2288 .mr(1)
2289 .nr(16)
2290 .kr(1)
2291 .sr(1)
2292 .m(1)
2293 .n(16)
2294 .k(k)
2295 .ks(3)
2296 .a_offset(23)
2297 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2298 }
2299 }
2300
2301 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, zero) {
2302 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2303 for (uint32_t mz = 0; mz < 1; mz++) {
2304 for (size_t k = 1; k <= 20; k += 5) {
2305 GemmMicrokernelTester()
2306 .mr(1)
2307 .nr(16)
2308 .kr(1)
2309 .sr(1)
2310 .m(1)
2311 .n(16)
2312 .k(k)
2313 .ks(3)
2314 .a_offset(23)
2315 .zero_index(mz)
2316 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2317 }
2318 }
2319 }
2320
2321 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmin) {
2322 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2323 GemmMicrokernelTester()
2324 .mr(1)
2325 .nr(16)
2326 .kr(1)
2327 .sr(1)
2328 .m(1)
2329 .n(16)
2330 .k(4)
2331 .qmin(128)
2332 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2333 }
2334
2335 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmax) {
2336 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2337 GemmMicrokernelTester()
2338 .mr(1)
2339 .nr(16)
2340 .kr(1)
2341 .sr(1)
2342 .m(1)
2343 .n(16)
2344 .k(4)
2345 .qmax(128)
2346 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2347 }
2348
2349 TEST(F16_IGEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm) {
2350 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2351 GemmMicrokernelTester()
2352 .mr(1)
2353 .nr(16)
2354 .kr(1)
2355 .sr(1)
2356 .m(1)
2357 .n(16)
2358 .k(4)
2359 .cm_stride(19)
2360 .Test(xnn_f16_igemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2361 }
2362#endif // XNN_ARCH_ARM64
2363
2364
2365#if XNN_ARCH_ARM64
2366 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4) {
2367 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2368 GemmMicrokernelTester()
2369 .mr(4)
2370 .nr(16)
2371 .kr(1)
2372 .sr(1)
2373 .m(4)
2374 .n(16)
2375 .k(4)
2376 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2377 }
2378
2379 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cn) {
2380 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2381 GemmMicrokernelTester()
2382 .mr(4)
2383 .nr(16)
2384 .kr(1)
2385 .sr(1)
2386 .m(4)
2387 .n(16)
2388 .k(4)
2389 .cn_stride(19)
2390 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2391 }
2392
2393 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
2394 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2395 for (uint32_t m = 1; m <= 4; m++) {
2396 for (uint32_t n = 1; n <= 16; n++) {
2397 GemmMicrokernelTester()
2398 .mr(4)
2399 .nr(16)
2400 .kr(1)
2401 .sr(1)
2402 .m(m)
2403 .n(n)
2404 .k(4)
2405 .iterations(1)
2406 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2407 }
2408 }
2409 }
2410
2411 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
2412 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2413 for (uint32_t m = 1; m <= 4; m++) {
2414 GemmMicrokernelTester()
2415 .mr(4)
2416 .nr(16)
2417 .kr(1)
2418 .sr(1)
2419 .m(m)
2420 .n(16)
2421 .k(4)
2422 .iterations(1)
2423 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2424 }
2425 }
2426
2427 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
2428 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2429 for (uint32_t n = 1; n <= 16; n++) {
2430 GemmMicrokernelTester()
2431 .mr(4)
2432 .nr(16)
2433 .kr(1)
2434 .sr(1)
2435 .m(4)
2436 .n(n)
2437 .k(4)
2438 .iterations(1)
2439 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2440 }
2441 }
2442
2443 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4) {
2444 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2445 for (size_t k = 1; k < 4; k++) {
2446 GemmMicrokernelTester()
2447 .mr(4)
2448 .nr(16)
2449 .kr(1)
2450 .sr(1)
2451 .m(4)
2452 .n(16)
2453 .k(k)
2454 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2455 }
2456 }
2457
2458 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
2459 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2460 for (size_t k = 1; k < 4; k++) {
2461 for (uint32_t m = 1; m <= 4; m++) {
2462 for (uint32_t n = 1; n <= 16; n++) {
2463 GemmMicrokernelTester()
2464 .mr(4)
2465 .nr(16)
2466 .kr(1)
2467 .sr(1)
2468 .m(m)
2469 .n(n)
2470 .k(k)
2471 .iterations(1)
2472 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2473 }
2474 }
2475 }
2476 }
2477
2478 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4) {
2479 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2480 for (size_t k = 5; k < 8; k++) {
2481 GemmMicrokernelTester()
2482 .mr(4)
2483 .nr(16)
2484 .kr(1)
2485 .sr(1)
2486 .m(4)
2487 .n(16)
2488 .k(k)
2489 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2490 }
2491 }
2492
2493 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
2494 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2495 for (size_t k = 5; k < 8; k++) {
2496 for (uint32_t m = 1; m <= 4; m++) {
2497 for (uint32_t n = 1; n <= 16; n++) {
2498 GemmMicrokernelTester()
2499 .mr(4)
2500 .nr(16)
2501 .kr(1)
2502 .sr(1)
2503 .m(m)
2504 .n(n)
2505 .k(k)
2506 .iterations(1)
2507 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2508 }
2509 }
2510 }
2511 }
2512
2513 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4) {
2514 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2515 for (size_t k = 8; k <= 40; k += 4) {
2516 GemmMicrokernelTester()
2517 .mr(4)
2518 .nr(16)
2519 .kr(1)
2520 .sr(1)
2521 .m(4)
2522 .n(16)
2523 .k(k)
2524 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2525 }
2526 }
2527
2528 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
2529 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2530 for (size_t k = 8; k <= 40; k += 4) {
2531 for (uint32_t m = 1; m <= 4; m++) {
2532 for (uint32_t n = 1; n <= 16; n++) {
2533 GemmMicrokernelTester()
2534 .mr(4)
2535 .nr(16)
2536 .kr(1)
2537 .sr(1)
2538 .m(m)
2539 .n(n)
2540 .k(k)
2541 .iterations(1)
2542 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2543 }
2544 }
2545 }
2546 }
2547
2548 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16) {
2549 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2550 for (uint32_t n = 17; n < 32; n++) {
2551 for (size_t k = 1; k <= 20; k += 5) {
2552 GemmMicrokernelTester()
2553 .mr(4)
2554 .nr(16)
2555 .kr(1)
2556 .sr(1)
2557 .m(4)
2558 .n(16)
2559 .k(k)
2560 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2561 }
2562 }
2563 }
2564
2565 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
2566 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2567 for (uint32_t n = 17; n < 32; n++) {
2568 for (size_t k = 1; k <= 20; k += 5) {
2569 GemmMicrokernelTester()
2570 .mr(4)
2571 .nr(16)
2572 .kr(1)
2573 .sr(1)
2574 .m(4)
2575 .n(16)
2576 .k(k)
2577 .cn_stride(19)
2578 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2579 }
2580 }
2581 }
2582
2583 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
2584 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2585 for (uint32_t n = 17; n < 32; n++) {
2586 for (size_t k = 1; k <= 20; k += 5) {
2587 for (uint32_t m = 1; m <= 4; m++) {
2588 GemmMicrokernelTester()
2589 .mr(4)
2590 .nr(16)
2591 .kr(1)
2592 .sr(1)
2593 .m(m)
2594 .n(n)
2595 .k(k)
2596 .iterations(1)
2597 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2598 }
2599 }
2600 }
2601 }
2602
2603 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16) {
2604 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2605 for (uint32_t n = 32; n <= 48; n += 16) {
2606 for (size_t k = 1; k <= 20; k += 5) {
2607 GemmMicrokernelTester()
2608 .mr(4)
2609 .nr(16)
2610 .kr(1)
2611 .sr(1)
2612 .m(4)
2613 .n(16)
2614 .k(k)
2615 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2616 }
2617 }
2618 }
2619
2620 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
2621 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2622 for (uint32_t n = 32; n <= 48; n += 16) {
2623 for (size_t k = 1; k <= 20; k += 5) {
2624 GemmMicrokernelTester()
2625 .mr(4)
2626 .nr(16)
2627 .kr(1)
2628 .sr(1)
2629 .m(4)
2630 .n(n)
2631 .k(k)
2632 .cn_stride(19)
2633 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2634 }
2635 }
2636 }
2637
2638 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
2639 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2640 for (uint32_t n = 32; n <= 48; n += 16) {
2641 for (size_t k = 1; k <= 20; k += 5) {
2642 for (uint32_t m = 1; m <= 4; m++) {
2643 GemmMicrokernelTester()
2644 .mr(4)
2645 .nr(16)
2646 .kr(1)
2647 .sr(1)
2648 .m(m)
2649 .n(n)
2650 .k(k)
2651 .iterations(1)
2652 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2653 }
2654 }
2655 }
2656 }
2657
2658 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, small_kernel) {
2659 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2660 for (size_t k = 1; k <= 20; k += 5) {
2661 GemmMicrokernelTester()
2662 .mr(4)
2663 .nr(16)
2664 .kr(1)
2665 .sr(1)
2666 .m(4)
2667 .n(16)
2668 .k(k)
2669 .ks(3)
2670 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2671 }
2672 }
2673
2674 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, small_kernel_subtile) {
2675 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2676 for (size_t k = 1; k <= 20; k += 5) {
2677 for (uint32_t m = 1; m <= 4; m++) {
2678 for (uint32_t n = 1; n <= 16; n++) {
2679 GemmMicrokernelTester()
2680 .mr(4)
2681 .nr(16)
2682 .kr(1)
2683 .sr(1)
2684 .m(m)
2685 .n(n)
2686 .k(k)
2687 .ks(3)
2688 .iterations(1)
2689 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2690 }
2691 }
2692 }
2693 }
2694
2695 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) {
2696 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2697 for (uint32_t n = 17; n < 32; n++) {
2698 for (size_t k = 1; k <= 20; k += 5) {
2699 GemmMicrokernelTester()
2700 .mr(4)
2701 .nr(16)
2702 .kr(1)
2703 .sr(1)
2704 .m(4)
2705 .n(16)
2706 .k(k)
2707 .ks(3)
2708 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2709 }
2710 }
2711 }
2712
2713 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) {
2714 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2715 for (uint32_t n = 32; n <= 48; n += 16) {
2716 for (size_t k = 1; k <= 20; k += 5) {
2717 GemmMicrokernelTester()
2718 .mr(4)
2719 .nr(16)
2720 .kr(1)
2721 .sr(1)
2722 .m(4)
2723 .n(16)
2724 .k(k)
2725 .ks(3)
2726 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2727 }
2728 }
2729 }
2730
2731 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
2732 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2733 for (size_t k = 1; k <= 20; k += 5) {
2734 for (uint32_t m = 1; m <= 4; m++) {
2735 for (uint32_t n = 1; n <= 16; n++) {
2736 GemmMicrokernelTester()
2737 .mr(4)
2738 .nr(16)
2739 .kr(1)
2740 .sr(1)
2741 .m(m)
2742 .n(n)
2743 .k(k)
2744 .cm_stride(19)
2745 .iterations(1)
2746 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2747 }
2748 }
2749 }
2750 }
2751
2752 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, a_offset) {
2753 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2754 for (size_t k = 1; k <= 20; k += 5) {
2755 GemmMicrokernelTester()
2756 .mr(4)
2757 .nr(16)
2758 .kr(1)
2759 .sr(1)
2760 .m(4)
2761 .n(16)
2762 .k(k)
2763 .ks(3)
2764 .a_offset(83)
2765 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2766 }
2767 }
2768
2769 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, zero) {
2770 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2771 for (uint32_t mz = 0; mz < 4; mz++) {
2772 for (size_t k = 1; k <= 20; k += 5) {
2773 GemmMicrokernelTester()
2774 .mr(4)
2775 .nr(16)
2776 .kr(1)
2777 .sr(1)
2778 .m(4)
2779 .n(16)
2780 .k(k)
2781 .ks(3)
2782 .a_offset(83)
2783 .zero_index(mz)
2784 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2785 }
2786 }
2787 }
2788
2789 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmin) {
2790 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2791 GemmMicrokernelTester()
2792 .mr(4)
2793 .nr(16)
2794 .kr(1)
2795 .sr(1)
2796 .m(4)
2797 .n(16)
2798 .k(4)
2799 .qmin(128)
2800 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2801 }
2802
2803 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmax) {
2804 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2805 GemmMicrokernelTester()
2806 .mr(4)
2807 .nr(16)
2808 .kr(1)
2809 .sr(1)
2810 .m(4)
2811 .n(16)
2812 .k(4)
2813 .qmax(128)
2814 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2815 }
2816
2817 TEST(F16_IGEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm) {
2818 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2819 GemmMicrokernelTester()
2820 .mr(4)
2821 .nr(16)
2822 .kr(1)
2823 .sr(1)
2824 .m(4)
2825 .n(16)
2826 .k(4)
2827 .cm_stride(19)
2828 .Test(xnn_f16_igemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2829 }
2830#endif // XNN_ARCH_ARM64
2831
2832
2833#if XNN_ARCH_ARM64
2834 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4) {
2835 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2836 GemmMicrokernelTester()
2837 .mr(6)
2838 .nr(16)
2839 .kr(1)
2840 .sr(1)
2841 .m(6)
2842 .n(16)
2843 .k(4)
2844 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2845 }
2846
2847 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cn) {
2848 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2849 GemmMicrokernelTester()
2850 .mr(6)
2851 .nr(16)
2852 .kr(1)
2853 .sr(1)
2854 .m(6)
2855 .n(16)
2856 .k(4)
2857 .cn_stride(19)
2858 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2859 }
2860
2861 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
2862 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2863 for (uint32_t m = 1; m <= 6; m++) {
2864 for (uint32_t n = 1; n <= 16; n++) {
2865 GemmMicrokernelTester()
2866 .mr(6)
2867 .nr(16)
2868 .kr(1)
2869 .sr(1)
2870 .m(m)
2871 .n(n)
2872 .k(4)
2873 .iterations(1)
2874 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2875 }
2876 }
2877 }
2878
2879 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
2880 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2881 for (uint32_t m = 1; m <= 6; m++) {
2882 GemmMicrokernelTester()
2883 .mr(6)
2884 .nr(16)
2885 .kr(1)
2886 .sr(1)
2887 .m(m)
2888 .n(16)
2889 .k(4)
2890 .iterations(1)
2891 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2892 }
2893 }
2894
2895 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
2896 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2897 for (uint32_t n = 1; n <= 16; n++) {
2898 GemmMicrokernelTester()
2899 .mr(6)
2900 .nr(16)
2901 .kr(1)
2902 .sr(1)
2903 .m(6)
2904 .n(n)
2905 .k(4)
2906 .iterations(1)
2907 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2908 }
2909 }
2910
2911 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4) {
2912 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2913 for (size_t k = 1; k < 4; k++) {
2914 GemmMicrokernelTester()
2915 .mr(6)
2916 .nr(16)
2917 .kr(1)
2918 .sr(1)
2919 .m(6)
2920 .n(16)
2921 .k(k)
2922 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2923 }
2924 }
2925
2926 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
2927 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2928 for (size_t k = 1; k < 4; k++) {
2929 for (uint32_t m = 1; m <= 6; m++) {
2930 for (uint32_t n = 1; n <= 16; n++) {
2931 GemmMicrokernelTester()
2932 .mr(6)
2933 .nr(16)
2934 .kr(1)
2935 .sr(1)
2936 .m(m)
2937 .n(n)
2938 .k(k)
2939 .iterations(1)
2940 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2941 }
2942 }
2943 }
2944 }
2945
2946 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4) {
2947 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2948 for (size_t k = 5; k < 8; k++) {
2949 GemmMicrokernelTester()
2950 .mr(6)
2951 .nr(16)
2952 .kr(1)
2953 .sr(1)
2954 .m(6)
2955 .n(16)
2956 .k(k)
2957 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2958 }
2959 }
2960
2961 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
2962 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2963 for (size_t k = 5; k < 8; k++) {
2964 for (uint32_t m = 1; m <= 6; m++) {
2965 for (uint32_t n = 1; n <= 16; n++) {
2966 GemmMicrokernelTester()
2967 .mr(6)
2968 .nr(16)
2969 .kr(1)
2970 .sr(1)
2971 .m(m)
2972 .n(n)
2973 .k(k)
2974 .iterations(1)
2975 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2976 }
2977 }
2978 }
2979 }
2980
2981 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4) {
2982 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2983 for (size_t k = 8; k <= 40; k += 4) {
2984 GemmMicrokernelTester()
2985 .mr(6)
2986 .nr(16)
2987 .kr(1)
2988 .sr(1)
2989 .m(6)
2990 .n(16)
2991 .k(k)
2992 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2993 }
2994 }
2995
2996 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
2997 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2998 for (size_t k = 8; k <= 40; k += 4) {
2999 for (uint32_t m = 1; m <= 6; m++) {
3000 for (uint32_t n = 1; n <= 16; n++) {
3001 GemmMicrokernelTester()
3002 .mr(6)
3003 .nr(16)
3004 .kr(1)
3005 .sr(1)
3006 .m(m)
3007 .n(n)
3008 .k(k)
3009 .iterations(1)
3010 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3011 }
3012 }
3013 }
3014 }
3015
3016 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16) {
3017 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3018 for (uint32_t n = 17; n < 32; n++) {
3019 for (size_t k = 1; k <= 20; k += 5) {
3020 GemmMicrokernelTester()
3021 .mr(6)
3022 .nr(16)
3023 .kr(1)
3024 .sr(1)
3025 .m(6)
3026 .n(16)
3027 .k(k)
3028 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3029 }
3030 }
3031 }
3032
3033 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
3034 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3035 for (uint32_t n = 17; n < 32; n++) {
3036 for (size_t k = 1; k <= 20; k += 5) {
3037 GemmMicrokernelTester()
3038 .mr(6)
3039 .nr(16)
3040 .kr(1)
3041 .sr(1)
3042 .m(6)
3043 .n(16)
3044 .k(k)
3045 .cn_stride(19)
3046 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3047 }
3048 }
3049 }
3050
3051 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
3052 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3053 for (uint32_t n = 17; n < 32; n++) {
3054 for (size_t k = 1; k <= 20; k += 5) {
3055 for (uint32_t m = 1; m <= 6; m++) {
3056 GemmMicrokernelTester()
3057 .mr(6)
3058 .nr(16)
3059 .kr(1)
3060 .sr(1)
3061 .m(m)
3062 .n(n)
3063 .k(k)
3064 .iterations(1)
3065 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3066 }
3067 }
3068 }
3069 }
3070
3071 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16) {
3072 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3073 for (uint32_t n = 32; n <= 48; n += 16) {
3074 for (size_t k = 1; k <= 20; k += 5) {
3075 GemmMicrokernelTester()
3076 .mr(6)
3077 .nr(16)
3078 .kr(1)
3079 .sr(1)
3080 .m(6)
3081 .n(16)
3082 .k(k)
3083 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3084 }
3085 }
3086 }
3087
3088 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
3089 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3090 for (uint32_t n = 32; n <= 48; n += 16) {
3091 for (size_t k = 1; k <= 20; k += 5) {
3092 GemmMicrokernelTester()
3093 .mr(6)
3094 .nr(16)
3095 .kr(1)
3096 .sr(1)
3097 .m(6)
3098 .n(n)
3099 .k(k)
3100 .cn_stride(19)
3101 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3102 }
3103 }
3104 }
3105
3106 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
3107 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3108 for (uint32_t n = 32; n <= 48; n += 16) {
3109 for (size_t k = 1; k <= 20; k += 5) {
3110 for (uint32_t m = 1; m <= 6; m++) {
3111 GemmMicrokernelTester()
3112 .mr(6)
3113 .nr(16)
3114 .kr(1)
3115 .sr(1)
3116 .m(m)
3117 .n(n)
3118 .k(k)
3119 .iterations(1)
3120 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3121 }
3122 }
3123 }
3124 }
3125
3126 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, small_kernel) {
3127 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3128 for (size_t k = 1; k <= 20; k += 5) {
3129 GemmMicrokernelTester()
3130 .mr(6)
3131 .nr(16)
3132 .kr(1)
3133 .sr(1)
3134 .m(6)
3135 .n(16)
3136 .k(k)
3137 .ks(3)
3138 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3139 }
3140 }
3141
3142 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, small_kernel_subtile) {
3143 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3144 for (size_t k = 1; k <= 20; k += 5) {
3145 for (uint32_t m = 1; m <= 6; m++) {
3146 for (uint32_t n = 1; n <= 16; n++) {
3147 GemmMicrokernelTester()
3148 .mr(6)
3149 .nr(16)
3150 .kr(1)
3151 .sr(1)
3152 .m(m)
3153 .n(n)
3154 .k(k)
3155 .ks(3)
3156 .iterations(1)
3157 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3158 }
3159 }
3160 }
3161 }
3162
3163 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) {
3164 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3165 for (uint32_t n = 17; n < 32; n++) {
3166 for (size_t k = 1; k <= 20; k += 5) {
3167 GemmMicrokernelTester()
3168 .mr(6)
3169 .nr(16)
3170 .kr(1)
3171 .sr(1)
3172 .m(6)
3173 .n(16)
3174 .k(k)
3175 .ks(3)
3176 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3177 }
3178 }
3179 }
3180
3181 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) {
3182 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3183 for (uint32_t n = 32; n <= 48; n += 16) {
3184 for (size_t k = 1; k <= 20; k += 5) {
3185 GemmMicrokernelTester()
3186 .mr(6)
3187 .nr(16)
3188 .kr(1)
3189 .sr(1)
3190 .m(6)
3191 .n(16)
3192 .k(k)
3193 .ks(3)
3194 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3195 }
3196 }
3197 }
3198
3199 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
3200 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3201 for (size_t k = 1; k <= 20; k += 5) {
3202 for (uint32_t m = 1; m <= 6; m++) {
3203 for (uint32_t n = 1; n <= 16; n++) {
3204 GemmMicrokernelTester()
3205 .mr(6)
3206 .nr(16)
3207 .kr(1)
3208 .sr(1)
3209 .m(m)
3210 .n(n)
3211 .k(k)
3212 .cm_stride(19)
3213 .iterations(1)
3214 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3215 }
3216 }
3217 }
3218 }
3219
3220 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, a_offset) {
3221 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3222 for (size_t k = 1; k <= 20; k += 5) {
3223 GemmMicrokernelTester()
3224 .mr(6)
3225 .nr(16)
3226 .kr(1)
3227 .sr(1)
3228 .m(6)
3229 .n(16)
3230 .k(k)
3231 .ks(3)
3232 .a_offset(127)
3233 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3234 }
3235 }
3236
3237 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, zero) {
3238 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3239 for (uint32_t mz = 0; mz < 6; mz++) {
3240 for (size_t k = 1; k <= 20; k += 5) {
3241 GemmMicrokernelTester()
3242 .mr(6)
3243 .nr(16)
3244 .kr(1)
3245 .sr(1)
3246 .m(6)
3247 .n(16)
3248 .k(k)
3249 .ks(3)
3250 .a_offset(127)
3251 .zero_index(mz)
3252 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3253 }
3254 }
3255 }
3256
3257 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmin) {
3258 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3259 GemmMicrokernelTester()
3260 .mr(6)
3261 .nr(16)
3262 .kr(1)
3263 .sr(1)
3264 .m(6)
3265 .n(16)
3266 .k(4)
3267 .qmin(128)
3268 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3269 }
3270
3271 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmax) {
3272 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3273 GemmMicrokernelTester()
3274 .mr(6)
3275 .nr(16)
3276 .kr(1)
3277 .sr(1)
3278 .m(6)
3279 .n(16)
3280 .k(4)
3281 .qmax(128)
3282 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3283 }
3284
3285 TEST(F16_IGEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm) {
3286 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3287 GemmMicrokernelTester()
3288 .mr(6)
3289 .nr(16)
3290 .kr(1)
3291 .sr(1)
3292 .m(6)
3293 .n(16)
3294 .k(4)
3295 .cm_stride(19)
3296 .Test(xnn_f16_igemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3297 }
3298#endif // XNN_ARCH_ARM64
3299
3300
3301#if XNN_ARCH_ARM64
3302 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4) {
3303 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3304 GemmMicrokernelTester()
3305 .mr(8)
3306 .nr(16)
3307 .kr(1)
3308 .sr(1)
3309 .m(8)
3310 .n(16)
3311 .k(4)
3312 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3313 }
3314
3315 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cn) {
3316 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3317 GemmMicrokernelTester()
3318 .mr(8)
3319 .nr(16)
3320 .kr(1)
3321 .sr(1)
3322 .m(8)
3323 .n(16)
3324 .k(4)
3325 .cn_stride(19)
3326 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3327 }
3328
3329 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
3330 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3331 for (uint32_t m = 1; m <= 8; m++) {
3332 for (uint32_t n = 1; n <= 16; n++) {
3333 GemmMicrokernelTester()
3334 .mr(8)
3335 .nr(16)
3336 .kr(1)
3337 .sr(1)
3338 .m(m)
3339 .n(n)
3340 .k(4)
3341 .iterations(1)
3342 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3343 }
3344 }
3345 }
3346
3347 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
3348 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3349 for (uint32_t m = 1; m <= 8; m++) {
3350 GemmMicrokernelTester()
3351 .mr(8)
3352 .nr(16)
3353 .kr(1)
3354 .sr(1)
3355 .m(m)
3356 .n(16)
3357 .k(4)
3358 .iterations(1)
3359 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3360 }
3361 }
3362
3363 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
3364 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3365 for (uint32_t n = 1; n <= 16; n++) {
3366 GemmMicrokernelTester()
3367 .mr(8)
3368 .nr(16)
3369 .kr(1)
3370 .sr(1)
3371 .m(8)
3372 .n(n)
3373 .k(4)
3374 .iterations(1)
3375 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3376 }
3377 }
3378
3379 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4) {
3380 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3381 for (size_t k = 1; k < 4; k++) {
3382 GemmMicrokernelTester()
3383 .mr(8)
3384 .nr(16)
3385 .kr(1)
3386 .sr(1)
3387 .m(8)
3388 .n(16)
3389 .k(k)
3390 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3391 }
3392 }
3393
3394 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
3395 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3396 for (size_t k = 1; k < 4; k++) {
3397 for (uint32_t m = 1; m <= 8; m++) {
3398 for (uint32_t n = 1; n <= 16; n++) {
3399 GemmMicrokernelTester()
3400 .mr(8)
3401 .nr(16)
3402 .kr(1)
3403 .sr(1)
3404 .m(m)
3405 .n(n)
3406 .k(k)
3407 .iterations(1)
3408 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3409 }
3410 }
3411 }
3412 }
3413
3414 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4) {
3415 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3416 for (size_t k = 5; k < 8; k++) {
3417 GemmMicrokernelTester()
3418 .mr(8)
3419 .nr(16)
3420 .kr(1)
3421 .sr(1)
3422 .m(8)
3423 .n(16)
3424 .k(k)
3425 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3426 }
3427 }
3428
3429 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
3430 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3431 for (size_t k = 5; k < 8; k++) {
3432 for (uint32_t m = 1; m <= 8; m++) {
3433 for (uint32_t n = 1; n <= 16; n++) {
3434 GemmMicrokernelTester()
3435 .mr(8)
3436 .nr(16)
3437 .kr(1)
3438 .sr(1)
3439 .m(m)
3440 .n(n)
3441 .k(k)
3442 .iterations(1)
3443 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3444 }
3445 }
3446 }
3447 }
3448
3449 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4) {
3450 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3451 for (size_t k = 8; k <= 40; k += 4) {
3452 GemmMicrokernelTester()
3453 .mr(8)
3454 .nr(16)
3455 .kr(1)
3456 .sr(1)
3457 .m(8)
3458 .n(16)
3459 .k(k)
3460 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3461 }
3462 }
3463
3464 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
3465 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3466 for (size_t k = 8; k <= 40; k += 4) {
3467 for (uint32_t m = 1; m <= 8; m++) {
3468 for (uint32_t n = 1; n <= 16; n++) {
3469 GemmMicrokernelTester()
3470 .mr(8)
3471 .nr(16)
3472 .kr(1)
3473 .sr(1)
3474 .m(m)
3475 .n(n)
3476 .k(k)
3477 .iterations(1)
3478 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3479 }
3480 }
3481 }
3482 }
3483
3484 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16) {
3485 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3486 for (uint32_t n = 17; n < 32; n++) {
3487 for (size_t k = 1; k <= 20; k += 5) {
3488 GemmMicrokernelTester()
3489 .mr(8)
3490 .nr(16)
3491 .kr(1)
3492 .sr(1)
3493 .m(8)
3494 .n(16)
3495 .k(k)
3496 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3497 }
3498 }
3499 }
3500
3501 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
3502 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3503 for (uint32_t n = 17; n < 32; n++) {
3504 for (size_t k = 1; k <= 20; k += 5) {
3505 GemmMicrokernelTester()
3506 .mr(8)
3507 .nr(16)
3508 .kr(1)
3509 .sr(1)
3510 .m(8)
3511 .n(16)
3512 .k(k)
3513 .cn_stride(19)
3514 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3515 }
3516 }
3517 }
3518
3519 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
3520 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3521 for (uint32_t n = 17; n < 32; n++) {
3522 for (size_t k = 1; k <= 20; k += 5) {
3523 for (uint32_t m = 1; m <= 8; m++) {
3524 GemmMicrokernelTester()
3525 .mr(8)
3526 .nr(16)
3527 .kr(1)
3528 .sr(1)
3529 .m(m)
3530 .n(n)
3531 .k(k)
3532 .iterations(1)
3533 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3534 }
3535 }
3536 }
3537 }
3538
3539 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16) {
3540 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3541 for (uint32_t n = 32; n <= 48; n += 16) {
3542 for (size_t k = 1; k <= 20; k += 5) {
3543 GemmMicrokernelTester()
3544 .mr(8)
3545 .nr(16)
3546 .kr(1)
3547 .sr(1)
3548 .m(8)
3549 .n(16)
3550 .k(k)
3551 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3552 }
3553 }
3554 }
3555
3556 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
3557 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3558 for (uint32_t n = 32; n <= 48; n += 16) {
3559 for (size_t k = 1; k <= 20; k += 5) {
3560 GemmMicrokernelTester()
3561 .mr(8)
3562 .nr(16)
3563 .kr(1)
3564 .sr(1)
3565 .m(8)
3566 .n(n)
3567 .k(k)
3568 .cn_stride(19)
3569 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3570 }
3571 }
3572 }
3573
3574 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
3575 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3576 for (uint32_t n = 32; n <= 48; n += 16) {
3577 for (size_t k = 1; k <= 20; k += 5) {
3578 for (uint32_t m = 1; m <= 8; m++) {
3579 GemmMicrokernelTester()
3580 .mr(8)
3581 .nr(16)
3582 .kr(1)
3583 .sr(1)
3584 .m(m)
3585 .n(n)
3586 .k(k)
3587 .iterations(1)
3588 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3589 }
3590 }
3591 }
3592 }
3593
3594 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, small_kernel) {
3595 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3596 for (size_t k = 1; k <= 20; k += 5) {
3597 GemmMicrokernelTester()
3598 .mr(8)
3599 .nr(16)
3600 .kr(1)
3601 .sr(1)
3602 .m(8)
3603 .n(16)
3604 .k(k)
3605 .ks(3)
3606 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3607 }
3608 }
3609
3610 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, small_kernel_subtile) {
3611 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3612 for (size_t k = 1; k <= 20; k += 5) {
3613 for (uint32_t m = 1; m <= 8; m++) {
3614 for (uint32_t n = 1; n <= 16; n++) {
3615 GemmMicrokernelTester()
3616 .mr(8)
3617 .nr(16)
3618 .kr(1)
3619 .sr(1)
3620 .m(m)
3621 .n(n)
3622 .k(k)
3623 .ks(3)
3624 .iterations(1)
3625 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3626 }
3627 }
3628 }
3629 }
3630
3631 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_small_kernel) {
3632 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3633 for (uint32_t n = 17; n < 32; n++) {
3634 for (size_t k = 1; k <= 20; k += 5) {
3635 GemmMicrokernelTester()
3636 .mr(8)
3637 .nr(16)
3638 .kr(1)
3639 .sr(1)
3640 .m(8)
3641 .n(16)
3642 .k(k)
3643 .ks(3)
3644 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3645 }
3646 }
3647 }
3648
3649 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_small_kernel) {
3650 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3651 for (uint32_t n = 32; n <= 48; n += 16) {
3652 for (size_t k = 1; k <= 20; k += 5) {
3653 GemmMicrokernelTester()
3654 .mr(8)
3655 .nr(16)
3656 .kr(1)
3657 .sr(1)
3658 .m(8)
3659 .n(16)
3660 .k(k)
3661 .ks(3)
3662 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3663 }
3664 }
3665 }
3666
3667 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
3668 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3669 for (size_t k = 1; k <= 20; k += 5) {
3670 for (uint32_t m = 1; m <= 8; m++) {
3671 for (uint32_t n = 1; n <= 16; n++) {
3672 GemmMicrokernelTester()
3673 .mr(8)
3674 .nr(16)
3675 .kr(1)
3676 .sr(1)
3677 .m(m)
3678 .n(n)
3679 .k(k)
3680 .cm_stride(19)
3681 .iterations(1)
3682 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3683 }
3684 }
3685 }
3686 }
3687
3688 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, a_offset) {
3689 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3690 for (size_t k = 1; k <= 20; k += 5) {
3691 GemmMicrokernelTester()
3692 .mr(8)
3693 .nr(16)
3694 .kr(1)
3695 .sr(1)
3696 .m(8)
3697 .n(16)
3698 .k(k)
3699 .ks(3)
3700 .a_offset(163)
3701 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3702 }
3703 }
3704
3705 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, zero) {
3706 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3707 for (uint32_t mz = 0; mz < 8; mz++) {
3708 for (size_t k = 1; k <= 20; k += 5) {
3709 GemmMicrokernelTester()
3710 .mr(8)
3711 .nr(16)
3712 .kr(1)
3713 .sr(1)
3714 .m(8)
3715 .n(16)
3716 .k(k)
3717 .ks(3)
3718 .a_offset(163)
3719 .zero_index(mz)
3720 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3721 }
3722 }
3723 }
3724
3725 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmin) {
3726 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3727 GemmMicrokernelTester()
3728 .mr(8)
3729 .nr(16)
3730 .kr(1)
3731 .sr(1)
3732 .m(8)
3733 .n(16)
3734 .k(4)
3735 .qmin(128)
3736 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3737 }
3738
3739 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmax) {
3740 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3741 GemmMicrokernelTester()
3742 .mr(8)
3743 .nr(16)
3744 .kr(1)
3745 .sr(1)
3746 .m(8)
3747 .n(16)
3748 .k(4)
3749 .qmax(128)
3750 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3751 }
3752
3753 TEST(F16_IGEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm) {
3754 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3755 GemmMicrokernelTester()
3756 .mr(8)
3757 .nr(16)
3758 .kr(1)
3759 .sr(1)
3760 .m(8)
3761 .n(16)
3762 .k(4)
3763 .cm_stride(19)
3764 .Test(xnn_f16_igemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3765 }
3766#endif // XNN_ARCH_ARM64