blob: 3aa323b0e90f5de5d9d2d91b349a10ed89f3cbd7 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
Marat Dukhan163a7e62020-04-09 04:19:26 -070010// Specification: test/f16-gemm-minmax.yaml
XNNPACK Teamb455b122019-09-27 18:10:33 -070011// Generator: tools/generate-gemm-test.py
12
13
XNNPACK Teamb455b122019-09-27 18:10:33 -070014#include <gtest/gtest.h>
15
Marat Dukhan1dadbf72019-10-01 10:46:20 -070016#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "gemm-microkernel-tester.h"
23
24
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025#if XNN_ARCH_ARM64
Frank Barchard1f4e4612020-04-13 18:24:54 -070026 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4) {
27 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(4)
36 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
37 }
38
39 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(4)
49 .cn_stride(11)
50 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
51 }
52
53 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
54 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
55 GemmMicrokernelTester()
56 .mr(1)
57 .nr(8)
58 .kr(1)
59 .sr(1)
60 .m(1)
61 .n(8)
62 .k(4)
63 .a_stride(7)
64 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
65 }
66
67 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
68 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
69 for (uint32_t m = 1; m <= 1; m++) {
70 for (uint32_t n = 1; n <= 8; n++) {
71 GemmMicrokernelTester()
72 .mr(1)
73 .nr(8)
74 .kr(1)
75 .sr(1)
76 .m(m)
77 .n(n)
78 .k(4)
79 .iterations(1)
80 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
81 }
82 }
83 }
84
85 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
86 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
87 for (uint32_t m = 1; m <= 1; m++) {
88 GemmMicrokernelTester()
89 .mr(1)
90 .nr(8)
91 .kr(1)
92 .sr(1)
93 .m(m)
94 .n(8)
95 .k(4)
96 .iterations(1)
97 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
98 }
99 }
100
101 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
102 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
103 for (uint32_t n = 1; n <= 8; n++) {
104 GemmMicrokernelTester()
105 .mr(1)
106 .nr(8)
107 .kr(1)
108 .sr(1)
109 .m(1)
110 .n(n)
111 .k(4)
112 .iterations(1)
113 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
114 }
115 }
116
117 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4) {
118 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
119 for (size_t k = 1; k < 4; k++) {
120 GemmMicrokernelTester()
121 .mr(1)
122 .nr(8)
123 .kr(1)
124 .sr(1)
125 .m(1)
126 .n(8)
127 .k(k)
128 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
129 }
130 }
131
132 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
133 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
134 for (size_t k = 1; k < 4; k++) {
135 GemmMicrokernelTester()
136 .mr(1)
137 .nr(8)
138 .kr(1)
139 .sr(1)
140 .m(1)
141 .n(8)
142 .k(k)
143 .a_stride(7)
144 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
145 }
146 }
147
148 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
149 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
150 for (size_t k = 1; k < 4; k++) {
151 for (uint32_t m = 1; m <= 1; m++) {
152 for (uint32_t n = 1; n <= 8; n++) {
153 GemmMicrokernelTester()
154 .mr(1)
155 .nr(8)
156 .kr(1)
157 .sr(1)
158 .m(m)
159 .n(n)
160 .k(k)
161 .iterations(1)
162 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
163 }
164 }
165 }
166 }
167
168 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4) {
169 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
170 for (size_t k = 5; k < 8; k++) {
171 GemmMicrokernelTester()
172 .mr(1)
173 .nr(8)
174 .kr(1)
175 .sr(1)
176 .m(1)
177 .n(8)
178 .k(k)
179 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
180 }
181 }
182
183 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
184 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
185 for (size_t k = 5; k < 8; k++) {
186 GemmMicrokernelTester()
187 .mr(1)
188 .nr(8)
189 .kr(1)
190 .sr(1)
191 .m(1)
192 .n(8)
193 .k(k)
194 .a_stride(11)
195 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
196 }
197 }
198
199 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
200 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
201 for (size_t k = 5; k < 8; k++) {
202 for (uint32_t m = 1; m <= 1; m++) {
203 for (uint32_t n = 1; n <= 8; n++) {
204 GemmMicrokernelTester()
205 .mr(1)
206 .nr(8)
207 .kr(1)
208 .sr(1)
209 .m(m)
210 .n(n)
211 .k(k)
212 .iterations(1)
213 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
214 }
215 }
216 }
217 }
218
219 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4) {
220 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
221 for (size_t k = 8; k <= 40; k += 4) {
222 GemmMicrokernelTester()
223 .mr(1)
224 .nr(8)
225 .kr(1)
226 .sr(1)
227 .m(1)
228 .n(8)
229 .k(k)
230 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
231 }
232 }
233
234 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_strided_a) {
235 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
236 for (size_t k = 8; k <= 40; k += 4) {
237 GemmMicrokernelTester()
238 .mr(1)
239 .nr(8)
240 .kr(1)
241 .sr(1)
242 .m(1)
243 .n(8)
244 .k(k)
245 .a_stride(43)
246 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
247 }
248 }
249
250 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
251 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
252 for (size_t k = 8; k <= 40; k += 4) {
253 for (uint32_t m = 1; m <= 1; m++) {
254 for (uint32_t n = 1; n <= 8; n++) {
255 GemmMicrokernelTester()
256 .mr(1)
257 .nr(8)
258 .kr(1)
259 .sr(1)
260 .m(m)
261 .n(n)
262 .k(k)
263 .iterations(1)
264 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
265 }
266 }
267 }
268 }
269
270 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8) {
271 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
272 for (uint32_t n = 9; n < 16; n++) {
273 for (size_t k = 1; k <= 20; k += 5) {
274 GemmMicrokernelTester()
275 .mr(1)
276 .nr(8)
277 .kr(1)
278 .sr(1)
279 .m(1)
280 .n(8)
281 .k(k)
282 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
283 }
284 }
285 }
286
287 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
288 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
289 for (uint32_t n = 9; n < 16; n++) {
290 for (size_t k = 1; k <= 20; k += 5) {
291 GemmMicrokernelTester()
292 .mr(1)
293 .nr(8)
294 .kr(1)
295 .sr(1)
296 .m(1)
297 .n(8)
298 .k(k)
299 .cn_stride(11)
300 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
301 }
302 }
303 }
304
305 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) {
306 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
307 for (uint32_t n = 9; n < 16; n++) {
308 for (size_t k = 1; k <= 20; k += 5) {
309 GemmMicrokernelTester()
310 .mr(1)
311 .nr(8)
312 .kr(1)
313 .sr(1)
314 .m(1)
315 .n(n)
316 .k(k)
317 .a_stride(23)
318 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
319 }
320 }
321 }
322
323 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
324 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
325 for (uint32_t n = 9; n < 16; n++) {
326 for (size_t k = 1; k <= 20; k += 5) {
327 for (uint32_t m = 1; m <= 1; m++) {
328 GemmMicrokernelTester()
329 .mr(1)
330 .nr(8)
331 .kr(1)
332 .sr(1)
333 .m(m)
334 .n(n)
335 .k(k)
336 .iterations(1)
337 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
338 }
339 }
340 }
341 }
342
343 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8) {
344 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
345 for (uint32_t n = 16; n <= 24; n += 8) {
346 for (size_t k = 1; k <= 20; k += 5) {
347 GemmMicrokernelTester()
348 .mr(1)
349 .nr(8)
350 .kr(1)
351 .sr(1)
352 .m(1)
353 .n(8)
354 .k(k)
355 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
356 }
357 }
358 }
359
360 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
361 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
362 for (uint32_t n = 16; n <= 24; n += 8) {
363 for (size_t k = 1; k <= 20; k += 5) {
364 GemmMicrokernelTester()
365 .mr(1)
366 .nr(8)
367 .kr(1)
368 .sr(1)
369 .m(1)
370 .n(n)
371 .k(k)
372 .cn_stride(11)
373 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
374 }
375 }
376 }
377
378 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_strided_a) {
379 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
380 for (uint32_t n = 16; n <= 24; n += 8) {
381 for (size_t k = 1; k <= 20; k += 5) {
382 GemmMicrokernelTester()
383 .mr(1)
384 .nr(8)
385 .kr(1)
386 .sr(1)
387 .m(1)
388 .n(n)
389 .k(k)
390 .a_stride(23)
391 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
392 }
393 }
394 }
395
396 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
397 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
398 for (uint32_t n = 16; n <= 24; n += 8) {
399 for (size_t k = 1; k <= 20; k += 5) {
400 for (uint32_t m = 1; m <= 1; m++) {
401 GemmMicrokernelTester()
402 .mr(1)
403 .nr(8)
404 .kr(1)
405 .sr(1)
406 .m(m)
407 .n(n)
408 .k(k)
409 .iterations(1)
410 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
411 }
412 }
413 }
414 }
415
416 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
417 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
418 for (size_t k = 1; k <= 20; k += 5) {
419 for (uint32_t m = 1; m <= 1; m++) {
420 for (uint32_t n = 1; n <= 8; n++) {
421 GemmMicrokernelTester()
422 .mr(1)
423 .nr(8)
424 .kr(1)
425 .sr(1)
426 .m(m)
427 .n(n)
428 .k(k)
429 .cm_stride(11)
430 .iterations(1)
431 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
432 }
433 }
434 }
435 }
436
437 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmin) {
438 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
439 GemmMicrokernelTester()
440 .mr(1)
441 .nr(8)
442 .kr(1)
443 .sr(1)
444 .m(1)
445 .n(8)
446 .k(4)
447 .qmin(128)
448 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
449 }
450
451 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, qmax) {
452 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
453 GemmMicrokernelTester()
454 .mr(1)
455 .nr(8)
456 .kr(1)
457 .sr(1)
458 .m(1)
459 .n(8)
460 .k(4)
461 .qmax(128)
462 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
463 }
464
465 TEST(F16_GEMM_MINMAX_1X8__NEONFP16ARITH_LD64, strided_cm) {
466 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
467 GemmMicrokernelTester()
468 .mr(1)
469 .nr(8)
470 .kr(1)
471 .sr(1)
472 .m(1)
473 .n(8)
474 .k(4)
475 .cm_stride(11)
476 .Test(xnn_f16_gemm_minmax_ukernel_1x8__neonfp16arith_ld64);
477 }
478#endif // XNN_ARCH_ARM64
479
480
481#if XNN_ARCH_ARM64
Marat Dukhande06f492020-04-09 00:19:31 -0700482 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700483 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
484 GemmMicrokernelTester()
485 .mr(4)
486 .nr(8)
487 .kr(1)
488 .sr(1)
489 .m(4)
490 .n(8)
491 .k(4)
Marat Dukhande06f492020-04-09 00:19:31 -0700492 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700493 }
494
Marat Dukhande06f492020-04-09 00:19:31 -0700495 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700496 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
497 GemmMicrokernelTester()
498 .mr(4)
499 .nr(8)
500 .kr(1)
501 .sr(1)
502 .m(4)
503 .n(8)
504 .k(4)
505 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -0700506 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700507 }
508
Marat Dukhande06f492020-04-09 00:19:31 -0700509 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700510 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
511 GemmMicrokernelTester()
512 .mr(4)
513 .nr(8)
514 .kr(1)
515 .sr(1)
516 .m(4)
517 .n(8)
518 .k(4)
519 .a_stride(7)
Marat Dukhande06f492020-04-09 00:19:31 -0700520 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700521 }
522
Marat Dukhande06f492020-04-09 00:19:31 -0700523 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700524 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
525 for (uint32_t m = 1; m <= 4; m++) {
526 for (uint32_t n = 1; n <= 8; n++) {
527 GemmMicrokernelTester()
528 .mr(4)
529 .nr(8)
530 .kr(1)
531 .sr(1)
532 .m(m)
533 .n(n)
534 .k(4)
535 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700536 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700537 }
538 }
539 }
540
Marat Dukhande06f492020-04-09 00:19:31 -0700541 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700542 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
543 for (uint32_t m = 1; m <= 4; m++) {
544 GemmMicrokernelTester()
545 .mr(4)
546 .nr(8)
547 .kr(1)
548 .sr(1)
549 .m(m)
550 .n(8)
551 .k(4)
552 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700553 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700554 }
555 }
556
Marat Dukhande06f492020-04-09 00:19:31 -0700557 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700558 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
559 for (uint32_t n = 1; n <= 8; n++) {
560 GemmMicrokernelTester()
561 .mr(4)
562 .nr(8)
563 .kr(1)
564 .sr(1)
565 .m(4)
566 .n(n)
567 .k(4)
568 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700569 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700570 }
571 }
572
Marat Dukhande06f492020-04-09 00:19:31 -0700573 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700574 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
575 for (size_t k = 1; k < 4; k++) {
576 GemmMicrokernelTester()
577 .mr(4)
578 .nr(8)
579 .kr(1)
580 .sr(1)
581 .m(4)
582 .n(8)
583 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -0700584 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700585 }
586 }
587
Marat Dukhande06f492020-04-09 00:19:31 -0700588 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700589 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
590 for (size_t k = 1; k < 4; k++) {
591 GemmMicrokernelTester()
592 .mr(4)
593 .nr(8)
594 .kr(1)
595 .sr(1)
596 .m(4)
597 .n(8)
598 .k(k)
599 .a_stride(7)
Marat Dukhande06f492020-04-09 00:19:31 -0700600 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700601 }
602 }
603
Marat Dukhande06f492020-04-09 00:19:31 -0700604 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700605 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
606 for (size_t k = 1; k < 4; k++) {
607 for (uint32_t m = 1; m <= 4; m++) {
608 for (uint32_t n = 1; n <= 8; n++) {
609 GemmMicrokernelTester()
610 .mr(4)
611 .nr(8)
612 .kr(1)
613 .sr(1)
614 .m(m)
615 .n(n)
616 .k(k)
617 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700618 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700619 }
620 }
621 }
622 }
623
Marat Dukhande06f492020-04-09 00:19:31 -0700624 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700625 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
626 for (size_t k = 5; k < 8; k++) {
627 GemmMicrokernelTester()
628 .mr(4)
629 .nr(8)
630 .kr(1)
631 .sr(1)
632 .m(4)
633 .n(8)
634 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -0700635 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700636 }
637 }
638
Marat Dukhande06f492020-04-09 00:19:31 -0700639 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700640 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
641 for (size_t k = 5; k < 8; k++) {
642 GemmMicrokernelTester()
643 .mr(4)
644 .nr(8)
645 .kr(1)
646 .sr(1)
647 .m(4)
648 .n(8)
649 .k(k)
650 .a_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -0700651 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700652 }
653 }
654
Marat Dukhande06f492020-04-09 00:19:31 -0700655 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700656 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
657 for (size_t k = 5; k < 8; k++) {
658 for (uint32_t m = 1; m <= 4; m++) {
659 for (uint32_t n = 1; n <= 8; n++) {
660 GemmMicrokernelTester()
661 .mr(4)
662 .nr(8)
663 .kr(1)
664 .sr(1)
665 .m(m)
666 .n(n)
667 .k(k)
668 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700669 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700670 }
671 }
672 }
673 }
674
Marat Dukhande06f492020-04-09 00:19:31 -0700675 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700676 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
677 for (size_t k = 8; k <= 40; k += 4) {
678 GemmMicrokernelTester()
679 .mr(4)
680 .nr(8)
681 .kr(1)
682 .sr(1)
683 .m(4)
684 .n(8)
685 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -0700686 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700687 }
688 }
689
Marat Dukhande06f492020-04-09 00:19:31 -0700690 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700691 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
692 for (size_t k = 8; k <= 40; k += 4) {
693 GemmMicrokernelTester()
694 .mr(4)
695 .nr(8)
696 .kr(1)
697 .sr(1)
698 .m(4)
699 .n(8)
700 .k(k)
701 .a_stride(43)
Marat Dukhande06f492020-04-09 00:19:31 -0700702 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700703 }
704 }
705
Marat Dukhande06f492020-04-09 00:19:31 -0700706 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700707 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
708 for (size_t k = 8; k <= 40; k += 4) {
709 for (uint32_t m = 1; m <= 4; m++) {
710 for (uint32_t n = 1; n <= 8; n++) {
711 GemmMicrokernelTester()
712 .mr(4)
713 .nr(8)
714 .kr(1)
715 .sr(1)
716 .m(m)
717 .n(n)
718 .k(k)
719 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700720 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700721 }
722 }
723 }
724 }
725
Marat Dukhande06f492020-04-09 00:19:31 -0700726 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700727 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
728 for (uint32_t n = 9; n < 16; n++) {
729 for (size_t k = 1; k <= 20; k += 5) {
730 GemmMicrokernelTester()
731 .mr(4)
732 .nr(8)
733 .kr(1)
734 .sr(1)
735 .m(4)
736 .n(8)
737 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -0700738 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700739 }
740 }
741 }
742
Marat Dukhande06f492020-04-09 00:19:31 -0700743 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700744 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
745 for (uint32_t n = 9; n < 16; n++) {
746 for (size_t k = 1; k <= 20; k += 5) {
747 GemmMicrokernelTester()
748 .mr(4)
749 .nr(8)
750 .kr(1)
751 .sr(1)
752 .m(4)
753 .n(8)
754 .k(k)
755 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -0700756 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700757 }
758 }
759 }
760
Marat Dukhande06f492020-04-09 00:19:31 -0700761 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700762 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
763 for (uint32_t n = 9; n < 16; n++) {
764 for (size_t k = 1; k <= 20; k += 5) {
765 GemmMicrokernelTester()
766 .mr(4)
767 .nr(8)
768 .kr(1)
769 .sr(1)
770 .m(4)
771 .n(n)
772 .k(k)
773 .a_stride(23)
Marat Dukhande06f492020-04-09 00:19:31 -0700774 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700775 }
776 }
777 }
778
Marat Dukhande06f492020-04-09 00:19:31 -0700779 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700780 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
781 for (uint32_t n = 9; n < 16; n++) {
782 for (size_t k = 1; k <= 20; k += 5) {
783 for (uint32_t m = 1; m <= 4; m++) {
784 GemmMicrokernelTester()
785 .mr(4)
786 .nr(8)
787 .kr(1)
788 .sr(1)
789 .m(m)
790 .n(n)
791 .k(k)
792 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700793 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700794 }
795 }
796 }
797 }
798
Marat Dukhande06f492020-04-09 00:19:31 -0700799 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700800 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
801 for (uint32_t n = 16; n <= 24; n += 8) {
802 for (size_t k = 1; k <= 20; k += 5) {
803 GemmMicrokernelTester()
804 .mr(4)
805 .nr(8)
806 .kr(1)
807 .sr(1)
808 .m(4)
809 .n(8)
810 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -0700811 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700812 }
813 }
814 }
815
Marat Dukhande06f492020-04-09 00:19:31 -0700816 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700817 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
818 for (uint32_t n = 16; n <= 24; n += 8) {
819 for (size_t k = 1; k <= 20; k += 5) {
820 GemmMicrokernelTester()
821 .mr(4)
822 .nr(8)
823 .kr(1)
824 .sr(1)
825 .m(4)
826 .n(n)
827 .k(k)
828 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -0700829 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700830 }
831 }
832 }
833
Marat Dukhande06f492020-04-09 00:19:31 -0700834 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700835 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
836 for (uint32_t n = 16; n <= 24; n += 8) {
837 for (size_t k = 1; k <= 20; k += 5) {
838 GemmMicrokernelTester()
839 .mr(4)
840 .nr(8)
841 .kr(1)
842 .sr(1)
843 .m(4)
844 .n(n)
845 .k(k)
846 .a_stride(23)
Marat Dukhande06f492020-04-09 00:19:31 -0700847 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700848 }
849 }
850 }
851
Marat Dukhande06f492020-04-09 00:19:31 -0700852 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700853 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
854 for (uint32_t n = 16; n <= 24; n += 8) {
855 for (size_t k = 1; k <= 20; k += 5) {
856 for (uint32_t m = 1; m <= 4; m++) {
857 GemmMicrokernelTester()
858 .mr(4)
859 .nr(8)
860 .kr(1)
861 .sr(1)
862 .m(m)
863 .n(n)
864 .k(k)
865 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700866 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700867 }
868 }
869 }
870 }
871
Marat Dukhande06f492020-04-09 00:19:31 -0700872 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700873 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
874 for (size_t k = 1; k <= 20; k += 5) {
875 for (uint32_t m = 1; m <= 4; m++) {
876 for (uint32_t n = 1; n <= 8; n++) {
877 GemmMicrokernelTester()
878 .mr(4)
879 .nr(8)
880 .kr(1)
881 .sr(1)
882 .m(m)
883 .n(n)
884 .k(k)
885 .cm_stride(11)
886 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700887 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700888 }
889 }
890 }
891 }
892
Marat Dukhande06f492020-04-09 00:19:31 -0700893 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700894 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
895 GemmMicrokernelTester()
896 .mr(4)
897 .nr(8)
898 .kr(1)
899 .sr(1)
900 .m(4)
901 .n(8)
902 .k(4)
903 .qmin(128)
Marat Dukhande06f492020-04-09 00:19:31 -0700904 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700905 }
906
Marat Dukhande06f492020-04-09 00:19:31 -0700907 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700908 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
909 GemmMicrokernelTester()
910 .mr(4)
911 .nr(8)
912 .kr(1)
913 .sr(1)
914 .m(4)
915 .n(8)
916 .k(4)
917 .qmax(128)
Marat Dukhande06f492020-04-09 00:19:31 -0700918 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700919 }
920
Marat Dukhande06f492020-04-09 00:19:31 -0700921 TEST(F16_GEMM_MINMAX_4X8__NEONFP16ARITH_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700922 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
923 GemmMicrokernelTester()
924 .mr(4)
925 .nr(8)
926 .kr(1)
927 .sr(1)
928 .m(4)
929 .n(8)
930 .k(4)
931 .cm_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -0700932 .Test(xnn_f16_gemm_minmax_ukernel_4x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700933 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700934#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -0700935
936
Marat Dukhan1dadbf72019-10-01 10:46:20 -0700937#if XNN_ARCH_ARM64
Marat Dukhande06f492020-04-09 00:19:31 -0700938 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700939 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
940 GemmMicrokernelTester()
941 .mr(6)
942 .nr(8)
943 .kr(1)
944 .sr(1)
945 .m(6)
946 .n(8)
947 .k(4)
Marat Dukhande06f492020-04-09 00:19:31 -0700948 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700949 }
950
Marat Dukhande06f492020-04-09 00:19:31 -0700951 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700952 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
953 GemmMicrokernelTester()
954 .mr(6)
955 .nr(8)
956 .kr(1)
957 .sr(1)
958 .m(6)
959 .n(8)
960 .k(4)
961 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -0700962 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700963 }
964
Marat Dukhande06f492020-04-09 00:19:31 -0700965 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700966 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
967 GemmMicrokernelTester()
968 .mr(6)
969 .nr(8)
970 .kr(1)
971 .sr(1)
972 .m(6)
973 .n(8)
974 .k(4)
975 .a_stride(7)
Marat Dukhande06f492020-04-09 00:19:31 -0700976 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700977 }
978
Marat Dukhande06f492020-04-09 00:19:31 -0700979 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700980 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
981 for (uint32_t m = 1; m <= 6; m++) {
982 for (uint32_t n = 1; n <= 8; n++) {
983 GemmMicrokernelTester()
984 .mr(6)
985 .nr(8)
986 .kr(1)
987 .sr(1)
988 .m(m)
989 .n(n)
990 .k(4)
991 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -0700992 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -0700993 }
994 }
995 }
996
Marat Dukhande06f492020-04-09 00:19:31 -0700997 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -0700998 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
999 for (uint32_t m = 1; m <= 6; m++) {
1000 GemmMicrokernelTester()
1001 .mr(6)
1002 .nr(8)
1003 .kr(1)
1004 .sr(1)
1005 .m(m)
1006 .n(8)
1007 .k(4)
1008 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001009 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001010 }
1011 }
1012
Marat Dukhande06f492020-04-09 00:19:31 -07001013 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001014 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1015 for (uint32_t n = 1; n <= 8; n++) {
1016 GemmMicrokernelTester()
1017 .mr(6)
1018 .nr(8)
1019 .kr(1)
1020 .sr(1)
1021 .m(6)
1022 .n(n)
1023 .k(4)
1024 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001025 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001026 }
1027 }
1028
Marat Dukhande06f492020-04-09 00:19:31 -07001029 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001030 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1031 for (size_t k = 1; k < 4; k++) {
1032 GemmMicrokernelTester()
1033 .mr(6)
1034 .nr(8)
1035 .kr(1)
1036 .sr(1)
1037 .m(6)
1038 .n(8)
1039 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001040 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001041 }
1042 }
1043
Marat Dukhande06f492020-04-09 00:19:31 -07001044 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001045 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1046 for (size_t k = 1; k < 4; k++) {
1047 GemmMicrokernelTester()
1048 .mr(6)
1049 .nr(8)
1050 .kr(1)
1051 .sr(1)
1052 .m(6)
1053 .n(8)
1054 .k(k)
1055 .a_stride(7)
Marat Dukhande06f492020-04-09 00:19:31 -07001056 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001057 }
1058 }
1059
Marat Dukhande06f492020-04-09 00:19:31 -07001060 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001061 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1062 for (size_t k = 1; k < 4; k++) {
1063 for (uint32_t m = 1; m <= 6; m++) {
1064 for (uint32_t n = 1; n <= 8; n++) {
1065 GemmMicrokernelTester()
1066 .mr(6)
1067 .nr(8)
1068 .kr(1)
1069 .sr(1)
1070 .m(m)
1071 .n(n)
1072 .k(k)
1073 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001074 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001075 }
1076 }
1077 }
1078 }
1079
Marat Dukhande06f492020-04-09 00:19:31 -07001080 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001081 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1082 for (size_t k = 5; k < 8; k++) {
1083 GemmMicrokernelTester()
1084 .mr(6)
1085 .nr(8)
1086 .kr(1)
1087 .sr(1)
1088 .m(6)
1089 .n(8)
1090 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001091 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001092 }
1093 }
1094
Marat Dukhande06f492020-04-09 00:19:31 -07001095 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001096 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1097 for (size_t k = 5; k < 8; k++) {
1098 GemmMicrokernelTester()
1099 .mr(6)
1100 .nr(8)
1101 .kr(1)
1102 .sr(1)
1103 .m(6)
1104 .n(8)
1105 .k(k)
1106 .a_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001107 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001108 }
1109 }
1110
Marat Dukhande06f492020-04-09 00:19:31 -07001111 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001112 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1113 for (size_t k = 5; k < 8; k++) {
1114 for (uint32_t m = 1; m <= 6; m++) {
1115 for (uint32_t n = 1; n <= 8; n++) {
1116 GemmMicrokernelTester()
1117 .mr(6)
1118 .nr(8)
1119 .kr(1)
1120 .sr(1)
1121 .m(m)
1122 .n(n)
1123 .k(k)
1124 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001125 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001126 }
1127 }
1128 }
1129 }
1130
Marat Dukhande06f492020-04-09 00:19:31 -07001131 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001132 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1133 for (size_t k = 8; k <= 40; k += 4) {
1134 GemmMicrokernelTester()
1135 .mr(6)
1136 .nr(8)
1137 .kr(1)
1138 .sr(1)
1139 .m(6)
1140 .n(8)
1141 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001142 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001143 }
1144 }
1145
Marat Dukhande06f492020-04-09 00:19:31 -07001146 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001147 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1148 for (size_t k = 8; k <= 40; k += 4) {
1149 GemmMicrokernelTester()
1150 .mr(6)
1151 .nr(8)
1152 .kr(1)
1153 .sr(1)
1154 .m(6)
1155 .n(8)
1156 .k(k)
1157 .a_stride(43)
Marat Dukhande06f492020-04-09 00:19:31 -07001158 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001159 }
1160 }
1161
Marat Dukhande06f492020-04-09 00:19:31 -07001162 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001163 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1164 for (size_t k = 8; k <= 40; k += 4) {
1165 for (uint32_t m = 1; m <= 6; m++) {
1166 for (uint32_t n = 1; n <= 8; n++) {
1167 GemmMicrokernelTester()
1168 .mr(6)
1169 .nr(8)
1170 .kr(1)
1171 .sr(1)
1172 .m(m)
1173 .n(n)
1174 .k(k)
1175 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001176 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001177 }
1178 }
1179 }
1180 }
1181
Marat Dukhande06f492020-04-09 00:19:31 -07001182 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001183 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1184 for (uint32_t n = 9; n < 16; n++) {
1185 for (size_t k = 1; k <= 20; k += 5) {
1186 GemmMicrokernelTester()
1187 .mr(6)
1188 .nr(8)
1189 .kr(1)
1190 .sr(1)
1191 .m(6)
1192 .n(8)
1193 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001194 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001195 }
1196 }
1197 }
1198
Marat Dukhande06f492020-04-09 00:19:31 -07001199 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001200 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1201 for (uint32_t n = 9; n < 16; n++) {
1202 for (size_t k = 1; k <= 20; k += 5) {
1203 GemmMicrokernelTester()
1204 .mr(6)
1205 .nr(8)
1206 .kr(1)
1207 .sr(1)
1208 .m(6)
1209 .n(8)
1210 .k(k)
1211 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001212 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001213 }
1214 }
1215 }
1216
Marat Dukhande06f492020-04-09 00:19:31 -07001217 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001218 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1219 for (uint32_t n = 9; n < 16; n++) {
1220 for (size_t k = 1; k <= 20; k += 5) {
1221 GemmMicrokernelTester()
1222 .mr(6)
1223 .nr(8)
1224 .kr(1)
1225 .sr(1)
1226 .m(6)
1227 .n(n)
1228 .k(k)
1229 .a_stride(23)
Marat Dukhande06f492020-04-09 00:19:31 -07001230 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001231 }
1232 }
1233 }
1234
Marat Dukhande06f492020-04-09 00:19:31 -07001235 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001236 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1237 for (uint32_t n = 9; n < 16; n++) {
1238 for (size_t k = 1; k <= 20; k += 5) {
1239 for (uint32_t m = 1; m <= 6; m++) {
1240 GemmMicrokernelTester()
1241 .mr(6)
1242 .nr(8)
1243 .kr(1)
1244 .sr(1)
1245 .m(m)
1246 .n(n)
1247 .k(k)
1248 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001249 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001250 }
1251 }
1252 }
1253 }
1254
Marat Dukhande06f492020-04-09 00:19:31 -07001255 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001256 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1257 for (uint32_t n = 16; n <= 24; n += 8) {
1258 for (size_t k = 1; k <= 20; k += 5) {
1259 GemmMicrokernelTester()
1260 .mr(6)
1261 .nr(8)
1262 .kr(1)
1263 .sr(1)
1264 .m(6)
1265 .n(8)
1266 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001267 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001268 }
1269 }
1270 }
1271
Marat Dukhande06f492020-04-09 00:19:31 -07001272 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001273 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1274 for (uint32_t n = 16; n <= 24; n += 8) {
1275 for (size_t k = 1; k <= 20; k += 5) {
1276 GemmMicrokernelTester()
1277 .mr(6)
1278 .nr(8)
1279 .kr(1)
1280 .sr(1)
1281 .m(6)
1282 .n(n)
1283 .k(k)
1284 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001285 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001286 }
1287 }
1288 }
1289
Marat Dukhande06f492020-04-09 00:19:31 -07001290 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001291 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1292 for (uint32_t n = 16; n <= 24; n += 8) {
1293 for (size_t k = 1; k <= 20; k += 5) {
1294 GemmMicrokernelTester()
1295 .mr(6)
1296 .nr(8)
1297 .kr(1)
1298 .sr(1)
1299 .m(6)
1300 .n(n)
1301 .k(k)
1302 .a_stride(23)
Marat Dukhande06f492020-04-09 00:19:31 -07001303 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001304 }
1305 }
1306 }
1307
Marat Dukhande06f492020-04-09 00:19:31 -07001308 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001309 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1310 for (uint32_t n = 16; n <= 24; n += 8) {
1311 for (size_t k = 1; k <= 20; k += 5) {
1312 for (uint32_t m = 1; m <= 6; m++) {
1313 GemmMicrokernelTester()
1314 .mr(6)
1315 .nr(8)
1316 .kr(1)
1317 .sr(1)
1318 .m(m)
1319 .n(n)
1320 .k(k)
1321 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001322 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001323 }
1324 }
1325 }
1326 }
1327
Marat Dukhande06f492020-04-09 00:19:31 -07001328 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001329 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1330 for (size_t k = 1; k <= 20; k += 5) {
1331 for (uint32_t m = 1; m <= 6; m++) {
1332 for (uint32_t n = 1; n <= 8; n++) {
1333 GemmMicrokernelTester()
1334 .mr(6)
1335 .nr(8)
1336 .kr(1)
1337 .sr(1)
1338 .m(m)
1339 .n(n)
1340 .k(k)
1341 .cm_stride(11)
1342 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001343 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001344 }
1345 }
1346 }
1347 }
1348
Marat Dukhande06f492020-04-09 00:19:31 -07001349 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001350 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1351 GemmMicrokernelTester()
1352 .mr(6)
1353 .nr(8)
1354 .kr(1)
1355 .sr(1)
1356 .m(6)
1357 .n(8)
1358 .k(4)
1359 .qmin(128)
Marat Dukhande06f492020-04-09 00:19:31 -07001360 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001361 }
1362
Marat Dukhande06f492020-04-09 00:19:31 -07001363 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001364 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1365 GemmMicrokernelTester()
1366 .mr(6)
1367 .nr(8)
1368 .kr(1)
1369 .sr(1)
1370 .m(6)
1371 .n(8)
1372 .k(4)
1373 .qmax(128)
Marat Dukhande06f492020-04-09 00:19:31 -07001374 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001375 }
1376
Marat Dukhande06f492020-04-09 00:19:31 -07001377 TEST(F16_GEMM_MINMAX_6X8__NEONFP16ARITH_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001378 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1379 GemmMicrokernelTester()
1380 .mr(6)
1381 .nr(8)
1382 .kr(1)
1383 .sr(1)
1384 .m(6)
1385 .n(8)
1386 .k(4)
1387 .cm_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001388 .Test(xnn_f16_gemm_minmax_ukernel_6x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001389 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001390#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07001391
1392
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001393#if XNN_ARCH_ARM64
Marat Dukhande06f492020-04-09 00:19:31 -07001394 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001395 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1396 GemmMicrokernelTester()
1397 .mr(8)
1398 .nr(8)
1399 .kr(1)
1400 .sr(1)
1401 .m(8)
1402 .n(8)
1403 .k(4)
Marat Dukhande06f492020-04-09 00:19:31 -07001404 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001405 }
1406
Marat Dukhande06f492020-04-09 00:19:31 -07001407 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001408 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1409 GemmMicrokernelTester()
1410 .mr(8)
1411 .nr(8)
1412 .kr(1)
1413 .sr(1)
1414 .m(8)
1415 .n(8)
1416 .k(4)
1417 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001418 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001419 }
1420
Marat Dukhande06f492020-04-09 00:19:31 -07001421 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001422 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1423 GemmMicrokernelTester()
1424 .mr(8)
1425 .nr(8)
1426 .kr(1)
1427 .sr(1)
1428 .m(8)
1429 .n(8)
1430 .k(4)
1431 .a_stride(7)
Marat Dukhande06f492020-04-09 00:19:31 -07001432 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001433 }
1434
Marat Dukhande06f492020-04-09 00:19:31 -07001435 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001436 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1437 for (uint32_t m = 1; m <= 8; m++) {
1438 for (uint32_t n = 1; n <= 8; n++) {
1439 GemmMicrokernelTester()
1440 .mr(8)
1441 .nr(8)
1442 .kr(1)
1443 .sr(1)
1444 .m(m)
1445 .n(n)
1446 .k(4)
1447 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001448 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001449 }
1450 }
1451 }
1452
Marat Dukhande06f492020-04-09 00:19:31 -07001453 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001454 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1455 for (uint32_t m = 1; m <= 8; m++) {
1456 GemmMicrokernelTester()
1457 .mr(8)
1458 .nr(8)
1459 .kr(1)
1460 .sr(1)
1461 .m(m)
1462 .n(8)
1463 .k(4)
1464 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001465 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001466 }
1467 }
1468
Marat Dukhande06f492020-04-09 00:19:31 -07001469 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001470 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1471 for (uint32_t n = 1; n <= 8; n++) {
1472 GemmMicrokernelTester()
1473 .mr(8)
1474 .nr(8)
1475 .kr(1)
1476 .sr(1)
1477 .m(8)
1478 .n(n)
1479 .k(4)
1480 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001481 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001482 }
1483 }
1484
Marat Dukhande06f492020-04-09 00:19:31 -07001485 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001486 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1487 for (size_t k = 1; k < 4; k++) {
1488 GemmMicrokernelTester()
1489 .mr(8)
1490 .nr(8)
1491 .kr(1)
1492 .sr(1)
1493 .m(8)
1494 .n(8)
1495 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001496 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001497 }
1498 }
1499
Marat Dukhande06f492020-04-09 00:19:31 -07001500 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001501 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1502 for (size_t k = 1; k < 4; k++) {
1503 GemmMicrokernelTester()
1504 .mr(8)
1505 .nr(8)
1506 .kr(1)
1507 .sr(1)
1508 .m(8)
1509 .n(8)
1510 .k(k)
1511 .a_stride(7)
Marat Dukhande06f492020-04-09 00:19:31 -07001512 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001513 }
1514 }
1515
Marat Dukhande06f492020-04-09 00:19:31 -07001516 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001517 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1518 for (size_t k = 1; k < 4; k++) {
1519 for (uint32_t m = 1; m <= 8; m++) {
1520 for (uint32_t n = 1; n <= 8; n++) {
1521 GemmMicrokernelTester()
1522 .mr(8)
1523 .nr(8)
1524 .kr(1)
1525 .sr(1)
1526 .m(m)
1527 .n(n)
1528 .k(k)
1529 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001530 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001531 }
1532 }
1533 }
1534 }
1535
Marat Dukhande06f492020-04-09 00:19:31 -07001536 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001537 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1538 for (size_t k = 5; k < 8; k++) {
1539 GemmMicrokernelTester()
1540 .mr(8)
1541 .nr(8)
1542 .kr(1)
1543 .sr(1)
1544 .m(8)
1545 .n(8)
1546 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001547 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001548 }
1549 }
1550
Marat Dukhande06f492020-04-09 00:19:31 -07001551 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001552 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1553 for (size_t k = 5; k < 8; k++) {
1554 GemmMicrokernelTester()
1555 .mr(8)
1556 .nr(8)
1557 .kr(1)
1558 .sr(1)
1559 .m(8)
1560 .n(8)
1561 .k(k)
1562 .a_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001563 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001564 }
1565 }
1566
Marat Dukhande06f492020-04-09 00:19:31 -07001567 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001568 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1569 for (size_t k = 5; k < 8; k++) {
1570 for (uint32_t m = 1; m <= 8; m++) {
1571 for (uint32_t n = 1; n <= 8; n++) {
1572 GemmMicrokernelTester()
1573 .mr(8)
1574 .nr(8)
1575 .kr(1)
1576 .sr(1)
1577 .m(m)
1578 .n(n)
1579 .k(k)
1580 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001581 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001582 }
1583 }
1584 }
1585 }
1586
Marat Dukhande06f492020-04-09 00:19:31 -07001587 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001588 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1589 for (size_t k = 8; k <= 40; k += 4) {
1590 GemmMicrokernelTester()
1591 .mr(8)
1592 .nr(8)
1593 .kr(1)
1594 .sr(1)
1595 .m(8)
1596 .n(8)
1597 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001598 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001599 }
1600 }
1601
Marat Dukhande06f492020-04-09 00:19:31 -07001602 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001603 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1604 for (size_t k = 8; k <= 40; k += 4) {
1605 GemmMicrokernelTester()
1606 .mr(8)
1607 .nr(8)
1608 .kr(1)
1609 .sr(1)
1610 .m(8)
1611 .n(8)
1612 .k(k)
1613 .a_stride(43)
Marat Dukhande06f492020-04-09 00:19:31 -07001614 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001615 }
1616 }
1617
Marat Dukhande06f492020-04-09 00:19:31 -07001618 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001619 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1620 for (size_t k = 8; k <= 40; k += 4) {
1621 for (uint32_t m = 1; m <= 8; m++) {
1622 for (uint32_t n = 1; n <= 8; n++) {
1623 GemmMicrokernelTester()
1624 .mr(8)
1625 .nr(8)
1626 .kr(1)
1627 .sr(1)
1628 .m(m)
1629 .n(n)
1630 .k(k)
1631 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001632 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001633 }
1634 }
1635 }
1636 }
1637
Marat Dukhande06f492020-04-09 00:19:31 -07001638 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001639 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1640 for (uint32_t n = 9; n < 16; n++) {
1641 for (size_t k = 1; k <= 20; k += 5) {
1642 GemmMicrokernelTester()
1643 .mr(8)
1644 .nr(8)
1645 .kr(1)
1646 .sr(1)
1647 .m(8)
1648 .n(8)
1649 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001650 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001651 }
1652 }
1653 }
1654
Marat Dukhande06f492020-04-09 00:19:31 -07001655 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001656 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1657 for (uint32_t n = 9; n < 16; n++) {
1658 for (size_t k = 1; k <= 20; k += 5) {
1659 GemmMicrokernelTester()
1660 .mr(8)
1661 .nr(8)
1662 .kr(1)
1663 .sr(1)
1664 .m(8)
1665 .n(8)
1666 .k(k)
1667 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001668 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001669 }
1670 }
1671 }
1672
Marat Dukhande06f492020-04-09 00:19:31 -07001673 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001674 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1675 for (uint32_t n = 9; n < 16; n++) {
1676 for (size_t k = 1; k <= 20; k += 5) {
1677 GemmMicrokernelTester()
1678 .mr(8)
1679 .nr(8)
1680 .kr(1)
1681 .sr(1)
1682 .m(8)
1683 .n(n)
1684 .k(k)
1685 .a_stride(23)
Marat Dukhande06f492020-04-09 00:19:31 -07001686 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001687 }
1688 }
1689 }
1690
Marat Dukhande06f492020-04-09 00:19:31 -07001691 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001692 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1693 for (uint32_t n = 9; n < 16; n++) {
1694 for (size_t k = 1; k <= 20; k += 5) {
1695 for (uint32_t m = 1; m <= 8; m++) {
1696 GemmMicrokernelTester()
1697 .mr(8)
1698 .nr(8)
1699 .kr(1)
1700 .sr(1)
1701 .m(m)
1702 .n(n)
1703 .k(k)
1704 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001705 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001706 }
1707 }
1708 }
1709 }
1710
Marat Dukhande06f492020-04-09 00:19:31 -07001711 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001712 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1713 for (uint32_t n = 16; n <= 24; n += 8) {
1714 for (size_t k = 1; k <= 20; k += 5) {
1715 GemmMicrokernelTester()
1716 .mr(8)
1717 .nr(8)
1718 .kr(1)
1719 .sr(1)
1720 .m(8)
1721 .n(8)
1722 .k(k)
Marat Dukhande06f492020-04-09 00:19:31 -07001723 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001724 }
1725 }
1726 }
1727
Marat Dukhande06f492020-04-09 00:19:31 -07001728 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001729 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1730 for (uint32_t n = 16; n <= 24; n += 8) {
1731 for (size_t k = 1; k <= 20; k += 5) {
1732 GemmMicrokernelTester()
1733 .mr(8)
1734 .nr(8)
1735 .kr(1)
1736 .sr(1)
1737 .m(8)
1738 .n(n)
1739 .k(k)
1740 .cn_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001741 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001742 }
1743 }
1744 }
1745
Marat Dukhande06f492020-04-09 00:19:31 -07001746 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001747 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1748 for (uint32_t n = 16; n <= 24; n += 8) {
1749 for (size_t k = 1; k <= 20; k += 5) {
1750 GemmMicrokernelTester()
1751 .mr(8)
1752 .nr(8)
1753 .kr(1)
1754 .sr(1)
1755 .m(8)
1756 .n(n)
1757 .k(k)
1758 .a_stride(23)
Marat Dukhande06f492020-04-09 00:19:31 -07001759 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001760 }
1761 }
1762 }
1763
Marat Dukhande06f492020-04-09 00:19:31 -07001764 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001765 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1766 for (uint32_t n = 16; n <= 24; n += 8) {
1767 for (size_t k = 1; k <= 20; k += 5) {
1768 for (uint32_t m = 1; m <= 8; m++) {
1769 GemmMicrokernelTester()
1770 .mr(8)
1771 .nr(8)
1772 .kr(1)
1773 .sr(1)
1774 .m(m)
1775 .n(n)
1776 .k(k)
1777 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001778 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001779 }
1780 }
1781 }
1782 }
1783
Marat Dukhande06f492020-04-09 00:19:31 -07001784 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001785 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1786 for (size_t k = 1; k <= 20; k += 5) {
1787 for (uint32_t m = 1; m <= 8; m++) {
1788 for (uint32_t n = 1; n <= 8; n++) {
1789 GemmMicrokernelTester()
1790 .mr(8)
1791 .nr(8)
1792 .kr(1)
1793 .sr(1)
1794 .m(m)
1795 .n(n)
1796 .k(k)
1797 .cm_stride(11)
1798 .iterations(1)
Marat Dukhande06f492020-04-09 00:19:31 -07001799 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001800 }
1801 }
1802 }
1803 }
1804
Marat Dukhande06f492020-04-09 00:19:31 -07001805 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001806 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1807 GemmMicrokernelTester()
1808 .mr(8)
1809 .nr(8)
1810 .kr(1)
1811 .sr(1)
1812 .m(8)
1813 .n(8)
1814 .k(4)
1815 .qmin(128)
Marat Dukhande06f492020-04-09 00:19:31 -07001816 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001817 }
1818
Marat Dukhande06f492020-04-09 00:19:31 -07001819 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001820 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1821 GemmMicrokernelTester()
1822 .mr(8)
1823 .nr(8)
1824 .kr(1)
1825 .sr(1)
1826 .m(8)
1827 .n(8)
1828 .k(4)
1829 .qmax(128)
Marat Dukhande06f492020-04-09 00:19:31 -07001830 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001831 }
1832
Marat Dukhande06f492020-04-09 00:19:31 -07001833 TEST(F16_GEMM_MINMAX_8X8__NEONFP16ARITH_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07001834 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1835 GemmMicrokernelTester()
1836 .mr(8)
1837 .nr(8)
1838 .kr(1)
1839 .sr(1)
1840 .m(8)
1841 .n(8)
1842 .k(4)
1843 .cm_stride(11)
Marat Dukhande06f492020-04-09 00:19:31 -07001844 .Test(xnn_f16_gemm_minmax_ukernel_8x8__neonfp16arith_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07001845 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07001846#endif // XNN_ARCH_ARM64
Frank Barchard683f5592020-04-10 00:48:26 -07001847
1848
1849#if XNN_ARCH_ARM64
Frank Barchard3f9f99f2020-05-06 01:12:04 -07001850 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4) {
1851 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1852 GemmMicrokernelTester()
1853 .mr(1)
1854 .nr(16)
1855 .kr(1)
1856 .sr(1)
1857 .m(1)
1858 .n(16)
1859 .k(4)
1860 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1861 }
1862
1863 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cn) {
1864 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1865 GemmMicrokernelTester()
1866 .mr(1)
1867 .nr(16)
1868 .kr(1)
1869 .sr(1)
1870 .m(1)
1871 .n(16)
1872 .k(4)
1873 .cn_stride(19)
1874 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1875 }
1876
1877 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
1878 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1879 GemmMicrokernelTester()
1880 .mr(1)
1881 .nr(16)
1882 .kr(1)
1883 .sr(1)
1884 .m(1)
1885 .n(16)
1886 .k(4)
1887 .a_stride(7)
1888 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1889 }
1890
1891 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
1892 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1893 for (uint32_t m = 1; m <= 1; m++) {
1894 for (uint32_t n = 1; n <= 16; n++) {
1895 GemmMicrokernelTester()
1896 .mr(1)
1897 .nr(16)
1898 .kr(1)
1899 .sr(1)
1900 .m(m)
1901 .n(n)
1902 .k(4)
1903 .iterations(1)
1904 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1905 }
1906 }
1907 }
1908
1909 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
1910 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1911 for (uint32_t m = 1; m <= 1; m++) {
1912 GemmMicrokernelTester()
1913 .mr(1)
1914 .nr(16)
1915 .kr(1)
1916 .sr(1)
1917 .m(m)
1918 .n(16)
1919 .k(4)
1920 .iterations(1)
1921 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1922 }
1923 }
1924
1925 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
1926 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1927 for (uint32_t n = 1; n <= 16; n++) {
1928 GemmMicrokernelTester()
1929 .mr(1)
1930 .nr(16)
1931 .kr(1)
1932 .sr(1)
1933 .m(1)
1934 .n(n)
1935 .k(4)
1936 .iterations(1)
1937 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1938 }
1939 }
1940
1941 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4) {
1942 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1943 for (size_t k = 1; k < 4; k++) {
1944 GemmMicrokernelTester()
1945 .mr(1)
1946 .nr(16)
1947 .kr(1)
1948 .sr(1)
1949 .m(1)
1950 .n(16)
1951 .k(k)
1952 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1953 }
1954 }
1955
1956 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
1957 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1958 for (size_t k = 1; k < 4; k++) {
1959 GemmMicrokernelTester()
1960 .mr(1)
1961 .nr(16)
1962 .kr(1)
1963 .sr(1)
1964 .m(1)
1965 .n(16)
1966 .k(k)
1967 .a_stride(7)
1968 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1969 }
1970 }
1971
1972 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
1973 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1974 for (size_t k = 1; k < 4; k++) {
1975 for (uint32_t m = 1; m <= 1; m++) {
1976 for (uint32_t n = 1; n <= 16; n++) {
1977 GemmMicrokernelTester()
1978 .mr(1)
1979 .nr(16)
1980 .kr(1)
1981 .sr(1)
1982 .m(m)
1983 .n(n)
1984 .k(k)
1985 .iterations(1)
1986 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
1987 }
1988 }
1989 }
1990 }
1991
1992 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4) {
1993 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
1994 for (size_t k = 5; k < 8; k++) {
1995 GemmMicrokernelTester()
1996 .mr(1)
1997 .nr(16)
1998 .kr(1)
1999 .sr(1)
2000 .m(1)
2001 .n(16)
2002 .k(k)
2003 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2004 }
2005 }
2006
2007 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
2008 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2009 for (size_t k = 5; k < 8; k++) {
2010 GemmMicrokernelTester()
2011 .mr(1)
2012 .nr(16)
2013 .kr(1)
2014 .sr(1)
2015 .m(1)
2016 .n(16)
2017 .k(k)
2018 .a_stride(11)
2019 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2020 }
2021 }
2022
2023 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
2024 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2025 for (size_t k = 5; k < 8; k++) {
2026 for (uint32_t m = 1; m <= 1; m++) {
2027 for (uint32_t n = 1; n <= 16; n++) {
2028 GemmMicrokernelTester()
2029 .mr(1)
2030 .nr(16)
2031 .kr(1)
2032 .sr(1)
2033 .m(m)
2034 .n(n)
2035 .k(k)
2036 .iterations(1)
2037 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2038 }
2039 }
2040 }
2041 }
2042
2043 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4) {
2044 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2045 for (size_t k = 8; k <= 40; k += 4) {
2046 GemmMicrokernelTester()
2047 .mr(1)
2048 .nr(16)
2049 .kr(1)
2050 .sr(1)
2051 .m(1)
2052 .n(16)
2053 .k(k)
2054 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2055 }
2056 }
2057
2058 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_strided_a) {
2059 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2060 for (size_t k = 8; k <= 40; k += 4) {
2061 GemmMicrokernelTester()
2062 .mr(1)
2063 .nr(16)
2064 .kr(1)
2065 .sr(1)
2066 .m(1)
2067 .n(16)
2068 .k(k)
2069 .a_stride(43)
2070 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2071 }
2072 }
2073
2074 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
2075 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2076 for (size_t k = 8; k <= 40; k += 4) {
2077 for (uint32_t m = 1; m <= 1; m++) {
2078 for (uint32_t n = 1; n <= 16; n++) {
2079 GemmMicrokernelTester()
2080 .mr(1)
2081 .nr(16)
2082 .kr(1)
2083 .sr(1)
2084 .m(m)
2085 .n(n)
2086 .k(k)
2087 .iterations(1)
2088 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2089 }
2090 }
2091 }
2092 }
2093
2094 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16) {
2095 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2096 for (uint32_t n = 17; n < 32; n++) {
2097 for (size_t k = 1; k <= 20; k += 5) {
2098 GemmMicrokernelTester()
2099 .mr(1)
2100 .nr(16)
2101 .kr(1)
2102 .sr(1)
2103 .m(1)
2104 .n(16)
2105 .k(k)
2106 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2107 }
2108 }
2109 }
2110
2111 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
2112 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2113 for (uint32_t n = 17; n < 32; n++) {
2114 for (size_t k = 1; k <= 20; k += 5) {
2115 GemmMicrokernelTester()
2116 .mr(1)
2117 .nr(16)
2118 .kr(1)
2119 .sr(1)
2120 .m(1)
2121 .n(16)
2122 .k(k)
2123 .cn_stride(19)
2124 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2125 }
2126 }
2127 }
2128
2129 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) {
2130 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2131 for (uint32_t n = 17; n < 32; n++) {
2132 for (size_t k = 1; k <= 20; k += 5) {
2133 GemmMicrokernelTester()
2134 .mr(1)
2135 .nr(16)
2136 .kr(1)
2137 .sr(1)
2138 .m(1)
2139 .n(n)
2140 .k(k)
2141 .a_stride(23)
2142 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2143 }
2144 }
2145 }
2146
2147 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
2148 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2149 for (uint32_t n = 17; n < 32; n++) {
2150 for (size_t k = 1; k <= 20; k += 5) {
2151 for (uint32_t m = 1; m <= 1; m++) {
2152 GemmMicrokernelTester()
2153 .mr(1)
2154 .nr(16)
2155 .kr(1)
2156 .sr(1)
2157 .m(m)
2158 .n(n)
2159 .k(k)
2160 .iterations(1)
2161 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2162 }
2163 }
2164 }
2165 }
2166
2167 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16) {
2168 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2169 for (uint32_t n = 32; n <= 48; n += 16) {
2170 for (size_t k = 1; k <= 20; k += 5) {
2171 GemmMicrokernelTester()
2172 .mr(1)
2173 .nr(16)
2174 .kr(1)
2175 .sr(1)
2176 .m(1)
2177 .n(16)
2178 .k(k)
2179 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2180 }
2181 }
2182 }
2183
2184 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
2185 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2186 for (uint32_t n = 32; n <= 48; n += 16) {
2187 for (size_t k = 1; k <= 20; k += 5) {
2188 GemmMicrokernelTester()
2189 .mr(1)
2190 .nr(16)
2191 .kr(1)
2192 .sr(1)
2193 .m(1)
2194 .n(n)
2195 .k(k)
2196 .cn_stride(19)
2197 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2198 }
2199 }
2200 }
2201
2202 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_strided_a) {
2203 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2204 for (uint32_t n = 32; n <= 48; n += 16) {
2205 for (size_t k = 1; k <= 20; k += 5) {
2206 GemmMicrokernelTester()
2207 .mr(1)
2208 .nr(16)
2209 .kr(1)
2210 .sr(1)
2211 .m(1)
2212 .n(n)
2213 .k(k)
2214 .a_stride(23)
2215 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2216 }
2217 }
2218 }
2219
2220 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
2221 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2222 for (uint32_t n = 32; n <= 48; n += 16) {
2223 for (size_t k = 1; k <= 20; k += 5) {
2224 for (uint32_t m = 1; m <= 1; m++) {
2225 GemmMicrokernelTester()
2226 .mr(1)
2227 .nr(16)
2228 .kr(1)
2229 .sr(1)
2230 .m(m)
2231 .n(n)
2232 .k(k)
2233 .iterations(1)
2234 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2235 }
2236 }
2237 }
2238 }
2239
2240 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
2241 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2242 for (size_t k = 1; k <= 20; k += 5) {
2243 for (uint32_t m = 1; m <= 1; m++) {
2244 for (uint32_t n = 1; n <= 16; n++) {
2245 GemmMicrokernelTester()
2246 .mr(1)
2247 .nr(16)
2248 .kr(1)
2249 .sr(1)
2250 .m(m)
2251 .n(n)
2252 .k(k)
2253 .cm_stride(19)
2254 .iterations(1)
2255 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2256 }
2257 }
2258 }
2259 }
2260
2261 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmin) {
2262 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2263 GemmMicrokernelTester()
2264 .mr(1)
2265 .nr(16)
2266 .kr(1)
2267 .sr(1)
2268 .m(1)
2269 .n(16)
2270 .k(4)
2271 .qmin(128)
2272 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2273 }
2274
2275 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, qmax) {
2276 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2277 GemmMicrokernelTester()
2278 .mr(1)
2279 .nr(16)
2280 .kr(1)
2281 .sr(1)
2282 .m(1)
2283 .n(16)
2284 .k(4)
2285 .qmax(128)
2286 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2287 }
2288
2289 TEST(F16_GEMM_MINMAX_1X16__NEONFP16ARITH_LD64, strided_cm) {
2290 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2291 GemmMicrokernelTester()
2292 .mr(1)
2293 .nr(16)
2294 .kr(1)
2295 .sr(1)
2296 .m(1)
2297 .n(16)
2298 .k(4)
2299 .cm_stride(19)
2300 .Test(xnn_f16_gemm_minmax_ukernel_1x16__neonfp16arith_ld64);
2301 }
2302#endif // XNN_ARCH_ARM64
2303
2304
2305#if XNN_ARCH_ARM64
2306 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4) {
2307 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2308 GemmMicrokernelTester()
2309 .mr(4)
2310 .nr(16)
2311 .kr(1)
2312 .sr(1)
2313 .m(4)
2314 .n(16)
2315 .k(4)
2316 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2317 }
2318
2319 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cn) {
2320 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2321 GemmMicrokernelTester()
2322 .mr(4)
2323 .nr(16)
2324 .kr(1)
2325 .sr(1)
2326 .m(4)
2327 .n(16)
2328 .k(4)
2329 .cn_stride(19)
2330 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2331 }
2332
2333 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
2334 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2335 GemmMicrokernelTester()
2336 .mr(4)
2337 .nr(16)
2338 .kr(1)
2339 .sr(1)
2340 .m(4)
2341 .n(16)
2342 .k(4)
2343 .a_stride(7)
2344 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2345 }
2346
2347 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
2348 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2349 for (uint32_t m = 1; m <= 4; m++) {
2350 for (uint32_t n = 1; n <= 16; n++) {
2351 GemmMicrokernelTester()
2352 .mr(4)
2353 .nr(16)
2354 .kr(1)
2355 .sr(1)
2356 .m(m)
2357 .n(n)
2358 .k(4)
2359 .iterations(1)
2360 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2361 }
2362 }
2363 }
2364
2365 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
2366 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2367 for (uint32_t m = 1; m <= 4; m++) {
2368 GemmMicrokernelTester()
2369 .mr(4)
2370 .nr(16)
2371 .kr(1)
2372 .sr(1)
2373 .m(m)
2374 .n(16)
2375 .k(4)
2376 .iterations(1)
2377 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2378 }
2379 }
2380
2381 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
2382 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2383 for (uint32_t n = 1; n <= 16; n++) {
2384 GemmMicrokernelTester()
2385 .mr(4)
2386 .nr(16)
2387 .kr(1)
2388 .sr(1)
2389 .m(4)
2390 .n(n)
2391 .k(4)
2392 .iterations(1)
2393 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2394 }
2395 }
2396
2397 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4) {
2398 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2399 for (size_t k = 1; k < 4; k++) {
2400 GemmMicrokernelTester()
2401 .mr(4)
2402 .nr(16)
2403 .kr(1)
2404 .sr(1)
2405 .m(4)
2406 .n(16)
2407 .k(k)
2408 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2409 }
2410 }
2411
2412 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
2413 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2414 for (size_t k = 1; k < 4; k++) {
2415 GemmMicrokernelTester()
2416 .mr(4)
2417 .nr(16)
2418 .kr(1)
2419 .sr(1)
2420 .m(4)
2421 .n(16)
2422 .k(k)
2423 .a_stride(7)
2424 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2425 }
2426 }
2427
2428 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
2429 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2430 for (size_t k = 1; k < 4; k++) {
2431 for (uint32_t m = 1; m <= 4; m++) {
2432 for (uint32_t n = 1; n <= 16; n++) {
2433 GemmMicrokernelTester()
2434 .mr(4)
2435 .nr(16)
2436 .kr(1)
2437 .sr(1)
2438 .m(m)
2439 .n(n)
2440 .k(k)
2441 .iterations(1)
2442 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2443 }
2444 }
2445 }
2446 }
2447
2448 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4) {
2449 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2450 for (size_t k = 5; k < 8; k++) {
2451 GemmMicrokernelTester()
2452 .mr(4)
2453 .nr(16)
2454 .kr(1)
2455 .sr(1)
2456 .m(4)
2457 .n(16)
2458 .k(k)
2459 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2460 }
2461 }
2462
2463 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
2464 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2465 for (size_t k = 5; k < 8; k++) {
2466 GemmMicrokernelTester()
2467 .mr(4)
2468 .nr(16)
2469 .kr(1)
2470 .sr(1)
2471 .m(4)
2472 .n(16)
2473 .k(k)
2474 .a_stride(11)
2475 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2476 }
2477 }
2478
2479 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
2480 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2481 for (size_t k = 5; k < 8; k++) {
2482 for (uint32_t m = 1; m <= 4; m++) {
2483 for (uint32_t n = 1; n <= 16; n++) {
2484 GemmMicrokernelTester()
2485 .mr(4)
2486 .nr(16)
2487 .kr(1)
2488 .sr(1)
2489 .m(m)
2490 .n(n)
2491 .k(k)
2492 .iterations(1)
2493 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2494 }
2495 }
2496 }
2497 }
2498
2499 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4) {
2500 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2501 for (size_t k = 8; k <= 40; k += 4) {
2502 GemmMicrokernelTester()
2503 .mr(4)
2504 .nr(16)
2505 .kr(1)
2506 .sr(1)
2507 .m(4)
2508 .n(16)
2509 .k(k)
2510 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2511 }
2512 }
2513
2514 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_strided_a) {
2515 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2516 for (size_t k = 8; k <= 40; k += 4) {
2517 GemmMicrokernelTester()
2518 .mr(4)
2519 .nr(16)
2520 .kr(1)
2521 .sr(1)
2522 .m(4)
2523 .n(16)
2524 .k(k)
2525 .a_stride(43)
2526 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2527 }
2528 }
2529
2530 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
2531 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2532 for (size_t k = 8; k <= 40; k += 4) {
2533 for (uint32_t m = 1; m <= 4; m++) {
2534 for (uint32_t n = 1; n <= 16; n++) {
2535 GemmMicrokernelTester()
2536 .mr(4)
2537 .nr(16)
2538 .kr(1)
2539 .sr(1)
2540 .m(m)
2541 .n(n)
2542 .k(k)
2543 .iterations(1)
2544 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2545 }
2546 }
2547 }
2548 }
2549
2550 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16) {
2551 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2552 for (uint32_t n = 17; n < 32; n++) {
2553 for (size_t k = 1; k <= 20; k += 5) {
2554 GemmMicrokernelTester()
2555 .mr(4)
2556 .nr(16)
2557 .kr(1)
2558 .sr(1)
2559 .m(4)
2560 .n(16)
2561 .k(k)
2562 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2563 }
2564 }
2565 }
2566
2567 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
2568 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2569 for (uint32_t n = 17; n < 32; n++) {
2570 for (size_t k = 1; k <= 20; k += 5) {
2571 GemmMicrokernelTester()
2572 .mr(4)
2573 .nr(16)
2574 .kr(1)
2575 .sr(1)
2576 .m(4)
2577 .n(16)
2578 .k(k)
2579 .cn_stride(19)
2580 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2581 }
2582 }
2583 }
2584
2585 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) {
2586 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2587 for (uint32_t n = 17; n < 32; n++) {
2588 for (size_t k = 1; k <= 20; k += 5) {
2589 GemmMicrokernelTester()
2590 .mr(4)
2591 .nr(16)
2592 .kr(1)
2593 .sr(1)
2594 .m(4)
2595 .n(n)
2596 .k(k)
2597 .a_stride(23)
2598 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2599 }
2600 }
2601 }
2602
2603 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
2604 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2605 for (uint32_t n = 17; n < 32; n++) {
2606 for (size_t k = 1; k <= 20; k += 5) {
2607 for (uint32_t m = 1; m <= 4; m++) {
2608 GemmMicrokernelTester()
2609 .mr(4)
2610 .nr(16)
2611 .kr(1)
2612 .sr(1)
2613 .m(m)
2614 .n(n)
2615 .k(k)
2616 .iterations(1)
2617 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2618 }
2619 }
2620 }
2621 }
2622
2623 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16) {
2624 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2625 for (uint32_t n = 32; n <= 48; n += 16) {
2626 for (size_t k = 1; k <= 20; k += 5) {
2627 GemmMicrokernelTester()
2628 .mr(4)
2629 .nr(16)
2630 .kr(1)
2631 .sr(1)
2632 .m(4)
2633 .n(16)
2634 .k(k)
2635 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2636 }
2637 }
2638 }
2639
2640 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
2641 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2642 for (uint32_t n = 32; n <= 48; n += 16) {
2643 for (size_t k = 1; k <= 20; k += 5) {
2644 GemmMicrokernelTester()
2645 .mr(4)
2646 .nr(16)
2647 .kr(1)
2648 .sr(1)
2649 .m(4)
2650 .n(n)
2651 .k(k)
2652 .cn_stride(19)
2653 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2654 }
2655 }
2656 }
2657
2658 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_strided_a) {
2659 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2660 for (uint32_t n = 32; n <= 48; n += 16) {
2661 for (size_t k = 1; k <= 20; k += 5) {
2662 GemmMicrokernelTester()
2663 .mr(4)
2664 .nr(16)
2665 .kr(1)
2666 .sr(1)
2667 .m(4)
2668 .n(n)
2669 .k(k)
2670 .a_stride(23)
2671 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2672 }
2673 }
2674 }
2675
2676 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
2677 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2678 for (uint32_t n = 32; n <= 48; n += 16) {
2679 for (size_t k = 1; k <= 20; k += 5) {
2680 for (uint32_t m = 1; m <= 4; m++) {
2681 GemmMicrokernelTester()
2682 .mr(4)
2683 .nr(16)
2684 .kr(1)
2685 .sr(1)
2686 .m(m)
2687 .n(n)
2688 .k(k)
2689 .iterations(1)
2690 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2691 }
2692 }
2693 }
2694 }
2695
2696 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
2697 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2698 for (size_t k = 1; k <= 20; k += 5) {
2699 for (uint32_t m = 1; m <= 4; m++) {
2700 for (uint32_t n = 1; n <= 16; n++) {
2701 GemmMicrokernelTester()
2702 .mr(4)
2703 .nr(16)
2704 .kr(1)
2705 .sr(1)
2706 .m(m)
2707 .n(n)
2708 .k(k)
2709 .cm_stride(19)
2710 .iterations(1)
2711 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2712 }
2713 }
2714 }
2715 }
2716
2717 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmin) {
2718 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2719 GemmMicrokernelTester()
2720 .mr(4)
2721 .nr(16)
2722 .kr(1)
2723 .sr(1)
2724 .m(4)
2725 .n(16)
2726 .k(4)
2727 .qmin(128)
2728 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2729 }
2730
2731 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, qmax) {
2732 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2733 GemmMicrokernelTester()
2734 .mr(4)
2735 .nr(16)
2736 .kr(1)
2737 .sr(1)
2738 .m(4)
2739 .n(16)
2740 .k(4)
2741 .qmax(128)
2742 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2743 }
2744
2745 TEST(F16_GEMM_MINMAX_4X16__NEONFP16ARITH_LD64, strided_cm) {
2746 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2747 GemmMicrokernelTester()
2748 .mr(4)
2749 .nr(16)
2750 .kr(1)
2751 .sr(1)
2752 .m(4)
2753 .n(16)
2754 .k(4)
2755 .cm_stride(19)
2756 .Test(xnn_f16_gemm_minmax_ukernel_4x16__neonfp16arith_ld64);
2757 }
2758#endif // XNN_ARCH_ARM64
2759
2760
2761#if XNN_ARCH_ARM64
2762 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4) {
2763 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2764 GemmMicrokernelTester()
2765 .mr(6)
2766 .nr(16)
2767 .kr(1)
2768 .sr(1)
2769 .m(6)
2770 .n(16)
2771 .k(4)
2772 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2773 }
2774
2775 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cn) {
2776 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2777 GemmMicrokernelTester()
2778 .mr(6)
2779 .nr(16)
2780 .kr(1)
2781 .sr(1)
2782 .m(6)
2783 .n(16)
2784 .k(4)
2785 .cn_stride(19)
2786 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2787 }
2788
2789 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
2790 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2791 GemmMicrokernelTester()
2792 .mr(6)
2793 .nr(16)
2794 .kr(1)
2795 .sr(1)
2796 .m(6)
2797 .n(16)
2798 .k(4)
2799 .a_stride(7)
2800 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2801 }
2802
2803 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
2804 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2805 for (uint32_t m = 1; m <= 6; m++) {
2806 for (uint32_t n = 1; n <= 16; n++) {
2807 GemmMicrokernelTester()
2808 .mr(6)
2809 .nr(16)
2810 .kr(1)
2811 .sr(1)
2812 .m(m)
2813 .n(n)
2814 .k(4)
2815 .iterations(1)
2816 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2817 }
2818 }
2819 }
2820
2821 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
2822 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2823 for (uint32_t m = 1; m <= 6; m++) {
2824 GemmMicrokernelTester()
2825 .mr(6)
2826 .nr(16)
2827 .kr(1)
2828 .sr(1)
2829 .m(m)
2830 .n(16)
2831 .k(4)
2832 .iterations(1)
2833 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2834 }
2835 }
2836
2837 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
2838 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2839 for (uint32_t n = 1; n <= 16; n++) {
2840 GemmMicrokernelTester()
2841 .mr(6)
2842 .nr(16)
2843 .kr(1)
2844 .sr(1)
2845 .m(6)
2846 .n(n)
2847 .k(4)
2848 .iterations(1)
2849 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2850 }
2851 }
2852
2853 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4) {
2854 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2855 for (size_t k = 1; k < 4; k++) {
2856 GemmMicrokernelTester()
2857 .mr(6)
2858 .nr(16)
2859 .kr(1)
2860 .sr(1)
2861 .m(6)
2862 .n(16)
2863 .k(k)
2864 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2865 }
2866 }
2867
2868 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
2869 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2870 for (size_t k = 1; k < 4; k++) {
2871 GemmMicrokernelTester()
2872 .mr(6)
2873 .nr(16)
2874 .kr(1)
2875 .sr(1)
2876 .m(6)
2877 .n(16)
2878 .k(k)
2879 .a_stride(7)
2880 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2881 }
2882 }
2883
2884 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
2885 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2886 for (size_t k = 1; k < 4; k++) {
2887 for (uint32_t m = 1; m <= 6; m++) {
2888 for (uint32_t n = 1; n <= 16; n++) {
2889 GemmMicrokernelTester()
2890 .mr(6)
2891 .nr(16)
2892 .kr(1)
2893 .sr(1)
2894 .m(m)
2895 .n(n)
2896 .k(k)
2897 .iterations(1)
2898 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2899 }
2900 }
2901 }
2902 }
2903
2904 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4) {
2905 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2906 for (size_t k = 5; k < 8; k++) {
2907 GemmMicrokernelTester()
2908 .mr(6)
2909 .nr(16)
2910 .kr(1)
2911 .sr(1)
2912 .m(6)
2913 .n(16)
2914 .k(k)
2915 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2916 }
2917 }
2918
2919 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
2920 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2921 for (size_t k = 5; k < 8; k++) {
2922 GemmMicrokernelTester()
2923 .mr(6)
2924 .nr(16)
2925 .kr(1)
2926 .sr(1)
2927 .m(6)
2928 .n(16)
2929 .k(k)
2930 .a_stride(11)
2931 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2932 }
2933 }
2934
2935 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
2936 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2937 for (size_t k = 5; k < 8; k++) {
2938 for (uint32_t m = 1; m <= 6; m++) {
2939 for (uint32_t n = 1; n <= 16; n++) {
2940 GemmMicrokernelTester()
2941 .mr(6)
2942 .nr(16)
2943 .kr(1)
2944 .sr(1)
2945 .m(m)
2946 .n(n)
2947 .k(k)
2948 .iterations(1)
2949 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2950 }
2951 }
2952 }
2953 }
2954
2955 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4) {
2956 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2957 for (size_t k = 8; k <= 40; k += 4) {
2958 GemmMicrokernelTester()
2959 .mr(6)
2960 .nr(16)
2961 .kr(1)
2962 .sr(1)
2963 .m(6)
2964 .n(16)
2965 .k(k)
2966 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2967 }
2968 }
2969
2970 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_strided_a) {
2971 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2972 for (size_t k = 8; k <= 40; k += 4) {
2973 GemmMicrokernelTester()
2974 .mr(6)
2975 .nr(16)
2976 .kr(1)
2977 .sr(1)
2978 .m(6)
2979 .n(16)
2980 .k(k)
2981 .a_stride(43)
2982 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
2983 }
2984 }
2985
2986 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
2987 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
2988 for (size_t k = 8; k <= 40; k += 4) {
2989 for (uint32_t m = 1; m <= 6; m++) {
2990 for (uint32_t n = 1; n <= 16; n++) {
2991 GemmMicrokernelTester()
2992 .mr(6)
2993 .nr(16)
2994 .kr(1)
2995 .sr(1)
2996 .m(m)
2997 .n(n)
2998 .k(k)
2999 .iterations(1)
3000 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3001 }
3002 }
3003 }
3004 }
3005
3006 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16) {
3007 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3008 for (uint32_t n = 17; n < 32; n++) {
3009 for (size_t k = 1; k <= 20; k += 5) {
3010 GemmMicrokernelTester()
3011 .mr(6)
3012 .nr(16)
3013 .kr(1)
3014 .sr(1)
3015 .m(6)
3016 .n(16)
3017 .k(k)
3018 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3019 }
3020 }
3021 }
3022
3023 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
3024 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3025 for (uint32_t n = 17; n < 32; n++) {
3026 for (size_t k = 1; k <= 20; k += 5) {
3027 GemmMicrokernelTester()
3028 .mr(6)
3029 .nr(16)
3030 .kr(1)
3031 .sr(1)
3032 .m(6)
3033 .n(16)
3034 .k(k)
3035 .cn_stride(19)
3036 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3037 }
3038 }
3039 }
3040
3041 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) {
3042 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3043 for (uint32_t n = 17; n < 32; n++) {
3044 for (size_t k = 1; k <= 20; k += 5) {
3045 GemmMicrokernelTester()
3046 .mr(6)
3047 .nr(16)
3048 .kr(1)
3049 .sr(1)
3050 .m(6)
3051 .n(n)
3052 .k(k)
3053 .a_stride(23)
3054 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3055 }
3056 }
3057 }
3058
3059 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
3060 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3061 for (uint32_t n = 17; n < 32; n++) {
3062 for (size_t k = 1; k <= 20; k += 5) {
3063 for (uint32_t m = 1; m <= 6; m++) {
3064 GemmMicrokernelTester()
3065 .mr(6)
3066 .nr(16)
3067 .kr(1)
3068 .sr(1)
3069 .m(m)
3070 .n(n)
3071 .k(k)
3072 .iterations(1)
3073 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3074 }
3075 }
3076 }
3077 }
3078
3079 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16) {
3080 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3081 for (uint32_t n = 32; n <= 48; n += 16) {
3082 for (size_t k = 1; k <= 20; k += 5) {
3083 GemmMicrokernelTester()
3084 .mr(6)
3085 .nr(16)
3086 .kr(1)
3087 .sr(1)
3088 .m(6)
3089 .n(16)
3090 .k(k)
3091 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3092 }
3093 }
3094 }
3095
3096 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
3097 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3098 for (uint32_t n = 32; n <= 48; n += 16) {
3099 for (size_t k = 1; k <= 20; k += 5) {
3100 GemmMicrokernelTester()
3101 .mr(6)
3102 .nr(16)
3103 .kr(1)
3104 .sr(1)
3105 .m(6)
3106 .n(n)
3107 .k(k)
3108 .cn_stride(19)
3109 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3110 }
3111 }
3112 }
3113
3114 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_strided_a) {
3115 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3116 for (uint32_t n = 32; n <= 48; n += 16) {
3117 for (size_t k = 1; k <= 20; k += 5) {
3118 GemmMicrokernelTester()
3119 .mr(6)
3120 .nr(16)
3121 .kr(1)
3122 .sr(1)
3123 .m(6)
3124 .n(n)
3125 .k(k)
3126 .a_stride(23)
3127 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3128 }
3129 }
3130 }
3131
3132 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
3133 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3134 for (uint32_t n = 32; n <= 48; n += 16) {
3135 for (size_t k = 1; k <= 20; k += 5) {
3136 for (uint32_t m = 1; m <= 6; m++) {
3137 GemmMicrokernelTester()
3138 .mr(6)
3139 .nr(16)
3140 .kr(1)
3141 .sr(1)
3142 .m(m)
3143 .n(n)
3144 .k(k)
3145 .iterations(1)
3146 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3147 }
3148 }
3149 }
3150 }
3151
3152 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
3153 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3154 for (size_t k = 1; k <= 20; k += 5) {
3155 for (uint32_t m = 1; m <= 6; m++) {
3156 for (uint32_t n = 1; n <= 16; n++) {
3157 GemmMicrokernelTester()
3158 .mr(6)
3159 .nr(16)
3160 .kr(1)
3161 .sr(1)
3162 .m(m)
3163 .n(n)
3164 .k(k)
3165 .cm_stride(19)
3166 .iterations(1)
3167 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3168 }
3169 }
3170 }
3171 }
3172
3173 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmin) {
3174 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3175 GemmMicrokernelTester()
3176 .mr(6)
3177 .nr(16)
3178 .kr(1)
3179 .sr(1)
3180 .m(6)
3181 .n(16)
3182 .k(4)
3183 .qmin(128)
3184 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3185 }
3186
3187 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, qmax) {
3188 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3189 GemmMicrokernelTester()
3190 .mr(6)
3191 .nr(16)
3192 .kr(1)
3193 .sr(1)
3194 .m(6)
3195 .n(16)
3196 .k(4)
3197 .qmax(128)
3198 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3199 }
3200
3201 TEST(F16_GEMM_MINMAX_6X16__NEONFP16ARITH_LD64, strided_cm) {
3202 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3203 GemmMicrokernelTester()
3204 .mr(6)
3205 .nr(16)
3206 .kr(1)
3207 .sr(1)
3208 .m(6)
3209 .n(16)
3210 .k(4)
3211 .cm_stride(19)
3212 .Test(xnn_f16_gemm_minmax_ukernel_6x16__neonfp16arith_ld64);
3213 }
3214#endif // XNN_ARCH_ARM64
3215
3216
3217#if XNN_ARCH_ARM64
3218 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4) {
3219 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3220 GemmMicrokernelTester()
3221 .mr(8)
3222 .nr(16)
3223 .kr(1)
3224 .sr(1)
3225 .m(8)
3226 .n(16)
3227 .k(4)
3228 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3229 }
3230
3231 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cn) {
3232 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3233 GemmMicrokernelTester()
3234 .mr(8)
3235 .nr(16)
3236 .kr(1)
3237 .sr(1)
3238 .m(8)
3239 .n(16)
3240 .k(4)
3241 .cn_stride(19)
3242 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3243 }
3244
3245 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_strided_a) {
3246 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3247 GemmMicrokernelTester()
3248 .mr(8)
3249 .nr(16)
3250 .kr(1)
3251 .sr(1)
3252 .m(8)
3253 .n(16)
3254 .k(4)
3255 .a_stride(7)
3256 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3257 }
3258
3259 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile) {
3260 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3261 for (uint32_t m = 1; m <= 8; m++) {
3262 for (uint32_t n = 1; n <= 16; n++) {
3263 GemmMicrokernelTester()
3264 .mr(8)
3265 .nr(16)
3266 .kr(1)
3267 .sr(1)
3268 .m(m)
3269 .n(n)
3270 .k(4)
3271 .iterations(1)
3272 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3273 }
3274 }
3275 }
3276
3277 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
3278 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3279 for (uint32_t m = 1; m <= 8; m++) {
3280 GemmMicrokernelTester()
3281 .mr(8)
3282 .nr(16)
3283 .kr(1)
3284 .sr(1)
3285 .m(m)
3286 .n(16)
3287 .k(4)
3288 .iterations(1)
3289 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3290 }
3291 }
3292
3293 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
3294 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3295 for (uint32_t n = 1; n <= 16; n++) {
3296 GemmMicrokernelTester()
3297 .mr(8)
3298 .nr(16)
3299 .kr(1)
3300 .sr(1)
3301 .m(8)
3302 .n(n)
3303 .k(4)
3304 .iterations(1)
3305 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3306 }
3307 }
3308
3309 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4) {
3310 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3311 for (size_t k = 1; k < 4; k++) {
3312 GemmMicrokernelTester()
3313 .mr(8)
3314 .nr(16)
3315 .kr(1)
3316 .sr(1)
3317 .m(8)
3318 .n(16)
3319 .k(k)
3320 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3321 }
3322 }
3323
3324 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_strided_a) {
3325 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3326 for (size_t k = 1; k < 4; k++) {
3327 GemmMicrokernelTester()
3328 .mr(8)
3329 .nr(16)
3330 .kr(1)
3331 .sr(1)
3332 .m(8)
3333 .n(16)
3334 .k(k)
3335 .a_stride(7)
3336 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3337 }
3338 }
3339
3340 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_lt_4_subtile) {
3341 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3342 for (size_t k = 1; k < 4; k++) {
3343 for (uint32_t m = 1; m <= 8; m++) {
3344 for (uint32_t n = 1; n <= 16; n++) {
3345 GemmMicrokernelTester()
3346 .mr(8)
3347 .nr(16)
3348 .kr(1)
3349 .sr(1)
3350 .m(m)
3351 .n(n)
3352 .k(k)
3353 .iterations(1)
3354 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3355 }
3356 }
3357 }
3358 }
3359
3360 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4) {
3361 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3362 for (size_t k = 5; k < 8; k++) {
3363 GemmMicrokernelTester()
3364 .mr(8)
3365 .nr(16)
3366 .kr(1)
3367 .sr(1)
3368 .m(8)
3369 .n(16)
3370 .k(k)
3371 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3372 }
3373 }
3374
3375 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_strided_a) {
3376 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3377 for (size_t k = 5; k < 8; k++) {
3378 GemmMicrokernelTester()
3379 .mr(8)
3380 .nr(16)
3381 .kr(1)
3382 .sr(1)
3383 .m(8)
3384 .n(16)
3385 .k(k)
3386 .a_stride(11)
3387 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3388 }
3389 }
3390
3391 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_gt_4_subtile) {
3392 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3393 for (size_t k = 5; k < 8; k++) {
3394 for (uint32_t m = 1; m <= 8; m++) {
3395 for (uint32_t n = 1; n <= 16; n++) {
3396 GemmMicrokernelTester()
3397 .mr(8)
3398 .nr(16)
3399 .kr(1)
3400 .sr(1)
3401 .m(m)
3402 .n(n)
3403 .k(k)
3404 .iterations(1)
3405 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3406 }
3407 }
3408 }
3409 }
3410
3411 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4) {
3412 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3413 for (size_t k = 8; k <= 40; k += 4) {
3414 GemmMicrokernelTester()
3415 .mr(8)
3416 .nr(16)
3417 .kr(1)
3418 .sr(1)
3419 .m(8)
3420 .n(16)
3421 .k(k)
3422 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3423 }
3424 }
3425
3426 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_strided_a) {
3427 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3428 for (size_t k = 8; k <= 40; k += 4) {
3429 GemmMicrokernelTester()
3430 .mr(8)
3431 .nr(16)
3432 .kr(1)
3433 .sr(1)
3434 .m(8)
3435 .n(16)
3436 .k(k)
3437 .a_stride(43)
3438 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3439 }
3440 }
3441
3442 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, k_div_4_subtile) {
3443 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3444 for (size_t k = 8; k <= 40; k += 4) {
3445 for (uint32_t m = 1; m <= 8; m++) {
3446 for (uint32_t n = 1; n <= 16; n++) {
3447 GemmMicrokernelTester()
3448 .mr(8)
3449 .nr(16)
3450 .kr(1)
3451 .sr(1)
3452 .m(m)
3453 .n(n)
3454 .k(k)
3455 .iterations(1)
3456 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3457 }
3458 }
3459 }
3460 }
3461
3462 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16) {
3463 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3464 for (uint32_t n = 17; n < 32; n++) {
3465 for (size_t k = 1; k <= 20; k += 5) {
3466 GemmMicrokernelTester()
3467 .mr(8)
3468 .nr(16)
3469 .kr(1)
3470 .sr(1)
3471 .m(8)
3472 .n(16)
3473 .k(k)
3474 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3475 }
3476 }
3477 }
3478
3479 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_cn) {
3480 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3481 for (uint32_t n = 17; n < 32; n++) {
3482 for (size_t k = 1; k <= 20; k += 5) {
3483 GemmMicrokernelTester()
3484 .mr(8)
3485 .nr(16)
3486 .kr(1)
3487 .sr(1)
3488 .m(8)
3489 .n(16)
3490 .k(k)
3491 .cn_stride(19)
3492 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3493 }
3494 }
3495 }
3496
3497 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_strided_a) {
3498 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3499 for (uint32_t n = 17; n < 32; n++) {
3500 for (size_t k = 1; k <= 20; k += 5) {
3501 GemmMicrokernelTester()
3502 .mr(8)
3503 .nr(16)
3504 .kr(1)
3505 .sr(1)
3506 .m(8)
3507 .n(n)
3508 .k(k)
3509 .a_stride(23)
3510 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3511 }
3512 }
3513 }
3514
3515 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_gt_16_subtile) {
3516 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3517 for (uint32_t n = 17; n < 32; n++) {
3518 for (size_t k = 1; k <= 20; k += 5) {
3519 for (uint32_t m = 1; m <= 8; m++) {
3520 GemmMicrokernelTester()
3521 .mr(8)
3522 .nr(16)
3523 .kr(1)
3524 .sr(1)
3525 .m(m)
3526 .n(n)
3527 .k(k)
3528 .iterations(1)
3529 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3530 }
3531 }
3532 }
3533 }
3534
3535 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16) {
3536 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3537 for (uint32_t n = 32; n <= 48; n += 16) {
3538 for (size_t k = 1; k <= 20; k += 5) {
3539 GemmMicrokernelTester()
3540 .mr(8)
3541 .nr(16)
3542 .kr(1)
3543 .sr(1)
3544 .m(8)
3545 .n(16)
3546 .k(k)
3547 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3548 }
3549 }
3550 }
3551
3552 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_cn) {
3553 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3554 for (uint32_t n = 32; n <= 48; n += 16) {
3555 for (size_t k = 1; k <= 20; k += 5) {
3556 GemmMicrokernelTester()
3557 .mr(8)
3558 .nr(16)
3559 .kr(1)
3560 .sr(1)
3561 .m(8)
3562 .n(n)
3563 .k(k)
3564 .cn_stride(19)
3565 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3566 }
3567 }
3568 }
3569
3570 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_strided_a) {
3571 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3572 for (uint32_t n = 32; n <= 48; n += 16) {
3573 for (size_t k = 1; k <= 20; k += 5) {
3574 GemmMicrokernelTester()
3575 .mr(8)
3576 .nr(16)
3577 .kr(1)
3578 .sr(1)
3579 .m(8)
3580 .n(n)
3581 .k(k)
3582 .a_stride(23)
3583 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3584 }
3585 }
3586 }
3587
3588 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, n_div_16_subtile) {
3589 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3590 for (uint32_t n = 32; n <= 48; n += 16) {
3591 for (size_t k = 1; k <= 20; k += 5) {
3592 for (uint32_t m = 1; m <= 8; m++) {
3593 GemmMicrokernelTester()
3594 .mr(8)
3595 .nr(16)
3596 .kr(1)
3597 .sr(1)
3598 .m(m)
3599 .n(n)
3600 .k(k)
3601 .iterations(1)
3602 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3603 }
3604 }
3605 }
3606 }
3607
3608 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm_subtile) {
3609 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3610 for (size_t k = 1; k <= 20; k += 5) {
3611 for (uint32_t m = 1; m <= 8; m++) {
3612 for (uint32_t n = 1; n <= 16; n++) {
3613 GemmMicrokernelTester()
3614 .mr(8)
3615 .nr(16)
3616 .kr(1)
3617 .sr(1)
3618 .m(m)
3619 .n(n)
3620 .k(k)
3621 .cm_stride(19)
3622 .iterations(1)
3623 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3624 }
3625 }
3626 }
3627 }
3628
3629 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmin) {
3630 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3631 GemmMicrokernelTester()
3632 .mr(8)
3633 .nr(16)
3634 .kr(1)
3635 .sr(1)
3636 .m(8)
3637 .n(16)
3638 .k(4)
3639 .qmin(128)
3640 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3641 }
3642
3643 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, qmax) {
3644 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3645 GemmMicrokernelTester()
3646 .mr(8)
3647 .nr(16)
3648 .kr(1)
3649 .sr(1)
3650 .m(8)
3651 .n(16)
3652 .k(4)
3653 .qmax(128)
3654 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3655 }
3656
3657 TEST(F16_GEMM_MINMAX_8X16__NEONFP16ARITH_LD64, strided_cm) {
3658 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3659 GemmMicrokernelTester()
3660 .mr(8)
3661 .nr(16)
3662 .kr(1)
3663 .sr(1)
3664 .m(8)
3665 .n(16)
3666 .k(4)
3667 .cm_stride(19)
3668 .Test(xnn_f16_gemm_minmax_ukernel_8x16__neonfp16arith_ld64);
3669 }
3670#endif // XNN_ARCH_ARM64
3671
3672
3673#if XNN_ARCH_ARM64
Frank Barchard36b76b62020-04-10 12:39:17 -07003674 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) {
3675 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3676 GemmMicrokernelTester()
3677 .mr(1)
3678 .nr(16)
3679 .kr(1)
3680 .sr(1)
3681 .m(1)
3682 .n(16)
3683 .k(2)
3684 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3685 }
3686
3687 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) {
3688 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3689 GemmMicrokernelTester()
3690 .mr(1)
3691 .nr(16)
3692 .kr(1)
3693 .sr(1)
3694 .m(1)
3695 .n(16)
3696 .k(2)
3697 .cn_stride(19)
3698 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3699 }
3700
3701 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) {
3702 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3703 GemmMicrokernelTester()
3704 .mr(1)
3705 .nr(16)
3706 .kr(1)
3707 .sr(1)
3708 .m(1)
3709 .n(16)
3710 .k(2)
3711 .a_stride(5)
3712 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3713 }
3714
3715 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) {
3716 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3717 for (uint32_t m = 1; m <= 1; m++) {
3718 for (uint32_t n = 1; n <= 16; n++) {
3719 GemmMicrokernelTester()
3720 .mr(1)
3721 .nr(16)
3722 .kr(1)
3723 .sr(1)
3724 .m(m)
3725 .n(n)
3726 .k(2)
3727 .iterations(1)
3728 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3729 }
3730 }
3731 }
3732
3733 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) {
3734 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3735 for (uint32_t m = 1; m <= 1; m++) {
3736 GemmMicrokernelTester()
3737 .mr(1)
3738 .nr(16)
3739 .kr(1)
3740 .sr(1)
3741 .m(m)
3742 .n(16)
3743 .k(2)
3744 .iterations(1)
3745 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3746 }
3747 }
3748
3749 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) {
3750 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3751 for (uint32_t n = 1; n <= 16; n++) {
3752 GemmMicrokernelTester()
3753 .mr(1)
3754 .nr(16)
3755 .kr(1)
3756 .sr(1)
3757 .m(1)
3758 .n(n)
3759 .k(2)
3760 .iterations(1)
3761 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3762 }
3763 }
3764
3765 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) {
3766 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3767 for (size_t k = 1; k < 2; k++) {
3768 GemmMicrokernelTester()
3769 .mr(1)
3770 .nr(16)
3771 .kr(1)
3772 .sr(1)
3773 .m(1)
3774 .n(16)
3775 .k(k)
3776 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3777 }
3778 }
3779
3780 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) {
3781 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3782 for (size_t k = 1; k < 2; k++) {
3783 GemmMicrokernelTester()
3784 .mr(1)
3785 .nr(16)
3786 .kr(1)
3787 .sr(1)
3788 .m(1)
3789 .n(16)
3790 .k(k)
3791 .a_stride(5)
3792 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3793 }
3794 }
3795
3796 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) {
3797 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3798 for (size_t k = 1; k < 2; k++) {
3799 for (uint32_t m = 1; m <= 1; m++) {
3800 for (uint32_t n = 1; n <= 16; n++) {
3801 GemmMicrokernelTester()
3802 .mr(1)
3803 .nr(16)
3804 .kr(1)
3805 .sr(1)
3806 .m(m)
3807 .n(n)
3808 .k(k)
3809 .iterations(1)
3810 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3811 }
3812 }
3813 }
3814 }
3815
3816 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) {
3817 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3818 for (size_t k = 3; k < 4; k++) {
3819 GemmMicrokernelTester()
3820 .mr(1)
3821 .nr(16)
3822 .kr(1)
3823 .sr(1)
3824 .m(1)
3825 .n(16)
3826 .k(k)
3827 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3828 }
3829 }
3830
3831 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) {
3832 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3833 for (size_t k = 3; k < 4; k++) {
3834 GemmMicrokernelTester()
3835 .mr(1)
3836 .nr(16)
3837 .kr(1)
3838 .sr(1)
3839 .m(1)
3840 .n(16)
3841 .k(k)
3842 .a_stride(7)
3843 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3844 }
3845 }
3846
3847 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) {
3848 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3849 for (size_t k = 3; k < 4; k++) {
3850 for (uint32_t m = 1; m <= 1; m++) {
3851 for (uint32_t n = 1; n <= 16; n++) {
3852 GemmMicrokernelTester()
3853 .mr(1)
3854 .nr(16)
3855 .kr(1)
3856 .sr(1)
3857 .m(m)
3858 .n(n)
3859 .k(k)
3860 .iterations(1)
3861 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3862 }
3863 }
3864 }
3865 }
3866
3867 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) {
3868 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3869 for (size_t k = 4; k <= 20; k += 2) {
3870 GemmMicrokernelTester()
3871 .mr(1)
3872 .nr(16)
3873 .kr(1)
3874 .sr(1)
3875 .m(1)
3876 .n(16)
3877 .k(k)
3878 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3879 }
3880 }
3881
3882 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) {
3883 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3884 for (size_t k = 4; k <= 20; k += 2) {
3885 GemmMicrokernelTester()
3886 .mr(1)
3887 .nr(16)
3888 .kr(1)
3889 .sr(1)
3890 .m(1)
3891 .n(16)
3892 .k(k)
3893 .a_stride(23)
3894 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3895 }
3896 }
3897
3898 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) {
3899 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3900 for (size_t k = 4; k <= 20; k += 2) {
3901 for (uint32_t m = 1; m <= 1; m++) {
3902 for (uint32_t n = 1; n <= 16; n++) {
3903 GemmMicrokernelTester()
3904 .mr(1)
3905 .nr(16)
3906 .kr(1)
3907 .sr(1)
3908 .m(m)
3909 .n(n)
3910 .k(k)
3911 .iterations(1)
3912 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3913 }
3914 }
3915 }
3916 }
3917
3918 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) {
3919 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3920 for (uint32_t n = 17; n < 32; n++) {
3921 for (size_t k = 1; k <= 10; k += 3) {
3922 GemmMicrokernelTester()
3923 .mr(1)
3924 .nr(16)
3925 .kr(1)
3926 .sr(1)
3927 .m(1)
3928 .n(16)
3929 .k(k)
3930 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3931 }
3932 }
3933 }
3934
3935 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) {
3936 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3937 for (uint32_t n = 17; n < 32; n++) {
3938 for (size_t k = 1; k <= 10; k += 3) {
3939 GemmMicrokernelTester()
3940 .mr(1)
3941 .nr(16)
3942 .kr(1)
3943 .sr(1)
3944 .m(1)
3945 .n(16)
3946 .k(k)
3947 .cn_stride(19)
3948 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3949 }
3950 }
3951 }
3952
3953 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) {
3954 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3955 for (uint32_t n = 17; n < 32; n++) {
3956 for (size_t k = 1; k <= 10; k += 3) {
3957 GemmMicrokernelTester()
3958 .mr(1)
3959 .nr(16)
3960 .kr(1)
3961 .sr(1)
3962 .m(1)
3963 .n(n)
3964 .k(k)
3965 .a_stride(13)
3966 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3967 }
3968 }
3969 }
3970
3971 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) {
3972 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3973 for (uint32_t n = 17; n < 32; n++) {
3974 for (size_t k = 1; k <= 10; k += 3) {
3975 for (uint32_t m = 1; m <= 1; m++) {
3976 GemmMicrokernelTester()
3977 .mr(1)
3978 .nr(16)
3979 .kr(1)
3980 .sr(1)
3981 .m(m)
3982 .n(n)
3983 .k(k)
3984 .iterations(1)
3985 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
3986 }
3987 }
3988 }
3989 }
3990
3991 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) {
3992 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
3993 for (uint32_t n = 32; n <= 48; n += 16) {
3994 for (size_t k = 1; k <= 10; k += 3) {
3995 GemmMicrokernelTester()
3996 .mr(1)
3997 .nr(16)
3998 .kr(1)
3999 .sr(1)
4000 .m(1)
4001 .n(16)
4002 .k(k)
4003 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4004 }
4005 }
4006 }
4007
4008 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) {
4009 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4010 for (uint32_t n = 32; n <= 48; n += 16) {
4011 for (size_t k = 1; k <= 10; k += 3) {
4012 GemmMicrokernelTester()
4013 .mr(1)
4014 .nr(16)
4015 .kr(1)
4016 .sr(1)
4017 .m(1)
4018 .n(n)
4019 .k(k)
4020 .cn_stride(19)
4021 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4022 }
4023 }
4024 }
4025
4026 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) {
4027 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4028 for (uint32_t n = 32; n <= 48; n += 16) {
4029 for (size_t k = 1; k <= 10; k += 3) {
4030 GemmMicrokernelTester()
4031 .mr(1)
4032 .nr(16)
4033 .kr(1)
4034 .sr(1)
4035 .m(1)
4036 .n(n)
4037 .k(k)
4038 .a_stride(13)
4039 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4040 }
4041 }
4042 }
4043
4044 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) {
4045 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4046 for (uint32_t n = 32; n <= 48; n += 16) {
4047 for (size_t k = 1; k <= 10; k += 3) {
4048 for (uint32_t m = 1; m <= 1; m++) {
4049 GemmMicrokernelTester()
4050 .mr(1)
4051 .nr(16)
4052 .kr(1)
4053 .sr(1)
4054 .m(m)
4055 .n(n)
4056 .k(k)
4057 .iterations(1)
4058 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4059 }
4060 }
4061 }
4062 }
4063
4064 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) {
4065 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4066 for (size_t k = 1; k <= 10; k += 3) {
4067 for (uint32_t m = 1; m <= 1; m++) {
4068 for (uint32_t n = 1; n <= 16; n++) {
4069 GemmMicrokernelTester()
4070 .mr(1)
4071 .nr(16)
4072 .kr(1)
4073 .sr(1)
4074 .m(m)
4075 .n(n)
4076 .k(k)
4077 .cm_stride(19)
4078 .iterations(1)
4079 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4080 }
4081 }
4082 }
4083 }
4084
4085 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmin) {
4086 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4087 GemmMicrokernelTester()
4088 .mr(1)
4089 .nr(16)
4090 .kr(1)
4091 .sr(1)
4092 .m(1)
4093 .n(16)
4094 .k(2)
4095 .qmin(128)
4096 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4097 }
4098
4099 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, qmax) {
4100 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4101 GemmMicrokernelTester()
4102 .mr(1)
4103 .nr(16)
4104 .kr(1)
4105 .sr(1)
4106 .m(1)
4107 .n(16)
4108 .k(2)
4109 .qmax(128)
4110 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4111 }
4112
4113 TEST(F16_GEMM_MINMAX_1X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) {
4114 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4115 GemmMicrokernelTester()
4116 .mr(1)
4117 .nr(16)
4118 .kr(1)
4119 .sr(1)
4120 .m(1)
4121 .n(16)
4122 .k(2)
4123 .cm_stride(19)
4124 .Test(xnn_f16_gemm_minmax_ukernel_1x16__aarch64_neonfp16arith_ld32);
4125 }
4126#endif // XNN_ARCH_ARM64
4127
4128
4129#if XNN_ARCH_ARM64
Frank Barchard683f5592020-04-10 00:48:26 -07004130 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) {
4131 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4132 GemmMicrokernelTester()
4133 .mr(4)
4134 .nr(16)
4135 .kr(1)
4136 .sr(1)
4137 .m(4)
4138 .n(16)
4139 .k(2)
4140 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4141 }
4142
4143 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) {
4144 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4145 GemmMicrokernelTester()
4146 .mr(4)
4147 .nr(16)
4148 .kr(1)
4149 .sr(1)
4150 .m(4)
4151 .n(16)
4152 .k(2)
4153 .cn_stride(19)
4154 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4155 }
4156
4157 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) {
4158 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4159 GemmMicrokernelTester()
4160 .mr(4)
4161 .nr(16)
4162 .kr(1)
4163 .sr(1)
4164 .m(4)
4165 .n(16)
4166 .k(2)
4167 .a_stride(5)
4168 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4169 }
4170
4171 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) {
4172 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4173 for (uint32_t m = 1; m <= 4; m++) {
4174 for (uint32_t n = 1; n <= 16; n++) {
4175 GemmMicrokernelTester()
4176 .mr(4)
4177 .nr(16)
4178 .kr(1)
4179 .sr(1)
4180 .m(m)
4181 .n(n)
4182 .k(2)
4183 .iterations(1)
4184 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4185 }
4186 }
4187 }
4188
4189 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) {
4190 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4191 for (uint32_t m = 1; m <= 4; m++) {
4192 GemmMicrokernelTester()
4193 .mr(4)
4194 .nr(16)
4195 .kr(1)
4196 .sr(1)
4197 .m(m)
4198 .n(16)
4199 .k(2)
4200 .iterations(1)
4201 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4202 }
4203 }
4204
4205 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) {
4206 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4207 for (uint32_t n = 1; n <= 16; n++) {
4208 GemmMicrokernelTester()
4209 .mr(4)
4210 .nr(16)
4211 .kr(1)
4212 .sr(1)
4213 .m(4)
4214 .n(n)
4215 .k(2)
4216 .iterations(1)
4217 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4218 }
4219 }
4220
4221 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) {
4222 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4223 for (size_t k = 1; k < 2; k++) {
4224 GemmMicrokernelTester()
4225 .mr(4)
4226 .nr(16)
4227 .kr(1)
4228 .sr(1)
4229 .m(4)
4230 .n(16)
4231 .k(k)
4232 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4233 }
4234 }
4235
4236 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) {
4237 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4238 for (size_t k = 1; k < 2; k++) {
4239 GemmMicrokernelTester()
4240 .mr(4)
4241 .nr(16)
4242 .kr(1)
4243 .sr(1)
4244 .m(4)
4245 .n(16)
4246 .k(k)
4247 .a_stride(5)
4248 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4249 }
4250 }
4251
4252 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) {
4253 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4254 for (size_t k = 1; k < 2; k++) {
4255 for (uint32_t m = 1; m <= 4; m++) {
4256 for (uint32_t n = 1; n <= 16; n++) {
4257 GemmMicrokernelTester()
4258 .mr(4)
4259 .nr(16)
4260 .kr(1)
4261 .sr(1)
4262 .m(m)
4263 .n(n)
4264 .k(k)
4265 .iterations(1)
4266 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4267 }
4268 }
4269 }
4270 }
4271
4272 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) {
4273 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4274 for (size_t k = 3; k < 4; k++) {
4275 GemmMicrokernelTester()
4276 .mr(4)
4277 .nr(16)
4278 .kr(1)
4279 .sr(1)
4280 .m(4)
4281 .n(16)
4282 .k(k)
4283 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4284 }
4285 }
4286
4287 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) {
4288 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4289 for (size_t k = 3; k < 4; k++) {
4290 GemmMicrokernelTester()
4291 .mr(4)
4292 .nr(16)
4293 .kr(1)
4294 .sr(1)
4295 .m(4)
4296 .n(16)
4297 .k(k)
4298 .a_stride(7)
4299 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4300 }
4301 }
4302
4303 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) {
4304 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4305 for (size_t k = 3; k < 4; k++) {
4306 for (uint32_t m = 1; m <= 4; m++) {
4307 for (uint32_t n = 1; n <= 16; n++) {
4308 GemmMicrokernelTester()
4309 .mr(4)
4310 .nr(16)
4311 .kr(1)
4312 .sr(1)
4313 .m(m)
4314 .n(n)
4315 .k(k)
4316 .iterations(1)
4317 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4318 }
4319 }
4320 }
4321 }
4322
4323 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) {
4324 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4325 for (size_t k = 4; k <= 20; k += 2) {
4326 GemmMicrokernelTester()
4327 .mr(4)
4328 .nr(16)
4329 .kr(1)
4330 .sr(1)
4331 .m(4)
4332 .n(16)
4333 .k(k)
4334 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4335 }
4336 }
4337
4338 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) {
4339 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4340 for (size_t k = 4; k <= 20; k += 2) {
4341 GemmMicrokernelTester()
4342 .mr(4)
4343 .nr(16)
4344 .kr(1)
4345 .sr(1)
4346 .m(4)
4347 .n(16)
4348 .k(k)
4349 .a_stride(23)
4350 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4351 }
4352 }
4353
4354 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) {
4355 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4356 for (size_t k = 4; k <= 20; k += 2) {
4357 for (uint32_t m = 1; m <= 4; m++) {
4358 for (uint32_t n = 1; n <= 16; n++) {
4359 GemmMicrokernelTester()
4360 .mr(4)
4361 .nr(16)
4362 .kr(1)
4363 .sr(1)
4364 .m(m)
4365 .n(n)
4366 .k(k)
4367 .iterations(1)
4368 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4369 }
4370 }
4371 }
4372 }
4373
4374 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) {
4375 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4376 for (uint32_t n = 17; n < 32; n++) {
4377 for (size_t k = 1; k <= 10; k += 3) {
4378 GemmMicrokernelTester()
4379 .mr(4)
4380 .nr(16)
4381 .kr(1)
4382 .sr(1)
4383 .m(4)
4384 .n(16)
4385 .k(k)
4386 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4387 }
4388 }
4389 }
4390
4391 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) {
4392 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4393 for (uint32_t n = 17; n < 32; n++) {
4394 for (size_t k = 1; k <= 10; k += 3) {
4395 GemmMicrokernelTester()
4396 .mr(4)
4397 .nr(16)
4398 .kr(1)
4399 .sr(1)
4400 .m(4)
4401 .n(16)
4402 .k(k)
4403 .cn_stride(19)
4404 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4405 }
4406 }
4407 }
4408
4409 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) {
4410 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4411 for (uint32_t n = 17; n < 32; n++) {
4412 for (size_t k = 1; k <= 10; k += 3) {
4413 GemmMicrokernelTester()
4414 .mr(4)
4415 .nr(16)
4416 .kr(1)
4417 .sr(1)
4418 .m(4)
4419 .n(n)
4420 .k(k)
4421 .a_stride(13)
4422 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4423 }
4424 }
4425 }
4426
4427 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) {
4428 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4429 for (uint32_t n = 17; n < 32; n++) {
4430 for (size_t k = 1; k <= 10; k += 3) {
4431 for (uint32_t m = 1; m <= 4; m++) {
4432 GemmMicrokernelTester()
4433 .mr(4)
4434 .nr(16)
4435 .kr(1)
4436 .sr(1)
4437 .m(m)
4438 .n(n)
4439 .k(k)
4440 .iterations(1)
4441 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4442 }
4443 }
4444 }
4445 }
4446
4447 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) {
4448 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4449 for (uint32_t n = 32; n <= 48; n += 16) {
4450 for (size_t k = 1; k <= 10; k += 3) {
4451 GemmMicrokernelTester()
4452 .mr(4)
4453 .nr(16)
4454 .kr(1)
4455 .sr(1)
4456 .m(4)
4457 .n(16)
4458 .k(k)
4459 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4460 }
4461 }
4462 }
4463
4464 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) {
4465 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4466 for (uint32_t n = 32; n <= 48; n += 16) {
4467 for (size_t k = 1; k <= 10; k += 3) {
4468 GemmMicrokernelTester()
4469 .mr(4)
4470 .nr(16)
4471 .kr(1)
4472 .sr(1)
4473 .m(4)
4474 .n(n)
4475 .k(k)
4476 .cn_stride(19)
4477 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4478 }
4479 }
4480 }
4481
4482 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) {
4483 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4484 for (uint32_t n = 32; n <= 48; n += 16) {
4485 for (size_t k = 1; k <= 10; k += 3) {
4486 GemmMicrokernelTester()
4487 .mr(4)
4488 .nr(16)
4489 .kr(1)
4490 .sr(1)
4491 .m(4)
4492 .n(n)
4493 .k(k)
4494 .a_stride(13)
4495 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4496 }
4497 }
4498 }
4499
4500 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) {
4501 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4502 for (uint32_t n = 32; n <= 48; n += 16) {
4503 for (size_t k = 1; k <= 10; k += 3) {
4504 for (uint32_t m = 1; m <= 4; m++) {
4505 GemmMicrokernelTester()
4506 .mr(4)
4507 .nr(16)
4508 .kr(1)
4509 .sr(1)
4510 .m(m)
4511 .n(n)
4512 .k(k)
4513 .iterations(1)
4514 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4515 }
4516 }
4517 }
4518 }
4519
4520 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) {
4521 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4522 for (size_t k = 1; k <= 10; k += 3) {
4523 for (uint32_t m = 1; m <= 4; m++) {
4524 for (uint32_t n = 1; n <= 16; n++) {
4525 GemmMicrokernelTester()
4526 .mr(4)
4527 .nr(16)
4528 .kr(1)
4529 .sr(1)
4530 .m(m)
4531 .n(n)
4532 .k(k)
4533 .cm_stride(19)
4534 .iterations(1)
4535 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4536 }
4537 }
4538 }
4539 }
4540
4541 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmin) {
4542 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4543 GemmMicrokernelTester()
4544 .mr(4)
4545 .nr(16)
4546 .kr(1)
4547 .sr(1)
4548 .m(4)
4549 .n(16)
4550 .k(2)
4551 .qmin(128)
4552 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4553 }
4554
4555 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, qmax) {
4556 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4557 GemmMicrokernelTester()
4558 .mr(4)
4559 .nr(16)
4560 .kr(1)
4561 .sr(1)
4562 .m(4)
4563 .n(16)
4564 .k(2)
4565 .qmax(128)
4566 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4567 }
4568
4569 TEST(F16_GEMM_MINMAX_4X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) {
4570 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4571 GemmMicrokernelTester()
4572 .mr(4)
4573 .nr(16)
4574 .kr(1)
4575 .sr(1)
4576 .m(4)
4577 .n(16)
4578 .k(2)
4579 .cm_stride(19)
4580 .Test(xnn_f16_gemm_minmax_ukernel_4x16__aarch64_neonfp16arith_ld32);
4581 }
4582#endif // XNN_ARCH_ARM64
4583
4584
4585#if XNN_ARCH_ARM64
4586 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2) {
4587 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4588 GemmMicrokernelTester()
4589 .mr(6)
4590 .nr(16)
4591 .kr(1)
4592 .sr(1)
4593 .m(6)
4594 .n(16)
4595 .k(2)
4596 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4597 }
4598
4599 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cn) {
4600 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4601 GemmMicrokernelTester()
4602 .mr(6)
4603 .nr(16)
4604 .kr(1)
4605 .sr(1)
4606 .m(6)
4607 .n(16)
4608 .k(2)
4609 .cn_stride(19)
4610 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4611 }
4612
4613 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_strided_a) {
4614 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4615 GemmMicrokernelTester()
4616 .mr(6)
4617 .nr(16)
4618 .kr(1)
4619 .sr(1)
4620 .m(6)
4621 .n(16)
4622 .k(2)
4623 .a_stride(5)
4624 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4625 }
4626
4627 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile) {
4628 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4629 for (uint32_t m = 1; m <= 6; m++) {
4630 for (uint32_t n = 1; n <= 16; n++) {
4631 GemmMicrokernelTester()
4632 .mr(6)
4633 .nr(16)
4634 .kr(1)
4635 .sr(1)
4636 .m(m)
4637 .n(n)
4638 .k(2)
4639 .iterations(1)
4640 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4641 }
4642 }
4643 }
4644
4645 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_m) {
4646 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4647 for (uint32_t m = 1; m <= 6; m++) {
4648 GemmMicrokernelTester()
4649 .mr(6)
4650 .nr(16)
4651 .kr(1)
4652 .sr(1)
4653 .m(m)
4654 .n(16)
4655 .k(2)
4656 .iterations(1)
4657 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4658 }
4659 }
4660
4661 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_eq_2_subtile_n) {
4662 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4663 for (uint32_t n = 1; n <= 16; n++) {
4664 GemmMicrokernelTester()
4665 .mr(6)
4666 .nr(16)
4667 .kr(1)
4668 .sr(1)
4669 .m(6)
4670 .n(n)
4671 .k(2)
4672 .iterations(1)
4673 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4674 }
4675 }
4676
4677 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2) {
4678 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4679 for (size_t k = 1; k < 2; k++) {
4680 GemmMicrokernelTester()
4681 .mr(6)
4682 .nr(16)
4683 .kr(1)
4684 .sr(1)
4685 .m(6)
4686 .n(16)
4687 .k(k)
4688 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4689 }
4690 }
4691
4692 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_strided_a) {
4693 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4694 for (size_t k = 1; k < 2; k++) {
4695 GemmMicrokernelTester()
4696 .mr(6)
4697 .nr(16)
4698 .kr(1)
4699 .sr(1)
4700 .m(6)
4701 .n(16)
4702 .k(k)
4703 .a_stride(5)
4704 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4705 }
4706 }
4707
4708 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_lt_2_subtile) {
4709 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4710 for (size_t k = 1; k < 2; k++) {
4711 for (uint32_t m = 1; m <= 6; m++) {
4712 for (uint32_t n = 1; n <= 16; n++) {
4713 GemmMicrokernelTester()
4714 .mr(6)
4715 .nr(16)
4716 .kr(1)
4717 .sr(1)
4718 .m(m)
4719 .n(n)
4720 .k(k)
4721 .iterations(1)
4722 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4723 }
4724 }
4725 }
4726 }
4727
4728 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2) {
4729 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4730 for (size_t k = 3; k < 4; k++) {
4731 GemmMicrokernelTester()
4732 .mr(6)
4733 .nr(16)
4734 .kr(1)
4735 .sr(1)
4736 .m(6)
4737 .n(16)
4738 .k(k)
4739 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4740 }
4741 }
4742
4743 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_strided_a) {
4744 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4745 for (size_t k = 3; k < 4; k++) {
4746 GemmMicrokernelTester()
4747 .mr(6)
4748 .nr(16)
4749 .kr(1)
4750 .sr(1)
4751 .m(6)
4752 .n(16)
4753 .k(k)
4754 .a_stride(7)
4755 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4756 }
4757 }
4758
4759 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_gt_2_subtile) {
4760 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4761 for (size_t k = 3; k < 4; k++) {
4762 for (uint32_t m = 1; m <= 6; m++) {
4763 for (uint32_t n = 1; n <= 16; n++) {
4764 GemmMicrokernelTester()
4765 .mr(6)
4766 .nr(16)
4767 .kr(1)
4768 .sr(1)
4769 .m(m)
4770 .n(n)
4771 .k(k)
4772 .iterations(1)
4773 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4774 }
4775 }
4776 }
4777 }
4778
4779 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2) {
4780 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4781 for (size_t k = 4; k <= 20; k += 2) {
4782 GemmMicrokernelTester()
4783 .mr(6)
4784 .nr(16)
4785 .kr(1)
4786 .sr(1)
4787 .m(6)
4788 .n(16)
4789 .k(k)
4790 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4791 }
4792 }
4793
4794 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_strided_a) {
4795 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4796 for (size_t k = 4; k <= 20; k += 2) {
4797 GemmMicrokernelTester()
4798 .mr(6)
4799 .nr(16)
4800 .kr(1)
4801 .sr(1)
4802 .m(6)
4803 .n(16)
4804 .k(k)
4805 .a_stride(23)
4806 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4807 }
4808 }
4809
4810 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, k_div_2_subtile) {
4811 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4812 for (size_t k = 4; k <= 20; k += 2) {
4813 for (uint32_t m = 1; m <= 6; m++) {
4814 for (uint32_t n = 1; n <= 16; n++) {
4815 GemmMicrokernelTester()
4816 .mr(6)
4817 .nr(16)
4818 .kr(1)
4819 .sr(1)
4820 .m(m)
4821 .n(n)
4822 .k(k)
4823 .iterations(1)
4824 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4825 }
4826 }
4827 }
4828 }
4829
4830 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16) {
4831 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4832 for (uint32_t n = 17; n < 32; n++) {
4833 for (size_t k = 1; k <= 10; k += 3) {
4834 GemmMicrokernelTester()
4835 .mr(6)
4836 .nr(16)
4837 .kr(1)
4838 .sr(1)
4839 .m(6)
4840 .n(16)
4841 .k(k)
4842 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4843 }
4844 }
4845 }
4846
4847 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_cn) {
4848 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4849 for (uint32_t n = 17; n < 32; n++) {
4850 for (size_t k = 1; k <= 10; k += 3) {
4851 GemmMicrokernelTester()
4852 .mr(6)
4853 .nr(16)
4854 .kr(1)
4855 .sr(1)
4856 .m(6)
4857 .n(16)
4858 .k(k)
4859 .cn_stride(19)
4860 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4861 }
4862 }
4863 }
4864
4865 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_strided_a) {
4866 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4867 for (uint32_t n = 17; n < 32; n++) {
4868 for (size_t k = 1; k <= 10; k += 3) {
4869 GemmMicrokernelTester()
4870 .mr(6)
4871 .nr(16)
4872 .kr(1)
4873 .sr(1)
4874 .m(6)
4875 .n(n)
4876 .k(k)
4877 .a_stride(13)
4878 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4879 }
4880 }
4881 }
4882
4883 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_gt_16_subtile) {
4884 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4885 for (uint32_t n = 17; n < 32; n++) {
4886 for (size_t k = 1; k <= 10; k += 3) {
4887 for (uint32_t m = 1; m <= 6; m++) {
4888 GemmMicrokernelTester()
4889 .mr(6)
4890 .nr(16)
4891 .kr(1)
4892 .sr(1)
4893 .m(m)
4894 .n(n)
4895 .k(k)
4896 .iterations(1)
4897 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4898 }
4899 }
4900 }
4901 }
4902
4903 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16) {
4904 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4905 for (uint32_t n = 32; n <= 48; n += 16) {
4906 for (size_t k = 1; k <= 10; k += 3) {
4907 GemmMicrokernelTester()
4908 .mr(6)
4909 .nr(16)
4910 .kr(1)
4911 .sr(1)
4912 .m(6)
4913 .n(16)
4914 .k(k)
4915 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4916 }
4917 }
4918 }
4919
4920 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_cn) {
4921 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4922 for (uint32_t n = 32; n <= 48; n += 16) {
4923 for (size_t k = 1; k <= 10; k += 3) {
4924 GemmMicrokernelTester()
4925 .mr(6)
4926 .nr(16)
4927 .kr(1)
4928 .sr(1)
4929 .m(6)
4930 .n(n)
4931 .k(k)
4932 .cn_stride(19)
4933 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4934 }
4935 }
4936 }
4937
4938 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_strided_a) {
4939 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4940 for (uint32_t n = 32; n <= 48; n += 16) {
4941 for (size_t k = 1; k <= 10; k += 3) {
4942 GemmMicrokernelTester()
4943 .mr(6)
4944 .nr(16)
4945 .kr(1)
4946 .sr(1)
4947 .m(6)
4948 .n(n)
4949 .k(k)
4950 .a_stride(13)
4951 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4952 }
4953 }
4954 }
4955
4956 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, n_div_16_subtile) {
4957 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4958 for (uint32_t n = 32; n <= 48; n += 16) {
4959 for (size_t k = 1; k <= 10; k += 3) {
4960 for (uint32_t m = 1; m <= 6; m++) {
4961 GemmMicrokernelTester()
4962 .mr(6)
4963 .nr(16)
4964 .kr(1)
4965 .sr(1)
4966 .m(m)
4967 .n(n)
4968 .k(k)
4969 .iterations(1)
4970 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4971 }
4972 }
4973 }
4974 }
4975
4976 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm_subtile) {
4977 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4978 for (size_t k = 1; k <= 10; k += 3) {
4979 for (uint32_t m = 1; m <= 6; m++) {
4980 for (uint32_t n = 1; n <= 16; n++) {
4981 GemmMicrokernelTester()
4982 .mr(6)
4983 .nr(16)
4984 .kr(1)
4985 .sr(1)
4986 .m(m)
4987 .n(n)
4988 .k(k)
4989 .cm_stride(19)
4990 .iterations(1)
4991 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
4992 }
4993 }
4994 }
4995 }
4996
4997 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmin) {
4998 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
4999 GemmMicrokernelTester()
5000 .mr(6)
5001 .nr(16)
5002 .kr(1)
5003 .sr(1)
5004 .m(6)
5005 .n(16)
5006 .k(2)
5007 .qmin(128)
5008 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
5009 }
5010
5011 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, qmax) {
5012 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5013 GemmMicrokernelTester()
5014 .mr(6)
5015 .nr(16)
5016 .kr(1)
5017 .sr(1)
5018 .m(6)
5019 .n(16)
5020 .k(2)
5021 .qmax(128)
5022 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
5023 }
5024
5025 TEST(F16_GEMM_MINMAX_6X16__AARCH64_NEONFP16ARITH_LD32, strided_cm) {
5026 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5027 GemmMicrokernelTester()
5028 .mr(6)
5029 .nr(16)
5030 .kr(1)
5031 .sr(1)
5032 .m(6)
5033 .n(16)
5034 .k(2)
5035 .cm_stride(19)
5036 .Test(xnn_f16_gemm_minmax_ukernel_6x16__aarch64_neonfp16arith_ld32);
5037 }
5038#endif // XNN_ARCH_ARM64
Frank Barchardbddfbcd2020-04-15 12:32:41 -07005039
5040
5041#if XNN_ARCH_ARM64
5042 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) {
5043 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5044 GemmMicrokernelTester()
5045 .mr(1)
5046 .nr(8)
5047 .kr(1)
5048 .sr(1)
5049 .m(1)
5050 .n(8)
5051 .k(4)
5052 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5053 }
5054
5055 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) {
5056 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5057 GemmMicrokernelTester()
5058 .mr(1)
5059 .nr(8)
5060 .kr(1)
5061 .sr(1)
5062 .m(1)
5063 .n(8)
5064 .k(4)
5065 .cn_stride(11)
5066 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5067 }
5068
5069 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) {
5070 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5071 GemmMicrokernelTester()
5072 .mr(1)
5073 .nr(8)
5074 .kr(1)
5075 .sr(1)
5076 .m(1)
5077 .n(8)
5078 .k(4)
5079 .a_stride(7)
5080 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5081 }
5082
5083 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) {
5084 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5085 for (uint32_t m = 1; m <= 1; m++) {
5086 for (uint32_t n = 1; n <= 8; n++) {
5087 GemmMicrokernelTester()
5088 .mr(1)
5089 .nr(8)
5090 .kr(1)
5091 .sr(1)
5092 .m(m)
5093 .n(n)
5094 .k(4)
5095 .iterations(1)
5096 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5097 }
5098 }
5099 }
5100
5101 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
5102 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5103 for (uint32_t m = 1; m <= 1; m++) {
5104 GemmMicrokernelTester()
5105 .mr(1)
5106 .nr(8)
5107 .kr(1)
5108 .sr(1)
5109 .m(m)
5110 .n(8)
5111 .k(4)
5112 .iterations(1)
5113 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5114 }
5115 }
5116
5117 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
5118 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5119 for (uint32_t n = 1; n <= 8; n++) {
5120 GemmMicrokernelTester()
5121 .mr(1)
5122 .nr(8)
5123 .kr(1)
5124 .sr(1)
5125 .m(1)
5126 .n(n)
5127 .k(4)
5128 .iterations(1)
5129 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5130 }
5131 }
5132
5133 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) {
5134 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5135 for (size_t k = 1; k < 4; k++) {
5136 GemmMicrokernelTester()
5137 .mr(1)
5138 .nr(8)
5139 .kr(1)
5140 .sr(1)
5141 .m(1)
5142 .n(8)
5143 .k(k)
5144 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5145 }
5146 }
5147
5148 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) {
5149 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5150 for (size_t k = 1; k < 4; k++) {
5151 GemmMicrokernelTester()
5152 .mr(1)
5153 .nr(8)
5154 .kr(1)
5155 .sr(1)
5156 .m(1)
5157 .n(8)
5158 .k(k)
5159 .a_stride(7)
5160 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5161 }
5162 }
5163
5164 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) {
5165 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5166 for (size_t k = 1; k < 4; k++) {
5167 for (uint32_t m = 1; m <= 1; m++) {
5168 for (uint32_t n = 1; n <= 8; n++) {
5169 GemmMicrokernelTester()
5170 .mr(1)
5171 .nr(8)
5172 .kr(1)
5173 .sr(1)
5174 .m(m)
5175 .n(n)
5176 .k(k)
5177 .iterations(1)
5178 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5179 }
5180 }
5181 }
5182 }
5183
5184 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) {
5185 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5186 for (size_t k = 5; k < 8; k++) {
5187 GemmMicrokernelTester()
5188 .mr(1)
5189 .nr(8)
5190 .kr(1)
5191 .sr(1)
5192 .m(1)
5193 .n(8)
5194 .k(k)
5195 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5196 }
5197 }
5198
5199 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) {
5200 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5201 for (size_t k = 5; k < 8; k++) {
5202 GemmMicrokernelTester()
5203 .mr(1)
5204 .nr(8)
5205 .kr(1)
5206 .sr(1)
5207 .m(1)
5208 .n(8)
5209 .k(k)
5210 .a_stride(11)
5211 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5212 }
5213 }
5214
5215 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) {
5216 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5217 for (size_t k = 5; k < 8; k++) {
5218 for (uint32_t m = 1; m <= 1; m++) {
5219 for (uint32_t n = 1; n <= 8; n++) {
5220 GemmMicrokernelTester()
5221 .mr(1)
5222 .nr(8)
5223 .kr(1)
5224 .sr(1)
5225 .m(m)
5226 .n(n)
5227 .k(k)
5228 .iterations(1)
5229 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5230 }
5231 }
5232 }
5233 }
5234
5235 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) {
5236 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5237 for (size_t k = 8; k <= 40; k += 4) {
5238 GemmMicrokernelTester()
5239 .mr(1)
5240 .nr(8)
5241 .kr(1)
5242 .sr(1)
5243 .m(1)
5244 .n(8)
5245 .k(k)
5246 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5247 }
5248 }
5249
5250 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) {
5251 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5252 for (size_t k = 8; k <= 40; k += 4) {
5253 GemmMicrokernelTester()
5254 .mr(1)
5255 .nr(8)
5256 .kr(1)
5257 .sr(1)
5258 .m(1)
5259 .n(8)
5260 .k(k)
5261 .a_stride(43)
5262 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5263 }
5264 }
5265
5266 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) {
5267 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5268 for (size_t k = 8; k <= 40; k += 4) {
5269 for (uint32_t m = 1; m <= 1; m++) {
5270 for (uint32_t n = 1; n <= 8; n++) {
5271 GemmMicrokernelTester()
5272 .mr(1)
5273 .nr(8)
5274 .kr(1)
5275 .sr(1)
5276 .m(m)
5277 .n(n)
5278 .k(k)
5279 .iterations(1)
5280 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5281 }
5282 }
5283 }
5284 }
5285
5286 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) {
5287 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5288 for (uint32_t n = 9; n < 16; n++) {
5289 for (size_t k = 1; k <= 20; k += 5) {
5290 GemmMicrokernelTester()
5291 .mr(1)
5292 .nr(8)
5293 .kr(1)
5294 .sr(1)
5295 .m(1)
5296 .n(8)
5297 .k(k)
5298 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5299 }
5300 }
5301 }
5302
5303 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
5304 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5305 for (uint32_t n = 9; n < 16; n++) {
5306 for (size_t k = 1; k <= 20; k += 5) {
5307 GemmMicrokernelTester()
5308 .mr(1)
5309 .nr(8)
5310 .kr(1)
5311 .sr(1)
5312 .m(1)
5313 .n(8)
5314 .k(k)
5315 .cn_stride(11)
5316 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5317 }
5318 }
5319 }
5320
5321 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) {
5322 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5323 for (uint32_t n = 9; n < 16; n++) {
5324 for (size_t k = 1; k <= 20; k += 5) {
5325 GemmMicrokernelTester()
5326 .mr(1)
5327 .nr(8)
5328 .kr(1)
5329 .sr(1)
5330 .m(1)
5331 .n(n)
5332 .k(k)
5333 .a_stride(23)
5334 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5335 }
5336 }
5337 }
5338
5339 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) {
5340 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5341 for (uint32_t n = 9; n < 16; n++) {
5342 for (size_t k = 1; k <= 20; k += 5) {
5343 for (uint32_t m = 1; m <= 1; m++) {
5344 GemmMicrokernelTester()
5345 .mr(1)
5346 .nr(8)
5347 .kr(1)
5348 .sr(1)
5349 .m(m)
5350 .n(n)
5351 .k(k)
5352 .iterations(1)
5353 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5354 }
5355 }
5356 }
5357 }
5358
5359 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) {
5360 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5361 for (uint32_t n = 16; n <= 24; n += 8) {
5362 for (size_t k = 1; k <= 20; k += 5) {
5363 GemmMicrokernelTester()
5364 .mr(1)
5365 .nr(8)
5366 .kr(1)
5367 .sr(1)
5368 .m(1)
5369 .n(8)
5370 .k(k)
5371 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5372 }
5373 }
5374 }
5375
5376 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) {
5377 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5378 for (uint32_t n = 16; n <= 24; n += 8) {
5379 for (size_t k = 1; k <= 20; k += 5) {
5380 GemmMicrokernelTester()
5381 .mr(1)
5382 .nr(8)
5383 .kr(1)
5384 .sr(1)
5385 .m(1)
5386 .n(n)
5387 .k(k)
5388 .cn_stride(11)
5389 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5390 }
5391 }
5392 }
5393
5394 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) {
5395 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5396 for (uint32_t n = 16; n <= 24; n += 8) {
5397 for (size_t k = 1; k <= 20; k += 5) {
5398 GemmMicrokernelTester()
5399 .mr(1)
5400 .nr(8)
5401 .kr(1)
5402 .sr(1)
5403 .m(1)
5404 .n(n)
5405 .k(k)
5406 .a_stride(23)
5407 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5408 }
5409 }
5410 }
5411
5412 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) {
5413 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5414 for (uint32_t n = 16; n <= 24; n += 8) {
5415 for (size_t k = 1; k <= 20; k += 5) {
5416 for (uint32_t m = 1; m <= 1; m++) {
5417 GemmMicrokernelTester()
5418 .mr(1)
5419 .nr(8)
5420 .kr(1)
5421 .sr(1)
5422 .m(m)
5423 .n(n)
5424 .k(k)
5425 .iterations(1)
5426 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5427 }
5428 }
5429 }
5430 }
5431
5432 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) {
5433 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5434 for (size_t k = 1; k <= 20; k += 5) {
5435 for (uint32_t m = 1; m <= 1; m++) {
5436 for (uint32_t n = 1; n <= 8; n++) {
5437 GemmMicrokernelTester()
5438 .mr(1)
5439 .nr(8)
5440 .kr(1)
5441 .sr(1)
5442 .m(m)
5443 .n(n)
5444 .k(k)
5445 .cm_stride(11)
5446 .iterations(1)
5447 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5448 }
5449 }
5450 }
5451 }
5452
5453 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, qmin) {
5454 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5455 GemmMicrokernelTester()
5456 .mr(1)
5457 .nr(8)
5458 .kr(1)
5459 .sr(1)
5460 .m(1)
5461 .n(8)
5462 .k(4)
5463 .qmin(128)
5464 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5465 }
5466
5467 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, qmax) {
5468 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5469 GemmMicrokernelTester()
5470 .mr(1)
5471 .nr(8)
5472 .kr(1)
5473 .sr(1)
5474 .m(1)
5475 .n(8)
5476 .k(4)
5477 .qmax(128)
5478 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5479 }
5480
5481 TEST(F16_GEMM_MINMAX_1X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) {
5482 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5483 GemmMicrokernelTester()
5484 .mr(1)
5485 .nr(8)
5486 .kr(1)
5487 .sr(1)
5488 .m(1)
5489 .n(8)
5490 .k(4)
5491 .cm_stride(11)
5492 .Test(xnn_f16_gemm_minmax_ukernel_1x8__aarch64_neonfp16arith_ld64);
5493 }
5494#endif // XNN_ARCH_ARM64
5495
5496
5497#if XNN_ARCH_ARM64
5498 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) {
5499 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5500 GemmMicrokernelTester()
5501 .mr(4)
5502 .nr(8)
5503 .kr(1)
5504 .sr(1)
5505 .m(4)
5506 .n(8)
5507 .k(4)
5508 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5509 }
5510
5511 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) {
5512 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5513 GemmMicrokernelTester()
5514 .mr(4)
5515 .nr(8)
5516 .kr(1)
5517 .sr(1)
5518 .m(4)
5519 .n(8)
5520 .k(4)
5521 .cn_stride(11)
5522 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5523 }
5524
5525 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) {
5526 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5527 GemmMicrokernelTester()
5528 .mr(4)
5529 .nr(8)
5530 .kr(1)
5531 .sr(1)
5532 .m(4)
5533 .n(8)
5534 .k(4)
5535 .a_stride(7)
5536 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5537 }
5538
5539 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) {
5540 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5541 for (uint32_t m = 1; m <= 4; m++) {
5542 for (uint32_t n = 1; n <= 8; n++) {
5543 GemmMicrokernelTester()
5544 .mr(4)
5545 .nr(8)
5546 .kr(1)
5547 .sr(1)
5548 .m(m)
5549 .n(n)
5550 .k(4)
5551 .iterations(1)
5552 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5553 }
5554 }
5555 }
5556
5557 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
5558 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5559 for (uint32_t m = 1; m <= 4; m++) {
5560 GemmMicrokernelTester()
5561 .mr(4)
5562 .nr(8)
5563 .kr(1)
5564 .sr(1)
5565 .m(m)
5566 .n(8)
5567 .k(4)
5568 .iterations(1)
5569 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5570 }
5571 }
5572
5573 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
5574 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5575 for (uint32_t n = 1; n <= 8; n++) {
5576 GemmMicrokernelTester()
5577 .mr(4)
5578 .nr(8)
5579 .kr(1)
5580 .sr(1)
5581 .m(4)
5582 .n(n)
5583 .k(4)
5584 .iterations(1)
5585 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5586 }
5587 }
5588
5589 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) {
5590 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5591 for (size_t k = 1; k < 4; k++) {
5592 GemmMicrokernelTester()
5593 .mr(4)
5594 .nr(8)
5595 .kr(1)
5596 .sr(1)
5597 .m(4)
5598 .n(8)
5599 .k(k)
5600 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5601 }
5602 }
5603
5604 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) {
5605 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5606 for (size_t k = 1; k < 4; k++) {
5607 GemmMicrokernelTester()
5608 .mr(4)
5609 .nr(8)
5610 .kr(1)
5611 .sr(1)
5612 .m(4)
5613 .n(8)
5614 .k(k)
5615 .a_stride(7)
5616 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5617 }
5618 }
5619
5620 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) {
5621 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5622 for (size_t k = 1; k < 4; k++) {
5623 for (uint32_t m = 1; m <= 4; m++) {
5624 for (uint32_t n = 1; n <= 8; n++) {
5625 GemmMicrokernelTester()
5626 .mr(4)
5627 .nr(8)
5628 .kr(1)
5629 .sr(1)
5630 .m(m)
5631 .n(n)
5632 .k(k)
5633 .iterations(1)
5634 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5635 }
5636 }
5637 }
5638 }
5639
5640 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) {
5641 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5642 for (size_t k = 5; k < 8; k++) {
5643 GemmMicrokernelTester()
5644 .mr(4)
5645 .nr(8)
5646 .kr(1)
5647 .sr(1)
5648 .m(4)
5649 .n(8)
5650 .k(k)
5651 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5652 }
5653 }
5654
5655 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) {
5656 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5657 for (size_t k = 5; k < 8; k++) {
5658 GemmMicrokernelTester()
5659 .mr(4)
5660 .nr(8)
5661 .kr(1)
5662 .sr(1)
5663 .m(4)
5664 .n(8)
5665 .k(k)
5666 .a_stride(11)
5667 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5668 }
5669 }
5670
5671 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) {
5672 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5673 for (size_t k = 5; k < 8; k++) {
5674 for (uint32_t m = 1; m <= 4; m++) {
5675 for (uint32_t n = 1; n <= 8; n++) {
5676 GemmMicrokernelTester()
5677 .mr(4)
5678 .nr(8)
5679 .kr(1)
5680 .sr(1)
5681 .m(m)
5682 .n(n)
5683 .k(k)
5684 .iterations(1)
5685 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5686 }
5687 }
5688 }
5689 }
5690
5691 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) {
5692 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5693 for (size_t k = 8; k <= 40; k += 4) {
5694 GemmMicrokernelTester()
5695 .mr(4)
5696 .nr(8)
5697 .kr(1)
5698 .sr(1)
5699 .m(4)
5700 .n(8)
5701 .k(k)
5702 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5703 }
5704 }
5705
5706 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) {
5707 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5708 for (size_t k = 8; k <= 40; k += 4) {
5709 GemmMicrokernelTester()
5710 .mr(4)
5711 .nr(8)
5712 .kr(1)
5713 .sr(1)
5714 .m(4)
5715 .n(8)
5716 .k(k)
5717 .a_stride(43)
5718 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5719 }
5720 }
5721
5722 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) {
5723 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5724 for (size_t k = 8; k <= 40; k += 4) {
5725 for (uint32_t m = 1; m <= 4; m++) {
5726 for (uint32_t n = 1; n <= 8; n++) {
5727 GemmMicrokernelTester()
5728 .mr(4)
5729 .nr(8)
5730 .kr(1)
5731 .sr(1)
5732 .m(m)
5733 .n(n)
5734 .k(k)
5735 .iterations(1)
5736 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5737 }
5738 }
5739 }
5740 }
5741
5742 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) {
5743 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5744 for (uint32_t n = 9; n < 16; n++) {
5745 for (size_t k = 1; k <= 20; k += 5) {
5746 GemmMicrokernelTester()
5747 .mr(4)
5748 .nr(8)
5749 .kr(1)
5750 .sr(1)
5751 .m(4)
5752 .n(8)
5753 .k(k)
5754 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5755 }
5756 }
5757 }
5758
5759 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
5760 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5761 for (uint32_t n = 9; n < 16; n++) {
5762 for (size_t k = 1; k <= 20; k += 5) {
5763 GemmMicrokernelTester()
5764 .mr(4)
5765 .nr(8)
5766 .kr(1)
5767 .sr(1)
5768 .m(4)
5769 .n(8)
5770 .k(k)
5771 .cn_stride(11)
5772 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5773 }
5774 }
5775 }
5776
5777 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) {
5778 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5779 for (uint32_t n = 9; n < 16; n++) {
5780 for (size_t k = 1; k <= 20; k += 5) {
5781 GemmMicrokernelTester()
5782 .mr(4)
5783 .nr(8)
5784 .kr(1)
5785 .sr(1)
5786 .m(4)
5787 .n(n)
5788 .k(k)
5789 .a_stride(23)
5790 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5791 }
5792 }
5793 }
5794
5795 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) {
5796 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5797 for (uint32_t n = 9; n < 16; n++) {
5798 for (size_t k = 1; k <= 20; k += 5) {
5799 for (uint32_t m = 1; m <= 4; m++) {
5800 GemmMicrokernelTester()
5801 .mr(4)
5802 .nr(8)
5803 .kr(1)
5804 .sr(1)
5805 .m(m)
5806 .n(n)
5807 .k(k)
5808 .iterations(1)
5809 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5810 }
5811 }
5812 }
5813 }
5814
5815 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) {
5816 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5817 for (uint32_t n = 16; n <= 24; n += 8) {
5818 for (size_t k = 1; k <= 20; k += 5) {
5819 GemmMicrokernelTester()
5820 .mr(4)
5821 .nr(8)
5822 .kr(1)
5823 .sr(1)
5824 .m(4)
5825 .n(8)
5826 .k(k)
5827 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5828 }
5829 }
5830 }
5831
5832 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) {
5833 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5834 for (uint32_t n = 16; n <= 24; n += 8) {
5835 for (size_t k = 1; k <= 20; k += 5) {
5836 GemmMicrokernelTester()
5837 .mr(4)
5838 .nr(8)
5839 .kr(1)
5840 .sr(1)
5841 .m(4)
5842 .n(n)
5843 .k(k)
5844 .cn_stride(11)
5845 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5846 }
5847 }
5848 }
5849
5850 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) {
5851 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5852 for (uint32_t n = 16; n <= 24; n += 8) {
5853 for (size_t k = 1; k <= 20; k += 5) {
5854 GemmMicrokernelTester()
5855 .mr(4)
5856 .nr(8)
5857 .kr(1)
5858 .sr(1)
5859 .m(4)
5860 .n(n)
5861 .k(k)
5862 .a_stride(23)
5863 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5864 }
5865 }
5866 }
5867
5868 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) {
5869 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5870 for (uint32_t n = 16; n <= 24; n += 8) {
5871 for (size_t k = 1; k <= 20; k += 5) {
5872 for (uint32_t m = 1; m <= 4; m++) {
5873 GemmMicrokernelTester()
5874 .mr(4)
5875 .nr(8)
5876 .kr(1)
5877 .sr(1)
5878 .m(m)
5879 .n(n)
5880 .k(k)
5881 .iterations(1)
5882 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5883 }
5884 }
5885 }
5886 }
5887
5888 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) {
5889 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5890 for (size_t k = 1; k <= 20; k += 5) {
5891 for (uint32_t m = 1; m <= 4; m++) {
5892 for (uint32_t n = 1; n <= 8; n++) {
5893 GemmMicrokernelTester()
5894 .mr(4)
5895 .nr(8)
5896 .kr(1)
5897 .sr(1)
5898 .m(m)
5899 .n(n)
5900 .k(k)
5901 .cm_stride(11)
5902 .iterations(1)
5903 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5904 }
5905 }
5906 }
5907 }
5908
5909 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, qmin) {
5910 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5911 GemmMicrokernelTester()
5912 .mr(4)
5913 .nr(8)
5914 .kr(1)
5915 .sr(1)
5916 .m(4)
5917 .n(8)
5918 .k(4)
5919 .qmin(128)
5920 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5921 }
5922
5923 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, qmax) {
5924 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5925 GemmMicrokernelTester()
5926 .mr(4)
5927 .nr(8)
5928 .kr(1)
5929 .sr(1)
5930 .m(4)
5931 .n(8)
5932 .k(4)
5933 .qmax(128)
5934 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5935 }
5936
5937 TEST(F16_GEMM_MINMAX_4X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) {
5938 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5939 GemmMicrokernelTester()
5940 .mr(4)
5941 .nr(8)
5942 .kr(1)
5943 .sr(1)
5944 .m(4)
5945 .n(8)
5946 .k(4)
5947 .cm_stride(11)
5948 .Test(xnn_f16_gemm_minmax_ukernel_4x8__aarch64_neonfp16arith_ld64);
5949 }
5950#endif // XNN_ARCH_ARM64
5951
5952
5953#if XNN_ARCH_ARM64
5954 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) {
5955 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5956 GemmMicrokernelTester()
5957 .mr(6)
5958 .nr(8)
5959 .kr(1)
5960 .sr(1)
5961 .m(6)
5962 .n(8)
5963 .k(4)
5964 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
5965 }
5966
5967 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) {
5968 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5969 GemmMicrokernelTester()
5970 .mr(6)
5971 .nr(8)
5972 .kr(1)
5973 .sr(1)
5974 .m(6)
5975 .n(8)
5976 .k(4)
5977 .cn_stride(11)
5978 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
5979 }
5980
5981 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) {
5982 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5983 GemmMicrokernelTester()
5984 .mr(6)
5985 .nr(8)
5986 .kr(1)
5987 .sr(1)
5988 .m(6)
5989 .n(8)
5990 .k(4)
5991 .a_stride(7)
5992 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
5993 }
5994
5995 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) {
5996 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
5997 for (uint32_t m = 1; m <= 6; m++) {
5998 for (uint32_t n = 1; n <= 8; n++) {
5999 GemmMicrokernelTester()
6000 .mr(6)
6001 .nr(8)
6002 .kr(1)
6003 .sr(1)
6004 .m(m)
6005 .n(n)
6006 .k(4)
6007 .iterations(1)
6008 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6009 }
6010 }
6011 }
6012
6013 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
6014 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6015 for (uint32_t m = 1; m <= 6; m++) {
6016 GemmMicrokernelTester()
6017 .mr(6)
6018 .nr(8)
6019 .kr(1)
6020 .sr(1)
6021 .m(m)
6022 .n(8)
6023 .k(4)
6024 .iterations(1)
6025 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6026 }
6027 }
6028
6029 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
6030 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6031 for (uint32_t n = 1; n <= 8; n++) {
6032 GemmMicrokernelTester()
6033 .mr(6)
6034 .nr(8)
6035 .kr(1)
6036 .sr(1)
6037 .m(6)
6038 .n(n)
6039 .k(4)
6040 .iterations(1)
6041 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6042 }
6043 }
6044
6045 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) {
6046 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6047 for (size_t k = 1; k < 4; k++) {
6048 GemmMicrokernelTester()
6049 .mr(6)
6050 .nr(8)
6051 .kr(1)
6052 .sr(1)
6053 .m(6)
6054 .n(8)
6055 .k(k)
6056 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6057 }
6058 }
6059
6060 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) {
6061 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6062 for (size_t k = 1; k < 4; k++) {
6063 GemmMicrokernelTester()
6064 .mr(6)
6065 .nr(8)
6066 .kr(1)
6067 .sr(1)
6068 .m(6)
6069 .n(8)
6070 .k(k)
6071 .a_stride(7)
6072 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6073 }
6074 }
6075
6076 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) {
6077 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6078 for (size_t k = 1; k < 4; k++) {
6079 for (uint32_t m = 1; m <= 6; m++) {
6080 for (uint32_t n = 1; n <= 8; n++) {
6081 GemmMicrokernelTester()
6082 .mr(6)
6083 .nr(8)
6084 .kr(1)
6085 .sr(1)
6086 .m(m)
6087 .n(n)
6088 .k(k)
6089 .iterations(1)
6090 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6091 }
6092 }
6093 }
6094 }
6095
6096 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) {
6097 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6098 for (size_t k = 5; k < 8; k++) {
6099 GemmMicrokernelTester()
6100 .mr(6)
6101 .nr(8)
6102 .kr(1)
6103 .sr(1)
6104 .m(6)
6105 .n(8)
6106 .k(k)
6107 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6108 }
6109 }
6110
6111 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) {
6112 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6113 for (size_t k = 5; k < 8; k++) {
6114 GemmMicrokernelTester()
6115 .mr(6)
6116 .nr(8)
6117 .kr(1)
6118 .sr(1)
6119 .m(6)
6120 .n(8)
6121 .k(k)
6122 .a_stride(11)
6123 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6124 }
6125 }
6126
6127 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) {
6128 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6129 for (size_t k = 5; k < 8; k++) {
6130 for (uint32_t m = 1; m <= 6; m++) {
6131 for (uint32_t n = 1; n <= 8; n++) {
6132 GemmMicrokernelTester()
6133 .mr(6)
6134 .nr(8)
6135 .kr(1)
6136 .sr(1)
6137 .m(m)
6138 .n(n)
6139 .k(k)
6140 .iterations(1)
6141 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6142 }
6143 }
6144 }
6145 }
6146
6147 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) {
6148 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6149 for (size_t k = 8; k <= 40; k += 4) {
6150 GemmMicrokernelTester()
6151 .mr(6)
6152 .nr(8)
6153 .kr(1)
6154 .sr(1)
6155 .m(6)
6156 .n(8)
6157 .k(k)
6158 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6159 }
6160 }
6161
6162 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) {
6163 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6164 for (size_t k = 8; k <= 40; k += 4) {
6165 GemmMicrokernelTester()
6166 .mr(6)
6167 .nr(8)
6168 .kr(1)
6169 .sr(1)
6170 .m(6)
6171 .n(8)
6172 .k(k)
6173 .a_stride(43)
6174 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6175 }
6176 }
6177
6178 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) {
6179 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6180 for (size_t k = 8; k <= 40; k += 4) {
6181 for (uint32_t m = 1; m <= 6; m++) {
6182 for (uint32_t n = 1; n <= 8; n++) {
6183 GemmMicrokernelTester()
6184 .mr(6)
6185 .nr(8)
6186 .kr(1)
6187 .sr(1)
6188 .m(m)
6189 .n(n)
6190 .k(k)
6191 .iterations(1)
6192 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6193 }
6194 }
6195 }
6196 }
6197
6198 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) {
6199 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6200 for (uint32_t n = 9; n < 16; n++) {
6201 for (size_t k = 1; k <= 20; k += 5) {
6202 GemmMicrokernelTester()
6203 .mr(6)
6204 .nr(8)
6205 .kr(1)
6206 .sr(1)
6207 .m(6)
6208 .n(8)
6209 .k(k)
6210 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6211 }
6212 }
6213 }
6214
6215 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
6216 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6217 for (uint32_t n = 9; n < 16; n++) {
6218 for (size_t k = 1; k <= 20; k += 5) {
6219 GemmMicrokernelTester()
6220 .mr(6)
6221 .nr(8)
6222 .kr(1)
6223 .sr(1)
6224 .m(6)
6225 .n(8)
6226 .k(k)
6227 .cn_stride(11)
6228 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6229 }
6230 }
6231 }
6232
6233 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) {
6234 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6235 for (uint32_t n = 9; n < 16; n++) {
6236 for (size_t k = 1; k <= 20; k += 5) {
6237 GemmMicrokernelTester()
6238 .mr(6)
6239 .nr(8)
6240 .kr(1)
6241 .sr(1)
6242 .m(6)
6243 .n(n)
6244 .k(k)
6245 .a_stride(23)
6246 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6247 }
6248 }
6249 }
6250
6251 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) {
6252 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6253 for (uint32_t n = 9; n < 16; n++) {
6254 for (size_t k = 1; k <= 20; k += 5) {
6255 for (uint32_t m = 1; m <= 6; m++) {
6256 GemmMicrokernelTester()
6257 .mr(6)
6258 .nr(8)
6259 .kr(1)
6260 .sr(1)
6261 .m(m)
6262 .n(n)
6263 .k(k)
6264 .iterations(1)
6265 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6266 }
6267 }
6268 }
6269 }
6270
6271 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) {
6272 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6273 for (uint32_t n = 16; n <= 24; n += 8) {
6274 for (size_t k = 1; k <= 20; k += 5) {
6275 GemmMicrokernelTester()
6276 .mr(6)
6277 .nr(8)
6278 .kr(1)
6279 .sr(1)
6280 .m(6)
6281 .n(8)
6282 .k(k)
6283 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6284 }
6285 }
6286 }
6287
6288 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) {
6289 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6290 for (uint32_t n = 16; n <= 24; n += 8) {
6291 for (size_t k = 1; k <= 20; k += 5) {
6292 GemmMicrokernelTester()
6293 .mr(6)
6294 .nr(8)
6295 .kr(1)
6296 .sr(1)
6297 .m(6)
6298 .n(n)
6299 .k(k)
6300 .cn_stride(11)
6301 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6302 }
6303 }
6304 }
6305
6306 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) {
6307 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6308 for (uint32_t n = 16; n <= 24; n += 8) {
6309 for (size_t k = 1; k <= 20; k += 5) {
6310 GemmMicrokernelTester()
6311 .mr(6)
6312 .nr(8)
6313 .kr(1)
6314 .sr(1)
6315 .m(6)
6316 .n(n)
6317 .k(k)
6318 .a_stride(23)
6319 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6320 }
6321 }
6322 }
6323
6324 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) {
6325 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6326 for (uint32_t n = 16; n <= 24; n += 8) {
6327 for (size_t k = 1; k <= 20; k += 5) {
6328 for (uint32_t m = 1; m <= 6; m++) {
6329 GemmMicrokernelTester()
6330 .mr(6)
6331 .nr(8)
6332 .kr(1)
6333 .sr(1)
6334 .m(m)
6335 .n(n)
6336 .k(k)
6337 .iterations(1)
6338 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6339 }
6340 }
6341 }
6342 }
6343
6344 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) {
6345 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6346 for (size_t k = 1; k <= 20; k += 5) {
6347 for (uint32_t m = 1; m <= 6; m++) {
6348 for (uint32_t n = 1; n <= 8; n++) {
6349 GemmMicrokernelTester()
6350 .mr(6)
6351 .nr(8)
6352 .kr(1)
6353 .sr(1)
6354 .m(m)
6355 .n(n)
6356 .k(k)
6357 .cm_stride(11)
6358 .iterations(1)
6359 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6360 }
6361 }
6362 }
6363 }
6364
6365 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, qmin) {
6366 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6367 GemmMicrokernelTester()
6368 .mr(6)
6369 .nr(8)
6370 .kr(1)
6371 .sr(1)
6372 .m(6)
6373 .n(8)
6374 .k(4)
6375 .qmin(128)
6376 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6377 }
6378
6379 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, qmax) {
6380 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6381 GemmMicrokernelTester()
6382 .mr(6)
6383 .nr(8)
6384 .kr(1)
6385 .sr(1)
6386 .m(6)
6387 .n(8)
6388 .k(4)
6389 .qmax(128)
6390 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6391 }
6392
6393 TEST(F16_GEMM_MINMAX_6X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) {
6394 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6395 GemmMicrokernelTester()
6396 .mr(6)
6397 .nr(8)
6398 .kr(1)
6399 .sr(1)
6400 .m(6)
6401 .n(8)
6402 .k(4)
6403 .cm_stride(11)
6404 .Test(xnn_f16_gemm_minmax_ukernel_6x8__aarch64_neonfp16arith_ld64);
6405 }
6406#endif // XNN_ARCH_ARM64
Frank Barchard3b8e5662020-04-20 12:12:53 -07006407
6408
6409#if XNN_ARCH_ARM64
6410 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4) {
6411 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6412 GemmMicrokernelTester()
6413 .mr(8)
6414 .nr(8)
6415 .kr(1)
6416 .sr(1)
6417 .m(8)
6418 .n(8)
6419 .k(4)
6420 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6421 }
6422
6423 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cn) {
6424 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6425 GemmMicrokernelTester()
6426 .mr(8)
6427 .nr(8)
6428 .kr(1)
6429 .sr(1)
6430 .m(8)
6431 .n(8)
6432 .k(4)
6433 .cn_stride(11)
6434 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6435 }
6436
6437 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_strided_a) {
6438 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6439 GemmMicrokernelTester()
6440 .mr(8)
6441 .nr(8)
6442 .kr(1)
6443 .sr(1)
6444 .m(8)
6445 .n(8)
6446 .k(4)
6447 .a_stride(7)
6448 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6449 }
6450
6451 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile) {
6452 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6453 for (uint32_t m = 1; m <= 8; m++) {
6454 for (uint32_t n = 1; n <= 8; n++) {
6455 GemmMicrokernelTester()
6456 .mr(8)
6457 .nr(8)
6458 .kr(1)
6459 .sr(1)
6460 .m(m)
6461 .n(n)
6462 .k(4)
6463 .iterations(1)
6464 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6465 }
6466 }
6467 }
6468
6469 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_m) {
6470 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6471 for (uint32_t m = 1; m <= 8; m++) {
6472 GemmMicrokernelTester()
6473 .mr(8)
6474 .nr(8)
6475 .kr(1)
6476 .sr(1)
6477 .m(m)
6478 .n(8)
6479 .k(4)
6480 .iterations(1)
6481 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6482 }
6483 }
6484
6485 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_eq_4_subtile_n) {
6486 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6487 for (uint32_t n = 1; n <= 8; n++) {
6488 GemmMicrokernelTester()
6489 .mr(8)
6490 .nr(8)
6491 .kr(1)
6492 .sr(1)
6493 .m(8)
6494 .n(n)
6495 .k(4)
6496 .iterations(1)
6497 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6498 }
6499 }
6500
6501 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4) {
6502 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6503 for (size_t k = 1; k < 4; k++) {
6504 GemmMicrokernelTester()
6505 .mr(8)
6506 .nr(8)
6507 .kr(1)
6508 .sr(1)
6509 .m(8)
6510 .n(8)
6511 .k(k)
6512 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6513 }
6514 }
6515
6516 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_strided_a) {
6517 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6518 for (size_t k = 1; k < 4; k++) {
6519 GemmMicrokernelTester()
6520 .mr(8)
6521 .nr(8)
6522 .kr(1)
6523 .sr(1)
6524 .m(8)
6525 .n(8)
6526 .k(k)
6527 .a_stride(7)
6528 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6529 }
6530 }
6531
6532 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_lt_4_subtile) {
6533 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6534 for (size_t k = 1; k < 4; k++) {
6535 for (uint32_t m = 1; m <= 8; m++) {
6536 for (uint32_t n = 1; n <= 8; n++) {
6537 GemmMicrokernelTester()
6538 .mr(8)
6539 .nr(8)
6540 .kr(1)
6541 .sr(1)
6542 .m(m)
6543 .n(n)
6544 .k(k)
6545 .iterations(1)
6546 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6547 }
6548 }
6549 }
6550 }
6551
6552 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4) {
6553 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6554 for (size_t k = 5; k < 8; k++) {
6555 GemmMicrokernelTester()
6556 .mr(8)
6557 .nr(8)
6558 .kr(1)
6559 .sr(1)
6560 .m(8)
6561 .n(8)
6562 .k(k)
6563 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6564 }
6565 }
6566
6567 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_strided_a) {
6568 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6569 for (size_t k = 5; k < 8; k++) {
6570 GemmMicrokernelTester()
6571 .mr(8)
6572 .nr(8)
6573 .kr(1)
6574 .sr(1)
6575 .m(8)
6576 .n(8)
6577 .k(k)
6578 .a_stride(11)
6579 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6580 }
6581 }
6582
6583 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_gt_4_subtile) {
6584 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6585 for (size_t k = 5; k < 8; k++) {
6586 for (uint32_t m = 1; m <= 8; m++) {
6587 for (uint32_t n = 1; n <= 8; n++) {
6588 GemmMicrokernelTester()
6589 .mr(8)
6590 .nr(8)
6591 .kr(1)
6592 .sr(1)
6593 .m(m)
6594 .n(n)
6595 .k(k)
6596 .iterations(1)
6597 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6598 }
6599 }
6600 }
6601 }
6602
6603 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4) {
6604 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6605 for (size_t k = 8; k <= 40; k += 4) {
6606 GemmMicrokernelTester()
6607 .mr(8)
6608 .nr(8)
6609 .kr(1)
6610 .sr(1)
6611 .m(8)
6612 .n(8)
6613 .k(k)
6614 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6615 }
6616 }
6617
6618 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_strided_a) {
6619 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6620 for (size_t k = 8; k <= 40; k += 4) {
6621 GemmMicrokernelTester()
6622 .mr(8)
6623 .nr(8)
6624 .kr(1)
6625 .sr(1)
6626 .m(8)
6627 .n(8)
6628 .k(k)
6629 .a_stride(43)
6630 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6631 }
6632 }
6633
6634 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, k_div_4_subtile) {
6635 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6636 for (size_t k = 8; k <= 40; k += 4) {
6637 for (uint32_t m = 1; m <= 8; m++) {
6638 for (uint32_t n = 1; n <= 8; n++) {
6639 GemmMicrokernelTester()
6640 .mr(8)
6641 .nr(8)
6642 .kr(1)
6643 .sr(1)
6644 .m(m)
6645 .n(n)
6646 .k(k)
6647 .iterations(1)
6648 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6649 }
6650 }
6651 }
6652 }
6653
6654 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8) {
6655 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6656 for (uint32_t n = 9; n < 16; n++) {
6657 for (size_t k = 1; k <= 20; k += 5) {
6658 GemmMicrokernelTester()
6659 .mr(8)
6660 .nr(8)
6661 .kr(1)
6662 .sr(1)
6663 .m(8)
6664 .n(8)
6665 .k(k)
6666 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6667 }
6668 }
6669 }
6670
6671 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_cn) {
6672 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6673 for (uint32_t n = 9; n < 16; n++) {
6674 for (size_t k = 1; k <= 20; k += 5) {
6675 GemmMicrokernelTester()
6676 .mr(8)
6677 .nr(8)
6678 .kr(1)
6679 .sr(1)
6680 .m(8)
6681 .n(8)
6682 .k(k)
6683 .cn_stride(11)
6684 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6685 }
6686 }
6687 }
6688
6689 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_strided_a) {
6690 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6691 for (uint32_t n = 9; n < 16; n++) {
6692 for (size_t k = 1; k <= 20; k += 5) {
6693 GemmMicrokernelTester()
6694 .mr(8)
6695 .nr(8)
6696 .kr(1)
6697 .sr(1)
6698 .m(8)
6699 .n(n)
6700 .k(k)
6701 .a_stride(23)
6702 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6703 }
6704 }
6705 }
6706
6707 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_gt_8_subtile) {
6708 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6709 for (uint32_t n = 9; n < 16; n++) {
6710 for (size_t k = 1; k <= 20; k += 5) {
6711 for (uint32_t m = 1; m <= 8; m++) {
6712 GemmMicrokernelTester()
6713 .mr(8)
6714 .nr(8)
6715 .kr(1)
6716 .sr(1)
6717 .m(m)
6718 .n(n)
6719 .k(k)
6720 .iterations(1)
6721 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6722 }
6723 }
6724 }
6725 }
6726
6727 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8) {
6728 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6729 for (uint32_t n = 16; n <= 24; n += 8) {
6730 for (size_t k = 1; k <= 20; k += 5) {
6731 GemmMicrokernelTester()
6732 .mr(8)
6733 .nr(8)
6734 .kr(1)
6735 .sr(1)
6736 .m(8)
6737 .n(8)
6738 .k(k)
6739 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6740 }
6741 }
6742 }
6743
6744 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_cn) {
6745 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6746 for (uint32_t n = 16; n <= 24; n += 8) {
6747 for (size_t k = 1; k <= 20; k += 5) {
6748 GemmMicrokernelTester()
6749 .mr(8)
6750 .nr(8)
6751 .kr(1)
6752 .sr(1)
6753 .m(8)
6754 .n(n)
6755 .k(k)
6756 .cn_stride(11)
6757 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6758 }
6759 }
6760 }
6761
6762 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_strided_a) {
6763 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6764 for (uint32_t n = 16; n <= 24; n += 8) {
6765 for (size_t k = 1; k <= 20; k += 5) {
6766 GemmMicrokernelTester()
6767 .mr(8)
6768 .nr(8)
6769 .kr(1)
6770 .sr(1)
6771 .m(8)
6772 .n(n)
6773 .k(k)
6774 .a_stride(23)
6775 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6776 }
6777 }
6778 }
6779
6780 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, n_div_8_subtile) {
6781 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6782 for (uint32_t n = 16; n <= 24; n += 8) {
6783 for (size_t k = 1; k <= 20; k += 5) {
6784 for (uint32_t m = 1; m <= 8; m++) {
6785 GemmMicrokernelTester()
6786 .mr(8)
6787 .nr(8)
6788 .kr(1)
6789 .sr(1)
6790 .m(m)
6791 .n(n)
6792 .k(k)
6793 .iterations(1)
6794 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6795 }
6796 }
6797 }
6798 }
6799
6800 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cm_subtile) {
6801 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6802 for (size_t k = 1; k <= 20; k += 5) {
6803 for (uint32_t m = 1; m <= 8; m++) {
6804 for (uint32_t n = 1; n <= 8; n++) {
6805 GemmMicrokernelTester()
6806 .mr(8)
6807 .nr(8)
6808 .kr(1)
6809 .sr(1)
6810 .m(m)
6811 .n(n)
6812 .k(k)
6813 .cm_stride(11)
6814 .iterations(1)
6815 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6816 }
6817 }
6818 }
6819 }
6820
6821 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, qmin) {
6822 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6823 GemmMicrokernelTester()
6824 .mr(8)
6825 .nr(8)
6826 .kr(1)
6827 .sr(1)
6828 .m(8)
6829 .n(8)
6830 .k(4)
6831 .qmin(128)
6832 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6833 }
6834
6835 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, qmax) {
6836 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6837 GemmMicrokernelTester()
6838 .mr(8)
6839 .nr(8)
6840 .kr(1)
6841 .sr(1)
6842 .m(8)
6843 .n(8)
6844 .k(4)
6845 .qmax(128)
6846 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6847 }
6848
6849 TEST(F16_GEMM_MINMAX_8X8__AARCH64_NEONFP16ARITH_LD64, strided_cm) {
6850 TEST_REQUIRES_ARM_NEON_FP16_ARITH;
6851 GemmMicrokernelTester()
6852 .mr(8)
6853 .nr(8)
6854 .kr(1)
6855 .sr(1)
6856 .m(8)
6857 .n(8)
6858 .k(4)
6859 .cm_stride(11)
6860 .Test(xnn_f16_gemm_minmax_ukernel_8x8__aarch64_neonfp16arith_ld64);
6861 }
6862#endif // XNN_ARCH_ARM64