blob: a06f46351d8f8e47b87b4aa50dd76a24616da24c [file] [log] [blame]
Marat Dukhan1c587112020-04-08 20:04:28 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f32-gemm-minmax.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
22#include "gemm-microkernel-tester.h"
23
24
25#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
26 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
27 TEST_REQUIRES_ARM_NEON_FMA;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(8)
36 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
37 }
38
39 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FMA;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(8)
49 .cn_stride(11)
50 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
51 }
52
53 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
54 TEST_REQUIRES_ARM_NEON_FMA;
55 GemmMicrokernelTester()
56 .mr(1)
57 .nr(8)
58 .kr(1)
59 .sr(1)
60 .m(1)
61 .n(8)
62 .k(8)
63 .a_stride(11)
64 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
65 }
66
67 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
68 TEST_REQUIRES_ARM_NEON_FMA;
69 for (uint32_t m = 1; m <= 1; m++) {
70 for (uint32_t n = 1; n <= 8; n++) {
71 GemmMicrokernelTester()
72 .mr(1)
73 .nr(8)
74 .kr(1)
75 .sr(1)
76 .m(m)
77 .n(n)
78 .k(8)
79 .iterations(1)
80 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
81 }
82 }
83 }
84
85 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
86 TEST_REQUIRES_ARM_NEON_FMA;
87 for (uint32_t m = 1; m <= 1; m++) {
88 GemmMicrokernelTester()
89 .mr(1)
90 .nr(8)
91 .kr(1)
92 .sr(1)
93 .m(m)
94 .n(8)
95 .k(8)
96 .iterations(1)
97 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
98 }
99 }
100
101 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
102 TEST_REQUIRES_ARM_NEON_FMA;
103 for (uint32_t n = 1; n <= 8; n++) {
104 GemmMicrokernelTester()
105 .mr(1)
106 .nr(8)
107 .kr(1)
108 .sr(1)
109 .m(1)
110 .n(n)
111 .k(8)
112 .iterations(1)
113 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
114 }
115 }
116
117 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
118 TEST_REQUIRES_ARM_NEON_FMA;
119 GemmMicrokernelTester()
120 .mr(1)
121 .nr(8)
122 .kr(1)
123 .sr(1)
124 .m(1)
125 .n(8)
126 .k(16)
127 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
128 }
129
130 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_strided_a) {
131 TEST_REQUIRES_ARM_NEON_FMA;
132 GemmMicrokernelTester()
133 .mr(1)
134 .nr(8)
135 .kr(1)
136 .sr(1)
137 .m(1)
138 .n(8)
139 .k(16)
140 .a_stride(19)
141 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
142 }
143
144 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
145 TEST_REQUIRES_ARM_NEON_FMA;
146 for (uint32_t m = 1; m <= 1; m++) {
147 for (uint32_t n = 1; n <= 8; n++) {
148 GemmMicrokernelTester()
149 .mr(1)
150 .nr(8)
151 .kr(1)
152 .sr(1)
153 .m(m)
154 .n(n)
155 .k(16)
156 .iterations(1)
157 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
158 }
159 }
160 }
161
162 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
163 TEST_REQUIRES_ARM_NEON_FMA;
164 for (size_t k = 1; k < 16; k++) {
165 GemmMicrokernelTester()
166 .mr(1)
167 .nr(8)
168 .kr(1)
169 .sr(1)
170 .m(1)
171 .n(8)
172 .k(k)
173 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
174 }
175 }
176
177 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_strided_a) {
178 TEST_REQUIRES_ARM_NEON_FMA;
179 for (size_t k = 1; k < 16; k++) {
180 GemmMicrokernelTester()
181 .mr(1)
182 .nr(8)
183 .kr(1)
184 .sr(1)
185 .m(1)
186 .n(8)
187 .k(k)
188 .a_stride(19)
189 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
190 }
191 }
192
193 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
194 TEST_REQUIRES_ARM_NEON_FMA;
195 for (size_t k = 1; k < 16; k++) {
196 for (uint32_t m = 1; m <= 1; m++) {
197 for (uint32_t n = 1; n <= 8; n++) {
198 GemmMicrokernelTester()
199 .mr(1)
200 .nr(8)
201 .kr(1)
202 .sr(1)
203 .m(m)
204 .n(n)
205 .k(k)
206 .iterations(1)
207 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
208 }
209 }
210 }
211 }
212
213 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
214 TEST_REQUIRES_ARM_NEON_FMA;
215 for (size_t k = 17; k < 16; k++) {
216 GemmMicrokernelTester()
217 .mr(1)
218 .nr(8)
219 .kr(1)
220 .sr(1)
221 .m(1)
222 .n(8)
223 .k(k)
224 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
225 }
226 }
227
228 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_strided_a) {
229 TEST_REQUIRES_ARM_NEON_FMA;
230 for (size_t k = 17; k < 16; k++) {
231 GemmMicrokernelTester()
232 .mr(1)
233 .nr(8)
234 .kr(1)
235 .sr(1)
236 .m(1)
237 .n(8)
238 .k(k)
239 .a_stride(19)
240 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
241 }
242 }
243
244 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_subtile) {
245 TEST_REQUIRES_ARM_NEON_FMA;
246 for (size_t k = 17; k < 16; k++) {
247 for (uint32_t m = 1; m <= 1; m++) {
248 for (uint32_t n = 1; n <= 8; n++) {
249 GemmMicrokernelTester()
250 .mr(1)
251 .nr(8)
252 .kr(1)
253 .sr(1)
254 .m(m)
255 .n(n)
256 .k(k)
257 .iterations(1)
258 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
259 }
260 }
261 }
262 }
263
264 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
265 TEST_REQUIRES_ARM_NEON_FMA;
266 for (size_t k = 24; k <= 80; k += 8) {
267 GemmMicrokernelTester()
268 .mr(1)
269 .nr(8)
270 .kr(1)
271 .sr(1)
272 .m(1)
273 .n(8)
274 .k(k)
275 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
276 }
277 }
278
279 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_strided_a) {
280 TEST_REQUIRES_ARM_NEON_FMA;
281 for (size_t k = 24; k <= 80; k += 8) {
282 GemmMicrokernelTester()
283 .mr(1)
284 .nr(8)
285 .kr(1)
286 .sr(1)
287 .m(1)
288 .n(8)
289 .k(k)
290 .a_stride(83)
291 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
292 }
293 }
294
295 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
296 TEST_REQUIRES_ARM_NEON_FMA;
297 for (size_t k = 24; k <= 80; k += 8) {
298 for (uint32_t m = 1; m <= 1; m++) {
299 for (uint32_t n = 1; n <= 8; n++) {
300 GemmMicrokernelTester()
301 .mr(1)
302 .nr(8)
303 .kr(1)
304 .sr(1)
305 .m(m)
306 .n(n)
307 .k(k)
308 .iterations(1)
309 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
310 }
311 }
312 }
313 }
314
315 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
316 TEST_REQUIRES_ARM_NEON_FMA;
317 for (uint32_t n = 9; n < 16; n++) {
318 for (size_t k = 1; k <= 40; k += 9) {
319 GemmMicrokernelTester()
320 .mr(1)
321 .nr(8)
322 .kr(1)
323 .sr(1)
324 .m(1)
325 .n(8)
326 .k(k)
327 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
328 }
329 }
330 }
331
332 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
333 TEST_REQUIRES_ARM_NEON_FMA;
334 for (uint32_t n = 9; n < 16; n++) {
335 for (size_t k = 1; k <= 40; k += 9) {
336 GemmMicrokernelTester()
337 .mr(1)
338 .nr(8)
339 .kr(1)
340 .sr(1)
341 .m(1)
342 .n(8)
343 .k(k)
344 .cn_stride(11)
345 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
346 }
347 }
348 }
349
350 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
351 TEST_REQUIRES_ARM_NEON_FMA;
352 for (uint32_t n = 9; n < 16; n++) {
353 for (size_t k = 1; k <= 40; k += 9) {
354 GemmMicrokernelTester()
355 .mr(1)
356 .nr(8)
357 .kr(1)
358 .sr(1)
359 .m(1)
360 .n(n)
361 .k(k)
362 .a_stride(43)
363 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
364 }
365 }
366 }
367
368 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
369 TEST_REQUIRES_ARM_NEON_FMA;
370 for (uint32_t n = 9; n < 16; n++) {
371 for (size_t k = 1; k <= 40; k += 9) {
372 for (uint32_t m = 1; m <= 1; m++) {
373 GemmMicrokernelTester()
374 .mr(1)
375 .nr(8)
376 .kr(1)
377 .sr(1)
378 .m(m)
379 .n(n)
380 .k(k)
381 .iterations(1)
382 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
383 }
384 }
385 }
386 }
387
388 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
389 TEST_REQUIRES_ARM_NEON_FMA;
390 for (uint32_t n = 16; n <= 24; n += 8) {
391 for (size_t k = 1; k <= 40; k += 9) {
392 GemmMicrokernelTester()
393 .mr(1)
394 .nr(8)
395 .kr(1)
396 .sr(1)
397 .m(1)
398 .n(8)
399 .k(k)
400 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
401 }
402 }
403 }
404
405 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
406 TEST_REQUIRES_ARM_NEON_FMA;
407 for (uint32_t n = 16; n <= 24; n += 8) {
408 for (size_t k = 1; k <= 40; k += 9) {
409 GemmMicrokernelTester()
410 .mr(1)
411 .nr(8)
412 .kr(1)
413 .sr(1)
414 .m(1)
415 .n(n)
416 .k(k)
417 .cn_stride(11)
418 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
419 }
420 }
421 }
422
423 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
424 TEST_REQUIRES_ARM_NEON_FMA;
425 for (uint32_t n = 16; n <= 24; n += 8) {
426 for (size_t k = 1; k <= 40; k += 9) {
427 GemmMicrokernelTester()
428 .mr(1)
429 .nr(8)
430 .kr(1)
431 .sr(1)
432 .m(1)
433 .n(n)
434 .k(k)
435 .a_stride(43)
436 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
437 }
438 }
439 }
440
441 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
442 TEST_REQUIRES_ARM_NEON_FMA;
443 for (uint32_t n = 16; n <= 24; n += 8) {
444 for (size_t k = 1; k <= 40; k += 9) {
445 for (uint32_t m = 1; m <= 1; m++) {
446 GemmMicrokernelTester()
447 .mr(1)
448 .nr(8)
449 .kr(1)
450 .sr(1)
451 .m(m)
452 .n(n)
453 .k(k)
454 .iterations(1)
455 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
456 }
457 }
458 }
459 }
460
461 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
462 TEST_REQUIRES_ARM_NEON_FMA;
463 for (size_t k = 1; k <= 40; k += 9) {
464 for (uint32_t m = 1; m <= 1; m++) {
465 for (uint32_t n = 1; n <= 8; n++) {
466 GemmMicrokernelTester()
467 .mr(1)
468 .nr(8)
469 .kr(1)
470 .sr(1)
471 .m(m)
472 .n(n)
473 .k(k)
474 .cm_stride(11)
475 .iterations(1)
476 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
477 }
478 }
479 }
480 }
481
482 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
483 TEST_REQUIRES_ARM_NEON_FMA;
484 GemmMicrokernelTester()
485 .mr(1)
486 .nr(8)
487 .kr(1)
488 .sr(1)
489 .m(1)
490 .n(8)
491 .k(8)
492 .qmin(128)
493 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
494 }
495
496 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
497 TEST_REQUIRES_ARM_NEON_FMA;
498 GemmMicrokernelTester()
499 .mr(1)
500 .nr(8)
501 .kr(1)
502 .sr(1)
503 .m(1)
504 .n(8)
505 .k(8)
506 .qmax(128)
507 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
508 }
509
510 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
511 TEST_REQUIRES_ARM_NEON_FMA;
512 GemmMicrokernelTester()
513 .mr(1)
514 .nr(8)
515 .kr(1)
516 .sr(1)
517 .m(1)
518 .n(8)
519 .k(8)
520 .cm_stride(11)
521 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
522 }
523#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
524
525
526#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
527 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
528 TEST_REQUIRES_ARM_NEON_FMA;
529 GemmMicrokernelTester()
530 .mr(1)
531 .nr(8)
532 .kr(1)
533 .sr(1)
534 .m(1)
535 .n(8)
536 .k(8)
537 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
538 }
539
540 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
541 TEST_REQUIRES_ARM_NEON_FMA;
542 GemmMicrokernelTester()
543 .mr(1)
544 .nr(8)
545 .kr(1)
546 .sr(1)
547 .m(1)
548 .n(8)
549 .k(8)
550 .cn_stride(11)
551 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
552 }
553
554 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
555 TEST_REQUIRES_ARM_NEON_FMA;
556 GemmMicrokernelTester()
557 .mr(1)
558 .nr(8)
559 .kr(1)
560 .sr(1)
561 .m(1)
562 .n(8)
563 .k(8)
564 .a_stride(11)
565 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
566 }
567
568 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
569 TEST_REQUIRES_ARM_NEON_FMA;
570 for (uint32_t m = 1; m <= 1; m++) {
571 for (uint32_t n = 1; n <= 8; n++) {
572 GemmMicrokernelTester()
573 .mr(1)
574 .nr(8)
575 .kr(1)
576 .sr(1)
577 .m(m)
578 .n(n)
579 .k(8)
580 .iterations(1)
581 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
582 }
583 }
584 }
585
586 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
587 TEST_REQUIRES_ARM_NEON_FMA;
588 for (uint32_t m = 1; m <= 1; m++) {
589 GemmMicrokernelTester()
590 .mr(1)
591 .nr(8)
592 .kr(1)
593 .sr(1)
594 .m(m)
595 .n(8)
596 .k(8)
597 .iterations(1)
598 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
599 }
600 }
601
602 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
603 TEST_REQUIRES_ARM_NEON_FMA;
604 for (uint32_t n = 1; n <= 8; n++) {
605 GemmMicrokernelTester()
606 .mr(1)
607 .nr(8)
608 .kr(1)
609 .sr(1)
610 .m(1)
611 .n(n)
612 .k(8)
613 .iterations(1)
614 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
615 }
616 }
617
618 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
619 TEST_REQUIRES_ARM_NEON_FMA;
620 GemmMicrokernelTester()
621 .mr(1)
622 .nr(8)
623 .kr(1)
624 .sr(1)
625 .m(1)
626 .n(8)
627 .k(16)
628 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
629 }
630
631 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
632 TEST_REQUIRES_ARM_NEON_FMA;
633 GemmMicrokernelTester()
634 .mr(1)
635 .nr(8)
636 .kr(1)
637 .sr(1)
638 .m(1)
639 .n(8)
640 .k(16)
641 .a_stride(19)
642 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
643 }
644
645 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
646 TEST_REQUIRES_ARM_NEON_FMA;
647 for (uint32_t m = 1; m <= 1; m++) {
648 for (uint32_t n = 1; n <= 8; n++) {
649 GemmMicrokernelTester()
650 .mr(1)
651 .nr(8)
652 .kr(1)
653 .sr(1)
654 .m(m)
655 .n(n)
656 .k(16)
657 .iterations(1)
658 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
659 }
660 }
661 }
662
663 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
664 TEST_REQUIRES_ARM_NEON_FMA;
665 for (size_t k = 1; k < 16; k++) {
666 GemmMicrokernelTester()
667 .mr(1)
668 .nr(8)
669 .kr(1)
670 .sr(1)
671 .m(1)
672 .n(8)
673 .k(k)
674 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
675 }
676 }
677
678 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
679 TEST_REQUIRES_ARM_NEON_FMA;
680 for (size_t k = 1; k < 16; k++) {
681 GemmMicrokernelTester()
682 .mr(1)
683 .nr(8)
684 .kr(1)
685 .sr(1)
686 .m(1)
687 .n(8)
688 .k(k)
689 .a_stride(19)
690 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
691 }
692 }
693
694 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
695 TEST_REQUIRES_ARM_NEON_FMA;
696 for (size_t k = 1; k < 16; k++) {
697 for (uint32_t m = 1; m <= 1; m++) {
698 for (uint32_t n = 1; n <= 8; n++) {
699 GemmMicrokernelTester()
700 .mr(1)
701 .nr(8)
702 .kr(1)
703 .sr(1)
704 .m(m)
705 .n(n)
706 .k(k)
707 .iterations(1)
708 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
709 }
710 }
711 }
712 }
713
714 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
715 TEST_REQUIRES_ARM_NEON_FMA;
716 for (size_t k = 17; k < 16; k++) {
717 GemmMicrokernelTester()
718 .mr(1)
719 .nr(8)
720 .kr(1)
721 .sr(1)
722 .m(1)
723 .n(8)
724 .k(k)
725 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
726 }
727 }
728
729 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
730 TEST_REQUIRES_ARM_NEON_FMA;
731 for (size_t k = 17; k < 16; k++) {
732 GemmMicrokernelTester()
733 .mr(1)
734 .nr(8)
735 .kr(1)
736 .sr(1)
737 .m(1)
738 .n(8)
739 .k(k)
740 .a_stride(19)
741 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
742 }
743 }
744
745 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
746 TEST_REQUIRES_ARM_NEON_FMA;
747 for (size_t k = 17; k < 16; k++) {
748 for (uint32_t m = 1; m <= 1; m++) {
749 for (uint32_t n = 1; n <= 8; n++) {
750 GemmMicrokernelTester()
751 .mr(1)
752 .nr(8)
753 .kr(1)
754 .sr(1)
755 .m(m)
756 .n(n)
757 .k(k)
758 .iterations(1)
759 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
760 }
761 }
762 }
763 }
764
765 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
766 TEST_REQUIRES_ARM_NEON_FMA;
767 for (size_t k = 24; k <= 80; k += 8) {
768 GemmMicrokernelTester()
769 .mr(1)
770 .nr(8)
771 .kr(1)
772 .sr(1)
773 .m(1)
774 .n(8)
775 .k(k)
776 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
777 }
778 }
779
780 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
781 TEST_REQUIRES_ARM_NEON_FMA;
782 for (size_t k = 24; k <= 80; k += 8) {
783 GemmMicrokernelTester()
784 .mr(1)
785 .nr(8)
786 .kr(1)
787 .sr(1)
788 .m(1)
789 .n(8)
790 .k(k)
791 .a_stride(83)
792 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
793 }
794 }
795
796 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
797 TEST_REQUIRES_ARM_NEON_FMA;
798 for (size_t k = 24; k <= 80; k += 8) {
799 for (uint32_t m = 1; m <= 1; m++) {
800 for (uint32_t n = 1; n <= 8; n++) {
801 GemmMicrokernelTester()
802 .mr(1)
803 .nr(8)
804 .kr(1)
805 .sr(1)
806 .m(m)
807 .n(n)
808 .k(k)
809 .iterations(1)
810 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
811 }
812 }
813 }
814 }
815
816 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
817 TEST_REQUIRES_ARM_NEON_FMA;
818 for (uint32_t n = 9; n < 16; n++) {
819 for (size_t k = 1; k <= 40; k += 9) {
820 GemmMicrokernelTester()
821 .mr(1)
822 .nr(8)
823 .kr(1)
824 .sr(1)
825 .m(1)
826 .n(8)
827 .k(k)
828 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
829 }
830 }
831 }
832
833 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
834 TEST_REQUIRES_ARM_NEON_FMA;
835 for (uint32_t n = 9; n < 16; n++) {
836 for (size_t k = 1; k <= 40; k += 9) {
837 GemmMicrokernelTester()
838 .mr(1)
839 .nr(8)
840 .kr(1)
841 .sr(1)
842 .m(1)
843 .n(8)
844 .k(k)
845 .cn_stride(11)
846 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
847 }
848 }
849 }
850
851 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
852 TEST_REQUIRES_ARM_NEON_FMA;
853 for (uint32_t n = 9; n < 16; n++) {
854 for (size_t k = 1; k <= 40; k += 9) {
855 GemmMicrokernelTester()
856 .mr(1)
857 .nr(8)
858 .kr(1)
859 .sr(1)
860 .m(1)
861 .n(n)
862 .k(k)
863 .a_stride(43)
864 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
865 }
866 }
867 }
868
869 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
870 TEST_REQUIRES_ARM_NEON_FMA;
871 for (uint32_t n = 9; n < 16; n++) {
872 for (size_t k = 1; k <= 40; k += 9) {
873 for (uint32_t m = 1; m <= 1; m++) {
874 GemmMicrokernelTester()
875 .mr(1)
876 .nr(8)
877 .kr(1)
878 .sr(1)
879 .m(m)
880 .n(n)
881 .k(k)
882 .iterations(1)
883 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
884 }
885 }
886 }
887 }
888
889 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
890 TEST_REQUIRES_ARM_NEON_FMA;
891 for (uint32_t n = 16; n <= 24; n += 8) {
892 for (size_t k = 1; k <= 40; k += 9) {
893 GemmMicrokernelTester()
894 .mr(1)
895 .nr(8)
896 .kr(1)
897 .sr(1)
898 .m(1)
899 .n(8)
900 .k(k)
901 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
902 }
903 }
904 }
905
906 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
907 TEST_REQUIRES_ARM_NEON_FMA;
908 for (uint32_t n = 16; n <= 24; n += 8) {
909 for (size_t k = 1; k <= 40; k += 9) {
910 GemmMicrokernelTester()
911 .mr(1)
912 .nr(8)
913 .kr(1)
914 .sr(1)
915 .m(1)
916 .n(n)
917 .k(k)
918 .cn_stride(11)
919 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
920 }
921 }
922 }
923
924 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
925 TEST_REQUIRES_ARM_NEON_FMA;
926 for (uint32_t n = 16; n <= 24; n += 8) {
927 for (size_t k = 1; k <= 40; k += 9) {
928 GemmMicrokernelTester()
929 .mr(1)
930 .nr(8)
931 .kr(1)
932 .sr(1)
933 .m(1)
934 .n(n)
935 .k(k)
936 .a_stride(43)
937 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
938 }
939 }
940 }
941
942 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
943 TEST_REQUIRES_ARM_NEON_FMA;
944 for (uint32_t n = 16; n <= 24; n += 8) {
945 for (size_t k = 1; k <= 40; k += 9) {
946 for (uint32_t m = 1; m <= 1; m++) {
947 GemmMicrokernelTester()
948 .mr(1)
949 .nr(8)
950 .kr(1)
951 .sr(1)
952 .m(m)
953 .n(n)
954 .k(k)
955 .iterations(1)
956 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
957 }
958 }
959 }
960 }
961
962 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
963 TEST_REQUIRES_ARM_NEON_FMA;
964 for (size_t k = 1; k <= 40; k += 9) {
965 for (uint32_t m = 1; m <= 1; m++) {
966 for (uint32_t n = 1; n <= 8; n++) {
967 GemmMicrokernelTester()
968 .mr(1)
969 .nr(8)
970 .kr(1)
971 .sr(1)
972 .m(m)
973 .n(n)
974 .k(k)
975 .cm_stride(11)
976 .iterations(1)
977 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
978 }
979 }
980 }
981 }
982
983 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
984 TEST_REQUIRES_ARM_NEON_FMA;
985 GemmMicrokernelTester()
986 .mr(1)
987 .nr(8)
988 .kr(1)
989 .sr(1)
990 .m(1)
991 .n(8)
992 .k(8)
993 .qmin(128)
994 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
995 }
996
997 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
998 TEST_REQUIRES_ARM_NEON_FMA;
999 GemmMicrokernelTester()
1000 .mr(1)
1001 .nr(8)
1002 .kr(1)
1003 .sr(1)
1004 .m(1)
1005 .n(8)
1006 .k(8)
1007 .qmax(128)
1008 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1009 }
1010
1011 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
1012 TEST_REQUIRES_ARM_NEON_FMA;
1013 GemmMicrokernelTester()
1014 .mr(1)
1015 .nr(8)
1016 .kr(1)
1017 .sr(1)
1018 .m(1)
1019 .n(8)
1020 .k(8)
1021 .cm_stride(11)
1022 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1023 }
1024#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1025
1026
1027#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1028 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
1029 TEST_REQUIRES_ARM_NEON_FMA;
1030 GemmMicrokernelTester()
1031 .mr(1)
1032 .nr(8)
1033 .kr(1)
1034 .sr(1)
1035 .m(1)
1036 .n(8)
1037 .k(8)
1038 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1039 }
1040
1041 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
1042 TEST_REQUIRES_ARM_NEON_FMA;
1043 GemmMicrokernelTester()
1044 .mr(1)
1045 .nr(8)
1046 .kr(1)
1047 .sr(1)
1048 .m(1)
1049 .n(8)
1050 .k(8)
1051 .cn_stride(11)
1052 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1053 }
1054
1055 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
1056 TEST_REQUIRES_ARM_NEON_FMA;
1057 GemmMicrokernelTester()
1058 .mr(1)
1059 .nr(8)
1060 .kr(1)
1061 .sr(1)
1062 .m(1)
1063 .n(8)
1064 .k(8)
1065 .a_stride(11)
1066 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1067 }
1068
1069 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
1070 TEST_REQUIRES_ARM_NEON_FMA;
1071 for (uint32_t m = 1; m <= 1; m++) {
1072 for (uint32_t n = 1; n <= 8; n++) {
1073 GemmMicrokernelTester()
1074 .mr(1)
1075 .nr(8)
1076 .kr(1)
1077 .sr(1)
1078 .m(m)
1079 .n(n)
1080 .k(8)
1081 .iterations(1)
1082 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1083 }
1084 }
1085 }
1086
1087 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
1088 TEST_REQUIRES_ARM_NEON_FMA;
1089 for (uint32_t m = 1; m <= 1; m++) {
1090 GemmMicrokernelTester()
1091 .mr(1)
1092 .nr(8)
1093 .kr(1)
1094 .sr(1)
1095 .m(m)
1096 .n(8)
1097 .k(8)
1098 .iterations(1)
1099 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1100 }
1101 }
1102
1103 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
1104 TEST_REQUIRES_ARM_NEON_FMA;
1105 for (uint32_t n = 1; n <= 8; n++) {
1106 GemmMicrokernelTester()
1107 .mr(1)
1108 .nr(8)
1109 .kr(1)
1110 .sr(1)
1111 .m(1)
1112 .n(n)
1113 .k(8)
1114 .iterations(1)
1115 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1116 }
1117 }
1118
1119 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
1120 TEST_REQUIRES_ARM_NEON_FMA;
1121 GemmMicrokernelTester()
1122 .mr(1)
1123 .nr(8)
1124 .kr(1)
1125 .sr(1)
1126 .m(1)
1127 .n(8)
1128 .k(16)
1129 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1130 }
1131
1132 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
1133 TEST_REQUIRES_ARM_NEON_FMA;
1134 GemmMicrokernelTester()
1135 .mr(1)
1136 .nr(8)
1137 .kr(1)
1138 .sr(1)
1139 .m(1)
1140 .n(8)
1141 .k(16)
1142 .a_stride(19)
1143 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1144 }
1145
1146 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
1147 TEST_REQUIRES_ARM_NEON_FMA;
1148 for (uint32_t m = 1; m <= 1; m++) {
1149 for (uint32_t n = 1; n <= 8; n++) {
1150 GemmMicrokernelTester()
1151 .mr(1)
1152 .nr(8)
1153 .kr(1)
1154 .sr(1)
1155 .m(m)
1156 .n(n)
1157 .k(16)
1158 .iterations(1)
1159 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1160 }
1161 }
1162 }
1163
1164 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
1165 TEST_REQUIRES_ARM_NEON_FMA;
1166 for (size_t k = 1; k < 16; k++) {
1167 GemmMicrokernelTester()
1168 .mr(1)
1169 .nr(8)
1170 .kr(1)
1171 .sr(1)
1172 .m(1)
1173 .n(8)
1174 .k(k)
1175 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1176 }
1177 }
1178
1179 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
1180 TEST_REQUIRES_ARM_NEON_FMA;
1181 for (size_t k = 1; k < 16; k++) {
1182 GemmMicrokernelTester()
1183 .mr(1)
1184 .nr(8)
1185 .kr(1)
1186 .sr(1)
1187 .m(1)
1188 .n(8)
1189 .k(k)
1190 .a_stride(19)
1191 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1192 }
1193 }
1194
1195 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
1196 TEST_REQUIRES_ARM_NEON_FMA;
1197 for (size_t k = 1; k < 16; k++) {
1198 for (uint32_t m = 1; m <= 1; m++) {
1199 for (uint32_t n = 1; n <= 8; n++) {
1200 GemmMicrokernelTester()
1201 .mr(1)
1202 .nr(8)
1203 .kr(1)
1204 .sr(1)
1205 .m(m)
1206 .n(n)
1207 .k(k)
1208 .iterations(1)
1209 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1210 }
1211 }
1212 }
1213 }
1214
1215 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
1216 TEST_REQUIRES_ARM_NEON_FMA;
1217 for (size_t k = 17; k < 16; k++) {
1218 GemmMicrokernelTester()
1219 .mr(1)
1220 .nr(8)
1221 .kr(1)
1222 .sr(1)
1223 .m(1)
1224 .n(8)
1225 .k(k)
1226 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1227 }
1228 }
1229
1230 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
1231 TEST_REQUIRES_ARM_NEON_FMA;
1232 for (size_t k = 17; k < 16; k++) {
1233 GemmMicrokernelTester()
1234 .mr(1)
1235 .nr(8)
1236 .kr(1)
1237 .sr(1)
1238 .m(1)
1239 .n(8)
1240 .k(k)
1241 .a_stride(19)
1242 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1243 }
1244 }
1245
1246 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
1247 TEST_REQUIRES_ARM_NEON_FMA;
1248 for (size_t k = 17; k < 16; k++) {
1249 for (uint32_t m = 1; m <= 1; m++) {
1250 for (uint32_t n = 1; n <= 8; n++) {
1251 GemmMicrokernelTester()
1252 .mr(1)
1253 .nr(8)
1254 .kr(1)
1255 .sr(1)
1256 .m(m)
1257 .n(n)
1258 .k(k)
1259 .iterations(1)
1260 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1261 }
1262 }
1263 }
1264 }
1265
1266 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
1267 TEST_REQUIRES_ARM_NEON_FMA;
1268 for (size_t k = 24; k <= 80; k += 8) {
1269 GemmMicrokernelTester()
1270 .mr(1)
1271 .nr(8)
1272 .kr(1)
1273 .sr(1)
1274 .m(1)
1275 .n(8)
1276 .k(k)
1277 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1278 }
1279 }
1280
1281 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
1282 TEST_REQUIRES_ARM_NEON_FMA;
1283 for (size_t k = 24; k <= 80; k += 8) {
1284 GemmMicrokernelTester()
1285 .mr(1)
1286 .nr(8)
1287 .kr(1)
1288 .sr(1)
1289 .m(1)
1290 .n(8)
1291 .k(k)
1292 .a_stride(83)
1293 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1294 }
1295 }
1296
1297 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
1298 TEST_REQUIRES_ARM_NEON_FMA;
1299 for (size_t k = 24; k <= 80; k += 8) {
1300 for (uint32_t m = 1; m <= 1; m++) {
1301 for (uint32_t n = 1; n <= 8; n++) {
1302 GemmMicrokernelTester()
1303 .mr(1)
1304 .nr(8)
1305 .kr(1)
1306 .sr(1)
1307 .m(m)
1308 .n(n)
1309 .k(k)
1310 .iterations(1)
1311 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1312 }
1313 }
1314 }
1315 }
1316
1317 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
1318 TEST_REQUIRES_ARM_NEON_FMA;
1319 for (uint32_t n = 9; n < 16; n++) {
1320 for (size_t k = 1; k <= 40; k += 9) {
1321 GemmMicrokernelTester()
1322 .mr(1)
1323 .nr(8)
1324 .kr(1)
1325 .sr(1)
1326 .m(1)
1327 .n(8)
1328 .k(k)
1329 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1330 }
1331 }
1332 }
1333
1334 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
1335 TEST_REQUIRES_ARM_NEON_FMA;
1336 for (uint32_t n = 9; n < 16; n++) {
1337 for (size_t k = 1; k <= 40; k += 9) {
1338 GemmMicrokernelTester()
1339 .mr(1)
1340 .nr(8)
1341 .kr(1)
1342 .sr(1)
1343 .m(1)
1344 .n(8)
1345 .k(k)
1346 .cn_stride(11)
1347 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1348 }
1349 }
1350 }
1351
1352 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
1353 TEST_REQUIRES_ARM_NEON_FMA;
1354 for (uint32_t n = 9; n < 16; n++) {
1355 for (size_t k = 1; k <= 40; k += 9) {
1356 GemmMicrokernelTester()
1357 .mr(1)
1358 .nr(8)
1359 .kr(1)
1360 .sr(1)
1361 .m(1)
1362 .n(n)
1363 .k(k)
1364 .a_stride(43)
1365 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1366 }
1367 }
1368 }
1369
1370 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
1371 TEST_REQUIRES_ARM_NEON_FMA;
1372 for (uint32_t n = 9; n < 16; n++) {
1373 for (size_t k = 1; k <= 40; k += 9) {
1374 for (uint32_t m = 1; m <= 1; m++) {
1375 GemmMicrokernelTester()
1376 .mr(1)
1377 .nr(8)
1378 .kr(1)
1379 .sr(1)
1380 .m(m)
1381 .n(n)
1382 .k(k)
1383 .iterations(1)
1384 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1385 }
1386 }
1387 }
1388 }
1389
1390 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
1391 TEST_REQUIRES_ARM_NEON_FMA;
1392 for (uint32_t n = 16; n <= 24; n += 8) {
1393 for (size_t k = 1; k <= 40; k += 9) {
1394 GemmMicrokernelTester()
1395 .mr(1)
1396 .nr(8)
1397 .kr(1)
1398 .sr(1)
1399 .m(1)
1400 .n(8)
1401 .k(k)
1402 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1403 }
1404 }
1405 }
1406
1407 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
1408 TEST_REQUIRES_ARM_NEON_FMA;
1409 for (uint32_t n = 16; n <= 24; n += 8) {
1410 for (size_t k = 1; k <= 40; k += 9) {
1411 GemmMicrokernelTester()
1412 .mr(1)
1413 .nr(8)
1414 .kr(1)
1415 .sr(1)
1416 .m(1)
1417 .n(n)
1418 .k(k)
1419 .cn_stride(11)
1420 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1421 }
1422 }
1423 }
1424
1425 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
1426 TEST_REQUIRES_ARM_NEON_FMA;
1427 for (uint32_t n = 16; n <= 24; n += 8) {
1428 for (size_t k = 1; k <= 40; k += 9) {
1429 GemmMicrokernelTester()
1430 .mr(1)
1431 .nr(8)
1432 .kr(1)
1433 .sr(1)
1434 .m(1)
1435 .n(n)
1436 .k(k)
1437 .a_stride(43)
1438 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1439 }
1440 }
1441 }
1442
1443 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
1444 TEST_REQUIRES_ARM_NEON_FMA;
1445 for (uint32_t n = 16; n <= 24; n += 8) {
1446 for (size_t k = 1; k <= 40; k += 9) {
1447 for (uint32_t m = 1; m <= 1; m++) {
1448 GemmMicrokernelTester()
1449 .mr(1)
1450 .nr(8)
1451 .kr(1)
1452 .sr(1)
1453 .m(m)
1454 .n(n)
1455 .k(k)
1456 .iterations(1)
1457 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1458 }
1459 }
1460 }
1461 }
1462
1463 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
1464 TEST_REQUIRES_ARM_NEON_FMA;
1465 for (size_t k = 1; k <= 40; k += 9) {
1466 for (uint32_t m = 1; m <= 1; m++) {
1467 for (uint32_t n = 1; n <= 8; n++) {
1468 GemmMicrokernelTester()
1469 .mr(1)
1470 .nr(8)
1471 .kr(1)
1472 .sr(1)
1473 .m(m)
1474 .n(n)
1475 .k(k)
1476 .cm_stride(11)
1477 .iterations(1)
1478 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1479 }
1480 }
1481 }
1482 }
1483
1484 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
1485 TEST_REQUIRES_ARM_NEON_FMA;
1486 GemmMicrokernelTester()
1487 .mr(1)
1488 .nr(8)
1489 .kr(1)
1490 .sr(1)
1491 .m(1)
1492 .n(8)
1493 .k(8)
1494 .qmin(128)
1495 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1496 }
1497
1498 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
1499 TEST_REQUIRES_ARM_NEON_FMA;
1500 GemmMicrokernelTester()
1501 .mr(1)
1502 .nr(8)
1503 .kr(1)
1504 .sr(1)
1505 .m(1)
1506 .n(8)
1507 .k(8)
1508 .qmax(128)
1509 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1510 }
1511
1512 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
1513 TEST_REQUIRES_ARM_NEON_FMA;
1514 GemmMicrokernelTester()
1515 .mr(1)
1516 .nr(8)
1517 .kr(1)
1518 .sr(1)
1519 .m(1)
1520 .n(8)
1521 .k(8)
1522 .cm_stride(11)
1523 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1524 }
1525#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1526
1527
1528#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1529 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
1530 TEST_REQUIRES_ARM_NEON_FMA;
1531 GemmMicrokernelTester()
1532 .mr(4)
1533 .nr(8)
1534 .kr(1)
1535 .sr(1)
1536 .m(4)
1537 .n(8)
1538 .k(4)
1539 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1540 }
1541
1542 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1543 TEST_REQUIRES_ARM_NEON_FMA;
1544 GemmMicrokernelTester()
1545 .mr(4)
1546 .nr(8)
1547 .kr(1)
1548 .sr(1)
1549 .m(4)
1550 .n(8)
1551 .k(4)
1552 .cn_stride(11)
1553 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1554 }
1555
1556 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
1557 TEST_REQUIRES_ARM_NEON_FMA;
1558 GemmMicrokernelTester()
1559 .mr(4)
1560 .nr(8)
1561 .kr(1)
1562 .sr(1)
1563 .m(4)
1564 .n(8)
1565 .k(4)
1566 .a_stride(7)
1567 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1568 }
1569
1570 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
1571 TEST_REQUIRES_ARM_NEON_FMA;
1572 for (uint32_t m = 1; m <= 4; m++) {
1573 for (uint32_t n = 1; n <= 8; n++) {
1574 GemmMicrokernelTester()
1575 .mr(4)
1576 .nr(8)
1577 .kr(1)
1578 .sr(1)
1579 .m(m)
1580 .n(n)
1581 .k(4)
1582 .iterations(1)
1583 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1584 }
1585 }
1586 }
1587
1588 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
1589 TEST_REQUIRES_ARM_NEON_FMA;
1590 for (uint32_t m = 1; m <= 4; m++) {
1591 GemmMicrokernelTester()
1592 .mr(4)
1593 .nr(8)
1594 .kr(1)
1595 .sr(1)
1596 .m(m)
1597 .n(8)
1598 .k(4)
1599 .iterations(1)
1600 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1601 }
1602 }
1603
1604 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
1605 TEST_REQUIRES_ARM_NEON_FMA;
1606 for (uint32_t n = 1; n <= 8; n++) {
1607 GemmMicrokernelTester()
1608 .mr(4)
1609 .nr(8)
1610 .kr(1)
1611 .sr(1)
1612 .m(4)
1613 .n(n)
1614 .k(4)
1615 .iterations(1)
1616 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1617 }
1618 }
1619
1620 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
1621 TEST_REQUIRES_ARM_NEON_FMA;
1622 GemmMicrokernelTester()
1623 .mr(4)
1624 .nr(8)
1625 .kr(1)
1626 .sr(1)
1627 .m(4)
1628 .n(8)
1629 .k(8)
1630 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1631 }
1632
1633 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
1634 TEST_REQUIRES_ARM_NEON_FMA;
1635 GemmMicrokernelTester()
1636 .mr(4)
1637 .nr(8)
1638 .kr(1)
1639 .sr(1)
1640 .m(4)
1641 .n(8)
1642 .k(8)
1643 .a_stride(11)
1644 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1645 }
1646
1647 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1648 TEST_REQUIRES_ARM_NEON_FMA;
1649 for (uint32_t m = 1; m <= 4; m++) {
1650 for (uint32_t n = 1; n <= 8; n++) {
1651 GemmMicrokernelTester()
1652 .mr(4)
1653 .nr(8)
1654 .kr(1)
1655 .sr(1)
1656 .m(m)
1657 .n(n)
1658 .k(8)
1659 .iterations(1)
1660 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1661 }
1662 }
1663 }
1664
1665 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
1666 TEST_REQUIRES_ARM_NEON_FMA;
1667 for (size_t k = 1; k < 8; k++) {
1668 GemmMicrokernelTester()
1669 .mr(4)
1670 .nr(8)
1671 .kr(1)
1672 .sr(1)
1673 .m(4)
1674 .n(8)
1675 .k(k)
1676 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1677 }
1678 }
1679
1680 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
1681 TEST_REQUIRES_ARM_NEON_FMA;
1682 for (size_t k = 1; k < 8; k++) {
1683 GemmMicrokernelTester()
1684 .mr(4)
1685 .nr(8)
1686 .kr(1)
1687 .sr(1)
1688 .m(4)
1689 .n(8)
1690 .k(k)
1691 .a_stride(11)
1692 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1693 }
1694 }
1695
1696 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
1697 TEST_REQUIRES_ARM_NEON_FMA;
1698 for (size_t k = 1; k < 8; k++) {
1699 for (uint32_t m = 1; m <= 4; m++) {
1700 for (uint32_t n = 1; n <= 8; n++) {
1701 GemmMicrokernelTester()
1702 .mr(4)
1703 .nr(8)
1704 .kr(1)
1705 .sr(1)
1706 .m(m)
1707 .n(n)
1708 .k(k)
1709 .iterations(1)
1710 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1711 }
1712 }
1713 }
1714 }
1715
1716 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
1717 TEST_REQUIRES_ARM_NEON_FMA;
1718 for (size_t k = 9; k < 8; k++) {
1719 GemmMicrokernelTester()
1720 .mr(4)
1721 .nr(8)
1722 .kr(1)
1723 .sr(1)
1724 .m(4)
1725 .n(8)
1726 .k(k)
1727 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1728 }
1729 }
1730
1731 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
1732 TEST_REQUIRES_ARM_NEON_FMA;
1733 for (size_t k = 9; k < 8; k++) {
1734 GemmMicrokernelTester()
1735 .mr(4)
1736 .nr(8)
1737 .kr(1)
1738 .sr(1)
1739 .m(4)
1740 .n(8)
1741 .k(k)
1742 .a_stride(11)
1743 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1744 }
1745 }
1746
1747 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
1748 TEST_REQUIRES_ARM_NEON_FMA;
1749 for (size_t k = 9; k < 8; k++) {
1750 for (uint32_t m = 1; m <= 4; m++) {
1751 for (uint32_t n = 1; n <= 8; n++) {
1752 GemmMicrokernelTester()
1753 .mr(4)
1754 .nr(8)
1755 .kr(1)
1756 .sr(1)
1757 .m(m)
1758 .n(n)
1759 .k(k)
1760 .iterations(1)
1761 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1762 }
1763 }
1764 }
1765 }
1766
1767 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
1768 TEST_REQUIRES_ARM_NEON_FMA;
1769 for (size_t k = 12; k <= 40; k += 4) {
1770 GemmMicrokernelTester()
1771 .mr(4)
1772 .nr(8)
1773 .kr(1)
1774 .sr(1)
1775 .m(4)
1776 .n(8)
1777 .k(k)
1778 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1779 }
1780 }
1781
1782 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
1783 TEST_REQUIRES_ARM_NEON_FMA;
1784 for (size_t k = 12; k <= 40; k += 4) {
1785 GemmMicrokernelTester()
1786 .mr(4)
1787 .nr(8)
1788 .kr(1)
1789 .sr(1)
1790 .m(4)
1791 .n(8)
1792 .k(k)
1793 .a_stride(43)
1794 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1795 }
1796 }
1797
1798 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
1799 TEST_REQUIRES_ARM_NEON_FMA;
1800 for (size_t k = 12; k <= 40; k += 4) {
1801 for (uint32_t m = 1; m <= 4; m++) {
1802 for (uint32_t n = 1; n <= 8; n++) {
1803 GemmMicrokernelTester()
1804 .mr(4)
1805 .nr(8)
1806 .kr(1)
1807 .sr(1)
1808 .m(m)
1809 .n(n)
1810 .k(k)
1811 .iterations(1)
1812 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1813 }
1814 }
1815 }
1816 }
1817
1818 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1819 TEST_REQUIRES_ARM_NEON_FMA;
1820 for (uint32_t n = 9; n < 16; n++) {
1821 for (size_t k = 1; k <= 20; k += 5) {
1822 GemmMicrokernelTester()
1823 .mr(4)
1824 .nr(8)
1825 .kr(1)
1826 .sr(1)
1827 .m(4)
1828 .n(8)
1829 .k(k)
1830 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1831 }
1832 }
1833 }
1834
1835 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1836 TEST_REQUIRES_ARM_NEON_FMA;
1837 for (uint32_t n = 9; n < 16; n++) {
1838 for (size_t k = 1; k <= 20; k += 5) {
1839 GemmMicrokernelTester()
1840 .mr(4)
1841 .nr(8)
1842 .kr(1)
1843 .sr(1)
1844 .m(4)
1845 .n(8)
1846 .k(k)
1847 .cn_stride(11)
1848 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1849 }
1850 }
1851 }
1852
1853 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
1854 TEST_REQUIRES_ARM_NEON_FMA;
1855 for (uint32_t n = 9; n < 16; n++) {
1856 for (size_t k = 1; k <= 20; k += 5) {
1857 GemmMicrokernelTester()
1858 .mr(4)
1859 .nr(8)
1860 .kr(1)
1861 .sr(1)
1862 .m(4)
1863 .n(n)
1864 .k(k)
1865 .a_stride(23)
1866 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1867 }
1868 }
1869 }
1870
1871 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1872 TEST_REQUIRES_ARM_NEON_FMA;
1873 for (uint32_t n = 9; n < 16; n++) {
1874 for (size_t k = 1; k <= 20; k += 5) {
1875 for (uint32_t m = 1; m <= 4; m++) {
1876 GemmMicrokernelTester()
1877 .mr(4)
1878 .nr(8)
1879 .kr(1)
1880 .sr(1)
1881 .m(m)
1882 .n(n)
1883 .k(k)
1884 .iterations(1)
1885 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1886 }
1887 }
1888 }
1889 }
1890
1891 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1892 TEST_REQUIRES_ARM_NEON_FMA;
1893 for (uint32_t n = 16; n <= 24; n += 8) {
1894 for (size_t k = 1; k <= 20; k += 5) {
1895 GemmMicrokernelTester()
1896 .mr(4)
1897 .nr(8)
1898 .kr(1)
1899 .sr(1)
1900 .m(4)
1901 .n(8)
1902 .k(k)
1903 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1904 }
1905 }
1906 }
1907
1908 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1909 TEST_REQUIRES_ARM_NEON_FMA;
1910 for (uint32_t n = 16; n <= 24; n += 8) {
1911 for (size_t k = 1; k <= 20; k += 5) {
1912 GemmMicrokernelTester()
1913 .mr(4)
1914 .nr(8)
1915 .kr(1)
1916 .sr(1)
1917 .m(4)
1918 .n(n)
1919 .k(k)
1920 .cn_stride(11)
1921 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1922 }
1923 }
1924 }
1925
1926 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
1927 TEST_REQUIRES_ARM_NEON_FMA;
1928 for (uint32_t n = 16; n <= 24; n += 8) {
1929 for (size_t k = 1; k <= 20; k += 5) {
1930 GemmMicrokernelTester()
1931 .mr(4)
1932 .nr(8)
1933 .kr(1)
1934 .sr(1)
1935 .m(4)
1936 .n(n)
1937 .k(k)
1938 .a_stride(23)
1939 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1940 }
1941 }
1942 }
1943
1944 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1945 TEST_REQUIRES_ARM_NEON_FMA;
1946 for (uint32_t n = 16; n <= 24; n += 8) {
1947 for (size_t k = 1; k <= 20; k += 5) {
1948 for (uint32_t m = 1; m <= 4; m++) {
1949 GemmMicrokernelTester()
1950 .mr(4)
1951 .nr(8)
1952 .kr(1)
1953 .sr(1)
1954 .m(m)
1955 .n(n)
1956 .k(k)
1957 .iterations(1)
1958 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1959 }
1960 }
1961 }
1962 }
1963
1964 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1965 TEST_REQUIRES_ARM_NEON_FMA;
1966 for (size_t k = 1; k <= 20; k += 5) {
1967 for (uint32_t m = 1; m <= 4; m++) {
1968 for (uint32_t n = 1; n <= 8; n++) {
1969 GemmMicrokernelTester()
1970 .mr(4)
1971 .nr(8)
1972 .kr(1)
1973 .sr(1)
1974 .m(m)
1975 .n(n)
1976 .k(k)
1977 .cm_stride(11)
1978 .iterations(1)
1979 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1980 }
1981 }
1982 }
1983 }
1984
1985 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1986 TEST_REQUIRES_ARM_NEON_FMA;
1987 GemmMicrokernelTester()
1988 .mr(4)
1989 .nr(8)
1990 .kr(1)
1991 .sr(1)
1992 .m(4)
1993 .n(8)
1994 .k(4)
1995 .qmin(128)
1996 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1997 }
1998
1999 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
2000 TEST_REQUIRES_ARM_NEON_FMA;
2001 GemmMicrokernelTester()
2002 .mr(4)
2003 .nr(8)
2004 .kr(1)
2005 .sr(1)
2006 .m(4)
2007 .n(8)
2008 .k(4)
2009 .qmax(128)
2010 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2011 }
2012
2013 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2014 TEST_REQUIRES_ARM_NEON_FMA;
2015 GemmMicrokernelTester()
2016 .mr(4)
2017 .nr(8)
2018 .kr(1)
2019 .sr(1)
2020 .m(4)
2021 .n(8)
2022 .k(4)
2023 .cm_stride(11)
2024 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2025 }
2026#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2027
2028
2029#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2030 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
2031 TEST_REQUIRES_ARM_NEON_FMA;
2032 GemmMicrokernelTester()
2033 .mr(4)
2034 .nr(8)
2035 .kr(1)
2036 .sr(1)
2037 .m(4)
2038 .n(8)
2039 .k(4)
2040 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2041 }
2042
2043 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
2044 TEST_REQUIRES_ARM_NEON_FMA;
2045 GemmMicrokernelTester()
2046 .mr(4)
2047 .nr(8)
2048 .kr(1)
2049 .sr(1)
2050 .m(4)
2051 .n(8)
2052 .k(4)
2053 .cn_stride(11)
2054 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2055 }
2056
2057 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_strided_a) {
2058 TEST_REQUIRES_ARM_NEON_FMA;
2059 GemmMicrokernelTester()
2060 .mr(4)
2061 .nr(8)
2062 .kr(1)
2063 .sr(1)
2064 .m(4)
2065 .n(8)
2066 .k(4)
2067 .a_stride(7)
2068 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2069 }
2070
2071 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
2072 TEST_REQUIRES_ARM_NEON_FMA;
2073 for (uint32_t m = 1; m <= 4; m++) {
2074 for (uint32_t n = 1; n <= 8; n++) {
2075 GemmMicrokernelTester()
2076 .mr(4)
2077 .nr(8)
2078 .kr(1)
2079 .sr(1)
2080 .m(m)
2081 .n(n)
2082 .k(4)
2083 .iterations(1)
2084 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2085 }
2086 }
2087 }
2088
2089 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
2090 TEST_REQUIRES_ARM_NEON_FMA;
2091 for (uint32_t m = 1; m <= 4; m++) {
2092 GemmMicrokernelTester()
2093 .mr(4)
2094 .nr(8)
2095 .kr(1)
2096 .sr(1)
2097 .m(m)
2098 .n(8)
2099 .k(4)
2100 .iterations(1)
2101 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2102 }
2103 }
2104
2105 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
2106 TEST_REQUIRES_ARM_NEON_FMA;
2107 for (uint32_t n = 1; n <= 8; n++) {
2108 GemmMicrokernelTester()
2109 .mr(4)
2110 .nr(8)
2111 .kr(1)
2112 .sr(1)
2113 .m(4)
2114 .n(n)
2115 .k(4)
2116 .iterations(1)
2117 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2118 }
2119 }
2120
2121 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
2122 TEST_REQUIRES_ARM_NEON_FMA;
2123 GemmMicrokernelTester()
2124 .mr(4)
2125 .nr(8)
2126 .kr(1)
2127 .sr(1)
2128 .m(4)
2129 .n(8)
2130 .k(8)
2131 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2132 }
2133
2134 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_strided_a) {
2135 TEST_REQUIRES_ARM_NEON_FMA;
2136 GemmMicrokernelTester()
2137 .mr(4)
2138 .nr(8)
2139 .kr(1)
2140 .sr(1)
2141 .m(4)
2142 .n(8)
2143 .k(8)
2144 .a_stride(11)
2145 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2146 }
2147
2148 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
2149 TEST_REQUIRES_ARM_NEON_FMA;
2150 for (uint32_t m = 1; m <= 4; m++) {
2151 for (uint32_t n = 1; n <= 8; n++) {
2152 GemmMicrokernelTester()
2153 .mr(4)
2154 .nr(8)
2155 .kr(1)
2156 .sr(1)
2157 .m(m)
2158 .n(n)
2159 .k(8)
2160 .iterations(1)
2161 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2162 }
2163 }
2164 }
2165
2166 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
2167 TEST_REQUIRES_ARM_NEON_FMA;
2168 for (size_t k = 1; k < 8; k++) {
2169 GemmMicrokernelTester()
2170 .mr(4)
2171 .nr(8)
2172 .kr(1)
2173 .sr(1)
2174 .m(4)
2175 .n(8)
2176 .k(k)
2177 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2178 }
2179 }
2180
2181 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_strided_a) {
2182 TEST_REQUIRES_ARM_NEON_FMA;
2183 for (size_t k = 1; k < 8; k++) {
2184 GemmMicrokernelTester()
2185 .mr(4)
2186 .nr(8)
2187 .kr(1)
2188 .sr(1)
2189 .m(4)
2190 .n(8)
2191 .k(k)
2192 .a_stride(11)
2193 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2194 }
2195 }
2196
2197 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
2198 TEST_REQUIRES_ARM_NEON_FMA;
2199 for (size_t k = 1; k < 8; k++) {
2200 for (uint32_t m = 1; m <= 4; m++) {
2201 for (uint32_t n = 1; n <= 8; n++) {
2202 GemmMicrokernelTester()
2203 .mr(4)
2204 .nr(8)
2205 .kr(1)
2206 .sr(1)
2207 .m(m)
2208 .n(n)
2209 .k(k)
2210 .iterations(1)
2211 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2212 }
2213 }
2214 }
2215 }
2216
2217 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
2218 TEST_REQUIRES_ARM_NEON_FMA;
2219 for (size_t k = 9; k < 8; k++) {
2220 GemmMicrokernelTester()
2221 .mr(4)
2222 .nr(8)
2223 .kr(1)
2224 .sr(1)
2225 .m(4)
2226 .n(8)
2227 .k(k)
2228 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2229 }
2230 }
2231
2232 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_strided_a) {
2233 TEST_REQUIRES_ARM_NEON_FMA;
2234 for (size_t k = 9; k < 8; k++) {
2235 GemmMicrokernelTester()
2236 .mr(4)
2237 .nr(8)
2238 .kr(1)
2239 .sr(1)
2240 .m(4)
2241 .n(8)
2242 .k(k)
2243 .a_stride(11)
2244 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2245 }
2246 }
2247
2248 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_subtile) {
2249 TEST_REQUIRES_ARM_NEON_FMA;
2250 for (size_t k = 9; k < 8; k++) {
2251 for (uint32_t m = 1; m <= 4; m++) {
2252 for (uint32_t n = 1; n <= 8; n++) {
2253 GemmMicrokernelTester()
2254 .mr(4)
2255 .nr(8)
2256 .kr(1)
2257 .sr(1)
2258 .m(m)
2259 .n(n)
2260 .k(k)
2261 .iterations(1)
2262 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2263 }
2264 }
2265 }
2266 }
2267
2268 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
2269 TEST_REQUIRES_ARM_NEON_FMA;
2270 for (size_t k = 12; k <= 40; k += 4) {
2271 GemmMicrokernelTester()
2272 .mr(4)
2273 .nr(8)
2274 .kr(1)
2275 .sr(1)
2276 .m(4)
2277 .n(8)
2278 .k(k)
2279 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2280 }
2281 }
2282
2283 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_strided_a) {
2284 TEST_REQUIRES_ARM_NEON_FMA;
2285 for (size_t k = 12; k <= 40; k += 4) {
2286 GemmMicrokernelTester()
2287 .mr(4)
2288 .nr(8)
2289 .kr(1)
2290 .sr(1)
2291 .m(4)
2292 .n(8)
2293 .k(k)
2294 .a_stride(43)
2295 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2296 }
2297 }
2298
2299 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
2300 TEST_REQUIRES_ARM_NEON_FMA;
2301 for (size_t k = 12; k <= 40; k += 4) {
2302 for (uint32_t m = 1; m <= 4; m++) {
2303 for (uint32_t n = 1; n <= 8; n++) {
2304 GemmMicrokernelTester()
2305 .mr(4)
2306 .nr(8)
2307 .kr(1)
2308 .sr(1)
2309 .m(m)
2310 .n(n)
2311 .k(k)
2312 .iterations(1)
2313 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2314 }
2315 }
2316 }
2317 }
2318
2319 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
2320 TEST_REQUIRES_ARM_NEON_FMA;
2321 for (uint32_t n = 9; n < 16; n++) {
2322 for (size_t k = 1; k <= 20; k += 5) {
2323 GemmMicrokernelTester()
2324 .mr(4)
2325 .nr(8)
2326 .kr(1)
2327 .sr(1)
2328 .m(4)
2329 .n(8)
2330 .k(k)
2331 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2332 }
2333 }
2334 }
2335
2336 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
2337 TEST_REQUIRES_ARM_NEON_FMA;
2338 for (uint32_t n = 9; n < 16; n++) {
2339 for (size_t k = 1; k <= 20; k += 5) {
2340 GemmMicrokernelTester()
2341 .mr(4)
2342 .nr(8)
2343 .kr(1)
2344 .sr(1)
2345 .m(4)
2346 .n(8)
2347 .k(k)
2348 .cn_stride(11)
2349 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2350 }
2351 }
2352 }
2353
2354 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_a) {
2355 TEST_REQUIRES_ARM_NEON_FMA;
2356 for (uint32_t n = 9; n < 16; n++) {
2357 for (size_t k = 1; k <= 20; k += 5) {
2358 GemmMicrokernelTester()
2359 .mr(4)
2360 .nr(8)
2361 .kr(1)
2362 .sr(1)
2363 .m(4)
2364 .n(n)
2365 .k(k)
2366 .a_stride(23)
2367 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2368 }
2369 }
2370 }
2371
2372 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
2373 TEST_REQUIRES_ARM_NEON_FMA;
2374 for (uint32_t n = 9; n < 16; n++) {
2375 for (size_t k = 1; k <= 20; k += 5) {
2376 for (uint32_t m = 1; m <= 4; m++) {
2377 GemmMicrokernelTester()
2378 .mr(4)
2379 .nr(8)
2380 .kr(1)
2381 .sr(1)
2382 .m(m)
2383 .n(n)
2384 .k(k)
2385 .iterations(1)
2386 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2387 }
2388 }
2389 }
2390 }
2391
2392 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
2393 TEST_REQUIRES_ARM_NEON_FMA;
2394 for (uint32_t n = 16; n <= 24; n += 8) {
2395 for (size_t k = 1; k <= 20; k += 5) {
2396 GemmMicrokernelTester()
2397 .mr(4)
2398 .nr(8)
2399 .kr(1)
2400 .sr(1)
2401 .m(4)
2402 .n(8)
2403 .k(k)
2404 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2405 }
2406 }
2407 }
2408
2409 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
2410 TEST_REQUIRES_ARM_NEON_FMA;
2411 for (uint32_t n = 16; n <= 24; n += 8) {
2412 for (size_t k = 1; k <= 20; k += 5) {
2413 GemmMicrokernelTester()
2414 .mr(4)
2415 .nr(8)
2416 .kr(1)
2417 .sr(1)
2418 .m(4)
2419 .n(n)
2420 .k(k)
2421 .cn_stride(11)
2422 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2423 }
2424 }
2425 }
2426
2427 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_a) {
2428 TEST_REQUIRES_ARM_NEON_FMA;
2429 for (uint32_t n = 16; n <= 24; n += 8) {
2430 for (size_t k = 1; k <= 20; k += 5) {
2431 GemmMicrokernelTester()
2432 .mr(4)
2433 .nr(8)
2434 .kr(1)
2435 .sr(1)
2436 .m(4)
2437 .n(n)
2438 .k(k)
2439 .a_stride(23)
2440 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2441 }
2442 }
2443 }
2444
2445 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
2446 TEST_REQUIRES_ARM_NEON_FMA;
2447 for (uint32_t n = 16; n <= 24; n += 8) {
2448 for (size_t k = 1; k <= 20; k += 5) {
2449 for (uint32_t m = 1; m <= 4; m++) {
2450 GemmMicrokernelTester()
2451 .mr(4)
2452 .nr(8)
2453 .kr(1)
2454 .sr(1)
2455 .m(m)
2456 .n(n)
2457 .k(k)
2458 .iterations(1)
2459 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2460 }
2461 }
2462 }
2463 }
2464
2465 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
2466 TEST_REQUIRES_ARM_NEON_FMA;
2467 for (size_t k = 1; k <= 20; k += 5) {
2468 for (uint32_t m = 1; m <= 4; m++) {
2469 for (uint32_t n = 1; n <= 8; n++) {
2470 GemmMicrokernelTester()
2471 .mr(4)
2472 .nr(8)
2473 .kr(1)
2474 .sr(1)
2475 .m(m)
2476 .n(n)
2477 .k(k)
2478 .cm_stride(11)
2479 .iterations(1)
2480 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2481 }
2482 }
2483 }
2484 }
2485
2486 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
2487 TEST_REQUIRES_ARM_NEON_FMA;
2488 GemmMicrokernelTester()
2489 .mr(4)
2490 .nr(8)
2491 .kr(1)
2492 .sr(1)
2493 .m(4)
2494 .n(8)
2495 .k(4)
2496 .qmin(128)
2497 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2498 }
2499
2500 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
2501 TEST_REQUIRES_ARM_NEON_FMA;
2502 GemmMicrokernelTester()
2503 .mr(4)
2504 .nr(8)
2505 .kr(1)
2506 .sr(1)
2507 .m(4)
2508 .n(8)
2509 .k(4)
2510 .qmax(128)
2511 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2512 }
2513
2514 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
2515 TEST_REQUIRES_ARM_NEON_FMA;
2516 GemmMicrokernelTester()
2517 .mr(4)
2518 .nr(8)
2519 .kr(1)
2520 .sr(1)
2521 .m(4)
2522 .n(8)
2523 .k(4)
2524 .cm_stride(11)
2525 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2526 }
2527#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2528
2529
2530#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2531 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
2532 TEST_REQUIRES_ARM_NEON_FMA;
2533 GemmMicrokernelTester()
2534 .mr(4)
2535 .nr(8)
2536 .kr(1)
2537 .sr(1)
2538 .m(4)
2539 .n(8)
2540 .k(8)
2541 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2542 }
2543
2544 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
2545 TEST_REQUIRES_ARM_NEON_FMA;
2546 GemmMicrokernelTester()
2547 .mr(4)
2548 .nr(8)
2549 .kr(1)
2550 .sr(1)
2551 .m(4)
2552 .n(8)
2553 .k(8)
2554 .cn_stride(11)
2555 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2556 }
2557
2558 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
2559 TEST_REQUIRES_ARM_NEON_FMA;
2560 GemmMicrokernelTester()
2561 .mr(4)
2562 .nr(8)
2563 .kr(1)
2564 .sr(1)
2565 .m(4)
2566 .n(8)
2567 .k(8)
2568 .a_stride(11)
2569 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2570 }
2571
2572 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
2573 TEST_REQUIRES_ARM_NEON_FMA;
2574 for (uint32_t m = 1; m <= 4; m++) {
2575 for (uint32_t n = 1; n <= 8; n++) {
2576 GemmMicrokernelTester()
2577 .mr(4)
2578 .nr(8)
2579 .kr(1)
2580 .sr(1)
2581 .m(m)
2582 .n(n)
2583 .k(8)
2584 .iterations(1)
2585 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2586 }
2587 }
2588 }
2589
2590 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
2591 TEST_REQUIRES_ARM_NEON_FMA;
2592 for (uint32_t m = 1; m <= 4; m++) {
2593 GemmMicrokernelTester()
2594 .mr(4)
2595 .nr(8)
2596 .kr(1)
2597 .sr(1)
2598 .m(m)
2599 .n(8)
2600 .k(8)
2601 .iterations(1)
2602 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2603 }
2604 }
2605
2606 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
2607 TEST_REQUIRES_ARM_NEON_FMA;
2608 for (uint32_t n = 1; n <= 8; n++) {
2609 GemmMicrokernelTester()
2610 .mr(4)
2611 .nr(8)
2612 .kr(1)
2613 .sr(1)
2614 .m(4)
2615 .n(n)
2616 .k(8)
2617 .iterations(1)
2618 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2619 }
2620 }
2621
2622 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
2623 TEST_REQUIRES_ARM_NEON_FMA;
2624 GemmMicrokernelTester()
2625 .mr(4)
2626 .nr(8)
2627 .kr(1)
2628 .sr(1)
2629 .m(4)
2630 .n(8)
2631 .k(16)
2632 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2633 }
2634
2635 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
2636 TEST_REQUIRES_ARM_NEON_FMA;
2637 GemmMicrokernelTester()
2638 .mr(4)
2639 .nr(8)
2640 .kr(1)
2641 .sr(1)
2642 .m(4)
2643 .n(8)
2644 .k(16)
2645 .a_stride(19)
2646 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2647 }
2648
2649 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
2650 TEST_REQUIRES_ARM_NEON_FMA;
2651 for (uint32_t m = 1; m <= 4; m++) {
2652 for (uint32_t n = 1; n <= 8; n++) {
2653 GemmMicrokernelTester()
2654 .mr(4)
2655 .nr(8)
2656 .kr(1)
2657 .sr(1)
2658 .m(m)
2659 .n(n)
2660 .k(16)
2661 .iterations(1)
2662 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2663 }
2664 }
2665 }
2666
2667 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
2668 TEST_REQUIRES_ARM_NEON_FMA;
2669 for (size_t k = 1; k < 16; k++) {
2670 GemmMicrokernelTester()
2671 .mr(4)
2672 .nr(8)
2673 .kr(1)
2674 .sr(1)
2675 .m(4)
2676 .n(8)
2677 .k(k)
2678 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2679 }
2680 }
2681
2682 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
2683 TEST_REQUIRES_ARM_NEON_FMA;
2684 for (size_t k = 1; k < 16; k++) {
2685 GemmMicrokernelTester()
2686 .mr(4)
2687 .nr(8)
2688 .kr(1)
2689 .sr(1)
2690 .m(4)
2691 .n(8)
2692 .k(k)
2693 .a_stride(19)
2694 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2695 }
2696 }
2697
2698 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
2699 TEST_REQUIRES_ARM_NEON_FMA;
2700 for (size_t k = 1; k < 16; k++) {
2701 for (uint32_t m = 1; m <= 4; m++) {
2702 for (uint32_t n = 1; n <= 8; n++) {
2703 GemmMicrokernelTester()
2704 .mr(4)
2705 .nr(8)
2706 .kr(1)
2707 .sr(1)
2708 .m(m)
2709 .n(n)
2710 .k(k)
2711 .iterations(1)
2712 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2713 }
2714 }
2715 }
2716 }
2717
2718 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
2719 TEST_REQUIRES_ARM_NEON_FMA;
2720 for (size_t k = 17; k < 16; k++) {
2721 GemmMicrokernelTester()
2722 .mr(4)
2723 .nr(8)
2724 .kr(1)
2725 .sr(1)
2726 .m(4)
2727 .n(8)
2728 .k(k)
2729 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2730 }
2731 }
2732
2733 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
2734 TEST_REQUIRES_ARM_NEON_FMA;
2735 for (size_t k = 17; k < 16; k++) {
2736 GemmMicrokernelTester()
2737 .mr(4)
2738 .nr(8)
2739 .kr(1)
2740 .sr(1)
2741 .m(4)
2742 .n(8)
2743 .k(k)
2744 .a_stride(19)
2745 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2746 }
2747 }
2748
2749 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
2750 TEST_REQUIRES_ARM_NEON_FMA;
2751 for (size_t k = 17; k < 16; k++) {
2752 for (uint32_t m = 1; m <= 4; m++) {
2753 for (uint32_t n = 1; n <= 8; n++) {
2754 GemmMicrokernelTester()
2755 .mr(4)
2756 .nr(8)
2757 .kr(1)
2758 .sr(1)
2759 .m(m)
2760 .n(n)
2761 .k(k)
2762 .iterations(1)
2763 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2764 }
2765 }
2766 }
2767 }
2768
2769 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
2770 TEST_REQUIRES_ARM_NEON_FMA;
2771 for (size_t k = 24; k <= 80; k += 8) {
2772 GemmMicrokernelTester()
2773 .mr(4)
2774 .nr(8)
2775 .kr(1)
2776 .sr(1)
2777 .m(4)
2778 .n(8)
2779 .k(k)
2780 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2781 }
2782 }
2783
2784 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
2785 TEST_REQUIRES_ARM_NEON_FMA;
2786 for (size_t k = 24; k <= 80; k += 8) {
2787 GemmMicrokernelTester()
2788 .mr(4)
2789 .nr(8)
2790 .kr(1)
2791 .sr(1)
2792 .m(4)
2793 .n(8)
2794 .k(k)
2795 .a_stride(83)
2796 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2797 }
2798 }
2799
2800 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
2801 TEST_REQUIRES_ARM_NEON_FMA;
2802 for (size_t k = 24; k <= 80; k += 8) {
2803 for (uint32_t m = 1; m <= 4; m++) {
2804 for (uint32_t n = 1; n <= 8; n++) {
2805 GemmMicrokernelTester()
2806 .mr(4)
2807 .nr(8)
2808 .kr(1)
2809 .sr(1)
2810 .m(m)
2811 .n(n)
2812 .k(k)
2813 .iterations(1)
2814 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2815 }
2816 }
2817 }
2818 }
2819
2820 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
2821 TEST_REQUIRES_ARM_NEON_FMA;
2822 for (uint32_t n = 9; n < 16; n++) {
2823 for (size_t k = 1; k <= 40; k += 9) {
2824 GemmMicrokernelTester()
2825 .mr(4)
2826 .nr(8)
2827 .kr(1)
2828 .sr(1)
2829 .m(4)
2830 .n(8)
2831 .k(k)
2832 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2833 }
2834 }
2835 }
2836
2837 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
2838 TEST_REQUIRES_ARM_NEON_FMA;
2839 for (uint32_t n = 9; n < 16; n++) {
2840 for (size_t k = 1; k <= 40; k += 9) {
2841 GemmMicrokernelTester()
2842 .mr(4)
2843 .nr(8)
2844 .kr(1)
2845 .sr(1)
2846 .m(4)
2847 .n(8)
2848 .k(k)
2849 .cn_stride(11)
2850 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2851 }
2852 }
2853 }
2854
2855 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
2856 TEST_REQUIRES_ARM_NEON_FMA;
2857 for (uint32_t n = 9; n < 16; n++) {
2858 for (size_t k = 1; k <= 40; k += 9) {
2859 GemmMicrokernelTester()
2860 .mr(4)
2861 .nr(8)
2862 .kr(1)
2863 .sr(1)
2864 .m(4)
2865 .n(n)
2866 .k(k)
2867 .a_stride(43)
2868 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2869 }
2870 }
2871 }
2872
2873 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
2874 TEST_REQUIRES_ARM_NEON_FMA;
2875 for (uint32_t n = 9; n < 16; n++) {
2876 for (size_t k = 1; k <= 40; k += 9) {
2877 for (uint32_t m = 1; m <= 4; m++) {
2878 GemmMicrokernelTester()
2879 .mr(4)
2880 .nr(8)
2881 .kr(1)
2882 .sr(1)
2883 .m(m)
2884 .n(n)
2885 .k(k)
2886 .iterations(1)
2887 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2888 }
2889 }
2890 }
2891 }
2892
2893 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
2894 TEST_REQUIRES_ARM_NEON_FMA;
2895 for (uint32_t n = 16; n <= 24; n += 8) {
2896 for (size_t k = 1; k <= 40; k += 9) {
2897 GemmMicrokernelTester()
2898 .mr(4)
2899 .nr(8)
2900 .kr(1)
2901 .sr(1)
2902 .m(4)
2903 .n(8)
2904 .k(k)
2905 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2906 }
2907 }
2908 }
2909
2910 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
2911 TEST_REQUIRES_ARM_NEON_FMA;
2912 for (uint32_t n = 16; n <= 24; n += 8) {
2913 for (size_t k = 1; k <= 40; k += 9) {
2914 GemmMicrokernelTester()
2915 .mr(4)
2916 .nr(8)
2917 .kr(1)
2918 .sr(1)
2919 .m(4)
2920 .n(n)
2921 .k(k)
2922 .cn_stride(11)
2923 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2924 }
2925 }
2926 }
2927
2928 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
2929 TEST_REQUIRES_ARM_NEON_FMA;
2930 for (uint32_t n = 16; n <= 24; n += 8) {
2931 for (size_t k = 1; k <= 40; k += 9) {
2932 GemmMicrokernelTester()
2933 .mr(4)
2934 .nr(8)
2935 .kr(1)
2936 .sr(1)
2937 .m(4)
2938 .n(n)
2939 .k(k)
2940 .a_stride(43)
2941 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2942 }
2943 }
2944 }
2945
2946 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
2947 TEST_REQUIRES_ARM_NEON_FMA;
2948 for (uint32_t n = 16; n <= 24; n += 8) {
2949 for (size_t k = 1; k <= 40; k += 9) {
2950 for (uint32_t m = 1; m <= 4; m++) {
2951 GemmMicrokernelTester()
2952 .mr(4)
2953 .nr(8)
2954 .kr(1)
2955 .sr(1)
2956 .m(m)
2957 .n(n)
2958 .k(k)
2959 .iterations(1)
2960 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2961 }
2962 }
2963 }
2964 }
2965
2966 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
2967 TEST_REQUIRES_ARM_NEON_FMA;
2968 for (size_t k = 1; k <= 40; k += 9) {
2969 for (uint32_t m = 1; m <= 4; m++) {
2970 for (uint32_t n = 1; n <= 8; n++) {
2971 GemmMicrokernelTester()
2972 .mr(4)
2973 .nr(8)
2974 .kr(1)
2975 .sr(1)
2976 .m(m)
2977 .n(n)
2978 .k(k)
2979 .cm_stride(11)
2980 .iterations(1)
2981 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2982 }
2983 }
2984 }
2985 }
2986
2987 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
2988 TEST_REQUIRES_ARM_NEON_FMA;
2989 GemmMicrokernelTester()
2990 .mr(4)
2991 .nr(8)
2992 .kr(1)
2993 .sr(1)
2994 .m(4)
2995 .n(8)
2996 .k(8)
2997 .qmin(128)
2998 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2999 }
3000
3001 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
3002 TEST_REQUIRES_ARM_NEON_FMA;
3003 GemmMicrokernelTester()
3004 .mr(4)
3005 .nr(8)
3006 .kr(1)
3007 .sr(1)
3008 .m(4)
3009 .n(8)
3010 .k(8)
3011 .qmax(128)
3012 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
3013 }
3014
3015 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
3016 TEST_REQUIRES_ARM_NEON_FMA;
3017 GemmMicrokernelTester()
3018 .mr(4)
3019 .nr(8)
3020 .kr(1)
3021 .sr(1)
3022 .m(4)
3023 .n(8)
3024 .k(8)
3025 .cm_stride(11)
3026 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
3027 }
3028#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3029
3030
3031#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3032 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
3033 TEST_REQUIRES_ARM_NEON_FMA;
3034 GemmMicrokernelTester()
3035 .mr(4)
3036 .nr(8)
3037 .kr(1)
3038 .sr(1)
3039 .m(4)
3040 .n(8)
3041 .k(8)
3042 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3043 }
3044
3045 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3046 TEST_REQUIRES_ARM_NEON_FMA;
3047 GemmMicrokernelTester()
3048 .mr(4)
3049 .nr(8)
3050 .kr(1)
3051 .sr(1)
3052 .m(4)
3053 .n(8)
3054 .k(8)
3055 .cn_stride(11)
3056 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3057 }
3058
3059 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
3060 TEST_REQUIRES_ARM_NEON_FMA;
3061 GemmMicrokernelTester()
3062 .mr(4)
3063 .nr(8)
3064 .kr(1)
3065 .sr(1)
3066 .m(4)
3067 .n(8)
3068 .k(8)
3069 .a_stride(11)
3070 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3071 }
3072
3073 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3074 TEST_REQUIRES_ARM_NEON_FMA;
3075 for (uint32_t m = 1; m <= 4; m++) {
3076 for (uint32_t n = 1; n <= 8; n++) {
3077 GemmMicrokernelTester()
3078 .mr(4)
3079 .nr(8)
3080 .kr(1)
3081 .sr(1)
3082 .m(m)
3083 .n(n)
3084 .k(8)
3085 .iterations(1)
3086 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3087 }
3088 }
3089 }
3090
3091 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3092 TEST_REQUIRES_ARM_NEON_FMA;
3093 for (uint32_t m = 1; m <= 4; m++) {
3094 GemmMicrokernelTester()
3095 .mr(4)
3096 .nr(8)
3097 .kr(1)
3098 .sr(1)
3099 .m(m)
3100 .n(8)
3101 .k(8)
3102 .iterations(1)
3103 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3104 }
3105 }
3106
3107 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3108 TEST_REQUIRES_ARM_NEON_FMA;
3109 for (uint32_t n = 1; n <= 8; n++) {
3110 GemmMicrokernelTester()
3111 .mr(4)
3112 .nr(8)
3113 .kr(1)
3114 .sr(1)
3115 .m(4)
3116 .n(n)
3117 .k(8)
3118 .iterations(1)
3119 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3120 }
3121 }
3122
3123 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
3124 TEST_REQUIRES_ARM_NEON_FMA;
3125 GemmMicrokernelTester()
3126 .mr(4)
3127 .nr(8)
3128 .kr(1)
3129 .sr(1)
3130 .m(4)
3131 .n(8)
3132 .k(16)
3133 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3134 }
3135
3136 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
3137 TEST_REQUIRES_ARM_NEON_FMA;
3138 GemmMicrokernelTester()
3139 .mr(4)
3140 .nr(8)
3141 .kr(1)
3142 .sr(1)
3143 .m(4)
3144 .n(8)
3145 .k(16)
3146 .a_stride(19)
3147 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3148 }
3149
3150 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
3151 TEST_REQUIRES_ARM_NEON_FMA;
3152 for (uint32_t m = 1; m <= 4; m++) {
3153 for (uint32_t n = 1; n <= 8; n++) {
3154 GemmMicrokernelTester()
3155 .mr(4)
3156 .nr(8)
3157 .kr(1)
3158 .sr(1)
3159 .m(m)
3160 .n(n)
3161 .k(16)
3162 .iterations(1)
3163 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3164 }
3165 }
3166 }
3167
3168 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
3169 TEST_REQUIRES_ARM_NEON_FMA;
3170 for (size_t k = 1; k < 16; k++) {
3171 GemmMicrokernelTester()
3172 .mr(4)
3173 .nr(8)
3174 .kr(1)
3175 .sr(1)
3176 .m(4)
3177 .n(8)
3178 .k(k)
3179 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3180 }
3181 }
3182
3183 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
3184 TEST_REQUIRES_ARM_NEON_FMA;
3185 for (size_t k = 1; k < 16; k++) {
3186 GemmMicrokernelTester()
3187 .mr(4)
3188 .nr(8)
3189 .kr(1)
3190 .sr(1)
3191 .m(4)
3192 .n(8)
3193 .k(k)
3194 .a_stride(19)
3195 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3196 }
3197 }
3198
3199 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
3200 TEST_REQUIRES_ARM_NEON_FMA;
3201 for (size_t k = 1; k < 16; k++) {
3202 for (uint32_t m = 1; m <= 4; m++) {
3203 for (uint32_t n = 1; n <= 8; n++) {
3204 GemmMicrokernelTester()
3205 .mr(4)
3206 .nr(8)
3207 .kr(1)
3208 .sr(1)
3209 .m(m)
3210 .n(n)
3211 .k(k)
3212 .iterations(1)
3213 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3214 }
3215 }
3216 }
3217 }
3218
3219 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
3220 TEST_REQUIRES_ARM_NEON_FMA;
3221 for (size_t k = 17; k < 16; k++) {
3222 GemmMicrokernelTester()
3223 .mr(4)
3224 .nr(8)
3225 .kr(1)
3226 .sr(1)
3227 .m(4)
3228 .n(8)
3229 .k(k)
3230 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3231 }
3232 }
3233
3234 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
3235 TEST_REQUIRES_ARM_NEON_FMA;
3236 for (size_t k = 17; k < 16; k++) {
3237 GemmMicrokernelTester()
3238 .mr(4)
3239 .nr(8)
3240 .kr(1)
3241 .sr(1)
3242 .m(4)
3243 .n(8)
3244 .k(k)
3245 .a_stride(19)
3246 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3247 }
3248 }
3249
3250 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3251 TEST_REQUIRES_ARM_NEON_FMA;
3252 for (size_t k = 17; k < 16; k++) {
3253 for (uint32_t m = 1; m <= 4; m++) {
3254 for (uint32_t n = 1; n <= 8; n++) {
3255 GemmMicrokernelTester()
3256 .mr(4)
3257 .nr(8)
3258 .kr(1)
3259 .sr(1)
3260 .m(m)
3261 .n(n)
3262 .k(k)
3263 .iterations(1)
3264 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3265 }
3266 }
3267 }
3268 }
3269
3270 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3271 TEST_REQUIRES_ARM_NEON_FMA;
3272 for (size_t k = 24; k <= 80; k += 8) {
3273 GemmMicrokernelTester()
3274 .mr(4)
3275 .nr(8)
3276 .kr(1)
3277 .sr(1)
3278 .m(4)
3279 .n(8)
3280 .k(k)
3281 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3282 }
3283 }
3284
3285 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
3286 TEST_REQUIRES_ARM_NEON_FMA;
3287 for (size_t k = 24; k <= 80; k += 8) {
3288 GemmMicrokernelTester()
3289 .mr(4)
3290 .nr(8)
3291 .kr(1)
3292 .sr(1)
3293 .m(4)
3294 .n(8)
3295 .k(k)
3296 .a_stride(83)
3297 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3298 }
3299 }
3300
3301 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3302 TEST_REQUIRES_ARM_NEON_FMA;
3303 for (size_t k = 24; k <= 80; k += 8) {
3304 for (uint32_t m = 1; m <= 4; m++) {
3305 for (uint32_t n = 1; n <= 8; n++) {
3306 GemmMicrokernelTester()
3307 .mr(4)
3308 .nr(8)
3309 .kr(1)
3310 .sr(1)
3311 .m(m)
3312 .n(n)
3313 .k(k)
3314 .iterations(1)
3315 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3316 }
3317 }
3318 }
3319 }
3320
3321 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
3322 TEST_REQUIRES_ARM_NEON_FMA;
3323 for (uint32_t n = 9; n < 16; n++) {
3324 for (size_t k = 1; k <= 40; k += 9) {
3325 GemmMicrokernelTester()
3326 .mr(4)
3327 .nr(8)
3328 .kr(1)
3329 .sr(1)
3330 .m(4)
3331 .n(8)
3332 .k(k)
3333 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3334 }
3335 }
3336 }
3337
3338 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
3339 TEST_REQUIRES_ARM_NEON_FMA;
3340 for (uint32_t n = 9; n < 16; n++) {
3341 for (size_t k = 1; k <= 40; k += 9) {
3342 GemmMicrokernelTester()
3343 .mr(4)
3344 .nr(8)
3345 .kr(1)
3346 .sr(1)
3347 .m(4)
3348 .n(8)
3349 .k(k)
3350 .cn_stride(11)
3351 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3352 }
3353 }
3354 }
3355
3356 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
3357 TEST_REQUIRES_ARM_NEON_FMA;
3358 for (uint32_t n = 9; n < 16; n++) {
3359 for (size_t k = 1; k <= 40; k += 9) {
3360 GemmMicrokernelTester()
3361 .mr(4)
3362 .nr(8)
3363 .kr(1)
3364 .sr(1)
3365 .m(4)
3366 .n(n)
3367 .k(k)
3368 .a_stride(43)
3369 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3370 }
3371 }
3372 }
3373
3374 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
3375 TEST_REQUIRES_ARM_NEON_FMA;
3376 for (uint32_t n = 9; n < 16; n++) {
3377 for (size_t k = 1; k <= 40; k += 9) {
3378 for (uint32_t m = 1; m <= 4; m++) {
3379 GemmMicrokernelTester()
3380 .mr(4)
3381 .nr(8)
3382 .kr(1)
3383 .sr(1)
3384 .m(m)
3385 .n(n)
3386 .k(k)
3387 .iterations(1)
3388 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3389 }
3390 }
3391 }
3392 }
3393
3394 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
3395 TEST_REQUIRES_ARM_NEON_FMA;
3396 for (uint32_t n = 16; n <= 24; n += 8) {
3397 for (size_t k = 1; k <= 40; k += 9) {
3398 GemmMicrokernelTester()
3399 .mr(4)
3400 .nr(8)
3401 .kr(1)
3402 .sr(1)
3403 .m(4)
3404 .n(8)
3405 .k(k)
3406 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3407 }
3408 }
3409 }
3410
3411 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
3412 TEST_REQUIRES_ARM_NEON_FMA;
3413 for (uint32_t n = 16; n <= 24; n += 8) {
3414 for (size_t k = 1; k <= 40; k += 9) {
3415 GemmMicrokernelTester()
3416 .mr(4)
3417 .nr(8)
3418 .kr(1)
3419 .sr(1)
3420 .m(4)
3421 .n(n)
3422 .k(k)
3423 .cn_stride(11)
3424 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3425 }
3426 }
3427 }
3428
3429 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
3430 TEST_REQUIRES_ARM_NEON_FMA;
3431 for (uint32_t n = 16; n <= 24; n += 8) {
3432 for (size_t k = 1; k <= 40; k += 9) {
3433 GemmMicrokernelTester()
3434 .mr(4)
3435 .nr(8)
3436 .kr(1)
3437 .sr(1)
3438 .m(4)
3439 .n(n)
3440 .k(k)
3441 .a_stride(43)
3442 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3443 }
3444 }
3445 }
3446
3447 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
3448 TEST_REQUIRES_ARM_NEON_FMA;
3449 for (uint32_t n = 16; n <= 24; n += 8) {
3450 for (size_t k = 1; k <= 40; k += 9) {
3451 for (uint32_t m = 1; m <= 4; m++) {
3452 GemmMicrokernelTester()
3453 .mr(4)
3454 .nr(8)
3455 .kr(1)
3456 .sr(1)
3457 .m(m)
3458 .n(n)
3459 .k(k)
3460 .iterations(1)
3461 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3462 }
3463 }
3464 }
3465 }
3466
3467 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3468 TEST_REQUIRES_ARM_NEON_FMA;
3469 for (size_t k = 1; k <= 40; k += 9) {
3470 for (uint32_t m = 1; m <= 4; m++) {
3471 for (uint32_t n = 1; n <= 8; n++) {
3472 GemmMicrokernelTester()
3473 .mr(4)
3474 .nr(8)
3475 .kr(1)
3476 .sr(1)
3477 .m(m)
3478 .n(n)
3479 .k(k)
3480 .cm_stride(11)
3481 .iterations(1)
3482 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3483 }
3484 }
3485 }
3486 }
3487
3488 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3489 TEST_REQUIRES_ARM_NEON_FMA;
3490 GemmMicrokernelTester()
3491 .mr(4)
3492 .nr(8)
3493 .kr(1)
3494 .sr(1)
3495 .m(4)
3496 .n(8)
3497 .k(8)
3498 .qmin(128)
3499 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3500 }
3501
3502 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3503 TEST_REQUIRES_ARM_NEON_FMA;
3504 GemmMicrokernelTester()
3505 .mr(4)
3506 .nr(8)
3507 .kr(1)
3508 .sr(1)
3509 .m(4)
3510 .n(8)
3511 .k(8)
3512 .qmax(128)
3513 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3514 }
3515
3516 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3517 TEST_REQUIRES_ARM_NEON_FMA;
3518 GemmMicrokernelTester()
3519 .mr(4)
3520 .nr(8)
3521 .kr(1)
3522 .sr(1)
3523 .m(4)
3524 .n(8)
3525 .k(8)
3526 .cm_stride(11)
3527 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3528 }
3529#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3530
3531
3532#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3533 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
3534 TEST_REQUIRES_ARM_NEON_FMA;
3535 GemmMicrokernelTester()
3536 .mr(5)
3537 .nr(8)
3538 .kr(1)
3539 .sr(1)
3540 .m(5)
3541 .n(8)
3542 .k(8)
3543 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3544 }
3545
3546 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
3547 TEST_REQUIRES_ARM_NEON_FMA;
3548 GemmMicrokernelTester()
3549 .mr(5)
3550 .nr(8)
3551 .kr(1)
3552 .sr(1)
3553 .m(5)
3554 .n(8)
3555 .k(8)
3556 .cn_stride(11)
3557 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3558 }
3559
3560 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
3561 TEST_REQUIRES_ARM_NEON_FMA;
3562 GemmMicrokernelTester()
3563 .mr(5)
3564 .nr(8)
3565 .kr(1)
3566 .sr(1)
3567 .m(5)
3568 .n(8)
3569 .k(8)
3570 .a_stride(11)
3571 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3572 }
3573
3574 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
3575 TEST_REQUIRES_ARM_NEON_FMA;
3576 for (uint32_t m = 1; m <= 5; m++) {
3577 for (uint32_t n = 1; n <= 8; n++) {
3578 GemmMicrokernelTester()
3579 .mr(5)
3580 .nr(8)
3581 .kr(1)
3582 .sr(1)
3583 .m(m)
3584 .n(n)
3585 .k(8)
3586 .iterations(1)
3587 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3588 }
3589 }
3590 }
3591
3592 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
3593 TEST_REQUIRES_ARM_NEON_FMA;
3594 for (uint32_t m = 1; m <= 5; m++) {
3595 GemmMicrokernelTester()
3596 .mr(5)
3597 .nr(8)
3598 .kr(1)
3599 .sr(1)
3600 .m(m)
3601 .n(8)
3602 .k(8)
3603 .iterations(1)
3604 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3605 }
3606 }
3607
3608 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
3609 TEST_REQUIRES_ARM_NEON_FMA;
3610 for (uint32_t n = 1; n <= 8; n++) {
3611 GemmMicrokernelTester()
3612 .mr(5)
3613 .nr(8)
3614 .kr(1)
3615 .sr(1)
3616 .m(5)
3617 .n(n)
3618 .k(8)
3619 .iterations(1)
3620 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3621 }
3622 }
3623
3624 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
3625 TEST_REQUIRES_ARM_NEON_FMA;
3626 GemmMicrokernelTester()
3627 .mr(5)
3628 .nr(8)
3629 .kr(1)
3630 .sr(1)
3631 .m(5)
3632 .n(8)
3633 .k(16)
3634 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3635 }
3636
3637 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
3638 TEST_REQUIRES_ARM_NEON_FMA;
3639 GemmMicrokernelTester()
3640 .mr(5)
3641 .nr(8)
3642 .kr(1)
3643 .sr(1)
3644 .m(5)
3645 .n(8)
3646 .k(16)
3647 .a_stride(19)
3648 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3649 }
3650
3651 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
3652 TEST_REQUIRES_ARM_NEON_FMA;
3653 for (uint32_t m = 1; m <= 5; m++) {
3654 for (uint32_t n = 1; n <= 8; n++) {
3655 GemmMicrokernelTester()
3656 .mr(5)
3657 .nr(8)
3658 .kr(1)
3659 .sr(1)
3660 .m(m)
3661 .n(n)
3662 .k(16)
3663 .iterations(1)
3664 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3665 }
3666 }
3667 }
3668
3669 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
3670 TEST_REQUIRES_ARM_NEON_FMA;
3671 for (size_t k = 1; k < 16; k++) {
3672 GemmMicrokernelTester()
3673 .mr(5)
3674 .nr(8)
3675 .kr(1)
3676 .sr(1)
3677 .m(5)
3678 .n(8)
3679 .k(k)
3680 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3681 }
3682 }
3683
3684 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
3685 TEST_REQUIRES_ARM_NEON_FMA;
3686 for (size_t k = 1; k < 16; k++) {
3687 GemmMicrokernelTester()
3688 .mr(5)
3689 .nr(8)
3690 .kr(1)
3691 .sr(1)
3692 .m(5)
3693 .n(8)
3694 .k(k)
3695 .a_stride(19)
3696 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3697 }
3698 }
3699
3700 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
3701 TEST_REQUIRES_ARM_NEON_FMA;
3702 for (size_t k = 1; k < 16; k++) {
3703 for (uint32_t m = 1; m <= 5; m++) {
3704 for (uint32_t n = 1; n <= 8; n++) {
3705 GemmMicrokernelTester()
3706 .mr(5)
3707 .nr(8)
3708 .kr(1)
3709 .sr(1)
3710 .m(m)
3711 .n(n)
3712 .k(k)
3713 .iterations(1)
3714 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3715 }
3716 }
3717 }
3718 }
3719
3720 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
3721 TEST_REQUIRES_ARM_NEON_FMA;
3722 for (size_t k = 17; k < 16; k++) {
3723 GemmMicrokernelTester()
3724 .mr(5)
3725 .nr(8)
3726 .kr(1)
3727 .sr(1)
3728 .m(5)
3729 .n(8)
3730 .k(k)
3731 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3732 }
3733 }
3734
3735 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
3736 TEST_REQUIRES_ARM_NEON_FMA;
3737 for (size_t k = 17; k < 16; k++) {
3738 GemmMicrokernelTester()
3739 .mr(5)
3740 .nr(8)
3741 .kr(1)
3742 .sr(1)
3743 .m(5)
3744 .n(8)
3745 .k(k)
3746 .a_stride(19)
3747 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3748 }
3749 }
3750
3751 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
3752 TEST_REQUIRES_ARM_NEON_FMA;
3753 for (size_t k = 17; k < 16; k++) {
3754 for (uint32_t m = 1; m <= 5; m++) {
3755 for (uint32_t n = 1; n <= 8; n++) {
3756 GemmMicrokernelTester()
3757 .mr(5)
3758 .nr(8)
3759 .kr(1)
3760 .sr(1)
3761 .m(m)
3762 .n(n)
3763 .k(k)
3764 .iterations(1)
3765 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3766 }
3767 }
3768 }
3769 }
3770
3771 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
3772 TEST_REQUIRES_ARM_NEON_FMA;
3773 for (size_t k = 24; k <= 80; k += 8) {
3774 GemmMicrokernelTester()
3775 .mr(5)
3776 .nr(8)
3777 .kr(1)
3778 .sr(1)
3779 .m(5)
3780 .n(8)
3781 .k(k)
3782 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3783 }
3784 }
3785
3786 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
3787 TEST_REQUIRES_ARM_NEON_FMA;
3788 for (size_t k = 24; k <= 80; k += 8) {
3789 GemmMicrokernelTester()
3790 .mr(5)
3791 .nr(8)
3792 .kr(1)
3793 .sr(1)
3794 .m(5)
3795 .n(8)
3796 .k(k)
3797 .a_stride(83)
3798 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3799 }
3800 }
3801
3802 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
3803 TEST_REQUIRES_ARM_NEON_FMA;
3804 for (size_t k = 24; k <= 80; k += 8) {
3805 for (uint32_t m = 1; m <= 5; m++) {
3806 for (uint32_t n = 1; n <= 8; n++) {
3807 GemmMicrokernelTester()
3808 .mr(5)
3809 .nr(8)
3810 .kr(1)
3811 .sr(1)
3812 .m(m)
3813 .n(n)
3814 .k(k)
3815 .iterations(1)
3816 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3817 }
3818 }
3819 }
3820 }
3821
3822 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
3823 TEST_REQUIRES_ARM_NEON_FMA;
3824 for (uint32_t n = 9; n < 16; n++) {
3825 for (size_t k = 1; k <= 40; k += 9) {
3826 GemmMicrokernelTester()
3827 .mr(5)
3828 .nr(8)
3829 .kr(1)
3830 .sr(1)
3831 .m(5)
3832 .n(8)
3833 .k(k)
3834 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3835 }
3836 }
3837 }
3838
3839 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
3840 TEST_REQUIRES_ARM_NEON_FMA;
3841 for (uint32_t n = 9; n < 16; n++) {
3842 for (size_t k = 1; k <= 40; k += 9) {
3843 GemmMicrokernelTester()
3844 .mr(5)
3845 .nr(8)
3846 .kr(1)
3847 .sr(1)
3848 .m(5)
3849 .n(8)
3850 .k(k)
3851 .cn_stride(11)
3852 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3853 }
3854 }
3855 }
3856
3857 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
3858 TEST_REQUIRES_ARM_NEON_FMA;
3859 for (uint32_t n = 9; n < 16; n++) {
3860 for (size_t k = 1; k <= 40; k += 9) {
3861 GemmMicrokernelTester()
3862 .mr(5)
3863 .nr(8)
3864 .kr(1)
3865 .sr(1)
3866 .m(5)
3867 .n(n)
3868 .k(k)
3869 .a_stride(43)
3870 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3871 }
3872 }
3873 }
3874
3875 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
3876 TEST_REQUIRES_ARM_NEON_FMA;
3877 for (uint32_t n = 9; n < 16; n++) {
3878 for (size_t k = 1; k <= 40; k += 9) {
3879 for (uint32_t m = 1; m <= 5; m++) {
3880 GemmMicrokernelTester()
3881 .mr(5)
3882 .nr(8)
3883 .kr(1)
3884 .sr(1)
3885 .m(m)
3886 .n(n)
3887 .k(k)
3888 .iterations(1)
3889 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3890 }
3891 }
3892 }
3893 }
3894
3895 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
3896 TEST_REQUIRES_ARM_NEON_FMA;
3897 for (uint32_t n = 16; n <= 24; n += 8) {
3898 for (size_t k = 1; k <= 40; k += 9) {
3899 GemmMicrokernelTester()
3900 .mr(5)
3901 .nr(8)
3902 .kr(1)
3903 .sr(1)
3904 .m(5)
3905 .n(8)
3906 .k(k)
3907 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3908 }
3909 }
3910 }
3911
3912 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
3913 TEST_REQUIRES_ARM_NEON_FMA;
3914 for (uint32_t n = 16; n <= 24; n += 8) {
3915 for (size_t k = 1; k <= 40; k += 9) {
3916 GemmMicrokernelTester()
3917 .mr(5)
3918 .nr(8)
3919 .kr(1)
3920 .sr(1)
3921 .m(5)
3922 .n(n)
3923 .k(k)
3924 .cn_stride(11)
3925 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3926 }
3927 }
3928 }
3929
3930 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
3931 TEST_REQUIRES_ARM_NEON_FMA;
3932 for (uint32_t n = 16; n <= 24; n += 8) {
3933 for (size_t k = 1; k <= 40; k += 9) {
3934 GemmMicrokernelTester()
3935 .mr(5)
3936 .nr(8)
3937 .kr(1)
3938 .sr(1)
3939 .m(5)
3940 .n(n)
3941 .k(k)
3942 .a_stride(43)
3943 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3944 }
3945 }
3946 }
3947
3948 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
3949 TEST_REQUIRES_ARM_NEON_FMA;
3950 for (uint32_t n = 16; n <= 24; n += 8) {
3951 for (size_t k = 1; k <= 40; k += 9) {
3952 for (uint32_t m = 1; m <= 5; m++) {
3953 GemmMicrokernelTester()
3954 .mr(5)
3955 .nr(8)
3956 .kr(1)
3957 .sr(1)
3958 .m(m)
3959 .n(n)
3960 .k(k)
3961 .iterations(1)
3962 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3963 }
3964 }
3965 }
3966 }
3967
3968 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
3969 TEST_REQUIRES_ARM_NEON_FMA;
3970 for (size_t k = 1; k <= 40; k += 9) {
3971 for (uint32_t m = 1; m <= 5; m++) {
3972 for (uint32_t n = 1; n <= 8; n++) {
3973 GemmMicrokernelTester()
3974 .mr(5)
3975 .nr(8)
3976 .kr(1)
3977 .sr(1)
3978 .m(m)
3979 .n(n)
3980 .k(k)
3981 .cm_stride(11)
3982 .iterations(1)
3983 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3984 }
3985 }
3986 }
3987 }
3988
3989 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
3990 TEST_REQUIRES_ARM_NEON_FMA;
3991 GemmMicrokernelTester()
3992 .mr(5)
3993 .nr(8)
3994 .kr(1)
3995 .sr(1)
3996 .m(5)
3997 .n(8)
3998 .k(8)
3999 .qmin(128)
4000 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4001 }
4002
4003 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
4004 TEST_REQUIRES_ARM_NEON_FMA;
4005 GemmMicrokernelTester()
4006 .mr(5)
4007 .nr(8)
4008 .kr(1)
4009 .sr(1)
4010 .m(5)
4011 .n(8)
4012 .k(8)
4013 .qmax(128)
4014 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4015 }
4016
4017 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
4018 TEST_REQUIRES_ARM_NEON_FMA;
4019 GemmMicrokernelTester()
4020 .mr(5)
4021 .nr(8)
4022 .kr(1)
4023 .sr(1)
4024 .m(5)
4025 .n(8)
4026 .k(8)
4027 .cm_stride(11)
4028 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
4029 }
4030#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4031
4032
4033#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4034 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
4035 TEST_REQUIRES_ARM_NEON_FMA;
4036 GemmMicrokernelTester()
4037 .mr(5)
4038 .nr(8)
4039 .kr(1)
4040 .sr(1)
4041 .m(5)
4042 .n(8)
4043 .k(8)
4044 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4045 }
4046
4047 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
4048 TEST_REQUIRES_ARM_NEON_FMA;
4049 GemmMicrokernelTester()
4050 .mr(5)
4051 .nr(8)
4052 .kr(1)
4053 .sr(1)
4054 .m(5)
4055 .n(8)
4056 .k(8)
4057 .cn_stride(11)
4058 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4059 }
4060
4061 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
4062 TEST_REQUIRES_ARM_NEON_FMA;
4063 GemmMicrokernelTester()
4064 .mr(5)
4065 .nr(8)
4066 .kr(1)
4067 .sr(1)
4068 .m(5)
4069 .n(8)
4070 .k(8)
4071 .a_stride(11)
4072 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4073 }
4074
4075 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
4076 TEST_REQUIRES_ARM_NEON_FMA;
4077 for (uint32_t m = 1; m <= 5; m++) {
4078 for (uint32_t n = 1; n <= 8; n++) {
4079 GemmMicrokernelTester()
4080 .mr(5)
4081 .nr(8)
4082 .kr(1)
4083 .sr(1)
4084 .m(m)
4085 .n(n)
4086 .k(8)
4087 .iterations(1)
4088 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4089 }
4090 }
4091 }
4092
4093 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
4094 TEST_REQUIRES_ARM_NEON_FMA;
4095 for (uint32_t m = 1; m <= 5; m++) {
4096 GemmMicrokernelTester()
4097 .mr(5)
4098 .nr(8)
4099 .kr(1)
4100 .sr(1)
4101 .m(m)
4102 .n(8)
4103 .k(8)
4104 .iterations(1)
4105 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4106 }
4107 }
4108
4109 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
4110 TEST_REQUIRES_ARM_NEON_FMA;
4111 for (uint32_t n = 1; n <= 8; n++) {
4112 GemmMicrokernelTester()
4113 .mr(5)
4114 .nr(8)
4115 .kr(1)
4116 .sr(1)
4117 .m(5)
4118 .n(n)
4119 .k(8)
4120 .iterations(1)
4121 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4122 }
4123 }
4124
4125 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
4126 TEST_REQUIRES_ARM_NEON_FMA;
4127 GemmMicrokernelTester()
4128 .mr(5)
4129 .nr(8)
4130 .kr(1)
4131 .sr(1)
4132 .m(5)
4133 .n(8)
4134 .k(16)
4135 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4136 }
4137
4138 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
4139 TEST_REQUIRES_ARM_NEON_FMA;
4140 GemmMicrokernelTester()
4141 .mr(5)
4142 .nr(8)
4143 .kr(1)
4144 .sr(1)
4145 .m(5)
4146 .n(8)
4147 .k(16)
4148 .a_stride(19)
4149 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4150 }
4151
4152 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
4153 TEST_REQUIRES_ARM_NEON_FMA;
4154 for (uint32_t m = 1; m <= 5; m++) {
4155 for (uint32_t n = 1; n <= 8; n++) {
4156 GemmMicrokernelTester()
4157 .mr(5)
4158 .nr(8)
4159 .kr(1)
4160 .sr(1)
4161 .m(m)
4162 .n(n)
4163 .k(16)
4164 .iterations(1)
4165 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4166 }
4167 }
4168 }
4169
4170 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
4171 TEST_REQUIRES_ARM_NEON_FMA;
4172 for (size_t k = 1; k < 16; k++) {
4173 GemmMicrokernelTester()
4174 .mr(5)
4175 .nr(8)
4176 .kr(1)
4177 .sr(1)
4178 .m(5)
4179 .n(8)
4180 .k(k)
4181 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4182 }
4183 }
4184
4185 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
4186 TEST_REQUIRES_ARM_NEON_FMA;
4187 for (size_t k = 1; k < 16; k++) {
4188 GemmMicrokernelTester()
4189 .mr(5)
4190 .nr(8)
4191 .kr(1)
4192 .sr(1)
4193 .m(5)
4194 .n(8)
4195 .k(k)
4196 .a_stride(19)
4197 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4198 }
4199 }
4200
4201 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
4202 TEST_REQUIRES_ARM_NEON_FMA;
4203 for (size_t k = 1; k < 16; k++) {
4204 for (uint32_t m = 1; m <= 5; m++) {
4205 for (uint32_t n = 1; n <= 8; n++) {
4206 GemmMicrokernelTester()
4207 .mr(5)
4208 .nr(8)
4209 .kr(1)
4210 .sr(1)
4211 .m(m)
4212 .n(n)
4213 .k(k)
4214 .iterations(1)
4215 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4216 }
4217 }
4218 }
4219 }
4220
4221 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
4222 TEST_REQUIRES_ARM_NEON_FMA;
4223 for (size_t k = 17; k < 16; k++) {
4224 GemmMicrokernelTester()
4225 .mr(5)
4226 .nr(8)
4227 .kr(1)
4228 .sr(1)
4229 .m(5)
4230 .n(8)
4231 .k(k)
4232 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4233 }
4234 }
4235
4236 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
4237 TEST_REQUIRES_ARM_NEON_FMA;
4238 for (size_t k = 17; k < 16; k++) {
4239 GemmMicrokernelTester()
4240 .mr(5)
4241 .nr(8)
4242 .kr(1)
4243 .sr(1)
4244 .m(5)
4245 .n(8)
4246 .k(k)
4247 .a_stride(19)
4248 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4249 }
4250 }
4251
4252 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
4253 TEST_REQUIRES_ARM_NEON_FMA;
4254 for (size_t k = 17; k < 16; k++) {
4255 for (uint32_t m = 1; m <= 5; m++) {
4256 for (uint32_t n = 1; n <= 8; n++) {
4257 GemmMicrokernelTester()
4258 .mr(5)
4259 .nr(8)
4260 .kr(1)
4261 .sr(1)
4262 .m(m)
4263 .n(n)
4264 .k(k)
4265 .iterations(1)
4266 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4267 }
4268 }
4269 }
4270 }
4271
4272 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
4273 TEST_REQUIRES_ARM_NEON_FMA;
4274 for (size_t k = 24; k <= 80; k += 8) {
4275 GemmMicrokernelTester()
4276 .mr(5)
4277 .nr(8)
4278 .kr(1)
4279 .sr(1)
4280 .m(5)
4281 .n(8)
4282 .k(k)
4283 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4284 }
4285 }
4286
4287 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
4288 TEST_REQUIRES_ARM_NEON_FMA;
4289 for (size_t k = 24; k <= 80; k += 8) {
4290 GemmMicrokernelTester()
4291 .mr(5)
4292 .nr(8)
4293 .kr(1)
4294 .sr(1)
4295 .m(5)
4296 .n(8)
4297 .k(k)
4298 .a_stride(83)
4299 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4300 }
4301 }
4302
4303 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
4304 TEST_REQUIRES_ARM_NEON_FMA;
4305 for (size_t k = 24; k <= 80; k += 8) {
4306 for (uint32_t m = 1; m <= 5; m++) {
4307 for (uint32_t n = 1; n <= 8; n++) {
4308 GemmMicrokernelTester()
4309 .mr(5)
4310 .nr(8)
4311 .kr(1)
4312 .sr(1)
4313 .m(m)
4314 .n(n)
4315 .k(k)
4316 .iterations(1)
4317 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4318 }
4319 }
4320 }
4321 }
4322
4323 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
4324 TEST_REQUIRES_ARM_NEON_FMA;
4325 for (uint32_t n = 9; n < 16; n++) {
4326 for (size_t k = 1; k <= 40; k += 9) {
4327 GemmMicrokernelTester()
4328 .mr(5)
4329 .nr(8)
4330 .kr(1)
4331 .sr(1)
4332 .m(5)
4333 .n(8)
4334 .k(k)
4335 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4336 }
4337 }
4338 }
4339
4340 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
4341 TEST_REQUIRES_ARM_NEON_FMA;
4342 for (uint32_t n = 9; n < 16; n++) {
4343 for (size_t k = 1; k <= 40; k += 9) {
4344 GemmMicrokernelTester()
4345 .mr(5)
4346 .nr(8)
4347 .kr(1)
4348 .sr(1)
4349 .m(5)
4350 .n(8)
4351 .k(k)
4352 .cn_stride(11)
4353 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4354 }
4355 }
4356 }
4357
4358 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
4359 TEST_REQUIRES_ARM_NEON_FMA;
4360 for (uint32_t n = 9; n < 16; n++) {
4361 for (size_t k = 1; k <= 40; k += 9) {
4362 GemmMicrokernelTester()
4363 .mr(5)
4364 .nr(8)
4365 .kr(1)
4366 .sr(1)
4367 .m(5)
4368 .n(n)
4369 .k(k)
4370 .a_stride(43)
4371 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4372 }
4373 }
4374 }
4375
4376 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
4377 TEST_REQUIRES_ARM_NEON_FMA;
4378 for (uint32_t n = 9; n < 16; n++) {
4379 for (size_t k = 1; k <= 40; k += 9) {
4380 for (uint32_t m = 1; m <= 5; m++) {
4381 GemmMicrokernelTester()
4382 .mr(5)
4383 .nr(8)
4384 .kr(1)
4385 .sr(1)
4386 .m(m)
4387 .n(n)
4388 .k(k)
4389 .iterations(1)
4390 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4391 }
4392 }
4393 }
4394 }
4395
4396 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
4397 TEST_REQUIRES_ARM_NEON_FMA;
4398 for (uint32_t n = 16; n <= 24; n += 8) {
4399 for (size_t k = 1; k <= 40; k += 9) {
4400 GemmMicrokernelTester()
4401 .mr(5)
4402 .nr(8)
4403 .kr(1)
4404 .sr(1)
4405 .m(5)
4406 .n(8)
4407 .k(k)
4408 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4409 }
4410 }
4411 }
4412
4413 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
4414 TEST_REQUIRES_ARM_NEON_FMA;
4415 for (uint32_t n = 16; n <= 24; n += 8) {
4416 for (size_t k = 1; k <= 40; k += 9) {
4417 GemmMicrokernelTester()
4418 .mr(5)
4419 .nr(8)
4420 .kr(1)
4421 .sr(1)
4422 .m(5)
4423 .n(n)
4424 .k(k)
4425 .cn_stride(11)
4426 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4427 }
4428 }
4429 }
4430
4431 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
4432 TEST_REQUIRES_ARM_NEON_FMA;
4433 for (uint32_t n = 16; n <= 24; n += 8) {
4434 for (size_t k = 1; k <= 40; k += 9) {
4435 GemmMicrokernelTester()
4436 .mr(5)
4437 .nr(8)
4438 .kr(1)
4439 .sr(1)
4440 .m(5)
4441 .n(n)
4442 .k(k)
4443 .a_stride(43)
4444 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4445 }
4446 }
4447 }
4448
4449 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
4450 TEST_REQUIRES_ARM_NEON_FMA;
4451 for (uint32_t n = 16; n <= 24; n += 8) {
4452 for (size_t k = 1; k <= 40; k += 9) {
4453 for (uint32_t m = 1; m <= 5; m++) {
4454 GemmMicrokernelTester()
4455 .mr(5)
4456 .nr(8)
4457 .kr(1)
4458 .sr(1)
4459 .m(m)
4460 .n(n)
4461 .k(k)
4462 .iterations(1)
4463 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4464 }
4465 }
4466 }
4467 }
4468
4469 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
4470 TEST_REQUIRES_ARM_NEON_FMA;
4471 for (size_t k = 1; k <= 40; k += 9) {
4472 for (uint32_t m = 1; m <= 5; m++) {
4473 for (uint32_t n = 1; n <= 8; n++) {
4474 GemmMicrokernelTester()
4475 .mr(5)
4476 .nr(8)
4477 .kr(1)
4478 .sr(1)
4479 .m(m)
4480 .n(n)
4481 .k(k)
4482 .cm_stride(11)
4483 .iterations(1)
4484 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4485 }
4486 }
4487 }
4488 }
4489
4490 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
4491 TEST_REQUIRES_ARM_NEON_FMA;
4492 GemmMicrokernelTester()
4493 .mr(5)
4494 .nr(8)
4495 .kr(1)
4496 .sr(1)
4497 .m(5)
4498 .n(8)
4499 .k(8)
4500 .qmin(128)
4501 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4502 }
4503
4504 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
4505 TEST_REQUIRES_ARM_NEON_FMA;
4506 GemmMicrokernelTester()
4507 .mr(5)
4508 .nr(8)
4509 .kr(1)
4510 .sr(1)
4511 .m(5)
4512 .n(8)
4513 .k(8)
4514 .qmax(128)
4515 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4516 }
4517
4518 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
4519 TEST_REQUIRES_ARM_NEON_FMA;
4520 GemmMicrokernelTester()
4521 .mr(5)
4522 .nr(8)
4523 .kr(1)
4524 .sr(1)
4525 .m(5)
4526 .n(8)
4527 .k(8)
4528 .cm_stride(11)
4529 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4530 }
4531#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4532
4533
4534#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4535 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
4536 TEST_REQUIRES_ARM_NEON_FMA;
4537 GemmMicrokernelTester()
4538 .mr(6)
4539 .nr(8)
4540 .kr(1)
4541 .sr(1)
4542 .m(6)
4543 .n(8)
4544 .k(4)
4545 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4546 }
4547
4548 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
4549 TEST_REQUIRES_ARM_NEON_FMA;
4550 GemmMicrokernelTester()
4551 .mr(6)
4552 .nr(8)
4553 .kr(1)
4554 .sr(1)
4555 .m(6)
4556 .n(8)
4557 .k(4)
4558 .cn_stride(11)
4559 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4560 }
4561
4562 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
4563 TEST_REQUIRES_ARM_NEON_FMA;
4564 GemmMicrokernelTester()
4565 .mr(6)
4566 .nr(8)
4567 .kr(1)
4568 .sr(1)
4569 .m(6)
4570 .n(8)
4571 .k(4)
4572 .a_stride(7)
4573 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4574 }
4575
4576 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
4577 TEST_REQUIRES_ARM_NEON_FMA;
4578 for (uint32_t m = 1; m <= 6; m++) {
4579 for (uint32_t n = 1; n <= 8; n++) {
4580 GemmMicrokernelTester()
4581 .mr(6)
4582 .nr(8)
4583 .kr(1)
4584 .sr(1)
4585 .m(m)
4586 .n(n)
4587 .k(4)
4588 .iterations(1)
4589 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4590 }
4591 }
4592 }
4593
4594 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
4595 TEST_REQUIRES_ARM_NEON_FMA;
4596 for (uint32_t m = 1; m <= 6; m++) {
4597 GemmMicrokernelTester()
4598 .mr(6)
4599 .nr(8)
4600 .kr(1)
4601 .sr(1)
4602 .m(m)
4603 .n(8)
4604 .k(4)
4605 .iterations(1)
4606 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4607 }
4608 }
4609
4610 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
4611 TEST_REQUIRES_ARM_NEON_FMA;
4612 for (uint32_t n = 1; n <= 8; n++) {
4613 GemmMicrokernelTester()
4614 .mr(6)
4615 .nr(8)
4616 .kr(1)
4617 .sr(1)
4618 .m(6)
4619 .n(n)
4620 .k(4)
4621 .iterations(1)
4622 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4623 }
4624 }
4625
4626 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
4627 TEST_REQUIRES_ARM_NEON_FMA;
4628 GemmMicrokernelTester()
4629 .mr(6)
4630 .nr(8)
4631 .kr(1)
4632 .sr(1)
4633 .m(6)
4634 .n(8)
4635 .k(8)
4636 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4637 }
4638
4639 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
4640 TEST_REQUIRES_ARM_NEON_FMA;
4641 GemmMicrokernelTester()
4642 .mr(6)
4643 .nr(8)
4644 .kr(1)
4645 .sr(1)
4646 .m(6)
4647 .n(8)
4648 .k(8)
4649 .a_stride(11)
4650 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4651 }
4652
4653 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
4654 TEST_REQUIRES_ARM_NEON_FMA;
4655 for (uint32_t m = 1; m <= 6; m++) {
4656 for (uint32_t n = 1; n <= 8; n++) {
4657 GemmMicrokernelTester()
4658 .mr(6)
4659 .nr(8)
4660 .kr(1)
4661 .sr(1)
4662 .m(m)
4663 .n(n)
4664 .k(8)
4665 .iterations(1)
4666 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4667 }
4668 }
4669 }
4670
4671 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
4672 TEST_REQUIRES_ARM_NEON_FMA;
4673 for (size_t k = 1; k < 8; k++) {
4674 GemmMicrokernelTester()
4675 .mr(6)
4676 .nr(8)
4677 .kr(1)
4678 .sr(1)
4679 .m(6)
4680 .n(8)
4681 .k(k)
4682 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4683 }
4684 }
4685
4686 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
4687 TEST_REQUIRES_ARM_NEON_FMA;
4688 for (size_t k = 1; k < 8; k++) {
4689 GemmMicrokernelTester()
4690 .mr(6)
4691 .nr(8)
4692 .kr(1)
4693 .sr(1)
4694 .m(6)
4695 .n(8)
4696 .k(k)
4697 .a_stride(11)
4698 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4699 }
4700 }
4701
4702 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
4703 TEST_REQUIRES_ARM_NEON_FMA;
4704 for (size_t k = 1; k < 8; k++) {
4705 for (uint32_t m = 1; m <= 6; m++) {
4706 for (uint32_t n = 1; n <= 8; n++) {
4707 GemmMicrokernelTester()
4708 .mr(6)
4709 .nr(8)
4710 .kr(1)
4711 .sr(1)
4712 .m(m)
4713 .n(n)
4714 .k(k)
4715 .iterations(1)
4716 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4717 }
4718 }
4719 }
4720 }
4721
4722 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
4723 TEST_REQUIRES_ARM_NEON_FMA;
4724 for (size_t k = 9; k < 8; k++) {
4725 GemmMicrokernelTester()
4726 .mr(6)
4727 .nr(8)
4728 .kr(1)
4729 .sr(1)
4730 .m(6)
4731 .n(8)
4732 .k(k)
4733 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4734 }
4735 }
4736
4737 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
4738 TEST_REQUIRES_ARM_NEON_FMA;
4739 for (size_t k = 9; k < 8; k++) {
4740 GemmMicrokernelTester()
4741 .mr(6)
4742 .nr(8)
4743 .kr(1)
4744 .sr(1)
4745 .m(6)
4746 .n(8)
4747 .k(k)
4748 .a_stride(11)
4749 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4750 }
4751 }
4752
4753 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
4754 TEST_REQUIRES_ARM_NEON_FMA;
4755 for (size_t k = 9; k < 8; k++) {
4756 for (uint32_t m = 1; m <= 6; m++) {
4757 for (uint32_t n = 1; n <= 8; n++) {
4758 GemmMicrokernelTester()
4759 .mr(6)
4760 .nr(8)
4761 .kr(1)
4762 .sr(1)
4763 .m(m)
4764 .n(n)
4765 .k(k)
4766 .iterations(1)
4767 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4768 }
4769 }
4770 }
4771 }
4772
4773 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
4774 TEST_REQUIRES_ARM_NEON_FMA;
4775 for (size_t k = 12; k <= 40; k += 4) {
4776 GemmMicrokernelTester()
4777 .mr(6)
4778 .nr(8)
4779 .kr(1)
4780 .sr(1)
4781 .m(6)
4782 .n(8)
4783 .k(k)
4784 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4785 }
4786 }
4787
4788 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
4789 TEST_REQUIRES_ARM_NEON_FMA;
4790 for (size_t k = 12; k <= 40; k += 4) {
4791 GemmMicrokernelTester()
4792 .mr(6)
4793 .nr(8)
4794 .kr(1)
4795 .sr(1)
4796 .m(6)
4797 .n(8)
4798 .k(k)
4799 .a_stride(43)
4800 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4801 }
4802 }
4803
4804 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
4805 TEST_REQUIRES_ARM_NEON_FMA;
4806 for (size_t k = 12; k <= 40; k += 4) {
4807 for (uint32_t m = 1; m <= 6; m++) {
4808 for (uint32_t n = 1; n <= 8; n++) {
4809 GemmMicrokernelTester()
4810 .mr(6)
4811 .nr(8)
4812 .kr(1)
4813 .sr(1)
4814 .m(m)
4815 .n(n)
4816 .k(k)
4817 .iterations(1)
4818 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4819 }
4820 }
4821 }
4822 }
4823
4824 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
4825 TEST_REQUIRES_ARM_NEON_FMA;
4826 for (uint32_t n = 9; n < 16; n++) {
4827 for (size_t k = 1; k <= 20; k += 5) {
4828 GemmMicrokernelTester()
4829 .mr(6)
4830 .nr(8)
4831 .kr(1)
4832 .sr(1)
4833 .m(6)
4834 .n(8)
4835 .k(k)
4836 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4837 }
4838 }
4839 }
4840
4841 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
4842 TEST_REQUIRES_ARM_NEON_FMA;
4843 for (uint32_t n = 9; n < 16; n++) {
4844 for (size_t k = 1; k <= 20; k += 5) {
4845 GemmMicrokernelTester()
4846 .mr(6)
4847 .nr(8)
4848 .kr(1)
4849 .sr(1)
4850 .m(6)
4851 .n(8)
4852 .k(k)
4853 .cn_stride(11)
4854 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4855 }
4856 }
4857 }
4858
4859 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
4860 TEST_REQUIRES_ARM_NEON_FMA;
4861 for (uint32_t n = 9; n < 16; n++) {
4862 for (size_t k = 1; k <= 20; k += 5) {
4863 GemmMicrokernelTester()
4864 .mr(6)
4865 .nr(8)
4866 .kr(1)
4867 .sr(1)
4868 .m(6)
4869 .n(n)
4870 .k(k)
4871 .a_stride(23)
4872 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4873 }
4874 }
4875 }
4876
4877 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
4878 TEST_REQUIRES_ARM_NEON_FMA;
4879 for (uint32_t n = 9; n < 16; n++) {
4880 for (size_t k = 1; k <= 20; k += 5) {
4881 for (uint32_t m = 1; m <= 6; m++) {
4882 GemmMicrokernelTester()
4883 .mr(6)
4884 .nr(8)
4885 .kr(1)
4886 .sr(1)
4887 .m(m)
4888 .n(n)
4889 .k(k)
4890 .iterations(1)
4891 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4892 }
4893 }
4894 }
4895 }
4896
4897 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
4898 TEST_REQUIRES_ARM_NEON_FMA;
4899 for (uint32_t n = 16; n <= 24; n += 8) {
4900 for (size_t k = 1; k <= 20; k += 5) {
4901 GemmMicrokernelTester()
4902 .mr(6)
4903 .nr(8)
4904 .kr(1)
4905 .sr(1)
4906 .m(6)
4907 .n(8)
4908 .k(k)
4909 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4910 }
4911 }
4912 }
4913
4914 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
4915 TEST_REQUIRES_ARM_NEON_FMA;
4916 for (uint32_t n = 16; n <= 24; n += 8) {
4917 for (size_t k = 1; k <= 20; k += 5) {
4918 GemmMicrokernelTester()
4919 .mr(6)
4920 .nr(8)
4921 .kr(1)
4922 .sr(1)
4923 .m(6)
4924 .n(n)
4925 .k(k)
4926 .cn_stride(11)
4927 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4928 }
4929 }
4930 }
4931
4932 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
4933 TEST_REQUIRES_ARM_NEON_FMA;
4934 for (uint32_t n = 16; n <= 24; n += 8) {
4935 for (size_t k = 1; k <= 20; k += 5) {
4936 GemmMicrokernelTester()
4937 .mr(6)
4938 .nr(8)
4939 .kr(1)
4940 .sr(1)
4941 .m(6)
4942 .n(n)
4943 .k(k)
4944 .a_stride(23)
4945 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4946 }
4947 }
4948 }
4949
4950 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
4951 TEST_REQUIRES_ARM_NEON_FMA;
4952 for (uint32_t n = 16; n <= 24; n += 8) {
4953 for (size_t k = 1; k <= 20; k += 5) {
4954 for (uint32_t m = 1; m <= 6; m++) {
4955 GemmMicrokernelTester()
4956 .mr(6)
4957 .nr(8)
4958 .kr(1)
4959 .sr(1)
4960 .m(m)
4961 .n(n)
4962 .k(k)
4963 .iterations(1)
4964 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4965 }
4966 }
4967 }
4968 }
4969
4970 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
4971 TEST_REQUIRES_ARM_NEON_FMA;
4972 for (size_t k = 1; k <= 20; k += 5) {
4973 for (uint32_t m = 1; m <= 6; m++) {
4974 for (uint32_t n = 1; n <= 8; n++) {
4975 GemmMicrokernelTester()
4976 .mr(6)
4977 .nr(8)
4978 .kr(1)
4979 .sr(1)
4980 .m(m)
4981 .n(n)
4982 .k(k)
4983 .cm_stride(11)
4984 .iterations(1)
4985 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4986 }
4987 }
4988 }
4989 }
4990
4991 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
4992 TEST_REQUIRES_ARM_NEON_FMA;
4993 GemmMicrokernelTester()
4994 .mr(6)
4995 .nr(8)
4996 .kr(1)
4997 .sr(1)
4998 .m(6)
4999 .n(8)
5000 .k(4)
5001 .qmin(128)
5002 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
5003 }
5004
5005 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
5006 TEST_REQUIRES_ARM_NEON_FMA;
5007 GemmMicrokernelTester()
5008 .mr(6)
5009 .nr(8)
5010 .kr(1)
5011 .sr(1)
5012 .m(6)
5013 .n(8)
5014 .k(4)
5015 .qmax(128)
5016 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
5017 }
5018
5019 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
5020 TEST_REQUIRES_ARM_NEON_FMA;
5021 GemmMicrokernelTester()
5022 .mr(6)
5023 .nr(8)
5024 .kr(1)
5025 .sr(1)
5026 .m(6)
5027 .n(8)
5028 .k(4)
5029 .cm_stride(11)
5030 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
5031 }
5032#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5033
5034
5035#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5036 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
5037 TEST_REQUIRES_ARM_NEON_FMA;
5038 GemmMicrokernelTester()
5039 .mr(6)
5040 .nr(8)
5041 .kr(1)
5042 .sr(1)
5043 .m(6)
5044 .n(8)
5045 .k(4)
5046 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5047 }
5048
5049 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
5050 TEST_REQUIRES_ARM_NEON_FMA;
5051 GemmMicrokernelTester()
5052 .mr(6)
5053 .nr(8)
5054 .kr(1)
5055 .sr(1)
5056 .m(6)
5057 .n(8)
5058 .k(4)
5059 .cn_stride(11)
5060 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5061 }
5062
5063 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_strided_a) {
5064 TEST_REQUIRES_ARM_NEON_FMA;
5065 GemmMicrokernelTester()
5066 .mr(6)
5067 .nr(8)
5068 .kr(1)
5069 .sr(1)
5070 .m(6)
5071 .n(8)
5072 .k(4)
5073 .a_stride(7)
5074 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5075 }
5076
5077 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
5078 TEST_REQUIRES_ARM_NEON_FMA;
5079 for (uint32_t m = 1; m <= 6; m++) {
5080 for (uint32_t n = 1; n <= 8; n++) {
5081 GemmMicrokernelTester()
5082 .mr(6)
5083 .nr(8)
5084 .kr(1)
5085 .sr(1)
5086 .m(m)
5087 .n(n)
5088 .k(4)
5089 .iterations(1)
5090 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5091 }
5092 }
5093 }
5094
5095 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
5096 TEST_REQUIRES_ARM_NEON_FMA;
5097 for (uint32_t m = 1; m <= 6; m++) {
5098 GemmMicrokernelTester()
5099 .mr(6)
5100 .nr(8)
5101 .kr(1)
5102 .sr(1)
5103 .m(m)
5104 .n(8)
5105 .k(4)
5106 .iterations(1)
5107 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5108 }
5109 }
5110
5111 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
5112 TEST_REQUIRES_ARM_NEON_FMA;
5113 for (uint32_t n = 1; n <= 8; n++) {
5114 GemmMicrokernelTester()
5115 .mr(6)
5116 .nr(8)
5117 .kr(1)
5118 .sr(1)
5119 .m(6)
5120 .n(n)
5121 .k(4)
5122 .iterations(1)
5123 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5124 }
5125 }
5126
5127 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
5128 TEST_REQUIRES_ARM_NEON_FMA;
5129 GemmMicrokernelTester()
5130 .mr(6)
5131 .nr(8)
5132 .kr(1)
5133 .sr(1)
5134 .m(6)
5135 .n(8)
5136 .k(8)
5137 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5138 }
5139
5140 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_strided_a) {
5141 TEST_REQUIRES_ARM_NEON_FMA;
5142 GemmMicrokernelTester()
5143 .mr(6)
5144 .nr(8)
5145 .kr(1)
5146 .sr(1)
5147 .m(6)
5148 .n(8)
5149 .k(8)
5150 .a_stride(11)
5151 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5152 }
5153
5154 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
5155 TEST_REQUIRES_ARM_NEON_FMA;
5156 for (uint32_t m = 1; m <= 6; m++) {
5157 for (uint32_t n = 1; n <= 8; n++) {
5158 GemmMicrokernelTester()
5159 .mr(6)
5160 .nr(8)
5161 .kr(1)
5162 .sr(1)
5163 .m(m)
5164 .n(n)
5165 .k(8)
5166 .iterations(1)
5167 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5168 }
5169 }
5170 }
5171
5172 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
5173 TEST_REQUIRES_ARM_NEON_FMA;
5174 for (size_t k = 1; k < 8; k++) {
5175 GemmMicrokernelTester()
5176 .mr(6)
5177 .nr(8)
5178 .kr(1)
5179 .sr(1)
5180 .m(6)
5181 .n(8)
5182 .k(k)
5183 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5184 }
5185 }
5186
5187 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_strided_a) {
5188 TEST_REQUIRES_ARM_NEON_FMA;
5189 for (size_t k = 1; k < 8; k++) {
5190 GemmMicrokernelTester()
5191 .mr(6)
5192 .nr(8)
5193 .kr(1)
5194 .sr(1)
5195 .m(6)
5196 .n(8)
5197 .k(k)
5198 .a_stride(11)
5199 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5200 }
5201 }
5202
5203 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
5204 TEST_REQUIRES_ARM_NEON_FMA;
5205 for (size_t k = 1; k < 8; k++) {
5206 for (uint32_t m = 1; m <= 6; m++) {
5207 for (uint32_t n = 1; n <= 8; n++) {
5208 GemmMicrokernelTester()
5209 .mr(6)
5210 .nr(8)
5211 .kr(1)
5212 .sr(1)
5213 .m(m)
5214 .n(n)
5215 .k(k)
5216 .iterations(1)
5217 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5218 }
5219 }
5220 }
5221 }
5222
5223 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
5224 TEST_REQUIRES_ARM_NEON_FMA;
5225 for (size_t k = 9; k < 8; k++) {
5226 GemmMicrokernelTester()
5227 .mr(6)
5228 .nr(8)
5229 .kr(1)
5230 .sr(1)
5231 .m(6)
5232 .n(8)
5233 .k(k)
5234 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5235 }
5236 }
5237
5238 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_strided_a) {
5239 TEST_REQUIRES_ARM_NEON_FMA;
5240 for (size_t k = 9; k < 8; k++) {
5241 GemmMicrokernelTester()
5242 .mr(6)
5243 .nr(8)
5244 .kr(1)
5245 .sr(1)
5246 .m(6)
5247 .n(8)
5248 .k(k)
5249 .a_stride(11)
5250 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5251 }
5252 }
5253
5254 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_subtile) {
5255 TEST_REQUIRES_ARM_NEON_FMA;
5256 for (size_t k = 9; k < 8; k++) {
5257 for (uint32_t m = 1; m <= 6; m++) {
5258 for (uint32_t n = 1; n <= 8; n++) {
5259 GemmMicrokernelTester()
5260 .mr(6)
5261 .nr(8)
5262 .kr(1)
5263 .sr(1)
5264 .m(m)
5265 .n(n)
5266 .k(k)
5267 .iterations(1)
5268 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5269 }
5270 }
5271 }
5272 }
5273
5274 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
5275 TEST_REQUIRES_ARM_NEON_FMA;
5276 for (size_t k = 12; k <= 40; k += 4) {
5277 GemmMicrokernelTester()
5278 .mr(6)
5279 .nr(8)
5280 .kr(1)
5281 .sr(1)
5282 .m(6)
5283 .n(8)
5284 .k(k)
5285 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5286 }
5287 }
5288
5289 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_strided_a) {
5290 TEST_REQUIRES_ARM_NEON_FMA;
5291 for (size_t k = 12; k <= 40; k += 4) {
5292 GemmMicrokernelTester()
5293 .mr(6)
5294 .nr(8)
5295 .kr(1)
5296 .sr(1)
5297 .m(6)
5298 .n(8)
5299 .k(k)
5300 .a_stride(43)
5301 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5302 }
5303 }
5304
5305 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
5306 TEST_REQUIRES_ARM_NEON_FMA;
5307 for (size_t k = 12; k <= 40; k += 4) {
5308 for (uint32_t m = 1; m <= 6; m++) {
5309 for (uint32_t n = 1; n <= 8; n++) {
5310 GemmMicrokernelTester()
5311 .mr(6)
5312 .nr(8)
5313 .kr(1)
5314 .sr(1)
5315 .m(m)
5316 .n(n)
5317 .k(k)
5318 .iterations(1)
5319 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5320 }
5321 }
5322 }
5323 }
5324
5325 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
5326 TEST_REQUIRES_ARM_NEON_FMA;
5327 for (uint32_t n = 9; n < 16; n++) {
5328 for (size_t k = 1; k <= 20; k += 5) {
5329 GemmMicrokernelTester()
5330 .mr(6)
5331 .nr(8)
5332 .kr(1)
5333 .sr(1)
5334 .m(6)
5335 .n(8)
5336 .k(k)
5337 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5338 }
5339 }
5340 }
5341
5342 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
5343 TEST_REQUIRES_ARM_NEON_FMA;
5344 for (uint32_t n = 9; n < 16; n++) {
5345 for (size_t k = 1; k <= 20; k += 5) {
5346 GemmMicrokernelTester()
5347 .mr(6)
5348 .nr(8)
5349 .kr(1)
5350 .sr(1)
5351 .m(6)
5352 .n(8)
5353 .k(k)
5354 .cn_stride(11)
5355 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5356 }
5357 }
5358 }
5359
5360 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_a) {
5361 TEST_REQUIRES_ARM_NEON_FMA;
5362 for (uint32_t n = 9; n < 16; n++) {
5363 for (size_t k = 1; k <= 20; k += 5) {
5364 GemmMicrokernelTester()
5365 .mr(6)
5366 .nr(8)
5367 .kr(1)
5368 .sr(1)
5369 .m(6)
5370 .n(n)
5371 .k(k)
5372 .a_stride(23)
5373 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5374 }
5375 }
5376 }
5377
5378 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
5379 TEST_REQUIRES_ARM_NEON_FMA;
5380 for (uint32_t n = 9; n < 16; n++) {
5381 for (size_t k = 1; k <= 20; k += 5) {
5382 for (uint32_t m = 1; m <= 6; m++) {
5383 GemmMicrokernelTester()
5384 .mr(6)
5385 .nr(8)
5386 .kr(1)
5387 .sr(1)
5388 .m(m)
5389 .n(n)
5390 .k(k)
5391 .iterations(1)
5392 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5393 }
5394 }
5395 }
5396 }
5397
5398 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
5399 TEST_REQUIRES_ARM_NEON_FMA;
5400 for (uint32_t n = 16; n <= 24; n += 8) {
5401 for (size_t k = 1; k <= 20; k += 5) {
5402 GemmMicrokernelTester()
5403 .mr(6)
5404 .nr(8)
5405 .kr(1)
5406 .sr(1)
5407 .m(6)
5408 .n(8)
5409 .k(k)
5410 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5411 }
5412 }
5413 }
5414
5415 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
5416 TEST_REQUIRES_ARM_NEON_FMA;
5417 for (uint32_t n = 16; n <= 24; n += 8) {
5418 for (size_t k = 1; k <= 20; k += 5) {
5419 GemmMicrokernelTester()
5420 .mr(6)
5421 .nr(8)
5422 .kr(1)
5423 .sr(1)
5424 .m(6)
5425 .n(n)
5426 .k(k)
5427 .cn_stride(11)
5428 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5429 }
5430 }
5431 }
5432
5433 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_a) {
5434 TEST_REQUIRES_ARM_NEON_FMA;
5435 for (uint32_t n = 16; n <= 24; n += 8) {
5436 for (size_t k = 1; k <= 20; k += 5) {
5437 GemmMicrokernelTester()
5438 .mr(6)
5439 .nr(8)
5440 .kr(1)
5441 .sr(1)
5442 .m(6)
5443 .n(n)
5444 .k(k)
5445 .a_stride(23)
5446 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5447 }
5448 }
5449 }
5450
5451 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
5452 TEST_REQUIRES_ARM_NEON_FMA;
5453 for (uint32_t n = 16; n <= 24; n += 8) {
5454 for (size_t k = 1; k <= 20; k += 5) {
5455 for (uint32_t m = 1; m <= 6; m++) {
5456 GemmMicrokernelTester()
5457 .mr(6)
5458 .nr(8)
5459 .kr(1)
5460 .sr(1)
5461 .m(m)
5462 .n(n)
5463 .k(k)
5464 .iterations(1)
5465 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5466 }
5467 }
5468 }
5469 }
5470
5471 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
5472 TEST_REQUIRES_ARM_NEON_FMA;
5473 for (size_t k = 1; k <= 20; k += 5) {
5474 for (uint32_t m = 1; m <= 6; m++) {
5475 for (uint32_t n = 1; n <= 8; n++) {
5476 GemmMicrokernelTester()
5477 .mr(6)
5478 .nr(8)
5479 .kr(1)
5480 .sr(1)
5481 .m(m)
5482 .n(n)
5483 .k(k)
5484 .cm_stride(11)
5485 .iterations(1)
5486 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5487 }
5488 }
5489 }
5490 }
5491
5492 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
5493 TEST_REQUIRES_ARM_NEON_FMA;
5494 GemmMicrokernelTester()
5495 .mr(6)
5496 .nr(8)
5497 .kr(1)
5498 .sr(1)
5499 .m(6)
5500 .n(8)
5501 .k(4)
5502 .qmin(128)
5503 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5504 }
5505
5506 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
5507 TEST_REQUIRES_ARM_NEON_FMA;
5508 GemmMicrokernelTester()
5509 .mr(6)
5510 .nr(8)
5511 .kr(1)
5512 .sr(1)
5513 .m(6)
5514 .n(8)
5515 .k(4)
5516 .qmax(128)
5517 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5518 }
5519
5520 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
5521 TEST_REQUIRES_ARM_NEON_FMA;
5522 GemmMicrokernelTester()
5523 .mr(6)
5524 .nr(8)
5525 .kr(1)
5526 .sr(1)
5527 .m(6)
5528 .n(8)
5529 .k(4)
5530 .cm_stride(11)
5531 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
5532 }
5533#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5534
5535
5536#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5537 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
5538 TEST_REQUIRES_ARM_NEON_FMA;
5539 GemmMicrokernelTester()
5540 .mr(6)
5541 .nr(8)
5542 .kr(1)
5543 .sr(1)
5544 .m(6)
5545 .n(8)
5546 .k(8)
5547 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5548 }
5549
5550 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cn) {
5551 TEST_REQUIRES_ARM_NEON_FMA;
5552 GemmMicrokernelTester()
5553 .mr(6)
5554 .nr(8)
5555 .kr(1)
5556 .sr(1)
5557 .m(6)
5558 .n(8)
5559 .k(8)
5560 .cn_stride(11)
5561 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5562 }
5563
5564 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_strided_a) {
5565 TEST_REQUIRES_ARM_NEON_FMA;
5566 GemmMicrokernelTester()
5567 .mr(6)
5568 .nr(8)
5569 .kr(1)
5570 .sr(1)
5571 .m(6)
5572 .n(8)
5573 .k(8)
5574 .a_stride(11)
5575 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5576 }
5577
5578 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile) {
5579 TEST_REQUIRES_ARM_NEON_FMA;
5580 for (uint32_t m = 1; m <= 6; m++) {
5581 for (uint32_t n = 1; n <= 8; n++) {
5582 GemmMicrokernelTester()
5583 .mr(6)
5584 .nr(8)
5585 .kr(1)
5586 .sr(1)
5587 .m(m)
5588 .n(n)
5589 .k(8)
5590 .iterations(1)
5591 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5592 }
5593 }
5594 }
5595
5596 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_m) {
5597 TEST_REQUIRES_ARM_NEON_FMA;
5598 for (uint32_t m = 1; m <= 6; m++) {
5599 GemmMicrokernelTester()
5600 .mr(6)
5601 .nr(8)
5602 .kr(1)
5603 .sr(1)
5604 .m(m)
5605 .n(8)
5606 .k(8)
5607 .iterations(1)
5608 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5609 }
5610 }
5611
5612 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_n) {
5613 TEST_REQUIRES_ARM_NEON_FMA;
5614 for (uint32_t n = 1; n <= 8; n++) {
5615 GemmMicrokernelTester()
5616 .mr(6)
5617 .nr(8)
5618 .kr(1)
5619 .sr(1)
5620 .m(6)
5621 .n(n)
5622 .k(8)
5623 .iterations(1)
5624 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5625 }
5626 }
5627
5628 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16) {
5629 TEST_REQUIRES_ARM_NEON_FMA;
5630 GemmMicrokernelTester()
5631 .mr(6)
5632 .nr(8)
5633 .kr(1)
5634 .sr(1)
5635 .m(6)
5636 .n(8)
5637 .k(16)
5638 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5639 }
5640
5641 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_strided_a) {
5642 TEST_REQUIRES_ARM_NEON_FMA;
5643 GemmMicrokernelTester()
5644 .mr(6)
5645 .nr(8)
5646 .kr(1)
5647 .sr(1)
5648 .m(6)
5649 .n(8)
5650 .k(16)
5651 .a_stride(19)
5652 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5653 }
5654
5655 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_subtile) {
5656 TEST_REQUIRES_ARM_NEON_FMA;
5657 for (uint32_t m = 1; m <= 6; m++) {
5658 for (uint32_t n = 1; n <= 8; n++) {
5659 GemmMicrokernelTester()
5660 .mr(6)
5661 .nr(8)
5662 .kr(1)
5663 .sr(1)
5664 .m(m)
5665 .n(n)
5666 .k(16)
5667 .iterations(1)
5668 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5669 }
5670 }
5671 }
5672
5673 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16) {
5674 TEST_REQUIRES_ARM_NEON_FMA;
5675 for (size_t k = 1; k < 16; k++) {
5676 GemmMicrokernelTester()
5677 .mr(6)
5678 .nr(8)
5679 .kr(1)
5680 .sr(1)
5681 .m(6)
5682 .n(8)
5683 .k(k)
5684 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5685 }
5686 }
5687
5688 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_strided_a) {
5689 TEST_REQUIRES_ARM_NEON_FMA;
5690 for (size_t k = 1; k < 16; k++) {
5691 GemmMicrokernelTester()
5692 .mr(6)
5693 .nr(8)
5694 .kr(1)
5695 .sr(1)
5696 .m(6)
5697 .n(8)
5698 .k(k)
5699 .a_stride(19)
5700 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5701 }
5702 }
5703
5704 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_subtile) {
5705 TEST_REQUIRES_ARM_NEON_FMA;
5706 for (size_t k = 1; k < 16; k++) {
5707 for (uint32_t m = 1; m <= 6; m++) {
5708 for (uint32_t n = 1; n <= 8; n++) {
5709 GemmMicrokernelTester()
5710 .mr(6)
5711 .nr(8)
5712 .kr(1)
5713 .sr(1)
5714 .m(m)
5715 .n(n)
5716 .k(k)
5717 .iterations(1)
5718 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5719 }
5720 }
5721 }
5722 }
5723
5724 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_16) {
5725 TEST_REQUIRES_ARM_NEON_FMA;
5726 for (size_t k = 17; k < 16; k++) {
5727 GemmMicrokernelTester()
5728 .mr(6)
5729 .nr(8)
5730 .kr(1)
5731 .sr(1)
5732 .m(6)
5733 .n(8)
5734 .k(k)
5735 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5736 }
5737 }
5738
5739 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_strided_a) {
5740 TEST_REQUIRES_ARM_NEON_FMA;
5741 for (size_t k = 17; k < 16; k++) {
5742 GemmMicrokernelTester()
5743 .mr(6)
5744 .nr(8)
5745 .kr(1)
5746 .sr(1)
5747 .m(6)
5748 .n(8)
5749 .k(k)
5750 .a_stride(19)
5751 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5752 }
5753 }
5754
5755 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_subtile) {
5756 TEST_REQUIRES_ARM_NEON_FMA;
5757 for (size_t k = 17; k < 16; k++) {
5758 for (uint32_t m = 1; m <= 6; m++) {
5759 for (uint32_t n = 1; n <= 8; n++) {
5760 GemmMicrokernelTester()
5761 .mr(6)
5762 .nr(8)
5763 .kr(1)
5764 .sr(1)
5765 .m(m)
5766 .n(n)
5767 .k(k)
5768 .iterations(1)
5769 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5770 }
5771 }
5772 }
5773 }
5774
5775 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8) {
5776 TEST_REQUIRES_ARM_NEON_FMA;
5777 for (size_t k = 24; k <= 80; k += 8) {
5778 GemmMicrokernelTester()
5779 .mr(6)
5780 .nr(8)
5781 .kr(1)
5782 .sr(1)
5783 .m(6)
5784 .n(8)
5785 .k(k)
5786 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5787 }
5788 }
5789
5790 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_strided_a) {
5791 TEST_REQUIRES_ARM_NEON_FMA;
5792 for (size_t k = 24; k <= 80; k += 8) {
5793 GemmMicrokernelTester()
5794 .mr(6)
5795 .nr(8)
5796 .kr(1)
5797 .sr(1)
5798 .m(6)
5799 .n(8)
5800 .k(k)
5801 .a_stride(83)
5802 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5803 }
5804 }
5805
5806 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_subtile) {
5807 TEST_REQUIRES_ARM_NEON_FMA;
5808 for (size_t k = 24; k <= 80; k += 8) {
5809 for (uint32_t m = 1; m <= 6; m++) {
5810 for (uint32_t n = 1; n <= 8; n++) {
5811 GemmMicrokernelTester()
5812 .mr(6)
5813 .nr(8)
5814 .kr(1)
5815 .sr(1)
5816 .m(m)
5817 .n(n)
5818 .k(k)
5819 .iterations(1)
5820 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5821 }
5822 }
5823 }
5824 }
5825
5826 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8) {
5827 TEST_REQUIRES_ARM_NEON_FMA;
5828 for (uint32_t n = 9; n < 16; n++) {
5829 for (size_t k = 1; k <= 40; k += 9) {
5830 GemmMicrokernelTester()
5831 .mr(6)
5832 .nr(8)
5833 .kr(1)
5834 .sr(1)
5835 .m(6)
5836 .n(8)
5837 .k(k)
5838 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5839 }
5840 }
5841 }
5842
5843 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_cn) {
5844 TEST_REQUIRES_ARM_NEON_FMA;
5845 for (uint32_t n = 9; n < 16; n++) {
5846 for (size_t k = 1; k <= 40; k += 9) {
5847 GemmMicrokernelTester()
5848 .mr(6)
5849 .nr(8)
5850 .kr(1)
5851 .sr(1)
5852 .m(6)
5853 .n(8)
5854 .k(k)
5855 .cn_stride(11)
5856 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5857 }
5858 }
5859 }
5860
5861 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_a) {
5862 TEST_REQUIRES_ARM_NEON_FMA;
5863 for (uint32_t n = 9; n < 16; n++) {
5864 for (size_t k = 1; k <= 40; k += 9) {
5865 GemmMicrokernelTester()
5866 .mr(6)
5867 .nr(8)
5868 .kr(1)
5869 .sr(1)
5870 .m(6)
5871 .n(n)
5872 .k(k)
5873 .a_stride(43)
5874 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5875 }
5876 }
5877 }
5878
5879 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_subtile) {
5880 TEST_REQUIRES_ARM_NEON_FMA;
5881 for (uint32_t n = 9; n < 16; n++) {
5882 for (size_t k = 1; k <= 40; k += 9) {
5883 for (uint32_t m = 1; m <= 6; m++) {
5884 GemmMicrokernelTester()
5885 .mr(6)
5886 .nr(8)
5887 .kr(1)
5888 .sr(1)
5889 .m(m)
5890 .n(n)
5891 .k(k)
5892 .iterations(1)
5893 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5894 }
5895 }
5896 }
5897 }
5898
5899 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8) {
5900 TEST_REQUIRES_ARM_NEON_FMA;
5901 for (uint32_t n = 16; n <= 24; n += 8) {
5902 for (size_t k = 1; k <= 40; k += 9) {
5903 GemmMicrokernelTester()
5904 .mr(6)
5905 .nr(8)
5906 .kr(1)
5907 .sr(1)
5908 .m(6)
5909 .n(8)
5910 .k(k)
5911 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5912 }
5913 }
5914 }
5915
5916 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_cn) {
5917 TEST_REQUIRES_ARM_NEON_FMA;
5918 for (uint32_t n = 16; n <= 24; n += 8) {
5919 for (size_t k = 1; k <= 40; k += 9) {
5920 GemmMicrokernelTester()
5921 .mr(6)
5922 .nr(8)
5923 .kr(1)
5924 .sr(1)
5925 .m(6)
5926 .n(n)
5927 .k(k)
5928 .cn_stride(11)
5929 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5930 }
5931 }
5932 }
5933
5934 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_a) {
5935 TEST_REQUIRES_ARM_NEON_FMA;
5936 for (uint32_t n = 16; n <= 24; n += 8) {
5937 for (size_t k = 1; k <= 40; k += 9) {
5938 GemmMicrokernelTester()
5939 .mr(6)
5940 .nr(8)
5941 .kr(1)
5942 .sr(1)
5943 .m(6)
5944 .n(n)
5945 .k(k)
5946 .a_stride(43)
5947 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5948 }
5949 }
5950 }
5951
5952 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_subtile) {
5953 TEST_REQUIRES_ARM_NEON_FMA;
5954 for (uint32_t n = 16; n <= 24; n += 8) {
5955 for (size_t k = 1; k <= 40; k += 9) {
5956 for (uint32_t m = 1; m <= 6; m++) {
5957 GemmMicrokernelTester()
5958 .mr(6)
5959 .nr(8)
5960 .kr(1)
5961 .sr(1)
5962 .m(m)
5963 .n(n)
5964 .k(k)
5965 .iterations(1)
5966 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5967 }
5968 }
5969 }
5970 }
5971
5972 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm_subtile) {
5973 TEST_REQUIRES_ARM_NEON_FMA;
5974 for (size_t k = 1; k <= 40; k += 9) {
5975 for (uint32_t m = 1; m <= 6; m++) {
5976 for (uint32_t n = 1; n <= 8; n++) {
5977 GemmMicrokernelTester()
5978 .mr(6)
5979 .nr(8)
5980 .kr(1)
5981 .sr(1)
5982 .m(m)
5983 .n(n)
5984 .k(k)
5985 .cm_stride(11)
5986 .iterations(1)
5987 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5988 }
5989 }
5990 }
5991 }
5992
5993 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmin) {
5994 TEST_REQUIRES_ARM_NEON_FMA;
5995 GemmMicrokernelTester()
5996 .mr(6)
5997 .nr(8)
5998 .kr(1)
5999 .sr(1)
6000 .m(6)
6001 .n(8)
6002 .k(8)
6003 .qmin(128)
6004 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6005 }
6006
6007 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmax) {
6008 TEST_REQUIRES_ARM_NEON_FMA;
6009 GemmMicrokernelTester()
6010 .mr(6)
6011 .nr(8)
6012 .kr(1)
6013 .sr(1)
6014 .m(6)
6015 .n(8)
6016 .k(8)
6017 .qmax(128)
6018 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6019 }
6020
6021 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm) {
6022 TEST_REQUIRES_ARM_NEON_FMA;
6023 GemmMicrokernelTester()
6024 .mr(6)
6025 .nr(8)
6026 .kr(1)
6027 .sr(1)
6028 .m(6)
6029 .n(8)
6030 .k(8)
6031 .cm_stride(11)
6032 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
6033 }
6034#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6035
6036
6037#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6038 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
6039 TEST_REQUIRES_ARM_NEON_FMA;
6040 GemmMicrokernelTester()
6041 .mr(6)
6042 .nr(8)
6043 .kr(1)
6044 .sr(1)
6045 .m(6)
6046 .n(8)
6047 .k(8)
6048 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6049 }
6050
6051 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
6052 TEST_REQUIRES_ARM_NEON_FMA;
6053 GemmMicrokernelTester()
6054 .mr(6)
6055 .nr(8)
6056 .kr(1)
6057 .sr(1)
6058 .m(6)
6059 .n(8)
6060 .k(8)
6061 .cn_stride(11)
6062 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6063 }
6064
6065 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
6066 TEST_REQUIRES_ARM_NEON_FMA;
6067 GemmMicrokernelTester()
6068 .mr(6)
6069 .nr(8)
6070 .kr(1)
6071 .sr(1)
6072 .m(6)
6073 .n(8)
6074 .k(8)
6075 .a_stride(11)
6076 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6077 }
6078
6079 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
6080 TEST_REQUIRES_ARM_NEON_FMA;
6081 for (uint32_t m = 1; m <= 6; m++) {
6082 for (uint32_t n = 1; n <= 8; n++) {
6083 GemmMicrokernelTester()
6084 .mr(6)
6085 .nr(8)
6086 .kr(1)
6087 .sr(1)
6088 .m(m)
6089 .n(n)
6090 .k(8)
6091 .iterations(1)
6092 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6093 }
6094 }
6095 }
6096
6097 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
6098 TEST_REQUIRES_ARM_NEON_FMA;
6099 for (uint32_t m = 1; m <= 6; m++) {
6100 GemmMicrokernelTester()
6101 .mr(6)
6102 .nr(8)
6103 .kr(1)
6104 .sr(1)
6105 .m(m)
6106 .n(8)
6107 .k(8)
6108 .iterations(1)
6109 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6110 }
6111 }
6112
6113 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
6114 TEST_REQUIRES_ARM_NEON_FMA;
6115 for (uint32_t n = 1; n <= 8; n++) {
6116 GemmMicrokernelTester()
6117 .mr(6)
6118 .nr(8)
6119 .kr(1)
6120 .sr(1)
6121 .m(6)
6122 .n(n)
6123 .k(8)
6124 .iterations(1)
6125 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6126 }
6127 }
6128
6129 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
6130 TEST_REQUIRES_ARM_NEON_FMA;
6131 GemmMicrokernelTester()
6132 .mr(6)
6133 .nr(8)
6134 .kr(1)
6135 .sr(1)
6136 .m(6)
6137 .n(8)
6138 .k(16)
6139 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6140 }
6141
6142 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
6143 TEST_REQUIRES_ARM_NEON_FMA;
6144 GemmMicrokernelTester()
6145 .mr(6)
6146 .nr(8)
6147 .kr(1)
6148 .sr(1)
6149 .m(6)
6150 .n(8)
6151 .k(16)
6152 .a_stride(19)
6153 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6154 }
6155
6156 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
6157 TEST_REQUIRES_ARM_NEON_FMA;
6158 for (uint32_t m = 1; m <= 6; m++) {
6159 for (uint32_t n = 1; n <= 8; n++) {
6160 GemmMicrokernelTester()
6161 .mr(6)
6162 .nr(8)
6163 .kr(1)
6164 .sr(1)
6165 .m(m)
6166 .n(n)
6167 .k(16)
6168 .iterations(1)
6169 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6170 }
6171 }
6172 }
6173
6174 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
6175 TEST_REQUIRES_ARM_NEON_FMA;
6176 for (size_t k = 1; k < 16; k++) {
6177 GemmMicrokernelTester()
6178 .mr(6)
6179 .nr(8)
6180 .kr(1)
6181 .sr(1)
6182 .m(6)
6183 .n(8)
6184 .k(k)
6185 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6186 }
6187 }
6188
6189 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
6190 TEST_REQUIRES_ARM_NEON_FMA;
6191 for (size_t k = 1; k < 16; k++) {
6192 GemmMicrokernelTester()
6193 .mr(6)
6194 .nr(8)
6195 .kr(1)
6196 .sr(1)
6197 .m(6)
6198 .n(8)
6199 .k(k)
6200 .a_stride(19)
6201 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6202 }
6203 }
6204
6205 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
6206 TEST_REQUIRES_ARM_NEON_FMA;
6207 for (size_t k = 1; k < 16; k++) {
6208 for (uint32_t m = 1; m <= 6; m++) {
6209 for (uint32_t n = 1; n <= 8; n++) {
6210 GemmMicrokernelTester()
6211 .mr(6)
6212 .nr(8)
6213 .kr(1)
6214 .sr(1)
6215 .m(m)
6216 .n(n)
6217 .k(k)
6218 .iterations(1)
6219 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6220 }
6221 }
6222 }
6223 }
6224
6225 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
6226 TEST_REQUIRES_ARM_NEON_FMA;
6227 for (size_t k = 17; k < 16; k++) {
6228 GemmMicrokernelTester()
6229 .mr(6)
6230 .nr(8)
6231 .kr(1)
6232 .sr(1)
6233 .m(6)
6234 .n(8)
6235 .k(k)
6236 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6237 }
6238 }
6239
6240 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
6241 TEST_REQUIRES_ARM_NEON_FMA;
6242 for (size_t k = 17; k < 16; k++) {
6243 GemmMicrokernelTester()
6244 .mr(6)
6245 .nr(8)
6246 .kr(1)
6247 .sr(1)
6248 .m(6)
6249 .n(8)
6250 .k(k)
6251 .a_stride(19)
6252 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6253 }
6254 }
6255
6256 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
6257 TEST_REQUIRES_ARM_NEON_FMA;
6258 for (size_t k = 17; k < 16; k++) {
6259 for (uint32_t m = 1; m <= 6; m++) {
6260 for (uint32_t n = 1; n <= 8; n++) {
6261 GemmMicrokernelTester()
6262 .mr(6)
6263 .nr(8)
6264 .kr(1)
6265 .sr(1)
6266 .m(m)
6267 .n(n)
6268 .k(k)
6269 .iterations(1)
6270 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6271 }
6272 }
6273 }
6274 }
6275
6276 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
6277 TEST_REQUIRES_ARM_NEON_FMA;
6278 for (size_t k = 24; k <= 80; k += 8) {
6279 GemmMicrokernelTester()
6280 .mr(6)
6281 .nr(8)
6282 .kr(1)
6283 .sr(1)
6284 .m(6)
6285 .n(8)
6286 .k(k)
6287 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6288 }
6289 }
6290
6291 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
6292 TEST_REQUIRES_ARM_NEON_FMA;
6293 for (size_t k = 24; k <= 80; k += 8) {
6294 GemmMicrokernelTester()
6295 .mr(6)
6296 .nr(8)
6297 .kr(1)
6298 .sr(1)
6299 .m(6)
6300 .n(8)
6301 .k(k)
6302 .a_stride(83)
6303 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6304 }
6305 }
6306
6307 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
6308 TEST_REQUIRES_ARM_NEON_FMA;
6309 for (size_t k = 24; k <= 80; k += 8) {
6310 for (uint32_t m = 1; m <= 6; m++) {
6311 for (uint32_t n = 1; n <= 8; n++) {
6312 GemmMicrokernelTester()
6313 .mr(6)
6314 .nr(8)
6315 .kr(1)
6316 .sr(1)
6317 .m(m)
6318 .n(n)
6319 .k(k)
6320 .iterations(1)
6321 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6322 }
6323 }
6324 }
6325 }
6326
6327 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
6328 TEST_REQUIRES_ARM_NEON_FMA;
6329 for (uint32_t n = 9; n < 16; n++) {
6330 for (size_t k = 1; k <= 40; k += 9) {
6331 GemmMicrokernelTester()
6332 .mr(6)
6333 .nr(8)
6334 .kr(1)
6335 .sr(1)
6336 .m(6)
6337 .n(8)
6338 .k(k)
6339 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6340 }
6341 }
6342 }
6343
6344 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
6345 TEST_REQUIRES_ARM_NEON_FMA;
6346 for (uint32_t n = 9; n < 16; n++) {
6347 for (size_t k = 1; k <= 40; k += 9) {
6348 GemmMicrokernelTester()
6349 .mr(6)
6350 .nr(8)
6351 .kr(1)
6352 .sr(1)
6353 .m(6)
6354 .n(8)
6355 .k(k)
6356 .cn_stride(11)
6357 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6358 }
6359 }
6360 }
6361
6362 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
6363 TEST_REQUIRES_ARM_NEON_FMA;
6364 for (uint32_t n = 9; n < 16; n++) {
6365 for (size_t k = 1; k <= 40; k += 9) {
6366 GemmMicrokernelTester()
6367 .mr(6)
6368 .nr(8)
6369 .kr(1)
6370 .sr(1)
6371 .m(6)
6372 .n(n)
6373 .k(k)
6374 .a_stride(43)
6375 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6376 }
6377 }
6378 }
6379
6380 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
6381 TEST_REQUIRES_ARM_NEON_FMA;
6382 for (uint32_t n = 9; n < 16; n++) {
6383 for (size_t k = 1; k <= 40; k += 9) {
6384 for (uint32_t m = 1; m <= 6; m++) {
6385 GemmMicrokernelTester()
6386 .mr(6)
6387 .nr(8)
6388 .kr(1)
6389 .sr(1)
6390 .m(m)
6391 .n(n)
6392 .k(k)
6393 .iterations(1)
6394 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6395 }
6396 }
6397 }
6398 }
6399
6400 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
6401 TEST_REQUIRES_ARM_NEON_FMA;
6402 for (uint32_t n = 16; n <= 24; n += 8) {
6403 for (size_t k = 1; k <= 40; k += 9) {
6404 GemmMicrokernelTester()
6405 .mr(6)
6406 .nr(8)
6407 .kr(1)
6408 .sr(1)
6409 .m(6)
6410 .n(8)
6411 .k(k)
6412 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6413 }
6414 }
6415 }
6416
6417 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
6418 TEST_REQUIRES_ARM_NEON_FMA;
6419 for (uint32_t n = 16; n <= 24; n += 8) {
6420 for (size_t k = 1; k <= 40; k += 9) {
6421 GemmMicrokernelTester()
6422 .mr(6)
6423 .nr(8)
6424 .kr(1)
6425 .sr(1)
6426 .m(6)
6427 .n(n)
6428 .k(k)
6429 .cn_stride(11)
6430 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6431 }
6432 }
6433 }
6434
6435 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
6436 TEST_REQUIRES_ARM_NEON_FMA;
6437 for (uint32_t n = 16; n <= 24; n += 8) {
6438 for (size_t k = 1; k <= 40; k += 9) {
6439 GemmMicrokernelTester()
6440 .mr(6)
6441 .nr(8)
6442 .kr(1)
6443 .sr(1)
6444 .m(6)
6445 .n(n)
6446 .k(k)
6447 .a_stride(43)
6448 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6449 }
6450 }
6451 }
6452
6453 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
6454 TEST_REQUIRES_ARM_NEON_FMA;
6455 for (uint32_t n = 16; n <= 24; n += 8) {
6456 for (size_t k = 1; k <= 40; k += 9) {
6457 for (uint32_t m = 1; m <= 6; m++) {
6458 GemmMicrokernelTester()
6459 .mr(6)
6460 .nr(8)
6461 .kr(1)
6462 .sr(1)
6463 .m(m)
6464 .n(n)
6465 .k(k)
6466 .iterations(1)
6467 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6468 }
6469 }
6470 }
6471 }
6472
6473 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
6474 TEST_REQUIRES_ARM_NEON_FMA;
6475 for (size_t k = 1; k <= 40; k += 9) {
6476 for (uint32_t m = 1; m <= 6; m++) {
6477 for (uint32_t n = 1; n <= 8; n++) {
6478 GemmMicrokernelTester()
6479 .mr(6)
6480 .nr(8)
6481 .kr(1)
6482 .sr(1)
6483 .m(m)
6484 .n(n)
6485 .k(k)
6486 .cm_stride(11)
6487 .iterations(1)
6488 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6489 }
6490 }
6491 }
6492 }
6493
6494 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
6495 TEST_REQUIRES_ARM_NEON_FMA;
6496 GemmMicrokernelTester()
6497 .mr(6)
6498 .nr(8)
6499 .kr(1)
6500 .sr(1)
6501 .m(6)
6502 .n(8)
6503 .k(8)
6504 .qmin(128)
6505 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6506 }
6507
6508 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
6509 TEST_REQUIRES_ARM_NEON_FMA;
6510 GemmMicrokernelTester()
6511 .mr(6)
6512 .nr(8)
6513 .kr(1)
6514 .sr(1)
6515 .m(6)
6516 .n(8)
6517 .k(8)
6518 .qmax(128)
6519 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6520 }
6521
6522 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
6523 TEST_REQUIRES_ARM_NEON_FMA;
6524 GemmMicrokernelTester()
6525 .mr(6)
6526 .nr(8)
6527 .kr(1)
6528 .sr(1)
6529 .m(6)
6530 .n(8)
6531 .k(8)
6532 .cm_stride(11)
6533 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
6534 }
6535#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6536
6537
6538#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6539 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
6540 TEST_REQUIRES_ARM_NEON_FMA;
6541 GemmMicrokernelTester()
6542 .mr(6)
6543 .nr(8)
6544 .kr(1)
6545 .sr(1)
6546 .m(6)
6547 .n(8)
6548 .k(8)
6549 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6550 }
6551
6552 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
6553 TEST_REQUIRES_ARM_NEON_FMA;
6554 GemmMicrokernelTester()
6555 .mr(6)
6556 .nr(8)
6557 .kr(1)
6558 .sr(1)
6559 .m(6)
6560 .n(8)
6561 .k(8)
6562 .cn_stride(11)
6563 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6564 }
6565
6566 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
6567 TEST_REQUIRES_ARM_NEON_FMA;
6568 GemmMicrokernelTester()
6569 .mr(6)
6570 .nr(8)
6571 .kr(1)
6572 .sr(1)
6573 .m(6)
6574 .n(8)
6575 .k(8)
6576 .a_stride(11)
6577 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6578 }
6579
6580 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
6581 TEST_REQUIRES_ARM_NEON_FMA;
6582 for (uint32_t m = 1; m <= 6; m++) {
6583 for (uint32_t n = 1; n <= 8; n++) {
6584 GemmMicrokernelTester()
6585 .mr(6)
6586 .nr(8)
6587 .kr(1)
6588 .sr(1)
6589 .m(m)
6590 .n(n)
6591 .k(8)
6592 .iterations(1)
6593 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6594 }
6595 }
6596 }
6597
6598 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
6599 TEST_REQUIRES_ARM_NEON_FMA;
6600 for (uint32_t m = 1; m <= 6; m++) {
6601 GemmMicrokernelTester()
6602 .mr(6)
6603 .nr(8)
6604 .kr(1)
6605 .sr(1)
6606 .m(m)
6607 .n(8)
6608 .k(8)
6609 .iterations(1)
6610 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6611 }
6612 }
6613
6614 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
6615 TEST_REQUIRES_ARM_NEON_FMA;
6616 for (uint32_t n = 1; n <= 8; n++) {
6617 GemmMicrokernelTester()
6618 .mr(6)
6619 .nr(8)
6620 .kr(1)
6621 .sr(1)
6622 .m(6)
6623 .n(n)
6624 .k(8)
6625 .iterations(1)
6626 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6627 }
6628 }
6629
6630 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
6631 TEST_REQUIRES_ARM_NEON_FMA;
6632 GemmMicrokernelTester()
6633 .mr(6)
6634 .nr(8)
6635 .kr(1)
6636 .sr(1)
6637 .m(6)
6638 .n(8)
6639 .k(16)
6640 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6641 }
6642
6643 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
6644 TEST_REQUIRES_ARM_NEON_FMA;
6645 GemmMicrokernelTester()
6646 .mr(6)
6647 .nr(8)
6648 .kr(1)
6649 .sr(1)
6650 .m(6)
6651 .n(8)
6652 .k(16)
6653 .a_stride(19)
6654 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6655 }
6656
6657 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
6658 TEST_REQUIRES_ARM_NEON_FMA;
6659 for (uint32_t m = 1; m <= 6; m++) {
6660 for (uint32_t n = 1; n <= 8; n++) {
6661 GemmMicrokernelTester()
6662 .mr(6)
6663 .nr(8)
6664 .kr(1)
6665 .sr(1)
6666 .m(m)
6667 .n(n)
6668 .k(16)
6669 .iterations(1)
6670 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6671 }
6672 }
6673 }
6674
6675 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
6676 TEST_REQUIRES_ARM_NEON_FMA;
6677 for (size_t k = 1; k < 16; k++) {
6678 GemmMicrokernelTester()
6679 .mr(6)
6680 .nr(8)
6681 .kr(1)
6682 .sr(1)
6683 .m(6)
6684 .n(8)
6685 .k(k)
6686 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6687 }
6688 }
6689
6690 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
6691 TEST_REQUIRES_ARM_NEON_FMA;
6692 for (size_t k = 1; k < 16; k++) {
6693 GemmMicrokernelTester()
6694 .mr(6)
6695 .nr(8)
6696 .kr(1)
6697 .sr(1)
6698 .m(6)
6699 .n(8)
6700 .k(k)
6701 .a_stride(19)
6702 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6703 }
6704 }
6705
6706 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
6707 TEST_REQUIRES_ARM_NEON_FMA;
6708 for (size_t k = 1; k < 16; k++) {
6709 for (uint32_t m = 1; m <= 6; m++) {
6710 for (uint32_t n = 1; n <= 8; n++) {
6711 GemmMicrokernelTester()
6712 .mr(6)
6713 .nr(8)
6714 .kr(1)
6715 .sr(1)
6716 .m(m)
6717 .n(n)
6718 .k(k)
6719 .iterations(1)
6720 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6721 }
6722 }
6723 }
6724 }
6725
6726 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
6727 TEST_REQUIRES_ARM_NEON_FMA;
6728 for (size_t k = 17; k < 16; k++) {
6729 GemmMicrokernelTester()
6730 .mr(6)
6731 .nr(8)
6732 .kr(1)
6733 .sr(1)
6734 .m(6)
6735 .n(8)
6736 .k(k)
6737 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6738 }
6739 }
6740
6741 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
6742 TEST_REQUIRES_ARM_NEON_FMA;
6743 for (size_t k = 17; k < 16; k++) {
6744 GemmMicrokernelTester()
6745 .mr(6)
6746 .nr(8)
6747 .kr(1)
6748 .sr(1)
6749 .m(6)
6750 .n(8)
6751 .k(k)
6752 .a_stride(19)
6753 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6754 }
6755 }
6756
6757 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
6758 TEST_REQUIRES_ARM_NEON_FMA;
6759 for (size_t k = 17; k < 16; k++) {
6760 for (uint32_t m = 1; m <= 6; m++) {
6761 for (uint32_t n = 1; n <= 8; n++) {
6762 GemmMicrokernelTester()
6763 .mr(6)
6764 .nr(8)
6765 .kr(1)
6766 .sr(1)
6767 .m(m)
6768 .n(n)
6769 .k(k)
6770 .iterations(1)
6771 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6772 }
6773 }
6774 }
6775 }
6776
6777 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
6778 TEST_REQUIRES_ARM_NEON_FMA;
6779 for (size_t k = 24; k <= 80; k += 8) {
6780 GemmMicrokernelTester()
6781 .mr(6)
6782 .nr(8)
6783 .kr(1)
6784 .sr(1)
6785 .m(6)
6786 .n(8)
6787 .k(k)
6788 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6789 }
6790 }
6791
6792 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
6793 TEST_REQUIRES_ARM_NEON_FMA;
6794 for (size_t k = 24; k <= 80; k += 8) {
6795 GemmMicrokernelTester()
6796 .mr(6)
6797 .nr(8)
6798 .kr(1)
6799 .sr(1)
6800 .m(6)
6801 .n(8)
6802 .k(k)
6803 .a_stride(83)
6804 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6805 }
6806 }
6807
6808 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
6809 TEST_REQUIRES_ARM_NEON_FMA;
6810 for (size_t k = 24; k <= 80; k += 8) {
6811 for (uint32_t m = 1; m <= 6; m++) {
6812 for (uint32_t n = 1; n <= 8; n++) {
6813 GemmMicrokernelTester()
6814 .mr(6)
6815 .nr(8)
6816 .kr(1)
6817 .sr(1)
6818 .m(m)
6819 .n(n)
6820 .k(k)
6821 .iterations(1)
6822 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6823 }
6824 }
6825 }
6826 }
6827
6828 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
6829 TEST_REQUIRES_ARM_NEON_FMA;
6830 for (uint32_t n = 9; n < 16; n++) {
6831 for (size_t k = 1; k <= 40; k += 9) {
6832 GemmMicrokernelTester()
6833 .mr(6)
6834 .nr(8)
6835 .kr(1)
6836 .sr(1)
6837 .m(6)
6838 .n(8)
6839 .k(k)
6840 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6841 }
6842 }
6843 }
6844
6845 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
6846 TEST_REQUIRES_ARM_NEON_FMA;
6847 for (uint32_t n = 9; n < 16; n++) {
6848 for (size_t k = 1; k <= 40; k += 9) {
6849 GemmMicrokernelTester()
6850 .mr(6)
6851 .nr(8)
6852 .kr(1)
6853 .sr(1)
6854 .m(6)
6855 .n(8)
6856 .k(k)
6857 .cn_stride(11)
6858 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6859 }
6860 }
6861 }
6862
6863 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
6864 TEST_REQUIRES_ARM_NEON_FMA;
6865 for (uint32_t n = 9; n < 16; n++) {
6866 for (size_t k = 1; k <= 40; k += 9) {
6867 GemmMicrokernelTester()
6868 .mr(6)
6869 .nr(8)
6870 .kr(1)
6871 .sr(1)
6872 .m(6)
6873 .n(n)
6874 .k(k)
6875 .a_stride(43)
6876 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6877 }
6878 }
6879 }
6880
6881 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
6882 TEST_REQUIRES_ARM_NEON_FMA;
6883 for (uint32_t n = 9; n < 16; n++) {
6884 for (size_t k = 1; k <= 40; k += 9) {
6885 for (uint32_t m = 1; m <= 6; m++) {
6886 GemmMicrokernelTester()
6887 .mr(6)
6888 .nr(8)
6889 .kr(1)
6890 .sr(1)
6891 .m(m)
6892 .n(n)
6893 .k(k)
6894 .iterations(1)
6895 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6896 }
6897 }
6898 }
6899 }
6900
6901 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
6902 TEST_REQUIRES_ARM_NEON_FMA;
6903 for (uint32_t n = 16; n <= 24; n += 8) {
6904 for (size_t k = 1; k <= 40; k += 9) {
6905 GemmMicrokernelTester()
6906 .mr(6)
6907 .nr(8)
6908 .kr(1)
6909 .sr(1)
6910 .m(6)
6911 .n(8)
6912 .k(k)
6913 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6914 }
6915 }
6916 }
6917
6918 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
6919 TEST_REQUIRES_ARM_NEON_FMA;
6920 for (uint32_t n = 16; n <= 24; n += 8) {
6921 for (size_t k = 1; k <= 40; k += 9) {
6922 GemmMicrokernelTester()
6923 .mr(6)
6924 .nr(8)
6925 .kr(1)
6926 .sr(1)
6927 .m(6)
6928 .n(n)
6929 .k(k)
6930 .cn_stride(11)
6931 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6932 }
6933 }
6934 }
6935
6936 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
6937 TEST_REQUIRES_ARM_NEON_FMA;
6938 for (uint32_t n = 16; n <= 24; n += 8) {
6939 for (size_t k = 1; k <= 40; k += 9) {
6940 GemmMicrokernelTester()
6941 .mr(6)
6942 .nr(8)
6943 .kr(1)
6944 .sr(1)
6945 .m(6)
6946 .n(n)
6947 .k(k)
6948 .a_stride(43)
6949 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6950 }
6951 }
6952 }
6953
6954 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
6955 TEST_REQUIRES_ARM_NEON_FMA;
6956 for (uint32_t n = 16; n <= 24; n += 8) {
6957 for (size_t k = 1; k <= 40; k += 9) {
6958 for (uint32_t m = 1; m <= 6; m++) {
6959 GemmMicrokernelTester()
6960 .mr(6)
6961 .nr(8)
6962 .kr(1)
6963 .sr(1)
6964 .m(m)
6965 .n(n)
6966 .k(k)
6967 .iterations(1)
6968 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6969 }
6970 }
6971 }
6972 }
6973
6974 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
6975 TEST_REQUIRES_ARM_NEON_FMA;
6976 for (size_t k = 1; k <= 40; k += 9) {
6977 for (uint32_t m = 1; m <= 6; m++) {
6978 for (uint32_t n = 1; n <= 8; n++) {
6979 GemmMicrokernelTester()
6980 .mr(6)
6981 .nr(8)
6982 .kr(1)
6983 .sr(1)
6984 .m(m)
6985 .n(n)
6986 .k(k)
6987 .cm_stride(11)
6988 .iterations(1)
6989 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6990 }
6991 }
6992 }
6993 }
6994
6995 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
6996 TEST_REQUIRES_ARM_NEON_FMA;
6997 GemmMicrokernelTester()
6998 .mr(6)
6999 .nr(8)
7000 .kr(1)
7001 .sr(1)
7002 .m(6)
7003 .n(8)
7004 .k(8)
7005 .qmin(128)
7006 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7007 }
7008
7009 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
7010 TEST_REQUIRES_ARM_NEON_FMA;
7011 GemmMicrokernelTester()
7012 .mr(6)
7013 .nr(8)
7014 .kr(1)
7015 .sr(1)
7016 .m(6)
7017 .n(8)
7018 .k(8)
7019 .qmax(128)
7020 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7021 }
7022
7023 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
7024 TEST_REQUIRES_ARM_NEON_FMA;
7025 GemmMicrokernelTester()
7026 .mr(6)
7027 .nr(8)
7028 .kr(1)
7029 .sr(1)
7030 .m(6)
7031 .n(8)
7032 .k(8)
7033 .cm_stride(11)
7034 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
7035 }
7036#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7037
7038
7039#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7040 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8) {
7041 TEST_REQUIRES_ARM_NEON_FMA;
7042 GemmMicrokernelTester()
7043 .mr(6)
7044 .nr(8)
7045 .kr(1)
7046 .sr(1)
7047 .m(6)
7048 .n(8)
7049 .k(8)
7050 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7051 }
7052
7053 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, strided_cn) {
7054 TEST_REQUIRES_ARM_NEON_FMA;
7055 GemmMicrokernelTester()
7056 .mr(6)
7057 .nr(8)
7058 .kr(1)
7059 .sr(1)
7060 .m(6)
7061 .n(8)
7062 .k(8)
7063 .cn_stride(11)
7064 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7065 }
7066
7067 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_strided_a) {
7068 TEST_REQUIRES_ARM_NEON_FMA;
7069 GemmMicrokernelTester()
7070 .mr(6)
7071 .nr(8)
7072 .kr(1)
7073 .sr(1)
7074 .m(6)
7075 .n(8)
7076 .k(8)
7077 .a_stride(11)
7078 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7079 }
7080
7081 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile) {
7082 TEST_REQUIRES_ARM_NEON_FMA;
7083 for (uint32_t m = 1; m <= 6; m++) {
7084 for (uint32_t n = 1; n <= 8; n++) {
7085 GemmMicrokernelTester()
7086 .mr(6)
7087 .nr(8)
7088 .kr(1)
7089 .sr(1)
7090 .m(m)
7091 .n(n)
7092 .k(8)
7093 .iterations(1)
7094 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7095 }
7096 }
7097 }
7098
7099 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_m) {
7100 TEST_REQUIRES_ARM_NEON_FMA;
7101 for (uint32_t m = 1; m <= 6; m++) {
7102 GemmMicrokernelTester()
7103 .mr(6)
7104 .nr(8)
7105 .kr(1)
7106 .sr(1)
7107 .m(m)
7108 .n(8)
7109 .k(8)
7110 .iterations(1)
7111 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7112 }
7113 }
7114
7115 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_n) {
7116 TEST_REQUIRES_ARM_NEON_FMA;
7117 for (uint32_t n = 1; n <= 8; n++) {
7118 GemmMicrokernelTester()
7119 .mr(6)
7120 .nr(8)
7121 .kr(1)
7122 .sr(1)
7123 .m(6)
7124 .n(n)
7125 .k(8)
7126 .iterations(1)
7127 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7128 }
7129 }
7130
7131 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16) {
7132 TEST_REQUIRES_ARM_NEON_FMA;
7133 GemmMicrokernelTester()
7134 .mr(6)
7135 .nr(8)
7136 .kr(1)
7137 .sr(1)
7138 .m(6)
7139 .n(8)
7140 .k(16)
7141 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7142 }
7143
7144 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16_strided_a) {
7145 TEST_REQUIRES_ARM_NEON_FMA;
7146 GemmMicrokernelTester()
7147 .mr(6)
7148 .nr(8)
7149 .kr(1)
7150 .sr(1)
7151 .m(6)
7152 .n(8)
7153 .k(16)
7154 .a_stride(19)
7155 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7156 }
7157
7158 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16_subtile) {
7159 TEST_REQUIRES_ARM_NEON_FMA;
7160 for (uint32_t m = 1; m <= 6; m++) {
7161 for (uint32_t n = 1; n <= 8; n++) {
7162 GemmMicrokernelTester()
7163 .mr(6)
7164 .nr(8)
7165 .kr(1)
7166 .sr(1)
7167 .m(m)
7168 .n(n)
7169 .k(16)
7170 .iterations(1)
7171 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7172 }
7173 }
7174 }
7175
7176 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16) {
7177 TEST_REQUIRES_ARM_NEON_FMA;
7178 for (size_t k = 1; k < 16; k++) {
7179 GemmMicrokernelTester()
7180 .mr(6)
7181 .nr(8)
7182 .kr(1)
7183 .sr(1)
7184 .m(6)
7185 .n(8)
7186 .k(k)
7187 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7188 }
7189 }
7190
7191 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16_strided_a) {
7192 TEST_REQUIRES_ARM_NEON_FMA;
7193 for (size_t k = 1; k < 16; k++) {
7194 GemmMicrokernelTester()
7195 .mr(6)
7196 .nr(8)
7197 .kr(1)
7198 .sr(1)
7199 .m(6)
7200 .n(8)
7201 .k(k)
7202 .a_stride(19)
7203 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7204 }
7205 }
7206
7207 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16_subtile) {
7208 TEST_REQUIRES_ARM_NEON_FMA;
7209 for (size_t k = 1; k < 16; k++) {
7210 for (uint32_t m = 1; m <= 6; m++) {
7211 for (uint32_t n = 1; n <= 8; n++) {
7212 GemmMicrokernelTester()
7213 .mr(6)
7214 .nr(8)
7215 .kr(1)
7216 .sr(1)
7217 .m(m)
7218 .n(n)
7219 .k(k)
7220 .iterations(1)
7221 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7222 }
7223 }
7224 }
7225 }
7226
7227 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_16) {
7228 TEST_REQUIRES_ARM_NEON_FMA;
7229 for (size_t k = 17; k < 16; k++) {
7230 GemmMicrokernelTester()
7231 .mr(6)
7232 .nr(8)
7233 .kr(1)
7234 .sr(1)
7235 .m(6)
7236 .n(8)
7237 .k(k)
7238 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7239 }
7240 }
7241
7242 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_8_strided_a) {
7243 TEST_REQUIRES_ARM_NEON_FMA;
7244 for (size_t k = 17; k < 16; k++) {
7245 GemmMicrokernelTester()
7246 .mr(6)
7247 .nr(8)
7248 .kr(1)
7249 .sr(1)
7250 .m(6)
7251 .n(8)
7252 .k(k)
7253 .a_stride(19)
7254 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7255 }
7256 }
7257
7258 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_8_subtile) {
7259 TEST_REQUIRES_ARM_NEON_FMA;
7260 for (size_t k = 17; k < 16; k++) {
7261 for (uint32_t m = 1; m <= 6; m++) {
7262 for (uint32_t n = 1; n <= 8; n++) {
7263 GemmMicrokernelTester()
7264 .mr(6)
7265 .nr(8)
7266 .kr(1)
7267 .sr(1)
7268 .m(m)
7269 .n(n)
7270 .k(k)
7271 .iterations(1)
7272 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7273 }
7274 }
7275 }
7276 }
7277
7278 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8) {
7279 TEST_REQUIRES_ARM_NEON_FMA;
7280 for (size_t k = 24; k <= 80; k += 8) {
7281 GemmMicrokernelTester()
7282 .mr(6)
7283 .nr(8)
7284 .kr(1)
7285 .sr(1)
7286 .m(6)
7287 .n(8)
7288 .k(k)
7289 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7290 }
7291 }
7292
7293 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8_strided_a) {
7294 TEST_REQUIRES_ARM_NEON_FMA;
7295 for (size_t k = 24; k <= 80; k += 8) {
7296 GemmMicrokernelTester()
7297 .mr(6)
7298 .nr(8)
7299 .kr(1)
7300 .sr(1)
7301 .m(6)
7302 .n(8)
7303 .k(k)
7304 .a_stride(83)
7305 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7306 }
7307 }
7308
7309 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8_subtile) {
7310 TEST_REQUIRES_ARM_NEON_FMA;
7311 for (size_t k = 24; k <= 80; k += 8) {
7312 for (uint32_t m = 1; m <= 6; m++) {
7313 for (uint32_t n = 1; n <= 8; n++) {
7314 GemmMicrokernelTester()
7315 .mr(6)
7316 .nr(8)
7317 .kr(1)
7318 .sr(1)
7319 .m(m)
7320 .n(n)
7321 .k(k)
7322 .iterations(1)
7323 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7324 }
7325 }
7326 }
7327 }
7328
7329 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8) {
7330 TEST_REQUIRES_ARM_NEON_FMA;
7331 for (uint32_t n = 9; n < 16; n++) {
7332 for (size_t k = 1; k <= 40; k += 9) {
7333 GemmMicrokernelTester()
7334 .mr(6)
7335 .nr(8)
7336 .kr(1)
7337 .sr(1)
7338 .m(6)
7339 .n(8)
7340 .k(k)
7341 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7342 }
7343 }
7344 }
7345
7346 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_cn) {
7347 TEST_REQUIRES_ARM_NEON_FMA;
7348 for (uint32_t n = 9; n < 16; n++) {
7349 for (size_t k = 1; k <= 40; k += 9) {
7350 GemmMicrokernelTester()
7351 .mr(6)
7352 .nr(8)
7353 .kr(1)
7354 .sr(1)
7355 .m(6)
7356 .n(8)
7357 .k(k)
7358 .cn_stride(11)
7359 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7360 }
7361 }
7362 }
7363
7364 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_a) {
7365 TEST_REQUIRES_ARM_NEON_FMA;
7366 for (uint32_t n = 9; n < 16; n++) {
7367 for (size_t k = 1; k <= 40; k += 9) {
7368 GemmMicrokernelTester()
7369 .mr(6)
7370 .nr(8)
7371 .kr(1)
7372 .sr(1)
7373 .m(6)
7374 .n(n)
7375 .k(k)
7376 .a_stride(43)
7377 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7378 }
7379 }
7380 }
7381
7382 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_subtile) {
7383 TEST_REQUIRES_ARM_NEON_FMA;
7384 for (uint32_t n = 9; n < 16; n++) {
7385 for (size_t k = 1; k <= 40; k += 9) {
7386 for (uint32_t m = 1; m <= 6; m++) {
7387 GemmMicrokernelTester()
7388 .mr(6)
7389 .nr(8)
7390 .kr(1)
7391 .sr(1)
7392 .m(m)
7393 .n(n)
7394 .k(k)
7395 .iterations(1)
7396 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7397 }
7398 }
7399 }
7400 }
7401
7402 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8) {
7403 TEST_REQUIRES_ARM_NEON_FMA;
7404 for (uint32_t n = 16; n <= 24; n += 8) {
7405 for (size_t k = 1; k <= 40; k += 9) {
7406 GemmMicrokernelTester()
7407 .mr(6)
7408 .nr(8)
7409 .kr(1)
7410 .sr(1)
7411 .m(6)
7412 .n(8)
7413 .k(k)
7414 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7415 }
7416 }
7417 }
7418
7419 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_cn) {
7420 TEST_REQUIRES_ARM_NEON_FMA;
7421 for (uint32_t n = 16; n <= 24; n += 8) {
7422 for (size_t k = 1; k <= 40; k += 9) {
7423 GemmMicrokernelTester()
7424 .mr(6)
7425 .nr(8)
7426 .kr(1)
7427 .sr(1)
7428 .m(6)
7429 .n(n)
7430 .k(k)
7431 .cn_stride(11)
7432 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7433 }
7434 }
7435 }
7436
7437 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_a) {
7438 TEST_REQUIRES_ARM_NEON_FMA;
7439 for (uint32_t n = 16; n <= 24; n += 8) {
7440 for (size_t k = 1; k <= 40; k += 9) {
7441 GemmMicrokernelTester()
7442 .mr(6)
7443 .nr(8)
7444 .kr(1)
7445 .sr(1)
7446 .m(6)
7447 .n(n)
7448 .k(k)
7449 .a_stride(43)
7450 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7451 }
7452 }
7453 }
7454
7455 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_subtile) {
7456 TEST_REQUIRES_ARM_NEON_FMA;
7457 for (uint32_t n = 16; n <= 24; n += 8) {
7458 for (size_t k = 1; k <= 40; k += 9) {
7459 for (uint32_t m = 1; m <= 6; m++) {
7460 GemmMicrokernelTester()
7461 .mr(6)
7462 .nr(8)
7463 .kr(1)
7464 .sr(1)
7465 .m(m)
7466 .n(n)
7467 .k(k)
7468 .iterations(1)
7469 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7470 }
7471 }
7472 }
7473 }
7474
7475 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm_subtile) {
7476 TEST_REQUIRES_ARM_NEON_FMA;
7477 for (size_t k = 1; k <= 40; k += 9) {
7478 for (uint32_t m = 1; m <= 6; m++) {
7479 for (uint32_t n = 1; n <= 8; n++) {
7480 GemmMicrokernelTester()
7481 .mr(6)
7482 .nr(8)
7483 .kr(1)
7484 .sr(1)
7485 .m(m)
7486 .n(n)
7487 .k(k)
7488 .cm_stride(11)
7489 .iterations(1)
7490 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7491 }
7492 }
7493 }
7494 }
7495
7496 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, qmin) {
7497 TEST_REQUIRES_ARM_NEON_FMA;
7498 GemmMicrokernelTester()
7499 .mr(6)
7500 .nr(8)
7501 .kr(1)
7502 .sr(1)
7503 .m(6)
7504 .n(8)
7505 .k(8)
7506 .qmin(128)
7507 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7508 }
7509
7510 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, qmax) {
7511 TEST_REQUIRES_ARM_NEON_FMA;
7512 GemmMicrokernelTester()
7513 .mr(6)
7514 .nr(8)
7515 .kr(1)
7516 .sr(1)
7517 .m(6)
7518 .n(8)
7519 .k(8)
7520 .qmax(128)
7521 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7522 }
7523
7524 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm) {
7525 TEST_REQUIRES_ARM_NEON_FMA;
7526 GemmMicrokernelTester()
7527 .mr(6)
7528 .nr(8)
7529 .kr(1)
7530 .sr(1)
7531 .m(6)
7532 .n(8)
7533 .k(8)
7534 .cm_stride(11)
7535 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ios);
7536 }
7537#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7538
7539
7540#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7541 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
7542 TEST_REQUIRES_ARM_NEON_FMA;
7543 GemmMicrokernelTester()
7544 .mr(1)
7545 .nr(12)
7546 .kr(1)
7547 .sr(1)
7548 .m(1)
7549 .n(12)
7550 .k(4)
7551 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7552 }
7553
7554 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
7555 TEST_REQUIRES_ARM_NEON_FMA;
7556 GemmMicrokernelTester()
7557 .mr(1)
7558 .nr(12)
7559 .kr(1)
7560 .sr(1)
7561 .m(1)
7562 .n(12)
7563 .k(4)
7564 .cn_stride(17)
7565 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7566 }
7567
7568 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
7569 TEST_REQUIRES_ARM_NEON_FMA;
7570 GemmMicrokernelTester()
7571 .mr(1)
7572 .nr(12)
7573 .kr(1)
7574 .sr(1)
7575 .m(1)
7576 .n(12)
7577 .k(4)
7578 .a_stride(7)
7579 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7580 }
7581
7582 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
7583 TEST_REQUIRES_ARM_NEON_FMA;
7584 for (uint32_t m = 1; m <= 1; m++) {
7585 for (uint32_t n = 1; n <= 12; n++) {
7586 GemmMicrokernelTester()
7587 .mr(1)
7588 .nr(12)
7589 .kr(1)
7590 .sr(1)
7591 .m(m)
7592 .n(n)
7593 .k(4)
7594 .iterations(1)
7595 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7596 }
7597 }
7598 }
7599
7600 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
7601 TEST_REQUIRES_ARM_NEON_FMA;
7602 for (uint32_t m = 1; m <= 1; m++) {
7603 GemmMicrokernelTester()
7604 .mr(1)
7605 .nr(12)
7606 .kr(1)
7607 .sr(1)
7608 .m(m)
7609 .n(12)
7610 .k(4)
7611 .iterations(1)
7612 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7613 }
7614 }
7615
7616 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
7617 TEST_REQUIRES_ARM_NEON_FMA;
7618 for (uint32_t n = 1; n <= 12; n++) {
7619 GemmMicrokernelTester()
7620 .mr(1)
7621 .nr(12)
7622 .kr(1)
7623 .sr(1)
7624 .m(1)
7625 .n(n)
7626 .k(4)
7627 .iterations(1)
7628 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7629 }
7630 }
7631
7632 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
7633 TEST_REQUIRES_ARM_NEON_FMA;
7634 GemmMicrokernelTester()
7635 .mr(1)
7636 .nr(12)
7637 .kr(1)
7638 .sr(1)
7639 .m(1)
7640 .n(12)
7641 .k(8)
7642 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7643 }
7644
7645 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
7646 TEST_REQUIRES_ARM_NEON_FMA;
7647 GemmMicrokernelTester()
7648 .mr(1)
7649 .nr(12)
7650 .kr(1)
7651 .sr(1)
7652 .m(1)
7653 .n(12)
7654 .k(8)
7655 .a_stride(11)
7656 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7657 }
7658
7659 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
7660 TEST_REQUIRES_ARM_NEON_FMA;
7661 for (uint32_t m = 1; m <= 1; m++) {
7662 for (uint32_t n = 1; n <= 12; n++) {
7663 GemmMicrokernelTester()
7664 .mr(1)
7665 .nr(12)
7666 .kr(1)
7667 .sr(1)
7668 .m(m)
7669 .n(n)
7670 .k(8)
7671 .iterations(1)
7672 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7673 }
7674 }
7675 }
7676
7677 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
7678 TEST_REQUIRES_ARM_NEON_FMA;
7679 for (size_t k = 1; k < 8; k++) {
7680 GemmMicrokernelTester()
7681 .mr(1)
7682 .nr(12)
7683 .kr(1)
7684 .sr(1)
7685 .m(1)
7686 .n(12)
7687 .k(k)
7688 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7689 }
7690 }
7691
7692 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
7693 TEST_REQUIRES_ARM_NEON_FMA;
7694 for (size_t k = 1; k < 8; k++) {
7695 GemmMicrokernelTester()
7696 .mr(1)
7697 .nr(12)
7698 .kr(1)
7699 .sr(1)
7700 .m(1)
7701 .n(12)
7702 .k(k)
7703 .a_stride(11)
7704 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7705 }
7706 }
7707
7708 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
7709 TEST_REQUIRES_ARM_NEON_FMA;
7710 for (size_t k = 1; k < 8; k++) {
7711 for (uint32_t m = 1; m <= 1; m++) {
7712 for (uint32_t n = 1; n <= 12; n++) {
7713 GemmMicrokernelTester()
7714 .mr(1)
7715 .nr(12)
7716 .kr(1)
7717 .sr(1)
7718 .m(m)
7719 .n(n)
7720 .k(k)
7721 .iterations(1)
7722 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7723 }
7724 }
7725 }
7726 }
7727
7728 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
7729 TEST_REQUIRES_ARM_NEON_FMA;
7730 for (size_t k = 9; k < 8; k++) {
7731 GemmMicrokernelTester()
7732 .mr(1)
7733 .nr(12)
7734 .kr(1)
7735 .sr(1)
7736 .m(1)
7737 .n(12)
7738 .k(k)
7739 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7740 }
7741 }
7742
7743 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
7744 TEST_REQUIRES_ARM_NEON_FMA;
7745 for (size_t k = 9; k < 8; k++) {
7746 GemmMicrokernelTester()
7747 .mr(1)
7748 .nr(12)
7749 .kr(1)
7750 .sr(1)
7751 .m(1)
7752 .n(12)
7753 .k(k)
7754 .a_stride(11)
7755 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7756 }
7757 }
7758
7759 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
7760 TEST_REQUIRES_ARM_NEON_FMA;
7761 for (size_t k = 9; k < 8; k++) {
7762 for (uint32_t m = 1; m <= 1; m++) {
7763 for (uint32_t n = 1; n <= 12; n++) {
7764 GemmMicrokernelTester()
7765 .mr(1)
7766 .nr(12)
7767 .kr(1)
7768 .sr(1)
7769 .m(m)
7770 .n(n)
7771 .k(k)
7772 .iterations(1)
7773 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7774 }
7775 }
7776 }
7777 }
7778
7779 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
7780 TEST_REQUIRES_ARM_NEON_FMA;
7781 for (size_t k = 12; k <= 40; k += 4) {
7782 GemmMicrokernelTester()
7783 .mr(1)
7784 .nr(12)
7785 .kr(1)
7786 .sr(1)
7787 .m(1)
7788 .n(12)
7789 .k(k)
7790 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7791 }
7792 }
7793
7794 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
7795 TEST_REQUIRES_ARM_NEON_FMA;
7796 for (size_t k = 12; k <= 40; k += 4) {
7797 GemmMicrokernelTester()
7798 .mr(1)
7799 .nr(12)
7800 .kr(1)
7801 .sr(1)
7802 .m(1)
7803 .n(12)
7804 .k(k)
7805 .a_stride(43)
7806 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7807 }
7808 }
7809
7810 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
7811 TEST_REQUIRES_ARM_NEON_FMA;
7812 for (size_t k = 12; k <= 40; k += 4) {
7813 for (uint32_t m = 1; m <= 1; m++) {
7814 for (uint32_t n = 1; n <= 12; n++) {
7815 GemmMicrokernelTester()
7816 .mr(1)
7817 .nr(12)
7818 .kr(1)
7819 .sr(1)
7820 .m(m)
7821 .n(n)
7822 .k(k)
7823 .iterations(1)
7824 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7825 }
7826 }
7827 }
7828 }
7829
7830 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
7831 TEST_REQUIRES_ARM_NEON_FMA;
7832 for (uint32_t n = 13; n < 24; n++) {
7833 for (size_t k = 1; k <= 20; k += 5) {
7834 GemmMicrokernelTester()
7835 .mr(1)
7836 .nr(12)
7837 .kr(1)
7838 .sr(1)
7839 .m(1)
7840 .n(12)
7841 .k(k)
7842 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7843 }
7844 }
7845 }
7846
7847 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
7848 TEST_REQUIRES_ARM_NEON_FMA;
7849 for (uint32_t n = 13; n < 24; n++) {
7850 for (size_t k = 1; k <= 20; k += 5) {
7851 GemmMicrokernelTester()
7852 .mr(1)
7853 .nr(12)
7854 .kr(1)
7855 .sr(1)
7856 .m(1)
7857 .n(12)
7858 .k(k)
7859 .cn_stride(17)
7860 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7861 }
7862 }
7863 }
7864
7865 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
7866 TEST_REQUIRES_ARM_NEON_FMA;
7867 for (uint32_t n = 13; n < 24; n++) {
7868 for (size_t k = 1; k <= 20; k += 5) {
7869 GemmMicrokernelTester()
7870 .mr(1)
7871 .nr(12)
7872 .kr(1)
7873 .sr(1)
7874 .m(1)
7875 .n(n)
7876 .k(k)
7877 .a_stride(23)
7878 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7879 }
7880 }
7881 }
7882
7883 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
7884 TEST_REQUIRES_ARM_NEON_FMA;
7885 for (uint32_t n = 13; n < 24; n++) {
7886 for (size_t k = 1; k <= 20; k += 5) {
7887 for (uint32_t m = 1; m <= 1; m++) {
7888 GemmMicrokernelTester()
7889 .mr(1)
7890 .nr(12)
7891 .kr(1)
7892 .sr(1)
7893 .m(m)
7894 .n(n)
7895 .k(k)
7896 .iterations(1)
7897 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7898 }
7899 }
7900 }
7901 }
7902
7903 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
7904 TEST_REQUIRES_ARM_NEON_FMA;
7905 for (uint32_t n = 24; n <= 36; n += 12) {
7906 for (size_t k = 1; k <= 20; k += 5) {
7907 GemmMicrokernelTester()
7908 .mr(1)
7909 .nr(12)
7910 .kr(1)
7911 .sr(1)
7912 .m(1)
7913 .n(12)
7914 .k(k)
7915 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7916 }
7917 }
7918 }
7919
7920 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
7921 TEST_REQUIRES_ARM_NEON_FMA;
7922 for (uint32_t n = 24; n <= 36; n += 12) {
7923 for (size_t k = 1; k <= 20; k += 5) {
7924 GemmMicrokernelTester()
7925 .mr(1)
7926 .nr(12)
7927 .kr(1)
7928 .sr(1)
7929 .m(1)
7930 .n(n)
7931 .k(k)
7932 .cn_stride(17)
7933 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7934 }
7935 }
7936 }
7937
7938 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
7939 TEST_REQUIRES_ARM_NEON_FMA;
7940 for (uint32_t n = 24; n <= 36; n += 12) {
7941 for (size_t k = 1; k <= 20; k += 5) {
7942 GemmMicrokernelTester()
7943 .mr(1)
7944 .nr(12)
7945 .kr(1)
7946 .sr(1)
7947 .m(1)
7948 .n(n)
7949 .k(k)
7950 .a_stride(23)
7951 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7952 }
7953 }
7954 }
7955
7956 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
7957 TEST_REQUIRES_ARM_NEON_FMA;
7958 for (uint32_t n = 24; n <= 36; n += 12) {
7959 for (size_t k = 1; k <= 20; k += 5) {
7960 for (uint32_t m = 1; m <= 1; m++) {
7961 GemmMicrokernelTester()
7962 .mr(1)
7963 .nr(12)
7964 .kr(1)
7965 .sr(1)
7966 .m(m)
7967 .n(n)
7968 .k(k)
7969 .iterations(1)
7970 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7971 }
7972 }
7973 }
7974 }
7975
7976 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
7977 TEST_REQUIRES_ARM_NEON_FMA;
7978 for (size_t k = 1; k <= 20; k += 5) {
7979 for (uint32_t m = 1; m <= 1; m++) {
7980 for (uint32_t n = 1; n <= 12; n++) {
7981 GemmMicrokernelTester()
7982 .mr(1)
7983 .nr(12)
7984 .kr(1)
7985 .sr(1)
7986 .m(m)
7987 .n(n)
7988 .k(k)
7989 .cm_stride(17)
7990 .iterations(1)
7991 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
7992 }
7993 }
7994 }
7995 }
7996
7997 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
7998 TEST_REQUIRES_ARM_NEON_FMA;
7999 GemmMicrokernelTester()
8000 .mr(1)
8001 .nr(12)
8002 .kr(1)
8003 .sr(1)
8004 .m(1)
8005 .n(12)
8006 .k(4)
8007 .qmin(128)
8008 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
8009 }
8010
8011 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
8012 TEST_REQUIRES_ARM_NEON_FMA;
8013 GemmMicrokernelTester()
8014 .mr(1)
8015 .nr(12)
8016 .kr(1)
8017 .sr(1)
8018 .m(1)
8019 .n(12)
8020 .k(4)
8021 .qmax(128)
8022 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
8023 }
8024
8025 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
8026 TEST_REQUIRES_ARM_NEON_FMA;
8027 GemmMicrokernelTester()
8028 .mr(1)
8029 .nr(12)
8030 .kr(1)
8031 .sr(1)
8032 .m(1)
8033 .n(12)
8034 .k(4)
8035 .cm_stride(17)
8036 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
8037 }
8038#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8039
8040
8041#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8042 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
8043 TEST_REQUIRES_ARM_NEON_FMA;
8044 GemmMicrokernelTester()
8045 .mr(4)
8046 .nr(12)
8047 .kr(1)
8048 .sr(1)
8049 .m(4)
8050 .n(12)
8051 .k(4)
8052 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8053 }
8054
8055 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
8056 TEST_REQUIRES_ARM_NEON_FMA;
8057 GemmMicrokernelTester()
8058 .mr(4)
8059 .nr(12)
8060 .kr(1)
8061 .sr(1)
8062 .m(4)
8063 .n(12)
8064 .k(4)
8065 .cn_stride(17)
8066 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8067 }
8068
8069 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
8070 TEST_REQUIRES_ARM_NEON_FMA;
8071 GemmMicrokernelTester()
8072 .mr(4)
8073 .nr(12)
8074 .kr(1)
8075 .sr(1)
8076 .m(4)
8077 .n(12)
8078 .k(4)
8079 .a_stride(7)
8080 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8081 }
8082
8083 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
8084 TEST_REQUIRES_ARM_NEON_FMA;
8085 for (uint32_t m = 1; m <= 4; m++) {
8086 for (uint32_t n = 1; n <= 12; n++) {
8087 GemmMicrokernelTester()
8088 .mr(4)
8089 .nr(12)
8090 .kr(1)
8091 .sr(1)
8092 .m(m)
8093 .n(n)
8094 .k(4)
8095 .iterations(1)
8096 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8097 }
8098 }
8099 }
8100
8101 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
8102 TEST_REQUIRES_ARM_NEON_FMA;
8103 for (uint32_t m = 1; m <= 4; m++) {
8104 GemmMicrokernelTester()
8105 .mr(4)
8106 .nr(12)
8107 .kr(1)
8108 .sr(1)
8109 .m(m)
8110 .n(12)
8111 .k(4)
8112 .iterations(1)
8113 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8114 }
8115 }
8116
8117 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
8118 TEST_REQUIRES_ARM_NEON_FMA;
8119 for (uint32_t n = 1; n <= 12; n++) {
8120 GemmMicrokernelTester()
8121 .mr(4)
8122 .nr(12)
8123 .kr(1)
8124 .sr(1)
8125 .m(4)
8126 .n(n)
8127 .k(4)
8128 .iterations(1)
8129 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8130 }
8131 }
8132
8133 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
8134 TEST_REQUIRES_ARM_NEON_FMA;
8135 GemmMicrokernelTester()
8136 .mr(4)
8137 .nr(12)
8138 .kr(1)
8139 .sr(1)
8140 .m(4)
8141 .n(12)
8142 .k(8)
8143 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8144 }
8145
8146 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
8147 TEST_REQUIRES_ARM_NEON_FMA;
8148 GemmMicrokernelTester()
8149 .mr(4)
8150 .nr(12)
8151 .kr(1)
8152 .sr(1)
8153 .m(4)
8154 .n(12)
8155 .k(8)
8156 .a_stride(11)
8157 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8158 }
8159
8160 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
8161 TEST_REQUIRES_ARM_NEON_FMA;
8162 for (uint32_t m = 1; m <= 4; m++) {
8163 for (uint32_t n = 1; n <= 12; n++) {
8164 GemmMicrokernelTester()
8165 .mr(4)
8166 .nr(12)
8167 .kr(1)
8168 .sr(1)
8169 .m(m)
8170 .n(n)
8171 .k(8)
8172 .iterations(1)
8173 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8174 }
8175 }
8176 }
8177
8178 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
8179 TEST_REQUIRES_ARM_NEON_FMA;
8180 for (size_t k = 1; k < 8; k++) {
8181 GemmMicrokernelTester()
8182 .mr(4)
8183 .nr(12)
8184 .kr(1)
8185 .sr(1)
8186 .m(4)
8187 .n(12)
8188 .k(k)
8189 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8190 }
8191 }
8192
8193 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
8194 TEST_REQUIRES_ARM_NEON_FMA;
8195 for (size_t k = 1; k < 8; k++) {
8196 GemmMicrokernelTester()
8197 .mr(4)
8198 .nr(12)
8199 .kr(1)
8200 .sr(1)
8201 .m(4)
8202 .n(12)
8203 .k(k)
8204 .a_stride(11)
8205 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8206 }
8207 }
8208
8209 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
8210 TEST_REQUIRES_ARM_NEON_FMA;
8211 for (size_t k = 1; k < 8; k++) {
8212 for (uint32_t m = 1; m <= 4; m++) {
8213 for (uint32_t n = 1; n <= 12; n++) {
8214 GemmMicrokernelTester()
8215 .mr(4)
8216 .nr(12)
8217 .kr(1)
8218 .sr(1)
8219 .m(m)
8220 .n(n)
8221 .k(k)
8222 .iterations(1)
8223 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8224 }
8225 }
8226 }
8227 }
8228
8229 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
8230 TEST_REQUIRES_ARM_NEON_FMA;
8231 for (size_t k = 9; k < 8; k++) {
8232 GemmMicrokernelTester()
8233 .mr(4)
8234 .nr(12)
8235 .kr(1)
8236 .sr(1)
8237 .m(4)
8238 .n(12)
8239 .k(k)
8240 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8241 }
8242 }
8243
8244 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
8245 TEST_REQUIRES_ARM_NEON_FMA;
8246 for (size_t k = 9; k < 8; k++) {
8247 GemmMicrokernelTester()
8248 .mr(4)
8249 .nr(12)
8250 .kr(1)
8251 .sr(1)
8252 .m(4)
8253 .n(12)
8254 .k(k)
8255 .a_stride(11)
8256 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8257 }
8258 }
8259
8260 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
8261 TEST_REQUIRES_ARM_NEON_FMA;
8262 for (size_t k = 9; k < 8; k++) {
8263 for (uint32_t m = 1; m <= 4; m++) {
8264 for (uint32_t n = 1; n <= 12; n++) {
8265 GemmMicrokernelTester()
8266 .mr(4)
8267 .nr(12)
8268 .kr(1)
8269 .sr(1)
8270 .m(m)
8271 .n(n)
8272 .k(k)
8273 .iterations(1)
8274 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8275 }
8276 }
8277 }
8278 }
8279
8280 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
8281 TEST_REQUIRES_ARM_NEON_FMA;
8282 for (size_t k = 12; k <= 40; k += 4) {
8283 GemmMicrokernelTester()
8284 .mr(4)
8285 .nr(12)
8286 .kr(1)
8287 .sr(1)
8288 .m(4)
8289 .n(12)
8290 .k(k)
8291 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8292 }
8293 }
8294
8295 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
8296 TEST_REQUIRES_ARM_NEON_FMA;
8297 for (size_t k = 12; k <= 40; k += 4) {
8298 GemmMicrokernelTester()
8299 .mr(4)
8300 .nr(12)
8301 .kr(1)
8302 .sr(1)
8303 .m(4)
8304 .n(12)
8305 .k(k)
8306 .a_stride(43)
8307 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8308 }
8309 }
8310
8311 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
8312 TEST_REQUIRES_ARM_NEON_FMA;
8313 for (size_t k = 12; k <= 40; k += 4) {
8314 for (uint32_t m = 1; m <= 4; m++) {
8315 for (uint32_t n = 1; n <= 12; n++) {
8316 GemmMicrokernelTester()
8317 .mr(4)
8318 .nr(12)
8319 .kr(1)
8320 .sr(1)
8321 .m(m)
8322 .n(n)
8323 .k(k)
8324 .iterations(1)
8325 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8326 }
8327 }
8328 }
8329 }
8330
8331 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
8332 TEST_REQUIRES_ARM_NEON_FMA;
8333 for (uint32_t n = 13; n < 24; n++) {
8334 for (size_t k = 1; k <= 20; k += 5) {
8335 GemmMicrokernelTester()
8336 .mr(4)
8337 .nr(12)
8338 .kr(1)
8339 .sr(1)
8340 .m(4)
8341 .n(12)
8342 .k(k)
8343 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8344 }
8345 }
8346 }
8347
8348 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
8349 TEST_REQUIRES_ARM_NEON_FMA;
8350 for (uint32_t n = 13; n < 24; n++) {
8351 for (size_t k = 1; k <= 20; k += 5) {
8352 GemmMicrokernelTester()
8353 .mr(4)
8354 .nr(12)
8355 .kr(1)
8356 .sr(1)
8357 .m(4)
8358 .n(12)
8359 .k(k)
8360 .cn_stride(17)
8361 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8362 }
8363 }
8364 }
8365
8366 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
8367 TEST_REQUIRES_ARM_NEON_FMA;
8368 for (uint32_t n = 13; n < 24; n++) {
8369 for (size_t k = 1; k <= 20; k += 5) {
8370 GemmMicrokernelTester()
8371 .mr(4)
8372 .nr(12)
8373 .kr(1)
8374 .sr(1)
8375 .m(4)
8376 .n(n)
8377 .k(k)
8378 .a_stride(23)
8379 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8380 }
8381 }
8382 }
8383
8384 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
8385 TEST_REQUIRES_ARM_NEON_FMA;
8386 for (uint32_t n = 13; n < 24; n++) {
8387 for (size_t k = 1; k <= 20; k += 5) {
8388 for (uint32_t m = 1; m <= 4; m++) {
8389 GemmMicrokernelTester()
8390 .mr(4)
8391 .nr(12)
8392 .kr(1)
8393 .sr(1)
8394 .m(m)
8395 .n(n)
8396 .k(k)
8397 .iterations(1)
8398 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8399 }
8400 }
8401 }
8402 }
8403
8404 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
8405 TEST_REQUIRES_ARM_NEON_FMA;
8406 for (uint32_t n = 24; n <= 36; n += 12) {
8407 for (size_t k = 1; k <= 20; k += 5) {
8408 GemmMicrokernelTester()
8409 .mr(4)
8410 .nr(12)
8411 .kr(1)
8412 .sr(1)
8413 .m(4)
8414 .n(12)
8415 .k(k)
8416 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8417 }
8418 }
8419 }
8420
8421 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
8422 TEST_REQUIRES_ARM_NEON_FMA;
8423 for (uint32_t n = 24; n <= 36; n += 12) {
8424 for (size_t k = 1; k <= 20; k += 5) {
8425 GemmMicrokernelTester()
8426 .mr(4)
8427 .nr(12)
8428 .kr(1)
8429 .sr(1)
8430 .m(4)
8431 .n(n)
8432 .k(k)
8433 .cn_stride(17)
8434 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8435 }
8436 }
8437 }
8438
8439 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
8440 TEST_REQUIRES_ARM_NEON_FMA;
8441 for (uint32_t n = 24; n <= 36; n += 12) {
8442 for (size_t k = 1; k <= 20; k += 5) {
8443 GemmMicrokernelTester()
8444 .mr(4)
8445 .nr(12)
8446 .kr(1)
8447 .sr(1)
8448 .m(4)
8449 .n(n)
8450 .k(k)
8451 .a_stride(23)
8452 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8453 }
8454 }
8455 }
8456
8457 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
8458 TEST_REQUIRES_ARM_NEON_FMA;
8459 for (uint32_t n = 24; n <= 36; n += 12) {
8460 for (size_t k = 1; k <= 20; k += 5) {
8461 for (uint32_t m = 1; m <= 4; m++) {
8462 GemmMicrokernelTester()
8463 .mr(4)
8464 .nr(12)
8465 .kr(1)
8466 .sr(1)
8467 .m(m)
8468 .n(n)
8469 .k(k)
8470 .iterations(1)
8471 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8472 }
8473 }
8474 }
8475 }
8476
8477 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
8478 TEST_REQUIRES_ARM_NEON_FMA;
8479 for (size_t k = 1; k <= 20; k += 5) {
8480 for (uint32_t m = 1; m <= 4; m++) {
8481 for (uint32_t n = 1; n <= 12; n++) {
8482 GemmMicrokernelTester()
8483 .mr(4)
8484 .nr(12)
8485 .kr(1)
8486 .sr(1)
8487 .m(m)
8488 .n(n)
8489 .k(k)
8490 .cm_stride(17)
8491 .iterations(1)
8492 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8493 }
8494 }
8495 }
8496 }
8497
8498 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
8499 TEST_REQUIRES_ARM_NEON_FMA;
8500 GemmMicrokernelTester()
8501 .mr(4)
8502 .nr(12)
8503 .kr(1)
8504 .sr(1)
8505 .m(4)
8506 .n(12)
8507 .k(4)
8508 .qmin(128)
8509 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8510 }
8511
8512 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
8513 TEST_REQUIRES_ARM_NEON_FMA;
8514 GemmMicrokernelTester()
8515 .mr(4)
8516 .nr(12)
8517 .kr(1)
8518 .sr(1)
8519 .m(4)
8520 .n(12)
8521 .k(4)
8522 .qmax(128)
8523 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8524 }
8525
8526 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
8527 TEST_REQUIRES_ARM_NEON_FMA;
8528 GemmMicrokernelTester()
8529 .mr(4)
8530 .nr(12)
8531 .kr(1)
8532 .sr(1)
8533 .m(4)
8534 .n(12)
8535 .k(4)
8536 .cm_stride(17)
8537 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
8538 }
8539#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8540
8541
8542#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8543 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2) {
8544 TEST_REQUIRES_ARM_NEON_FMA;
8545 GemmMicrokernelTester()
8546 .mr(4)
8547 .nr(8)
8548 .kr(1)
8549 .sr(1)
8550 .m(4)
8551 .n(8)
8552 .k(2)
8553 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8554 }
8555
8556 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, strided_cn) {
8557 TEST_REQUIRES_ARM_NEON_FMA;
8558 GemmMicrokernelTester()
8559 .mr(4)
8560 .nr(8)
8561 .kr(1)
8562 .sr(1)
8563 .m(4)
8564 .n(8)
8565 .k(2)
8566 .cn_stride(11)
8567 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8568 }
8569
8570 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
8571 TEST_REQUIRES_ARM_NEON_FMA;
8572 GemmMicrokernelTester()
8573 .mr(4)
8574 .nr(8)
8575 .kr(1)
8576 .sr(1)
8577 .m(4)
8578 .n(8)
8579 .k(2)
8580 .a_stride(5)
8581 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8582 }
8583
8584 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
8585 TEST_REQUIRES_ARM_NEON_FMA;
8586 for (uint32_t m = 1; m <= 4; m++) {
8587 for (uint32_t n = 1; n <= 8; n++) {
8588 GemmMicrokernelTester()
8589 .mr(4)
8590 .nr(8)
8591 .kr(1)
8592 .sr(1)
8593 .m(m)
8594 .n(n)
8595 .k(2)
8596 .iterations(1)
8597 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8598 }
8599 }
8600 }
8601
8602 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
8603 TEST_REQUIRES_ARM_NEON_FMA;
8604 for (uint32_t m = 1; m <= 4; m++) {
8605 GemmMicrokernelTester()
8606 .mr(4)
8607 .nr(8)
8608 .kr(1)
8609 .sr(1)
8610 .m(m)
8611 .n(8)
8612 .k(2)
8613 .iterations(1)
8614 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8615 }
8616 }
8617
8618 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
8619 TEST_REQUIRES_ARM_NEON_FMA;
8620 for (uint32_t n = 1; n <= 8; n++) {
8621 GemmMicrokernelTester()
8622 .mr(4)
8623 .nr(8)
8624 .kr(1)
8625 .sr(1)
8626 .m(4)
8627 .n(n)
8628 .k(2)
8629 .iterations(1)
8630 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8631 }
8632 }
8633
8634 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_lt_2) {
8635 TEST_REQUIRES_ARM_NEON_FMA;
8636 for (size_t k = 1; k < 2; k++) {
8637 GemmMicrokernelTester()
8638 .mr(4)
8639 .nr(8)
8640 .kr(1)
8641 .sr(1)
8642 .m(4)
8643 .n(8)
8644 .k(k)
8645 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8646 }
8647 }
8648
8649 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
8650 TEST_REQUIRES_ARM_NEON_FMA;
8651 for (size_t k = 1; k < 2; k++) {
8652 GemmMicrokernelTester()
8653 .mr(4)
8654 .nr(8)
8655 .kr(1)
8656 .sr(1)
8657 .m(4)
8658 .n(8)
8659 .k(k)
8660 .a_stride(5)
8661 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8662 }
8663 }
8664
8665 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
8666 TEST_REQUIRES_ARM_NEON_FMA;
8667 for (size_t k = 1; k < 2; k++) {
8668 for (uint32_t m = 1; m <= 4; m++) {
8669 for (uint32_t n = 1; n <= 8; n++) {
8670 GemmMicrokernelTester()
8671 .mr(4)
8672 .nr(8)
8673 .kr(1)
8674 .sr(1)
8675 .m(m)
8676 .n(n)
8677 .k(k)
8678 .iterations(1)
8679 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8680 }
8681 }
8682 }
8683 }
8684
8685 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_gt_2) {
8686 TEST_REQUIRES_ARM_NEON_FMA;
8687 for (size_t k = 3; k < 4; k++) {
8688 GemmMicrokernelTester()
8689 .mr(4)
8690 .nr(8)
8691 .kr(1)
8692 .sr(1)
8693 .m(4)
8694 .n(8)
8695 .k(k)
8696 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8697 }
8698 }
8699
8700 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
8701 TEST_REQUIRES_ARM_NEON_FMA;
8702 for (size_t k = 3; k < 4; k++) {
8703 GemmMicrokernelTester()
8704 .mr(4)
8705 .nr(8)
8706 .kr(1)
8707 .sr(1)
8708 .m(4)
8709 .n(8)
8710 .k(k)
8711 .a_stride(7)
8712 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8713 }
8714 }
8715
8716 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
8717 TEST_REQUIRES_ARM_NEON_FMA;
8718 for (size_t k = 3; k < 4; k++) {
8719 for (uint32_t m = 1; m <= 4; m++) {
8720 for (uint32_t n = 1; n <= 8; n++) {
8721 GemmMicrokernelTester()
8722 .mr(4)
8723 .nr(8)
8724 .kr(1)
8725 .sr(1)
8726 .m(m)
8727 .n(n)
8728 .k(k)
8729 .iterations(1)
8730 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8731 }
8732 }
8733 }
8734 }
8735
8736 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_div_2) {
8737 TEST_REQUIRES_ARM_NEON_FMA;
8738 for (size_t k = 4; k <= 20; k += 2) {
8739 GemmMicrokernelTester()
8740 .mr(4)
8741 .nr(8)
8742 .kr(1)
8743 .sr(1)
8744 .m(4)
8745 .n(8)
8746 .k(k)
8747 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8748 }
8749 }
8750
8751 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
8752 TEST_REQUIRES_ARM_NEON_FMA;
8753 for (size_t k = 4; k <= 20; k += 2) {
8754 GemmMicrokernelTester()
8755 .mr(4)
8756 .nr(8)
8757 .kr(1)
8758 .sr(1)
8759 .m(4)
8760 .n(8)
8761 .k(k)
8762 .a_stride(23)
8763 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8764 }
8765 }
8766
8767 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
8768 TEST_REQUIRES_ARM_NEON_FMA;
8769 for (size_t k = 4; k <= 20; k += 2) {
8770 for (uint32_t m = 1; m <= 4; m++) {
8771 for (uint32_t n = 1; n <= 8; n++) {
8772 GemmMicrokernelTester()
8773 .mr(4)
8774 .nr(8)
8775 .kr(1)
8776 .sr(1)
8777 .m(m)
8778 .n(n)
8779 .k(k)
8780 .iterations(1)
8781 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8782 }
8783 }
8784 }
8785 }
8786
8787 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8) {
8788 TEST_REQUIRES_ARM_NEON_FMA;
8789 for (uint32_t n = 9; n < 16; n++) {
8790 for (size_t k = 1; k <= 10; k += 3) {
8791 GemmMicrokernelTester()
8792 .mr(4)
8793 .nr(8)
8794 .kr(1)
8795 .sr(1)
8796 .m(4)
8797 .n(8)
8798 .k(k)
8799 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8800 }
8801 }
8802 }
8803
8804 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
8805 TEST_REQUIRES_ARM_NEON_FMA;
8806 for (uint32_t n = 9; n < 16; n++) {
8807 for (size_t k = 1; k <= 10; k += 3) {
8808 GemmMicrokernelTester()
8809 .mr(4)
8810 .nr(8)
8811 .kr(1)
8812 .sr(1)
8813 .m(4)
8814 .n(8)
8815 .k(k)
8816 .cn_stride(11)
8817 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8818 }
8819 }
8820 }
8821
8822 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
8823 TEST_REQUIRES_ARM_NEON_FMA;
8824 for (uint32_t n = 9; n < 16; n++) {
8825 for (size_t k = 1; k <= 10; k += 3) {
8826 GemmMicrokernelTester()
8827 .mr(4)
8828 .nr(8)
8829 .kr(1)
8830 .sr(1)
8831 .m(4)
8832 .n(n)
8833 .k(k)
8834 .a_stride(13)
8835 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8836 }
8837 }
8838 }
8839
8840 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
8841 TEST_REQUIRES_ARM_NEON_FMA;
8842 for (uint32_t n = 9; n < 16; n++) {
8843 for (size_t k = 1; k <= 10; k += 3) {
8844 for (uint32_t m = 1; m <= 4; m++) {
8845 GemmMicrokernelTester()
8846 .mr(4)
8847 .nr(8)
8848 .kr(1)
8849 .sr(1)
8850 .m(m)
8851 .n(n)
8852 .k(k)
8853 .iterations(1)
8854 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8855 }
8856 }
8857 }
8858 }
8859
8860 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8) {
8861 TEST_REQUIRES_ARM_NEON_FMA;
8862 for (uint32_t n = 16; n <= 24; n += 8) {
8863 for (size_t k = 1; k <= 10; k += 3) {
8864 GemmMicrokernelTester()
8865 .mr(4)
8866 .nr(8)
8867 .kr(1)
8868 .sr(1)
8869 .m(4)
8870 .n(8)
8871 .k(k)
8872 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8873 }
8874 }
8875 }
8876
8877 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
8878 TEST_REQUIRES_ARM_NEON_FMA;
8879 for (uint32_t n = 16; n <= 24; n += 8) {
8880 for (size_t k = 1; k <= 10; k += 3) {
8881 GemmMicrokernelTester()
8882 .mr(4)
8883 .nr(8)
8884 .kr(1)
8885 .sr(1)
8886 .m(4)
8887 .n(n)
8888 .k(k)
8889 .cn_stride(11)
8890 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8891 }
8892 }
8893 }
8894
8895 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
8896 TEST_REQUIRES_ARM_NEON_FMA;
8897 for (uint32_t n = 16; n <= 24; n += 8) {
8898 for (size_t k = 1; k <= 10; k += 3) {
8899 GemmMicrokernelTester()
8900 .mr(4)
8901 .nr(8)
8902 .kr(1)
8903 .sr(1)
8904 .m(4)
8905 .n(n)
8906 .k(k)
8907 .a_stride(13)
8908 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8909 }
8910 }
8911 }
8912
8913 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
8914 TEST_REQUIRES_ARM_NEON_FMA;
8915 for (uint32_t n = 16; n <= 24; n += 8) {
8916 for (size_t k = 1; k <= 10; k += 3) {
8917 for (uint32_t m = 1; m <= 4; m++) {
8918 GemmMicrokernelTester()
8919 .mr(4)
8920 .nr(8)
8921 .kr(1)
8922 .sr(1)
8923 .m(m)
8924 .n(n)
8925 .k(k)
8926 .iterations(1)
8927 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8928 }
8929 }
8930 }
8931 }
8932
8933 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
8934 TEST_REQUIRES_ARM_NEON_FMA;
8935 for (size_t k = 1; k <= 10; k += 3) {
8936 for (uint32_t m = 1; m <= 4; m++) {
8937 for (uint32_t n = 1; n <= 8; n++) {
8938 GemmMicrokernelTester()
8939 .mr(4)
8940 .nr(8)
8941 .kr(1)
8942 .sr(1)
8943 .m(m)
8944 .n(n)
8945 .k(k)
8946 .cm_stride(11)
8947 .iterations(1)
8948 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8949 }
8950 }
8951 }
8952 }
8953
8954 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, qmin) {
8955 TEST_REQUIRES_ARM_NEON_FMA;
8956 GemmMicrokernelTester()
8957 .mr(4)
8958 .nr(8)
8959 .kr(1)
8960 .sr(1)
8961 .m(4)
8962 .n(8)
8963 .k(2)
8964 .qmin(128)
8965 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8966 }
8967
8968 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, qmax) {
8969 TEST_REQUIRES_ARM_NEON_FMA;
8970 GemmMicrokernelTester()
8971 .mr(4)
8972 .nr(8)
8973 .kr(1)
8974 .sr(1)
8975 .m(4)
8976 .n(8)
8977 .k(2)
8978 .qmax(128)
8979 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8980 }
8981
8982 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, strided_cm) {
8983 TEST_REQUIRES_ARM_NEON_FMA;
8984 GemmMicrokernelTester()
8985 .mr(4)
8986 .nr(8)
8987 .kr(1)
8988 .sr(1)
8989 .m(4)
8990 .n(8)
8991 .k(2)
8992 .cm_stride(11)
8993 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
8994 }
8995#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8996
8997
8998#if XNN_ARCH_ARM
8999 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4) {
9000 TEST_REQUIRES_ARM_NEON;
9001 GemmMicrokernelTester()
9002 .mr(4)
9003 .nr(8)
9004 .kr(1)
9005 .sr(1)
9006 .m(4)
9007 .n(8)
9008 .k(4)
9009 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9010 }
9011
9012 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cn) {
9013 TEST_REQUIRES_ARM_NEON;
9014 GemmMicrokernelTester()
9015 .mr(4)
9016 .nr(8)
9017 .kr(1)
9018 .sr(1)
9019 .m(4)
9020 .n(8)
9021 .k(4)
9022 .cn_stride(11)
9023 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9024 }
9025
9026 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_strided_a) {
9027 TEST_REQUIRES_ARM_NEON;
9028 GemmMicrokernelTester()
9029 .mr(4)
9030 .nr(8)
9031 .kr(1)
9032 .sr(1)
9033 .m(4)
9034 .n(8)
9035 .k(4)
9036 .a_stride(7)
9037 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9038 }
9039
9040 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile) {
9041 TEST_REQUIRES_ARM_NEON;
9042 for (uint32_t m = 1; m <= 4; m++) {
9043 for (uint32_t n = 1; n <= 8; n++) {
9044 GemmMicrokernelTester()
9045 .mr(4)
9046 .nr(8)
9047 .kr(1)
9048 .sr(1)
9049 .m(m)
9050 .n(n)
9051 .k(4)
9052 .iterations(1)
9053 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9054 }
9055 }
9056 }
9057
9058 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile_m) {
9059 TEST_REQUIRES_ARM_NEON;
9060 for (uint32_t m = 1; m <= 4; m++) {
9061 GemmMicrokernelTester()
9062 .mr(4)
9063 .nr(8)
9064 .kr(1)
9065 .sr(1)
9066 .m(m)
9067 .n(8)
9068 .k(4)
9069 .iterations(1)
9070 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9071 }
9072 }
9073
9074 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile_n) {
9075 TEST_REQUIRES_ARM_NEON;
9076 for (uint32_t n = 1; n <= 8; n++) {
9077 GemmMicrokernelTester()
9078 .mr(4)
9079 .nr(8)
9080 .kr(1)
9081 .sr(1)
9082 .m(4)
9083 .n(n)
9084 .k(4)
9085 .iterations(1)
9086 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9087 }
9088 }
9089
9090 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8) {
9091 TEST_REQUIRES_ARM_NEON;
9092 GemmMicrokernelTester()
9093 .mr(4)
9094 .nr(8)
9095 .kr(1)
9096 .sr(1)
9097 .m(4)
9098 .n(8)
9099 .k(8)
9100 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9101 }
9102
9103 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8_strided_a) {
9104 TEST_REQUIRES_ARM_NEON;
9105 GemmMicrokernelTester()
9106 .mr(4)
9107 .nr(8)
9108 .kr(1)
9109 .sr(1)
9110 .m(4)
9111 .n(8)
9112 .k(8)
9113 .a_stride(11)
9114 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9115 }
9116
9117 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8_subtile) {
9118 TEST_REQUIRES_ARM_NEON;
9119 for (uint32_t m = 1; m <= 4; m++) {
9120 for (uint32_t n = 1; n <= 8; n++) {
9121 GemmMicrokernelTester()
9122 .mr(4)
9123 .nr(8)
9124 .kr(1)
9125 .sr(1)
9126 .m(m)
9127 .n(n)
9128 .k(8)
9129 .iterations(1)
9130 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9131 }
9132 }
9133 }
9134
9135 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8) {
9136 TEST_REQUIRES_ARM_NEON;
9137 for (size_t k = 1; k < 8; k++) {
9138 GemmMicrokernelTester()
9139 .mr(4)
9140 .nr(8)
9141 .kr(1)
9142 .sr(1)
9143 .m(4)
9144 .n(8)
9145 .k(k)
9146 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9147 }
9148 }
9149
9150 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8_strided_a) {
9151 TEST_REQUIRES_ARM_NEON;
9152 for (size_t k = 1; k < 8; k++) {
9153 GemmMicrokernelTester()
9154 .mr(4)
9155 .nr(8)
9156 .kr(1)
9157 .sr(1)
9158 .m(4)
9159 .n(8)
9160 .k(k)
9161 .a_stride(11)
9162 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9163 }
9164 }
9165
9166 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8_subtile) {
9167 TEST_REQUIRES_ARM_NEON;
9168 for (size_t k = 1; k < 8; k++) {
9169 for (uint32_t m = 1; m <= 4; m++) {
9170 for (uint32_t n = 1; n <= 8; n++) {
9171 GemmMicrokernelTester()
9172 .mr(4)
9173 .nr(8)
9174 .kr(1)
9175 .sr(1)
9176 .m(m)
9177 .n(n)
9178 .k(k)
9179 .iterations(1)
9180 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9181 }
9182 }
9183 }
9184 }
9185
9186 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_8) {
9187 TEST_REQUIRES_ARM_NEON;
9188 for (size_t k = 9; k < 8; k++) {
9189 GemmMicrokernelTester()
9190 .mr(4)
9191 .nr(8)
9192 .kr(1)
9193 .sr(1)
9194 .m(4)
9195 .n(8)
9196 .k(k)
9197 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9198 }
9199 }
9200
9201 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_4_strided_a) {
9202 TEST_REQUIRES_ARM_NEON;
9203 for (size_t k = 9; k < 8; k++) {
9204 GemmMicrokernelTester()
9205 .mr(4)
9206 .nr(8)
9207 .kr(1)
9208 .sr(1)
9209 .m(4)
9210 .n(8)
9211 .k(k)
9212 .a_stride(11)
9213 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9214 }
9215 }
9216
9217 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_4_subtile) {
9218 TEST_REQUIRES_ARM_NEON;
9219 for (size_t k = 9; k < 8; k++) {
9220 for (uint32_t m = 1; m <= 4; m++) {
9221 for (uint32_t n = 1; n <= 8; n++) {
9222 GemmMicrokernelTester()
9223 .mr(4)
9224 .nr(8)
9225 .kr(1)
9226 .sr(1)
9227 .m(m)
9228 .n(n)
9229 .k(k)
9230 .iterations(1)
9231 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9232 }
9233 }
9234 }
9235 }
9236
9237 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4) {
9238 TEST_REQUIRES_ARM_NEON;
9239 for (size_t k = 12; k <= 40; k += 4) {
9240 GemmMicrokernelTester()
9241 .mr(4)
9242 .nr(8)
9243 .kr(1)
9244 .sr(1)
9245 .m(4)
9246 .n(8)
9247 .k(k)
9248 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9249 }
9250 }
9251
9252 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4_strided_a) {
9253 TEST_REQUIRES_ARM_NEON;
9254 for (size_t k = 12; k <= 40; k += 4) {
9255 GemmMicrokernelTester()
9256 .mr(4)
9257 .nr(8)
9258 .kr(1)
9259 .sr(1)
9260 .m(4)
9261 .n(8)
9262 .k(k)
9263 .a_stride(43)
9264 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9265 }
9266 }
9267
9268 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4_subtile) {
9269 TEST_REQUIRES_ARM_NEON;
9270 for (size_t k = 12; k <= 40; k += 4) {
9271 for (uint32_t m = 1; m <= 4; m++) {
9272 for (uint32_t n = 1; n <= 8; n++) {
9273 GemmMicrokernelTester()
9274 .mr(4)
9275 .nr(8)
9276 .kr(1)
9277 .sr(1)
9278 .m(m)
9279 .n(n)
9280 .k(k)
9281 .iterations(1)
9282 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9283 }
9284 }
9285 }
9286 }
9287
9288 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8) {
9289 TEST_REQUIRES_ARM_NEON;
9290 for (uint32_t n = 9; n < 16; n++) {
9291 for (size_t k = 1; k <= 20; k += 5) {
9292 GemmMicrokernelTester()
9293 .mr(4)
9294 .nr(8)
9295 .kr(1)
9296 .sr(1)
9297 .m(4)
9298 .n(8)
9299 .k(k)
9300 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9301 }
9302 }
9303 }
9304
9305 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_strided_cn) {
9306 TEST_REQUIRES_ARM_NEON;
9307 for (uint32_t n = 9; n < 16; n++) {
9308 for (size_t k = 1; k <= 20; k += 5) {
9309 GemmMicrokernelTester()
9310 .mr(4)
9311 .nr(8)
9312 .kr(1)
9313 .sr(1)
9314 .m(4)
9315 .n(8)
9316 .k(k)
9317 .cn_stride(11)
9318 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9319 }
9320 }
9321 }
9322
9323 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_strided_a) {
9324 TEST_REQUIRES_ARM_NEON;
9325 for (uint32_t n = 9; n < 16; n++) {
9326 for (size_t k = 1; k <= 20; k += 5) {
9327 GemmMicrokernelTester()
9328 .mr(4)
9329 .nr(8)
9330 .kr(1)
9331 .sr(1)
9332 .m(4)
9333 .n(n)
9334 .k(k)
9335 .a_stride(23)
9336 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9337 }
9338 }
9339 }
9340
9341 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_subtile) {
9342 TEST_REQUIRES_ARM_NEON;
9343 for (uint32_t n = 9; n < 16; n++) {
9344 for (size_t k = 1; k <= 20; k += 5) {
9345 for (uint32_t m = 1; m <= 4; m++) {
9346 GemmMicrokernelTester()
9347 .mr(4)
9348 .nr(8)
9349 .kr(1)
9350 .sr(1)
9351 .m(m)
9352 .n(n)
9353 .k(k)
9354 .iterations(1)
9355 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9356 }
9357 }
9358 }
9359 }
9360
9361 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8) {
9362 TEST_REQUIRES_ARM_NEON;
9363 for (uint32_t n = 16; n <= 24; n += 8) {
9364 for (size_t k = 1; k <= 20; k += 5) {
9365 GemmMicrokernelTester()
9366 .mr(4)
9367 .nr(8)
9368 .kr(1)
9369 .sr(1)
9370 .m(4)
9371 .n(8)
9372 .k(k)
9373 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9374 }
9375 }
9376 }
9377
9378 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_strided_cn) {
9379 TEST_REQUIRES_ARM_NEON;
9380 for (uint32_t n = 16; n <= 24; n += 8) {
9381 for (size_t k = 1; k <= 20; k += 5) {
9382 GemmMicrokernelTester()
9383 .mr(4)
9384 .nr(8)
9385 .kr(1)
9386 .sr(1)
9387 .m(4)
9388 .n(n)
9389 .k(k)
9390 .cn_stride(11)
9391 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9392 }
9393 }
9394 }
9395
9396 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_strided_a) {
9397 TEST_REQUIRES_ARM_NEON;
9398 for (uint32_t n = 16; n <= 24; n += 8) {
9399 for (size_t k = 1; k <= 20; k += 5) {
9400 GemmMicrokernelTester()
9401 .mr(4)
9402 .nr(8)
9403 .kr(1)
9404 .sr(1)
9405 .m(4)
9406 .n(n)
9407 .k(k)
9408 .a_stride(23)
9409 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9410 }
9411 }
9412 }
9413
9414 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_subtile) {
9415 TEST_REQUIRES_ARM_NEON;
9416 for (uint32_t n = 16; n <= 24; n += 8) {
9417 for (size_t k = 1; k <= 20; k += 5) {
9418 for (uint32_t m = 1; m <= 4; m++) {
9419 GemmMicrokernelTester()
9420 .mr(4)
9421 .nr(8)
9422 .kr(1)
9423 .sr(1)
9424 .m(m)
9425 .n(n)
9426 .k(k)
9427 .iterations(1)
9428 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9429 }
9430 }
9431 }
9432 }
9433
9434 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cm_subtile) {
9435 TEST_REQUIRES_ARM_NEON;
9436 for (size_t k = 1; k <= 20; k += 5) {
9437 for (uint32_t m = 1; m <= 4; m++) {
9438 for (uint32_t n = 1; n <= 8; n++) {
9439 GemmMicrokernelTester()
9440 .mr(4)
9441 .nr(8)
9442 .kr(1)
9443 .sr(1)
9444 .m(m)
9445 .n(n)
9446 .k(k)
9447 .cm_stride(11)
9448 .iterations(1)
9449 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9450 }
9451 }
9452 }
9453 }
9454
9455 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, qmin) {
9456 TEST_REQUIRES_ARM_NEON;
9457 GemmMicrokernelTester()
9458 .mr(4)
9459 .nr(8)
9460 .kr(1)
9461 .sr(1)
9462 .m(4)
9463 .n(8)
9464 .k(4)
9465 .qmin(128)
9466 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9467 }
9468
9469 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, qmax) {
9470 TEST_REQUIRES_ARM_NEON;
9471 GemmMicrokernelTester()
9472 .mr(4)
9473 .nr(8)
9474 .kr(1)
9475 .sr(1)
9476 .m(4)
9477 .n(8)
9478 .k(4)
9479 .qmax(128)
9480 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9481 }
9482
9483 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cm) {
9484 TEST_REQUIRES_ARM_NEON;
9485 GemmMicrokernelTester()
9486 .mr(4)
9487 .nr(8)
9488 .kr(1)
9489 .sr(1)
9490 .m(4)
9491 .n(8)
9492 .k(4)
9493 .cm_stride(11)
9494 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
9495 }
9496#endif // XNN_ARCH_ARM
9497
9498
9499#if XNN_ARCH_ARM
9500 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4) {
9501 TEST_REQUIRES_ARM_NEON;
9502 GemmMicrokernelTester()
9503 .mr(4)
9504 .nr(8)
9505 .kr(1)
9506 .sr(1)
9507 .m(4)
9508 .n(8)
9509 .k(4)
9510 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9511 }
9512
9513 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cn) {
9514 TEST_REQUIRES_ARM_NEON;
9515 GemmMicrokernelTester()
9516 .mr(4)
9517 .nr(8)
9518 .kr(1)
9519 .sr(1)
9520 .m(4)
9521 .n(8)
9522 .k(4)
9523 .cn_stride(11)
9524 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9525 }
9526
9527 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_strided_a) {
9528 TEST_REQUIRES_ARM_NEON;
9529 GemmMicrokernelTester()
9530 .mr(4)
9531 .nr(8)
9532 .kr(1)
9533 .sr(1)
9534 .m(4)
9535 .n(8)
9536 .k(4)
9537 .a_stride(7)
9538 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9539 }
9540
9541 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile) {
9542 TEST_REQUIRES_ARM_NEON;
9543 for (uint32_t m = 1; m <= 4; m++) {
9544 for (uint32_t n = 1; n <= 8; n++) {
9545 GemmMicrokernelTester()
9546 .mr(4)
9547 .nr(8)
9548 .kr(1)
9549 .sr(1)
9550 .m(m)
9551 .n(n)
9552 .k(4)
9553 .iterations(1)
9554 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9555 }
9556 }
9557 }
9558
9559 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_m) {
9560 TEST_REQUIRES_ARM_NEON;
9561 for (uint32_t m = 1; m <= 4; m++) {
9562 GemmMicrokernelTester()
9563 .mr(4)
9564 .nr(8)
9565 .kr(1)
9566 .sr(1)
9567 .m(m)
9568 .n(8)
9569 .k(4)
9570 .iterations(1)
9571 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9572 }
9573 }
9574
9575 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_n) {
9576 TEST_REQUIRES_ARM_NEON;
9577 for (uint32_t n = 1; n <= 8; n++) {
9578 GemmMicrokernelTester()
9579 .mr(4)
9580 .nr(8)
9581 .kr(1)
9582 .sr(1)
9583 .m(4)
9584 .n(n)
9585 .k(4)
9586 .iterations(1)
9587 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9588 }
9589 }
9590
9591 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8) {
9592 TEST_REQUIRES_ARM_NEON;
9593 GemmMicrokernelTester()
9594 .mr(4)
9595 .nr(8)
9596 .kr(1)
9597 .sr(1)
9598 .m(4)
9599 .n(8)
9600 .k(8)
9601 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9602 }
9603
9604 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_strided_a) {
9605 TEST_REQUIRES_ARM_NEON;
9606 GemmMicrokernelTester()
9607 .mr(4)
9608 .nr(8)
9609 .kr(1)
9610 .sr(1)
9611 .m(4)
9612 .n(8)
9613 .k(8)
9614 .a_stride(11)
9615 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9616 }
9617
9618 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_subtile) {
9619 TEST_REQUIRES_ARM_NEON;
9620 for (uint32_t m = 1; m <= 4; m++) {
9621 for (uint32_t n = 1; n <= 8; n++) {
9622 GemmMicrokernelTester()
9623 .mr(4)
9624 .nr(8)
9625 .kr(1)
9626 .sr(1)
9627 .m(m)
9628 .n(n)
9629 .k(8)
9630 .iterations(1)
9631 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9632 }
9633 }
9634 }
9635
9636 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8) {
9637 TEST_REQUIRES_ARM_NEON;
9638 for (size_t k = 1; k < 8; k++) {
9639 GemmMicrokernelTester()
9640 .mr(4)
9641 .nr(8)
9642 .kr(1)
9643 .sr(1)
9644 .m(4)
9645 .n(8)
9646 .k(k)
9647 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9648 }
9649 }
9650
9651 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_strided_a) {
9652 TEST_REQUIRES_ARM_NEON;
9653 for (size_t k = 1; k < 8; k++) {
9654 GemmMicrokernelTester()
9655 .mr(4)
9656 .nr(8)
9657 .kr(1)
9658 .sr(1)
9659 .m(4)
9660 .n(8)
9661 .k(k)
9662 .a_stride(11)
9663 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9664 }
9665 }
9666
9667 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_subtile) {
9668 TEST_REQUIRES_ARM_NEON;
9669 for (size_t k = 1; k < 8; k++) {
9670 for (uint32_t m = 1; m <= 4; m++) {
9671 for (uint32_t n = 1; n <= 8; n++) {
9672 GemmMicrokernelTester()
9673 .mr(4)
9674 .nr(8)
9675 .kr(1)
9676 .sr(1)
9677 .m(m)
9678 .n(n)
9679 .k(k)
9680 .iterations(1)
9681 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9682 }
9683 }
9684 }
9685 }
9686
9687 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8) {
9688 TEST_REQUIRES_ARM_NEON;
9689 for (size_t k = 9; k < 8; k++) {
9690 GemmMicrokernelTester()
9691 .mr(4)
9692 .nr(8)
9693 .kr(1)
9694 .sr(1)
9695 .m(4)
9696 .n(8)
9697 .k(k)
9698 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9699 }
9700 }
9701
9702 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_4_strided_a) {
9703 TEST_REQUIRES_ARM_NEON;
9704 for (size_t k = 9; k < 8; k++) {
9705 GemmMicrokernelTester()
9706 .mr(4)
9707 .nr(8)
9708 .kr(1)
9709 .sr(1)
9710 .m(4)
9711 .n(8)
9712 .k(k)
9713 .a_stride(11)
9714 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9715 }
9716 }
9717
9718 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_4_subtile) {
9719 TEST_REQUIRES_ARM_NEON;
9720 for (size_t k = 9; k < 8; k++) {
9721 for (uint32_t m = 1; m <= 4; m++) {
9722 for (uint32_t n = 1; n <= 8; n++) {
9723 GemmMicrokernelTester()
9724 .mr(4)
9725 .nr(8)
9726 .kr(1)
9727 .sr(1)
9728 .m(m)
9729 .n(n)
9730 .k(k)
9731 .iterations(1)
9732 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9733 }
9734 }
9735 }
9736 }
9737
9738 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4) {
9739 TEST_REQUIRES_ARM_NEON;
9740 for (size_t k = 12; k <= 40; k += 4) {
9741 GemmMicrokernelTester()
9742 .mr(4)
9743 .nr(8)
9744 .kr(1)
9745 .sr(1)
9746 .m(4)
9747 .n(8)
9748 .k(k)
9749 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9750 }
9751 }
9752
9753 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_strided_a) {
9754 TEST_REQUIRES_ARM_NEON;
9755 for (size_t k = 12; k <= 40; k += 4) {
9756 GemmMicrokernelTester()
9757 .mr(4)
9758 .nr(8)
9759 .kr(1)
9760 .sr(1)
9761 .m(4)
9762 .n(8)
9763 .k(k)
9764 .a_stride(43)
9765 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9766 }
9767 }
9768
9769 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_subtile) {
9770 TEST_REQUIRES_ARM_NEON;
9771 for (size_t k = 12; k <= 40; k += 4) {
9772 for (uint32_t m = 1; m <= 4; m++) {
9773 for (uint32_t n = 1; n <= 8; n++) {
9774 GemmMicrokernelTester()
9775 .mr(4)
9776 .nr(8)
9777 .kr(1)
9778 .sr(1)
9779 .m(m)
9780 .n(n)
9781 .k(k)
9782 .iterations(1)
9783 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9784 }
9785 }
9786 }
9787 }
9788
9789 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8) {
9790 TEST_REQUIRES_ARM_NEON;
9791 for (uint32_t n = 9; n < 16; n++) {
9792 for (size_t k = 1; k <= 20; k += 5) {
9793 GemmMicrokernelTester()
9794 .mr(4)
9795 .nr(8)
9796 .kr(1)
9797 .sr(1)
9798 .m(4)
9799 .n(8)
9800 .k(k)
9801 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9802 }
9803 }
9804 }
9805
9806 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_cn) {
9807 TEST_REQUIRES_ARM_NEON;
9808 for (uint32_t n = 9; n < 16; n++) {
9809 for (size_t k = 1; k <= 20; k += 5) {
9810 GemmMicrokernelTester()
9811 .mr(4)
9812 .nr(8)
9813 .kr(1)
9814 .sr(1)
9815 .m(4)
9816 .n(8)
9817 .k(k)
9818 .cn_stride(11)
9819 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9820 }
9821 }
9822 }
9823
9824 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_a) {
9825 TEST_REQUIRES_ARM_NEON;
9826 for (uint32_t n = 9; n < 16; n++) {
9827 for (size_t k = 1; k <= 20; k += 5) {
9828 GemmMicrokernelTester()
9829 .mr(4)
9830 .nr(8)
9831 .kr(1)
9832 .sr(1)
9833 .m(4)
9834 .n(n)
9835 .k(k)
9836 .a_stride(23)
9837 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9838 }
9839 }
9840 }
9841
9842 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_subtile) {
9843 TEST_REQUIRES_ARM_NEON;
9844 for (uint32_t n = 9; n < 16; n++) {
9845 for (size_t k = 1; k <= 20; k += 5) {
9846 for (uint32_t m = 1; m <= 4; m++) {
9847 GemmMicrokernelTester()
9848 .mr(4)
9849 .nr(8)
9850 .kr(1)
9851 .sr(1)
9852 .m(m)
9853 .n(n)
9854 .k(k)
9855 .iterations(1)
9856 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9857 }
9858 }
9859 }
9860 }
9861
9862 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8) {
9863 TEST_REQUIRES_ARM_NEON;
9864 for (uint32_t n = 16; n <= 24; n += 8) {
9865 for (size_t k = 1; k <= 20; k += 5) {
9866 GemmMicrokernelTester()
9867 .mr(4)
9868 .nr(8)
9869 .kr(1)
9870 .sr(1)
9871 .m(4)
9872 .n(8)
9873 .k(k)
9874 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9875 }
9876 }
9877 }
9878
9879 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_cn) {
9880 TEST_REQUIRES_ARM_NEON;
9881 for (uint32_t n = 16; n <= 24; n += 8) {
9882 for (size_t k = 1; k <= 20; k += 5) {
9883 GemmMicrokernelTester()
9884 .mr(4)
9885 .nr(8)
9886 .kr(1)
9887 .sr(1)
9888 .m(4)
9889 .n(n)
9890 .k(k)
9891 .cn_stride(11)
9892 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9893 }
9894 }
9895 }
9896
9897 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_a) {
9898 TEST_REQUIRES_ARM_NEON;
9899 for (uint32_t n = 16; n <= 24; n += 8) {
9900 for (size_t k = 1; k <= 20; k += 5) {
9901 GemmMicrokernelTester()
9902 .mr(4)
9903 .nr(8)
9904 .kr(1)
9905 .sr(1)
9906 .m(4)
9907 .n(n)
9908 .k(k)
9909 .a_stride(23)
9910 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9911 }
9912 }
9913 }
9914
9915 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_subtile) {
9916 TEST_REQUIRES_ARM_NEON;
9917 for (uint32_t n = 16; n <= 24; n += 8) {
9918 for (size_t k = 1; k <= 20; k += 5) {
9919 for (uint32_t m = 1; m <= 4; m++) {
9920 GemmMicrokernelTester()
9921 .mr(4)
9922 .nr(8)
9923 .kr(1)
9924 .sr(1)
9925 .m(m)
9926 .n(n)
9927 .k(k)
9928 .iterations(1)
9929 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9930 }
9931 }
9932 }
9933 }
9934
9935 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm_subtile) {
9936 TEST_REQUIRES_ARM_NEON;
9937 for (size_t k = 1; k <= 20; k += 5) {
9938 for (uint32_t m = 1; m <= 4; m++) {
9939 for (uint32_t n = 1; n <= 8; n++) {
9940 GemmMicrokernelTester()
9941 .mr(4)
9942 .nr(8)
9943 .kr(1)
9944 .sr(1)
9945 .m(m)
9946 .n(n)
9947 .k(k)
9948 .cm_stride(11)
9949 .iterations(1)
9950 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9951 }
9952 }
9953 }
9954 }
9955
9956 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, qmin) {
9957 TEST_REQUIRES_ARM_NEON;
9958 GemmMicrokernelTester()
9959 .mr(4)
9960 .nr(8)
9961 .kr(1)
9962 .sr(1)
9963 .m(4)
9964 .n(8)
9965 .k(4)
9966 .qmin(128)
9967 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9968 }
9969
9970 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, qmax) {
9971 TEST_REQUIRES_ARM_NEON;
9972 GemmMicrokernelTester()
9973 .mr(4)
9974 .nr(8)
9975 .kr(1)
9976 .sr(1)
9977 .m(4)
9978 .n(8)
9979 .k(4)
9980 .qmax(128)
9981 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9982 }
9983
9984 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm) {
9985 TEST_REQUIRES_ARM_NEON;
9986 GemmMicrokernelTester()
9987 .mr(4)
9988 .nr(8)
9989 .kr(1)
9990 .sr(1)
9991 .m(4)
9992 .n(8)
9993 .k(4)
9994 .cm_stride(11)
9995 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a55);
9996 }
9997#endif // XNN_ARCH_ARM
9998
9999
10000#if XNN_ARCH_ARM
10001 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
10002 TEST_REQUIRES_ARM_NEON;
10003 GemmMicrokernelTester()
10004 .mr(4)
10005 .nr(8)
10006 .kr(1)
10007 .sr(1)
10008 .m(4)
10009 .n(8)
10010 .k(4)
10011 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10012 }
10013
10014 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cn) {
10015 TEST_REQUIRES_ARM_NEON;
10016 GemmMicrokernelTester()
10017 .mr(4)
10018 .nr(8)
10019 .kr(1)
10020 .sr(1)
10021 .m(4)
10022 .n(8)
10023 .k(4)
10024 .cn_stride(11)
10025 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10026 }
10027
10028 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_strided_a) {
10029 TEST_REQUIRES_ARM_NEON;
10030 GemmMicrokernelTester()
10031 .mr(4)
10032 .nr(8)
10033 .kr(1)
10034 .sr(1)
10035 .m(4)
10036 .n(8)
10037 .k(4)
10038 .a_stride(7)
10039 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10040 }
10041
10042 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile) {
10043 TEST_REQUIRES_ARM_NEON;
10044 for (uint32_t m = 1; m <= 4; m++) {
10045 for (uint32_t n = 1; n <= 8; n++) {
10046 GemmMicrokernelTester()
10047 .mr(4)
10048 .nr(8)
10049 .kr(1)
10050 .sr(1)
10051 .m(m)
10052 .n(n)
10053 .k(4)
10054 .iterations(1)
10055 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10056 }
10057 }
10058 }
10059
10060 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_m) {
10061 TEST_REQUIRES_ARM_NEON;
10062 for (uint32_t m = 1; m <= 4; m++) {
10063 GemmMicrokernelTester()
10064 .mr(4)
10065 .nr(8)
10066 .kr(1)
10067 .sr(1)
10068 .m(m)
10069 .n(8)
10070 .k(4)
10071 .iterations(1)
10072 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10073 }
10074 }
10075
10076 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_n) {
10077 TEST_REQUIRES_ARM_NEON;
10078 for (uint32_t n = 1; n <= 8; n++) {
10079 GemmMicrokernelTester()
10080 .mr(4)
10081 .nr(8)
10082 .kr(1)
10083 .sr(1)
10084 .m(4)
10085 .n(n)
10086 .k(4)
10087 .iterations(1)
10088 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10089 }
10090 }
10091
10092 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8) {
10093 TEST_REQUIRES_ARM_NEON;
10094 GemmMicrokernelTester()
10095 .mr(4)
10096 .nr(8)
10097 .kr(1)
10098 .sr(1)
10099 .m(4)
10100 .n(8)
10101 .k(8)
10102 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10103 }
10104
10105 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_strided_a) {
10106 TEST_REQUIRES_ARM_NEON;
10107 GemmMicrokernelTester()
10108 .mr(4)
10109 .nr(8)
10110 .kr(1)
10111 .sr(1)
10112 .m(4)
10113 .n(8)
10114 .k(8)
10115 .a_stride(11)
10116 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10117 }
10118
10119 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_subtile) {
10120 TEST_REQUIRES_ARM_NEON;
10121 for (uint32_t m = 1; m <= 4; m++) {
10122 for (uint32_t n = 1; n <= 8; n++) {
10123 GemmMicrokernelTester()
10124 .mr(4)
10125 .nr(8)
10126 .kr(1)
10127 .sr(1)
10128 .m(m)
10129 .n(n)
10130 .k(8)
10131 .iterations(1)
10132 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10133 }
10134 }
10135 }
10136
10137 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8) {
10138 TEST_REQUIRES_ARM_NEON;
10139 for (size_t k = 1; k < 8; k++) {
10140 GemmMicrokernelTester()
10141 .mr(4)
10142 .nr(8)
10143 .kr(1)
10144 .sr(1)
10145 .m(4)
10146 .n(8)
10147 .k(k)
10148 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10149 }
10150 }
10151
10152 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_strided_a) {
10153 TEST_REQUIRES_ARM_NEON;
10154 for (size_t k = 1; k < 8; k++) {
10155 GemmMicrokernelTester()
10156 .mr(4)
10157 .nr(8)
10158 .kr(1)
10159 .sr(1)
10160 .m(4)
10161 .n(8)
10162 .k(k)
10163 .a_stride(11)
10164 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10165 }
10166 }
10167
10168 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_subtile) {
10169 TEST_REQUIRES_ARM_NEON;
10170 for (size_t k = 1; k < 8; k++) {
10171 for (uint32_t m = 1; m <= 4; m++) {
10172 for (uint32_t n = 1; n <= 8; n++) {
10173 GemmMicrokernelTester()
10174 .mr(4)
10175 .nr(8)
10176 .kr(1)
10177 .sr(1)
10178 .m(m)
10179 .n(n)
10180 .k(k)
10181 .iterations(1)
10182 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10183 }
10184 }
10185 }
10186 }
10187
10188 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8) {
10189 TEST_REQUIRES_ARM_NEON;
10190 for (size_t k = 9; k < 8; k++) {
10191 GemmMicrokernelTester()
10192 .mr(4)
10193 .nr(8)
10194 .kr(1)
10195 .sr(1)
10196 .m(4)
10197 .n(8)
10198 .k(k)
10199 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10200 }
10201 }
10202
10203 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_4_strided_a) {
10204 TEST_REQUIRES_ARM_NEON;
10205 for (size_t k = 9; k < 8; k++) {
10206 GemmMicrokernelTester()
10207 .mr(4)
10208 .nr(8)
10209 .kr(1)
10210 .sr(1)
10211 .m(4)
10212 .n(8)
10213 .k(k)
10214 .a_stride(11)
10215 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10216 }
10217 }
10218
10219 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_4_subtile) {
10220 TEST_REQUIRES_ARM_NEON;
10221 for (size_t k = 9; k < 8; k++) {
10222 for (uint32_t m = 1; m <= 4; m++) {
10223 for (uint32_t n = 1; n <= 8; n++) {
10224 GemmMicrokernelTester()
10225 .mr(4)
10226 .nr(8)
10227 .kr(1)
10228 .sr(1)
10229 .m(m)
10230 .n(n)
10231 .k(k)
10232 .iterations(1)
10233 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10234 }
10235 }
10236 }
10237 }
10238
10239 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4) {
10240 TEST_REQUIRES_ARM_NEON;
10241 for (size_t k = 12; k <= 40; k += 4) {
10242 GemmMicrokernelTester()
10243 .mr(4)
10244 .nr(8)
10245 .kr(1)
10246 .sr(1)
10247 .m(4)
10248 .n(8)
10249 .k(k)
10250 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10251 }
10252 }
10253
10254 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_strided_a) {
10255 TEST_REQUIRES_ARM_NEON;
10256 for (size_t k = 12; k <= 40; k += 4) {
10257 GemmMicrokernelTester()
10258 .mr(4)
10259 .nr(8)
10260 .kr(1)
10261 .sr(1)
10262 .m(4)
10263 .n(8)
10264 .k(k)
10265 .a_stride(43)
10266 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10267 }
10268 }
10269
10270 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_subtile) {
10271 TEST_REQUIRES_ARM_NEON;
10272 for (size_t k = 12; k <= 40; k += 4) {
10273 for (uint32_t m = 1; m <= 4; m++) {
10274 for (uint32_t n = 1; n <= 8; n++) {
10275 GemmMicrokernelTester()
10276 .mr(4)
10277 .nr(8)
10278 .kr(1)
10279 .sr(1)
10280 .m(m)
10281 .n(n)
10282 .k(k)
10283 .iterations(1)
10284 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10285 }
10286 }
10287 }
10288 }
10289
10290 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8) {
10291 TEST_REQUIRES_ARM_NEON;
10292 for (uint32_t n = 9; n < 16; n++) {
10293 for (size_t k = 1; k <= 20; k += 5) {
10294 GemmMicrokernelTester()
10295 .mr(4)
10296 .nr(8)
10297 .kr(1)
10298 .sr(1)
10299 .m(4)
10300 .n(8)
10301 .k(k)
10302 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10303 }
10304 }
10305 }
10306
10307 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_cn) {
10308 TEST_REQUIRES_ARM_NEON;
10309 for (uint32_t n = 9; n < 16; n++) {
10310 for (size_t k = 1; k <= 20; k += 5) {
10311 GemmMicrokernelTester()
10312 .mr(4)
10313 .nr(8)
10314 .kr(1)
10315 .sr(1)
10316 .m(4)
10317 .n(8)
10318 .k(k)
10319 .cn_stride(11)
10320 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10321 }
10322 }
10323 }
10324
10325 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_a) {
10326 TEST_REQUIRES_ARM_NEON;
10327 for (uint32_t n = 9; n < 16; n++) {
10328 for (size_t k = 1; k <= 20; k += 5) {
10329 GemmMicrokernelTester()
10330 .mr(4)
10331 .nr(8)
10332 .kr(1)
10333 .sr(1)
10334 .m(4)
10335 .n(n)
10336 .k(k)
10337 .a_stride(23)
10338 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10339 }
10340 }
10341 }
10342
10343 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_subtile) {
10344 TEST_REQUIRES_ARM_NEON;
10345 for (uint32_t n = 9; n < 16; n++) {
10346 for (size_t k = 1; k <= 20; k += 5) {
10347 for (uint32_t m = 1; m <= 4; m++) {
10348 GemmMicrokernelTester()
10349 .mr(4)
10350 .nr(8)
10351 .kr(1)
10352 .sr(1)
10353 .m(m)
10354 .n(n)
10355 .k(k)
10356 .iterations(1)
10357 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10358 }
10359 }
10360 }
10361 }
10362
10363 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8) {
10364 TEST_REQUIRES_ARM_NEON;
10365 for (uint32_t n = 16; n <= 24; n += 8) {
10366 for (size_t k = 1; k <= 20; k += 5) {
10367 GemmMicrokernelTester()
10368 .mr(4)
10369 .nr(8)
10370 .kr(1)
10371 .sr(1)
10372 .m(4)
10373 .n(8)
10374 .k(k)
10375 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10376 }
10377 }
10378 }
10379
10380 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_cn) {
10381 TEST_REQUIRES_ARM_NEON;
10382 for (uint32_t n = 16; n <= 24; n += 8) {
10383 for (size_t k = 1; k <= 20; k += 5) {
10384 GemmMicrokernelTester()
10385 .mr(4)
10386 .nr(8)
10387 .kr(1)
10388 .sr(1)
10389 .m(4)
10390 .n(n)
10391 .k(k)
10392 .cn_stride(11)
10393 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10394 }
10395 }
10396 }
10397
10398 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_a) {
10399 TEST_REQUIRES_ARM_NEON;
10400 for (uint32_t n = 16; n <= 24; n += 8) {
10401 for (size_t k = 1; k <= 20; k += 5) {
10402 GemmMicrokernelTester()
10403 .mr(4)
10404 .nr(8)
10405 .kr(1)
10406 .sr(1)
10407 .m(4)
10408 .n(n)
10409 .k(k)
10410 .a_stride(23)
10411 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10412 }
10413 }
10414 }
10415
10416 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_subtile) {
10417 TEST_REQUIRES_ARM_NEON;
10418 for (uint32_t n = 16; n <= 24; n += 8) {
10419 for (size_t k = 1; k <= 20; k += 5) {
10420 for (uint32_t m = 1; m <= 4; m++) {
10421 GemmMicrokernelTester()
10422 .mr(4)
10423 .nr(8)
10424 .kr(1)
10425 .sr(1)
10426 .m(m)
10427 .n(n)
10428 .k(k)
10429 .iterations(1)
10430 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10431 }
10432 }
10433 }
10434 }
10435
10436 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm_subtile) {
10437 TEST_REQUIRES_ARM_NEON;
10438 for (size_t k = 1; k <= 20; k += 5) {
10439 for (uint32_t m = 1; m <= 4; m++) {
10440 for (uint32_t n = 1; n <= 8; n++) {
10441 GemmMicrokernelTester()
10442 .mr(4)
10443 .nr(8)
10444 .kr(1)
10445 .sr(1)
10446 .m(m)
10447 .n(n)
10448 .k(k)
10449 .cm_stride(11)
10450 .iterations(1)
10451 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10452 }
10453 }
10454 }
10455 }
10456
10457 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, qmin) {
10458 TEST_REQUIRES_ARM_NEON;
10459 GemmMicrokernelTester()
10460 .mr(4)
10461 .nr(8)
10462 .kr(1)
10463 .sr(1)
10464 .m(4)
10465 .n(8)
10466 .k(4)
10467 .qmin(128)
10468 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10469 }
10470
10471 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, qmax) {
10472 TEST_REQUIRES_ARM_NEON;
10473 GemmMicrokernelTester()
10474 .mr(4)
10475 .nr(8)
10476 .kr(1)
10477 .sr(1)
10478 .m(4)
10479 .n(8)
10480 .k(4)
10481 .qmax(128)
10482 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10483 }
10484
10485 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm) {
10486 TEST_REQUIRES_ARM_NEON;
10487 GemmMicrokernelTester()
10488 .mr(4)
10489 .nr(8)
10490 .kr(1)
10491 .sr(1)
10492 .m(4)
10493 .n(8)
10494 .k(4)
10495 .cm_stride(11)
10496 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
10497 }
10498#endif // XNN_ARCH_ARM
10499
10500
10501#if XNN_ARCH_ARM
10502 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4) {
10503 TEST_REQUIRES_ARM_NEON;
10504 GemmMicrokernelTester()
10505 .mr(4)
10506 .nr(8)
10507 .kr(1)
10508 .sr(1)
10509 .m(4)
10510 .n(8)
10511 .k(4)
10512 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10513 }
10514
10515 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cn) {
10516 TEST_REQUIRES_ARM_NEON;
10517 GemmMicrokernelTester()
10518 .mr(4)
10519 .nr(8)
10520 .kr(1)
10521 .sr(1)
10522 .m(4)
10523 .n(8)
10524 .k(4)
10525 .cn_stride(11)
10526 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10527 }
10528
10529 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_strided_a) {
10530 TEST_REQUIRES_ARM_NEON;
10531 GemmMicrokernelTester()
10532 .mr(4)
10533 .nr(8)
10534 .kr(1)
10535 .sr(1)
10536 .m(4)
10537 .n(8)
10538 .k(4)
10539 .a_stride(7)
10540 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10541 }
10542
10543 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile) {
10544 TEST_REQUIRES_ARM_NEON;
10545 for (uint32_t m = 1; m <= 4; m++) {
10546 for (uint32_t n = 1; n <= 8; n++) {
10547 GemmMicrokernelTester()
10548 .mr(4)
10549 .nr(8)
10550 .kr(1)
10551 .sr(1)
10552 .m(m)
10553 .n(n)
10554 .k(4)
10555 .iterations(1)
10556 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10557 }
10558 }
10559 }
10560
10561 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_m) {
10562 TEST_REQUIRES_ARM_NEON;
10563 for (uint32_t m = 1; m <= 4; m++) {
10564 GemmMicrokernelTester()
10565 .mr(4)
10566 .nr(8)
10567 .kr(1)
10568 .sr(1)
10569 .m(m)
10570 .n(8)
10571 .k(4)
10572 .iterations(1)
10573 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10574 }
10575 }
10576
10577 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_n) {
10578 TEST_REQUIRES_ARM_NEON;
10579 for (uint32_t n = 1; n <= 8; n++) {
10580 GemmMicrokernelTester()
10581 .mr(4)
10582 .nr(8)
10583 .kr(1)
10584 .sr(1)
10585 .m(4)
10586 .n(n)
10587 .k(4)
10588 .iterations(1)
10589 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10590 }
10591 }
10592
10593 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8) {
10594 TEST_REQUIRES_ARM_NEON;
10595 GemmMicrokernelTester()
10596 .mr(4)
10597 .nr(8)
10598 .kr(1)
10599 .sr(1)
10600 .m(4)
10601 .n(8)
10602 .k(8)
10603 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10604 }
10605
10606 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8_strided_a) {
10607 TEST_REQUIRES_ARM_NEON;
10608 GemmMicrokernelTester()
10609 .mr(4)
10610 .nr(8)
10611 .kr(1)
10612 .sr(1)
10613 .m(4)
10614 .n(8)
10615 .k(8)
10616 .a_stride(11)
10617 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10618 }
10619
10620 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8_subtile) {
10621 TEST_REQUIRES_ARM_NEON;
10622 for (uint32_t m = 1; m <= 4; m++) {
10623 for (uint32_t n = 1; n <= 8; n++) {
10624 GemmMicrokernelTester()
10625 .mr(4)
10626 .nr(8)
10627 .kr(1)
10628 .sr(1)
10629 .m(m)
10630 .n(n)
10631 .k(8)
10632 .iterations(1)
10633 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10634 }
10635 }
10636 }
10637
10638 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8) {
10639 TEST_REQUIRES_ARM_NEON;
10640 for (size_t k = 1; k < 8; k++) {
10641 GemmMicrokernelTester()
10642 .mr(4)
10643 .nr(8)
10644 .kr(1)
10645 .sr(1)
10646 .m(4)
10647 .n(8)
10648 .k(k)
10649 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10650 }
10651 }
10652
10653 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8_strided_a) {
10654 TEST_REQUIRES_ARM_NEON;
10655 for (size_t k = 1; k < 8; k++) {
10656 GemmMicrokernelTester()
10657 .mr(4)
10658 .nr(8)
10659 .kr(1)
10660 .sr(1)
10661 .m(4)
10662 .n(8)
10663 .k(k)
10664 .a_stride(11)
10665 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10666 }
10667 }
10668
10669 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8_subtile) {
10670 TEST_REQUIRES_ARM_NEON;
10671 for (size_t k = 1; k < 8; k++) {
10672 for (uint32_t m = 1; m <= 4; m++) {
10673 for (uint32_t n = 1; n <= 8; n++) {
10674 GemmMicrokernelTester()
10675 .mr(4)
10676 .nr(8)
10677 .kr(1)
10678 .sr(1)
10679 .m(m)
10680 .n(n)
10681 .k(k)
10682 .iterations(1)
10683 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10684 }
10685 }
10686 }
10687 }
10688
10689 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_8) {
10690 TEST_REQUIRES_ARM_NEON;
10691 for (size_t k = 9; k < 8; k++) {
10692 GemmMicrokernelTester()
10693 .mr(4)
10694 .nr(8)
10695 .kr(1)
10696 .sr(1)
10697 .m(4)
10698 .n(8)
10699 .k(k)
10700 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10701 }
10702 }
10703
10704 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_4_strided_a) {
10705 TEST_REQUIRES_ARM_NEON;
10706 for (size_t k = 9; k < 8; k++) {
10707 GemmMicrokernelTester()
10708 .mr(4)
10709 .nr(8)
10710 .kr(1)
10711 .sr(1)
10712 .m(4)
10713 .n(8)
10714 .k(k)
10715 .a_stride(11)
10716 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10717 }
10718 }
10719
10720 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_4_subtile) {
10721 TEST_REQUIRES_ARM_NEON;
10722 for (size_t k = 9; k < 8; k++) {
10723 for (uint32_t m = 1; m <= 4; m++) {
10724 for (uint32_t n = 1; n <= 8; n++) {
10725 GemmMicrokernelTester()
10726 .mr(4)
10727 .nr(8)
10728 .kr(1)
10729 .sr(1)
10730 .m(m)
10731 .n(n)
10732 .k(k)
10733 .iterations(1)
10734 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10735 }
10736 }
10737 }
10738 }
10739
10740 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4) {
10741 TEST_REQUIRES_ARM_NEON;
10742 for (size_t k = 12; k <= 40; k += 4) {
10743 GemmMicrokernelTester()
10744 .mr(4)
10745 .nr(8)
10746 .kr(1)
10747 .sr(1)
10748 .m(4)
10749 .n(8)
10750 .k(k)
10751 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10752 }
10753 }
10754
10755 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4_strided_a) {
10756 TEST_REQUIRES_ARM_NEON;
10757 for (size_t k = 12; k <= 40; k += 4) {
10758 GemmMicrokernelTester()
10759 .mr(4)
10760 .nr(8)
10761 .kr(1)
10762 .sr(1)
10763 .m(4)
10764 .n(8)
10765 .k(k)
10766 .a_stride(43)
10767 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10768 }
10769 }
10770
10771 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4_subtile) {
10772 TEST_REQUIRES_ARM_NEON;
10773 for (size_t k = 12; k <= 40; k += 4) {
10774 for (uint32_t m = 1; m <= 4; m++) {
10775 for (uint32_t n = 1; n <= 8; n++) {
10776 GemmMicrokernelTester()
10777 .mr(4)
10778 .nr(8)
10779 .kr(1)
10780 .sr(1)
10781 .m(m)
10782 .n(n)
10783 .k(k)
10784 .iterations(1)
10785 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10786 }
10787 }
10788 }
10789 }
10790
10791 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8) {
10792 TEST_REQUIRES_ARM_NEON;
10793 for (uint32_t n = 9; n < 16; n++) {
10794 for (size_t k = 1; k <= 20; k += 5) {
10795 GemmMicrokernelTester()
10796 .mr(4)
10797 .nr(8)
10798 .kr(1)
10799 .sr(1)
10800 .m(4)
10801 .n(8)
10802 .k(k)
10803 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10804 }
10805 }
10806 }
10807
10808 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_strided_cn) {
10809 TEST_REQUIRES_ARM_NEON;
10810 for (uint32_t n = 9; n < 16; n++) {
10811 for (size_t k = 1; k <= 20; k += 5) {
10812 GemmMicrokernelTester()
10813 .mr(4)
10814 .nr(8)
10815 .kr(1)
10816 .sr(1)
10817 .m(4)
10818 .n(8)
10819 .k(k)
10820 .cn_stride(11)
10821 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10822 }
10823 }
10824 }
10825
10826 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_strided_a) {
10827 TEST_REQUIRES_ARM_NEON;
10828 for (uint32_t n = 9; n < 16; n++) {
10829 for (size_t k = 1; k <= 20; k += 5) {
10830 GemmMicrokernelTester()
10831 .mr(4)
10832 .nr(8)
10833 .kr(1)
10834 .sr(1)
10835 .m(4)
10836 .n(n)
10837 .k(k)
10838 .a_stride(23)
10839 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10840 }
10841 }
10842 }
10843
10844 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_subtile) {
10845 TEST_REQUIRES_ARM_NEON;
10846 for (uint32_t n = 9; n < 16; n++) {
10847 for (size_t k = 1; k <= 20; k += 5) {
10848 for (uint32_t m = 1; m <= 4; m++) {
10849 GemmMicrokernelTester()
10850 .mr(4)
10851 .nr(8)
10852 .kr(1)
10853 .sr(1)
10854 .m(m)
10855 .n(n)
10856 .k(k)
10857 .iterations(1)
10858 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10859 }
10860 }
10861 }
10862 }
10863
10864 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8) {
10865 TEST_REQUIRES_ARM_NEON;
10866 for (uint32_t n = 16; n <= 24; n += 8) {
10867 for (size_t k = 1; k <= 20; k += 5) {
10868 GemmMicrokernelTester()
10869 .mr(4)
10870 .nr(8)
10871 .kr(1)
10872 .sr(1)
10873 .m(4)
10874 .n(8)
10875 .k(k)
10876 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10877 }
10878 }
10879 }
10880
10881 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_strided_cn) {
10882 TEST_REQUIRES_ARM_NEON;
10883 for (uint32_t n = 16; n <= 24; n += 8) {
10884 for (size_t k = 1; k <= 20; k += 5) {
10885 GemmMicrokernelTester()
10886 .mr(4)
10887 .nr(8)
10888 .kr(1)
10889 .sr(1)
10890 .m(4)
10891 .n(n)
10892 .k(k)
10893 .cn_stride(11)
10894 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10895 }
10896 }
10897 }
10898
10899 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_strided_a) {
10900 TEST_REQUIRES_ARM_NEON;
10901 for (uint32_t n = 16; n <= 24; n += 8) {
10902 for (size_t k = 1; k <= 20; k += 5) {
10903 GemmMicrokernelTester()
10904 .mr(4)
10905 .nr(8)
10906 .kr(1)
10907 .sr(1)
10908 .m(4)
10909 .n(n)
10910 .k(k)
10911 .a_stride(23)
10912 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10913 }
10914 }
10915 }
10916
10917 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_subtile) {
10918 TEST_REQUIRES_ARM_NEON;
10919 for (uint32_t n = 16; n <= 24; n += 8) {
10920 for (size_t k = 1; k <= 20; k += 5) {
10921 for (uint32_t m = 1; m <= 4; m++) {
10922 GemmMicrokernelTester()
10923 .mr(4)
10924 .nr(8)
10925 .kr(1)
10926 .sr(1)
10927 .m(m)
10928 .n(n)
10929 .k(k)
10930 .iterations(1)
10931 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10932 }
10933 }
10934 }
10935 }
10936
10937 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm_subtile) {
10938 TEST_REQUIRES_ARM_NEON;
10939 for (size_t k = 1; k <= 20; k += 5) {
10940 for (uint32_t m = 1; m <= 4; m++) {
10941 for (uint32_t n = 1; n <= 8; n++) {
10942 GemmMicrokernelTester()
10943 .mr(4)
10944 .nr(8)
10945 .kr(1)
10946 .sr(1)
10947 .m(m)
10948 .n(n)
10949 .k(k)
10950 .cm_stride(11)
10951 .iterations(1)
10952 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10953 }
10954 }
10955 }
10956 }
10957
10958 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmin) {
10959 TEST_REQUIRES_ARM_NEON;
10960 GemmMicrokernelTester()
10961 .mr(4)
10962 .nr(8)
10963 .kr(1)
10964 .sr(1)
10965 .m(4)
10966 .n(8)
10967 .k(4)
10968 .qmin(128)
10969 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10970 }
10971
10972 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmax) {
10973 TEST_REQUIRES_ARM_NEON;
10974 GemmMicrokernelTester()
10975 .mr(4)
10976 .nr(8)
10977 .kr(1)
10978 .sr(1)
10979 .m(4)
10980 .n(8)
10981 .k(4)
10982 .qmax(128)
10983 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10984 }
10985
10986 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm) {
10987 TEST_REQUIRES_ARM_NEON;
10988 GemmMicrokernelTester()
10989 .mr(4)
10990 .nr(8)
10991 .kr(1)
10992 .sr(1)
10993 .m(4)
10994 .n(8)
10995 .k(4)
10996 .cm_stride(11)
10997 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
10998 }
10999#endif // XNN_ARCH_ARM
11000
11001
11002#if XNN_ARCH_ARM
11003 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2) {
11004 TEST_REQUIRES_ARM_NEON;
11005 GemmMicrokernelTester()
11006 .mr(4)
11007 .nr(8)
11008 .kr(1)
11009 .sr(1)
11010 .m(4)
11011 .n(8)
11012 .k(2)
11013 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11014 }
11015
11016 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, strided_cn) {
11017 TEST_REQUIRES_ARM_NEON;
11018 GemmMicrokernelTester()
11019 .mr(4)
11020 .nr(8)
11021 .kr(1)
11022 .sr(1)
11023 .m(4)
11024 .n(8)
11025 .k(2)
11026 .cn_stride(11)
11027 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11028 }
11029
11030 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_strided_a) {
11031 TEST_REQUIRES_ARM_NEON;
11032 GemmMicrokernelTester()
11033 .mr(4)
11034 .nr(8)
11035 .kr(1)
11036 .sr(1)
11037 .m(4)
11038 .n(8)
11039 .k(2)
11040 .a_stride(5)
11041 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11042 }
11043
11044 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile) {
11045 TEST_REQUIRES_ARM_NEON;
11046 for (uint32_t m = 1; m <= 4; m++) {
11047 for (uint32_t n = 1; n <= 8; n++) {
11048 GemmMicrokernelTester()
11049 .mr(4)
11050 .nr(8)
11051 .kr(1)
11052 .sr(1)
11053 .m(m)
11054 .n(n)
11055 .k(2)
11056 .iterations(1)
11057 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11058 }
11059 }
11060 }
11061
11062 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_m) {
11063 TEST_REQUIRES_ARM_NEON;
11064 for (uint32_t m = 1; m <= 4; m++) {
11065 GemmMicrokernelTester()
11066 .mr(4)
11067 .nr(8)
11068 .kr(1)
11069 .sr(1)
11070 .m(m)
11071 .n(8)
11072 .k(2)
11073 .iterations(1)
11074 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11075 }
11076 }
11077
11078 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_n) {
11079 TEST_REQUIRES_ARM_NEON;
11080 for (uint32_t n = 1; n <= 8; n++) {
11081 GemmMicrokernelTester()
11082 .mr(4)
11083 .nr(8)
11084 .kr(1)
11085 .sr(1)
11086 .m(4)
11087 .n(n)
11088 .k(2)
11089 .iterations(1)
11090 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11091 }
11092 }
11093
11094 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_lt_2) {
11095 TEST_REQUIRES_ARM_NEON;
11096 for (size_t k = 1; k < 2; k++) {
11097 GemmMicrokernelTester()
11098 .mr(4)
11099 .nr(8)
11100 .kr(1)
11101 .sr(1)
11102 .m(4)
11103 .n(8)
11104 .k(k)
11105 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11106 }
11107 }
11108
11109 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_lt_2_strided_a) {
11110 TEST_REQUIRES_ARM_NEON;
11111 for (size_t k = 1; k < 2; k++) {
11112 GemmMicrokernelTester()
11113 .mr(4)
11114 .nr(8)
11115 .kr(1)
11116 .sr(1)
11117 .m(4)
11118 .n(8)
11119 .k(k)
11120 .a_stride(5)
11121 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11122 }
11123 }
11124
11125 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_lt_2_subtile) {
11126 TEST_REQUIRES_ARM_NEON;
11127 for (size_t k = 1; k < 2; k++) {
11128 for (uint32_t m = 1; m <= 4; m++) {
11129 for (uint32_t n = 1; n <= 8; n++) {
11130 GemmMicrokernelTester()
11131 .mr(4)
11132 .nr(8)
11133 .kr(1)
11134 .sr(1)
11135 .m(m)
11136 .n(n)
11137 .k(k)
11138 .iterations(1)
11139 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11140 }
11141 }
11142 }
11143 }
11144
11145 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_gt_2) {
11146 TEST_REQUIRES_ARM_NEON;
11147 for (size_t k = 3; k < 4; k++) {
11148 GemmMicrokernelTester()
11149 .mr(4)
11150 .nr(8)
11151 .kr(1)
11152 .sr(1)
11153 .m(4)
11154 .n(8)
11155 .k(k)
11156 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11157 }
11158 }
11159
11160 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_gt_2_strided_a) {
11161 TEST_REQUIRES_ARM_NEON;
11162 for (size_t k = 3; k < 4; k++) {
11163 GemmMicrokernelTester()
11164 .mr(4)
11165 .nr(8)
11166 .kr(1)
11167 .sr(1)
11168 .m(4)
11169 .n(8)
11170 .k(k)
11171 .a_stride(7)
11172 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11173 }
11174 }
11175
11176 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_gt_2_subtile) {
11177 TEST_REQUIRES_ARM_NEON;
11178 for (size_t k = 3; k < 4; k++) {
11179 for (uint32_t m = 1; m <= 4; m++) {
11180 for (uint32_t n = 1; n <= 8; n++) {
11181 GemmMicrokernelTester()
11182 .mr(4)
11183 .nr(8)
11184 .kr(1)
11185 .sr(1)
11186 .m(m)
11187 .n(n)
11188 .k(k)
11189 .iterations(1)
11190 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11191 }
11192 }
11193 }
11194 }
11195
11196 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_div_2) {
11197 TEST_REQUIRES_ARM_NEON;
11198 for (size_t k = 4; k <= 20; k += 2) {
11199 GemmMicrokernelTester()
11200 .mr(4)
11201 .nr(8)
11202 .kr(1)
11203 .sr(1)
11204 .m(4)
11205 .n(8)
11206 .k(k)
11207 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11208 }
11209 }
11210
11211 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_div_2_strided_a) {
11212 TEST_REQUIRES_ARM_NEON;
11213 for (size_t k = 4; k <= 20; k += 2) {
11214 GemmMicrokernelTester()
11215 .mr(4)
11216 .nr(8)
11217 .kr(1)
11218 .sr(1)
11219 .m(4)
11220 .n(8)
11221 .k(k)
11222 .a_stride(23)
11223 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11224 }
11225 }
11226
11227 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_div_2_subtile) {
11228 TEST_REQUIRES_ARM_NEON;
11229 for (size_t k = 4; k <= 20; k += 2) {
11230 for (uint32_t m = 1; m <= 4; m++) {
11231 for (uint32_t n = 1; n <= 8; n++) {
11232 GemmMicrokernelTester()
11233 .mr(4)
11234 .nr(8)
11235 .kr(1)
11236 .sr(1)
11237 .m(m)
11238 .n(n)
11239 .k(k)
11240 .iterations(1)
11241 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11242 }
11243 }
11244 }
11245 }
11246
11247 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8) {
11248 TEST_REQUIRES_ARM_NEON;
11249 for (uint32_t n = 9; n < 16; n++) {
11250 for (size_t k = 1; k <= 10; k += 3) {
11251 GemmMicrokernelTester()
11252 .mr(4)
11253 .nr(8)
11254 .kr(1)
11255 .sr(1)
11256 .m(4)
11257 .n(8)
11258 .k(k)
11259 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11260 }
11261 }
11262 }
11263
11264 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8_strided_cn) {
11265 TEST_REQUIRES_ARM_NEON;
11266 for (uint32_t n = 9; n < 16; n++) {
11267 for (size_t k = 1; k <= 10; k += 3) {
11268 GemmMicrokernelTester()
11269 .mr(4)
11270 .nr(8)
11271 .kr(1)
11272 .sr(1)
11273 .m(4)
11274 .n(8)
11275 .k(k)
11276 .cn_stride(11)
11277 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11278 }
11279 }
11280 }
11281
11282 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8_strided_a) {
11283 TEST_REQUIRES_ARM_NEON;
11284 for (uint32_t n = 9; n < 16; n++) {
11285 for (size_t k = 1; k <= 10; k += 3) {
11286 GemmMicrokernelTester()
11287 .mr(4)
11288 .nr(8)
11289 .kr(1)
11290 .sr(1)
11291 .m(4)
11292 .n(n)
11293 .k(k)
11294 .a_stride(13)
11295 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11296 }
11297 }
11298 }
11299
11300 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8_subtile) {
11301 TEST_REQUIRES_ARM_NEON;
11302 for (uint32_t n = 9; n < 16; n++) {
11303 for (size_t k = 1; k <= 10; k += 3) {
11304 for (uint32_t m = 1; m <= 4; m++) {
11305 GemmMicrokernelTester()
11306 .mr(4)
11307 .nr(8)
11308 .kr(1)
11309 .sr(1)
11310 .m(m)
11311 .n(n)
11312 .k(k)
11313 .iterations(1)
11314 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11315 }
11316 }
11317 }
11318 }
11319
11320 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8) {
11321 TEST_REQUIRES_ARM_NEON;
11322 for (uint32_t n = 16; n <= 24; n += 8) {
11323 for (size_t k = 1; k <= 10; k += 3) {
11324 GemmMicrokernelTester()
11325 .mr(4)
11326 .nr(8)
11327 .kr(1)
11328 .sr(1)
11329 .m(4)
11330 .n(8)
11331 .k(k)
11332 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11333 }
11334 }
11335 }
11336
11337 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8_strided_cn) {
11338 TEST_REQUIRES_ARM_NEON;
11339 for (uint32_t n = 16; n <= 24; n += 8) {
11340 for (size_t k = 1; k <= 10; k += 3) {
11341 GemmMicrokernelTester()
11342 .mr(4)
11343 .nr(8)
11344 .kr(1)
11345 .sr(1)
11346 .m(4)
11347 .n(n)
11348 .k(k)
11349 .cn_stride(11)
11350 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11351 }
11352 }
11353 }
11354
11355 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8_strided_a) {
11356 TEST_REQUIRES_ARM_NEON;
11357 for (uint32_t n = 16; n <= 24; n += 8) {
11358 for (size_t k = 1; k <= 10; k += 3) {
11359 GemmMicrokernelTester()
11360 .mr(4)
11361 .nr(8)
11362 .kr(1)
11363 .sr(1)
11364 .m(4)
11365 .n(n)
11366 .k(k)
11367 .a_stride(13)
11368 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11369 }
11370 }
11371 }
11372
11373 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8_subtile) {
11374 TEST_REQUIRES_ARM_NEON;
11375 for (uint32_t n = 16; n <= 24; n += 8) {
11376 for (size_t k = 1; k <= 10; k += 3) {
11377 for (uint32_t m = 1; m <= 4; m++) {
11378 GemmMicrokernelTester()
11379 .mr(4)
11380 .nr(8)
11381 .kr(1)
11382 .sr(1)
11383 .m(m)
11384 .n(n)
11385 .k(k)
11386 .iterations(1)
11387 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11388 }
11389 }
11390 }
11391 }
11392
11393 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, strided_cm_subtile) {
11394 TEST_REQUIRES_ARM_NEON;
11395 for (size_t k = 1; k <= 10; k += 3) {
11396 for (uint32_t m = 1; m <= 4; m++) {
11397 for (uint32_t n = 1; n <= 8; n++) {
11398 GemmMicrokernelTester()
11399 .mr(4)
11400 .nr(8)
11401 .kr(1)
11402 .sr(1)
11403 .m(m)
11404 .n(n)
11405 .k(k)
11406 .cm_stride(11)
11407 .iterations(1)
11408 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11409 }
11410 }
11411 }
11412 }
11413
11414 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, qmin) {
11415 TEST_REQUIRES_ARM_NEON;
11416 GemmMicrokernelTester()
11417 .mr(4)
11418 .nr(8)
11419 .kr(1)
11420 .sr(1)
11421 .m(4)
11422 .n(8)
11423 .k(2)
11424 .qmin(128)
11425 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11426 }
11427
11428 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, qmax) {
11429 TEST_REQUIRES_ARM_NEON;
11430 GemmMicrokernelTester()
11431 .mr(4)
11432 .nr(8)
11433 .kr(1)
11434 .sr(1)
11435 .m(4)
11436 .n(8)
11437 .k(2)
11438 .qmax(128)
11439 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11440 }
11441
11442 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, strided_cm) {
11443 TEST_REQUIRES_ARM_NEON;
11444 GemmMicrokernelTester()
11445 .mr(4)
11446 .nr(8)
11447 .kr(1)
11448 .sr(1)
11449 .m(4)
11450 .n(8)
11451 .k(2)
11452 .cm_stride(11)
11453 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
11454 }
11455#endif // XNN_ARCH_ARM
11456
11457
11458#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
11459 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
11460 TEST_REQUIRES_ARM_NEON_FMA;
11461 GemmMicrokernelTester()
11462 .mr(4)
11463 .nr(8)
11464 .kr(1)
11465 .sr(1)
11466 .m(4)
11467 .n(8)
11468 .k(4)
11469 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11470 }
11471
11472 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, strided_cn) {
11473 TEST_REQUIRES_ARM_NEON_FMA;
11474 GemmMicrokernelTester()
11475 .mr(4)
11476 .nr(8)
11477 .kr(1)
11478 .sr(1)
11479 .m(4)
11480 .n(8)
11481 .k(4)
11482 .cn_stride(11)
11483 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11484 }
11485
11486 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
11487 TEST_REQUIRES_ARM_NEON_FMA;
11488 GemmMicrokernelTester()
11489 .mr(4)
11490 .nr(8)
11491 .kr(1)
11492 .sr(1)
11493 .m(4)
11494 .n(8)
11495 .k(4)
11496 .a_stride(7)
11497 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11498 }
11499
11500 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
11501 TEST_REQUIRES_ARM_NEON_FMA;
11502 for (uint32_t m = 1; m <= 4; m++) {
11503 for (uint32_t n = 1; n <= 8; n++) {
11504 GemmMicrokernelTester()
11505 .mr(4)
11506 .nr(8)
11507 .kr(1)
11508 .sr(1)
11509 .m(m)
11510 .n(n)
11511 .k(4)
11512 .iterations(1)
11513 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11514 }
11515 }
11516 }
11517
11518 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
11519 TEST_REQUIRES_ARM_NEON_FMA;
11520 for (uint32_t m = 1; m <= 4; m++) {
11521 GemmMicrokernelTester()
11522 .mr(4)
11523 .nr(8)
11524 .kr(1)
11525 .sr(1)
11526 .m(m)
11527 .n(8)
11528 .k(4)
11529 .iterations(1)
11530 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11531 }
11532 }
11533
11534 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
11535 TEST_REQUIRES_ARM_NEON_FMA;
11536 for (uint32_t n = 1; n <= 8; n++) {
11537 GemmMicrokernelTester()
11538 .mr(4)
11539 .nr(8)
11540 .kr(1)
11541 .sr(1)
11542 .m(4)
11543 .n(n)
11544 .k(4)
11545 .iterations(1)
11546 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11547 }
11548 }
11549
11550 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_lt_4) {
11551 TEST_REQUIRES_ARM_NEON_FMA;
11552 for (size_t k = 1; k < 4; k++) {
11553 GemmMicrokernelTester()
11554 .mr(4)
11555 .nr(8)
11556 .kr(1)
11557 .sr(1)
11558 .m(4)
11559 .n(8)
11560 .k(k)
11561 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11562 }
11563 }
11564
11565 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
11566 TEST_REQUIRES_ARM_NEON_FMA;
11567 for (size_t k = 1; k < 4; k++) {
11568 GemmMicrokernelTester()
11569 .mr(4)
11570 .nr(8)
11571 .kr(1)
11572 .sr(1)
11573 .m(4)
11574 .n(8)
11575 .k(k)
11576 .a_stride(7)
11577 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11578 }
11579 }
11580
11581 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
11582 TEST_REQUIRES_ARM_NEON_FMA;
11583 for (size_t k = 1; k < 4; k++) {
11584 for (uint32_t m = 1; m <= 4; m++) {
11585 for (uint32_t n = 1; n <= 8; n++) {
11586 GemmMicrokernelTester()
11587 .mr(4)
11588 .nr(8)
11589 .kr(1)
11590 .sr(1)
11591 .m(m)
11592 .n(n)
11593 .k(k)
11594 .iterations(1)
11595 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11596 }
11597 }
11598 }
11599 }
11600
11601 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_gt_4) {
11602 TEST_REQUIRES_ARM_NEON_FMA;
11603 for (size_t k = 5; k < 8; k++) {
11604 GemmMicrokernelTester()
11605 .mr(4)
11606 .nr(8)
11607 .kr(1)
11608 .sr(1)
11609 .m(4)
11610 .n(8)
11611 .k(k)
11612 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11613 }
11614 }
11615
11616 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
11617 TEST_REQUIRES_ARM_NEON_FMA;
11618 for (size_t k = 5; k < 8; k++) {
11619 GemmMicrokernelTester()
11620 .mr(4)
11621 .nr(8)
11622 .kr(1)
11623 .sr(1)
11624 .m(4)
11625 .n(8)
11626 .k(k)
11627 .a_stride(11)
11628 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11629 }
11630 }
11631
11632 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
11633 TEST_REQUIRES_ARM_NEON_FMA;
11634 for (size_t k = 5; k < 8; k++) {
11635 for (uint32_t m = 1; m <= 4; m++) {
11636 for (uint32_t n = 1; n <= 8; n++) {
11637 GemmMicrokernelTester()
11638 .mr(4)
11639 .nr(8)
11640 .kr(1)
11641 .sr(1)
11642 .m(m)
11643 .n(n)
11644 .k(k)
11645 .iterations(1)
11646 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11647 }
11648 }
11649 }
11650 }
11651
11652 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_div_4) {
11653 TEST_REQUIRES_ARM_NEON_FMA;
11654 for (size_t k = 8; k <= 40; k += 4) {
11655 GemmMicrokernelTester()
11656 .mr(4)
11657 .nr(8)
11658 .kr(1)
11659 .sr(1)
11660 .m(4)
11661 .n(8)
11662 .k(k)
11663 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11664 }
11665 }
11666
11667 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
11668 TEST_REQUIRES_ARM_NEON_FMA;
11669 for (size_t k = 8; k <= 40; k += 4) {
11670 GemmMicrokernelTester()
11671 .mr(4)
11672 .nr(8)
11673 .kr(1)
11674 .sr(1)
11675 .m(4)
11676 .n(8)
11677 .k(k)
11678 .a_stride(43)
11679 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11680 }
11681 }
11682
11683 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
11684 TEST_REQUIRES_ARM_NEON_FMA;
11685 for (size_t k = 8; k <= 40; k += 4) {
11686 for (uint32_t m = 1; m <= 4; m++) {
11687 for (uint32_t n = 1; n <= 8; n++) {
11688 GemmMicrokernelTester()
11689 .mr(4)
11690 .nr(8)
11691 .kr(1)
11692 .sr(1)
11693 .m(m)
11694 .n(n)
11695 .k(k)
11696 .iterations(1)
11697 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11698 }
11699 }
11700 }
11701 }
11702
11703 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8) {
11704 TEST_REQUIRES_ARM_NEON_FMA;
11705 for (uint32_t n = 9; n < 16; n++) {
11706 for (size_t k = 1; k <= 20; k += 5) {
11707 GemmMicrokernelTester()
11708 .mr(4)
11709 .nr(8)
11710 .kr(1)
11711 .sr(1)
11712 .m(4)
11713 .n(8)
11714 .k(k)
11715 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11716 }
11717 }
11718 }
11719
11720 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
11721 TEST_REQUIRES_ARM_NEON_FMA;
11722 for (uint32_t n = 9; n < 16; n++) {
11723 for (size_t k = 1; k <= 20; k += 5) {
11724 GemmMicrokernelTester()
11725 .mr(4)
11726 .nr(8)
11727 .kr(1)
11728 .sr(1)
11729 .m(4)
11730 .n(8)
11731 .k(k)
11732 .cn_stride(11)
11733 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11734 }
11735 }
11736 }
11737
11738 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
11739 TEST_REQUIRES_ARM_NEON_FMA;
11740 for (uint32_t n = 9; n < 16; n++) {
11741 for (size_t k = 1; k <= 20; k += 5) {
11742 GemmMicrokernelTester()
11743 .mr(4)
11744 .nr(8)
11745 .kr(1)
11746 .sr(1)
11747 .m(4)
11748 .n(n)
11749 .k(k)
11750 .a_stride(23)
11751 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11752 }
11753 }
11754 }
11755
11756 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
11757 TEST_REQUIRES_ARM_NEON_FMA;
11758 for (uint32_t n = 9; n < 16; n++) {
11759 for (size_t k = 1; k <= 20; k += 5) {
11760 for (uint32_t m = 1; m <= 4; m++) {
11761 GemmMicrokernelTester()
11762 .mr(4)
11763 .nr(8)
11764 .kr(1)
11765 .sr(1)
11766 .m(m)
11767 .n(n)
11768 .k(k)
11769 .iterations(1)
11770 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11771 }
11772 }
11773 }
11774 }
11775
11776 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8) {
11777 TEST_REQUIRES_ARM_NEON_FMA;
11778 for (uint32_t n = 16; n <= 24; n += 8) {
11779 for (size_t k = 1; k <= 20; k += 5) {
11780 GemmMicrokernelTester()
11781 .mr(4)
11782 .nr(8)
11783 .kr(1)
11784 .sr(1)
11785 .m(4)
11786 .n(8)
11787 .k(k)
11788 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11789 }
11790 }
11791 }
11792
11793 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
11794 TEST_REQUIRES_ARM_NEON_FMA;
11795 for (uint32_t n = 16; n <= 24; n += 8) {
11796 for (size_t k = 1; k <= 20; k += 5) {
11797 GemmMicrokernelTester()
11798 .mr(4)
11799 .nr(8)
11800 .kr(1)
11801 .sr(1)
11802 .m(4)
11803 .n(n)
11804 .k(k)
11805 .cn_stride(11)
11806 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11807 }
11808 }
11809 }
11810
11811 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
11812 TEST_REQUIRES_ARM_NEON_FMA;
11813 for (uint32_t n = 16; n <= 24; n += 8) {
11814 for (size_t k = 1; k <= 20; k += 5) {
11815 GemmMicrokernelTester()
11816 .mr(4)
11817 .nr(8)
11818 .kr(1)
11819 .sr(1)
11820 .m(4)
11821 .n(n)
11822 .k(k)
11823 .a_stride(23)
11824 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11825 }
11826 }
11827 }
11828
11829 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
11830 TEST_REQUIRES_ARM_NEON_FMA;
11831 for (uint32_t n = 16; n <= 24; n += 8) {
11832 for (size_t k = 1; k <= 20; k += 5) {
11833 for (uint32_t m = 1; m <= 4; m++) {
11834 GemmMicrokernelTester()
11835 .mr(4)
11836 .nr(8)
11837 .kr(1)
11838 .sr(1)
11839 .m(m)
11840 .n(n)
11841 .k(k)
11842 .iterations(1)
11843 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11844 }
11845 }
11846 }
11847 }
11848
11849 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
11850 TEST_REQUIRES_ARM_NEON_FMA;
11851 for (size_t k = 1; k <= 20; k += 5) {
11852 for (uint32_t m = 1; m <= 4; m++) {
11853 for (uint32_t n = 1; n <= 8; n++) {
11854 GemmMicrokernelTester()
11855 .mr(4)
11856 .nr(8)
11857 .kr(1)
11858 .sr(1)
11859 .m(m)
11860 .n(n)
11861 .k(k)
11862 .cm_stride(11)
11863 .iterations(1)
11864 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11865 }
11866 }
11867 }
11868 }
11869
11870 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, qmin) {
11871 TEST_REQUIRES_ARM_NEON_FMA;
11872 GemmMicrokernelTester()
11873 .mr(4)
11874 .nr(8)
11875 .kr(1)
11876 .sr(1)
11877 .m(4)
11878 .n(8)
11879 .k(4)
11880 .qmin(128)
11881 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11882 }
11883
11884 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, qmax) {
11885 TEST_REQUIRES_ARM_NEON_FMA;
11886 GemmMicrokernelTester()
11887 .mr(4)
11888 .nr(8)
11889 .kr(1)
11890 .sr(1)
11891 .m(4)
11892 .n(8)
11893 .k(4)
11894 .qmax(128)
11895 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11896 }
11897
11898 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, strided_cm) {
11899 TEST_REQUIRES_ARM_NEON_FMA;
11900 GemmMicrokernelTester()
11901 .mr(4)
11902 .nr(8)
11903 .kr(1)
11904 .sr(1)
11905 .m(4)
11906 .n(8)
11907 .k(4)
11908 .cm_stride(11)
11909 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
11910 }
11911#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
11912
11913
11914#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
11915 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
11916 TEST_REQUIRES_ARM_NEON_FMA;
11917 GemmMicrokernelTester()
11918 .mr(6)
11919 .nr(8)
11920 .kr(1)
11921 .sr(1)
11922 .m(6)
11923 .n(8)
11924 .k(2)
11925 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
11926 }
11927
11928 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, strided_cn) {
11929 TEST_REQUIRES_ARM_NEON_FMA;
11930 GemmMicrokernelTester()
11931 .mr(6)
11932 .nr(8)
11933 .kr(1)
11934 .sr(1)
11935 .m(6)
11936 .n(8)
11937 .k(2)
11938 .cn_stride(11)
11939 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
11940 }
11941
11942 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
11943 TEST_REQUIRES_ARM_NEON_FMA;
11944 GemmMicrokernelTester()
11945 .mr(6)
11946 .nr(8)
11947 .kr(1)
11948 .sr(1)
11949 .m(6)
11950 .n(8)
11951 .k(2)
11952 .a_stride(5)
11953 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
11954 }
11955
11956 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
11957 TEST_REQUIRES_ARM_NEON_FMA;
11958 for (uint32_t m = 1; m <= 6; m++) {
11959 for (uint32_t n = 1; n <= 8; n++) {
11960 GemmMicrokernelTester()
11961 .mr(6)
11962 .nr(8)
11963 .kr(1)
11964 .sr(1)
11965 .m(m)
11966 .n(n)
11967 .k(2)
11968 .iterations(1)
11969 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
11970 }
11971 }
11972 }
11973
11974 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
11975 TEST_REQUIRES_ARM_NEON_FMA;
11976 for (uint32_t m = 1; m <= 6; m++) {
11977 GemmMicrokernelTester()
11978 .mr(6)
11979 .nr(8)
11980 .kr(1)
11981 .sr(1)
11982 .m(m)
11983 .n(8)
11984 .k(2)
11985 .iterations(1)
11986 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
11987 }
11988 }
11989
11990 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
11991 TEST_REQUIRES_ARM_NEON_FMA;
11992 for (uint32_t n = 1; n <= 8; n++) {
11993 GemmMicrokernelTester()
11994 .mr(6)
11995 .nr(8)
11996 .kr(1)
11997 .sr(1)
11998 .m(6)
11999 .n(n)
12000 .k(2)
12001 .iterations(1)
12002 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12003 }
12004 }
12005
12006 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_lt_2) {
12007 TEST_REQUIRES_ARM_NEON_FMA;
12008 for (size_t k = 1; k < 2; k++) {
12009 GemmMicrokernelTester()
12010 .mr(6)
12011 .nr(8)
12012 .kr(1)
12013 .sr(1)
12014 .m(6)
12015 .n(8)
12016 .k(k)
12017 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12018 }
12019 }
12020
12021 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
12022 TEST_REQUIRES_ARM_NEON_FMA;
12023 for (size_t k = 1; k < 2; k++) {
12024 GemmMicrokernelTester()
12025 .mr(6)
12026 .nr(8)
12027 .kr(1)
12028 .sr(1)
12029 .m(6)
12030 .n(8)
12031 .k(k)
12032 .a_stride(5)
12033 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12034 }
12035 }
12036
12037 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
12038 TEST_REQUIRES_ARM_NEON_FMA;
12039 for (size_t k = 1; k < 2; k++) {
12040 for (uint32_t m = 1; m <= 6; m++) {
12041 for (uint32_t n = 1; n <= 8; n++) {
12042 GemmMicrokernelTester()
12043 .mr(6)
12044 .nr(8)
12045 .kr(1)
12046 .sr(1)
12047 .m(m)
12048 .n(n)
12049 .k(k)
12050 .iterations(1)
12051 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12052 }
12053 }
12054 }
12055 }
12056
12057 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_gt_2) {
12058 TEST_REQUIRES_ARM_NEON_FMA;
12059 for (size_t k = 3; k < 4; k++) {
12060 GemmMicrokernelTester()
12061 .mr(6)
12062 .nr(8)
12063 .kr(1)
12064 .sr(1)
12065 .m(6)
12066 .n(8)
12067 .k(k)
12068 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12069 }
12070 }
12071
12072 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
12073 TEST_REQUIRES_ARM_NEON_FMA;
12074 for (size_t k = 3; k < 4; k++) {
12075 GemmMicrokernelTester()
12076 .mr(6)
12077 .nr(8)
12078 .kr(1)
12079 .sr(1)
12080 .m(6)
12081 .n(8)
12082 .k(k)
12083 .a_stride(7)
12084 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12085 }
12086 }
12087
12088 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
12089 TEST_REQUIRES_ARM_NEON_FMA;
12090 for (size_t k = 3; k < 4; k++) {
12091 for (uint32_t m = 1; m <= 6; m++) {
12092 for (uint32_t n = 1; n <= 8; n++) {
12093 GemmMicrokernelTester()
12094 .mr(6)
12095 .nr(8)
12096 .kr(1)
12097 .sr(1)
12098 .m(m)
12099 .n(n)
12100 .k(k)
12101 .iterations(1)
12102 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12103 }
12104 }
12105 }
12106 }
12107
12108 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_div_2) {
12109 TEST_REQUIRES_ARM_NEON_FMA;
12110 for (size_t k = 4; k <= 20; k += 2) {
12111 GemmMicrokernelTester()
12112 .mr(6)
12113 .nr(8)
12114 .kr(1)
12115 .sr(1)
12116 .m(6)
12117 .n(8)
12118 .k(k)
12119 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12120 }
12121 }
12122
12123 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
12124 TEST_REQUIRES_ARM_NEON_FMA;
12125 for (size_t k = 4; k <= 20; k += 2) {
12126 GemmMicrokernelTester()
12127 .mr(6)
12128 .nr(8)
12129 .kr(1)
12130 .sr(1)
12131 .m(6)
12132 .n(8)
12133 .k(k)
12134 .a_stride(23)
12135 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12136 }
12137 }
12138
12139 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
12140 TEST_REQUIRES_ARM_NEON_FMA;
12141 for (size_t k = 4; k <= 20; k += 2) {
12142 for (uint32_t m = 1; m <= 6; m++) {
12143 for (uint32_t n = 1; n <= 8; n++) {
12144 GemmMicrokernelTester()
12145 .mr(6)
12146 .nr(8)
12147 .kr(1)
12148 .sr(1)
12149 .m(m)
12150 .n(n)
12151 .k(k)
12152 .iterations(1)
12153 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12154 }
12155 }
12156 }
12157 }
12158
12159 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8) {
12160 TEST_REQUIRES_ARM_NEON_FMA;
12161 for (uint32_t n = 9; n < 16; n++) {
12162 for (size_t k = 1; k <= 10; k += 3) {
12163 GemmMicrokernelTester()
12164 .mr(6)
12165 .nr(8)
12166 .kr(1)
12167 .sr(1)
12168 .m(6)
12169 .n(8)
12170 .k(k)
12171 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12172 }
12173 }
12174 }
12175
12176 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
12177 TEST_REQUIRES_ARM_NEON_FMA;
12178 for (uint32_t n = 9; n < 16; n++) {
12179 for (size_t k = 1; k <= 10; k += 3) {
12180 GemmMicrokernelTester()
12181 .mr(6)
12182 .nr(8)
12183 .kr(1)
12184 .sr(1)
12185 .m(6)
12186 .n(8)
12187 .k(k)
12188 .cn_stride(11)
12189 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12190 }
12191 }
12192 }
12193
12194 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
12195 TEST_REQUIRES_ARM_NEON_FMA;
12196 for (uint32_t n = 9; n < 16; n++) {
12197 for (size_t k = 1; k <= 10; k += 3) {
12198 GemmMicrokernelTester()
12199 .mr(6)
12200 .nr(8)
12201 .kr(1)
12202 .sr(1)
12203 .m(6)
12204 .n(n)
12205 .k(k)
12206 .a_stride(13)
12207 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12208 }
12209 }
12210 }
12211
12212 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
12213 TEST_REQUIRES_ARM_NEON_FMA;
12214 for (uint32_t n = 9; n < 16; n++) {
12215 for (size_t k = 1; k <= 10; k += 3) {
12216 for (uint32_t m = 1; m <= 6; m++) {
12217 GemmMicrokernelTester()
12218 .mr(6)
12219 .nr(8)
12220 .kr(1)
12221 .sr(1)
12222 .m(m)
12223 .n(n)
12224 .k(k)
12225 .iterations(1)
12226 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12227 }
12228 }
12229 }
12230 }
12231
12232 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8) {
12233 TEST_REQUIRES_ARM_NEON_FMA;
12234 for (uint32_t n = 16; n <= 24; n += 8) {
12235 for (size_t k = 1; k <= 10; k += 3) {
12236 GemmMicrokernelTester()
12237 .mr(6)
12238 .nr(8)
12239 .kr(1)
12240 .sr(1)
12241 .m(6)
12242 .n(8)
12243 .k(k)
12244 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12245 }
12246 }
12247 }
12248
12249 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
12250 TEST_REQUIRES_ARM_NEON_FMA;
12251 for (uint32_t n = 16; n <= 24; n += 8) {
12252 for (size_t k = 1; k <= 10; k += 3) {
12253 GemmMicrokernelTester()
12254 .mr(6)
12255 .nr(8)
12256 .kr(1)
12257 .sr(1)
12258 .m(6)
12259 .n(n)
12260 .k(k)
12261 .cn_stride(11)
12262 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12263 }
12264 }
12265 }
12266
12267 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
12268 TEST_REQUIRES_ARM_NEON_FMA;
12269 for (uint32_t n = 16; n <= 24; n += 8) {
12270 for (size_t k = 1; k <= 10; k += 3) {
12271 GemmMicrokernelTester()
12272 .mr(6)
12273 .nr(8)
12274 .kr(1)
12275 .sr(1)
12276 .m(6)
12277 .n(n)
12278 .k(k)
12279 .a_stride(13)
12280 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12281 }
12282 }
12283 }
12284
12285 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
12286 TEST_REQUIRES_ARM_NEON_FMA;
12287 for (uint32_t n = 16; n <= 24; n += 8) {
12288 for (size_t k = 1; k <= 10; k += 3) {
12289 for (uint32_t m = 1; m <= 6; m++) {
12290 GemmMicrokernelTester()
12291 .mr(6)
12292 .nr(8)
12293 .kr(1)
12294 .sr(1)
12295 .m(m)
12296 .n(n)
12297 .k(k)
12298 .iterations(1)
12299 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12300 }
12301 }
12302 }
12303 }
12304
12305 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
12306 TEST_REQUIRES_ARM_NEON_FMA;
12307 for (size_t k = 1; k <= 10; k += 3) {
12308 for (uint32_t m = 1; m <= 6; m++) {
12309 for (uint32_t n = 1; n <= 8; n++) {
12310 GemmMicrokernelTester()
12311 .mr(6)
12312 .nr(8)
12313 .kr(1)
12314 .sr(1)
12315 .m(m)
12316 .n(n)
12317 .k(k)
12318 .cm_stride(11)
12319 .iterations(1)
12320 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12321 }
12322 }
12323 }
12324 }
12325
12326 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, qmin) {
12327 TEST_REQUIRES_ARM_NEON_FMA;
12328 GemmMicrokernelTester()
12329 .mr(6)
12330 .nr(8)
12331 .kr(1)
12332 .sr(1)
12333 .m(6)
12334 .n(8)
12335 .k(2)
12336 .qmin(128)
12337 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12338 }
12339
12340 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, qmax) {
12341 TEST_REQUIRES_ARM_NEON_FMA;
12342 GemmMicrokernelTester()
12343 .mr(6)
12344 .nr(8)
12345 .kr(1)
12346 .sr(1)
12347 .m(6)
12348 .n(8)
12349 .k(2)
12350 .qmax(128)
12351 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12352 }
12353
12354 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, strided_cm) {
12355 TEST_REQUIRES_ARM_NEON_FMA;
12356 GemmMicrokernelTester()
12357 .mr(6)
12358 .nr(8)
12359 .kr(1)
12360 .sr(1)
12361 .m(6)
12362 .n(8)
12363 .k(2)
12364 .cm_stride(11)
12365 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
12366 }
12367#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12368
12369
12370#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12371 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
12372 TEST_REQUIRES_ARM_NEON_FMA;
12373 GemmMicrokernelTester()
12374 .mr(6)
12375 .nr(8)
12376 .kr(1)
12377 .sr(1)
12378 .m(6)
12379 .n(8)
12380 .k(4)
12381 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12382 }
12383
12384 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cn) {
12385 TEST_REQUIRES_ARM_NEON_FMA;
12386 GemmMicrokernelTester()
12387 .mr(6)
12388 .nr(8)
12389 .kr(1)
12390 .sr(1)
12391 .m(6)
12392 .n(8)
12393 .k(4)
12394 .cn_stride(11)
12395 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12396 }
12397
12398 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
12399 TEST_REQUIRES_ARM_NEON_FMA;
12400 GemmMicrokernelTester()
12401 .mr(6)
12402 .nr(8)
12403 .kr(1)
12404 .sr(1)
12405 .m(6)
12406 .n(8)
12407 .k(4)
12408 .a_stride(7)
12409 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12410 }
12411
12412 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
12413 TEST_REQUIRES_ARM_NEON_FMA;
12414 for (uint32_t m = 1; m <= 6; m++) {
12415 for (uint32_t n = 1; n <= 8; n++) {
12416 GemmMicrokernelTester()
12417 .mr(6)
12418 .nr(8)
12419 .kr(1)
12420 .sr(1)
12421 .m(m)
12422 .n(n)
12423 .k(4)
12424 .iterations(1)
12425 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12426 }
12427 }
12428 }
12429
12430 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
12431 TEST_REQUIRES_ARM_NEON_FMA;
12432 for (uint32_t m = 1; m <= 6; m++) {
12433 GemmMicrokernelTester()
12434 .mr(6)
12435 .nr(8)
12436 .kr(1)
12437 .sr(1)
12438 .m(m)
12439 .n(8)
12440 .k(4)
12441 .iterations(1)
12442 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12443 }
12444 }
12445
12446 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
12447 TEST_REQUIRES_ARM_NEON_FMA;
12448 for (uint32_t n = 1; n <= 8; n++) {
12449 GemmMicrokernelTester()
12450 .mr(6)
12451 .nr(8)
12452 .kr(1)
12453 .sr(1)
12454 .m(6)
12455 .n(n)
12456 .k(4)
12457 .iterations(1)
12458 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12459 }
12460 }
12461
12462 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4) {
12463 TEST_REQUIRES_ARM_NEON_FMA;
12464 for (size_t k = 1; k < 4; k++) {
12465 GemmMicrokernelTester()
12466 .mr(6)
12467 .nr(8)
12468 .kr(1)
12469 .sr(1)
12470 .m(6)
12471 .n(8)
12472 .k(k)
12473 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12474 }
12475 }
12476
12477 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
12478 TEST_REQUIRES_ARM_NEON_FMA;
12479 for (size_t k = 1; k < 4; k++) {
12480 GemmMicrokernelTester()
12481 .mr(6)
12482 .nr(8)
12483 .kr(1)
12484 .sr(1)
12485 .m(6)
12486 .n(8)
12487 .k(k)
12488 .a_stride(7)
12489 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12490 }
12491 }
12492
12493 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
12494 TEST_REQUIRES_ARM_NEON_FMA;
12495 for (size_t k = 1; k < 4; k++) {
12496 for (uint32_t m = 1; m <= 6; m++) {
12497 for (uint32_t n = 1; n <= 8; n++) {
12498 GemmMicrokernelTester()
12499 .mr(6)
12500 .nr(8)
12501 .kr(1)
12502 .sr(1)
12503 .m(m)
12504 .n(n)
12505 .k(k)
12506 .iterations(1)
12507 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12508 }
12509 }
12510 }
12511 }
12512
12513 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4) {
12514 TEST_REQUIRES_ARM_NEON_FMA;
12515 for (size_t k = 5; k < 8; k++) {
12516 GemmMicrokernelTester()
12517 .mr(6)
12518 .nr(8)
12519 .kr(1)
12520 .sr(1)
12521 .m(6)
12522 .n(8)
12523 .k(k)
12524 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12525 }
12526 }
12527
12528 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
12529 TEST_REQUIRES_ARM_NEON_FMA;
12530 for (size_t k = 5; k < 8; k++) {
12531 GemmMicrokernelTester()
12532 .mr(6)
12533 .nr(8)
12534 .kr(1)
12535 .sr(1)
12536 .m(6)
12537 .n(8)
12538 .k(k)
12539 .a_stride(11)
12540 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12541 }
12542 }
12543
12544 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
12545 TEST_REQUIRES_ARM_NEON_FMA;
12546 for (size_t k = 5; k < 8; k++) {
12547 for (uint32_t m = 1; m <= 6; m++) {
12548 for (uint32_t n = 1; n <= 8; n++) {
12549 GemmMicrokernelTester()
12550 .mr(6)
12551 .nr(8)
12552 .kr(1)
12553 .sr(1)
12554 .m(m)
12555 .n(n)
12556 .k(k)
12557 .iterations(1)
12558 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12559 }
12560 }
12561 }
12562 }
12563
12564 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4) {
12565 TEST_REQUIRES_ARM_NEON_FMA;
12566 for (size_t k = 8; k <= 40; k += 4) {
12567 GemmMicrokernelTester()
12568 .mr(6)
12569 .nr(8)
12570 .kr(1)
12571 .sr(1)
12572 .m(6)
12573 .n(8)
12574 .k(k)
12575 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12576 }
12577 }
12578
12579 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
12580 TEST_REQUIRES_ARM_NEON_FMA;
12581 for (size_t k = 8; k <= 40; k += 4) {
12582 GemmMicrokernelTester()
12583 .mr(6)
12584 .nr(8)
12585 .kr(1)
12586 .sr(1)
12587 .m(6)
12588 .n(8)
12589 .k(k)
12590 .a_stride(43)
12591 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12592 }
12593 }
12594
12595 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
12596 TEST_REQUIRES_ARM_NEON_FMA;
12597 for (size_t k = 8; k <= 40; k += 4) {
12598 for (uint32_t m = 1; m <= 6; m++) {
12599 for (uint32_t n = 1; n <= 8; n++) {
12600 GemmMicrokernelTester()
12601 .mr(6)
12602 .nr(8)
12603 .kr(1)
12604 .sr(1)
12605 .m(m)
12606 .n(n)
12607 .k(k)
12608 .iterations(1)
12609 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12610 }
12611 }
12612 }
12613 }
12614
12615 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8) {
12616 TEST_REQUIRES_ARM_NEON_FMA;
12617 for (uint32_t n = 9; n < 16; n++) {
12618 for (size_t k = 1; k <= 20; k += 5) {
12619 GemmMicrokernelTester()
12620 .mr(6)
12621 .nr(8)
12622 .kr(1)
12623 .sr(1)
12624 .m(6)
12625 .n(8)
12626 .k(k)
12627 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12628 }
12629 }
12630 }
12631
12632 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
12633 TEST_REQUIRES_ARM_NEON_FMA;
12634 for (uint32_t n = 9; n < 16; n++) {
12635 for (size_t k = 1; k <= 20; k += 5) {
12636 GemmMicrokernelTester()
12637 .mr(6)
12638 .nr(8)
12639 .kr(1)
12640 .sr(1)
12641 .m(6)
12642 .n(8)
12643 .k(k)
12644 .cn_stride(11)
12645 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12646 }
12647 }
12648 }
12649
12650 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
12651 TEST_REQUIRES_ARM_NEON_FMA;
12652 for (uint32_t n = 9; n < 16; n++) {
12653 for (size_t k = 1; k <= 20; k += 5) {
12654 GemmMicrokernelTester()
12655 .mr(6)
12656 .nr(8)
12657 .kr(1)
12658 .sr(1)
12659 .m(6)
12660 .n(n)
12661 .k(k)
12662 .a_stride(23)
12663 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12664 }
12665 }
12666 }
12667
12668 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
12669 TEST_REQUIRES_ARM_NEON_FMA;
12670 for (uint32_t n = 9; n < 16; n++) {
12671 for (size_t k = 1; k <= 20; k += 5) {
12672 for (uint32_t m = 1; m <= 6; m++) {
12673 GemmMicrokernelTester()
12674 .mr(6)
12675 .nr(8)
12676 .kr(1)
12677 .sr(1)
12678 .m(m)
12679 .n(n)
12680 .k(k)
12681 .iterations(1)
12682 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12683 }
12684 }
12685 }
12686 }
12687
12688 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8) {
12689 TEST_REQUIRES_ARM_NEON_FMA;
12690 for (uint32_t n = 16; n <= 24; n += 8) {
12691 for (size_t k = 1; k <= 20; k += 5) {
12692 GemmMicrokernelTester()
12693 .mr(6)
12694 .nr(8)
12695 .kr(1)
12696 .sr(1)
12697 .m(6)
12698 .n(8)
12699 .k(k)
12700 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12701 }
12702 }
12703 }
12704
12705 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
12706 TEST_REQUIRES_ARM_NEON_FMA;
12707 for (uint32_t n = 16; n <= 24; n += 8) {
12708 for (size_t k = 1; k <= 20; k += 5) {
12709 GemmMicrokernelTester()
12710 .mr(6)
12711 .nr(8)
12712 .kr(1)
12713 .sr(1)
12714 .m(6)
12715 .n(n)
12716 .k(k)
12717 .cn_stride(11)
12718 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12719 }
12720 }
12721 }
12722
12723 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
12724 TEST_REQUIRES_ARM_NEON_FMA;
12725 for (uint32_t n = 16; n <= 24; n += 8) {
12726 for (size_t k = 1; k <= 20; k += 5) {
12727 GemmMicrokernelTester()
12728 .mr(6)
12729 .nr(8)
12730 .kr(1)
12731 .sr(1)
12732 .m(6)
12733 .n(n)
12734 .k(k)
12735 .a_stride(23)
12736 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12737 }
12738 }
12739 }
12740
12741 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
12742 TEST_REQUIRES_ARM_NEON_FMA;
12743 for (uint32_t n = 16; n <= 24; n += 8) {
12744 for (size_t k = 1; k <= 20; k += 5) {
12745 for (uint32_t m = 1; m <= 6; m++) {
12746 GemmMicrokernelTester()
12747 .mr(6)
12748 .nr(8)
12749 .kr(1)
12750 .sr(1)
12751 .m(m)
12752 .n(n)
12753 .k(k)
12754 .iterations(1)
12755 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12756 }
12757 }
12758 }
12759 }
12760
12761 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
12762 TEST_REQUIRES_ARM_NEON_FMA;
12763 for (size_t k = 1; k <= 20; k += 5) {
12764 for (uint32_t m = 1; m <= 6; m++) {
12765 for (uint32_t n = 1; n <= 8; n++) {
12766 GemmMicrokernelTester()
12767 .mr(6)
12768 .nr(8)
12769 .kr(1)
12770 .sr(1)
12771 .m(m)
12772 .n(n)
12773 .k(k)
12774 .cm_stride(11)
12775 .iterations(1)
12776 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12777 }
12778 }
12779 }
12780 }
12781
12782 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, qmin) {
12783 TEST_REQUIRES_ARM_NEON_FMA;
12784 GemmMicrokernelTester()
12785 .mr(6)
12786 .nr(8)
12787 .kr(1)
12788 .sr(1)
12789 .m(6)
12790 .n(8)
12791 .k(4)
12792 .qmin(128)
12793 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12794 }
12795
12796 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, qmax) {
12797 TEST_REQUIRES_ARM_NEON_FMA;
12798 GemmMicrokernelTester()
12799 .mr(6)
12800 .nr(8)
12801 .kr(1)
12802 .sr(1)
12803 .m(6)
12804 .n(8)
12805 .k(4)
12806 .qmax(128)
12807 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12808 }
12809
12810 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cm) {
12811 TEST_REQUIRES_ARM_NEON_FMA;
12812 GemmMicrokernelTester()
12813 .mr(6)
12814 .nr(8)
12815 .kr(1)
12816 .sr(1)
12817 .m(6)
12818 .n(8)
12819 .k(4)
12820 .cm_stride(11)
12821 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
12822 }
12823#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
12824
12825
12826#if XNN_ARCH_ARM || XNN_ARCH_ARM64
12827 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2) {
12828 TEST_REQUIRES_ARM_NEON;
12829 GemmMicrokernelTester()
12830 .mr(1)
12831 .nr(8)
12832 .kr(1)
12833 .sr(1)
12834 .m(1)
12835 .n(8)
12836 .k(2)
12837 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12838 }
12839
12840 TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cn) {
12841 TEST_REQUIRES_ARM_NEON;
12842 GemmMicrokernelTester()
12843 .mr(1)
12844 .nr(8)
12845 .kr(1)
12846 .sr(1)
12847 .m(1)
12848 .n(8)
12849 .k(2)
12850 .cn_stride(11)
12851 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12852 }
12853
12854 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
12855 TEST_REQUIRES_ARM_NEON;
12856 GemmMicrokernelTester()
12857 .mr(1)
12858 .nr(8)
12859 .kr(1)
12860 .sr(1)
12861 .m(1)
12862 .n(8)
12863 .k(2)
12864 .a_stride(5)
12865 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12866 }
12867
12868 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
12869 TEST_REQUIRES_ARM_NEON;
12870 for (uint32_t m = 1; m <= 1; m++) {
12871 for (uint32_t n = 1; n <= 8; n++) {
12872 GemmMicrokernelTester()
12873 .mr(1)
12874 .nr(8)
12875 .kr(1)
12876 .sr(1)
12877 .m(m)
12878 .n(n)
12879 .k(2)
12880 .iterations(1)
12881 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12882 }
12883 }
12884 }
12885
12886 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
12887 TEST_REQUIRES_ARM_NEON;
12888 for (uint32_t m = 1; m <= 1; m++) {
12889 GemmMicrokernelTester()
12890 .mr(1)
12891 .nr(8)
12892 .kr(1)
12893 .sr(1)
12894 .m(m)
12895 .n(8)
12896 .k(2)
12897 .iterations(1)
12898 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12899 }
12900 }
12901
12902 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
12903 TEST_REQUIRES_ARM_NEON;
12904 for (uint32_t n = 1; n <= 8; n++) {
12905 GemmMicrokernelTester()
12906 .mr(1)
12907 .nr(8)
12908 .kr(1)
12909 .sr(1)
12910 .m(1)
12911 .n(n)
12912 .k(2)
12913 .iterations(1)
12914 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12915 }
12916 }
12917
12918 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2) {
12919 TEST_REQUIRES_ARM_NEON;
12920 for (size_t k = 1; k < 2; k++) {
12921 GemmMicrokernelTester()
12922 .mr(1)
12923 .nr(8)
12924 .kr(1)
12925 .sr(1)
12926 .m(1)
12927 .n(8)
12928 .k(k)
12929 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12930 }
12931 }
12932
12933 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
12934 TEST_REQUIRES_ARM_NEON;
12935 for (size_t k = 1; k < 2; k++) {
12936 GemmMicrokernelTester()
12937 .mr(1)
12938 .nr(8)
12939 .kr(1)
12940 .sr(1)
12941 .m(1)
12942 .n(8)
12943 .k(k)
12944 .a_stride(5)
12945 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12946 }
12947 }
12948
12949 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
12950 TEST_REQUIRES_ARM_NEON;
12951 for (size_t k = 1; k < 2; k++) {
12952 for (uint32_t m = 1; m <= 1; m++) {
12953 for (uint32_t n = 1; n <= 8; n++) {
12954 GemmMicrokernelTester()
12955 .mr(1)
12956 .nr(8)
12957 .kr(1)
12958 .sr(1)
12959 .m(m)
12960 .n(n)
12961 .k(k)
12962 .iterations(1)
12963 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12964 }
12965 }
12966 }
12967 }
12968
12969 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2) {
12970 TEST_REQUIRES_ARM_NEON;
12971 for (size_t k = 3; k < 4; k++) {
12972 GemmMicrokernelTester()
12973 .mr(1)
12974 .nr(8)
12975 .kr(1)
12976 .sr(1)
12977 .m(1)
12978 .n(8)
12979 .k(k)
12980 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12981 }
12982 }
12983
12984 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
12985 TEST_REQUIRES_ARM_NEON;
12986 for (size_t k = 3; k < 4; k++) {
12987 GemmMicrokernelTester()
12988 .mr(1)
12989 .nr(8)
12990 .kr(1)
12991 .sr(1)
12992 .m(1)
12993 .n(8)
12994 .k(k)
12995 .a_stride(7)
12996 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
12997 }
12998 }
12999
13000 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
13001 TEST_REQUIRES_ARM_NEON;
13002 for (size_t k = 3; k < 4; k++) {
13003 for (uint32_t m = 1; m <= 1; m++) {
13004 for (uint32_t n = 1; n <= 8; n++) {
13005 GemmMicrokernelTester()
13006 .mr(1)
13007 .nr(8)
13008 .kr(1)
13009 .sr(1)
13010 .m(m)
13011 .n(n)
13012 .k(k)
13013 .iterations(1)
13014 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13015 }
13016 }
13017 }
13018 }
13019
13020 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2) {
13021 TEST_REQUIRES_ARM_NEON;
13022 for (size_t k = 4; k <= 20; k += 2) {
13023 GemmMicrokernelTester()
13024 .mr(1)
13025 .nr(8)
13026 .kr(1)
13027 .sr(1)
13028 .m(1)
13029 .n(8)
13030 .k(k)
13031 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13032 }
13033 }
13034
13035 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
13036 TEST_REQUIRES_ARM_NEON;
13037 for (size_t k = 4; k <= 20; k += 2) {
13038 GemmMicrokernelTester()
13039 .mr(1)
13040 .nr(8)
13041 .kr(1)
13042 .sr(1)
13043 .m(1)
13044 .n(8)
13045 .k(k)
13046 .a_stride(23)
13047 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13048 }
13049 }
13050
13051 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2_subtile) {
13052 TEST_REQUIRES_ARM_NEON;
13053 for (size_t k = 4; k <= 20; k += 2) {
13054 for (uint32_t m = 1; m <= 1; m++) {
13055 for (uint32_t n = 1; n <= 8; n++) {
13056 GemmMicrokernelTester()
13057 .mr(1)
13058 .nr(8)
13059 .kr(1)
13060 .sr(1)
13061 .m(m)
13062 .n(n)
13063 .k(k)
13064 .iterations(1)
13065 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13066 }
13067 }
13068 }
13069 }
13070
13071 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8) {
13072 TEST_REQUIRES_ARM_NEON;
13073 for (uint32_t n = 9; n < 16; n++) {
13074 for (size_t k = 1; k <= 10; k += 3) {
13075 GemmMicrokernelTester()
13076 .mr(1)
13077 .nr(8)
13078 .kr(1)
13079 .sr(1)
13080 .m(1)
13081 .n(8)
13082 .k(k)
13083 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13084 }
13085 }
13086 }
13087
13088 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
13089 TEST_REQUIRES_ARM_NEON;
13090 for (uint32_t n = 9; n < 16; n++) {
13091 for (size_t k = 1; k <= 10; k += 3) {
13092 GemmMicrokernelTester()
13093 .mr(1)
13094 .nr(8)
13095 .kr(1)
13096 .sr(1)
13097 .m(1)
13098 .n(8)
13099 .k(k)
13100 .cn_stride(11)
13101 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13102 }
13103 }
13104 }
13105
13106 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
13107 TEST_REQUIRES_ARM_NEON;
13108 for (uint32_t n = 9; n < 16; n++) {
13109 for (size_t k = 1; k <= 10; k += 3) {
13110 GemmMicrokernelTester()
13111 .mr(1)
13112 .nr(8)
13113 .kr(1)
13114 .sr(1)
13115 .m(1)
13116 .n(n)
13117 .k(k)
13118 .a_stride(13)
13119 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13120 }
13121 }
13122 }
13123
13124 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
13125 TEST_REQUIRES_ARM_NEON;
13126 for (uint32_t n = 9; n < 16; n++) {
13127 for (size_t k = 1; k <= 10; k += 3) {
13128 for (uint32_t m = 1; m <= 1; m++) {
13129 GemmMicrokernelTester()
13130 .mr(1)
13131 .nr(8)
13132 .kr(1)
13133 .sr(1)
13134 .m(m)
13135 .n(n)
13136 .k(k)
13137 .iterations(1)
13138 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13139 }
13140 }
13141 }
13142 }
13143
13144 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8) {
13145 TEST_REQUIRES_ARM_NEON;
13146 for (uint32_t n = 16; n <= 24; n += 8) {
13147 for (size_t k = 1; k <= 10; k += 3) {
13148 GemmMicrokernelTester()
13149 .mr(1)
13150 .nr(8)
13151 .kr(1)
13152 .sr(1)
13153 .m(1)
13154 .n(8)
13155 .k(k)
13156 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13157 }
13158 }
13159 }
13160
13161 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
13162 TEST_REQUIRES_ARM_NEON;
13163 for (uint32_t n = 16; n <= 24; n += 8) {
13164 for (size_t k = 1; k <= 10; k += 3) {
13165 GemmMicrokernelTester()
13166 .mr(1)
13167 .nr(8)
13168 .kr(1)
13169 .sr(1)
13170 .m(1)
13171 .n(n)
13172 .k(k)
13173 .cn_stride(11)
13174 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13175 }
13176 }
13177 }
13178
13179 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
13180 TEST_REQUIRES_ARM_NEON;
13181 for (uint32_t n = 16; n <= 24; n += 8) {
13182 for (size_t k = 1; k <= 10; k += 3) {
13183 GemmMicrokernelTester()
13184 .mr(1)
13185 .nr(8)
13186 .kr(1)
13187 .sr(1)
13188 .m(1)
13189 .n(n)
13190 .k(k)
13191 .a_stride(13)
13192 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13193 }
13194 }
13195 }
13196
13197 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_subtile) {
13198 TEST_REQUIRES_ARM_NEON;
13199 for (uint32_t n = 16; n <= 24; n += 8) {
13200 for (size_t k = 1; k <= 10; k += 3) {
13201 for (uint32_t m = 1; m <= 1; m++) {
13202 GemmMicrokernelTester()
13203 .mr(1)
13204 .nr(8)
13205 .kr(1)
13206 .sr(1)
13207 .m(m)
13208 .n(n)
13209 .k(k)
13210 .iterations(1)
13211 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13212 }
13213 }
13214 }
13215 }
13216
13217 TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cm_subtile) {
13218 TEST_REQUIRES_ARM_NEON;
13219 for (size_t k = 1; k <= 10; k += 3) {
13220 for (uint32_t m = 1; m <= 1; m++) {
13221 for (uint32_t n = 1; n <= 8; n++) {
13222 GemmMicrokernelTester()
13223 .mr(1)
13224 .nr(8)
13225 .kr(1)
13226 .sr(1)
13227 .m(m)
13228 .n(n)
13229 .k(k)
13230 .cm_stride(11)
13231 .iterations(1)
13232 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13233 }
13234 }
13235 }
13236 }
13237
13238 TEST(F32_GEMM_1X8__NEON_LANE_LD64, qmin) {
13239 TEST_REQUIRES_ARM_NEON;
13240 GemmMicrokernelTester()
13241 .mr(1)
13242 .nr(8)
13243 .kr(1)
13244 .sr(1)
13245 .m(1)
13246 .n(8)
13247 .k(2)
13248 .qmin(128)
13249 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13250 }
13251
13252 TEST(F32_GEMM_1X8__NEON_LANE_LD64, qmax) {
13253 TEST_REQUIRES_ARM_NEON;
13254 GemmMicrokernelTester()
13255 .mr(1)
13256 .nr(8)
13257 .kr(1)
13258 .sr(1)
13259 .m(1)
13260 .n(8)
13261 .k(2)
13262 .qmax(128)
13263 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13264 }
13265
13266 TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cm) {
13267 TEST_REQUIRES_ARM_NEON;
13268 GemmMicrokernelTester()
13269 .mr(1)
13270 .nr(8)
13271 .kr(1)
13272 .sr(1)
13273 .m(1)
13274 .n(8)
13275 .k(2)
13276 .cm_stride(11)
13277 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
13278 }
13279#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13280
13281
13282#if XNN_ARCH_ARM || XNN_ARCH_ARM64
13283 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2) {
13284 TEST_REQUIRES_ARM_NEON;
13285 GemmMicrokernelTester()
13286 .mr(4)
13287 .nr(2)
13288 .kr(1)
13289 .sr(1)
13290 .m(4)
13291 .n(2)
13292 .k(2)
13293 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13294 }
13295
13296 TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cn) {
13297 TEST_REQUIRES_ARM_NEON;
13298 GemmMicrokernelTester()
13299 .mr(4)
13300 .nr(2)
13301 .kr(1)
13302 .sr(1)
13303 .m(4)
13304 .n(2)
13305 .k(2)
13306 .cn_stride(5)
13307 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13308 }
13309
13310 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_strided_a) {
13311 TEST_REQUIRES_ARM_NEON;
13312 GemmMicrokernelTester()
13313 .mr(4)
13314 .nr(2)
13315 .kr(1)
13316 .sr(1)
13317 .m(4)
13318 .n(2)
13319 .k(2)
13320 .a_stride(5)
13321 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13322 }
13323
13324 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile) {
13325 TEST_REQUIRES_ARM_NEON;
13326 for (uint32_t m = 1; m <= 4; m++) {
13327 for (uint32_t n = 1; n <= 2; n++) {
13328 GemmMicrokernelTester()
13329 .mr(4)
13330 .nr(2)
13331 .kr(1)
13332 .sr(1)
13333 .m(m)
13334 .n(n)
13335 .k(2)
13336 .iterations(1)
13337 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13338 }
13339 }
13340 }
13341
13342 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
13343 TEST_REQUIRES_ARM_NEON;
13344 for (uint32_t m = 1; m <= 4; m++) {
13345 GemmMicrokernelTester()
13346 .mr(4)
13347 .nr(2)
13348 .kr(1)
13349 .sr(1)
13350 .m(m)
13351 .n(2)
13352 .k(2)
13353 .iterations(1)
13354 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13355 }
13356 }
13357
13358 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
13359 TEST_REQUIRES_ARM_NEON;
13360 for (uint32_t n = 1; n <= 2; n++) {
13361 GemmMicrokernelTester()
13362 .mr(4)
13363 .nr(2)
13364 .kr(1)
13365 .sr(1)
13366 .m(4)
13367 .n(n)
13368 .k(2)
13369 .iterations(1)
13370 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13371 }
13372 }
13373
13374 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2) {
13375 TEST_REQUIRES_ARM_NEON;
13376 for (size_t k = 1; k < 2; k++) {
13377 GemmMicrokernelTester()
13378 .mr(4)
13379 .nr(2)
13380 .kr(1)
13381 .sr(1)
13382 .m(4)
13383 .n(2)
13384 .k(k)
13385 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13386 }
13387 }
13388
13389 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2_strided_a) {
13390 TEST_REQUIRES_ARM_NEON;
13391 for (size_t k = 1; k < 2; k++) {
13392 GemmMicrokernelTester()
13393 .mr(4)
13394 .nr(2)
13395 .kr(1)
13396 .sr(1)
13397 .m(4)
13398 .n(2)
13399 .k(k)
13400 .a_stride(5)
13401 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13402 }
13403 }
13404
13405 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2_subtile) {
13406 TEST_REQUIRES_ARM_NEON;
13407 for (size_t k = 1; k < 2; k++) {
13408 for (uint32_t m = 1; m <= 4; m++) {
13409 for (uint32_t n = 1; n <= 2; n++) {
13410 GemmMicrokernelTester()
13411 .mr(4)
13412 .nr(2)
13413 .kr(1)
13414 .sr(1)
13415 .m(m)
13416 .n(n)
13417 .k(k)
13418 .iterations(1)
13419 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13420 }
13421 }
13422 }
13423 }
13424
13425 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2) {
13426 TEST_REQUIRES_ARM_NEON;
13427 for (size_t k = 3; k < 4; k++) {
13428 GemmMicrokernelTester()
13429 .mr(4)
13430 .nr(2)
13431 .kr(1)
13432 .sr(1)
13433 .m(4)
13434 .n(2)
13435 .k(k)
13436 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13437 }
13438 }
13439
13440 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2_strided_a) {
13441 TEST_REQUIRES_ARM_NEON;
13442 for (size_t k = 3; k < 4; k++) {
13443 GemmMicrokernelTester()
13444 .mr(4)
13445 .nr(2)
13446 .kr(1)
13447 .sr(1)
13448 .m(4)
13449 .n(2)
13450 .k(k)
13451 .a_stride(7)
13452 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13453 }
13454 }
13455
13456 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2_subtile) {
13457 TEST_REQUIRES_ARM_NEON;
13458 for (size_t k = 3; k < 4; k++) {
13459 for (uint32_t m = 1; m <= 4; m++) {
13460 for (uint32_t n = 1; n <= 2; n++) {
13461 GemmMicrokernelTester()
13462 .mr(4)
13463 .nr(2)
13464 .kr(1)
13465 .sr(1)
13466 .m(m)
13467 .n(n)
13468 .k(k)
13469 .iterations(1)
13470 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13471 }
13472 }
13473 }
13474 }
13475
13476 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2) {
13477 TEST_REQUIRES_ARM_NEON;
13478 for (size_t k = 4; k <= 20; k += 2) {
13479 GemmMicrokernelTester()
13480 .mr(4)
13481 .nr(2)
13482 .kr(1)
13483 .sr(1)
13484 .m(4)
13485 .n(2)
13486 .k(k)
13487 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13488 }
13489 }
13490
13491 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2_strided_a) {
13492 TEST_REQUIRES_ARM_NEON;
13493 for (size_t k = 4; k <= 20; k += 2) {
13494 GemmMicrokernelTester()
13495 .mr(4)
13496 .nr(2)
13497 .kr(1)
13498 .sr(1)
13499 .m(4)
13500 .n(2)
13501 .k(k)
13502 .a_stride(23)
13503 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13504 }
13505 }
13506
13507 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2_subtile) {
13508 TEST_REQUIRES_ARM_NEON;
13509 for (size_t k = 4; k <= 20; k += 2) {
13510 for (uint32_t m = 1; m <= 4; m++) {
13511 for (uint32_t n = 1; n <= 2; n++) {
13512 GemmMicrokernelTester()
13513 .mr(4)
13514 .nr(2)
13515 .kr(1)
13516 .sr(1)
13517 .m(m)
13518 .n(n)
13519 .k(k)
13520 .iterations(1)
13521 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13522 }
13523 }
13524 }
13525 }
13526
13527 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2) {
13528 TEST_REQUIRES_ARM_NEON;
13529 for (uint32_t n = 3; n < 4; n++) {
13530 for (size_t k = 1; k <= 10; k += 3) {
13531 GemmMicrokernelTester()
13532 .mr(4)
13533 .nr(2)
13534 .kr(1)
13535 .sr(1)
13536 .m(4)
13537 .n(2)
13538 .k(k)
13539 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13540 }
13541 }
13542 }
13543
13544 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
13545 TEST_REQUIRES_ARM_NEON;
13546 for (uint32_t n = 3; n < 4; n++) {
13547 for (size_t k = 1; k <= 10; k += 3) {
13548 GemmMicrokernelTester()
13549 .mr(4)
13550 .nr(2)
13551 .kr(1)
13552 .sr(1)
13553 .m(4)
13554 .n(2)
13555 .k(k)
13556 .cn_stride(5)
13557 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13558 }
13559 }
13560 }
13561
13562 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_a) {
13563 TEST_REQUIRES_ARM_NEON;
13564 for (uint32_t n = 3; n < 4; n++) {
13565 for (size_t k = 1; k <= 10; k += 3) {
13566 GemmMicrokernelTester()
13567 .mr(4)
13568 .nr(2)
13569 .kr(1)
13570 .sr(1)
13571 .m(4)
13572 .n(n)
13573 .k(k)
13574 .a_stride(13)
13575 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13576 }
13577 }
13578 }
13579
13580 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_subtile) {
13581 TEST_REQUIRES_ARM_NEON;
13582 for (uint32_t n = 3; n < 4; n++) {
13583 for (size_t k = 1; k <= 10; k += 3) {
13584 for (uint32_t m = 1; m <= 4; m++) {
13585 GemmMicrokernelTester()
13586 .mr(4)
13587 .nr(2)
13588 .kr(1)
13589 .sr(1)
13590 .m(m)
13591 .n(n)
13592 .k(k)
13593 .iterations(1)
13594 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13595 }
13596 }
13597 }
13598 }
13599
13600 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2) {
13601 TEST_REQUIRES_ARM_NEON;
13602 for (uint32_t n = 4; n <= 6; n += 2) {
13603 for (size_t k = 1; k <= 10; k += 3) {
13604 GemmMicrokernelTester()
13605 .mr(4)
13606 .nr(2)
13607 .kr(1)
13608 .sr(1)
13609 .m(4)
13610 .n(2)
13611 .k(k)
13612 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13613 }
13614 }
13615 }
13616
13617 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_strided_cn) {
13618 TEST_REQUIRES_ARM_NEON;
13619 for (uint32_t n = 4; n <= 6; n += 2) {
13620 for (size_t k = 1; k <= 10; k += 3) {
13621 GemmMicrokernelTester()
13622 .mr(4)
13623 .nr(2)
13624 .kr(1)
13625 .sr(1)
13626 .m(4)
13627 .n(n)
13628 .k(k)
13629 .cn_stride(5)
13630 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13631 }
13632 }
13633 }
13634
13635 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_strided_a) {
13636 TEST_REQUIRES_ARM_NEON;
13637 for (uint32_t n = 4; n <= 6; n += 2) {
13638 for (size_t k = 1; k <= 10; k += 3) {
13639 GemmMicrokernelTester()
13640 .mr(4)
13641 .nr(2)
13642 .kr(1)
13643 .sr(1)
13644 .m(4)
13645 .n(n)
13646 .k(k)
13647 .a_stride(13)
13648 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13649 }
13650 }
13651 }
13652
13653 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_subtile) {
13654 TEST_REQUIRES_ARM_NEON;
13655 for (uint32_t n = 4; n <= 6; n += 2) {
13656 for (size_t k = 1; k <= 10; k += 3) {
13657 for (uint32_t m = 1; m <= 4; m++) {
13658 GemmMicrokernelTester()
13659 .mr(4)
13660 .nr(2)
13661 .kr(1)
13662 .sr(1)
13663 .m(m)
13664 .n(n)
13665 .k(k)
13666 .iterations(1)
13667 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13668 }
13669 }
13670 }
13671 }
13672
13673 TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cm_subtile) {
13674 TEST_REQUIRES_ARM_NEON;
13675 for (size_t k = 1; k <= 10; k += 3) {
13676 for (uint32_t m = 1; m <= 4; m++) {
13677 for (uint32_t n = 1; n <= 2; n++) {
13678 GemmMicrokernelTester()
13679 .mr(4)
13680 .nr(2)
13681 .kr(1)
13682 .sr(1)
13683 .m(m)
13684 .n(n)
13685 .k(k)
13686 .cm_stride(5)
13687 .iterations(1)
13688 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13689 }
13690 }
13691 }
13692 }
13693
13694 TEST(F32_GEMM_4X2__NEON_LANE_LD64, qmin) {
13695 TEST_REQUIRES_ARM_NEON;
13696 GemmMicrokernelTester()
13697 .mr(4)
13698 .nr(2)
13699 .kr(1)
13700 .sr(1)
13701 .m(4)
13702 .n(2)
13703 .k(2)
13704 .qmin(128)
13705 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13706 }
13707
13708 TEST(F32_GEMM_4X2__NEON_LANE_LD64, qmax) {
13709 TEST_REQUIRES_ARM_NEON;
13710 GemmMicrokernelTester()
13711 .mr(4)
13712 .nr(2)
13713 .kr(1)
13714 .sr(1)
13715 .m(4)
13716 .n(2)
13717 .k(2)
13718 .qmax(128)
13719 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13720 }
13721
13722 TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cm) {
13723 TEST_REQUIRES_ARM_NEON;
13724 GemmMicrokernelTester()
13725 .mr(4)
13726 .nr(2)
13727 .kr(1)
13728 .sr(1)
13729 .m(4)
13730 .n(2)
13731 .k(2)
13732 .cm_stride(5)
13733 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
13734 }
13735#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13736
13737
13738#if XNN_ARCH_ARM || XNN_ARCH_ARM64
13739 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2) {
13740 TEST_REQUIRES_ARM_NEON;
13741 GemmMicrokernelTester()
13742 .mr(4)
13743 .nr(8)
13744 .kr(1)
13745 .sr(1)
13746 .m(4)
13747 .n(8)
13748 .k(2)
13749 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13750 }
13751
13752 TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cn) {
13753 TEST_REQUIRES_ARM_NEON;
13754 GemmMicrokernelTester()
13755 .mr(4)
13756 .nr(8)
13757 .kr(1)
13758 .sr(1)
13759 .m(4)
13760 .n(8)
13761 .k(2)
13762 .cn_stride(11)
13763 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13764 }
13765
13766 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
13767 TEST_REQUIRES_ARM_NEON;
13768 GemmMicrokernelTester()
13769 .mr(4)
13770 .nr(8)
13771 .kr(1)
13772 .sr(1)
13773 .m(4)
13774 .n(8)
13775 .k(2)
13776 .a_stride(5)
13777 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13778 }
13779
13780 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
13781 TEST_REQUIRES_ARM_NEON;
13782 for (uint32_t m = 1; m <= 4; m++) {
13783 for (uint32_t n = 1; n <= 8; n++) {
13784 GemmMicrokernelTester()
13785 .mr(4)
13786 .nr(8)
13787 .kr(1)
13788 .sr(1)
13789 .m(m)
13790 .n(n)
13791 .k(2)
13792 .iterations(1)
13793 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13794 }
13795 }
13796 }
13797
13798 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
13799 TEST_REQUIRES_ARM_NEON;
13800 for (uint32_t m = 1; m <= 4; m++) {
13801 GemmMicrokernelTester()
13802 .mr(4)
13803 .nr(8)
13804 .kr(1)
13805 .sr(1)
13806 .m(m)
13807 .n(8)
13808 .k(2)
13809 .iterations(1)
13810 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13811 }
13812 }
13813
13814 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
13815 TEST_REQUIRES_ARM_NEON;
13816 for (uint32_t n = 1; n <= 8; n++) {
13817 GemmMicrokernelTester()
13818 .mr(4)
13819 .nr(8)
13820 .kr(1)
13821 .sr(1)
13822 .m(4)
13823 .n(n)
13824 .k(2)
13825 .iterations(1)
13826 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13827 }
13828 }
13829
13830 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2) {
13831 TEST_REQUIRES_ARM_NEON;
13832 for (size_t k = 1; k < 2; k++) {
13833 GemmMicrokernelTester()
13834 .mr(4)
13835 .nr(8)
13836 .kr(1)
13837 .sr(1)
13838 .m(4)
13839 .n(8)
13840 .k(k)
13841 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13842 }
13843 }
13844
13845 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
13846 TEST_REQUIRES_ARM_NEON;
13847 for (size_t k = 1; k < 2; k++) {
13848 GemmMicrokernelTester()
13849 .mr(4)
13850 .nr(8)
13851 .kr(1)
13852 .sr(1)
13853 .m(4)
13854 .n(8)
13855 .k(k)
13856 .a_stride(5)
13857 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13858 }
13859 }
13860
13861 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
13862 TEST_REQUIRES_ARM_NEON;
13863 for (size_t k = 1; k < 2; k++) {
13864 for (uint32_t m = 1; m <= 4; m++) {
13865 for (uint32_t n = 1; n <= 8; n++) {
13866 GemmMicrokernelTester()
13867 .mr(4)
13868 .nr(8)
13869 .kr(1)
13870 .sr(1)
13871 .m(m)
13872 .n(n)
13873 .k(k)
13874 .iterations(1)
13875 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13876 }
13877 }
13878 }
13879 }
13880
13881 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2) {
13882 TEST_REQUIRES_ARM_NEON;
13883 for (size_t k = 3; k < 4; k++) {
13884 GemmMicrokernelTester()
13885 .mr(4)
13886 .nr(8)
13887 .kr(1)
13888 .sr(1)
13889 .m(4)
13890 .n(8)
13891 .k(k)
13892 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13893 }
13894 }
13895
13896 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
13897 TEST_REQUIRES_ARM_NEON;
13898 for (size_t k = 3; k < 4; k++) {
13899 GemmMicrokernelTester()
13900 .mr(4)
13901 .nr(8)
13902 .kr(1)
13903 .sr(1)
13904 .m(4)
13905 .n(8)
13906 .k(k)
13907 .a_stride(7)
13908 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13909 }
13910 }
13911
13912 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
13913 TEST_REQUIRES_ARM_NEON;
13914 for (size_t k = 3; k < 4; k++) {
13915 for (uint32_t m = 1; m <= 4; m++) {
13916 for (uint32_t n = 1; n <= 8; n++) {
13917 GemmMicrokernelTester()
13918 .mr(4)
13919 .nr(8)
13920 .kr(1)
13921 .sr(1)
13922 .m(m)
13923 .n(n)
13924 .k(k)
13925 .iterations(1)
13926 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13927 }
13928 }
13929 }
13930 }
13931
13932 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2) {
13933 TEST_REQUIRES_ARM_NEON;
13934 for (size_t k = 4; k <= 20; k += 2) {
13935 GemmMicrokernelTester()
13936 .mr(4)
13937 .nr(8)
13938 .kr(1)
13939 .sr(1)
13940 .m(4)
13941 .n(8)
13942 .k(k)
13943 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13944 }
13945 }
13946
13947 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
13948 TEST_REQUIRES_ARM_NEON;
13949 for (size_t k = 4; k <= 20; k += 2) {
13950 GemmMicrokernelTester()
13951 .mr(4)
13952 .nr(8)
13953 .kr(1)
13954 .sr(1)
13955 .m(4)
13956 .n(8)
13957 .k(k)
13958 .a_stride(23)
13959 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13960 }
13961 }
13962
13963 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2_subtile) {
13964 TEST_REQUIRES_ARM_NEON;
13965 for (size_t k = 4; k <= 20; k += 2) {
13966 for (uint32_t m = 1; m <= 4; m++) {
13967 for (uint32_t n = 1; n <= 8; n++) {
13968 GemmMicrokernelTester()
13969 .mr(4)
13970 .nr(8)
13971 .kr(1)
13972 .sr(1)
13973 .m(m)
13974 .n(n)
13975 .k(k)
13976 .iterations(1)
13977 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13978 }
13979 }
13980 }
13981 }
13982
13983 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8) {
13984 TEST_REQUIRES_ARM_NEON;
13985 for (uint32_t n = 9; n < 16; n++) {
13986 for (size_t k = 1; k <= 10; k += 3) {
13987 GemmMicrokernelTester()
13988 .mr(4)
13989 .nr(8)
13990 .kr(1)
13991 .sr(1)
13992 .m(4)
13993 .n(8)
13994 .k(k)
13995 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
13996 }
13997 }
13998 }
13999
14000 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
14001 TEST_REQUIRES_ARM_NEON;
14002 for (uint32_t n = 9; n < 16; n++) {
14003 for (size_t k = 1; k <= 10; k += 3) {
14004 GemmMicrokernelTester()
14005 .mr(4)
14006 .nr(8)
14007 .kr(1)
14008 .sr(1)
14009 .m(4)
14010 .n(8)
14011 .k(k)
14012 .cn_stride(11)
14013 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14014 }
14015 }
14016 }
14017
14018 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
14019 TEST_REQUIRES_ARM_NEON;
14020 for (uint32_t n = 9; n < 16; n++) {
14021 for (size_t k = 1; k <= 10; k += 3) {
14022 GemmMicrokernelTester()
14023 .mr(4)
14024 .nr(8)
14025 .kr(1)
14026 .sr(1)
14027 .m(4)
14028 .n(n)
14029 .k(k)
14030 .a_stride(13)
14031 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14032 }
14033 }
14034 }
14035
14036 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
14037 TEST_REQUIRES_ARM_NEON;
14038 for (uint32_t n = 9; n < 16; n++) {
14039 for (size_t k = 1; k <= 10; k += 3) {
14040 for (uint32_t m = 1; m <= 4; m++) {
14041 GemmMicrokernelTester()
14042 .mr(4)
14043 .nr(8)
14044 .kr(1)
14045 .sr(1)
14046 .m(m)
14047 .n(n)
14048 .k(k)
14049 .iterations(1)
14050 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14051 }
14052 }
14053 }
14054 }
14055
14056 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8) {
14057 TEST_REQUIRES_ARM_NEON;
14058 for (uint32_t n = 16; n <= 24; n += 8) {
14059 for (size_t k = 1; k <= 10; k += 3) {
14060 GemmMicrokernelTester()
14061 .mr(4)
14062 .nr(8)
14063 .kr(1)
14064 .sr(1)
14065 .m(4)
14066 .n(8)
14067 .k(k)
14068 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14069 }
14070 }
14071 }
14072
14073 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
14074 TEST_REQUIRES_ARM_NEON;
14075 for (uint32_t n = 16; n <= 24; n += 8) {
14076 for (size_t k = 1; k <= 10; k += 3) {
14077 GemmMicrokernelTester()
14078 .mr(4)
14079 .nr(8)
14080 .kr(1)
14081 .sr(1)
14082 .m(4)
14083 .n(n)
14084 .k(k)
14085 .cn_stride(11)
14086 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14087 }
14088 }
14089 }
14090
14091 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
14092 TEST_REQUIRES_ARM_NEON;
14093 for (uint32_t n = 16; n <= 24; n += 8) {
14094 for (size_t k = 1; k <= 10; k += 3) {
14095 GemmMicrokernelTester()
14096 .mr(4)
14097 .nr(8)
14098 .kr(1)
14099 .sr(1)
14100 .m(4)
14101 .n(n)
14102 .k(k)
14103 .a_stride(13)
14104 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14105 }
14106 }
14107 }
14108
14109 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_subtile) {
14110 TEST_REQUIRES_ARM_NEON;
14111 for (uint32_t n = 16; n <= 24; n += 8) {
14112 for (size_t k = 1; k <= 10; k += 3) {
14113 for (uint32_t m = 1; m <= 4; m++) {
14114 GemmMicrokernelTester()
14115 .mr(4)
14116 .nr(8)
14117 .kr(1)
14118 .sr(1)
14119 .m(m)
14120 .n(n)
14121 .k(k)
14122 .iterations(1)
14123 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14124 }
14125 }
14126 }
14127 }
14128
14129 TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cm_subtile) {
14130 TEST_REQUIRES_ARM_NEON;
14131 for (size_t k = 1; k <= 10; k += 3) {
14132 for (uint32_t m = 1; m <= 4; m++) {
14133 for (uint32_t n = 1; n <= 8; n++) {
14134 GemmMicrokernelTester()
14135 .mr(4)
14136 .nr(8)
14137 .kr(1)
14138 .sr(1)
14139 .m(m)
14140 .n(n)
14141 .k(k)
14142 .cm_stride(11)
14143 .iterations(1)
14144 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14145 }
14146 }
14147 }
14148 }
14149
14150 TEST(F32_GEMM_4X8__NEON_LANE_LD64, qmin) {
14151 TEST_REQUIRES_ARM_NEON;
14152 GemmMicrokernelTester()
14153 .mr(4)
14154 .nr(8)
14155 .kr(1)
14156 .sr(1)
14157 .m(4)
14158 .n(8)
14159 .k(2)
14160 .qmin(128)
14161 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14162 }
14163
14164 TEST(F32_GEMM_4X8__NEON_LANE_LD64, qmax) {
14165 TEST_REQUIRES_ARM_NEON;
14166 GemmMicrokernelTester()
14167 .mr(4)
14168 .nr(8)
14169 .kr(1)
14170 .sr(1)
14171 .m(4)
14172 .n(8)
14173 .k(2)
14174 .qmax(128)
14175 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14176 }
14177
14178 TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cm) {
14179 TEST_REQUIRES_ARM_NEON;
14180 GemmMicrokernelTester()
14181 .mr(4)
14182 .nr(8)
14183 .kr(1)
14184 .sr(1)
14185 .m(4)
14186 .n(8)
14187 .k(2)
14188 .cm_stride(11)
14189 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
14190 }
14191#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14192
14193
14194#if XNN_ARCH_ARM || XNN_ARCH_ARM64
14195 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4) {
14196 TEST_REQUIRES_ARM_NEON;
14197 GemmMicrokernelTester()
14198 .mr(4)
14199 .nr(8)
14200 .kr(1)
14201 .sr(1)
14202 .m(4)
14203 .n(8)
14204 .k(4)
14205 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14206 }
14207
14208 TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cn) {
14209 TEST_REQUIRES_ARM_NEON;
14210 GemmMicrokernelTester()
14211 .mr(4)
14212 .nr(8)
14213 .kr(1)
14214 .sr(1)
14215 .m(4)
14216 .n(8)
14217 .k(4)
14218 .cn_stride(11)
14219 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14220 }
14221
14222 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
14223 TEST_REQUIRES_ARM_NEON;
14224 GemmMicrokernelTester()
14225 .mr(4)
14226 .nr(8)
14227 .kr(1)
14228 .sr(1)
14229 .m(4)
14230 .n(8)
14231 .k(4)
14232 .a_stride(7)
14233 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14234 }
14235
14236 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
14237 TEST_REQUIRES_ARM_NEON;
14238 for (uint32_t m = 1; m <= 4; m++) {
14239 for (uint32_t n = 1; n <= 8; n++) {
14240 GemmMicrokernelTester()
14241 .mr(4)
14242 .nr(8)
14243 .kr(1)
14244 .sr(1)
14245 .m(m)
14246 .n(n)
14247 .k(4)
14248 .iterations(1)
14249 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14250 }
14251 }
14252 }
14253
14254 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
14255 TEST_REQUIRES_ARM_NEON;
14256 for (uint32_t m = 1; m <= 4; m++) {
14257 GemmMicrokernelTester()
14258 .mr(4)
14259 .nr(8)
14260 .kr(1)
14261 .sr(1)
14262 .m(m)
14263 .n(8)
14264 .k(4)
14265 .iterations(1)
14266 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14267 }
14268 }
14269
14270 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
14271 TEST_REQUIRES_ARM_NEON;
14272 for (uint32_t n = 1; n <= 8; n++) {
14273 GemmMicrokernelTester()
14274 .mr(4)
14275 .nr(8)
14276 .kr(1)
14277 .sr(1)
14278 .m(4)
14279 .n(n)
14280 .k(4)
14281 .iterations(1)
14282 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14283 }
14284 }
14285
14286 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4) {
14287 TEST_REQUIRES_ARM_NEON;
14288 for (size_t k = 1; k < 4; k++) {
14289 GemmMicrokernelTester()
14290 .mr(4)
14291 .nr(8)
14292 .kr(1)
14293 .sr(1)
14294 .m(4)
14295 .n(8)
14296 .k(k)
14297 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14298 }
14299 }
14300
14301 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
14302 TEST_REQUIRES_ARM_NEON;
14303 for (size_t k = 1; k < 4; k++) {
14304 GemmMicrokernelTester()
14305 .mr(4)
14306 .nr(8)
14307 .kr(1)
14308 .sr(1)
14309 .m(4)
14310 .n(8)
14311 .k(k)
14312 .a_stride(7)
14313 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14314 }
14315 }
14316
14317 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
14318 TEST_REQUIRES_ARM_NEON;
14319 for (size_t k = 1; k < 4; k++) {
14320 for (uint32_t m = 1; m <= 4; m++) {
14321 for (uint32_t n = 1; n <= 8; n++) {
14322 GemmMicrokernelTester()
14323 .mr(4)
14324 .nr(8)
14325 .kr(1)
14326 .sr(1)
14327 .m(m)
14328 .n(n)
14329 .k(k)
14330 .iterations(1)
14331 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14332 }
14333 }
14334 }
14335 }
14336
14337 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4) {
14338 TEST_REQUIRES_ARM_NEON;
14339 for (size_t k = 5; k < 8; k++) {
14340 GemmMicrokernelTester()
14341 .mr(4)
14342 .nr(8)
14343 .kr(1)
14344 .sr(1)
14345 .m(4)
14346 .n(8)
14347 .k(k)
14348 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14349 }
14350 }
14351
14352 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
14353 TEST_REQUIRES_ARM_NEON;
14354 for (size_t k = 5; k < 8; k++) {
14355 GemmMicrokernelTester()
14356 .mr(4)
14357 .nr(8)
14358 .kr(1)
14359 .sr(1)
14360 .m(4)
14361 .n(8)
14362 .k(k)
14363 .a_stride(11)
14364 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14365 }
14366 }
14367
14368 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
14369 TEST_REQUIRES_ARM_NEON;
14370 for (size_t k = 5; k < 8; k++) {
14371 for (uint32_t m = 1; m <= 4; m++) {
14372 for (uint32_t n = 1; n <= 8; n++) {
14373 GemmMicrokernelTester()
14374 .mr(4)
14375 .nr(8)
14376 .kr(1)
14377 .sr(1)
14378 .m(m)
14379 .n(n)
14380 .k(k)
14381 .iterations(1)
14382 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14383 }
14384 }
14385 }
14386 }
14387
14388 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4) {
14389 TEST_REQUIRES_ARM_NEON;
14390 for (size_t k = 8; k <= 40; k += 4) {
14391 GemmMicrokernelTester()
14392 .mr(4)
14393 .nr(8)
14394 .kr(1)
14395 .sr(1)
14396 .m(4)
14397 .n(8)
14398 .k(k)
14399 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14400 }
14401 }
14402
14403 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
14404 TEST_REQUIRES_ARM_NEON;
14405 for (size_t k = 8; k <= 40; k += 4) {
14406 GemmMicrokernelTester()
14407 .mr(4)
14408 .nr(8)
14409 .kr(1)
14410 .sr(1)
14411 .m(4)
14412 .n(8)
14413 .k(k)
14414 .a_stride(43)
14415 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14416 }
14417 }
14418
14419 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4_subtile) {
14420 TEST_REQUIRES_ARM_NEON;
14421 for (size_t k = 8; k <= 40; k += 4) {
14422 for (uint32_t m = 1; m <= 4; m++) {
14423 for (uint32_t n = 1; n <= 8; n++) {
14424 GemmMicrokernelTester()
14425 .mr(4)
14426 .nr(8)
14427 .kr(1)
14428 .sr(1)
14429 .m(m)
14430 .n(n)
14431 .k(k)
14432 .iterations(1)
14433 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14434 }
14435 }
14436 }
14437 }
14438
14439 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8) {
14440 TEST_REQUIRES_ARM_NEON;
14441 for (uint32_t n = 9; n < 16; n++) {
14442 for (size_t k = 1; k <= 20; k += 5) {
14443 GemmMicrokernelTester()
14444 .mr(4)
14445 .nr(8)
14446 .kr(1)
14447 .sr(1)
14448 .m(4)
14449 .n(8)
14450 .k(k)
14451 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14452 }
14453 }
14454 }
14455
14456 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
14457 TEST_REQUIRES_ARM_NEON;
14458 for (uint32_t n = 9; n < 16; n++) {
14459 for (size_t k = 1; k <= 20; k += 5) {
14460 GemmMicrokernelTester()
14461 .mr(4)
14462 .nr(8)
14463 .kr(1)
14464 .sr(1)
14465 .m(4)
14466 .n(8)
14467 .k(k)
14468 .cn_stride(11)
14469 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14470 }
14471 }
14472 }
14473
14474 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
14475 TEST_REQUIRES_ARM_NEON;
14476 for (uint32_t n = 9; n < 16; n++) {
14477 for (size_t k = 1; k <= 20; k += 5) {
14478 GemmMicrokernelTester()
14479 .mr(4)
14480 .nr(8)
14481 .kr(1)
14482 .sr(1)
14483 .m(4)
14484 .n(n)
14485 .k(k)
14486 .a_stride(23)
14487 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14488 }
14489 }
14490 }
14491
14492 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
14493 TEST_REQUIRES_ARM_NEON;
14494 for (uint32_t n = 9; n < 16; n++) {
14495 for (size_t k = 1; k <= 20; k += 5) {
14496 for (uint32_t m = 1; m <= 4; m++) {
14497 GemmMicrokernelTester()
14498 .mr(4)
14499 .nr(8)
14500 .kr(1)
14501 .sr(1)
14502 .m(m)
14503 .n(n)
14504 .k(k)
14505 .iterations(1)
14506 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14507 }
14508 }
14509 }
14510 }
14511
14512 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8) {
14513 TEST_REQUIRES_ARM_NEON;
14514 for (uint32_t n = 16; n <= 24; n += 8) {
14515 for (size_t k = 1; k <= 20; k += 5) {
14516 GemmMicrokernelTester()
14517 .mr(4)
14518 .nr(8)
14519 .kr(1)
14520 .sr(1)
14521 .m(4)
14522 .n(8)
14523 .k(k)
14524 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14525 }
14526 }
14527 }
14528
14529 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
14530 TEST_REQUIRES_ARM_NEON;
14531 for (uint32_t n = 16; n <= 24; n += 8) {
14532 for (size_t k = 1; k <= 20; k += 5) {
14533 GemmMicrokernelTester()
14534 .mr(4)
14535 .nr(8)
14536 .kr(1)
14537 .sr(1)
14538 .m(4)
14539 .n(n)
14540 .k(k)
14541 .cn_stride(11)
14542 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14543 }
14544 }
14545 }
14546
14547 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
14548 TEST_REQUIRES_ARM_NEON;
14549 for (uint32_t n = 16; n <= 24; n += 8) {
14550 for (size_t k = 1; k <= 20; k += 5) {
14551 GemmMicrokernelTester()
14552 .mr(4)
14553 .nr(8)
14554 .kr(1)
14555 .sr(1)
14556 .m(4)
14557 .n(n)
14558 .k(k)
14559 .a_stride(23)
14560 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14561 }
14562 }
14563 }
14564
14565 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_subtile) {
14566 TEST_REQUIRES_ARM_NEON;
14567 for (uint32_t n = 16; n <= 24; n += 8) {
14568 for (size_t k = 1; k <= 20; k += 5) {
14569 for (uint32_t m = 1; m <= 4; m++) {
14570 GemmMicrokernelTester()
14571 .mr(4)
14572 .nr(8)
14573 .kr(1)
14574 .sr(1)
14575 .m(m)
14576 .n(n)
14577 .k(k)
14578 .iterations(1)
14579 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14580 }
14581 }
14582 }
14583 }
14584
14585 TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cm_subtile) {
14586 TEST_REQUIRES_ARM_NEON;
14587 for (size_t k = 1; k <= 20; k += 5) {
14588 for (uint32_t m = 1; m <= 4; m++) {
14589 for (uint32_t n = 1; n <= 8; n++) {
14590 GemmMicrokernelTester()
14591 .mr(4)
14592 .nr(8)
14593 .kr(1)
14594 .sr(1)
14595 .m(m)
14596 .n(n)
14597 .k(k)
14598 .cm_stride(11)
14599 .iterations(1)
14600 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14601 }
14602 }
14603 }
14604 }
14605
14606 TEST(F32_GEMM_4X8__NEON_LANE_LD128, qmin) {
14607 TEST_REQUIRES_ARM_NEON;
14608 GemmMicrokernelTester()
14609 .mr(4)
14610 .nr(8)
14611 .kr(1)
14612 .sr(1)
14613 .m(4)
14614 .n(8)
14615 .k(4)
14616 .qmin(128)
14617 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14618 }
14619
14620 TEST(F32_GEMM_4X8__NEON_LANE_LD128, qmax) {
14621 TEST_REQUIRES_ARM_NEON;
14622 GemmMicrokernelTester()
14623 .mr(4)
14624 .nr(8)
14625 .kr(1)
14626 .sr(1)
14627 .m(4)
14628 .n(8)
14629 .k(4)
14630 .qmax(128)
14631 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14632 }
14633
14634 TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cm) {
14635 TEST_REQUIRES_ARM_NEON;
14636 GemmMicrokernelTester()
14637 .mr(4)
14638 .nr(8)
14639 .kr(1)
14640 .sr(1)
14641 .m(4)
14642 .n(8)
14643 .k(4)
14644 .cm_stride(11)
14645 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
14646 }
14647#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14648
14649
14650#if XNN_ARCH_ARM || XNN_ARCH_ARM64
14651 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2) {
14652 TEST_REQUIRES_ARM_NEON;
14653 GemmMicrokernelTester()
14654 .mr(5)
14655 .nr(8)
14656 .kr(1)
14657 .sr(1)
14658 .m(5)
14659 .n(8)
14660 .k(2)
14661 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14662 }
14663
14664 TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cn) {
14665 TEST_REQUIRES_ARM_NEON;
14666 GemmMicrokernelTester()
14667 .mr(5)
14668 .nr(8)
14669 .kr(1)
14670 .sr(1)
14671 .m(5)
14672 .n(8)
14673 .k(2)
14674 .cn_stride(11)
14675 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14676 }
14677
14678 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
14679 TEST_REQUIRES_ARM_NEON;
14680 GemmMicrokernelTester()
14681 .mr(5)
14682 .nr(8)
14683 .kr(1)
14684 .sr(1)
14685 .m(5)
14686 .n(8)
14687 .k(2)
14688 .a_stride(5)
14689 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14690 }
14691
14692 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
14693 TEST_REQUIRES_ARM_NEON;
14694 for (uint32_t m = 1; m <= 5; m++) {
14695 for (uint32_t n = 1; n <= 8; n++) {
14696 GemmMicrokernelTester()
14697 .mr(5)
14698 .nr(8)
14699 .kr(1)
14700 .sr(1)
14701 .m(m)
14702 .n(n)
14703 .k(2)
14704 .iterations(1)
14705 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14706 }
14707 }
14708 }
14709
14710 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
14711 TEST_REQUIRES_ARM_NEON;
14712 for (uint32_t m = 1; m <= 5; m++) {
14713 GemmMicrokernelTester()
14714 .mr(5)
14715 .nr(8)
14716 .kr(1)
14717 .sr(1)
14718 .m(m)
14719 .n(8)
14720 .k(2)
14721 .iterations(1)
14722 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14723 }
14724 }
14725
14726 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
14727 TEST_REQUIRES_ARM_NEON;
14728 for (uint32_t n = 1; n <= 8; n++) {
14729 GemmMicrokernelTester()
14730 .mr(5)
14731 .nr(8)
14732 .kr(1)
14733 .sr(1)
14734 .m(5)
14735 .n(n)
14736 .k(2)
14737 .iterations(1)
14738 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14739 }
14740 }
14741
14742 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2) {
14743 TEST_REQUIRES_ARM_NEON;
14744 for (size_t k = 1; k < 2; k++) {
14745 GemmMicrokernelTester()
14746 .mr(5)
14747 .nr(8)
14748 .kr(1)
14749 .sr(1)
14750 .m(5)
14751 .n(8)
14752 .k(k)
14753 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14754 }
14755 }
14756
14757 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
14758 TEST_REQUIRES_ARM_NEON;
14759 for (size_t k = 1; k < 2; k++) {
14760 GemmMicrokernelTester()
14761 .mr(5)
14762 .nr(8)
14763 .kr(1)
14764 .sr(1)
14765 .m(5)
14766 .n(8)
14767 .k(k)
14768 .a_stride(5)
14769 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14770 }
14771 }
14772
14773 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
14774 TEST_REQUIRES_ARM_NEON;
14775 for (size_t k = 1; k < 2; k++) {
14776 for (uint32_t m = 1; m <= 5; m++) {
14777 for (uint32_t n = 1; n <= 8; n++) {
14778 GemmMicrokernelTester()
14779 .mr(5)
14780 .nr(8)
14781 .kr(1)
14782 .sr(1)
14783 .m(m)
14784 .n(n)
14785 .k(k)
14786 .iterations(1)
14787 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14788 }
14789 }
14790 }
14791 }
14792
14793 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2) {
14794 TEST_REQUIRES_ARM_NEON;
14795 for (size_t k = 3; k < 4; k++) {
14796 GemmMicrokernelTester()
14797 .mr(5)
14798 .nr(8)
14799 .kr(1)
14800 .sr(1)
14801 .m(5)
14802 .n(8)
14803 .k(k)
14804 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14805 }
14806 }
14807
14808 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
14809 TEST_REQUIRES_ARM_NEON;
14810 for (size_t k = 3; k < 4; k++) {
14811 GemmMicrokernelTester()
14812 .mr(5)
14813 .nr(8)
14814 .kr(1)
14815 .sr(1)
14816 .m(5)
14817 .n(8)
14818 .k(k)
14819 .a_stride(7)
14820 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14821 }
14822 }
14823
14824 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
14825 TEST_REQUIRES_ARM_NEON;
14826 for (size_t k = 3; k < 4; k++) {
14827 for (uint32_t m = 1; m <= 5; m++) {
14828 for (uint32_t n = 1; n <= 8; n++) {
14829 GemmMicrokernelTester()
14830 .mr(5)
14831 .nr(8)
14832 .kr(1)
14833 .sr(1)
14834 .m(m)
14835 .n(n)
14836 .k(k)
14837 .iterations(1)
14838 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14839 }
14840 }
14841 }
14842 }
14843
14844 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2) {
14845 TEST_REQUIRES_ARM_NEON;
14846 for (size_t k = 4; k <= 20; k += 2) {
14847 GemmMicrokernelTester()
14848 .mr(5)
14849 .nr(8)
14850 .kr(1)
14851 .sr(1)
14852 .m(5)
14853 .n(8)
14854 .k(k)
14855 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14856 }
14857 }
14858
14859 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
14860 TEST_REQUIRES_ARM_NEON;
14861 for (size_t k = 4; k <= 20; k += 2) {
14862 GemmMicrokernelTester()
14863 .mr(5)
14864 .nr(8)
14865 .kr(1)
14866 .sr(1)
14867 .m(5)
14868 .n(8)
14869 .k(k)
14870 .a_stride(23)
14871 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14872 }
14873 }
14874
14875 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2_subtile) {
14876 TEST_REQUIRES_ARM_NEON;
14877 for (size_t k = 4; k <= 20; k += 2) {
14878 for (uint32_t m = 1; m <= 5; m++) {
14879 for (uint32_t n = 1; n <= 8; n++) {
14880 GemmMicrokernelTester()
14881 .mr(5)
14882 .nr(8)
14883 .kr(1)
14884 .sr(1)
14885 .m(m)
14886 .n(n)
14887 .k(k)
14888 .iterations(1)
14889 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14890 }
14891 }
14892 }
14893 }
14894
14895 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8) {
14896 TEST_REQUIRES_ARM_NEON;
14897 for (uint32_t n = 9; n < 16; n++) {
14898 for (size_t k = 1; k <= 10; k += 3) {
14899 GemmMicrokernelTester()
14900 .mr(5)
14901 .nr(8)
14902 .kr(1)
14903 .sr(1)
14904 .m(5)
14905 .n(8)
14906 .k(k)
14907 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14908 }
14909 }
14910 }
14911
14912 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
14913 TEST_REQUIRES_ARM_NEON;
14914 for (uint32_t n = 9; n < 16; n++) {
14915 for (size_t k = 1; k <= 10; k += 3) {
14916 GemmMicrokernelTester()
14917 .mr(5)
14918 .nr(8)
14919 .kr(1)
14920 .sr(1)
14921 .m(5)
14922 .n(8)
14923 .k(k)
14924 .cn_stride(11)
14925 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14926 }
14927 }
14928 }
14929
14930 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
14931 TEST_REQUIRES_ARM_NEON;
14932 for (uint32_t n = 9; n < 16; n++) {
14933 for (size_t k = 1; k <= 10; k += 3) {
14934 GemmMicrokernelTester()
14935 .mr(5)
14936 .nr(8)
14937 .kr(1)
14938 .sr(1)
14939 .m(5)
14940 .n(n)
14941 .k(k)
14942 .a_stride(13)
14943 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14944 }
14945 }
14946 }
14947
14948 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
14949 TEST_REQUIRES_ARM_NEON;
14950 for (uint32_t n = 9; n < 16; n++) {
14951 for (size_t k = 1; k <= 10; k += 3) {
14952 for (uint32_t m = 1; m <= 5; m++) {
14953 GemmMicrokernelTester()
14954 .mr(5)
14955 .nr(8)
14956 .kr(1)
14957 .sr(1)
14958 .m(m)
14959 .n(n)
14960 .k(k)
14961 .iterations(1)
14962 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14963 }
14964 }
14965 }
14966 }
14967
14968 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8) {
14969 TEST_REQUIRES_ARM_NEON;
14970 for (uint32_t n = 16; n <= 24; n += 8) {
14971 for (size_t k = 1; k <= 10; k += 3) {
14972 GemmMicrokernelTester()
14973 .mr(5)
14974 .nr(8)
14975 .kr(1)
14976 .sr(1)
14977 .m(5)
14978 .n(8)
14979 .k(k)
14980 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14981 }
14982 }
14983 }
14984
14985 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
14986 TEST_REQUIRES_ARM_NEON;
14987 for (uint32_t n = 16; n <= 24; n += 8) {
14988 for (size_t k = 1; k <= 10; k += 3) {
14989 GemmMicrokernelTester()
14990 .mr(5)
14991 .nr(8)
14992 .kr(1)
14993 .sr(1)
14994 .m(5)
14995 .n(n)
14996 .k(k)
14997 .cn_stride(11)
14998 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
14999 }
15000 }
15001 }
15002
15003 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
15004 TEST_REQUIRES_ARM_NEON;
15005 for (uint32_t n = 16; n <= 24; n += 8) {
15006 for (size_t k = 1; k <= 10; k += 3) {
15007 GemmMicrokernelTester()
15008 .mr(5)
15009 .nr(8)
15010 .kr(1)
15011 .sr(1)
15012 .m(5)
15013 .n(n)
15014 .k(k)
15015 .a_stride(13)
15016 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
15017 }
15018 }
15019 }
15020
15021 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_subtile) {
15022 TEST_REQUIRES_ARM_NEON;
15023 for (uint32_t n = 16; n <= 24; n += 8) {
15024 for (size_t k = 1; k <= 10; k += 3) {
15025 for (uint32_t m = 1; m <= 5; m++) {
15026 GemmMicrokernelTester()
15027 .mr(5)
15028 .nr(8)
15029 .kr(1)
15030 .sr(1)
15031 .m(m)
15032 .n(n)
15033 .k(k)
15034 .iterations(1)
15035 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
15036 }
15037 }
15038 }
15039 }
15040
15041 TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cm_subtile) {
15042 TEST_REQUIRES_ARM_NEON;
15043 for (size_t k = 1; k <= 10; k += 3) {
15044 for (uint32_t m = 1; m <= 5; m++) {
15045 for (uint32_t n = 1; n <= 8; n++) {
15046 GemmMicrokernelTester()
15047 .mr(5)
15048 .nr(8)
15049 .kr(1)
15050 .sr(1)
15051 .m(m)
15052 .n(n)
15053 .k(k)
15054 .cm_stride(11)
15055 .iterations(1)
15056 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
15057 }
15058 }
15059 }
15060 }
15061
15062 TEST(F32_GEMM_5X8__NEON_LANE_LD64, qmin) {
15063 TEST_REQUIRES_ARM_NEON;
15064 GemmMicrokernelTester()
15065 .mr(5)
15066 .nr(8)
15067 .kr(1)
15068 .sr(1)
15069 .m(5)
15070 .n(8)
15071 .k(2)
15072 .qmin(128)
15073 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
15074 }
15075
15076 TEST(F32_GEMM_5X8__NEON_LANE_LD64, qmax) {
15077 TEST_REQUIRES_ARM_NEON;
15078 GemmMicrokernelTester()
15079 .mr(5)
15080 .nr(8)
15081 .kr(1)
15082 .sr(1)
15083 .m(5)
15084 .n(8)
15085 .k(2)
15086 .qmax(128)
15087 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
15088 }
15089
15090 TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cm) {
15091 TEST_REQUIRES_ARM_NEON;
15092 GemmMicrokernelTester()
15093 .mr(5)
15094 .nr(8)
15095 .kr(1)
15096 .sr(1)
15097 .m(5)
15098 .n(8)
15099 .k(2)
15100 .cm_stride(11)
15101 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
15102 }
15103#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15104
15105
15106#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15107 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2) {
15108 TEST_REQUIRES_ARM_NEON;
15109 GemmMicrokernelTester()
15110 .mr(6)
15111 .nr(8)
15112 .kr(1)
15113 .sr(1)
15114 .m(6)
15115 .n(8)
15116 .k(2)
15117 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15118 }
15119
15120 TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cn) {
15121 TEST_REQUIRES_ARM_NEON;
15122 GemmMicrokernelTester()
15123 .mr(6)
15124 .nr(8)
15125 .kr(1)
15126 .sr(1)
15127 .m(6)
15128 .n(8)
15129 .k(2)
15130 .cn_stride(11)
15131 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15132 }
15133
15134 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_strided_a) {
15135 TEST_REQUIRES_ARM_NEON;
15136 GemmMicrokernelTester()
15137 .mr(6)
15138 .nr(8)
15139 .kr(1)
15140 .sr(1)
15141 .m(6)
15142 .n(8)
15143 .k(2)
15144 .a_stride(5)
15145 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15146 }
15147
15148 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
15149 TEST_REQUIRES_ARM_NEON;
15150 for (uint32_t m = 1; m <= 6; m++) {
15151 for (uint32_t n = 1; n <= 8; n++) {
15152 GemmMicrokernelTester()
15153 .mr(6)
15154 .nr(8)
15155 .kr(1)
15156 .sr(1)
15157 .m(m)
15158 .n(n)
15159 .k(2)
15160 .iterations(1)
15161 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15162 }
15163 }
15164 }
15165
15166 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
15167 TEST_REQUIRES_ARM_NEON;
15168 for (uint32_t m = 1; m <= 6; m++) {
15169 GemmMicrokernelTester()
15170 .mr(6)
15171 .nr(8)
15172 .kr(1)
15173 .sr(1)
15174 .m(m)
15175 .n(8)
15176 .k(2)
15177 .iterations(1)
15178 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15179 }
15180 }
15181
15182 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
15183 TEST_REQUIRES_ARM_NEON;
15184 for (uint32_t n = 1; n <= 8; n++) {
15185 GemmMicrokernelTester()
15186 .mr(6)
15187 .nr(8)
15188 .kr(1)
15189 .sr(1)
15190 .m(6)
15191 .n(n)
15192 .k(2)
15193 .iterations(1)
15194 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15195 }
15196 }
15197
15198 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2) {
15199 TEST_REQUIRES_ARM_NEON;
15200 for (size_t k = 1; k < 2; k++) {
15201 GemmMicrokernelTester()
15202 .mr(6)
15203 .nr(8)
15204 .kr(1)
15205 .sr(1)
15206 .m(6)
15207 .n(8)
15208 .k(k)
15209 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15210 }
15211 }
15212
15213 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2_strided_a) {
15214 TEST_REQUIRES_ARM_NEON;
15215 for (size_t k = 1; k < 2; k++) {
15216 GemmMicrokernelTester()
15217 .mr(6)
15218 .nr(8)
15219 .kr(1)
15220 .sr(1)
15221 .m(6)
15222 .n(8)
15223 .k(k)
15224 .a_stride(5)
15225 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15226 }
15227 }
15228
15229 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
15230 TEST_REQUIRES_ARM_NEON;
15231 for (size_t k = 1; k < 2; k++) {
15232 for (uint32_t m = 1; m <= 6; m++) {
15233 for (uint32_t n = 1; n <= 8; n++) {
15234 GemmMicrokernelTester()
15235 .mr(6)
15236 .nr(8)
15237 .kr(1)
15238 .sr(1)
15239 .m(m)
15240 .n(n)
15241 .k(k)
15242 .iterations(1)
15243 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15244 }
15245 }
15246 }
15247 }
15248
15249 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2) {
15250 TEST_REQUIRES_ARM_NEON;
15251 for (size_t k = 3; k < 4; k++) {
15252 GemmMicrokernelTester()
15253 .mr(6)
15254 .nr(8)
15255 .kr(1)
15256 .sr(1)
15257 .m(6)
15258 .n(8)
15259 .k(k)
15260 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15261 }
15262 }
15263
15264 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2_strided_a) {
15265 TEST_REQUIRES_ARM_NEON;
15266 for (size_t k = 3; k < 4; k++) {
15267 GemmMicrokernelTester()
15268 .mr(6)
15269 .nr(8)
15270 .kr(1)
15271 .sr(1)
15272 .m(6)
15273 .n(8)
15274 .k(k)
15275 .a_stride(7)
15276 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15277 }
15278 }
15279
15280 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
15281 TEST_REQUIRES_ARM_NEON;
15282 for (size_t k = 3; k < 4; k++) {
15283 for (uint32_t m = 1; m <= 6; m++) {
15284 for (uint32_t n = 1; n <= 8; n++) {
15285 GemmMicrokernelTester()
15286 .mr(6)
15287 .nr(8)
15288 .kr(1)
15289 .sr(1)
15290 .m(m)
15291 .n(n)
15292 .k(k)
15293 .iterations(1)
15294 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15295 }
15296 }
15297 }
15298 }
15299
15300 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2) {
15301 TEST_REQUIRES_ARM_NEON;
15302 for (size_t k = 4; k <= 20; k += 2) {
15303 GemmMicrokernelTester()
15304 .mr(6)
15305 .nr(8)
15306 .kr(1)
15307 .sr(1)
15308 .m(6)
15309 .n(8)
15310 .k(k)
15311 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15312 }
15313 }
15314
15315 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2_strided_a) {
15316 TEST_REQUIRES_ARM_NEON;
15317 for (size_t k = 4; k <= 20; k += 2) {
15318 GemmMicrokernelTester()
15319 .mr(6)
15320 .nr(8)
15321 .kr(1)
15322 .sr(1)
15323 .m(6)
15324 .n(8)
15325 .k(k)
15326 .a_stride(23)
15327 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15328 }
15329 }
15330
15331 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2_subtile) {
15332 TEST_REQUIRES_ARM_NEON;
15333 for (size_t k = 4; k <= 20; k += 2) {
15334 for (uint32_t m = 1; m <= 6; m++) {
15335 for (uint32_t n = 1; n <= 8; n++) {
15336 GemmMicrokernelTester()
15337 .mr(6)
15338 .nr(8)
15339 .kr(1)
15340 .sr(1)
15341 .m(m)
15342 .n(n)
15343 .k(k)
15344 .iterations(1)
15345 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15346 }
15347 }
15348 }
15349 }
15350
15351 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8) {
15352 TEST_REQUIRES_ARM_NEON;
15353 for (uint32_t n = 9; n < 16; n++) {
15354 for (size_t k = 1; k <= 10; k += 3) {
15355 GemmMicrokernelTester()
15356 .mr(6)
15357 .nr(8)
15358 .kr(1)
15359 .sr(1)
15360 .m(6)
15361 .n(8)
15362 .k(k)
15363 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15364 }
15365 }
15366 }
15367
15368 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
15369 TEST_REQUIRES_ARM_NEON;
15370 for (uint32_t n = 9; n < 16; n++) {
15371 for (size_t k = 1; k <= 10; k += 3) {
15372 GemmMicrokernelTester()
15373 .mr(6)
15374 .nr(8)
15375 .kr(1)
15376 .sr(1)
15377 .m(6)
15378 .n(8)
15379 .k(k)
15380 .cn_stride(11)
15381 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15382 }
15383 }
15384 }
15385
15386 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_a) {
15387 TEST_REQUIRES_ARM_NEON;
15388 for (uint32_t n = 9; n < 16; n++) {
15389 for (size_t k = 1; k <= 10; k += 3) {
15390 GemmMicrokernelTester()
15391 .mr(6)
15392 .nr(8)
15393 .kr(1)
15394 .sr(1)
15395 .m(6)
15396 .n(n)
15397 .k(k)
15398 .a_stride(13)
15399 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15400 }
15401 }
15402 }
15403
15404 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
15405 TEST_REQUIRES_ARM_NEON;
15406 for (uint32_t n = 9; n < 16; n++) {
15407 for (size_t k = 1; k <= 10; k += 3) {
15408 for (uint32_t m = 1; m <= 6; m++) {
15409 GemmMicrokernelTester()
15410 .mr(6)
15411 .nr(8)
15412 .kr(1)
15413 .sr(1)
15414 .m(m)
15415 .n(n)
15416 .k(k)
15417 .iterations(1)
15418 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15419 }
15420 }
15421 }
15422 }
15423
15424 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8) {
15425 TEST_REQUIRES_ARM_NEON;
15426 for (uint32_t n = 16; n <= 24; n += 8) {
15427 for (size_t k = 1; k <= 10; k += 3) {
15428 GemmMicrokernelTester()
15429 .mr(6)
15430 .nr(8)
15431 .kr(1)
15432 .sr(1)
15433 .m(6)
15434 .n(8)
15435 .k(k)
15436 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15437 }
15438 }
15439 }
15440
15441 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
15442 TEST_REQUIRES_ARM_NEON;
15443 for (uint32_t n = 16; n <= 24; n += 8) {
15444 for (size_t k = 1; k <= 10; k += 3) {
15445 GemmMicrokernelTester()
15446 .mr(6)
15447 .nr(8)
15448 .kr(1)
15449 .sr(1)
15450 .m(6)
15451 .n(n)
15452 .k(k)
15453 .cn_stride(11)
15454 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15455 }
15456 }
15457 }
15458
15459 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_strided_a) {
15460 TEST_REQUIRES_ARM_NEON;
15461 for (uint32_t n = 16; n <= 24; n += 8) {
15462 for (size_t k = 1; k <= 10; k += 3) {
15463 GemmMicrokernelTester()
15464 .mr(6)
15465 .nr(8)
15466 .kr(1)
15467 .sr(1)
15468 .m(6)
15469 .n(n)
15470 .k(k)
15471 .a_stride(13)
15472 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15473 }
15474 }
15475 }
15476
15477 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_subtile) {
15478 TEST_REQUIRES_ARM_NEON;
15479 for (uint32_t n = 16; n <= 24; n += 8) {
15480 for (size_t k = 1; k <= 10; k += 3) {
15481 for (uint32_t m = 1; m <= 6; m++) {
15482 GemmMicrokernelTester()
15483 .mr(6)
15484 .nr(8)
15485 .kr(1)
15486 .sr(1)
15487 .m(m)
15488 .n(n)
15489 .k(k)
15490 .iterations(1)
15491 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15492 }
15493 }
15494 }
15495 }
15496
15497 TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cm_subtile) {
15498 TEST_REQUIRES_ARM_NEON;
15499 for (size_t k = 1; k <= 10; k += 3) {
15500 for (uint32_t m = 1; m <= 6; m++) {
15501 for (uint32_t n = 1; n <= 8; n++) {
15502 GemmMicrokernelTester()
15503 .mr(6)
15504 .nr(8)
15505 .kr(1)
15506 .sr(1)
15507 .m(m)
15508 .n(n)
15509 .k(k)
15510 .cm_stride(11)
15511 .iterations(1)
15512 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15513 }
15514 }
15515 }
15516 }
15517
15518 TEST(F32_GEMM_6X8__NEON_LANE_LD64, qmin) {
15519 TEST_REQUIRES_ARM_NEON;
15520 GemmMicrokernelTester()
15521 .mr(6)
15522 .nr(8)
15523 .kr(1)
15524 .sr(1)
15525 .m(6)
15526 .n(8)
15527 .k(2)
15528 .qmin(128)
15529 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15530 }
15531
15532 TEST(F32_GEMM_6X8__NEON_LANE_LD64, qmax) {
15533 TEST_REQUIRES_ARM_NEON;
15534 GemmMicrokernelTester()
15535 .mr(6)
15536 .nr(8)
15537 .kr(1)
15538 .sr(1)
15539 .m(6)
15540 .n(8)
15541 .k(2)
15542 .qmax(128)
15543 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15544 }
15545
15546 TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cm) {
15547 TEST_REQUIRES_ARM_NEON;
15548 GemmMicrokernelTester()
15549 .mr(6)
15550 .nr(8)
15551 .kr(1)
15552 .sr(1)
15553 .m(6)
15554 .n(8)
15555 .k(2)
15556 .cm_stride(11)
15557 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
15558 }
15559#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15560
15561
15562#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15563 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4) {
15564 TEST_REQUIRES_ARM_NEON;
15565 GemmMicrokernelTester()
15566 .mr(6)
15567 .nr(8)
15568 .kr(1)
15569 .sr(1)
15570 .m(6)
15571 .n(8)
15572 .k(4)
15573 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15574 }
15575
15576 TEST(F32_GEMM_6X8__NEON_LANE_LD128, strided_cn) {
15577 TEST_REQUIRES_ARM_NEON;
15578 GemmMicrokernelTester()
15579 .mr(6)
15580 .nr(8)
15581 .kr(1)
15582 .sr(1)
15583 .m(6)
15584 .n(8)
15585 .k(4)
15586 .cn_stride(11)
15587 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15588 }
15589
15590 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_strided_a) {
15591 TEST_REQUIRES_ARM_NEON;
15592 GemmMicrokernelTester()
15593 .mr(6)
15594 .nr(8)
15595 .kr(1)
15596 .sr(1)
15597 .m(6)
15598 .n(8)
15599 .k(4)
15600 .a_stride(7)
15601 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15602 }
15603
15604 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
15605 TEST_REQUIRES_ARM_NEON;
15606 for (uint32_t m = 1; m <= 6; m++) {
15607 for (uint32_t n = 1; n <= 8; n++) {
15608 GemmMicrokernelTester()
15609 .mr(6)
15610 .nr(8)
15611 .kr(1)
15612 .sr(1)
15613 .m(m)
15614 .n(n)
15615 .k(4)
15616 .iterations(1)
15617 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15618 }
15619 }
15620 }
15621
15622 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
15623 TEST_REQUIRES_ARM_NEON;
15624 for (uint32_t m = 1; m <= 6; m++) {
15625 GemmMicrokernelTester()
15626 .mr(6)
15627 .nr(8)
15628 .kr(1)
15629 .sr(1)
15630 .m(m)
15631 .n(8)
15632 .k(4)
15633 .iterations(1)
15634 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15635 }
15636 }
15637
15638 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
15639 TEST_REQUIRES_ARM_NEON;
15640 for (uint32_t n = 1; n <= 8; n++) {
15641 GemmMicrokernelTester()
15642 .mr(6)
15643 .nr(8)
15644 .kr(1)
15645 .sr(1)
15646 .m(6)
15647 .n(n)
15648 .k(4)
15649 .iterations(1)
15650 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15651 }
15652 }
15653
15654 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_lt_4) {
15655 TEST_REQUIRES_ARM_NEON;
15656 for (size_t k = 1; k < 4; k++) {
15657 GemmMicrokernelTester()
15658 .mr(6)
15659 .nr(8)
15660 .kr(1)
15661 .sr(1)
15662 .m(6)
15663 .n(8)
15664 .k(k)
15665 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15666 }
15667 }
15668
15669 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_lt_4_strided_a) {
15670 TEST_REQUIRES_ARM_NEON;
15671 for (size_t k = 1; k < 4; k++) {
15672 GemmMicrokernelTester()
15673 .mr(6)
15674 .nr(8)
15675 .kr(1)
15676 .sr(1)
15677 .m(6)
15678 .n(8)
15679 .k(k)
15680 .a_stride(7)
15681 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15682 }
15683 }
15684
15685 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
15686 TEST_REQUIRES_ARM_NEON;
15687 for (size_t k = 1; k < 4; k++) {
15688 for (uint32_t m = 1; m <= 6; m++) {
15689 for (uint32_t n = 1; n <= 8; n++) {
15690 GemmMicrokernelTester()
15691 .mr(6)
15692 .nr(8)
15693 .kr(1)
15694 .sr(1)
15695 .m(m)
15696 .n(n)
15697 .k(k)
15698 .iterations(1)
15699 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15700 }
15701 }
15702 }
15703 }
15704
15705 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_gt_4) {
15706 TEST_REQUIRES_ARM_NEON;
15707 for (size_t k = 5; k < 8; k++) {
15708 GemmMicrokernelTester()
15709 .mr(6)
15710 .nr(8)
15711 .kr(1)
15712 .sr(1)
15713 .m(6)
15714 .n(8)
15715 .k(k)
15716 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15717 }
15718 }
15719
15720 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_gt_4_strided_a) {
15721 TEST_REQUIRES_ARM_NEON;
15722 for (size_t k = 5; k < 8; k++) {
15723 GemmMicrokernelTester()
15724 .mr(6)
15725 .nr(8)
15726 .kr(1)
15727 .sr(1)
15728 .m(6)
15729 .n(8)
15730 .k(k)
15731 .a_stride(11)
15732 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15733 }
15734 }
15735
15736 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
15737 TEST_REQUIRES_ARM_NEON;
15738 for (size_t k = 5; k < 8; k++) {
15739 for (uint32_t m = 1; m <= 6; m++) {
15740 for (uint32_t n = 1; n <= 8; n++) {
15741 GemmMicrokernelTester()
15742 .mr(6)
15743 .nr(8)
15744 .kr(1)
15745 .sr(1)
15746 .m(m)
15747 .n(n)
15748 .k(k)
15749 .iterations(1)
15750 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15751 }
15752 }
15753 }
15754 }
15755
15756 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_div_4) {
15757 TEST_REQUIRES_ARM_NEON;
15758 for (size_t k = 8; k <= 40; k += 4) {
15759 GemmMicrokernelTester()
15760 .mr(6)
15761 .nr(8)
15762 .kr(1)
15763 .sr(1)
15764 .m(6)
15765 .n(8)
15766 .k(k)
15767 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15768 }
15769 }
15770
15771 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_div_4_strided_a) {
15772 TEST_REQUIRES_ARM_NEON;
15773 for (size_t k = 8; k <= 40; k += 4) {
15774 GemmMicrokernelTester()
15775 .mr(6)
15776 .nr(8)
15777 .kr(1)
15778 .sr(1)
15779 .m(6)
15780 .n(8)
15781 .k(k)
15782 .a_stride(43)
15783 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15784 }
15785 }
15786
15787 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_div_4_subtile) {
15788 TEST_REQUIRES_ARM_NEON;
15789 for (size_t k = 8; k <= 40; k += 4) {
15790 for (uint32_t m = 1; m <= 6; m++) {
15791 for (uint32_t n = 1; n <= 8; n++) {
15792 GemmMicrokernelTester()
15793 .mr(6)
15794 .nr(8)
15795 .kr(1)
15796 .sr(1)
15797 .m(m)
15798 .n(n)
15799 .k(k)
15800 .iterations(1)
15801 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15802 }
15803 }
15804 }
15805 }
15806
15807 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8) {
15808 TEST_REQUIRES_ARM_NEON;
15809 for (uint32_t n = 9; n < 16; n++) {
15810 for (size_t k = 1; k <= 20; k += 5) {
15811 GemmMicrokernelTester()
15812 .mr(6)
15813 .nr(8)
15814 .kr(1)
15815 .sr(1)
15816 .m(6)
15817 .n(8)
15818 .k(k)
15819 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15820 }
15821 }
15822 }
15823
15824 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
15825 TEST_REQUIRES_ARM_NEON;
15826 for (uint32_t n = 9; n < 16; n++) {
15827 for (size_t k = 1; k <= 20; k += 5) {
15828 GemmMicrokernelTester()
15829 .mr(6)
15830 .nr(8)
15831 .kr(1)
15832 .sr(1)
15833 .m(6)
15834 .n(8)
15835 .k(k)
15836 .cn_stride(11)
15837 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15838 }
15839 }
15840 }
15841
15842 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8_strided_a) {
15843 TEST_REQUIRES_ARM_NEON;
15844 for (uint32_t n = 9; n < 16; n++) {
15845 for (size_t k = 1; k <= 20; k += 5) {
15846 GemmMicrokernelTester()
15847 .mr(6)
15848 .nr(8)
15849 .kr(1)
15850 .sr(1)
15851 .m(6)
15852 .n(n)
15853 .k(k)
15854 .a_stride(23)
15855 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15856 }
15857 }
15858 }
15859
15860 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
15861 TEST_REQUIRES_ARM_NEON;
15862 for (uint32_t n = 9; n < 16; n++) {
15863 for (size_t k = 1; k <= 20; k += 5) {
15864 for (uint32_t m = 1; m <= 6; m++) {
15865 GemmMicrokernelTester()
15866 .mr(6)
15867 .nr(8)
15868 .kr(1)
15869 .sr(1)
15870 .m(m)
15871 .n(n)
15872 .k(k)
15873 .iterations(1)
15874 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15875 }
15876 }
15877 }
15878 }
15879
15880 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8) {
15881 TEST_REQUIRES_ARM_NEON;
15882 for (uint32_t n = 16; n <= 24; n += 8) {
15883 for (size_t k = 1; k <= 20; k += 5) {
15884 GemmMicrokernelTester()
15885 .mr(6)
15886 .nr(8)
15887 .kr(1)
15888 .sr(1)
15889 .m(6)
15890 .n(8)
15891 .k(k)
15892 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15893 }
15894 }
15895 }
15896
15897 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
15898 TEST_REQUIRES_ARM_NEON;
15899 for (uint32_t n = 16; n <= 24; n += 8) {
15900 for (size_t k = 1; k <= 20; k += 5) {
15901 GemmMicrokernelTester()
15902 .mr(6)
15903 .nr(8)
15904 .kr(1)
15905 .sr(1)
15906 .m(6)
15907 .n(n)
15908 .k(k)
15909 .cn_stride(11)
15910 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15911 }
15912 }
15913 }
15914
15915 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8_strided_a) {
15916 TEST_REQUIRES_ARM_NEON;
15917 for (uint32_t n = 16; n <= 24; n += 8) {
15918 for (size_t k = 1; k <= 20; k += 5) {
15919 GemmMicrokernelTester()
15920 .mr(6)
15921 .nr(8)
15922 .kr(1)
15923 .sr(1)
15924 .m(6)
15925 .n(n)
15926 .k(k)
15927 .a_stride(23)
15928 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15929 }
15930 }
15931 }
15932
15933 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8_subtile) {
15934 TEST_REQUIRES_ARM_NEON;
15935 for (uint32_t n = 16; n <= 24; n += 8) {
15936 for (size_t k = 1; k <= 20; k += 5) {
15937 for (uint32_t m = 1; m <= 6; m++) {
15938 GemmMicrokernelTester()
15939 .mr(6)
15940 .nr(8)
15941 .kr(1)
15942 .sr(1)
15943 .m(m)
15944 .n(n)
15945 .k(k)
15946 .iterations(1)
15947 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15948 }
15949 }
15950 }
15951 }
15952
15953 TEST(F32_GEMM_6X8__NEON_LANE_LD128, strided_cm_subtile) {
15954 TEST_REQUIRES_ARM_NEON;
15955 for (size_t k = 1; k <= 20; k += 5) {
15956 for (uint32_t m = 1; m <= 6; m++) {
15957 for (uint32_t n = 1; n <= 8; n++) {
15958 GemmMicrokernelTester()
15959 .mr(6)
15960 .nr(8)
15961 .kr(1)
15962 .sr(1)
15963 .m(m)
15964 .n(n)
15965 .k(k)
15966 .cm_stride(11)
15967 .iterations(1)
15968 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15969 }
15970 }
15971 }
15972 }
15973
15974 TEST(F32_GEMM_6X8__NEON_LANE_LD128, qmin) {
15975 TEST_REQUIRES_ARM_NEON;
15976 GemmMicrokernelTester()
15977 .mr(6)
15978 .nr(8)
15979 .kr(1)
15980 .sr(1)
15981 .m(6)
15982 .n(8)
15983 .k(4)
15984 .qmin(128)
15985 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
15986 }
15987
15988 TEST(F32_GEMM_6X8__NEON_LANE_LD128, qmax) {
15989 TEST_REQUIRES_ARM_NEON;
15990 GemmMicrokernelTester()
15991 .mr(6)
15992 .nr(8)
15993 .kr(1)
15994 .sr(1)
15995 .m(6)
15996 .n(8)
15997 .k(4)
15998 .qmax(128)
15999 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
16000 }
16001
16002 TEST(F32_GEMM_6X8__NEON_LANE_LD128, strided_cm) {
16003 TEST_REQUIRES_ARM_NEON;
16004 GemmMicrokernelTester()
16005 .mr(6)
16006 .nr(8)
16007 .kr(1)
16008 .sr(1)
16009 .m(6)
16010 .n(8)
16011 .k(4)
16012 .cm_stride(11)
16013 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
16014 }
16015#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16016
16017
16018#if XNN_ARCH_ARM64
16019 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2) {
16020 TEST_REQUIRES_ARM_NEON_FMA;
16021 GemmMicrokernelTester()
16022 .mr(1)
16023 .nr(8)
16024 .kr(1)
16025 .sr(1)
16026 .m(1)
16027 .n(8)
16028 .k(2)
16029 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16030 }
16031
16032 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cn) {
16033 TEST_REQUIRES_ARM_NEON_FMA;
16034 GemmMicrokernelTester()
16035 .mr(1)
16036 .nr(8)
16037 .kr(1)
16038 .sr(1)
16039 .m(1)
16040 .n(8)
16041 .k(2)
16042 .cn_stride(11)
16043 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16044 }
16045
16046 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
16047 TEST_REQUIRES_ARM_NEON_FMA;
16048 GemmMicrokernelTester()
16049 .mr(1)
16050 .nr(8)
16051 .kr(1)
16052 .sr(1)
16053 .m(1)
16054 .n(8)
16055 .k(2)
16056 .a_stride(5)
16057 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16058 }
16059
16060 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
16061 TEST_REQUIRES_ARM_NEON_FMA;
16062 for (uint32_t m = 1; m <= 1; m++) {
16063 for (uint32_t n = 1; n <= 8; n++) {
16064 GemmMicrokernelTester()
16065 .mr(1)
16066 .nr(8)
16067 .kr(1)
16068 .sr(1)
16069 .m(m)
16070 .n(n)
16071 .k(2)
16072 .iterations(1)
16073 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16074 }
16075 }
16076 }
16077
16078 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
16079 TEST_REQUIRES_ARM_NEON_FMA;
16080 for (uint32_t m = 1; m <= 1; m++) {
16081 GemmMicrokernelTester()
16082 .mr(1)
16083 .nr(8)
16084 .kr(1)
16085 .sr(1)
16086 .m(m)
16087 .n(8)
16088 .k(2)
16089 .iterations(1)
16090 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16091 }
16092 }
16093
16094 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
16095 TEST_REQUIRES_ARM_NEON_FMA;
16096 for (uint32_t n = 1; n <= 8; n++) {
16097 GemmMicrokernelTester()
16098 .mr(1)
16099 .nr(8)
16100 .kr(1)
16101 .sr(1)
16102 .m(1)
16103 .n(n)
16104 .k(2)
16105 .iterations(1)
16106 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16107 }
16108 }
16109
16110 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2) {
16111 TEST_REQUIRES_ARM_NEON_FMA;
16112 for (size_t k = 1; k < 2; k++) {
16113 GemmMicrokernelTester()
16114 .mr(1)
16115 .nr(8)
16116 .kr(1)
16117 .sr(1)
16118 .m(1)
16119 .n(8)
16120 .k(k)
16121 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16122 }
16123 }
16124
16125 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
16126 TEST_REQUIRES_ARM_NEON_FMA;
16127 for (size_t k = 1; k < 2; k++) {
16128 GemmMicrokernelTester()
16129 .mr(1)
16130 .nr(8)
16131 .kr(1)
16132 .sr(1)
16133 .m(1)
16134 .n(8)
16135 .k(k)
16136 .a_stride(5)
16137 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16138 }
16139 }
16140
16141 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
16142 TEST_REQUIRES_ARM_NEON_FMA;
16143 for (size_t k = 1; k < 2; k++) {
16144 for (uint32_t m = 1; m <= 1; m++) {
16145 for (uint32_t n = 1; n <= 8; n++) {
16146 GemmMicrokernelTester()
16147 .mr(1)
16148 .nr(8)
16149 .kr(1)
16150 .sr(1)
16151 .m(m)
16152 .n(n)
16153 .k(k)
16154 .iterations(1)
16155 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16156 }
16157 }
16158 }
16159 }
16160
16161 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2) {
16162 TEST_REQUIRES_ARM_NEON_FMA;
16163 for (size_t k = 3; k < 4; k++) {
16164 GemmMicrokernelTester()
16165 .mr(1)
16166 .nr(8)
16167 .kr(1)
16168 .sr(1)
16169 .m(1)
16170 .n(8)
16171 .k(k)
16172 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16173 }
16174 }
16175
16176 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
16177 TEST_REQUIRES_ARM_NEON_FMA;
16178 for (size_t k = 3; k < 4; k++) {
16179 GemmMicrokernelTester()
16180 .mr(1)
16181 .nr(8)
16182 .kr(1)
16183 .sr(1)
16184 .m(1)
16185 .n(8)
16186 .k(k)
16187 .a_stride(7)
16188 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16189 }
16190 }
16191
16192 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
16193 TEST_REQUIRES_ARM_NEON_FMA;
16194 for (size_t k = 3; k < 4; k++) {
16195 for (uint32_t m = 1; m <= 1; m++) {
16196 for (uint32_t n = 1; n <= 8; n++) {
16197 GemmMicrokernelTester()
16198 .mr(1)
16199 .nr(8)
16200 .kr(1)
16201 .sr(1)
16202 .m(m)
16203 .n(n)
16204 .k(k)
16205 .iterations(1)
16206 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16207 }
16208 }
16209 }
16210 }
16211
16212 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2) {
16213 TEST_REQUIRES_ARM_NEON_FMA;
16214 for (size_t k = 4; k <= 20; k += 2) {
16215 GemmMicrokernelTester()
16216 .mr(1)
16217 .nr(8)
16218 .kr(1)
16219 .sr(1)
16220 .m(1)
16221 .n(8)
16222 .k(k)
16223 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16224 }
16225 }
16226
16227 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
16228 TEST_REQUIRES_ARM_NEON_FMA;
16229 for (size_t k = 4; k <= 20; k += 2) {
16230 GemmMicrokernelTester()
16231 .mr(1)
16232 .nr(8)
16233 .kr(1)
16234 .sr(1)
16235 .m(1)
16236 .n(8)
16237 .k(k)
16238 .a_stride(23)
16239 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16240 }
16241 }
16242
16243 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
16244 TEST_REQUIRES_ARM_NEON_FMA;
16245 for (size_t k = 4; k <= 20; k += 2) {
16246 for (uint32_t m = 1; m <= 1; m++) {
16247 for (uint32_t n = 1; n <= 8; n++) {
16248 GemmMicrokernelTester()
16249 .mr(1)
16250 .nr(8)
16251 .kr(1)
16252 .sr(1)
16253 .m(m)
16254 .n(n)
16255 .k(k)
16256 .iterations(1)
16257 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16258 }
16259 }
16260 }
16261 }
16262
16263 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8) {
16264 TEST_REQUIRES_ARM_NEON_FMA;
16265 for (uint32_t n = 9; n < 16; n++) {
16266 for (size_t k = 1; k <= 10; k += 3) {
16267 GemmMicrokernelTester()
16268 .mr(1)
16269 .nr(8)
16270 .kr(1)
16271 .sr(1)
16272 .m(1)
16273 .n(8)
16274 .k(k)
16275 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16276 }
16277 }
16278 }
16279
16280 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
16281 TEST_REQUIRES_ARM_NEON_FMA;
16282 for (uint32_t n = 9; n < 16; n++) {
16283 for (size_t k = 1; k <= 10; k += 3) {
16284 GemmMicrokernelTester()
16285 .mr(1)
16286 .nr(8)
16287 .kr(1)
16288 .sr(1)
16289 .m(1)
16290 .n(8)
16291 .k(k)
16292 .cn_stride(11)
16293 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16294 }
16295 }
16296 }
16297
16298 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
16299 TEST_REQUIRES_ARM_NEON_FMA;
16300 for (uint32_t n = 9; n < 16; n++) {
16301 for (size_t k = 1; k <= 10; k += 3) {
16302 GemmMicrokernelTester()
16303 .mr(1)
16304 .nr(8)
16305 .kr(1)
16306 .sr(1)
16307 .m(1)
16308 .n(n)
16309 .k(k)
16310 .a_stride(13)
16311 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16312 }
16313 }
16314 }
16315
16316 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
16317 TEST_REQUIRES_ARM_NEON_FMA;
16318 for (uint32_t n = 9; n < 16; n++) {
16319 for (size_t k = 1; k <= 10; k += 3) {
16320 for (uint32_t m = 1; m <= 1; m++) {
16321 GemmMicrokernelTester()
16322 .mr(1)
16323 .nr(8)
16324 .kr(1)
16325 .sr(1)
16326 .m(m)
16327 .n(n)
16328 .k(k)
16329 .iterations(1)
16330 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16331 }
16332 }
16333 }
16334 }
16335
16336 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8) {
16337 TEST_REQUIRES_ARM_NEON_FMA;
16338 for (uint32_t n = 16; n <= 24; n += 8) {
16339 for (size_t k = 1; k <= 10; k += 3) {
16340 GemmMicrokernelTester()
16341 .mr(1)
16342 .nr(8)
16343 .kr(1)
16344 .sr(1)
16345 .m(1)
16346 .n(8)
16347 .k(k)
16348 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16349 }
16350 }
16351 }
16352
16353 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
16354 TEST_REQUIRES_ARM_NEON_FMA;
16355 for (uint32_t n = 16; n <= 24; n += 8) {
16356 for (size_t k = 1; k <= 10; k += 3) {
16357 GemmMicrokernelTester()
16358 .mr(1)
16359 .nr(8)
16360 .kr(1)
16361 .sr(1)
16362 .m(1)
16363 .n(n)
16364 .k(k)
16365 .cn_stride(11)
16366 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16367 }
16368 }
16369 }
16370
16371 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
16372 TEST_REQUIRES_ARM_NEON_FMA;
16373 for (uint32_t n = 16; n <= 24; n += 8) {
16374 for (size_t k = 1; k <= 10; k += 3) {
16375 GemmMicrokernelTester()
16376 .mr(1)
16377 .nr(8)
16378 .kr(1)
16379 .sr(1)
16380 .m(1)
16381 .n(n)
16382 .k(k)
16383 .a_stride(13)
16384 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16385 }
16386 }
16387 }
16388
16389 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
16390 TEST_REQUIRES_ARM_NEON_FMA;
16391 for (uint32_t n = 16; n <= 24; n += 8) {
16392 for (size_t k = 1; k <= 10; k += 3) {
16393 for (uint32_t m = 1; m <= 1; m++) {
16394 GemmMicrokernelTester()
16395 .mr(1)
16396 .nr(8)
16397 .kr(1)
16398 .sr(1)
16399 .m(m)
16400 .n(n)
16401 .k(k)
16402 .iterations(1)
16403 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16404 }
16405 }
16406 }
16407 }
16408
16409 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
16410 TEST_REQUIRES_ARM_NEON_FMA;
16411 for (size_t k = 1; k <= 10; k += 3) {
16412 for (uint32_t m = 1; m <= 1; m++) {
16413 for (uint32_t n = 1; n <= 8; n++) {
16414 GemmMicrokernelTester()
16415 .mr(1)
16416 .nr(8)
16417 .kr(1)
16418 .sr(1)
16419 .m(m)
16420 .n(n)
16421 .k(k)
16422 .cm_stride(11)
16423 .iterations(1)
16424 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16425 }
16426 }
16427 }
16428 }
16429
16430 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, qmin) {
16431 TEST_REQUIRES_ARM_NEON_FMA;
16432 GemmMicrokernelTester()
16433 .mr(1)
16434 .nr(8)
16435 .kr(1)
16436 .sr(1)
16437 .m(1)
16438 .n(8)
16439 .k(2)
16440 .qmin(128)
16441 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16442 }
16443
16444 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, qmax) {
16445 TEST_REQUIRES_ARM_NEON_FMA;
16446 GemmMicrokernelTester()
16447 .mr(1)
16448 .nr(8)
16449 .kr(1)
16450 .sr(1)
16451 .m(1)
16452 .n(8)
16453 .k(2)
16454 .qmax(128)
16455 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16456 }
16457
16458 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cm) {
16459 TEST_REQUIRES_ARM_NEON_FMA;
16460 GemmMicrokernelTester()
16461 .mr(1)
16462 .nr(8)
16463 .kr(1)
16464 .sr(1)
16465 .m(1)
16466 .n(8)
16467 .k(2)
16468 .cm_stride(11)
16469 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
16470 }
16471#endif // XNN_ARCH_ARM64
16472
16473
16474#if XNN_ARCH_ARM64
16475 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2) {
16476 TEST_REQUIRES_ARM_NEON_FMA;
16477 GemmMicrokernelTester()
16478 .mr(4)
16479 .nr(8)
16480 .kr(1)
16481 .sr(1)
16482 .m(4)
16483 .n(8)
16484 .k(2)
16485 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16486 }
16487
16488 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cn) {
16489 TEST_REQUIRES_ARM_NEON_FMA;
16490 GemmMicrokernelTester()
16491 .mr(4)
16492 .nr(8)
16493 .kr(1)
16494 .sr(1)
16495 .m(4)
16496 .n(8)
16497 .k(2)
16498 .cn_stride(11)
16499 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16500 }
16501
16502 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
16503 TEST_REQUIRES_ARM_NEON_FMA;
16504 GemmMicrokernelTester()
16505 .mr(4)
16506 .nr(8)
16507 .kr(1)
16508 .sr(1)
16509 .m(4)
16510 .n(8)
16511 .k(2)
16512 .a_stride(5)
16513 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16514 }
16515
16516 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
16517 TEST_REQUIRES_ARM_NEON_FMA;
16518 for (uint32_t m = 1; m <= 4; m++) {
16519 for (uint32_t n = 1; n <= 8; n++) {
16520 GemmMicrokernelTester()
16521 .mr(4)
16522 .nr(8)
16523 .kr(1)
16524 .sr(1)
16525 .m(m)
16526 .n(n)
16527 .k(2)
16528 .iterations(1)
16529 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16530 }
16531 }
16532 }
16533
16534 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
16535 TEST_REQUIRES_ARM_NEON_FMA;
16536 for (uint32_t m = 1; m <= 4; m++) {
16537 GemmMicrokernelTester()
16538 .mr(4)
16539 .nr(8)
16540 .kr(1)
16541 .sr(1)
16542 .m(m)
16543 .n(8)
16544 .k(2)
16545 .iterations(1)
16546 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16547 }
16548 }
16549
16550 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
16551 TEST_REQUIRES_ARM_NEON_FMA;
16552 for (uint32_t n = 1; n <= 8; n++) {
16553 GemmMicrokernelTester()
16554 .mr(4)
16555 .nr(8)
16556 .kr(1)
16557 .sr(1)
16558 .m(4)
16559 .n(n)
16560 .k(2)
16561 .iterations(1)
16562 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16563 }
16564 }
16565
16566 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2) {
16567 TEST_REQUIRES_ARM_NEON_FMA;
16568 for (size_t k = 1; k < 2; k++) {
16569 GemmMicrokernelTester()
16570 .mr(4)
16571 .nr(8)
16572 .kr(1)
16573 .sr(1)
16574 .m(4)
16575 .n(8)
16576 .k(k)
16577 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16578 }
16579 }
16580
16581 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
16582 TEST_REQUIRES_ARM_NEON_FMA;
16583 for (size_t k = 1; k < 2; k++) {
16584 GemmMicrokernelTester()
16585 .mr(4)
16586 .nr(8)
16587 .kr(1)
16588 .sr(1)
16589 .m(4)
16590 .n(8)
16591 .k(k)
16592 .a_stride(5)
16593 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16594 }
16595 }
16596
16597 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
16598 TEST_REQUIRES_ARM_NEON_FMA;
16599 for (size_t k = 1; k < 2; k++) {
16600 for (uint32_t m = 1; m <= 4; m++) {
16601 for (uint32_t n = 1; n <= 8; n++) {
16602 GemmMicrokernelTester()
16603 .mr(4)
16604 .nr(8)
16605 .kr(1)
16606 .sr(1)
16607 .m(m)
16608 .n(n)
16609 .k(k)
16610 .iterations(1)
16611 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16612 }
16613 }
16614 }
16615 }
16616
16617 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2) {
16618 TEST_REQUIRES_ARM_NEON_FMA;
16619 for (size_t k = 3; k < 4; k++) {
16620 GemmMicrokernelTester()
16621 .mr(4)
16622 .nr(8)
16623 .kr(1)
16624 .sr(1)
16625 .m(4)
16626 .n(8)
16627 .k(k)
16628 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16629 }
16630 }
16631
16632 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
16633 TEST_REQUIRES_ARM_NEON_FMA;
16634 for (size_t k = 3; k < 4; k++) {
16635 GemmMicrokernelTester()
16636 .mr(4)
16637 .nr(8)
16638 .kr(1)
16639 .sr(1)
16640 .m(4)
16641 .n(8)
16642 .k(k)
16643 .a_stride(7)
16644 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16645 }
16646 }
16647
16648 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
16649 TEST_REQUIRES_ARM_NEON_FMA;
16650 for (size_t k = 3; k < 4; k++) {
16651 for (uint32_t m = 1; m <= 4; m++) {
16652 for (uint32_t n = 1; n <= 8; n++) {
16653 GemmMicrokernelTester()
16654 .mr(4)
16655 .nr(8)
16656 .kr(1)
16657 .sr(1)
16658 .m(m)
16659 .n(n)
16660 .k(k)
16661 .iterations(1)
16662 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16663 }
16664 }
16665 }
16666 }
16667
16668 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2) {
16669 TEST_REQUIRES_ARM_NEON_FMA;
16670 for (size_t k = 4; k <= 20; k += 2) {
16671 GemmMicrokernelTester()
16672 .mr(4)
16673 .nr(8)
16674 .kr(1)
16675 .sr(1)
16676 .m(4)
16677 .n(8)
16678 .k(k)
16679 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16680 }
16681 }
16682
16683 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
16684 TEST_REQUIRES_ARM_NEON_FMA;
16685 for (size_t k = 4; k <= 20; k += 2) {
16686 GemmMicrokernelTester()
16687 .mr(4)
16688 .nr(8)
16689 .kr(1)
16690 .sr(1)
16691 .m(4)
16692 .n(8)
16693 .k(k)
16694 .a_stride(23)
16695 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16696 }
16697 }
16698
16699 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
16700 TEST_REQUIRES_ARM_NEON_FMA;
16701 for (size_t k = 4; k <= 20; k += 2) {
16702 for (uint32_t m = 1; m <= 4; m++) {
16703 for (uint32_t n = 1; n <= 8; n++) {
16704 GemmMicrokernelTester()
16705 .mr(4)
16706 .nr(8)
16707 .kr(1)
16708 .sr(1)
16709 .m(m)
16710 .n(n)
16711 .k(k)
16712 .iterations(1)
16713 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16714 }
16715 }
16716 }
16717 }
16718
16719 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8) {
16720 TEST_REQUIRES_ARM_NEON_FMA;
16721 for (uint32_t n = 9; n < 16; n++) {
16722 for (size_t k = 1; k <= 10; k += 3) {
16723 GemmMicrokernelTester()
16724 .mr(4)
16725 .nr(8)
16726 .kr(1)
16727 .sr(1)
16728 .m(4)
16729 .n(8)
16730 .k(k)
16731 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16732 }
16733 }
16734 }
16735
16736 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
16737 TEST_REQUIRES_ARM_NEON_FMA;
16738 for (uint32_t n = 9; n < 16; n++) {
16739 for (size_t k = 1; k <= 10; k += 3) {
16740 GemmMicrokernelTester()
16741 .mr(4)
16742 .nr(8)
16743 .kr(1)
16744 .sr(1)
16745 .m(4)
16746 .n(8)
16747 .k(k)
16748 .cn_stride(11)
16749 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16750 }
16751 }
16752 }
16753
16754 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
16755 TEST_REQUIRES_ARM_NEON_FMA;
16756 for (uint32_t n = 9; n < 16; n++) {
16757 for (size_t k = 1; k <= 10; k += 3) {
16758 GemmMicrokernelTester()
16759 .mr(4)
16760 .nr(8)
16761 .kr(1)
16762 .sr(1)
16763 .m(4)
16764 .n(n)
16765 .k(k)
16766 .a_stride(13)
16767 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16768 }
16769 }
16770 }
16771
16772 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
16773 TEST_REQUIRES_ARM_NEON_FMA;
16774 for (uint32_t n = 9; n < 16; n++) {
16775 for (size_t k = 1; k <= 10; k += 3) {
16776 for (uint32_t m = 1; m <= 4; m++) {
16777 GemmMicrokernelTester()
16778 .mr(4)
16779 .nr(8)
16780 .kr(1)
16781 .sr(1)
16782 .m(m)
16783 .n(n)
16784 .k(k)
16785 .iterations(1)
16786 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16787 }
16788 }
16789 }
16790 }
16791
16792 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8) {
16793 TEST_REQUIRES_ARM_NEON_FMA;
16794 for (uint32_t n = 16; n <= 24; n += 8) {
16795 for (size_t k = 1; k <= 10; k += 3) {
16796 GemmMicrokernelTester()
16797 .mr(4)
16798 .nr(8)
16799 .kr(1)
16800 .sr(1)
16801 .m(4)
16802 .n(8)
16803 .k(k)
16804 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16805 }
16806 }
16807 }
16808
16809 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
16810 TEST_REQUIRES_ARM_NEON_FMA;
16811 for (uint32_t n = 16; n <= 24; n += 8) {
16812 for (size_t k = 1; k <= 10; k += 3) {
16813 GemmMicrokernelTester()
16814 .mr(4)
16815 .nr(8)
16816 .kr(1)
16817 .sr(1)
16818 .m(4)
16819 .n(n)
16820 .k(k)
16821 .cn_stride(11)
16822 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16823 }
16824 }
16825 }
16826
16827 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
16828 TEST_REQUIRES_ARM_NEON_FMA;
16829 for (uint32_t n = 16; n <= 24; n += 8) {
16830 for (size_t k = 1; k <= 10; k += 3) {
16831 GemmMicrokernelTester()
16832 .mr(4)
16833 .nr(8)
16834 .kr(1)
16835 .sr(1)
16836 .m(4)
16837 .n(n)
16838 .k(k)
16839 .a_stride(13)
16840 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16841 }
16842 }
16843 }
16844
16845 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
16846 TEST_REQUIRES_ARM_NEON_FMA;
16847 for (uint32_t n = 16; n <= 24; n += 8) {
16848 for (size_t k = 1; k <= 10; k += 3) {
16849 for (uint32_t m = 1; m <= 4; m++) {
16850 GemmMicrokernelTester()
16851 .mr(4)
16852 .nr(8)
16853 .kr(1)
16854 .sr(1)
16855 .m(m)
16856 .n(n)
16857 .k(k)
16858 .iterations(1)
16859 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16860 }
16861 }
16862 }
16863 }
16864
16865 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
16866 TEST_REQUIRES_ARM_NEON_FMA;
16867 for (size_t k = 1; k <= 10; k += 3) {
16868 for (uint32_t m = 1; m <= 4; m++) {
16869 for (uint32_t n = 1; n <= 8; n++) {
16870 GemmMicrokernelTester()
16871 .mr(4)
16872 .nr(8)
16873 .kr(1)
16874 .sr(1)
16875 .m(m)
16876 .n(n)
16877 .k(k)
16878 .cm_stride(11)
16879 .iterations(1)
16880 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16881 }
16882 }
16883 }
16884 }
16885
16886 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, qmin) {
16887 TEST_REQUIRES_ARM_NEON_FMA;
16888 GemmMicrokernelTester()
16889 .mr(4)
16890 .nr(8)
16891 .kr(1)
16892 .sr(1)
16893 .m(4)
16894 .n(8)
16895 .k(2)
16896 .qmin(128)
16897 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16898 }
16899
16900 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, qmax) {
16901 TEST_REQUIRES_ARM_NEON_FMA;
16902 GemmMicrokernelTester()
16903 .mr(4)
16904 .nr(8)
16905 .kr(1)
16906 .sr(1)
16907 .m(4)
16908 .n(8)
16909 .k(2)
16910 .qmax(128)
16911 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16912 }
16913
16914 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cm) {
16915 TEST_REQUIRES_ARM_NEON_FMA;
16916 GemmMicrokernelTester()
16917 .mr(4)
16918 .nr(8)
16919 .kr(1)
16920 .sr(1)
16921 .m(4)
16922 .n(8)
16923 .k(2)
16924 .cm_stride(11)
16925 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
16926 }
16927#endif // XNN_ARCH_ARM64
16928
16929
16930#if XNN_ARCH_ARM64
16931 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4) {
16932 TEST_REQUIRES_ARM_NEON_FMA;
16933 GemmMicrokernelTester()
16934 .mr(4)
16935 .nr(8)
16936 .kr(1)
16937 .sr(1)
16938 .m(4)
16939 .n(8)
16940 .k(4)
16941 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
16942 }
16943
16944 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cn) {
16945 TEST_REQUIRES_ARM_NEON_FMA;
16946 GemmMicrokernelTester()
16947 .mr(4)
16948 .nr(8)
16949 .kr(1)
16950 .sr(1)
16951 .m(4)
16952 .n(8)
16953 .k(4)
16954 .cn_stride(11)
16955 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
16956 }
16957
16958 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
16959 TEST_REQUIRES_ARM_NEON_FMA;
16960 GemmMicrokernelTester()
16961 .mr(4)
16962 .nr(8)
16963 .kr(1)
16964 .sr(1)
16965 .m(4)
16966 .n(8)
16967 .k(4)
16968 .a_stride(7)
16969 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
16970 }
16971
16972 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
16973 TEST_REQUIRES_ARM_NEON_FMA;
16974 for (uint32_t m = 1; m <= 4; m++) {
16975 for (uint32_t n = 1; n <= 8; n++) {
16976 GemmMicrokernelTester()
16977 .mr(4)
16978 .nr(8)
16979 .kr(1)
16980 .sr(1)
16981 .m(m)
16982 .n(n)
16983 .k(4)
16984 .iterations(1)
16985 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
16986 }
16987 }
16988 }
16989
16990 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
16991 TEST_REQUIRES_ARM_NEON_FMA;
16992 for (uint32_t m = 1; m <= 4; m++) {
16993 GemmMicrokernelTester()
16994 .mr(4)
16995 .nr(8)
16996 .kr(1)
16997 .sr(1)
16998 .m(m)
16999 .n(8)
17000 .k(4)
17001 .iterations(1)
17002 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17003 }
17004 }
17005
17006 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
17007 TEST_REQUIRES_ARM_NEON_FMA;
17008 for (uint32_t n = 1; n <= 8; n++) {
17009 GemmMicrokernelTester()
17010 .mr(4)
17011 .nr(8)
17012 .kr(1)
17013 .sr(1)
17014 .m(4)
17015 .n(n)
17016 .k(4)
17017 .iterations(1)
17018 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17019 }
17020 }
17021
17022 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4) {
17023 TEST_REQUIRES_ARM_NEON_FMA;
17024 for (size_t k = 1; k < 4; k++) {
17025 GemmMicrokernelTester()
17026 .mr(4)
17027 .nr(8)
17028 .kr(1)
17029 .sr(1)
17030 .m(4)
17031 .n(8)
17032 .k(k)
17033 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17034 }
17035 }
17036
17037 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
17038 TEST_REQUIRES_ARM_NEON_FMA;
17039 for (size_t k = 1; k < 4; k++) {
17040 GemmMicrokernelTester()
17041 .mr(4)
17042 .nr(8)
17043 .kr(1)
17044 .sr(1)
17045 .m(4)
17046 .n(8)
17047 .k(k)
17048 .a_stride(7)
17049 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17050 }
17051 }
17052
17053 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
17054 TEST_REQUIRES_ARM_NEON_FMA;
17055 for (size_t k = 1; k < 4; k++) {
17056 for (uint32_t m = 1; m <= 4; m++) {
17057 for (uint32_t n = 1; n <= 8; n++) {
17058 GemmMicrokernelTester()
17059 .mr(4)
17060 .nr(8)
17061 .kr(1)
17062 .sr(1)
17063 .m(m)
17064 .n(n)
17065 .k(k)
17066 .iterations(1)
17067 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17068 }
17069 }
17070 }
17071 }
17072
17073 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4) {
17074 TEST_REQUIRES_ARM_NEON_FMA;
17075 for (size_t k = 5; k < 8; k++) {
17076 GemmMicrokernelTester()
17077 .mr(4)
17078 .nr(8)
17079 .kr(1)
17080 .sr(1)
17081 .m(4)
17082 .n(8)
17083 .k(k)
17084 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17085 }
17086 }
17087
17088 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
17089 TEST_REQUIRES_ARM_NEON_FMA;
17090 for (size_t k = 5; k < 8; k++) {
17091 GemmMicrokernelTester()
17092 .mr(4)
17093 .nr(8)
17094 .kr(1)
17095 .sr(1)
17096 .m(4)
17097 .n(8)
17098 .k(k)
17099 .a_stride(11)
17100 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17101 }
17102 }
17103
17104 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
17105 TEST_REQUIRES_ARM_NEON_FMA;
17106 for (size_t k = 5; k < 8; k++) {
17107 for (uint32_t m = 1; m <= 4; m++) {
17108 for (uint32_t n = 1; n <= 8; n++) {
17109 GemmMicrokernelTester()
17110 .mr(4)
17111 .nr(8)
17112 .kr(1)
17113 .sr(1)
17114 .m(m)
17115 .n(n)
17116 .k(k)
17117 .iterations(1)
17118 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17119 }
17120 }
17121 }
17122 }
17123
17124 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4) {
17125 TEST_REQUIRES_ARM_NEON_FMA;
17126 for (size_t k = 8; k <= 40; k += 4) {
17127 GemmMicrokernelTester()
17128 .mr(4)
17129 .nr(8)
17130 .kr(1)
17131 .sr(1)
17132 .m(4)
17133 .n(8)
17134 .k(k)
17135 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17136 }
17137 }
17138
17139 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
17140 TEST_REQUIRES_ARM_NEON_FMA;
17141 for (size_t k = 8; k <= 40; k += 4) {
17142 GemmMicrokernelTester()
17143 .mr(4)
17144 .nr(8)
17145 .kr(1)
17146 .sr(1)
17147 .m(4)
17148 .n(8)
17149 .k(k)
17150 .a_stride(43)
17151 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17152 }
17153 }
17154
17155 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
17156 TEST_REQUIRES_ARM_NEON_FMA;
17157 for (size_t k = 8; k <= 40; k += 4) {
17158 for (uint32_t m = 1; m <= 4; m++) {
17159 for (uint32_t n = 1; n <= 8; n++) {
17160 GemmMicrokernelTester()
17161 .mr(4)
17162 .nr(8)
17163 .kr(1)
17164 .sr(1)
17165 .m(m)
17166 .n(n)
17167 .k(k)
17168 .iterations(1)
17169 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17170 }
17171 }
17172 }
17173 }
17174
17175 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8) {
17176 TEST_REQUIRES_ARM_NEON_FMA;
17177 for (uint32_t n = 9; n < 16; n++) {
17178 for (size_t k = 1; k <= 20; k += 5) {
17179 GemmMicrokernelTester()
17180 .mr(4)
17181 .nr(8)
17182 .kr(1)
17183 .sr(1)
17184 .m(4)
17185 .n(8)
17186 .k(k)
17187 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17188 }
17189 }
17190 }
17191
17192 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
17193 TEST_REQUIRES_ARM_NEON_FMA;
17194 for (uint32_t n = 9; n < 16; n++) {
17195 for (size_t k = 1; k <= 20; k += 5) {
17196 GemmMicrokernelTester()
17197 .mr(4)
17198 .nr(8)
17199 .kr(1)
17200 .sr(1)
17201 .m(4)
17202 .n(8)
17203 .k(k)
17204 .cn_stride(11)
17205 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17206 }
17207 }
17208 }
17209
17210 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
17211 TEST_REQUIRES_ARM_NEON_FMA;
17212 for (uint32_t n = 9; n < 16; n++) {
17213 for (size_t k = 1; k <= 20; k += 5) {
17214 GemmMicrokernelTester()
17215 .mr(4)
17216 .nr(8)
17217 .kr(1)
17218 .sr(1)
17219 .m(4)
17220 .n(n)
17221 .k(k)
17222 .a_stride(23)
17223 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17224 }
17225 }
17226 }
17227
17228 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
17229 TEST_REQUIRES_ARM_NEON_FMA;
17230 for (uint32_t n = 9; n < 16; n++) {
17231 for (size_t k = 1; k <= 20; k += 5) {
17232 for (uint32_t m = 1; m <= 4; m++) {
17233 GemmMicrokernelTester()
17234 .mr(4)
17235 .nr(8)
17236 .kr(1)
17237 .sr(1)
17238 .m(m)
17239 .n(n)
17240 .k(k)
17241 .iterations(1)
17242 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17243 }
17244 }
17245 }
17246 }
17247
17248 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8) {
17249 TEST_REQUIRES_ARM_NEON_FMA;
17250 for (uint32_t n = 16; n <= 24; n += 8) {
17251 for (size_t k = 1; k <= 20; k += 5) {
17252 GemmMicrokernelTester()
17253 .mr(4)
17254 .nr(8)
17255 .kr(1)
17256 .sr(1)
17257 .m(4)
17258 .n(8)
17259 .k(k)
17260 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17261 }
17262 }
17263 }
17264
17265 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
17266 TEST_REQUIRES_ARM_NEON_FMA;
17267 for (uint32_t n = 16; n <= 24; n += 8) {
17268 for (size_t k = 1; k <= 20; k += 5) {
17269 GemmMicrokernelTester()
17270 .mr(4)
17271 .nr(8)
17272 .kr(1)
17273 .sr(1)
17274 .m(4)
17275 .n(n)
17276 .k(k)
17277 .cn_stride(11)
17278 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17279 }
17280 }
17281 }
17282
17283 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
17284 TEST_REQUIRES_ARM_NEON_FMA;
17285 for (uint32_t n = 16; n <= 24; n += 8) {
17286 for (size_t k = 1; k <= 20; k += 5) {
17287 GemmMicrokernelTester()
17288 .mr(4)
17289 .nr(8)
17290 .kr(1)
17291 .sr(1)
17292 .m(4)
17293 .n(n)
17294 .k(k)
17295 .a_stride(23)
17296 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17297 }
17298 }
17299 }
17300
17301 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
17302 TEST_REQUIRES_ARM_NEON_FMA;
17303 for (uint32_t n = 16; n <= 24; n += 8) {
17304 for (size_t k = 1; k <= 20; k += 5) {
17305 for (uint32_t m = 1; m <= 4; m++) {
17306 GemmMicrokernelTester()
17307 .mr(4)
17308 .nr(8)
17309 .kr(1)
17310 .sr(1)
17311 .m(m)
17312 .n(n)
17313 .k(k)
17314 .iterations(1)
17315 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17316 }
17317 }
17318 }
17319 }
17320
17321 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
17322 TEST_REQUIRES_ARM_NEON_FMA;
17323 for (size_t k = 1; k <= 20; k += 5) {
17324 for (uint32_t m = 1; m <= 4; m++) {
17325 for (uint32_t n = 1; n <= 8; n++) {
17326 GemmMicrokernelTester()
17327 .mr(4)
17328 .nr(8)
17329 .kr(1)
17330 .sr(1)
17331 .m(m)
17332 .n(n)
17333 .k(k)
17334 .cm_stride(11)
17335 .iterations(1)
17336 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17337 }
17338 }
17339 }
17340 }
17341
17342 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, qmin) {
17343 TEST_REQUIRES_ARM_NEON_FMA;
17344 GemmMicrokernelTester()
17345 .mr(4)
17346 .nr(8)
17347 .kr(1)
17348 .sr(1)
17349 .m(4)
17350 .n(8)
17351 .k(4)
17352 .qmin(128)
17353 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17354 }
17355
17356 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, qmax) {
17357 TEST_REQUIRES_ARM_NEON_FMA;
17358 GemmMicrokernelTester()
17359 .mr(4)
17360 .nr(8)
17361 .kr(1)
17362 .sr(1)
17363 .m(4)
17364 .n(8)
17365 .k(4)
17366 .qmax(128)
17367 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17368 }
17369
17370 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cm) {
17371 TEST_REQUIRES_ARM_NEON_FMA;
17372 GemmMicrokernelTester()
17373 .mr(4)
17374 .nr(8)
17375 .kr(1)
17376 .sr(1)
17377 .m(4)
17378 .n(8)
17379 .k(4)
17380 .cm_stride(11)
17381 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
17382 }
17383#endif // XNN_ARCH_ARM64
17384
17385
17386#if XNN_ARCH_ARM64
17387 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2) {
17388 TEST_REQUIRES_ARM_NEON_FMA;
17389 GemmMicrokernelTester()
17390 .mr(5)
17391 .nr(8)
17392 .kr(1)
17393 .sr(1)
17394 .m(5)
17395 .n(8)
17396 .k(2)
17397 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17398 }
17399
17400 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cn) {
17401 TEST_REQUIRES_ARM_NEON_FMA;
17402 GemmMicrokernelTester()
17403 .mr(5)
17404 .nr(8)
17405 .kr(1)
17406 .sr(1)
17407 .m(5)
17408 .n(8)
17409 .k(2)
17410 .cn_stride(11)
17411 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17412 }
17413
17414 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
17415 TEST_REQUIRES_ARM_NEON_FMA;
17416 GemmMicrokernelTester()
17417 .mr(5)
17418 .nr(8)
17419 .kr(1)
17420 .sr(1)
17421 .m(5)
17422 .n(8)
17423 .k(2)
17424 .a_stride(5)
17425 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17426 }
17427
17428 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
17429 TEST_REQUIRES_ARM_NEON_FMA;
17430 for (uint32_t m = 1; m <= 5; m++) {
17431 for (uint32_t n = 1; n <= 8; n++) {
17432 GemmMicrokernelTester()
17433 .mr(5)
17434 .nr(8)
17435 .kr(1)
17436 .sr(1)
17437 .m(m)
17438 .n(n)
17439 .k(2)
17440 .iterations(1)
17441 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17442 }
17443 }
17444 }
17445
17446 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
17447 TEST_REQUIRES_ARM_NEON_FMA;
17448 for (uint32_t m = 1; m <= 5; m++) {
17449 GemmMicrokernelTester()
17450 .mr(5)
17451 .nr(8)
17452 .kr(1)
17453 .sr(1)
17454 .m(m)
17455 .n(8)
17456 .k(2)
17457 .iterations(1)
17458 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17459 }
17460 }
17461
17462 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
17463 TEST_REQUIRES_ARM_NEON_FMA;
17464 for (uint32_t n = 1; n <= 8; n++) {
17465 GemmMicrokernelTester()
17466 .mr(5)
17467 .nr(8)
17468 .kr(1)
17469 .sr(1)
17470 .m(5)
17471 .n(n)
17472 .k(2)
17473 .iterations(1)
17474 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17475 }
17476 }
17477
17478 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2) {
17479 TEST_REQUIRES_ARM_NEON_FMA;
17480 for (size_t k = 1; k < 2; k++) {
17481 GemmMicrokernelTester()
17482 .mr(5)
17483 .nr(8)
17484 .kr(1)
17485 .sr(1)
17486 .m(5)
17487 .n(8)
17488 .k(k)
17489 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17490 }
17491 }
17492
17493 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
17494 TEST_REQUIRES_ARM_NEON_FMA;
17495 for (size_t k = 1; k < 2; k++) {
17496 GemmMicrokernelTester()
17497 .mr(5)
17498 .nr(8)
17499 .kr(1)
17500 .sr(1)
17501 .m(5)
17502 .n(8)
17503 .k(k)
17504 .a_stride(5)
17505 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17506 }
17507 }
17508
17509 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
17510 TEST_REQUIRES_ARM_NEON_FMA;
17511 for (size_t k = 1; k < 2; k++) {
17512 for (uint32_t m = 1; m <= 5; m++) {
17513 for (uint32_t n = 1; n <= 8; n++) {
17514 GemmMicrokernelTester()
17515 .mr(5)
17516 .nr(8)
17517 .kr(1)
17518 .sr(1)
17519 .m(m)
17520 .n(n)
17521 .k(k)
17522 .iterations(1)
17523 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17524 }
17525 }
17526 }
17527 }
17528
17529 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2) {
17530 TEST_REQUIRES_ARM_NEON_FMA;
17531 for (size_t k = 3; k < 4; k++) {
17532 GemmMicrokernelTester()
17533 .mr(5)
17534 .nr(8)
17535 .kr(1)
17536 .sr(1)
17537 .m(5)
17538 .n(8)
17539 .k(k)
17540 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17541 }
17542 }
17543
17544 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
17545 TEST_REQUIRES_ARM_NEON_FMA;
17546 for (size_t k = 3; k < 4; k++) {
17547 GemmMicrokernelTester()
17548 .mr(5)
17549 .nr(8)
17550 .kr(1)
17551 .sr(1)
17552 .m(5)
17553 .n(8)
17554 .k(k)
17555 .a_stride(7)
17556 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17557 }
17558 }
17559
17560 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
17561 TEST_REQUIRES_ARM_NEON_FMA;
17562 for (size_t k = 3; k < 4; k++) {
17563 for (uint32_t m = 1; m <= 5; m++) {
17564 for (uint32_t n = 1; n <= 8; n++) {
17565 GemmMicrokernelTester()
17566 .mr(5)
17567 .nr(8)
17568 .kr(1)
17569 .sr(1)
17570 .m(m)
17571 .n(n)
17572 .k(k)
17573 .iterations(1)
17574 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17575 }
17576 }
17577 }
17578 }
17579
17580 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2) {
17581 TEST_REQUIRES_ARM_NEON_FMA;
17582 for (size_t k = 4; k <= 20; k += 2) {
17583 GemmMicrokernelTester()
17584 .mr(5)
17585 .nr(8)
17586 .kr(1)
17587 .sr(1)
17588 .m(5)
17589 .n(8)
17590 .k(k)
17591 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17592 }
17593 }
17594
17595 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
17596 TEST_REQUIRES_ARM_NEON_FMA;
17597 for (size_t k = 4; k <= 20; k += 2) {
17598 GemmMicrokernelTester()
17599 .mr(5)
17600 .nr(8)
17601 .kr(1)
17602 .sr(1)
17603 .m(5)
17604 .n(8)
17605 .k(k)
17606 .a_stride(23)
17607 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17608 }
17609 }
17610
17611 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
17612 TEST_REQUIRES_ARM_NEON_FMA;
17613 for (size_t k = 4; k <= 20; k += 2) {
17614 for (uint32_t m = 1; m <= 5; m++) {
17615 for (uint32_t n = 1; n <= 8; n++) {
17616 GemmMicrokernelTester()
17617 .mr(5)
17618 .nr(8)
17619 .kr(1)
17620 .sr(1)
17621 .m(m)
17622 .n(n)
17623 .k(k)
17624 .iterations(1)
17625 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17626 }
17627 }
17628 }
17629 }
17630
17631 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8) {
17632 TEST_REQUIRES_ARM_NEON_FMA;
17633 for (uint32_t n = 9; n < 16; n++) {
17634 for (size_t k = 1; k <= 10; k += 3) {
17635 GemmMicrokernelTester()
17636 .mr(5)
17637 .nr(8)
17638 .kr(1)
17639 .sr(1)
17640 .m(5)
17641 .n(8)
17642 .k(k)
17643 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17644 }
17645 }
17646 }
17647
17648 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
17649 TEST_REQUIRES_ARM_NEON_FMA;
17650 for (uint32_t n = 9; n < 16; n++) {
17651 for (size_t k = 1; k <= 10; k += 3) {
17652 GemmMicrokernelTester()
17653 .mr(5)
17654 .nr(8)
17655 .kr(1)
17656 .sr(1)
17657 .m(5)
17658 .n(8)
17659 .k(k)
17660 .cn_stride(11)
17661 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17662 }
17663 }
17664 }
17665
17666 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
17667 TEST_REQUIRES_ARM_NEON_FMA;
17668 for (uint32_t n = 9; n < 16; n++) {
17669 for (size_t k = 1; k <= 10; k += 3) {
17670 GemmMicrokernelTester()
17671 .mr(5)
17672 .nr(8)
17673 .kr(1)
17674 .sr(1)
17675 .m(5)
17676 .n(n)
17677 .k(k)
17678 .a_stride(13)
17679 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17680 }
17681 }
17682 }
17683
17684 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
17685 TEST_REQUIRES_ARM_NEON_FMA;
17686 for (uint32_t n = 9; n < 16; n++) {
17687 for (size_t k = 1; k <= 10; k += 3) {
17688 for (uint32_t m = 1; m <= 5; m++) {
17689 GemmMicrokernelTester()
17690 .mr(5)
17691 .nr(8)
17692 .kr(1)
17693 .sr(1)
17694 .m(m)
17695 .n(n)
17696 .k(k)
17697 .iterations(1)
17698 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17699 }
17700 }
17701 }
17702 }
17703
17704 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8) {
17705 TEST_REQUIRES_ARM_NEON_FMA;
17706 for (uint32_t n = 16; n <= 24; n += 8) {
17707 for (size_t k = 1; k <= 10; k += 3) {
17708 GemmMicrokernelTester()
17709 .mr(5)
17710 .nr(8)
17711 .kr(1)
17712 .sr(1)
17713 .m(5)
17714 .n(8)
17715 .k(k)
17716 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17717 }
17718 }
17719 }
17720
17721 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
17722 TEST_REQUIRES_ARM_NEON_FMA;
17723 for (uint32_t n = 16; n <= 24; n += 8) {
17724 for (size_t k = 1; k <= 10; k += 3) {
17725 GemmMicrokernelTester()
17726 .mr(5)
17727 .nr(8)
17728 .kr(1)
17729 .sr(1)
17730 .m(5)
17731 .n(n)
17732 .k(k)
17733 .cn_stride(11)
17734 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17735 }
17736 }
17737 }
17738
17739 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
17740 TEST_REQUIRES_ARM_NEON_FMA;
17741 for (uint32_t n = 16; n <= 24; n += 8) {
17742 for (size_t k = 1; k <= 10; k += 3) {
17743 GemmMicrokernelTester()
17744 .mr(5)
17745 .nr(8)
17746 .kr(1)
17747 .sr(1)
17748 .m(5)
17749 .n(n)
17750 .k(k)
17751 .a_stride(13)
17752 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17753 }
17754 }
17755 }
17756
17757 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
17758 TEST_REQUIRES_ARM_NEON_FMA;
17759 for (uint32_t n = 16; n <= 24; n += 8) {
17760 for (size_t k = 1; k <= 10; k += 3) {
17761 for (uint32_t m = 1; m <= 5; m++) {
17762 GemmMicrokernelTester()
17763 .mr(5)
17764 .nr(8)
17765 .kr(1)
17766 .sr(1)
17767 .m(m)
17768 .n(n)
17769 .k(k)
17770 .iterations(1)
17771 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17772 }
17773 }
17774 }
17775 }
17776
17777 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
17778 TEST_REQUIRES_ARM_NEON_FMA;
17779 for (size_t k = 1; k <= 10; k += 3) {
17780 for (uint32_t m = 1; m <= 5; m++) {
17781 for (uint32_t n = 1; n <= 8; n++) {
17782 GemmMicrokernelTester()
17783 .mr(5)
17784 .nr(8)
17785 .kr(1)
17786 .sr(1)
17787 .m(m)
17788 .n(n)
17789 .k(k)
17790 .cm_stride(11)
17791 .iterations(1)
17792 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17793 }
17794 }
17795 }
17796 }
17797
17798 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, qmin) {
17799 TEST_REQUIRES_ARM_NEON_FMA;
17800 GemmMicrokernelTester()
17801 .mr(5)
17802 .nr(8)
17803 .kr(1)
17804 .sr(1)
17805 .m(5)
17806 .n(8)
17807 .k(2)
17808 .qmin(128)
17809 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17810 }
17811
17812 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, qmax) {
17813 TEST_REQUIRES_ARM_NEON_FMA;
17814 GemmMicrokernelTester()
17815 .mr(5)
17816 .nr(8)
17817 .kr(1)
17818 .sr(1)
17819 .m(5)
17820 .n(8)
17821 .k(2)
17822 .qmax(128)
17823 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17824 }
17825
17826 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cm) {
17827 TEST_REQUIRES_ARM_NEON_FMA;
17828 GemmMicrokernelTester()
17829 .mr(5)
17830 .nr(8)
17831 .kr(1)
17832 .sr(1)
17833 .m(5)
17834 .n(8)
17835 .k(2)
17836 .cm_stride(11)
17837 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
17838 }
17839#endif // XNN_ARCH_ARM64
17840
17841
17842#if XNN_ARCH_ARM64
17843 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2) {
17844 TEST_REQUIRES_ARM_NEON_FMA;
17845 GemmMicrokernelTester()
17846 .mr(6)
17847 .nr(8)
17848 .kr(1)
17849 .sr(1)
17850 .m(6)
17851 .n(8)
17852 .k(2)
17853 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17854 }
17855
17856 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cn) {
17857 TEST_REQUIRES_ARM_NEON_FMA;
17858 GemmMicrokernelTester()
17859 .mr(6)
17860 .nr(8)
17861 .kr(1)
17862 .sr(1)
17863 .m(6)
17864 .n(8)
17865 .k(2)
17866 .cn_stride(11)
17867 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17868 }
17869
17870 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
17871 TEST_REQUIRES_ARM_NEON_FMA;
17872 GemmMicrokernelTester()
17873 .mr(6)
17874 .nr(8)
17875 .kr(1)
17876 .sr(1)
17877 .m(6)
17878 .n(8)
17879 .k(2)
17880 .a_stride(5)
17881 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17882 }
17883
17884 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
17885 TEST_REQUIRES_ARM_NEON_FMA;
17886 for (uint32_t m = 1; m <= 6; m++) {
17887 for (uint32_t n = 1; n <= 8; n++) {
17888 GemmMicrokernelTester()
17889 .mr(6)
17890 .nr(8)
17891 .kr(1)
17892 .sr(1)
17893 .m(m)
17894 .n(n)
17895 .k(2)
17896 .iterations(1)
17897 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17898 }
17899 }
17900 }
17901
17902 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
17903 TEST_REQUIRES_ARM_NEON_FMA;
17904 for (uint32_t m = 1; m <= 6; m++) {
17905 GemmMicrokernelTester()
17906 .mr(6)
17907 .nr(8)
17908 .kr(1)
17909 .sr(1)
17910 .m(m)
17911 .n(8)
17912 .k(2)
17913 .iterations(1)
17914 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17915 }
17916 }
17917
17918 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
17919 TEST_REQUIRES_ARM_NEON_FMA;
17920 for (uint32_t n = 1; n <= 8; n++) {
17921 GemmMicrokernelTester()
17922 .mr(6)
17923 .nr(8)
17924 .kr(1)
17925 .sr(1)
17926 .m(6)
17927 .n(n)
17928 .k(2)
17929 .iterations(1)
17930 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17931 }
17932 }
17933
17934 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2) {
17935 TEST_REQUIRES_ARM_NEON_FMA;
17936 for (size_t k = 1; k < 2; k++) {
17937 GemmMicrokernelTester()
17938 .mr(6)
17939 .nr(8)
17940 .kr(1)
17941 .sr(1)
17942 .m(6)
17943 .n(8)
17944 .k(k)
17945 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17946 }
17947 }
17948
17949 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
17950 TEST_REQUIRES_ARM_NEON_FMA;
17951 for (size_t k = 1; k < 2; k++) {
17952 GemmMicrokernelTester()
17953 .mr(6)
17954 .nr(8)
17955 .kr(1)
17956 .sr(1)
17957 .m(6)
17958 .n(8)
17959 .k(k)
17960 .a_stride(5)
17961 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17962 }
17963 }
17964
17965 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
17966 TEST_REQUIRES_ARM_NEON_FMA;
17967 for (size_t k = 1; k < 2; k++) {
17968 for (uint32_t m = 1; m <= 6; m++) {
17969 for (uint32_t n = 1; n <= 8; n++) {
17970 GemmMicrokernelTester()
17971 .mr(6)
17972 .nr(8)
17973 .kr(1)
17974 .sr(1)
17975 .m(m)
17976 .n(n)
17977 .k(k)
17978 .iterations(1)
17979 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17980 }
17981 }
17982 }
17983 }
17984
17985 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2) {
17986 TEST_REQUIRES_ARM_NEON_FMA;
17987 for (size_t k = 3; k < 4; k++) {
17988 GemmMicrokernelTester()
17989 .mr(6)
17990 .nr(8)
17991 .kr(1)
17992 .sr(1)
17993 .m(6)
17994 .n(8)
17995 .k(k)
17996 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
17997 }
17998 }
17999
18000 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
18001 TEST_REQUIRES_ARM_NEON_FMA;
18002 for (size_t k = 3; k < 4; k++) {
18003 GemmMicrokernelTester()
18004 .mr(6)
18005 .nr(8)
18006 .kr(1)
18007 .sr(1)
18008 .m(6)
18009 .n(8)
18010 .k(k)
18011 .a_stride(7)
18012 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18013 }
18014 }
18015
18016 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
18017 TEST_REQUIRES_ARM_NEON_FMA;
18018 for (size_t k = 3; k < 4; k++) {
18019 for (uint32_t m = 1; m <= 6; m++) {
18020 for (uint32_t n = 1; n <= 8; n++) {
18021 GemmMicrokernelTester()
18022 .mr(6)
18023 .nr(8)
18024 .kr(1)
18025 .sr(1)
18026 .m(m)
18027 .n(n)
18028 .k(k)
18029 .iterations(1)
18030 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18031 }
18032 }
18033 }
18034 }
18035
18036 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2) {
18037 TEST_REQUIRES_ARM_NEON_FMA;
18038 for (size_t k = 4; k <= 20; k += 2) {
18039 GemmMicrokernelTester()
18040 .mr(6)
18041 .nr(8)
18042 .kr(1)
18043 .sr(1)
18044 .m(6)
18045 .n(8)
18046 .k(k)
18047 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18048 }
18049 }
18050
18051 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
18052 TEST_REQUIRES_ARM_NEON_FMA;
18053 for (size_t k = 4; k <= 20; k += 2) {
18054 GemmMicrokernelTester()
18055 .mr(6)
18056 .nr(8)
18057 .kr(1)
18058 .sr(1)
18059 .m(6)
18060 .n(8)
18061 .k(k)
18062 .a_stride(23)
18063 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18064 }
18065 }
18066
18067 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
18068 TEST_REQUIRES_ARM_NEON_FMA;
18069 for (size_t k = 4; k <= 20; k += 2) {
18070 for (uint32_t m = 1; m <= 6; m++) {
18071 for (uint32_t n = 1; n <= 8; n++) {
18072 GemmMicrokernelTester()
18073 .mr(6)
18074 .nr(8)
18075 .kr(1)
18076 .sr(1)
18077 .m(m)
18078 .n(n)
18079 .k(k)
18080 .iterations(1)
18081 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18082 }
18083 }
18084 }
18085 }
18086
18087 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8) {
18088 TEST_REQUIRES_ARM_NEON_FMA;
18089 for (uint32_t n = 9; n < 16; n++) {
18090 for (size_t k = 1; k <= 10; k += 3) {
18091 GemmMicrokernelTester()
18092 .mr(6)
18093 .nr(8)
18094 .kr(1)
18095 .sr(1)
18096 .m(6)
18097 .n(8)
18098 .k(k)
18099 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18100 }
18101 }
18102 }
18103
18104 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
18105 TEST_REQUIRES_ARM_NEON_FMA;
18106 for (uint32_t n = 9; n < 16; n++) {
18107 for (size_t k = 1; k <= 10; k += 3) {
18108 GemmMicrokernelTester()
18109 .mr(6)
18110 .nr(8)
18111 .kr(1)
18112 .sr(1)
18113 .m(6)
18114 .n(8)
18115 .k(k)
18116 .cn_stride(11)
18117 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18118 }
18119 }
18120 }
18121
18122 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
18123 TEST_REQUIRES_ARM_NEON_FMA;
18124 for (uint32_t n = 9; n < 16; n++) {
18125 for (size_t k = 1; k <= 10; k += 3) {
18126 GemmMicrokernelTester()
18127 .mr(6)
18128 .nr(8)
18129 .kr(1)
18130 .sr(1)
18131 .m(6)
18132 .n(n)
18133 .k(k)
18134 .a_stride(13)
18135 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18136 }
18137 }
18138 }
18139
18140 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
18141 TEST_REQUIRES_ARM_NEON_FMA;
18142 for (uint32_t n = 9; n < 16; n++) {
18143 for (size_t k = 1; k <= 10; k += 3) {
18144 for (uint32_t m = 1; m <= 6; m++) {
18145 GemmMicrokernelTester()
18146 .mr(6)
18147 .nr(8)
18148 .kr(1)
18149 .sr(1)
18150 .m(m)
18151 .n(n)
18152 .k(k)
18153 .iterations(1)
18154 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18155 }
18156 }
18157 }
18158 }
18159
18160 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8) {
18161 TEST_REQUIRES_ARM_NEON_FMA;
18162 for (uint32_t n = 16; n <= 24; n += 8) {
18163 for (size_t k = 1; k <= 10; k += 3) {
18164 GemmMicrokernelTester()
18165 .mr(6)
18166 .nr(8)
18167 .kr(1)
18168 .sr(1)
18169 .m(6)
18170 .n(8)
18171 .k(k)
18172 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18173 }
18174 }
18175 }
18176
18177 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
18178 TEST_REQUIRES_ARM_NEON_FMA;
18179 for (uint32_t n = 16; n <= 24; n += 8) {
18180 for (size_t k = 1; k <= 10; k += 3) {
18181 GemmMicrokernelTester()
18182 .mr(6)
18183 .nr(8)
18184 .kr(1)
18185 .sr(1)
18186 .m(6)
18187 .n(n)
18188 .k(k)
18189 .cn_stride(11)
18190 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18191 }
18192 }
18193 }
18194
18195 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
18196 TEST_REQUIRES_ARM_NEON_FMA;
18197 for (uint32_t n = 16; n <= 24; n += 8) {
18198 for (size_t k = 1; k <= 10; k += 3) {
18199 GemmMicrokernelTester()
18200 .mr(6)
18201 .nr(8)
18202 .kr(1)
18203 .sr(1)
18204 .m(6)
18205 .n(n)
18206 .k(k)
18207 .a_stride(13)
18208 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18209 }
18210 }
18211 }
18212
18213 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
18214 TEST_REQUIRES_ARM_NEON_FMA;
18215 for (uint32_t n = 16; n <= 24; n += 8) {
18216 for (size_t k = 1; k <= 10; k += 3) {
18217 for (uint32_t m = 1; m <= 6; m++) {
18218 GemmMicrokernelTester()
18219 .mr(6)
18220 .nr(8)
18221 .kr(1)
18222 .sr(1)
18223 .m(m)
18224 .n(n)
18225 .k(k)
18226 .iterations(1)
18227 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18228 }
18229 }
18230 }
18231 }
18232
18233 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
18234 TEST_REQUIRES_ARM_NEON_FMA;
18235 for (size_t k = 1; k <= 10; k += 3) {
18236 for (uint32_t m = 1; m <= 6; m++) {
18237 for (uint32_t n = 1; n <= 8; n++) {
18238 GemmMicrokernelTester()
18239 .mr(6)
18240 .nr(8)
18241 .kr(1)
18242 .sr(1)
18243 .m(m)
18244 .n(n)
18245 .k(k)
18246 .cm_stride(11)
18247 .iterations(1)
18248 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18249 }
18250 }
18251 }
18252 }
18253
18254 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, qmin) {
18255 TEST_REQUIRES_ARM_NEON_FMA;
18256 GemmMicrokernelTester()
18257 .mr(6)
18258 .nr(8)
18259 .kr(1)
18260 .sr(1)
18261 .m(6)
18262 .n(8)
18263 .k(2)
18264 .qmin(128)
18265 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18266 }
18267
18268 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, qmax) {
18269 TEST_REQUIRES_ARM_NEON_FMA;
18270 GemmMicrokernelTester()
18271 .mr(6)
18272 .nr(8)
18273 .kr(1)
18274 .sr(1)
18275 .m(6)
18276 .n(8)
18277 .k(2)
18278 .qmax(128)
18279 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18280 }
18281
18282 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cm) {
18283 TEST_REQUIRES_ARM_NEON_FMA;
18284 GemmMicrokernelTester()
18285 .mr(6)
18286 .nr(8)
18287 .kr(1)
18288 .sr(1)
18289 .m(6)
18290 .n(8)
18291 .k(2)
18292 .cm_stride(11)
18293 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
18294 }
18295#endif // XNN_ARCH_ARM64
18296
18297
18298#if XNN_ARCH_ARM64
18299 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4) {
18300 TEST_REQUIRES_ARM_NEON_FMA;
18301 GemmMicrokernelTester()
18302 .mr(6)
18303 .nr(8)
18304 .kr(1)
18305 .sr(1)
18306 .m(6)
18307 .n(8)
18308 .k(4)
18309 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18310 }
18311
18312 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, strided_cn) {
18313 TEST_REQUIRES_ARM_NEON_FMA;
18314 GemmMicrokernelTester()
18315 .mr(6)
18316 .nr(8)
18317 .kr(1)
18318 .sr(1)
18319 .m(6)
18320 .n(8)
18321 .k(4)
18322 .cn_stride(11)
18323 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18324 }
18325
18326 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
18327 TEST_REQUIRES_ARM_NEON_FMA;
18328 GemmMicrokernelTester()
18329 .mr(6)
18330 .nr(8)
18331 .kr(1)
18332 .sr(1)
18333 .m(6)
18334 .n(8)
18335 .k(4)
18336 .a_stride(7)
18337 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18338 }
18339
18340 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
18341 TEST_REQUIRES_ARM_NEON_FMA;
18342 for (uint32_t m = 1; m <= 6; m++) {
18343 for (uint32_t n = 1; n <= 8; n++) {
18344 GemmMicrokernelTester()
18345 .mr(6)
18346 .nr(8)
18347 .kr(1)
18348 .sr(1)
18349 .m(m)
18350 .n(n)
18351 .k(4)
18352 .iterations(1)
18353 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18354 }
18355 }
18356 }
18357
18358 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
18359 TEST_REQUIRES_ARM_NEON_FMA;
18360 for (uint32_t m = 1; m <= 6; m++) {
18361 GemmMicrokernelTester()
18362 .mr(6)
18363 .nr(8)
18364 .kr(1)
18365 .sr(1)
18366 .m(m)
18367 .n(8)
18368 .k(4)
18369 .iterations(1)
18370 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18371 }
18372 }
18373
18374 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
18375 TEST_REQUIRES_ARM_NEON_FMA;
18376 for (uint32_t n = 1; n <= 8; n++) {
18377 GemmMicrokernelTester()
18378 .mr(6)
18379 .nr(8)
18380 .kr(1)
18381 .sr(1)
18382 .m(6)
18383 .n(n)
18384 .k(4)
18385 .iterations(1)
18386 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18387 }
18388 }
18389
18390 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_lt_4) {
18391 TEST_REQUIRES_ARM_NEON_FMA;
18392 for (size_t k = 1; k < 4; k++) {
18393 GemmMicrokernelTester()
18394 .mr(6)
18395 .nr(8)
18396 .kr(1)
18397 .sr(1)
18398 .m(6)
18399 .n(8)
18400 .k(k)
18401 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18402 }
18403 }
18404
18405 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
18406 TEST_REQUIRES_ARM_NEON_FMA;
18407 for (size_t k = 1; k < 4; k++) {
18408 GemmMicrokernelTester()
18409 .mr(6)
18410 .nr(8)
18411 .kr(1)
18412 .sr(1)
18413 .m(6)
18414 .n(8)
18415 .k(k)
18416 .a_stride(7)
18417 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18418 }
18419 }
18420
18421 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
18422 TEST_REQUIRES_ARM_NEON_FMA;
18423 for (size_t k = 1; k < 4; k++) {
18424 for (uint32_t m = 1; m <= 6; m++) {
18425 for (uint32_t n = 1; n <= 8; n++) {
18426 GemmMicrokernelTester()
18427 .mr(6)
18428 .nr(8)
18429 .kr(1)
18430 .sr(1)
18431 .m(m)
18432 .n(n)
18433 .k(k)
18434 .iterations(1)
18435 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18436 }
18437 }
18438 }
18439 }
18440
18441 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_gt_4) {
18442 TEST_REQUIRES_ARM_NEON_FMA;
18443 for (size_t k = 5; k < 8; k++) {
18444 GemmMicrokernelTester()
18445 .mr(6)
18446 .nr(8)
18447 .kr(1)
18448 .sr(1)
18449 .m(6)
18450 .n(8)
18451 .k(k)
18452 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18453 }
18454 }
18455
18456 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
18457 TEST_REQUIRES_ARM_NEON_FMA;
18458 for (size_t k = 5; k < 8; k++) {
18459 GemmMicrokernelTester()
18460 .mr(6)
18461 .nr(8)
18462 .kr(1)
18463 .sr(1)
18464 .m(6)
18465 .n(8)
18466 .k(k)
18467 .a_stride(11)
18468 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18469 }
18470 }
18471
18472 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
18473 TEST_REQUIRES_ARM_NEON_FMA;
18474 for (size_t k = 5; k < 8; k++) {
18475 for (uint32_t m = 1; m <= 6; m++) {
18476 for (uint32_t n = 1; n <= 8; n++) {
18477 GemmMicrokernelTester()
18478 .mr(6)
18479 .nr(8)
18480 .kr(1)
18481 .sr(1)
18482 .m(m)
18483 .n(n)
18484 .k(k)
18485 .iterations(1)
18486 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18487 }
18488 }
18489 }
18490 }
18491
18492 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_div_4) {
18493 TEST_REQUIRES_ARM_NEON_FMA;
18494 for (size_t k = 8; k <= 40; k += 4) {
18495 GemmMicrokernelTester()
18496 .mr(6)
18497 .nr(8)
18498 .kr(1)
18499 .sr(1)
18500 .m(6)
18501 .n(8)
18502 .k(k)
18503 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18504 }
18505 }
18506
18507 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
18508 TEST_REQUIRES_ARM_NEON_FMA;
18509 for (size_t k = 8; k <= 40; k += 4) {
18510 GemmMicrokernelTester()
18511 .mr(6)
18512 .nr(8)
18513 .kr(1)
18514 .sr(1)
18515 .m(6)
18516 .n(8)
18517 .k(k)
18518 .a_stride(43)
18519 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18520 }
18521 }
18522
18523 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
18524 TEST_REQUIRES_ARM_NEON_FMA;
18525 for (size_t k = 8; k <= 40; k += 4) {
18526 for (uint32_t m = 1; m <= 6; m++) {
18527 for (uint32_t n = 1; n <= 8; n++) {
18528 GemmMicrokernelTester()
18529 .mr(6)
18530 .nr(8)
18531 .kr(1)
18532 .sr(1)
18533 .m(m)
18534 .n(n)
18535 .k(k)
18536 .iterations(1)
18537 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18538 }
18539 }
18540 }
18541 }
18542
18543 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8) {
18544 TEST_REQUIRES_ARM_NEON_FMA;
18545 for (uint32_t n = 9; n < 16; n++) {
18546 for (size_t k = 1; k <= 20; k += 5) {
18547 GemmMicrokernelTester()
18548 .mr(6)
18549 .nr(8)
18550 .kr(1)
18551 .sr(1)
18552 .m(6)
18553 .n(8)
18554 .k(k)
18555 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18556 }
18557 }
18558 }
18559
18560 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
18561 TEST_REQUIRES_ARM_NEON_FMA;
18562 for (uint32_t n = 9; n < 16; n++) {
18563 for (size_t k = 1; k <= 20; k += 5) {
18564 GemmMicrokernelTester()
18565 .mr(6)
18566 .nr(8)
18567 .kr(1)
18568 .sr(1)
18569 .m(6)
18570 .n(8)
18571 .k(k)
18572 .cn_stride(11)
18573 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18574 }
18575 }
18576 }
18577
18578 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
18579 TEST_REQUIRES_ARM_NEON_FMA;
18580 for (uint32_t n = 9; n < 16; n++) {
18581 for (size_t k = 1; k <= 20; k += 5) {
18582 GemmMicrokernelTester()
18583 .mr(6)
18584 .nr(8)
18585 .kr(1)
18586 .sr(1)
18587 .m(6)
18588 .n(n)
18589 .k(k)
18590 .a_stride(23)
18591 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18592 }
18593 }
18594 }
18595
18596 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
18597 TEST_REQUIRES_ARM_NEON_FMA;
18598 for (uint32_t n = 9; n < 16; n++) {
18599 for (size_t k = 1; k <= 20; k += 5) {
18600 for (uint32_t m = 1; m <= 6; m++) {
18601 GemmMicrokernelTester()
18602 .mr(6)
18603 .nr(8)
18604 .kr(1)
18605 .sr(1)
18606 .m(m)
18607 .n(n)
18608 .k(k)
18609 .iterations(1)
18610 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18611 }
18612 }
18613 }
18614 }
18615
18616 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8) {
18617 TEST_REQUIRES_ARM_NEON_FMA;
18618 for (uint32_t n = 16; n <= 24; n += 8) {
18619 for (size_t k = 1; k <= 20; k += 5) {
18620 GemmMicrokernelTester()
18621 .mr(6)
18622 .nr(8)
18623 .kr(1)
18624 .sr(1)
18625 .m(6)
18626 .n(8)
18627 .k(k)
18628 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18629 }
18630 }
18631 }
18632
18633 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
18634 TEST_REQUIRES_ARM_NEON_FMA;
18635 for (uint32_t n = 16; n <= 24; n += 8) {
18636 for (size_t k = 1; k <= 20; k += 5) {
18637 GemmMicrokernelTester()
18638 .mr(6)
18639 .nr(8)
18640 .kr(1)
18641 .sr(1)
18642 .m(6)
18643 .n(n)
18644 .k(k)
18645 .cn_stride(11)
18646 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18647 }
18648 }
18649 }
18650
18651 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
18652 TEST_REQUIRES_ARM_NEON_FMA;
18653 for (uint32_t n = 16; n <= 24; n += 8) {
18654 for (size_t k = 1; k <= 20; k += 5) {
18655 GemmMicrokernelTester()
18656 .mr(6)
18657 .nr(8)
18658 .kr(1)
18659 .sr(1)
18660 .m(6)
18661 .n(n)
18662 .k(k)
18663 .a_stride(23)
18664 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18665 }
18666 }
18667 }
18668
18669 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
18670 TEST_REQUIRES_ARM_NEON_FMA;
18671 for (uint32_t n = 16; n <= 24; n += 8) {
18672 for (size_t k = 1; k <= 20; k += 5) {
18673 for (uint32_t m = 1; m <= 6; m++) {
18674 GemmMicrokernelTester()
18675 .mr(6)
18676 .nr(8)
18677 .kr(1)
18678 .sr(1)
18679 .m(m)
18680 .n(n)
18681 .k(k)
18682 .iterations(1)
18683 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18684 }
18685 }
18686 }
18687 }
18688
18689 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
18690 TEST_REQUIRES_ARM_NEON_FMA;
18691 for (size_t k = 1; k <= 20; k += 5) {
18692 for (uint32_t m = 1; m <= 6; m++) {
18693 for (uint32_t n = 1; n <= 8; n++) {
18694 GemmMicrokernelTester()
18695 .mr(6)
18696 .nr(8)
18697 .kr(1)
18698 .sr(1)
18699 .m(m)
18700 .n(n)
18701 .k(k)
18702 .cm_stride(11)
18703 .iterations(1)
18704 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18705 }
18706 }
18707 }
18708 }
18709
18710 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, qmin) {
18711 TEST_REQUIRES_ARM_NEON_FMA;
18712 GemmMicrokernelTester()
18713 .mr(6)
18714 .nr(8)
18715 .kr(1)
18716 .sr(1)
18717 .m(6)
18718 .n(8)
18719 .k(4)
18720 .qmin(128)
18721 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18722 }
18723
18724 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, qmax) {
18725 TEST_REQUIRES_ARM_NEON_FMA;
18726 GemmMicrokernelTester()
18727 .mr(6)
18728 .nr(8)
18729 .kr(1)
18730 .sr(1)
18731 .m(6)
18732 .n(8)
18733 .k(4)
18734 .qmax(128)
18735 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18736 }
18737
18738 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, strided_cm) {
18739 TEST_REQUIRES_ARM_NEON_FMA;
18740 GemmMicrokernelTester()
18741 .mr(6)
18742 .nr(8)
18743 .kr(1)
18744 .sr(1)
18745 .m(6)
18746 .n(8)
18747 .k(4)
18748 .cm_stride(11)
18749 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
18750 }
18751#endif // XNN_ARCH_ARM64
18752
18753
18754#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18755 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2) {
18756 TEST_REQUIRES_ARM_NEON;
18757 GemmMicrokernelTester()
18758 .mr(1)
18759 .nr(8)
18760 .kr(1)
18761 .sr(1)
18762 .m(1)
18763 .n(8)
18764 .k(2)
18765 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18766 }
18767
18768 TEST(F32_GEMM_1X8__NEON_DUP_LD64, strided_cn) {
18769 TEST_REQUIRES_ARM_NEON;
18770 GemmMicrokernelTester()
18771 .mr(1)
18772 .nr(8)
18773 .kr(1)
18774 .sr(1)
18775 .m(1)
18776 .n(8)
18777 .k(2)
18778 .cn_stride(11)
18779 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18780 }
18781
18782 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_strided_a) {
18783 TEST_REQUIRES_ARM_NEON;
18784 GemmMicrokernelTester()
18785 .mr(1)
18786 .nr(8)
18787 .kr(1)
18788 .sr(1)
18789 .m(1)
18790 .n(8)
18791 .k(2)
18792 .a_stride(5)
18793 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18794 }
18795
18796 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
18797 TEST_REQUIRES_ARM_NEON;
18798 for (uint32_t m = 1; m <= 1; m++) {
18799 for (uint32_t n = 1; n <= 8; n++) {
18800 GemmMicrokernelTester()
18801 .mr(1)
18802 .nr(8)
18803 .kr(1)
18804 .sr(1)
18805 .m(m)
18806 .n(n)
18807 .k(2)
18808 .iterations(1)
18809 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18810 }
18811 }
18812 }
18813
18814 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
18815 TEST_REQUIRES_ARM_NEON;
18816 for (uint32_t m = 1; m <= 1; m++) {
18817 GemmMicrokernelTester()
18818 .mr(1)
18819 .nr(8)
18820 .kr(1)
18821 .sr(1)
18822 .m(m)
18823 .n(8)
18824 .k(2)
18825 .iterations(1)
18826 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18827 }
18828 }
18829
18830 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
18831 TEST_REQUIRES_ARM_NEON;
18832 for (uint32_t n = 1; n <= 8; n++) {
18833 GemmMicrokernelTester()
18834 .mr(1)
18835 .nr(8)
18836 .kr(1)
18837 .sr(1)
18838 .m(1)
18839 .n(n)
18840 .k(2)
18841 .iterations(1)
18842 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18843 }
18844 }
18845
18846 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_lt_2) {
18847 TEST_REQUIRES_ARM_NEON;
18848 for (size_t k = 1; k < 2; k++) {
18849 GemmMicrokernelTester()
18850 .mr(1)
18851 .nr(8)
18852 .kr(1)
18853 .sr(1)
18854 .m(1)
18855 .n(8)
18856 .k(k)
18857 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18858 }
18859 }
18860
18861 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_lt_2_strided_a) {
18862 TEST_REQUIRES_ARM_NEON;
18863 for (size_t k = 1; k < 2; k++) {
18864 GemmMicrokernelTester()
18865 .mr(1)
18866 .nr(8)
18867 .kr(1)
18868 .sr(1)
18869 .m(1)
18870 .n(8)
18871 .k(k)
18872 .a_stride(5)
18873 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18874 }
18875 }
18876
18877 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
18878 TEST_REQUIRES_ARM_NEON;
18879 for (size_t k = 1; k < 2; k++) {
18880 for (uint32_t m = 1; m <= 1; m++) {
18881 for (uint32_t n = 1; n <= 8; n++) {
18882 GemmMicrokernelTester()
18883 .mr(1)
18884 .nr(8)
18885 .kr(1)
18886 .sr(1)
18887 .m(m)
18888 .n(n)
18889 .k(k)
18890 .iterations(1)
18891 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18892 }
18893 }
18894 }
18895 }
18896
18897 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_gt_2) {
18898 TEST_REQUIRES_ARM_NEON;
18899 for (size_t k = 3; k < 4; k++) {
18900 GemmMicrokernelTester()
18901 .mr(1)
18902 .nr(8)
18903 .kr(1)
18904 .sr(1)
18905 .m(1)
18906 .n(8)
18907 .k(k)
18908 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18909 }
18910 }
18911
18912 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_gt_2_strided_a) {
18913 TEST_REQUIRES_ARM_NEON;
18914 for (size_t k = 3; k < 4; k++) {
18915 GemmMicrokernelTester()
18916 .mr(1)
18917 .nr(8)
18918 .kr(1)
18919 .sr(1)
18920 .m(1)
18921 .n(8)
18922 .k(k)
18923 .a_stride(7)
18924 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18925 }
18926 }
18927
18928 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
18929 TEST_REQUIRES_ARM_NEON;
18930 for (size_t k = 3; k < 4; k++) {
18931 for (uint32_t m = 1; m <= 1; m++) {
18932 for (uint32_t n = 1; n <= 8; n++) {
18933 GemmMicrokernelTester()
18934 .mr(1)
18935 .nr(8)
18936 .kr(1)
18937 .sr(1)
18938 .m(m)
18939 .n(n)
18940 .k(k)
18941 .iterations(1)
18942 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18943 }
18944 }
18945 }
18946 }
18947
18948 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_div_2) {
18949 TEST_REQUIRES_ARM_NEON;
18950 for (size_t k = 4; k <= 20; k += 2) {
18951 GemmMicrokernelTester()
18952 .mr(1)
18953 .nr(8)
18954 .kr(1)
18955 .sr(1)
18956 .m(1)
18957 .n(8)
18958 .k(k)
18959 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18960 }
18961 }
18962
18963 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_div_2_strided_a) {
18964 TEST_REQUIRES_ARM_NEON;
18965 for (size_t k = 4; k <= 20; k += 2) {
18966 GemmMicrokernelTester()
18967 .mr(1)
18968 .nr(8)
18969 .kr(1)
18970 .sr(1)
18971 .m(1)
18972 .n(8)
18973 .k(k)
18974 .a_stride(23)
18975 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18976 }
18977 }
18978
18979 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_div_2_subtile) {
18980 TEST_REQUIRES_ARM_NEON;
18981 for (size_t k = 4; k <= 20; k += 2) {
18982 for (uint32_t m = 1; m <= 1; m++) {
18983 for (uint32_t n = 1; n <= 8; n++) {
18984 GemmMicrokernelTester()
18985 .mr(1)
18986 .nr(8)
18987 .kr(1)
18988 .sr(1)
18989 .m(m)
18990 .n(n)
18991 .k(k)
18992 .iterations(1)
18993 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
18994 }
18995 }
18996 }
18997 }
18998
18999 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8) {
19000 TEST_REQUIRES_ARM_NEON;
19001 for (uint32_t n = 9; n < 16; n++) {
19002 for (size_t k = 1; k <= 10; k += 3) {
19003 GemmMicrokernelTester()
19004 .mr(1)
19005 .nr(8)
19006 .kr(1)
19007 .sr(1)
19008 .m(1)
19009 .n(8)
19010 .k(k)
19011 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19012 }
19013 }
19014 }
19015
19016 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
19017 TEST_REQUIRES_ARM_NEON;
19018 for (uint32_t n = 9; n < 16; n++) {
19019 for (size_t k = 1; k <= 10; k += 3) {
19020 GemmMicrokernelTester()
19021 .mr(1)
19022 .nr(8)
19023 .kr(1)
19024 .sr(1)
19025 .m(1)
19026 .n(8)
19027 .k(k)
19028 .cn_stride(11)
19029 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19030 }
19031 }
19032 }
19033
19034 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8_strided_a) {
19035 TEST_REQUIRES_ARM_NEON;
19036 for (uint32_t n = 9; n < 16; n++) {
19037 for (size_t k = 1; k <= 10; k += 3) {
19038 GemmMicrokernelTester()
19039 .mr(1)
19040 .nr(8)
19041 .kr(1)
19042 .sr(1)
19043 .m(1)
19044 .n(n)
19045 .k(k)
19046 .a_stride(13)
19047 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19048 }
19049 }
19050 }
19051
19052 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
19053 TEST_REQUIRES_ARM_NEON;
19054 for (uint32_t n = 9; n < 16; n++) {
19055 for (size_t k = 1; k <= 10; k += 3) {
19056 for (uint32_t m = 1; m <= 1; m++) {
19057 GemmMicrokernelTester()
19058 .mr(1)
19059 .nr(8)
19060 .kr(1)
19061 .sr(1)
19062 .m(m)
19063 .n(n)
19064 .k(k)
19065 .iterations(1)
19066 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19067 }
19068 }
19069 }
19070 }
19071
19072 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8) {
19073 TEST_REQUIRES_ARM_NEON;
19074 for (uint32_t n = 16; n <= 24; n += 8) {
19075 for (size_t k = 1; k <= 10; k += 3) {
19076 GemmMicrokernelTester()
19077 .mr(1)
19078 .nr(8)
19079 .kr(1)
19080 .sr(1)
19081 .m(1)
19082 .n(8)
19083 .k(k)
19084 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19085 }
19086 }
19087 }
19088
19089 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
19090 TEST_REQUIRES_ARM_NEON;
19091 for (uint32_t n = 16; n <= 24; n += 8) {
19092 for (size_t k = 1; k <= 10; k += 3) {
19093 GemmMicrokernelTester()
19094 .mr(1)
19095 .nr(8)
19096 .kr(1)
19097 .sr(1)
19098 .m(1)
19099 .n(n)
19100 .k(k)
19101 .cn_stride(11)
19102 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19103 }
19104 }
19105 }
19106
19107 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8_strided_a) {
19108 TEST_REQUIRES_ARM_NEON;
19109 for (uint32_t n = 16; n <= 24; n += 8) {
19110 for (size_t k = 1; k <= 10; k += 3) {
19111 GemmMicrokernelTester()
19112 .mr(1)
19113 .nr(8)
19114 .kr(1)
19115 .sr(1)
19116 .m(1)
19117 .n(n)
19118 .k(k)
19119 .a_stride(13)
19120 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19121 }
19122 }
19123 }
19124
19125 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8_subtile) {
19126 TEST_REQUIRES_ARM_NEON;
19127 for (uint32_t n = 16; n <= 24; n += 8) {
19128 for (size_t k = 1; k <= 10; k += 3) {
19129 for (uint32_t m = 1; m <= 1; m++) {
19130 GemmMicrokernelTester()
19131 .mr(1)
19132 .nr(8)
19133 .kr(1)
19134 .sr(1)
19135 .m(m)
19136 .n(n)
19137 .k(k)
19138 .iterations(1)
19139 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19140 }
19141 }
19142 }
19143 }
19144
19145 TEST(F32_GEMM_1X8__NEON_DUP_LD64, strided_cm_subtile) {
19146 TEST_REQUIRES_ARM_NEON;
19147 for (size_t k = 1; k <= 10; k += 3) {
19148 for (uint32_t m = 1; m <= 1; m++) {
19149 for (uint32_t n = 1; n <= 8; n++) {
19150 GemmMicrokernelTester()
19151 .mr(1)
19152 .nr(8)
19153 .kr(1)
19154 .sr(1)
19155 .m(m)
19156 .n(n)
19157 .k(k)
19158 .cm_stride(11)
19159 .iterations(1)
19160 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19161 }
19162 }
19163 }
19164 }
19165
19166 TEST(F32_GEMM_1X8__NEON_DUP_LD64, qmin) {
19167 TEST_REQUIRES_ARM_NEON;
19168 GemmMicrokernelTester()
19169 .mr(1)
19170 .nr(8)
19171 .kr(1)
19172 .sr(1)
19173 .m(1)
19174 .n(8)
19175 .k(2)
19176 .qmin(128)
19177 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19178 }
19179
19180 TEST(F32_GEMM_1X8__NEON_DUP_LD64, qmax) {
19181 TEST_REQUIRES_ARM_NEON;
19182 GemmMicrokernelTester()
19183 .mr(1)
19184 .nr(8)
19185 .kr(1)
19186 .sr(1)
19187 .m(1)
19188 .n(8)
19189 .k(2)
19190 .qmax(128)
19191 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19192 }
19193
19194 TEST(F32_GEMM_1X8__NEON_DUP_LD64, strided_cm) {
19195 TEST_REQUIRES_ARM_NEON;
19196 GemmMicrokernelTester()
19197 .mr(1)
19198 .nr(8)
19199 .kr(1)
19200 .sr(1)
19201 .m(1)
19202 .n(8)
19203 .k(2)
19204 .cm_stride(11)
19205 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
19206 }
19207#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19208
19209
19210#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19211 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2) {
19212 TEST_REQUIRES_ARM_NEON;
19213 GemmMicrokernelTester()
19214 .mr(4)
19215 .nr(8)
19216 .kr(1)
19217 .sr(1)
19218 .m(4)
19219 .n(8)
19220 .k(2)
19221 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19222 }
19223
19224 TEST(F32_GEMM_4X8__NEON_DUP_LD64, strided_cn) {
19225 TEST_REQUIRES_ARM_NEON;
19226 GemmMicrokernelTester()
19227 .mr(4)
19228 .nr(8)
19229 .kr(1)
19230 .sr(1)
19231 .m(4)
19232 .n(8)
19233 .k(2)
19234 .cn_stride(11)
19235 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19236 }
19237
19238 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_strided_a) {
19239 TEST_REQUIRES_ARM_NEON;
19240 GemmMicrokernelTester()
19241 .mr(4)
19242 .nr(8)
19243 .kr(1)
19244 .sr(1)
19245 .m(4)
19246 .n(8)
19247 .k(2)
19248 .a_stride(5)
19249 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19250 }
19251
19252 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile) {
19253 TEST_REQUIRES_ARM_NEON;
19254 for (uint32_t m = 1; m <= 4; m++) {
19255 for (uint32_t n = 1; n <= 8; n++) {
19256 GemmMicrokernelTester()
19257 .mr(4)
19258 .nr(8)
19259 .kr(1)
19260 .sr(1)
19261 .m(m)
19262 .n(n)
19263 .k(2)
19264 .iterations(1)
19265 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19266 }
19267 }
19268 }
19269
19270 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
19271 TEST_REQUIRES_ARM_NEON;
19272 for (uint32_t m = 1; m <= 4; m++) {
19273 GemmMicrokernelTester()
19274 .mr(4)
19275 .nr(8)
19276 .kr(1)
19277 .sr(1)
19278 .m(m)
19279 .n(8)
19280 .k(2)
19281 .iterations(1)
19282 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19283 }
19284 }
19285
19286 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
19287 TEST_REQUIRES_ARM_NEON;
19288 for (uint32_t n = 1; n <= 8; n++) {
19289 GemmMicrokernelTester()
19290 .mr(4)
19291 .nr(8)
19292 .kr(1)
19293 .sr(1)
19294 .m(4)
19295 .n(n)
19296 .k(2)
19297 .iterations(1)
19298 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19299 }
19300 }
19301
19302 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_lt_2) {
19303 TEST_REQUIRES_ARM_NEON;
19304 for (size_t k = 1; k < 2; k++) {
19305 GemmMicrokernelTester()
19306 .mr(4)
19307 .nr(8)
19308 .kr(1)
19309 .sr(1)
19310 .m(4)
19311 .n(8)
19312 .k(k)
19313 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19314 }
19315 }
19316
19317 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_lt_2_strided_a) {
19318 TEST_REQUIRES_ARM_NEON;
19319 for (size_t k = 1; k < 2; k++) {
19320 GemmMicrokernelTester()
19321 .mr(4)
19322 .nr(8)
19323 .kr(1)
19324 .sr(1)
19325 .m(4)
19326 .n(8)
19327 .k(k)
19328 .a_stride(5)
19329 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19330 }
19331 }
19332
19333 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_lt_2_subtile) {
19334 TEST_REQUIRES_ARM_NEON;
19335 for (size_t k = 1; k < 2; k++) {
19336 for (uint32_t m = 1; m <= 4; m++) {
19337 for (uint32_t n = 1; n <= 8; n++) {
19338 GemmMicrokernelTester()
19339 .mr(4)
19340 .nr(8)
19341 .kr(1)
19342 .sr(1)
19343 .m(m)
19344 .n(n)
19345 .k(k)
19346 .iterations(1)
19347 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19348 }
19349 }
19350 }
19351 }
19352
19353 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_gt_2) {
19354 TEST_REQUIRES_ARM_NEON;
19355 for (size_t k = 3; k < 4; k++) {
19356 GemmMicrokernelTester()
19357 .mr(4)
19358 .nr(8)
19359 .kr(1)
19360 .sr(1)
19361 .m(4)
19362 .n(8)
19363 .k(k)
19364 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19365 }
19366 }
19367
19368 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_gt_2_strided_a) {
19369 TEST_REQUIRES_ARM_NEON;
19370 for (size_t k = 3; k < 4; k++) {
19371 GemmMicrokernelTester()
19372 .mr(4)
19373 .nr(8)
19374 .kr(1)
19375 .sr(1)
19376 .m(4)
19377 .n(8)
19378 .k(k)
19379 .a_stride(7)
19380 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19381 }
19382 }
19383
19384 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_gt_2_subtile) {
19385 TEST_REQUIRES_ARM_NEON;
19386 for (size_t k = 3; k < 4; k++) {
19387 for (uint32_t m = 1; m <= 4; m++) {
19388 for (uint32_t n = 1; n <= 8; n++) {
19389 GemmMicrokernelTester()
19390 .mr(4)
19391 .nr(8)
19392 .kr(1)
19393 .sr(1)
19394 .m(m)
19395 .n(n)
19396 .k(k)
19397 .iterations(1)
19398 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19399 }
19400 }
19401 }
19402 }
19403
19404 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_div_2) {
19405 TEST_REQUIRES_ARM_NEON;
19406 for (size_t k = 4; k <= 20; k += 2) {
19407 GemmMicrokernelTester()
19408 .mr(4)
19409 .nr(8)
19410 .kr(1)
19411 .sr(1)
19412 .m(4)
19413 .n(8)
19414 .k(k)
19415 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19416 }
19417 }
19418
19419 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_div_2_strided_a) {
19420 TEST_REQUIRES_ARM_NEON;
19421 for (size_t k = 4; k <= 20; k += 2) {
19422 GemmMicrokernelTester()
19423 .mr(4)
19424 .nr(8)
19425 .kr(1)
19426 .sr(1)
19427 .m(4)
19428 .n(8)
19429 .k(k)
19430 .a_stride(23)
19431 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19432 }
19433 }
19434
19435 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_div_2_subtile) {
19436 TEST_REQUIRES_ARM_NEON;
19437 for (size_t k = 4; k <= 20; k += 2) {
19438 for (uint32_t m = 1; m <= 4; m++) {
19439 for (uint32_t n = 1; n <= 8; n++) {
19440 GemmMicrokernelTester()
19441 .mr(4)
19442 .nr(8)
19443 .kr(1)
19444 .sr(1)
19445 .m(m)
19446 .n(n)
19447 .k(k)
19448 .iterations(1)
19449 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19450 }
19451 }
19452 }
19453 }
19454
19455 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8) {
19456 TEST_REQUIRES_ARM_NEON;
19457 for (uint32_t n = 9; n < 16; n++) {
19458 for (size_t k = 1; k <= 10; k += 3) {
19459 GemmMicrokernelTester()
19460 .mr(4)
19461 .nr(8)
19462 .kr(1)
19463 .sr(1)
19464 .m(4)
19465 .n(8)
19466 .k(k)
19467 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19468 }
19469 }
19470 }
19471
19472 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
19473 TEST_REQUIRES_ARM_NEON;
19474 for (uint32_t n = 9; n < 16; n++) {
19475 for (size_t k = 1; k <= 10; k += 3) {
19476 GemmMicrokernelTester()
19477 .mr(4)
19478 .nr(8)
19479 .kr(1)
19480 .sr(1)
19481 .m(4)
19482 .n(8)
19483 .k(k)
19484 .cn_stride(11)
19485 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19486 }
19487 }
19488 }
19489
19490 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8_strided_a) {
19491 TEST_REQUIRES_ARM_NEON;
19492 for (uint32_t n = 9; n < 16; n++) {
19493 for (size_t k = 1; k <= 10; k += 3) {
19494 GemmMicrokernelTester()
19495 .mr(4)
19496 .nr(8)
19497 .kr(1)
19498 .sr(1)
19499 .m(4)
19500 .n(n)
19501 .k(k)
19502 .a_stride(13)
19503 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19504 }
19505 }
19506 }
19507
19508 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8_subtile) {
19509 TEST_REQUIRES_ARM_NEON;
19510 for (uint32_t n = 9; n < 16; n++) {
19511 for (size_t k = 1; k <= 10; k += 3) {
19512 for (uint32_t m = 1; m <= 4; m++) {
19513 GemmMicrokernelTester()
19514 .mr(4)
19515 .nr(8)
19516 .kr(1)
19517 .sr(1)
19518 .m(m)
19519 .n(n)
19520 .k(k)
19521 .iterations(1)
19522 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19523 }
19524 }
19525 }
19526 }
19527
19528 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8) {
19529 TEST_REQUIRES_ARM_NEON;
19530 for (uint32_t n = 16; n <= 24; n += 8) {
19531 for (size_t k = 1; k <= 10; k += 3) {
19532 GemmMicrokernelTester()
19533 .mr(4)
19534 .nr(8)
19535 .kr(1)
19536 .sr(1)
19537 .m(4)
19538 .n(8)
19539 .k(k)
19540 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19541 }
19542 }
19543 }
19544
19545 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8_strided_cn) {
19546 TEST_REQUIRES_ARM_NEON;
19547 for (uint32_t n = 16; n <= 24; n += 8) {
19548 for (size_t k = 1; k <= 10; k += 3) {
19549 GemmMicrokernelTester()
19550 .mr(4)
19551 .nr(8)
19552 .kr(1)
19553 .sr(1)
19554 .m(4)
19555 .n(n)
19556 .k(k)
19557 .cn_stride(11)
19558 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19559 }
19560 }
19561 }
19562
19563 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8_strided_a) {
19564 TEST_REQUIRES_ARM_NEON;
19565 for (uint32_t n = 16; n <= 24; n += 8) {
19566 for (size_t k = 1; k <= 10; k += 3) {
19567 GemmMicrokernelTester()
19568 .mr(4)
19569 .nr(8)
19570 .kr(1)
19571 .sr(1)
19572 .m(4)
19573 .n(n)
19574 .k(k)
19575 .a_stride(13)
19576 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19577 }
19578 }
19579 }
19580
19581 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8_subtile) {
19582 TEST_REQUIRES_ARM_NEON;
19583 for (uint32_t n = 16; n <= 24; n += 8) {
19584 for (size_t k = 1; k <= 10; k += 3) {
19585 for (uint32_t m = 1; m <= 4; m++) {
19586 GemmMicrokernelTester()
19587 .mr(4)
19588 .nr(8)
19589 .kr(1)
19590 .sr(1)
19591 .m(m)
19592 .n(n)
19593 .k(k)
19594 .iterations(1)
19595 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19596 }
19597 }
19598 }
19599 }
19600
19601 TEST(F32_GEMM_4X8__NEON_DUP_LD64, strided_cm_subtile) {
19602 TEST_REQUIRES_ARM_NEON;
19603 for (size_t k = 1; k <= 10; k += 3) {
19604 for (uint32_t m = 1; m <= 4; m++) {
19605 for (uint32_t n = 1; n <= 8; n++) {
19606 GemmMicrokernelTester()
19607 .mr(4)
19608 .nr(8)
19609 .kr(1)
19610 .sr(1)
19611 .m(m)
19612 .n(n)
19613 .k(k)
19614 .cm_stride(11)
19615 .iterations(1)
19616 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19617 }
19618 }
19619 }
19620 }
19621
19622 TEST(F32_GEMM_4X8__NEON_DUP_LD64, qmin) {
19623 TEST_REQUIRES_ARM_NEON;
19624 GemmMicrokernelTester()
19625 .mr(4)
19626 .nr(8)
19627 .kr(1)
19628 .sr(1)
19629 .m(4)
19630 .n(8)
19631 .k(2)
19632 .qmin(128)
19633 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19634 }
19635
19636 TEST(F32_GEMM_4X8__NEON_DUP_LD64, qmax) {
19637 TEST_REQUIRES_ARM_NEON;
19638 GemmMicrokernelTester()
19639 .mr(4)
19640 .nr(8)
19641 .kr(1)
19642 .sr(1)
19643 .m(4)
19644 .n(8)
19645 .k(2)
19646 .qmax(128)
19647 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19648 }
19649
19650 TEST(F32_GEMM_4X8__NEON_DUP_LD64, strided_cm) {
19651 TEST_REQUIRES_ARM_NEON;
19652 GemmMicrokernelTester()
19653 .mr(4)
19654 .nr(8)
19655 .kr(1)
19656 .sr(1)
19657 .m(4)
19658 .n(8)
19659 .k(2)
19660 .cm_stride(11)
19661 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
19662 }
19663#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19664
19665
19666#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19667 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4) {
19668 TEST_REQUIRES_ARM_NEON;
19669 GemmMicrokernelTester()
19670 .mr(4)
19671 .nr(8)
19672 .kr(1)
19673 .sr(1)
19674 .m(4)
19675 .n(8)
19676 .k(4)
19677 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19678 }
19679
19680 TEST(F32_GEMM_4X8__NEON_DUP_LD128, strided_cn) {
19681 TEST_REQUIRES_ARM_NEON;
19682 GemmMicrokernelTester()
19683 .mr(4)
19684 .nr(8)
19685 .kr(1)
19686 .sr(1)
19687 .m(4)
19688 .n(8)
19689 .k(4)
19690 .cn_stride(11)
19691 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19692 }
19693
19694 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_strided_a) {
19695 TEST_REQUIRES_ARM_NEON;
19696 GemmMicrokernelTester()
19697 .mr(4)
19698 .nr(8)
19699 .kr(1)
19700 .sr(1)
19701 .m(4)
19702 .n(8)
19703 .k(4)
19704 .a_stride(7)
19705 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19706 }
19707
19708 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
19709 TEST_REQUIRES_ARM_NEON;
19710 for (uint32_t m = 1; m <= 4; m++) {
19711 for (uint32_t n = 1; n <= 8; n++) {
19712 GemmMicrokernelTester()
19713 .mr(4)
19714 .nr(8)
19715 .kr(1)
19716 .sr(1)
19717 .m(m)
19718 .n(n)
19719 .k(4)
19720 .iterations(1)
19721 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19722 }
19723 }
19724 }
19725
19726 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
19727 TEST_REQUIRES_ARM_NEON;
19728 for (uint32_t m = 1; m <= 4; m++) {
19729 GemmMicrokernelTester()
19730 .mr(4)
19731 .nr(8)
19732 .kr(1)
19733 .sr(1)
19734 .m(m)
19735 .n(8)
19736 .k(4)
19737 .iterations(1)
19738 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19739 }
19740 }
19741
19742 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
19743 TEST_REQUIRES_ARM_NEON;
19744 for (uint32_t n = 1; n <= 8; n++) {
19745 GemmMicrokernelTester()
19746 .mr(4)
19747 .nr(8)
19748 .kr(1)
19749 .sr(1)
19750 .m(4)
19751 .n(n)
19752 .k(4)
19753 .iterations(1)
19754 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19755 }
19756 }
19757
19758 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_lt_4) {
19759 TEST_REQUIRES_ARM_NEON;
19760 for (size_t k = 1; k < 4; k++) {
19761 GemmMicrokernelTester()
19762 .mr(4)
19763 .nr(8)
19764 .kr(1)
19765 .sr(1)
19766 .m(4)
19767 .n(8)
19768 .k(k)
19769 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19770 }
19771 }
19772
19773 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_lt_4_strided_a) {
19774 TEST_REQUIRES_ARM_NEON;
19775 for (size_t k = 1; k < 4; k++) {
19776 GemmMicrokernelTester()
19777 .mr(4)
19778 .nr(8)
19779 .kr(1)
19780 .sr(1)
19781 .m(4)
19782 .n(8)
19783 .k(k)
19784 .a_stride(7)
19785 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19786 }
19787 }
19788
19789 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
19790 TEST_REQUIRES_ARM_NEON;
19791 for (size_t k = 1; k < 4; k++) {
19792 for (uint32_t m = 1; m <= 4; m++) {
19793 for (uint32_t n = 1; n <= 8; n++) {
19794 GemmMicrokernelTester()
19795 .mr(4)
19796 .nr(8)
19797 .kr(1)
19798 .sr(1)
19799 .m(m)
19800 .n(n)
19801 .k(k)
19802 .iterations(1)
19803 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19804 }
19805 }
19806 }
19807 }
19808
19809 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_gt_4) {
19810 TEST_REQUIRES_ARM_NEON;
19811 for (size_t k = 5; k < 8; k++) {
19812 GemmMicrokernelTester()
19813 .mr(4)
19814 .nr(8)
19815 .kr(1)
19816 .sr(1)
19817 .m(4)
19818 .n(8)
19819 .k(k)
19820 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19821 }
19822 }
19823
19824 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_gt_4_strided_a) {
19825 TEST_REQUIRES_ARM_NEON;
19826 for (size_t k = 5; k < 8; k++) {
19827 GemmMicrokernelTester()
19828 .mr(4)
19829 .nr(8)
19830 .kr(1)
19831 .sr(1)
19832 .m(4)
19833 .n(8)
19834 .k(k)
19835 .a_stride(11)
19836 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19837 }
19838 }
19839
19840 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
19841 TEST_REQUIRES_ARM_NEON;
19842 for (size_t k = 5; k < 8; k++) {
19843 for (uint32_t m = 1; m <= 4; m++) {
19844 for (uint32_t n = 1; n <= 8; n++) {
19845 GemmMicrokernelTester()
19846 .mr(4)
19847 .nr(8)
19848 .kr(1)
19849 .sr(1)
19850 .m(m)
19851 .n(n)
19852 .k(k)
19853 .iterations(1)
19854 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19855 }
19856 }
19857 }
19858 }
19859
19860 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_div_4) {
19861 TEST_REQUIRES_ARM_NEON;
19862 for (size_t k = 8; k <= 40; k += 4) {
19863 GemmMicrokernelTester()
19864 .mr(4)
19865 .nr(8)
19866 .kr(1)
19867 .sr(1)
19868 .m(4)
19869 .n(8)
19870 .k(k)
19871 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19872 }
19873 }
19874
19875 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_div_4_strided_a) {
19876 TEST_REQUIRES_ARM_NEON;
19877 for (size_t k = 8; k <= 40; k += 4) {
19878 GemmMicrokernelTester()
19879 .mr(4)
19880 .nr(8)
19881 .kr(1)
19882 .sr(1)
19883 .m(4)
19884 .n(8)
19885 .k(k)
19886 .a_stride(43)
19887 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19888 }
19889 }
19890
19891 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_div_4_subtile) {
19892 TEST_REQUIRES_ARM_NEON;
19893 for (size_t k = 8; k <= 40; k += 4) {
19894 for (uint32_t m = 1; m <= 4; m++) {
19895 for (uint32_t n = 1; n <= 8; n++) {
19896 GemmMicrokernelTester()
19897 .mr(4)
19898 .nr(8)
19899 .kr(1)
19900 .sr(1)
19901 .m(m)
19902 .n(n)
19903 .k(k)
19904 .iterations(1)
19905 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19906 }
19907 }
19908 }
19909 }
19910
19911 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8) {
19912 TEST_REQUIRES_ARM_NEON;
19913 for (uint32_t n = 9; n < 16; n++) {
19914 for (size_t k = 1; k <= 20; k += 5) {
19915 GemmMicrokernelTester()
19916 .mr(4)
19917 .nr(8)
19918 .kr(1)
19919 .sr(1)
19920 .m(4)
19921 .n(8)
19922 .k(k)
19923 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19924 }
19925 }
19926 }
19927
19928 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
19929 TEST_REQUIRES_ARM_NEON;
19930 for (uint32_t n = 9; n < 16; n++) {
19931 for (size_t k = 1; k <= 20; k += 5) {
19932 GemmMicrokernelTester()
19933 .mr(4)
19934 .nr(8)
19935 .kr(1)
19936 .sr(1)
19937 .m(4)
19938 .n(8)
19939 .k(k)
19940 .cn_stride(11)
19941 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19942 }
19943 }
19944 }
19945
19946 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8_strided_a) {
19947 TEST_REQUIRES_ARM_NEON;
19948 for (uint32_t n = 9; n < 16; n++) {
19949 for (size_t k = 1; k <= 20; k += 5) {
19950 GemmMicrokernelTester()
19951 .mr(4)
19952 .nr(8)
19953 .kr(1)
19954 .sr(1)
19955 .m(4)
19956 .n(n)
19957 .k(k)
19958 .a_stride(23)
19959 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19960 }
19961 }
19962 }
19963
19964 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
19965 TEST_REQUIRES_ARM_NEON;
19966 for (uint32_t n = 9; n < 16; n++) {
19967 for (size_t k = 1; k <= 20; k += 5) {
19968 for (uint32_t m = 1; m <= 4; m++) {
19969 GemmMicrokernelTester()
19970 .mr(4)
19971 .nr(8)
19972 .kr(1)
19973 .sr(1)
19974 .m(m)
19975 .n(n)
19976 .k(k)
19977 .iterations(1)
19978 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19979 }
19980 }
19981 }
19982 }
19983
19984 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8) {
19985 TEST_REQUIRES_ARM_NEON;
19986 for (uint32_t n = 16; n <= 24; n += 8) {
19987 for (size_t k = 1; k <= 20; k += 5) {
19988 GemmMicrokernelTester()
19989 .mr(4)
19990 .nr(8)
19991 .kr(1)
19992 .sr(1)
19993 .m(4)
19994 .n(8)
19995 .k(k)
19996 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
19997 }
19998 }
19999 }
20000
20001 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
20002 TEST_REQUIRES_ARM_NEON;
20003 for (uint32_t n = 16; n <= 24; n += 8) {
20004 for (size_t k = 1; k <= 20; k += 5) {
20005 GemmMicrokernelTester()
20006 .mr(4)
20007 .nr(8)
20008 .kr(1)
20009 .sr(1)
20010 .m(4)
20011 .n(n)
20012 .k(k)
20013 .cn_stride(11)
20014 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20015 }
20016 }
20017 }
20018
20019 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8_strided_a) {
20020 TEST_REQUIRES_ARM_NEON;
20021 for (uint32_t n = 16; n <= 24; n += 8) {
20022 for (size_t k = 1; k <= 20; k += 5) {
20023 GemmMicrokernelTester()
20024 .mr(4)
20025 .nr(8)
20026 .kr(1)
20027 .sr(1)
20028 .m(4)
20029 .n(n)
20030 .k(k)
20031 .a_stride(23)
20032 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20033 }
20034 }
20035 }
20036
20037 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8_subtile) {
20038 TEST_REQUIRES_ARM_NEON;
20039 for (uint32_t n = 16; n <= 24; n += 8) {
20040 for (size_t k = 1; k <= 20; k += 5) {
20041 for (uint32_t m = 1; m <= 4; m++) {
20042 GemmMicrokernelTester()
20043 .mr(4)
20044 .nr(8)
20045 .kr(1)
20046 .sr(1)
20047 .m(m)
20048 .n(n)
20049 .k(k)
20050 .iterations(1)
20051 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20052 }
20053 }
20054 }
20055 }
20056
20057 TEST(F32_GEMM_4X8__NEON_DUP_LD128, strided_cm_subtile) {
20058 TEST_REQUIRES_ARM_NEON;
20059 for (size_t k = 1; k <= 20; k += 5) {
20060 for (uint32_t m = 1; m <= 4; m++) {
20061 for (uint32_t n = 1; n <= 8; n++) {
20062 GemmMicrokernelTester()
20063 .mr(4)
20064 .nr(8)
20065 .kr(1)
20066 .sr(1)
20067 .m(m)
20068 .n(n)
20069 .k(k)
20070 .cm_stride(11)
20071 .iterations(1)
20072 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20073 }
20074 }
20075 }
20076 }
20077
20078 TEST(F32_GEMM_4X8__NEON_DUP_LD128, qmin) {
20079 TEST_REQUIRES_ARM_NEON;
20080 GemmMicrokernelTester()
20081 .mr(4)
20082 .nr(8)
20083 .kr(1)
20084 .sr(1)
20085 .m(4)
20086 .n(8)
20087 .k(4)
20088 .qmin(128)
20089 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20090 }
20091
20092 TEST(F32_GEMM_4X8__NEON_DUP_LD128, qmax) {
20093 TEST_REQUIRES_ARM_NEON;
20094 GemmMicrokernelTester()
20095 .mr(4)
20096 .nr(8)
20097 .kr(1)
20098 .sr(1)
20099 .m(4)
20100 .n(8)
20101 .k(4)
20102 .qmax(128)
20103 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20104 }
20105
20106 TEST(F32_GEMM_4X8__NEON_DUP_LD128, strided_cm) {
20107 TEST_REQUIRES_ARM_NEON;
20108 GemmMicrokernelTester()
20109 .mr(4)
20110 .nr(8)
20111 .kr(1)
20112 .sr(1)
20113 .m(4)
20114 .n(8)
20115 .k(4)
20116 .cm_stride(11)
20117 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
20118 }
20119#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20120
20121
20122#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20123 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2) {
20124 TEST_REQUIRES_ARM_NEON;
20125 GemmMicrokernelTester()
20126 .mr(6)
20127 .nr(8)
20128 .kr(1)
20129 .sr(1)
20130 .m(6)
20131 .n(8)
20132 .k(2)
20133 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20134 }
20135
20136 TEST(F32_GEMM_6X8__NEON_DUP_LD64, strided_cn) {
20137 TEST_REQUIRES_ARM_NEON;
20138 GemmMicrokernelTester()
20139 .mr(6)
20140 .nr(8)
20141 .kr(1)
20142 .sr(1)
20143 .m(6)
20144 .n(8)
20145 .k(2)
20146 .cn_stride(11)
20147 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20148 }
20149
20150 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_strided_a) {
20151 TEST_REQUIRES_ARM_NEON;
20152 GemmMicrokernelTester()
20153 .mr(6)
20154 .nr(8)
20155 .kr(1)
20156 .sr(1)
20157 .m(6)
20158 .n(8)
20159 .k(2)
20160 .a_stride(5)
20161 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20162 }
20163
20164 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
20165 TEST_REQUIRES_ARM_NEON;
20166 for (uint32_t m = 1; m <= 6; m++) {
20167 for (uint32_t n = 1; n <= 8; n++) {
20168 GemmMicrokernelTester()
20169 .mr(6)
20170 .nr(8)
20171 .kr(1)
20172 .sr(1)
20173 .m(m)
20174 .n(n)
20175 .k(2)
20176 .iterations(1)
20177 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20178 }
20179 }
20180 }
20181
20182 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
20183 TEST_REQUIRES_ARM_NEON;
20184 for (uint32_t m = 1; m <= 6; m++) {
20185 GemmMicrokernelTester()
20186 .mr(6)
20187 .nr(8)
20188 .kr(1)
20189 .sr(1)
20190 .m(m)
20191 .n(8)
20192 .k(2)
20193 .iterations(1)
20194 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20195 }
20196 }
20197
20198 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
20199 TEST_REQUIRES_ARM_NEON;
20200 for (uint32_t n = 1; n <= 8; n++) {
20201 GemmMicrokernelTester()
20202 .mr(6)
20203 .nr(8)
20204 .kr(1)
20205 .sr(1)
20206 .m(6)
20207 .n(n)
20208 .k(2)
20209 .iterations(1)
20210 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20211 }
20212 }
20213
20214 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_lt_2) {
20215 TEST_REQUIRES_ARM_NEON;
20216 for (size_t k = 1; k < 2; k++) {
20217 GemmMicrokernelTester()
20218 .mr(6)
20219 .nr(8)
20220 .kr(1)
20221 .sr(1)
20222 .m(6)
20223 .n(8)
20224 .k(k)
20225 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20226 }
20227 }
20228
20229 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_lt_2_strided_a) {
20230 TEST_REQUIRES_ARM_NEON;
20231 for (size_t k = 1; k < 2; k++) {
20232 GemmMicrokernelTester()
20233 .mr(6)
20234 .nr(8)
20235 .kr(1)
20236 .sr(1)
20237 .m(6)
20238 .n(8)
20239 .k(k)
20240 .a_stride(5)
20241 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20242 }
20243 }
20244
20245 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
20246 TEST_REQUIRES_ARM_NEON;
20247 for (size_t k = 1; k < 2; k++) {
20248 for (uint32_t m = 1; m <= 6; m++) {
20249 for (uint32_t n = 1; n <= 8; n++) {
20250 GemmMicrokernelTester()
20251 .mr(6)
20252 .nr(8)
20253 .kr(1)
20254 .sr(1)
20255 .m(m)
20256 .n(n)
20257 .k(k)
20258 .iterations(1)
20259 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20260 }
20261 }
20262 }
20263 }
20264
20265 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_gt_2) {
20266 TEST_REQUIRES_ARM_NEON;
20267 for (size_t k = 3; k < 4; k++) {
20268 GemmMicrokernelTester()
20269 .mr(6)
20270 .nr(8)
20271 .kr(1)
20272 .sr(1)
20273 .m(6)
20274 .n(8)
20275 .k(k)
20276 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20277 }
20278 }
20279
20280 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_gt_2_strided_a) {
20281 TEST_REQUIRES_ARM_NEON;
20282 for (size_t k = 3; k < 4; k++) {
20283 GemmMicrokernelTester()
20284 .mr(6)
20285 .nr(8)
20286 .kr(1)
20287 .sr(1)
20288 .m(6)
20289 .n(8)
20290 .k(k)
20291 .a_stride(7)
20292 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20293 }
20294 }
20295
20296 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
20297 TEST_REQUIRES_ARM_NEON;
20298 for (size_t k = 3; k < 4; k++) {
20299 for (uint32_t m = 1; m <= 6; m++) {
20300 for (uint32_t n = 1; n <= 8; n++) {
20301 GemmMicrokernelTester()
20302 .mr(6)
20303 .nr(8)
20304 .kr(1)
20305 .sr(1)
20306 .m(m)
20307 .n(n)
20308 .k(k)
20309 .iterations(1)
20310 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20311 }
20312 }
20313 }
20314 }
20315
20316 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_div_2) {
20317 TEST_REQUIRES_ARM_NEON;
20318 for (size_t k = 4; k <= 20; k += 2) {
20319 GemmMicrokernelTester()
20320 .mr(6)
20321 .nr(8)
20322 .kr(1)
20323 .sr(1)
20324 .m(6)
20325 .n(8)
20326 .k(k)
20327 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20328 }
20329 }
20330
20331 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_div_2_strided_a) {
20332 TEST_REQUIRES_ARM_NEON;
20333 for (size_t k = 4; k <= 20; k += 2) {
20334 GemmMicrokernelTester()
20335 .mr(6)
20336 .nr(8)
20337 .kr(1)
20338 .sr(1)
20339 .m(6)
20340 .n(8)
20341 .k(k)
20342 .a_stride(23)
20343 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20344 }
20345 }
20346
20347 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_div_2_subtile) {
20348 TEST_REQUIRES_ARM_NEON;
20349 for (size_t k = 4; k <= 20; k += 2) {
20350 for (uint32_t m = 1; m <= 6; m++) {
20351 for (uint32_t n = 1; n <= 8; n++) {
20352 GemmMicrokernelTester()
20353 .mr(6)
20354 .nr(8)
20355 .kr(1)
20356 .sr(1)
20357 .m(m)
20358 .n(n)
20359 .k(k)
20360 .iterations(1)
20361 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20362 }
20363 }
20364 }
20365 }
20366
20367 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8) {
20368 TEST_REQUIRES_ARM_NEON;
20369 for (uint32_t n = 9; n < 16; n++) {
20370 for (size_t k = 1; k <= 10; k += 3) {
20371 GemmMicrokernelTester()
20372 .mr(6)
20373 .nr(8)
20374 .kr(1)
20375 .sr(1)
20376 .m(6)
20377 .n(8)
20378 .k(k)
20379 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20380 }
20381 }
20382 }
20383
20384 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
20385 TEST_REQUIRES_ARM_NEON;
20386 for (uint32_t n = 9; n < 16; n++) {
20387 for (size_t k = 1; k <= 10; k += 3) {
20388 GemmMicrokernelTester()
20389 .mr(6)
20390 .nr(8)
20391 .kr(1)
20392 .sr(1)
20393 .m(6)
20394 .n(8)
20395 .k(k)
20396 .cn_stride(11)
20397 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20398 }
20399 }
20400 }
20401
20402 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8_strided_a) {
20403 TEST_REQUIRES_ARM_NEON;
20404 for (uint32_t n = 9; n < 16; n++) {
20405 for (size_t k = 1; k <= 10; k += 3) {
20406 GemmMicrokernelTester()
20407 .mr(6)
20408 .nr(8)
20409 .kr(1)
20410 .sr(1)
20411 .m(6)
20412 .n(n)
20413 .k(k)
20414 .a_stride(13)
20415 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20416 }
20417 }
20418 }
20419
20420 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
20421 TEST_REQUIRES_ARM_NEON;
20422 for (uint32_t n = 9; n < 16; n++) {
20423 for (size_t k = 1; k <= 10; k += 3) {
20424 for (uint32_t m = 1; m <= 6; m++) {
20425 GemmMicrokernelTester()
20426 .mr(6)
20427 .nr(8)
20428 .kr(1)
20429 .sr(1)
20430 .m(m)
20431 .n(n)
20432 .k(k)
20433 .iterations(1)
20434 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20435 }
20436 }
20437 }
20438 }
20439
20440 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8) {
20441 TEST_REQUIRES_ARM_NEON;
20442 for (uint32_t n = 16; n <= 24; n += 8) {
20443 for (size_t k = 1; k <= 10; k += 3) {
20444 GemmMicrokernelTester()
20445 .mr(6)
20446 .nr(8)
20447 .kr(1)
20448 .sr(1)
20449 .m(6)
20450 .n(8)
20451 .k(k)
20452 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20453 }
20454 }
20455 }
20456
20457 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
20458 TEST_REQUIRES_ARM_NEON;
20459 for (uint32_t n = 16; n <= 24; n += 8) {
20460 for (size_t k = 1; k <= 10; k += 3) {
20461 GemmMicrokernelTester()
20462 .mr(6)
20463 .nr(8)
20464 .kr(1)
20465 .sr(1)
20466 .m(6)
20467 .n(n)
20468 .k(k)
20469 .cn_stride(11)
20470 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20471 }
20472 }
20473 }
20474
20475 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8_strided_a) {
20476 TEST_REQUIRES_ARM_NEON;
20477 for (uint32_t n = 16; n <= 24; n += 8) {
20478 for (size_t k = 1; k <= 10; k += 3) {
20479 GemmMicrokernelTester()
20480 .mr(6)
20481 .nr(8)
20482 .kr(1)
20483 .sr(1)
20484 .m(6)
20485 .n(n)
20486 .k(k)
20487 .a_stride(13)
20488 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20489 }
20490 }
20491 }
20492
20493 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8_subtile) {
20494 TEST_REQUIRES_ARM_NEON;
20495 for (uint32_t n = 16; n <= 24; n += 8) {
20496 for (size_t k = 1; k <= 10; k += 3) {
20497 for (uint32_t m = 1; m <= 6; m++) {
20498 GemmMicrokernelTester()
20499 .mr(6)
20500 .nr(8)
20501 .kr(1)
20502 .sr(1)
20503 .m(m)
20504 .n(n)
20505 .k(k)
20506 .iterations(1)
20507 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20508 }
20509 }
20510 }
20511 }
20512
20513 TEST(F32_GEMM_6X8__NEON_DUP_LD64, strided_cm_subtile) {
20514 TEST_REQUIRES_ARM_NEON;
20515 for (size_t k = 1; k <= 10; k += 3) {
20516 for (uint32_t m = 1; m <= 6; m++) {
20517 for (uint32_t n = 1; n <= 8; n++) {
20518 GemmMicrokernelTester()
20519 .mr(6)
20520 .nr(8)
20521 .kr(1)
20522 .sr(1)
20523 .m(m)
20524 .n(n)
20525 .k(k)
20526 .cm_stride(11)
20527 .iterations(1)
20528 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20529 }
20530 }
20531 }
20532 }
20533
20534 TEST(F32_GEMM_6X8__NEON_DUP_LD64, qmin) {
20535 TEST_REQUIRES_ARM_NEON;
20536 GemmMicrokernelTester()
20537 .mr(6)
20538 .nr(8)
20539 .kr(1)
20540 .sr(1)
20541 .m(6)
20542 .n(8)
20543 .k(2)
20544 .qmin(128)
20545 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20546 }
20547
20548 TEST(F32_GEMM_6X8__NEON_DUP_LD64, qmax) {
20549 TEST_REQUIRES_ARM_NEON;
20550 GemmMicrokernelTester()
20551 .mr(6)
20552 .nr(8)
20553 .kr(1)
20554 .sr(1)
20555 .m(6)
20556 .n(8)
20557 .k(2)
20558 .qmax(128)
20559 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20560 }
20561
20562 TEST(F32_GEMM_6X8__NEON_DUP_LD64, strided_cm) {
20563 TEST_REQUIRES_ARM_NEON;
20564 GemmMicrokernelTester()
20565 .mr(6)
20566 .nr(8)
20567 .kr(1)
20568 .sr(1)
20569 .m(6)
20570 .n(8)
20571 .k(2)
20572 .cm_stride(11)
20573 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
20574 }
20575#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20576
20577
20578#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20579 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4) {
20580 TEST_REQUIRES_ARM_NEON;
20581 GemmMicrokernelTester()
20582 .mr(6)
20583 .nr(8)
20584 .kr(1)
20585 .sr(1)
20586 .m(6)
20587 .n(8)
20588 .k(4)
20589 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20590 }
20591
20592 TEST(F32_GEMM_6X8__NEON_DUP_LD128, strided_cn) {
20593 TEST_REQUIRES_ARM_NEON;
20594 GemmMicrokernelTester()
20595 .mr(6)
20596 .nr(8)
20597 .kr(1)
20598 .sr(1)
20599 .m(6)
20600 .n(8)
20601 .k(4)
20602 .cn_stride(11)
20603 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20604 }
20605
20606 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_strided_a) {
20607 TEST_REQUIRES_ARM_NEON;
20608 GemmMicrokernelTester()
20609 .mr(6)
20610 .nr(8)
20611 .kr(1)
20612 .sr(1)
20613 .m(6)
20614 .n(8)
20615 .k(4)
20616 .a_stride(7)
20617 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20618 }
20619
20620 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile) {
20621 TEST_REQUIRES_ARM_NEON;
20622 for (uint32_t m = 1; m <= 6; m++) {
20623 for (uint32_t n = 1; n <= 8; n++) {
20624 GemmMicrokernelTester()
20625 .mr(6)
20626 .nr(8)
20627 .kr(1)
20628 .sr(1)
20629 .m(m)
20630 .n(n)
20631 .k(4)
20632 .iterations(1)
20633 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20634 }
20635 }
20636 }
20637
20638 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
20639 TEST_REQUIRES_ARM_NEON;
20640 for (uint32_t m = 1; m <= 6; m++) {
20641 GemmMicrokernelTester()
20642 .mr(6)
20643 .nr(8)
20644 .kr(1)
20645 .sr(1)
20646 .m(m)
20647 .n(8)
20648 .k(4)
20649 .iterations(1)
20650 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20651 }
20652 }
20653
20654 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
20655 TEST_REQUIRES_ARM_NEON;
20656 for (uint32_t n = 1; n <= 8; n++) {
20657 GemmMicrokernelTester()
20658 .mr(6)
20659 .nr(8)
20660 .kr(1)
20661 .sr(1)
20662 .m(6)
20663 .n(n)
20664 .k(4)
20665 .iterations(1)
20666 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20667 }
20668 }
20669
20670 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_lt_4) {
20671 TEST_REQUIRES_ARM_NEON;
20672 for (size_t k = 1; k < 4; k++) {
20673 GemmMicrokernelTester()
20674 .mr(6)
20675 .nr(8)
20676 .kr(1)
20677 .sr(1)
20678 .m(6)
20679 .n(8)
20680 .k(k)
20681 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20682 }
20683 }
20684
20685 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_lt_4_strided_a) {
20686 TEST_REQUIRES_ARM_NEON;
20687 for (size_t k = 1; k < 4; k++) {
20688 GemmMicrokernelTester()
20689 .mr(6)
20690 .nr(8)
20691 .kr(1)
20692 .sr(1)
20693 .m(6)
20694 .n(8)
20695 .k(k)
20696 .a_stride(7)
20697 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20698 }
20699 }
20700
20701 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_lt_4_subtile) {
20702 TEST_REQUIRES_ARM_NEON;
20703 for (size_t k = 1; k < 4; k++) {
20704 for (uint32_t m = 1; m <= 6; m++) {
20705 for (uint32_t n = 1; n <= 8; n++) {
20706 GemmMicrokernelTester()
20707 .mr(6)
20708 .nr(8)
20709 .kr(1)
20710 .sr(1)
20711 .m(m)
20712 .n(n)
20713 .k(k)
20714 .iterations(1)
20715 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20716 }
20717 }
20718 }
20719 }
20720
20721 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_gt_4) {
20722 TEST_REQUIRES_ARM_NEON;
20723 for (size_t k = 5; k < 8; k++) {
20724 GemmMicrokernelTester()
20725 .mr(6)
20726 .nr(8)
20727 .kr(1)
20728 .sr(1)
20729 .m(6)
20730 .n(8)
20731 .k(k)
20732 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20733 }
20734 }
20735
20736 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_gt_4_strided_a) {
20737 TEST_REQUIRES_ARM_NEON;
20738 for (size_t k = 5; k < 8; k++) {
20739 GemmMicrokernelTester()
20740 .mr(6)
20741 .nr(8)
20742 .kr(1)
20743 .sr(1)
20744 .m(6)
20745 .n(8)
20746 .k(k)
20747 .a_stride(11)
20748 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20749 }
20750 }
20751
20752 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_gt_4_subtile) {
20753 TEST_REQUIRES_ARM_NEON;
20754 for (size_t k = 5; k < 8; k++) {
20755 for (uint32_t m = 1; m <= 6; m++) {
20756 for (uint32_t n = 1; n <= 8; n++) {
20757 GemmMicrokernelTester()
20758 .mr(6)
20759 .nr(8)
20760 .kr(1)
20761 .sr(1)
20762 .m(m)
20763 .n(n)
20764 .k(k)
20765 .iterations(1)
20766 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20767 }
20768 }
20769 }
20770 }
20771
20772 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_div_4) {
20773 TEST_REQUIRES_ARM_NEON;
20774 for (size_t k = 8; k <= 40; k += 4) {
20775 GemmMicrokernelTester()
20776 .mr(6)
20777 .nr(8)
20778 .kr(1)
20779 .sr(1)
20780 .m(6)
20781 .n(8)
20782 .k(k)
20783 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20784 }
20785 }
20786
20787 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_div_4_strided_a) {
20788 TEST_REQUIRES_ARM_NEON;
20789 for (size_t k = 8; k <= 40; k += 4) {
20790 GemmMicrokernelTester()
20791 .mr(6)
20792 .nr(8)
20793 .kr(1)
20794 .sr(1)
20795 .m(6)
20796 .n(8)
20797 .k(k)
20798 .a_stride(43)
20799 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20800 }
20801 }
20802
20803 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_div_4_subtile) {
20804 TEST_REQUIRES_ARM_NEON;
20805 for (size_t k = 8; k <= 40; k += 4) {
20806 for (uint32_t m = 1; m <= 6; m++) {
20807 for (uint32_t n = 1; n <= 8; n++) {
20808 GemmMicrokernelTester()
20809 .mr(6)
20810 .nr(8)
20811 .kr(1)
20812 .sr(1)
20813 .m(m)
20814 .n(n)
20815 .k(k)
20816 .iterations(1)
20817 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20818 }
20819 }
20820 }
20821 }
20822
20823 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8) {
20824 TEST_REQUIRES_ARM_NEON;
20825 for (uint32_t n = 9; n < 16; n++) {
20826 for (size_t k = 1; k <= 20; k += 5) {
20827 GemmMicrokernelTester()
20828 .mr(6)
20829 .nr(8)
20830 .kr(1)
20831 .sr(1)
20832 .m(6)
20833 .n(8)
20834 .k(k)
20835 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20836 }
20837 }
20838 }
20839
20840 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
20841 TEST_REQUIRES_ARM_NEON;
20842 for (uint32_t n = 9; n < 16; n++) {
20843 for (size_t k = 1; k <= 20; k += 5) {
20844 GemmMicrokernelTester()
20845 .mr(6)
20846 .nr(8)
20847 .kr(1)
20848 .sr(1)
20849 .m(6)
20850 .n(8)
20851 .k(k)
20852 .cn_stride(11)
20853 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20854 }
20855 }
20856 }
20857
20858 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8_strided_a) {
20859 TEST_REQUIRES_ARM_NEON;
20860 for (uint32_t n = 9; n < 16; n++) {
20861 for (size_t k = 1; k <= 20; k += 5) {
20862 GemmMicrokernelTester()
20863 .mr(6)
20864 .nr(8)
20865 .kr(1)
20866 .sr(1)
20867 .m(6)
20868 .n(n)
20869 .k(k)
20870 .a_stride(23)
20871 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20872 }
20873 }
20874 }
20875
20876 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8_subtile) {
20877 TEST_REQUIRES_ARM_NEON;
20878 for (uint32_t n = 9; n < 16; n++) {
20879 for (size_t k = 1; k <= 20; k += 5) {
20880 for (uint32_t m = 1; m <= 6; m++) {
20881 GemmMicrokernelTester()
20882 .mr(6)
20883 .nr(8)
20884 .kr(1)
20885 .sr(1)
20886 .m(m)
20887 .n(n)
20888 .k(k)
20889 .iterations(1)
20890 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20891 }
20892 }
20893 }
20894 }
20895
20896 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8) {
20897 TEST_REQUIRES_ARM_NEON;
20898 for (uint32_t n = 16; n <= 24; n += 8) {
20899 for (size_t k = 1; k <= 20; k += 5) {
20900 GemmMicrokernelTester()
20901 .mr(6)
20902 .nr(8)
20903 .kr(1)
20904 .sr(1)
20905 .m(6)
20906 .n(8)
20907 .k(k)
20908 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20909 }
20910 }
20911 }
20912
20913 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8_strided_cn) {
20914 TEST_REQUIRES_ARM_NEON;
20915 for (uint32_t n = 16; n <= 24; n += 8) {
20916 for (size_t k = 1; k <= 20; k += 5) {
20917 GemmMicrokernelTester()
20918 .mr(6)
20919 .nr(8)
20920 .kr(1)
20921 .sr(1)
20922 .m(6)
20923 .n(n)
20924 .k(k)
20925 .cn_stride(11)
20926 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20927 }
20928 }
20929 }
20930
20931 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8_strided_a) {
20932 TEST_REQUIRES_ARM_NEON;
20933 for (uint32_t n = 16; n <= 24; n += 8) {
20934 for (size_t k = 1; k <= 20; k += 5) {
20935 GemmMicrokernelTester()
20936 .mr(6)
20937 .nr(8)
20938 .kr(1)
20939 .sr(1)
20940 .m(6)
20941 .n(n)
20942 .k(k)
20943 .a_stride(23)
20944 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20945 }
20946 }
20947 }
20948
20949 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8_subtile) {
20950 TEST_REQUIRES_ARM_NEON;
20951 for (uint32_t n = 16; n <= 24; n += 8) {
20952 for (size_t k = 1; k <= 20; k += 5) {
20953 for (uint32_t m = 1; m <= 6; m++) {
20954 GemmMicrokernelTester()
20955 .mr(6)
20956 .nr(8)
20957 .kr(1)
20958 .sr(1)
20959 .m(m)
20960 .n(n)
20961 .k(k)
20962 .iterations(1)
20963 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20964 }
20965 }
20966 }
20967 }
20968
20969 TEST(F32_GEMM_6X8__NEON_DUP_LD128, strided_cm_subtile) {
20970 TEST_REQUIRES_ARM_NEON;
20971 for (size_t k = 1; k <= 20; k += 5) {
20972 for (uint32_t m = 1; m <= 6; m++) {
20973 for (uint32_t n = 1; n <= 8; n++) {
20974 GemmMicrokernelTester()
20975 .mr(6)
20976 .nr(8)
20977 .kr(1)
20978 .sr(1)
20979 .m(m)
20980 .n(n)
20981 .k(k)
20982 .cm_stride(11)
20983 .iterations(1)
20984 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
20985 }
20986 }
20987 }
20988 }
20989
20990 TEST(F32_GEMM_6X8__NEON_DUP_LD128, qmin) {
20991 TEST_REQUIRES_ARM_NEON;
20992 GemmMicrokernelTester()
20993 .mr(6)
20994 .nr(8)
20995 .kr(1)
20996 .sr(1)
20997 .m(6)
20998 .n(8)
20999 .k(4)
21000 .qmin(128)
21001 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
21002 }
21003
21004 TEST(F32_GEMM_6X8__NEON_DUP_LD128, qmax) {
21005 TEST_REQUIRES_ARM_NEON;
21006 GemmMicrokernelTester()
21007 .mr(6)
21008 .nr(8)
21009 .kr(1)
21010 .sr(1)
21011 .m(6)
21012 .n(8)
21013 .k(4)
21014 .qmax(128)
21015 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
21016 }
21017
21018 TEST(F32_GEMM_6X8__NEON_DUP_LD128, strided_cm) {
21019 TEST_REQUIRES_ARM_NEON;
21020 GemmMicrokernelTester()
21021 .mr(6)
21022 .nr(8)
21023 .kr(1)
21024 .sr(1)
21025 .m(6)
21026 .n(8)
21027 .k(4)
21028 .cm_stride(11)
21029 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
21030 }
21031#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21032
21033
21034#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21035 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2) {
21036 TEST_REQUIRES_ARM_NEON_FMA;
21037 GemmMicrokernelTester()
21038 .mr(1)
21039 .nr(8)
21040 .kr(1)
21041 .sr(1)
21042 .m(1)
21043 .n(8)
21044 .k(2)
21045 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21046 }
21047
21048 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, strided_cn) {
21049 TEST_REQUIRES_ARM_NEON_FMA;
21050 GemmMicrokernelTester()
21051 .mr(1)
21052 .nr(8)
21053 .kr(1)
21054 .sr(1)
21055 .m(1)
21056 .n(8)
21057 .k(2)
21058 .cn_stride(11)
21059 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21060 }
21061
21062 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
21063 TEST_REQUIRES_ARM_NEON_FMA;
21064 GemmMicrokernelTester()
21065 .mr(1)
21066 .nr(8)
21067 .kr(1)
21068 .sr(1)
21069 .m(1)
21070 .n(8)
21071 .k(2)
21072 .a_stride(5)
21073 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21074 }
21075
21076 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
21077 TEST_REQUIRES_ARM_NEON_FMA;
21078 for (uint32_t m = 1; m <= 1; m++) {
21079 for (uint32_t n = 1; n <= 8; n++) {
21080 GemmMicrokernelTester()
21081 .mr(1)
21082 .nr(8)
21083 .kr(1)
21084 .sr(1)
21085 .m(m)
21086 .n(n)
21087 .k(2)
21088 .iterations(1)
21089 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21090 }
21091 }
21092 }
21093
21094 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
21095 TEST_REQUIRES_ARM_NEON_FMA;
21096 for (uint32_t m = 1; m <= 1; m++) {
21097 GemmMicrokernelTester()
21098 .mr(1)
21099 .nr(8)
21100 .kr(1)
21101 .sr(1)
21102 .m(m)
21103 .n(8)
21104 .k(2)
21105 .iterations(1)
21106 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21107 }
21108 }
21109
21110 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
21111 TEST_REQUIRES_ARM_NEON_FMA;
21112 for (uint32_t n = 1; n <= 8; n++) {
21113 GemmMicrokernelTester()
21114 .mr(1)
21115 .nr(8)
21116 .kr(1)
21117 .sr(1)
21118 .m(1)
21119 .n(n)
21120 .k(2)
21121 .iterations(1)
21122 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21123 }
21124 }
21125
21126 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_lt_2) {
21127 TEST_REQUIRES_ARM_NEON_FMA;
21128 for (size_t k = 1; k < 2; k++) {
21129 GemmMicrokernelTester()
21130 .mr(1)
21131 .nr(8)
21132 .kr(1)
21133 .sr(1)
21134 .m(1)
21135 .n(8)
21136 .k(k)
21137 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21138 }
21139 }
21140
21141 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
21142 TEST_REQUIRES_ARM_NEON_FMA;
21143 for (size_t k = 1; k < 2; k++) {
21144 GemmMicrokernelTester()
21145 .mr(1)
21146 .nr(8)
21147 .kr(1)
21148 .sr(1)
21149 .m(1)
21150 .n(8)
21151 .k(k)
21152 .a_stride(5)
21153 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21154 }
21155 }
21156
21157 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
21158 TEST_REQUIRES_ARM_NEON_FMA;
21159 for (size_t k = 1; k < 2; k++) {
21160 for (uint32_t m = 1; m <= 1; m++) {
21161 for (uint32_t n = 1; n <= 8; n++) {
21162 GemmMicrokernelTester()
21163 .mr(1)
21164 .nr(8)
21165 .kr(1)
21166 .sr(1)
21167 .m(m)
21168 .n(n)
21169 .k(k)
21170 .iterations(1)
21171 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21172 }
21173 }
21174 }
21175 }
21176
21177 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_gt_2) {
21178 TEST_REQUIRES_ARM_NEON_FMA;
21179 for (size_t k = 3; k < 4; k++) {
21180 GemmMicrokernelTester()
21181 .mr(1)
21182 .nr(8)
21183 .kr(1)
21184 .sr(1)
21185 .m(1)
21186 .n(8)
21187 .k(k)
21188 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21189 }
21190 }
21191
21192 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
21193 TEST_REQUIRES_ARM_NEON_FMA;
21194 for (size_t k = 3; k < 4; k++) {
21195 GemmMicrokernelTester()
21196 .mr(1)
21197 .nr(8)
21198 .kr(1)
21199 .sr(1)
21200 .m(1)
21201 .n(8)
21202 .k(k)
21203 .a_stride(7)
21204 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21205 }
21206 }
21207
21208 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
21209 TEST_REQUIRES_ARM_NEON_FMA;
21210 for (size_t k = 3; k < 4; k++) {
21211 for (uint32_t m = 1; m <= 1; m++) {
21212 for (uint32_t n = 1; n <= 8; n++) {
21213 GemmMicrokernelTester()
21214 .mr(1)
21215 .nr(8)
21216 .kr(1)
21217 .sr(1)
21218 .m(m)
21219 .n(n)
21220 .k(k)
21221 .iterations(1)
21222 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21223 }
21224 }
21225 }
21226 }
21227
21228 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_div_2) {
21229 TEST_REQUIRES_ARM_NEON_FMA;
21230 for (size_t k = 4; k <= 20; k += 2) {
21231 GemmMicrokernelTester()
21232 .mr(1)
21233 .nr(8)
21234 .kr(1)
21235 .sr(1)
21236 .m(1)
21237 .n(8)
21238 .k(k)
21239 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21240 }
21241 }
21242
21243 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
21244 TEST_REQUIRES_ARM_NEON_FMA;
21245 for (size_t k = 4; k <= 20; k += 2) {
21246 GemmMicrokernelTester()
21247 .mr(1)
21248 .nr(8)
21249 .kr(1)
21250 .sr(1)
21251 .m(1)
21252 .n(8)
21253 .k(k)
21254 .a_stride(23)
21255 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21256 }
21257 }
21258
21259 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
21260 TEST_REQUIRES_ARM_NEON_FMA;
21261 for (size_t k = 4; k <= 20; k += 2) {
21262 for (uint32_t m = 1; m <= 1; m++) {
21263 for (uint32_t n = 1; n <= 8; n++) {
21264 GemmMicrokernelTester()
21265 .mr(1)
21266 .nr(8)
21267 .kr(1)
21268 .sr(1)
21269 .m(m)
21270 .n(n)
21271 .k(k)
21272 .iterations(1)
21273 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21274 }
21275 }
21276 }
21277 }
21278
21279 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8) {
21280 TEST_REQUIRES_ARM_NEON_FMA;
21281 for (uint32_t n = 9; n < 16; n++) {
21282 for (size_t k = 1; k <= 10; k += 3) {
21283 GemmMicrokernelTester()
21284 .mr(1)
21285 .nr(8)
21286 .kr(1)
21287 .sr(1)
21288 .m(1)
21289 .n(8)
21290 .k(k)
21291 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21292 }
21293 }
21294 }
21295
21296 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
21297 TEST_REQUIRES_ARM_NEON_FMA;
21298 for (uint32_t n = 9; n < 16; n++) {
21299 for (size_t k = 1; k <= 10; k += 3) {
21300 GemmMicrokernelTester()
21301 .mr(1)
21302 .nr(8)
21303 .kr(1)
21304 .sr(1)
21305 .m(1)
21306 .n(8)
21307 .k(k)
21308 .cn_stride(11)
21309 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21310 }
21311 }
21312 }
21313
21314 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
21315 TEST_REQUIRES_ARM_NEON_FMA;
21316 for (uint32_t n = 9; n < 16; n++) {
21317 for (size_t k = 1; k <= 10; k += 3) {
21318 GemmMicrokernelTester()
21319 .mr(1)
21320 .nr(8)
21321 .kr(1)
21322 .sr(1)
21323 .m(1)
21324 .n(n)
21325 .k(k)
21326 .a_stride(13)
21327 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21328 }
21329 }
21330 }
21331
21332 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
21333 TEST_REQUIRES_ARM_NEON_FMA;
21334 for (uint32_t n = 9; n < 16; n++) {
21335 for (size_t k = 1; k <= 10; k += 3) {
21336 for (uint32_t m = 1; m <= 1; m++) {
21337 GemmMicrokernelTester()
21338 .mr(1)
21339 .nr(8)
21340 .kr(1)
21341 .sr(1)
21342 .m(m)
21343 .n(n)
21344 .k(k)
21345 .iterations(1)
21346 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21347 }
21348 }
21349 }
21350 }
21351
21352 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8) {
21353 TEST_REQUIRES_ARM_NEON_FMA;
21354 for (uint32_t n = 16; n <= 24; n += 8) {
21355 for (size_t k = 1; k <= 10; k += 3) {
21356 GemmMicrokernelTester()
21357 .mr(1)
21358 .nr(8)
21359 .kr(1)
21360 .sr(1)
21361 .m(1)
21362 .n(8)
21363 .k(k)
21364 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21365 }
21366 }
21367 }
21368
21369 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
21370 TEST_REQUIRES_ARM_NEON_FMA;
21371 for (uint32_t n = 16; n <= 24; n += 8) {
21372 for (size_t k = 1; k <= 10; k += 3) {
21373 GemmMicrokernelTester()
21374 .mr(1)
21375 .nr(8)
21376 .kr(1)
21377 .sr(1)
21378 .m(1)
21379 .n(n)
21380 .k(k)
21381 .cn_stride(11)
21382 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21383 }
21384 }
21385 }
21386
21387 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
21388 TEST_REQUIRES_ARM_NEON_FMA;
21389 for (uint32_t n = 16; n <= 24; n += 8) {
21390 for (size_t k = 1; k <= 10; k += 3) {
21391 GemmMicrokernelTester()
21392 .mr(1)
21393 .nr(8)
21394 .kr(1)
21395 .sr(1)
21396 .m(1)
21397 .n(n)
21398 .k(k)
21399 .a_stride(13)
21400 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21401 }
21402 }
21403 }
21404
21405 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
21406 TEST_REQUIRES_ARM_NEON_FMA;
21407 for (uint32_t n = 16; n <= 24; n += 8) {
21408 for (size_t k = 1; k <= 10; k += 3) {
21409 for (uint32_t m = 1; m <= 1; m++) {
21410 GemmMicrokernelTester()
21411 .mr(1)
21412 .nr(8)
21413 .kr(1)
21414 .sr(1)
21415 .m(m)
21416 .n(n)
21417 .k(k)
21418 .iterations(1)
21419 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21420 }
21421 }
21422 }
21423 }
21424
21425 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
21426 TEST_REQUIRES_ARM_NEON_FMA;
21427 for (size_t k = 1; k <= 10; k += 3) {
21428 for (uint32_t m = 1; m <= 1; m++) {
21429 for (uint32_t n = 1; n <= 8; n++) {
21430 GemmMicrokernelTester()
21431 .mr(1)
21432 .nr(8)
21433 .kr(1)
21434 .sr(1)
21435 .m(m)
21436 .n(n)
21437 .k(k)
21438 .cm_stride(11)
21439 .iterations(1)
21440 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21441 }
21442 }
21443 }
21444 }
21445
21446 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, qmin) {
21447 TEST_REQUIRES_ARM_NEON_FMA;
21448 GemmMicrokernelTester()
21449 .mr(1)
21450 .nr(8)
21451 .kr(1)
21452 .sr(1)
21453 .m(1)
21454 .n(8)
21455 .k(2)
21456 .qmin(128)
21457 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21458 }
21459
21460 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, qmax) {
21461 TEST_REQUIRES_ARM_NEON_FMA;
21462 GemmMicrokernelTester()
21463 .mr(1)
21464 .nr(8)
21465 .kr(1)
21466 .sr(1)
21467 .m(1)
21468 .n(8)
21469 .k(2)
21470 .qmax(128)
21471 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21472 }
21473
21474 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, strided_cm) {
21475 TEST_REQUIRES_ARM_NEON_FMA;
21476 GemmMicrokernelTester()
21477 .mr(1)
21478 .nr(8)
21479 .kr(1)
21480 .sr(1)
21481 .m(1)
21482 .n(8)
21483 .k(2)
21484 .cm_stride(11)
21485 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
21486 }
21487#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21488
21489
21490#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21491 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2) {
21492 TEST_REQUIRES_ARM_NEON_FMA;
21493 GemmMicrokernelTester()
21494 .mr(4)
21495 .nr(8)
21496 .kr(1)
21497 .sr(1)
21498 .m(4)
21499 .n(8)
21500 .k(2)
21501 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21502 }
21503
21504 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, strided_cn) {
21505 TEST_REQUIRES_ARM_NEON_FMA;
21506 GemmMicrokernelTester()
21507 .mr(4)
21508 .nr(8)
21509 .kr(1)
21510 .sr(1)
21511 .m(4)
21512 .n(8)
21513 .k(2)
21514 .cn_stride(11)
21515 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21516 }
21517
21518 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
21519 TEST_REQUIRES_ARM_NEON_FMA;
21520 GemmMicrokernelTester()
21521 .mr(4)
21522 .nr(8)
21523 .kr(1)
21524 .sr(1)
21525 .m(4)
21526 .n(8)
21527 .k(2)
21528 .a_stride(5)
21529 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21530 }
21531
21532 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
21533 TEST_REQUIRES_ARM_NEON_FMA;
21534 for (uint32_t m = 1; m <= 4; m++) {
21535 for (uint32_t n = 1; n <= 8; n++) {
21536 GemmMicrokernelTester()
21537 .mr(4)
21538 .nr(8)
21539 .kr(1)
21540 .sr(1)
21541 .m(m)
21542 .n(n)
21543 .k(2)
21544 .iterations(1)
21545 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21546 }
21547 }
21548 }
21549
21550 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
21551 TEST_REQUIRES_ARM_NEON_FMA;
21552 for (uint32_t m = 1; m <= 4; m++) {
21553 GemmMicrokernelTester()
21554 .mr(4)
21555 .nr(8)
21556 .kr(1)
21557 .sr(1)
21558 .m(m)
21559 .n(8)
21560 .k(2)
21561 .iterations(1)
21562 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21563 }
21564 }
21565
21566 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
21567 TEST_REQUIRES_ARM_NEON_FMA;
21568 for (uint32_t n = 1; n <= 8; n++) {
21569 GemmMicrokernelTester()
21570 .mr(4)
21571 .nr(8)
21572 .kr(1)
21573 .sr(1)
21574 .m(4)
21575 .n(n)
21576 .k(2)
21577 .iterations(1)
21578 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21579 }
21580 }
21581
21582 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_lt_2) {
21583 TEST_REQUIRES_ARM_NEON_FMA;
21584 for (size_t k = 1; k < 2; k++) {
21585 GemmMicrokernelTester()
21586 .mr(4)
21587 .nr(8)
21588 .kr(1)
21589 .sr(1)
21590 .m(4)
21591 .n(8)
21592 .k(k)
21593 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21594 }
21595 }
21596
21597 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
21598 TEST_REQUIRES_ARM_NEON_FMA;
21599 for (size_t k = 1; k < 2; k++) {
21600 GemmMicrokernelTester()
21601 .mr(4)
21602 .nr(8)
21603 .kr(1)
21604 .sr(1)
21605 .m(4)
21606 .n(8)
21607 .k(k)
21608 .a_stride(5)
21609 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21610 }
21611 }
21612
21613 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
21614 TEST_REQUIRES_ARM_NEON_FMA;
21615 for (size_t k = 1; k < 2; k++) {
21616 for (uint32_t m = 1; m <= 4; m++) {
21617 for (uint32_t n = 1; n <= 8; n++) {
21618 GemmMicrokernelTester()
21619 .mr(4)
21620 .nr(8)
21621 .kr(1)
21622 .sr(1)
21623 .m(m)
21624 .n(n)
21625 .k(k)
21626 .iterations(1)
21627 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21628 }
21629 }
21630 }
21631 }
21632
21633 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_gt_2) {
21634 TEST_REQUIRES_ARM_NEON_FMA;
21635 for (size_t k = 3; k < 4; k++) {
21636 GemmMicrokernelTester()
21637 .mr(4)
21638 .nr(8)
21639 .kr(1)
21640 .sr(1)
21641 .m(4)
21642 .n(8)
21643 .k(k)
21644 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21645 }
21646 }
21647
21648 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
21649 TEST_REQUIRES_ARM_NEON_FMA;
21650 for (size_t k = 3; k < 4; k++) {
21651 GemmMicrokernelTester()
21652 .mr(4)
21653 .nr(8)
21654 .kr(1)
21655 .sr(1)
21656 .m(4)
21657 .n(8)
21658 .k(k)
21659 .a_stride(7)
21660 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21661 }
21662 }
21663
21664 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
21665 TEST_REQUIRES_ARM_NEON_FMA;
21666 for (size_t k = 3; k < 4; k++) {
21667 for (uint32_t m = 1; m <= 4; m++) {
21668 for (uint32_t n = 1; n <= 8; n++) {
21669 GemmMicrokernelTester()
21670 .mr(4)
21671 .nr(8)
21672 .kr(1)
21673 .sr(1)
21674 .m(m)
21675 .n(n)
21676 .k(k)
21677 .iterations(1)
21678 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21679 }
21680 }
21681 }
21682 }
21683
21684 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_div_2) {
21685 TEST_REQUIRES_ARM_NEON_FMA;
21686 for (size_t k = 4; k <= 20; k += 2) {
21687 GemmMicrokernelTester()
21688 .mr(4)
21689 .nr(8)
21690 .kr(1)
21691 .sr(1)
21692 .m(4)
21693 .n(8)
21694 .k(k)
21695 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21696 }
21697 }
21698
21699 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
21700 TEST_REQUIRES_ARM_NEON_FMA;
21701 for (size_t k = 4; k <= 20; k += 2) {
21702 GemmMicrokernelTester()
21703 .mr(4)
21704 .nr(8)
21705 .kr(1)
21706 .sr(1)
21707 .m(4)
21708 .n(8)
21709 .k(k)
21710 .a_stride(23)
21711 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21712 }
21713 }
21714
21715 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
21716 TEST_REQUIRES_ARM_NEON_FMA;
21717 for (size_t k = 4; k <= 20; k += 2) {
21718 for (uint32_t m = 1; m <= 4; m++) {
21719 for (uint32_t n = 1; n <= 8; n++) {
21720 GemmMicrokernelTester()
21721 .mr(4)
21722 .nr(8)
21723 .kr(1)
21724 .sr(1)
21725 .m(m)
21726 .n(n)
21727 .k(k)
21728 .iterations(1)
21729 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21730 }
21731 }
21732 }
21733 }
21734
21735 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8) {
21736 TEST_REQUIRES_ARM_NEON_FMA;
21737 for (uint32_t n = 9; n < 16; n++) {
21738 for (size_t k = 1; k <= 10; k += 3) {
21739 GemmMicrokernelTester()
21740 .mr(4)
21741 .nr(8)
21742 .kr(1)
21743 .sr(1)
21744 .m(4)
21745 .n(8)
21746 .k(k)
21747 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21748 }
21749 }
21750 }
21751
21752 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
21753 TEST_REQUIRES_ARM_NEON_FMA;
21754 for (uint32_t n = 9; n < 16; n++) {
21755 for (size_t k = 1; k <= 10; k += 3) {
21756 GemmMicrokernelTester()
21757 .mr(4)
21758 .nr(8)
21759 .kr(1)
21760 .sr(1)
21761 .m(4)
21762 .n(8)
21763 .k(k)
21764 .cn_stride(11)
21765 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21766 }
21767 }
21768 }
21769
21770 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
21771 TEST_REQUIRES_ARM_NEON_FMA;
21772 for (uint32_t n = 9; n < 16; n++) {
21773 for (size_t k = 1; k <= 10; k += 3) {
21774 GemmMicrokernelTester()
21775 .mr(4)
21776 .nr(8)
21777 .kr(1)
21778 .sr(1)
21779 .m(4)
21780 .n(n)
21781 .k(k)
21782 .a_stride(13)
21783 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21784 }
21785 }
21786 }
21787
21788 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
21789 TEST_REQUIRES_ARM_NEON_FMA;
21790 for (uint32_t n = 9; n < 16; n++) {
21791 for (size_t k = 1; k <= 10; k += 3) {
21792 for (uint32_t m = 1; m <= 4; m++) {
21793 GemmMicrokernelTester()
21794 .mr(4)
21795 .nr(8)
21796 .kr(1)
21797 .sr(1)
21798 .m(m)
21799 .n(n)
21800 .k(k)
21801 .iterations(1)
21802 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21803 }
21804 }
21805 }
21806 }
21807
21808 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8) {
21809 TEST_REQUIRES_ARM_NEON_FMA;
21810 for (uint32_t n = 16; n <= 24; n += 8) {
21811 for (size_t k = 1; k <= 10; k += 3) {
21812 GemmMicrokernelTester()
21813 .mr(4)
21814 .nr(8)
21815 .kr(1)
21816 .sr(1)
21817 .m(4)
21818 .n(8)
21819 .k(k)
21820 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21821 }
21822 }
21823 }
21824
21825 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
21826 TEST_REQUIRES_ARM_NEON_FMA;
21827 for (uint32_t n = 16; n <= 24; n += 8) {
21828 for (size_t k = 1; k <= 10; k += 3) {
21829 GemmMicrokernelTester()
21830 .mr(4)
21831 .nr(8)
21832 .kr(1)
21833 .sr(1)
21834 .m(4)
21835 .n(n)
21836 .k(k)
21837 .cn_stride(11)
21838 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21839 }
21840 }
21841 }
21842
21843 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
21844 TEST_REQUIRES_ARM_NEON_FMA;
21845 for (uint32_t n = 16; n <= 24; n += 8) {
21846 for (size_t k = 1; k <= 10; k += 3) {
21847 GemmMicrokernelTester()
21848 .mr(4)
21849 .nr(8)
21850 .kr(1)
21851 .sr(1)
21852 .m(4)
21853 .n(n)
21854 .k(k)
21855 .a_stride(13)
21856 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21857 }
21858 }
21859 }
21860
21861 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
21862 TEST_REQUIRES_ARM_NEON_FMA;
21863 for (uint32_t n = 16; n <= 24; n += 8) {
21864 for (size_t k = 1; k <= 10; k += 3) {
21865 for (uint32_t m = 1; m <= 4; m++) {
21866 GemmMicrokernelTester()
21867 .mr(4)
21868 .nr(8)
21869 .kr(1)
21870 .sr(1)
21871 .m(m)
21872 .n(n)
21873 .k(k)
21874 .iterations(1)
21875 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21876 }
21877 }
21878 }
21879 }
21880
21881 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
21882 TEST_REQUIRES_ARM_NEON_FMA;
21883 for (size_t k = 1; k <= 10; k += 3) {
21884 for (uint32_t m = 1; m <= 4; m++) {
21885 for (uint32_t n = 1; n <= 8; n++) {
21886 GemmMicrokernelTester()
21887 .mr(4)
21888 .nr(8)
21889 .kr(1)
21890 .sr(1)
21891 .m(m)
21892 .n(n)
21893 .k(k)
21894 .cm_stride(11)
21895 .iterations(1)
21896 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21897 }
21898 }
21899 }
21900 }
21901
21902 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, qmin) {
21903 TEST_REQUIRES_ARM_NEON_FMA;
21904 GemmMicrokernelTester()
21905 .mr(4)
21906 .nr(8)
21907 .kr(1)
21908 .sr(1)
21909 .m(4)
21910 .n(8)
21911 .k(2)
21912 .qmin(128)
21913 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21914 }
21915
21916 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, qmax) {
21917 TEST_REQUIRES_ARM_NEON_FMA;
21918 GemmMicrokernelTester()
21919 .mr(4)
21920 .nr(8)
21921 .kr(1)
21922 .sr(1)
21923 .m(4)
21924 .n(8)
21925 .k(2)
21926 .qmax(128)
21927 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21928 }
21929
21930 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, strided_cm) {
21931 TEST_REQUIRES_ARM_NEON_FMA;
21932 GemmMicrokernelTester()
21933 .mr(4)
21934 .nr(8)
21935 .kr(1)
21936 .sr(1)
21937 .m(4)
21938 .n(8)
21939 .k(2)
21940 .cm_stride(11)
21941 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
21942 }
21943#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21944
21945
21946#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21947 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4) {
21948 TEST_REQUIRES_ARM_NEON_FMA;
21949 GemmMicrokernelTester()
21950 .mr(4)
21951 .nr(8)
21952 .kr(1)
21953 .sr(1)
21954 .m(4)
21955 .n(8)
21956 .k(4)
21957 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
21958 }
21959
21960 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, strided_cn) {
21961 TEST_REQUIRES_ARM_NEON_FMA;
21962 GemmMicrokernelTester()
21963 .mr(4)
21964 .nr(8)
21965 .kr(1)
21966 .sr(1)
21967 .m(4)
21968 .n(8)
21969 .k(4)
21970 .cn_stride(11)
21971 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
21972 }
21973
21974 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
21975 TEST_REQUIRES_ARM_NEON_FMA;
21976 GemmMicrokernelTester()
21977 .mr(4)
21978 .nr(8)
21979 .kr(1)
21980 .sr(1)
21981 .m(4)
21982 .n(8)
21983 .k(4)
21984 .a_stride(7)
21985 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
21986 }
21987
21988 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
21989 TEST_REQUIRES_ARM_NEON_FMA;
21990 for (uint32_t m = 1; m <= 4; m++) {
21991 for (uint32_t n = 1; n <= 8; n++) {
21992 GemmMicrokernelTester()
21993 .mr(4)
21994 .nr(8)
21995 .kr(1)
21996 .sr(1)
21997 .m(m)
21998 .n(n)
21999 .k(4)
22000 .iterations(1)
22001 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22002 }
22003 }
22004 }
22005
22006 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
22007 TEST_REQUIRES_ARM_NEON_FMA;
22008 for (uint32_t m = 1; m <= 4; m++) {
22009 GemmMicrokernelTester()
22010 .mr(4)
22011 .nr(8)
22012 .kr(1)
22013 .sr(1)
22014 .m(m)
22015 .n(8)
22016 .k(4)
22017 .iterations(1)
22018 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22019 }
22020 }
22021
22022 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
22023 TEST_REQUIRES_ARM_NEON_FMA;
22024 for (uint32_t n = 1; n <= 8; n++) {
22025 GemmMicrokernelTester()
22026 .mr(4)
22027 .nr(8)
22028 .kr(1)
22029 .sr(1)
22030 .m(4)
22031 .n(n)
22032 .k(4)
22033 .iterations(1)
22034 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22035 }
22036 }
22037
22038 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_lt_4) {
22039 TEST_REQUIRES_ARM_NEON_FMA;
22040 for (size_t k = 1; k < 4; k++) {
22041 GemmMicrokernelTester()
22042 .mr(4)
22043 .nr(8)
22044 .kr(1)
22045 .sr(1)
22046 .m(4)
22047 .n(8)
22048 .k(k)
22049 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22050 }
22051 }
22052
22053 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
22054 TEST_REQUIRES_ARM_NEON_FMA;
22055 for (size_t k = 1; k < 4; k++) {
22056 GemmMicrokernelTester()
22057 .mr(4)
22058 .nr(8)
22059 .kr(1)
22060 .sr(1)
22061 .m(4)
22062 .n(8)
22063 .k(k)
22064 .a_stride(7)
22065 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22066 }
22067 }
22068
22069 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
22070 TEST_REQUIRES_ARM_NEON_FMA;
22071 for (size_t k = 1; k < 4; k++) {
22072 for (uint32_t m = 1; m <= 4; m++) {
22073 for (uint32_t n = 1; n <= 8; n++) {
22074 GemmMicrokernelTester()
22075 .mr(4)
22076 .nr(8)
22077 .kr(1)
22078 .sr(1)
22079 .m(m)
22080 .n(n)
22081 .k(k)
22082 .iterations(1)
22083 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22084 }
22085 }
22086 }
22087 }
22088
22089 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_gt_4) {
22090 TEST_REQUIRES_ARM_NEON_FMA;
22091 for (size_t k = 5; k < 8; k++) {
22092 GemmMicrokernelTester()
22093 .mr(4)
22094 .nr(8)
22095 .kr(1)
22096 .sr(1)
22097 .m(4)
22098 .n(8)
22099 .k(k)
22100 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22101 }
22102 }
22103
22104 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
22105 TEST_REQUIRES_ARM_NEON_FMA;
22106 for (size_t k = 5; k < 8; k++) {
22107 GemmMicrokernelTester()
22108 .mr(4)
22109 .nr(8)
22110 .kr(1)
22111 .sr(1)
22112 .m(4)
22113 .n(8)
22114 .k(k)
22115 .a_stride(11)
22116 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22117 }
22118 }
22119
22120 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
22121 TEST_REQUIRES_ARM_NEON_FMA;
22122 for (size_t k = 5; k < 8; k++) {
22123 for (uint32_t m = 1; m <= 4; m++) {
22124 for (uint32_t n = 1; n <= 8; n++) {
22125 GemmMicrokernelTester()
22126 .mr(4)
22127 .nr(8)
22128 .kr(1)
22129 .sr(1)
22130 .m(m)
22131 .n(n)
22132 .k(k)
22133 .iterations(1)
22134 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22135 }
22136 }
22137 }
22138 }
22139
22140 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_div_4) {
22141 TEST_REQUIRES_ARM_NEON_FMA;
22142 for (size_t k = 8; k <= 40; k += 4) {
22143 GemmMicrokernelTester()
22144 .mr(4)
22145 .nr(8)
22146 .kr(1)
22147 .sr(1)
22148 .m(4)
22149 .n(8)
22150 .k(k)
22151 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22152 }
22153 }
22154
22155 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
22156 TEST_REQUIRES_ARM_NEON_FMA;
22157 for (size_t k = 8; k <= 40; k += 4) {
22158 GemmMicrokernelTester()
22159 .mr(4)
22160 .nr(8)
22161 .kr(1)
22162 .sr(1)
22163 .m(4)
22164 .n(8)
22165 .k(k)
22166 .a_stride(43)
22167 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22168 }
22169 }
22170
22171 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
22172 TEST_REQUIRES_ARM_NEON_FMA;
22173 for (size_t k = 8; k <= 40; k += 4) {
22174 for (uint32_t m = 1; m <= 4; m++) {
22175 for (uint32_t n = 1; n <= 8; n++) {
22176 GemmMicrokernelTester()
22177 .mr(4)
22178 .nr(8)
22179 .kr(1)
22180 .sr(1)
22181 .m(m)
22182 .n(n)
22183 .k(k)
22184 .iterations(1)
22185 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22186 }
22187 }
22188 }
22189 }
22190
22191 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8) {
22192 TEST_REQUIRES_ARM_NEON_FMA;
22193 for (uint32_t n = 9; n < 16; n++) {
22194 for (size_t k = 1; k <= 20; k += 5) {
22195 GemmMicrokernelTester()
22196 .mr(4)
22197 .nr(8)
22198 .kr(1)
22199 .sr(1)
22200 .m(4)
22201 .n(8)
22202 .k(k)
22203 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22204 }
22205 }
22206 }
22207
22208 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
22209 TEST_REQUIRES_ARM_NEON_FMA;
22210 for (uint32_t n = 9; n < 16; n++) {
22211 for (size_t k = 1; k <= 20; k += 5) {
22212 GemmMicrokernelTester()
22213 .mr(4)
22214 .nr(8)
22215 .kr(1)
22216 .sr(1)
22217 .m(4)
22218 .n(8)
22219 .k(k)
22220 .cn_stride(11)
22221 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22222 }
22223 }
22224 }
22225
22226 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
22227 TEST_REQUIRES_ARM_NEON_FMA;
22228 for (uint32_t n = 9; n < 16; n++) {
22229 for (size_t k = 1; k <= 20; k += 5) {
22230 GemmMicrokernelTester()
22231 .mr(4)
22232 .nr(8)
22233 .kr(1)
22234 .sr(1)
22235 .m(4)
22236 .n(n)
22237 .k(k)
22238 .a_stride(23)
22239 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22240 }
22241 }
22242 }
22243
22244 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
22245 TEST_REQUIRES_ARM_NEON_FMA;
22246 for (uint32_t n = 9; n < 16; n++) {
22247 for (size_t k = 1; k <= 20; k += 5) {
22248 for (uint32_t m = 1; m <= 4; m++) {
22249 GemmMicrokernelTester()
22250 .mr(4)
22251 .nr(8)
22252 .kr(1)
22253 .sr(1)
22254 .m(m)
22255 .n(n)
22256 .k(k)
22257 .iterations(1)
22258 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22259 }
22260 }
22261 }
22262 }
22263
22264 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8) {
22265 TEST_REQUIRES_ARM_NEON_FMA;
22266 for (uint32_t n = 16; n <= 24; n += 8) {
22267 for (size_t k = 1; k <= 20; k += 5) {
22268 GemmMicrokernelTester()
22269 .mr(4)
22270 .nr(8)
22271 .kr(1)
22272 .sr(1)
22273 .m(4)
22274 .n(8)
22275 .k(k)
22276 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22277 }
22278 }
22279 }
22280
22281 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
22282 TEST_REQUIRES_ARM_NEON_FMA;
22283 for (uint32_t n = 16; n <= 24; n += 8) {
22284 for (size_t k = 1; k <= 20; k += 5) {
22285 GemmMicrokernelTester()
22286 .mr(4)
22287 .nr(8)
22288 .kr(1)
22289 .sr(1)
22290 .m(4)
22291 .n(n)
22292 .k(k)
22293 .cn_stride(11)
22294 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22295 }
22296 }
22297 }
22298
22299 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
22300 TEST_REQUIRES_ARM_NEON_FMA;
22301 for (uint32_t n = 16; n <= 24; n += 8) {
22302 for (size_t k = 1; k <= 20; k += 5) {
22303 GemmMicrokernelTester()
22304 .mr(4)
22305 .nr(8)
22306 .kr(1)
22307 .sr(1)
22308 .m(4)
22309 .n(n)
22310 .k(k)
22311 .a_stride(23)
22312 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22313 }
22314 }
22315 }
22316
22317 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
22318 TEST_REQUIRES_ARM_NEON_FMA;
22319 for (uint32_t n = 16; n <= 24; n += 8) {
22320 for (size_t k = 1; k <= 20; k += 5) {
22321 for (uint32_t m = 1; m <= 4; m++) {
22322 GemmMicrokernelTester()
22323 .mr(4)
22324 .nr(8)
22325 .kr(1)
22326 .sr(1)
22327 .m(m)
22328 .n(n)
22329 .k(k)
22330 .iterations(1)
22331 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22332 }
22333 }
22334 }
22335 }
22336
22337 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
22338 TEST_REQUIRES_ARM_NEON_FMA;
22339 for (size_t k = 1; k <= 20; k += 5) {
22340 for (uint32_t m = 1; m <= 4; m++) {
22341 for (uint32_t n = 1; n <= 8; n++) {
22342 GemmMicrokernelTester()
22343 .mr(4)
22344 .nr(8)
22345 .kr(1)
22346 .sr(1)
22347 .m(m)
22348 .n(n)
22349 .k(k)
22350 .cm_stride(11)
22351 .iterations(1)
22352 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22353 }
22354 }
22355 }
22356 }
22357
22358 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, qmin) {
22359 TEST_REQUIRES_ARM_NEON_FMA;
22360 GemmMicrokernelTester()
22361 .mr(4)
22362 .nr(8)
22363 .kr(1)
22364 .sr(1)
22365 .m(4)
22366 .n(8)
22367 .k(4)
22368 .qmin(128)
22369 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22370 }
22371
22372 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, qmax) {
22373 TEST_REQUIRES_ARM_NEON_FMA;
22374 GemmMicrokernelTester()
22375 .mr(4)
22376 .nr(8)
22377 .kr(1)
22378 .sr(1)
22379 .m(4)
22380 .n(8)
22381 .k(4)
22382 .qmax(128)
22383 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22384 }
22385
22386 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, strided_cm) {
22387 TEST_REQUIRES_ARM_NEON_FMA;
22388 GemmMicrokernelTester()
22389 .mr(4)
22390 .nr(8)
22391 .kr(1)
22392 .sr(1)
22393 .m(4)
22394 .n(8)
22395 .k(4)
22396 .cm_stride(11)
22397 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
22398 }
22399#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22400
22401
22402#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22403 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2) {
22404 TEST_REQUIRES_ARM_NEON_FMA;
22405 GemmMicrokernelTester()
22406 .mr(6)
22407 .nr(8)
22408 .kr(1)
22409 .sr(1)
22410 .m(6)
22411 .n(8)
22412 .k(2)
22413 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22414 }
22415
22416 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, strided_cn) {
22417 TEST_REQUIRES_ARM_NEON_FMA;
22418 GemmMicrokernelTester()
22419 .mr(6)
22420 .nr(8)
22421 .kr(1)
22422 .sr(1)
22423 .m(6)
22424 .n(8)
22425 .k(2)
22426 .cn_stride(11)
22427 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22428 }
22429
22430 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
22431 TEST_REQUIRES_ARM_NEON_FMA;
22432 GemmMicrokernelTester()
22433 .mr(6)
22434 .nr(8)
22435 .kr(1)
22436 .sr(1)
22437 .m(6)
22438 .n(8)
22439 .k(2)
22440 .a_stride(5)
22441 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22442 }
22443
22444 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
22445 TEST_REQUIRES_ARM_NEON_FMA;
22446 for (uint32_t m = 1; m <= 6; m++) {
22447 for (uint32_t n = 1; n <= 8; n++) {
22448 GemmMicrokernelTester()
22449 .mr(6)
22450 .nr(8)
22451 .kr(1)
22452 .sr(1)
22453 .m(m)
22454 .n(n)
22455 .k(2)
22456 .iterations(1)
22457 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22458 }
22459 }
22460 }
22461
22462 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
22463 TEST_REQUIRES_ARM_NEON_FMA;
22464 for (uint32_t m = 1; m <= 6; m++) {
22465 GemmMicrokernelTester()
22466 .mr(6)
22467 .nr(8)
22468 .kr(1)
22469 .sr(1)
22470 .m(m)
22471 .n(8)
22472 .k(2)
22473 .iterations(1)
22474 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22475 }
22476 }
22477
22478 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
22479 TEST_REQUIRES_ARM_NEON_FMA;
22480 for (uint32_t n = 1; n <= 8; n++) {
22481 GemmMicrokernelTester()
22482 .mr(6)
22483 .nr(8)
22484 .kr(1)
22485 .sr(1)
22486 .m(6)
22487 .n(n)
22488 .k(2)
22489 .iterations(1)
22490 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22491 }
22492 }
22493
22494 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_lt_2) {
22495 TEST_REQUIRES_ARM_NEON_FMA;
22496 for (size_t k = 1; k < 2; k++) {
22497 GemmMicrokernelTester()
22498 .mr(6)
22499 .nr(8)
22500 .kr(1)
22501 .sr(1)
22502 .m(6)
22503 .n(8)
22504 .k(k)
22505 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22506 }
22507 }
22508
22509 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
22510 TEST_REQUIRES_ARM_NEON_FMA;
22511 for (size_t k = 1; k < 2; k++) {
22512 GemmMicrokernelTester()
22513 .mr(6)
22514 .nr(8)
22515 .kr(1)
22516 .sr(1)
22517 .m(6)
22518 .n(8)
22519 .k(k)
22520 .a_stride(5)
22521 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22522 }
22523 }
22524
22525 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
22526 TEST_REQUIRES_ARM_NEON_FMA;
22527 for (size_t k = 1; k < 2; k++) {
22528 for (uint32_t m = 1; m <= 6; m++) {
22529 for (uint32_t n = 1; n <= 8; n++) {
22530 GemmMicrokernelTester()
22531 .mr(6)
22532 .nr(8)
22533 .kr(1)
22534 .sr(1)
22535 .m(m)
22536 .n(n)
22537 .k(k)
22538 .iterations(1)
22539 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22540 }
22541 }
22542 }
22543 }
22544
22545 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_gt_2) {
22546 TEST_REQUIRES_ARM_NEON_FMA;
22547 for (size_t k = 3; k < 4; k++) {
22548 GemmMicrokernelTester()
22549 .mr(6)
22550 .nr(8)
22551 .kr(1)
22552 .sr(1)
22553 .m(6)
22554 .n(8)
22555 .k(k)
22556 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22557 }
22558 }
22559
22560 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
22561 TEST_REQUIRES_ARM_NEON_FMA;
22562 for (size_t k = 3; k < 4; k++) {
22563 GemmMicrokernelTester()
22564 .mr(6)
22565 .nr(8)
22566 .kr(1)
22567 .sr(1)
22568 .m(6)
22569 .n(8)
22570 .k(k)
22571 .a_stride(7)
22572 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22573 }
22574 }
22575
22576 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
22577 TEST_REQUIRES_ARM_NEON_FMA;
22578 for (size_t k = 3; k < 4; k++) {
22579 for (uint32_t m = 1; m <= 6; m++) {
22580 for (uint32_t n = 1; n <= 8; n++) {
22581 GemmMicrokernelTester()
22582 .mr(6)
22583 .nr(8)
22584 .kr(1)
22585 .sr(1)
22586 .m(m)
22587 .n(n)
22588 .k(k)
22589 .iterations(1)
22590 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22591 }
22592 }
22593 }
22594 }
22595
22596 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_div_2) {
22597 TEST_REQUIRES_ARM_NEON_FMA;
22598 for (size_t k = 4; k <= 20; k += 2) {
22599 GemmMicrokernelTester()
22600 .mr(6)
22601 .nr(8)
22602 .kr(1)
22603 .sr(1)
22604 .m(6)
22605 .n(8)
22606 .k(k)
22607 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22608 }
22609 }
22610
22611 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
22612 TEST_REQUIRES_ARM_NEON_FMA;
22613 for (size_t k = 4; k <= 20; k += 2) {
22614 GemmMicrokernelTester()
22615 .mr(6)
22616 .nr(8)
22617 .kr(1)
22618 .sr(1)
22619 .m(6)
22620 .n(8)
22621 .k(k)
22622 .a_stride(23)
22623 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22624 }
22625 }
22626
22627 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
22628 TEST_REQUIRES_ARM_NEON_FMA;
22629 for (size_t k = 4; k <= 20; k += 2) {
22630 for (uint32_t m = 1; m <= 6; m++) {
22631 for (uint32_t n = 1; n <= 8; n++) {
22632 GemmMicrokernelTester()
22633 .mr(6)
22634 .nr(8)
22635 .kr(1)
22636 .sr(1)
22637 .m(m)
22638 .n(n)
22639 .k(k)
22640 .iterations(1)
22641 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22642 }
22643 }
22644 }
22645 }
22646
22647 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8) {
22648 TEST_REQUIRES_ARM_NEON_FMA;
22649 for (uint32_t n = 9; n < 16; n++) {
22650 for (size_t k = 1; k <= 10; k += 3) {
22651 GemmMicrokernelTester()
22652 .mr(6)
22653 .nr(8)
22654 .kr(1)
22655 .sr(1)
22656 .m(6)
22657 .n(8)
22658 .k(k)
22659 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22660 }
22661 }
22662 }
22663
22664 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
22665 TEST_REQUIRES_ARM_NEON_FMA;
22666 for (uint32_t n = 9; n < 16; n++) {
22667 for (size_t k = 1; k <= 10; k += 3) {
22668 GemmMicrokernelTester()
22669 .mr(6)
22670 .nr(8)
22671 .kr(1)
22672 .sr(1)
22673 .m(6)
22674 .n(8)
22675 .k(k)
22676 .cn_stride(11)
22677 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22678 }
22679 }
22680 }
22681
22682 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
22683 TEST_REQUIRES_ARM_NEON_FMA;
22684 for (uint32_t n = 9; n < 16; n++) {
22685 for (size_t k = 1; k <= 10; k += 3) {
22686 GemmMicrokernelTester()
22687 .mr(6)
22688 .nr(8)
22689 .kr(1)
22690 .sr(1)
22691 .m(6)
22692 .n(n)
22693 .k(k)
22694 .a_stride(13)
22695 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22696 }
22697 }
22698 }
22699
22700 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
22701 TEST_REQUIRES_ARM_NEON_FMA;
22702 for (uint32_t n = 9; n < 16; n++) {
22703 for (size_t k = 1; k <= 10; k += 3) {
22704 for (uint32_t m = 1; m <= 6; m++) {
22705 GemmMicrokernelTester()
22706 .mr(6)
22707 .nr(8)
22708 .kr(1)
22709 .sr(1)
22710 .m(m)
22711 .n(n)
22712 .k(k)
22713 .iterations(1)
22714 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22715 }
22716 }
22717 }
22718 }
22719
22720 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8) {
22721 TEST_REQUIRES_ARM_NEON_FMA;
22722 for (uint32_t n = 16; n <= 24; n += 8) {
22723 for (size_t k = 1; k <= 10; k += 3) {
22724 GemmMicrokernelTester()
22725 .mr(6)
22726 .nr(8)
22727 .kr(1)
22728 .sr(1)
22729 .m(6)
22730 .n(8)
22731 .k(k)
22732 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22733 }
22734 }
22735 }
22736
22737 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
22738 TEST_REQUIRES_ARM_NEON_FMA;
22739 for (uint32_t n = 16; n <= 24; n += 8) {
22740 for (size_t k = 1; k <= 10; k += 3) {
22741 GemmMicrokernelTester()
22742 .mr(6)
22743 .nr(8)
22744 .kr(1)
22745 .sr(1)
22746 .m(6)
22747 .n(n)
22748 .k(k)
22749 .cn_stride(11)
22750 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22751 }
22752 }
22753 }
22754
22755 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
22756 TEST_REQUIRES_ARM_NEON_FMA;
22757 for (uint32_t n = 16; n <= 24; n += 8) {
22758 for (size_t k = 1; k <= 10; k += 3) {
22759 GemmMicrokernelTester()
22760 .mr(6)
22761 .nr(8)
22762 .kr(1)
22763 .sr(1)
22764 .m(6)
22765 .n(n)
22766 .k(k)
22767 .a_stride(13)
22768 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22769 }
22770 }
22771 }
22772
22773 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
22774 TEST_REQUIRES_ARM_NEON_FMA;
22775 for (uint32_t n = 16; n <= 24; n += 8) {
22776 for (size_t k = 1; k <= 10; k += 3) {
22777 for (uint32_t m = 1; m <= 6; m++) {
22778 GemmMicrokernelTester()
22779 .mr(6)
22780 .nr(8)
22781 .kr(1)
22782 .sr(1)
22783 .m(m)
22784 .n(n)
22785 .k(k)
22786 .iterations(1)
22787 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22788 }
22789 }
22790 }
22791 }
22792
22793 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
22794 TEST_REQUIRES_ARM_NEON_FMA;
22795 for (size_t k = 1; k <= 10; k += 3) {
22796 for (uint32_t m = 1; m <= 6; m++) {
22797 for (uint32_t n = 1; n <= 8; n++) {
22798 GemmMicrokernelTester()
22799 .mr(6)
22800 .nr(8)
22801 .kr(1)
22802 .sr(1)
22803 .m(m)
22804 .n(n)
22805 .k(k)
22806 .cm_stride(11)
22807 .iterations(1)
22808 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22809 }
22810 }
22811 }
22812 }
22813
22814 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, qmin) {
22815 TEST_REQUIRES_ARM_NEON_FMA;
22816 GemmMicrokernelTester()
22817 .mr(6)
22818 .nr(8)
22819 .kr(1)
22820 .sr(1)
22821 .m(6)
22822 .n(8)
22823 .k(2)
22824 .qmin(128)
22825 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22826 }
22827
22828 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, qmax) {
22829 TEST_REQUIRES_ARM_NEON_FMA;
22830 GemmMicrokernelTester()
22831 .mr(6)
22832 .nr(8)
22833 .kr(1)
22834 .sr(1)
22835 .m(6)
22836 .n(8)
22837 .k(2)
22838 .qmax(128)
22839 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22840 }
22841
22842 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, strided_cm) {
22843 TEST_REQUIRES_ARM_NEON_FMA;
22844 GemmMicrokernelTester()
22845 .mr(6)
22846 .nr(8)
22847 .kr(1)
22848 .sr(1)
22849 .m(6)
22850 .n(8)
22851 .k(2)
22852 .cm_stride(11)
22853 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
22854 }
22855#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22856
22857
22858#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22859 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4) {
22860 TEST_REQUIRES_ARM_NEON_FMA;
22861 GemmMicrokernelTester()
22862 .mr(6)
22863 .nr(8)
22864 .kr(1)
22865 .sr(1)
22866 .m(6)
22867 .n(8)
22868 .k(4)
22869 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22870 }
22871
22872 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, strided_cn) {
22873 TEST_REQUIRES_ARM_NEON_FMA;
22874 GemmMicrokernelTester()
22875 .mr(6)
22876 .nr(8)
22877 .kr(1)
22878 .sr(1)
22879 .m(6)
22880 .n(8)
22881 .k(4)
22882 .cn_stride(11)
22883 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22884 }
22885
22886 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
22887 TEST_REQUIRES_ARM_NEON_FMA;
22888 GemmMicrokernelTester()
22889 .mr(6)
22890 .nr(8)
22891 .kr(1)
22892 .sr(1)
22893 .m(6)
22894 .n(8)
22895 .k(4)
22896 .a_stride(7)
22897 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22898 }
22899
22900 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
22901 TEST_REQUIRES_ARM_NEON_FMA;
22902 for (uint32_t m = 1; m <= 6; m++) {
22903 for (uint32_t n = 1; n <= 8; n++) {
22904 GemmMicrokernelTester()
22905 .mr(6)
22906 .nr(8)
22907 .kr(1)
22908 .sr(1)
22909 .m(m)
22910 .n(n)
22911 .k(4)
22912 .iterations(1)
22913 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22914 }
22915 }
22916 }
22917
22918 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
22919 TEST_REQUIRES_ARM_NEON_FMA;
22920 for (uint32_t m = 1; m <= 6; m++) {
22921 GemmMicrokernelTester()
22922 .mr(6)
22923 .nr(8)
22924 .kr(1)
22925 .sr(1)
22926 .m(m)
22927 .n(8)
22928 .k(4)
22929 .iterations(1)
22930 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22931 }
22932 }
22933
22934 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
22935 TEST_REQUIRES_ARM_NEON_FMA;
22936 for (uint32_t n = 1; n <= 8; n++) {
22937 GemmMicrokernelTester()
22938 .mr(6)
22939 .nr(8)
22940 .kr(1)
22941 .sr(1)
22942 .m(6)
22943 .n(n)
22944 .k(4)
22945 .iterations(1)
22946 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22947 }
22948 }
22949
22950 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_lt_4) {
22951 TEST_REQUIRES_ARM_NEON_FMA;
22952 for (size_t k = 1; k < 4; k++) {
22953 GemmMicrokernelTester()
22954 .mr(6)
22955 .nr(8)
22956 .kr(1)
22957 .sr(1)
22958 .m(6)
22959 .n(8)
22960 .k(k)
22961 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22962 }
22963 }
22964
22965 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
22966 TEST_REQUIRES_ARM_NEON_FMA;
22967 for (size_t k = 1; k < 4; k++) {
22968 GemmMicrokernelTester()
22969 .mr(6)
22970 .nr(8)
22971 .kr(1)
22972 .sr(1)
22973 .m(6)
22974 .n(8)
22975 .k(k)
22976 .a_stride(7)
22977 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22978 }
22979 }
22980
22981 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
22982 TEST_REQUIRES_ARM_NEON_FMA;
22983 for (size_t k = 1; k < 4; k++) {
22984 for (uint32_t m = 1; m <= 6; m++) {
22985 for (uint32_t n = 1; n <= 8; n++) {
22986 GemmMicrokernelTester()
22987 .mr(6)
22988 .nr(8)
22989 .kr(1)
22990 .sr(1)
22991 .m(m)
22992 .n(n)
22993 .k(k)
22994 .iterations(1)
22995 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
22996 }
22997 }
22998 }
22999 }
23000
23001 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_gt_4) {
23002 TEST_REQUIRES_ARM_NEON_FMA;
23003 for (size_t k = 5; k < 8; k++) {
23004 GemmMicrokernelTester()
23005 .mr(6)
23006 .nr(8)
23007 .kr(1)
23008 .sr(1)
23009 .m(6)
23010 .n(8)
23011 .k(k)
23012 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23013 }
23014 }
23015
23016 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
23017 TEST_REQUIRES_ARM_NEON_FMA;
23018 for (size_t k = 5; k < 8; k++) {
23019 GemmMicrokernelTester()
23020 .mr(6)
23021 .nr(8)
23022 .kr(1)
23023 .sr(1)
23024 .m(6)
23025 .n(8)
23026 .k(k)
23027 .a_stride(11)
23028 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23029 }
23030 }
23031
23032 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
23033 TEST_REQUIRES_ARM_NEON_FMA;
23034 for (size_t k = 5; k < 8; k++) {
23035 for (uint32_t m = 1; m <= 6; m++) {
23036 for (uint32_t n = 1; n <= 8; n++) {
23037 GemmMicrokernelTester()
23038 .mr(6)
23039 .nr(8)
23040 .kr(1)
23041 .sr(1)
23042 .m(m)
23043 .n(n)
23044 .k(k)
23045 .iterations(1)
23046 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23047 }
23048 }
23049 }
23050 }
23051
23052 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_div_4) {
23053 TEST_REQUIRES_ARM_NEON_FMA;
23054 for (size_t k = 8; k <= 40; k += 4) {
23055 GemmMicrokernelTester()
23056 .mr(6)
23057 .nr(8)
23058 .kr(1)
23059 .sr(1)
23060 .m(6)
23061 .n(8)
23062 .k(k)
23063 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23064 }
23065 }
23066
23067 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
23068 TEST_REQUIRES_ARM_NEON_FMA;
23069 for (size_t k = 8; k <= 40; k += 4) {
23070 GemmMicrokernelTester()
23071 .mr(6)
23072 .nr(8)
23073 .kr(1)
23074 .sr(1)
23075 .m(6)
23076 .n(8)
23077 .k(k)
23078 .a_stride(43)
23079 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23080 }
23081 }
23082
23083 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
23084 TEST_REQUIRES_ARM_NEON_FMA;
23085 for (size_t k = 8; k <= 40; k += 4) {
23086 for (uint32_t m = 1; m <= 6; m++) {
23087 for (uint32_t n = 1; n <= 8; n++) {
23088 GemmMicrokernelTester()
23089 .mr(6)
23090 .nr(8)
23091 .kr(1)
23092 .sr(1)
23093 .m(m)
23094 .n(n)
23095 .k(k)
23096 .iterations(1)
23097 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23098 }
23099 }
23100 }
23101 }
23102
23103 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8) {
23104 TEST_REQUIRES_ARM_NEON_FMA;
23105 for (uint32_t n = 9; n < 16; n++) {
23106 for (size_t k = 1; k <= 20; k += 5) {
23107 GemmMicrokernelTester()
23108 .mr(6)
23109 .nr(8)
23110 .kr(1)
23111 .sr(1)
23112 .m(6)
23113 .n(8)
23114 .k(k)
23115 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23116 }
23117 }
23118 }
23119
23120 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
23121 TEST_REQUIRES_ARM_NEON_FMA;
23122 for (uint32_t n = 9; n < 16; n++) {
23123 for (size_t k = 1; k <= 20; k += 5) {
23124 GemmMicrokernelTester()
23125 .mr(6)
23126 .nr(8)
23127 .kr(1)
23128 .sr(1)
23129 .m(6)
23130 .n(8)
23131 .k(k)
23132 .cn_stride(11)
23133 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23134 }
23135 }
23136 }
23137
23138 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
23139 TEST_REQUIRES_ARM_NEON_FMA;
23140 for (uint32_t n = 9; n < 16; n++) {
23141 for (size_t k = 1; k <= 20; k += 5) {
23142 GemmMicrokernelTester()
23143 .mr(6)
23144 .nr(8)
23145 .kr(1)
23146 .sr(1)
23147 .m(6)
23148 .n(n)
23149 .k(k)
23150 .a_stride(23)
23151 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23152 }
23153 }
23154 }
23155
23156 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
23157 TEST_REQUIRES_ARM_NEON_FMA;
23158 for (uint32_t n = 9; n < 16; n++) {
23159 for (size_t k = 1; k <= 20; k += 5) {
23160 for (uint32_t m = 1; m <= 6; m++) {
23161 GemmMicrokernelTester()
23162 .mr(6)
23163 .nr(8)
23164 .kr(1)
23165 .sr(1)
23166 .m(m)
23167 .n(n)
23168 .k(k)
23169 .iterations(1)
23170 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23171 }
23172 }
23173 }
23174 }
23175
23176 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8) {
23177 TEST_REQUIRES_ARM_NEON_FMA;
23178 for (uint32_t n = 16; n <= 24; n += 8) {
23179 for (size_t k = 1; k <= 20; k += 5) {
23180 GemmMicrokernelTester()
23181 .mr(6)
23182 .nr(8)
23183 .kr(1)
23184 .sr(1)
23185 .m(6)
23186 .n(8)
23187 .k(k)
23188 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23189 }
23190 }
23191 }
23192
23193 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
23194 TEST_REQUIRES_ARM_NEON_FMA;
23195 for (uint32_t n = 16; n <= 24; n += 8) {
23196 for (size_t k = 1; k <= 20; k += 5) {
23197 GemmMicrokernelTester()
23198 .mr(6)
23199 .nr(8)
23200 .kr(1)
23201 .sr(1)
23202 .m(6)
23203 .n(n)
23204 .k(k)
23205 .cn_stride(11)
23206 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23207 }
23208 }
23209 }
23210
23211 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
23212 TEST_REQUIRES_ARM_NEON_FMA;
23213 for (uint32_t n = 16; n <= 24; n += 8) {
23214 for (size_t k = 1; k <= 20; k += 5) {
23215 GemmMicrokernelTester()
23216 .mr(6)
23217 .nr(8)
23218 .kr(1)
23219 .sr(1)
23220 .m(6)
23221 .n(n)
23222 .k(k)
23223 .a_stride(23)
23224 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23225 }
23226 }
23227 }
23228
23229 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
23230 TEST_REQUIRES_ARM_NEON_FMA;
23231 for (uint32_t n = 16; n <= 24; n += 8) {
23232 for (size_t k = 1; k <= 20; k += 5) {
23233 for (uint32_t m = 1; m <= 6; m++) {
23234 GemmMicrokernelTester()
23235 .mr(6)
23236 .nr(8)
23237 .kr(1)
23238 .sr(1)
23239 .m(m)
23240 .n(n)
23241 .k(k)
23242 .iterations(1)
23243 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23244 }
23245 }
23246 }
23247 }
23248
23249 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
23250 TEST_REQUIRES_ARM_NEON_FMA;
23251 for (size_t k = 1; k <= 20; k += 5) {
23252 for (uint32_t m = 1; m <= 6; m++) {
23253 for (uint32_t n = 1; n <= 8; n++) {
23254 GemmMicrokernelTester()
23255 .mr(6)
23256 .nr(8)
23257 .kr(1)
23258 .sr(1)
23259 .m(m)
23260 .n(n)
23261 .k(k)
23262 .cm_stride(11)
23263 .iterations(1)
23264 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23265 }
23266 }
23267 }
23268 }
23269
23270 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, qmin) {
23271 TEST_REQUIRES_ARM_NEON_FMA;
23272 GemmMicrokernelTester()
23273 .mr(6)
23274 .nr(8)
23275 .kr(1)
23276 .sr(1)
23277 .m(6)
23278 .n(8)
23279 .k(4)
23280 .qmin(128)
23281 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23282 }
23283
23284 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, qmax) {
23285 TEST_REQUIRES_ARM_NEON_FMA;
23286 GemmMicrokernelTester()
23287 .mr(6)
23288 .nr(8)
23289 .kr(1)
23290 .sr(1)
23291 .m(6)
23292 .n(8)
23293 .k(4)
23294 .qmax(128)
23295 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23296 }
23297
23298 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, strided_cm) {
23299 TEST_REQUIRES_ARM_NEON_FMA;
23300 GemmMicrokernelTester()
23301 .mr(6)
23302 .nr(8)
23303 .kr(1)
23304 .sr(1)
23305 .m(6)
23306 .n(8)
23307 .k(4)
23308 .cm_stride(11)
23309 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
23310 }
23311#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23312
23313
23314#if XNN_ARCH_ARM || XNN_ARCH_ARM64
23315 TEST(F32_GEMM_1X8S4__NEON, k_eq_4) {
23316 TEST_REQUIRES_ARM_NEON;
23317 GemmMicrokernelTester()
23318 .mr(1)
23319 .nr(8)
23320 .kr(1)
23321 .sr(4)
23322 .m(1)
23323 .n(8)
23324 .k(4)
23325 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23326 }
23327
23328 TEST(F32_GEMM_1X8S4__NEON, strided_cn) {
23329 TEST_REQUIRES_ARM_NEON;
23330 GemmMicrokernelTester()
23331 .mr(1)
23332 .nr(8)
23333 .kr(1)
23334 .sr(4)
23335 .m(1)
23336 .n(8)
23337 .k(4)
23338 .cn_stride(11)
23339 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23340 }
23341
23342 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_strided_a) {
23343 TEST_REQUIRES_ARM_NEON;
23344 GemmMicrokernelTester()
23345 .mr(1)
23346 .nr(8)
23347 .kr(1)
23348 .sr(4)
23349 .m(1)
23350 .n(8)
23351 .k(4)
23352 .a_stride(7)
23353 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23354 }
23355
23356 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_subtile) {
23357 TEST_REQUIRES_ARM_NEON;
23358 for (uint32_t m = 1; m <= 1; m++) {
23359 for (uint32_t n = 1; n <= 8; n++) {
23360 GemmMicrokernelTester()
23361 .mr(1)
23362 .nr(8)
23363 .kr(1)
23364 .sr(4)
23365 .m(m)
23366 .n(n)
23367 .k(4)
23368 .iterations(1)
23369 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23370 }
23371 }
23372 }
23373
23374 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_subtile_m) {
23375 TEST_REQUIRES_ARM_NEON;
23376 for (uint32_t m = 1; m <= 1; m++) {
23377 GemmMicrokernelTester()
23378 .mr(1)
23379 .nr(8)
23380 .kr(1)
23381 .sr(4)
23382 .m(m)
23383 .n(8)
23384 .k(4)
23385 .iterations(1)
23386 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23387 }
23388 }
23389
23390 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_subtile_n) {
23391 TEST_REQUIRES_ARM_NEON;
23392 for (uint32_t n = 1; n <= 8; n++) {
23393 GemmMicrokernelTester()
23394 .mr(1)
23395 .nr(8)
23396 .kr(1)
23397 .sr(4)
23398 .m(1)
23399 .n(n)
23400 .k(4)
23401 .iterations(1)
23402 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23403 }
23404 }
23405
23406 TEST(F32_GEMM_1X8S4__NEON, k_lt_4) {
23407 TEST_REQUIRES_ARM_NEON;
23408 for (size_t k = 1; k < 4; k++) {
23409 GemmMicrokernelTester()
23410 .mr(1)
23411 .nr(8)
23412 .kr(1)
23413 .sr(4)
23414 .m(1)
23415 .n(8)
23416 .k(k)
23417 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23418 }
23419 }
23420
23421 TEST(F32_GEMM_1X8S4__NEON, k_lt_4_strided_a) {
23422 TEST_REQUIRES_ARM_NEON;
23423 for (size_t k = 1; k < 4; k++) {
23424 GemmMicrokernelTester()
23425 .mr(1)
23426 .nr(8)
23427 .kr(1)
23428 .sr(4)
23429 .m(1)
23430 .n(8)
23431 .k(k)
23432 .a_stride(7)
23433 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23434 }
23435 }
23436
23437 TEST(F32_GEMM_1X8S4__NEON, k_lt_4_subtile) {
23438 TEST_REQUIRES_ARM_NEON;
23439 for (size_t k = 1; k < 4; k++) {
23440 for (uint32_t m = 1; m <= 1; m++) {
23441 for (uint32_t n = 1; n <= 8; n++) {
23442 GemmMicrokernelTester()
23443 .mr(1)
23444 .nr(8)
23445 .kr(1)
23446 .sr(4)
23447 .m(m)
23448 .n(n)
23449 .k(k)
23450 .iterations(1)
23451 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23452 }
23453 }
23454 }
23455 }
23456
23457 TEST(F32_GEMM_1X8S4__NEON, k_gt_4) {
23458 TEST_REQUIRES_ARM_NEON;
23459 for (size_t k = 5; k < 8; k++) {
23460 GemmMicrokernelTester()
23461 .mr(1)
23462 .nr(8)
23463 .kr(1)
23464 .sr(4)
23465 .m(1)
23466 .n(8)
23467 .k(k)
23468 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23469 }
23470 }
23471
23472 TEST(F32_GEMM_1X8S4__NEON, k_gt_4_strided_a) {
23473 TEST_REQUIRES_ARM_NEON;
23474 for (size_t k = 5; k < 8; k++) {
23475 GemmMicrokernelTester()
23476 .mr(1)
23477 .nr(8)
23478 .kr(1)
23479 .sr(4)
23480 .m(1)
23481 .n(8)
23482 .k(k)
23483 .a_stride(11)
23484 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23485 }
23486 }
23487
23488 TEST(F32_GEMM_1X8S4__NEON, k_gt_4_subtile) {
23489 TEST_REQUIRES_ARM_NEON;
23490 for (size_t k = 5; k < 8; k++) {
23491 for (uint32_t m = 1; m <= 1; m++) {
23492 for (uint32_t n = 1; n <= 8; n++) {
23493 GemmMicrokernelTester()
23494 .mr(1)
23495 .nr(8)
23496 .kr(1)
23497 .sr(4)
23498 .m(m)
23499 .n(n)
23500 .k(k)
23501 .iterations(1)
23502 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23503 }
23504 }
23505 }
23506 }
23507
23508 TEST(F32_GEMM_1X8S4__NEON, k_div_4) {
23509 TEST_REQUIRES_ARM_NEON;
23510 for (size_t k = 8; k <= 40; k += 4) {
23511 GemmMicrokernelTester()
23512 .mr(1)
23513 .nr(8)
23514 .kr(1)
23515 .sr(4)
23516 .m(1)
23517 .n(8)
23518 .k(k)
23519 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23520 }
23521 }
23522
23523 TEST(F32_GEMM_1X8S4__NEON, k_div_4_strided_a) {
23524 TEST_REQUIRES_ARM_NEON;
23525 for (size_t k = 8; k <= 40; k += 4) {
23526 GemmMicrokernelTester()
23527 .mr(1)
23528 .nr(8)
23529 .kr(1)
23530 .sr(4)
23531 .m(1)
23532 .n(8)
23533 .k(k)
23534 .a_stride(43)
23535 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23536 }
23537 }
23538
23539 TEST(F32_GEMM_1X8S4__NEON, k_div_4_subtile) {
23540 TEST_REQUIRES_ARM_NEON;
23541 for (size_t k = 8; k <= 40; k += 4) {
23542 for (uint32_t m = 1; m <= 1; m++) {
23543 for (uint32_t n = 1; n <= 8; n++) {
23544 GemmMicrokernelTester()
23545 .mr(1)
23546 .nr(8)
23547 .kr(1)
23548 .sr(4)
23549 .m(m)
23550 .n(n)
23551 .k(k)
23552 .iterations(1)
23553 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23554 }
23555 }
23556 }
23557 }
23558
23559 TEST(F32_GEMM_1X8S4__NEON, n_gt_8) {
23560 TEST_REQUIRES_ARM_NEON;
23561 for (uint32_t n = 9; n < 16; n++) {
23562 for (size_t k = 1; k <= 20; k += 5) {
23563 GemmMicrokernelTester()
23564 .mr(1)
23565 .nr(8)
23566 .kr(1)
23567 .sr(4)
23568 .m(1)
23569 .n(8)
23570 .k(k)
23571 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23572 }
23573 }
23574 }
23575
23576 TEST(F32_GEMM_1X8S4__NEON, n_gt_8_strided_cn) {
23577 TEST_REQUIRES_ARM_NEON;
23578 for (uint32_t n = 9; n < 16; n++) {
23579 for (size_t k = 1; k <= 20; k += 5) {
23580 GemmMicrokernelTester()
23581 .mr(1)
23582 .nr(8)
23583 .kr(1)
23584 .sr(4)
23585 .m(1)
23586 .n(8)
23587 .k(k)
23588 .cn_stride(11)
23589 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23590 }
23591 }
23592 }
23593
23594 TEST(F32_GEMM_1X8S4__NEON, n_gt_8_strided_a) {
23595 TEST_REQUIRES_ARM_NEON;
23596 for (uint32_t n = 9; n < 16; n++) {
23597 for (size_t k = 1; k <= 20; k += 5) {
23598 GemmMicrokernelTester()
23599 .mr(1)
23600 .nr(8)
23601 .kr(1)
23602 .sr(4)
23603 .m(1)
23604 .n(n)
23605 .k(k)
23606 .a_stride(23)
23607 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23608 }
23609 }
23610 }
23611
23612 TEST(F32_GEMM_1X8S4__NEON, n_gt_8_subtile) {
23613 TEST_REQUIRES_ARM_NEON;
23614 for (uint32_t n = 9; n < 16; n++) {
23615 for (size_t k = 1; k <= 20; k += 5) {
23616 for (uint32_t m = 1; m <= 1; m++) {
23617 GemmMicrokernelTester()
23618 .mr(1)
23619 .nr(8)
23620 .kr(1)
23621 .sr(4)
23622 .m(m)
23623 .n(n)
23624 .k(k)
23625 .iterations(1)
23626 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23627 }
23628 }
23629 }
23630 }
23631
23632 TEST(F32_GEMM_1X8S4__NEON, n_div_8) {
23633 TEST_REQUIRES_ARM_NEON;
23634 for (uint32_t n = 16; n <= 24; n += 8) {
23635 for (size_t k = 1; k <= 20; k += 5) {
23636 GemmMicrokernelTester()
23637 .mr(1)
23638 .nr(8)
23639 .kr(1)
23640 .sr(4)
23641 .m(1)
23642 .n(8)
23643 .k(k)
23644 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23645 }
23646 }
23647 }
23648
23649 TEST(F32_GEMM_1X8S4__NEON, n_div_8_strided_cn) {
23650 TEST_REQUIRES_ARM_NEON;
23651 for (uint32_t n = 16; n <= 24; n += 8) {
23652 for (size_t k = 1; k <= 20; k += 5) {
23653 GemmMicrokernelTester()
23654 .mr(1)
23655 .nr(8)
23656 .kr(1)
23657 .sr(4)
23658 .m(1)
23659 .n(n)
23660 .k(k)
23661 .cn_stride(11)
23662 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23663 }
23664 }
23665 }
23666
23667 TEST(F32_GEMM_1X8S4__NEON, n_div_8_strided_a) {
23668 TEST_REQUIRES_ARM_NEON;
23669 for (uint32_t n = 16; n <= 24; n += 8) {
23670 for (size_t k = 1; k <= 20; k += 5) {
23671 GemmMicrokernelTester()
23672 .mr(1)
23673 .nr(8)
23674 .kr(1)
23675 .sr(4)
23676 .m(1)
23677 .n(n)
23678 .k(k)
23679 .a_stride(23)
23680 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23681 }
23682 }
23683 }
23684
23685 TEST(F32_GEMM_1X8S4__NEON, n_div_8_subtile) {
23686 TEST_REQUIRES_ARM_NEON;
23687 for (uint32_t n = 16; n <= 24; n += 8) {
23688 for (size_t k = 1; k <= 20; k += 5) {
23689 for (uint32_t m = 1; m <= 1; m++) {
23690 GemmMicrokernelTester()
23691 .mr(1)
23692 .nr(8)
23693 .kr(1)
23694 .sr(4)
23695 .m(m)
23696 .n(n)
23697 .k(k)
23698 .iterations(1)
23699 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23700 }
23701 }
23702 }
23703 }
23704
23705 TEST(F32_GEMM_1X8S4__NEON, strided_cm_subtile) {
23706 TEST_REQUIRES_ARM_NEON;
23707 for (size_t k = 1; k <= 20; k += 5) {
23708 for (uint32_t m = 1; m <= 1; m++) {
23709 for (uint32_t n = 1; n <= 8; n++) {
23710 GemmMicrokernelTester()
23711 .mr(1)
23712 .nr(8)
23713 .kr(1)
23714 .sr(4)
23715 .m(m)
23716 .n(n)
23717 .k(k)
23718 .cm_stride(11)
23719 .iterations(1)
23720 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23721 }
23722 }
23723 }
23724 }
23725
23726 TEST(F32_GEMM_1X8S4__NEON, qmin) {
23727 TEST_REQUIRES_ARM_NEON;
23728 GemmMicrokernelTester()
23729 .mr(1)
23730 .nr(8)
23731 .kr(1)
23732 .sr(4)
23733 .m(1)
23734 .n(8)
23735 .k(4)
23736 .qmin(128)
23737 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23738 }
23739
23740 TEST(F32_GEMM_1X8S4__NEON, qmax) {
23741 TEST_REQUIRES_ARM_NEON;
23742 GemmMicrokernelTester()
23743 .mr(1)
23744 .nr(8)
23745 .kr(1)
23746 .sr(4)
23747 .m(1)
23748 .n(8)
23749 .k(4)
23750 .qmax(128)
23751 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23752 }
23753
23754 TEST(F32_GEMM_1X8S4__NEON, strided_cm) {
23755 TEST_REQUIRES_ARM_NEON;
23756 GemmMicrokernelTester()
23757 .mr(1)
23758 .nr(8)
23759 .kr(1)
23760 .sr(4)
23761 .m(1)
23762 .n(8)
23763 .k(4)
23764 .cm_stride(11)
23765 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
23766 }
23767#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23768
23769
23770#if XNN_ARCH_ARM || XNN_ARCH_ARM64
23771 TEST(F32_GEMM_4X8S4__NEON, k_eq_4) {
23772 TEST_REQUIRES_ARM_NEON;
23773 GemmMicrokernelTester()
23774 .mr(4)
23775 .nr(8)
23776 .kr(1)
23777 .sr(4)
23778 .m(4)
23779 .n(8)
23780 .k(4)
23781 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23782 }
23783
23784 TEST(F32_GEMM_4X8S4__NEON, strided_cn) {
23785 TEST_REQUIRES_ARM_NEON;
23786 GemmMicrokernelTester()
23787 .mr(4)
23788 .nr(8)
23789 .kr(1)
23790 .sr(4)
23791 .m(4)
23792 .n(8)
23793 .k(4)
23794 .cn_stride(11)
23795 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23796 }
23797
23798 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_strided_a) {
23799 TEST_REQUIRES_ARM_NEON;
23800 GemmMicrokernelTester()
23801 .mr(4)
23802 .nr(8)
23803 .kr(1)
23804 .sr(4)
23805 .m(4)
23806 .n(8)
23807 .k(4)
23808 .a_stride(7)
23809 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23810 }
23811
23812 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_subtile) {
23813 TEST_REQUIRES_ARM_NEON;
23814 for (uint32_t m = 1; m <= 4; m++) {
23815 for (uint32_t n = 1; n <= 8; n++) {
23816 GemmMicrokernelTester()
23817 .mr(4)
23818 .nr(8)
23819 .kr(1)
23820 .sr(4)
23821 .m(m)
23822 .n(n)
23823 .k(4)
23824 .iterations(1)
23825 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23826 }
23827 }
23828 }
23829
23830 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_subtile_m) {
23831 TEST_REQUIRES_ARM_NEON;
23832 for (uint32_t m = 1; m <= 4; m++) {
23833 GemmMicrokernelTester()
23834 .mr(4)
23835 .nr(8)
23836 .kr(1)
23837 .sr(4)
23838 .m(m)
23839 .n(8)
23840 .k(4)
23841 .iterations(1)
23842 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23843 }
23844 }
23845
23846 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_subtile_n) {
23847 TEST_REQUIRES_ARM_NEON;
23848 for (uint32_t n = 1; n <= 8; n++) {
23849 GemmMicrokernelTester()
23850 .mr(4)
23851 .nr(8)
23852 .kr(1)
23853 .sr(4)
23854 .m(4)
23855 .n(n)
23856 .k(4)
23857 .iterations(1)
23858 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23859 }
23860 }
23861
23862 TEST(F32_GEMM_4X8S4__NEON, k_lt_4) {
23863 TEST_REQUIRES_ARM_NEON;
23864 for (size_t k = 1; k < 4; k++) {
23865 GemmMicrokernelTester()
23866 .mr(4)
23867 .nr(8)
23868 .kr(1)
23869 .sr(4)
23870 .m(4)
23871 .n(8)
23872 .k(k)
23873 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23874 }
23875 }
23876
23877 TEST(F32_GEMM_4X8S4__NEON, k_lt_4_strided_a) {
23878 TEST_REQUIRES_ARM_NEON;
23879 for (size_t k = 1; k < 4; k++) {
23880 GemmMicrokernelTester()
23881 .mr(4)
23882 .nr(8)
23883 .kr(1)
23884 .sr(4)
23885 .m(4)
23886 .n(8)
23887 .k(k)
23888 .a_stride(7)
23889 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23890 }
23891 }
23892
23893 TEST(F32_GEMM_4X8S4__NEON, k_lt_4_subtile) {
23894 TEST_REQUIRES_ARM_NEON;
23895 for (size_t k = 1; k < 4; k++) {
23896 for (uint32_t m = 1; m <= 4; m++) {
23897 for (uint32_t n = 1; n <= 8; n++) {
23898 GemmMicrokernelTester()
23899 .mr(4)
23900 .nr(8)
23901 .kr(1)
23902 .sr(4)
23903 .m(m)
23904 .n(n)
23905 .k(k)
23906 .iterations(1)
23907 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23908 }
23909 }
23910 }
23911 }
23912
23913 TEST(F32_GEMM_4X8S4__NEON, k_gt_4) {
23914 TEST_REQUIRES_ARM_NEON;
23915 for (size_t k = 5; k < 8; k++) {
23916 GemmMicrokernelTester()
23917 .mr(4)
23918 .nr(8)
23919 .kr(1)
23920 .sr(4)
23921 .m(4)
23922 .n(8)
23923 .k(k)
23924 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23925 }
23926 }
23927
23928 TEST(F32_GEMM_4X8S4__NEON, k_gt_4_strided_a) {
23929 TEST_REQUIRES_ARM_NEON;
23930 for (size_t k = 5; k < 8; k++) {
23931 GemmMicrokernelTester()
23932 .mr(4)
23933 .nr(8)
23934 .kr(1)
23935 .sr(4)
23936 .m(4)
23937 .n(8)
23938 .k(k)
23939 .a_stride(11)
23940 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23941 }
23942 }
23943
23944 TEST(F32_GEMM_4X8S4__NEON, k_gt_4_subtile) {
23945 TEST_REQUIRES_ARM_NEON;
23946 for (size_t k = 5; k < 8; k++) {
23947 for (uint32_t m = 1; m <= 4; m++) {
23948 for (uint32_t n = 1; n <= 8; n++) {
23949 GemmMicrokernelTester()
23950 .mr(4)
23951 .nr(8)
23952 .kr(1)
23953 .sr(4)
23954 .m(m)
23955 .n(n)
23956 .k(k)
23957 .iterations(1)
23958 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23959 }
23960 }
23961 }
23962 }
23963
23964 TEST(F32_GEMM_4X8S4__NEON, k_div_4) {
23965 TEST_REQUIRES_ARM_NEON;
23966 for (size_t k = 8; k <= 40; k += 4) {
23967 GemmMicrokernelTester()
23968 .mr(4)
23969 .nr(8)
23970 .kr(1)
23971 .sr(4)
23972 .m(4)
23973 .n(8)
23974 .k(k)
23975 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23976 }
23977 }
23978
23979 TEST(F32_GEMM_4X8S4__NEON, k_div_4_strided_a) {
23980 TEST_REQUIRES_ARM_NEON;
23981 for (size_t k = 8; k <= 40; k += 4) {
23982 GemmMicrokernelTester()
23983 .mr(4)
23984 .nr(8)
23985 .kr(1)
23986 .sr(4)
23987 .m(4)
23988 .n(8)
23989 .k(k)
23990 .a_stride(43)
23991 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
23992 }
23993 }
23994
23995 TEST(F32_GEMM_4X8S4__NEON, k_div_4_subtile) {
23996 TEST_REQUIRES_ARM_NEON;
23997 for (size_t k = 8; k <= 40; k += 4) {
23998 for (uint32_t m = 1; m <= 4; m++) {
23999 for (uint32_t n = 1; n <= 8; n++) {
24000 GemmMicrokernelTester()
24001 .mr(4)
24002 .nr(8)
24003 .kr(1)
24004 .sr(4)
24005 .m(m)
24006 .n(n)
24007 .k(k)
24008 .iterations(1)
24009 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24010 }
24011 }
24012 }
24013 }
24014
24015 TEST(F32_GEMM_4X8S4__NEON, n_gt_8) {
24016 TEST_REQUIRES_ARM_NEON;
24017 for (uint32_t n = 9; n < 16; n++) {
24018 for (size_t k = 1; k <= 20; k += 5) {
24019 GemmMicrokernelTester()
24020 .mr(4)
24021 .nr(8)
24022 .kr(1)
24023 .sr(4)
24024 .m(4)
24025 .n(8)
24026 .k(k)
24027 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24028 }
24029 }
24030 }
24031
24032 TEST(F32_GEMM_4X8S4__NEON, n_gt_8_strided_cn) {
24033 TEST_REQUIRES_ARM_NEON;
24034 for (uint32_t n = 9; n < 16; n++) {
24035 for (size_t k = 1; k <= 20; k += 5) {
24036 GemmMicrokernelTester()
24037 .mr(4)
24038 .nr(8)
24039 .kr(1)
24040 .sr(4)
24041 .m(4)
24042 .n(8)
24043 .k(k)
24044 .cn_stride(11)
24045 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24046 }
24047 }
24048 }
24049
24050 TEST(F32_GEMM_4X8S4__NEON, n_gt_8_strided_a) {
24051 TEST_REQUIRES_ARM_NEON;
24052 for (uint32_t n = 9; n < 16; n++) {
24053 for (size_t k = 1; k <= 20; k += 5) {
24054 GemmMicrokernelTester()
24055 .mr(4)
24056 .nr(8)
24057 .kr(1)
24058 .sr(4)
24059 .m(4)
24060 .n(n)
24061 .k(k)
24062 .a_stride(23)
24063 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24064 }
24065 }
24066 }
24067
24068 TEST(F32_GEMM_4X8S4__NEON, n_gt_8_subtile) {
24069 TEST_REQUIRES_ARM_NEON;
24070 for (uint32_t n = 9; n < 16; n++) {
24071 for (size_t k = 1; k <= 20; k += 5) {
24072 for (uint32_t m = 1; m <= 4; m++) {
24073 GemmMicrokernelTester()
24074 .mr(4)
24075 .nr(8)
24076 .kr(1)
24077 .sr(4)
24078 .m(m)
24079 .n(n)
24080 .k(k)
24081 .iterations(1)
24082 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24083 }
24084 }
24085 }
24086 }
24087
24088 TEST(F32_GEMM_4X8S4__NEON, n_div_8) {
24089 TEST_REQUIRES_ARM_NEON;
24090 for (uint32_t n = 16; n <= 24; n += 8) {
24091 for (size_t k = 1; k <= 20; k += 5) {
24092 GemmMicrokernelTester()
24093 .mr(4)
24094 .nr(8)
24095 .kr(1)
24096 .sr(4)
24097 .m(4)
24098 .n(8)
24099 .k(k)
24100 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24101 }
24102 }
24103 }
24104
24105 TEST(F32_GEMM_4X8S4__NEON, n_div_8_strided_cn) {
24106 TEST_REQUIRES_ARM_NEON;
24107 for (uint32_t n = 16; n <= 24; n += 8) {
24108 for (size_t k = 1; k <= 20; k += 5) {
24109 GemmMicrokernelTester()
24110 .mr(4)
24111 .nr(8)
24112 .kr(1)
24113 .sr(4)
24114 .m(4)
24115 .n(n)
24116 .k(k)
24117 .cn_stride(11)
24118 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24119 }
24120 }
24121 }
24122
24123 TEST(F32_GEMM_4X8S4__NEON, n_div_8_strided_a) {
24124 TEST_REQUIRES_ARM_NEON;
24125 for (uint32_t n = 16; n <= 24; n += 8) {
24126 for (size_t k = 1; k <= 20; k += 5) {
24127 GemmMicrokernelTester()
24128 .mr(4)
24129 .nr(8)
24130 .kr(1)
24131 .sr(4)
24132 .m(4)
24133 .n(n)
24134 .k(k)
24135 .a_stride(23)
24136 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24137 }
24138 }
24139 }
24140
24141 TEST(F32_GEMM_4X8S4__NEON, n_div_8_subtile) {
24142 TEST_REQUIRES_ARM_NEON;
24143 for (uint32_t n = 16; n <= 24; n += 8) {
24144 for (size_t k = 1; k <= 20; k += 5) {
24145 for (uint32_t m = 1; m <= 4; m++) {
24146 GemmMicrokernelTester()
24147 .mr(4)
24148 .nr(8)
24149 .kr(1)
24150 .sr(4)
24151 .m(m)
24152 .n(n)
24153 .k(k)
24154 .iterations(1)
24155 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24156 }
24157 }
24158 }
24159 }
24160
24161 TEST(F32_GEMM_4X8S4__NEON, strided_cm_subtile) {
24162 TEST_REQUIRES_ARM_NEON;
24163 for (size_t k = 1; k <= 20; k += 5) {
24164 for (uint32_t m = 1; m <= 4; m++) {
24165 for (uint32_t n = 1; n <= 8; n++) {
24166 GemmMicrokernelTester()
24167 .mr(4)
24168 .nr(8)
24169 .kr(1)
24170 .sr(4)
24171 .m(m)
24172 .n(n)
24173 .k(k)
24174 .cm_stride(11)
24175 .iterations(1)
24176 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24177 }
24178 }
24179 }
24180 }
24181
24182 TEST(F32_GEMM_4X8S4__NEON, qmin) {
24183 TEST_REQUIRES_ARM_NEON;
24184 GemmMicrokernelTester()
24185 .mr(4)
24186 .nr(8)
24187 .kr(1)
24188 .sr(4)
24189 .m(4)
24190 .n(8)
24191 .k(4)
24192 .qmin(128)
24193 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24194 }
24195
24196 TEST(F32_GEMM_4X8S4__NEON, qmax) {
24197 TEST_REQUIRES_ARM_NEON;
24198 GemmMicrokernelTester()
24199 .mr(4)
24200 .nr(8)
24201 .kr(1)
24202 .sr(4)
24203 .m(4)
24204 .n(8)
24205 .k(4)
24206 .qmax(128)
24207 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24208 }
24209
24210 TEST(F32_GEMM_4X8S4__NEON, strided_cm) {
24211 TEST_REQUIRES_ARM_NEON;
24212 GemmMicrokernelTester()
24213 .mr(4)
24214 .nr(8)
24215 .kr(1)
24216 .sr(4)
24217 .m(4)
24218 .n(8)
24219 .k(4)
24220 .cm_stride(11)
24221 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
24222 }
24223#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24224
24225
24226#if XNN_ARCH_ARM || XNN_ARCH_ARM64
24227 TEST(F32_GEMM_6X8S4__NEON, k_eq_4) {
24228 TEST_REQUIRES_ARM_NEON;
24229 GemmMicrokernelTester()
24230 .mr(6)
24231 .nr(8)
24232 .kr(1)
24233 .sr(4)
24234 .m(6)
24235 .n(8)
24236 .k(4)
24237 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24238 }
24239
24240 TEST(F32_GEMM_6X8S4__NEON, strided_cn) {
24241 TEST_REQUIRES_ARM_NEON;
24242 GemmMicrokernelTester()
24243 .mr(6)
24244 .nr(8)
24245 .kr(1)
24246 .sr(4)
24247 .m(6)
24248 .n(8)
24249 .k(4)
24250 .cn_stride(11)
24251 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24252 }
24253
24254 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_strided_a) {
24255 TEST_REQUIRES_ARM_NEON;
24256 GemmMicrokernelTester()
24257 .mr(6)
24258 .nr(8)
24259 .kr(1)
24260 .sr(4)
24261 .m(6)
24262 .n(8)
24263 .k(4)
24264 .a_stride(7)
24265 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24266 }
24267
24268 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_subtile) {
24269 TEST_REQUIRES_ARM_NEON;
24270 for (uint32_t m = 1; m <= 6; m++) {
24271 for (uint32_t n = 1; n <= 8; n++) {
24272 GemmMicrokernelTester()
24273 .mr(6)
24274 .nr(8)
24275 .kr(1)
24276 .sr(4)
24277 .m(m)
24278 .n(n)
24279 .k(4)
24280 .iterations(1)
24281 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24282 }
24283 }
24284 }
24285
24286 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_subtile_m) {
24287 TEST_REQUIRES_ARM_NEON;
24288 for (uint32_t m = 1; m <= 6; m++) {
24289 GemmMicrokernelTester()
24290 .mr(6)
24291 .nr(8)
24292 .kr(1)
24293 .sr(4)
24294 .m(m)
24295 .n(8)
24296 .k(4)
24297 .iterations(1)
24298 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24299 }
24300 }
24301
24302 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_subtile_n) {
24303 TEST_REQUIRES_ARM_NEON;
24304 for (uint32_t n = 1; n <= 8; n++) {
24305 GemmMicrokernelTester()
24306 .mr(6)
24307 .nr(8)
24308 .kr(1)
24309 .sr(4)
24310 .m(6)
24311 .n(n)
24312 .k(4)
24313 .iterations(1)
24314 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24315 }
24316 }
24317
24318 TEST(F32_GEMM_6X8S4__NEON, k_lt_4) {
24319 TEST_REQUIRES_ARM_NEON;
24320 for (size_t k = 1; k < 4; k++) {
24321 GemmMicrokernelTester()
24322 .mr(6)
24323 .nr(8)
24324 .kr(1)
24325 .sr(4)
24326 .m(6)
24327 .n(8)
24328 .k(k)
24329 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24330 }
24331 }
24332
24333 TEST(F32_GEMM_6X8S4__NEON, k_lt_4_strided_a) {
24334 TEST_REQUIRES_ARM_NEON;
24335 for (size_t k = 1; k < 4; k++) {
24336 GemmMicrokernelTester()
24337 .mr(6)
24338 .nr(8)
24339 .kr(1)
24340 .sr(4)
24341 .m(6)
24342 .n(8)
24343 .k(k)
24344 .a_stride(7)
24345 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24346 }
24347 }
24348
24349 TEST(F32_GEMM_6X8S4__NEON, k_lt_4_subtile) {
24350 TEST_REQUIRES_ARM_NEON;
24351 for (size_t k = 1; k < 4; k++) {
24352 for (uint32_t m = 1; m <= 6; m++) {
24353 for (uint32_t n = 1; n <= 8; n++) {
24354 GemmMicrokernelTester()
24355 .mr(6)
24356 .nr(8)
24357 .kr(1)
24358 .sr(4)
24359 .m(m)
24360 .n(n)
24361 .k(k)
24362 .iterations(1)
24363 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24364 }
24365 }
24366 }
24367 }
24368
24369 TEST(F32_GEMM_6X8S4__NEON, k_gt_4) {
24370 TEST_REQUIRES_ARM_NEON;
24371 for (size_t k = 5; k < 8; k++) {
24372 GemmMicrokernelTester()
24373 .mr(6)
24374 .nr(8)
24375 .kr(1)
24376 .sr(4)
24377 .m(6)
24378 .n(8)
24379 .k(k)
24380 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24381 }
24382 }
24383
24384 TEST(F32_GEMM_6X8S4__NEON, k_gt_4_strided_a) {
24385 TEST_REQUIRES_ARM_NEON;
24386 for (size_t k = 5; k < 8; k++) {
24387 GemmMicrokernelTester()
24388 .mr(6)
24389 .nr(8)
24390 .kr(1)
24391 .sr(4)
24392 .m(6)
24393 .n(8)
24394 .k(k)
24395 .a_stride(11)
24396 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24397 }
24398 }
24399
24400 TEST(F32_GEMM_6X8S4__NEON, k_gt_4_subtile) {
24401 TEST_REQUIRES_ARM_NEON;
24402 for (size_t k = 5; k < 8; k++) {
24403 for (uint32_t m = 1; m <= 6; m++) {
24404 for (uint32_t n = 1; n <= 8; n++) {
24405 GemmMicrokernelTester()
24406 .mr(6)
24407 .nr(8)
24408 .kr(1)
24409 .sr(4)
24410 .m(m)
24411 .n(n)
24412 .k(k)
24413 .iterations(1)
24414 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24415 }
24416 }
24417 }
24418 }
24419
24420 TEST(F32_GEMM_6X8S4__NEON, k_div_4) {
24421 TEST_REQUIRES_ARM_NEON;
24422 for (size_t k = 8; k <= 40; k += 4) {
24423 GemmMicrokernelTester()
24424 .mr(6)
24425 .nr(8)
24426 .kr(1)
24427 .sr(4)
24428 .m(6)
24429 .n(8)
24430 .k(k)
24431 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24432 }
24433 }
24434
24435 TEST(F32_GEMM_6X8S4__NEON, k_div_4_strided_a) {
24436 TEST_REQUIRES_ARM_NEON;
24437 for (size_t k = 8; k <= 40; k += 4) {
24438 GemmMicrokernelTester()
24439 .mr(6)
24440 .nr(8)
24441 .kr(1)
24442 .sr(4)
24443 .m(6)
24444 .n(8)
24445 .k(k)
24446 .a_stride(43)
24447 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24448 }
24449 }
24450
24451 TEST(F32_GEMM_6X8S4__NEON, k_div_4_subtile) {
24452 TEST_REQUIRES_ARM_NEON;
24453 for (size_t k = 8; k <= 40; k += 4) {
24454 for (uint32_t m = 1; m <= 6; m++) {
24455 for (uint32_t n = 1; n <= 8; n++) {
24456 GemmMicrokernelTester()
24457 .mr(6)
24458 .nr(8)
24459 .kr(1)
24460 .sr(4)
24461 .m(m)
24462 .n(n)
24463 .k(k)
24464 .iterations(1)
24465 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24466 }
24467 }
24468 }
24469 }
24470
24471 TEST(F32_GEMM_6X8S4__NEON, n_gt_8) {
24472 TEST_REQUIRES_ARM_NEON;
24473 for (uint32_t n = 9; n < 16; n++) {
24474 for (size_t k = 1; k <= 20; k += 5) {
24475 GemmMicrokernelTester()
24476 .mr(6)
24477 .nr(8)
24478 .kr(1)
24479 .sr(4)
24480 .m(6)
24481 .n(8)
24482 .k(k)
24483 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24484 }
24485 }
24486 }
24487
24488 TEST(F32_GEMM_6X8S4__NEON, n_gt_8_strided_cn) {
24489 TEST_REQUIRES_ARM_NEON;
24490 for (uint32_t n = 9; n < 16; n++) {
24491 for (size_t k = 1; k <= 20; k += 5) {
24492 GemmMicrokernelTester()
24493 .mr(6)
24494 .nr(8)
24495 .kr(1)
24496 .sr(4)
24497 .m(6)
24498 .n(8)
24499 .k(k)
24500 .cn_stride(11)
24501 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24502 }
24503 }
24504 }
24505
24506 TEST(F32_GEMM_6X8S4__NEON, n_gt_8_strided_a) {
24507 TEST_REQUIRES_ARM_NEON;
24508 for (uint32_t n = 9; n < 16; n++) {
24509 for (size_t k = 1; k <= 20; k += 5) {
24510 GemmMicrokernelTester()
24511 .mr(6)
24512 .nr(8)
24513 .kr(1)
24514 .sr(4)
24515 .m(6)
24516 .n(n)
24517 .k(k)
24518 .a_stride(23)
24519 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24520 }
24521 }
24522 }
24523
24524 TEST(F32_GEMM_6X8S4__NEON, n_gt_8_subtile) {
24525 TEST_REQUIRES_ARM_NEON;
24526 for (uint32_t n = 9; n < 16; n++) {
24527 for (size_t k = 1; k <= 20; k += 5) {
24528 for (uint32_t m = 1; m <= 6; m++) {
24529 GemmMicrokernelTester()
24530 .mr(6)
24531 .nr(8)
24532 .kr(1)
24533 .sr(4)
24534 .m(m)
24535 .n(n)
24536 .k(k)
24537 .iterations(1)
24538 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24539 }
24540 }
24541 }
24542 }
24543
24544 TEST(F32_GEMM_6X8S4__NEON, n_div_8) {
24545 TEST_REQUIRES_ARM_NEON;
24546 for (uint32_t n = 16; n <= 24; n += 8) {
24547 for (size_t k = 1; k <= 20; k += 5) {
24548 GemmMicrokernelTester()
24549 .mr(6)
24550 .nr(8)
24551 .kr(1)
24552 .sr(4)
24553 .m(6)
24554 .n(8)
24555 .k(k)
24556 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24557 }
24558 }
24559 }
24560
24561 TEST(F32_GEMM_6X8S4__NEON, n_div_8_strided_cn) {
24562 TEST_REQUIRES_ARM_NEON;
24563 for (uint32_t n = 16; n <= 24; n += 8) {
24564 for (size_t k = 1; k <= 20; k += 5) {
24565 GemmMicrokernelTester()
24566 .mr(6)
24567 .nr(8)
24568 .kr(1)
24569 .sr(4)
24570 .m(6)
24571 .n(n)
24572 .k(k)
24573 .cn_stride(11)
24574 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24575 }
24576 }
24577 }
24578
24579 TEST(F32_GEMM_6X8S4__NEON, n_div_8_strided_a) {
24580 TEST_REQUIRES_ARM_NEON;
24581 for (uint32_t n = 16; n <= 24; n += 8) {
24582 for (size_t k = 1; k <= 20; k += 5) {
24583 GemmMicrokernelTester()
24584 .mr(6)
24585 .nr(8)
24586 .kr(1)
24587 .sr(4)
24588 .m(6)
24589 .n(n)
24590 .k(k)
24591 .a_stride(23)
24592 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24593 }
24594 }
24595 }
24596
24597 TEST(F32_GEMM_6X8S4__NEON, n_div_8_subtile) {
24598 TEST_REQUIRES_ARM_NEON;
24599 for (uint32_t n = 16; n <= 24; n += 8) {
24600 for (size_t k = 1; k <= 20; k += 5) {
24601 for (uint32_t m = 1; m <= 6; m++) {
24602 GemmMicrokernelTester()
24603 .mr(6)
24604 .nr(8)
24605 .kr(1)
24606 .sr(4)
24607 .m(m)
24608 .n(n)
24609 .k(k)
24610 .iterations(1)
24611 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24612 }
24613 }
24614 }
24615 }
24616
24617 TEST(F32_GEMM_6X8S4__NEON, strided_cm_subtile) {
24618 TEST_REQUIRES_ARM_NEON;
24619 for (size_t k = 1; k <= 20; k += 5) {
24620 for (uint32_t m = 1; m <= 6; m++) {
24621 for (uint32_t n = 1; n <= 8; n++) {
24622 GemmMicrokernelTester()
24623 .mr(6)
24624 .nr(8)
24625 .kr(1)
24626 .sr(4)
24627 .m(m)
24628 .n(n)
24629 .k(k)
24630 .cm_stride(11)
24631 .iterations(1)
24632 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24633 }
24634 }
24635 }
24636 }
24637
24638 TEST(F32_GEMM_6X8S4__NEON, qmin) {
24639 TEST_REQUIRES_ARM_NEON;
24640 GemmMicrokernelTester()
24641 .mr(6)
24642 .nr(8)
24643 .kr(1)
24644 .sr(4)
24645 .m(6)
24646 .n(8)
24647 .k(4)
24648 .qmin(128)
24649 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24650 }
24651
24652 TEST(F32_GEMM_6X8S4__NEON, qmax) {
24653 TEST_REQUIRES_ARM_NEON;
24654 GemmMicrokernelTester()
24655 .mr(6)
24656 .nr(8)
24657 .kr(1)
24658 .sr(4)
24659 .m(6)
24660 .n(8)
24661 .k(4)
24662 .qmax(128)
24663 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24664 }
24665
24666 TEST(F32_GEMM_6X8S4__NEON, strided_cm) {
24667 TEST_REQUIRES_ARM_NEON;
24668 GemmMicrokernelTester()
24669 .mr(6)
24670 .nr(8)
24671 .kr(1)
24672 .sr(4)
24673 .m(6)
24674 .n(8)
24675 .k(4)
24676 .cm_stride(11)
24677 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
24678 }
24679#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24680
24681
24682#if XNN_ARCH_ARM || XNN_ARCH_ARM64
24683 TEST(F32_GEMM_8X8S4__NEON, k_eq_4) {
24684 TEST_REQUIRES_ARM_NEON;
24685 GemmMicrokernelTester()
24686 .mr(8)
24687 .nr(8)
24688 .kr(1)
24689 .sr(4)
24690 .m(8)
24691 .n(8)
24692 .k(4)
24693 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24694 }
24695
24696 TEST(F32_GEMM_8X8S4__NEON, strided_cn) {
24697 TEST_REQUIRES_ARM_NEON;
24698 GemmMicrokernelTester()
24699 .mr(8)
24700 .nr(8)
24701 .kr(1)
24702 .sr(4)
24703 .m(8)
24704 .n(8)
24705 .k(4)
24706 .cn_stride(11)
24707 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24708 }
24709
24710 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_strided_a) {
24711 TEST_REQUIRES_ARM_NEON;
24712 GemmMicrokernelTester()
24713 .mr(8)
24714 .nr(8)
24715 .kr(1)
24716 .sr(4)
24717 .m(8)
24718 .n(8)
24719 .k(4)
24720 .a_stride(7)
24721 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24722 }
24723
24724 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_subtile) {
24725 TEST_REQUIRES_ARM_NEON;
24726 for (uint32_t m = 1; m <= 8; m++) {
24727 for (uint32_t n = 1; n <= 8; n++) {
24728 GemmMicrokernelTester()
24729 .mr(8)
24730 .nr(8)
24731 .kr(1)
24732 .sr(4)
24733 .m(m)
24734 .n(n)
24735 .k(4)
24736 .iterations(1)
24737 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24738 }
24739 }
24740 }
24741
24742 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_subtile_m) {
24743 TEST_REQUIRES_ARM_NEON;
24744 for (uint32_t m = 1; m <= 8; m++) {
24745 GemmMicrokernelTester()
24746 .mr(8)
24747 .nr(8)
24748 .kr(1)
24749 .sr(4)
24750 .m(m)
24751 .n(8)
24752 .k(4)
24753 .iterations(1)
24754 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24755 }
24756 }
24757
24758 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_subtile_n) {
24759 TEST_REQUIRES_ARM_NEON;
24760 for (uint32_t n = 1; n <= 8; n++) {
24761 GemmMicrokernelTester()
24762 .mr(8)
24763 .nr(8)
24764 .kr(1)
24765 .sr(4)
24766 .m(8)
24767 .n(n)
24768 .k(4)
24769 .iterations(1)
24770 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24771 }
24772 }
24773
24774 TEST(F32_GEMM_8X8S4__NEON, k_lt_4) {
24775 TEST_REQUIRES_ARM_NEON;
24776 for (size_t k = 1; k < 4; k++) {
24777 GemmMicrokernelTester()
24778 .mr(8)
24779 .nr(8)
24780 .kr(1)
24781 .sr(4)
24782 .m(8)
24783 .n(8)
24784 .k(k)
24785 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24786 }
24787 }
24788
24789 TEST(F32_GEMM_8X8S4__NEON, k_lt_4_strided_a) {
24790 TEST_REQUIRES_ARM_NEON;
24791 for (size_t k = 1; k < 4; k++) {
24792 GemmMicrokernelTester()
24793 .mr(8)
24794 .nr(8)
24795 .kr(1)
24796 .sr(4)
24797 .m(8)
24798 .n(8)
24799 .k(k)
24800 .a_stride(7)
24801 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24802 }
24803 }
24804
24805 TEST(F32_GEMM_8X8S4__NEON, k_lt_4_subtile) {
24806 TEST_REQUIRES_ARM_NEON;
24807 for (size_t k = 1; k < 4; k++) {
24808 for (uint32_t m = 1; m <= 8; m++) {
24809 for (uint32_t n = 1; n <= 8; n++) {
24810 GemmMicrokernelTester()
24811 .mr(8)
24812 .nr(8)
24813 .kr(1)
24814 .sr(4)
24815 .m(m)
24816 .n(n)
24817 .k(k)
24818 .iterations(1)
24819 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24820 }
24821 }
24822 }
24823 }
24824
24825 TEST(F32_GEMM_8X8S4__NEON, k_gt_4) {
24826 TEST_REQUIRES_ARM_NEON;
24827 for (size_t k = 5; k < 8; k++) {
24828 GemmMicrokernelTester()
24829 .mr(8)
24830 .nr(8)
24831 .kr(1)
24832 .sr(4)
24833 .m(8)
24834 .n(8)
24835 .k(k)
24836 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24837 }
24838 }
24839
24840 TEST(F32_GEMM_8X8S4__NEON, k_gt_4_strided_a) {
24841 TEST_REQUIRES_ARM_NEON;
24842 for (size_t k = 5; k < 8; k++) {
24843 GemmMicrokernelTester()
24844 .mr(8)
24845 .nr(8)
24846 .kr(1)
24847 .sr(4)
24848 .m(8)
24849 .n(8)
24850 .k(k)
24851 .a_stride(11)
24852 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24853 }
24854 }
24855
24856 TEST(F32_GEMM_8X8S4__NEON, k_gt_4_subtile) {
24857 TEST_REQUIRES_ARM_NEON;
24858 for (size_t k = 5; k < 8; k++) {
24859 for (uint32_t m = 1; m <= 8; m++) {
24860 for (uint32_t n = 1; n <= 8; n++) {
24861 GemmMicrokernelTester()
24862 .mr(8)
24863 .nr(8)
24864 .kr(1)
24865 .sr(4)
24866 .m(m)
24867 .n(n)
24868 .k(k)
24869 .iterations(1)
24870 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24871 }
24872 }
24873 }
24874 }
24875
24876 TEST(F32_GEMM_8X8S4__NEON, k_div_4) {
24877 TEST_REQUIRES_ARM_NEON;
24878 for (size_t k = 8; k <= 40; k += 4) {
24879 GemmMicrokernelTester()
24880 .mr(8)
24881 .nr(8)
24882 .kr(1)
24883 .sr(4)
24884 .m(8)
24885 .n(8)
24886 .k(k)
24887 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24888 }
24889 }
24890
24891 TEST(F32_GEMM_8X8S4__NEON, k_div_4_strided_a) {
24892 TEST_REQUIRES_ARM_NEON;
24893 for (size_t k = 8; k <= 40; k += 4) {
24894 GemmMicrokernelTester()
24895 .mr(8)
24896 .nr(8)
24897 .kr(1)
24898 .sr(4)
24899 .m(8)
24900 .n(8)
24901 .k(k)
24902 .a_stride(43)
24903 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24904 }
24905 }
24906
24907 TEST(F32_GEMM_8X8S4__NEON, k_div_4_subtile) {
24908 TEST_REQUIRES_ARM_NEON;
24909 for (size_t k = 8; k <= 40; k += 4) {
24910 for (uint32_t m = 1; m <= 8; m++) {
24911 for (uint32_t n = 1; n <= 8; n++) {
24912 GemmMicrokernelTester()
24913 .mr(8)
24914 .nr(8)
24915 .kr(1)
24916 .sr(4)
24917 .m(m)
24918 .n(n)
24919 .k(k)
24920 .iterations(1)
24921 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24922 }
24923 }
24924 }
24925 }
24926
24927 TEST(F32_GEMM_8X8S4__NEON, n_gt_8) {
24928 TEST_REQUIRES_ARM_NEON;
24929 for (uint32_t n = 9; n < 16; n++) {
24930 for (size_t k = 1; k <= 20; k += 5) {
24931 GemmMicrokernelTester()
24932 .mr(8)
24933 .nr(8)
24934 .kr(1)
24935 .sr(4)
24936 .m(8)
24937 .n(8)
24938 .k(k)
24939 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24940 }
24941 }
24942 }
24943
24944 TEST(F32_GEMM_8X8S4__NEON, n_gt_8_strided_cn) {
24945 TEST_REQUIRES_ARM_NEON;
24946 for (uint32_t n = 9; n < 16; n++) {
24947 for (size_t k = 1; k <= 20; k += 5) {
24948 GemmMicrokernelTester()
24949 .mr(8)
24950 .nr(8)
24951 .kr(1)
24952 .sr(4)
24953 .m(8)
24954 .n(8)
24955 .k(k)
24956 .cn_stride(11)
24957 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24958 }
24959 }
24960 }
24961
24962 TEST(F32_GEMM_8X8S4__NEON, n_gt_8_strided_a) {
24963 TEST_REQUIRES_ARM_NEON;
24964 for (uint32_t n = 9; n < 16; n++) {
24965 for (size_t k = 1; k <= 20; k += 5) {
24966 GemmMicrokernelTester()
24967 .mr(8)
24968 .nr(8)
24969 .kr(1)
24970 .sr(4)
24971 .m(8)
24972 .n(n)
24973 .k(k)
24974 .a_stride(23)
24975 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24976 }
24977 }
24978 }
24979
24980 TEST(F32_GEMM_8X8S4__NEON, n_gt_8_subtile) {
24981 TEST_REQUIRES_ARM_NEON;
24982 for (uint32_t n = 9; n < 16; n++) {
24983 for (size_t k = 1; k <= 20; k += 5) {
24984 for (uint32_t m = 1; m <= 8; m++) {
24985 GemmMicrokernelTester()
24986 .mr(8)
24987 .nr(8)
24988 .kr(1)
24989 .sr(4)
24990 .m(m)
24991 .n(n)
24992 .k(k)
24993 .iterations(1)
24994 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
24995 }
24996 }
24997 }
24998 }
24999
25000 TEST(F32_GEMM_8X8S4__NEON, n_div_8) {
25001 TEST_REQUIRES_ARM_NEON;
25002 for (uint32_t n = 16; n <= 24; n += 8) {
25003 for (size_t k = 1; k <= 20; k += 5) {
25004 GemmMicrokernelTester()
25005 .mr(8)
25006 .nr(8)
25007 .kr(1)
25008 .sr(4)
25009 .m(8)
25010 .n(8)
25011 .k(k)
25012 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25013 }
25014 }
25015 }
25016
25017 TEST(F32_GEMM_8X8S4__NEON, n_div_8_strided_cn) {
25018 TEST_REQUIRES_ARM_NEON;
25019 for (uint32_t n = 16; n <= 24; n += 8) {
25020 for (size_t k = 1; k <= 20; k += 5) {
25021 GemmMicrokernelTester()
25022 .mr(8)
25023 .nr(8)
25024 .kr(1)
25025 .sr(4)
25026 .m(8)
25027 .n(n)
25028 .k(k)
25029 .cn_stride(11)
25030 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25031 }
25032 }
25033 }
25034
25035 TEST(F32_GEMM_8X8S4__NEON, n_div_8_strided_a) {
25036 TEST_REQUIRES_ARM_NEON;
25037 for (uint32_t n = 16; n <= 24; n += 8) {
25038 for (size_t k = 1; k <= 20; k += 5) {
25039 GemmMicrokernelTester()
25040 .mr(8)
25041 .nr(8)
25042 .kr(1)
25043 .sr(4)
25044 .m(8)
25045 .n(n)
25046 .k(k)
25047 .a_stride(23)
25048 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25049 }
25050 }
25051 }
25052
25053 TEST(F32_GEMM_8X8S4__NEON, n_div_8_subtile) {
25054 TEST_REQUIRES_ARM_NEON;
25055 for (uint32_t n = 16; n <= 24; n += 8) {
25056 for (size_t k = 1; k <= 20; k += 5) {
25057 for (uint32_t m = 1; m <= 8; m++) {
25058 GemmMicrokernelTester()
25059 .mr(8)
25060 .nr(8)
25061 .kr(1)
25062 .sr(4)
25063 .m(m)
25064 .n(n)
25065 .k(k)
25066 .iterations(1)
25067 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25068 }
25069 }
25070 }
25071 }
25072
25073 TEST(F32_GEMM_8X8S4__NEON, strided_cm_subtile) {
25074 TEST_REQUIRES_ARM_NEON;
25075 for (size_t k = 1; k <= 20; k += 5) {
25076 for (uint32_t m = 1; m <= 8; m++) {
25077 for (uint32_t n = 1; n <= 8; n++) {
25078 GemmMicrokernelTester()
25079 .mr(8)
25080 .nr(8)
25081 .kr(1)
25082 .sr(4)
25083 .m(m)
25084 .n(n)
25085 .k(k)
25086 .cm_stride(11)
25087 .iterations(1)
25088 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25089 }
25090 }
25091 }
25092 }
25093
25094 TEST(F32_GEMM_8X8S4__NEON, qmin) {
25095 TEST_REQUIRES_ARM_NEON;
25096 GemmMicrokernelTester()
25097 .mr(8)
25098 .nr(8)
25099 .kr(1)
25100 .sr(4)
25101 .m(8)
25102 .n(8)
25103 .k(4)
25104 .qmin(128)
25105 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25106 }
25107
25108 TEST(F32_GEMM_8X8S4__NEON, qmax) {
25109 TEST_REQUIRES_ARM_NEON;
25110 GemmMicrokernelTester()
25111 .mr(8)
25112 .nr(8)
25113 .kr(1)
25114 .sr(4)
25115 .m(8)
25116 .n(8)
25117 .k(4)
25118 .qmax(128)
25119 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25120 }
25121
25122 TEST(F32_GEMM_8X8S4__NEON, strided_cm) {
25123 TEST_REQUIRES_ARM_NEON;
25124 GemmMicrokernelTester()
25125 .mr(8)
25126 .nr(8)
25127 .kr(1)
25128 .sr(4)
25129 .m(8)
25130 .n(8)
25131 .k(4)
25132 .cm_stride(11)
25133 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
25134 }
25135#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
25136
25137
25138#if XNN_ARCH_ARM || XNN_ARCH_ARM64
25139 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4) {
25140 TEST_REQUIRES_ARM_NEON_FMA;
25141 GemmMicrokernelTester()
25142 .mr(1)
25143 .nr(8)
25144 .kr(1)
25145 .sr(4)
25146 .m(1)
25147 .n(8)
25148 .k(4)
25149 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25150 }
25151
25152 TEST(F32_GEMM_1X8S4__NEONFMA, strided_cn) {
25153 TEST_REQUIRES_ARM_NEON_FMA;
25154 GemmMicrokernelTester()
25155 .mr(1)
25156 .nr(8)
25157 .kr(1)
25158 .sr(4)
25159 .m(1)
25160 .n(8)
25161 .k(4)
25162 .cn_stride(11)
25163 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25164 }
25165
25166 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_strided_a) {
25167 TEST_REQUIRES_ARM_NEON_FMA;
25168 GemmMicrokernelTester()
25169 .mr(1)
25170 .nr(8)
25171 .kr(1)
25172 .sr(4)
25173 .m(1)
25174 .n(8)
25175 .k(4)
25176 .a_stride(7)
25177 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25178 }
25179
25180 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_subtile) {
25181 TEST_REQUIRES_ARM_NEON_FMA;
25182 for (uint32_t m = 1; m <= 1; m++) {
25183 for (uint32_t n = 1; n <= 8; n++) {
25184 GemmMicrokernelTester()
25185 .mr(1)
25186 .nr(8)
25187 .kr(1)
25188 .sr(4)
25189 .m(m)
25190 .n(n)
25191 .k(4)
25192 .iterations(1)
25193 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25194 }
25195 }
25196 }
25197
25198 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_subtile_m) {
25199 TEST_REQUIRES_ARM_NEON_FMA;
25200 for (uint32_t m = 1; m <= 1; m++) {
25201 GemmMicrokernelTester()
25202 .mr(1)
25203 .nr(8)
25204 .kr(1)
25205 .sr(4)
25206 .m(m)
25207 .n(8)
25208 .k(4)
25209 .iterations(1)
25210 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25211 }
25212 }
25213
25214 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_subtile_n) {
25215 TEST_REQUIRES_ARM_NEON_FMA;
25216 for (uint32_t n = 1; n <= 8; n++) {
25217 GemmMicrokernelTester()
25218 .mr(1)
25219 .nr(8)
25220 .kr(1)
25221 .sr(4)
25222 .m(1)
25223 .n(n)
25224 .k(4)
25225 .iterations(1)
25226 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25227 }
25228 }
25229
25230 TEST(F32_GEMM_1X8S4__NEONFMA, k_lt_4) {
25231 TEST_REQUIRES_ARM_NEON_FMA;
25232 for (size_t k = 1; k < 4; k++) {
25233 GemmMicrokernelTester()
25234 .mr(1)
25235 .nr(8)
25236 .kr(1)
25237 .sr(4)
25238 .m(1)
25239 .n(8)
25240 .k(k)
25241 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25242 }
25243 }
25244
25245 TEST(F32_GEMM_1X8S4__NEONFMA, k_lt_4_strided_a) {
25246 TEST_REQUIRES_ARM_NEON_FMA;
25247 for (size_t k = 1; k < 4; k++) {
25248 GemmMicrokernelTester()
25249 .mr(1)
25250 .nr(8)
25251 .kr(1)
25252 .sr(4)
25253 .m(1)
25254 .n(8)
25255 .k(k)
25256 .a_stride(7)
25257 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25258 }
25259 }
25260
25261 TEST(F32_GEMM_1X8S4__NEONFMA, k_lt_4_subtile) {
25262 TEST_REQUIRES_ARM_NEON_FMA;
25263 for (size_t k = 1; k < 4; k++) {
25264 for (uint32_t m = 1; m <= 1; m++) {
25265 for (uint32_t n = 1; n <= 8; n++) {
25266 GemmMicrokernelTester()
25267 .mr(1)
25268 .nr(8)
25269 .kr(1)
25270 .sr(4)
25271 .m(m)
25272 .n(n)
25273 .k(k)
25274 .iterations(1)
25275 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25276 }
25277 }
25278 }
25279 }
25280
25281 TEST(F32_GEMM_1X8S4__NEONFMA, k_gt_4) {
25282 TEST_REQUIRES_ARM_NEON_FMA;
25283 for (size_t k = 5; k < 8; k++) {
25284 GemmMicrokernelTester()
25285 .mr(1)
25286 .nr(8)
25287 .kr(1)
25288 .sr(4)
25289 .m(1)
25290 .n(8)
25291 .k(k)
25292 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25293 }
25294 }
25295
25296 TEST(F32_GEMM_1X8S4__NEONFMA, k_gt_4_strided_a) {
25297 TEST_REQUIRES_ARM_NEON_FMA;
25298 for (size_t k = 5; k < 8; k++) {
25299 GemmMicrokernelTester()
25300 .mr(1)
25301 .nr(8)
25302 .kr(1)
25303 .sr(4)
25304 .m(1)
25305 .n(8)
25306 .k(k)
25307 .a_stride(11)
25308 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25309 }
25310 }
25311
25312 TEST(F32_GEMM_1X8S4__NEONFMA, k_gt_4_subtile) {
25313 TEST_REQUIRES_ARM_NEON_FMA;
25314 for (size_t k = 5; k < 8; k++) {
25315 for (uint32_t m = 1; m <= 1; m++) {
25316 for (uint32_t n = 1; n <= 8; n++) {
25317 GemmMicrokernelTester()
25318 .mr(1)
25319 .nr(8)
25320 .kr(1)
25321 .sr(4)
25322 .m(m)
25323 .n(n)
25324 .k(k)
25325 .iterations(1)
25326 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25327 }
25328 }
25329 }
25330 }
25331
25332 TEST(F32_GEMM_1X8S4__NEONFMA, k_div_4) {
25333 TEST_REQUIRES_ARM_NEON_FMA;
25334 for (size_t k = 8; k <= 40; k += 4) {
25335 GemmMicrokernelTester()
25336 .mr(1)
25337 .nr(8)
25338 .kr(1)
25339 .sr(4)
25340 .m(1)
25341 .n(8)
25342 .k(k)
25343 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25344 }
25345 }
25346
25347 TEST(F32_GEMM_1X8S4__NEONFMA, k_div_4_strided_a) {
25348 TEST_REQUIRES_ARM_NEON_FMA;
25349 for (size_t k = 8; k <= 40; k += 4) {
25350 GemmMicrokernelTester()
25351 .mr(1)
25352 .nr(8)
25353 .kr(1)
25354 .sr(4)
25355 .m(1)
25356 .n(8)
25357 .k(k)
25358 .a_stride(43)
25359 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25360 }
25361 }
25362
25363 TEST(F32_GEMM_1X8S4__NEONFMA, k_div_4_subtile) {
25364 TEST_REQUIRES_ARM_NEON_FMA;
25365 for (size_t k = 8; k <= 40; k += 4) {
25366 for (uint32_t m = 1; m <= 1; m++) {
25367 for (uint32_t n = 1; n <= 8; n++) {
25368 GemmMicrokernelTester()
25369 .mr(1)
25370 .nr(8)
25371 .kr(1)
25372 .sr(4)
25373 .m(m)
25374 .n(n)
25375 .k(k)
25376 .iterations(1)
25377 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25378 }
25379 }
25380 }
25381 }
25382
25383 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8) {
25384 TEST_REQUIRES_ARM_NEON_FMA;
25385 for (uint32_t n = 9; n < 16; n++) {
25386 for (size_t k = 1; k <= 20; k += 5) {
25387 GemmMicrokernelTester()
25388 .mr(1)
25389 .nr(8)
25390 .kr(1)
25391 .sr(4)
25392 .m(1)
25393 .n(8)
25394 .k(k)
25395 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25396 }
25397 }
25398 }
25399
25400 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8_strided_cn) {
25401 TEST_REQUIRES_ARM_NEON_FMA;
25402 for (uint32_t n = 9; n < 16; n++) {
25403 for (size_t k = 1; k <= 20; k += 5) {
25404 GemmMicrokernelTester()
25405 .mr(1)
25406 .nr(8)
25407 .kr(1)
25408 .sr(4)
25409 .m(1)
25410 .n(8)
25411 .k(k)
25412 .cn_stride(11)
25413 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25414 }
25415 }
25416 }
25417
25418 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8_strided_a) {
25419 TEST_REQUIRES_ARM_NEON_FMA;
25420 for (uint32_t n = 9; n < 16; n++) {
25421 for (size_t k = 1; k <= 20; k += 5) {
25422 GemmMicrokernelTester()
25423 .mr(1)
25424 .nr(8)
25425 .kr(1)
25426 .sr(4)
25427 .m(1)
25428 .n(n)
25429 .k(k)
25430 .a_stride(23)
25431 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25432 }
25433 }
25434 }
25435
25436 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8_subtile) {
25437 TEST_REQUIRES_ARM_NEON_FMA;
25438 for (uint32_t n = 9; n < 16; n++) {
25439 for (size_t k = 1; k <= 20; k += 5) {
25440 for (uint32_t m = 1; m <= 1; m++) {
25441 GemmMicrokernelTester()
25442 .mr(1)
25443 .nr(8)
25444 .kr(1)
25445 .sr(4)
25446 .m(m)
25447 .n(n)
25448 .k(k)
25449 .iterations(1)
25450 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25451 }
25452 }
25453 }
25454 }
25455
25456 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8) {
25457 TEST_REQUIRES_ARM_NEON_FMA;
25458 for (uint32_t n = 16; n <= 24; n += 8) {
25459 for (size_t k = 1; k <= 20; k += 5) {
25460 GemmMicrokernelTester()
25461 .mr(1)
25462 .nr(8)
25463 .kr(1)
25464 .sr(4)
25465 .m(1)
25466 .n(8)
25467 .k(k)
25468 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25469 }
25470 }
25471 }
25472
25473 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8_strided_cn) {
25474 TEST_REQUIRES_ARM_NEON_FMA;
25475 for (uint32_t n = 16; n <= 24; n += 8) {
25476 for (size_t k = 1; k <= 20; k += 5) {
25477 GemmMicrokernelTester()
25478 .mr(1)
25479 .nr(8)
25480 .kr(1)
25481 .sr(4)
25482 .m(1)
25483 .n(n)
25484 .k(k)
25485 .cn_stride(11)
25486 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25487 }
25488 }
25489 }
25490
25491 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8_strided_a) {
25492 TEST_REQUIRES_ARM_NEON_FMA;
25493 for (uint32_t n = 16; n <= 24; n += 8) {
25494 for (size_t k = 1; k <= 20; k += 5) {
25495 GemmMicrokernelTester()
25496 .mr(1)
25497 .nr(8)
25498 .kr(1)
25499 .sr(4)
25500 .m(1)
25501 .n(n)
25502 .k(k)
25503 .a_stride(23)
25504 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25505 }
25506 }
25507 }
25508
25509 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8_subtile) {
25510 TEST_REQUIRES_ARM_NEON_FMA;
25511 for (uint32_t n = 16; n <= 24; n += 8) {
25512 for (size_t k = 1; k <= 20; k += 5) {
25513 for (uint32_t m = 1; m <= 1; m++) {
25514 GemmMicrokernelTester()
25515 .mr(1)
25516 .nr(8)
25517 .kr(1)
25518 .sr(4)
25519 .m(m)
25520 .n(n)
25521 .k(k)
25522 .iterations(1)
25523 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25524 }
25525 }
25526 }
25527 }
25528
25529 TEST(F32_GEMM_1X8S4__NEONFMA, strided_cm_subtile) {
25530 TEST_REQUIRES_ARM_NEON_FMA;
25531 for (size_t k = 1; k <= 20; k += 5) {
25532 for (uint32_t m = 1; m <= 1; m++) {
25533 for (uint32_t n = 1; n <= 8; n++) {
25534 GemmMicrokernelTester()
25535 .mr(1)
25536 .nr(8)
25537 .kr(1)
25538 .sr(4)
25539 .m(m)
25540 .n(n)
25541 .k(k)
25542 .cm_stride(11)
25543 .iterations(1)
25544 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25545 }
25546 }
25547 }
25548 }
25549
25550 TEST(F32_GEMM_1X8S4__NEONFMA, qmin) {
25551 TEST_REQUIRES_ARM_NEON_FMA;
25552 GemmMicrokernelTester()
25553 .mr(1)
25554 .nr(8)
25555 .kr(1)
25556 .sr(4)
25557 .m(1)
25558 .n(8)
25559 .k(4)
25560 .qmin(128)
25561 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25562 }
25563
25564 TEST(F32_GEMM_1X8S4__NEONFMA, qmax) {
25565 TEST_REQUIRES_ARM_NEON_FMA;
25566 GemmMicrokernelTester()
25567 .mr(1)
25568 .nr(8)
25569 .kr(1)
25570 .sr(4)
25571 .m(1)
25572 .n(8)
25573 .k(4)
25574 .qmax(128)
25575 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25576 }
25577
25578 TEST(F32_GEMM_1X8S4__NEONFMA, strided_cm) {
25579 TEST_REQUIRES_ARM_NEON_FMA;
25580 GemmMicrokernelTester()
25581 .mr(1)
25582 .nr(8)
25583 .kr(1)
25584 .sr(4)
25585 .m(1)
25586 .n(8)
25587 .k(4)
25588 .cm_stride(11)
25589 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
25590 }
25591#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
25592
25593
25594#if XNN_ARCH_ARM || XNN_ARCH_ARM64
25595 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4) {
25596 TEST_REQUIRES_ARM_NEON_FMA;
25597 GemmMicrokernelTester()
25598 .mr(4)
25599 .nr(8)
25600 .kr(1)
25601 .sr(4)
25602 .m(4)
25603 .n(8)
25604 .k(4)
25605 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25606 }
25607
25608 TEST(F32_GEMM_4X8S4__NEONFMA, strided_cn) {
25609 TEST_REQUIRES_ARM_NEON_FMA;
25610 GemmMicrokernelTester()
25611 .mr(4)
25612 .nr(8)
25613 .kr(1)
25614 .sr(4)
25615 .m(4)
25616 .n(8)
25617 .k(4)
25618 .cn_stride(11)
25619 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25620 }
25621
25622 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_strided_a) {
25623 TEST_REQUIRES_ARM_NEON_FMA;
25624 GemmMicrokernelTester()
25625 .mr(4)
25626 .nr(8)
25627 .kr(1)
25628 .sr(4)
25629 .m(4)
25630 .n(8)
25631 .k(4)
25632 .a_stride(7)
25633 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25634 }
25635
25636 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_subtile) {
25637 TEST_REQUIRES_ARM_NEON_FMA;
25638 for (uint32_t m = 1; m <= 4; m++) {
25639 for (uint32_t n = 1; n <= 8; n++) {
25640 GemmMicrokernelTester()
25641 .mr(4)
25642 .nr(8)
25643 .kr(1)
25644 .sr(4)
25645 .m(m)
25646 .n(n)
25647 .k(4)
25648 .iterations(1)
25649 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25650 }
25651 }
25652 }
25653
25654 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_subtile_m) {
25655 TEST_REQUIRES_ARM_NEON_FMA;
25656 for (uint32_t m = 1; m <= 4; m++) {
25657 GemmMicrokernelTester()
25658 .mr(4)
25659 .nr(8)
25660 .kr(1)
25661 .sr(4)
25662 .m(m)
25663 .n(8)
25664 .k(4)
25665 .iterations(1)
25666 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25667 }
25668 }
25669
25670 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_subtile_n) {
25671 TEST_REQUIRES_ARM_NEON_FMA;
25672 for (uint32_t n = 1; n <= 8; n++) {
25673 GemmMicrokernelTester()
25674 .mr(4)
25675 .nr(8)
25676 .kr(1)
25677 .sr(4)
25678 .m(4)
25679 .n(n)
25680 .k(4)
25681 .iterations(1)
25682 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25683 }
25684 }
25685
25686 TEST(F32_GEMM_4X8S4__NEONFMA, k_lt_4) {
25687 TEST_REQUIRES_ARM_NEON_FMA;
25688 for (size_t k = 1; k < 4; k++) {
25689 GemmMicrokernelTester()
25690 .mr(4)
25691 .nr(8)
25692 .kr(1)
25693 .sr(4)
25694 .m(4)
25695 .n(8)
25696 .k(k)
25697 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25698 }
25699 }
25700
25701 TEST(F32_GEMM_4X8S4__NEONFMA, k_lt_4_strided_a) {
25702 TEST_REQUIRES_ARM_NEON_FMA;
25703 for (size_t k = 1; k < 4; k++) {
25704 GemmMicrokernelTester()
25705 .mr(4)
25706 .nr(8)
25707 .kr(1)
25708 .sr(4)
25709 .m(4)
25710 .n(8)
25711 .k(k)
25712 .a_stride(7)
25713 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25714 }
25715 }
25716
25717 TEST(F32_GEMM_4X8S4__NEONFMA, k_lt_4_subtile) {
25718 TEST_REQUIRES_ARM_NEON_FMA;
25719 for (size_t k = 1; k < 4; k++) {
25720 for (uint32_t m = 1; m <= 4; m++) {
25721 for (uint32_t n = 1; n <= 8; n++) {
25722 GemmMicrokernelTester()
25723 .mr(4)
25724 .nr(8)
25725 .kr(1)
25726 .sr(4)
25727 .m(m)
25728 .n(n)
25729 .k(k)
25730 .iterations(1)
25731 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25732 }
25733 }
25734 }
25735 }
25736
25737 TEST(F32_GEMM_4X8S4__NEONFMA, k_gt_4) {
25738 TEST_REQUIRES_ARM_NEON_FMA;
25739 for (size_t k = 5; k < 8; k++) {
25740 GemmMicrokernelTester()
25741 .mr(4)
25742 .nr(8)
25743 .kr(1)
25744 .sr(4)
25745 .m(4)
25746 .n(8)
25747 .k(k)
25748 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25749 }
25750 }
25751
25752 TEST(F32_GEMM_4X8S4__NEONFMA, k_gt_4_strided_a) {
25753 TEST_REQUIRES_ARM_NEON_FMA;
25754 for (size_t k = 5; k < 8; k++) {
25755 GemmMicrokernelTester()
25756 .mr(4)
25757 .nr(8)
25758 .kr(1)
25759 .sr(4)
25760 .m(4)
25761 .n(8)
25762 .k(k)
25763 .a_stride(11)
25764 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25765 }
25766 }
25767
25768 TEST(F32_GEMM_4X8S4__NEONFMA, k_gt_4_subtile) {
25769 TEST_REQUIRES_ARM_NEON_FMA;
25770 for (size_t k = 5; k < 8; k++) {
25771 for (uint32_t m = 1; m <= 4; m++) {
25772 for (uint32_t n = 1; n <= 8; n++) {
25773 GemmMicrokernelTester()
25774 .mr(4)
25775 .nr(8)
25776 .kr(1)
25777 .sr(4)
25778 .m(m)
25779 .n(n)
25780 .k(k)
25781 .iterations(1)
25782 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25783 }
25784 }
25785 }
25786 }
25787
25788 TEST(F32_GEMM_4X8S4__NEONFMA, k_div_4) {
25789 TEST_REQUIRES_ARM_NEON_FMA;
25790 for (size_t k = 8; k <= 40; k += 4) {
25791 GemmMicrokernelTester()
25792 .mr(4)
25793 .nr(8)
25794 .kr(1)
25795 .sr(4)
25796 .m(4)
25797 .n(8)
25798 .k(k)
25799 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25800 }
25801 }
25802
25803 TEST(F32_GEMM_4X8S4__NEONFMA, k_div_4_strided_a) {
25804 TEST_REQUIRES_ARM_NEON_FMA;
25805 for (size_t k = 8; k <= 40; k += 4) {
25806 GemmMicrokernelTester()
25807 .mr(4)
25808 .nr(8)
25809 .kr(1)
25810 .sr(4)
25811 .m(4)
25812 .n(8)
25813 .k(k)
25814 .a_stride(43)
25815 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25816 }
25817 }
25818
25819 TEST(F32_GEMM_4X8S4__NEONFMA, k_div_4_subtile) {
25820 TEST_REQUIRES_ARM_NEON_FMA;
25821 for (size_t k = 8; k <= 40; k += 4) {
25822 for (uint32_t m = 1; m <= 4; m++) {
25823 for (uint32_t n = 1; n <= 8; n++) {
25824 GemmMicrokernelTester()
25825 .mr(4)
25826 .nr(8)
25827 .kr(1)
25828 .sr(4)
25829 .m(m)
25830 .n(n)
25831 .k(k)
25832 .iterations(1)
25833 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25834 }
25835 }
25836 }
25837 }
25838
25839 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8) {
25840 TEST_REQUIRES_ARM_NEON_FMA;
25841 for (uint32_t n = 9; n < 16; n++) {
25842 for (size_t k = 1; k <= 20; k += 5) {
25843 GemmMicrokernelTester()
25844 .mr(4)
25845 .nr(8)
25846 .kr(1)
25847 .sr(4)
25848 .m(4)
25849 .n(8)
25850 .k(k)
25851 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25852 }
25853 }
25854 }
25855
25856 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8_strided_cn) {
25857 TEST_REQUIRES_ARM_NEON_FMA;
25858 for (uint32_t n = 9; n < 16; n++) {
25859 for (size_t k = 1; k <= 20; k += 5) {
25860 GemmMicrokernelTester()
25861 .mr(4)
25862 .nr(8)
25863 .kr(1)
25864 .sr(4)
25865 .m(4)
25866 .n(8)
25867 .k(k)
25868 .cn_stride(11)
25869 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25870 }
25871 }
25872 }
25873
25874 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8_strided_a) {
25875 TEST_REQUIRES_ARM_NEON_FMA;
25876 for (uint32_t n = 9; n < 16; n++) {
25877 for (size_t k = 1; k <= 20; k += 5) {
25878 GemmMicrokernelTester()
25879 .mr(4)
25880 .nr(8)
25881 .kr(1)
25882 .sr(4)
25883 .m(4)
25884 .n(n)
25885 .k(k)
25886 .a_stride(23)
25887 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25888 }
25889 }
25890 }
25891
25892 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8_subtile) {
25893 TEST_REQUIRES_ARM_NEON_FMA;
25894 for (uint32_t n = 9; n < 16; n++) {
25895 for (size_t k = 1; k <= 20; k += 5) {
25896 for (uint32_t m = 1; m <= 4; m++) {
25897 GemmMicrokernelTester()
25898 .mr(4)
25899 .nr(8)
25900 .kr(1)
25901 .sr(4)
25902 .m(m)
25903 .n(n)
25904 .k(k)
25905 .iterations(1)
25906 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25907 }
25908 }
25909 }
25910 }
25911
25912 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8) {
25913 TEST_REQUIRES_ARM_NEON_FMA;
25914 for (uint32_t n = 16; n <= 24; n += 8) {
25915 for (size_t k = 1; k <= 20; k += 5) {
25916 GemmMicrokernelTester()
25917 .mr(4)
25918 .nr(8)
25919 .kr(1)
25920 .sr(4)
25921 .m(4)
25922 .n(8)
25923 .k(k)
25924 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25925 }
25926 }
25927 }
25928
25929 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8_strided_cn) {
25930 TEST_REQUIRES_ARM_NEON_FMA;
25931 for (uint32_t n = 16; n <= 24; n += 8) {
25932 for (size_t k = 1; k <= 20; k += 5) {
25933 GemmMicrokernelTester()
25934 .mr(4)
25935 .nr(8)
25936 .kr(1)
25937 .sr(4)
25938 .m(4)
25939 .n(n)
25940 .k(k)
25941 .cn_stride(11)
25942 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25943 }
25944 }
25945 }
25946
25947 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8_strided_a) {
25948 TEST_REQUIRES_ARM_NEON_FMA;
25949 for (uint32_t n = 16; n <= 24; n += 8) {
25950 for (size_t k = 1; k <= 20; k += 5) {
25951 GemmMicrokernelTester()
25952 .mr(4)
25953 .nr(8)
25954 .kr(1)
25955 .sr(4)
25956 .m(4)
25957 .n(n)
25958 .k(k)
25959 .a_stride(23)
25960 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25961 }
25962 }
25963 }
25964
25965 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8_subtile) {
25966 TEST_REQUIRES_ARM_NEON_FMA;
25967 for (uint32_t n = 16; n <= 24; n += 8) {
25968 for (size_t k = 1; k <= 20; k += 5) {
25969 for (uint32_t m = 1; m <= 4; m++) {
25970 GemmMicrokernelTester()
25971 .mr(4)
25972 .nr(8)
25973 .kr(1)
25974 .sr(4)
25975 .m(m)
25976 .n(n)
25977 .k(k)
25978 .iterations(1)
25979 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
25980 }
25981 }
25982 }
25983 }
25984
25985 TEST(F32_GEMM_4X8S4__NEONFMA, strided_cm_subtile) {
25986 TEST_REQUIRES_ARM_NEON_FMA;
25987 for (size_t k = 1; k <= 20; k += 5) {
25988 for (uint32_t m = 1; m <= 4; m++) {
25989 for (uint32_t n = 1; n <= 8; n++) {
25990 GemmMicrokernelTester()
25991 .mr(4)
25992 .nr(8)
25993 .kr(1)
25994 .sr(4)
25995 .m(m)
25996 .n(n)
25997 .k(k)
25998 .cm_stride(11)
25999 .iterations(1)
26000 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
26001 }
26002 }
26003 }
26004 }
26005
26006 TEST(F32_GEMM_4X8S4__NEONFMA, qmin) {
26007 TEST_REQUIRES_ARM_NEON_FMA;
26008 GemmMicrokernelTester()
26009 .mr(4)
26010 .nr(8)
26011 .kr(1)
26012 .sr(4)
26013 .m(4)
26014 .n(8)
26015 .k(4)
26016 .qmin(128)
26017 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
26018 }
26019
26020 TEST(F32_GEMM_4X8S4__NEONFMA, qmax) {
26021 TEST_REQUIRES_ARM_NEON_FMA;
26022 GemmMicrokernelTester()
26023 .mr(4)
26024 .nr(8)
26025 .kr(1)
26026 .sr(4)
26027 .m(4)
26028 .n(8)
26029 .k(4)
26030 .qmax(128)
26031 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
26032 }
26033
26034 TEST(F32_GEMM_4X8S4__NEONFMA, strided_cm) {
26035 TEST_REQUIRES_ARM_NEON_FMA;
26036 GemmMicrokernelTester()
26037 .mr(4)
26038 .nr(8)
26039 .kr(1)
26040 .sr(4)
26041 .m(4)
26042 .n(8)
26043 .k(4)
26044 .cm_stride(11)
26045 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
26046 }
26047#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
26048
26049
26050#if XNN_ARCH_ARM || XNN_ARCH_ARM64
26051 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4) {
26052 TEST_REQUIRES_ARM_NEON_FMA;
26053 GemmMicrokernelTester()
26054 .mr(6)
26055 .nr(8)
26056 .kr(1)
26057 .sr(4)
26058 .m(6)
26059 .n(8)
26060 .k(4)
26061 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26062 }
26063
26064 TEST(F32_GEMM_6X8S4__NEONFMA, strided_cn) {
26065 TEST_REQUIRES_ARM_NEON_FMA;
26066 GemmMicrokernelTester()
26067 .mr(6)
26068 .nr(8)
26069 .kr(1)
26070 .sr(4)
26071 .m(6)
26072 .n(8)
26073 .k(4)
26074 .cn_stride(11)
26075 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26076 }
26077
26078 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_strided_a) {
26079 TEST_REQUIRES_ARM_NEON_FMA;
26080 GemmMicrokernelTester()
26081 .mr(6)
26082 .nr(8)
26083 .kr(1)
26084 .sr(4)
26085 .m(6)
26086 .n(8)
26087 .k(4)
26088 .a_stride(7)
26089 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26090 }
26091
26092 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_subtile) {
26093 TEST_REQUIRES_ARM_NEON_FMA;
26094 for (uint32_t m = 1; m <= 6; m++) {
26095 for (uint32_t n = 1; n <= 8; n++) {
26096 GemmMicrokernelTester()
26097 .mr(6)
26098 .nr(8)
26099 .kr(1)
26100 .sr(4)
26101 .m(m)
26102 .n(n)
26103 .k(4)
26104 .iterations(1)
26105 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26106 }
26107 }
26108 }
26109
26110 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_subtile_m) {
26111 TEST_REQUIRES_ARM_NEON_FMA;
26112 for (uint32_t m = 1; m <= 6; m++) {
26113 GemmMicrokernelTester()
26114 .mr(6)
26115 .nr(8)
26116 .kr(1)
26117 .sr(4)
26118 .m(m)
26119 .n(8)
26120 .k(4)
26121 .iterations(1)
26122 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26123 }
26124 }
26125
26126 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_subtile_n) {
26127 TEST_REQUIRES_ARM_NEON_FMA;
26128 for (uint32_t n = 1; n <= 8; n++) {
26129 GemmMicrokernelTester()
26130 .mr(6)
26131 .nr(8)
26132 .kr(1)
26133 .sr(4)
26134 .m(6)
26135 .n(n)
26136 .k(4)
26137 .iterations(1)
26138 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26139 }
26140 }
26141
26142 TEST(F32_GEMM_6X8S4__NEONFMA, k_lt_4) {
26143 TEST_REQUIRES_ARM_NEON_FMA;
26144 for (size_t k = 1; k < 4; k++) {
26145 GemmMicrokernelTester()
26146 .mr(6)
26147 .nr(8)
26148 .kr(1)
26149 .sr(4)
26150 .m(6)
26151 .n(8)
26152 .k(k)
26153 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26154 }
26155 }
26156
26157 TEST(F32_GEMM_6X8S4__NEONFMA, k_lt_4_strided_a) {
26158 TEST_REQUIRES_ARM_NEON_FMA;
26159 for (size_t k = 1; k < 4; k++) {
26160 GemmMicrokernelTester()
26161 .mr(6)
26162 .nr(8)
26163 .kr(1)
26164 .sr(4)
26165 .m(6)
26166 .n(8)
26167 .k(k)
26168 .a_stride(7)
26169 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26170 }
26171 }
26172
26173 TEST(F32_GEMM_6X8S4__NEONFMA, k_lt_4_subtile) {
26174 TEST_REQUIRES_ARM_NEON_FMA;
26175 for (size_t k = 1; k < 4; k++) {
26176 for (uint32_t m = 1; m <= 6; m++) {
26177 for (uint32_t n = 1; n <= 8; n++) {
26178 GemmMicrokernelTester()
26179 .mr(6)
26180 .nr(8)
26181 .kr(1)
26182 .sr(4)
26183 .m(m)
26184 .n(n)
26185 .k(k)
26186 .iterations(1)
26187 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26188 }
26189 }
26190 }
26191 }
26192
26193 TEST(F32_GEMM_6X8S4__NEONFMA, k_gt_4) {
26194 TEST_REQUIRES_ARM_NEON_FMA;
26195 for (size_t k = 5; k < 8; k++) {
26196 GemmMicrokernelTester()
26197 .mr(6)
26198 .nr(8)
26199 .kr(1)
26200 .sr(4)
26201 .m(6)
26202 .n(8)
26203 .k(k)
26204 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26205 }
26206 }
26207
26208 TEST(F32_GEMM_6X8S4__NEONFMA, k_gt_4_strided_a) {
26209 TEST_REQUIRES_ARM_NEON_FMA;
26210 for (size_t k = 5; k < 8; k++) {
26211 GemmMicrokernelTester()
26212 .mr(6)
26213 .nr(8)
26214 .kr(1)
26215 .sr(4)
26216 .m(6)
26217 .n(8)
26218 .k(k)
26219 .a_stride(11)
26220 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26221 }
26222 }
26223
26224 TEST(F32_GEMM_6X8S4__NEONFMA, k_gt_4_subtile) {
26225 TEST_REQUIRES_ARM_NEON_FMA;
26226 for (size_t k = 5; k < 8; k++) {
26227 for (uint32_t m = 1; m <= 6; m++) {
26228 for (uint32_t n = 1; n <= 8; n++) {
26229 GemmMicrokernelTester()
26230 .mr(6)
26231 .nr(8)
26232 .kr(1)
26233 .sr(4)
26234 .m(m)
26235 .n(n)
26236 .k(k)
26237 .iterations(1)
26238 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26239 }
26240 }
26241 }
26242 }
26243
26244 TEST(F32_GEMM_6X8S4__NEONFMA, k_div_4) {
26245 TEST_REQUIRES_ARM_NEON_FMA;
26246 for (size_t k = 8; k <= 40; k += 4) {
26247 GemmMicrokernelTester()
26248 .mr(6)
26249 .nr(8)
26250 .kr(1)
26251 .sr(4)
26252 .m(6)
26253 .n(8)
26254 .k(k)
26255 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26256 }
26257 }
26258
26259 TEST(F32_GEMM_6X8S4__NEONFMA, k_div_4_strided_a) {
26260 TEST_REQUIRES_ARM_NEON_FMA;
26261 for (size_t k = 8; k <= 40; k += 4) {
26262 GemmMicrokernelTester()
26263 .mr(6)
26264 .nr(8)
26265 .kr(1)
26266 .sr(4)
26267 .m(6)
26268 .n(8)
26269 .k(k)
26270 .a_stride(43)
26271 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26272 }
26273 }
26274
26275 TEST(F32_GEMM_6X8S4__NEONFMA, k_div_4_subtile) {
26276 TEST_REQUIRES_ARM_NEON_FMA;
26277 for (size_t k = 8; k <= 40; k += 4) {
26278 for (uint32_t m = 1; m <= 6; m++) {
26279 for (uint32_t n = 1; n <= 8; n++) {
26280 GemmMicrokernelTester()
26281 .mr(6)
26282 .nr(8)
26283 .kr(1)
26284 .sr(4)
26285 .m(m)
26286 .n(n)
26287 .k(k)
26288 .iterations(1)
26289 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26290 }
26291 }
26292 }
26293 }
26294
26295 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8) {
26296 TEST_REQUIRES_ARM_NEON_FMA;
26297 for (uint32_t n = 9; n < 16; n++) {
26298 for (size_t k = 1; k <= 20; k += 5) {
26299 GemmMicrokernelTester()
26300 .mr(6)
26301 .nr(8)
26302 .kr(1)
26303 .sr(4)
26304 .m(6)
26305 .n(8)
26306 .k(k)
26307 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26308 }
26309 }
26310 }
26311
26312 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8_strided_cn) {
26313 TEST_REQUIRES_ARM_NEON_FMA;
26314 for (uint32_t n = 9; n < 16; n++) {
26315 for (size_t k = 1; k <= 20; k += 5) {
26316 GemmMicrokernelTester()
26317 .mr(6)
26318 .nr(8)
26319 .kr(1)
26320 .sr(4)
26321 .m(6)
26322 .n(8)
26323 .k(k)
26324 .cn_stride(11)
26325 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26326 }
26327 }
26328 }
26329
26330 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8_strided_a) {
26331 TEST_REQUIRES_ARM_NEON_FMA;
26332 for (uint32_t n = 9; n < 16; n++) {
26333 for (size_t k = 1; k <= 20; k += 5) {
26334 GemmMicrokernelTester()
26335 .mr(6)
26336 .nr(8)
26337 .kr(1)
26338 .sr(4)
26339 .m(6)
26340 .n(n)
26341 .k(k)
26342 .a_stride(23)
26343 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26344 }
26345 }
26346 }
26347
26348 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8_subtile) {
26349 TEST_REQUIRES_ARM_NEON_FMA;
26350 for (uint32_t n = 9; n < 16; n++) {
26351 for (size_t k = 1; k <= 20; k += 5) {
26352 for (uint32_t m = 1; m <= 6; m++) {
26353 GemmMicrokernelTester()
26354 .mr(6)
26355 .nr(8)
26356 .kr(1)
26357 .sr(4)
26358 .m(m)
26359 .n(n)
26360 .k(k)
26361 .iterations(1)
26362 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26363 }
26364 }
26365 }
26366 }
26367
26368 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8) {
26369 TEST_REQUIRES_ARM_NEON_FMA;
26370 for (uint32_t n = 16; n <= 24; n += 8) {
26371 for (size_t k = 1; k <= 20; k += 5) {
26372 GemmMicrokernelTester()
26373 .mr(6)
26374 .nr(8)
26375 .kr(1)
26376 .sr(4)
26377 .m(6)
26378 .n(8)
26379 .k(k)
26380 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26381 }
26382 }
26383 }
26384
26385 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8_strided_cn) {
26386 TEST_REQUIRES_ARM_NEON_FMA;
26387 for (uint32_t n = 16; n <= 24; n += 8) {
26388 for (size_t k = 1; k <= 20; k += 5) {
26389 GemmMicrokernelTester()
26390 .mr(6)
26391 .nr(8)
26392 .kr(1)
26393 .sr(4)
26394 .m(6)
26395 .n(n)
26396 .k(k)
26397 .cn_stride(11)
26398 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26399 }
26400 }
26401 }
26402
26403 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8_strided_a) {
26404 TEST_REQUIRES_ARM_NEON_FMA;
26405 for (uint32_t n = 16; n <= 24; n += 8) {
26406 for (size_t k = 1; k <= 20; k += 5) {
26407 GemmMicrokernelTester()
26408 .mr(6)
26409 .nr(8)
26410 .kr(1)
26411 .sr(4)
26412 .m(6)
26413 .n(n)
26414 .k(k)
26415 .a_stride(23)
26416 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26417 }
26418 }
26419 }
26420
26421 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8_subtile) {
26422 TEST_REQUIRES_ARM_NEON_FMA;
26423 for (uint32_t n = 16; n <= 24; n += 8) {
26424 for (size_t k = 1; k <= 20; k += 5) {
26425 for (uint32_t m = 1; m <= 6; m++) {
26426 GemmMicrokernelTester()
26427 .mr(6)
26428 .nr(8)
26429 .kr(1)
26430 .sr(4)
26431 .m(m)
26432 .n(n)
26433 .k(k)
26434 .iterations(1)
26435 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26436 }
26437 }
26438 }
26439 }
26440
26441 TEST(F32_GEMM_6X8S4__NEONFMA, strided_cm_subtile) {
26442 TEST_REQUIRES_ARM_NEON_FMA;
26443 for (size_t k = 1; k <= 20; k += 5) {
26444 for (uint32_t m = 1; m <= 6; m++) {
26445 for (uint32_t n = 1; n <= 8; n++) {
26446 GemmMicrokernelTester()
26447 .mr(6)
26448 .nr(8)
26449 .kr(1)
26450 .sr(4)
26451 .m(m)
26452 .n(n)
26453 .k(k)
26454 .cm_stride(11)
26455 .iterations(1)
26456 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26457 }
26458 }
26459 }
26460 }
26461
26462 TEST(F32_GEMM_6X8S4__NEONFMA, qmin) {
26463 TEST_REQUIRES_ARM_NEON_FMA;
26464 GemmMicrokernelTester()
26465 .mr(6)
26466 .nr(8)
26467 .kr(1)
26468 .sr(4)
26469 .m(6)
26470 .n(8)
26471 .k(4)
26472 .qmin(128)
26473 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26474 }
26475
26476 TEST(F32_GEMM_6X8S4__NEONFMA, qmax) {
26477 TEST_REQUIRES_ARM_NEON_FMA;
26478 GemmMicrokernelTester()
26479 .mr(6)
26480 .nr(8)
26481 .kr(1)
26482 .sr(4)
26483 .m(6)
26484 .n(8)
26485 .k(4)
26486 .qmax(128)
26487 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26488 }
26489
26490 TEST(F32_GEMM_6X8S4__NEONFMA, strided_cm) {
26491 TEST_REQUIRES_ARM_NEON_FMA;
26492 GemmMicrokernelTester()
26493 .mr(6)
26494 .nr(8)
26495 .kr(1)
26496 .sr(4)
26497 .m(6)
26498 .n(8)
26499 .k(4)
26500 .cm_stride(11)
26501 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
26502 }
26503#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
26504
26505
26506#if XNN_ARCH_ARM || XNN_ARCH_ARM64
26507 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4) {
26508 TEST_REQUIRES_ARM_NEON_FMA;
26509 GemmMicrokernelTester()
26510 .mr(8)
26511 .nr(8)
26512 .kr(1)
26513 .sr(4)
26514 .m(8)
26515 .n(8)
26516 .k(4)
26517 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26518 }
26519
26520 TEST(F32_GEMM_8X8S4__NEONFMA, strided_cn) {
26521 TEST_REQUIRES_ARM_NEON_FMA;
26522 GemmMicrokernelTester()
26523 .mr(8)
26524 .nr(8)
26525 .kr(1)
26526 .sr(4)
26527 .m(8)
26528 .n(8)
26529 .k(4)
26530 .cn_stride(11)
26531 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26532 }
26533
26534 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_strided_a) {
26535 TEST_REQUIRES_ARM_NEON_FMA;
26536 GemmMicrokernelTester()
26537 .mr(8)
26538 .nr(8)
26539 .kr(1)
26540 .sr(4)
26541 .m(8)
26542 .n(8)
26543 .k(4)
26544 .a_stride(7)
26545 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26546 }
26547
26548 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_subtile) {
26549 TEST_REQUIRES_ARM_NEON_FMA;
26550 for (uint32_t m = 1; m <= 8; m++) {
26551 for (uint32_t n = 1; n <= 8; n++) {
26552 GemmMicrokernelTester()
26553 .mr(8)
26554 .nr(8)
26555 .kr(1)
26556 .sr(4)
26557 .m(m)
26558 .n(n)
26559 .k(4)
26560 .iterations(1)
26561 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26562 }
26563 }
26564 }
26565
26566 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_subtile_m) {
26567 TEST_REQUIRES_ARM_NEON_FMA;
26568 for (uint32_t m = 1; m <= 8; m++) {
26569 GemmMicrokernelTester()
26570 .mr(8)
26571 .nr(8)
26572 .kr(1)
26573 .sr(4)
26574 .m(m)
26575 .n(8)
26576 .k(4)
26577 .iterations(1)
26578 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26579 }
26580 }
26581
26582 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_subtile_n) {
26583 TEST_REQUIRES_ARM_NEON_FMA;
26584 for (uint32_t n = 1; n <= 8; n++) {
26585 GemmMicrokernelTester()
26586 .mr(8)
26587 .nr(8)
26588 .kr(1)
26589 .sr(4)
26590 .m(8)
26591 .n(n)
26592 .k(4)
26593 .iterations(1)
26594 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26595 }
26596 }
26597
26598 TEST(F32_GEMM_8X8S4__NEONFMA, k_lt_4) {
26599 TEST_REQUIRES_ARM_NEON_FMA;
26600 for (size_t k = 1; k < 4; k++) {
26601 GemmMicrokernelTester()
26602 .mr(8)
26603 .nr(8)
26604 .kr(1)
26605 .sr(4)
26606 .m(8)
26607 .n(8)
26608 .k(k)
26609 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26610 }
26611 }
26612
26613 TEST(F32_GEMM_8X8S4__NEONFMA, k_lt_4_strided_a) {
26614 TEST_REQUIRES_ARM_NEON_FMA;
26615 for (size_t k = 1; k < 4; k++) {
26616 GemmMicrokernelTester()
26617 .mr(8)
26618 .nr(8)
26619 .kr(1)
26620 .sr(4)
26621 .m(8)
26622 .n(8)
26623 .k(k)
26624 .a_stride(7)
26625 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26626 }
26627 }
26628
26629 TEST(F32_GEMM_8X8S4__NEONFMA, k_lt_4_subtile) {
26630 TEST_REQUIRES_ARM_NEON_FMA;
26631 for (size_t k = 1; k < 4; k++) {
26632 for (uint32_t m = 1; m <= 8; m++) {
26633 for (uint32_t n = 1; n <= 8; n++) {
26634 GemmMicrokernelTester()
26635 .mr(8)
26636 .nr(8)
26637 .kr(1)
26638 .sr(4)
26639 .m(m)
26640 .n(n)
26641 .k(k)
26642 .iterations(1)
26643 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26644 }
26645 }
26646 }
26647 }
26648
26649 TEST(F32_GEMM_8X8S4__NEONFMA, k_gt_4) {
26650 TEST_REQUIRES_ARM_NEON_FMA;
26651 for (size_t k = 5; k < 8; k++) {
26652 GemmMicrokernelTester()
26653 .mr(8)
26654 .nr(8)
26655 .kr(1)
26656 .sr(4)
26657 .m(8)
26658 .n(8)
26659 .k(k)
26660 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26661 }
26662 }
26663
26664 TEST(F32_GEMM_8X8S4__NEONFMA, k_gt_4_strided_a) {
26665 TEST_REQUIRES_ARM_NEON_FMA;
26666 for (size_t k = 5; k < 8; k++) {
26667 GemmMicrokernelTester()
26668 .mr(8)
26669 .nr(8)
26670 .kr(1)
26671 .sr(4)
26672 .m(8)
26673 .n(8)
26674 .k(k)
26675 .a_stride(11)
26676 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26677 }
26678 }
26679
26680 TEST(F32_GEMM_8X8S4__NEONFMA, k_gt_4_subtile) {
26681 TEST_REQUIRES_ARM_NEON_FMA;
26682 for (size_t k = 5; k < 8; k++) {
26683 for (uint32_t m = 1; m <= 8; m++) {
26684 for (uint32_t n = 1; n <= 8; n++) {
26685 GemmMicrokernelTester()
26686 .mr(8)
26687 .nr(8)
26688 .kr(1)
26689 .sr(4)
26690 .m(m)
26691 .n(n)
26692 .k(k)
26693 .iterations(1)
26694 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26695 }
26696 }
26697 }
26698 }
26699
26700 TEST(F32_GEMM_8X8S4__NEONFMA, k_div_4) {
26701 TEST_REQUIRES_ARM_NEON_FMA;
26702 for (size_t k = 8; k <= 40; k += 4) {
26703 GemmMicrokernelTester()
26704 .mr(8)
26705 .nr(8)
26706 .kr(1)
26707 .sr(4)
26708 .m(8)
26709 .n(8)
26710 .k(k)
26711 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26712 }
26713 }
26714
26715 TEST(F32_GEMM_8X8S4__NEONFMA, k_div_4_strided_a) {
26716 TEST_REQUIRES_ARM_NEON_FMA;
26717 for (size_t k = 8; k <= 40; k += 4) {
26718 GemmMicrokernelTester()
26719 .mr(8)
26720 .nr(8)
26721 .kr(1)
26722 .sr(4)
26723 .m(8)
26724 .n(8)
26725 .k(k)
26726 .a_stride(43)
26727 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26728 }
26729 }
26730
26731 TEST(F32_GEMM_8X8S4__NEONFMA, k_div_4_subtile) {
26732 TEST_REQUIRES_ARM_NEON_FMA;
26733 for (size_t k = 8; k <= 40; k += 4) {
26734 for (uint32_t m = 1; m <= 8; m++) {
26735 for (uint32_t n = 1; n <= 8; n++) {
26736 GemmMicrokernelTester()
26737 .mr(8)
26738 .nr(8)
26739 .kr(1)
26740 .sr(4)
26741 .m(m)
26742 .n(n)
26743 .k(k)
26744 .iterations(1)
26745 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26746 }
26747 }
26748 }
26749 }
26750
26751 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8) {
26752 TEST_REQUIRES_ARM_NEON_FMA;
26753 for (uint32_t n = 9; n < 16; n++) {
26754 for (size_t k = 1; k <= 20; k += 5) {
26755 GemmMicrokernelTester()
26756 .mr(8)
26757 .nr(8)
26758 .kr(1)
26759 .sr(4)
26760 .m(8)
26761 .n(8)
26762 .k(k)
26763 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26764 }
26765 }
26766 }
26767
26768 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8_strided_cn) {
26769 TEST_REQUIRES_ARM_NEON_FMA;
26770 for (uint32_t n = 9; n < 16; n++) {
26771 for (size_t k = 1; k <= 20; k += 5) {
26772 GemmMicrokernelTester()
26773 .mr(8)
26774 .nr(8)
26775 .kr(1)
26776 .sr(4)
26777 .m(8)
26778 .n(8)
26779 .k(k)
26780 .cn_stride(11)
26781 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26782 }
26783 }
26784 }
26785
26786 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8_strided_a) {
26787 TEST_REQUIRES_ARM_NEON_FMA;
26788 for (uint32_t n = 9; n < 16; n++) {
26789 for (size_t k = 1; k <= 20; k += 5) {
26790 GemmMicrokernelTester()
26791 .mr(8)
26792 .nr(8)
26793 .kr(1)
26794 .sr(4)
26795 .m(8)
26796 .n(n)
26797 .k(k)
26798 .a_stride(23)
26799 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26800 }
26801 }
26802 }
26803
26804 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8_subtile) {
26805 TEST_REQUIRES_ARM_NEON_FMA;
26806 for (uint32_t n = 9; n < 16; n++) {
26807 for (size_t k = 1; k <= 20; k += 5) {
26808 for (uint32_t m = 1; m <= 8; m++) {
26809 GemmMicrokernelTester()
26810 .mr(8)
26811 .nr(8)
26812 .kr(1)
26813 .sr(4)
26814 .m(m)
26815 .n(n)
26816 .k(k)
26817 .iterations(1)
26818 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26819 }
26820 }
26821 }
26822 }
26823
26824 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8) {
26825 TEST_REQUIRES_ARM_NEON_FMA;
26826 for (uint32_t n = 16; n <= 24; n += 8) {
26827 for (size_t k = 1; k <= 20; k += 5) {
26828 GemmMicrokernelTester()
26829 .mr(8)
26830 .nr(8)
26831 .kr(1)
26832 .sr(4)
26833 .m(8)
26834 .n(8)
26835 .k(k)
26836 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26837 }
26838 }
26839 }
26840
26841 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8_strided_cn) {
26842 TEST_REQUIRES_ARM_NEON_FMA;
26843 for (uint32_t n = 16; n <= 24; n += 8) {
26844 for (size_t k = 1; k <= 20; k += 5) {
26845 GemmMicrokernelTester()
26846 .mr(8)
26847 .nr(8)
26848 .kr(1)
26849 .sr(4)
26850 .m(8)
26851 .n(n)
26852 .k(k)
26853 .cn_stride(11)
26854 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26855 }
26856 }
26857 }
26858
26859 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8_strided_a) {
26860 TEST_REQUIRES_ARM_NEON_FMA;
26861 for (uint32_t n = 16; n <= 24; n += 8) {
26862 for (size_t k = 1; k <= 20; k += 5) {
26863 GemmMicrokernelTester()
26864 .mr(8)
26865 .nr(8)
26866 .kr(1)
26867 .sr(4)
26868 .m(8)
26869 .n(n)
26870 .k(k)
26871 .a_stride(23)
26872 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26873 }
26874 }
26875 }
26876
26877 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8_subtile) {
26878 TEST_REQUIRES_ARM_NEON_FMA;
26879 for (uint32_t n = 16; n <= 24; n += 8) {
26880 for (size_t k = 1; k <= 20; k += 5) {
26881 for (uint32_t m = 1; m <= 8; m++) {
26882 GemmMicrokernelTester()
26883 .mr(8)
26884 .nr(8)
26885 .kr(1)
26886 .sr(4)
26887 .m(m)
26888 .n(n)
26889 .k(k)
26890 .iterations(1)
26891 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26892 }
26893 }
26894 }
26895 }
26896
26897 TEST(F32_GEMM_8X8S4__NEONFMA, strided_cm_subtile) {
26898 TEST_REQUIRES_ARM_NEON_FMA;
26899 for (size_t k = 1; k <= 20; k += 5) {
26900 for (uint32_t m = 1; m <= 8; m++) {
26901 for (uint32_t n = 1; n <= 8; n++) {
26902 GemmMicrokernelTester()
26903 .mr(8)
26904 .nr(8)
26905 .kr(1)
26906 .sr(4)
26907 .m(m)
26908 .n(n)
26909 .k(k)
26910 .cm_stride(11)
26911 .iterations(1)
26912 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26913 }
26914 }
26915 }
26916 }
26917
26918 TEST(F32_GEMM_8X8S4__NEONFMA, qmin) {
26919 TEST_REQUIRES_ARM_NEON_FMA;
26920 GemmMicrokernelTester()
26921 .mr(8)
26922 .nr(8)
26923 .kr(1)
26924 .sr(4)
26925 .m(8)
26926 .n(8)
26927 .k(4)
26928 .qmin(128)
26929 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26930 }
26931
26932 TEST(F32_GEMM_8X8S4__NEONFMA, qmax) {
26933 TEST_REQUIRES_ARM_NEON_FMA;
26934 GemmMicrokernelTester()
26935 .mr(8)
26936 .nr(8)
26937 .kr(1)
26938 .sr(4)
26939 .m(8)
26940 .n(8)
26941 .k(4)
26942 .qmax(128)
26943 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26944 }
26945
26946 TEST(F32_GEMM_8X8S4__NEONFMA, strided_cm) {
26947 TEST_REQUIRES_ARM_NEON_FMA;
26948 GemmMicrokernelTester()
26949 .mr(8)
26950 .nr(8)
26951 .kr(1)
26952 .sr(4)
26953 .m(8)
26954 .n(8)
26955 .k(4)
26956 .cm_stride(11)
26957 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
26958 }
26959#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
26960
26961
26962#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26963 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1) {
26964 TEST_REQUIRES_X86_SSE;
26965 GemmMicrokernelTester()
26966 .mr(1)
26967 .nr(8)
26968 .kr(1)
26969 .sr(1)
26970 .m(1)
26971 .n(8)
26972 .k(1)
26973 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
26974 }
26975
26976 TEST(F32_GEMM_1X8__SSE_LOAD1, strided_cn) {
26977 TEST_REQUIRES_X86_SSE;
26978 GemmMicrokernelTester()
26979 .mr(1)
26980 .nr(8)
26981 .kr(1)
26982 .sr(1)
26983 .m(1)
26984 .n(8)
26985 .k(1)
26986 .cn_stride(11)
26987 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
26988 }
26989
26990 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_strided_a) {
26991 TEST_REQUIRES_X86_SSE;
26992 GemmMicrokernelTester()
26993 .mr(1)
26994 .nr(8)
26995 .kr(1)
26996 .sr(1)
26997 .m(1)
26998 .n(8)
26999 .k(1)
27000 .a_stride(3)
27001 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27002 }
27003
27004 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_subtile) {
27005 TEST_REQUIRES_X86_SSE;
27006 for (uint32_t m = 1; m <= 1; m++) {
27007 for (uint32_t n = 1; n <= 8; n++) {
27008 GemmMicrokernelTester()
27009 .mr(1)
27010 .nr(8)
27011 .kr(1)
27012 .sr(1)
27013 .m(m)
27014 .n(n)
27015 .k(1)
27016 .iterations(1)
27017 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27018 }
27019 }
27020 }
27021
27022 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_subtile_m) {
27023 TEST_REQUIRES_X86_SSE;
27024 for (uint32_t m = 1; m <= 1; m++) {
27025 GemmMicrokernelTester()
27026 .mr(1)
27027 .nr(8)
27028 .kr(1)
27029 .sr(1)
27030 .m(m)
27031 .n(8)
27032 .k(1)
27033 .iterations(1)
27034 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27035 }
27036 }
27037
27038 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_subtile_n) {
27039 TEST_REQUIRES_X86_SSE;
27040 for (uint32_t n = 1; n <= 8; n++) {
27041 GemmMicrokernelTester()
27042 .mr(1)
27043 .nr(8)
27044 .kr(1)
27045 .sr(1)
27046 .m(1)
27047 .n(n)
27048 .k(1)
27049 .iterations(1)
27050 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27051 }
27052 }
27053
27054 TEST(F32_GEMM_1X8__SSE_LOAD1, k_gt_1) {
27055 TEST_REQUIRES_X86_SSE;
27056 for (size_t k = 2; k < 10; k++) {
27057 GemmMicrokernelTester()
27058 .mr(1)
27059 .nr(8)
27060 .kr(1)
27061 .sr(1)
27062 .m(1)
27063 .n(8)
27064 .k(k)
27065 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27066 }
27067 }
27068
27069 TEST(F32_GEMM_1X8__SSE_LOAD1, k_gt_1_strided_a) {
27070 TEST_REQUIRES_X86_SSE;
27071 for (size_t k = 2; k < 10; k++) {
27072 GemmMicrokernelTester()
27073 .mr(1)
27074 .nr(8)
27075 .kr(1)
27076 .sr(1)
27077 .m(1)
27078 .n(8)
27079 .k(k)
27080 .a_stride(11)
27081 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27082 }
27083 }
27084
27085 TEST(F32_GEMM_1X8__SSE_LOAD1, k_gt_1_subtile) {
27086 TEST_REQUIRES_X86_SSE;
27087 for (size_t k = 2; k < 10; k++) {
27088 for (uint32_t m = 1; m <= 1; m++) {
27089 for (uint32_t n = 1; n <= 8; n++) {
27090 GemmMicrokernelTester()
27091 .mr(1)
27092 .nr(8)
27093 .kr(1)
27094 .sr(1)
27095 .m(m)
27096 .n(n)
27097 .k(k)
27098 .iterations(1)
27099 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27100 }
27101 }
27102 }
27103 }
27104
27105 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8) {
27106 TEST_REQUIRES_X86_SSE;
27107 for (uint32_t n = 9; n < 16; n++) {
27108 for (size_t k = 1; k <= 5; k += 2) {
27109 GemmMicrokernelTester()
27110 .mr(1)
27111 .nr(8)
27112 .kr(1)
27113 .sr(1)
27114 .m(1)
27115 .n(8)
27116 .k(k)
27117 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27118 }
27119 }
27120 }
27121
27122 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8_strided_cn) {
27123 TEST_REQUIRES_X86_SSE;
27124 for (uint32_t n = 9; n < 16; n++) {
27125 for (size_t k = 1; k <= 5; k += 2) {
27126 GemmMicrokernelTester()
27127 .mr(1)
27128 .nr(8)
27129 .kr(1)
27130 .sr(1)
27131 .m(1)
27132 .n(8)
27133 .k(k)
27134 .cn_stride(11)
27135 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27136 }
27137 }
27138 }
27139
27140 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8_strided_a) {
27141 TEST_REQUIRES_X86_SSE;
27142 for (uint32_t n = 9; n < 16; n++) {
27143 for (size_t k = 1; k <= 5; k += 2) {
27144 GemmMicrokernelTester()
27145 .mr(1)
27146 .nr(8)
27147 .kr(1)
27148 .sr(1)
27149 .m(1)
27150 .n(n)
27151 .k(k)
27152 .a_stride(7)
27153 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27154 }
27155 }
27156 }
27157
27158 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8_subtile) {
27159 TEST_REQUIRES_X86_SSE;
27160 for (uint32_t n = 9; n < 16; n++) {
27161 for (size_t k = 1; k <= 5; k += 2) {
27162 for (uint32_t m = 1; m <= 1; m++) {
27163 GemmMicrokernelTester()
27164 .mr(1)
27165 .nr(8)
27166 .kr(1)
27167 .sr(1)
27168 .m(m)
27169 .n(n)
27170 .k(k)
27171 .iterations(1)
27172 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27173 }
27174 }
27175 }
27176 }
27177
27178 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8) {
27179 TEST_REQUIRES_X86_SSE;
27180 for (uint32_t n = 16; n <= 24; n += 8) {
27181 for (size_t k = 1; k <= 5; k += 2) {
27182 GemmMicrokernelTester()
27183 .mr(1)
27184 .nr(8)
27185 .kr(1)
27186 .sr(1)
27187 .m(1)
27188 .n(8)
27189 .k(k)
27190 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27191 }
27192 }
27193 }
27194
27195 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8_strided_cn) {
27196 TEST_REQUIRES_X86_SSE;
27197 for (uint32_t n = 16; n <= 24; n += 8) {
27198 for (size_t k = 1; k <= 5; k += 2) {
27199 GemmMicrokernelTester()
27200 .mr(1)
27201 .nr(8)
27202 .kr(1)
27203 .sr(1)
27204 .m(1)
27205 .n(n)
27206 .k(k)
27207 .cn_stride(11)
27208 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27209 }
27210 }
27211 }
27212
27213 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8_strided_a) {
27214 TEST_REQUIRES_X86_SSE;
27215 for (uint32_t n = 16; n <= 24; n += 8) {
27216 for (size_t k = 1; k <= 5; k += 2) {
27217 GemmMicrokernelTester()
27218 .mr(1)
27219 .nr(8)
27220 .kr(1)
27221 .sr(1)
27222 .m(1)
27223 .n(n)
27224 .k(k)
27225 .a_stride(7)
27226 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27227 }
27228 }
27229 }
27230
27231 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8_subtile) {
27232 TEST_REQUIRES_X86_SSE;
27233 for (uint32_t n = 16; n <= 24; n += 8) {
27234 for (size_t k = 1; k <= 5; k += 2) {
27235 for (uint32_t m = 1; m <= 1; m++) {
27236 GemmMicrokernelTester()
27237 .mr(1)
27238 .nr(8)
27239 .kr(1)
27240 .sr(1)
27241 .m(m)
27242 .n(n)
27243 .k(k)
27244 .iterations(1)
27245 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27246 }
27247 }
27248 }
27249 }
27250
27251 TEST(F32_GEMM_1X8__SSE_LOAD1, strided_cm_subtile) {
27252 TEST_REQUIRES_X86_SSE;
27253 for (size_t k = 1; k <= 5; k += 2) {
27254 for (uint32_t m = 1; m <= 1; m++) {
27255 for (uint32_t n = 1; n <= 8; n++) {
27256 GemmMicrokernelTester()
27257 .mr(1)
27258 .nr(8)
27259 .kr(1)
27260 .sr(1)
27261 .m(m)
27262 .n(n)
27263 .k(k)
27264 .cm_stride(11)
27265 .iterations(1)
27266 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27267 }
27268 }
27269 }
27270 }
27271
27272 TEST(F32_GEMM_1X8__SSE_LOAD1, qmin) {
27273 TEST_REQUIRES_X86_SSE;
27274 GemmMicrokernelTester()
27275 .mr(1)
27276 .nr(8)
27277 .kr(1)
27278 .sr(1)
27279 .m(1)
27280 .n(8)
27281 .k(1)
27282 .qmin(128)
27283 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27284 }
27285
27286 TEST(F32_GEMM_1X8__SSE_LOAD1, qmax) {
27287 TEST_REQUIRES_X86_SSE;
27288 GemmMicrokernelTester()
27289 .mr(1)
27290 .nr(8)
27291 .kr(1)
27292 .sr(1)
27293 .m(1)
27294 .n(8)
27295 .k(1)
27296 .qmax(128)
27297 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27298 }
27299
27300 TEST(F32_GEMM_1X8__SSE_LOAD1, strided_cm) {
27301 TEST_REQUIRES_X86_SSE;
27302 GemmMicrokernelTester()
27303 .mr(1)
27304 .nr(8)
27305 .kr(1)
27306 .sr(1)
27307 .m(1)
27308 .n(8)
27309 .k(1)
27310 .cm_stride(11)
27311 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
27312 }
27313#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27314
27315
27316#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27317 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1) {
27318 TEST_REQUIRES_X86_SSE;
27319 GemmMicrokernelTester()
27320 .mr(4)
27321 .nr(8)
27322 .kr(1)
27323 .sr(1)
27324 .m(4)
27325 .n(8)
27326 .k(1)
27327 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27328 }
27329
27330 TEST(F32_GEMM_4X8__SSE_LOAD1, strided_cn) {
27331 TEST_REQUIRES_X86_SSE;
27332 GemmMicrokernelTester()
27333 .mr(4)
27334 .nr(8)
27335 .kr(1)
27336 .sr(1)
27337 .m(4)
27338 .n(8)
27339 .k(1)
27340 .cn_stride(11)
27341 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27342 }
27343
27344 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_strided_a) {
27345 TEST_REQUIRES_X86_SSE;
27346 GemmMicrokernelTester()
27347 .mr(4)
27348 .nr(8)
27349 .kr(1)
27350 .sr(1)
27351 .m(4)
27352 .n(8)
27353 .k(1)
27354 .a_stride(3)
27355 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27356 }
27357
27358 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_subtile) {
27359 TEST_REQUIRES_X86_SSE;
27360 for (uint32_t m = 1; m <= 4; m++) {
27361 for (uint32_t n = 1; n <= 8; n++) {
27362 GemmMicrokernelTester()
27363 .mr(4)
27364 .nr(8)
27365 .kr(1)
27366 .sr(1)
27367 .m(m)
27368 .n(n)
27369 .k(1)
27370 .iterations(1)
27371 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27372 }
27373 }
27374 }
27375
27376 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
27377 TEST_REQUIRES_X86_SSE;
27378 for (uint32_t m = 1; m <= 4; m++) {
27379 GemmMicrokernelTester()
27380 .mr(4)
27381 .nr(8)
27382 .kr(1)
27383 .sr(1)
27384 .m(m)
27385 .n(8)
27386 .k(1)
27387 .iterations(1)
27388 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27389 }
27390 }
27391
27392 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
27393 TEST_REQUIRES_X86_SSE;
27394 for (uint32_t n = 1; n <= 8; n++) {
27395 GemmMicrokernelTester()
27396 .mr(4)
27397 .nr(8)
27398 .kr(1)
27399 .sr(1)
27400 .m(4)
27401 .n(n)
27402 .k(1)
27403 .iterations(1)
27404 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27405 }
27406 }
27407
27408 TEST(F32_GEMM_4X8__SSE_LOAD1, k_gt_1) {
27409 TEST_REQUIRES_X86_SSE;
27410 for (size_t k = 2; k < 10; k++) {
27411 GemmMicrokernelTester()
27412 .mr(4)
27413 .nr(8)
27414 .kr(1)
27415 .sr(1)
27416 .m(4)
27417 .n(8)
27418 .k(k)
27419 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27420 }
27421 }
27422
27423 TEST(F32_GEMM_4X8__SSE_LOAD1, k_gt_1_strided_a) {
27424 TEST_REQUIRES_X86_SSE;
27425 for (size_t k = 2; k < 10; k++) {
27426 GemmMicrokernelTester()
27427 .mr(4)
27428 .nr(8)
27429 .kr(1)
27430 .sr(1)
27431 .m(4)
27432 .n(8)
27433 .k(k)
27434 .a_stride(11)
27435 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27436 }
27437 }
27438
27439 TEST(F32_GEMM_4X8__SSE_LOAD1, k_gt_1_subtile) {
27440 TEST_REQUIRES_X86_SSE;
27441 for (size_t k = 2; k < 10; k++) {
27442 for (uint32_t m = 1; m <= 4; m++) {
27443 for (uint32_t n = 1; n <= 8; n++) {
27444 GemmMicrokernelTester()
27445 .mr(4)
27446 .nr(8)
27447 .kr(1)
27448 .sr(1)
27449 .m(m)
27450 .n(n)
27451 .k(k)
27452 .iterations(1)
27453 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27454 }
27455 }
27456 }
27457 }
27458
27459 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8) {
27460 TEST_REQUIRES_X86_SSE;
27461 for (uint32_t n = 9; n < 16; n++) {
27462 for (size_t k = 1; k <= 5; k += 2) {
27463 GemmMicrokernelTester()
27464 .mr(4)
27465 .nr(8)
27466 .kr(1)
27467 .sr(1)
27468 .m(4)
27469 .n(8)
27470 .k(k)
27471 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27472 }
27473 }
27474 }
27475
27476 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
27477 TEST_REQUIRES_X86_SSE;
27478 for (uint32_t n = 9; n < 16; n++) {
27479 for (size_t k = 1; k <= 5; k += 2) {
27480 GemmMicrokernelTester()
27481 .mr(4)
27482 .nr(8)
27483 .kr(1)
27484 .sr(1)
27485 .m(4)
27486 .n(8)
27487 .k(k)
27488 .cn_stride(11)
27489 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27490 }
27491 }
27492 }
27493
27494 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8_strided_a) {
27495 TEST_REQUIRES_X86_SSE;
27496 for (uint32_t n = 9; n < 16; n++) {
27497 for (size_t k = 1; k <= 5; k += 2) {
27498 GemmMicrokernelTester()
27499 .mr(4)
27500 .nr(8)
27501 .kr(1)
27502 .sr(1)
27503 .m(4)
27504 .n(n)
27505 .k(k)
27506 .a_stride(7)
27507 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27508 }
27509 }
27510 }
27511
27512 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8_subtile) {
27513 TEST_REQUIRES_X86_SSE;
27514 for (uint32_t n = 9; n < 16; n++) {
27515 for (size_t k = 1; k <= 5; k += 2) {
27516 for (uint32_t m = 1; m <= 4; m++) {
27517 GemmMicrokernelTester()
27518 .mr(4)
27519 .nr(8)
27520 .kr(1)
27521 .sr(1)
27522 .m(m)
27523 .n(n)
27524 .k(k)
27525 .iterations(1)
27526 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27527 }
27528 }
27529 }
27530 }
27531
27532 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8) {
27533 TEST_REQUIRES_X86_SSE;
27534 for (uint32_t n = 16; n <= 24; n += 8) {
27535 for (size_t k = 1; k <= 5; k += 2) {
27536 GemmMicrokernelTester()
27537 .mr(4)
27538 .nr(8)
27539 .kr(1)
27540 .sr(1)
27541 .m(4)
27542 .n(8)
27543 .k(k)
27544 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27545 }
27546 }
27547 }
27548
27549 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8_strided_cn) {
27550 TEST_REQUIRES_X86_SSE;
27551 for (uint32_t n = 16; n <= 24; n += 8) {
27552 for (size_t k = 1; k <= 5; k += 2) {
27553 GemmMicrokernelTester()
27554 .mr(4)
27555 .nr(8)
27556 .kr(1)
27557 .sr(1)
27558 .m(4)
27559 .n(n)
27560 .k(k)
27561 .cn_stride(11)
27562 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27563 }
27564 }
27565 }
27566
27567 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8_strided_a) {
27568 TEST_REQUIRES_X86_SSE;
27569 for (uint32_t n = 16; n <= 24; n += 8) {
27570 for (size_t k = 1; k <= 5; k += 2) {
27571 GemmMicrokernelTester()
27572 .mr(4)
27573 .nr(8)
27574 .kr(1)
27575 .sr(1)
27576 .m(4)
27577 .n(n)
27578 .k(k)
27579 .a_stride(7)
27580 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27581 }
27582 }
27583 }
27584
27585 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8_subtile) {
27586 TEST_REQUIRES_X86_SSE;
27587 for (uint32_t n = 16; n <= 24; n += 8) {
27588 for (size_t k = 1; k <= 5; k += 2) {
27589 for (uint32_t m = 1; m <= 4; m++) {
27590 GemmMicrokernelTester()
27591 .mr(4)
27592 .nr(8)
27593 .kr(1)
27594 .sr(1)
27595 .m(m)
27596 .n(n)
27597 .k(k)
27598 .iterations(1)
27599 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27600 }
27601 }
27602 }
27603 }
27604
27605 TEST(F32_GEMM_4X8__SSE_LOAD1, strided_cm_subtile) {
27606 TEST_REQUIRES_X86_SSE;
27607 for (size_t k = 1; k <= 5; k += 2) {
27608 for (uint32_t m = 1; m <= 4; m++) {
27609 for (uint32_t n = 1; n <= 8; n++) {
27610 GemmMicrokernelTester()
27611 .mr(4)
27612 .nr(8)
27613 .kr(1)
27614 .sr(1)
27615 .m(m)
27616 .n(n)
27617 .k(k)
27618 .cm_stride(11)
27619 .iterations(1)
27620 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27621 }
27622 }
27623 }
27624 }
27625
27626 TEST(F32_GEMM_4X8__SSE_LOAD1, qmin) {
27627 TEST_REQUIRES_X86_SSE;
27628 GemmMicrokernelTester()
27629 .mr(4)
27630 .nr(8)
27631 .kr(1)
27632 .sr(1)
27633 .m(4)
27634 .n(8)
27635 .k(1)
27636 .qmin(128)
27637 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27638 }
27639
27640 TEST(F32_GEMM_4X8__SSE_LOAD1, qmax) {
27641 TEST_REQUIRES_X86_SSE;
27642 GemmMicrokernelTester()
27643 .mr(4)
27644 .nr(8)
27645 .kr(1)
27646 .sr(1)
27647 .m(4)
27648 .n(8)
27649 .k(1)
27650 .qmax(128)
27651 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27652 }
27653
27654 TEST(F32_GEMM_4X8__SSE_LOAD1, strided_cm) {
27655 TEST_REQUIRES_X86_SSE;
27656 GemmMicrokernelTester()
27657 .mr(4)
27658 .nr(8)
27659 .kr(1)
27660 .sr(1)
27661 .m(4)
27662 .n(8)
27663 .k(1)
27664 .cm_stride(11)
27665 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
27666 }
27667#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27668
27669
27670#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27671 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4) {
27672 TEST_REQUIRES_X86_SSE;
27673 GemmMicrokernelTester()
27674 .mr(1)
27675 .nr(8)
27676 .kr(1)
27677 .sr(1)
27678 .m(1)
27679 .n(8)
27680 .k(4)
27681 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27682 }
27683
27684 TEST(F32_GEMM_1X8__SSE_DUP, strided_cn) {
27685 TEST_REQUIRES_X86_SSE;
27686 GemmMicrokernelTester()
27687 .mr(1)
27688 .nr(8)
27689 .kr(1)
27690 .sr(1)
27691 .m(1)
27692 .n(8)
27693 .k(4)
27694 .cn_stride(11)
27695 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27696 }
27697
27698 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_strided_a) {
27699 TEST_REQUIRES_X86_SSE;
27700 GemmMicrokernelTester()
27701 .mr(1)
27702 .nr(8)
27703 .kr(1)
27704 .sr(1)
27705 .m(1)
27706 .n(8)
27707 .k(4)
27708 .a_stride(7)
27709 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27710 }
27711
27712 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_subtile) {
27713 TEST_REQUIRES_X86_SSE;
27714 for (uint32_t m = 1; m <= 1; m++) {
27715 for (uint32_t n = 1; n <= 8; n++) {
27716 GemmMicrokernelTester()
27717 .mr(1)
27718 .nr(8)
27719 .kr(1)
27720 .sr(1)
27721 .m(m)
27722 .n(n)
27723 .k(4)
27724 .iterations(1)
27725 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27726 }
27727 }
27728 }
27729
27730 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_subtile_m) {
27731 TEST_REQUIRES_X86_SSE;
27732 for (uint32_t m = 1; m <= 1; m++) {
27733 GemmMicrokernelTester()
27734 .mr(1)
27735 .nr(8)
27736 .kr(1)
27737 .sr(1)
27738 .m(m)
27739 .n(8)
27740 .k(4)
27741 .iterations(1)
27742 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27743 }
27744 }
27745
27746 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_subtile_n) {
27747 TEST_REQUIRES_X86_SSE;
27748 for (uint32_t n = 1; n <= 8; n++) {
27749 GemmMicrokernelTester()
27750 .mr(1)
27751 .nr(8)
27752 .kr(1)
27753 .sr(1)
27754 .m(1)
27755 .n(n)
27756 .k(4)
27757 .iterations(1)
27758 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27759 }
27760 }
27761
27762 TEST(F32_GEMM_1X8__SSE_DUP, k_lt_4) {
27763 TEST_REQUIRES_X86_SSE;
27764 for (size_t k = 1; k < 4; k++) {
27765 GemmMicrokernelTester()
27766 .mr(1)
27767 .nr(8)
27768 .kr(1)
27769 .sr(1)
27770 .m(1)
27771 .n(8)
27772 .k(k)
27773 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27774 }
27775 }
27776
27777 TEST(F32_GEMM_1X8__SSE_DUP, k_lt_4_strided_a) {
27778 TEST_REQUIRES_X86_SSE;
27779 for (size_t k = 1; k < 4; k++) {
27780 GemmMicrokernelTester()
27781 .mr(1)
27782 .nr(8)
27783 .kr(1)
27784 .sr(1)
27785 .m(1)
27786 .n(8)
27787 .k(k)
27788 .a_stride(7)
27789 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27790 }
27791 }
27792
27793 TEST(F32_GEMM_1X8__SSE_DUP, k_lt_4_subtile) {
27794 TEST_REQUIRES_X86_SSE;
27795 for (size_t k = 1; k < 4; k++) {
27796 for (uint32_t m = 1; m <= 1; m++) {
27797 for (uint32_t n = 1; n <= 8; n++) {
27798 GemmMicrokernelTester()
27799 .mr(1)
27800 .nr(8)
27801 .kr(1)
27802 .sr(1)
27803 .m(m)
27804 .n(n)
27805 .k(k)
27806 .iterations(1)
27807 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27808 }
27809 }
27810 }
27811 }
27812
27813 TEST(F32_GEMM_1X8__SSE_DUP, k_gt_4) {
27814 TEST_REQUIRES_X86_SSE;
27815 for (size_t k = 5; k < 8; k++) {
27816 GemmMicrokernelTester()
27817 .mr(1)
27818 .nr(8)
27819 .kr(1)
27820 .sr(1)
27821 .m(1)
27822 .n(8)
27823 .k(k)
27824 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27825 }
27826 }
27827
27828 TEST(F32_GEMM_1X8__SSE_DUP, k_gt_4_strided_a) {
27829 TEST_REQUIRES_X86_SSE;
27830 for (size_t k = 5; k < 8; k++) {
27831 GemmMicrokernelTester()
27832 .mr(1)
27833 .nr(8)
27834 .kr(1)
27835 .sr(1)
27836 .m(1)
27837 .n(8)
27838 .k(k)
27839 .a_stride(11)
27840 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27841 }
27842 }
27843
27844 TEST(F32_GEMM_1X8__SSE_DUP, k_gt_4_subtile) {
27845 TEST_REQUIRES_X86_SSE;
27846 for (size_t k = 5; k < 8; k++) {
27847 for (uint32_t m = 1; m <= 1; m++) {
27848 for (uint32_t n = 1; n <= 8; n++) {
27849 GemmMicrokernelTester()
27850 .mr(1)
27851 .nr(8)
27852 .kr(1)
27853 .sr(1)
27854 .m(m)
27855 .n(n)
27856 .k(k)
27857 .iterations(1)
27858 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27859 }
27860 }
27861 }
27862 }
27863
27864 TEST(F32_GEMM_1X8__SSE_DUP, k_div_4) {
27865 TEST_REQUIRES_X86_SSE;
27866 for (size_t k = 8; k <= 40; k += 4) {
27867 GemmMicrokernelTester()
27868 .mr(1)
27869 .nr(8)
27870 .kr(1)
27871 .sr(1)
27872 .m(1)
27873 .n(8)
27874 .k(k)
27875 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27876 }
27877 }
27878
27879 TEST(F32_GEMM_1X8__SSE_DUP, k_div_4_strided_a) {
27880 TEST_REQUIRES_X86_SSE;
27881 for (size_t k = 8; k <= 40; k += 4) {
27882 GemmMicrokernelTester()
27883 .mr(1)
27884 .nr(8)
27885 .kr(1)
27886 .sr(1)
27887 .m(1)
27888 .n(8)
27889 .k(k)
27890 .a_stride(43)
27891 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27892 }
27893 }
27894
27895 TEST(F32_GEMM_1X8__SSE_DUP, k_div_4_subtile) {
27896 TEST_REQUIRES_X86_SSE;
27897 for (size_t k = 8; k <= 40; k += 4) {
27898 for (uint32_t m = 1; m <= 1; m++) {
27899 for (uint32_t n = 1; n <= 8; n++) {
27900 GemmMicrokernelTester()
27901 .mr(1)
27902 .nr(8)
27903 .kr(1)
27904 .sr(1)
27905 .m(m)
27906 .n(n)
27907 .k(k)
27908 .iterations(1)
27909 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27910 }
27911 }
27912 }
27913 }
27914
27915 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8) {
27916 TEST_REQUIRES_X86_SSE;
27917 for (uint32_t n = 9; n < 16; n++) {
27918 for (size_t k = 1; k <= 20; k += 5) {
27919 GemmMicrokernelTester()
27920 .mr(1)
27921 .nr(8)
27922 .kr(1)
27923 .sr(1)
27924 .m(1)
27925 .n(8)
27926 .k(k)
27927 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27928 }
27929 }
27930 }
27931
27932 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8_strided_cn) {
27933 TEST_REQUIRES_X86_SSE;
27934 for (uint32_t n = 9; n < 16; n++) {
27935 for (size_t k = 1; k <= 20; k += 5) {
27936 GemmMicrokernelTester()
27937 .mr(1)
27938 .nr(8)
27939 .kr(1)
27940 .sr(1)
27941 .m(1)
27942 .n(8)
27943 .k(k)
27944 .cn_stride(11)
27945 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27946 }
27947 }
27948 }
27949
27950 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8_strided_a) {
27951 TEST_REQUIRES_X86_SSE;
27952 for (uint32_t n = 9; n < 16; n++) {
27953 for (size_t k = 1; k <= 20; k += 5) {
27954 GemmMicrokernelTester()
27955 .mr(1)
27956 .nr(8)
27957 .kr(1)
27958 .sr(1)
27959 .m(1)
27960 .n(n)
27961 .k(k)
27962 .a_stride(23)
27963 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27964 }
27965 }
27966 }
27967
27968 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8_subtile) {
27969 TEST_REQUIRES_X86_SSE;
27970 for (uint32_t n = 9; n < 16; n++) {
27971 for (size_t k = 1; k <= 20; k += 5) {
27972 for (uint32_t m = 1; m <= 1; m++) {
27973 GemmMicrokernelTester()
27974 .mr(1)
27975 .nr(8)
27976 .kr(1)
27977 .sr(1)
27978 .m(m)
27979 .n(n)
27980 .k(k)
27981 .iterations(1)
27982 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
27983 }
27984 }
27985 }
27986 }
27987
27988 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8) {
27989 TEST_REQUIRES_X86_SSE;
27990 for (uint32_t n = 16; n <= 24; n += 8) {
27991 for (size_t k = 1; k <= 20; k += 5) {
27992 GemmMicrokernelTester()
27993 .mr(1)
27994 .nr(8)
27995 .kr(1)
27996 .sr(1)
27997 .m(1)
27998 .n(8)
27999 .k(k)
28000 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28001 }
28002 }
28003 }
28004
28005 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8_strided_cn) {
28006 TEST_REQUIRES_X86_SSE;
28007 for (uint32_t n = 16; n <= 24; n += 8) {
28008 for (size_t k = 1; k <= 20; k += 5) {
28009 GemmMicrokernelTester()
28010 .mr(1)
28011 .nr(8)
28012 .kr(1)
28013 .sr(1)
28014 .m(1)
28015 .n(n)
28016 .k(k)
28017 .cn_stride(11)
28018 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28019 }
28020 }
28021 }
28022
28023 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8_strided_a) {
28024 TEST_REQUIRES_X86_SSE;
28025 for (uint32_t n = 16; n <= 24; n += 8) {
28026 for (size_t k = 1; k <= 20; k += 5) {
28027 GemmMicrokernelTester()
28028 .mr(1)
28029 .nr(8)
28030 .kr(1)
28031 .sr(1)
28032 .m(1)
28033 .n(n)
28034 .k(k)
28035 .a_stride(23)
28036 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28037 }
28038 }
28039 }
28040
28041 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8_subtile) {
28042 TEST_REQUIRES_X86_SSE;
28043 for (uint32_t n = 16; n <= 24; n += 8) {
28044 for (size_t k = 1; k <= 20; k += 5) {
28045 for (uint32_t m = 1; m <= 1; m++) {
28046 GemmMicrokernelTester()
28047 .mr(1)
28048 .nr(8)
28049 .kr(1)
28050 .sr(1)
28051 .m(m)
28052 .n(n)
28053 .k(k)
28054 .iterations(1)
28055 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28056 }
28057 }
28058 }
28059 }
28060
28061 TEST(F32_GEMM_1X8__SSE_DUP, strided_cm_subtile) {
28062 TEST_REQUIRES_X86_SSE;
28063 for (size_t k = 1; k <= 20; k += 5) {
28064 for (uint32_t m = 1; m <= 1; m++) {
28065 for (uint32_t n = 1; n <= 8; n++) {
28066 GemmMicrokernelTester()
28067 .mr(1)
28068 .nr(8)
28069 .kr(1)
28070 .sr(1)
28071 .m(m)
28072 .n(n)
28073 .k(k)
28074 .cm_stride(11)
28075 .iterations(1)
28076 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28077 }
28078 }
28079 }
28080 }
28081
28082 TEST(F32_GEMM_1X8__SSE_DUP, qmin) {
28083 TEST_REQUIRES_X86_SSE;
28084 GemmMicrokernelTester()
28085 .mr(1)
28086 .nr(8)
28087 .kr(1)
28088 .sr(1)
28089 .m(1)
28090 .n(8)
28091 .k(4)
28092 .qmin(128)
28093 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28094 }
28095
28096 TEST(F32_GEMM_1X8__SSE_DUP, qmax) {
28097 TEST_REQUIRES_X86_SSE;
28098 GemmMicrokernelTester()
28099 .mr(1)
28100 .nr(8)
28101 .kr(1)
28102 .sr(1)
28103 .m(1)
28104 .n(8)
28105 .k(4)
28106 .qmax(128)
28107 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28108 }
28109
28110 TEST(F32_GEMM_1X8__SSE_DUP, strided_cm) {
28111 TEST_REQUIRES_X86_SSE;
28112 GemmMicrokernelTester()
28113 .mr(1)
28114 .nr(8)
28115 .kr(1)
28116 .sr(1)
28117 .m(1)
28118 .n(8)
28119 .k(4)
28120 .cm_stride(11)
28121 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
28122 }
28123#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28124
28125
28126#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28127 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4) {
28128 TEST_REQUIRES_X86_SSE;
28129 GemmMicrokernelTester()
28130 .mr(4)
28131 .nr(8)
28132 .kr(1)
28133 .sr(1)
28134 .m(4)
28135 .n(8)
28136 .k(4)
28137 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28138 }
28139
28140 TEST(F32_GEMM_4X8__SSE_DUP, strided_cn) {
28141 TEST_REQUIRES_X86_SSE;
28142 GemmMicrokernelTester()
28143 .mr(4)
28144 .nr(8)
28145 .kr(1)
28146 .sr(1)
28147 .m(4)
28148 .n(8)
28149 .k(4)
28150 .cn_stride(11)
28151 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28152 }
28153
28154 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_strided_a) {
28155 TEST_REQUIRES_X86_SSE;
28156 GemmMicrokernelTester()
28157 .mr(4)
28158 .nr(8)
28159 .kr(1)
28160 .sr(1)
28161 .m(4)
28162 .n(8)
28163 .k(4)
28164 .a_stride(7)
28165 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28166 }
28167
28168 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_subtile) {
28169 TEST_REQUIRES_X86_SSE;
28170 for (uint32_t m = 1; m <= 4; m++) {
28171 for (uint32_t n = 1; n <= 8; n++) {
28172 GemmMicrokernelTester()
28173 .mr(4)
28174 .nr(8)
28175 .kr(1)
28176 .sr(1)
28177 .m(m)
28178 .n(n)
28179 .k(4)
28180 .iterations(1)
28181 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28182 }
28183 }
28184 }
28185
28186 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_subtile_m) {
28187 TEST_REQUIRES_X86_SSE;
28188 for (uint32_t m = 1; m <= 4; m++) {
28189 GemmMicrokernelTester()
28190 .mr(4)
28191 .nr(8)
28192 .kr(1)
28193 .sr(1)
28194 .m(m)
28195 .n(8)
28196 .k(4)
28197 .iterations(1)
28198 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28199 }
28200 }
28201
28202 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_subtile_n) {
28203 TEST_REQUIRES_X86_SSE;
28204 for (uint32_t n = 1; n <= 8; n++) {
28205 GemmMicrokernelTester()
28206 .mr(4)
28207 .nr(8)
28208 .kr(1)
28209 .sr(1)
28210 .m(4)
28211 .n(n)
28212 .k(4)
28213 .iterations(1)
28214 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28215 }
28216 }
28217
28218 TEST(F32_GEMM_4X8__SSE_DUP, k_lt_4) {
28219 TEST_REQUIRES_X86_SSE;
28220 for (size_t k = 1; k < 4; k++) {
28221 GemmMicrokernelTester()
28222 .mr(4)
28223 .nr(8)
28224 .kr(1)
28225 .sr(1)
28226 .m(4)
28227 .n(8)
28228 .k(k)
28229 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28230 }
28231 }
28232
28233 TEST(F32_GEMM_4X8__SSE_DUP, k_lt_4_strided_a) {
28234 TEST_REQUIRES_X86_SSE;
28235 for (size_t k = 1; k < 4; k++) {
28236 GemmMicrokernelTester()
28237 .mr(4)
28238 .nr(8)
28239 .kr(1)
28240 .sr(1)
28241 .m(4)
28242 .n(8)
28243 .k(k)
28244 .a_stride(7)
28245 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28246 }
28247 }
28248
28249 TEST(F32_GEMM_4X8__SSE_DUP, k_lt_4_subtile) {
28250 TEST_REQUIRES_X86_SSE;
28251 for (size_t k = 1; k < 4; k++) {
28252 for (uint32_t m = 1; m <= 4; m++) {
28253 for (uint32_t n = 1; n <= 8; n++) {
28254 GemmMicrokernelTester()
28255 .mr(4)
28256 .nr(8)
28257 .kr(1)
28258 .sr(1)
28259 .m(m)
28260 .n(n)
28261 .k(k)
28262 .iterations(1)
28263 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28264 }
28265 }
28266 }
28267 }
28268
28269 TEST(F32_GEMM_4X8__SSE_DUP, k_gt_4) {
28270 TEST_REQUIRES_X86_SSE;
28271 for (size_t k = 5; k < 8; k++) {
28272 GemmMicrokernelTester()
28273 .mr(4)
28274 .nr(8)
28275 .kr(1)
28276 .sr(1)
28277 .m(4)
28278 .n(8)
28279 .k(k)
28280 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28281 }
28282 }
28283
28284 TEST(F32_GEMM_4X8__SSE_DUP, k_gt_4_strided_a) {
28285 TEST_REQUIRES_X86_SSE;
28286 for (size_t k = 5; k < 8; k++) {
28287 GemmMicrokernelTester()
28288 .mr(4)
28289 .nr(8)
28290 .kr(1)
28291 .sr(1)
28292 .m(4)
28293 .n(8)
28294 .k(k)
28295 .a_stride(11)
28296 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28297 }
28298 }
28299
28300 TEST(F32_GEMM_4X8__SSE_DUP, k_gt_4_subtile) {
28301 TEST_REQUIRES_X86_SSE;
28302 for (size_t k = 5; k < 8; k++) {
28303 for (uint32_t m = 1; m <= 4; m++) {
28304 for (uint32_t n = 1; n <= 8; n++) {
28305 GemmMicrokernelTester()
28306 .mr(4)
28307 .nr(8)
28308 .kr(1)
28309 .sr(1)
28310 .m(m)
28311 .n(n)
28312 .k(k)
28313 .iterations(1)
28314 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28315 }
28316 }
28317 }
28318 }
28319
28320 TEST(F32_GEMM_4X8__SSE_DUP, k_div_4) {
28321 TEST_REQUIRES_X86_SSE;
28322 for (size_t k = 8; k <= 40; k += 4) {
28323 GemmMicrokernelTester()
28324 .mr(4)
28325 .nr(8)
28326 .kr(1)
28327 .sr(1)
28328 .m(4)
28329 .n(8)
28330 .k(k)
28331 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28332 }
28333 }
28334
28335 TEST(F32_GEMM_4X8__SSE_DUP, k_div_4_strided_a) {
28336 TEST_REQUIRES_X86_SSE;
28337 for (size_t k = 8; k <= 40; k += 4) {
28338 GemmMicrokernelTester()
28339 .mr(4)
28340 .nr(8)
28341 .kr(1)
28342 .sr(1)
28343 .m(4)
28344 .n(8)
28345 .k(k)
28346 .a_stride(43)
28347 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28348 }
28349 }
28350
28351 TEST(F32_GEMM_4X8__SSE_DUP, k_div_4_subtile) {
28352 TEST_REQUIRES_X86_SSE;
28353 for (size_t k = 8; k <= 40; k += 4) {
28354 for (uint32_t m = 1; m <= 4; m++) {
28355 for (uint32_t n = 1; n <= 8; n++) {
28356 GemmMicrokernelTester()
28357 .mr(4)
28358 .nr(8)
28359 .kr(1)
28360 .sr(1)
28361 .m(m)
28362 .n(n)
28363 .k(k)
28364 .iterations(1)
28365 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28366 }
28367 }
28368 }
28369 }
28370
28371 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8) {
28372 TEST_REQUIRES_X86_SSE;
28373 for (uint32_t n = 9; n < 16; n++) {
28374 for (size_t k = 1; k <= 20; k += 5) {
28375 GemmMicrokernelTester()
28376 .mr(4)
28377 .nr(8)
28378 .kr(1)
28379 .sr(1)
28380 .m(4)
28381 .n(8)
28382 .k(k)
28383 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28384 }
28385 }
28386 }
28387
28388 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8_strided_cn) {
28389 TEST_REQUIRES_X86_SSE;
28390 for (uint32_t n = 9; n < 16; n++) {
28391 for (size_t k = 1; k <= 20; k += 5) {
28392 GemmMicrokernelTester()
28393 .mr(4)
28394 .nr(8)
28395 .kr(1)
28396 .sr(1)
28397 .m(4)
28398 .n(8)
28399 .k(k)
28400 .cn_stride(11)
28401 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28402 }
28403 }
28404 }
28405
28406 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8_strided_a) {
28407 TEST_REQUIRES_X86_SSE;
28408 for (uint32_t n = 9; n < 16; n++) {
28409 for (size_t k = 1; k <= 20; k += 5) {
28410 GemmMicrokernelTester()
28411 .mr(4)
28412 .nr(8)
28413 .kr(1)
28414 .sr(1)
28415 .m(4)
28416 .n(n)
28417 .k(k)
28418 .a_stride(23)
28419 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28420 }
28421 }
28422 }
28423
28424 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8_subtile) {
28425 TEST_REQUIRES_X86_SSE;
28426 for (uint32_t n = 9; n < 16; n++) {
28427 for (size_t k = 1; k <= 20; k += 5) {
28428 for (uint32_t m = 1; m <= 4; m++) {
28429 GemmMicrokernelTester()
28430 .mr(4)
28431 .nr(8)
28432 .kr(1)
28433 .sr(1)
28434 .m(m)
28435 .n(n)
28436 .k(k)
28437 .iterations(1)
28438 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28439 }
28440 }
28441 }
28442 }
28443
28444 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8) {
28445 TEST_REQUIRES_X86_SSE;
28446 for (uint32_t n = 16; n <= 24; n += 8) {
28447 for (size_t k = 1; k <= 20; k += 5) {
28448 GemmMicrokernelTester()
28449 .mr(4)
28450 .nr(8)
28451 .kr(1)
28452 .sr(1)
28453 .m(4)
28454 .n(8)
28455 .k(k)
28456 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28457 }
28458 }
28459 }
28460
28461 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8_strided_cn) {
28462 TEST_REQUIRES_X86_SSE;
28463 for (uint32_t n = 16; n <= 24; n += 8) {
28464 for (size_t k = 1; k <= 20; k += 5) {
28465 GemmMicrokernelTester()
28466 .mr(4)
28467 .nr(8)
28468 .kr(1)
28469 .sr(1)
28470 .m(4)
28471 .n(n)
28472 .k(k)
28473 .cn_stride(11)
28474 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28475 }
28476 }
28477 }
28478
28479 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8_strided_a) {
28480 TEST_REQUIRES_X86_SSE;
28481 for (uint32_t n = 16; n <= 24; n += 8) {
28482 for (size_t k = 1; k <= 20; k += 5) {
28483 GemmMicrokernelTester()
28484 .mr(4)
28485 .nr(8)
28486 .kr(1)
28487 .sr(1)
28488 .m(4)
28489 .n(n)
28490 .k(k)
28491 .a_stride(23)
28492 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28493 }
28494 }
28495 }
28496
28497 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8_subtile) {
28498 TEST_REQUIRES_X86_SSE;
28499 for (uint32_t n = 16; n <= 24; n += 8) {
28500 for (size_t k = 1; k <= 20; k += 5) {
28501 for (uint32_t m = 1; m <= 4; m++) {
28502 GemmMicrokernelTester()
28503 .mr(4)
28504 .nr(8)
28505 .kr(1)
28506 .sr(1)
28507 .m(m)
28508 .n(n)
28509 .k(k)
28510 .iterations(1)
28511 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28512 }
28513 }
28514 }
28515 }
28516
28517 TEST(F32_GEMM_4X8__SSE_DUP, strided_cm_subtile) {
28518 TEST_REQUIRES_X86_SSE;
28519 for (size_t k = 1; k <= 20; k += 5) {
28520 for (uint32_t m = 1; m <= 4; m++) {
28521 for (uint32_t n = 1; n <= 8; n++) {
28522 GemmMicrokernelTester()
28523 .mr(4)
28524 .nr(8)
28525 .kr(1)
28526 .sr(1)
28527 .m(m)
28528 .n(n)
28529 .k(k)
28530 .cm_stride(11)
28531 .iterations(1)
28532 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28533 }
28534 }
28535 }
28536 }
28537
28538 TEST(F32_GEMM_4X8__SSE_DUP, qmin) {
28539 TEST_REQUIRES_X86_SSE;
28540 GemmMicrokernelTester()
28541 .mr(4)
28542 .nr(8)
28543 .kr(1)
28544 .sr(1)
28545 .m(4)
28546 .n(8)
28547 .k(4)
28548 .qmin(128)
28549 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28550 }
28551
28552 TEST(F32_GEMM_4X8__SSE_DUP, qmax) {
28553 TEST_REQUIRES_X86_SSE;
28554 GemmMicrokernelTester()
28555 .mr(4)
28556 .nr(8)
28557 .kr(1)
28558 .sr(1)
28559 .m(4)
28560 .n(8)
28561 .k(4)
28562 .qmax(128)
28563 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28564 }
28565
28566 TEST(F32_GEMM_4X8__SSE_DUP, strided_cm) {
28567 TEST_REQUIRES_X86_SSE;
28568 GemmMicrokernelTester()
28569 .mr(4)
28570 .nr(8)
28571 .kr(1)
28572 .sr(1)
28573 .m(4)
28574 .n(8)
28575 .k(4)
28576 .cm_stride(11)
28577 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
28578 }
28579#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28580
28581
28582#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28583 TEST(F32_GEMM_1X8S4__SSE, k_eq_4) {
28584 TEST_REQUIRES_X86_SSE;
28585 GemmMicrokernelTester()
28586 .mr(1)
28587 .nr(8)
28588 .kr(1)
28589 .sr(4)
28590 .m(1)
28591 .n(8)
28592 .k(4)
28593 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28594 }
28595
28596 TEST(F32_GEMM_1X8S4__SSE, strided_cn) {
28597 TEST_REQUIRES_X86_SSE;
28598 GemmMicrokernelTester()
28599 .mr(1)
28600 .nr(8)
28601 .kr(1)
28602 .sr(4)
28603 .m(1)
28604 .n(8)
28605 .k(4)
28606 .cn_stride(11)
28607 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28608 }
28609
28610 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_strided_a) {
28611 TEST_REQUIRES_X86_SSE;
28612 GemmMicrokernelTester()
28613 .mr(1)
28614 .nr(8)
28615 .kr(1)
28616 .sr(4)
28617 .m(1)
28618 .n(8)
28619 .k(4)
28620 .a_stride(7)
28621 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28622 }
28623
28624 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_subtile) {
28625 TEST_REQUIRES_X86_SSE;
28626 for (uint32_t m = 1; m <= 1; m++) {
28627 for (uint32_t n = 1; n <= 8; n++) {
28628 GemmMicrokernelTester()
28629 .mr(1)
28630 .nr(8)
28631 .kr(1)
28632 .sr(4)
28633 .m(m)
28634 .n(n)
28635 .k(4)
28636 .iterations(1)
28637 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28638 }
28639 }
28640 }
28641
28642 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_subtile_m) {
28643 TEST_REQUIRES_X86_SSE;
28644 for (uint32_t m = 1; m <= 1; m++) {
28645 GemmMicrokernelTester()
28646 .mr(1)
28647 .nr(8)
28648 .kr(1)
28649 .sr(4)
28650 .m(m)
28651 .n(8)
28652 .k(4)
28653 .iterations(1)
28654 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28655 }
28656 }
28657
28658 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_subtile_n) {
28659 TEST_REQUIRES_X86_SSE;
28660 for (uint32_t n = 1; n <= 8; n++) {
28661 GemmMicrokernelTester()
28662 .mr(1)
28663 .nr(8)
28664 .kr(1)
28665 .sr(4)
28666 .m(1)
28667 .n(n)
28668 .k(4)
28669 .iterations(1)
28670 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28671 }
28672 }
28673
28674 TEST(F32_GEMM_1X8S4__SSE, k_lt_4) {
28675 TEST_REQUIRES_X86_SSE;
28676 for (size_t k = 1; k < 4; k++) {
28677 GemmMicrokernelTester()
28678 .mr(1)
28679 .nr(8)
28680 .kr(1)
28681 .sr(4)
28682 .m(1)
28683 .n(8)
28684 .k(k)
28685 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28686 }
28687 }
28688
28689 TEST(F32_GEMM_1X8S4__SSE, k_lt_4_strided_a) {
28690 TEST_REQUIRES_X86_SSE;
28691 for (size_t k = 1; k < 4; k++) {
28692 GemmMicrokernelTester()
28693 .mr(1)
28694 .nr(8)
28695 .kr(1)
28696 .sr(4)
28697 .m(1)
28698 .n(8)
28699 .k(k)
28700 .a_stride(7)
28701 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28702 }
28703 }
28704
28705 TEST(F32_GEMM_1X8S4__SSE, k_lt_4_subtile) {
28706 TEST_REQUIRES_X86_SSE;
28707 for (size_t k = 1; k < 4; k++) {
28708 for (uint32_t m = 1; m <= 1; m++) {
28709 for (uint32_t n = 1; n <= 8; n++) {
28710 GemmMicrokernelTester()
28711 .mr(1)
28712 .nr(8)
28713 .kr(1)
28714 .sr(4)
28715 .m(m)
28716 .n(n)
28717 .k(k)
28718 .iterations(1)
28719 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28720 }
28721 }
28722 }
28723 }
28724
28725 TEST(F32_GEMM_1X8S4__SSE, k_gt_4) {
28726 TEST_REQUIRES_X86_SSE;
28727 for (size_t k = 5; k < 8; k++) {
28728 GemmMicrokernelTester()
28729 .mr(1)
28730 .nr(8)
28731 .kr(1)
28732 .sr(4)
28733 .m(1)
28734 .n(8)
28735 .k(k)
28736 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28737 }
28738 }
28739
28740 TEST(F32_GEMM_1X8S4__SSE, k_gt_4_strided_a) {
28741 TEST_REQUIRES_X86_SSE;
28742 for (size_t k = 5; k < 8; k++) {
28743 GemmMicrokernelTester()
28744 .mr(1)
28745 .nr(8)
28746 .kr(1)
28747 .sr(4)
28748 .m(1)
28749 .n(8)
28750 .k(k)
28751 .a_stride(11)
28752 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28753 }
28754 }
28755
28756 TEST(F32_GEMM_1X8S4__SSE, k_gt_4_subtile) {
28757 TEST_REQUIRES_X86_SSE;
28758 for (size_t k = 5; k < 8; k++) {
28759 for (uint32_t m = 1; m <= 1; m++) {
28760 for (uint32_t n = 1; n <= 8; n++) {
28761 GemmMicrokernelTester()
28762 .mr(1)
28763 .nr(8)
28764 .kr(1)
28765 .sr(4)
28766 .m(m)
28767 .n(n)
28768 .k(k)
28769 .iterations(1)
28770 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28771 }
28772 }
28773 }
28774 }
28775
28776 TEST(F32_GEMM_1X8S4__SSE, k_div_4) {
28777 TEST_REQUIRES_X86_SSE;
28778 for (size_t k = 8; k <= 40; k += 4) {
28779 GemmMicrokernelTester()
28780 .mr(1)
28781 .nr(8)
28782 .kr(1)
28783 .sr(4)
28784 .m(1)
28785 .n(8)
28786 .k(k)
28787 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28788 }
28789 }
28790
28791 TEST(F32_GEMM_1X8S4__SSE, k_div_4_strided_a) {
28792 TEST_REQUIRES_X86_SSE;
28793 for (size_t k = 8; k <= 40; k += 4) {
28794 GemmMicrokernelTester()
28795 .mr(1)
28796 .nr(8)
28797 .kr(1)
28798 .sr(4)
28799 .m(1)
28800 .n(8)
28801 .k(k)
28802 .a_stride(43)
28803 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28804 }
28805 }
28806
28807 TEST(F32_GEMM_1X8S4__SSE, k_div_4_subtile) {
28808 TEST_REQUIRES_X86_SSE;
28809 for (size_t k = 8; k <= 40; k += 4) {
28810 for (uint32_t m = 1; m <= 1; m++) {
28811 for (uint32_t n = 1; n <= 8; n++) {
28812 GemmMicrokernelTester()
28813 .mr(1)
28814 .nr(8)
28815 .kr(1)
28816 .sr(4)
28817 .m(m)
28818 .n(n)
28819 .k(k)
28820 .iterations(1)
28821 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28822 }
28823 }
28824 }
28825 }
28826
28827 TEST(F32_GEMM_1X8S4__SSE, n_gt_8) {
28828 TEST_REQUIRES_X86_SSE;
28829 for (uint32_t n = 9; n < 16; n++) {
28830 for (size_t k = 1; k <= 20; k += 5) {
28831 GemmMicrokernelTester()
28832 .mr(1)
28833 .nr(8)
28834 .kr(1)
28835 .sr(4)
28836 .m(1)
28837 .n(8)
28838 .k(k)
28839 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28840 }
28841 }
28842 }
28843
28844 TEST(F32_GEMM_1X8S4__SSE, n_gt_8_strided_cn) {
28845 TEST_REQUIRES_X86_SSE;
28846 for (uint32_t n = 9; n < 16; n++) {
28847 for (size_t k = 1; k <= 20; k += 5) {
28848 GemmMicrokernelTester()
28849 .mr(1)
28850 .nr(8)
28851 .kr(1)
28852 .sr(4)
28853 .m(1)
28854 .n(8)
28855 .k(k)
28856 .cn_stride(11)
28857 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28858 }
28859 }
28860 }
28861
28862 TEST(F32_GEMM_1X8S4__SSE, n_gt_8_strided_a) {
28863 TEST_REQUIRES_X86_SSE;
28864 for (uint32_t n = 9; n < 16; n++) {
28865 for (size_t k = 1; k <= 20; k += 5) {
28866 GemmMicrokernelTester()
28867 .mr(1)
28868 .nr(8)
28869 .kr(1)
28870 .sr(4)
28871 .m(1)
28872 .n(n)
28873 .k(k)
28874 .a_stride(23)
28875 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28876 }
28877 }
28878 }
28879
28880 TEST(F32_GEMM_1X8S4__SSE, n_gt_8_subtile) {
28881 TEST_REQUIRES_X86_SSE;
28882 for (uint32_t n = 9; n < 16; n++) {
28883 for (size_t k = 1; k <= 20; k += 5) {
28884 for (uint32_t m = 1; m <= 1; m++) {
28885 GemmMicrokernelTester()
28886 .mr(1)
28887 .nr(8)
28888 .kr(1)
28889 .sr(4)
28890 .m(m)
28891 .n(n)
28892 .k(k)
28893 .iterations(1)
28894 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28895 }
28896 }
28897 }
28898 }
28899
28900 TEST(F32_GEMM_1X8S4__SSE, n_div_8) {
28901 TEST_REQUIRES_X86_SSE;
28902 for (uint32_t n = 16; n <= 24; n += 8) {
28903 for (size_t k = 1; k <= 20; k += 5) {
28904 GemmMicrokernelTester()
28905 .mr(1)
28906 .nr(8)
28907 .kr(1)
28908 .sr(4)
28909 .m(1)
28910 .n(8)
28911 .k(k)
28912 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28913 }
28914 }
28915 }
28916
28917 TEST(F32_GEMM_1X8S4__SSE, n_div_8_strided_cn) {
28918 TEST_REQUIRES_X86_SSE;
28919 for (uint32_t n = 16; n <= 24; n += 8) {
28920 for (size_t k = 1; k <= 20; k += 5) {
28921 GemmMicrokernelTester()
28922 .mr(1)
28923 .nr(8)
28924 .kr(1)
28925 .sr(4)
28926 .m(1)
28927 .n(n)
28928 .k(k)
28929 .cn_stride(11)
28930 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28931 }
28932 }
28933 }
28934
28935 TEST(F32_GEMM_1X8S4__SSE, n_div_8_strided_a) {
28936 TEST_REQUIRES_X86_SSE;
28937 for (uint32_t n = 16; n <= 24; n += 8) {
28938 for (size_t k = 1; k <= 20; k += 5) {
28939 GemmMicrokernelTester()
28940 .mr(1)
28941 .nr(8)
28942 .kr(1)
28943 .sr(4)
28944 .m(1)
28945 .n(n)
28946 .k(k)
28947 .a_stride(23)
28948 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28949 }
28950 }
28951 }
28952
28953 TEST(F32_GEMM_1X8S4__SSE, n_div_8_subtile) {
28954 TEST_REQUIRES_X86_SSE;
28955 for (uint32_t n = 16; n <= 24; n += 8) {
28956 for (size_t k = 1; k <= 20; k += 5) {
28957 for (uint32_t m = 1; m <= 1; m++) {
28958 GemmMicrokernelTester()
28959 .mr(1)
28960 .nr(8)
28961 .kr(1)
28962 .sr(4)
28963 .m(m)
28964 .n(n)
28965 .k(k)
28966 .iterations(1)
28967 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28968 }
28969 }
28970 }
28971 }
28972
28973 TEST(F32_GEMM_1X8S4__SSE, strided_cm_subtile) {
28974 TEST_REQUIRES_X86_SSE;
28975 for (size_t k = 1; k <= 20; k += 5) {
28976 for (uint32_t m = 1; m <= 1; m++) {
28977 for (uint32_t n = 1; n <= 8; n++) {
28978 GemmMicrokernelTester()
28979 .mr(1)
28980 .nr(8)
28981 .kr(1)
28982 .sr(4)
28983 .m(m)
28984 .n(n)
28985 .k(k)
28986 .cm_stride(11)
28987 .iterations(1)
28988 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
28989 }
28990 }
28991 }
28992 }
28993
28994 TEST(F32_GEMM_1X8S4__SSE, qmin) {
28995 TEST_REQUIRES_X86_SSE;
28996 GemmMicrokernelTester()
28997 .mr(1)
28998 .nr(8)
28999 .kr(1)
29000 .sr(4)
29001 .m(1)
29002 .n(8)
29003 .k(4)
29004 .qmin(128)
29005 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
29006 }
29007
29008 TEST(F32_GEMM_1X8S4__SSE, qmax) {
29009 TEST_REQUIRES_X86_SSE;
29010 GemmMicrokernelTester()
29011 .mr(1)
29012 .nr(8)
29013 .kr(1)
29014 .sr(4)
29015 .m(1)
29016 .n(8)
29017 .k(4)
29018 .qmax(128)
29019 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
29020 }
29021
29022 TEST(F32_GEMM_1X8S4__SSE, strided_cm) {
29023 TEST_REQUIRES_X86_SSE;
29024 GemmMicrokernelTester()
29025 .mr(1)
29026 .nr(8)
29027 .kr(1)
29028 .sr(4)
29029 .m(1)
29030 .n(8)
29031 .k(4)
29032 .cm_stride(11)
29033 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
29034 }
29035#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29036
29037
29038#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29039 TEST(F32_GEMM_4X8S4__SSE, k_eq_4) {
29040 TEST_REQUIRES_X86_SSE;
29041 GemmMicrokernelTester()
29042 .mr(4)
29043 .nr(8)
29044 .kr(1)
29045 .sr(4)
29046 .m(4)
29047 .n(8)
29048 .k(4)
29049 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29050 }
29051
29052 TEST(F32_GEMM_4X8S4__SSE, strided_cn) {
29053 TEST_REQUIRES_X86_SSE;
29054 GemmMicrokernelTester()
29055 .mr(4)
29056 .nr(8)
29057 .kr(1)
29058 .sr(4)
29059 .m(4)
29060 .n(8)
29061 .k(4)
29062 .cn_stride(11)
29063 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29064 }
29065
29066 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_strided_a) {
29067 TEST_REQUIRES_X86_SSE;
29068 GemmMicrokernelTester()
29069 .mr(4)
29070 .nr(8)
29071 .kr(1)
29072 .sr(4)
29073 .m(4)
29074 .n(8)
29075 .k(4)
29076 .a_stride(7)
29077 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29078 }
29079
29080 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_subtile) {
29081 TEST_REQUIRES_X86_SSE;
29082 for (uint32_t m = 1; m <= 4; m++) {
29083 for (uint32_t n = 1; n <= 8; n++) {
29084 GemmMicrokernelTester()
29085 .mr(4)
29086 .nr(8)
29087 .kr(1)
29088 .sr(4)
29089 .m(m)
29090 .n(n)
29091 .k(4)
29092 .iterations(1)
29093 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29094 }
29095 }
29096 }
29097
29098 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_subtile_m) {
29099 TEST_REQUIRES_X86_SSE;
29100 for (uint32_t m = 1; m <= 4; m++) {
29101 GemmMicrokernelTester()
29102 .mr(4)
29103 .nr(8)
29104 .kr(1)
29105 .sr(4)
29106 .m(m)
29107 .n(8)
29108 .k(4)
29109 .iterations(1)
29110 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29111 }
29112 }
29113
29114 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_subtile_n) {
29115 TEST_REQUIRES_X86_SSE;
29116 for (uint32_t n = 1; n <= 8; n++) {
29117 GemmMicrokernelTester()
29118 .mr(4)
29119 .nr(8)
29120 .kr(1)
29121 .sr(4)
29122 .m(4)
29123 .n(n)
29124 .k(4)
29125 .iterations(1)
29126 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29127 }
29128 }
29129
29130 TEST(F32_GEMM_4X8S4__SSE, k_lt_4) {
29131 TEST_REQUIRES_X86_SSE;
29132 for (size_t k = 1; k < 4; k++) {
29133 GemmMicrokernelTester()
29134 .mr(4)
29135 .nr(8)
29136 .kr(1)
29137 .sr(4)
29138 .m(4)
29139 .n(8)
29140 .k(k)
29141 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29142 }
29143 }
29144
29145 TEST(F32_GEMM_4X8S4__SSE, k_lt_4_strided_a) {
29146 TEST_REQUIRES_X86_SSE;
29147 for (size_t k = 1; k < 4; k++) {
29148 GemmMicrokernelTester()
29149 .mr(4)
29150 .nr(8)
29151 .kr(1)
29152 .sr(4)
29153 .m(4)
29154 .n(8)
29155 .k(k)
29156 .a_stride(7)
29157 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29158 }
29159 }
29160
29161 TEST(F32_GEMM_4X8S4__SSE, k_lt_4_subtile) {
29162 TEST_REQUIRES_X86_SSE;
29163 for (size_t k = 1; k < 4; k++) {
29164 for (uint32_t m = 1; m <= 4; m++) {
29165 for (uint32_t n = 1; n <= 8; n++) {
29166 GemmMicrokernelTester()
29167 .mr(4)
29168 .nr(8)
29169 .kr(1)
29170 .sr(4)
29171 .m(m)
29172 .n(n)
29173 .k(k)
29174 .iterations(1)
29175 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29176 }
29177 }
29178 }
29179 }
29180
29181 TEST(F32_GEMM_4X8S4__SSE, k_gt_4) {
29182 TEST_REQUIRES_X86_SSE;
29183 for (size_t k = 5; k < 8; k++) {
29184 GemmMicrokernelTester()
29185 .mr(4)
29186 .nr(8)
29187 .kr(1)
29188 .sr(4)
29189 .m(4)
29190 .n(8)
29191 .k(k)
29192 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29193 }
29194 }
29195
29196 TEST(F32_GEMM_4X8S4__SSE, k_gt_4_strided_a) {
29197 TEST_REQUIRES_X86_SSE;
29198 for (size_t k = 5; k < 8; k++) {
29199 GemmMicrokernelTester()
29200 .mr(4)
29201 .nr(8)
29202 .kr(1)
29203 .sr(4)
29204 .m(4)
29205 .n(8)
29206 .k(k)
29207 .a_stride(11)
29208 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29209 }
29210 }
29211
29212 TEST(F32_GEMM_4X8S4__SSE, k_gt_4_subtile) {
29213 TEST_REQUIRES_X86_SSE;
29214 for (size_t k = 5; k < 8; k++) {
29215 for (uint32_t m = 1; m <= 4; m++) {
29216 for (uint32_t n = 1; n <= 8; n++) {
29217 GemmMicrokernelTester()
29218 .mr(4)
29219 .nr(8)
29220 .kr(1)
29221 .sr(4)
29222 .m(m)
29223 .n(n)
29224 .k(k)
29225 .iterations(1)
29226 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29227 }
29228 }
29229 }
29230 }
29231
29232 TEST(F32_GEMM_4X8S4__SSE, k_div_4) {
29233 TEST_REQUIRES_X86_SSE;
29234 for (size_t k = 8; k <= 40; k += 4) {
29235 GemmMicrokernelTester()
29236 .mr(4)
29237 .nr(8)
29238 .kr(1)
29239 .sr(4)
29240 .m(4)
29241 .n(8)
29242 .k(k)
29243 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29244 }
29245 }
29246
29247 TEST(F32_GEMM_4X8S4__SSE, k_div_4_strided_a) {
29248 TEST_REQUIRES_X86_SSE;
29249 for (size_t k = 8; k <= 40; k += 4) {
29250 GemmMicrokernelTester()
29251 .mr(4)
29252 .nr(8)
29253 .kr(1)
29254 .sr(4)
29255 .m(4)
29256 .n(8)
29257 .k(k)
29258 .a_stride(43)
29259 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29260 }
29261 }
29262
29263 TEST(F32_GEMM_4X8S4__SSE, k_div_4_subtile) {
29264 TEST_REQUIRES_X86_SSE;
29265 for (size_t k = 8; k <= 40; k += 4) {
29266 for (uint32_t m = 1; m <= 4; m++) {
29267 for (uint32_t n = 1; n <= 8; n++) {
29268 GemmMicrokernelTester()
29269 .mr(4)
29270 .nr(8)
29271 .kr(1)
29272 .sr(4)
29273 .m(m)
29274 .n(n)
29275 .k(k)
29276 .iterations(1)
29277 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29278 }
29279 }
29280 }
29281 }
29282
29283 TEST(F32_GEMM_4X8S4__SSE, n_gt_8) {
29284 TEST_REQUIRES_X86_SSE;
29285 for (uint32_t n = 9; n < 16; n++) {
29286 for (size_t k = 1; k <= 20; k += 5) {
29287 GemmMicrokernelTester()
29288 .mr(4)
29289 .nr(8)
29290 .kr(1)
29291 .sr(4)
29292 .m(4)
29293 .n(8)
29294 .k(k)
29295 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29296 }
29297 }
29298 }
29299
29300 TEST(F32_GEMM_4X8S4__SSE, n_gt_8_strided_cn) {
29301 TEST_REQUIRES_X86_SSE;
29302 for (uint32_t n = 9; n < 16; n++) {
29303 for (size_t k = 1; k <= 20; k += 5) {
29304 GemmMicrokernelTester()
29305 .mr(4)
29306 .nr(8)
29307 .kr(1)
29308 .sr(4)
29309 .m(4)
29310 .n(8)
29311 .k(k)
29312 .cn_stride(11)
29313 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29314 }
29315 }
29316 }
29317
29318 TEST(F32_GEMM_4X8S4__SSE, n_gt_8_strided_a) {
29319 TEST_REQUIRES_X86_SSE;
29320 for (uint32_t n = 9; n < 16; n++) {
29321 for (size_t k = 1; k <= 20; k += 5) {
29322 GemmMicrokernelTester()
29323 .mr(4)
29324 .nr(8)
29325 .kr(1)
29326 .sr(4)
29327 .m(4)
29328 .n(n)
29329 .k(k)
29330 .a_stride(23)
29331 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29332 }
29333 }
29334 }
29335
29336 TEST(F32_GEMM_4X8S4__SSE, n_gt_8_subtile) {
29337 TEST_REQUIRES_X86_SSE;
29338 for (uint32_t n = 9; n < 16; n++) {
29339 for (size_t k = 1; k <= 20; k += 5) {
29340 for (uint32_t m = 1; m <= 4; m++) {
29341 GemmMicrokernelTester()
29342 .mr(4)
29343 .nr(8)
29344 .kr(1)
29345 .sr(4)
29346 .m(m)
29347 .n(n)
29348 .k(k)
29349 .iterations(1)
29350 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29351 }
29352 }
29353 }
29354 }
29355
29356 TEST(F32_GEMM_4X8S4__SSE, n_div_8) {
29357 TEST_REQUIRES_X86_SSE;
29358 for (uint32_t n = 16; n <= 24; n += 8) {
29359 for (size_t k = 1; k <= 20; k += 5) {
29360 GemmMicrokernelTester()
29361 .mr(4)
29362 .nr(8)
29363 .kr(1)
29364 .sr(4)
29365 .m(4)
29366 .n(8)
29367 .k(k)
29368 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29369 }
29370 }
29371 }
29372
29373 TEST(F32_GEMM_4X8S4__SSE, n_div_8_strided_cn) {
29374 TEST_REQUIRES_X86_SSE;
29375 for (uint32_t n = 16; n <= 24; n += 8) {
29376 for (size_t k = 1; k <= 20; k += 5) {
29377 GemmMicrokernelTester()
29378 .mr(4)
29379 .nr(8)
29380 .kr(1)
29381 .sr(4)
29382 .m(4)
29383 .n(n)
29384 .k(k)
29385 .cn_stride(11)
29386 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29387 }
29388 }
29389 }
29390
29391 TEST(F32_GEMM_4X8S4__SSE, n_div_8_strided_a) {
29392 TEST_REQUIRES_X86_SSE;
29393 for (uint32_t n = 16; n <= 24; n += 8) {
29394 for (size_t k = 1; k <= 20; k += 5) {
29395 GemmMicrokernelTester()
29396 .mr(4)
29397 .nr(8)
29398 .kr(1)
29399 .sr(4)
29400 .m(4)
29401 .n(n)
29402 .k(k)
29403 .a_stride(23)
29404 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29405 }
29406 }
29407 }
29408
29409 TEST(F32_GEMM_4X8S4__SSE, n_div_8_subtile) {
29410 TEST_REQUIRES_X86_SSE;
29411 for (uint32_t n = 16; n <= 24; n += 8) {
29412 for (size_t k = 1; k <= 20; k += 5) {
29413 for (uint32_t m = 1; m <= 4; m++) {
29414 GemmMicrokernelTester()
29415 .mr(4)
29416 .nr(8)
29417 .kr(1)
29418 .sr(4)
29419 .m(m)
29420 .n(n)
29421 .k(k)
29422 .iterations(1)
29423 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29424 }
29425 }
29426 }
29427 }
29428
29429 TEST(F32_GEMM_4X8S4__SSE, strided_cm_subtile) {
29430 TEST_REQUIRES_X86_SSE;
29431 for (size_t k = 1; k <= 20; k += 5) {
29432 for (uint32_t m = 1; m <= 4; m++) {
29433 for (uint32_t n = 1; n <= 8; n++) {
29434 GemmMicrokernelTester()
29435 .mr(4)
29436 .nr(8)
29437 .kr(1)
29438 .sr(4)
29439 .m(m)
29440 .n(n)
29441 .k(k)
29442 .cm_stride(11)
29443 .iterations(1)
29444 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29445 }
29446 }
29447 }
29448 }
29449
29450 TEST(F32_GEMM_4X8S4__SSE, qmin) {
29451 TEST_REQUIRES_X86_SSE;
29452 GemmMicrokernelTester()
29453 .mr(4)
29454 .nr(8)
29455 .kr(1)
29456 .sr(4)
29457 .m(4)
29458 .n(8)
29459 .k(4)
29460 .qmin(128)
29461 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29462 }
29463
29464 TEST(F32_GEMM_4X8S4__SSE, qmax) {
29465 TEST_REQUIRES_X86_SSE;
29466 GemmMicrokernelTester()
29467 .mr(4)
29468 .nr(8)
29469 .kr(1)
29470 .sr(4)
29471 .m(4)
29472 .n(8)
29473 .k(4)
29474 .qmax(128)
29475 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29476 }
29477
29478 TEST(F32_GEMM_4X8S4__SSE, strided_cm) {
29479 TEST_REQUIRES_X86_SSE;
29480 GemmMicrokernelTester()
29481 .mr(4)
29482 .nr(8)
29483 .kr(1)
29484 .sr(4)
29485 .m(4)
29486 .n(8)
29487 .k(4)
29488 .cm_stride(11)
29489 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
29490 }
29491#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29492
29493
29494#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29495 TEST(F32_GEMM_4X2C4__SSE, k_eq_4) {
29496 TEST_REQUIRES_X86_SSE;
29497 GemmMicrokernelTester()
29498 .mr(4)
29499 .nr(2)
29500 .kr(4)
29501 .sr(1)
29502 .m(4)
29503 .n(2)
29504 .k(4)
29505 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29506 }
29507
29508 TEST(F32_GEMM_4X2C4__SSE, strided_cn) {
29509 TEST_REQUIRES_X86_SSE;
29510 GemmMicrokernelTester()
29511 .mr(4)
29512 .nr(2)
29513 .kr(4)
29514 .sr(1)
29515 .m(4)
29516 .n(2)
29517 .k(4)
29518 .cn_stride(5)
29519 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29520 }
29521
29522 TEST(F32_GEMM_4X2C4__SSE, k_eq_4_strided_a) {
29523 TEST_REQUIRES_X86_SSE;
29524 GemmMicrokernelTester()
29525 .mr(4)
29526 .nr(2)
29527 .kr(4)
29528 .sr(1)
29529 .m(4)
29530 .n(2)
29531 .k(4)
29532 .a_stride(7)
29533 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29534 }
29535
29536 TEST(F32_GEMM_4X2C4__SSE, k_eq_4_subtile) {
29537 TEST_REQUIRES_X86_SSE;
29538 for (uint32_t m = 1; m <= 4; m++) {
29539 for (uint32_t n = 1; n <= 2; n++) {
29540 GemmMicrokernelTester()
29541 .mr(4)
29542 .nr(2)
29543 .kr(4)
29544 .sr(1)
29545 .m(m)
29546 .n(n)
29547 .k(4)
29548 .iterations(1)
29549 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29550 }
29551 }
29552 }
29553
29554 TEST(F32_GEMM_4X2C4__SSE, k_eq_4_subtile_m) {
29555 TEST_REQUIRES_X86_SSE;
29556 for (uint32_t m = 1; m <= 4; m++) {
29557 GemmMicrokernelTester()
29558 .mr(4)
29559 .nr(2)
29560 .kr(4)
29561 .sr(1)
29562 .m(m)
29563 .n(2)
29564 .k(4)
29565 .iterations(1)
29566 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29567 }
29568 }
29569
29570 TEST(F32_GEMM_4X2C4__SSE, k_eq_4_subtile_n) {
29571 TEST_REQUIRES_X86_SSE;
29572 for (uint32_t n = 1; n <= 2; n++) {
29573 GemmMicrokernelTester()
29574 .mr(4)
29575 .nr(2)
29576 .kr(4)
29577 .sr(1)
29578 .m(4)
29579 .n(n)
29580 .k(4)
29581 .iterations(1)
29582 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29583 }
29584 }
29585
29586 TEST(F32_GEMM_4X2C4__SSE, k_lt_4) {
29587 TEST_REQUIRES_X86_SSE;
29588 for (size_t k = 1; k < 4; k++) {
29589 GemmMicrokernelTester()
29590 .mr(4)
29591 .nr(2)
29592 .kr(4)
29593 .sr(1)
29594 .m(4)
29595 .n(2)
29596 .k(k)
29597 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29598 }
29599 }
29600
29601 TEST(F32_GEMM_4X2C4__SSE, k_lt_4_strided_a) {
29602 TEST_REQUIRES_X86_SSE;
29603 for (size_t k = 1; k < 4; k++) {
29604 GemmMicrokernelTester()
29605 .mr(4)
29606 .nr(2)
29607 .kr(4)
29608 .sr(1)
29609 .m(4)
29610 .n(2)
29611 .k(k)
29612 .a_stride(7)
29613 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29614 }
29615 }
29616
29617 TEST(F32_GEMM_4X2C4__SSE, k_lt_4_subtile) {
29618 TEST_REQUIRES_X86_SSE;
29619 for (size_t k = 1; k < 4; k++) {
29620 for (uint32_t m = 1; m <= 4; m++) {
29621 for (uint32_t n = 1; n <= 2; n++) {
29622 GemmMicrokernelTester()
29623 .mr(4)
29624 .nr(2)
29625 .kr(4)
29626 .sr(1)
29627 .m(m)
29628 .n(n)
29629 .k(k)
29630 .iterations(1)
29631 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29632 }
29633 }
29634 }
29635 }
29636
29637 TEST(F32_GEMM_4X2C4__SSE, k_gt_4) {
29638 TEST_REQUIRES_X86_SSE;
29639 for (size_t k = 5; k < 8; k++) {
29640 GemmMicrokernelTester()
29641 .mr(4)
29642 .nr(2)
29643 .kr(4)
29644 .sr(1)
29645 .m(4)
29646 .n(2)
29647 .k(k)
29648 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29649 }
29650 }
29651
29652 TEST(F32_GEMM_4X2C4__SSE, k_gt_4_strided_a) {
29653 TEST_REQUIRES_X86_SSE;
29654 for (size_t k = 5; k < 8; k++) {
29655 GemmMicrokernelTester()
29656 .mr(4)
29657 .nr(2)
29658 .kr(4)
29659 .sr(1)
29660 .m(4)
29661 .n(2)
29662 .k(k)
29663 .a_stride(11)
29664 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29665 }
29666 }
29667
29668 TEST(F32_GEMM_4X2C4__SSE, k_gt_4_subtile) {
29669 TEST_REQUIRES_X86_SSE;
29670 for (size_t k = 5; k < 8; k++) {
29671 for (uint32_t m = 1; m <= 4; m++) {
29672 for (uint32_t n = 1; n <= 2; n++) {
29673 GemmMicrokernelTester()
29674 .mr(4)
29675 .nr(2)
29676 .kr(4)
29677 .sr(1)
29678 .m(m)
29679 .n(n)
29680 .k(k)
29681 .iterations(1)
29682 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29683 }
29684 }
29685 }
29686 }
29687
29688 TEST(F32_GEMM_4X2C4__SSE, k_div_4) {
29689 TEST_REQUIRES_X86_SSE;
29690 for (size_t k = 8; k <= 40; k += 4) {
29691 GemmMicrokernelTester()
29692 .mr(4)
29693 .nr(2)
29694 .kr(4)
29695 .sr(1)
29696 .m(4)
29697 .n(2)
29698 .k(k)
29699 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29700 }
29701 }
29702
29703 TEST(F32_GEMM_4X2C4__SSE, k_div_4_strided_a) {
29704 TEST_REQUIRES_X86_SSE;
29705 for (size_t k = 8; k <= 40; k += 4) {
29706 GemmMicrokernelTester()
29707 .mr(4)
29708 .nr(2)
29709 .kr(4)
29710 .sr(1)
29711 .m(4)
29712 .n(2)
29713 .k(k)
29714 .a_stride(43)
29715 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29716 }
29717 }
29718
29719 TEST(F32_GEMM_4X2C4__SSE, k_div_4_subtile) {
29720 TEST_REQUIRES_X86_SSE;
29721 for (size_t k = 8; k <= 40; k += 4) {
29722 for (uint32_t m = 1; m <= 4; m++) {
29723 for (uint32_t n = 1; n <= 2; n++) {
29724 GemmMicrokernelTester()
29725 .mr(4)
29726 .nr(2)
29727 .kr(4)
29728 .sr(1)
29729 .m(m)
29730 .n(n)
29731 .k(k)
29732 .iterations(1)
29733 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29734 }
29735 }
29736 }
29737 }
29738
29739 TEST(F32_GEMM_4X2C4__SSE, n_gt_2) {
29740 TEST_REQUIRES_X86_SSE;
29741 for (uint32_t n = 3; n < 4; n++) {
29742 for (size_t k = 1; k <= 20; k += 5) {
29743 GemmMicrokernelTester()
29744 .mr(4)
29745 .nr(2)
29746 .kr(4)
29747 .sr(1)
29748 .m(4)
29749 .n(2)
29750 .k(k)
29751 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29752 }
29753 }
29754 }
29755
29756 TEST(F32_GEMM_4X2C4__SSE, n_gt_2_strided_cn) {
29757 TEST_REQUIRES_X86_SSE;
29758 for (uint32_t n = 3; n < 4; n++) {
29759 for (size_t k = 1; k <= 20; k += 5) {
29760 GemmMicrokernelTester()
29761 .mr(4)
29762 .nr(2)
29763 .kr(4)
29764 .sr(1)
29765 .m(4)
29766 .n(2)
29767 .k(k)
29768 .cn_stride(5)
29769 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29770 }
29771 }
29772 }
29773
29774 TEST(F32_GEMM_4X2C4__SSE, n_gt_2_strided_a) {
29775 TEST_REQUIRES_X86_SSE;
29776 for (uint32_t n = 3; n < 4; n++) {
29777 for (size_t k = 1; k <= 20; k += 5) {
29778 GemmMicrokernelTester()
29779 .mr(4)
29780 .nr(2)
29781 .kr(4)
29782 .sr(1)
29783 .m(4)
29784 .n(n)
29785 .k(k)
29786 .a_stride(23)
29787 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29788 }
29789 }
29790 }
29791
29792 TEST(F32_GEMM_4X2C4__SSE, n_gt_2_subtile) {
29793 TEST_REQUIRES_X86_SSE;
29794 for (uint32_t n = 3; n < 4; n++) {
29795 for (size_t k = 1; k <= 20; k += 5) {
29796 for (uint32_t m = 1; m <= 4; m++) {
29797 GemmMicrokernelTester()
29798 .mr(4)
29799 .nr(2)
29800 .kr(4)
29801 .sr(1)
29802 .m(m)
29803 .n(n)
29804 .k(k)
29805 .iterations(1)
29806 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29807 }
29808 }
29809 }
29810 }
29811
29812 TEST(F32_GEMM_4X2C4__SSE, n_div_2) {
29813 TEST_REQUIRES_X86_SSE;
29814 for (uint32_t n = 4; n <= 6; n += 2) {
29815 for (size_t k = 1; k <= 20; k += 5) {
29816 GemmMicrokernelTester()
29817 .mr(4)
29818 .nr(2)
29819 .kr(4)
29820 .sr(1)
29821 .m(4)
29822 .n(2)
29823 .k(k)
29824 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29825 }
29826 }
29827 }
29828
29829 TEST(F32_GEMM_4X2C4__SSE, n_div_2_strided_cn) {
29830 TEST_REQUIRES_X86_SSE;
29831 for (uint32_t n = 4; n <= 6; n += 2) {
29832 for (size_t k = 1; k <= 20; k += 5) {
29833 GemmMicrokernelTester()
29834 .mr(4)
29835 .nr(2)
29836 .kr(4)
29837 .sr(1)
29838 .m(4)
29839 .n(n)
29840 .k(k)
29841 .cn_stride(5)
29842 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29843 }
29844 }
29845 }
29846
29847 TEST(F32_GEMM_4X2C4__SSE, n_div_2_strided_a) {
29848 TEST_REQUIRES_X86_SSE;
29849 for (uint32_t n = 4; n <= 6; n += 2) {
29850 for (size_t k = 1; k <= 20; k += 5) {
29851 GemmMicrokernelTester()
29852 .mr(4)
29853 .nr(2)
29854 .kr(4)
29855 .sr(1)
29856 .m(4)
29857 .n(n)
29858 .k(k)
29859 .a_stride(23)
29860 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29861 }
29862 }
29863 }
29864
29865 TEST(F32_GEMM_4X2C4__SSE, n_div_2_subtile) {
29866 TEST_REQUIRES_X86_SSE;
29867 for (uint32_t n = 4; n <= 6; n += 2) {
29868 for (size_t k = 1; k <= 20; k += 5) {
29869 for (uint32_t m = 1; m <= 4; m++) {
29870 GemmMicrokernelTester()
29871 .mr(4)
29872 .nr(2)
29873 .kr(4)
29874 .sr(1)
29875 .m(m)
29876 .n(n)
29877 .k(k)
29878 .iterations(1)
29879 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29880 }
29881 }
29882 }
29883 }
29884
29885 TEST(F32_GEMM_4X2C4__SSE, strided_cm_subtile) {
29886 TEST_REQUIRES_X86_SSE;
29887 for (size_t k = 1; k <= 20; k += 5) {
29888 for (uint32_t m = 1; m <= 4; m++) {
29889 for (uint32_t n = 1; n <= 2; n++) {
29890 GemmMicrokernelTester()
29891 .mr(4)
29892 .nr(2)
29893 .kr(4)
29894 .sr(1)
29895 .m(m)
29896 .n(n)
29897 .k(k)
29898 .cm_stride(5)
29899 .iterations(1)
29900 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29901 }
29902 }
29903 }
29904 }
29905
29906 TEST(F32_GEMM_4X2C4__SSE, qmin) {
29907 TEST_REQUIRES_X86_SSE;
29908 GemmMicrokernelTester()
29909 .mr(4)
29910 .nr(2)
29911 .kr(4)
29912 .sr(1)
29913 .m(4)
29914 .n(2)
29915 .k(4)
29916 .qmin(128)
29917 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29918 }
29919
29920 TEST(F32_GEMM_4X2C4__SSE, qmax) {
29921 TEST_REQUIRES_X86_SSE;
29922 GemmMicrokernelTester()
29923 .mr(4)
29924 .nr(2)
29925 .kr(4)
29926 .sr(1)
29927 .m(4)
29928 .n(2)
29929 .k(4)
29930 .qmax(128)
29931 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29932 }
29933
29934 TEST(F32_GEMM_4X2C4__SSE, strided_cm) {
29935 TEST_REQUIRES_X86_SSE;
29936 GemmMicrokernelTester()
29937 .mr(4)
29938 .nr(2)
29939 .kr(4)
29940 .sr(1)
29941 .m(4)
29942 .n(2)
29943 .k(4)
29944 .cm_stride(5)
29945 .Test(xnn_f32_gemm_ukernel_4x2c4__sse);
29946 }
29947#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29948
29949
29950#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29951 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1) {
29952 TEST_REQUIRES_X86_AVX;
29953 GemmMicrokernelTester()
29954 .mr(1)
29955 .nr(8)
29956 .kr(1)
29957 .sr(1)
29958 .m(1)
29959 .n(8)
29960 .k(1)
29961 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
29962 }
29963
29964 TEST(F32_GEMM_1X8__AVX_BROADCAST, strided_cn) {
29965 TEST_REQUIRES_X86_AVX;
29966 GemmMicrokernelTester()
29967 .mr(1)
29968 .nr(8)
29969 .kr(1)
29970 .sr(1)
29971 .m(1)
29972 .n(8)
29973 .k(1)
29974 .cn_stride(11)
29975 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
29976 }
29977
29978 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_strided_a) {
29979 TEST_REQUIRES_X86_AVX;
29980 GemmMicrokernelTester()
29981 .mr(1)
29982 .nr(8)
29983 .kr(1)
29984 .sr(1)
29985 .m(1)
29986 .n(8)
29987 .k(1)
29988 .a_stride(3)
29989 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
29990 }
29991
29992 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_subtile) {
29993 TEST_REQUIRES_X86_AVX;
29994 for (uint32_t m = 1; m <= 1; m++) {
29995 for (uint32_t n = 1; n <= 8; n++) {
29996 GemmMicrokernelTester()
29997 .mr(1)
29998 .nr(8)
29999 .kr(1)
30000 .sr(1)
30001 .m(m)
30002 .n(n)
30003 .k(1)
30004 .iterations(1)
30005 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30006 }
30007 }
30008 }
30009
30010 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
30011 TEST_REQUIRES_X86_AVX;
30012 for (uint32_t m = 1; m <= 1; m++) {
30013 GemmMicrokernelTester()
30014 .mr(1)
30015 .nr(8)
30016 .kr(1)
30017 .sr(1)
30018 .m(m)
30019 .n(8)
30020 .k(1)
30021 .iterations(1)
30022 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30023 }
30024 }
30025
30026 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
30027 TEST_REQUIRES_X86_AVX;
30028 for (uint32_t n = 1; n <= 8; n++) {
30029 GemmMicrokernelTester()
30030 .mr(1)
30031 .nr(8)
30032 .kr(1)
30033 .sr(1)
30034 .m(1)
30035 .n(n)
30036 .k(1)
30037 .iterations(1)
30038 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30039 }
30040 }
30041
30042 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_gt_1) {
30043 TEST_REQUIRES_X86_AVX;
30044 for (size_t k = 2; k < 10; k++) {
30045 GemmMicrokernelTester()
30046 .mr(1)
30047 .nr(8)
30048 .kr(1)
30049 .sr(1)
30050 .m(1)
30051 .n(8)
30052 .k(k)
30053 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30054 }
30055 }
30056
30057 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_gt_1_strided_a) {
30058 TEST_REQUIRES_X86_AVX;
30059 for (size_t k = 2; k < 10; k++) {
30060 GemmMicrokernelTester()
30061 .mr(1)
30062 .nr(8)
30063 .kr(1)
30064 .sr(1)
30065 .m(1)
30066 .n(8)
30067 .k(k)
30068 .a_stride(11)
30069 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30070 }
30071 }
30072
30073 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_gt_1_subtile) {
30074 TEST_REQUIRES_X86_AVX;
30075 for (size_t k = 2; k < 10; k++) {
30076 for (uint32_t m = 1; m <= 1; m++) {
30077 for (uint32_t n = 1; n <= 8; n++) {
30078 GemmMicrokernelTester()
30079 .mr(1)
30080 .nr(8)
30081 .kr(1)
30082 .sr(1)
30083 .m(m)
30084 .n(n)
30085 .k(k)
30086 .iterations(1)
30087 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30088 }
30089 }
30090 }
30091 }
30092
30093 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8) {
30094 TEST_REQUIRES_X86_AVX;
30095 for (uint32_t n = 9; n < 16; n++) {
30096 for (size_t k = 1; k <= 5; k += 2) {
30097 GemmMicrokernelTester()
30098 .mr(1)
30099 .nr(8)
30100 .kr(1)
30101 .sr(1)
30102 .m(1)
30103 .n(8)
30104 .k(k)
30105 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30106 }
30107 }
30108 }
30109
30110 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
30111 TEST_REQUIRES_X86_AVX;
30112 for (uint32_t n = 9; n < 16; n++) {
30113 for (size_t k = 1; k <= 5; k += 2) {
30114 GemmMicrokernelTester()
30115 .mr(1)
30116 .nr(8)
30117 .kr(1)
30118 .sr(1)
30119 .m(1)
30120 .n(8)
30121 .k(k)
30122 .cn_stride(11)
30123 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30124 }
30125 }
30126 }
30127
30128 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8_strided_a) {
30129 TEST_REQUIRES_X86_AVX;
30130 for (uint32_t n = 9; n < 16; n++) {
30131 for (size_t k = 1; k <= 5; k += 2) {
30132 GemmMicrokernelTester()
30133 .mr(1)
30134 .nr(8)
30135 .kr(1)
30136 .sr(1)
30137 .m(1)
30138 .n(n)
30139 .k(k)
30140 .a_stride(7)
30141 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30142 }
30143 }
30144 }
30145
30146 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8_subtile) {
30147 TEST_REQUIRES_X86_AVX;
30148 for (uint32_t n = 9; n < 16; n++) {
30149 for (size_t k = 1; k <= 5; k += 2) {
30150 for (uint32_t m = 1; m <= 1; m++) {
30151 GemmMicrokernelTester()
30152 .mr(1)
30153 .nr(8)
30154 .kr(1)
30155 .sr(1)
30156 .m(m)
30157 .n(n)
30158 .k(k)
30159 .iterations(1)
30160 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30161 }
30162 }
30163 }
30164 }
30165
30166 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8) {
30167 TEST_REQUIRES_X86_AVX;
30168 for (uint32_t n = 16; n <= 24; n += 8) {
30169 for (size_t k = 1; k <= 5; k += 2) {
30170 GemmMicrokernelTester()
30171 .mr(1)
30172 .nr(8)
30173 .kr(1)
30174 .sr(1)
30175 .m(1)
30176 .n(8)
30177 .k(k)
30178 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30179 }
30180 }
30181 }
30182
30183 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
30184 TEST_REQUIRES_X86_AVX;
30185 for (uint32_t n = 16; n <= 24; n += 8) {
30186 for (size_t k = 1; k <= 5; k += 2) {
30187 GemmMicrokernelTester()
30188 .mr(1)
30189 .nr(8)
30190 .kr(1)
30191 .sr(1)
30192 .m(1)
30193 .n(n)
30194 .k(k)
30195 .cn_stride(11)
30196 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30197 }
30198 }
30199 }
30200
30201 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8_strided_a) {
30202 TEST_REQUIRES_X86_AVX;
30203 for (uint32_t n = 16; n <= 24; n += 8) {
30204 for (size_t k = 1; k <= 5; k += 2) {
30205 GemmMicrokernelTester()
30206 .mr(1)
30207 .nr(8)
30208 .kr(1)
30209 .sr(1)
30210 .m(1)
30211 .n(n)
30212 .k(k)
30213 .a_stride(7)
30214 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30215 }
30216 }
30217 }
30218
30219 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8_subtile) {
30220 TEST_REQUIRES_X86_AVX;
30221 for (uint32_t n = 16; n <= 24; n += 8) {
30222 for (size_t k = 1; k <= 5; k += 2) {
30223 for (uint32_t m = 1; m <= 1; m++) {
30224 GemmMicrokernelTester()
30225 .mr(1)
30226 .nr(8)
30227 .kr(1)
30228 .sr(1)
30229 .m(m)
30230 .n(n)
30231 .k(k)
30232 .iterations(1)
30233 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30234 }
30235 }
30236 }
30237 }
30238
30239 TEST(F32_GEMM_1X8__AVX_BROADCAST, strided_cm_subtile) {
30240 TEST_REQUIRES_X86_AVX;
30241 for (size_t k = 1; k <= 5; k += 2) {
30242 for (uint32_t m = 1; m <= 1; m++) {
30243 for (uint32_t n = 1; n <= 8; n++) {
30244 GemmMicrokernelTester()
30245 .mr(1)
30246 .nr(8)
30247 .kr(1)
30248 .sr(1)
30249 .m(m)
30250 .n(n)
30251 .k(k)
30252 .cm_stride(11)
30253 .iterations(1)
30254 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30255 }
30256 }
30257 }
30258 }
30259
30260 TEST(F32_GEMM_1X8__AVX_BROADCAST, qmin) {
30261 TEST_REQUIRES_X86_AVX;
30262 GemmMicrokernelTester()
30263 .mr(1)
30264 .nr(8)
30265 .kr(1)
30266 .sr(1)
30267 .m(1)
30268 .n(8)
30269 .k(1)
30270 .qmin(128)
30271 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30272 }
30273
30274 TEST(F32_GEMM_1X8__AVX_BROADCAST, qmax) {
30275 TEST_REQUIRES_X86_AVX;
30276 GemmMicrokernelTester()
30277 .mr(1)
30278 .nr(8)
30279 .kr(1)
30280 .sr(1)
30281 .m(1)
30282 .n(8)
30283 .k(1)
30284 .qmax(128)
30285 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30286 }
30287
30288 TEST(F32_GEMM_1X8__AVX_BROADCAST, strided_cm) {
30289 TEST_REQUIRES_X86_AVX;
30290 GemmMicrokernelTester()
30291 .mr(1)
30292 .nr(8)
30293 .kr(1)
30294 .sr(1)
30295 .m(1)
30296 .n(8)
30297 .k(1)
30298 .cm_stride(11)
30299 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
30300 }
30301#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30302
30303
30304#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30305 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1) {
30306 TEST_REQUIRES_X86_AVX;
30307 GemmMicrokernelTester()
30308 .mr(4)
30309 .nr(8)
30310 .kr(1)
30311 .sr(1)
30312 .m(4)
30313 .n(8)
30314 .k(1)
30315 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30316 }
30317
30318 TEST(F32_GEMM_4X8__AVX_BROADCAST, strided_cn) {
30319 TEST_REQUIRES_X86_AVX;
30320 GemmMicrokernelTester()
30321 .mr(4)
30322 .nr(8)
30323 .kr(1)
30324 .sr(1)
30325 .m(4)
30326 .n(8)
30327 .k(1)
30328 .cn_stride(11)
30329 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30330 }
30331
30332 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_strided_a) {
30333 TEST_REQUIRES_X86_AVX;
30334 GemmMicrokernelTester()
30335 .mr(4)
30336 .nr(8)
30337 .kr(1)
30338 .sr(1)
30339 .m(4)
30340 .n(8)
30341 .k(1)
30342 .a_stride(3)
30343 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30344 }
30345
30346 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_subtile) {
30347 TEST_REQUIRES_X86_AVX;
30348 for (uint32_t m = 1; m <= 4; m++) {
30349 for (uint32_t n = 1; n <= 8; n++) {
30350 GemmMicrokernelTester()
30351 .mr(4)
30352 .nr(8)
30353 .kr(1)
30354 .sr(1)
30355 .m(m)
30356 .n(n)
30357 .k(1)
30358 .iterations(1)
30359 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30360 }
30361 }
30362 }
30363
30364 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_m) {
30365 TEST_REQUIRES_X86_AVX;
30366 for (uint32_t m = 1; m <= 4; m++) {
30367 GemmMicrokernelTester()
30368 .mr(4)
30369 .nr(8)
30370 .kr(1)
30371 .sr(1)
30372 .m(m)
30373 .n(8)
30374 .k(1)
30375 .iterations(1)
30376 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30377 }
30378 }
30379
30380 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_n) {
30381 TEST_REQUIRES_X86_AVX;
30382 for (uint32_t n = 1; n <= 8; n++) {
30383 GemmMicrokernelTester()
30384 .mr(4)
30385 .nr(8)
30386 .kr(1)
30387 .sr(1)
30388 .m(4)
30389 .n(n)
30390 .k(1)
30391 .iterations(1)
30392 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30393 }
30394 }
30395
30396 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_gt_1) {
30397 TEST_REQUIRES_X86_AVX;
30398 for (size_t k = 2; k < 10; k++) {
30399 GemmMicrokernelTester()
30400 .mr(4)
30401 .nr(8)
30402 .kr(1)
30403 .sr(1)
30404 .m(4)
30405 .n(8)
30406 .k(k)
30407 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30408 }
30409 }
30410
30411 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_gt_1_strided_a) {
30412 TEST_REQUIRES_X86_AVX;
30413 for (size_t k = 2; k < 10; k++) {
30414 GemmMicrokernelTester()
30415 .mr(4)
30416 .nr(8)
30417 .kr(1)
30418 .sr(1)
30419 .m(4)
30420 .n(8)
30421 .k(k)
30422 .a_stride(11)
30423 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30424 }
30425 }
30426
30427 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_gt_1_subtile) {
30428 TEST_REQUIRES_X86_AVX;
30429 for (size_t k = 2; k < 10; k++) {
30430 for (uint32_t m = 1; m <= 4; m++) {
30431 for (uint32_t n = 1; n <= 8; n++) {
30432 GemmMicrokernelTester()
30433 .mr(4)
30434 .nr(8)
30435 .kr(1)
30436 .sr(1)
30437 .m(m)
30438 .n(n)
30439 .k(k)
30440 .iterations(1)
30441 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30442 }
30443 }
30444 }
30445 }
30446
30447 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8) {
30448 TEST_REQUIRES_X86_AVX;
30449 for (uint32_t n = 9; n < 16; n++) {
30450 for (size_t k = 1; k <= 5; k += 2) {
30451 GemmMicrokernelTester()
30452 .mr(4)
30453 .nr(8)
30454 .kr(1)
30455 .sr(1)
30456 .m(4)
30457 .n(8)
30458 .k(k)
30459 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30460 }
30461 }
30462 }
30463
30464 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8_strided_cn) {
30465 TEST_REQUIRES_X86_AVX;
30466 for (uint32_t n = 9; n < 16; n++) {
30467 for (size_t k = 1; k <= 5; k += 2) {
30468 GemmMicrokernelTester()
30469 .mr(4)
30470 .nr(8)
30471 .kr(1)
30472 .sr(1)
30473 .m(4)
30474 .n(8)
30475 .k(k)
30476 .cn_stride(11)
30477 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30478 }
30479 }
30480 }
30481
30482 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8_strided_a) {
30483 TEST_REQUIRES_X86_AVX;
30484 for (uint32_t n = 9; n < 16; n++) {
30485 for (size_t k = 1; k <= 5; k += 2) {
30486 GemmMicrokernelTester()
30487 .mr(4)
30488 .nr(8)
30489 .kr(1)
30490 .sr(1)
30491 .m(4)
30492 .n(n)
30493 .k(k)
30494 .a_stride(7)
30495 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30496 }
30497 }
30498 }
30499
30500 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8_subtile) {
30501 TEST_REQUIRES_X86_AVX;
30502 for (uint32_t n = 9; n < 16; n++) {
30503 for (size_t k = 1; k <= 5; k += 2) {
30504 for (uint32_t m = 1; m <= 4; m++) {
30505 GemmMicrokernelTester()
30506 .mr(4)
30507 .nr(8)
30508 .kr(1)
30509 .sr(1)
30510 .m(m)
30511 .n(n)
30512 .k(k)
30513 .iterations(1)
30514 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30515 }
30516 }
30517 }
30518 }
30519
30520 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8) {
30521 TEST_REQUIRES_X86_AVX;
30522 for (uint32_t n = 16; n <= 24; n += 8) {
30523 for (size_t k = 1; k <= 5; k += 2) {
30524 GemmMicrokernelTester()
30525 .mr(4)
30526 .nr(8)
30527 .kr(1)
30528 .sr(1)
30529 .m(4)
30530 .n(8)
30531 .k(k)
30532 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30533 }
30534 }
30535 }
30536
30537 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8_strided_cn) {
30538 TEST_REQUIRES_X86_AVX;
30539 for (uint32_t n = 16; n <= 24; n += 8) {
30540 for (size_t k = 1; k <= 5; k += 2) {
30541 GemmMicrokernelTester()
30542 .mr(4)
30543 .nr(8)
30544 .kr(1)
30545 .sr(1)
30546 .m(4)
30547 .n(n)
30548 .k(k)
30549 .cn_stride(11)
30550 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30551 }
30552 }
30553 }
30554
30555 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8_strided_a) {
30556 TEST_REQUIRES_X86_AVX;
30557 for (uint32_t n = 16; n <= 24; n += 8) {
30558 for (size_t k = 1; k <= 5; k += 2) {
30559 GemmMicrokernelTester()
30560 .mr(4)
30561 .nr(8)
30562 .kr(1)
30563 .sr(1)
30564 .m(4)
30565 .n(n)
30566 .k(k)
30567 .a_stride(7)
30568 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30569 }
30570 }
30571 }
30572
30573 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8_subtile) {
30574 TEST_REQUIRES_X86_AVX;
30575 for (uint32_t n = 16; n <= 24; n += 8) {
30576 for (size_t k = 1; k <= 5; k += 2) {
30577 for (uint32_t m = 1; m <= 4; m++) {
30578 GemmMicrokernelTester()
30579 .mr(4)
30580 .nr(8)
30581 .kr(1)
30582 .sr(1)
30583 .m(m)
30584 .n(n)
30585 .k(k)
30586 .iterations(1)
30587 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30588 }
30589 }
30590 }
30591 }
30592
30593 TEST(F32_GEMM_4X8__AVX_BROADCAST, strided_cm_subtile) {
30594 TEST_REQUIRES_X86_AVX;
30595 for (size_t k = 1; k <= 5; k += 2) {
30596 for (uint32_t m = 1; m <= 4; m++) {
30597 for (uint32_t n = 1; n <= 8; n++) {
30598 GemmMicrokernelTester()
30599 .mr(4)
30600 .nr(8)
30601 .kr(1)
30602 .sr(1)
30603 .m(m)
30604 .n(n)
30605 .k(k)
30606 .cm_stride(11)
30607 .iterations(1)
30608 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30609 }
30610 }
30611 }
30612 }
30613
30614 TEST(F32_GEMM_4X8__AVX_BROADCAST, qmin) {
30615 TEST_REQUIRES_X86_AVX;
30616 GemmMicrokernelTester()
30617 .mr(4)
30618 .nr(8)
30619 .kr(1)
30620 .sr(1)
30621 .m(4)
30622 .n(8)
30623 .k(1)
30624 .qmin(128)
30625 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30626 }
30627
30628 TEST(F32_GEMM_4X8__AVX_BROADCAST, qmax) {
30629 TEST_REQUIRES_X86_AVX;
30630 GemmMicrokernelTester()
30631 .mr(4)
30632 .nr(8)
30633 .kr(1)
30634 .sr(1)
30635 .m(4)
30636 .n(8)
30637 .k(1)
30638 .qmax(128)
30639 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30640 }
30641
30642 TEST(F32_GEMM_4X8__AVX_BROADCAST, strided_cm) {
30643 TEST_REQUIRES_X86_AVX;
30644 GemmMicrokernelTester()
30645 .mr(4)
30646 .nr(8)
30647 .kr(1)
30648 .sr(1)
30649 .m(4)
30650 .n(8)
30651 .k(1)
30652 .cm_stride(11)
30653 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
30654 }
30655#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30656
30657
30658#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30659 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1) {
30660 TEST_REQUIRES_X86_AVX;
30661 GemmMicrokernelTester()
30662 .mr(5)
30663 .nr(8)
30664 .kr(1)
30665 .sr(1)
30666 .m(5)
30667 .n(8)
30668 .k(1)
30669 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30670 }
30671
30672 TEST(F32_GEMM_5X8__AVX_BROADCAST, strided_cn) {
30673 TEST_REQUIRES_X86_AVX;
30674 GemmMicrokernelTester()
30675 .mr(5)
30676 .nr(8)
30677 .kr(1)
30678 .sr(1)
30679 .m(5)
30680 .n(8)
30681 .k(1)
30682 .cn_stride(11)
30683 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30684 }
30685
30686 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_strided_a) {
30687 TEST_REQUIRES_X86_AVX;
30688 GemmMicrokernelTester()
30689 .mr(5)
30690 .nr(8)
30691 .kr(1)
30692 .sr(1)
30693 .m(5)
30694 .n(8)
30695 .k(1)
30696 .a_stride(3)
30697 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30698 }
30699
30700 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_subtile) {
30701 TEST_REQUIRES_X86_AVX;
30702 for (uint32_t m = 1; m <= 5; m++) {
30703 for (uint32_t n = 1; n <= 8; n++) {
30704 GemmMicrokernelTester()
30705 .mr(5)
30706 .nr(8)
30707 .kr(1)
30708 .sr(1)
30709 .m(m)
30710 .n(n)
30711 .k(1)
30712 .iterations(1)
30713 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30714 }
30715 }
30716 }
30717
30718 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
30719 TEST_REQUIRES_X86_AVX;
30720 for (uint32_t m = 1; m <= 5; m++) {
30721 GemmMicrokernelTester()
30722 .mr(5)
30723 .nr(8)
30724 .kr(1)
30725 .sr(1)
30726 .m(m)
30727 .n(8)
30728 .k(1)
30729 .iterations(1)
30730 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30731 }
30732 }
30733
30734 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
30735 TEST_REQUIRES_X86_AVX;
30736 for (uint32_t n = 1; n <= 8; n++) {
30737 GemmMicrokernelTester()
30738 .mr(5)
30739 .nr(8)
30740 .kr(1)
30741 .sr(1)
30742 .m(5)
30743 .n(n)
30744 .k(1)
30745 .iterations(1)
30746 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30747 }
30748 }
30749
30750 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_gt_1) {
30751 TEST_REQUIRES_X86_AVX;
30752 for (size_t k = 2; k < 10; k++) {
30753 GemmMicrokernelTester()
30754 .mr(5)
30755 .nr(8)
30756 .kr(1)
30757 .sr(1)
30758 .m(5)
30759 .n(8)
30760 .k(k)
30761 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30762 }
30763 }
30764
30765 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_gt_1_strided_a) {
30766 TEST_REQUIRES_X86_AVX;
30767 for (size_t k = 2; k < 10; k++) {
30768 GemmMicrokernelTester()
30769 .mr(5)
30770 .nr(8)
30771 .kr(1)
30772 .sr(1)
30773 .m(5)
30774 .n(8)
30775 .k(k)
30776 .a_stride(11)
30777 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30778 }
30779 }
30780
30781 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_gt_1_subtile) {
30782 TEST_REQUIRES_X86_AVX;
30783 for (size_t k = 2; k < 10; k++) {
30784 for (uint32_t m = 1; m <= 5; m++) {
30785 for (uint32_t n = 1; n <= 8; n++) {
30786 GemmMicrokernelTester()
30787 .mr(5)
30788 .nr(8)
30789 .kr(1)
30790 .sr(1)
30791 .m(m)
30792 .n(n)
30793 .k(k)
30794 .iterations(1)
30795 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30796 }
30797 }
30798 }
30799 }
30800
30801 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8) {
30802 TEST_REQUIRES_X86_AVX;
30803 for (uint32_t n = 9; n < 16; n++) {
30804 for (size_t k = 1; k <= 5; k += 2) {
30805 GemmMicrokernelTester()
30806 .mr(5)
30807 .nr(8)
30808 .kr(1)
30809 .sr(1)
30810 .m(5)
30811 .n(8)
30812 .k(k)
30813 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30814 }
30815 }
30816 }
30817
30818 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
30819 TEST_REQUIRES_X86_AVX;
30820 for (uint32_t n = 9; n < 16; n++) {
30821 for (size_t k = 1; k <= 5; k += 2) {
30822 GemmMicrokernelTester()
30823 .mr(5)
30824 .nr(8)
30825 .kr(1)
30826 .sr(1)
30827 .m(5)
30828 .n(8)
30829 .k(k)
30830 .cn_stride(11)
30831 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30832 }
30833 }
30834 }
30835
30836 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8_strided_a) {
30837 TEST_REQUIRES_X86_AVX;
30838 for (uint32_t n = 9; n < 16; n++) {
30839 for (size_t k = 1; k <= 5; k += 2) {
30840 GemmMicrokernelTester()
30841 .mr(5)
30842 .nr(8)
30843 .kr(1)
30844 .sr(1)
30845 .m(5)
30846 .n(n)
30847 .k(k)
30848 .a_stride(7)
30849 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30850 }
30851 }
30852 }
30853
30854 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8_subtile) {
30855 TEST_REQUIRES_X86_AVX;
30856 for (uint32_t n = 9; n < 16; n++) {
30857 for (size_t k = 1; k <= 5; k += 2) {
30858 for (uint32_t m = 1; m <= 5; m++) {
30859 GemmMicrokernelTester()
30860 .mr(5)
30861 .nr(8)
30862 .kr(1)
30863 .sr(1)
30864 .m(m)
30865 .n(n)
30866 .k(k)
30867 .iterations(1)
30868 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30869 }
30870 }
30871 }
30872 }
30873
30874 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8) {
30875 TEST_REQUIRES_X86_AVX;
30876 for (uint32_t n = 16; n <= 24; n += 8) {
30877 for (size_t k = 1; k <= 5; k += 2) {
30878 GemmMicrokernelTester()
30879 .mr(5)
30880 .nr(8)
30881 .kr(1)
30882 .sr(1)
30883 .m(5)
30884 .n(8)
30885 .k(k)
30886 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30887 }
30888 }
30889 }
30890
30891 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
30892 TEST_REQUIRES_X86_AVX;
30893 for (uint32_t n = 16; n <= 24; n += 8) {
30894 for (size_t k = 1; k <= 5; k += 2) {
30895 GemmMicrokernelTester()
30896 .mr(5)
30897 .nr(8)
30898 .kr(1)
30899 .sr(1)
30900 .m(5)
30901 .n(n)
30902 .k(k)
30903 .cn_stride(11)
30904 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30905 }
30906 }
30907 }
30908
30909 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8_strided_a) {
30910 TEST_REQUIRES_X86_AVX;
30911 for (uint32_t n = 16; n <= 24; n += 8) {
30912 for (size_t k = 1; k <= 5; k += 2) {
30913 GemmMicrokernelTester()
30914 .mr(5)
30915 .nr(8)
30916 .kr(1)
30917 .sr(1)
30918 .m(5)
30919 .n(n)
30920 .k(k)
30921 .a_stride(7)
30922 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30923 }
30924 }
30925 }
30926
30927 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8_subtile) {
30928 TEST_REQUIRES_X86_AVX;
30929 for (uint32_t n = 16; n <= 24; n += 8) {
30930 for (size_t k = 1; k <= 5; k += 2) {
30931 for (uint32_t m = 1; m <= 5; m++) {
30932 GemmMicrokernelTester()
30933 .mr(5)
30934 .nr(8)
30935 .kr(1)
30936 .sr(1)
30937 .m(m)
30938 .n(n)
30939 .k(k)
30940 .iterations(1)
30941 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30942 }
30943 }
30944 }
30945 }
30946
30947 TEST(F32_GEMM_5X8__AVX_BROADCAST, strided_cm_subtile) {
30948 TEST_REQUIRES_X86_AVX;
30949 for (size_t k = 1; k <= 5; k += 2) {
30950 for (uint32_t m = 1; m <= 5; m++) {
30951 for (uint32_t n = 1; n <= 8; n++) {
30952 GemmMicrokernelTester()
30953 .mr(5)
30954 .nr(8)
30955 .kr(1)
30956 .sr(1)
30957 .m(m)
30958 .n(n)
30959 .k(k)
30960 .cm_stride(11)
30961 .iterations(1)
30962 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30963 }
30964 }
30965 }
30966 }
30967
30968 TEST(F32_GEMM_5X8__AVX_BROADCAST, qmin) {
30969 TEST_REQUIRES_X86_AVX;
30970 GemmMicrokernelTester()
30971 .mr(5)
30972 .nr(8)
30973 .kr(1)
30974 .sr(1)
30975 .m(5)
30976 .n(8)
30977 .k(1)
30978 .qmin(128)
30979 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30980 }
30981
30982 TEST(F32_GEMM_5X8__AVX_BROADCAST, qmax) {
30983 TEST_REQUIRES_X86_AVX;
30984 GemmMicrokernelTester()
30985 .mr(5)
30986 .nr(8)
30987 .kr(1)
30988 .sr(1)
30989 .m(5)
30990 .n(8)
30991 .k(1)
30992 .qmax(128)
30993 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
30994 }
30995
30996 TEST(F32_GEMM_5X8__AVX_BROADCAST, strided_cm) {
30997 TEST_REQUIRES_X86_AVX;
30998 GemmMicrokernelTester()
30999 .mr(5)
31000 .nr(8)
31001 .kr(1)
31002 .sr(1)
31003 .m(5)
31004 .n(8)
31005 .k(1)
31006 .cm_stride(11)
31007 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
31008 }
31009#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31010
31011
31012#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31013 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1) {
31014 TEST_REQUIRES_X86_AVX;
31015 GemmMicrokernelTester()
31016 .mr(6)
31017 .nr(8)
31018 .kr(1)
31019 .sr(1)
31020 .m(6)
31021 .n(8)
31022 .k(1)
31023 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31024 }
31025
31026 TEST(F32_GEMM_6X8__AVX_BROADCAST, strided_cn) {
31027 TEST_REQUIRES_X86_AVX;
31028 GemmMicrokernelTester()
31029 .mr(6)
31030 .nr(8)
31031 .kr(1)
31032 .sr(1)
31033 .m(6)
31034 .n(8)
31035 .k(1)
31036 .cn_stride(11)
31037 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31038 }
31039
31040 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_strided_a) {
31041 TEST_REQUIRES_X86_AVX;
31042 GemmMicrokernelTester()
31043 .mr(6)
31044 .nr(8)
31045 .kr(1)
31046 .sr(1)
31047 .m(6)
31048 .n(8)
31049 .k(1)
31050 .a_stride(3)
31051 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31052 }
31053
31054 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_subtile) {
31055 TEST_REQUIRES_X86_AVX;
31056 for (uint32_t m = 1; m <= 6; m++) {
31057 for (uint32_t n = 1; n <= 8; n++) {
31058 GemmMicrokernelTester()
31059 .mr(6)
31060 .nr(8)
31061 .kr(1)
31062 .sr(1)
31063 .m(m)
31064 .n(n)
31065 .k(1)
31066 .iterations(1)
31067 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31068 }
31069 }
31070 }
31071
31072 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
31073 TEST_REQUIRES_X86_AVX;
31074 for (uint32_t m = 1; m <= 6; m++) {
31075 GemmMicrokernelTester()
31076 .mr(6)
31077 .nr(8)
31078 .kr(1)
31079 .sr(1)
31080 .m(m)
31081 .n(8)
31082 .k(1)
31083 .iterations(1)
31084 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31085 }
31086 }
31087
31088 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
31089 TEST_REQUIRES_X86_AVX;
31090 for (uint32_t n = 1; n <= 8; n++) {
31091 GemmMicrokernelTester()
31092 .mr(6)
31093 .nr(8)
31094 .kr(1)
31095 .sr(1)
31096 .m(6)
31097 .n(n)
31098 .k(1)
31099 .iterations(1)
31100 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31101 }
31102 }
31103
31104 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_gt_1) {
31105 TEST_REQUIRES_X86_AVX;
31106 for (size_t k = 2; k < 10; k++) {
31107 GemmMicrokernelTester()
31108 .mr(6)
31109 .nr(8)
31110 .kr(1)
31111 .sr(1)
31112 .m(6)
31113 .n(8)
31114 .k(k)
31115 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31116 }
31117 }
31118
31119 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_gt_1_strided_a) {
31120 TEST_REQUIRES_X86_AVX;
31121 for (size_t k = 2; k < 10; k++) {
31122 GemmMicrokernelTester()
31123 .mr(6)
31124 .nr(8)
31125 .kr(1)
31126 .sr(1)
31127 .m(6)
31128 .n(8)
31129 .k(k)
31130 .a_stride(11)
31131 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31132 }
31133 }
31134
31135 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_gt_1_subtile) {
31136 TEST_REQUIRES_X86_AVX;
31137 for (size_t k = 2; k < 10; k++) {
31138 for (uint32_t m = 1; m <= 6; m++) {
31139 for (uint32_t n = 1; n <= 8; n++) {
31140 GemmMicrokernelTester()
31141 .mr(6)
31142 .nr(8)
31143 .kr(1)
31144 .sr(1)
31145 .m(m)
31146 .n(n)
31147 .k(k)
31148 .iterations(1)
31149 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31150 }
31151 }
31152 }
31153 }
31154
31155 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8) {
31156 TEST_REQUIRES_X86_AVX;
31157 for (uint32_t n = 9; n < 16; n++) {
31158 for (size_t k = 1; k <= 5; k += 2) {
31159 GemmMicrokernelTester()
31160 .mr(6)
31161 .nr(8)
31162 .kr(1)
31163 .sr(1)
31164 .m(6)
31165 .n(8)
31166 .k(k)
31167 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31168 }
31169 }
31170 }
31171
31172 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
31173 TEST_REQUIRES_X86_AVX;
31174 for (uint32_t n = 9; n < 16; n++) {
31175 for (size_t k = 1; k <= 5; k += 2) {
31176 GemmMicrokernelTester()
31177 .mr(6)
31178 .nr(8)
31179 .kr(1)
31180 .sr(1)
31181 .m(6)
31182 .n(8)
31183 .k(k)
31184 .cn_stride(11)
31185 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31186 }
31187 }
31188 }
31189
31190 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8_strided_a) {
31191 TEST_REQUIRES_X86_AVX;
31192 for (uint32_t n = 9; n < 16; n++) {
31193 for (size_t k = 1; k <= 5; k += 2) {
31194 GemmMicrokernelTester()
31195 .mr(6)
31196 .nr(8)
31197 .kr(1)
31198 .sr(1)
31199 .m(6)
31200 .n(n)
31201 .k(k)
31202 .a_stride(7)
31203 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31204 }
31205 }
31206 }
31207
31208 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8_subtile) {
31209 TEST_REQUIRES_X86_AVX;
31210 for (uint32_t n = 9; n < 16; n++) {
31211 for (size_t k = 1; k <= 5; k += 2) {
31212 for (uint32_t m = 1; m <= 6; m++) {
31213 GemmMicrokernelTester()
31214 .mr(6)
31215 .nr(8)
31216 .kr(1)
31217 .sr(1)
31218 .m(m)
31219 .n(n)
31220 .k(k)
31221 .iterations(1)
31222 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31223 }
31224 }
31225 }
31226 }
31227
31228 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8) {
31229 TEST_REQUIRES_X86_AVX;
31230 for (uint32_t n = 16; n <= 24; n += 8) {
31231 for (size_t k = 1; k <= 5; k += 2) {
31232 GemmMicrokernelTester()
31233 .mr(6)
31234 .nr(8)
31235 .kr(1)
31236 .sr(1)
31237 .m(6)
31238 .n(8)
31239 .k(k)
31240 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31241 }
31242 }
31243 }
31244
31245 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
31246 TEST_REQUIRES_X86_AVX;
31247 for (uint32_t n = 16; n <= 24; n += 8) {
31248 for (size_t k = 1; k <= 5; k += 2) {
31249 GemmMicrokernelTester()
31250 .mr(6)
31251 .nr(8)
31252 .kr(1)
31253 .sr(1)
31254 .m(6)
31255 .n(n)
31256 .k(k)
31257 .cn_stride(11)
31258 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31259 }
31260 }
31261 }
31262
31263 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8_strided_a) {
31264 TEST_REQUIRES_X86_AVX;
31265 for (uint32_t n = 16; n <= 24; n += 8) {
31266 for (size_t k = 1; k <= 5; k += 2) {
31267 GemmMicrokernelTester()
31268 .mr(6)
31269 .nr(8)
31270 .kr(1)
31271 .sr(1)
31272 .m(6)
31273 .n(n)
31274 .k(k)
31275 .a_stride(7)
31276 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31277 }
31278 }
31279 }
31280
31281 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8_subtile) {
31282 TEST_REQUIRES_X86_AVX;
31283 for (uint32_t n = 16; n <= 24; n += 8) {
31284 for (size_t k = 1; k <= 5; k += 2) {
31285 for (uint32_t m = 1; m <= 6; m++) {
31286 GemmMicrokernelTester()
31287 .mr(6)
31288 .nr(8)
31289 .kr(1)
31290 .sr(1)
31291 .m(m)
31292 .n(n)
31293 .k(k)
31294 .iterations(1)
31295 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31296 }
31297 }
31298 }
31299 }
31300
31301 TEST(F32_GEMM_6X8__AVX_BROADCAST, strided_cm_subtile) {
31302 TEST_REQUIRES_X86_AVX;
31303 for (size_t k = 1; k <= 5; k += 2) {
31304 for (uint32_t m = 1; m <= 6; m++) {
31305 for (uint32_t n = 1; n <= 8; n++) {
31306 GemmMicrokernelTester()
31307 .mr(6)
31308 .nr(8)
31309 .kr(1)
31310 .sr(1)
31311 .m(m)
31312 .n(n)
31313 .k(k)
31314 .cm_stride(11)
31315 .iterations(1)
31316 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31317 }
31318 }
31319 }
31320 }
31321
31322 TEST(F32_GEMM_6X8__AVX_BROADCAST, qmin) {
31323 TEST_REQUIRES_X86_AVX;
31324 GemmMicrokernelTester()
31325 .mr(6)
31326 .nr(8)
31327 .kr(1)
31328 .sr(1)
31329 .m(6)
31330 .n(8)
31331 .k(1)
31332 .qmin(128)
31333 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31334 }
31335
31336 TEST(F32_GEMM_6X8__AVX_BROADCAST, qmax) {
31337 TEST_REQUIRES_X86_AVX;
31338 GemmMicrokernelTester()
31339 .mr(6)
31340 .nr(8)
31341 .kr(1)
31342 .sr(1)
31343 .m(6)
31344 .n(8)
31345 .k(1)
31346 .qmax(128)
31347 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31348 }
31349
31350 TEST(F32_GEMM_6X8__AVX_BROADCAST, strided_cm) {
31351 TEST_REQUIRES_X86_AVX;
31352 GemmMicrokernelTester()
31353 .mr(6)
31354 .nr(8)
31355 .kr(1)
31356 .sr(1)
31357 .m(6)
31358 .n(8)
31359 .k(1)
31360 .cm_stride(11)
31361 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
31362 }
31363#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31364
31365
31366#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31367 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1) {
31368 TEST_REQUIRES_X86_AVX;
31369 GemmMicrokernelTester()
31370 .mr(7)
31371 .nr(8)
31372 .kr(1)
31373 .sr(1)
31374 .m(7)
31375 .n(8)
31376 .k(1)
31377 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31378 }
31379
31380 TEST(F32_GEMM_7X8__AVX_BROADCAST, strided_cn) {
31381 TEST_REQUIRES_X86_AVX;
31382 GemmMicrokernelTester()
31383 .mr(7)
31384 .nr(8)
31385 .kr(1)
31386 .sr(1)
31387 .m(7)
31388 .n(8)
31389 .k(1)
31390 .cn_stride(11)
31391 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31392 }
31393
31394 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_strided_a) {
31395 TEST_REQUIRES_X86_AVX;
31396 GemmMicrokernelTester()
31397 .mr(7)
31398 .nr(8)
31399 .kr(1)
31400 .sr(1)
31401 .m(7)
31402 .n(8)
31403 .k(1)
31404 .a_stride(3)
31405 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31406 }
31407
31408 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_subtile) {
31409 TEST_REQUIRES_X86_AVX;
31410 for (uint32_t m = 1; m <= 7; m++) {
31411 for (uint32_t n = 1; n <= 8; n++) {
31412 GemmMicrokernelTester()
31413 .mr(7)
31414 .nr(8)
31415 .kr(1)
31416 .sr(1)
31417 .m(m)
31418 .n(n)
31419 .k(1)
31420 .iterations(1)
31421 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31422 }
31423 }
31424 }
31425
31426 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_m) {
31427 TEST_REQUIRES_X86_AVX;
31428 for (uint32_t m = 1; m <= 7; m++) {
31429 GemmMicrokernelTester()
31430 .mr(7)
31431 .nr(8)
31432 .kr(1)
31433 .sr(1)
31434 .m(m)
31435 .n(8)
31436 .k(1)
31437 .iterations(1)
31438 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31439 }
31440 }
31441
31442 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_n) {
31443 TEST_REQUIRES_X86_AVX;
31444 for (uint32_t n = 1; n <= 8; n++) {
31445 GemmMicrokernelTester()
31446 .mr(7)
31447 .nr(8)
31448 .kr(1)
31449 .sr(1)
31450 .m(7)
31451 .n(n)
31452 .k(1)
31453 .iterations(1)
31454 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31455 }
31456 }
31457
31458 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_gt_1) {
31459 TEST_REQUIRES_X86_AVX;
31460 for (size_t k = 2; k < 10; k++) {
31461 GemmMicrokernelTester()
31462 .mr(7)
31463 .nr(8)
31464 .kr(1)
31465 .sr(1)
31466 .m(7)
31467 .n(8)
31468 .k(k)
31469 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31470 }
31471 }
31472
31473 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_gt_1_strided_a) {
31474 TEST_REQUIRES_X86_AVX;
31475 for (size_t k = 2; k < 10; k++) {
31476 GemmMicrokernelTester()
31477 .mr(7)
31478 .nr(8)
31479 .kr(1)
31480 .sr(1)
31481 .m(7)
31482 .n(8)
31483 .k(k)
31484 .a_stride(11)
31485 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31486 }
31487 }
31488
31489 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_gt_1_subtile) {
31490 TEST_REQUIRES_X86_AVX;
31491 for (size_t k = 2; k < 10; k++) {
31492 for (uint32_t m = 1; m <= 7; m++) {
31493 for (uint32_t n = 1; n <= 8; n++) {
31494 GemmMicrokernelTester()
31495 .mr(7)
31496 .nr(8)
31497 .kr(1)
31498 .sr(1)
31499 .m(m)
31500 .n(n)
31501 .k(k)
31502 .iterations(1)
31503 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31504 }
31505 }
31506 }
31507 }
31508
31509 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8) {
31510 TEST_REQUIRES_X86_AVX;
31511 for (uint32_t n = 9; n < 16; n++) {
31512 for (size_t k = 1; k <= 5; k += 2) {
31513 GemmMicrokernelTester()
31514 .mr(7)
31515 .nr(8)
31516 .kr(1)
31517 .sr(1)
31518 .m(7)
31519 .n(8)
31520 .k(k)
31521 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31522 }
31523 }
31524 }
31525
31526 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8_strided_cn) {
31527 TEST_REQUIRES_X86_AVX;
31528 for (uint32_t n = 9; n < 16; n++) {
31529 for (size_t k = 1; k <= 5; k += 2) {
31530 GemmMicrokernelTester()
31531 .mr(7)
31532 .nr(8)
31533 .kr(1)
31534 .sr(1)
31535 .m(7)
31536 .n(8)
31537 .k(k)
31538 .cn_stride(11)
31539 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31540 }
31541 }
31542 }
31543
31544 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8_strided_a) {
31545 TEST_REQUIRES_X86_AVX;
31546 for (uint32_t n = 9; n < 16; n++) {
31547 for (size_t k = 1; k <= 5; k += 2) {
31548 GemmMicrokernelTester()
31549 .mr(7)
31550 .nr(8)
31551 .kr(1)
31552 .sr(1)
31553 .m(7)
31554 .n(n)
31555 .k(k)
31556 .a_stride(7)
31557 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31558 }
31559 }
31560 }
31561
31562 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8_subtile) {
31563 TEST_REQUIRES_X86_AVX;
31564 for (uint32_t n = 9; n < 16; n++) {
31565 for (size_t k = 1; k <= 5; k += 2) {
31566 for (uint32_t m = 1; m <= 7; m++) {
31567 GemmMicrokernelTester()
31568 .mr(7)
31569 .nr(8)
31570 .kr(1)
31571 .sr(1)
31572 .m(m)
31573 .n(n)
31574 .k(k)
31575 .iterations(1)
31576 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31577 }
31578 }
31579 }
31580 }
31581
31582 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8) {
31583 TEST_REQUIRES_X86_AVX;
31584 for (uint32_t n = 16; n <= 24; n += 8) {
31585 for (size_t k = 1; k <= 5; k += 2) {
31586 GemmMicrokernelTester()
31587 .mr(7)
31588 .nr(8)
31589 .kr(1)
31590 .sr(1)
31591 .m(7)
31592 .n(8)
31593 .k(k)
31594 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31595 }
31596 }
31597 }
31598
31599 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8_strided_cn) {
31600 TEST_REQUIRES_X86_AVX;
31601 for (uint32_t n = 16; n <= 24; n += 8) {
31602 for (size_t k = 1; k <= 5; k += 2) {
31603 GemmMicrokernelTester()
31604 .mr(7)
31605 .nr(8)
31606 .kr(1)
31607 .sr(1)
31608 .m(7)
31609 .n(n)
31610 .k(k)
31611 .cn_stride(11)
31612 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31613 }
31614 }
31615 }
31616
31617 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8_strided_a) {
31618 TEST_REQUIRES_X86_AVX;
31619 for (uint32_t n = 16; n <= 24; n += 8) {
31620 for (size_t k = 1; k <= 5; k += 2) {
31621 GemmMicrokernelTester()
31622 .mr(7)
31623 .nr(8)
31624 .kr(1)
31625 .sr(1)
31626 .m(7)
31627 .n(n)
31628 .k(k)
31629 .a_stride(7)
31630 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31631 }
31632 }
31633 }
31634
31635 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8_subtile) {
31636 TEST_REQUIRES_X86_AVX;
31637 for (uint32_t n = 16; n <= 24; n += 8) {
31638 for (size_t k = 1; k <= 5; k += 2) {
31639 for (uint32_t m = 1; m <= 7; m++) {
31640 GemmMicrokernelTester()
31641 .mr(7)
31642 .nr(8)
31643 .kr(1)
31644 .sr(1)
31645 .m(m)
31646 .n(n)
31647 .k(k)
31648 .iterations(1)
31649 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31650 }
31651 }
31652 }
31653 }
31654
31655 TEST(F32_GEMM_7X8__AVX_BROADCAST, strided_cm_subtile) {
31656 TEST_REQUIRES_X86_AVX;
31657 for (size_t k = 1; k <= 5; k += 2) {
31658 for (uint32_t m = 1; m <= 7; m++) {
31659 for (uint32_t n = 1; n <= 8; n++) {
31660 GemmMicrokernelTester()
31661 .mr(7)
31662 .nr(8)
31663 .kr(1)
31664 .sr(1)
31665 .m(m)
31666 .n(n)
31667 .k(k)
31668 .cm_stride(11)
31669 .iterations(1)
31670 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31671 }
31672 }
31673 }
31674 }
31675
31676 TEST(F32_GEMM_7X8__AVX_BROADCAST, qmin) {
31677 TEST_REQUIRES_X86_AVX;
31678 GemmMicrokernelTester()
31679 .mr(7)
31680 .nr(8)
31681 .kr(1)
31682 .sr(1)
31683 .m(7)
31684 .n(8)
31685 .k(1)
31686 .qmin(128)
31687 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31688 }
31689
31690 TEST(F32_GEMM_7X8__AVX_BROADCAST, qmax) {
31691 TEST_REQUIRES_X86_AVX;
31692 GemmMicrokernelTester()
31693 .mr(7)
31694 .nr(8)
31695 .kr(1)
31696 .sr(1)
31697 .m(7)
31698 .n(8)
31699 .k(1)
31700 .qmax(128)
31701 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31702 }
31703
31704 TEST(F32_GEMM_7X8__AVX_BROADCAST, strided_cm) {
31705 TEST_REQUIRES_X86_AVX;
31706 GemmMicrokernelTester()
31707 .mr(7)
31708 .nr(8)
31709 .kr(1)
31710 .sr(1)
31711 .m(7)
31712 .n(8)
31713 .k(1)
31714 .cm_stride(11)
31715 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
31716 }
31717#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31718
31719
31720#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31721 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1) {
31722 TEST_REQUIRES_X86_AVX;
31723 GemmMicrokernelTester()
31724 .mr(1)
31725 .nr(16)
31726 .kr(1)
31727 .sr(1)
31728 .m(1)
31729 .n(16)
31730 .k(1)
31731 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31732 }
31733
31734 TEST(F32_GEMM_1X16__AVX_BROADCAST, strided_cn) {
31735 TEST_REQUIRES_X86_AVX;
31736 GemmMicrokernelTester()
31737 .mr(1)
31738 .nr(16)
31739 .kr(1)
31740 .sr(1)
31741 .m(1)
31742 .n(16)
31743 .k(1)
31744 .cn_stride(19)
31745 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31746 }
31747
31748 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_strided_a) {
31749 TEST_REQUIRES_X86_AVX;
31750 GemmMicrokernelTester()
31751 .mr(1)
31752 .nr(16)
31753 .kr(1)
31754 .sr(1)
31755 .m(1)
31756 .n(16)
31757 .k(1)
31758 .a_stride(3)
31759 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31760 }
31761
31762 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_subtile) {
31763 TEST_REQUIRES_X86_AVX;
31764 for (uint32_t m = 1; m <= 1; m++) {
31765 for (uint32_t n = 1; n <= 16; n++) {
31766 GemmMicrokernelTester()
31767 .mr(1)
31768 .nr(16)
31769 .kr(1)
31770 .sr(1)
31771 .m(m)
31772 .n(n)
31773 .k(1)
31774 .iterations(1)
31775 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31776 }
31777 }
31778 }
31779
31780 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_m) {
31781 TEST_REQUIRES_X86_AVX;
31782 for (uint32_t m = 1; m <= 1; m++) {
31783 GemmMicrokernelTester()
31784 .mr(1)
31785 .nr(16)
31786 .kr(1)
31787 .sr(1)
31788 .m(m)
31789 .n(16)
31790 .k(1)
31791 .iterations(1)
31792 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31793 }
31794 }
31795
31796 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_n) {
31797 TEST_REQUIRES_X86_AVX;
31798 for (uint32_t n = 1; n <= 16; n++) {
31799 GemmMicrokernelTester()
31800 .mr(1)
31801 .nr(16)
31802 .kr(1)
31803 .sr(1)
31804 .m(1)
31805 .n(n)
31806 .k(1)
31807 .iterations(1)
31808 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31809 }
31810 }
31811
31812 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_gt_1) {
31813 TEST_REQUIRES_X86_AVX;
31814 for (size_t k = 2; k < 10; k++) {
31815 GemmMicrokernelTester()
31816 .mr(1)
31817 .nr(16)
31818 .kr(1)
31819 .sr(1)
31820 .m(1)
31821 .n(16)
31822 .k(k)
31823 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31824 }
31825 }
31826
31827 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_gt_1_strided_a) {
31828 TEST_REQUIRES_X86_AVX;
31829 for (size_t k = 2; k < 10; k++) {
31830 GemmMicrokernelTester()
31831 .mr(1)
31832 .nr(16)
31833 .kr(1)
31834 .sr(1)
31835 .m(1)
31836 .n(16)
31837 .k(k)
31838 .a_stride(11)
31839 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31840 }
31841 }
31842
31843 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_gt_1_subtile) {
31844 TEST_REQUIRES_X86_AVX;
31845 for (size_t k = 2; k < 10; k++) {
31846 for (uint32_t m = 1; m <= 1; m++) {
31847 for (uint32_t n = 1; n <= 16; n++) {
31848 GemmMicrokernelTester()
31849 .mr(1)
31850 .nr(16)
31851 .kr(1)
31852 .sr(1)
31853 .m(m)
31854 .n(n)
31855 .k(k)
31856 .iterations(1)
31857 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31858 }
31859 }
31860 }
31861 }
31862
31863 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16) {
31864 TEST_REQUIRES_X86_AVX;
31865 for (uint32_t n = 17; n < 32; n++) {
31866 for (size_t k = 1; k <= 5; k += 2) {
31867 GemmMicrokernelTester()
31868 .mr(1)
31869 .nr(16)
31870 .kr(1)
31871 .sr(1)
31872 .m(1)
31873 .n(16)
31874 .k(k)
31875 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31876 }
31877 }
31878 }
31879
31880 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16_strided_cn) {
31881 TEST_REQUIRES_X86_AVX;
31882 for (uint32_t n = 17; n < 32; n++) {
31883 for (size_t k = 1; k <= 5; k += 2) {
31884 GemmMicrokernelTester()
31885 .mr(1)
31886 .nr(16)
31887 .kr(1)
31888 .sr(1)
31889 .m(1)
31890 .n(16)
31891 .k(k)
31892 .cn_stride(19)
31893 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31894 }
31895 }
31896 }
31897
31898 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16_strided_a) {
31899 TEST_REQUIRES_X86_AVX;
31900 for (uint32_t n = 17; n < 32; n++) {
31901 for (size_t k = 1; k <= 5; k += 2) {
31902 GemmMicrokernelTester()
31903 .mr(1)
31904 .nr(16)
31905 .kr(1)
31906 .sr(1)
31907 .m(1)
31908 .n(n)
31909 .k(k)
31910 .a_stride(7)
31911 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31912 }
31913 }
31914 }
31915
31916 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16_subtile) {
31917 TEST_REQUIRES_X86_AVX;
31918 for (uint32_t n = 17; n < 32; n++) {
31919 for (size_t k = 1; k <= 5; k += 2) {
31920 for (uint32_t m = 1; m <= 1; m++) {
31921 GemmMicrokernelTester()
31922 .mr(1)
31923 .nr(16)
31924 .kr(1)
31925 .sr(1)
31926 .m(m)
31927 .n(n)
31928 .k(k)
31929 .iterations(1)
31930 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31931 }
31932 }
31933 }
31934 }
31935
31936 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16) {
31937 TEST_REQUIRES_X86_AVX;
31938 for (uint32_t n = 32; n <= 48; n += 16) {
31939 for (size_t k = 1; k <= 5; k += 2) {
31940 GemmMicrokernelTester()
31941 .mr(1)
31942 .nr(16)
31943 .kr(1)
31944 .sr(1)
31945 .m(1)
31946 .n(16)
31947 .k(k)
31948 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31949 }
31950 }
31951 }
31952
31953 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16_strided_cn) {
31954 TEST_REQUIRES_X86_AVX;
31955 for (uint32_t n = 32; n <= 48; n += 16) {
31956 for (size_t k = 1; k <= 5; k += 2) {
31957 GemmMicrokernelTester()
31958 .mr(1)
31959 .nr(16)
31960 .kr(1)
31961 .sr(1)
31962 .m(1)
31963 .n(n)
31964 .k(k)
31965 .cn_stride(19)
31966 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31967 }
31968 }
31969 }
31970
31971 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16_strided_a) {
31972 TEST_REQUIRES_X86_AVX;
31973 for (uint32_t n = 32; n <= 48; n += 16) {
31974 for (size_t k = 1; k <= 5; k += 2) {
31975 GemmMicrokernelTester()
31976 .mr(1)
31977 .nr(16)
31978 .kr(1)
31979 .sr(1)
31980 .m(1)
31981 .n(n)
31982 .k(k)
31983 .a_stride(7)
31984 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
31985 }
31986 }
31987 }
31988
31989 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16_subtile) {
31990 TEST_REQUIRES_X86_AVX;
31991 for (uint32_t n = 32; n <= 48; n += 16) {
31992 for (size_t k = 1; k <= 5; k += 2) {
31993 for (uint32_t m = 1; m <= 1; m++) {
31994 GemmMicrokernelTester()
31995 .mr(1)
31996 .nr(16)
31997 .kr(1)
31998 .sr(1)
31999 .m(m)
32000 .n(n)
32001 .k(k)
32002 .iterations(1)
32003 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
32004 }
32005 }
32006 }
32007 }
32008
32009 TEST(F32_GEMM_1X16__AVX_BROADCAST, strided_cm_subtile) {
32010 TEST_REQUIRES_X86_AVX;
32011 for (size_t k = 1; k <= 5; k += 2) {
32012 for (uint32_t m = 1; m <= 1; m++) {
32013 for (uint32_t n = 1; n <= 16; n++) {
32014 GemmMicrokernelTester()
32015 .mr(1)
32016 .nr(16)
32017 .kr(1)
32018 .sr(1)
32019 .m(m)
32020 .n(n)
32021 .k(k)
32022 .cm_stride(19)
32023 .iterations(1)
32024 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
32025 }
32026 }
32027 }
32028 }
32029
32030 TEST(F32_GEMM_1X16__AVX_BROADCAST, qmin) {
32031 TEST_REQUIRES_X86_AVX;
32032 GemmMicrokernelTester()
32033 .mr(1)
32034 .nr(16)
32035 .kr(1)
32036 .sr(1)
32037 .m(1)
32038 .n(16)
32039 .k(1)
32040 .qmin(128)
32041 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
32042 }
32043
32044 TEST(F32_GEMM_1X16__AVX_BROADCAST, qmax) {
32045 TEST_REQUIRES_X86_AVX;
32046 GemmMicrokernelTester()
32047 .mr(1)
32048 .nr(16)
32049 .kr(1)
32050 .sr(1)
32051 .m(1)
32052 .n(16)
32053 .k(1)
32054 .qmax(128)
32055 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
32056 }
32057
32058 TEST(F32_GEMM_1X16__AVX_BROADCAST, strided_cm) {
32059 TEST_REQUIRES_X86_AVX;
32060 GemmMicrokernelTester()
32061 .mr(1)
32062 .nr(16)
32063 .kr(1)
32064 .sr(1)
32065 .m(1)
32066 .n(16)
32067 .k(1)
32068 .cm_stride(19)
32069 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
32070 }
32071#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32072
32073
32074#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32075 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1) {
32076 TEST_REQUIRES_X86_AVX;
32077 GemmMicrokernelTester()
32078 .mr(3)
32079 .nr(16)
32080 .kr(1)
32081 .sr(1)
32082 .m(3)
32083 .n(16)
32084 .k(1)
32085 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32086 }
32087
32088 TEST(F32_GEMM_3X16__AVX_BROADCAST, strided_cn) {
32089 TEST_REQUIRES_X86_AVX;
32090 GemmMicrokernelTester()
32091 .mr(3)
32092 .nr(16)
32093 .kr(1)
32094 .sr(1)
32095 .m(3)
32096 .n(16)
32097 .k(1)
32098 .cn_stride(19)
32099 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32100 }
32101
32102 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_strided_a) {
32103 TEST_REQUIRES_X86_AVX;
32104 GemmMicrokernelTester()
32105 .mr(3)
32106 .nr(16)
32107 .kr(1)
32108 .sr(1)
32109 .m(3)
32110 .n(16)
32111 .k(1)
32112 .a_stride(3)
32113 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32114 }
32115
32116 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_subtile) {
32117 TEST_REQUIRES_X86_AVX;
32118 for (uint32_t m = 1; m <= 3; m++) {
32119 for (uint32_t n = 1; n <= 16; n++) {
32120 GemmMicrokernelTester()
32121 .mr(3)
32122 .nr(16)
32123 .kr(1)
32124 .sr(1)
32125 .m(m)
32126 .n(n)
32127 .k(1)
32128 .iterations(1)
32129 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32130 }
32131 }
32132 }
32133
32134 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
32135 TEST_REQUIRES_X86_AVX;
32136 for (uint32_t m = 1; m <= 3; m++) {
32137 GemmMicrokernelTester()
32138 .mr(3)
32139 .nr(16)
32140 .kr(1)
32141 .sr(1)
32142 .m(m)
32143 .n(16)
32144 .k(1)
32145 .iterations(1)
32146 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32147 }
32148 }
32149
32150 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
32151 TEST_REQUIRES_X86_AVX;
32152 for (uint32_t n = 1; n <= 16; n++) {
32153 GemmMicrokernelTester()
32154 .mr(3)
32155 .nr(16)
32156 .kr(1)
32157 .sr(1)
32158 .m(3)
32159 .n(n)
32160 .k(1)
32161 .iterations(1)
32162 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32163 }
32164 }
32165
32166 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_gt_1) {
32167 TEST_REQUIRES_X86_AVX;
32168 for (size_t k = 2; k < 10; k++) {
32169 GemmMicrokernelTester()
32170 .mr(3)
32171 .nr(16)
32172 .kr(1)
32173 .sr(1)
32174 .m(3)
32175 .n(16)
32176 .k(k)
32177 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32178 }
32179 }
32180
32181 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_gt_1_strided_a) {
32182 TEST_REQUIRES_X86_AVX;
32183 for (size_t k = 2; k < 10; k++) {
32184 GemmMicrokernelTester()
32185 .mr(3)
32186 .nr(16)
32187 .kr(1)
32188 .sr(1)
32189 .m(3)
32190 .n(16)
32191 .k(k)
32192 .a_stride(11)
32193 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32194 }
32195 }
32196
32197 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_gt_1_subtile) {
32198 TEST_REQUIRES_X86_AVX;
32199 for (size_t k = 2; k < 10; k++) {
32200 for (uint32_t m = 1; m <= 3; m++) {
32201 for (uint32_t n = 1; n <= 16; n++) {
32202 GemmMicrokernelTester()
32203 .mr(3)
32204 .nr(16)
32205 .kr(1)
32206 .sr(1)
32207 .m(m)
32208 .n(n)
32209 .k(k)
32210 .iterations(1)
32211 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32212 }
32213 }
32214 }
32215 }
32216
32217 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16) {
32218 TEST_REQUIRES_X86_AVX;
32219 for (uint32_t n = 17; n < 32; n++) {
32220 for (size_t k = 1; k <= 5; k += 2) {
32221 GemmMicrokernelTester()
32222 .mr(3)
32223 .nr(16)
32224 .kr(1)
32225 .sr(1)
32226 .m(3)
32227 .n(16)
32228 .k(k)
32229 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32230 }
32231 }
32232 }
32233
32234 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
32235 TEST_REQUIRES_X86_AVX;
32236 for (uint32_t n = 17; n < 32; n++) {
32237 for (size_t k = 1; k <= 5; k += 2) {
32238 GemmMicrokernelTester()
32239 .mr(3)
32240 .nr(16)
32241 .kr(1)
32242 .sr(1)
32243 .m(3)
32244 .n(16)
32245 .k(k)
32246 .cn_stride(19)
32247 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32248 }
32249 }
32250 }
32251
32252 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16_strided_a) {
32253 TEST_REQUIRES_X86_AVX;
32254 for (uint32_t n = 17; n < 32; n++) {
32255 for (size_t k = 1; k <= 5; k += 2) {
32256 GemmMicrokernelTester()
32257 .mr(3)
32258 .nr(16)
32259 .kr(1)
32260 .sr(1)
32261 .m(3)
32262 .n(n)
32263 .k(k)
32264 .a_stride(7)
32265 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32266 }
32267 }
32268 }
32269
32270 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16_subtile) {
32271 TEST_REQUIRES_X86_AVX;
32272 for (uint32_t n = 17; n < 32; n++) {
32273 for (size_t k = 1; k <= 5; k += 2) {
32274 for (uint32_t m = 1; m <= 3; m++) {
32275 GemmMicrokernelTester()
32276 .mr(3)
32277 .nr(16)
32278 .kr(1)
32279 .sr(1)
32280 .m(m)
32281 .n(n)
32282 .k(k)
32283 .iterations(1)
32284 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32285 }
32286 }
32287 }
32288 }
32289
32290 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16) {
32291 TEST_REQUIRES_X86_AVX;
32292 for (uint32_t n = 32; n <= 48; n += 16) {
32293 for (size_t k = 1; k <= 5; k += 2) {
32294 GemmMicrokernelTester()
32295 .mr(3)
32296 .nr(16)
32297 .kr(1)
32298 .sr(1)
32299 .m(3)
32300 .n(16)
32301 .k(k)
32302 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32303 }
32304 }
32305 }
32306
32307 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
32308 TEST_REQUIRES_X86_AVX;
32309 for (uint32_t n = 32; n <= 48; n += 16) {
32310 for (size_t k = 1; k <= 5; k += 2) {
32311 GemmMicrokernelTester()
32312 .mr(3)
32313 .nr(16)
32314 .kr(1)
32315 .sr(1)
32316 .m(3)
32317 .n(n)
32318 .k(k)
32319 .cn_stride(19)
32320 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32321 }
32322 }
32323 }
32324
32325 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16_strided_a) {
32326 TEST_REQUIRES_X86_AVX;
32327 for (uint32_t n = 32; n <= 48; n += 16) {
32328 for (size_t k = 1; k <= 5; k += 2) {
32329 GemmMicrokernelTester()
32330 .mr(3)
32331 .nr(16)
32332 .kr(1)
32333 .sr(1)
32334 .m(3)
32335 .n(n)
32336 .k(k)
32337 .a_stride(7)
32338 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32339 }
32340 }
32341 }
32342
32343 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16_subtile) {
32344 TEST_REQUIRES_X86_AVX;
32345 for (uint32_t n = 32; n <= 48; n += 16) {
32346 for (size_t k = 1; k <= 5; k += 2) {
32347 for (uint32_t m = 1; m <= 3; m++) {
32348 GemmMicrokernelTester()
32349 .mr(3)
32350 .nr(16)
32351 .kr(1)
32352 .sr(1)
32353 .m(m)
32354 .n(n)
32355 .k(k)
32356 .iterations(1)
32357 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32358 }
32359 }
32360 }
32361 }
32362
32363 TEST(F32_GEMM_3X16__AVX_BROADCAST, strided_cm_subtile) {
32364 TEST_REQUIRES_X86_AVX;
32365 for (size_t k = 1; k <= 5; k += 2) {
32366 for (uint32_t m = 1; m <= 3; m++) {
32367 for (uint32_t n = 1; n <= 16; n++) {
32368 GemmMicrokernelTester()
32369 .mr(3)
32370 .nr(16)
32371 .kr(1)
32372 .sr(1)
32373 .m(m)
32374 .n(n)
32375 .k(k)
32376 .cm_stride(19)
32377 .iterations(1)
32378 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32379 }
32380 }
32381 }
32382 }
32383
32384 TEST(F32_GEMM_3X16__AVX_BROADCAST, qmin) {
32385 TEST_REQUIRES_X86_AVX;
32386 GemmMicrokernelTester()
32387 .mr(3)
32388 .nr(16)
32389 .kr(1)
32390 .sr(1)
32391 .m(3)
32392 .n(16)
32393 .k(1)
32394 .qmin(128)
32395 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32396 }
32397
32398 TEST(F32_GEMM_3X16__AVX_BROADCAST, qmax) {
32399 TEST_REQUIRES_X86_AVX;
32400 GemmMicrokernelTester()
32401 .mr(3)
32402 .nr(16)
32403 .kr(1)
32404 .sr(1)
32405 .m(3)
32406 .n(16)
32407 .k(1)
32408 .qmax(128)
32409 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32410 }
32411
32412 TEST(F32_GEMM_3X16__AVX_BROADCAST, strided_cm) {
32413 TEST_REQUIRES_X86_AVX;
32414 GemmMicrokernelTester()
32415 .mr(3)
32416 .nr(16)
32417 .kr(1)
32418 .sr(1)
32419 .m(3)
32420 .n(16)
32421 .k(1)
32422 .cm_stride(19)
32423 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
32424 }
32425#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32426
32427
32428#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32429 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1) {
32430 TEST_REQUIRES_X86_AVX;
32431 GemmMicrokernelTester()
32432 .mr(4)
32433 .nr(16)
32434 .kr(1)
32435 .sr(1)
32436 .m(4)
32437 .n(16)
32438 .k(1)
32439 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32440 }
32441
32442 TEST(F32_GEMM_4X16__AVX_BROADCAST, strided_cn) {
32443 TEST_REQUIRES_X86_AVX;
32444 GemmMicrokernelTester()
32445 .mr(4)
32446 .nr(16)
32447 .kr(1)
32448 .sr(1)
32449 .m(4)
32450 .n(16)
32451 .k(1)
32452 .cn_stride(19)
32453 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32454 }
32455
32456 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_strided_a) {
32457 TEST_REQUIRES_X86_AVX;
32458 GemmMicrokernelTester()
32459 .mr(4)
32460 .nr(16)
32461 .kr(1)
32462 .sr(1)
32463 .m(4)
32464 .n(16)
32465 .k(1)
32466 .a_stride(3)
32467 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32468 }
32469
32470 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_subtile) {
32471 TEST_REQUIRES_X86_AVX;
32472 for (uint32_t m = 1; m <= 4; m++) {
32473 for (uint32_t n = 1; n <= 16; n++) {
32474 GemmMicrokernelTester()
32475 .mr(4)
32476 .nr(16)
32477 .kr(1)
32478 .sr(1)
32479 .m(m)
32480 .n(n)
32481 .k(1)
32482 .iterations(1)
32483 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32484 }
32485 }
32486 }
32487
32488 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_m) {
32489 TEST_REQUIRES_X86_AVX;
32490 for (uint32_t m = 1; m <= 4; m++) {
32491 GemmMicrokernelTester()
32492 .mr(4)
32493 .nr(16)
32494 .kr(1)
32495 .sr(1)
32496 .m(m)
32497 .n(16)
32498 .k(1)
32499 .iterations(1)
32500 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32501 }
32502 }
32503
32504 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_n) {
32505 TEST_REQUIRES_X86_AVX;
32506 for (uint32_t n = 1; n <= 16; n++) {
32507 GemmMicrokernelTester()
32508 .mr(4)
32509 .nr(16)
32510 .kr(1)
32511 .sr(1)
32512 .m(4)
32513 .n(n)
32514 .k(1)
32515 .iterations(1)
32516 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32517 }
32518 }
32519
32520 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_gt_1) {
32521 TEST_REQUIRES_X86_AVX;
32522 for (size_t k = 2; k < 10; k++) {
32523 GemmMicrokernelTester()
32524 .mr(4)
32525 .nr(16)
32526 .kr(1)
32527 .sr(1)
32528 .m(4)
32529 .n(16)
32530 .k(k)
32531 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32532 }
32533 }
32534
32535 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_gt_1_strided_a) {
32536 TEST_REQUIRES_X86_AVX;
32537 for (size_t k = 2; k < 10; k++) {
32538 GemmMicrokernelTester()
32539 .mr(4)
32540 .nr(16)
32541 .kr(1)
32542 .sr(1)
32543 .m(4)
32544 .n(16)
32545 .k(k)
32546 .a_stride(11)
32547 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32548 }
32549 }
32550
32551 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_gt_1_subtile) {
32552 TEST_REQUIRES_X86_AVX;
32553 for (size_t k = 2; k < 10; k++) {
32554 for (uint32_t m = 1; m <= 4; m++) {
32555 for (uint32_t n = 1; n <= 16; n++) {
32556 GemmMicrokernelTester()
32557 .mr(4)
32558 .nr(16)
32559 .kr(1)
32560 .sr(1)
32561 .m(m)
32562 .n(n)
32563 .k(k)
32564 .iterations(1)
32565 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32566 }
32567 }
32568 }
32569 }
32570
32571 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16) {
32572 TEST_REQUIRES_X86_AVX;
32573 for (uint32_t n = 17; n < 32; n++) {
32574 for (size_t k = 1; k <= 5; k += 2) {
32575 GemmMicrokernelTester()
32576 .mr(4)
32577 .nr(16)
32578 .kr(1)
32579 .sr(1)
32580 .m(4)
32581 .n(16)
32582 .k(k)
32583 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32584 }
32585 }
32586 }
32587
32588 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16_strided_cn) {
32589 TEST_REQUIRES_X86_AVX;
32590 for (uint32_t n = 17; n < 32; n++) {
32591 for (size_t k = 1; k <= 5; k += 2) {
32592 GemmMicrokernelTester()
32593 .mr(4)
32594 .nr(16)
32595 .kr(1)
32596 .sr(1)
32597 .m(4)
32598 .n(16)
32599 .k(k)
32600 .cn_stride(19)
32601 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32602 }
32603 }
32604 }
32605
32606 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16_strided_a) {
32607 TEST_REQUIRES_X86_AVX;
32608 for (uint32_t n = 17; n < 32; n++) {
32609 for (size_t k = 1; k <= 5; k += 2) {
32610 GemmMicrokernelTester()
32611 .mr(4)
32612 .nr(16)
32613 .kr(1)
32614 .sr(1)
32615 .m(4)
32616 .n(n)
32617 .k(k)
32618 .a_stride(7)
32619 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32620 }
32621 }
32622 }
32623
32624 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16_subtile) {
32625 TEST_REQUIRES_X86_AVX;
32626 for (uint32_t n = 17; n < 32; n++) {
32627 for (size_t k = 1; k <= 5; k += 2) {
32628 for (uint32_t m = 1; m <= 4; m++) {
32629 GemmMicrokernelTester()
32630 .mr(4)
32631 .nr(16)
32632 .kr(1)
32633 .sr(1)
32634 .m(m)
32635 .n(n)
32636 .k(k)
32637 .iterations(1)
32638 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32639 }
32640 }
32641 }
32642 }
32643
32644 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16) {
32645 TEST_REQUIRES_X86_AVX;
32646 for (uint32_t n = 32; n <= 48; n += 16) {
32647 for (size_t k = 1; k <= 5; k += 2) {
32648 GemmMicrokernelTester()
32649 .mr(4)
32650 .nr(16)
32651 .kr(1)
32652 .sr(1)
32653 .m(4)
32654 .n(16)
32655 .k(k)
32656 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32657 }
32658 }
32659 }
32660
32661 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16_strided_cn) {
32662 TEST_REQUIRES_X86_AVX;
32663 for (uint32_t n = 32; n <= 48; n += 16) {
32664 for (size_t k = 1; k <= 5; k += 2) {
32665 GemmMicrokernelTester()
32666 .mr(4)
32667 .nr(16)
32668 .kr(1)
32669 .sr(1)
32670 .m(4)
32671 .n(n)
32672 .k(k)
32673 .cn_stride(19)
32674 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32675 }
32676 }
32677 }
32678
32679 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16_strided_a) {
32680 TEST_REQUIRES_X86_AVX;
32681 for (uint32_t n = 32; n <= 48; n += 16) {
32682 for (size_t k = 1; k <= 5; k += 2) {
32683 GemmMicrokernelTester()
32684 .mr(4)
32685 .nr(16)
32686 .kr(1)
32687 .sr(1)
32688 .m(4)
32689 .n(n)
32690 .k(k)
32691 .a_stride(7)
32692 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32693 }
32694 }
32695 }
32696
32697 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16_subtile) {
32698 TEST_REQUIRES_X86_AVX;
32699 for (uint32_t n = 32; n <= 48; n += 16) {
32700 for (size_t k = 1; k <= 5; k += 2) {
32701 for (uint32_t m = 1; m <= 4; m++) {
32702 GemmMicrokernelTester()
32703 .mr(4)
32704 .nr(16)
32705 .kr(1)
32706 .sr(1)
32707 .m(m)
32708 .n(n)
32709 .k(k)
32710 .iterations(1)
32711 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32712 }
32713 }
32714 }
32715 }
32716
32717 TEST(F32_GEMM_4X16__AVX_BROADCAST, strided_cm_subtile) {
32718 TEST_REQUIRES_X86_AVX;
32719 for (size_t k = 1; k <= 5; k += 2) {
32720 for (uint32_t m = 1; m <= 4; m++) {
32721 for (uint32_t n = 1; n <= 16; n++) {
32722 GemmMicrokernelTester()
32723 .mr(4)
32724 .nr(16)
32725 .kr(1)
32726 .sr(1)
32727 .m(m)
32728 .n(n)
32729 .k(k)
32730 .cm_stride(19)
32731 .iterations(1)
32732 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32733 }
32734 }
32735 }
32736 }
32737
32738 TEST(F32_GEMM_4X16__AVX_BROADCAST, qmin) {
32739 TEST_REQUIRES_X86_AVX;
32740 GemmMicrokernelTester()
32741 .mr(4)
32742 .nr(16)
32743 .kr(1)
32744 .sr(1)
32745 .m(4)
32746 .n(16)
32747 .k(1)
32748 .qmin(128)
32749 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32750 }
32751
32752 TEST(F32_GEMM_4X16__AVX_BROADCAST, qmax) {
32753 TEST_REQUIRES_X86_AVX;
32754 GemmMicrokernelTester()
32755 .mr(4)
32756 .nr(16)
32757 .kr(1)
32758 .sr(1)
32759 .m(4)
32760 .n(16)
32761 .k(1)
32762 .qmax(128)
32763 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32764 }
32765
32766 TEST(F32_GEMM_4X16__AVX_BROADCAST, strided_cm) {
32767 TEST_REQUIRES_X86_AVX;
32768 GemmMicrokernelTester()
32769 .mr(4)
32770 .nr(16)
32771 .kr(1)
32772 .sr(1)
32773 .m(4)
32774 .n(16)
32775 .k(1)
32776 .cm_stride(19)
32777 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
32778 }
32779#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32780
32781
32782#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32783 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1) {
32784 TEST_REQUIRES_X86_AVX;
32785 GemmMicrokernelTester()
32786 .mr(5)
32787 .nr(16)
32788 .kr(1)
32789 .sr(1)
32790 .m(5)
32791 .n(16)
32792 .k(1)
32793 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32794 }
32795
32796 TEST(F32_GEMM_5X16__AVX_BROADCAST, strided_cn) {
32797 TEST_REQUIRES_X86_AVX;
32798 GemmMicrokernelTester()
32799 .mr(5)
32800 .nr(16)
32801 .kr(1)
32802 .sr(1)
32803 .m(5)
32804 .n(16)
32805 .k(1)
32806 .cn_stride(19)
32807 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32808 }
32809
32810 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_strided_a) {
32811 TEST_REQUIRES_X86_AVX;
32812 GemmMicrokernelTester()
32813 .mr(5)
32814 .nr(16)
32815 .kr(1)
32816 .sr(1)
32817 .m(5)
32818 .n(16)
32819 .k(1)
32820 .a_stride(3)
32821 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32822 }
32823
32824 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_subtile) {
32825 TEST_REQUIRES_X86_AVX;
32826 for (uint32_t m = 1; m <= 5; m++) {
32827 for (uint32_t n = 1; n <= 16; n++) {
32828 GemmMicrokernelTester()
32829 .mr(5)
32830 .nr(16)
32831 .kr(1)
32832 .sr(1)
32833 .m(m)
32834 .n(n)
32835 .k(1)
32836 .iterations(1)
32837 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32838 }
32839 }
32840 }
32841
32842 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_m) {
32843 TEST_REQUIRES_X86_AVX;
32844 for (uint32_t m = 1; m <= 5; m++) {
32845 GemmMicrokernelTester()
32846 .mr(5)
32847 .nr(16)
32848 .kr(1)
32849 .sr(1)
32850 .m(m)
32851 .n(16)
32852 .k(1)
32853 .iterations(1)
32854 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32855 }
32856 }
32857
32858 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_n) {
32859 TEST_REQUIRES_X86_AVX;
32860 for (uint32_t n = 1; n <= 16; n++) {
32861 GemmMicrokernelTester()
32862 .mr(5)
32863 .nr(16)
32864 .kr(1)
32865 .sr(1)
32866 .m(5)
32867 .n(n)
32868 .k(1)
32869 .iterations(1)
32870 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32871 }
32872 }
32873
32874 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_gt_1) {
32875 TEST_REQUIRES_X86_AVX;
32876 for (size_t k = 2; k < 10; k++) {
32877 GemmMicrokernelTester()
32878 .mr(5)
32879 .nr(16)
32880 .kr(1)
32881 .sr(1)
32882 .m(5)
32883 .n(16)
32884 .k(k)
32885 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32886 }
32887 }
32888
32889 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_gt_1_strided_a) {
32890 TEST_REQUIRES_X86_AVX;
32891 for (size_t k = 2; k < 10; k++) {
32892 GemmMicrokernelTester()
32893 .mr(5)
32894 .nr(16)
32895 .kr(1)
32896 .sr(1)
32897 .m(5)
32898 .n(16)
32899 .k(k)
32900 .a_stride(11)
32901 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32902 }
32903 }
32904
32905 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_gt_1_subtile) {
32906 TEST_REQUIRES_X86_AVX;
32907 for (size_t k = 2; k < 10; k++) {
32908 for (uint32_t m = 1; m <= 5; m++) {
32909 for (uint32_t n = 1; n <= 16; n++) {
32910 GemmMicrokernelTester()
32911 .mr(5)
32912 .nr(16)
32913 .kr(1)
32914 .sr(1)
32915 .m(m)
32916 .n(n)
32917 .k(k)
32918 .iterations(1)
32919 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32920 }
32921 }
32922 }
32923 }
32924
32925 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16) {
32926 TEST_REQUIRES_X86_AVX;
32927 for (uint32_t n = 17; n < 32; n++) {
32928 for (size_t k = 1; k <= 5; k += 2) {
32929 GemmMicrokernelTester()
32930 .mr(5)
32931 .nr(16)
32932 .kr(1)
32933 .sr(1)
32934 .m(5)
32935 .n(16)
32936 .k(k)
32937 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32938 }
32939 }
32940 }
32941
32942 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16_strided_cn) {
32943 TEST_REQUIRES_X86_AVX;
32944 for (uint32_t n = 17; n < 32; n++) {
32945 for (size_t k = 1; k <= 5; k += 2) {
32946 GemmMicrokernelTester()
32947 .mr(5)
32948 .nr(16)
32949 .kr(1)
32950 .sr(1)
32951 .m(5)
32952 .n(16)
32953 .k(k)
32954 .cn_stride(19)
32955 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32956 }
32957 }
32958 }
32959
32960 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16_strided_a) {
32961 TEST_REQUIRES_X86_AVX;
32962 for (uint32_t n = 17; n < 32; n++) {
32963 for (size_t k = 1; k <= 5; k += 2) {
32964 GemmMicrokernelTester()
32965 .mr(5)
32966 .nr(16)
32967 .kr(1)
32968 .sr(1)
32969 .m(5)
32970 .n(n)
32971 .k(k)
32972 .a_stride(7)
32973 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32974 }
32975 }
32976 }
32977
32978 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16_subtile) {
32979 TEST_REQUIRES_X86_AVX;
32980 for (uint32_t n = 17; n < 32; n++) {
32981 for (size_t k = 1; k <= 5; k += 2) {
32982 for (uint32_t m = 1; m <= 5; m++) {
32983 GemmMicrokernelTester()
32984 .mr(5)
32985 .nr(16)
32986 .kr(1)
32987 .sr(1)
32988 .m(m)
32989 .n(n)
32990 .k(k)
32991 .iterations(1)
32992 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
32993 }
32994 }
32995 }
32996 }
32997
32998 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16) {
32999 TEST_REQUIRES_X86_AVX;
33000 for (uint32_t n = 32; n <= 48; n += 16) {
33001 for (size_t k = 1; k <= 5; k += 2) {
33002 GemmMicrokernelTester()
33003 .mr(5)
33004 .nr(16)
33005 .kr(1)
33006 .sr(1)
33007 .m(5)
33008 .n(16)
33009 .k(k)
33010 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33011 }
33012 }
33013 }
33014
33015 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16_strided_cn) {
33016 TEST_REQUIRES_X86_AVX;
33017 for (uint32_t n = 32; n <= 48; n += 16) {
33018 for (size_t k = 1; k <= 5; k += 2) {
33019 GemmMicrokernelTester()
33020 .mr(5)
33021 .nr(16)
33022 .kr(1)
33023 .sr(1)
33024 .m(5)
33025 .n(n)
33026 .k(k)
33027 .cn_stride(19)
33028 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33029 }
33030 }
33031 }
33032
33033 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16_strided_a) {
33034 TEST_REQUIRES_X86_AVX;
33035 for (uint32_t n = 32; n <= 48; n += 16) {
33036 for (size_t k = 1; k <= 5; k += 2) {
33037 GemmMicrokernelTester()
33038 .mr(5)
33039 .nr(16)
33040 .kr(1)
33041 .sr(1)
33042 .m(5)
33043 .n(n)
33044 .k(k)
33045 .a_stride(7)
33046 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33047 }
33048 }
33049 }
33050
33051 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16_subtile) {
33052 TEST_REQUIRES_X86_AVX;
33053 for (uint32_t n = 32; n <= 48; n += 16) {
33054 for (size_t k = 1; k <= 5; k += 2) {
33055 for (uint32_t m = 1; m <= 5; m++) {
33056 GemmMicrokernelTester()
33057 .mr(5)
33058 .nr(16)
33059 .kr(1)
33060 .sr(1)
33061 .m(m)
33062 .n(n)
33063 .k(k)
33064 .iterations(1)
33065 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33066 }
33067 }
33068 }
33069 }
33070
33071 TEST(F32_GEMM_5X16__AVX_BROADCAST, strided_cm_subtile) {
33072 TEST_REQUIRES_X86_AVX;
33073 for (size_t k = 1; k <= 5; k += 2) {
33074 for (uint32_t m = 1; m <= 5; m++) {
33075 for (uint32_t n = 1; n <= 16; n++) {
33076 GemmMicrokernelTester()
33077 .mr(5)
33078 .nr(16)
33079 .kr(1)
33080 .sr(1)
33081 .m(m)
33082 .n(n)
33083 .k(k)
33084 .cm_stride(19)
33085 .iterations(1)
33086 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33087 }
33088 }
33089 }
33090 }
33091
33092 TEST(F32_GEMM_5X16__AVX_BROADCAST, qmin) {
33093 TEST_REQUIRES_X86_AVX;
33094 GemmMicrokernelTester()
33095 .mr(5)
33096 .nr(16)
33097 .kr(1)
33098 .sr(1)
33099 .m(5)
33100 .n(16)
33101 .k(1)
33102 .qmin(128)
33103 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33104 }
33105
33106 TEST(F32_GEMM_5X16__AVX_BROADCAST, qmax) {
33107 TEST_REQUIRES_X86_AVX;
33108 GemmMicrokernelTester()
33109 .mr(5)
33110 .nr(16)
33111 .kr(1)
33112 .sr(1)
33113 .m(5)
33114 .n(16)
33115 .k(1)
33116 .qmax(128)
33117 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33118 }
33119
33120 TEST(F32_GEMM_5X16__AVX_BROADCAST, strided_cm) {
33121 TEST_REQUIRES_X86_AVX;
33122 GemmMicrokernelTester()
33123 .mr(5)
33124 .nr(16)
33125 .kr(1)
33126 .sr(1)
33127 .m(5)
33128 .n(16)
33129 .k(1)
33130 .cm_stride(19)
33131 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
33132 }
33133#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33134
33135
33136#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33137 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1) {
33138 TEST_REQUIRES_X86_FMA3;
33139 GemmMicrokernelTester()
33140 .mr(1)
33141 .nr(8)
33142 .kr(1)
33143 .sr(1)
33144 .m(1)
33145 .n(8)
33146 .k(1)
33147 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33148 }
33149
33150 TEST(F32_GEMM_1X8__FMA3_BROADCAST, strided_cn) {
33151 TEST_REQUIRES_X86_FMA3;
33152 GemmMicrokernelTester()
33153 .mr(1)
33154 .nr(8)
33155 .kr(1)
33156 .sr(1)
33157 .m(1)
33158 .n(8)
33159 .k(1)
33160 .cn_stride(11)
33161 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33162 }
33163
33164 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_strided_a) {
33165 TEST_REQUIRES_X86_FMA3;
33166 GemmMicrokernelTester()
33167 .mr(1)
33168 .nr(8)
33169 .kr(1)
33170 .sr(1)
33171 .m(1)
33172 .n(8)
33173 .k(1)
33174 .a_stride(3)
33175 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33176 }
33177
33178 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
33179 TEST_REQUIRES_X86_FMA3;
33180 for (uint32_t m = 1; m <= 1; m++) {
33181 for (uint32_t n = 1; n <= 8; n++) {
33182 GemmMicrokernelTester()
33183 .mr(1)
33184 .nr(8)
33185 .kr(1)
33186 .sr(1)
33187 .m(m)
33188 .n(n)
33189 .k(1)
33190 .iterations(1)
33191 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33192 }
33193 }
33194 }
33195
33196 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
33197 TEST_REQUIRES_X86_FMA3;
33198 for (uint32_t m = 1; m <= 1; m++) {
33199 GemmMicrokernelTester()
33200 .mr(1)
33201 .nr(8)
33202 .kr(1)
33203 .sr(1)
33204 .m(m)
33205 .n(8)
33206 .k(1)
33207 .iterations(1)
33208 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33209 }
33210 }
33211
33212 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
33213 TEST_REQUIRES_X86_FMA3;
33214 for (uint32_t n = 1; n <= 8; n++) {
33215 GemmMicrokernelTester()
33216 .mr(1)
33217 .nr(8)
33218 .kr(1)
33219 .sr(1)
33220 .m(1)
33221 .n(n)
33222 .k(1)
33223 .iterations(1)
33224 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33225 }
33226 }
33227
33228 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_gt_1) {
33229 TEST_REQUIRES_X86_FMA3;
33230 for (size_t k = 2; k < 10; k++) {
33231 GemmMicrokernelTester()
33232 .mr(1)
33233 .nr(8)
33234 .kr(1)
33235 .sr(1)
33236 .m(1)
33237 .n(8)
33238 .k(k)
33239 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33240 }
33241 }
33242
33243 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_gt_1_strided_a) {
33244 TEST_REQUIRES_X86_FMA3;
33245 for (size_t k = 2; k < 10; k++) {
33246 GemmMicrokernelTester()
33247 .mr(1)
33248 .nr(8)
33249 .kr(1)
33250 .sr(1)
33251 .m(1)
33252 .n(8)
33253 .k(k)
33254 .a_stride(11)
33255 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33256 }
33257 }
33258
33259 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
33260 TEST_REQUIRES_X86_FMA3;
33261 for (size_t k = 2; k < 10; k++) {
33262 for (uint32_t m = 1; m <= 1; m++) {
33263 for (uint32_t n = 1; n <= 8; n++) {
33264 GemmMicrokernelTester()
33265 .mr(1)
33266 .nr(8)
33267 .kr(1)
33268 .sr(1)
33269 .m(m)
33270 .n(n)
33271 .k(k)
33272 .iterations(1)
33273 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33274 }
33275 }
33276 }
33277 }
33278
33279 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8) {
33280 TEST_REQUIRES_X86_FMA3;
33281 for (uint32_t n = 9; n < 16; n++) {
33282 for (size_t k = 1; k <= 5; k += 2) {
33283 GemmMicrokernelTester()
33284 .mr(1)
33285 .nr(8)
33286 .kr(1)
33287 .sr(1)
33288 .m(1)
33289 .n(8)
33290 .k(k)
33291 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33292 }
33293 }
33294 }
33295
33296 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
33297 TEST_REQUIRES_X86_FMA3;
33298 for (uint32_t n = 9; n < 16; n++) {
33299 for (size_t k = 1; k <= 5; k += 2) {
33300 GemmMicrokernelTester()
33301 .mr(1)
33302 .nr(8)
33303 .kr(1)
33304 .sr(1)
33305 .m(1)
33306 .n(8)
33307 .k(k)
33308 .cn_stride(11)
33309 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33310 }
33311 }
33312 }
33313
33314 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8_strided_a) {
33315 TEST_REQUIRES_X86_FMA3;
33316 for (uint32_t n = 9; n < 16; n++) {
33317 for (size_t k = 1; k <= 5; k += 2) {
33318 GemmMicrokernelTester()
33319 .mr(1)
33320 .nr(8)
33321 .kr(1)
33322 .sr(1)
33323 .m(1)
33324 .n(n)
33325 .k(k)
33326 .a_stride(7)
33327 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33328 }
33329 }
33330 }
33331
33332 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
33333 TEST_REQUIRES_X86_FMA3;
33334 for (uint32_t n = 9; n < 16; n++) {
33335 for (size_t k = 1; k <= 5; k += 2) {
33336 for (uint32_t m = 1; m <= 1; m++) {
33337 GemmMicrokernelTester()
33338 .mr(1)
33339 .nr(8)
33340 .kr(1)
33341 .sr(1)
33342 .m(m)
33343 .n(n)
33344 .k(k)
33345 .iterations(1)
33346 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33347 }
33348 }
33349 }
33350 }
33351
33352 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8) {
33353 TEST_REQUIRES_X86_FMA3;
33354 for (uint32_t n = 16; n <= 24; n += 8) {
33355 for (size_t k = 1; k <= 5; k += 2) {
33356 GemmMicrokernelTester()
33357 .mr(1)
33358 .nr(8)
33359 .kr(1)
33360 .sr(1)
33361 .m(1)
33362 .n(8)
33363 .k(k)
33364 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33365 }
33366 }
33367 }
33368
33369 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
33370 TEST_REQUIRES_X86_FMA3;
33371 for (uint32_t n = 16; n <= 24; n += 8) {
33372 for (size_t k = 1; k <= 5; k += 2) {
33373 GemmMicrokernelTester()
33374 .mr(1)
33375 .nr(8)
33376 .kr(1)
33377 .sr(1)
33378 .m(1)
33379 .n(n)
33380 .k(k)
33381 .cn_stride(11)
33382 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33383 }
33384 }
33385 }
33386
33387 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8_strided_a) {
33388 TEST_REQUIRES_X86_FMA3;
33389 for (uint32_t n = 16; n <= 24; n += 8) {
33390 for (size_t k = 1; k <= 5; k += 2) {
33391 GemmMicrokernelTester()
33392 .mr(1)
33393 .nr(8)
33394 .kr(1)
33395 .sr(1)
33396 .m(1)
33397 .n(n)
33398 .k(k)
33399 .a_stride(7)
33400 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33401 }
33402 }
33403 }
33404
33405 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8_subtile) {
33406 TEST_REQUIRES_X86_FMA3;
33407 for (uint32_t n = 16; n <= 24; n += 8) {
33408 for (size_t k = 1; k <= 5; k += 2) {
33409 for (uint32_t m = 1; m <= 1; m++) {
33410 GemmMicrokernelTester()
33411 .mr(1)
33412 .nr(8)
33413 .kr(1)
33414 .sr(1)
33415 .m(m)
33416 .n(n)
33417 .k(k)
33418 .iterations(1)
33419 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33420 }
33421 }
33422 }
33423 }
33424
33425 TEST(F32_GEMM_1X8__FMA3_BROADCAST, strided_cm_subtile) {
33426 TEST_REQUIRES_X86_FMA3;
33427 for (size_t k = 1; k <= 5; k += 2) {
33428 for (uint32_t m = 1; m <= 1; m++) {
33429 for (uint32_t n = 1; n <= 8; n++) {
33430 GemmMicrokernelTester()
33431 .mr(1)
33432 .nr(8)
33433 .kr(1)
33434 .sr(1)
33435 .m(m)
33436 .n(n)
33437 .k(k)
33438 .cm_stride(11)
33439 .iterations(1)
33440 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33441 }
33442 }
33443 }
33444 }
33445
33446 TEST(F32_GEMM_1X8__FMA3_BROADCAST, qmin) {
33447 TEST_REQUIRES_X86_FMA3;
33448 GemmMicrokernelTester()
33449 .mr(1)
33450 .nr(8)
33451 .kr(1)
33452 .sr(1)
33453 .m(1)
33454 .n(8)
33455 .k(1)
33456 .qmin(128)
33457 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33458 }
33459
33460 TEST(F32_GEMM_1X8__FMA3_BROADCAST, qmax) {
33461 TEST_REQUIRES_X86_FMA3;
33462 GemmMicrokernelTester()
33463 .mr(1)
33464 .nr(8)
33465 .kr(1)
33466 .sr(1)
33467 .m(1)
33468 .n(8)
33469 .k(1)
33470 .qmax(128)
33471 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33472 }
33473
33474 TEST(F32_GEMM_1X8__FMA3_BROADCAST, strided_cm) {
33475 TEST_REQUIRES_X86_FMA3;
33476 GemmMicrokernelTester()
33477 .mr(1)
33478 .nr(8)
33479 .kr(1)
33480 .sr(1)
33481 .m(1)
33482 .n(8)
33483 .k(1)
33484 .cm_stride(11)
33485 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
33486 }
33487#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33488
33489
33490#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33491 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1) {
33492 TEST_REQUIRES_X86_FMA3;
33493 GemmMicrokernelTester()
33494 .mr(4)
33495 .nr(8)
33496 .kr(1)
33497 .sr(1)
33498 .m(4)
33499 .n(8)
33500 .k(1)
33501 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33502 }
33503
33504 TEST(F32_GEMM_4X8__FMA3_BROADCAST, strided_cn) {
33505 TEST_REQUIRES_X86_FMA3;
33506 GemmMicrokernelTester()
33507 .mr(4)
33508 .nr(8)
33509 .kr(1)
33510 .sr(1)
33511 .m(4)
33512 .n(8)
33513 .k(1)
33514 .cn_stride(11)
33515 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33516 }
33517
33518 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_strided_a) {
33519 TEST_REQUIRES_X86_FMA3;
33520 GemmMicrokernelTester()
33521 .mr(4)
33522 .nr(8)
33523 .kr(1)
33524 .sr(1)
33525 .m(4)
33526 .n(8)
33527 .k(1)
33528 .a_stride(3)
33529 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33530 }
33531
33532 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
33533 TEST_REQUIRES_X86_FMA3;
33534 for (uint32_t m = 1; m <= 4; m++) {
33535 for (uint32_t n = 1; n <= 8; n++) {
33536 GemmMicrokernelTester()
33537 .mr(4)
33538 .nr(8)
33539 .kr(1)
33540 .sr(1)
33541 .m(m)
33542 .n(n)
33543 .k(1)
33544 .iterations(1)
33545 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33546 }
33547 }
33548 }
33549
33550 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
33551 TEST_REQUIRES_X86_FMA3;
33552 for (uint32_t m = 1; m <= 4; m++) {
33553 GemmMicrokernelTester()
33554 .mr(4)
33555 .nr(8)
33556 .kr(1)
33557 .sr(1)
33558 .m(m)
33559 .n(8)
33560 .k(1)
33561 .iterations(1)
33562 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33563 }
33564 }
33565
33566 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
33567 TEST_REQUIRES_X86_FMA3;
33568 for (uint32_t n = 1; n <= 8; n++) {
33569 GemmMicrokernelTester()
33570 .mr(4)
33571 .nr(8)
33572 .kr(1)
33573 .sr(1)
33574 .m(4)
33575 .n(n)
33576 .k(1)
33577 .iterations(1)
33578 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33579 }
33580 }
33581
33582 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_gt_1) {
33583 TEST_REQUIRES_X86_FMA3;
33584 for (size_t k = 2; k < 10; k++) {
33585 GemmMicrokernelTester()
33586 .mr(4)
33587 .nr(8)
33588 .kr(1)
33589 .sr(1)
33590 .m(4)
33591 .n(8)
33592 .k(k)
33593 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33594 }
33595 }
33596
33597 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_gt_1_strided_a) {
33598 TEST_REQUIRES_X86_FMA3;
33599 for (size_t k = 2; k < 10; k++) {
33600 GemmMicrokernelTester()
33601 .mr(4)
33602 .nr(8)
33603 .kr(1)
33604 .sr(1)
33605 .m(4)
33606 .n(8)
33607 .k(k)
33608 .a_stride(11)
33609 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33610 }
33611 }
33612
33613 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
33614 TEST_REQUIRES_X86_FMA3;
33615 for (size_t k = 2; k < 10; k++) {
33616 for (uint32_t m = 1; m <= 4; m++) {
33617 for (uint32_t n = 1; n <= 8; n++) {
33618 GemmMicrokernelTester()
33619 .mr(4)
33620 .nr(8)
33621 .kr(1)
33622 .sr(1)
33623 .m(m)
33624 .n(n)
33625 .k(k)
33626 .iterations(1)
33627 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33628 }
33629 }
33630 }
33631 }
33632
33633 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8) {
33634 TEST_REQUIRES_X86_FMA3;
33635 for (uint32_t n = 9; n < 16; n++) {
33636 for (size_t k = 1; k <= 5; k += 2) {
33637 GemmMicrokernelTester()
33638 .mr(4)
33639 .nr(8)
33640 .kr(1)
33641 .sr(1)
33642 .m(4)
33643 .n(8)
33644 .k(k)
33645 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33646 }
33647 }
33648 }
33649
33650 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
33651 TEST_REQUIRES_X86_FMA3;
33652 for (uint32_t n = 9; n < 16; n++) {
33653 for (size_t k = 1; k <= 5; k += 2) {
33654 GemmMicrokernelTester()
33655 .mr(4)
33656 .nr(8)
33657 .kr(1)
33658 .sr(1)
33659 .m(4)
33660 .n(8)
33661 .k(k)
33662 .cn_stride(11)
33663 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33664 }
33665 }
33666 }
33667
33668 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8_strided_a) {
33669 TEST_REQUIRES_X86_FMA3;
33670 for (uint32_t n = 9; n < 16; n++) {
33671 for (size_t k = 1; k <= 5; k += 2) {
33672 GemmMicrokernelTester()
33673 .mr(4)
33674 .nr(8)
33675 .kr(1)
33676 .sr(1)
33677 .m(4)
33678 .n(n)
33679 .k(k)
33680 .a_stride(7)
33681 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33682 }
33683 }
33684 }
33685
33686 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
33687 TEST_REQUIRES_X86_FMA3;
33688 for (uint32_t n = 9; n < 16; n++) {
33689 for (size_t k = 1; k <= 5; k += 2) {
33690 for (uint32_t m = 1; m <= 4; m++) {
33691 GemmMicrokernelTester()
33692 .mr(4)
33693 .nr(8)
33694 .kr(1)
33695 .sr(1)
33696 .m(m)
33697 .n(n)
33698 .k(k)
33699 .iterations(1)
33700 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33701 }
33702 }
33703 }
33704 }
33705
33706 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8) {
33707 TEST_REQUIRES_X86_FMA3;
33708 for (uint32_t n = 16; n <= 24; n += 8) {
33709 for (size_t k = 1; k <= 5; k += 2) {
33710 GemmMicrokernelTester()
33711 .mr(4)
33712 .nr(8)
33713 .kr(1)
33714 .sr(1)
33715 .m(4)
33716 .n(8)
33717 .k(k)
33718 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33719 }
33720 }
33721 }
33722
33723 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
33724 TEST_REQUIRES_X86_FMA3;
33725 for (uint32_t n = 16; n <= 24; n += 8) {
33726 for (size_t k = 1; k <= 5; k += 2) {
33727 GemmMicrokernelTester()
33728 .mr(4)
33729 .nr(8)
33730 .kr(1)
33731 .sr(1)
33732 .m(4)
33733 .n(n)
33734 .k(k)
33735 .cn_stride(11)
33736 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33737 }
33738 }
33739 }
33740
33741 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8_strided_a) {
33742 TEST_REQUIRES_X86_FMA3;
33743 for (uint32_t n = 16; n <= 24; n += 8) {
33744 for (size_t k = 1; k <= 5; k += 2) {
33745 GemmMicrokernelTester()
33746 .mr(4)
33747 .nr(8)
33748 .kr(1)
33749 .sr(1)
33750 .m(4)
33751 .n(n)
33752 .k(k)
33753 .a_stride(7)
33754 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33755 }
33756 }
33757 }
33758
33759 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8_subtile) {
33760 TEST_REQUIRES_X86_FMA3;
33761 for (uint32_t n = 16; n <= 24; n += 8) {
33762 for (size_t k = 1; k <= 5; k += 2) {
33763 for (uint32_t m = 1; m <= 4; m++) {
33764 GemmMicrokernelTester()
33765 .mr(4)
33766 .nr(8)
33767 .kr(1)
33768 .sr(1)
33769 .m(m)
33770 .n(n)
33771 .k(k)
33772 .iterations(1)
33773 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33774 }
33775 }
33776 }
33777 }
33778
33779 TEST(F32_GEMM_4X8__FMA3_BROADCAST, strided_cm_subtile) {
33780 TEST_REQUIRES_X86_FMA3;
33781 for (size_t k = 1; k <= 5; k += 2) {
33782 for (uint32_t m = 1; m <= 4; m++) {
33783 for (uint32_t n = 1; n <= 8; n++) {
33784 GemmMicrokernelTester()
33785 .mr(4)
33786 .nr(8)
33787 .kr(1)
33788 .sr(1)
33789 .m(m)
33790 .n(n)
33791 .k(k)
33792 .cm_stride(11)
33793 .iterations(1)
33794 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33795 }
33796 }
33797 }
33798 }
33799
33800 TEST(F32_GEMM_4X8__FMA3_BROADCAST, qmin) {
33801 TEST_REQUIRES_X86_FMA3;
33802 GemmMicrokernelTester()
33803 .mr(4)
33804 .nr(8)
33805 .kr(1)
33806 .sr(1)
33807 .m(4)
33808 .n(8)
33809 .k(1)
33810 .qmin(128)
33811 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33812 }
33813
33814 TEST(F32_GEMM_4X8__FMA3_BROADCAST, qmax) {
33815 TEST_REQUIRES_X86_FMA3;
33816 GemmMicrokernelTester()
33817 .mr(4)
33818 .nr(8)
33819 .kr(1)
33820 .sr(1)
33821 .m(4)
33822 .n(8)
33823 .k(1)
33824 .qmax(128)
33825 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33826 }
33827
33828 TEST(F32_GEMM_4X8__FMA3_BROADCAST, strided_cm) {
33829 TEST_REQUIRES_X86_FMA3;
33830 GemmMicrokernelTester()
33831 .mr(4)
33832 .nr(8)
33833 .kr(1)
33834 .sr(1)
33835 .m(4)
33836 .n(8)
33837 .k(1)
33838 .cm_stride(11)
33839 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
33840 }
33841#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33842
33843
33844#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33845 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1) {
33846 TEST_REQUIRES_X86_FMA3;
33847 GemmMicrokernelTester()
33848 .mr(5)
33849 .nr(8)
33850 .kr(1)
33851 .sr(1)
33852 .m(5)
33853 .n(8)
33854 .k(1)
33855 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33856 }
33857
33858 TEST(F32_GEMM_5X8__FMA3_BROADCAST, strided_cn) {
33859 TEST_REQUIRES_X86_FMA3;
33860 GemmMicrokernelTester()
33861 .mr(5)
33862 .nr(8)
33863 .kr(1)
33864 .sr(1)
33865 .m(5)
33866 .n(8)
33867 .k(1)
33868 .cn_stride(11)
33869 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33870 }
33871
33872 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_strided_a) {
33873 TEST_REQUIRES_X86_FMA3;
33874 GemmMicrokernelTester()
33875 .mr(5)
33876 .nr(8)
33877 .kr(1)
33878 .sr(1)
33879 .m(5)
33880 .n(8)
33881 .k(1)
33882 .a_stride(3)
33883 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33884 }
33885
33886 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
33887 TEST_REQUIRES_X86_FMA3;
33888 for (uint32_t m = 1; m <= 5; m++) {
33889 for (uint32_t n = 1; n <= 8; n++) {
33890 GemmMicrokernelTester()
33891 .mr(5)
33892 .nr(8)
33893 .kr(1)
33894 .sr(1)
33895 .m(m)
33896 .n(n)
33897 .k(1)
33898 .iterations(1)
33899 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33900 }
33901 }
33902 }
33903
33904 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
33905 TEST_REQUIRES_X86_FMA3;
33906 for (uint32_t m = 1; m <= 5; m++) {
33907 GemmMicrokernelTester()
33908 .mr(5)
33909 .nr(8)
33910 .kr(1)
33911 .sr(1)
33912 .m(m)
33913 .n(8)
33914 .k(1)
33915 .iterations(1)
33916 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33917 }
33918 }
33919
33920 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
33921 TEST_REQUIRES_X86_FMA3;
33922 for (uint32_t n = 1; n <= 8; n++) {
33923 GemmMicrokernelTester()
33924 .mr(5)
33925 .nr(8)
33926 .kr(1)
33927 .sr(1)
33928 .m(5)
33929 .n(n)
33930 .k(1)
33931 .iterations(1)
33932 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33933 }
33934 }
33935
33936 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_gt_1) {
33937 TEST_REQUIRES_X86_FMA3;
33938 for (size_t k = 2; k < 10; k++) {
33939 GemmMicrokernelTester()
33940 .mr(5)
33941 .nr(8)
33942 .kr(1)
33943 .sr(1)
33944 .m(5)
33945 .n(8)
33946 .k(k)
33947 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33948 }
33949 }
33950
33951 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_gt_1_strided_a) {
33952 TEST_REQUIRES_X86_FMA3;
33953 for (size_t k = 2; k < 10; k++) {
33954 GemmMicrokernelTester()
33955 .mr(5)
33956 .nr(8)
33957 .kr(1)
33958 .sr(1)
33959 .m(5)
33960 .n(8)
33961 .k(k)
33962 .a_stride(11)
33963 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33964 }
33965 }
33966
33967 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
33968 TEST_REQUIRES_X86_FMA3;
33969 for (size_t k = 2; k < 10; k++) {
33970 for (uint32_t m = 1; m <= 5; m++) {
33971 for (uint32_t n = 1; n <= 8; n++) {
33972 GemmMicrokernelTester()
33973 .mr(5)
33974 .nr(8)
33975 .kr(1)
33976 .sr(1)
33977 .m(m)
33978 .n(n)
33979 .k(k)
33980 .iterations(1)
33981 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
33982 }
33983 }
33984 }
33985 }
33986
33987 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8) {
33988 TEST_REQUIRES_X86_FMA3;
33989 for (uint32_t n = 9; n < 16; n++) {
33990 for (size_t k = 1; k <= 5; k += 2) {
33991 GemmMicrokernelTester()
33992 .mr(5)
33993 .nr(8)
33994 .kr(1)
33995 .sr(1)
33996 .m(5)
33997 .n(8)
33998 .k(k)
33999 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34000 }
34001 }
34002 }
34003
34004 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
34005 TEST_REQUIRES_X86_FMA3;
34006 for (uint32_t n = 9; n < 16; n++) {
34007 for (size_t k = 1; k <= 5; k += 2) {
34008 GemmMicrokernelTester()
34009 .mr(5)
34010 .nr(8)
34011 .kr(1)
34012 .sr(1)
34013 .m(5)
34014 .n(8)
34015 .k(k)
34016 .cn_stride(11)
34017 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34018 }
34019 }
34020 }
34021
34022 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8_strided_a) {
34023 TEST_REQUIRES_X86_FMA3;
34024 for (uint32_t n = 9; n < 16; n++) {
34025 for (size_t k = 1; k <= 5; k += 2) {
34026 GemmMicrokernelTester()
34027 .mr(5)
34028 .nr(8)
34029 .kr(1)
34030 .sr(1)
34031 .m(5)
34032 .n(n)
34033 .k(k)
34034 .a_stride(7)
34035 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34036 }
34037 }
34038 }
34039
34040 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
34041 TEST_REQUIRES_X86_FMA3;
34042 for (uint32_t n = 9; n < 16; n++) {
34043 for (size_t k = 1; k <= 5; k += 2) {
34044 for (uint32_t m = 1; m <= 5; m++) {
34045 GemmMicrokernelTester()
34046 .mr(5)
34047 .nr(8)
34048 .kr(1)
34049 .sr(1)
34050 .m(m)
34051 .n(n)
34052 .k(k)
34053 .iterations(1)
34054 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34055 }
34056 }
34057 }
34058 }
34059
34060 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8) {
34061 TEST_REQUIRES_X86_FMA3;
34062 for (uint32_t n = 16; n <= 24; n += 8) {
34063 for (size_t k = 1; k <= 5; k += 2) {
34064 GemmMicrokernelTester()
34065 .mr(5)
34066 .nr(8)
34067 .kr(1)
34068 .sr(1)
34069 .m(5)
34070 .n(8)
34071 .k(k)
34072 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34073 }
34074 }
34075 }
34076
34077 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
34078 TEST_REQUIRES_X86_FMA3;
34079 for (uint32_t n = 16; n <= 24; n += 8) {
34080 for (size_t k = 1; k <= 5; k += 2) {
34081 GemmMicrokernelTester()
34082 .mr(5)
34083 .nr(8)
34084 .kr(1)
34085 .sr(1)
34086 .m(5)
34087 .n(n)
34088 .k(k)
34089 .cn_stride(11)
34090 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34091 }
34092 }
34093 }
34094
34095 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8_strided_a) {
34096 TEST_REQUIRES_X86_FMA3;
34097 for (uint32_t n = 16; n <= 24; n += 8) {
34098 for (size_t k = 1; k <= 5; k += 2) {
34099 GemmMicrokernelTester()
34100 .mr(5)
34101 .nr(8)
34102 .kr(1)
34103 .sr(1)
34104 .m(5)
34105 .n(n)
34106 .k(k)
34107 .a_stride(7)
34108 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34109 }
34110 }
34111 }
34112
34113 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8_subtile) {
34114 TEST_REQUIRES_X86_FMA3;
34115 for (uint32_t n = 16; n <= 24; n += 8) {
34116 for (size_t k = 1; k <= 5; k += 2) {
34117 for (uint32_t m = 1; m <= 5; m++) {
34118 GemmMicrokernelTester()
34119 .mr(5)
34120 .nr(8)
34121 .kr(1)
34122 .sr(1)
34123 .m(m)
34124 .n(n)
34125 .k(k)
34126 .iterations(1)
34127 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34128 }
34129 }
34130 }
34131 }
34132
34133 TEST(F32_GEMM_5X8__FMA3_BROADCAST, strided_cm_subtile) {
34134 TEST_REQUIRES_X86_FMA3;
34135 for (size_t k = 1; k <= 5; k += 2) {
34136 for (uint32_t m = 1; m <= 5; m++) {
34137 for (uint32_t n = 1; n <= 8; n++) {
34138 GemmMicrokernelTester()
34139 .mr(5)
34140 .nr(8)
34141 .kr(1)
34142 .sr(1)
34143 .m(m)
34144 .n(n)
34145 .k(k)
34146 .cm_stride(11)
34147 .iterations(1)
34148 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34149 }
34150 }
34151 }
34152 }
34153
34154 TEST(F32_GEMM_5X8__FMA3_BROADCAST, qmin) {
34155 TEST_REQUIRES_X86_FMA3;
34156 GemmMicrokernelTester()
34157 .mr(5)
34158 .nr(8)
34159 .kr(1)
34160 .sr(1)
34161 .m(5)
34162 .n(8)
34163 .k(1)
34164 .qmin(128)
34165 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34166 }
34167
34168 TEST(F32_GEMM_5X8__FMA3_BROADCAST, qmax) {
34169 TEST_REQUIRES_X86_FMA3;
34170 GemmMicrokernelTester()
34171 .mr(5)
34172 .nr(8)
34173 .kr(1)
34174 .sr(1)
34175 .m(5)
34176 .n(8)
34177 .k(1)
34178 .qmax(128)
34179 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34180 }
34181
34182 TEST(F32_GEMM_5X8__FMA3_BROADCAST, strided_cm) {
34183 TEST_REQUIRES_X86_FMA3;
34184 GemmMicrokernelTester()
34185 .mr(5)
34186 .nr(8)
34187 .kr(1)
34188 .sr(1)
34189 .m(5)
34190 .n(8)
34191 .k(1)
34192 .cm_stride(11)
34193 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
34194 }
34195#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34196
34197
34198#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34199 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1) {
34200 TEST_REQUIRES_X86_FMA3;
34201 GemmMicrokernelTester()
34202 .mr(6)
34203 .nr(8)
34204 .kr(1)
34205 .sr(1)
34206 .m(6)
34207 .n(8)
34208 .k(1)
34209 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34210 }
34211
34212 TEST(F32_GEMM_6X8__FMA3_BROADCAST, strided_cn) {
34213 TEST_REQUIRES_X86_FMA3;
34214 GemmMicrokernelTester()
34215 .mr(6)
34216 .nr(8)
34217 .kr(1)
34218 .sr(1)
34219 .m(6)
34220 .n(8)
34221 .k(1)
34222 .cn_stride(11)
34223 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34224 }
34225
34226 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_strided_a) {
34227 TEST_REQUIRES_X86_FMA3;
34228 GemmMicrokernelTester()
34229 .mr(6)
34230 .nr(8)
34231 .kr(1)
34232 .sr(1)
34233 .m(6)
34234 .n(8)
34235 .k(1)
34236 .a_stride(3)
34237 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34238 }
34239
34240 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile) {
34241 TEST_REQUIRES_X86_FMA3;
34242 for (uint32_t m = 1; m <= 6; m++) {
34243 for (uint32_t n = 1; n <= 8; n++) {
34244 GemmMicrokernelTester()
34245 .mr(6)
34246 .nr(8)
34247 .kr(1)
34248 .sr(1)
34249 .m(m)
34250 .n(n)
34251 .k(1)
34252 .iterations(1)
34253 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34254 }
34255 }
34256 }
34257
34258 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
34259 TEST_REQUIRES_X86_FMA3;
34260 for (uint32_t m = 1; m <= 6; m++) {
34261 GemmMicrokernelTester()
34262 .mr(6)
34263 .nr(8)
34264 .kr(1)
34265 .sr(1)
34266 .m(m)
34267 .n(8)
34268 .k(1)
34269 .iterations(1)
34270 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34271 }
34272 }
34273
34274 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
34275 TEST_REQUIRES_X86_FMA3;
34276 for (uint32_t n = 1; n <= 8; n++) {
34277 GemmMicrokernelTester()
34278 .mr(6)
34279 .nr(8)
34280 .kr(1)
34281 .sr(1)
34282 .m(6)
34283 .n(n)
34284 .k(1)
34285 .iterations(1)
34286 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34287 }
34288 }
34289
34290 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_gt_1) {
34291 TEST_REQUIRES_X86_FMA3;
34292 for (size_t k = 2; k < 10; k++) {
34293 GemmMicrokernelTester()
34294 .mr(6)
34295 .nr(8)
34296 .kr(1)
34297 .sr(1)
34298 .m(6)
34299 .n(8)
34300 .k(k)
34301 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34302 }
34303 }
34304
34305 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_gt_1_strided_a) {
34306 TEST_REQUIRES_X86_FMA3;
34307 for (size_t k = 2; k < 10; k++) {
34308 GemmMicrokernelTester()
34309 .mr(6)
34310 .nr(8)
34311 .kr(1)
34312 .sr(1)
34313 .m(6)
34314 .n(8)
34315 .k(k)
34316 .a_stride(11)
34317 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34318 }
34319 }
34320
34321 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_gt_1_subtile) {
34322 TEST_REQUIRES_X86_FMA3;
34323 for (size_t k = 2; k < 10; k++) {
34324 for (uint32_t m = 1; m <= 6; m++) {
34325 for (uint32_t n = 1; n <= 8; n++) {
34326 GemmMicrokernelTester()
34327 .mr(6)
34328 .nr(8)
34329 .kr(1)
34330 .sr(1)
34331 .m(m)
34332 .n(n)
34333 .k(k)
34334 .iterations(1)
34335 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34336 }
34337 }
34338 }
34339 }
34340
34341 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8) {
34342 TEST_REQUIRES_X86_FMA3;
34343 for (uint32_t n = 9; n < 16; n++) {
34344 for (size_t k = 1; k <= 5; k += 2) {
34345 GemmMicrokernelTester()
34346 .mr(6)
34347 .nr(8)
34348 .kr(1)
34349 .sr(1)
34350 .m(6)
34351 .n(8)
34352 .k(k)
34353 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34354 }
34355 }
34356 }
34357
34358 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
34359 TEST_REQUIRES_X86_FMA3;
34360 for (uint32_t n = 9; n < 16; n++) {
34361 for (size_t k = 1; k <= 5; k += 2) {
34362 GemmMicrokernelTester()
34363 .mr(6)
34364 .nr(8)
34365 .kr(1)
34366 .sr(1)
34367 .m(6)
34368 .n(8)
34369 .k(k)
34370 .cn_stride(11)
34371 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34372 }
34373 }
34374 }
34375
34376 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8_strided_a) {
34377 TEST_REQUIRES_X86_FMA3;
34378 for (uint32_t n = 9; n < 16; n++) {
34379 for (size_t k = 1; k <= 5; k += 2) {
34380 GemmMicrokernelTester()
34381 .mr(6)
34382 .nr(8)
34383 .kr(1)
34384 .sr(1)
34385 .m(6)
34386 .n(n)
34387 .k(k)
34388 .a_stride(7)
34389 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34390 }
34391 }
34392 }
34393
34394 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8_subtile) {
34395 TEST_REQUIRES_X86_FMA3;
34396 for (uint32_t n = 9; n < 16; n++) {
34397 for (size_t k = 1; k <= 5; k += 2) {
34398 for (uint32_t m = 1; m <= 6; m++) {
34399 GemmMicrokernelTester()
34400 .mr(6)
34401 .nr(8)
34402 .kr(1)
34403 .sr(1)
34404 .m(m)
34405 .n(n)
34406 .k(k)
34407 .iterations(1)
34408 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34409 }
34410 }
34411 }
34412 }
34413
34414 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8) {
34415 TEST_REQUIRES_X86_FMA3;
34416 for (uint32_t n = 16; n <= 24; n += 8) {
34417 for (size_t k = 1; k <= 5; k += 2) {
34418 GemmMicrokernelTester()
34419 .mr(6)
34420 .nr(8)
34421 .kr(1)
34422 .sr(1)
34423 .m(6)
34424 .n(8)
34425 .k(k)
34426 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34427 }
34428 }
34429 }
34430
34431 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8_strided_cn) {
34432 TEST_REQUIRES_X86_FMA3;
34433 for (uint32_t n = 16; n <= 24; n += 8) {
34434 for (size_t k = 1; k <= 5; k += 2) {
34435 GemmMicrokernelTester()
34436 .mr(6)
34437 .nr(8)
34438 .kr(1)
34439 .sr(1)
34440 .m(6)
34441 .n(n)
34442 .k(k)
34443 .cn_stride(11)
34444 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34445 }
34446 }
34447 }
34448
34449 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8_strided_a) {
34450 TEST_REQUIRES_X86_FMA3;
34451 for (uint32_t n = 16; n <= 24; n += 8) {
34452 for (size_t k = 1; k <= 5; k += 2) {
34453 GemmMicrokernelTester()
34454 .mr(6)
34455 .nr(8)
34456 .kr(1)
34457 .sr(1)
34458 .m(6)
34459 .n(n)
34460 .k(k)
34461 .a_stride(7)
34462 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34463 }
34464 }
34465 }
34466
34467 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8_subtile) {
34468 TEST_REQUIRES_X86_FMA3;
34469 for (uint32_t n = 16; n <= 24; n += 8) {
34470 for (size_t k = 1; k <= 5; k += 2) {
34471 for (uint32_t m = 1; m <= 6; m++) {
34472 GemmMicrokernelTester()
34473 .mr(6)
34474 .nr(8)
34475 .kr(1)
34476 .sr(1)
34477 .m(m)
34478 .n(n)
34479 .k(k)
34480 .iterations(1)
34481 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34482 }
34483 }
34484 }
34485 }
34486
34487 TEST(F32_GEMM_6X8__FMA3_BROADCAST, strided_cm_subtile) {
34488 TEST_REQUIRES_X86_FMA3;
34489 for (size_t k = 1; k <= 5; k += 2) {
34490 for (uint32_t m = 1; m <= 6; m++) {
34491 for (uint32_t n = 1; n <= 8; n++) {
34492 GemmMicrokernelTester()
34493 .mr(6)
34494 .nr(8)
34495 .kr(1)
34496 .sr(1)
34497 .m(m)
34498 .n(n)
34499 .k(k)
34500 .cm_stride(11)
34501 .iterations(1)
34502 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34503 }
34504 }
34505 }
34506 }
34507
34508 TEST(F32_GEMM_6X8__FMA3_BROADCAST, qmin) {
34509 TEST_REQUIRES_X86_FMA3;
34510 GemmMicrokernelTester()
34511 .mr(6)
34512 .nr(8)
34513 .kr(1)
34514 .sr(1)
34515 .m(6)
34516 .n(8)
34517 .k(1)
34518 .qmin(128)
34519 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34520 }
34521
34522 TEST(F32_GEMM_6X8__FMA3_BROADCAST, qmax) {
34523 TEST_REQUIRES_X86_FMA3;
34524 GemmMicrokernelTester()
34525 .mr(6)
34526 .nr(8)
34527 .kr(1)
34528 .sr(1)
34529 .m(6)
34530 .n(8)
34531 .k(1)
34532 .qmax(128)
34533 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34534 }
34535
34536 TEST(F32_GEMM_6X8__FMA3_BROADCAST, strided_cm) {
34537 TEST_REQUIRES_X86_FMA3;
34538 GemmMicrokernelTester()
34539 .mr(6)
34540 .nr(8)
34541 .kr(1)
34542 .sr(1)
34543 .m(6)
34544 .n(8)
34545 .k(1)
34546 .cm_stride(11)
34547 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
34548 }
34549#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34550
34551
34552#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34553 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1) {
34554 TEST_REQUIRES_X86_FMA3;
34555 GemmMicrokernelTester()
34556 .mr(7)
34557 .nr(8)
34558 .kr(1)
34559 .sr(1)
34560 .m(7)
34561 .n(8)
34562 .k(1)
34563 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34564 }
34565
34566 TEST(F32_GEMM_7X8__FMA3_BROADCAST, strided_cn) {
34567 TEST_REQUIRES_X86_FMA3;
34568 GemmMicrokernelTester()
34569 .mr(7)
34570 .nr(8)
34571 .kr(1)
34572 .sr(1)
34573 .m(7)
34574 .n(8)
34575 .k(1)
34576 .cn_stride(11)
34577 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34578 }
34579
34580 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_strided_a) {
34581 TEST_REQUIRES_X86_FMA3;
34582 GemmMicrokernelTester()
34583 .mr(7)
34584 .nr(8)
34585 .kr(1)
34586 .sr(1)
34587 .m(7)
34588 .n(8)
34589 .k(1)
34590 .a_stride(3)
34591 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34592 }
34593
34594 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile) {
34595 TEST_REQUIRES_X86_FMA3;
34596 for (uint32_t m = 1; m <= 7; m++) {
34597 for (uint32_t n = 1; n <= 8; n++) {
34598 GemmMicrokernelTester()
34599 .mr(7)
34600 .nr(8)
34601 .kr(1)
34602 .sr(1)
34603 .m(m)
34604 .n(n)
34605 .k(1)
34606 .iterations(1)
34607 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34608 }
34609 }
34610 }
34611
34612 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
34613 TEST_REQUIRES_X86_FMA3;
34614 for (uint32_t m = 1; m <= 7; m++) {
34615 GemmMicrokernelTester()
34616 .mr(7)
34617 .nr(8)
34618 .kr(1)
34619 .sr(1)
34620 .m(m)
34621 .n(8)
34622 .k(1)
34623 .iterations(1)
34624 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34625 }
34626 }
34627
34628 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
34629 TEST_REQUIRES_X86_FMA3;
34630 for (uint32_t n = 1; n <= 8; n++) {
34631 GemmMicrokernelTester()
34632 .mr(7)
34633 .nr(8)
34634 .kr(1)
34635 .sr(1)
34636 .m(7)
34637 .n(n)
34638 .k(1)
34639 .iterations(1)
34640 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34641 }
34642 }
34643
34644 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_gt_1) {
34645 TEST_REQUIRES_X86_FMA3;
34646 for (size_t k = 2; k < 10; k++) {
34647 GemmMicrokernelTester()
34648 .mr(7)
34649 .nr(8)
34650 .kr(1)
34651 .sr(1)
34652 .m(7)
34653 .n(8)
34654 .k(k)
34655 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34656 }
34657 }
34658
34659 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_gt_1_strided_a) {
34660 TEST_REQUIRES_X86_FMA3;
34661 for (size_t k = 2; k < 10; k++) {
34662 GemmMicrokernelTester()
34663 .mr(7)
34664 .nr(8)
34665 .kr(1)
34666 .sr(1)
34667 .m(7)
34668 .n(8)
34669 .k(k)
34670 .a_stride(11)
34671 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34672 }
34673 }
34674
34675 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_gt_1_subtile) {
34676 TEST_REQUIRES_X86_FMA3;
34677 for (size_t k = 2; k < 10; k++) {
34678 for (uint32_t m = 1; m <= 7; m++) {
34679 for (uint32_t n = 1; n <= 8; n++) {
34680 GemmMicrokernelTester()
34681 .mr(7)
34682 .nr(8)
34683 .kr(1)
34684 .sr(1)
34685 .m(m)
34686 .n(n)
34687 .k(k)
34688 .iterations(1)
34689 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34690 }
34691 }
34692 }
34693 }
34694
34695 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8) {
34696 TEST_REQUIRES_X86_FMA3;
34697 for (uint32_t n = 9; n < 16; n++) {
34698 for (size_t k = 1; k <= 5; k += 2) {
34699 GemmMicrokernelTester()
34700 .mr(7)
34701 .nr(8)
34702 .kr(1)
34703 .sr(1)
34704 .m(7)
34705 .n(8)
34706 .k(k)
34707 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34708 }
34709 }
34710 }
34711
34712 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
34713 TEST_REQUIRES_X86_FMA3;
34714 for (uint32_t n = 9; n < 16; n++) {
34715 for (size_t k = 1; k <= 5; k += 2) {
34716 GemmMicrokernelTester()
34717 .mr(7)
34718 .nr(8)
34719 .kr(1)
34720 .sr(1)
34721 .m(7)
34722 .n(8)
34723 .k(k)
34724 .cn_stride(11)
34725 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34726 }
34727 }
34728 }
34729
34730 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8_strided_a) {
34731 TEST_REQUIRES_X86_FMA3;
34732 for (uint32_t n = 9; n < 16; n++) {
34733 for (size_t k = 1; k <= 5; k += 2) {
34734 GemmMicrokernelTester()
34735 .mr(7)
34736 .nr(8)
34737 .kr(1)
34738 .sr(1)
34739 .m(7)
34740 .n(n)
34741 .k(k)
34742 .a_stride(7)
34743 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34744 }
34745 }
34746 }
34747
34748 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8_subtile) {
34749 TEST_REQUIRES_X86_FMA3;
34750 for (uint32_t n = 9; n < 16; n++) {
34751 for (size_t k = 1; k <= 5; k += 2) {
34752 for (uint32_t m = 1; m <= 7; m++) {
34753 GemmMicrokernelTester()
34754 .mr(7)
34755 .nr(8)
34756 .kr(1)
34757 .sr(1)
34758 .m(m)
34759 .n(n)
34760 .k(k)
34761 .iterations(1)
34762 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34763 }
34764 }
34765 }
34766 }
34767
34768 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8) {
34769 TEST_REQUIRES_X86_FMA3;
34770 for (uint32_t n = 16; n <= 24; n += 8) {
34771 for (size_t k = 1; k <= 5; k += 2) {
34772 GemmMicrokernelTester()
34773 .mr(7)
34774 .nr(8)
34775 .kr(1)
34776 .sr(1)
34777 .m(7)
34778 .n(8)
34779 .k(k)
34780 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34781 }
34782 }
34783 }
34784
34785 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8_strided_cn) {
34786 TEST_REQUIRES_X86_FMA3;
34787 for (uint32_t n = 16; n <= 24; n += 8) {
34788 for (size_t k = 1; k <= 5; k += 2) {
34789 GemmMicrokernelTester()
34790 .mr(7)
34791 .nr(8)
34792 .kr(1)
34793 .sr(1)
34794 .m(7)
34795 .n(n)
34796 .k(k)
34797 .cn_stride(11)
34798 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34799 }
34800 }
34801 }
34802
34803 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8_strided_a) {
34804 TEST_REQUIRES_X86_FMA3;
34805 for (uint32_t n = 16; n <= 24; n += 8) {
34806 for (size_t k = 1; k <= 5; k += 2) {
34807 GemmMicrokernelTester()
34808 .mr(7)
34809 .nr(8)
34810 .kr(1)
34811 .sr(1)
34812 .m(7)
34813 .n(n)
34814 .k(k)
34815 .a_stride(7)
34816 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34817 }
34818 }
34819 }
34820
34821 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8_subtile) {
34822 TEST_REQUIRES_X86_FMA3;
34823 for (uint32_t n = 16; n <= 24; n += 8) {
34824 for (size_t k = 1; k <= 5; k += 2) {
34825 for (uint32_t m = 1; m <= 7; m++) {
34826 GemmMicrokernelTester()
34827 .mr(7)
34828 .nr(8)
34829 .kr(1)
34830 .sr(1)
34831 .m(m)
34832 .n(n)
34833 .k(k)
34834 .iterations(1)
34835 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34836 }
34837 }
34838 }
34839 }
34840
34841 TEST(F32_GEMM_7X8__FMA3_BROADCAST, strided_cm_subtile) {
34842 TEST_REQUIRES_X86_FMA3;
34843 for (size_t k = 1; k <= 5; k += 2) {
34844 for (uint32_t m = 1; m <= 7; m++) {
34845 for (uint32_t n = 1; n <= 8; n++) {
34846 GemmMicrokernelTester()
34847 .mr(7)
34848 .nr(8)
34849 .kr(1)
34850 .sr(1)
34851 .m(m)
34852 .n(n)
34853 .k(k)
34854 .cm_stride(11)
34855 .iterations(1)
34856 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34857 }
34858 }
34859 }
34860 }
34861
34862 TEST(F32_GEMM_7X8__FMA3_BROADCAST, qmin) {
34863 TEST_REQUIRES_X86_FMA3;
34864 GemmMicrokernelTester()
34865 .mr(7)
34866 .nr(8)
34867 .kr(1)
34868 .sr(1)
34869 .m(7)
34870 .n(8)
34871 .k(1)
34872 .qmin(128)
34873 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34874 }
34875
34876 TEST(F32_GEMM_7X8__FMA3_BROADCAST, qmax) {
34877 TEST_REQUIRES_X86_FMA3;
34878 GemmMicrokernelTester()
34879 .mr(7)
34880 .nr(8)
34881 .kr(1)
34882 .sr(1)
34883 .m(7)
34884 .n(8)
34885 .k(1)
34886 .qmax(128)
34887 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34888 }
34889
34890 TEST(F32_GEMM_7X8__FMA3_BROADCAST, strided_cm) {
34891 TEST_REQUIRES_X86_FMA3;
34892 GemmMicrokernelTester()
34893 .mr(7)
34894 .nr(8)
34895 .kr(1)
34896 .sr(1)
34897 .m(7)
34898 .n(8)
34899 .k(1)
34900 .cm_stride(11)
34901 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
34902 }
34903#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34904
34905
34906#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34907 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1) {
34908 TEST_REQUIRES_X86_FMA3;
34909 GemmMicrokernelTester()
34910 .mr(8)
34911 .nr(8)
34912 .kr(1)
34913 .sr(1)
34914 .m(8)
34915 .n(8)
34916 .k(1)
34917 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
34918 }
34919
34920 TEST(F32_GEMM_8X8__FMA3_BROADCAST, strided_cn) {
34921 TEST_REQUIRES_X86_FMA3;
34922 GemmMicrokernelTester()
34923 .mr(8)
34924 .nr(8)
34925 .kr(1)
34926 .sr(1)
34927 .m(8)
34928 .n(8)
34929 .k(1)
34930 .cn_stride(11)
34931 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
34932 }
34933
34934 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_strided_a) {
34935 TEST_REQUIRES_X86_FMA3;
34936 GemmMicrokernelTester()
34937 .mr(8)
34938 .nr(8)
34939 .kr(1)
34940 .sr(1)
34941 .m(8)
34942 .n(8)
34943 .k(1)
34944 .a_stride(3)
34945 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
34946 }
34947
34948 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile) {
34949 TEST_REQUIRES_X86_FMA3;
34950 for (uint32_t m = 1; m <= 8; m++) {
34951 for (uint32_t n = 1; n <= 8; n++) {
34952 GemmMicrokernelTester()
34953 .mr(8)
34954 .nr(8)
34955 .kr(1)
34956 .sr(1)
34957 .m(m)
34958 .n(n)
34959 .k(1)
34960 .iterations(1)
34961 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
34962 }
34963 }
34964 }
34965
34966 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
34967 TEST_REQUIRES_X86_FMA3;
34968 for (uint32_t m = 1; m <= 8; m++) {
34969 GemmMicrokernelTester()
34970 .mr(8)
34971 .nr(8)
34972 .kr(1)
34973 .sr(1)
34974 .m(m)
34975 .n(8)
34976 .k(1)
34977 .iterations(1)
34978 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
34979 }
34980 }
34981
34982 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
34983 TEST_REQUIRES_X86_FMA3;
34984 for (uint32_t n = 1; n <= 8; n++) {
34985 GemmMicrokernelTester()
34986 .mr(8)
34987 .nr(8)
34988 .kr(1)
34989 .sr(1)
34990 .m(8)
34991 .n(n)
34992 .k(1)
34993 .iterations(1)
34994 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
34995 }
34996 }
34997
34998 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_gt_1) {
34999 TEST_REQUIRES_X86_FMA3;
35000 for (size_t k = 2; k < 10; k++) {
35001 GemmMicrokernelTester()
35002 .mr(8)
35003 .nr(8)
35004 .kr(1)
35005 .sr(1)
35006 .m(8)
35007 .n(8)
35008 .k(k)
35009 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35010 }
35011 }
35012
35013 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_gt_1_strided_a) {
35014 TEST_REQUIRES_X86_FMA3;
35015 for (size_t k = 2; k < 10; k++) {
35016 GemmMicrokernelTester()
35017 .mr(8)
35018 .nr(8)
35019 .kr(1)
35020 .sr(1)
35021 .m(8)
35022 .n(8)
35023 .k(k)
35024 .a_stride(11)
35025 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35026 }
35027 }
35028
35029 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_gt_1_subtile) {
35030 TEST_REQUIRES_X86_FMA3;
35031 for (size_t k = 2; k < 10; k++) {
35032 for (uint32_t m = 1; m <= 8; m++) {
35033 for (uint32_t n = 1; n <= 8; n++) {
35034 GemmMicrokernelTester()
35035 .mr(8)
35036 .nr(8)
35037 .kr(1)
35038 .sr(1)
35039 .m(m)
35040 .n(n)
35041 .k(k)
35042 .iterations(1)
35043 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35044 }
35045 }
35046 }
35047 }
35048
35049 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8) {
35050 TEST_REQUIRES_X86_FMA3;
35051 for (uint32_t n = 9; n < 16; n++) {
35052 for (size_t k = 1; k <= 5; k += 2) {
35053 GemmMicrokernelTester()
35054 .mr(8)
35055 .nr(8)
35056 .kr(1)
35057 .sr(1)
35058 .m(8)
35059 .n(8)
35060 .k(k)
35061 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35062 }
35063 }
35064 }
35065
35066 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
35067 TEST_REQUIRES_X86_FMA3;
35068 for (uint32_t n = 9; n < 16; n++) {
35069 for (size_t k = 1; k <= 5; k += 2) {
35070 GemmMicrokernelTester()
35071 .mr(8)
35072 .nr(8)
35073 .kr(1)
35074 .sr(1)
35075 .m(8)
35076 .n(8)
35077 .k(k)
35078 .cn_stride(11)
35079 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35080 }
35081 }
35082 }
35083
35084 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8_strided_a) {
35085 TEST_REQUIRES_X86_FMA3;
35086 for (uint32_t n = 9; n < 16; n++) {
35087 for (size_t k = 1; k <= 5; k += 2) {
35088 GemmMicrokernelTester()
35089 .mr(8)
35090 .nr(8)
35091 .kr(1)
35092 .sr(1)
35093 .m(8)
35094 .n(n)
35095 .k(k)
35096 .a_stride(7)
35097 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35098 }
35099 }
35100 }
35101
35102 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8_subtile) {
35103 TEST_REQUIRES_X86_FMA3;
35104 for (uint32_t n = 9; n < 16; n++) {
35105 for (size_t k = 1; k <= 5; k += 2) {
35106 for (uint32_t m = 1; m <= 8; m++) {
35107 GemmMicrokernelTester()
35108 .mr(8)
35109 .nr(8)
35110 .kr(1)
35111 .sr(1)
35112 .m(m)
35113 .n(n)
35114 .k(k)
35115 .iterations(1)
35116 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35117 }
35118 }
35119 }
35120 }
35121
35122 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8) {
35123 TEST_REQUIRES_X86_FMA3;
35124 for (uint32_t n = 16; n <= 24; n += 8) {
35125 for (size_t k = 1; k <= 5; k += 2) {
35126 GemmMicrokernelTester()
35127 .mr(8)
35128 .nr(8)
35129 .kr(1)
35130 .sr(1)
35131 .m(8)
35132 .n(8)
35133 .k(k)
35134 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35135 }
35136 }
35137 }
35138
35139 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8_strided_cn) {
35140 TEST_REQUIRES_X86_FMA3;
35141 for (uint32_t n = 16; n <= 24; n += 8) {
35142 for (size_t k = 1; k <= 5; k += 2) {
35143 GemmMicrokernelTester()
35144 .mr(8)
35145 .nr(8)
35146 .kr(1)
35147 .sr(1)
35148 .m(8)
35149 .n(n)
35150 .k(k)
35151 .cn_stride(11)
35152 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35153 }
35154 }
35155 }
35156
35157 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8_strided_a) {
35158 TEST_REQUIRES_X86_FMA3;
35159 for (uint32_t n = 16; n <= 24; n += 8) {
35160 for (size_t k = 1; k <= 5; k += 2) {
35161 GemmMicrokernelTester()
35162 .mr(8)
35163 .nr(8)
35164 .kr(1)
35165 .sr(1)
35166 .m(8)
35167 .n(n)
35168 .k(k)
35169 .a_stride(7)
35170 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35171 }
35172 }
35173 }
35174
35175 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8_subtile) {
35176 TEST_REQUIRES_X86_FMA3;
35177 for (uint32_t n = 16; n <= 24; n += 8) {
35178 for (size_t k = 1; k <= 5; k += 2) {
35179 for (uint32_t m = 1; m <= 8; m++) {
35180 GemmMicrokernelTester()
35181 .mr(8)
35182 .nr(8)
35183 .kr(1)
35184 .sr(1)
35185 .m(m)
35186 .n(n)
35187 .k(k)
35188 .iterations(1)
35189 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35190 }
35191 }
35192 }
35193 }
35194
35195 TEST(F32_GEMM_8X8__FMA3_BROADCAST, strided_cm_subtile) {
35196 TEST_REQUIRES_X86_FMA3;
35197 for (size_t k = 1; k <= 5; k += 2) {
35198 for (uint32_t m = 1; m <= 8; m++) {
35199 for (uint32_t n = 1; n <= 8; n++) {
35200 GemmMicrokernelTester()
35201 .mr(8)
35202 .nr(8)
35203 .kr(1)
35204 .sr(1)
35205 .m(m)
35206 .n(n)
35207 .k(k)
35208 .cm_stride(11)
35209 .iterations(1)
35210 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35211 }
35212 }
35213 }
35214 }
35215
35216 TEST(F32_GEMM_8X8__FMA3_BROADCAST, qmin) {
35217 TEST_REQUIRES_X86_FMA3;
35218 GemmMicrokernelTester()
35219 .mr(8)
35220 .nr(8)
35221 .kr(1)
35222 .sr(1)
35223 .m(8)
35224 .n(8)
35225 .k(1)
35226 .qmin(128)
35227 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35228 }
35229
35230 TEST(F32_GEMM_8X8__FMA3_BROADCAST, qmax) {
35231 TEST_REQUIRES_X86_FMA3;
35232 GemmMicrokernelTester()
35233 .mr(8)
35234 .nr(8)
35235 .kr(1)
35236 .sr(1)
35237 .m(8)
35238 .n(8)
35239 .k(1)
35240 .qmax(128)
35241 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35242 }
35243
35244 TEST(F32_GEMM_8X8__FMA3_BROADCAST, strided_cm) {
35245 TEST_REQUIRES_X86_FMA3;
35246 GemmMicrokernelTester()
35247 .mr(8)
35248 .nr(8)
35249 .kr(1)
35250 .sr(1)
35251 .m(8)
35252 .n(8)
35253 .k(1)
35254 .cm_stride(11)
35255 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
35256 }
35257#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35258
35259
35260#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35261 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1) {
35262 TEST_REQUIRES_X86_FMA3;
35263 GemmMicrokernelTester()
35264 .mr(1)
35265 .nr(16)
35266 .kr(1)
35267 .sr(1)
35268 .m(1)
35269 .n(16)
35270 .k(1)
35271 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35272 }
35273
35274 TEST(F32_GEMM_1X16__FMA3_BROADCAST, strided_cn) {
35275 TEST_REQUIRES_X86_FMA3;
35276 GemmMicrokernelTester()
35277 .mr(1)
35278 .nr(16)
35279 .kr(1)
35280 .sr(1)
35281 .m(1)
35282 .n(16)
35283 .k(1)
35284 .cn_stride(19)
35285 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35286 }
35287
35288 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_strided_a) {
35289 TEST_REQUIRES_X86_FMA3;
35290 GemmMicrokernelTester()
35291 .mr(1)
35292 .nr(16)
35293 .kr(1)
35294 .sr(1)
35295 .m(1)
35296 .n(16)
35297 .k(1)
35298 .a_stride(3)
35299 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35300 }
35301
35302 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
35303 TEST_REQUIRES_X86_FMA3;
35304 for (uint32_t m = 1; m <= 1; m++) {
35305 for (uint32_t n = 1; n <= 16; n++) {
35306 GemmMicrokernelTester()
35307 .mr(1)
35308 .nr(16)
35309 .kr(1)
35310 .sr(1)
35311 .m(m)
35312 .n(n)
35313 .k(1)
35314 .iterations(1)
35315 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35316 }
35317 }
35318 }
35319
35320 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
35321 TEST_REQUIRES_X86_FMA3;
35322 for (uint32_t m = 1; m <= 1; m++) {
35323 GemmMicrokernelTester()
35324 .mr(1)
35325 .nr(16)
35326 .kr(1)
35327 .sr(1)
35328 .m(m)
35329 .n(16)
35330 .k(1)
35331 .iterations(1)
35332 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35333 }
35334 }
35335
35336 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
35337 TEST_REQUIRES_X86_FMA3;
35338 for (uint32_t n = 1; n <= 16; n++) {
35339 GemmMicrokernelTester()
35340 .mr(1)
35341 .nr(16)
35342 .kr(1)
35343 .sr(1)
35344 .m(1)
35345 .n(n)
35346 .k(1)
35347 .iterations(1)
35348 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35349 }
35350 }
35351
35352 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_gt_1) {
35353 TEST_REQUIRES_X86_FMA3;
35354 for (size_t k = 2; k < 10; k++) {
35355 GemmMicrokernelTester()
35356 .mr(1)
35357 .nr(16)
35358 .kr(1)
35359 .sr(1)
35360 .m(1)
35361 .n(16)
35362 .k(k)
35363 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35364 }
35365 }
35366
35367 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_gt_1_strided_a) {
35368 TEST_REQUIRES_X86_FMA3;
35369 for (size_t k = 2; k < 10; k++) {
35370 GemmMicrokernelTester()
35371 .mr(1)
35372 .nr(16)
35373 .kr(1)
35374 .sr(1)
35375 .m(1)
35376 .n(16)
35377 .k(k)
35378 .a_stride(11)
35379 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35380 }
35381 }
35382
35383 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
35384 TEST_REQUIRES_X86_FMA3;
35385 for (size_t k = 2; k < 10; k++) {
35386 for (uint32_t m = 1; m <= 1; m++) {
35387 for (uint32_t n = 1; n <= 16; n++) {
35388 GemmMicrokernelTester()
35389 .mr(1)
35390 .nr(16)
35391 .kr(1)
35392 .sr(1)
35393 .m(m)
35394 .n(n)
35395 .k(k)
35396 .iterations(1)
35397 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35398 }
35399 }
35400 }
35401 }
35402
35403 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16) {
35404 TEST_REQUIRES_X86_FMA3;
35405 for (uint32_t n = 17; n < 32; n++) {
35406 for (size_t k = 1; k <= 5; k += 2) {
35407 GemmMicrokernelTester()
35408 .mr(1)
35409 .nr(16)
35410 .kr(1)
35411 .sr(1)
35412 .m(1)
35413 .n(16)
35414 .k(k)
35415 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35416 }
35417 }
35418 }
35419
35420 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
35421 TEST_REQUIRES_X86_FMA3;
35422 for (uint32_t n = 17; n < 32; n++) {
35423 for (size_t k = 1; k <= 5; k += 2) {
35424 GemmMicrokernelTester()
35425 .mr(1)
35426 .nr(16)
35427 .kr(1)
35428 .sr(1)
35429 .m(1)
35430 .n(16)
35431 .k(k)
35432 .cn_stride(19)
35433 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35434 }
35435 }
35436 }
35437
35438 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16_strided_a) {
35439 TEST_REQUIRES_X86_FMA3;
35440 for (uint32_t n = 17; n < 32; n++) {
35441 for (size_t k = 1; k <= 5; k += 2) {
35442 GemmMicrokernelTester()
35443 .mr(1)
35444 .nr(16)
35445 .kr(1)
35446 .sr(1)
35447 .m(1)
35448 .n(n)
35449 .k(k)
35450 .a_stride(7)
35451 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35452 }
35453 }
35454 }
35455
35456 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
35457 TEST_REQUIRES_X86_FMA3;
35458 for (uint32_t n = 17; n < 32; n++) {
35459 for (size_t k = 1; k <= 5; k += 2) {
35460 for (uint32_t m = 1; m <= 1; m++) {
35461 GemmMicrokernelTester()
35462 .mr(1)
35463 .nr(16)
35464 .kr(1)
35465 .sr(1)
35466 .m(m)
35467 .n(n)
35468 .k(k)
35469 .iterations(1)
35470 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35471 }
35472 }
35473 }
35474 }
35475
35476 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16) {
35477 TEST_REQUIRES_X86_FMA3;
35478 for (uint32_t n = 32; n <= 48; n += 16) {
35479 for (size_t k = 1; k <= 5; k += 2) {
35480 GemmMicrokernelTester()
35481 .mr(1)
35482 .nr(16)
35483 .kr(1)
35484 .sr(1)
35485 .m(1)
35486 .n(16)
35487 .k(k)
35488 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35489 }
35490 }
35491 }
35492
35493 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
35494 TEST_REQUIRES_X86_FMA3;
35495 for (uint32_t n = 32; n <= 48; n += 16) {
35496 for (size_t k = 1; k <= 5; k += 2) {
35497 GemmMicrokernelTester()
35498 .mr(1)
35499 .nr(16)
35500 .kr(1)
35501 .sr(1)
35502 .m(1)
35503 .n(n)
35504 .k(k)
35505 .cn_stride(19)
35506 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35507 }
35508 }
35509 }
35510
35511 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16_strided_a) {
35512 TEST_REQUIRES_X86_FMA3;
35513 for (uint32_t n = 32; n <= 48; n += 16) {
35514 for (size_t k = 1; k <= 5; k += 2) {
35515 GemmMicrokernelTester()
35516 .mr(1)
35517 .nr(16)
35518 .kr(1)
35519 .sr(1)
35520 .m(1)
35521 .n(n)
35522 .k(k)
35523 .a_stride(7)
35524 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35525 }
35526 }
35527 }
35528
35529 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16_subtile) {
35530 TEST_REQUIRES_X86_FMA3;
35531 for (uint32_t n = 32; n <= 48; n += 16) {
35532 for (size_t k = 1; k <= 5; k += 2) {
35533 for (uint32_t m = 1; m <= 1; m++) {
35534 GemmMicrokernelTester()
35535 .mr(1)
35536 .nr(16)
35537 .kr(1)
35538 .sr(1)
35539 .m(m)
35540 .n(n)
35541 .k(k)
35542 .iterations(1)
35543 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35544 }
35545 }
35546 }
35547 }
35548
35549 TEST(F32_GEMM_1X16__FMA3_BROADCAST, strided_cm_subtile) {
35550 TEST_REQUIRES_X86_FMA3;
35551 for (size_t k = 1; k <= 5; k += 2) {
35552 for (uint32_t m = 1; m <= 1; m++) {
35553 for (uint32_t n = 1; n <= 16; n++) {
35554 GemmMicrokernelTester()
35555 .mr(1)
35556 .nr(16)
35557 .kr(1)
35558 .sr(1)
35559 .m(m)
35560 .n(n)
35561 .k(k)
35562 .cm_stride(19)
35563 .iterations(1)
35564 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35565 }
35566 }
35567 }
35568 }
35569
35570 TEST(F32_GEMM_1X16__FMA3_BROADCAST, qmin) {
35571 TEST_REQUIRES_X86_FMA3;
35572 GemmMicrokernelTester()
35573 .mr(1)
35574 .nr(16)
35575 .kr(1)
35576 .sr(1)
35577 .m(1)
35578 .n(16)
35579 .k(1)
35580 .qmin(128)
35581 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35582 }
35583
35584 TEST(F32_GEMM_1X16__FMA3_BROADCAST, qmax) {
35585 TEST_REQUIRES_X86_FMA3;
35586 GemmMicrokernelTester()
35587 .mr(1)
35588 .nr(16)
35589 .kr(1)
35590 .sr(1)
35591 .m(1)
35592 .n(16)
35593 .k(1)
35594 .qmax(128)
35595 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35596 }
35597
35598 TEST(F32_GEMM_1X16__FMA3_BROADCAST, strided_cm) {
35599 TEST_REQUIRES_X86_FMA3;
35600 GemmMicrokernelTester()
35601 .mr(1)
35602 .nr(16)
35603 .kr(1)
35604 .sr(1)
35605 .m(1)
35606 .n(16)
35607 .k(1)
35608 .cm_stride(19)
35609 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
35610 }
35611#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35612
35613
35614#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35615 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1) {
35616 TEST_REQUIRES_X86_FMA3;
35617 GemmMicrokernelTester()
35618 .mr(3)
35619 .nr(16)
35620 .kr(1)
35621 .sr(1)
35622 .m(3)
35623 .n(16)
35624 .k(1)
35625 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35626 }
35627
35628 TEST(F32_GEMM_3X16__FMA3_BROADCAST, strided_cn) {
35629 TEST_REQUIRES_X86_FMA3;
35630 GemmMicrokernelTester()
35631 .mr(3)
35632 .nr(16)
35633 .kr(1)
35634 .sr(1)
35635 .m(3)
35636 .n(16)
35637 .k(1)
35638 .cn_stride(19)
35639 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35640 }
35641
35642 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_strided_a) {
35643 TEST_REQUIRES_X86_FMA3;
35644 GemmMicrokernelTester()
35645 .mr(3)
35646 .nr(16)
35647 .kr(1)
35648 .sr(1)
35649 .m(3)
35650 .n(16)
35651 .k(1)
35652 .a_stride(3)
35653 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35654 }
35655
35656 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
35657 TEST_REQUIRES_X86_FMA3;
35658 for (uint32_t m = 1; m <= 3; m++) {
35659 for (uint32_t n = 1; n <= 16; n++) {
35660 GemmMicrokernelTester()
35661 .mr(3)
35662 .nr(16)
35663 .kr(1)
35664 .sr(1)
35665 .m(m)
35666 .n(n)
35667 .k(1)
35668 .iterations(1)
35669 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35670 }
35671 }
35672 }
35673
35674 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
35675 TEST_REQUIRES_X86_FMA3;
35676 for (uint32_t m = 1; m <= 3; m++) {
35677 GemmMicrokernelTester()
35678 .mr(3)
35679 .nr(16)
35680 .kr(1)
35681 .sr(1)
35682 .m(m)
35683 .n(16)
35684 .k(1)
35685 .iterations(1)
35686 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35687 }
35688 }
35689
35690 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
35691 TEST_REQUIRES_X86_FMA3;
35692 for (uint32_t n = 1; n <= 16; n++) {
35693 GemmMicrokernelTester()
35694 .mr(3)
35695 .nr(16)
35696 .kr(1)
35697 .sr(1)
35698 .m(3)
35699 .n(n)
35700 .k(1)
35701 .iterations(1)
35702 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35703 }
35704 }
35705
35706 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_gt_1) {
35707 TEST_REQUIRES_X86_FMA3;
35708 for (size_t k = 2; k < 10; k++) {
35709 GemmMicrokernelTester()
35710 .mr(3)
35711 .nr(16)
35712 .kr(1)
35713 .sr(1)
35714 .m(3)
35715 .n(16)
35716 .k(k)
35717 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35718 }
35719 }
35720
35721 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_gt_1_strided_a) {
35722 TEST_REQUIRES_X86_FMA3;
35723 for (size_t k = 2; k < 10; k++) {
35724 GemmMicrokernelTester()
35725 .mr(3)
35726 .nr(16)
35727 .kr(1)
35728 .sr(1)
35729 .m(3)
35730 .n(16)
35731 .k(k)
35732 .a_stride(11)
35733 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35734 }
35735 }
35736
35737 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
35738 TEST_REQUIRES_X86_FMA3;
35739 for (size_t k = 2; k < 10; k++) {
35740 for (uint32_t m = 1; m <= 3; m++) {
35741 for (uint32_t n = 1; n <= 16; n++) {
35742 GemmMicrokernelTester()
35743 .mr(3)
35744 .nr(16)
35745 .kr(1)
35746 .sr(1)
35747 .m(m)
35748 .n(n)
35749 .k(k)
35750 .iterations(1)
35751 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35752 }
35753 }
35754 }
35755 }
35756
35757 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16) {
35758 TEST_REQUIRES_X86_FMA3;
35759 for (uint32_t n = 17; n < 32; n++) {
35760 for (size_t k = 1; k <= 5; k += 2) {
35761 GemmMicrokernelTester()
35762 .mr(3)
35763 .nr(16)
35764 .kr(1)
35765 .sr(1)
35766 .m(3)
35767 .n(16)
35768 .k(k)
35769 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35770 }
35771 }
35772 }
35773
35774 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
35775 TEST_REQUIRES_X86_FMA3;
35776 for (uint32_t n = 17; n < 32; n++) {
35777 for (size_t k = 1; k <= 5; k += 2) {
35778 GemmMicrokernelTester()
35779 .mr(3)
35780 .nr(16)
35781 .kr(1)
35782 .sr(1)
35783 .m(3)
35784 .n(16)
35785 .k(k)
35786 .cn_stride(19)
35787 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35788 }
35789 }
35790 }
35791
35792 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16_strided_a) {
35793 TEST_REQUIRES_X86_FMA3;
35794 for (uint32_t n = 17; n < 32; n++) {
35795 for (size_t k = 1; k <= 5; k += 2) {
35796 GemmMicrokernelTester()
35797 .mr(3)
35798 .nr(16)
35799 .kr(1)
35800 .sr(1)
35801 .m(3)
35802 .n(n)
35803 .k(k)
35804 .a_stride(7)
35805 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35806 }
35807 }
35808 }
35809
35810 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
35811 TEST_REQUIRES_X86_FMA3;
35812 for (uint32_t n = 17; n < 32; n++) {
35813 for (size_t k = 1; k <= 5; k += 2) {
35814 for (uint32_t m = 1; m <= 3; m++) {
35815 GemmMicrokernelTester()
35816 .mr(3)
35817 .nr(16)
35818 .kr(1)
35819 .sr(1)
35820 .m(m)
35821 .n(n)
35822 .k(k)
35823 .iterations(1)
35824 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35825 }
35826 }
35827 }
35828 }
35829
35830 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16) {
35831 TEST_REQUIRES_X86_FMA3;
35832 for (uint32_t n = 32; n <= 48; n += 16) {
35833 for (size_t k = 1; k <= 5; k += 2) {
35834 GemmMicrokernelTester()
35835 .mr(3)
35836 .nr(16)
35837 .kr(1)
35838 .sr(1)
35839 .m(3)
35840 .n(16)
35841 .k(k)
35842 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35843 }
35844 }
35845 }
35846
35847 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
35848 TEST_REQUIRES_X86_FMA3;
35849 for (uint32_t n = 32; n <= 48; n += 16) {
35850 for (size_t k = 1; k <= 5; k += 2) {
35851 GemmMicrokernelTester()
35852 .mr(3)
35853 .nr(16)
35854 .kr(1)
35855 .sr(1)
35856 .m(3)
35857 .n(n)
35858 .k(k)
35859 .cn_stride(19)
35860 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35861 }
35862 }
35863 }
35864
35865 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16_strided_a) {
35866 TEST_REQUIRES_X86_FMA3;
35867 for (uint32_t n = 32; n <= 48; n += 16) {
35868 for (size_t k = 1; k <= 5; k += 2) {
35869 GemmMicrokernelTester()
35870 .mr(3)
35871 .nr(16)
35872 .kr(1)
35873 .sr(1)
35874 .m(3)
35875 .n(n)
35876 .k(k)
35877 .a_stride(7)
35878 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35879 }
35880 }
35881 }
35882
35883 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16_subtile) {
35884 TEST_REQUIRES_X86_FMA3;
35885 for (uint32_t n = 32; n <= 48; n += 16) {
35886 for (size_t k = 1; k <= 5; k += 2) {
35887 for (uint32_t m = 1; m <= 3; m++) {
35888 GemmMicrokernelTester()
35889 .mr(3)
35890 .nr(16)
35891 .kr(1)
35892 .sr(1)
35893 .m(m)
35894 .n(n)
35895 .k(k)
35896 .iterations(1)
35897 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35898 }
35899 }
35900 }
35901 }
35902
35903 TEST(F32_GEMM_3X16__FMA3_BROADCAST, strided_cm_subtile) {
35904 TEST_REQUIRES_X86_FMA3;
35905 for (size_t k = 1; k <= 5; k += 2) {
35906 for (uint32_t m = 1; m <= 3; m++) {
35907 for (uint32_t n = 1; n <= 16; n++) {
35908 GemmMicrokernelTester()
35909 .mr(3)
35910 .nr(16)
35911 .kr(1)
35912 .sr(1)
35913 .m(m)
35914 .n(n)
35915 .k(k)
35916 .cm_stride(19)
35917 .iterations(1)
35918 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35919 }
35920 }
35921 }
35922 }
35923
35924 TEST(F32_GEMM_3X16__FMA3_BROADCAST, qmin) {
35925 TEST_REQUIRES_X86_FMA3;
35926 GemmMicrokernelTester()
35927 .mr(3)
35928 .nr(16)
35929 .kr(1)
35930 .sr(1)
35931 .m(3)
35932 .n(16)
35933 .k(1)
35934 .qmin(128)
35935 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35936 }
35937
35938 TEST(F32_GEMM_3X16__FMA3_BROADCAST, qmax) {
35939 TEST_REQUIRES_X86_FMA3;
35940 GemmMicrokernelTester()
35941 .mr(3)
35942 .nr(16)
35943 .kr(1)
35944 .sr(1)
35945 .m(3)
35946 .n(16)
35947 .k(1)
35948 .qmax(128)
35949 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35950 }
35951
35952 TEST(F32_GEMM_3X16__FMA3_BROADCAST, strided_cm) {
35953 TEST_REQUIRES_X86_FMA3;
35954 GemmMicrokernelTester()
35955 .mr(3)
35956 .nr(16)
35957 .kr(1)
35958 .sr(1)
35959 .m(3)
35960 .n(16)
35961 .k(1)
35962 .cm_stride(19)
35963 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
35964 }
35965#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35966
35967
35968#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35969 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1) {
35970 TEST_REQUIRES_X86_FMA3;
35971 GemmMicrokernelTester()
35972 .mr(4)
35973 .nr(16)
35974 .kr(1)
35975 .sr(1)
35976 .m(4)
35977 .n(16)
35978 .k(1)
35979 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
35980 }
35981
35982 TEST(F32_GEMM_4X16__FMA3_BROADCAST, strided_cn) {
35983 TEST_REQUIRES_X86_FMA3;
35984 GemmMicrokernelTester()
35985 .mr(4)
35986 .nr(16)
35987 .kr(1)
35988 .sr(1)
35989 .m(4)
35990 .n(16)
35991 .k(1)
35992 .cn_stride(19)
35993 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
35994 }
35995
35996 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_strided_a) {
35997 TEST_REQUIRES_X86_FMA3;
35998 GemmMicrokernelTester()
35999 .mr(4)
36000 .nr(16)
36001 .kr(1)
36002 .sr(1)
36003 .m(4)
36004 .n(16)
36005 .k(1)
36006 .a_stride(3)
36007 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36008 }
36009
36010 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile) {
36011 TEST_REQUIRES_X86_FMA3;
36012 for (uint32_t m = 1; m <= 4; m++) {
36013 for (uint32_t n = 1; n <= 16; n++) {
36014 GemmMicrokernelTester()
36015 .mr(4)
36016 .nr(16)
36017 .kr(1)
36018 .sr(1)
36019 .m(m)
36020 .n(n)
36021 .k(1)
36022 .iterations(1)
36023 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36024 }
36025 }
36026 }
36027
36028 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
36029 TEST_REQUIRES_X86_FMA3;
36030 for (uint32_t m = 1; m <= 4; m++) {
36031 GemmMicrokernelTester()
36032 .mr(4)
36033 .nr(16)
36034 .kr(1)
36035 .sr(1)
36036 .m(m)
36037 .n(16)
36038 .k(1)
36039 .iterations(1)
36040 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36041 }
36042 }
36043
36044 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
36045 TEST_REQUIRES_X86_FMA3;
36046 for (uint32_t n = 1; n <= 16; n++) {
36047 GemmMicrokernelTester()
36048 .mr(4)
36049 .nr(16)
36050 .kr(1)
36051 .sr(1)
36052 .m(4)
36053 .n(n)
36054 .k(1)
36055 .iterations(1)
36056 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36057 }
36058 }
36059
36060 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_gt_1) {
36061 TEST_REQUIRES_X86_FMA3;
36062 for (size_t k = 2; k < 10; k++) {
36063 GemmMicrokernelTester()
36064 .mr(4)
36065 .nr(16)
36066 .kr(1)
36067 .sr(1)
36068 .m(4)
36069 .n(16)
36070 .k(k)
36071 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36072 }
36073 }
36074
36075 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_gt_1_strided_a) {
36076 TEST_REQUIRES_X86_FMA3;
36077 for (size_t k = 2; k < 10; k++) {
36078 GemmMicrokernelTester()
36079 .mr(4)
36080 .nr(16)
36081 .kr(1)
36082 .sr(1)
36083 .m(4)
36084 .n(16)
36085 .k(k)
36086 .a_stride(11)
36087 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36088 }
36089 }
36090
36091 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_gt_1_subtile) {
36092 TEST_REQUIRES_X86_FMA3;
36093 for (size_t k = 2; k < 10; k++) {
36094 for (uint32_t m = 1; m <= 4; m++) {
36095 for (uint32_t n = 1; n <= 16; n++) {
36096 GemmMicrokernelTester()
36097 .mr(4)
36098 .nr(16)
36099 .kr(1)
36100 .sr(1)
36101 .m(m)
36102 .n(n)
36103 .k(k)
36104 .iterations(1)
36105 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36106 }
36107 }
36108 }
36109 }
36110
36111 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16) {
36112 TEST_REQUIRES_X86_FMA3;
36113 for (uint32_t n = 17; n < 32; n++) {
36114 for (size_t k = 1; k <= 5; k += 2) {
36115 GemmMicrokernelTester()
36116 .mr(4)
36117 .nr(16)
36118 .kr(1)
36119 .sr(1)
36120 .m(4)
36121 .n(16)
36122 .k(k)
36123 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36124 }
36125 }
36126 }
36127
36128 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
36129 TEST_REQUIRES_X86_FMA3;
36130 for (uint32_t n = 17; n < 32; n++) {
36131 for (size_t k = 1; k <= 5; k += 2) {
36132 GemmMicrokernelTester()
36133 .mr(4)
36134 .nr(16)
36135 .kr(1)
36136 .sr(1)
36137 .m(4)
36138 .n(16)
36139 .k(k)
36140 .cn_stride(19)
36141 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36142 }
36143 }
36144 }
36145
36146 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16_strided_a) {
36147 TEST_REQUIRES_X86_FMA3;
36148 for (uint32_t n = 17; n < 32; n++) {
36149 for (size_t k = 1; k <= 5; k += 2) {
36150 GemmMicrokernelTester()
36151 .mr(4)
36152 .nr(16)
36153 .kr(1)
36154 .sr(1)
36155 .m(4)
36156 .n(n)
36157 .k(k)
36158 .a_stride(7)
36159 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36160 }
36161 }
36162 }
36163
36164 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16_subtile) {
36165 TEST_REQUIRES_X86_FMA3;
36166 for (uint32_t n = 17; n < 32; n++) {
36167 for (size_t k = 1; k <= 5; k += 2) {
36168 for (uint32_t m = 1; m <= 4; m++) {
36169 GemmMicrokernelTester()
36170 .mr(4)
36171 .nr(16)
36172 .kr(1)
36173 .sr(1)
36174 .m(m)
36175 .n(n)
36176 .k(k)
36177 .iterations(1)
36178 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36179 }
36180 }
36181 }
36182 }
36183
36184 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16) {
36185 TEST_REQUIRES_X86_FMA3;
36186 for (uint32_t n = 32; n <= 48; n += 16) {
36187 for (size_t k = 1; k <= 5; k += 2) {
36188 GemmMicrokernelTester()
36189 .mr(4)
36190 .nr(16)
36191 .kr(1)
36192 .sr(1)
36193 .m(4)
36194 .n(16)
36195 .k(k)
36196 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36197 }
36198 }
36199 }
36200
36201 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16_strided_cn) {
36202 TEST_REQUIRES_X86_FMA3;
36203 for (uint32_t n = 32; n <= 48; n += 16) {
36204 for (size_t k = 1; k <= 5; k += 2) {
36205 GemmMicrokernelTester()
36206 .mr(4)
36207 .nr(16)
36208 .kr(1)
36209 .sr(1)
36210 .m(4)
36211 .n(n)
36212 .k(k)
36213 .cn_stride(19)
36214 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36215 }
36216 }
36217 }
36218
36219 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16_strided_a) {
36220 TEST_REQUIRES_X86_FMA3;
36221 for (uint32_t n = 32; n <= 48; n += 16) {
36222 for (size_t k = 1; k <= 5; k += 2) {
36223 GemmMicrokernelTester()
36224 .mr(4)
36225 .nr(16)
36226 .kr(1)
36227 .sr(1)
36228 .m(4)
36229 .n(n)
36230 .k(k)
36231 .a_stride(7)
36232 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36233 }
36234 }
36235 }
36236
36237 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16_subtile) {
36238 TEST_REQUIRES_X86_FMA3;
36239 for (uint32_t n = 32; n <= 48; n += 16) {
36240 for (size_t k = 1; k <= 5; k += 2) {
36241 for (uint32_t m = 1; m <= 4; m++) {
36242 GemmMicrokernelTester()
36243 .mr(4)
36244 .nr(16)
36245 .kr(1)
36246 .sr(1)
36247 .m(m)
36248 .n(n)
36249 .k(k)
36250 .iterations(1)
36251 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36252 }
36253 }
36254 }
36255 }
36256
36257 TEST(F32_GEMM_4X16__FMA3_BROADCAST, strided_cm_subtile) {
36258 TEST_REQUIRES_X86_FMA3;
36259 for (size_t k = 1; k <= 5; k += 2) {
36260 for (uint32_t m = 1; m <= 4; m++) {
36261 for (uint32_t n = 1; n <= 16; n++) {
36262 GemmMicrokernelTester()
36263 .mr(4)
36264 .nr(16)
36265 .kr(1)
36266 .sr(1)
36267 .m(m)
36268 .n(n)
36269 .k(k)
36270 .cm_stride(19)
36271 .iterations(1)
36272 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36273 }
36274 }
36275 }
36276 }
36277
36278 TEST(F32_GEMM_4X16__FMA3_BROADCAST, qmin) {
36279 TEST_REQUIRES_X86_FMA3;
36280 GemmMicrokernelTester()
36281 .mr(4)
36282 .nr(16)
36283 .kr(1)
36284 .sr(1)
36285 .m(4)
36286 .n(16)
36287 .k(1)
36288 .qmin(128)
36289 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36290 }
36291
36292 TEST(F32_GEMM_4X16__FMA3_BROADCAST, qmax) {
36293 TEST_REQUIRES_X86_FMA3;
36294 GemmMicrokernelTester()
36295 .mr(4)
36296 .nr(16)
36297 .kr(1)
36298 .sr(1)
36299 .m(4)
36300 .n(16)
36301 .k(1)
36302 .qmax(128)
36303 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36304 }
36305
36306 TEST(F32_GEMM_4X16__FMA3_BROADCAST, strided_cm) {
36307 TEST_REQUIRES_X86_FMA3;
36308 GemmMicrokernelTester()
36309 .mr(4)
36310 .nr(16)
36311 .kr(1)
36312 .sr(1)
36313 .m(4)
36314 .n(16)
36315 .k(1)
36316 .cm_stride(19)
36317 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
36318 }
36319#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36320
36321
36322#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36323 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1) {
36324 TEST_REQUIRES_X86_FMA3;
36325 GemmMicrokernelTester()
36326 .mr(5)
36327 .nr(16)
36328 .kr(1)
36329 .sr(1)
36330 .m(5)
36331 .n(16)
36332 .k(1)
36333 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36334 }
36335
36336 TEST(F32_GEMM_5X16__FMA3_BROADCAST, strided_cn) {
36337 TEST_REQUIRES_X86_FMA3;
36338 GemmMicrokernelTester()
36339 .mr(5)
36340 .nr(16)
36341 .kr(1)
36342 .sr(1)
36343 .m(5)
36344 .n(16)
36345 .k(1)
36346 .cn_stride(19)
36347 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36348 }
36349
36350 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_strided_a) {
36351 TEST_REQUIRES_X86_FMA3;
36352 GemmMicrokernelTester()
36353 .mr(5)
36354 .nr(16)
36355 .kr(1)
36356 .sr(1)
36357 .m(5)
36358 .n(16)
36359 .k(1)
36360 .a_stride(3)
36361 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36362 }
36363
36364 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile) {
36365 TEST_REQUIRES_X86_FMA3;
36366 for (uint32_t m = 1; m <= 5; m++) {
36367 for (uint32_t n = 1; n <= 16; n++) {
36368 GemmMicrokernelTester()
36369 .mr(5)
36370 .nr(16)
36371 .kr(1)
36372 .sr(1)
36373 .m(m)
36374 .n(n)
36375 .k(1)
36376 .iterations(1)
36377 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36378 }
36379 }
36380 }
36381
36382 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
36383 TEST_REQUIRES_X86_FMA3;
36384 for (uint32_t m = 1; m <= 5; m++) {
36385 GemmMicrokernelTester()
36386 .mr(5)
36387 .nr(16)
36388 .kr(1)
36389 .sr(1)
36390 .m(m)
36391 .n(16)
36392 .k(1)
36393 .iterations(1)
36394 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36395 }
36396 }
36397
36398 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
36399 TEST_REQUIRES_X86_FMA3;
36400 for (uint32_t n = 1; n <= 16; n++) {
36401 GemmMicrokernelTester()
36402 .mr(5)
36403 .nr(16)
36404 .kr(1)
36405 .sr(1)
36406 .m(5)
36407 .n(n)
36408 .k(1)
36409 .iterations(1)
36410 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36411 }
36412 }
36413
36414 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_gt_1) {
36415 TEST_REQUIRES_X86_FMA3;
36416 for (size_t k = 2; k < 10; k++) {
36417 GemmMicrokernelTester()
36418 .mr(5)
36419 .nr(16)
36420 .kr(1)
36421 .sr(1)
36422 .m(5)
36423 .n(16)
36424 .k(k)
36425 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36426 }
36427 }
36428
36429 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_gt_1_strided_a) {
36430 TEST_REQUIRES_X86_FMA3;
36431 for (size_t k = 2; k < 10; k++) {
36432 GemmMicrokernelTester()
36433 .mr(5)
36434 .nr(16)
36435 .kr(1)
36436 .sr(1)
36437 .m(5)
36438 .n(16)
36439 .k(k)
36440 .a_stride(11)
36441 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36442 }
36443 }
36444
36445 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_gt_1_subtile) {
36446 TEST_REQUIRES_X86_FMA3;
36447 for (size_t k = 2; k < 10; k++) {
36448 for (uint32_t m = 1; m <= 5; m++) {
36449 for (uint32_t n = 1; n <= 16; n++) {
36450 GemmMicrokernelTester()
36451 .mr(5)
36452 .nr(16)
36453 .kr(1)
36454 .sr(1)
36455 .m(m)
36456 .n(n)
36457 .k(k)
36458 .iterations(1)
36459 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36460 }
36461 }
36462 }
36463 }
36464
36465 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16) {
36466 TEST_REQUIRES_X86_FMA3;
36467 for (uint32_t n = 17; n < 32; n++) {
36468 for (size_t k = 1; k <= 5; k += 2) {
36469 GemmMicrokernelTester()
36470 .mr(5)
36471 .nr(16)
36472 .kr(1)
36473 .sr(1)
36474 .m(5)
36475 .n(16)
36476 .k(k)
36477 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36478 }
36479 }
36480 }
36481
36482 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
36483 TEST_REQUIRES_X86_FMA3;
36484 for (uint32_t n = 17; n < 32; n++) {
36485 for (size_t k = 1; k <= 5; k += 2) {
36486 GemmMicrokernelTester()
36487 .mr(5)
36488 .nr(16)
36489 .kr(1)
36490 .sr(1)
36491 .m(5)
36492 .n(16)
36493 .k(k)
36494 .cn_stride(19)
36495 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36496 }
36497 }
36498 }
36499
36500 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16_strided_a) {
36501 TEST_REQUIRES_X86_FMA3;
36502 for (uint32_t n = 17; n < 32; n++) {
36503 for (size_t k = 1; k <= 5; k += 2) {
36504 GemmMicrokernelTester()
36505 .mr(5)
36506 .nr(16)
36507 .kr(1)
36508 .sr(1)
36509 .m(5)
36510 .n(n)
36511 .k(k)
36512 .a_stride(7)
36513 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36514 }
36515 }
36516 }
36517
36518 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16_subtile) {
36519 TEST_REQUIRES_X86_FMA3;
36520 for (uint32_t n = 17; n < 32; n++) {
36521 for (size_t k = 1; k <= 5; k += 2) {
36522 for (uint32_t m = 1; m <= 5; m++) {
36523 GemmMicrokernelTester()
36524 .mr(5)
36525 .nr(16)
36526 .kr(1)
36527 .sr(1)
36528 .m(m)
36529 .n(n)
36530 .k(k)
36531 .iterations(1)
36532 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36533 }
36534 }
36535 }
36536 }
36537
36538 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16) {
36539 TEST_REQUIRES_X86_FMA3;
36540 for (uint32_t n = 32; n <= 48; n += 16) {
36541 for (size_t k = 1; k <= 5; k += 2) {
36542 GemmMicrokernelTester()
36543 .mr(5)
36544 .nr(16)
36545 .kr(1)
36546 .sr(1)
36547 .m(5)
36548 .n(16)
36549 .k(k)
36550 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36551 }
36552 }
36553 }
36554
36555 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16_strided_cn) {
36556 TEST_REQUIRES_X86_FMA3;
36557 for (uint32_t n = 32; n <= 48; n += 16) {
36558 for (size_t k = 1; k <= 5; k += 2) {
36559 GemmMicrokernelTester()
36560 .mr(5)
36561 .nr(16)
36562 .kr(1)
36563 .sr(1)
36564 .m(5)
36565 .n(n)
36566 .k(k)
36567 .cn_stride(19)
36568 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36569 }
36570 }
36571 }
36572
36573 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16_strided_a) {
36574 TEST_REQUIRES_X86_FMA3;
36575 for (uint32_t n = 32; n <= 48; n += 16) {
36576 for (size_t k = 1; k <= 5; k += 2) {
36577 GemmMicrokernelTester()
36578 .mr(5)
36579 .nr(16)
36580 .kr(1)
36581 .sr(1)
36582 .m(5)
36583 .n(n)
36584 .k(k)
36585 .a_stride(7)
36586 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36587 }
36588 }
36589 }
36590
36591 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16_subtile) {
36592 TEST_REQUIRES_X86_FMA3;
36593 for (uint32_t n = 32; n <= 48; n += 16) {
36594 for (size_t k = 1; k <= 5; k += 2) {
36595 for (uint32_t m = 1; m <= 5; m++) {
36596 GemmMicrokernelTester()
36597 .mr(5)
36598 .nr(16)
36599 .kr(1)
36600 .sr(1)
36601 .m(m)
36602 .n(n)
36603 .k(k)
36604 .iterations(1)
36605 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36606 }
36607 }
36608 }
36609 }
36610
36611 TEST(F32_GEMM_5X16__FMA3_BROADCAST, strided_cm_subtile) {
36612 TEST_REQUIRES_X86_FMA3;
36613 for (size_t k = 1; k <= 5; k += 2) {
36614 for (uint32_t m = 1; m <= 5; m++) {
36615 for (uint32_t n = 1; n <= 16; n++) {
36616 GemmMicrokernelTester()
36617 .mr(5)
36618 .nr(16)
36619 .kr(1)
36620 .sr(1)
36621 .m(m)
36622 .n(n)
36623 .k(k)
36624 .cm_stride(19)
36625 .iterations(1)
36626 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36627 }
36628 }
36629 }
36630 }
36631
36632 TEST(F32_GEMM_5X16__FMA3_BROADCAST, qmin) {
36633 TEST_REQUIRES_X86_FMA3;
36634 GemmMicrokernelTester()
36635 .mr(5)
36636 .nr(16)
36637 .kr(1)
36638 .sr(1)
36639 .m(5)
36640 .n(16)
36641 .k(1)
36642 .qmin(128)
36643 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36644 }
36645
36646 TEST(F32_GEMM_5X16__FMA3_BROADCAST, qmax) {
36647 TEST_REQUIRES_X86_FMA3;
36648 GemmMicrokernelTester()
36649 .mr(5)
36650 .nr(16)
36651 .kr(1)
36652 .sr(1)
36653 .m(5)
36654 .n(16)
36655 .k(1)
36656 .qmax(128)
36657 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36658 }
36659
36660 TEST(F32_GEMM_5X16__FMA3_BROADCAST, strided_cm) {
36661 TEST_REQUIRES_X86_FMA3;
36662 GemmMicrokernelTester()
36663 .mr(5)
36664 .nr(16)
36665 .kr(1)
36666 .sr(1)
36667 .m(5)
36668 .n(16)
36669 .k(1)
36670 .cm_stride(19)
36671 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
36672 }
36673#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36674
36675
36676#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36677 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4) {
36678 TEST_REQUIRES_X86_FMA3;
36679 GemmMicrokernelTester()
36680 .mr(1)
36681 .nr(16)
36682 .kr(1)
36683 .sr(4)
36684 .m(1)
36685 .n(16)
36686 .k(4)
36687 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36688 }
36689
36690 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, strided_cn) {
36691 TEST_REQUIRES_X86_FMA3;
36692 GemmMicrokernelTester()
36693 .mr(1)
36694 .nr(16)
36695 .kr(1)
36696 .sr(4)
36697 .m(1)
36698 .n(16)
36699 .k(4)
36700 .cn_stride(19)
36701 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36702 }
36703
36704 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
36705 TEST_REQUIRES_X86_FMA3;
36706 GemmMicrokernelTester()
36707 .mr(1)
36708 .nr(16)
36709 .kr(1)
36710 .sr(4)
36711 .m(1)
36712 .n(16)
36713 .k(4)
36714 .a_stride(7)
36715 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36716 }
36717
36718 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
36719 TEST_REQUIRES_X86_FMA3;
36720 for (uint32_t m = 1; m <= 1; m++) {
36721 for (uint32_t n = 1; n <= 16; n++) {
36722 GemmMicrokernelTester()
36723 .mr(1)
36724 .nr(16)
36725 .kr(1)
36726 .sr(4)
36727 .m(m)
36728 .n(n)
36729 .k(4)
36730 .iterations(1)
36731 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36732 }
36733 }
36734 }
36735
36736 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
36737 TEST_REQUIRES_X86_FMA3;
36738 for (uint32_t m = 1; m <= 1; m++) {
36739 GemmMicrokernelTester()
36740 .mr(1)
36741 .nr(16)
36742 .kr(1)
36743 .sr(4)
36744 .m(m)
36745 .n(16)
36746 .k(4)
36747 .iterations(1)
36748 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36749 }
36750 }
36751
36752 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
36753 TEST_REQUIRES_X86_FMA3;
36754 for (uint32_t n = 1; n <= 16; n++) {
36755 GemmMicrokernelTester()
36756 .mr(1)
36757 .nr(16)
36758 .kr(1)
36759 .sr(4)
36760 .m(1)
36761 .n(n)
36762 .k(4)
36763 .iterations(1)
36764 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36765 }
36766 }
36767
36768 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_lt_4) {
36769 TEST_REQUIRES_X86_FMA3;
36770 for (size_t k = 1; k < 4; k++) {
36771 GemmMicrokernelTester()
36772 .mr(1)
36773 .nr(16)
36774 .kr(1)
36775 .sr(4)
36776 .m(1)
36777 .n(16)
36778 .k(k)
36779 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36780 }
36781 }
36782
36783 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
36784 TEST_REQUIRES_X86_FMA3;
36785 for (size_t k = 1; k < 4; k++) {
36786 GemmMicrokernelTester()
36787 .mr(1)
36788 .nr(16)
36789 .kr(1)
36790 .sr(4)
36791 .m(1)
36792 .n(16)
36793 .k(k)
36794 .a_stride(7)
36795 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36796 }
36797 }
36798
36799 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
36800 TEST_REQUIRES_X86_FMA3;
36801 for (size_t k = 1; k < 4; k++) {
36802 for (uint32_t m = 1; m <= 1; m++) {
36803 for (uint32_t n = 1; n <= 16; n++) {
36804 GemmMicrokernelTester()
36805 .mr(1)
36806 .nr(16)
36807 .kr(1)
36808 .sr(4)
36809 .m(m)
36810 .n(n)
36811 .k(k)
36812 .iterations(1)
36813 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36814 }
36815 }
36816 }
36817 }
36818
36819 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_gt_4) {
36820 TEST_REQUIRES_X86_FMA3;
36821 for (size_t k = 5; k < 8; k++) {
36822 GemmMicrokernelTester()
36823 .mr(1)
36824 .nr(16)
36825 .kr(1)
36826 .sr(4)
36827 .m(1)
36828 .n(16)
36829 .k(k)
36830 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36831 }
36832 }
36833
36834 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
36835 TEST_REQUIRES_X86_FMA3;
36836 for (size_t k = 5; k < 8; k++) {
36837 GemmMicrokernelTester()
36838 .mr(1)
36839 .nr(16)
36840 .kr(1)
36841 .sr(4)
36842 .m(1)
36843 .n(16)
36844 .k(k)
36845 .a_stride(11)
36846 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36847 }
36848 }
36849
36850 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
36851 TEST_REQUIRES_X86_FMA3;
36852 for (size_t k = 5; k < 8; k++) {
36853 for (uint32_t m = 1; m <= 1; m++) {
36854 for (uint32_t n = 1; n <= 16; n++) {
36855 GemmMicrokernelTester()
36856 .mr(1)
36857 .nr(16)
36858 .kr(1)
36859 .sr(4)
36860 .m(m)
36861 .n(n)
36862 .k(k)
36863 .iterations(1)
36864 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36865 }
36866 }
36867 }
36868 }
36869
36870 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_div_4) {
36871 TEST_REQUIRES_X86_FMA3;
36872 for (size_t k = 8; k <= 40; k += 4) {
36873 GemmMicrokernelTester()
36874 .mr(1)
36875 .nr(16)
36876 .kr(1)
36877 .sr(4)
36878 .m(1)
36879 .n(16)
36880 .k(k)
36881 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36882 }
36883 }
36884
36885 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
36886 TEST_REQUIRES_X86_FMA3;
36887 for (size_t k = 8; k <= 40; k += 4) {
36888 GemmMicrokernelTester()
36889 .mr(1)
36890 .nr(16)
36891 .kr(1)
36892 .sr(4)
36893 .m(1)
36894 .n(16)
36895 .k(k)
36896 .a_stride(43)
36897 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36898 }
36899 }
36900
36901 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_div_4_subtile) {
36902 TEST_REQUIRES_X86_FMA3;
36903 for (size_t k = 8; k <= 40; k += 4) {
36904 for (uint32_t m = 1; m <= 1; m++) {
36905 for (uint32_t n = 1; n <= 16; n++) {
36906 GemmMicrokernelTester()
36907 .mr(1)
36908 .nr(16)
36909 .kr(1)
36910 .sr(4)
36911 .m(m)
36912 .n(n)
36913 .k(k)
36914 .iterations(1)
36915 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36916 }
36917 }
36918 }
36919 }
36920
36921 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16) {
36922 TEST_REQUIRES_X86_FMA3;
36923 for (uint32_t n = 17; n < 32; n++) {
36924 for (size_t k = 1; k <= 20; k += 5) {
36925 GemmMicrokernelTester()
36926 .mr(1)
36927 .nr(16)
36928 .kr(1)
36929 .sr(4)
36930 .m(1)
36931 .n(16)
36932 .k(k)
36933 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36934 }
36935 }
36936 }
36937
36938 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
36939 TEST_REQUIRES_X86_FMA3;
36940 for (uint32_t n = 17; n < 32; n++) {
36941 for (size_t k = 1; k <= 20; k += 5) {
36942 GemmMicrokernelTester()
36943 .mr(1)
36944 .nr(16)
36945 .kr(1)
36946 .sr(4)
36947 .m(1)
36948 .n(16)
36949 .k(k)
36950 .cn_stride(19)
36951 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36952 }
36953 }
36954 }
36955
36956 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
36957 TEST_REQUIRES_X86_FMA3;
36958 for (uint32_t n = 17; n < 32; n++) {
36959 for (size_t k = 1; k <= 20; k += 5) {
36960 GemmMicrokernelTester()
36961 .mr(1)
36962 .nr(16)
36963 .kr(1)
36964 .sr(4)
36965 .m(1)
36966 .n(n)
36967 .k(k)
36968 .a_stride(23)
36969 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36970 }
36971 }
36972 }
36973
36974 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
36975 TEST_REQUIRES_X86_FMA3;
36976 for (uint32_t n = 17; n < 32; n++) {
36977 for (size_t k = 1; k <= 20; k += 5) {
36978 for (uint32_t m = 1; m <= 1; m++) {
36979 GemmMicrokernelTester()
36980 .mr(1)
36981 .nr(16)
36982 .kr(1)
36983 .sr(4)
36984 .m(m)
36985 .n(n)
36986 .k(k)
36987 .iterations(1)
36988 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
36989 }
36990 }
36991 }
36992 }
36993
36994 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16) {
36995 TEST_REQUIRES_X86_FMA3;
36996 for (uint32_t n = 32; n <= 48; n += 16) {
36997 for (size_t k = 1; k <= 20; k += 5) {
36998 GemmMicrokernelTester()
36999 .mr(1)
37000 .nr(16)
37001 .kr(1)
37002 .sr(4)
37003 .m(1)
37004 .n(16)
37005 .k(k)
37006 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37007 }
37008 }
37009 }
37010
37011 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
37012 TEST_REQUIRES_X86_FMA3;
37013 for (uint32_t n = 32; n <= 48; n += 16) {
37014 for (size_t k = 1; k <= 20; k += 5) {
37015 GemmMicrokernelTester()
37016 .mr(1)
37017 .nr(16)
37018 .kr(1)
37019 .sr(4)
37020 .m(1)
37021 .n(n)
37022 .k(k)
37023 .cn_stride(19)
37024 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37025 }
37026 }
37027 }
37028
37029 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
37030 TEST_REQUIRES_X86_FMA3;
37031 for (uint32_t n = 32; n <= 48; n += 16) {
37032 for (size_t k = 1; k <= 20; k += 5) {
37033 GemmMicrokernelTester()
37034 .mr(1)
37035 .nr(16)
37036 .kr(1)
37037 .sr(4)
37038 .m(1)
37039 .n(n)
37040 .k(k)
37041 .a_stride(23)
37042 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37043 }
37044 }
37045 }
37046
37047 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16_subtile) {
37048 TEST_REQUIRES_X86_FMA3;
37049 for (uint32_t n = 32; n <= 48; n += 16) {
37050 for (size_t k = 1; k <= 20; k += 5) {
37051 for (uint32_t m = 1; m <= 1; m++) {
37052 GemmMicrokernelTester()
37053 .mr(1)
37054 .nr(16)
37055 .kr(1)
37056 .sr(4)
37057 .m(m)
37058 .n(n)
37059 .k(k)
37060 .iterations(1)
37061 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37062 }
37063 }
37064 }
37065 }
37066
37067 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, strided_cm_subtile) {
37068 TEST_REQUIRES_X86_FMA3;
37069 for (size_t k = 1; k <= 20; k += 5) {
37070 for (uint32_t m = 1; m <= 1; m++) {
37071 for (uint32_t n = 1; n <= 16; n++) {
37072 GemmMicrokernelTester()
37073 .mr(1)
37074 .nr(16)
37075 .kr(1)
37076 .sr(4)
37077 .m(m)
37078 .n(n)
37079 .k(k)
37080 .cm_stride(19)
37081 .iterations(1)
37082 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37083 }
37084 }
37085 }
37086 }
37087
37088 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, qmin) {
37089 TEST_REQUIRES_X86_FMA3;
37090 GemmMicrokernelTester()
37091 .mr(1)
37092 .nr(16)
37093 .kr(1)
37094 .sr(4)
37095 .m(1)
37096 .n(16)
37097 .k(4)
37098 .qmin(128)
37099 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37100 }
37101
37102 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, qmax) {
37103 TEST_REQUIRES_X86_FMA3;
37104 GemmMicrokernelTester()
37105 .mr(1)
37106 .nr(16)
37107 .kr(1)
37108 .sr(4)
37109 .m(1)
37110 .n(16)
37111 .k(4)
37112 .qmax(128)
37113 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37114 }
37115
37116 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, strided_cm) {
37117 TEST_REQUIRES_X86_FMA3;
37118 GemmMicrokernelTester()
37119 .mr(1)
37120 .nr(16)
37121 .kr(1)
37122 .sr(4)
37123 .m(1)
37124 .n(16)
37125 .k(4)
37126 .cm_stride(19)
37127 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
37128 }
37129#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37130
37131
37132#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37133 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4) {
37134 TEST_REQUIRES_X86_FMA3;
37135 GemmMicrokernelTester()
37136 .mr(3)
37137 .nr(16)
37138 .kr(1)
37139 .sr(4)
37140 .m(3)
37141 .n(16)
37142 .k(4)
37143 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37144 }
37145
37146 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, strided_cn) {
37147 TEST_REQUIRES_X86_FMA3;
37148 GemmMicrokernelTester()
37149 .mr(3)
37150 .nr(16)
37151 .kr(1)
37152 .sr(4)
37153 .m(3)
37154 .n(16)
37155 .k(4)
37156 .cn_stride(19)
37157 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37158 }
37159
37160 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
37161 TEST_REQUIRES_X86_FMA3;
37162 GemmMicrokernelTester()
37163 .mr(3)
37164 .nr(16)
37165 .kr(1)
37166 .sr(4)
37167 .m(3)
37168 .n(16)
37169 .k(4)
37170 .a_stride(7)
37171 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37172 }
37173
37174 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
37175 TEST_REQUIRES_X86_FMA3;
37176 for (uint32_t m = 1; m <= 3; m++) {
37177 for (uint32_t n = 1; n <= 16; n++) {
37178 GemmMicrokernelTester()
37179 .mr(3)
37180 .nr(16)
37181 .kr(1)
37182 .sr(4)
37183 .m(m)
37184 .n(n)
37185 .k(4)
37186 .iterations(1)
37187 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37188 }
37189 }
37190 }
37191
37192 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
37193 TEST_REQUIRES_X86_FMA3;
37194 for (uint32_t m = 1; m <= 3; m++) {
37195 GemmMicrokernelTester()
37196 .mr(3)
37197 .nr(16)
37198 .kr(1)
37199 .sr(4)
37200 .m(m)
37201 .n(16)
37202 .k(4)
37203 .iterations(1)
37204 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37205 }
37206 }
37207
37208 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
37209 TEST_REQUIRES_X86_FMA3;
37210 for (uint32_t n = 1; n <= 16; n++) {
37211 GemmMicrokernelTester()
37212 .mr(3)
37213 .nr(16)
37214 .kr(1)
37215 .sr(4)
37216 .m(3)
37217 .n(n)
37218 .k(4)
37219 .iterations(1)
37220 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37221 }
37222 }
37223
37224 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_lt_4) {
37225 TEST_REQUIRES_X86_FMA3;
37226 for (size_t k = 1; k < 4; k++) {
37227 GemmMicrokernelTester()
37228 .mr(3)
37229 .nr(16)
37230 .kr(1)
37231 .sr(4)
37232 .m(3)
37233 .n(16)
37234 .k(k)
37235 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37236 }
37237 }
37238
37239 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
37240 TEST_REQUIRES_X86_FMA3;
37241 for (size_t k = 1; k < 4; k++) {
37242 GemmMicrokernelTester()
37243 .mr(3)
37244 .nr(16)
37245 .kr(1)
37246 .sr(4)
37247 .m(3)
37248 .n(16)
37249 .k(k)
37250 .a_stride(7)
37251 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37252 }
37253 }
37254
37255 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
37256 TEST_REQUIRES_X86_FMA3;
37257 for (size_t k = 1; k < 4; k++) {
37258 for (uint32_t m = 1; m <= 3; m++) {
37259 for (uint32_t n = 1; n <= 16; n++) {
37260 GemmMicrokernelTester()
37261 .mr(3)
37262 .nr(16)
37263 .kr(1)
37264 .sr(4)
37265 .m(m)
37266 .n(n)
37267 .k(k)
37268 .iterations(1)
37269 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37270 }
37271 }
37272 }
37273 }
37274
37275 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_gt_4) {
37276 TEST_REQUIRES_X86_FMA3;
37277 for (size_t k = 5; k < 8; k++) {
37278 GemmMicrokernelTester()
37279 .mr(3)
37280 .nr(16)
37281 .kr(1)
37282 .sr(4)
37283 .m(3)
37284 .n(16)
37285 .k(k)
37286 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37287 }
37288 }
37289
37290 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
37291 TEST_REQUIRES_X86_FMA3;
37292 for (size_t k = 5; k < 8; k++) {
37293 GemmMicrokernelTester()
37294 .mr(3)
37295 .nr(16)
37296 .kr(1)
37297 .sr(4)
37298 .m(3)
37299 .n(16)
37300 .k(k)
37301 .a_stride(11)
37302 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37303 }
37304 }
37305
37306 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
37307 TEST_REQUIRES_X86_FMA3;
37308 for (size_t k = 5; k < 8; k++) {
37309 for (uint32_t m = 1; m <= 3; m++) {
37310 for (uint32_t n = 1; n <= 16; n++) {
37311 GemmMicrokernelTester()
37312 .mr(3)
37313 .nr(16)
37314 .kr(1)
37315 .sr(4)
37316 .m(m)
37317 .n(n)
37318 .k(k)
37319 .iterations(1)
37320 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37321 }
37322 }
37323 }
37324 }
37325
37326 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_div_4) {
37327 TEST_REQUIRES_X86_FMA3;
37328 for (size_t k = 8; k <= 40; k += 4) {
37329 GemmMicrokernelTester()
37330 .mr(3)
37331 .nr(16)
37332 .kr(1)
37333 .sr(4)
37334 .m(3)
37335 .n(16)
37336 .k(k)
37337 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37338 }
37339 }
37340
37341 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
37342 TEST_REQUIRES_X86_FMA3;
37343 for (size_t k = 8; k <= 40; k += 4) {
37344 GemmMicrokernelTester()
37345 .mr(3)
37346 .nr(16)
37347 .kr(1)
37348 .sr(4)
37349 .m(3)
37350 .n(16)
37351 .k(k)
37352 .a_stride(43)
37353 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37354 }
37355 }
37356
37357 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_div_4_subtile) {
37358 TEST_REQUIRES_X86_FMA3;
37359 for (size_t k = 8; k <= 40; k += 4) {
37360 for (uint32_t m = 1; m <= 3; m++) {
37361 for (uint32_t n = 1; n <= 16; n++) {
37362 GemmMicrokernelTester()
37363 .mr(3)
37364 .nr(16)
37365 .kr(1)
37366 .sr(4)
37367 .m(m)
37368 .n(n)
37369 .k(k)
37370 .iterations(1)
37371 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37372 }
37373 }
37374 }
37375 }
37376
37377 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16) {
37378 TEST_REQUIRES_X86_FMA3;
37379 for (uint32_t n = 17; n < 32; n++) {
37380 for (size_t k = 1; k <= 20; k += 5) {
37381 GemmMicrokernelTester()
37382 .mr(3)
37383 .nr(16)
37384 .kr(1)
37385 .sr(4)
37386 .m(3)
37387 .n(16)
37388 .k(k)
37389 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37390 }
37391 }
37392 }
37393
37394 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
37395 TEST_REQUIRES_X86_FMA3;
37396 for (uint32_t n = 17; n < 32; n++) {
37397 for (size_t k = 1; k <= 20; k += 5) {
37398 GemmMicrokernelTester()
37399 .mr(3)
37400 .nr(16)
37401 .kr(1)
37402 .sr(4)
37403 .m(3)
37404 .n(16)
37405 .k(k)
37406 .cn_stride(19)
37407 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37408 }
37409 }
37410 }
37411
37412 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
37413 TEST_REQUIRES_X86_FMA3;
37414 for (uint32_t n = 17; n < 32; n++) {
37415 for (size_t k = 1; k <= 20; k += 5) {
37416 GemmMicrokernelTester()
37417 .mr(3)
37418 .nr(16)
37419 .kr(1)
37420 .sr(4)
37421 .m(3)
37422 .n(n)
37423 .k(k)
37424 .a_stride(23)
37425 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37426 }
37427 }
37428 }
37429
37430 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
37431 TEST_REQUIRES_X86_FMA3;
37432 for (uint32_t n = 17; n < 32; n++) {
37433 for (size_t k = 1; k <= 20; k += 5) {
37434 for (uint32_t m = 1; m <= 3; m++) {
37435 GemmMicrokernelTester()
37436 .mr(3)
37437 .nr(16)
37438 .kr(1)
37439 .sr(4)
37440 .m(m)
37441 .n(n)
37442 .k(k)
37443 .iterations(1)
37444 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37445 }
37446 }
37447 }
37448 }
37449
37450 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16) {
37451 TEST_REQUIRES_X86_FMA3;
37452 for (uint32_t n = 32; n <= 48; n += 16) {
37453 for (size_t k = 1; k <= 20; k += 5) {
37454 GemmMicrokernelTester()
37455 .mr(3)
37456 .nr(16)
37457 .kr(1)
37458 .sr(4)
37459 .m(3)
37460 .n(16)
37461 .k(k)
37462 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37463 }
37464 }
37465 }
37466
37467 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
37468 TEST_REQUIRES_X86_FMA3;
37469 for (uint32_t n = 32; n <= 48; n += 16) {
37470 for (size_t k = 1; k <= 20; k += 5) {
37471 GemmMicrokernelTester()
37472 .mr(3)
37473 .nr(16)
37474 .kr(1)
37475 .sr(4)
37476 .m(3)
37477 .n(n)
37478 .k(k)
37479 .cn_stride(19)
37480 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37481 }
37482 }
37483 }
37484
37485 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
37486 TEST_REQUIRES_X86_FMA3;
37487 for (uint32_t n = 32; n <= 48; n += 16) {
37488 for (size_t k = 1; k <= 20; k += 5) {
37489 GemmMicrokernelTester()
37490 .mr(3)
37491 .nr(16)
37492 .kr(1)
37493 .sr(4)
37494 .m(3)
37495 .n(n)
37496 .k(k)
37497 .a_stride(23)
37498 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37499 }
37500 }
37501 }
37502
37503 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16_subtile) {
37504 TEST_REQUIRES_X86_FMA3;
37505 for (uint32_t n = 32; n <= 48; n += 16) {
37506 for (size_t k = 1; k <= 20; k += 5) {
37507 for (uint32_t m = 1; m <= 3; m++) {
37508 GemmMicrokernelTester()
37509 .mr(3)
37510 .nr(16)
37511 .kr(1)
37512 .sr(4)
37513 .m(m)
37514 .n(n)
37515 .k(k)
37516 .iterations(1)
37517 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37518 }
37519 }
37520 }
37521 }
37522
37523 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, strided_cm_subtile) {
37524 TEST_REQUIRES_X86_FMA3;
37525 for (size_t k = 1; k <= 20; k += 5) {
37526 for (uint32_t m = 1; m <= 3; m++) {
37527 for (uint32_t n = 1; n <= 16; n++) {
37528 GemmMicrokernelTester()
37529 .mr(3)
37530 .nr(16)
37531 .kr(1)
37532 .sr(4)
37533 .m(m)
37534 .n(n)
37535 .k(k)
37536 .cm_stride(19)
37537 .iterations(1)
37538 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37539 }
37540 }
37541 }
37542 }
37543
37544 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, qmin) {
37545 TEST_REQUIRES_X86_FMA3;
37546 GemmMicrokernelTester()
37547 .mr(3)
37548 .nr(16)
37549 .kr(1)
37550 .sr(4)
37551 .m(3)
37552 .n(16)
37553 .k(4)
37554 .qmin(128)
37555 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37556 }
37557
37558 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, qmax) {
37559 TEST_REQUIRES_X86_FMA3;
37560 GemmMicrokernelTester()
37561 .mr(3)
37562 .nr(16)
37563 .kr(1)
37564 .sr(4)
37565 .m(3)
37566 .n(16)
37567 .k(4)
37568 .qmax(128)
37569 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37570 }
37571
37572 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, strided_cm) {
37573 TEST_REQUIRES_X86_FMA3;
37574 GemmMicrokernelTester()
37575 .mr(3)
37576 .nr(16)
37577 .kr(1)
37578 .sr(4)
37579 .m(3)
37580 .n(16)
37581 .k(4)
37582 .cm_stride(19)
37583 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
37584 }
37585#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37586
37587
37588#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37589 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4) {
37590 TEST_REQUIRES_X86_FMA3;
37591 GemmMicrokernelTester()
37592 .mr(4)
37593 .nr(16)
37594 .kr(1)
37595 .sr(4)
37596 .m(4)
37597 .n(16)
37598 .k(4)
37599 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37600 }
37601
37602 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, strided_cn) {
37603 TEST_REQUIRES_X86_FMA3;
37604 GemmMicrokernelTester()
37605 .mr(4)
37606 .nr(16)
37607 .kr(1)
37608 .sr(4)
37609 .m(4)
37610 .n(16)
37611 .k(4)
37612 .cn_stride(19)
37613 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37614 }
37615
37616 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
37617 TEST_REQUIRES_X86_FMA3;
37618 GemmMicrokernelTester()
37619 .mr(4)
37620 .nr(16)
37621 .kr(1)
37622 .sr(4)
37623 .m(4)
37624 .n(16)
37625 .k(4)
37626 .a_stride(7)
37627 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37628 }
37629
37630 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
37631 TEST_REQUIRES_X86_FMA3;
37632 for (uint32_t m = 1; m <= 4; m++) {
37633 for (uint32_t n = 1; n <= 16; n++) {
37634 GemmMicrokernelTester()
37635 .mr(4)
37636 .nr(16)
37637 .kr(1)
37638 .sr(4)
37639 .m(m)
37640 .n(n)
37641 .k(4)
37642 .iterations(1)
37643 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37644 }
37645 }
37646 }
37647
37648 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
37649 TEST_REQUIRES_X86_FMA3;
37650 for (uint32_t m = 1; m <= 4; m++) {
37651 GemmMicrokernelTester()
37652 .mr(4)
37653 .nr(16)
37654 .kr(1)
37655 .sr(4)
37656 .m(m)
37657 .n(16)
37658 .k(4)
37659 .iterations(1)
37660 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37661 }
37662 }
37663
37664 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
37665 TEST_REQUIRES_X86_FMA3;
37666 for (uint32_t n = 1; n <= 16; n++) {
37667 GemmMicrokernelTester()
37668 .mr(4)
37669 .nr(16)
37670 .kr(1)
37671 .sr(4)
37672 .m(4)
37673 .n(n)
37674 .k(4)
37675 .iterations(1)
37676 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37677 }
37678 }
37679
37680 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_lt_4) {
37681 TEST_REQUIRES_X86_FMA3;
37682 for (size_t k = 1; k < 4; k++) {
37683 GemmMicrokernelTester()
37684 .mr(4)
37685 .nr(16)
37686 .kr(1)
37687 .sr(4)
37688 .m(4)
37689 .n(16)
37690 .k(k)
37691 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37692 }
37693 }
37694
37695 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
37696 TEST_REQUIRES_X86_FMA3;
37697 for (size_t k = 1; k < 4; k++) {
37698 GemmMicrokernelTester()
37699 .mr(4)
37700 .nr(16)
37701 .kr(1)
37702 .sr(4)
37703 .m(4)
37704 .n(16)
37705 .k(k)
37706 .a_stride(7)
37707 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37708 }
37709 }
37710
37711 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
37712 TEST_REQUIRES_X86_FMA3;
37713 for (size_t k = 1; k < 4; k++) {
37714 for (uint32_t m = 1; m <= 4; m++) {
37715 for (uint32_t n = 1; n <= 16; n++) {
37716 GemmMicrokernelTester()
37717 .mr(4)
37718 .nr(16)
37719 .kr(1)
37720 .sr(4)
37721 .m(m)
37722 .n(n)
37723 .k(k)
37724 .iterations(1)
37725 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37726 }
37727 }
37728 }
37729 }
37730
37731 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_gt_4) {
37732 TEST_REQUIRES_X86_FMA3;
37733 for (size_t k = 5; k < 8; k++) {
37734 GemmMicrokernelTester()
37735 .mr(4)
37736 .nr(16)
37737 .kr(1)
37738 .sr(4)
37739 .m(4)
37740 .n(16)
37741 .k(k)
37742 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37743 }
37744 }
37745
37746 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
37747 TEST_REQUIRES_X86_FMA3;
37748 for (size_t k = 5; k < 8; k++) {
37749 GemmMicrokernelTester()
37750 .mr(4)
37751 .nr(16)
37752 .kr(1)
37753 .sr(4)
37754 .m(4)
37755 .n(16)
37756 .k(k)
37757 .a_stride(11)
37758 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37759 }
37760 }
37761
37762 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
37763 TEST_REQUIRES_X86_FMA3;
37764 for (size_t k = 5; k < 8; k++) {
37765 for (uint32_t m = 1; m <= 4; m++) {
37766 for (uint32_t n = 1; n <= 16; n++) {
37767 GemmMicrokernelTester()
37768 .mr(4)
37769 .nr(16)
37770 .kr(1)
37771 .sr(4)
37772 .m(m)
37773 .n(n)
37774 .k(k)
37775 .iterations(1)
37776 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37777 }
37778 }
37779 }
37780 }
37781
37782 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_div_4) {
37783 TEST_REQUIRES_X86_FMA3;
37784 for (size_t k = 8; k <= 40; k += 4) {
37785 GemmMicrokernelTester()
37786 .mr(4)
37787 .nr(16)
37788 .kr(1)
37789 .sr(4)
37790 .m(4)
37791 .n(16)
37792 .k(k)
37793 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37794 }
37795 }
37796
37797 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
37798 TEST_REQUIRES_X86_FMA3;
37799 for (size_t k = 8; k <= 40; k += 4) {
37800 GemmMicrokernelTester()
37801 .mr(4)
37802 .nr(16)
37803 .kr(1)
37804 .sr(4)
37805 .m(4)
37806 .n(16)
37807 .k(k)
37808 .a_stride(43)
37809 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37810 }
37811 }
37812
37813 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_div_4_subtile) {
37814 TEST_REQUIRES_X86_FMA3;
37815 for (size_t k = 8; k <= 40; k += 4) {
37816 for (uint32_t m = 1; m <= 4; m++) {
37817 for (uint32_t n = 1; n <= 16; n++) {
37818 GemmMicrokernelTester()
37819 .mr(4)
37820 .nr(16)
37821 .kr(1)
37822 .sr(4)
37823 .m(m)
37824 .n(n)
37825 .k(k)
37826 .iterations(1)
37827 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37828 }
37829 }
37830 }
37831 }
37832
37833 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16) {
37834 TEST_REQUIRES_X86_FMA3;
37835 for (uint32_t n = 17; n < 32; n++) {
37836 for (size_t k = 1; k <= 20; k += 5) {
37837 GemmMicrokernelTester()
37838 .mr(4)
37839 .nr(16)
37840 .kr(1)
37841 .sr(4)
37842 .m(4)
37843 .n(16)
37844 .k(k)
37845 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37846 }
37847 }
37848 }
37849
37850 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
37851 TEST_REQUIRES_X86_FMA3;
37852 for (uint32_t n = 17; n < 32; n++) {
37853 for (size_t k = 1; k <= 20; k += 5) {
37854 GemmMicrokernelTester()
37855 .mr(4)
37856 .nr(16)
37857 .kr(1)
37858 .sr(4)
37859 .m(4)
37860 .n(16)
37861 .k(k)
37862 .cn_stride(19)
37863 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37864 }
37865 }
37866 }
37867
37868 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
37869 TEST_REQUIRES_X86_FMA3;
37870 for (uint32_t n = 17; n < 32; n++) {
37871 for (size_t k = 1; k <= 20; k += 5) {
37872 GemmMicrokernelTester()
37873 .mr(4)
37874 .nr(16)
37875 .kr(1)
37876 .sr(4)
37877 .m(4)
37878 .n(n)
37879 .k(k)
37880 .a_stride(23)
37881 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37882 }
37883 }
37884 }
37885
37886 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
37887 TEST_REQUIRES_X86_FMA3;
37888 for (uint32_t n = 17; n < 32; n++) {
37889 for (size_t k = 1; k <= 20; k += 5) {
37890 for (uint32_t m = 1; m <= 4; m++) {
37891 GemmMicrokernelTester()
37892 .mr(4)
37893 .nr(16)
37894 .kr(1)
37895 .sr(4)
37896 .m(m)
37897 .n(n)
37898 .k(k)
37899 .iterations(1)
37900 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37901 }
37902 }
37903 }
37904 }
37905
37906 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16) {
37907 TEST_REQUIRES_X86_FMA3;
37908 for (uint32_t n = 32; n <= 48; n += 16) {
37909 for (size_t k = 1; k <= 20; k += 5) {
37910 GemmMicrokernelTester()
37911 .mr(4)
37912 .nr(16)
37913 .kr(1)
37914 .sr(4)
37915 .m(4)
37916 .n(16)
37917 .k(k)
37918 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37919 }
37920 }
37921 }
37922
37923 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
37924 TEST_REQUIRES_X86_FMA3;
37925 for (uint32_t n = 32; n <= 48; n += 16) {
37926 for (size_t k = 1; k <= 20; k += 5) {
37927 GemmMicrokernelTester()
37928 .mr(4)
37929 .nr(16)
37930 .kr(1)
37931 .sr(4)
37932 .m(4)
37933 .n(n)
37934 .k(k)
37935 .cn_stride(19)
37936 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37937 }
37938 }
37939 }
37940
37941 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
37942 TEST_REQUIRES_X86_FMA3;
37943 for (uint32_t n = 32; n <= 48; n += 16) {
37944 for (size_t k = 1; k <= 20; k += 5) {
37945 GemmMicrokernelTester()
37946 .mr(4)
37947 .nr(16)
37948 .kr(1)
37949 .sr(4)
37950 .m(4)
37951 .n(n)
37952 .k(k)
37953 .a_stride(23)
37954 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37955 }
37956 }
37957 }
37958
37959 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16_subtile) {
37960 TEST_REQUIRES_X86_FMA3;
37961 for (uint32_t n = 32; n <= 48; n += 16) {
37962 for (size_t k = 1; k <= 20; k += 5) {
37963 for (uint32_t m = 1; m <= 4; m++) {
37964 GemmMicrokernelTester()
37965 .mr(4)
37966 .nr(16)
37967 .kr(1)
37968 .sr(4)
37969 .m(m)
37970 .n(n)
37971 .k(k)
37972 .iterations(1)
37973 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37974 }
37975 }
37976 }
37977 }
37978
37979 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, strided_cm_subtile) {
37980 TEST_REQUIRES_X86_FMA3;
37981 for (size_t k = 1; k <= 20; k += 5) {
37982 for (uint32_t m = 1; m <= 4; m++) {
37983 for (uint32_t n = 1; n <= 16; n++) {
37984 GemmMicrokernelTester()
37985 .mr(4)
37986 .nr(16)
37987 .kr(1)
37988 .sr(4)
37989 .m(m)
37990 .n(n)
37991 .k(k)
37992 .cm_stride(19)
37993 .iterations(1)
37994 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
37995 }
37996 }
37997 }
37998 }
37999
38000 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, qmin) {
38001 TEST_REQUIRES_X86_FMA3;
38002 GemmMicrokernelTester()
38003 .mr(4)
38004 .nr(16)
38005 .kr(1)
38006 .sr(4)
38007 .m(4)
38008 .n(16)
38009 .k(4)
38010 .qmin(128)
38011 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
38012 }
38013
38014 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, qmax) {
38015 TEST_REQUIRES_X86_FMA3;
38016 GemmMicrokernelTester()
38017 .mr(4)
38018 .nr(16)
38019 .kr(1)
38020 .sr(4)
38021 .m(4)
38022 .n(16)
38023 .k(4)
38024 .qmax(128)
38025 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
38026 }
38027
38028 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, strided_cm) {
38029 TEST_REQUIRES_X86_FMA3;
38030 GemmMicrokernelTester()
38031 .mr(4)
38032 .nr(16)
38033 .kr(1)
38034 .sr(4)
38035 .m(4)
38036 .n(16)
38037 .k(4)
38038 .cm_stride(19)
38039 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
38040 }
38041#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38042
38043
38044#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38045 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4) {
38046 TEST_REQUIRES_X86_FMA3;
38047 GemmMicrokernelTester()
38048 .mr(5)
38049 .nr(16)
38050 .kr(1)
38051 .sr(4)
38052 .m(5)
38053 .n(16)
38054 .k(4)
38055 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38056 }
38057
38058 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, strided_cn) {
38059 TEST_REQUIRES_X86_FMA3;
38060 GemmMicrokernelTester()
38061 .mr(5)
38062 .nr(16)
38063 .kr(1)
38064 .sr(4)
38065 .m(5)
38066 .n(16)
38067 .k(4)
38068 .cn_stride(19)
38069 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38070 }
38071
38072 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
38073 TEST_REQUIRES_X86_FMA3;
38074 GemmMicrokernelTester()
38075 .mr(5)
38076 .nr(16)
38077 .kr(1)
38078 .sr(4)
38079 .m(5)
38080 .n(16)
38081 .k(4)
38082 .a_stride(7)
38083 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38084 }
38085
38086 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
38087 TEST_REQUIRES_X86_FMA3;
38088 for (uint32_t m = 1; m <= 5; m++) {
38089 for (uint32_t n = 1; n <= 16; n++) {
38090 GemmMicrokernelTester()
38091 .mr(5)
38092 .nr(16)
38093 .kr(1)
38094 .sr(4)
38095 .m(m)
38096 .n(n)
38097 .k(4)
38098 .iterations(1)
38099 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38100 }
38101 }
38102 }
38103
38104 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
38105 TEST_REQUIRES_X86_FMA3;
38106 for (uint32_t m = 1; m <= 5; m++) {
38107 GemmMicrokernelTester()
38108 .mr(5)
38109 .nr(16)
38110 .kr(1)
38111 .sr(4)
38112 .m(m)
38113 .n(16)
38114 .k(4)
38115 .iterations(1)
38116 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38117 }
38118 }
38119
38120 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
38121 TEST_REQUIRES_X86_FMA3;
38122 for (uint32_t n = 1; n <= 16; n++) {
38123 GemmMicrokernelTester()
38124 .mr(5)
38125 .nr(16)
38126 .kr(1)
38127 .sr(4)
38128 .m(5)
38129 .n(n)
38130 .k(4)
38131 .iterations(1)
38132 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38133 }
38134 }
38135
38136 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_lt_4) {
38137 TEST_REQUIRES_X86_FMA3;
38138 for (size_t k = 1; k < 4; k++) {
38139 GemmMicrokernelTester()
38140 .mr(5)
38141 .nr(16)
38142 .kr(1)
38143 .sr(4)
38144 .m(5)
38145 .n(16)
38146 .k(k)
38147 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38148 }
38149 }
38150
38151 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
38152 TEST_REQUIRES_X86_FMA3;
38153 for (size_t k = 1; k < 4; k++) {
38154 GemmMicrokernelTester()
38155 .mr(5)
38156 .nr(16)
38157 .kr(1)
38158 .sr(4)
38159 .m(5)
38160 .n(16)
38161 .k(k)
38162 .a_stride(7)
38163 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38164 }
38165 }
38166
38167 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
38168 TEST_REQUIRES_X86_FMA3;
38169 for (size_t k = 1; k < 4; k++) {
38170 for (uint32_t m = 1; m <= 5; m++) {
38171 for (uint32_t n = 1; n <= 16; n++) {
38172 GemmMicrokernelTester()
38173 .mr(5)
38174 .nr(16)
38175 .kr(1)
38176 .sr(4)
38177 .m(m)
38178 .n(n)
38179 .k(k)
38180 .iterations(1)
38181 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38182 }
38183 }
38184 }
38185 }
38186
38187 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_gt_4) {
38188 TEST_REQUIRES_X86_FMA3;
38189 for (size_t k = 5; k < 8; k++) {
38190 GemmMicrokernelTester()
38191 .mr(5)
38192 .nr(16)
38193 .kr(1)
38194 .sr(4)
38195 .m(5)
38196 .n(16)
38197 .k(k)
38198 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38199 }
38200 }
38201
38202 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
38203 TEST_REQUIRES_X86_FMA3;
38204 for (size_t k = 5; k < 8; k++) {
38205 GemmMicrokernelTester()
38206 .mr(5)
38207 .nr(16)
38208 .kr(1)
38209 .sr(4)
38210 .m(5)
38211 .n(16)
38212 .k(k)
38213 .a_stride(11)
38214 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38215 }
38216 }
38217
38218 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
38219 TEST_REQUIRES_X86_FMA3;
38220 for (size_t k = 5; k < 8; k++) {
38221 for (uint32_t m = 1; m <= 5; m++) {
38222 for (uint32_t n = 1; n <= 16; n++) {
38223 GemmMicrokernelTester()
38224 .mr(5)
38225 .nr(16)
38226 .kr(1)
38227 .sr(4)
38228 .m(m)
38229 .n(n)
38230 .k(k)
38231 .iterations(1)
38232 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38233 }
38234 }
38235 }
38236 }
38237
38238 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_div_4) {
38239 TEST_REQUIRES_X86_FMA3;
38240 for (size_t k = 8; k <= 40; k += 4) {
38241 GemmMicrokernelTester()
38242 .mr(5)
38243 .nr(16)
38244 .kr(1)
38245 .sr(4)
38246 .m(5)
38247 .n(16)
38248 .k(k)
38249 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38250 }
38251 }
38252
38253 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
38254 TEST_REQUIRES_X86_FMA3;
38255 for (size_t k = 8; k <= 40; k += 4) {
38256 GemmMicrokernelTester()
38257 .mr(5)
38258 .nr(16)
38259 .kr(1)
38260 .sr(4)
38261 .m(5)
38262 .n(16)
38263 .k(k)
38264 .a_stride(43)
38265 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38266 }
38267 }
38268
38269 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_div_4_subtile) {
38270 TEST_REQUIRES_X86_FMA3;
38271 for (size_t k = 8; k <= 40; k += 4) {
38272 for (uint32_t m = 1; m <= 5; m++) {
38273 for (uint32_t n = 1; n <= 16; n++) {
38274 GemmMicrokernelTester()
38275 .mr(5)
38276 .nr(16)
38277 .kr(1)
38278 .sr(4)
38279 .m(m)
38280 .n(n)
38281 .k(k)
38282 .iterations(1)
38283 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38284 }
38285 }
38286 }
38287 }
38288
38289 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16) {
38290 TEST_REQUIRES_X86_FMA3;
38291 for (uint32_t n = 17; n < 32; n++) {
38292 for (size_t k = 1; k <= 20; k += 5) {
38293 GemmMicrokernelTester()
38294 .mr(5)
38295 .nr(16)
38296 .kr(1)
38297 .sr(4)
38298 .m(5)
38299 .n(16)
38300 .k(k)
38301 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38302 }
38303 }
38304 }
38305
38306 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
38307 TEST_REQUIRES_X86_FMA3;
38308 for (uint32_t n = 17; n < 32; n++) {
38309 for (size_t k = 1; k <= 20; k += 5) {
38310 GemmMicrokernelTester()
38311 .mr(5)
38312 .nr(16)
38313 .kr(1)
38314 .sr(4)
38315 .m(5)
38316 .n(16)
38317 .k(k)
38318 .cn_stride(19)
38319 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38320 }
38321 }
38322 }
38323
38324 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
38325 TEST_REQUIRES_X86_FMA3;
38326 for (uint32_t n = 17; n < 32; n++) {
38327 for (size_t k = 1; k <= 20; k += 5) {
38328 GemmMicrokernelTester()
38329 .mr(5)
38330 .nr(16)
38331 .kr(1)
38332 .sr(4)
38333 .m(5)
38334 .n(n)
38335 .k(k)
38336 .a_stride(23)
38337 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38338 }
38339 }
38340 }
38341
38342 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
38343 TEST_REQUIRES_X86_FMA3;
38344 for (uint32_t n = 17; n < 32; n++) {
38345 for (size_t k = 1; k <= 20; k += 5) {
38346 for (uint32_t m = 1; m <= 5; m++) {
38347 GemmMicrokernelTester()
38348 .mr(5)
38349 .nr(16)
38350 .kr(1)
38351 .sr(4)
38352 .m(m)
38353 .n(n)
38354 .k(k)
38355 .iterations(1)
38356 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38357 }
38358 }
38359 }
38360 }
38361
38362 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16) {
38363 TEST_REQUIRES_X86_FMA3;
38364 for (uint32_t n = 32; n <= 48; n += 16) {
38365 for (size_t k = 1; k <= 20; k += 5) {
38366 GemmMicrokernelTester()
38367 .mr(5)
38368 .nr(16)
38369 .kr(1)
38370 .sr(4)
38371 .m(5)
38372 .n(16)
38373 .k(k)
38374 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38375 }
38376 }
38377 }
38378
38379 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
38380 TEST_REQUIRES_X86_FMA3;
38381 for (uint32_t n = 32; n <= 48; n += 16) {
38382 for (size_t k = 1; k <= 20; k += 5) {
38383 GemmMicrokernelTester()
38384 .mr(5)
38385 .nr(16)
38386 .kr(1)
38387 .sr(4)
38388 .m(5)
38389 .n(n)
38390 .k(k)
38391 .cn_stride(19)
38392 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38393 }
38394 }
38395 }
38396
38397 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
38398 TEST_REQUIRES_X86_FMA3;
38399 for (uint32_t n = 32; n <= 48; n += 16) {
38400 for (size_t k = 1; k <= 20; k += 5) {
38401 GemmMicrokernelTester()
38402 .mr(5)
38403 .nr(16)
38404 .kr(1)
38405 .sr(4)
38406 .m(5)
38407 .n(n)
38408 .k(k)
38409 .a_stride(23)
38410 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38411 }
38412 }
38413 }
38414
38415 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16_subtile) {
38416 TEST_REQUIRES_X86_FMA3;
38417 for (uint32_t n = 32; n <= 48; n += 16) {
38418 for (size_t k = 1; k <= 20; k += 5) {
38419 for (uint32_t m = 1; m <= 5; m++) {
38420 GemmMicrokernelTester()
38421 .mr(5)
38422 .nr(16)
38423 .kr(1)
38424 .sr(4)
38425 .m(m)
38426 .n(n)
38427 .k(k)
38428 .iterations(1)
38429 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38430 }
38431 }
38432 }
38433 }
38434
38435 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, strided_cm_subtile) {
38436 TEST_REQUIRES_X86_FMA3;
38437 for (size_t k = 1; k <= 20; k += 5) {
38438 for (uint32_t m = 1; m <= 5; m++) {
38439 for (uint32_t n = 1; n <= 16; n++) {
38440 GemmMicrokernelTester()
38441 .mr(5)
38442 .nr(16)
38443 .kr(1)
38444 .sr(4)
38445 .m(m)
38446 .n(n)
38447 .k(k)
38448 .cm_stride(19)
38449 .iterations(1)
38450 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38451 }
38452 }
38453 }
38454 }
38455
38456 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, qmin) {
38457 TEST_REQUIRES_X86_FMA3;
38458 GemmMicrokernelTester()
38459 .mr(5)
38460 .nr(16)
38461 .kr(1)
38462 .sr(4)
38463 .m(5)
38464 .n(16)
38465 .k(4)
38466 .qmin(128)
38467 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38468 }
38469
38470 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, qmax) {
38471 TEST_REQUIRES_X86_FMA3;
38472 GemmMicrokernelTester()
38473 .mr(5)
38474 .nr(16)
38475 .kr(1)
38476 .sr(4)
38477 .m(5)
38478 .n(16)
38479 .k(4)
38480 .qmax(128)
38481 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38482 }
38483
38484 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, strided_cm) {
38485 TEST_REQUIRES_X86_FMA3;
38486 GemmMicrokernelTester()
38487 .mr(5)
38488 .nr(16)
38489 .kr(1)
38490 .sr(4)
38491 .m(5)
38492 .n(16)
38493 .k(4)
38494 .cm_stride(19)
38495 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
38496 }
38497#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38498
38499
38500#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38501 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1) {
38502 TEST_REQUIRES_X86_AVX512F;
38503 GemmMicrokernelTester()
38504 .mr(1)
38505 .nr(16)
38506 .kr(1)
38507 .sr(1)
38508 .m(1)
38509 .n(16)
38510 .k(1)
38511 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38512 }
38513
38514 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, strided_cn) {
38515 TEST_REQUIRES_X86_AVX512F;
38516 GemmMicrokernelTester()
38517 .mr(1)
38518 .nr(16)
38519 .kr(1)
38520 .sr(1)
38521 .m(1)
38522 .n(16)
38523 .k(1)
38524 .cn_stride(19)
38525 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38526 }
38527
38528 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
38529 TEST_REQUIRES_X86_AVX512F;
38530 GemmMicrokernelTester()
38531 .mr(1)
38532 .nr(16)
38533 .kr(1)
38534 .sr(1)
38535 .m(1)
38536 .n(16)
38537 .k(1)
38538 .a_stride(3)
38539 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38540 }
38541
38542 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
38543 TEST_REQUIRES_X86_AVX512F;
38544 for (uint32_t m = 1; m <= 1; m++) {
38545 for (uint32_t n = 1; n <= 16; n++) {
38546 GemmMicrokernelTester()
38547 .mr(1)
38548 .nr(16)
38549 .kr(1)
38550 .sr(1)
38551 .m(m)
38552 .n(n)
38553 .k(1)
38554 .iterations(1)
38555 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38556 }
38557 }
38558 }
38559
38560 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
38561 TEST_REQUIRES_X86_AVX512F;
38562 for (uint32_t m = 1; m <= 1; m++) {
38563 GemmMicrokernelTester()
38564 .mr(1)
38565 .nr(16)
38566 .kr(1)
38567 .sr(1)
38568 .m(m)
38569 .n(16)
38570 .k(1)
38571 .iterations(1)
38572 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38573 }
38574 }
38575
38576 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
38577 TEST_REQUIRES_X86_AVX512F;
38578 for (uint32_t n = 1; n <= 16; n++) {
38579 GemmMicrokernelTester()
38580 .mr(1)
38581 .nr(16)
38582 .kr(1)
38583 .sr(1)
38584 .m(1)
38585 .n(n)
38586 .k(1)
38587 .iterations(1)
38588 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38589 }
38590 }
38591
38592 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_gt_1) {
38593 TEST_REQUIRES_X86_AVX512F;
38594 for (size_t k = 2; k < 10; k++) {
38595 GemmMicrokernelTester()
38596 .mr(1)
38597 .nr(16)
38598 .kr(1)
38599 .sr(1)
38600 .m(1)
38601 .n(16)
38602 .k(k)
38603 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38604 }
38605 }
38606
38607 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
38608 TEST_REQUIRES_X86_AVX512F;
38609 for (size_t k = 2; k < 10; k++) {
38610 GemmMicrokernelTester()
38611 .mr(1)
38612 .nr(16)
38613 .kr(1)
38614 .sr(1)
38615 .m(1)
38616 .n(16)
38617 .k(k)
38618 .a_stride(11)
38619 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38620 }
38621 }
38622
38623 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
38624 TEST_REQUIRES_X86_AVX512F;
38625 for (size_t k = 2; k < 10; k++) {
38626 for (uint32_t m = 1; m <= 1; m++) {
38627 for (uint32_t n = 1; n <= 16; n++) {
38628 GemmMicrokernelTester()
38629 .mr(1)
38630 .nr(16)
38631 .kr(1)
38632 .sr(1)
38633 .m(m)
38634 .n(n)
38635 .k(k)
38636 .iterations(1)
38637 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38638 }
38639 }
38640 }
38641 }
38642
38643 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16) {
38644 TEST_REQUIRES_X86_AVX512F;
38645 for (uint32_t n = 17; n < 32; n++) {
38646 for (size_t k = 1; k <= 5; k += 2) {
38647 GemmMicrokernelTester()
38648 .mr(1)
38649 .nr(16)
38650 .kr(1)
38651 .sr(1)
38652 .m(1)
38653 .n(16)
38654 .k(k)
38655 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38656 }
38657 }
38658 }
38659
38660 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
38661 TEST_REQUIRES_X86_AVX512F;
38662 for (uint32_t n = 17; n < 32; n++) {
38663 for (size_t k = 1; k <= 5; k += 2) {
38664 GemmMicrokernelTester()
38665 .mr(1)
38666 .nr(16)
38667 .kr(1)
38668 .sr(1)
38669 .m(1)
38670 .n(16)
38671 .k(k)
38672 .cn_stride(19)
38673 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38674 }
38675 }
38676 }
38677
38678 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
38679 TEST_REQUIRES_X86_AVX512F;
38680 for (uint32_t n = 17; n < 32; n++) {
38681 for (size_t k = 1; k <= 5; k += 2) {
38682 GemmMicrokernelTester()
38683 .mr(1)
38684 .nr(16)
38685 .kr(1)
38686 .sr(1)
38687 .m(1)
38688 .n(n)
38689 .k(k)
38690 .a_stride(7)
38691 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38692 }
38693 }
38694 }
38695
38696 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
38697 TEST_REQUIRES_X86_AVX512F;
38698 for (uint32_t n = 17; n < 32; n++) {
38699 for (size_t k = 1; k <= 5; k += 2) {
38700 for (uint32_t m = 1; m <= 1; m++) {
38701 GemmMicrokernelTester()
38702 .mr(1)
38703 .nr(16)
38704 .kr(1)
38705 .sr(1)
38706 .m(m)
38707 .n(n)
38708 .k(k)
38709 .iterations(1)
38710 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38711 }
38712 }
38713 }
38714 }
38715
38716 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16) {
38717 TEST_REQUIRES_X86_AVX512F;
38718 for (uint32_t n = 32; n <= 48; n += 16) {
38719 for (size_t k = 1; k <= 5; k += 2) {
38720 GemmMicrokernelTester()
38721 .mr(1)
38722 .nr(16)
38723 .kr(1)
38724 .sr(1)
38725 .m(1)
38726 .n(16)
38727 .k(k)
38728 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38729 }
38730 }
38731 }
38732
38733 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
38734 TEST_REQUIRES_X86_AVX512F;
38735 for (uint32_t n = 32; n <= 48; n += 16) {
38736 for (size_t k = 1; k <= 5; k += 2) {
38737 GemmMicrokernelTester()
38738 .mr(1)
38739 .nr(16)
38740 .kr(1)
38741 .sr(1)
38742 .m(1)
38743 .n(n)
38744 .k(k)
38745 .cn_stride(19)
38746 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38747 }
38748 }
38749 }
38750
38751 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16_strided_a) {
38752 TEST_REQUIRES_X86_AVX512F;
38753 for (uint32_t n = 32; n <= 48; n += 16) {
38754 for (size_t k = 1; k <= 5; k += 2) {
38755 GemmMicrokernelTester()
38756 .mr(1)
38757 .nr(16)
38758 .kr(1)
38759 .sr(1)
38760 .m(1)
38761 .n(n)
38762 .k(k)
38763 .a_stride(7)
38764 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38765 }
38766 }
38767 }
38768
38769 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
38770 TEST_REQUIRES_X86_AVX512F;
38771 for (uint32_t n = 32; n <= 48; n += 16) {
38772 for (size_t k = 1; k <= 5; k += 2) {
38773 for (uint32_t m = 1; m <= 1; m++) {
38774 GemmMicrokernelTester()
38775 .mr(1)
38776 .nr(16)
38777 .kr(1)
38778 .sr(1)
38779 .m(m)
38780 .n(n)
38781 .k(k)
38782 .iterations(1)
38783 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38784 }
38785 }
38786 }
38787 }
38788
38789 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
38790 TEST_REQUIRES_X86_AVX512F;
38791 for (size_t k = 1; k <= 5; k += 2) {
38792 for (uint32_t m = 1; m <= 1; m++) {
38793 for (uint32_t n = 1; n <= 16; n++) {
38794 GemmMicrokernelTester()
38795 .mr(1)
38796 .nr(16)
38797 .kr(1)
38798 .sr(1)
38799 .m(m)
38800 .n(n)
38801 .k(k)
38802 .cm_stride(19)
38803 .iterations(1)
38804 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38805 }
38806 }
38807 }
38808 }
38809
38810 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, qmin) {
38811 TEST_REQUIRES_X86_AVX512F;
38812 GemmMicrokernelTester()
38813 .mr(1)
38814 .nr(16)
38815 .kr(1)
38816 .sr(1)
38817 .m(1)
38818 .n(16)
38819 .k(1)
38820 .qmin(128)
38821 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38822 }
38823
38824 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, qmax) {
38825 TEST_REQUIRES_X86_AVX512F;
38826 GemmMicrokernelTester()
38827 .mr(1)
38828 .nr(16)
38829 .kr(1)
38830 .sr(1)
38831 .m(1)
38832 .n(16)
38833 .k(1)
38834 .qmax(128)
38835 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38836 }
38837
38838 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, strided_cm) {
38839 TEST_REQUIRES_X86_AVX512F;
38840 GemmMicrokernelTester()
38841 .mr(1)
38842 .nr(16)
38843 .kr(1)
38844 .sr(1)
38845 .m(1)
38846 .n(16)
38847 .k(1)
38848 .cm_stride(19)
38849 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
38850 }
38851#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38852
38853
38854#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38855 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1) {
38856 TEST_REQUIRES_X86_AVX512F;
38857 GemmMicrokernelTester()
38858 .mr(4)
38859 .nr(16)
38860 .kr(1)
38861 .sr(1)
38862 .m(4)
38863 .n(16)
38864 .k(1)
38865 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38866 }
38867
38868 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, strided_cn) {
38869 TEST_REQUIRES_X86_AVX512F;
38870 GemmMicrokernelTester()
38871 .mr(4)
38872 .nr(16)
38873 .kr(1)
38874 .sr(1)
38875 .m(4)
38876 .n(16)
38877 .k(1)
38878 .cn_stride(19)
38879 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38880 }
38881
38882 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
38883 TEST_REQUIRES_X86_AVX512F;
38884 GemmMicrokernelTester()
38885 .mr(4)
38886 .nr(16)
38887 .kr(1)
38888 .sr(1)
38889 .m(4)
38890 .n(16)
38891 .k(1)
38892 .a_stride(3)
38893 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38894 }
38895
38896 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile) {
38897 TEST_REQUIRES_X86_AVX512F;
38898 for (uint32_t m = 1; m <= 4; m++) {
38899 for (uint32_t n = 1; n <= 16; n++) {
38900 GemmMicrokernelTester()
38901 .mr(4)
38902 .nr(16)
38903 .kr(1)
38904 .sr(1)
38905 .m(m)
38906 .n(n)
38907 .k(1)
38908 .iterations(1)
38909 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38910 }
38911 }
38912 }
38913
38914 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
38915 TEST_REQUIRES_X86_AVX512F;
38916 for (uint32_t m = 1; m <= 4; m++) {
38917 GemmMicrokernelTester()
38918 .mr(4)
38919 .nr(16)
38920 .kr(1)
38921 .sr(1)
38922 .m(m)
38923 .n(16)
38924 .k(1)
38925 .iterations(1)
38926 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38927 }
38928 }
38929
38930 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
38931 TEST_REQUIRES_X86_AVX512F;
38932 for (uint32_t n = 1; n <= 16; n++) {
38933 GemmMicrokernelTester()
38934 .mr(4)
38935 .nr(16)
38936 .kr(1)
38937 .sr(1)
38938 .m(4)
38939 .n(n)
38940 .k(1)
38941 .iterations(1)
38942 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38943 }
38944 }
38945
38946 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_gt_1) {
38947 TEST_REQUIRES_X86_AVX512F;
38948 for (size_t k = 2; k < 10; k++) {
38949 GemmMicrokernelTester()
38950 .mr(4)
38951 .nr(16)
38952 .kr(1)
38953 .sr(1)
38954 .m(4)
38955 .n(16)
38956 .k(k)
38957 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38958 }
38959 }
38960
38961 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
38962 TEST_REQUIRES_X86_AVX512F;
38963 for (size_t k = 2; k < 10; k++) {
38964 GemmMicrokernelTester()
38965 .mr(4)
38966 .nr(16)
38967 .kr(1)
38968 .sr(1)
38969 .m(4)
38970 .n(16)
38971 .k(k)
38972 .a_stride(11)
38973 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38974 }
38975 }
38976
38977 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_gt_1_subtile) {
38978 TEST_REQUIRES_X86_AVX512F;
38979 for (size_t k = 2; k < 10; k++) {
38980 for (uint32_t m = 1; m <= 4; m++) {
38981 for (uint32_t n = 1; n <= 16; n++) {
38982 GemmMicrokernelTester()
38983 .mr(4)
38984 .nr(16)
38985 .kr(1)
38986 .sr(1)
38987 .m(m)
38988 .n(n)
38989 .k(k)
38990 .iterations(1)
38991 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
38992 }
38993 }
38994 }
38995 }
38996
38997 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16) {
38998 TEST_REQUIRES_X86_AVX512F;
38999 for (uint32_t n = 17; n < 32; n++) {
39000 for (size_t k = 1; k <= 5; k += 2) {
39001 GemmMicrokernelTester()
39002 .mr(4)
39003 .nr(16)
39004 .kr(1)
39005 .sr(1)
39006 .m(4)
39007 .n(16)
39008 .k(k)
39009 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39010 }
39011 }
39012 }
39013
39014 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
39015 TEST_REQUIRES_X86_AVX512F;
39016 for (uint32_t n = 17; n < 32; n++) {
39017 for (size_t k = 1; k <= 5; k += 2) {
39018 GemmMicrokernelTester()
39019 .mr(4)
39020 .nr(16)
39021 .kr(1)
39022 .sr(1)
39023 .m(4)
39024 .n(16)
39025 .k(k)
39026 .cn_stride(19)
39027 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39028 }
39029 }
39030 }
39031
39032 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
39033 TEST_REQUIRES_X86_AVX512F;
39034 for (uint32_t n = 17; n < 32; n++) {
39035 for (size_t k = 1; k <= 5; k += 2) {
39036 GemmMicrokernelTester()
39037 .mr(4)
39038 .nr(16)
39039 .kr(1)
39040 .sr(1)
39041 .m(4)
39042 .n(n)
39043 .k(k)
39044 .a_stride(7)
39045 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39046 }
39047 }
39048 }
39049
39050 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16_subtile) {
39051 TEST_REQUIRES_X86_AVX512F;
39052 for (uint32_t n = 17; n < 32; n++) {
39053 for (size_t k = 1; k <= 5; k += 2) {
39054 for (uint32_t m = 1; m <= 4; m++) {
39055 GemmMicrokernelTester()
39056 .mr(4)
39057 .nr(16)
39058 .kr(1)
39059 .sr(1)
39060 .m(m)
39061 .n(n)
39062 .k(k)
39063 .iterations(1)
39064 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39065 }
39066 }
39067 }
39068 }
39069
39070 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16) {
39071 TEST_REQUIRES_X86_AVX512F;
39072 for (uint32_t n = 32; n <= 48; n += 16) {
39073 for (size_t k = 1; k <= 5; k += 2) {
39074 GemmMicrokernelTester()
39075 .mr(4)
39076 .nr(16)
39077 .kr(1)
39078 .sr(1)
39079 .m(4)
39080 .n(16)
39081 .k(k)
39082 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39083 }
39084 }
39085 }
39086
39087 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
39088 TEST_REQUIRES_X86_AVX512F;
39089 for (uint32_t n = 32; n <= 48; n += 16) {
39090 for (size_t k = 1; k <= 5; k += 2) {
39091 GemmMicrokernelTester()
39092 .mr(4)
39093 .nr(16)
39094 .kr(1)
39095 .sr(1)
39096 .m(4)
39097 .n(n)
39098 .k(k)
39099 .cn_stride(19)
39100 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39101 }
39102 }
39103 }
39104
39105 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16_strided_a) {
39106 TEST_REQUIRES_X86_AVX512F;
39107 for (uint32_t n = 32; n <= 48; n += 16) {
39108 for (size_t k = 1; k <= 5; k += 2) {
39109 GemmMicrokernelTester()
39110 .mr(4)
39111 .nr(16)
39112 .kr(1)
39113 .sr(1)
39114 .m(4)
39115 .n(n)
39116 .k(k)
39117 .a_stride(7)
39118 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39119 }
39120 }
39121 }
39122
39123 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16_subtile) {
39124 TEST_REQUIRES_X86_AVX512F;
39125 for (uint32_t n = 32; n <= 48; n += 16) {
39126 for (size_t k = 1; k <= 5; k += 2) {
39127 for (uint32_t m = 1; m <= 4; m++) {
39128 GemmMicrokernelTester()
39129 .mr(4)
39130 .nr(16)
39131 .kr(1)
39132 .sr(1)
39133 .m(m)
39134 .n(n)
39135 .k(k)
39136 .iterations(1)
39137 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39138 }
39139 }
39140 }
39141 }
39142
39143 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, strided_cm_subtile) {
39144 TEST_REQUIRES_X86_AVX512F;
39145 for (size_t k = 1; k <= 5; k += 2) {
39146 for (uint32_t m = 1; m <= 4; m++) {
39147 for (uint32_t n = 1; n <= 16; n++) {
39148 GemmMicrokernelTester()
39149 .mr(4)
39150 .nr(16)
39151 .kr(1)
39152 .sr(1)
39153 .m(m)
39154 .n(n)
39155 .k(k)
39156 .cm_stride(19)
39157 .iterations(1)
39158 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39159 }
39160 }
39161 }
39162 }
39163
39164 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, qmin) {
39165 TEST_REQUIRES_X86_AVX512F;
39166 GemmMicrokernelTester()
39167 .mr(4)
39168 .nr(16)
39169 .kr(1)
39170 .sr(1)
39171 .m(4)
39172 .n(16)
39173 .k(1)
39174 .qmin(128)
39175 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39176 }
39177
39178 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, qmax) {
39179 TEST_REQUIRES_X86_AVX512F;
39180 GemmMicrokernelTester()
39181 .mr(4)
39182 .nr(16)
39183 .kr(1)
39184 .sr(1)
39185 .m(4)
39186 .n(16)
39187 .k(1)
39188 .qmax(128)
39189 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39190 }
39191
39192 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, strided_cm) {
39193 TEST_REQUIRES_X86_AVX512F;
39194 GemmMicrokernelTester()
39195 .mr(4)
39196 .nr(16)
39197 .kr(1)
39198 .sr(1)
39199 .m(4)
39200 .n(16)
39201 .k(1)
39202 .cm_stride(19)
39203 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
39204 }
39205#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
39206
39207
39208#if XNN_ARCH_X86 || XNN_ARCH_X86_64
39209 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1) {
39210 TEST_REQUIRES_X86_AVX512F;
39211 GemmMicrokernelTester()
39212 .mr(5)
39213 .nr(16)
39214 .kr(1)
39215 .sr(1)
39216 .m(5)
39217 .n(16)
39218 .k(1)
39219 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39220 }
39221
39222 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, strided_cn) {
39223 TEST_REQUIRES_X86_AVX512F;
39224 GemmMicrokernelTester()
39225 .mr(5)
39226 .nr(16)
39227 .kr(1)
39228 .sr(1)
39229 .m(5)
39230 .n(16)
39231 .k(1)
39232 .cn_stride(19)
39233 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39234 }
39235
39236 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
39237 TEST_REQUIRES_X86_AVX512F;
39238 GemmMicrokernelTester()
39239 .mr(5)
39240 .nr(16)
39241 .kr(1)
39242 .sr(1)
39243 .m(5)
39244 .n(16)
39245 .k(1)
39246 .a_stride(3)
39247 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39248 }
39249
39250 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile) {
39251 TEST_REQUIRES_X86_AVX512F;
39252 for (uint32_t m = 1; m <= 5; m++) {
39253 for (uint32_t n = 1; n <= 16; n++) {
39254 GemmMicrokernelTester()
39255 .mr(5)
39256 .nr(16)
39257 .kr(1)
39258 .sr(1)
39259 .m(m)
39260 .n(n)
39261 .k(1)
39262 .iterations(1)
39263 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39264 }
39265 }
39266 }
39267
39268 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
39269 TEST_REQUIRES_X86_AVX512F;
39270 for (uint32_t m = 1; m <= 5; m++) {
39271 GemmMicrokernelTester()
39272 .mr(5)
39273 .nr(16)
39274 .kr(1)
39275 .sr(1)
39276 .m(m)
39277 .n(16)
39278 .k(1)
39279 .iterations(1)
39280 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39281 }
39282 }
39283
39284 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
39285 TEST_REQUIRES_X86_AVX512F;
39286 for (uint32_t n = 1; n <= 16; n++) {
39287 GemmMicrokernelTester()
39288 .mr(5)
39289 .nr(16)
39290 .kr(1)
39291 .sr(1)
39292 .m(5)
39293 .n(n)
39294 .k(1)
39295 .iterations(1)
39296 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39297 }
39298 }
39299
39300 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_gt_1) {
39301 TEST_REQUIRES_X86_AVX512F;
39302 for (size_t k = 2; k < 10; k++) {
39303 GemmMicrokernelTester()
39304 .mr(5)
39305 .nr(16)
39306 .kr(1)
39307 .sr(1)
39308 .m(5)
39309 .n(16)
39310 .k(k)
39311 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39312 }
39313 }
39314
39315 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
39316 TEST_REQUIRES_X86_AVX512F;
39317 for (size_t k = 2; k < 10; k++) {
39318 GemmMicrokernelTester()
39319 .mr(5)
39320 .nr(16)
39321 .kr(1)
39322 .sr(1)
39323 .m(5)
39324 .n(16)
39325 .k(k)
39326 .a_stride(11)
39327 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39328 }
39329 }
39330
39331 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_gt_1_subtile) {
39332 TEST_REQUIRES_X86_AVX512F;
39333 for (size_t k = 2; k < 10; k++) {
39334 for (uint32_t m = 1; m <= 5; m++) {
39335 for (uint32_t n = 1; n <= 16; n++) {
39336 GemmMicrokernelTester()
39337 .mr(5)
39338 .nr(16)
39339 .kr(1)
39340 .sr(1)
39341 .m(m)
39342 .n(n)
39343 .k(k)
39344 .iterations(1)
39345 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39346 }
39347 }
39348 }
39349 }
39350
39351 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16) {
39352 TEST_REQUIRES_X86_AVX512F;
39353 for (uint32_t n = 17; n < 32; n++) {
39354 for (size_t k = 1; k <= 5; k += 2) {
39355 GemmMicrokernelTester()
39356 .mr(5)
39357 .nr(16)
39358 .kr(1)
39359 .sr(1)
39360 .m(5)
39361 .n(16)
39362 .k(k)
39363 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39364 }
39365 }
39366 }
39367
39368 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
39369 TEST_REQUIRES_X86_AVX512F;
39370 for (uint32_t n = 17; n < 32; n++) {
39371 for (size_t k = 1; k <= 5; k += 2) {
39372 GemmMicrokernelTester()
39373 .mr(5)
39374 .nr(16)
39375 .kr(1)
39376 .sr(1)
39377 .m(5)
39378 .n(16)
39379 .k(k)
39380 .cn_stride(19)
39381 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39382 }
39383 }
39384 }
39385
39386 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
39387 TEST_REQUIRES_X86_AVX512F;
39388 for (uint32_t n = 17; n < 32; n++) {
39389 for (size_t k = 1; k <= 5; k += 2) {
39390 GemmMicrokernelTester()
39391 .mr(5)
39392 .nr(16)
39393 .kr(1)
39394 .sr(1)
39395 .m(5)
39396 .n(n)
39397 .k(k)
39398 .a_stride(7)
39399 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39400 }
39401 }
39402 }
39403
39404 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16_subtile) {
39405 TEST_REQUIRES_X86_AVX512F;
39406 for (uint32_t n = 17; n < 32; n++) {
39407 for (size_t k = 1; k <= 5; k += 2) {
39408 for (uint32_t m = 1; m <= 5; m++) {
39409 GemmMicrokernelTester()
39410 .mr(5)
39411 .nr(16)
39412 .kr(1)
39413 .sr(1)
39414 .m(m)
39415 .n(n)
39416 .k(k)
39417 .iterations(1)
39418 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39419 }
39420 }
39421 }
39422 }
39423
39424 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16) {
39425 TEST_REQUIRES_X86_AVX512F;
39426 for (uint32_t n = 32; n <= 48; n += 16) {
39427 for (size_t k = 1; k <= 5; k += 2) {
39428 GemmMicrokernelTester()
39429 .mr(5)
39430 .nr(16)
39431 .kr(1)
39432 .sr(1)
39433 .m(5)
39434 .n(16)
39435 .k(k)
39436 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39437 }
39438 }
39439 }
39440
39441 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
39442 TEST_REQUIRES_X86_AVX512F;
39443 for (uint32_t n = 32; n <= 48; n += 16) {
39444 for (size_t k = 1; k <= 5; k += 2) {
39445 GemmMicrokernelTester()
39446 .mr(5)
39447 .nr(16)
39448 .kr(1)
39449 .sr(1)
39450 .m(5)
39451 .n(n)
39452 .k(k)
39453 .cn_stride(19)
39454 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39455 }
39456 }
39457 }
39458
39459 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16_strided_a) {
39460 TEST_REQUIRES_X86_AVX512F;
39461 for (uint32_t n = 32; n <= 48; n += 16) {
39462 for (size_t k = 1; k <= 5; k += 2) {
39463 GemmMicrokernelTester()
39464 .mr(5)
39465 .nr(16)
39466 .kr(1)
39467 .sr(1)
39468 .m(5)
39469 .n(n)
39470 .k(k)
39471 .a_stride(7)
39472 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39473 }
39474 }
39475 }
39476
39477 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16_subtile) {
39478 TEST_REQUIRES_X86_AVX512F;
39479 for (uint32_t n = 32; n <= 48; n += 16) {
39480 for (size_t k = 1; k <= 5; k += 2) {
39481 for (uint32_t m = 1; m <= 5; m++) {
39482 GemmMicrokernelTester()
39483 .mr(5)
39484 .nr(16)
39485 .kr(1)
39486 .sr(1)
39487 .m(m)
39488 .n(n)
39489 .k(k)
39490 .iterations(1)
39491 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39492 }
39493 }
39494 }
39495 }
39496
39497 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, strided_cm_subtile) {
39498 TEST_REQUIRES_X86_AVX512F;
39499 for (size_t k = 1; k <= 5; k += 2) {
39500 for (uint32_t m = 1; m <= 5; m++) {
39501 for (uint32_t n = 1; n <= 16; n++) {
39502 GemmMicrokernelTester()
39503 .mr(5)
39504 .nr(16)
39505 .kr(1)
39506 .sr(1)
39507 .m(m)
39508 .n(n)
39509 .k(k)
39510 .cm_stride(19)
39511 .iterations(1)
39512 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39513 }
39514 }
39515 }
39516 }
39517
39518 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, qmin) {
39519 TEST_REQUIRES_X86_AVX512F;
39520 GemmMicrokernelTester()
39521 .mr(5)
39522 .nr(16)
39523 .kr(1)
39524 .sr(1)
39525 .m(5)
39526 .n(16)
39527 .k(1)
39528 .qmin(128)
39529 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39530 }
39531
39532 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, qmax) {
39533 TEST_REQUIRES_X86_AVX512F;
39534 GemmMicrokernelTester()
39535 .mr(5)
39536 .nr(16)
39537 .kr(1)
39538 .sr(1)
39539 .m(5)
39540 .n(16)
39541 .k(1)
39542 .qmax(128)
39543 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39544 }
39545
39546 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, strided_cm) {
39547 TEST_REQUIRES_X86_AVX512F;
39548 GemmMicrokernelTester()
39549 .mr(5)
39550 .nr(16)
39551 .kr(1)
39552 .sr(1)
39553 .m(5)
39554 .n(16)
39555 .k(1)
39556 .cm_stride(19)
39557 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
39558 }
39559#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
39560
39561
39562#if XNN_ARCH_X86 || XNN_ARCH_X86_64
39563 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1) {
39564 TEST_REQUIRES_X86_AVX512F;
39565 GemmMicrokernelTester()
39566 .mr(6)
39567 .nr(16)
39568 .kr(1)
39569 .sr(1)
39570 .m(6)
39571 .n(16)
39572 .k(1)
39573 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39574 }
39575
39576 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, strided_cn) {
39577 TEST_REQUIRES_X86_AVX512F;
39578 GemmMicrokernelTester()
39579 .mr(6)
39580 .nr(16)
39581 .kr(1)
39582 .sr(1)
39583 .m(6)
39584 .n(16)
39585 .k(1)
39586 .cn_stride(19)
39587 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39588 }
39589
39590 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
39591 TEST_REQUIRES_X86_AVX512F;
39592 GemmMicrokernelTester()
39593 .mr(6)
39594 .nr(16)
39595 .kr(1)
39596 .sr(1)
39597 .m(6)
39598 .n(16)
39599 .k(1)
39600 .a_stride(3)
39601 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39602 }
39603
39604 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
39605 TEST_REQUIRES_X86_AVX512F;
39606 for (uint32_t m = 1; m <= 6; m++) {
39607 for (uint32_t n = 1; n <= 16; n++) {
39608 GemmMicrokernelTester()
39609 .mr(6)
39610 .nr(16)
39611 .kr(1)
39612 .sr(1)
39613 .m(m)
39614 .n(n)
39615 .k(1)
39616 .iterations(1)
39617 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39618 }
39619 }
39620 }
39621
39622 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
39623 TEST_REQUIRES_X86_AVX512F;
39624 for (uint32_t m = 1; m <= 6; m++) {
39625 GemmMicrokernelTester()
39626 .mr(6)
39627 .nr(16)
39628 .kr(1)
39629 .sr(1)
39630 .m(m)
39631 .n(16)
39632 .k(1)
39633 .iterations(1)
39634 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39635 }
39636 }
39637
39638 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
39639 TEST_REQUIRES_X86_AVX512F;
39640 for (uint32_t n = 1; n <= 16; n++) {
39641 GemmMicrokernelTester()
39642 .mr(6)
39643 .nr(16)
39644 .kr(1)
39645 .sr(1)
39646 .m(6)
39647 .n(n)
39648 .k(1)
39649 .iterations(1)
39650 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39651 }
39652 }
39653
39654 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_gt_1) {
39655 TEST_REQUIRES_X86_AVX512F;
39656 for (size_t k = 2; k < 10; k++) {
39657 GemmMicrokernelTester()
39658 .mr(6)
39659 .nr(16)
39660 .kr(1)
39661 .sr(1)
39662 .m(6)
39663 .n(16)
39664 .k(k)
39665 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39666 }
39667 }
39668
39669 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
39670 TEST_REQUIRES_X86_AVX512F;
39671 for (size_t k = 2; k < 10; k++) {
39672 GemmMicrokernelTester()
39673 .mr(6)
39674 .nr(16)
39675 .kr(1)
39676 .sr(1)
39677 .m(6)
39678 .n(16)
39679 .k(k)
39680 .a_stride(11)
39681 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39682 }
39683 }
39684
39685 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
39686 TEST_REQUIRES_X86_AVX512F;
39687 for (size_t k = 2; k < 10; k++) {
39688 for (uint32_t m = 1; m <= 6; m++) {
39689 for (uint32_t n = 1; n <= 16; n++) {
39690 GemmMicrokernelTester()
39691 .mr(6)
39692 .nr(16)
39693 .kr(1)
39694 .sr(1)
39695 .m(m)
39696 .n(n)
39697 .k(k)
39698 .iterations(1)
39699 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39700 }
39701 }
39702 }
39703 }
39704
39705 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16) {
39706 TEST_REQUIRES_X86_AVX512F;
39707 for (uint32_t n = 17; n < 32; n++) {
39708 for (size_t k = 1; k <= 5; k += 2) {
39709 GemmMicrokernelTester()
39710 .mr(6)
39711 .nr(16)
39712 .kr(1)
39713 .sr(1)
39714 .m(6)
39715 .n(16)
39716 .k(k)
39717 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39718 }
39719 }
39720 }
39721
39722 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
39723 TEST_REQUIRES_X86_AVX512F;
39724 for (uint32_t n = 17; n < 32; n++) {
39725 for (size_t k = 1; k <= 5; k += 2) {
39726 GemmMicrokernelTester()
39727 .mr(6)
39728 .nr(16)
39729 .kr(1)
39730 .sr(1)
39731 .m(6)
39732 .n(16)
39733 .k(k)
39734 .cn_stride(19)
39735 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39736 }
39737 }
39738 }
39739
39740 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
39741 TEST_REQUIRES_X86_AVX512F;
39742 for (uint32_t n = 17; n < 32; n++) {
39743 for (size_t k = 1; k <= 5; k += 2) {
39744 GemmMicrokernelTester()
39745 .mr(6)
39746 .nr(16)
39747 .kr(1)
39748 .sr(1)
39749 .m(6)
39750 .n(n)
39751 .k(k)
39752 .a_stride(7)
39753 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39754 }
39755 }
39756 }
39757
39758 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
39759 TEST_REQUIRES_X86_AVX512F;
39760 for (uint32_t n = 17; n < 32; n++) {
39761 for (size_t k = 1; k <= 5; k += 2) {
39762 for (uint32_t m = 1; m <= 6; m++) {
39763 GemmMicrokernelTester()
39764 .mr(6)
39765 .nr(16)
39766 .kr(1)
39767 .sr(1)
39768 .m(m)
39769 .n(n)
39770 .k(k)
39771 .iterations(1)
39772 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39773 }
39774 }
39775 }
39776 }
39777
39778 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16) {
39779 TEST_REQUIRES_X86_AVX512F;
39780 for (uint32_t n = 32; n <= 48; n += 16) {
39781 for (size_t k = 1; k <= 5; k += 2) {
39782 GemmMicrokernelTester()
39783 .mr(6)
39784 .nr(16)
39785 .kr(1)
39786 .sr(1)
39787 .m(6)
39788 .n(16)
39789 .k(k)
39790 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39791 }
39792 }
39793 }
39794
39795 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
39796 TEST_REQUIRES_X86_AVX512F;
39797 for (uint32_t n = 32; n <= 48; n += 16) {
39798 for (size_t k = 1; k <= 5; k += 2) {
39799 GemmMicrokernelTester()
39800 .mr(6)
39801 .nr(16)
39802 .kr(1)
39803 .sr(1)
39804 .m(6)
39805 .n(n)
39806 .k(k)
39807 .cn_stride(19)
39808 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39809 }
39810 }
39811 }
39812
39813 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16_strided_a) {
39814 TEST_REQUIRES_X86_AVX512F;
39815 for (uint32_t n = 32; n <= 48; n += 16) {
39816 for (size_t k = 1; k <= 5; k += 2) {
39817 GemmMicrokernelTester()
39818 .mr(6)
39819 .nr(16)
39820 .kr(1)
39821 .sr(1)
39822 .m(6)
39823 .n(n)
39824 .k(k)
39825 .a_stride(7)
39826 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39827 }
39828 }
39829 }
39830
39831 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
39832 TEST_REQUIRES_X86_AVX512F;
39833 for (uint32_t n = 32; n <= 48; n += 16) {
39834 for (size_t k = 1; k <= 5; k += 2) {
39835 for (uint32_t m = 1; m <= 6; m++) {
39836 GemmMicrokernelTester()
39837 .mr(6)
39838 .nr(16)
39839 .kr(1)
39840 .sr(1)
39841 .m(m)
39842 .n(n)
39843 .k(k)
39844 .iterations(1)
39845 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39846 }
39847 }
39848 }
39849 }
39850
39851 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
39852 TEST_REQUIRES_X86_AVX512F;
39853 for (size_t k = 1; k <= 5; k += 2) {
39854 for (uint32_t m = 1; m <= 6; m++) {
39855 for (uint32_t n = 1; n <= 16; n++) {
39856 GemmMicrokernelTester()
39857 .mr(6)
39858 .nr(16)
39859 .kr(1)
39860 .sr(1)
39861 .m(m)
39862 .n(n)
39863 .k(k)
39864 .cm_stride(19)
39865 .iterations(1)
39866 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39867 }
39868 }
39869 }
39870 }
39871
39872 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, qmin) {
39873 TEST_REQUIRES_X86_AVX512F;
39874 GemmMicrokernelTester()
39875 .mr(6)
39876 .nr(16)
39877 .kr(1)
39878 .sr(1)
39879 .m(6)
39880 .n(16)
39881 .k(1)
39882 .qmin(128)
39883 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39884 }
39885
39886 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, qmax) {
39887 TEST_REQUIRES_X86_AVX512F;
39888 GemmMicrokernelTester()
39889 .mr(6)
39890 .nr(16)
39891 .kr(1)
39892 .sr(1)
39893 .m(6)
39894 .n(16)
39895 .k(1)
39896 .qmax(128)
39897 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39898 }
39899
39900 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, strided_cm) {
39901 TEST_REQUIRES_X86_AVX512F;
39902 GemmMicrokernelTester()
39903 .mr(6)
39904 .nr(16)
39905 .kr(1)
39906 .sr(1)
39907 .m(6)
39908 .n(16)
39909 .k(1)
39910 .cm_stride(19)
39911 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
39912 }
39913#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
39914
39915
39916#if XNN_ARCH_X86 || XNN_ARCH_X86_64
39917 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1) {
39918 TEST_REQUIRES_X86_AVX512F;
39919 GemmMicrokernelTester()
39920 .mr(7)
39921 .nr(16)
39922 .kr(1)
39923 .sr(1)
39924 .m(7)
39925 .n(16)
39926 .k(1)
39927 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
39928 }
39929
39930 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, strided_cn) {
39931 TEST_REQUIRES_X86_AVX512F;
39932 GemmMicrokernelTester()
39933 .mr(7)
39934 .nr(16)
39935 .kr(1)
39936 .sr(1)
39937 .m(7)
39938 .n(16)
39939 .k(1)
39940 .cn_stride(19)
39941 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
39942 }
39943
39944 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
39945 TEST_REQUIRES_X86_AVX512F;
39946 GemmMicrokernelTester()
39947 .mr(7)
39948 .nr(16)
39949 .kr(1)
39950 .sr(1)
39951 .m(7)
39952 .n(16)
39953 .k(1)
39954 .a_stride(3)
39955 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
39956 }
39957
39958 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
39959 TEST_REQUIRES_X86_AVX512F;
39960 for (uint32_t m = 1; m <= 7; m++) {
39961 for (uint32_t n = 1; n <= 16; n++) {
39962 GemmMicrokernelTester()
39963 .mr(7)
39964 .nr(16)
39965 .kr(1)
39966 .sr(1)
39967 .m(m)
39968 .n(n)
39969 .k(1)
39970 .iterations(1)
39971 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
39972 }
39973 }
39974 }
39975
39976 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
39977 TEST_REQUIRES_X86_AVX512F;
39978 for (uint32_t m = 1; m <= 7; m++) {
39979 GemmMicrokernelTester()
39980 .mr(7)
39981 .nr(16)
39982 .kr(1)
39983 .sr(1)
39984 .m(m)
39985 .n(16)
39986 .k(1)
39987 .iterations(1)
39988 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
39989 }
39990 }
39991
39992 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
39993 TEST_REQUIRES_X86_AVX512F;
39994 for (uint32_t n = 1; n <= 16; n++) {
39995 GemmMicrokernelTester()
39996 .mr(7)
39997 .nr(16)
39998 .kr(1)
39999 .sr(1)
40000 .m(7)
40001 .n(n)
40002 .k(1)
40003 .iterations(1)
40004 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40005 }
40006 }
40007
40008 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_gt_1) {
40009 TEST_REQUIRES_X86_AVX512F;
40010 for (size_t k = 2; k < 10; k++) {
40011 GemmMicrokernelTester()
40012 .mr(7)
40013 .nr(16)
40014 .kr(1)
40015 .sr(1)
40016 .m(7)
40017 .n(16)
40018 .k(k)
40019 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40020 }
40021 }
40022
40023 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
40024 TEST_REQUIRES_X86_AVX512F;
40025 for (size_t k = 2; k < 10; k++) {
40026 GemmMicrokernelTester()
40027 .mr(7)
40028 .nr(16)
40029 .kr(1)
40030 .sr(1)
40031 .m(7)
40032 .n(16)
40033 .k(k)
40034 .a_stride(11)
40035 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40036 }
40037 }
40038
40039 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
40040 TEST_REQUIRES_X86_AVX512F;
40041 for (size_t k = 2; k < 10; k++) {
40042 for (uint32_t m = 1; m <= 7; m++) {
40043 for (uint32_t n = 1; n <= 16; n++) {
40044 GemmMicrokernelTester()
40045 .mr(7)
40046 .nr(16)
40047 .kr(1)
40048 .sr(1)
40049 .m(m)
40050 .n(n)
40051 .k(k)
40052 .iterations(1)
40053 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40054 }
40055 }
40056 }
40057 }
40058
40059 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16) {
40060 TEST_REQUIRES_X86_AVX512F;
40061 for (uint32_t n = 17; n < 32; n++) {
40062 for (size_t k = 1; k <= 5; k += 2) {
40063 GemmMicrokernelTester()
40064 .mr(7)
40065 .nr(16)
40066 .kr(1)
40067 .sr(1)
40068 .m(7)
40069 .n(16)
40070 .k(k)
40071 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40072 }
40073 }
40074 }
40075
40076 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
40077 TEST_REQUIRES_X86_AVX512F;
40078 for (uint32_t n = 17; n < 32; n++) {
40079 for (size_t k = 1; k <= 5; k += 2) {
40080 GemmMicrokernelTester()
40081 .mr(7)
40082 .nr(16)
40083 .kr(1)
40084 .sr(1)
40085 .m(7)
40086 .n(16)
40087 .k(k)
40088 .cn_stride(19)
40089 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40090 }
40091 }
40092 }
40093
40094 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
40095 TEST_REQUIRES_X86_AVX512F;
40096 for (uint32_t n = 17; n < 32; n++) {
40097 for (size_t k = 1; k <= 5; k += 2) {
40098 GemmMicrokernelTester()
40099 .mr(7)
40100 .nr(16)
40101 .kr(1)
40102 .sr(1)
40103 .m(7)
40104 .n(n)
40105 .k(k)
40106 .a_stride(7)
40107 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40108 }
40109 }
40110 }
40111
40112 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
40113 TEST_REQUIRES_X86_AVX512F;
40114 for (uint32_t n = 17; n < 32; n++) {
40115 for (size_t k = 1; k <= 5; k += 2) {
40116 for (uint32_t m = 1; m <= 7; m++) {
40117 GemmMicrokernelTester()
40118 .mr(7)
40119 .nr(16)
40120 .kr(1)
40121 .sr(1)
40122 .m(m)
40123 .n(n)
40124 .k(k)
40125 .iterations(1)
40126 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40127 }
40128 }
40129 }
40130 }
40131
40132 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16) {
40133 TEST_REQUIRES_X86_AVX512F;
40134 for (uint32_t n = 32; n <= 48; n += 16) {
40135 for (size_t k = 1; k <= 5; k += 2) {
40136 GemmMicrokernelTester()
40137 .mr(7)
40138 .nr(16)
40139 .kr(1)
40140 .sr(1)
40141 .m(7)
40142 .n(16)
40143 .k(k)
40144 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40145 }
40146 }
40147 }
40148
40149 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
40150 TEST_REQUIRES_X86_AVX512F;
40151 for (uint32_t n = 32; n <= 48; n += 16) {
40152 for (size_t k = 1; k <= 5; k += 2) {
40153 GemmMicrokernelTester()
40154 .mr(7)
40155 .nr(16)
40156 .kr(1)
40157 .sr(1)
40158 .m(7)
40159 .n(n)
40160 .k(k)
40161 .cn_stride(19)
40162 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40163 }
40164 }
40165 }
40166
40167 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16_strided_a) {
40168 TEST_REQUIRES_X86_AVX512F;
40169 for (uint32_t n = 32; n <= 48; n += 16) {
40170 for (size_t k = 1; k <= 5; k += 2) {
40171 GemmMicrokernelTester()
40172 .mr(7)
40173 .nr(16)
40174 .kr(1)
40175 .sr(1)
40176 .m(7)
40177 .n(n)
40178 .k(k)
40179 .a_stride(7)
40180 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40181 }
40182 }
40183 }
40184
40185 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
40186 TEST_REQUIRES_X86_AVX512F;
40187 for (uint32_t n = 32; n <= 48; n += 16) {
40188 for (size_t k = 1; k <= 5; k += 2) {
40189 for (uint32_t m = 1; m <= 7; m++) {
40190 GemmMicrokernelTester()
40191 .mr(7)
40192 .nr(16)
40193 .kr(1)
40194 .sr(1)
40195 .m(m)
40196 .n(n)
40197 .k(k)
40198 .iterations(1)
40199 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40200 }
40201 }
40202 }
40203 }
40204
40205 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
40206 TEST_REQUIRES_X86_AVX512F;
40207 for (size_t k = 1; k <= 5; k += 2) {
40208 for (uint32_t m = 1; m <= 7; m++) {
40209 for (uint32_t n = 1; n <= 16; n++) {
40210 GemmMicrokernelTester()
40211 .mr(7)
40212 .nr(16)
40213 .kr(1)
40214 .sr(1)
40215 .m(m)
40216 .n(n)
40217 .k(k)
40218 .cm_stride(19)
40219 .iterations(1)
40220 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40221 }
40222 }
40223 }
40224 }
40225
40226 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, qmin) {
40227 TEST_REQUIRES_X86_AVX512F;
40228 GemmMicrokernelTester()
40229 .mr(7)
40230 .nr(16)
40231 .kr(1)
40232 .sr(1)
40233 .m(7)
40234 .n(16)
40235 .k(1)
40236 .qmin(128)
40237 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40238 }
40239
40240 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, qmax) {
40241 TEST_REQUIRES_X86_AVX512F;
40242 GemmMicrokernelTester()
40243 .mr(7)
40244 .nr(16)
40245 .kr(1)
40246 .sr(1)
40247 .m(7)
40248 .n(16)
40249 .k(1)
40250 .qmax(128)
40251 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40252 }
40253
40254 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, strided_cm) {
40255 TEST_REQUIRES_X86_AVX512F;
40256 GemmMicrokernelTester()
40257 .mr(7)
40258 .nr(16)
40259 .kr(1)
40260 .sr(1)
40261 .m(7)
40262 .n(16)
40263 .k(1)
40264 .cm_stride(19)
40265 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
40266 }
40267#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
40268
40269
40270#if XNN_ARCH_X86 || XNN_ARCH_X86_64
40271 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1) {
40272 TEST_REQUIRES_X86_AVX512F;
40273 GemmMicrokernelTester()
40274 .mr(8)
40275 .nr(16)
40276 .kr(1)
40277 .sr(1)
40278 .m(8)
40279 .n(16)
40280 .k(1)
40281 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40282 }
40283
40284 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, strided_cn) {
40285 TEST_REQUIRES_X86_AVX512F;
40286 GemmMicrokernelTester()
40287 .mr(8)
40288 .nr(16)
40289 .kr(1)
40290 .sr(1)
40291 .m(8)
40292 .n(16)
40293 .k(1)
40294 .cn_stride(19)
40295 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40296 }
40297
40298 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
40299 TEST_REQUIRES_X86_AVX512F;
40300 GemmMicrokernelTester()
40301 .mr(8)
40302 .nr(16)
40303 .kr(1)
40304 .sr(1)
40305 .m(8)
40306 .n(16)
40307 .k(1)
40308 .a_stride(3)
40309 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40310 }
40311
40312 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
40313 TEST_REQUIRES_X86_AVX512F;
40314 for (uint32_t m = 1; m <= 8; m++) {
40315 for (uint32_t n = 1; n <= 16; n++) {
40316 GemmMicrokernelTester()
40317 .mr(8)
40318 .nr(16)
40319 .kr(1)
40320 .sr(1)
40321 .m(m)
40322 .n(n)
40323 .k(1)
40324 .iterations(1)
40325 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40326 }
40327 }
40328 }
40329
40330 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
40331 TEST_REQUIRES_X86_AVX512F;
40332 for (uint32_t m = 1; m <= 8; m++) {
40333 GemmMicrokernelTester()
40334 .mr(8)
40335 .nr(16)
40336 .kr(1)
40337 .sr(1)
40338 .m(m)
40339 .n(16)
40340 .k(1)
40341 .iterations(1)
40342 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40343 }
40344 }
40345
40346 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
40347 TEST_REQUIRES_X86_AVX512F;
40348 for (uint32_t n = 1; n <= 16; n++) {
40349 GemmMicrokernelTester()
40350 .mr(8)
40351 .nr(16)
40352 .kr(1)
40353 .sr(1)
40354 .m(8)
40355 .n(n)
40356 .k(1)
40357 .iterations(1)
40358 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40359 }
40360 }
40361
40362 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_gt_1) {
40363 TEST_REQUIRES_X86_AVX512F;
40364 for (size_t k = 2; k < 10; k++) {
40365 GemmMicrokernelTester()
40366 .mr(8)
40367 .nr(16)
40368 .kr(1)
40369 .sr(1)
40370 .m(8)
40371 .n(16)
40372 .k(k)
40373 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40374 }
40375 }
40376
40377 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
40378 TEST_REQUIRES_X86_AVX512F;
40379 for (size_t k = 2; k < 10; k++) {
40380 GemmMicrokernelTester()
40381 .mr(8)
40382 .nr(16)
40383 .kr(1)
40384 .sr(1)
40385 .m(8)
40386 .n(16)
40387 .k(k)
40388 .a_stride(11)
40389 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40390 }
40391 }
40392
40393 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
40394 TEST_REQUIRES_X86_AVX512F;
40395 for (size_t k = 2; k < 10; k++) {
40396 for (uint32_t m = 1; m <= 8; m++) {
40397 for (uint32_t n = 1; n <= 16; n++) {
40398 GemmMicrokernelTester()
40399 .mr(8)
40400 .nr(16)
40401 .kr(1)
40402 .sr(1)
40403 .m(m)
40404 .n(n)
40405 .k(k)
40406 .iterations(1)
40407 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40408 }
40409 }
40410 }
40411 }
40412
40413 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16) {
40414 TEST_REQUIRES_X86_AVX512F;
40415 for (uint32_t n = 17; n < 32; n++) {
40416 for (size_t k = 1; k <= 5; k += 2) {
40417 GemmMicrokernelTester()
40418 .mr(8)
40419 .nr(16)
40420 .kr(1)
40421 .sr(1)
40422 .m(8)
40423 .n(16)
40424 .k(k)
40425 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40426 }
40427 }
40428 }
40429
40430 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
40431 TEST_REQUIRES_X86_AVX512F;
40432 for (uint32_t n = 17; n < 32; n++) {
40433 for (size_t k = 1; k <= 5; k += 2) {
40434 GemmMicrokernelTester()
40435 .mr(8)
40436 .nr(16)
40437 .kr(1)
40438 .sr(1)
40439 .m(8)
40440 .n(16)
40441 .k(k)
40442 .cn_stride(19)
40443 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40444 }
40445 }
40446 }
40447
40448 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
40449 TEST_REQUIRES_X86_AVX512F;
40450 for (uint32_t n = 17; n < 32; n++) {
40451 for (size_t k = 1; k <= 5; k += 2) {
40452 GemmMicrokernelTester()
40453 .mr(8)
40454 .nr(16)
40455 .kr(1)
40456 .sr(1)
40457 .m(8)
40458 .n(n)
40459 .k(k)
40460 .a_stride(7)
40461 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40462 }
40463 }
40464 }
40465
40466 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
40467 TEST_REQUIRES_X86_AVX512F;
40468 for (uint32_t n = 17; n < 32; n++) {
40469 for (size_t k = 1; k <= 5; k += 2) {
40470 for (uint32_t m = 1; m <= 8; m++) {
40471 GemmMicrokernelTester()
40472 .mr(8)
40473 .nr(16)
40474 .kr(1)
40475 .sr(1)
40476 .m(m)
40477 .n(n)
40478 .k(k)
40479 .iterations(1)
40480 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40481 }
40482 }
40483 }
40484 }
40485
40486 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16) {
40487 TEST_REQUIRES_X86_AVX512F;
40488 for (uint32_t n = 32; n <= 48; n += 16) {
40489 for (size_t k = 1; k <= 5; k += 2) {
40490 GemmMicrokernelTester()
40491 .mr(8)
40492 .nr(16)
40493 .kr(1)
40494 .sr(1)
40495 .m(8)
40496 .n(16)
40497 .k(k)
40498 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40499 }
40500 }
40501 }
40502
40503 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
40504 TEST_REQUIRES_X86_AVX512F;
40505 for (uint32_t n = 32; n <= 48; n += 16) {
40506 for (size_t k = 1; k <= 5; k += 2) {
40507 GemmMicrokernelTester()
40508 .mr(8)
40509 .nr(16)
40510 .kr(1)
40511 .sr(1)
40512 .m(8)
40513 .n(n)
40514 .k(k)
40515 .cn_stride(19)
40516 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40517 }
40518 }
40519 }
40520
40521 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16_strided_a) {
40522 TEST_REQUIRES_X86_AVX512F;
40523 for (uint32_t n = 32; n <= 48; n += 16) {
40524 for (size_t k = 1; k <= 5; k += 2) {
40525 GemmMicrokernelTester()
40526 .mr(8)
40527 .nr(16)
40528 .kr(1)
40529 .sr(1)
40530 .m(8)
40531 .n(n)
40532 .k(k)
40533 .a_stride(7)
40534 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40535 }
40536 }
40537 }
40538
40539 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
40540 TEST_REQUIRES_X86_AVX512F;
40541 for (uint32_t n = 32; n <= 48; n += 16) {
40542 for (size_t k = 1; k <= 5; k += 2) {
40543 for (uint32_t m = 1; m <= 8; m++) {
40544 GemmMicrokernelTester()
40545 .mr(8)
40546 .nr(16)
40547 .kr(1)
40548 .sr(1)
40549 .m(m)
40550 .n(n)
40551 .k(k)
40552 .iterations(1)
40553 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40554 }
40555 }
40556 }
40557 }
40558
40559 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
40560 TEST_REQUIRES_X86_AVX512F;
40561 for (size_t k = 1; k <= 5; k += 2) {
40562 for (uint32_t m = 1; m <= 8; m++) {
40563 for (uint32_t n = 1; n <= 16; n++) {
40564 GemmMicrokernelTester()
40565 .mr(8)
40566 .nr(16)
40567 .kr(1)
40568 .sr(1)
40569 .m(m)
40570 .n(n)
40571 .k(k)
40572 .cm_stride(19)
40573 .iterations(1)
40574 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40575 }
40576 }
40577 }
40578 }
40579
40580 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, qmin) {
40581 TEST_REQUIRES_X86_AVX512F;
40582 GemmMicrokernelTester()
40583 .mr(8)
40584 .nr(16)
40585 .kr(1)
40586 .sr(1)
40587 .m(8)
40588 .n(16)
40589 .k(1)
40590 .qmin(128)
40591 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40592 }
40593
40594 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, qmax) {
40595 TEST_REQUIRES_X86_AVX512F;
40596 GemmMicrokernelTester()
40597 .mr(8)
40598 .nr(16)
40599 .kr(1)
40600 .sr(1)
40601 .m(8)
40602 .n(16)
40603 .k(1)
40604 .qmax(128)
40605 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40606 }
40607
40608 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, strided_cm) {
40609 TEST_REQUIRES_X86_AVX512F;
40610 GemmMicrokernelTester()
40611 .mr(8)
40612 .nr(16)
40613 .kr(1)
40614 .sr(1)
40615 .m(8)
40616 .n(16)
40617 .k(1)
40618 .cm_stride(19)
40619 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
40620 }
40621#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
40622
40623
40624#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
40625 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1) {
40626 TEST_REQUIRES_PSIMD;
40627 GemmMicrokernelTester()
40628 .mr(1)
40629 .nr(8)
40630 .kr(1)
40631 .sr(1)
40632 .m(1)
40633 .n(8)
40634 .k(1)
40635 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40636 }
40637
40638 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, strided_cn) {
40639 TEST_REQUIRES_PSIMD;
40640 GemmMicrokernelTester()
40641 .mr(1)
40642 .nr(8)
40643 .kr(1)
40644 .sr(1)
40645 .m(1)
40646 .n(8)
40647 .k(1)
40648 .cn_stride(11)
40649 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40650 }
40651
40652 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
40653 TEST_REQUIRES_PSIMD;
40654 GemmMicrokernelTester()
40655 .mr(1)
40656 .nr(8)
40657 .kr(1)
40658 .sr(1)
40659 .m(1)
40660 .n(8)
40661 .k(1)
40662 .a_stride(3)
40663 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40664 }
40665
40666 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
40667 TEST_REQUIRES_PSIMD;
40668 for (uint32_t m = 1; m <= 1; m++) {
40669 for (uint32_t n = 1; n <= 8; n++) {
40670 GemmMicrokernelTester()
40671 .mr(1)
40672 .nr(8)
40673 .kr(1)
40674 .sr(1)
40675 .m(m)
40676 .n(n)
40677 .k(1)
40678 .iterations(1)
40679 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40680 }
40681 }
40682 }
40683
40684 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
40685 TEST_REQUIRES_PSIMD;
40686 for (uint32_t m = 1; m <= 1; m++) {
40687 GemmMicrokernelTester()
40688 .mr(1)
40689 .nr(8)
40690 .kr(1)
40691 .sr(1)
40692 .m(m)
40693 .n(8)
40694 .k(1)
40695 .iterations(1)
40696 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40697 }
40698 }
40699
40700 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
40701 TEST_REQUIRES_PSIMD;
40702 for (uint32_t n = 1; n <= 8; n++) {
40703 GemmMicrokernelTester()
40704 .mr(1)
40705 .nr(8)
40706 .kr(1)
40707 .sr(1)
40708 .m(1)
40709 .n(n)
40710 .k(1)
40711 .iterations(1)
40712 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40713 }
40714 }
40715
40716 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_gt_1) {
40717 TEST_REQUIRES_PSIMD;
40718 for (size_t k = 2; k < 10; k++) {
40719 GemmMicrokernelTester()
40720 .mr(1)
40721 .nr(8)
40722 .kr(1)
40723 .sr(1)
40724 .m(1)
40725 .n(8)
40726 .k(k)
40727 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40728 }
40729 }
40730
40731 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
40732 TEST_REQUIRES_PSIMD;
40733 for (size_t k = 2; k < 10; k++) {
40734 GemmMicrokernelTester()
40735 .mr(1)
40736 .nr(8)
40737 .kr(1)
40738 .sr(1)
40739 .m(1)
40740 .n(8)
40741 .k(k)
40742 .a_stride(11)
40743 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40744 }
40745 }
40746
40747 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
40748 TEST_REQUIRES_PSIMD;
40749 for (size_t k = 2; k < 10; k++) {
40750 for (uint32_t m = 1; m <= 1; m++) {
40751 for (uint32_t n = 1; n <= 8; n++) {
40752 GemmMicrokernelTester()
40753 .mr(1)
40754 .nr(8)
40755 .kr(1)
40756 .sr(1)
40757 .m(m)
40758 .n(n)
40759 .k(k)
40760 .iterations(1)
40761 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40762 }
40763 }
40764 }
40765 }
40766
40767 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8) {
40768 TEST_REQUIRES_PSIMD;
40769 for (uint32_t n = 9; n < 16; n++) {
40770 for (size_t k = 1; k <= 5; k += 2) {
40771 GemmMicrokernelTester()
40772 .mr(1)
40773 .nr(8)
40774 .kr(1)
40775 .sr(1)
40776 .m(1)
40777 .n(8)
40778 .k(k)
40779 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40780 }
40781 }
40782 }
40783
40784 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
40785 TEST_REQUIRES_PSIMD;
40786 for (uint32_t n = 9; n < 16; n++) {
40787 for (size_t k = 1; k <= 5; k += 2) {
40788 GemmMicrokernelTester()
40789 .mr(1)
40790 .nr(8)
40791 .kr(1)
40792 .sr(1)
40793 .m(1)
40794 .n(8)
40795 .k(k)
40796 .cn_stride(11)
40797 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40798 }
40799 }
40800 }
40801
40802 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
40803 TEST_REQUIRES_PSIMD;
40804 for (uint32_t n = 9; n < 16; n++) {
40805 for (size_t k = 1; k <= 5; k += 2) {
40806 GemmMicrokernelTester()
40807 .mr(1)
40808 .nr(8)
40809 .kr(1)
40810 .sr(1)
40811 .m(1)
40812 .n(n)
40813 .k(k)
40814 .a_stride(7)
40815 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40816 }
40817 }
40818 }
40819
40820 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
40821 TEST_REQUIRES_PSIMD;
40822 for (uint32_t n = 9; n < 16; n++) {
40823 for (size_t k = 1; k <= 5; k += 2) {
40824 for (uint32_t m = 1; m <= 1; m++) {
40825 GemmMicrokernelTester()
40826 .mr(1)
40827 .nr(8)
40828 .kr(1)
40829 .sr(1)
40830 .m(m)
40831 .n(n)
40832 .k(k)
40833 .iterations(1)
40834 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40835 }
40836 }
40837 }
40838 }
40839
40840 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8) {
40841 TEST_REQUIRES_PSIMD;
40842 for (uint32_t n = 16; n <= 24; n += 8) {
40843 for (size_t k = 1; k <= 5; k += 2) {
40844 GemmMicrokernelTester()
40845 .mr(1)
40846 .nr(8)
40847 .kr(1)
40848 .sr(1)
40849 .m(1)
40850 .n(8)
40851 .k(k)
40852 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40853 }
40854 }
40855 }
40856
40857 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
40858 TEST_REQUIRES_PSIMD;
40859 for (uint32_t n = 16; n <= 24; n += 8) {
40860 for (size_t k = 1; k <= 5; k += 2) {
40861 GemmMicrokernelTester()
40862 .mr(1)
40863 .nr(8)
40864 .kr(1)
40865 .sr(1)
40866 .m(1)
40867 .n(n)
40868 .k(k)
40869 .cn_stride(11)
40870 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40871 }
40872 }
40873 }
40874
40875 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
40876 TEST_REQUIRES_PSIMD;
40877 for (uint32_t n = 16; n <= 24; n += 8) {
40878 for (size_t k = 1; k <= 5; k += 2) {
40879 GemmMicrokernelTester()
40880 .mr(1)
40881 .nr(8)
40882 .kr(1)
40883 .sr(1)
40884 .m(1)
40885 .n(n)
40886 .k(k)
40887 .a_stride(7)
40888 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40889 }
40890 }
40891 }
40892
40893 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
40894 TEST_REQUIRES_PSIMD;
40895 for (uint32_t n = 16; n <= 24; n += 8) {
40896 for (size_t k = 1; k <= 5; k += 2) {
40897 for (uint32_t m = 1; m <= 1; m++) {
40898 GemmMicrokernelTester()
40899 .mr(1)
40900 .nr(8)
40901 .kr(1)
40902 .sr(1)
40903 .m(m)
40904 .n(n)
40905 .k(k)
40906 .iterations(1)
40907 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40908 }
40909 }
40910 }
40911 }
40912
40913 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
40914 TEST_REQUIRES_PSIMD;
40915 for (size_t k = 1; k <= 5; k += 2) {
40916 for (uint32_t m = 1; m <= 1; m++) {
40917 for (uint32_t n = 1; n <= 8; n++) {
40918 GemmMicrokernelTester()
40919 .mr(1)
40920 .nr(8)
40921 .kr(1)
40922 .sr(1)
40923 .m(m)
40924 .n(n)
40925 .k(k)
40926 .cm_stride(11)
40927 .iterations(1)
40928 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40929 }
40930 }
40931 }
40932 }
40933
40934 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, qmin) {
40935 TEST_REQUIRES_PSIMD;
40936 GemmMicrokernelTester()
40937 .mr(1)
40938 .nr(8)
40939 .kr(1)
40940 .sr(1)
40941 .m(1)
40942 .n(8)
40943 .k(1)
40944 .qmin(128)
40945 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40946 }
40947
40948 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, qmax) {
40949 TEST_REQUIRES_PSIMD;
40950 GemmMicrokernelTester()
40951 .mr(1)
40952 .nr(8)
40953 .kr(1)
40954 .sr(1)
40955 .m(1)
40956 .n(8)
40957 .k(1)
40958 .qmax(128)
40959 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40960 }
40961
40962 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, strided_cm) {
40963 TEST_REQUIRES_PSIMD;
40964 GemmMicrokernelTester()
40965 .mr(1)
40966 .nr(8)
40967 .kr(1)
40968 .sr(1)
40969 .m(1)
40970 .n(8)
40971 .k(1)
40972 .cm_stride(11)
40973 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40974 }
40975#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
40976
40977
40978#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
40979 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1) {
40980 TEST_REQUIRES_PSIMD;
40981 GemmMicrokernelTester()
40982 .mr(4)
40983 .nr(8)
40984 .kr(1)
40985 .sr(1)
40986 .m(4)
40987 .n(8)
40988 .k(1)
40989 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
40990 }
40991
40992 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, strided_cn) {
40993 TEST_REQUIRES_PSIMD;
40994 GemmMicrokernelTester()
40995 .mr(4)
40996 .nr(8)
40997 .kr(1)
40998 .sr(1)
40999 .m(4)
41000 .n(8)
41001 .k(1)
41002 .cn_stride(11)
41003 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41004 }
41005
41006 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
41007 TEST_REQUIRES_PSIMD;
41008 GemmMicrokernelTester()
41009 .mr(4)
41010 .nr(8)
41011 .kr(1)
41012 .sr(1)
41013 .m(4)
41014 .n(8)
41015 .k(1)
41016 .a_stride(3)
41017 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41018 }
41019
41020 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
41021 TEST_REQUIRES_PSIMD;
41022 for (uint32_t m = 1; m <= 4; m++) {
41023 for (uint32_t n = 1; n <= 8; n++) {
41024 GemmMicrokernelTester()
41025 .mr(4)
41026 .nr(8)
41027 .kr(1)
41028 .sr(1)
41029 .m(m)
41030 .n(n)
41031 .k(1)
41032 .iterations(1)
41033 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41034 }
41035 }
41036 }
41037
41038 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
41039 TEST_REQUIRES_PSIMD;
41040 for (uint32_t m = 1; m <= 4; m++) {
41041 GemmMicrokernelTester()
41042 .mr(4)
41043 .nr(8)
41044 .kr(1)
41045 .sr(1)
41046 .m(m)
41047 .n(8)
41048 .k(1)
41049 .iterations(1)
41050 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41051 }
41052 }
41053
41054 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
41055 TEST_REQUIRES_PSIMD;
41056 for (uint32_t n = 1; n <= 8; n++) {
41057 GemmMicrokernelTester()
41058 .mr(4)
41059 .nr(8)
41060 .kr(1)
41061 .sr(1)
41062 .m(4)
41063 .n(n)
41064 .k(1)
41065 .iterations(1)
41066 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41067 }
41068 }
41069
41070 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_gt_1) {
41071 TEST_REQUIRES_PSIMD;
41072 for (size_t k = 2; k < 10; k++) {
41073 GemmMicrokernelTester()
41074 .mr(4)
41075 .nr(8)
41076 .kr(1)
41077 .sr(1)
41078 .m(4)
41079 .n(8)
41080 .k(k)
41081 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41082 }
41083 }
41084
41085 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
41086 TEST_REQUIRES_PSIMD;
41087 for (size_t k = 2; k < 10; k++) {
41088 GemmMicrokernelTester()
41089 .mr(4)
41090 .nr(8)
41091 .kr(1)
41092 .sr(1)
41093 .m(4)
41094 .n(8)
41095 .k(k)
41096 .a_stride(11)
41097 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41098 }
41099 }
41100
41101 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
41102 TEST_REQUIRES_PSIMD;
41103 for (size_t k = 2; k < 10; k++) {
41104 for (uint32_t m = 1; m <= 4; m++) {
41105 for (uint32_t n = 1; n <= 8; n++) {
41106 GemmMicrokernelTester()
41107 .mr(4)
41108 .nr(8)
41109 .kr(1)
41110 .sr(1)
41111 .m(m)
41112 .n(n)
41113 .k(k)
41114 .iterations(1)
41115 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41116 }
41117 }
41118 }
41119 }
41120
41121 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8) {
41122 TEST_REQUIRES_PSIMD;
41123 for (uint32_t n = 9; n < 16; n++) {
41124 for (size_t k = 1; k <= 5; k += 2) {
41125 GemmMicrokernelTester()
41126 .mr(4)
41127 .nr(8)
41128 .kr(1)
41129 .sr(1)
41130 .m(4)
41131 .n(8)
41132 .k(k)
41133 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41134 }
41135 }
41136 }
41137
41138 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
41139 TEST_REQUIRES_PSIMD;
41140 for (uint32_t n = 9; n < 16; n++) {
41141 for (size_t k = 1; k <= 5; k += 2) {
41142 GemmMicrokernelTester()
41143 .mr(4)
41144 .nr(8)
41145 .kr(1)
41146 .sr(1)
41147 .m(4)
41148 .n(8)
41149 .k(k)
41150 .cn_stride(11)
41151 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41152 }
41153 }
41154 }
41155
41156 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
41157 TEST_REQUIRES_PSIMD;
41158 for (uint32_t n = 9; n < 16; n++) {
41159 for (size_t k = 1; k <= 5; k += 2) {
41160 GemmMicrokernelTester()
41161 .mr(4)
41162 .nr(8)
41163 .kr(1)
41164 .sr(1)
41165 .m(4)
41166 .n(n)
41167 .k(k)
41168 .a_stride(7)
41169 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41170 }
41171 }
41172 }
41173
41174 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
41175 TEST_REQUIRES_PSIMD;
41176 for (uint32_t n = 9; n < 16; n++) {
41177 for (size_t k = 1; k <= 5; k += 2) {
41178 for (uint32_t m = 1; m <= 4; m++) {
41179 GemmMicrokernelTester()
41180 .mr(4)
41181 .nr(8)
41182 .kr(1)
41183 .sr(1)
41184 .m(m)
41185 .n(n)
41186 .k(k)
41187 .iterations(1)
41188 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41189 }
41190 }
41191 }
41192 }
41193
41194 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8) {
41195 TEST_REQUIRES_PSIMD;
41196 for (uint32_t n = 16; n <= 24; n += 8) {
41197 for (size_t k = 1; k <= 5; k += 2) {
41198 GemmMicrokernelTester()
41199 .mr(4)
41200 .nr(8)
41201 .kr(1)
41202 .sr(1)
41203 .m(4)
41204 .n(8)
41205 .k(k)
41206 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41207 }
41208 }
41209 }
41210
41211 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
41212 TEST_REQUIRES_PSIMD;
41213 for (uint32_t n = 16; n <= 24; n += 8) {
41214 for (size_t k = 1; k <= 5; k += 2) {
41215 GemmMicrokernelTester()
41216 .mr(4)
41217 .nr(8)
41218 .kr(1)
41219 .sr(1)
41220 .m(4)
41221 .n(n)
41222 .k(k)
41223 .cn_stride(11)
41224 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41225 }
41226 }
41227 }
41228
41229 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
41230 TEST_REQUIRES_PSIMD;
41231 for (uint32_t n = 16; n <= 24; n += 8) {
41232 for (size_t k = 1; k <= 5; k += 2) {
41233 GemmMicrokernelTester()
41234 .mr(4)
41235 .nr(8)
41236 .kr(1)
41237 .sr(1)
41238 .m(4)
41239 .n(n)
41240 .k(k)
41241 .a_stride(7)
41242 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41243 }
41244 }
41245 }
41246
41247 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
41248 TEST_REQUIRES_PSIMD;
41249 for (uint32_t n = 16; n <= 24; n += 8) {
41250 for (size_t k = 1; k <= 5; k += 2) {
41251 for (uint32_t m = 1; m <= 4; m++) {
41252 GemmMicrokernelTester()
41253 .mr(4)
41254 .nr(8)
41255 .kr(1)
41256 .sr(1)
41257 .m(m)
41258 .n(n)
41259 .k(k)
41260 .iterations(1)
41261 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41262 }
41263 }
41264 }
41265 }
41266
41267 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
41268 TEST_REQUIRES_PSIMD;
41269 for (size_t k = 1; k <= 5; k += 2) {
41270 for (uint32_t m = 1; m <= 4; m++) {
41271 for (uint32_t n = 1; n <= 8; n++) {
41272 GemmMicrokernelTester()
41273 .mr(4)
41274 .nr(8)
41275 .kr(1)
41276 .sr(1)
41277 .m(m)
41278 .n(n)
41279 .k(k)
41280 .cm_stride(11)
41281 .iterations(1)
41282 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41283 }
41284 }
41285 }
41286 }
41287
41288 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, qmin) {
41289 TEST_REQUIRES_PSIMD;
41290 GemmMicrokernelTester()
41291 .mr(4)
41292 .nr(8)
41293 .kr(1)
41294 .sr(1)
41295 .m(4)
41296 .n(8)
41297 .k(1)
41298 .qmin(128)
41299 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41300 }
41301
41302 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, qmax) {
41303 TEST_REQUIRES_PSIMD;
41304 GemmMicrokernelTester()
41305 .mr(4)
41306 .nr(8)
41307 .kr(1)
41308 .sr(1)
41309 .m(4)
41310 .n(8)
41311 .k(1)
41312 .qmax(128)
41313 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41314 }
41315
41316 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, strided_cm) {
41317 TEST_REQUIRES_PSIMD;
41318 GemmMicrokernelTester()
41319 .mr(4)
41320 .nr(8)
41321 .kr(1)
41322 .sr(1)
41323 .m(4)
41324 .n(8)
41325 .k(1)
41326 .cm_stride(11)
41327 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41328 }
41329#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41330
41331
41332#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41333 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1) {
41334 TEST_REQUIRES_PSIMD;
41335 GemmMicrokernelTester()
41336 .mr(6)
41337 .nr(8)
41338 .kr(1)
41339 .sr(1)
41340 .m(6)
41341 .n(8)
41342 .k(1)
41343 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41344 }
41345
41346 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, strided_cn) {
41347 TEST_REQUIRES_PSIMD;
41348 GemmMicrokernelTester()
41349 .mr(6)
41350 .nr(8)
41351 .kr(1)
41352 .sr(1)
41353 .m(6)
41354 .n(8)
41355 .k(1)
41356 .cn_stride(11)
41357 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41358 }
41359
41360 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
41361 TEST_REQUIRES_PSIMD;
41362 GemmMicrokernelTester()
41363 .mr(6)
41364 .nr(8)
41365 .kr(1)
41366 .sr(1)
41367 .m(6)
41368 .n(8)
41369 .k(1)
41370 .a_stride(3)
41371 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41372 }
41373
41374 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
41375 TEST_REQUIRES_PSIMD;
41376 for (uint32_t m = 1; m <= 6; m++) {
41377 for (uint32_t n = 1; n <= 8; n++) {
41378 GemmMicrokernelTester()
41379 .mr(6)
41380 .nr(8)
41381 .kr(1)
41382 .sr(1)
41383 .m(m)
41384 .n(n)
41385 .k(1)
41386 .iterations(1)
41387 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41388 }
41389 }
41390 }
41391
41392 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
41393 TEST_REQUIRES_PSIMD;
41394 for (uint32_t m = 1; m <= 6; m++) {
41395 GemmMicrokernelTester()
41396 .mr(6)
41397 .nr(8)
41398 .kr(1)
41399 .sr(1)
41400 .m(m)
41401 .n(8)
41402 .k(1)
41403 .iterations(1)
41404 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41405 }
41406 }
41407
41408 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
41409 TEST_REQUIRES_PSIMD;
41410 for (uint32_t n = 1; n <= 8; n++) {
41411 GemmMicrokernelTester()
41412 .mr(6)
41413 .nr(8)
41414 .kr(1)
41415 .sr(1)
41416 .m(6)
41417 .n(n)
41418 .k(1)
41419 .iterations(1)
41420 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41421 }
41422 }
41423
41424 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_gt_1) {
41425 TEST_REQUIRES_PSIMD;
41426 for (size_t k = 2; k < 10; k++) {
41427 GemmMicrokernelTester()
41428 .mr(6)
41429 .nr(8)
41430 .kr(1)
41431 .sr(1)
41432 .m(6)
41433 .n(8)
41434 .k(k)
41435 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41436 }
41437 }
41438
41439 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
41440 TEST_REQUIRES_PSIMD;
41441 for (size_t k = 2; k < 10; k++) {
41442 GemmMicrokernelTester()
41443 .mr(6)
41444 .nr(8)
41445 .kr(1)
41446 .sr(1)
41447 .m(6)
41448 .n(8)
41449 .k(k)
41450 .a_stride(11)
41451 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41452 }
41453 }
41454
41455 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
41456 TEST_REQUIRES_PSIMD;
41457 for (size_t k = 2; k < 10; k++) {
41458 for (uint32_t m = 1; m <= 6; m++) {
41459 for (uint32_t n = 1; n <= 8; n++) {
41460 GemmMicrokernelTester()
41461 .mr(6)
41462 .nr(8)
41463 .kr(1)
41464 .sr(1)
41465 .m(m)
41466 .n(n)
41467 .k(k)
41468 .iterations(1)
41469 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41470 }
41471 }
41472 }
41473 }
41474
41475 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8) {
41476 TEST_REQUIRES_PSIMD;
41477 for (uint32_t n = 9; n < 16; n++) {
41478 for (size_t k = 1; k <= 5; k += 2) {
41479 GemmMicrokernelTester()
41480 .mr(6)
41481 .nr(8)
41482 .kr(1)
41483 .sr(1)
41484 .m(6)
41485 .n(8)
41486 .k(k)
41487 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41488 }
41489 }
41490 }
41491
41492 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
41493 TEST_REQUIRES_PSIMD;
41494 for (uint32_t n = 9; n < 16; n++) {
41495 for (size_t k = 1; k <= 5; k += 2) {
41496 GemmMicrokernelTester()
41497 .mr(6)
41498 .nr(8)
41499 .kr(1)
41500 .sr(1)
41501 .m(6)
41502 .n(8)
41503 .k(k)
41504 .cn_stride(11)
41505 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41506 }
41507 }
41508 }
41509
41510 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
41511 TEST_REQUIRES_PSIMD;
41512 for (uint32_t n = 9; n < 16; n++) {
41513 for (size_t k = 1; k <= 5; k += 2) {
41514 GemmMicrokernelTester()
41515 .mr(6)
41516 .nr(8)
41517 .kr(1)
41518 .sr(1)
41519 .m(6)
41520 .n(n)
41521 .k(k)
41522 .a_stride(7)
41523 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41524 }
41525 }
41526 }
41527
41528 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
41529 TEST_REQUIRES_PSIMD;
41530 for (uint32_t n = 9; n < 16; n++) {
41531 for (size_t k = 1; k <= 5; k += 2) {
41532 for (uint32_t m = 1; m <= 6; m++) {
41533 GemmMicrokernelTester()
41534 .mr(6)
41535 .nr(8)
41536 .kr(1)
41537 .sr(1)
41538 .m(m)
41539 .n(n)
41540 .k(k)
41541 .iterations(1)
41542 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41543 }
41544 }
41545 }
41546 }
41547
41548 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8) {
41549 TEST_REQUIRES_PSIMD;
41550 for (uint32_t n = 16; n <= 24; n += 8) {
41551 for (size_t k = 1; k <= 5; k += 2) {
41552 GemmMicrokernelTester()
41553 .mr(6)
41554 .nr(8)
41555 .kr(1)
41556 .sr(1)
41557 .m(6)
41558 .n(8)
41559 .k(k)
41560 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41561 }
41562 }
41563 }
41564
41565 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
41566 TEST_REQUIRES_PSIMD;
41567 for (uint32_t n = 16; n <= 24; n += 8) {
41568 for (size_t k = 1; k <= 5; k += 2) {
41569 GemmMicrokernelTester()
41570 .mr(6)
41571 .nr(8)
41572 .kr(1)
41573 .sr(1)
41574 .m(6)
41575 .n(n)
41576 .k(k)
41577 .cn_stride(11)
41578 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41579 }
41580 }
41581 }
41582
41583 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
41584 TEST_REQUIRES_PSIMD;
41585 for (uint32_t n = 16; n <= 24; n += 8) {
41586 for (size_t k = 1; k <= 5; k += 2) {
41587 GemmMicrokernelTester()
41588 .mr(6)
41589 .nr(8)
41590 .kr(1)
41591 .sr(1)
41592 .m(6)
41593 .n(n)
41594 .k(k)
41595 .a_stride(7)
41596 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41597 }
41598 }
41599 }
41600
41601 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
41602 TEST_REQUIRES_PSIMD;
41603 for (uint32_t n = 16; n <= 24; n += 8) {
41604 for (size_t k = 1; k <= 5; k += 2) {
41605 for (uint32_t m = 1; m <= 6; m++) {
41606 GemmMicrokernelTester()
41607 .mr(6)
41608 .nr(8)
41609 .kr(1)
41610 .sr(1)
41611 .m(m)
41612 .n(n)
41613 .k(k)
41614 .iterations(1)
41615 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41616 }
41617 }
41618 }
41619 }
41620
41621 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
41622 TEST_REQUIRES_PSIMD;
41623 for (size_t k = 1; k <= 5; k += 2) {
41624 for (uint32_t m = 1; m <= 6; m++) {
41625 for (uint32_t n = 1; n <= 8; n++) {
41626 GemmMicrokernelTester()
41627 .mr(6)
41628 .nr(8)
41629 .kr(1)
41630 .sr(1)
41631 .m(m)
41632 .n(n)
41633 .k(k)
41634 .cm_stride(11)
41635 .iterations(1)
41636 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41637 }
41638 }
41639 }
41640 }
41641
41642 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, qmin) {
41643 TEST_REQUIRES_PSIMD;
41644 GemmMicrokernelTester()
41645 .mr(6)
41646 .nr(8)
41647 .kr(1)
41648 .sr(1)
41649 .m(6)
41650 .n(8)
41651 .k(1)
41652 .qmin(128)
41653 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41654 }
41655
41656 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, qmax) {
41657 TEST_REQUIRES_PSIMD;
41658 GemmMicrokernelTester()
41659 .mr(6)
41660 .nr(8)
41661 .kr(1)
41662 .sr(1)
41663 .m(6)
41664 .n(8)
41665 .k(1)
41666 .qmax(128)
41667 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41668 }
41669
41670 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, strided_cm) {
41671 TEST_REQUIRES_PSIMD;
41672 GemmMicrokernelTester()
41673 .mr(6)
41674 .nr(8)
41675 .kr(1)
41676 .sr(1)
41677 .m(6)
41678 .n(8)
41679 .k(1)
41680 .cm_stride(11)
41681 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41682 }
41683#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41684
41685
41686#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41687 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4) {
41688 TEST_REQUIRES_PSIMD;
41689 GemmMicrokernelTester()
41690 .mr(1)
41691 .nr(8)
41692 .kr(1)
41693 .sr(1)
41694 .m(1)
41695 .n(8)
41696 .k(4)
41697 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41698 }
41699
41700 TEST(F32_GEMM_1X8__PSIMD_SPLAT, strided_cn) {
41701 TEST_REQUIRES_PSIMD;
41702 GemmMicrokernelTester()
41703 .mr(1)
41704 .nr(8)
41705 .kr(1)
41706 .sr(1)
41707 .m(1)
41708 .n(8)
41709 .k(4)
41710 .cn_stride(11)
41711 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41712 }
41713
41714 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_strided_a) {
41715 TEST_REQUIRES_PSIMD;
41716 GemmMicrokernelTester()
41717 .mr(1)
41718 .nr(8)
41719 .kr(1)
41720 .sr(1)
41721 .m(1)
41722 .n(8)
41723 .k(4)
41724 .a_stride(7)
41725 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41726 }
41727
41728 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile) {
41729 TEST_REQUIRES_PSIMD;
41730 for (uint32_t m = 1; m <= 1; m++) {
41731 for (uint32_t n = 1; n <= 8; n++) {
41732 GemmMicrokernelTester()
41733 .mr(1)
41734 .nr(8)
41735 .kr(1)
41736 .sr(1)
41737 .m(m)
41738 .n(n)
41739 .k(4)
41740 .iterations(1)
41741 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41742 }
41743 }
41744 }
41745
41746 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
41747 TEST_REQUIRES_PSIMD;
41748 for (uint32_t m = 1; m <= 1; m++) {
41749 GemmMicrokernelTester()
41750 .mr(1)
41751 .nr(8)
41752 .kr(1)
41753 .sr(1)
41754 .m(m)
41755 .n(8)
41756 .k(4)
41757 .iterations(1)
41758 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41759 }
41760 }
41761
41762 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
41763 TEST_REQUIRES_PSIMD;
41764 for (uint32_t n = 1; n <= 8; n++) {
41765 GemmMicrokernelTester()
41766 .mr(1)
41767 .nr(8)
41768 .kr(1)
41769 .sr(1)
41770 .m(1)
41771 .n(n)
41772 .k(4)
41773 .iterations(1)
41774 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41775 }
41776 }
41777
41778 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_lt_4) {
41779 TEST_REQUIRES_PSIMD;
41780 for (size_t k = 1; k < 4; k++) {
41781 GemmMicrokernelTester()
41782 .mr(1)
41783 .nr(8)
41784 .kr(1)
41785 .sr(1)
41786 .m(1)
41787 .n(8)
41788 .k(k)
41789 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41790 }
41791 }
41792
41793 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_lt_4_strided_a) {
41794 TEST_REQUIRES_PSIMD;
41795 for (size_t k = 1; k < 4; k++) {
41796 GemmMicrokernelTester()
41797 .mr(1)
41798 .nr(8)
41799 .kr(1)
41800 .sr(1)
41801 .m(1)
41802 .n(8)
41803 .k(k)
41804 .a_stride(7)
41805 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41806 }
41807 }
41808
41809 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_lt_4_subtile) {
41810 TEST_REQUIRES_PSIMD;
41811 for (size_t k = 1; k < 4; k++) {
41812 for (uint32_t m = 1; m <= 1; m++) {
41813 for (uint32_t n = 1; n <= 8; n++) {
41814 GemmMicrokernelTester()
41815 .mr(1)
41816 .nr(8)
41817 .kr(1)
41818 .sr(1)
41819 .m(m)
41820 .n(n)
41821 .k(k)
41822 .iterations(1)
41823 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41824 }
41825 }
41826 }
41827 }
41828
41829 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_gt_4) {
41830 TEST_REQUIRES_PSIMD;
41831 for (size_t k = 5; k < 8; k++) {
41832 GemmMicrokernelTester()
41833 .mr(1)
41834 .nr(8)
41835 .kr(1)
41836 .sr(1)
41837 .m(1)
41838 .n(8)
41839 .k(k)
41840 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41841 }
41842 }
41843
41844 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_gt_4_strided_a) {
41845 TEST_REQUIRES_PSIMD;
41846 for (size_t k = 5; k < 8; k++) {
41847 GemmMicrokernelTester()
41848 .mr(1)
41849 .nr(8)
41850 .kr(1)
41851 .sr(1)
41852 .m(1)
41853 .n(8)
41854 .k(k)
41855 .a_stride(11)
41856 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41857 }
41858 }
41859
41860 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_gt_4_subtile) {
41861 TEST_REQUIRES_PSIMD;
41862 for (size_t k = 5; k < 8; k++) {
41863 for (uint32_t m = 1; m <= 1; m++) {
41864 for (uint32_t n = 1; n <= 8; n++) {
41865 GemmMicrokernelTester()
41866 .mr(1)
41867 .nr(8)
41868 .kr(1)
41869 .sr(1)
41870 .m(m)
41871 .n(n)
41872 .k(k)
41873 .iterations(1)
41874 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41875 }
41876 }
41877 }
41878 }
41879
41880 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_div_4) {
41881 TEST_REQUIRES_PSIMD;
41882 for (size_t k = 8; k <= 40; k += 4) {
41883 GemmMicrokernelTester()
41884 .mr(1)
41885 .nr(8)
41886 .kr(1)
41887 .sr(1)
41888 .m(1)
41889 .n(8)
41890 .k(k)
41891 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41892 }
41893 }
41894
41895 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_div_4_strided_a) {
41896 TEST_REQUIRES_PSIMD;
41897 for (size_t k = 8; k <= 40; k += 4) {
41898 GemmMicrokernelTester()
41899 .mr(1)
41900 .nr(8)
41901 .kr(1)
41902 .sr(1)
41903 .m(1)
41904 .n(8)
41905 .k(k)
41906 .a_stride(43)
41907 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41908 }
41909 }
41910
41911 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_div_4_subtile) {
41912 TEST_REQUIRES_PSIMD;
41913 for (size_t k = 8; k <= 40; k += 4) {
41914 for (uint32_t m = 1; m <= 1; m++) {
41915 for (uint32_t n = 1; n <= 8; n++) {
41916 GemmMicrokernelTester()
41917 .mr(1)
41918 .nr(8)
41919 .kr(1)
41920 .sr(1)
41921 .m(m)
41922 .n(n)
41923 .k(k)
41924 .iterations(1)
41925 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41926 }
41927 }
41928 }
41929 }
41930
41931 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8) {
41932 TEST_REQUIRES_PSIMD;
41933 for (uint32_t n = 9; n < 16; n++) {
41934 for (size_t k = 1; k <= 20; k += 5) {
41935 GemmMicrokernelTester()
41936 .mr(1)
41937 .nr(8)
41938 .kr(1)
41939 .sr(1)
41940 .m(1)
41941 .n(8)
41942 .k(k)
41943 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41944 }
41945 }
41946 }
41947
41948 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
41949 TEST_REQUIRES_PSIMD;
41950 for (uint32_t n = 9; n < 16; n++) {
41951 for (size_t k = 1; k <= 20; k += 5) {
41952 GemmMicrokernelTester()
41953 .mr(1)
41954 .nr(8)
41955 .kr(1)
41956 .sr(1)
41957 .m(1)
41958 .n(8)
41959 .k(k)
41960 .cn_stride(11)
41961 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41962 }
41963 }
41964 }
41965
41966 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8_strided_a) {
41967 TEST_REQUIRES_PSIMD;
41968 for (uint32_t n = 9; n < 16; n++) {
41969 for (size_t k = 1; k <= 20; k += 5) {
41970 GemmMicrokernelTester()
41971 .mr(1)
41972 .nr(8)
41973 .kr(1)
41974 .sr(1)
41975 .m(1)
41976 .n(n)
41977 .k(k)
41978 .a_stride(23)
41979 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41980 }
41981 }
41982 }
41983
41984 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8_subtile) {
41985 TEST_REQUIRES_PSIMD;
41986 for (uint32_t n = 9; n < 16; n++) {
41987 for (size_t k = 1; k <= 20; k += 5) {
41988 for (uint32_t m = 1; m <= 1; m++) {
41989 GemmMicrokernelTester()
41990 .mr(1)
41991 .nr(8)
41992 .kr(1)
41993 .sr(1)
41994 .m(m)
41995 .n(n)
41996 .k(k)
41997 .iterations(1)
41998 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
41999 }
42000 }
42001 }
42002 }
42003
42004 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8) {
42005 TEST_REQUIRES_PSIMD;
42006 for (uint32_t n = 16; n <= 24; n += 8) {
42007 for (size_t k = 1; k <= 20; k += 5) {
42008 GemmMicrokernelTester()
42009 .mr(1)
42010 .nr(8)
42011 .kr(1)
42012 .sr(1)
42013 .m(1)
42014 .n(8)
42015 .k(k)
42016 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42017 }
42018 }
42019 }
42020
42021 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8_strided_cn) {
42022 TEST_REQUIRES_PSIMD;
42023 for (uint32_t n = 16; n <= 24; n += 8) {
42024 for (size_t k = 1; k <= 20; k += 5) {
42025 GemmMicrokernelTester()
42026 .mr(1)
42027 .nr(8)
42028 .kr(1)
42029 .sr(1)
42030 .m(1)
42031 .n(n)
42032 .k(k)
42033 .cn_stride(11)
42034 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42035 }
42036 }
42037 }
42038
42039 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8_strided_a) {
42040 TEST_REQUIRES_PSIMD;
42041 for (uint32_t n = 16; n <= 24; n += 8) {
42042 for (size_t k = 1; k <= 20; k += 5) {
42043 GemmMicrokernelTester()
42044 .mr(1)
42045 .nr(8)
42046 .kr(1)
42047 .sr(1)
42048 .m(1)
42049 .n(n)
42050 .k(k)
42051 .a_stride(23)
42052 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42053 }
42054 }
42055 }
42056
42057 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8_subtile) {
42058 TEST_REQUIRES_PSIMD;
42059 for (uint32_t n = 16; n <= 24; n += 8) {
42060 for (size_t k = 1; k <= 20; k += 5) {
42061 for (uint32_t m = 1; m <= 1; m++) {
42062 GemmMicrokernelTester()
42063 .mr(1)
42064 .nr(8)
42065 .kr(1)
42066 .sr(1)
42067 .m(m)
42068 .n(n)
42069 .k(k)
42070 .iterations(1)
42071 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42072 }
42073 }
42074 }
42075 }
42076
42077 TEST(F32_GEMM_1X8__PSIMD_SPLAT, strided_cm_subtile) {
42078 TEST_REQUIRES_PSIMD;
42079 for (size_t k = 1; k <= 20; k += 5) {
42080 for (uint32_t m = 1; m <= 1; m++) {
42081 for (uint32_t n = 1; n <= 8; n++) {
42082 GemmMicrokernelTester()
42083 .mr(1)
42084 .nr(8)
42085 .kr(1)
42086 .sr(1)
42087 .m(m)
42088 .n(n)
42089 .k(k)
42090 .cm_stride(11)
42091 .iterations(1)
42092 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42093 }
42094 }
42095 }
42096 }
42097
42098 TEST(F32_GEMM_1X8__PSIMD_SPLAT, qmin) {
42099 TEST_REQUIRES_PSIMD;
42100 GemmMicrokernelTester()
42101 .mr(1)
42102 .nr(8)
42103 .kr(1)
42104 .sr(1)
42105 .m(1)
42106 .n(8)
42107 .k(4)
42108 .qmin(128)
42109 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42110 }
42111
42112 TEST(F32_GEMM_1X8__PSIMD_SPLAT, qmax) {
42113 TEST_REQUIRES_PSIMD;
42114 GemmMicrokernelTester()
42115 .mr(1)
42116 .nr(8)
42117 .kr(1)
42118 .sr(1)
42119 .m(1)
42120 .n(8)
42121 .k(4)
42122 .qmax(128)
42123 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42124 }
42125
42126 TEST(F32_GEMM_1X8__PSIMD_SPLAT, strided_cm) {
42127 TEST_REQUIRES_PSIMD;
42128 GemmMicrokernelTester()
42129 .mr(1)
42130 .nr(8)
42131 .kr(1)
42132 .sr(1)
42133 .m(1)
42134 .n(8)
42135 .k(4)
42136 .cm_stride(11)
42137 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42138 }
42139#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42140
42141
42142#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42143 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4) {
42144 TEST_REQUIRES_PSIMD;
42145 GemmMicrokernelTester()
42146 .mr(4)
42147 .nr(8)
42148 .kr(1)
42149 .sr(1)
42150 .m(4)
42151 .n(8)
42152 .k(4)
42153 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42154 }
42155
42156 TEST(F32_GEMM_4X8__PSIMD_SPLAT, strided_cn) {
42157 TEST_REQUIRES_PSIMD;
42158 GemmMicrokernelTester()
42159 .mr(4)
42160 .nr(8)
42161 .kr(1)
42162 .sr(1)
42163 .m(4)
42164 .n(8)
42165 .k(4)
42166 .cn_stride(11)
42167 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42168 }
42169
42170 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_strided_a) {
42171 TEST_REQUIRES_PSIMD;
42172 GemmMicrokernelTester()
42173 .mr(4)
42174 .nr(8)
42175 .kr(1)
42176 .sr(1)
42177 .m(4)
42178 .n(8)
42179 .k(4)
42180 .a_stride(7)
42181 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42182 }
42183
42184 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile) {
42185 TEST_REQUIRES_PSIMD;
42186 for (uint32_t m = 1; m <= 4; m++) {
42187 for (uint32_t n = 1; n <= 8; n++) {
42188 GemmMicrokernelTester()
42189 .mr(4)
42190 .nr(8)
42191 .kr(1)
42192 .sr(1)
42193 .m(m)
42194 .n(n)
42195 .k(4)
42196 .iterations(1)
42197 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42198 }
42199 }
42200 }
42201
42202 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
42203 TEST_REQUIRES_PSIMD;
42204 for (uint32_t m = 1; m <= 4; m++) {
42205 GemmMicrokernelTester()
42206 .mr(4)
42207 .nr(8)
42208 .kr(1)
42209 .sr(1)
42210 .m(m)
42211 .n(8)
42212 .k(4)
42213 .iterations(1)
42214 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42215 }
42216 }
42217
42218 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
42219 TEST_REQUIRES_PSIMD;
42220 for (uint32_t n = 1; n <= 8; n++) {
42221 GemmMicrokernelTester()
42222 .mr(4)
42223 .nr(8)
42224 .kr(1)
42225 .sr(1)
42226 .m(4)
42227 .n(n)
42228 .k(4)
42229 .iterations(1)
42230 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42231 }
42232 }
42233
42234 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_lt_4) {
42235 TEST_REQUIRES_PSIMD;
42236 for (size_t k = 1; k < 4; k++) {
42237 GemmMicrokernelTester()
42238 .mr(4)
42239 .nr(8)
42240 .kr(1)
42241 .sr(1)
42242 .m(4)
42243 .n(8)
42244 .k(k)
42245 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42246 }
42247 }
42248
42249 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_lt_4_strided_a) {
42250 TEST_REQUIRES_PSIMD;
42251 for (size_t k = 1; k < 4; k++) {
42252 GemmMicrokernelTester()
42253 .mr(4)
42254 .nr(8)
42255 .kr(1)
42256 .sr(1)
42257 .m(4)
42258 .n(8)
42259 .k(k)
42260 .a_stride(7)
42261 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42262 }
42263 }
42264
42265 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_lt_4_subtile) {
42266 TEST_REQUIRES_PSIMD;
42267 for (size_t k = 1; k < 4; k++) {
42268 for (uint32_t m = 1; m <= 4; m++) {
42269 for (uint32_t n = 1; n <= 8; n++) {
42270 GemmMicrokernelTester()
42271 .mr(4)
42272 .nr(8)
42273 .kr(1)
42274 .sr(1)
42275 .m(m)
42276 .n(n)
42277 .k(k)
42278 .iterations(1)
42279 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42280 }
42281 }
42282 }
42283 }
42284
42285 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_gt_4) {
42286 TEST_REQUIRES_PSIMD;
42287 for (size_t k = 5; k < 8; k++) {
42288 GemmMicrokernelTester()
42289 .mr(4)
42290 .nr(8)
42291 .kr(1)
42292 .sr(1)
42293 .m(4)
42294 .n(8)
42295 .k(k)
42296 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42297 }
42298 }
42299
42300 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_gt_4_strided_a) {
42301 TEST_REQUIRES_PSIMD;
42302 for (size_t k = 5; k < 8; k++) {
42303 GemmMicrokernelTester()
42304 .mr(4)
42305 .nr(8)
42306 .kr(1)
42307 .sr(1)
42308 .m(4)
42309 .n(8)
42310 .k(k)
42311 .a_stride(11)
42312 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42313 }
42314 }
42315
42316 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_gt_4_subtile) {
42317 TEST_REQUIRES_PSIMD;
42318 for (size_t k = 5; k < 8; k++) {
42319 for (uint32_t m = 1; m <= 4; m++) {
42320 for (uint32_t n = 1; n <= 8; n++) {
42321 GemmMicrokernelTester()
42322 .mr(4)
42323 .nr(8)
42324 .kr(1)
42325 .sr(1)
42326 .m(m)
42327 .n(n)
42328 .k(k)
42329 .iterations(1)
42330 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42331 }
42332 }
42333 }
42334 }
42335
42336 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_div_4) {
42337 TEST_REQUIRES_PSIMD;
42338 for (size_t k = 8; k <= 40; k += 4) {
42339 GemmMicrokernelTester()
42340 .mr(4)
42341 .nr(8)
42342 .kr(1)
42343 .sr(1)
42344 .m(4)
42345 .n(8)
42346 .k(k)
42347 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42348 }
42349 }
42350
42351 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_div_4_strided_a) {
42352 TEST_REQUIRES_PSIMD;
42353 for (size_t k = 8; k <= 40; k += 4) {
42354 GemmMicrokernelTester()
42355 .mr(4)
42356 .nr(8)
42357 .kr(1)
42358 .sr(1)
42359 .m(4)
42360 .n(8)
42361 .k(k)
42362 .a_stride(43)
42363 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42364 }
42365 }
42366
42367 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_div_4_subtile) {
42368 TEST_REQUIRES_PSIMD;
42369 for (size_t k = 8; k <= 40; k += 4) {
42370 for (uint32_t m = 1; m <= 4; m++) {
42371 for (uint32_t n = 1; n <= 8; n++) {
42372 GemmMicrokernelTester()
42373 .mr(4)
42374 .nr(8)
42375 .kr(1)
42376 .sr(1)
42377 .m(m)
42378 .n(n)
42379 .k(k)
42380 .iterations(1)
42381 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42382 }
42383 }
42384 }
42385 }
42386
42387 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8) {
42388 TEST_REQUIRES_PSIMD;
42389 for (uint32_t n = 9; n < 16; n++) {
42390 for (size_t k = 1; k <= 20; k += 5) {
42391 GemmMicrokernelTester()
42392 .mr(4)
42393 .nr(8)
42394 .kr(1)
42395 .sr(1)
42396 .m(4)
42397 .n(8)
42398 .k(k)
42399 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42400 }
42401 }
42402 }
42403
42404 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
42405 TEST_REQUIRES_PSIMD;
42406 for (uint32_t n = 9; n < 16; n++) {
42407 for (size_t k = 1; k <= 20; k += 5) {
42408 GemmMicrokernelTester()
42409 .mr(4)
42410 .nr(8)
42411 .kr(1)
42412 .sr(1)
42413 .m(4)
42414 .n(8)
42415 .k(k)
42416 .cn_stride(11)
42417 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42418 }
42419 }
42420 }
42421
42422 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8_strided_a) {
42423 TEST_REQUIRES_PSIMD;
42424 for (uint32_t n = 9; n < 16; n++) {
42425 for (size_t k = 1; k <= 20; k += 5) {
42426 GemmMicrokernelTester()
42427 .mr(4)
42428 .nr(8)
42429 .kr(1)
42430 .sr(1)
42431 .m(4)
42432 .n(n)
42433 .k(k)
42434 .a_stride(23)
42435 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42436 }
42437 }
42438 }
42439
42440 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8_subtile) {
42441 TEST_REQUIRES_PSIMD;
42442 for (uint32_t n = 9; n < 16; n++) {
42443 for (size_t k = 1; k <= 20; k += 5) {
42444 for (uint32_t m = 1; m <= 4; m++) {
42445 GemmMicrokernelTester()
42446 .mr(4)
42447 .nr(8)
42448 .kr(1)
42449 .sr(1)
42450 .m(m)
42451 .n(n)
42452 .k(k)
42453 .iterations(1)
42454 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42455 }
42456 }
42457 }
42458 }
42459
42460 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8) {
42461 TEST_REQUIRES_PSIMD;
42462 for (uint32_t n = 16; n <= 24; n += 8) {
42463 for (size_t k = 1; k <= 20; k += 5) {
42464 GemmMicrokernelTester()
42465 .mr(4)
42466 .nr(8)
42467 .kr(1)
42468 .sr(1)
42469 .m(4)
42470 .n(8)
42471 .k(k)
42472 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42473 }
42474 }
42475 }
42476
42477 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8_strided_cn) {
42478 TEST_REQUIRES_PSIMD;
42479 for (uint32_t n = 16; n <= 24; n += 8) {
42480 for (size_t k = 1; k <= 20; k += 5) {
42481 GemmMicrokernelTester()
42482 .mr(4)
42483 .nr(8)
42484 .kr(1)
42485 .sr(1)
42486 .m(4)
42487 .n(n)
42488 .k(k)
42489 .cn_stride(11)
42490 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42491 }
42492 }
42493 }
42494
42495 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8_strided_a) {
42496 TEST_REQUIRES_PSIMD;
42497 for (uint32_t n = 16; n <= 24; n += 8) {
42498 for (size_t k = 1; k <= 20; k += 5) {
42499 GemmMicrokernelTester()
42500 .mr(4)
42501 .nr(8)
42502 .kr(1)
42503 .sr(1)
42504 .m(4)
42505 .n(n)
42506 .k(k)
42507 .a_stride(23)
42508 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42509 }
42510 }
42511 }
42512
42513 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8_subtile) {
42514 TEST_REQUIRES_PSIMD;
42515 for (uint32_t n = 16; n <= 24; n += 8) {
42516 for (size_t k = 1; k <= 20; k += 5) {
42517 for (uint32_t m = 1; m <= 4; m++) {
42518 GemmMicrokernelTester()
42519 .mr(4)
42520 .nr(8)
42521 .kr(1)
42522 .sr(1)
42523 .m(m)
42524 .n(n)
42525 .k(k)
42526 .iterations(1)
42527 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42528 }
42529 }
42530 }
42531 }
42532
42533 TEST(F32_GEMM_4X8__PSIMD_SPLAT, strided_cm_subtile) {
42534 TEST_REQUIRES_PSIMD;
42535 for (size_t k = 1; k <= 20; k += 5) {
42536 for (uint32_t m = 1; m <= 4; m++) {
42537 for (uint32_t n = 1; n <= 8; n++) {
42538 GemmMicrokernelTester()
42539 .mr(4)
42540 .nr(8)
42541 .kr(1)
42542 .sr(1)
42543 .m(m)
42544 .n(n)
42545 .k(k)
42546 .cm_stride(11)
42547 .iterations(1)
42548 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42549 }
42550 }
42551 }
42552 }
42553
42554 TEST(F32_GEMM_4X8__PSIMD_SPLAT, qmin) {
42555 TEST_REQUIRES_PSIMD;
42556 GemmMicrokernelTester()
42557 .mr(4)
42558 .nr(8)
42559 .kr(1)
42560 .sr(1)
42561 .m(4)
42562 .n(8)
42563 .k(4)
42564 .qmin(128)
42565 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42566 }
42567
42568 TEST(F32_GEMM_4X8__PSIMD_SPLAT, qmax) {
42569 TEST_REQUIRES_PSIMD;
42570 GemmMicrokernelTester()
42571 .mr(4)
42572 .nr(8)
42573 .kr(1)
42574 .sr(1)
42575 .m(4)
42576 .n(8)
42577 .k(4)
42578 .qmax(128)
42579 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42580 }
42581
42582 TEST(F32_GEMM_4X8__PSIMD_SPLAT, strided_cm) {
42583 TEST_REQUIRES_PSIMD;
42584 GemmMicrokernelTester()
42585 .mr(4)
42586 .nr(8)
42587 .kr(1)
42588 .sr(1)
42589 .m(4)
42590 .n(8)
42591 .k(4)
42592 .cm_stride(11)
42593 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42594 }
42595#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42596
42597
42598#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42599 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4) {
42600 TEST_REQUIRES_PSIMD;
42601 GemmMicrokernelTester()
42602 .mr(6)
42603 .nr(8)
42604 .kr(1)
42605 .sr(1)
42606 .m(6)
42607 .n(8)
42608 .k(4)
42609 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42610 }
42611
42612 TEST(F32_GEMM_6X8__PSIMD_SPLAT, strided_cn) {
42613 TEST_REQUIRES_PSIMD;
42614 GemmMicrokernelTester()
42615 .mr(6)
42616 .nr(8)
42617 .kr(1)
42618 .sr(1)
42619 .m(6)
42620 .n(8)
42621 .k(4)
42622 .cn_stride(11)
42623 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42624 }
42625
42626 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_strided_a) {
42627 TEST_REQUIRES_PSIMD;
42628 GemmMicrokernelTester()
42629 .mr(6)
42630 .nr(8)
42631 .kr(1)
42632 .sr(1)
42633 .m(6)
42634 .n(8)
42635 .k(4)
42636 .a_stride(7)
42637 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42638 }
42639
42640 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile) {
42641 TEST_REQUIRES_PSIMD;
42642 for (uint32_t m = 1; m <= 6; m++) {
42643 for (uint32_t n = 1; n <= 8; n++) {
42644 GemmMicrokernelTester()
42645 .mr(6)
42646 .nr(8)
42647 .kr(1)
42648 .sr(1)
42649 .m(m)
42650 .n(n)
42651 .k(4)
42652 .iterations(1)
42653 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42654 }
42655 }
42656 }
42657
42658 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
42659 TEST_REQUIRES_PSIMD;
42660 for (uint32_t m = 1; m <= 6; m++) {
42661 GemmMicrokernelTester()
42662 .mr(6)
42663 .nr(8)
42664 .kr(1)
42665 .sr(1)
42666 .m(m)
42667 .n(8)
42668 .k(4)
42669 .iterations(1)
42670 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42671 }
42672 }
42673
42674 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
42675 TEST_REQUIRES_PSIMD;
42676 for (uint32_t n = 1; n <= 8; n++) {
42677 GemmMicrokernelTester()
42678 .mr(6)
42679 .nr(8)
42680 .kr(1)
42681 .sr(1)
42682 .m(6)
42683 .n(n)
42684 .k(4)
42685 .iterations(1)
42686 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42687 }
42688 }
42689
42690 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_lt_4) {
42691 TEST_REQUIRES_PSIMD;
42692 for (size_t k = 1; k < 4; k++) {
42693 GemmMicrokernelTester()
42694 .mr(6)
42695 .nr(8)
42696 .kr(1)
42697 .sr(1)
42698 .m(6)
42699 .n(8)
42700 .k(k)
42701 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42702 }
42703 }
42704
42705 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_lt_4_strided_a) {
42706 TEST_REQUIRES_PSIMD;
42707 for (size_t k = 1; k < 4; k++) {
42708 GemmMicrokernelTester()
42709 .mr(6)
42710 .nr(8)
42711 .kr(1)
42712 .sr(1)
42713 .m(6)
42714 .n(8)
42715 .k(k)
42716 .a_stride(7)
42717 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42718 }
42719 }
42720
42721 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_lt_4_subtile) {
42722 TEST_REQUIRES_PSIMD;
42723 for (size_t k = 1; k < 4; k++) {
42724 for (uint32_t m = 1; m <= 6; m++) {
42725 for (uint32_t n = 1; n <= 8; n++) {
42726 GemmMicrokernelTester()
42727 .mr(6)
42728 .nr(8)
42729 .kr(1)
42730 .sr(1)
42731 .m(m)
42732 .n(n)
42733 .k(k)
42734 .iterations(1)
42735 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42736 }
42737 }
42738 }
42739 }
42740
42741 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_gt_4) {
42742 TEST_REQUIRES_PSIMD;
42743 for (size_t k = 5; k < 8; k++) {
42744 GemmMicrokernelTester()
42745 .mr(6)
42746 .nr(8)
42747 .kr(1)
42748 .sr(1)
42749 .m(6)
42750 .n(8)
42751 .k(k)
42752 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42753 }
42754 }
42755
42756 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_gt_4_strided_a) {
42757 TEST_REQUIRES_PSIMD;
42758 for (size_t k = 5; k < 8; k++) {
42759 GemmMicrokernelTester()
42760 .mr(6)
42761 .nr(8)
42762 .kr(1)
42763 .sr(1)
42764 .m(6)
42765 .n(8)
42766 .k(k)
42767 .a_stride(11)
42768 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42769 }
42770 }
42771
42772 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_gt_4_subtile) {
42773 TEST_REQUIRES_PSIMD;
42774 for (size_t k = 5; k < 8; k++) {
42775 for (uint32_t m = 1; m <= 6; m++) {
42776 for (uint32_t n = 1; n <= 8; n++) {
42777 GemmMicrokernelTester()
42778 .mr(6)
42779 .nr(8)
42780 .kr(1)
42781 .sr(1)
42782 .m(m)
42783 .n(n)
42784 .k(k)
42785 .iterations(1)
42786 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42787 }
42788 }
42789 }
42790 }
42791
42792 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_div_4) {
42793 TEST_REQUIRES_PSIMD;
42794 for (size_t k = 8; k <= 40; k += 4) {
42795 GemmMicrokernelTester()
42796 .mr(6)
42797 .nr(8)
42798 .kr(1)
42799 .sr(1)
42800 .m(6)
42801 .n(8)
42802 .k(k)
42803 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42804 }
42805 }
42806
42807 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_div_4_strided_a) {
42808 TEST_REQUIRES_PSIMD;
42809 for (size_t k = 8; k <= 40; k += 4) {
42810 GemmMicrokernelTester()
42811 .mr(6)
42812 .nr(8)
42813 .kr(1)
42814 .sr(1)
42815 .m(6)
42816 .n(8)
42817 .k(k)
42818 .a_stride(43)
42819 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42820 }
42821 }
42822
42823 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_div_4_subtile) {
42824 TEST_REQUIRES_PSIMD;
42825 for (size_t k = 8; k <= 40; k += 4) {
42826 for (uint32_t m = 1; m <= 6; m++) {
42827 for (uint32_t n = 1; n <= 8; n++) {
42828 GemmMicrokernelTester()
42829 .mr(6)
42830 .nr(8)
42831 .kr(1)
42832 .sr(1)
42833 .m(m)
42834 .n(n)
42835 .k(k)
42836 .iterations(1)
42837 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42838 }
42839 }
42840 }
42841 }
42842
42843 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8) {
42844 TEST_REQUIRES_PSIMD;
42845 for (uint32_t n = 9; n < 16; n++) {
42846 for (size_t k = 1; k <= 20; k += 5) {
42847 GemmMicrokernelTester()
42848 .mr(6)
42849 .nr(8)
42850 .kr(1)
42851 .sr(1)
42852 .m(6)
42853 .n(8)
42854 .k(k)
42855 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42856 }
42857 }
42858 }
42859
42860 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
42861 TEST_REQUIRES_PSIMD;
42862 for (uint32_t n = 9; n < 16; n++) {
42863 for (size_t k = 1; k <= 20; k += 5) {
42864 GemmMicrokernelTester()
42865 .mr(6)
42866 .nr(8)
42867 .kr(1)
42868 .sr(1)
42869 .m(6)
42870 .n(8)
42871 .k(k)
42872 .cn_stride(11)
42873 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42874 }
42875 }
42876 }
42877
42878 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8_strided_a) {
42879 TEST_REQUIRES_PSIMD;
42880 for (uint32_t n = 9; n < 16; n++) {
42881 for (size_t k = 1; k <= 20; k += 5) {
42882 GemmMicrokernelTester()
42883 .mr(6)
42884 .nr(8)
42885 .kr(1)
42886 .sr(1)
42887 .m(6)
42888 .n(n)
42889 .k(k)
42890 .a_stride(23)
42891 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42892 }
42893 }
42894 }
42895
42896 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8_subtile) {
42897 TEST_REQUIRES_PSIMD;
42898 for (uint32_t n = 9; n < 16; n++) {
42899 for (size_t k = 1; k <= 20; k += 5) {
42900 for (uint32_t m = 1; m <= 6; m++) {
42901 GemmMicrokernelTester()
42902 .mr(6)
42903 .nr(8)
42904 .kr(1)
42905 .sr(1)
42906 .m(m)
42907 .n(n)
42908 .k(k)
42909 .iterations(1)
42910 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42911 }
42912 }
42913 }
42914 }
42915
42916 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8) {
42917 TEST_REQUIRES_PSIMD;
42918 for (uint32_t n = 16; n <= 24; n += 8) {
42919 for (size_t k = 1; k <= 20; k += 5) {
42920 GemmMicrokernelTester()
42921 .mr(6)
42922 .nr(8)
42923 .kr(1)
42924 .sr(1)
42925 .m(6)
42926 .n(8)
42927 .k(k)
42928 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42929 }
42930 }
42931 }
42932
42933 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8_strided_cn) {
42934 TEST_REQUIRES_PSIMD;
42935 for (uint32_t n = 16; n <= 24; n += 8) {
42936 for (size_t k = 1; k <= 20; k += 5) {
42937 GemmMicrokernelTester()
42938 .mr(6)
42939 .nr(8)
42940 .kr(1)
42941 .sr(1)
42942 .m(6)
42943 .n(n)
42944 .k(k)
42945 .cn_stride(11)
42946 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42947 }
42948 }
42949 }
42950
42951 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8_strided_a) {
42952 TEST_REQUIRES_PSIMD;
42953 for (uint32_t n = 16; n <= 24; n += 8) {
42954 for (size_t k = 1; k <= 20; k += 5) {
42955 GemmMicrokernelTester()
42956 .mr(6)
42957 .nr(8)
42958 .kr(1)
42959 .sr(1)
42960 .m(6)
42961 .n(n)
42962 .k(k)
42963 .a_stride(23)
42964 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42965 }
42966 }
42967 }
42968
42969 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8_subtile) {
42970 TEST_REQUIRES_PSIMD;
42971 for (uint32_t n = 16; n <= 24; n += 8) {
42972 for (size_t k = 1; k <= 20; k += 5) {
42973 for (uint32_t m = 1; m <= 6; m++) {
42974 GemmMicrokernelTester()
42975 .mr(6)
42976 .nr(8)
42977 .kr(1)
42978 .sr(1)
42979 .m(m)
42980 .n(n)
42981 .k(k)
42982 .iterations(1)
42983 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42984 }
42985 }
42986 }
42987 }
42988
42989 TEST(F32_GEMM_6X8__PSIMD_SPLAT, strided_cm_subtile) {
42990 TEST_REQUIRES_PSIMD;
42991 for (size_t k = 1; k <= 20; k += 5) {
42992 for (uint32_t m = 1; m <= 6; m++) {
42993 for (uint32_t n = 1; n <= 8; n++) {
42994 GemmMicrokernelTester()
42995 .mr(6)
42996 .nr(8)
42997 .kr(1)
42998 .sr(1)
42999 .m(m)
43000 .n(n)
43001 .k(k)
43002 .cm_stride(11)
43003 .iterations(1)
43004 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43005 }
43006 }
43007 }
43008 }
43009
43010 TEST(F32_GEMM_6X8__PSIMD_SPLAT, qmin) {
43011 TEST_REQUIRES_PSIMD;
43012 GemmMicrokernelTester()
43013 .mr(6)
43014 .nr(8)
43015 .kr(1)
43016 .sr(1)
43017 .m(6)
43018 .n(8)
43019 .k(4)
43020 .qmin(128)
43021 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43022 }
43023
43024 TEST(F32_GEMM_6X8__PSIMD_SPLAT, qmax) {
43025 TEST_REQUIRES_PSIMD;
43026 GemmMicrokernelTester()
43027 .mr(6)
43028 .nr(8)
43029 .kr(1)
43030 .sr(1)
43031 .m(6)
43032 .n(8)
43033 .k(4)
43034 .qmax(128)
43035 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43036 }
43037
43038 TEST(F32_GEMM_6X8__PSIMD_SPLAT, strided_cm) {
43039 TEST_REQUIRES_PSIMD;
43040 GemmMicrokernelTester()
43041 .mr(6)
43042 .nr(8)
43043 .kr(1)
43044 .sr(1)
43045 .m(6)
43046 .n(8)
43047 .k(4)
43048 .cm_stride(11)
43049 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43050 }
43051#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43052
43053
43054#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43055 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4) {
43056 TEST_REQUIRES_PSIMD;
43057 GemmMicrokernelTester()
43058 .mr(1)
43059 .nr(8)
43060 .kr(1)
43061 .sr(4)
43062 .m(1)
43063 .n(8)
43064 .k(4)
43065 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43066 }
43067
43068 TEST(F32_GEMM_1X8S4__PSIMD, strided_cn) {
43069 TEST_REQUIRES_PSIMD;
43070 GemmMicrokernelTester()
43071 .mr(1)
43072 .nr(8)
43073 .kr(1)
43074 .sr(4)
43075 .m(1)
43076 .n(8)
43077 .k(4)
43078 .cn_stride(11)
43079 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43080 }
43081
43082 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_strided_a) {
43083 TEST_REQUIRES_PSIMD;
43084 GemmMicrokernelTester()
43085 .mr(1)
43086 .nr(8)
43087 .kr(1)
43088 .sr(4)
43089 .m(1)
43090 .n(8)
43091 .k(4)
43092 .a_stride(7)
43093 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43094 }
43095
43096 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_subtile) {
43097 TEST_REQUIRES_PSIMD;
43098 for (uint32_t m = 1; m <= 1; m++) {
43099 for (uint32_t n = 1; n <= 8; n++) {
43100 GemmMicrokernelTester()
43101 .mr(1)
43102 .nr(8)
43103 .kr(1)
43104 .sr(4)
43105 .m(m)
43106 .n(n)
43107 .k(4)
43108 .iterations(1)
43109 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43110 }
43111 }
43112 }
43113
43114 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_subtile_m) {
43115 TEST_REQUIRES_PSIMD;
43116 for (uint32_t m = 1; m <= 1; m++) {
43117 GemmMicrokernelTester()
43118 .mr(1)
43119 .nr(8)
43120 .kr(1)
43121 .sr(4)
43122 .m(m)
43123 .n(8)
43124 .k(4)
43125 .iterations(1)
43126 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43127 }
43128 }
43129
43130 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_subtile_n) {
43131 TEST_REQUIRES_PSIMD;
43132 for (uint32_t n = 1; n <= 8; n++) {
43133 GemmMicrokernelTester()
43134 .mr(1)
43135 .nr(8)
43136 .kr(1)
43137 .sr(4)
43138 .m(1)
43139 .n(n)
43140 .k(4)
43141 .iterations(1)
43142 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43143 }
43144 }
43145
43146 TEST(F32_GEMM_1X8S4__PSIMD, k_lt_4) {
43147 TEST_REQUIRES_PSIMD;
43148 for (size_t k = 1; k < 4; k++) {
43149 GemmMicrokernelTester()
43150 .mr(1)
43151 .nr(8)
43152 .kr(1)
43153 .sr(4)
43154 .m(1)
43155 .n(8)
43156 .k(k)
43157 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43158 }
43159 }
43160
43161 TEST(F32_GEMM_1X8S4__PSIMD, k_lt_4_strided_a) {
43162 TEST_REQUIRES_PSIMD;
43163 for (size_t k = 1; k < 4; k++) {
43164 GemmMicrokernelTester()
43165 .mr(1)
43166 .nr(8)
43167 .kr(1)
43168 .sr(4)
43169 .m(1)
43170 .n(8)
43171 .k(k)
43172 .a_stride(7)
43173 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43174 }
43175 }
43176
43177 TEST(F32_GEMM_1X8S4__PSIMD, k_lt_4_subtile) {
43178 TEST_REQUIRES_PSIMD;
43179 for (size_t k = 1; k < 4; k++) {
43180 for (uint32_t m = 1; m <= 1; m++) {
43181 for (uint32_t n = 1; n <= 8; n++) {
43182 GemmMicrokernelTester()
43183 .mr(1)
43184 .nr(8)
43185 .kr(1)
43186 .sr(4)
43187 .m(m)
43188 .n(n)
43189 .k(k)
43190 .iterations(1)
43191 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43192 }
43193 }
43194 }
43195 }
43196
43197 TEST(F32_GEMM_1X8S4__PSIMD, k_gt_4) {
43198 TEST_REQUIRES_PSIMD;
43199 for (size_t k = 5; k < 8; k++) {
43200 GemmMicrokernelTester()
43201 .mr(1)
43202 .nr(8)
43203 .kr(1)
43204 .sr(4)
43205 .m(1)
43206 .n(8)
43207 .k(k)
43208 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43209 }
43210 }
43211
43212 TEST(F32_GEMM_1X8S4__PSIMD, k_gt_4_strided_a) {
43213 TEST_REQUIRES_PSIMD;
43214 for (size_t k = 5; k < 8; k++) {
43215 GemmMicrokernelTester()
43216 .mr(1)
43217 .nr(8)
43218 .kr(1)
43219 .sr(4)
43220 .m(1)
43221 .n(8)
43222 .k(k)
43223 .a_stride(11)
43224 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43225 }
43226 }
43227
43228 TEST(F32_GEMM_1X8S4__PSIMD, k_gt_4_subtile) {
43229 TEST_REQUIRES_PSIMD;
43230 for (size_t k = 5; k < 8; k++) {
43231 for (uint32_t m = 1; m <= 1; m++) {
43232 for (uint32_t n = 1; n <= 8; n++) {
43233 GemmMicrokernelTester()
43234 .mr(1)
43235 .nr(8)
43236 .kr(1)
43237 .sr(4)
43238 .m(m)
43239 .n(n)
43240 .k(k)
43241 .iterations(1)
43242 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43243 }
43244 }
43245 }
43246 }
43247
43248 TEST(F32_GEMM_1X8S4__PSIMD, k_div_4) {
43249 TEST_REQUIRES_PSIMD;
43250 for (size_t k = 8; k <= 40; k += 4) {
43251 GemmMicrokernelTester()
43252 .mr(1)
43253 .nr(8)
43254 .kr(1)
43255 .sr(4)
43256 .m(1)
43257 .n(8)
43258 .k(k)
43259 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43260 }
43261 }
43262
43263 TEST(F32_GEMM_1X8S4__PSIMD, k_div_4_strided_a) {
43264 TEST_REQUIRES_PSIMD;
43265 for (size_t k = 8; k <= 40; k += 4) {
43266 GemmMicrokernelTester()
43267 .mr(1)
43268 .nr(8)
43269 .kr(1)
43270 .sr(4)
43271 .m(1)
43272 .n(8)
43273 .k(k)
43274 .a_stride(43)
43275 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43276 }
43277 }
43278
43279 TEST(F32_GEMM_1X8S4__PSIMD, k_div_4_subtile) {
43280 TEST_REQUIRES_PSIMD;
43281 for (size_t k = 8; k <= 40; k += 4) {
43282 for (uint32_t m = 1; m <= 1; m++) {
43283 for (uint32_t n = 1; n <= 8; n++) {
43284 GemmMicrokernelTester()
43285 .mr(1)
43286 .nr(8)
43287 .kr(1)
43288 .sr(4)
43289 .m(m)
43290 .n(n)
43291 .k(k)
43292 .iterations(1)
43293 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43294 }
43295 }
43296 }
43297 }
43298
43299 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8) {
43300 TEST_REQUIRES_PSIMD;
43301 for (uint32_t n = 9; n < 16; n++) {
43302 for (size_t k = 1; k <= 20; k += 5) {
43303 GemmMicrokernelTester()
43304 .mr(1)
43305 .nr(8)
43306 .kr(1)
43307 .sr(4)
43308 .m(1)
43309 .n(8)
43310 .k(k)
43311 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43312 }
43313 }
43314 }
43315
43316 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8_strided_cn) {
43317 TEST_REQUIRES_PSIMD;
43318 for (uint32_t n = 9; n < 16; n++) {
43319 for (size_t k = 1; k <= 20; k += 5) {
43320 GemmMicrokernelTester()
43321 .mr(1)
43322 .nr(8)
43323 .kr(1)
43324 .sr(4)
43325 .m(1)
43326 .n(8)
43327 .k(k)
43328 .cn_stride(11)
43329 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43330 }
43331 }
43332 }
43333
43334 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8_strided_a) {
43335 TEST_REQUIRES_PSIMD;
43336 for (uint32_t n = 9; n < 16; n++) {
43337 for (size_t k = 1; k <= 20; k += 5) {
43338 GemmMicrokernelTester()
43339 .mr(1)
43340 .nr(8)
43341 .kr(1)
43342 .sr(4)
43343 .m(1)
43344 .n(n)
43345 .k(k)
43346 .a_stride(23)
43347 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43348 }
43349 }
43350 }
43351
43352 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8_subtile) {
43353 TEST_REQUIRES_PSIMD;
43354 for (uint32_t n = 9; n < 16; n++) {
43355 for (size_t k = 1; k <= 20; k += 5) {
43356 for (uint32_t m = 1; m <= 1; m++) {
43357 GemmMicrokernelTester()
43358 .mr(1)
43359 .nr(8)
43360 .kr(1)
43361 .sr(4)
43362 .m(m)
43363 .n(n)
43364 .k(k)
43365 .iterations(1)
43366 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43367 }
43368 }
43369 }
43370 }
43371
43372 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8) {
43373 TEST_REQUIRES_PSIMD;
43374 for (uint32_t n = 16; n <= 24; n += 8) {
43375 for (size_t k = 1; k <= 20; k += 5) {
43376 GemmMicrokernelTester()
43377 .mr(1)
43378 .nr(8)
43379 .kr(1)
43380 .sr(4)
43381 .m(1)
43382 .n(8)
43383 .k(k)
43384 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43385 }
43386 }
43387 }
43388
43389 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8_strided_cn) {
43390 TEST_REQUIRES_PSIMD;
43391 for (uint32_t n = 16; n <= 24; n += 8) {
43392 for (size_t k = 1; k <= 20; k += 5) {
43393 GemmMicrokernelTester()
43394 .mr(1)
43395 .nr(8)
43396 .kr(1)
43397 .sr(4)
43398 .m(1)
43399 .n(n)
43400 .k(k)
43401 .cn_stride(11)
43402 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43403 }
43404 }
43405 }
43406
43407 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8_strided_a) {
43408 TEST_REQUIRES_PSIMD;
43409 for (uint32_t n = 16; n <= 24; n += 8) {
43410 for (size_t k = 1; k <= 20; k += 5) {
43411 GemmMicrokernelTester()
43412 .mr(1)
43413 .nr(8)
43414 .kr(1)
43415 .sr(4)
43416 .m(1)
43417 .n(n)
43418 .k(k)
43419 .a_stride(23)
43420 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43421 }
43422 }
43423 }
43424
43425 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8_subtile) {
43426 TEST_REQUIRES_PSIMD;
43427 for (uint32_t n = 16; n <= 24; n += 8) {
43428 for (size_t k = 1; k <= 20; k += 5) {
43429 for (uint32_t m = 1; m <= 1; m++) {
43430 GemmMicrokernelTester()
43431 .mr(1)
43432 .nr(8)
43433 .kr(1)
43434 .sr(4)
43435 .m(m)
43436 .n(n)
43437 .k(k)
43438 .iterations(1)
43439 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43440 }
43441 }
43442 }
43443 }
43444
43445 TEST(F32_GEMM_1X8S4__PSIMD, strided_cm_subtile) {
43446 TEST_REQUIRES_PSIMD;
43447 for (size_t k = 1; k <= 20; k += 5) {
43448 for (uint32_t m = 1; m <= 1; m++) {
43449 for (uint32_t n = 1; n <= 8; n++) {
43450 GemmMicrokernelTester()
43451 .mr(1)
43452 .nr(8)
43453 .kr(1)
43454 .sr(4)
43455 .m(m)
43456 .n(n)
43457 .k(k)
43458 .cm_stride(11)
43459 .iterations(1)
43460 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43461 }
43462 }
43463 }
43464 }
43465
43466 TEST(F32_GEMM_1X8S4__PSIMD, qmin) {
43467 TEST_REQUIRES_PSIMD;
43468 GemmMicrokernelTester()
43469 .mr(1)
43470 .nr(8)
43471 .kr(1)
43472 .sr(4)
43473 .m(1)
43474 .n(8)
43475 .k(4)
43476 .qmin(128)
43477 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43478 }
43479
43480 TEST(F32_GEMM_1X8S4__PSIMD, qmax) {
43481 TEST_REQUIRES_PSIMD;
43482 GemmMicrokernelTester()
43483 .mr(1)
43484 .nr(8)
43485 .kr(1)
43486 .sr(4)
43487 .m(1)
43488 .n(8)
43489 .k(4)
43490 .qmax(128)
43491 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43492 }
43493
43494 TEST(F32_GEMM_1X8S4__PSIMD, strided_cm) {
43495 TEST_REQUIRES_PSIMD;
43496 GemmMicrokernelTester()
43497 .mr(1)
43498 .nr(8)
43499 .kr(1)
43500 .sr(4)
43501 .m(1)
43502 .n(8)
43503 .k(4)
43504 .cm_stride(11)
43505 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43506 }
43507#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43508
43509
43510#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43511 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4) {
43512 TEST_REQUIRES_PSIMD;
43513 GemmMicrokernelTester()
43514 .mr(4)
43515 .nr(8)
43516 .kr(1)
43517 .sr(4)
43518 .m(4)
43519 .n(8)
43520 .k(4)
43521 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43522 }
43523
43524 TEST(F32_GEMM_4X8S4__PSIMD, strided_cn) {
43525 TEST_REQUIRES_PSIMD;
43526 GemmMicrokernelTester()
43527 .mr(4)
43528 .nr(8)
43529 .kr(1)
43530 .sr(4)
43531 .m(4)
43532 .n(8)
43533 .k(4)
43534 .cn_stride(11)
43535 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43536 }
43537
43538 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_strided_a) {
43539 TEST_REQUIRES_PSIMD;
43540 GemmMicrokernelTester()
43541 .mr(4)
43542 .nr(8)
43543 .kr(1)
43544 .sr(4)
43545 .m(4)
43546 .n(8)
43547 .k(4)
43548 .a_stride(7)
43549 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43550 }
43551
43552 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_subtile) {
43553 TEST_REQUIRES_PSIMD;
43554 for (uint32_t m = 1; m <= 4; m++) {
43555 for (uint32_t n = 1; n <= 8; n++) {
43556 GemmMicrokernelTester()
43557 .mr(4)
43558 .nr(8)
43559 .kr(1)
43560 .sr(4)
43561 .m(m)
43562 .n(n)
43563 .k(4)
43564 .iterations(1)
43565 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43566 }
43567 }
43568 }
43569
43570 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_subtile_m) {
43571 TEST_REQUIRES_PSIMD;
43572 for (uint32_t m = 1; m <= 4; m++) {
43573 GemmMicrokernelTester()
43574 .mr(4)
43575 .nr(8)
43576 .kr(1)
43577 .sr(4)
43578 .m(m)
43579 .n(8)
43580 .k(4)
43581 .iterations(1)
43582 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43583 }
43584 }
43585
43586 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_subtile_n) {
43587 TEST_REQUIRES_PSIMD;
43588 for (uint32_t n = 1; n <= 8; n++) {
43589 GemmMicrokernelTester()
43590 .mr(4)
43591 .nr(8)
43592 .kr(1)
43593 .sr(4)
43594 .m(4)
43595 .n(n)
43596 .k(4)
43597 .iterations(1)
43598 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43599 }
43600 }
43601
43602 TEST(F32_GEMM_4X8S4__PSIMD, k_lt_4) {
43603 TEST_REQUIRES_PSIMD;
43604 for (size_t k = 1; k < 4; k++) {
43605 GemmMicrokernelTester()
43606 .mr(4)
43607 .nr(8)
43608 .kr(1)
43609 .sr(4)
43610 .m(4)
43611 .n(8)
43612 .k(k)
43613 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43614 }
43615 }
43616
43617 TEST(F32_GEMM_4X8S4__PSIMD, k_lt_4_strided_a) {
43618 TEST_REQUIRES_PSIMD;
43619 for (size_t k = 1; k < 4; k++) {
43620 GemmMicrokernelTester()
43621 .mr(4)
43622 .nr(8)
43623 .kr(1)
43624 .sr(4)
43625 .m(4)
43626 .n(8)
43627 .k(k)
43628 .a_stride(7)
43629 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43630 }
43631 }
43632
43633 TEST(F32_GEMM_4X8S4__PSIMD, k_lt_4_subtile) {
43634 TEST_REQUIRES_PSIMD;
43635 for (size_t k = 1; k < 4; k++) {
43636 for (uint32_t m = 1; m <= 4; m++) {
43637 for (uint32_t n = 1; n <= 8; n++) {
43638 GemmMicrokernelTester()
43639 .mr(4)
43640 .nr(8)
43641 .kr(1)
43642 .sr(4)
43643 .m(m)
43644 .n(n)
43645 .k(k)
43646 .iterations(1)
43647 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43648 }
43649 }
43650 }
43651 }
43652
43653 TEST(F32_GEMM_4X8S4__PSIMD, k_gt_4) {
43654 TEST_REQUIRES_PSIMD;
43655 for (size_t k = 5; k < 8; k++) {
43656 GemmMicrokernelTester()
43657 .mr(4)
43658 .nr(8)
43659 .kr(1)
43660 .sr(4)
43661 .m(4)
43662 .n(8)
43663 .k(k)
43664 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43665 }
43666 }
43667
43668 TEST(F32_GEMM_4X8S4__PSIMD, k_gt_4_strided_a) {
43669 TEST_REQUIRES_PSIMD;
43670 for (size_t k = 5; k < 8; k++) {
43671 GemmMicrokernelTester()
43672 .mr(4)
43673 .nr(8)
43674 .kr(1)
43675 .sr(4)
43676 .m(4)
43677 .n(8)
43678 .k(k)
43679 .a_stride(11)
43680 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43681 }
43682 }
43683
43684 TEST(F32_GEMM_4X8S4__PSIMD, k_gt_4_subtile) {
43685 TEST_REQUIRES_PSIMD;
43686 for (size_t k = 5; k < 8; k++) {
43687 for (uint32_t m = 1; m <= 4; m++) {
43688 for (uint32_t n = 1; n <= 8; n++) {
43689 GemmMicrokernelTester()
43690 .mr(4)
43691 .nr(8)
43692 .kr(1)
43693 .sr(4)
43694 .m(m)
43695 .n(n)
43696 .k(k)
43697 .iterations(1)
43698 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43699 }
43700 }
43701 }
43702 }
43703
43704 TEST(F32_GEMM_4X8S4__PSIMD, k_div_4) {
43705 TEST_REQUIRES_PSIMD;
43706 for (size_t k = 8; k <= 40; k += 4) {
43707 GemmMicrokernelTester()
43708 .mr(4)
43709 .nr(8)
43710 .kr(1)
43711 .sr(4)
43712 .m(4)
43713 .n(8)
43714 .k(k)
43715 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43716 }
43717 }
43718
43719 TEST(F32_GEMM_4X8S4__PSIMD, k_div_4_strided_a) {
43720 TEST_REQUIRES_PSIMD;
43721 for (size_t k = 8; k <= 40; k += 4) {
43722 GemmMicrokernelTester()
43723 .mr(4)
43724 .nr(8)
43725 .kr(1)
43726 .sr(4)
43727 .m(4)
43728 .n(8)
43729 .k(k)
43730 .a_stride(43)
43731 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43732 }
43733 }
43734
43735 TEST(F32_GEMM_4X8S4__PSIMD, k_div_4_subtile) {
43736 TEST_REQUIRES_PSIMD;
43737 for (size_t k = 8; k <= 40; k += 4) {
43738 for (uint32_t m = 1; m <= 4; m++) {
43739 for (uint32_t n = 1; n <= 8; n++) {
43740 GemmMicrokernelTester()
43741 .mr(4)
43742 .nr(8)
43743 .kr(1)
43744 .sr(4)
43745 .m(m)
43746 .n(n)
43747 .k(k)
43748 .iterations(1)
43749 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43750 }
43751 }
43752 }
43753 }
43754
43755 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8) {
43756 TEST_REQUIRES_PSIMD;
43757 for (uint32_t n = 9; n < 16; n++) {
43758 for (size_t k = 1; k <= 20; k += 5) {
43759 GemmMicrokernelTester()
43760 .mr(4)
43761 .nr(8)
43762 .kr(1)
43763 .sr(4)
43764 .m(4)
43765 .n(8)
43766 .k(k)
43767 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43768 }
43769 }
43770 }
43771
43772 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8_strided_cn) {
43773 TEST_REQUIRES_PSIMD;
43774 for (uint32_t n = 9; n < 16; n++) {
43775 for (size_t k = 1; k <= 20; k += 5) {
43776 GemmMicrokernelTester()
43777 .mr(4)
43778 .nr(8)
43779 .kr(1)
43780 .sr(4)
43781 .m(4)
43782 .n(8)
43783 .k(k)
43784 .cn_stride(11)
43785 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43786 }
43787 }
43788 }
43789
43790 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8_strided_a) {
43791 TEST_REQUIRES_PSIMD;
43792 for (uint32_t n = 9; n < 16; n++) {
43793 for (size_t k = 1; k <= 20; k += 5) {
43794 GemmMicrokernelTester()
43795 .mr(4)
43796 .nr(8)
43797 .kr(1)
43798 .sr(4)
43799 .m(4)
43800 .n(n)
43801 .k(k)
43802 .a_stride(23)
43803 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43804 }
43805 }
43806 }
43807
43808 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8_subtile) {
43809 TEST_REQUIRES_PSIMD;
43810 for (uint32_t n = 9; n < 16; n++) {
43811 for (size_t k = 1; k <= 20; k += 5) {
43812 for (uint32_t m = 1; m <= 4; m++) {
43813 GemmMicrokernelTester()
43814 .mr(4)
43815 .nr(8)
43816 .kr(1)
43817 .sr(4)
43818 .m(m)
43819 .n(n)
43820 .k(k)
43821 .iterations(1)
43822 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43823 }
43824 }
43825 }
43826 }
43827
43828 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8) {
43829 TEST_REQUIRES_PSIMD;
43830 for (uint32_t n = 16; n <= 24; n += 8) {
43831 for (size_t k = 1; k <= 20; k += 5) {
43832 GemmMicrokernelTester()
43833 .mr(4)
43834 .nr(8)
43835 .kr(1)
43836 .sr(4)
43837 .m(4)
43838 .n(8)
43839 .k(k)
43840 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43841 }
43842 }
43843 }
43844
43845 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8_strided_cn) {
43846 TEST_REQUIRES_PSIMD;
43847 for (uint32_t n = 16; n <= 24; n += 8) {
43848 for (size_t k = 1; k <= 20; k += 5) {
43849 GemmMicrokernelTester()
43850 .mr(4)
43851 .nr(8)
43852 .kr(1)
43853 .sr(4)
43854 .m(4)
43855 .n(n)
43856 .k(k)
43857 .cn_stride(11)
43858 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43859 }
43860 }
43861 }
43862
43863 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8_strided_a) {
43864 TEST_REQUIRES_PSIMD;
43865 for (uint32_t n = 16; n <= 24; n += 8) {
43866 for (size_t k = 1; k <= 20; k += 5) {
43867 GemmMicrokernelTester()
43868 .mr(4)
43869 .nr(8)
43870 .kr(1)
43871 .sr(4)
43872 .m(4)
43873 .n(n)
43874 .k(k)
43875 .a_stride(23)
43876 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43877 }
43878 }
43879 }
43880
43881 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8_subtile) {
43882 TEST_REQUIRES_PSIMD;
43883 for (uint32_t n = 16; n <= 24; n += 8) {
43884 for (size_t k = 1; k <= 20; k += 5) {
43885 for (uint32_t m = 1; m <= 4; m++) {
43886 GemmMicrokernelTester()
43887 .mr(4)
43888 .nr(8)
43889 .kr(1)
43890 .sr(4)
43891 .m(m)
43892 .n(n)
43893 .k(k)
43894 .iterations(1)
43895 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43896 }
43897 }
43898 }
43899 }
43900
43901 TEST(F32_GEMM_4X8S4__PSIMD, strided_cm_subtile) {
43902 TEST_REQUIRES_PSIMD;
43903 for (size_t k = 1; k <= 20; k += 5) {
43904 for (uint32_t m = 1; m <= 4; m++) {
43905 for (uint32_t n = 1; n <= 8; n++) {
43906 GemmMicrokernelTester()
43907 .mr(4)
43908 .nr(8)
43909 .kr(1)
43910 .sr(4)
43911 .m(m)
43912 .n(n)
43913 .k(k)
43914 .cm_stride(11)
43915 .iterations(1)
43916 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43917 }
43918 }
43919 }
43920 }
43921
43922 TEST(F32_GEMM_4X8S4__PSIMD, qmin) {
43923 TEST_REQUIRES_PSIMD;
43924 GemmMicrokernelTester()
43925 .mr(4)
43926 .nr(8)
43927 .kr(1)
43928 .sr(4)
43929 .m(4)
43930 .n(8)
43931 .k(4)
43932 .qmin(128)
43933 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43934 }
43935
43936 TEST(F32_GEMM_4X8S4__PSIMD, qmax) {
43937 TEST_REQUIRES_PSIMD;
43938 GemmMicrokernelTester()
43939 .mr(4)
43940 .nr(8)
43941 .kr(1)
43942 .sr(4)
43943 .m(4)
43944 .n(8)
43945 .k(4)
43946 .qmax(128)
43947 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43948 }
43949
43950 TEST(F32_GEMM_4X8S4__PSIMD, strided_cm) {
43951 TEST_REQUIRES_PSIMD;
43952 GemmMicrokernelTester()
43953 .mr(4)
43954 .nr(8)
43955 .kr(1)
43956 .sr(4)
43957 .m(4)
43958 .n(8)
43959 .k(4)
43960 .cm_stride(11)
43961 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43962 }
43963#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43964
43965
43966#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43967 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4) {
43968 TEST_REQUIRES_PSIMD;
43969 GemmMicrokernelTester()
43970 .mr(6)
43971 .nr(8)
43972 .kr(1)
43973 .sr(4)
43974 .m(6)
43975 .n(8)
43976 .k(4)
43977 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43978 }
43979
43980 TEST(F32_GEMM_6X8S4__PSIMD, strided_cn) {
43981 TEST_REQUIRES_PSIMD;
43982 GemmMicrokernelTester()
43983 .mr(6)
43984 .nr(8)
43985 .kr(1)
43986 .sr(4)
43987 .m(6)
43988 .n(8)
43989 .k(4)
43990 .cn_stride(11)
43991 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43992 }
43993
43994 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_strided_a) {
43995 TEST_REQUIRES_PSIMD;
43996 GemmMicrokernelTester()
43997 .mr(6)
43998 .nr(8)
43999 .kr(1)
44000 .sr(4)
44001 .m(6)
44002 .n(8)
44003 .k(4)
44004 .a_stride(7)
44005 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44006 }
44007
44008 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_subtile) {
44009 TEST_REQUIRES_PSIMD;
44010 for (uint32_t m = 1; m <= 6; m++) {
44011 for (uint32_t n = 1; n <= 8; n++) {
44012 GemmMicrokernelTester()
44013 .mr(6)
44014 .nr(8)
44015 .kr(1)
44016 .sr(4)
44017 .m(m)
44018 .n(n)
44019 .k(4)
44020 .iterations(1)
44021 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44022 }
44023 }
44024 }
44025
44026 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_subtile_m) {
44027 TEST_REQUIRES_PSIMD;
44028 for (uint32_t m = 1; m <= 6; m++) {
44029 GemmMicrokernelTester()
44030 .mr(6)
44031 .nr(8)
44032 .kr(1)
44033 .sr(4)
44034 .m(m)
44035 .n(8)
44036 .k(4)
44037 .iterations(1)
44038 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44039 }
44040 }
44041
44042 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_subtile_n) {
44043 TEST_REQUIRES_PSIMD;
44044 for (uint32_t n = 1; n <= 8; n++) {
44045 GemmMicrokernelTester()
44046 .mr(6)
44047 .nr(8)
44048 .kr(1)
44049 .sr(4)
44050 .m(6)
44051 .n(n)
44052 .k(4)
44053 .iterations(1)
44054 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44055 }
44056 }
44057
44058 TEST(F32_GEMM_6X8S4__PSIMD, k_lt_4) {
44059 TEST_REQUIRES_PSIMD;
44060 for (size_t k = 1; k < 4; k++) {
44061 GemmMicrokernelTester()
44062 .mr(6)
44063 .nr(8)
44064 .kr(1)
44065 .sr(4)
44066 .m(6)
44067 .n(8)
44068 .k(k)
44069 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44070 }
44071 }
44072
44073 TEST(F32_GEMM_6X8S4__PSIMD, k_lt_4_strided_a) {
44074 TEST_REQUIRES_PSIMD;
44075 for (size_t k = 1; k < 4; k++) {
44076 GemmMicrokernelTester()
44077 .mr(6)
44078 .nr(8)
44079 .kr(1)
44080 .sr(4)
44081 .m(6)
44082 .n(8)
44083 .k(k)
44084 .a_stride(7)
44085 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44086 }
44087 }
44088
44089 TEST(F32_GEMM_6X8S4__PSIMD, k_lt_4_subtile) {
44090 TEST_REQUIRES_PSIMD;
44091 for (size_t k = 1; k < 4; k++) {
44092 for (uint32_t m = 1; m <= 6; m++) {
44093 for (uint32_t n = 1; n <= 8; n++) {
44094 GemmMicrokernelTester()
44095 .mr(6)
44096 .nr(8)
44097 .kr(1)
44098 .sr(4)
44099 .m(m)
44100 .n(n)
44101 .k(k)
44102 .iterations(1)
44103 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44104 }
44105 }
44106 }
44107 }
44108
44109 TEST(F32_GEMM_6X8S4__PSIMD, k_gt_4) {
44110 TEST_REQUIRES_PSIMD;
44111 for (size_t k = 5; k < 8; k++) {
44112 GemmMicrokernelTester()
44113 .mr(6)
44114 .nr(8)
44115 .kr(1)
44116 .sr(4)
44117 .m(6)
44118 .n(8)
44119 .k(k)
44120 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44121 }
44122 }
44123
44124 TEST(F32_GEMM_6X8S4__PSIMD, k_gt_4_strided_a) {
44125 TEST_REQUIRES_PSIMD;
44126 for (size_t k = 5; k < 8; k++) {
44127 GemmMicrokernelTester()
44128 .mr(6)
44129 .nr(8)
44130 .kr(1)
44131 .sr(4)
44132 .m(6)
44133 .n(8)
44134 .k(k)
44135 .a_stride(11)
44136 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44137 }
44138 }
44139
44140 TEST(F32_GEMM_6X8S4__PSIMD, k_gt_4_subtile) {
44141 TEST_REQUIRES_PSIMD;
44142 for (size_t k = 5; k < 8; k++) {
44143 for (uint32_t m = 1; m <= 6; m++) {
44144 for (uint32_t n = 1; n <= 8; n++) {
44145 GemmMicrokernelTester()
44146 .mr(6)
44147 .nr(8)
44148 .kr(1)
44149 .sr(4)
44150 .m(m)
44151 .n(n)
44152 .k(k)
44153 .iterations(1)
44154 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44155 }
44156 }
44157 }
44158 }
44159
44160 TEST(F32_GEMM_6X8S4__PSIMD, k_div_4) {
44161 TEST_REQUIRES_PSIMD;
44162 for (size_t k = 8; k <= 40; k += 4) {
44163 GemmMicrokernelTester()
44164 .mr(6)
44165 .nr(8)
44166 .kr(1)
44167 .sr(4)
44168 .m(6)
44169 .n(8)
44170 .k(k)
44171 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44172 }
44173 }
44174
44175 TEST(F32_GEMM_6X8S4__PSIMD, k_div_4_strided_a) {
44176 TEST_REQUIRES_PSIMD;
44177 for (size_t k = 8; k <= 40; k += 4) {
44178 GemmMicrokernelTester()
44179 .mr(6)
44180 .nr(8)
44181 .kr(1)
44182 .sr(4)
44183 .m(6)
44184 .n(8)
44185 .k(k)
44186 .a_stride(43)
44187 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44188 }
44189 }
44190
44191 TEST(F32_GEMM_6X8S4__PSIMD, k_div_4_subtile) {
44192 TEST_REQUIRES_PSIMD;
44193 for (size_t k = 8; k <= 40; k += 4) {
44194 for (uint32_t m = 1; m <= 6; m++) {
44195 for (uint32_t n = 1; n <= 8; n++) {
44196 GemmMicrokernelTester()
44197 .mr(6)
44198 .nr(8)
44199 .kr(1)
44200 .sr(4)
44201 .m(m)
44202 .n(n)
44203 .k(k)
44204 .iterations(1)
44205 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44206 }
44207 }
44208 }
44209 }
44210
44211 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8) {
44212 TEST_REQUIRES_PSIMD;
44213 for (uint32_t n = 9; n < 16; n++) {
44214 for (size_t k = 1; k <= 20; k += 5) {
44215 GemmMicrokernelTester()
44216 .mr(6)
44217 .nr(8)
44218 .kr(1)
44219 .sr(4)
44220 .m(6)
44221 .n(8)
44222 .k(k)
44223 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44224 }
44225 }
44226 }
44227
44228 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8_strided_cn) {
44229 TEST_REQUIRES_PSIMD;
44230 for (uint32_t n = 9; n < 16; n++) {
44231 for (size_t k = 1; k <= 20; k += 5) {
44232 GemmMicrokernelTester()
44233 .mr(6)
44234 .nr(8)
44235 .kr(1)
44236 .sr(4)
44237 .m(6)
44238 .n(8)
44239 .k(k)
44240 .cn_stride(11)
44241 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44242 }
44243 }
44244 }
44245
44246 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8_strided_a) {
44247 TEST_REQUIRES_PSIMD;
44248 for (uint32_t n = 9; n < 16; n++) {
44249 for (size_t k = 1; k <= 20; k += 5) {
44250 GemmMicrokernelTester()
44251 .mr(6)
44252 .nr(8)
44253 .kr(1)
44254 .sr(4)
44255 .m(6)
44256 .n(n)
44257 .k(k)
44258 .a_stride(23)
44259 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44260 }
44261 }
44262 }
44263
44264 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8_subtile) {
44265 TEST_REQUIRES_PSIMD;
44266 for (uint32_t n = 9; n < 16; n++) {
44267 for (size_t k = 1; k <= 20; k += 5) {
44268 for (uint32_t m = 1; m <= 6; m++) {
44269 GemmMicrokernelTester()
44270 .mr(6)
44271 .nr(8)
44272 .kr(1)
44273 .sr(4)
44274 .m(m)
44275 .n(n)
44276 .k(k)
44277 .iterations(1)
44278 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44279 }
44280 }
44281 }
44282 }
44283
44284 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8) {
44285 TEST_REQUIRES_PSIMD;
44286 for (uint32_t n = 16; n <= 24; n += 8) {
44287 for (size_t k = 1; k <= 20; k += 5) {
44288 GemmMicrokernelTester()
44289 .mr(6)
44290 .nr(8)
44291 .kr(1)
44292 .sr(4)
44293 .m(6)
44294 .n(8)
44295 .k(k)
44296 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44297 }
44298 }
44299 }
44300
44301 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8_strided_cn) {
44302 TEST_REQUIRES_PSIMD;
44303 for (uint32_t n = 16; n <= 24; n += 8) {
44304 for (size_t k = 1; k <= 20; k += 5) {
44305 GemmMicrokernelTester()
44306 .mr(6)
44307 .nr(8)
44308 .kr(1)
44309 .sr(4)
44310 .m(6)
44311 .n(n)
44312 .k(k)
44313 .cn_stride(11)
44314 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44315 }
44316 }
44317 }
44318
44319 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8_strided_a) {
44320 TEST_REQUIRES_PSIMD;
44321 for (uint32_t n = 16; n <= 24; n += 8) {
44322 for (size_t k = 1; k <= 20; k += 5) {
44323 GemmMicrokernelTester()
44324 .mr(6)
44325 .nr(8)
44326 .kr(1)
44327 .sr(4)
44328 .m(6)
44329 .n(n)
44330 .k(k)
44331 .a_stride(23)
44332 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44333 }
44334 }
44335 }
44336
44337 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8_subtile) {
44338 TEST_REQUIRES_PSIMD;
44339 for (uint32_t n = 16; n <= 24; n += 8) {
44340 for (size_t k = 1; k <= 20; k += 5) {
44341 for (uint32_t m = 1; m <= 6; m++) {
44342 GemmMicrokernelTester()
44343 .mr(6)
44344 .nr(8)
44345 .kr(1)
44346 .sr(4)
44347 .m(m)
44348 .n(n)
44349 .k(k)
44350 .iterations(1)
44351 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44352 }
44353 }
44354 }
44355 }
44356
44357 TEST(F32_GEMM_6X8S4__PSIMD, strided_cm_subtile) {
44358 TEST_REQUIRES_PSIMD;
44359 for (size_t k = 1; k <= 20; k += 5) {
44360 for (uint32_t m = 1; m <= 6; m++) {
44361 for (uint32_t n = 1; n <= 8; n++) {
44362 GemmMicrokernelTester()
44363 .mr(6)
44364 .nr(8)
44365 .kr(1)
44366 .sr(4)
44367 .m(m)
44368 .n(n)
44369 .k(k)
44370 .cm_stride(11)
44371 .iterations(1)
44372 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44373 }
44374 }
44375 }
44376 }
44377
44378 TEST(F32_GEMM_6X8S4__PSIMD, qmin) {
44379 TEST_REQUIRES_PSIMD;
44380 GemmMicrokernelTester()
44381 .mr(6)
44382 .nr(8)
44383 .kr(1)
44384 .sr(4)
44385 .m(6)
44386 .n(8)
44387 .k(4)
44388 .qmin(128)
44389 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44390 }
44391
44392 TEST(F32_GEMM_6X8S4__PSIMD, qmax) {
44393 TEST_REQUIRES_PSIMD;
44394 GemmMicrokernelTester()
44395 .mr(6)
44396 .nr(8)
44397 .kr(1)
44398 .sr(4)
44399 .m(6)
44400 .n(8)
44401 .k(4)
44402 .qmax(128)
44403 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44404 }
44405
44406 TEST(F32_GEMM_6X8S4__PSIMD, strided_cm) {
44407 TEST_REQUIRES_PSIMD;
44408 GemmMicrokernelTester()
44409 .mr(6)
44410 .nr(8)
44411 .kr(1)
44412 .sr(4)
44413 .m(6)
44414 .n(8)
44415 .k(4)
44416 .cm_stride(11)
44417 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44418 }
44419#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44420
44421
44422#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44423 TEST(F32_GEMM_4X2C4__PSIMD, k_eq_4) {
44424 TEST_REQUIRES_PSIMD;
44425 GemmMicrokernelTester()
44426 .mr(4)
44427 .nr(2)
44428 .kr(4)
44429 .sr(1)
44430 .m(4)
44431 .n(2)
44432 .k(4)
44433 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44434 }
44435
44436 TEST(F32_GEMM_4X2C4__PSIMD, strided_cn) {
44437 TEST_REQUIRES_PSIMD;
44438 GemmMicrokernelTester()
44439 .mr(4)
44440 .nr(2)
44441 .kr(4)
44442 .sr(1)
44443 .m(4)
44444 .n(2)
44445 .k(4)
44446 .cn_stride(5)
44447 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44448 }
44449
44450 TEST(F32_GEMM_4X2C4__PSIMD, k_eq_4_strided_a) {
44451 TEST_REQUIRES_PSIMD;
44452 GemmMicrokernelTester()
44453 .mr(4)
44454 .nr(2)
44455 .kr(4)
44456 .sr(1)
44457 .m(4)
44458 .n(2)
44459 .k(4)
44460 .a_stride(7)
44461 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44462 }
44463
44464 TEST(F32_GEMM_4X2C4__PSIMD, k_eq_4_subtile) {
44465 TEST_REQUIRES_PSIMD;
44466 for (uint32_t m = 1; m <= 4; m++) {
44467 for (uint32_t n = 1; n <= 2; n++) {
44468 GemmMicrokernelTester()
44469 .mr(4)
44470 .nr(2)
44471 .kr(4)
44472 .sr(1)
44473 .m(m)
44474 .n(n)
44475 .k(4)
44476 .iterations(1)
44477 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44478 }
44479 }
44480 }
44481
44482 TEST(F32_GEMM_4X2C4__PSIMD, k_eq_4_subtile_m) {
44483 TEST_REQUIRES_PSIMD;
44484 for (uint32_t m = 1; m <= 4; m++) {
44485 GemmMicrokernelTester()
44486 .mr(4)
44487 .nr(2)
44488 .kr(4)
44489 .sr(1)
44490 .m(m)
44491 .n(2)
44492 .k(4)
44493 .iterations(1)
44494 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44495 }
44496 }
44497
44498 TEST(F32_GEMM_4X2C4__PSIMD, k_eq_4_subtile_n) {
44499 TEST_REQUIRES_PSIMD;
44500 for (uint32_t n = 1; n <= 2; n++) {
44501 GemmMicrokernelTester()
44502 .mr(4)
44503 .nr(2)
44504 .kr(4)
44505 .sr(1)
44506 .m(4)
44507 .n(n)
44508 .k(4)
44509 .iterations(1)
44510 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44511 }
44512 }
44513
44514 TEST(F32_GEMM_4X2C4__PSIMD, k_lt_4) {
44515 TEST_REQUIRES_PSIMD;
44516 for (size_t k = 1; k < 4; k++) {
44517 GemmMicrokernelTester()
44518 .mr(4)
44519 .nr(2)
44520 .kr(4)
44521 .sr(1)
44522 .m(4)
44523 .n(2)
44524 .k(k)
44525 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44526 }
44527 }
44528
44529 TEST(F32_GEMM_4X2C4__PSIMD, k_lt_4_strided_a) {
44530 TEST_REQUIRES_PSIMD;
44531 for (size_t k = 1; k < 4; k++) {
44532 GemmMicrokernelTester()
44533 .mr(4)
44534 .nr(2)
44535 .kr(4)
44536 .sr(1)
44537 .m(4)
44538 .n(2)
44539 .k(k)
44540 .a_stride(7)
44541 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44542 }
44543 }
44544
44545 TEST(F32_GEMM_4X2C4__PSIMD, k_lt_4_subtile) {
44546 TEST_REQUIRES_PSIMD;
44547 for (size_t k = 1; k < 4; k++) {
44548 for (uint32_t m = 1; m <= 4; m++) {
44549 for (uint32_t n = 1; n <= 2; n++) {
44550 GemmMicrokernelTester()
44551 .mr(4)
44552 .nr(2)
44553 .kr(4)
44554 .sr(1)
44555 .m(m)
44556 .n(n)
44557 .k(k)
44558 .iterations(1)
44559 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44560 }
44561 }
44562 }
44563 }
44564
44565 TEST(F32_GEMM_4X2C4__PSIMD, k_gt_4) {
44566 TEST_REQUIRES_PSIMD;
44567 for (size_t k = 5; k < 8; k++) {
44568 GemmMicrokernelTester()
44569 .mr(4)
44570 .nr(2)
44571 .kr(4)
44572 .sr(1)
44573 .m(4)
44574 .n(2)
44575 .k(k)
44576 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44577 }
44578 }
44579
44580 TEST(F32_GEMM_4X2C4__PSIMD, k_gt_4_strided_a) {
44581 TEST_REQUIRES_PSIMD;
44582 for (size_t k = 5; k < 8; k++) {
44583 GemmMicrokernelTester()
44584 .mr(4)
44585 .nr(2)
44586 .kr(4)
44587 .sr(1)
44588 .m(4)
44589 .n(2)
44590 .k(k)
44591 .a_stride(11)
44592 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44593 }
44594 }
44595
44596 TEST(F32_GEMM_4X2C4__PSIMD, k_gt_4_subtile) {
44597 TEST_REQUIRES_PSIMD;
44598 for (size_t k = 5; k < 8; k++) {
44599 for (uint32_t m = 1; m <= 4; m++) {
44600 for (uint32_t n = 1; n <= 2; n++) {
44601 GemmMicrokernelTester()
44602 .mr(4)
44603 .nr(2)
44604 .kr(4)
44605 .sr(1)
44606 .m(m)
44607 .n(n)
44608 .k(k)
44609 .iterations(1)
44610 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44611 }
44612 }
44613 }
44614 }
44615
44616 TEST(F32_GEMM_4X2C4__PSIMD, k_div_4) {
44617 TEST_REQUIRES_PSIMD;
44618 for (size_t k = 8; k <= 40; k += 4) {
44619 GemmMicrokernelTester()
44620 .mr(4)
44621 .nr(2)
44622 .kr(4)
44623 .sr(1)
44624 .m(4)
44625 .n(2)
44626 .k(k)
44627 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44628 }
44629 }
44630
44631 TEST(F32_GEMM_4X2C4__PSIMD, k_div_4_strided_a) {
44632 TEST_REQUIRES_PSIMD;
44633 for (size_t k = 8; k <= 40; k += 4) {
44634 GemmMicrokernelTester()
44635 .mr(4)
44636 .nr(2)
44637 .kr(4)
44638 .sr(1)
44639 .m(4)
44640 .n(2)
44641 .k(k)
44642 .a_stride(43)
44643 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44644 }
44645 }
44646
44647 TEST(F32_GEMM_4X2C4__PSIMD, k_div_4_subtile) {
44648 TEST_REQUIRES_PSIMD;
44649 for (size_t k = 8; k <= 40; k += 4) {
44650 for (uint32_t m = 1; m <= 4; m++) {
44651 for (uint32_t n = 1; n <= 2; n++) {
44652 GemmMicrokernelTester()
44653 .mr(4)
44654 .nr(2)
44655 .kr(4)
44656 .sr(1)
44657 .m(m)
44658 .n(n)
44659 .k(k)
44660 .iterations(1)
44661 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44662 }
44663 }
44664 }
44665 }
44666
44667 TEST(F32_GEMM_4X2C4__PSIMD, n_gt_2) {
44668 TEST_REQUIRES_PSIMD;
44669 for (uint32_t n = 3; n < 4; n++) {
44670 for (size_t k = 1; k <= 20; k += 5) {
44671 GemmMicrokernelTester()
44672 .mr(4)
44673 .nr(2)
44674 .kr(4)
44675 .sr(1)
44676 .m(4)
44677 .n(2)
44678 .k(k)
44679 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44680 }
44681 }
44682 }
44683
44684 TEST(F32_GEMM_4X2C4__PSIMD, n_gt_2_strided_cn) {
44685 TEST_REQUIRES_PSIMD;
44686 for (uint32_t n = 3; n < 4; n++) {
44687 for (size_t k = 1; k <= 20; k += 5) {
44688 GemmMicrokernelTester()
44689 .mr(4)
44690 .nr(2)
44691 .kr(4)
44692 .sr(1)
44693 .m(4)
44694 .n(2)
44695 .k(k)
44696 .cn_stride(5)
44697 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44698 }
44699 }
44700 }
44701
44702 TEST(F32_GEMM_4X2C4__PSIMD, n_gt_2_strided_a) {
44703 TEST_REQUIRES_PSIMD;
44704 for (uint32_t n = 3; n < 4; n++) {
44705 for (size_t k = 1; k <= 20; k += 5) {
44706 GemmMicrokernelTester()
44707 .mr(4)
44708 .nr(2)
44709 .kr(4)
44710 .sr(1)
44711 .m(4)
44712 .n(n)
44713 .k(k)
44714 .a_stride(23)
44715 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44716 }
44717 }
44718 }
44719
44720 TEST(F32_GEMM_4X2C4__PSIMD, n_gt_2_subtile) {
44721 TEST_REQUIRES_PSIMD;
44722 for (uint32_t n = 3; n < 4; n++) {
44723 for (size_t k = 1; k <= 20; k += 5) {
44724 for (uint32_t m = 1; m <= 4; m++) {
44725 GemmMicrokernelTester()
44726 .mr(4)
44727 .nr(2)
44728 .kr(4)
44729 .sr(1)
44730 .m(m)
44731 .n(n)
44732 .k(k)
44733 .iterations(1)
44734 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44735 }
44736 }
44737 }
44738 }
44739
44740 TEST(F32_GEMM_4X2C4__PSIMD, n_div_2) {
44741 TEST_REQUIRES_PSIMD;
44742 for (uint32_t n = 4; n <= 6; n += 2) {
44743 for (size_t k = 1; k <= 20; k += 5) {
44744 GemmMicrokernelTester()
44745 .mr(4)
44746 .nr(2)
44747 .kr(4)
44748 .sr(1)
44749 .m(4)
44750 .n(2)
44751 .k(k)
44752 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44753 }
44754 }
44755 }
44756
44757 TEST(F32_GEMM_4X2C4__PSIMD, n_div_2_strided_cn) {
44758 TEST_REQUIRES_PSIMD;
44759 for (uint32_t n = 4; n <= 6; n += 2) {
44760 for (size_t k = 1; k <= 20; k += 5) {
44761 GemmMicrokernelTester()
44762 .mr(4)
44763 .nr(2)
44764 .kr(4)
44765 .sr(1)
44766 .m(4)
44767 .n(n)
44768 .k(k)
44769 .cn_stride(5)
44770 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44771 }
44772 }
44773 }
44774
44775 TEST(F32_GEMM_4X2C4__PSIMD, n_div_2_strided_a) {
44776 TEST_REQUIRES_PSIMD;
44777 for (uint32_t n = 4; n <= 6; n += 2) {
44778 for (size_t k = 1; k <= 20; k += 5) {
44779 GemmMicrokernelTester()
44780 .mr(4)
44781 .nr(2)
44782 .kr(4)
44783 .sr(1)
44784 .m(4)
44785 .n(n)
44786 .k(k)
44787 .a_stride(23)
44788 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44789 }
44790 }
44791 }
44792
44793 TEST(F32_GEMM_4X2C4__PSIMD, n_div_2_subtile) {
44794 TEST_REQUIRES_PSIMD;
44795 for (uint32_t n = 4; n <= 6; n += 2) {
44796 for (size_t k = 1; k <= 20; k += 5) {
44797 for (uint32_t m = 1; m <= 4; m++) {
44798 GemmMicrokernelTester()
44799 .mr(4)
44800 .nr(2)
44801 .kr(4)
44802 .sr(1)
44803 .m(m)
44804 .n(n)
44805 .k(k)
44806 .iterations(1)
44807 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44808 }
44809 }
44810 }
44811 }
44812
44813 TEST(F32_GEMM_4X2C4__PSIMD, strided_cm_subtile) {
44814 TEST_REQUIRES_PSIMD;
44815 for (size_t k = 1; k <= 20; k += 5) {
44816 for (uint32_t m = 1; m <= 4; m++) {
44817 for (uint32_t n = 1; n <= 2; n++) {
44818 GemmMicrokernelTester()
44819 .mr(4)
44820 .nr(2)
44821 .kr(4)
44822 .sr(1)
44823 .m(m)
44824 .n(n)
44825 .k(k)
44826 .cm_stride(5)
44827 .iterations(1)
44828 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44829 }
44830 }
44831 }
44832 }
44833
44834 TEST(F32_GEMM_4X2C4__PSIMD, qmin) {
44835 TEST_REQUIRES_PSIMD;
44836 GemmMicrokernelTester()
44837 .mr(4)
44838 .nr(2)
44839 .kr(4)
44840 .sr(1)
44841 .m(4)
44842 .n(2)
44843 .k(4)
44844 .qmin(128)
44845 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44846 }
44847
44848 TEST(F32_GEMM_4X2C4__PSIMD, qmax) {
44849 TEST_REQUIRES_PSIMD;
44850 GemmMicrokernelTester()
44851 .mr(4)
44852 .nr(2)
44853 .kr(4)
44854 .sr(1)
44855 .m(4)
44856 .n(2)
44857 .k(4)
44858 .qmax(128)
44859 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44860 }
44861
44862 TEST(F32_GEMM_4X2C4__PSIMD, strided_cm) {
44863 TEST_REQUIRES_PSIMD;
44864 GemmMicrokernelTester()
44865 .mr(4)
44866 .nr(2)
44867 .kr(4)
44868 .sr(1)
44869 .m(4)
44870 .n(2)
44871 .k(4)
44872 .cm_stride(5)
44873 .Test(xnn_f32_gemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
44874 }
44875#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44876
44877
44878#if XNN_ARCH_WASM
44879 TEST(F32_GEMM_1X4__WASM, k_eq_1) {
44880 GemmMicrokernelTester()
44881 .mr(1)
44882 .nr(4)
44883 .kr(1)
44884 .sr(1)
44885 .m(1)
44886 .n(4)
44887 .k(1)
44888 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44889 }
44890
44891 TEST(F32_GEMM_1X4__WASM, strided_cn) {
44892 GemmMicrokernelTester()
44893 .mr(1)
44894 .nr(4)
44895 .kr(1)
44896 .sr(1)
44897 .m(1)
44898 .n(4)
44899 .k(1)
44900 .cn_stride(7)
44901 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44902 }
44903
44904 TEST(F32_GEMM_1X4__WASM, k_eq_1_strided_a) {
44905 GemmMicrokernelTester()
44906 .mr(1)
44907 .nr(4)
44908 .kr(1)
44909 .sr(1)
44910 .m(1)
44911 .n(4)
44912 .k(1)
44913 .a_stride(3)
44914 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44915 }
44916
44917 TEST(F32_GEMM_1X4__WASM, k_eq_1_subtile) {
44918 for (uint32_t m = 1; m <= 1; m++) {
44919 for (uint32_t n = 1; n <= 4; n++) {
44920 GemmMicrokernelTester()
44921 .mr(1)
44922 .nr(4)
44923 .kr(1)
44924 .sr(1)
44925 .m(m)
44926 .n(n)
44927 .k(1)
44928 .iterations(1)
44929 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44930 }
44931 }
44932 }
44933
44934 TEST(F32_GEMM_1X4__WASM, k_eq_1_subtile_m) {
44935 for (uint32_t m = 1; m <= 1; m++) {
44936 GemmMicrokernelTester()
44937 .mr(1)
44938 .nr(4)
44939 .kr(1)
44940 .sr(1)
44941 .m(m)
44942 .n(4)
44943 .k(1)
44944 .iterations(1)
44945 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44946 }
44947 }
44948
44949 TEST(F32_GEMM_1X4__WASM, k_eq_1_subtile_n) {
44950 for (uint32_t n = 1; n <= 4; n++) {
44951 GemmMicrokernelTester()
44952 .mr(1)
44953 .nr(4)
44954 .kr(1)
44955 .sr(1)
44956 .m(1)
44957 .n(n)
44958 .k(1)
44959 .iterations(1)
44960 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44961 }
44962 }
44963
44964 TEST(F32_GEMM_1X4__WASM, k_gt_1) {
44965 for (size_t k = 2; k < 10; k++) {
44966 GemmMicrokernelTester()
44967 .mr(1)
44968 .nr(4)
44969 .kr(1)
44970 .sr(1)
44971 .m(1)
44972 .n(4)
44973 .k(k)
44974 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44975 }
44976 }
44977
44978 TEST(F32_GEMM_1X4__WASM, k_gt_1_strided_a) {
44979 for (size_t k = 2; k < 10; k++) {
44980 GemmMicrokernelTester()
44981 .mr(1)
44982 .nr(4)
44983 .kr(1)
44984 .sr(1)
44985 .m(1)
44986 .n(4)
44987 .k(k)
44988 .a_stride(11)
44989 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
44990 }
44991 }
44992
44993 TEST(F32_GEMM_1X4__WASM, k_gt_1_subtile) {
44994 for (size_t k = 2; k < 10; k++) {
44995 for (uint32_t m = 1; m <= 1; m++) {
44996 for (uint32_t n = 1; n <= 4; n++) {
44997 GemmMicrokernelTester()
44998 .mr(1)
44999 .nr(4)
45000 .kr(1)
45001 .sr(1)
45002 .m(m)
45003 .n(n)
45004 .k(k)
45005 .iterations(1)
45006 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45007 }
45008 }
45009 }
45010 }
45011
45012 TEST(F32_GEMM_1X4__WASM, n_gt_4) {
45013 for (uint32_t n = 5; n < 8; n++) {
45014 for (size_t k = 1; k <= 5; k += 2) {
45015 GemmMicrokernelTester()
45016 .mr(1)
45017 .nr(4)
45018 .kr(1)
45019 .sr(1)
45020 .m(1)
45021 .n(4)
45022 .k(k)
45023 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45024 }
45025 }
45026 }
45027
45028 TEST(F32_GEMM_1X4__WASM, n_gt_4_strided_cn) {
45029 for (uint32_t n = 5; n < 8; n++) {
45030 for (size_t k = 1; k <= 5; k += 2) {
45031 GemmMicrokernelTester()
45032 .mr(1)
45033 .nr(4)
45034 .kr(1)
45035 .sr(1)
45036 .m(1)
45037 .n(4)
45038 .k(k)
45039 .cn_stride(7)
45040 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45041 }
45042 }
45043 }
45044
45045 TEST(F32_GEMM_1X4__WASM, n_gt_4_strided_a) {
45046 for (uint32_t n = 5; n < 8; n++) {
45047 for (size_t k = 1; k <= 5; k += 2) {
45048 GemmMicrokernelTester()
45049 .mr(1)
45050 .nr(4)
45051 .kr(1)
45052 .sr(1)
45053 .m(1)
45054 .n(n)
45055 .k(k)
45056 .a_stride(7)
45057 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45058 }
45059 }
45060 }
45061
45062 TEST(F32_GEMM_1X4__WASM, n_gt_4_subtile) {
45063 for (uint32_t n = 5; n < 8; n++) {
45064 for (size_t k = 1; k <= 5; k += 2) {
45065 for (uint32_t m = 1; m <= 1; m++) {
45066 GemmMicrokernelTester()
45067 .mr(1)
45068 .nr(4)
45069 .kr(1)
45070 .sr(1)
45071 .m(m)
45072 .n(n)
45073 .k(k)
45074 .iterations(1)
45075 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45076 }
45077 }
45078 }
45079 }
45080
45081 TEST(F32_GEMM_1X4__WASM, n_div_4) {
45082 for (uint32_t n = 8; n <= 12; n += 4) {
45083 for (size_t k = 1; k <= 5; k += 2) {
45084 GemmMicrokernelTester()
45085 .mr(1)
45086 .nr(4)
45087 .kr(1)
45088 .sr(1)
45089 .m(1)
45090 .n(4)
45091 .k(k)
45092 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45093 }
45094 }
45095 }
45096
45097 TEST(F32_GEMM_1X4__WASM, n_div_4_strided_cn) {
45098 for (uint32_t n = 8; n <= 12; n += 4) {
45099 for (size_t k = 1; k <= 5; k += 2) {
45100 GemmMicrokernelTester()
45101 .mr(1)
45102 .nr(4)
45103 .kr(1)
45104 .sr(1)
45105 .m(1)
45106 .n(n)
45107 .k(k)
45108 .cn_stride(7)
45109 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45110 }
45111 }
45112 }
45113
45114 TEST(F32_GEMM_1X4__WASM, n_div_4_strided_a) {
45115 for (uint32_t n = 8; n <= 12; n += 4) {
45116 for (size_t k = 1; k <= 5; k += 2) {
45117 GemmMicrokernelTester()
45118 .mr(1)
45119 .nr(4)
45120 .kr(1)
45121 .sr(1)
45122 .m(1)
45123 .n(n)
45124 .k(k)
45125 .a_stride(7)
45126 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45127 }
45128 }
45129 }
45130
45131 TEST(F32_GEMM_1X4__WASM, n_div_4_subtile) {
45132 for (uint32_t n = 8; n <= 12; n += 4) {
45133 for (size_t k = 1; k <= 5; k += 2) {
45134 for (uint32_t m = 1; m <= 1; m++) {
45135 GemmMicrokernelTester()
45136 .mr(1)
45137 .nr(4)
45138 .kr(1)
45139 .sr(1)
45140 .m(m)
45141 .n(n)
45142 .k(k)
45143 .iterations(1)
45144 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45145 }
45146 }
45147 }
45148 }
45149
45150 TEST(F32_GEMM_1X4__WASM, strided_cm_subtile) {
45151 for (size_t k = 1; k <= 5; k += 2) {
45152 for (uint32_t m = 1; m <= 1; m++) {
45153 for (uint32_t n = 1; n <= 4; n++) {
45154 GemmMicrokernelTester()
45155 .mr(1)
45156 .nr(4)
45157 .kr(1)
45158 .sr(1)
45159 .m(m)
45160 .n(n)
45161 .k(k)
45162 .cm_stride(7)
45163 .iterations(1)
45164 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45165 }
45166 }
45167 }
45168 }
45169
45170 TEST(F32_GEMM_1X4__WASM, qmin) {
45171 GemmMicrokernelTester()
45172 .mr(1)
45173 .nr(4)
45174 .kr(1)
45175 .sr(1)
45176 .m(1)
45177 .n(4)
45178 .k(1)
45179 .qmin(128)
45180 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45181 }
45182
45183 TEST(F32_GEMM_1X4__WASM, qmax) {
45184 GemmMicrokernelTester()
45185 .mr(1)
45186 .nr(4)
45187 .kr(1)
45188 .sr(1)
45189 .m(1)
45190 .n(4)
45191 .k(1)
45192 .qmax(128)
45193 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45194 }
45195
45196 TEST(F32_GEMM_1X4__WASM, strided_cm) {
45197 GemmMicrokernelTester()
45198 .mr(1)
45199 .nr(4)
45200 .kr(1)
45201 .sr(1)
45202 .m(1)
45203 .n(4)
45204 .k(1)
45205 .cm_stride(7)
45206 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45207 }
45208#endif // XNN_ARCH_WASM
45209
45210
45211#if XNN_ARCH_WASM
45212 TEST(F32_GEMM_2X4__WASM, k_eq_1) {
45213 GemmMicrokernelTester()
45214 .mr(2)
45215 .nr(4)
45216 .kr(1)
45217 .sr(1)
45218 .m(2)
45219 .n(4)
45220 .k(1)
45221 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45222 }
45223
45224 TEST(F32_GEMM_2X4__WASM, strided_cn) {
45225 GemmMicrokernelTester()
45226 .mr(2)
45227 .nr(4)
45228 .kr(1)
45229 .sr(1)
45230 .m(2)
45231 .n(4)
45232 .k(1)
45233 .cn_stride(7)
45234 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45235 }
45236
45237 TEST(F32_GEMM_2X4__WASM, k_eq_1_strided_a) {
45238 GemmMicrokernelTester()
45239 .mr(2)
45240 .nr(4)
45241 .kr(1)
45242 .sr(1)
45243 .m(2)
45244 .n(4)
45245 .k(1)
45246 .a_stride(3)
45247 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45248 }
45249
45250 TEST(F32_GEMM_2X4__WASM, k_eq_1_subtile) {
45251 for (uint32_t m = 1; m <= 2; m++) {
45252 for (uint32_t n = 1; n <= 4; n++) {
45253 GemmMicrokernelTester()
45254 .mr(2)
45255 .nr(4)
45256 .kr(1)
45257 .sr(1)
45258 .m(m)
45259 .n(n)
45260 .k(1)
45261 .iterations(1)
45262 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45263 }
45264 }
45265 }
45266
45267 TEST(F32_GEMM_2X4__WASM, k_eq_1_subtile_m) {
45268 for (uint32_t m = 1; m <= 2; m++) {
45269 GemmMicrokernelTester()
45270 .mr(2)
45271 .nr(4)
45272 .kr(1)
45273 .sr(1)
45274 .m(m)
45275 .n(4)
45276 .k(1)
45277 .iterations(1)
45278 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45279 }
45280 }
45281
45282 TEST(F32_GEMM_2X4__WASM, k_eq_1_subtile_n) {
45283 for (uint32_t n = 1; n <= 4; n++) {
45284 GemmMicrokernelTester()
45285 .mr(2)
45286 .nr(4)
45287 .kr(1)
45288 .sr(1)
45289 .m(2)
45290 .n(n)
45291 .k(1)
45292 .iterations(1)
45293 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45294 }
45295 }
45296
45297 TEST(F32_GEMM_2X4__WASM, k_gt_1) {
45298 for (size_t k = 2; k < 10; k++) {
45299 GemmMicrokernelTester()
45300 .mr(2)
45301 .nr(4)
45302 .kr(1)
45303 .sr(1)
45304 .m(2)
45305 .n(4)
45306 .k(k)
45307 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45308 }
45309 }
45310
45311 TEST(F32_GEMM_2X4__WASM, k_gt_1_strided_a) {
45312 for (size_t k = 2; k < 10; k++) {
45313 GemmMicrokernelTester()
45314 .mr(2)
45315 .nr(4)
45316 .kr(1)
45317 .sr(1)
45318 .m(2)
45319 .n(4)
45320 .k(k)
45321 .a_stride(11)
45322 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45323 }
45324 }
45325
45326 TEST(F32_GEMM_2X4__WASM, k_gt_1_subtile) {
45327 for (size_t k = 2; k < 10; k++) {
45328 for (uint32_t m = 1; m <= 2; m++) {
45329 for (uint32_t n = 1; n <= 4; n++) {
45330 GemmMicrokernelTester()
45331 .mr(2)
45332 .nr(4)
45333 .kr(1)
45334 .sr(1)
45335 .m(m)
45336 .n(n)
45337 .k(k)
45338 .iterations(1)
45339 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45340 }
45341 }
45342 }
45343 }
45344
45345 TEST(F32_GEMM_2X4__WASM, n_gt_4) {
45346 for (uint32_t n = 5; n < 8; n++) {
45347 for (size_t k = 1; k <= 5; k += 2) {
45348 GemmMicrokernelTester()
45349 .mr(2)
45350 .nr(4)
45351 .kr(1)
45352 .sr(1)
45353 .m(2)
45354 .n(4)
45355 .k(k)
45356 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45357 }
45358 }
45359 }
45360
45361 TEST(F32_GEMM_2X4__WASM, n_gt_4_strided_cn) {
45362 for (uint32_t n = 5; n < 8; n++) {
45363 for (size_t k = 1; k <= 5; k += 2) {
45364 GemmMicrokernelTester()
45365 .mr(2)
45366 .nr(4)
45367 .kr(1)
45368 .sr(1)
45369 .m(2)
45370 .n(4)
45371 .k(k)
45372 .cn_stride(7)
45373 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45374 }
45375 }
45376 }
45377
45378 TEST(F32_GEMM_2X4__WASM, n_gt_4_strided_a) {
45379 for (uint32_t n = 5; n < 8; n++) {
45380 for (size_t k = 1; k <= 5; k += 2) {
45381 GemmMicrokernelTester()
45382 .mr(2)
45383 .nr(4)
45384 .kr(1)
45385 .sr(1)
45386 .m(2)
45387 .n(n)
45388 .k(k)
45389 .a_stride(7)
45390 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45391 }
45392 }
45393 }
45394
45395 TEST(F32_GEMM_2X4__WASM, n_gt_4_subtile) {
45396 for (uint32_t n = 5; n < 8; n++) {
45397 for (size_t k = 1; k <= 5; k += 2) {
45398 for (uint32_t m = 1; m <= 2; m++) {
45399 GemmMicrokernelTester()
45400 .mr(2)
45401 .nr(4)
45402 .kr(1)
45403 .sr(1)
45404 .m(m)
45405 .n(n)
45406 .k(k)
45407 .iterations(1)
45408 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45409 }
45410 }
45411 }
45412 }
45413
45414 TEST(F32_GEMM_2X4__WASM, n_div_4) {
45415 for (uint32_t n = 8; n <= 12; n += 4) {
45416 for (size_t k = 1; k <= 5; k += 2) {
45417 GemmMicrokernelTester()
45418 .mr(2)
45419 .nr(4)
45420 .kr(1)
45421 .sr(1)
45422 .m(2)
45423 .n(4)
45424 .k(k)
45425 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45426 }
45427 }
45428 }
45429
45430 TEST(F32_GEMM_2X4__WASM, n_div_4_strided_cn) {
45431 for (uint32_t n = 8; n <= 12; n += 4) {
45432 for (size_t k = 1; k <= 5; k += 2) {
45433 GemmMicrokernelTester()
45434 .mr(2)
45435 .nr(4)
45436 .kr(1)
45437 .sr(1)
45438 .m(2)
45439 .n(n)
45440 .k(k)
45441 .cn_stride(7)
45442 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45443 }
45444 }
45445 }
45446
45447 TEST(F32_GEMM_2X4__WASM, n_div_4_strided_a) {
45448 for (uint32_t n = 8; n <= 12; n += 4) {
45449 for (size_t k = 1; k <= 5; k += 2) {
45450 GemmMicrokernelTester()
45451 .mr(2)
45452 .nr(4)
45453 .kr(1)
45454 .sr(1)
45455 .m(2)
45456 .n(n)
45457 .k(k)
45458 .a_stride(7)
45459 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45460 }
45461 }
45462 }
45463
45464 TEST(F32_GEMM_2X4__WASM, n_div_4_subtile) {
45465 for (uint32_t n = 8; n <= 12; n += 4) {
45466 for (size_t k = 1; k <= 5; k += 2) {
45467 for (uint32_t m = 1; m <= 2; m++) {
45468 GemmMicrokernelTester()
45469 .mr(2)
45470 .nr(4)
45471 .kr(1)
45472 .sr(1)
45473 .m(m)
45474 .n(n)
45475 .k(k)
45476 .iterations(1)
45477 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45478 }
45479 }
45480 }
45481 }
45482
45483 TEST(F32_GEMM_2X4__WASM, strided_cm_subtile) {
45484 for (size_t k = 1; k <= 5; k += 2) {
45485 for (uint32_t m = 1; m <= 2; m++) {
45486 for (uint32_t n = 1; n <= 4; n++) {
45487 GemmMicrokernelTester()
45488 .mr(2)
45489 .nr(4)
45490 .kr(1)
45491 .sr(1)
45492 .m(m)
45493 .n(n)
45494 .k(k)
45495 .cm_stride(7)
45496 .iterations(1)
45497 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45498 }
45499 }
45500 }
45501 }
45502
45503 TEST(F32_GEMM_2X4__WASM, qmin) {
45504 GemmMicrokernelTester()
45505 .mr(2)
45506 .nr(4)
45507 .kr(1)
45508 .sr(1)
45509 .m(2)
45510 .n(4)
45511 .k(1)
45512 .qmin(128)
45513 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45514 }
45515
45516 TEST(F32_GEMM_2X4__WASM, qmax) {
45517 GemmMicrokernelTester()
45518 .mr(2)
45519 .nr(4)
45520 .kr(1)
45521 .sr(1)
45522 .m(2)
45523 .n(4)
45524 .k(1)
45525 .qmax(128)
45526 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45527 }
45528
45529 TEST(F32_GEMM_2X4__WASM, strided_cm) {
45530 GemmMicrokernelTester()
45531 .mr(2)
45532 .nr(4)
45533 .kr(1)
45534 .sr(1)
45535 .m(2)
45536 .n(4)
45537 .k(1)
45538 .cm_stride(7)
45539 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45540 }
45541#endif // XNN_ARCH_WASM
45542
45543
45544#if XNN_ARCH_WASM
45545 TEST(F32_GEMM_4X4__WASM, k_eq_1) {
45546 GemmMicrokernelTester()
45547 .mr(4)
45548 .nr(4)
45549 .kr(1)
45550 .sr(1)
45551 .m(4)
45552 .n(4)
45553 .k(1)
45554 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45555 }
45556
45557 TEST(F32_GEMM_4X4__WASM, strided_cn) {
45558 GemmMicrokernelTester()
45559 .mr(4)
45560 .nr(4)
45561 .kr(1)
45562 .sr(1)
45563 .m(4)
45564 .n(4)
45565 .k(1)
45566 .cn_stride(7)
45567 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45568 }
45569
45570 TEST(F32_GEMM_4X4__WASM, k_eq_1_strided_a) {
45571 GemmMicrokernelTester()
45572 .mr(4)
45573 .nr(4)
45574 .kr(1)
45575 .sr(1)
45576 .m(4)
45577 .n(4)
45578 .k(1)
45579 .a_stride(3)
45580 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45581 }
45582
45583 TEST(F32_GEMM_4X4__WASM, k_eq_1_subtile) {
45584 for (uint32_t m = 1; m <= 4; m++) {
45585 for (uint32_t n = 1; n <= 4; n++) {
45586 GemmMicrokernelTester()
45587 .mr(4)
45588 .nr(4)
45589 .kr(1)
45590 .sr(1)
45591 .m(m)
45592 .n(n)
45593 .k(1)
45594 .iterations(1)
45595 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45596 }
45597 }
45598 }
45599
45600 TEST(F32_GEMM_4X4__WASM, k_eq_1_subtile_m) {
45601 for (uint32_t m = 1; m <= 4; m++) {
45602 GemmMicrokernelTester()
45603 .mr(4)
45604 .nr(4)
45605 .kr(1)
45606 .sr(1)
45607 .m(m)
45608 .n(4)
45609 .k(1)
45610 .iterations(1)
45611 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45612 }
45613 }
45614
45615 TEST(F32_GEMM_4X4__WASM, k_eq_1_subtile_n) {
45616 for (uint32_t n = 1; n <= 4; n++) {
45617 GemmMicrokernelTester()
45618 .mr(4)
45619 .nr(4)
45620 .kr(1)
45621 .sr(1)
45622 .m(4)
45623 .n(n)
45624 .k(1)
45625 .iterations(1)
45626 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45627 }
45628 }
45629
45630 TEST(F32_GEMM_4X4__WASM, k_gt_1) {
45631 for (size_t k = 2; k < 10; k++) {
45632 GemmMicrokernelTester()
45633 .mr(4)
45634 .nr(4)
45635 .kr(1)
45636 .sr(1)
45637 .m(4)
45638 .n(4)
45639 .k(k)
45640 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45641 }
45642 }
45643
45644 TEST(F32_GEMM_4X4__WASM, k_gt_1_strided_a) {
45645 for (size_t k = 2; k < 10; k++) {
45646 GemmMicrokernelTester()
45647 .mr(4)
45648 .nr(4)
45649 .kr(1)
45650 .sr(1)
45651 .m(4)
45652 .n(4)
45653 .k(k)
45654 .a_stride(11)
45655 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45656 }
45657 }
45658
45659 TEST(F32_GEMM_4X4__WASM, k_gt_1_subtile) {
45660 for (size_t k = 2; k < 10; k++) {
45661 for (uint32_t m = 1; m <= 4; m++) {
45662 for (uint32_t n = 1; n <= 4; n++) {
45663 GemmMicrokernelTester()
45664 .mr(4)
45665 .nr(4)
45666 .kr(1)
45667 .sr(1)
45668 .m(m)
45669 .n(n)
45670 .k(k)
45671 .iterations(1)
45672 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45673 }
45674 }
45675 }
45676 }
45677
45678 TEST(F32_GEMM_4X4__WASM, n_gt_4) {
45679 for (uint32_t n = 5; n < 8; n++) {
45680 for (size_t k = 1; k <= 5; k += 2) {
45681 GemmMicrokernelTester()
45682 .mr(4)
45683 .nr(4)
45684 .kr(1)
45685 .sr(1)
45686 .m(4)
45687 .n(4)
45688 .k(k)
45689 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45690 }
45691 }
45692 }
45693
45694 TEST(F32_GEMM_4X4__WASM, n_gt_4_strided_cn) {
45695 for (uint32_t n = 5; n < 8; n++) {
45696 for (size_t k = 1; k <= 5; k += 2) {
45697 GemmMicrokernelTester()
45698 .mr(4)
45699 .nr(4)
45700 .kr(1)
45701 .sr(1)
45702 .m(4)
45703 .n(4)
45704 .k(k)
45705 .cn_stride(7)
45706 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45707 }
45708 }
45709 }
45710
45711 TEST(F32_GEMM_4X4__WASM, n_gt_4_strided_a) {
45712 for (uint32_t n = 5; n < 8; n++) {
45713 for (size_t k = 1; k <= 5; k += 2) {
45714 GemmMicrokernelTester()
45715 .mr(4)
45716 .nr(4)
45717 .kr(1)
45718 .sr(1)
45719 .m(4)
45720 .n(n)
45721 .k(k)
45722 .a_stride(7)
45723 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45724 }
45725 }
45726 }
45727
45728 TEST(F32_GEMM_4X4__WASM, n_gt_4_subtile) {
45729 for (uint32_t n = 5; n < 8; n++) {
45730 for (size_t k = 1; k <= 5; k += 2) {
45731 for (uint32_t m = 1; m <= 4; m++) {
45732 GemmMicrokernelTester()
45733 .mr(4)
45734 .nr(4)
45735 .kr(1)
45736 .sr(1)
45737 .m(m)
45738 .n(n)
45739 .k(k)
45740 .iterations(1)
45741 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45742 }
45743 }
45744 }
45745 }
45746
45747 TEST(F32_GEMM_4X4__WASM, n_div_4) {
45748 for (uint32_t n = 8; n <= 12; n += 4) {
45749 for (size_t k = 1; k <= 5; k += 2) {
45750 GemmMicrokernelTester()
45751 .mr(4)
45752 .nr(4)
45753 .kr(1)
45754 .sr(1)
45755 .m(4)
45756 .n(4)
45757 .k(k)
45758 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45759 }
45760 }
45761 }
45762
45763 TEST(F32_GEMM_4X4__WASM, n_div_4_strided_cn) {
45764 for (uint32_t n = 8; n <= 12; n += 4) {
45765 for (size_t k = 1; k <= 5; k += 2) {
45766 GemmMicrokernelTester()
45767 .mr(4)
45768 .nr(4)
45769 .kr(1)
45770 .sr(1)
45771 .m(4)
45772 .n(n)
45773 .k(k)
45774 .cn_stride(7)
45775 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45776 }
45777 }
45778 }
45779
45780 TEST(F32_GEMM_4X4__WASM, n_div_4_strided_a) {
45781 for (uint32_t n = 8; n <= 12; n += 4) {
45782 for (size_t k = 1; k <= 5; k += 2) {
45783 GemmMicrokernelTester()
45784 .mr(4)
45785 .nr(4)
45786 .kr(1)
45787 .sr(1)
45788 .m(4)
45789 .n(n)
45790 .k(k)
45791 .a_stride(7)
45792 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45793 }
45794 }
45795 }
45796
45797 TEST(F32_GEMM_4X4__WASM, n_div_4_subtile) {
45798 for (uint32_t n = 8; n <= 12; n += 4) {
45799 for (size_t k = 1; k <= 5; k += 2) {
45800 for (uint32_t m = 1; m <= 4; m++) {
45801 GemmMicrokernelTester()
45802 .mr(4)
45803 .nr(4)
45804 .kr(1)
45805 .sr(1)
45806 .m(m)
45807 .n(n)
45808 .k(k)
45809 .iterations(1)
45810 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45811 }
45812 }
45813 }
45814 }
45815
45816 TEST(F32_GEMM_4X4__WASM, strided_cm_subtile) {
45817 for (size_t k = 1; k <= 5; k += 2) {
45818 for (uint32_t m = 1; m <= 4; m++) {
45819 for (uint32_t n = 1; n <= 4; n++) {
45820 GemmMicrokernelTester()
45821 .mr(4)
45822 .nr(4)
45823 .kr(1)
45824 .sr(1)
45825 .m(m)
45826 .n(n)
45827 .k(k)
45828 .cm_stride(7)
45829 .iterations(1)
45830 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45831 }
45832 }
45833 }
45834 }
45835
45836 TEST(F32_GEMM_4X4__WASM, qmin) {
45837 GemmMicrokernelTester()
45838 .mr(4)
45839 .nr(4)
45840 .kr(1)
45841 .sr(1)
45842 .m(4)
45843 .n(4)
45844 .k(1)
45845 .qmin(128)
45846 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45847 }
45848
45849 TEST(F32_GEMM_4X4__WASM, qmax) {
45850 GemmMicrokernelTester()
45851 .mr(4)
45852 .nr(4)
45853 .kr(1)
45854 .sr(1)
45855 .m(4)
45856 .n(4)
45857 .k(1)
45858 .qmax(128)
45859 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45860 }
45861
45862 TEST(F32_GEMM_4X4__WASM, strided_cm) {
45863 GemmMicrokernelTester()
45864 .mr(4)
45865 .nr(4)
45866 .kr(1)
45867 .sr(1)
45868 .m(4)
45869 .n(4)
45870 .k(1)
45871 .cm_stride(7)
45872 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45873 }
45874#endif // XNN_ARCH_WASM
45875
45876
45877#if XNN_ARCH_WASM
45878 TEST(F32_GEMM_4X2__WASM, k_eq_1) {
45879 GemmMicrokernelTester()
45880 .mr(4)
45881 .nr(2)
45882 .kr(1)
45883 .sr(1)
45884 .m(4)
45885 .n(2)
45886 .k(1)
45887 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45888 }
45889
45890 TEST(F32_GEMM_4X2__WASM, strided_cn) {
45891 GemmMicrokernelTester()
45892 .mr(4)
45893 .nr(2)
45894 .kr(1)
45895 .sr(1)
45896 .m(4)
45897 .n(2)
45898 .k(1)
45899 .cn_stride(5)
45900 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45901 }
45902
45903 TEST(F32_GEMM_4X2__WASM, k_eq_1_strided_a) {
45904 GemmMicrokernelTester()
45905 .mr(4)
45906 .nr(2)
45907 .kr(1)
45908 .sr(1)
45909 .m(4)
45910 .n(2)
45911 .k(1)
45912 .a_stride(3)
45913 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45914 }
45915
45916 TEST(F32_GEMM_4X2__WASM, k_eq_1_subtile) {
45917 for (uint32_t m = 1; m <= 4; m++) {
45918 for (uint32_t n = 1; n <= 2; n++) {
45919 GemmMicrokernelTester()
45920 .mr(4)
45921 .nr(2)
45922 .kr(1)
45923 .sr(1)
45924 .m(m)
45925 .n(n)
45926 .k(1)
45927 .iterations(1)
45928 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45929 }
45930 }
45931 }
45932
45933 TEST(F32_GEMM_4X2__WASM, k_eq_1_subtile_m) {
45934 for (uint32_t m = 1; m <= 4; m++) {
45935 GemmMicrokernelTester()
45936 .mr(4)
45937 .nr(2)
45938 .kr(1)
45939 .sr(1)
45940 .m(m)
45941 .n(2)
45942 .k(1)
45943 .iterations(1)
45944 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45945 }
45946 }
45947
45948 TEST(F32_GEMM_4X2__WASM, k_eq_1_subtile_n) {
45949 for (uint32_t n = 1; n <= 2; n++) {
45950 GemmMicrokernelTester()
45951 .mr(4)
45952 .nr(2)
45953 .kr(1)
45954 .sr(1)
45955 .m(4)
45956 .n(n)
45957 .k(1)
45958 .iterations(1)
45959 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45960 }
45961 }
45962
45963 TEST(F32_GEMM_4X2__WASM, k_gt_1) {
45964 for (size_t k = 2; k < 10; k++) {
45965 GemmMicrokernelTester()
45966 .mr(4)
45967 .nr(2)
45968 .kr(1)
45969 .sr(1)
45970 .m(4)
45971 .n(2)
45972 .k(k)
45973 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45974 }
45975 }
45976
45977 TEST(F32_GEMM_4X2__WASM, k_gt_1_strided_a) {
45978 for (size_t k = 2; k < 10; k++) {
45979 GemmMicrokernelTester()
45980 .mr(4)
45981 .nr(2)
45982 .kr(1)
45983 .sr(1)
45984 .m(4)
45985 .n(2)
45986 .k(k)
45987 .a_stride(11)
45988 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
45989 }
45990 }
45991
45992 TEST(F32_GEMM_4X2__WASM, k_gt_1_subtile) {
45993 for (size_t k = 2; k < 10; k++) {
45994 for (uint32_t m = 1; m <= 4; m++) {
45995 for (uint32_t n = 1; n <= 2; n++) {
45996 GemmMicrokernelTester()
45997 .mr(4)
45998 .nr(2)
45999 .kr(1)
46000 .sr(1)
46001 .m(m)
46002 .n(n)
46003 .k(k)
46004 .iterations(1)
46005 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46006 }
46007 }
46008 }
46009 }
46010
46011 TEST(F32_GEMM_4X2__WASM, n_gt_2) {
46012 for (uint32_t n = 3; n < 4; n++) {
46013 for (size_t k = 1; k <= 5; k += 2) {
46014 GemmMicrokernelTester()
46015 .mr(4)
46016 .nr(2)
46017 .kr(1)
46018 .sr(1)
46019 .m(4)
46020 .n(2)
46021 .k(k)
46022 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46023 }
46024 }
46025 }
46026
46027 TEST(F32_GEMM_4X2__WASM, n_gt_2_strided_cn) {
46028 for (uint32_t n = 3; n < 4; n++) {
46029 for (size_t k = 1; k <= 5; k += 2) {
46030 GemmMicrokernelTester()
46031 .mr(4)
46032 .nr(2)
46033 .kr(1)
46034 .sr(1)
46035 .m(4)
46036 .n(2)
46037 .k(k)
46038 .cn_stride(5)
46039 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46040 }
46041 }
46042 }
46043
46044 TEST(F32_GEMM_4X2__WASM, n_gt_2_strided_a) {
46045 for (uint32_t n = 3; n < 4; n++) {
46046 for (size_t k = 1; k <= 5; k += 2) {
46047 GemmMicrokernelTester()
46048 .mr(4)
46049 .nr(2)
46050 .kr(1)
46051 .sr(1)
46052 .m(4)
46053 .n(n)
46054 .k(k)
46055 .a_stride(7)
46056 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46057 }
46058 }
46059 }
46060
46061 TEST(F32_GEMM_4X2__WASM, n_gt_2_subtile) {
46062 for (uint32_t n = 3; n < 4; n++) {
46063 for (size_t k = 1; k <= 5; k += 2) {
46064 for (uint32_t m = 1; m <= 4; m++) {
46065 GemmMicrokernelTester()
46066 .mr(4)
46067 .nr(2)
46068 .kr(1)
46069 .sr(1)
46070 .m(m)
46071 .n(n)
46072 .k(k)
46073 .iterations(1)
46074 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46075 }
46076 }
46077 }
46078 }
46079
46080 TEST(F32_GEMM_4X2__WASM, n_div_2) {
46081 for (uint32_t n = 4; n <= 6; n += 2) {
46082 for (size_t k = 1; k <= 5; k += 2) {
46083 GemmMicrokernelTester()
46084 .mr(4)
46085 .nr(2)
46086 .kr(1)
46087 .sr(1)
46088 .m(4)
46089 .n(2)
46090 .k(k)
46091 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46092 }
46093 }
46094 }
46095
46096 TEST(F32_GEMM_4X2__WASM, n_div_2_strided_cn) {
46097 for (uint32_t n = 4; n <= 6; n += 2) {
46098 for (size_t k = 1; k <= 5; k += 2) {
46099 GemmMicrokernelTester()
46100 .mr(4)
46101 .nr(2)
46102 .kr(1)
46103 .sr(1)
46104 .m(4)
46105 .n(n)
46106 .k(k)
46107 .cn_stride(5)
46108 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46109 }
46110 }
46111 }
46112
46113 TEST(F32_GEMM_4X2__WASM, n_div_2_strided_a) {
46114 for (uint32_t n = 4; n <= 6; n += 2) {
46115 for (size_t k = 1; k <= 5; k += 2) {
46116 GemmMicrokernelTester()
46117 .mr(4)
46118 .nr(2)
46119 .kr(1)
46120 .sr(1)
46121 .m(4)
46122 .n(n)
46123 .k(k)
46124 .a_stride(7)
46125 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46126 }
46127 }
46128 }
46129
46130 TEST(F32_GEMM_4X2__WASM, n_div_2_subtile) {
46131 for (uint32_t n = 4; n <= 6; n += 2) {
46132 for (size_t k = 1; k <= 5; k += 2) {
46133 for (uint32_t m = 1; m <= 4; m++) {
46134 GemmMicrokernelTester()
46135 .mr(4)
46136 .nr(2)
46137 .kr(1)
46138 .sr(1)
46139 .m(m)
46140 .n(n)
46141 .k(k)
46142 .iterations(1)
46143 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46144 }
46145 }
46146 }
46147 }
46148
46149 TEST(F32_GEMM_4X2__WASM, strided_cm_subtile) {
46150 for (size_t k = 1; k <= 5; k += 2) {
46151 for (uint32_t m = 1; m <= 4; m++) {
46152 for (uint32_t n = 1; n <= 2; n++) {
46153 GemmMicrokernelTester()
46154 .mr(4)
46155 .nr(2)
46156 .kr(1)
46157 .sr(1)
46158 .m(m)
46159 .n(n)
46160 .k(k)
46161 .cm_stride(5)
46162 .iterations(1)
46163 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46164 }
46165 }
46166 }
46167 }
46168
46169 TEST(F32_GEMM_4X2__WASM, qmin) {
46170 GemmMicrokernelTester()
46171 .mr(4)
46172 .nr(2)
46173 .kr(1)
46174 .sr(1)
46175 .m(4)
46176 .n(2)
46177 .k(1)
46178 .qmin(128)
46179 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46180 }
46181
46182 TEST(F32_GEMM_4X2__WASM, qmax) {
46183 GemmMicrokernelTester()
46184 .mr(4)
46185 .nr(2)
46186 .kr(1)
46187 .sr(1)
46188 .m(4)
46189 .n(2)
46190 .k(1)
46191 .qmax(128)
46192 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46193 }
46194
46195 TEST(F32_GEMM_4X2__WASM, strided_cm) {
46196 GemmMicrokernelTester()
46197 .mr(4)
46198 .nr(2)
46199 .kr(1)
46200 .sr(1)
46201 .m(4)
46202 .n(2)
46203 .k(1)
46204 .cm_stride(5)
46205 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46206 }
46207#endif // XNN_ARCH_WASM
46208
46209
46210TEST(F32_GEMM_1X4__SCALAR, k_eq_1) {
46211 GemmMicrokernelTester()
46212 .mr(1)
46213 .nr(4)
46214 .kr(1)
46215 .sr(1)
46216 .m(1)
46217 .n(4)
46218 .k(1)
46219 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46220}
46221
46222TEST(F32_GEMM_1X4__SCALAR, strided_cn) {
46223 GemmMicrokernelTester()
46224 .mr(1)
46225 .nr(4)
46226 .kr(1)
46227 .sr(1)
46228 .m(1)
46229 .n(4)
46230 .k(1)
46231 .cn_stride(7)
46232 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46233}
46234
46235TEST(F32_GEMM_1X4__SCALAR, k_eq_1_strided_a) {
46236 GemmMicrokernelTester()
46237 .mr(1)
46238 .nr(4)
46239 .kr(1)
46240 .sr(1)
46241 .m(1)
46242 .n(4)
46243 .k(1)
46244 .a_stride(3)
46245 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46246}
46247
46248TEST(F32_GEMM_1X4__SCALAR, k_eq_1_subtile) {
46249 for (uint32_t m = 1; m <= 1; m++) {
46250 for (uint32_t n = 1; n <= 4; n++) {
46251 GemmMicrokernelTester()
46252 .mr(1)
46253 .nr(4)
46254 .kr(1)
46255 .sr(1)
46256 .m(m)
46257 .n(n)
46258 .k(1)
46259 .iterations(1)
46260 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46261 }
46262 }
46263}
46264
46265TEST(F32_GEMM_1X4__SCALAR, k_eq_1_subtile_m) {
46266 for (uint32_t m = 1; m <= 1; m++) {
46267 GemmMicrokernelTester()
46268 .mr(1)
46269 .nr(4)
46270 .kr(1)
46271 .sr(1)
46272 .m(m)
46273 .n(4)
46274 .k(1)
46275 .iterations(1)
46276 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46277 }
46278}
46279
46280TEST(F32_GEMM_1X4__SCALAR, k_eq_1_subtile_n) {
46281 for (uint32_t n = 1; n <= 4; n++) {
46282 GemmMicrokernelTester()
46283 .mr(1)
46284 .nr(4)
46285 .kr(1)
46286 .sr(1)
46287 .m(1)
46288 .n(n)
46289 .k(1)
46290 .iterations(1)
46291 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46292 }
46293}
46294
46295TEST(F32_GEMM_1X4__SCALAR, k_gt_1) {
46296 for (size_t k = 2; k < 10; k++) {
46297 GemmMicrokernelTester()
46298 .mr(1)
46299 .nr(4)
46300 .kr(1)
46301 .sr(1)
46302 .m(1)
46303 .n(4)
46304 .k(k)
46305 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46306 }
46307}
46308
46309TEST(F32_GEMM_1X4__SCALAR, k_gt_1_strided_a) {
46310 for (size_t k = 2; k < 10; k++) {
46311 GemmMicrokernelTester()
46312 .mr(1)
46313 .nr(4)
46314 .kr(1)
46315 .sr(1)
46316 .m(1)
46317 .n(4)
46318 .k(k)
46319 .a_stride(11)
46320 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46321 }
46322}
46323
46324TEST(F32_GEMM_1X4__SCALAR, k_gt_1_subtile) {
46325 for (size_t k = 2; k < 10; k++) {
46326 for (uint32_t m = 1; m <= 1; m++) {
46327 for (uint32_t n = 1; n <= 4; n++) {
46328 GemmMicrokernelTester()
46329 .mr(1)
46330 .nr(4)
46331 .kr(1)
46332 .sr(1)
46333 .m(m)
46334 .n(n)
46335 .k(k)
46336 .iterations(1)
46337 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46338 }
46339 }
46340 }
46341}
46342
46343TEST(F32_GEMM_1X4__SCALAR, n_gt_4) {
46344 for (uint32_t n = 5; n < 8; n++) {
46345 for (size_t k = 1; k <= 5; k += 2) {
46346 GemmMicrokernelTester()
46347 .mr(1)
46348 .nr(4)
46349 .kr(1)
46350 .sr(1)
46351 .m(1)
46352 .n(4)
46353 .k(k)
46354 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46355 }
46356 }
46357}
46358
46359TEST(F32_GEMM_1X4__SCALAR, n_gt_4_strided_cn) {
46360 for (uint32_t n = 5; n < 8; n++) {
46361 for (size_t k = 1; k <= 5; k += 2) {
46362 GemmMicrokernelTester()
46363 .mr(1)
46364 .nr(4)
46365 .kr(1)
46366 .sr(1)
46367 .m(1)
46368 .n(4)
46369 .k(k)
46370 .cn_stride(7)
46371 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46372 }
46373 }
46374}
46375
46376TEST(F32_GEMM_1X4__SCALAR, n_gt_4_strided_a) {
46377 for (uint32_t n = 5; n < 8; n++) {
46378 for (size_t k = 1; k <= 5; k += 2) {
46379 GemmMicrokernelTester()
46380 .mr(1)
46381 .nr(4)
46382 .kr(1)
46383 .sr(1)
46384 .m(1)
46385 .n(n)
46386 .k(k)
46387 .a_stride(7)
46388 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46389 }
46390 }
46391}
46392
46393TEST(F32_GEMM_1X4__SCALAR, n_gt_4_subtile) {
46394 for (uint32_t n = 5; n < 8; n++) {
46395 for (size_t k = 1; k <= 5; k += 2) {
46396 for (uint32_t m = 1; m <= 1; m++) {
46397 GemmMicrokernelTester()
46398 .mr(1)
46399 .nr(4)
46400 .kr(1)
46401 .sr(1)
46402 .m(m)
46403 .n(n)
46404 .k(k)
46405 .iterations(1)
46406 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46407 }
46408 }
46409 }
46410}
46411
46412TEST(F32_GEMM_1X4__SCALAR, n_div_4) {
46413 for (uint32_t n = 8; n <= 12; n += 4) {
46414 for (size_t k = 1; k <= 5; k += 2) {
46415 GemmMicrokernelTester()
46416 .mr(1)
46417 .nr(4)
46418 .kr(1)
46419 .sr(1)
46420 .m(1)
46421 .n(4)
46422 .k(k)
46423 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46424 }
46425 }
46426}
46427
46428TEST(F32_GEMM_1X4__SCALAR, n_div_4_strided_cn) {
46429 for (uint32_t n = 8; n <= 12; n += 4) {
46430 for (size_t k = 1; k <= 5; k += 2) {
46431 GemmMicrokernelTester()
46432 .mr(1)
46433 .nr(4)
46434 .kr(1)
46435 .sr(1)
46436 .m(1)
46437 .n(n)
46438 .k(k)
46439 .cn_stride(7)
46440 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46441 }
46442 }
46443}
46444
46445TEST(F32_GEMM_1X4__SCALAR, n_div_4_strided_a) {
46446 for (uint32_t n = 8; n <= 12; n += 4) {
46447 for (size_t k = 1; k <= 5; k += 2) {
46448 GemmMicrokernelTester()
46449 .mr(1)
46450 .nr(4)
46451 .kr(1)
46452 .sr(1)
46453 .m(1)
46454 .n(n)
46455 .k(k)
46456 .a_stride(7)
46457 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46458 }
46459 }
46460}
46461
46462TEST(F32_GEMM_1X4__SCALAR, n_div_4_subtile) {
46463 for (uint32_t n = 8; n <= 12; n += 4) {
46464 for (size_t k = 1; k <= 5; k += 2) {
46465 for (uint32_t m = 1; m <= 1; m++) {
46466 GemmMicrokernelTester()
46467 .mr(1)
46468 .nr(4)
46469 .kr(1)
46470 .sr(1)
46471 .m(m)
46472 .n(n)
46473 .k(k)
46474 .iterations(1)
46475 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46476 }
46477 }
46478 }
46479}
46480
46481TEST(F32_GEMM_1X4__SCALAR, strided_cm_subtile) {
46482 for (size_t k = 1; k <= 5; k += 2) {
46483 for (uint32_t m = 1; m <= 1; m++) {
46484 for (uint32_t n = 1; n <= 4; n++) {
46485 GemmMicrokernelTester()
46486 .mr(1)
46487 .nr(4)
46488 .kr(1)
46489 .sr(1)
46490 .m(m)
46491 .n(n)
46492 .k(k)
46493 .cm_stride(7)
46494 .iterations(1)
46495 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46496 }
46497 }
46498 }
46499}
46500
46501TEST(F32_GEMM_1X4__SCALAR, qmin) {
46502 GemmMicrokernelTester()
46503 .mr(1)
46504 .nr(4)
46505 .kr(1)
46506 .sr(1)
46507 .m(1)
46508 .n(4)
46509 .k(1)
46510 .qmin(128)
46511 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46512}
46513
46514TEST(F32_GEMM_1X4__SCALAR, qmax) {
46515 GemmMicrokernelTester()
46516 .mr(1)
46517 .nr(4)
46518 .kr(1)
46519 .sr(1)
46520 .m(1)
46521 .n(4)
46522 .k(1)
46523 .qmax(128)
46524 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46525}
46526
46527TEST(F32_GEMM_1X4__SCALAR, strided_cm) {
46528 GemmMicrokernelTester()
46529 .mr(1)
46530 .nr(4)
46531 .kr(1)
46532 .sr(1)
46533 .m(1)
46534 .n(4)
46535 .k(1)
46536 .cm_stride(7)
46537 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46538}
46539
46540
46541TEST(F32_GEMM_2X4__SCALAR, k_eq_1) {
46542 GemmMicrokernelTester()
46543 .mr(2)
46544 .nr(4)
46545 .kr(1)
46546 .sr(1)
46547 .m(2)
46548 .n(4)
46549 .k(1)
46550 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46551}
46552
46553TEST(F32_GEMM_2X4__SCALAR, strided_cn) {
46554 GemmMicrokernelTester()
46555 .mr(2)
46556 .nr(4)
46557 .kr(1)
46558 .sr(1)
46559 .m(2)
46560 .n(4)
46561 .k(1)
46562 .cn_stride(7)
46563 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46564}
46565
46566TEST(F32_GEMM_2X4__SCALAR, k_eq_1_strided_a) {
46567 GemmMicrokernelTester()
46568 .mr(2)
46569 .nr(4)
46570 .kr(1)
46571 .sr(1)
46572 .m(2)
46573 .n(4)
46574 .k(1)
46575 .a_stride(3)
46576 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46577}
46578
46579TEST(F32_GEMM_2X4__SCALAR, k_eq_1_subtile) {
46580 for (uint32_t m = 1; m <= 2; m++) {
46581 for (uint32_t n = 1; n <= 4; n++) {
46582 GemmMicrokernelTester()
46583 .mr(2)
46584 .nr(4)
46585 .kr(1)
46586 .sr(1)
46587 .m(m)
46588 .n(n)
46589 .k(1)
46590 .iterations(1)
46591 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46592 }
46593 }
46594}
46595
46596TEST(F32_GEMM_2X4__SCALAR, k_eq_1_subtile_m) {
46597 for (uint32_t m = 1; m <= 2; m++) {
46598 GemmMicrokernelTester()
46599 .mr(2)
46600 .nr(4)
46601 .kr(1)
46602 .sr(1)
46603 .m(m)
46604 .n(4)
46605 .k(1)
46606 .iterations(1)
46607 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46608 }
46609}
46610
46611TEST(F32_GEMM_2X4__SCALAR, k_eq_1_subtile_n) {
46612 for (uint32_t n = 1; n <= 4; n++) {
46613 GemmMicrokernelTester()
46614 .mr(2)
46615 .nr(4)
46616 .kr(1)
46617 .sr(1)
46618 .m(2)
46619 .n(n)
46620 .k(1)
46621 .iterations(1)
46622 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46623 }
46624}
46625
46626TEST(F32_GEMM_2X4__SCALAR, k_gt_1) {
46627 for (size_t k = 2; k < 10; k++) {
46628 GemmMicrokernelTester()
46629 .mr(2)
46630 .nr(4)
46631 .kr(1)
46632 .sr(1)
46633 .m(2)
46634 .n(4)
46635 .k(k)
46636 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46637 }
46638}
46639
46640TEST(F32_GEMM_2X4__SCALAR, k_gt_1_strided_a) {
46641 for (size_t k = 2; k < 10; k++) {
46642 GemmMicrokernelTester()
46643 .mr(2)
46644 .nr(4)
46645 .kr(1)
46646 .sr(1)
46647 .m(2)
46648 .n(4)
46649 .k(k)
46650 .a_stride(11)
46651 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46652 }
46653}
46654
46655TEST(F32_GEMM_2X4__SCALAR, k_gt_1_subtile) {
46656 for (size_t k = 2; k < 10; k++) {
46657 for (uint32_t m = 1; m <= 2; m++) {
46658 for (uint32_t n = 1; n <= 4; n++) {
46659 GemmMicrokernelTester()
46660 .mr(2)
46661 .nr(4)
46662 .kr(1)
46663 .sr(1)
46664 .m(m)
46665 .n(n)
46666 .k(k)
46667 .iterations(1)
46668 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46669 }
46670 }
46671 }
46672}
46673
46674TEST(F32_GEMM_2X4__SCALAR, n_gt_4) {
46675 for (uint32_t n = 5; n < 8; n++) {
46676 for (size_t k = 1; k <= 5; k += 2) {
46677 GemmMicrokernelTester()
46678 .mr(2)
46679 .nr(4)
46680 .kr(1)
46681 .sr(1)
46682 .m(2)
46683 .n(4)
46684 .k(k)
46685 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46686 }
46687 }
46688}
46689
46690TEST(F32_GEMM_2X4__SCALAR, n_gt_4_strided_cn) {
46691 for (uint32_t n = 5; n < 8; n++) {
46692 for (size_t k = 1; k <= 5; k += 2) {
46693 GemmMicrokernelTester()
46694 .mr(2)
46695 .nr(4)
46696 .kr(1)
46697 .sr(1)
46698 .m(2)
46699 .n(4)
46700 .k(k)
46701 .cn_stride(7)
46702 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46703 }
46704 }
46705}
46706
46707TEST(F32_GEMM_2X4__SCALAR, n_gt_4_strided_a) {
46708 for (uint32_t n = 5; n < 8; n++) {
46709 for (size_t k = 1; k <= 5; k += 2) {
46710 GemmMicrokernelTester()
46711 .mr(2)
46712 .nr(4)
46713 .kr(1)
46714 .sr(1)
46715 .m(2)
46716 .n(n)
46717 .k(k)
46718 .a_stride(7)
46719 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46720 }
46721 }
46722}
46723
46724TEST(F32_GEMM_2X4__SCALAR, n_gt_4_subtile) {
46725 for (uint32_t n = 5; n < 8; n++) {
46726 for (size_t k = 1; k <= 5; k += 2) {
46727 for (uint32_t m = 1; m <= 2; m++) {
46728 GemmMicrokernelTester()
46729 .mr(2)
46730 .nr(4)
46731 .kr(1)
46732 .sr(1)
46733 .m(m)
46734 .n(n)
46735 .k(k)
46736 .iterations(1)
46737 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46738 }
46739 }
46740 }
46741}
46742
46743TEST(F32_GEMM_2X4__SCALAR, n_div_4) {
46744 for (uint32_t n = 8; n <= 12; n += 4) {
46745 for (size_t k = 1; k <= 5; k += 2) {
46746 GemmMicrokernelTester()
46747 .mr(2)
46748 .nr(4)
46749 .kr(1)
46750 .sr(1)
46751 .m(2)
46752 .n(4)
46753 .k(k)
46754 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46755 }
46756 }
46757}
46758
46759TEST(F32_GEMM_2X4__SCALAR, n_div_4_strided_cn) {
46760 for (uint32_t n = 8; n <= 12; n += 4) {
46761 for (size_t k = 1; k <= 5; k += 2) {
46762 GemmMicrokernelTester()
46763 .mr(2)
46764 .nr(4)
46765 .kr(1)
46766 .sr(1)
46767 .m(2)
46768 .n(n)
46769 .k(k)
46770 .cn_stride(7)
46771 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46772 }
46773 }
46774}
46775
46776TEST(F32_GEMM_2X4__SCALAR, n_div_4_strided_a) {
46777 for (uint32_t n = 8; n <= 12; n += 4) {
46778 for (size_t k = 1; k <= 5; k += 2) {
46779 GemmMicrokernelTester()
46780 .mr(2)
46781 .nr(4)
46782 .kr(1)
46783 .sr(1)
46784 .m(2)
46785 .n(n)
46786 .k(k)
46787 .a_stride(7)
46788 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46789 }
46790 }
46791}
46792
46793TEST(F32_GEMM_2X4__SCALAR, n_div_4_subtile) {
46794 for (uint32_t n = 8; n <= 12; n += 4) {
46795 for (size_t k = 1; k <= 5; k += 2) {
46796 for (uint32_t m = 1; m <= 2; m++) {
46797 GemmMicrokernelTester()
46798 .mr(2)
46799 .nr(4)
46800 .kr(1)
46801 .sr(1)
46802 .m(m)
46803 .n(n)
46804 .k(k)
46805 .iterations(1)
46806 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46807 }
46808 }
46809 }
46810}
46811
46812TEST(F32_GEMM_2X4__SCALAR, strided_cm_subtile) {
46813 for (size_t k = 1; k <= 5; k += 2) {
46814 for (uint32_t m = 1; m <= 2; m++) {
46815 for (uint32_t n = 1; n <= 4; n++) {
46816 GemmMicrokernelTester()
46817 .mr(2)
46818 .nr(4)
46819 .kr(1)
46820 .sr(1)
46821 .m(m)
46822 .n(n)
46823 .k(k)
46824 .cm_stride(7)
46825 .iterations(1)
46826 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46827 }
46828 }
46829 }
46830}
46831
46832TEST(F32_GEMM_2X4__SCALAR, qmin) {
46833 GemmMicrokernelTester()
46834 .mr(2)
46835 .nr(4)
46836 .kr(1)
46837 .sr(1)
46838 .m(2)
46839 .n(4)
46840 .k(1)
46841 .qmin(128)
46842 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46843}
46844
46845TEST(F32_GEMM_2X4__SCALAR, qmax) {
46846 GemmMicrokernelTester()
46847 .mr(2)
46848 .nr(4)
46849 .kr(1)
46850 .sr(1)
46851 .m(2)
46852 .n(4)
46853 .k(1)
46854 .qmax(128)
46855 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46856}
46857
46858TEST(F32_GEMM_2X4__SCALAR, strided_cm) {
46859 GemmMicrokernelTester()
46860 .mr(2)
46861 .nr(4)
46862 .kr(1)
46863 .sr(1)
46864 .m(2)
46865 .n(4)
46866 .k(1)
46867 .cm_stride(7)
46868 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46869}
46870
46871
46872TEST(F32_GEMM_4X4__SCALAR, k_eq_1) {
46873 GemmMicrokernelTester()
46874 .mr(4)
46875 .nr(4)
46876 .kr(1)
46877 .sr(1)
46878 .m(4)
46879 .n(4)
46880 .k(1)
46881 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46882}
46883
46884TEST(F32_GEMM_4X4__SCALAR, strided_cn) {
46885 GemmMicrokernelTester()
46886 .mr(4)
46887 .nr(4)
46888 .kr(1)
46889 .sr(1)
46890 .m(4)
46891 .n(4)
46892 .k(1)
46893 .cn_stride(7)
46894 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46895}
46896
46897TEST(F32_GEMM_4X4__SCALAR, k_eq_1_strided_a) {
46898 GemmMicrokernelTester()
46899 .mr(4)
46900 .nr(4)
46901 .kr(1)
46902 .sr(1)
46903 .m(4)
46904 .n(4)
46905 .k(1)
46906 .a_stride(3)
46907 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46908}
46909
46910TEST(F32_GEMM_4X4__SCALAR, k_eq_1_subtile) {
46911 for (uint32_t m = 1; m <= 4; m++) {
46912 for (uint32_t n = 1; n <= 4; n++) {
46913 GemmMicrokernelTester()
46914 .mr(4)
46915 .nr(4)
46916 .kr(1)
46917 .sr(1)
46918 .m(m)
46919 .n(n)
46920 .k(1)
46921 .iterations(1)
46922 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46923 }
46924 }
46925}
46926
46927TEST(F32_GEMM_4X4__SCALAR, k_eq_1_subtile_m) {
46928 for (uint32_t m = 1; m <= 4; m++) {
46929 GemmMicrokernelTester()
46930 .mr(4)
46931 .nr(4)
46932 .kr(1)
46933 .sr(1)
46934 .m(m)
46935 .n(4)
46936 .k(1)
46937 .iterations(1)
46938 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46939 }
46940}
46941
46942TEST(F32_GEMM_4X4__SCALAR, k_eq_1_subtile_n) {
46943 for (uint32_t n = 1; n <= 4; n++) {
46944 GemmMicrokernelTester()
46945 .mr(4)
46946 .nr(4)
46947 .kr(1)
46948 .sr(1)
46949 .m(4)
46950 .n(n)
46951 .k(1)
46952 .iterations(1)
46953 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46954 }
46955}
46956
46957TEST(F32_GEMM_4X4__SCALAR, k_gt_1) {
46958 for (size_t k = 2; k < 10; k++) {
46959 GemmMicrokernelTester()
46960 .mr(4)
46961 .nr(4)
46962 .kr(1)
46963 .sr(1)
46964 .m(4)
46965 .n(4)
46966 .k(k)
46967 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46968 }
46969}
46970
46971TEST(F32_GEMM_4X4__SCALAR, k_gt_1_strided_a) {
46972 for (size_t k = 2; k < 10; k++) {
46973 GemmMicrokernelTester()
46974 .mr(4)
46975 .nr(4)
46976 .kr(1)
46977 .sr(1)
46978 .m(4)
46979 .n(4)
46980 .k(k)
46981 .a_stride(11)
46982 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46983 }
46984}
46985
46986TEST(F32_GEMM_4X4__SCALAR, k_gt_1_subtile) {
46987 for (size_t k = 2; k < 10; k++) {
46988 for (uint32_t m = 1; m <= 4; m++) {
46989 for (uint32_t n = 1; n <= 4; n++) {
46990 GemmMicrokernelTester()
46991 .mr(4)
46992 .nr(4)
46993 .kr(1)
46994 .sr(1)
46995 .m(m)
46996 .n(n)
46997 .k(k)
46998 .iterations(1)
46999 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47000 }
47001 }
47002 }
47003}
47004
47005TEST(F32_GEMM_4X4__SCALAR, n_gt_4) {
47006 for (uint32_t n = 5; n < 8; n++) {
47007 for (size_t k = 1; k <= 5; k += 2) {
47008 GemmMicrokernelTester()
47009 .mr(4)
47010 .nr(4)
47011 .kr(1)
47012 .sr(1)
47013 .m(4)
47014 .n(4)
47015 .k(k)
47016 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47017 }
47018 }
47019}
47020
47021TEST(F32_GEMM_4X4__SCALAR, n_gt_4_strided_cn) {
47022 for (uint32_t n = 5; n < 8; n++) {
47023 for (size_t k = 1; k <= 5; k += 2) {
47024 GemmMicrokernelTester()
47025 .mr(4)
47026 .nr(4)
47027 .kr(1)
47028 .sr(1)
47029 .m(4)
47030 .n(4)
47031 .k(k)
47032 .cn_stride(7)
47033 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47034 }
47035 }
47036}
47037
47038TEST(F32_GEMM_4X4__SCALAR, n_gt_4_strided_a) {
47039 for (uint32_t n = 5; n < 8; n++) {
47040 for (size_t k = 1; k <= 5; k += 2) {
47041 GemmMicrokernelTester()
47042 .mr(4)
47043 .nr(4)
47044 .kr(1)
47045 .sr(1)
47046 .m(4)
47047 .n(n)
47048 .k(k)
47049 .a_stride(7)
47050 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47051 }
47052 }
47053}
47054
47055TEST(F32_GEMM_4X4__SCALAR, n_gt_4_subtile) {
47056 for (uint32_t n = 5; n < 8; n++) {
47057 for (size_t k = 1; k <= 5; k += 2) {
47058 for (uint32_t m = 1; m <= 4; m++) {
47059 GemmMicrokernelTester()
47060 .mr(4)
47061 .nr(4)
47062 .kr(1)
47063 .sr(1)
47064 .m(m)
47065 .n(n)
47066 .k(k)
47067 .iterations(1)
47068 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47069 }
47070 }
47071 }
47072}
47073
47074TEST(F32_GEMM_4X4__SCALAR, n_div_4) {
47075 for (uint32_t n = 8; n <= 12; n += 4) {
47076 for (size_t k = 1; k <= 5; k += 2) {
47077 GemmMicrokernelTester()
47078 .mr(4)
47079 .nr(4)
47080 .kr(1)
47081 .sr(1)
47082 .m(4)
47083 .n(4)
47084 .k(k)
47085 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47086 }
47087 }
47088}
47089
47090TEST(F32_GEMM_4X4__SCALAR, n_div_4_strided_cn) {
47091 for (uint32_t n = 8; n <= 12; n += 4) {
47092 for (size_t k = 1; k <= 5; k += 2) {
47093 GemmMicrokernelTester()
47094 .mr(4)
47095 .nr(4)
47096 .kr(1)
47097 .sr(1)
47098 .m(4)
47099 .n(n)
47100 .k(k)
47101 .cn_stride(7)
47102 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47103 }
47104 }
47105}
47106
47107TEST(F32_GEMM_4X4__SCALAR, n_div_4_strided_a) {
47108 for (uint32_t n = 8; n <= 12; n += 4) {
47109 for (size_t k = 1; k <= 5; k += 2) {
47110 GemmMicrokernelTester()
47111 .mr(4)
47112 .nr(4)
47113 .kr(1)
47114 .sr(1)
47115 .m(4)
47116 .n(n)
47117 .k(k)
47118 .a_stride(7)
47119 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47120 }
47121 }
47122}
47123
47124TEST(F32_GEMM_4X4__SCALAR, n_div_4_subtile) {
47125 for (uint32_t n = 8; n <= 12; n += 4) {
47126 for (size_t k = 1; k <= 5; k += 2) {
47127 for (uint32_t m = 1; m <= 4; m++) {
47128 GemmMicrokernelTester()
47129 .mr(4)
47130 .nr(4)
47131 .kr(1)
47132 .sr(1)
47133 .m(m)
47134 .n(n)
47135 .k(k)
47136 .iterations(1)
47137 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47138 }
47139 }
47140 }
47141}
47142
47143TEST(F32_GEMM_4X4__SCALAR, strided_cm_subtile) {
47144 for (size_t k = 1; k <= 5; k += 2) {
47145 for (uint32_t m = 1; m <= 4; m++) {
47146 for (uint32_t n = 1; n <= 4; n++) {
47147 GemmMicrokernelTester()
47148 .mr(4)
47149 .nr(4)
47150 .kr(1)
47151 .sr(1)
47152 .m(m)
47153 .n(n)
47154 .k(k)
47155 .cm_stride(7)
47156 .iterations(1)
47157 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47158 }
47159 }
47160 }
47161}
47162
47163TEST(F32_GEMM_4X4__SCALAR, qmin) {
47164 GemmMicrokernelTester()
47165 .mr(4)
47166 .nr(4)
47167 .kr(1)
47168 .sr(1)
47169 .m(4)
47170 .n(4)
47171 .k(1)
47172 .qmin(128)
47173 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47174}
47175
47176TEST(F32_GEMM_4X4__SCALAR, qmax) {
47177 GemmMicrokernelTester()
47178 .mr(4)
47179 .nr(4)
47180 .kr(1)
47181 .sr(1)
47182 .m(4)
47183 .n(4)
47184 .k(1)
47185 .qmax(128)
47186 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47187}
47188
47189TEST(F32_GEMM_4X4__SCALAR, strided_cm) {
47190 GemmMicrokernelTester()
47191 .mr(4)
47192 .nr(4)
47193 .kr(1)
47194 .sr(1)
47195 .m(4)
47196 .n(4)
47197 .k(1)
47198 .cm_stride(7)
47199 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47200}
47201
47202
47203TEST(F32_GEMM_4X2__SCALAR, k_eq_1) {
47204 GemmMicrokernelTester()
47205 .mr(4)
47206 .nr(2)
47207 .kr(1)
47208 .sr(1)
47209 .m(4)
47210 .n(2)
47211 .k(1)
47212 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47213}
47214
47215TEST(F32_GEMM_4X2__SCALAR, strided_cn) {
47216 GemmMicrokernelTester()
47217 .mr(4)
47218 .nr(2)
47219 .kr(1)
47220 .sr(1)
47221 .m(4)
47222 .n(2)
47223 .k(1)
47224 .cn_stride(5)
47225 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47226}
47227
47228TEST(F32_GEMM_4X2__SCALAR, k_eq_1_strided_a) {
47229 GemmMicrokernelTester()
47230 .mr(4)
47231 .nr(2)
47232 .kr(1)
47233 .sr(1)
47234 .m(4)
47235 .n(2)
47236 .k(1)
47237 .a_stride(3)
47238 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47239}
47240
47241TEST(F32_GEMM_4X2__SCALAR, k_eq_1_subtile) {
47242 for (uint32_t m = 1; m <= 4; m++) {
47243 for (uint32_t n = 1; n <= 2; n++) {
47244 GemmMicrokernelTester()
47245 .mr(4)
47246 .nr(2)
47247 .kr(1)
47248 .sr(1)
47249 .m(m)
47250 .n(n)
47251 .k(1)
47252 .iterations(1)
47253 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47254 }
47255 }
47256}
47257
47258TEST(F32_GEMM_4X2__SCALAR, k_eq_1_subtile_m) {
47259 for (uint32_t m = 1; m <= 4; m++) {
47260 GemmMicrokernelTester()
47261 .mr(4)
47262 .nr(2)
47263 .kr(1)
47264 .sr(1)
47265 .m(m)
47266 .n(2)
47267 .k(1)
47268 .iterations(1)
47269 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47270 }
47271}
47272
47273TEST(F32_GEMM_4X2__SCALAR, k_eq_1_subtile_n) {
47274 for (uint32_t n = 1; n <= 2; n++) {
47275 GemmMicrokernelTester()
47276 .mr(4)
47277 .nr(2)
47278 .kr(1)
47279 .sr(1)
47280 .m(4)
47281 .n(n)
47282 .k(1)
47283 .iterations(1)
47284 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47285 }
47286}
47287
47288TEST(F32_GEMM_4X2__SCALAR, k_gt_1) {
47289 for (size_t k = 2; k < 10; k++) {
47290 GemmMicrokernelTester()
47291 .mr(4)
47292 .nr(2)
47293 .kr(1)
47294 .sr(1)
47295 .m(4)
47296 .n(2)
47297 .k(k)
47298 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47299 }
47300}
47301
47302TEST(F32_GEMM_4X2__SCALAR, k_gt_1_strided_a) {
47303 for (size_t k = 2; k < 10; k++) {
47304 GemmMicrokernelTester()
47305 .mr(4)
47306 .nr(2)
47307 .kr(1)
47308 .sr(1)
47309 .m(4)
47310 .n(2)
47311 .k(k)
47312 .a_stride(11)
47313 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47314 }
47315}
47316
47317TEST(F32_GEMM_4X2__SCALAR, k_gt_1_subtile) {
47318 for (size_t k = 2; k < 10; k++) {
47319 for (uint32_t m = 1; m <= 4; m++) {
47320 for (uint32_t n = 1; n <= 2; n++) {
47321 GemmMicrokernelTester()
47322 .mr(4)
47323 .nr(2)
47324 .kr(1)
47325 .sr(1)
47326 .m(m)
47327 .n(n)
47328 .k(k)
47329 .iterations(1)
47330 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47331 }
47332 }
47333 }
47334}
47335
47336TEST(F32_GEMM_4X2__SCALAR, n_gt_2) {
47337 for (uint32_t n = 3; n < 4; n++) {
47338 for (size_t k = 1; k <= 5; k += 2) {
47339 GemmMicrokernelTester()
47340 .mr(4)
47341 .nr(2)
47342 .kr(1)
47343 .sr(1)
47344 .m(4)
47345 .n(2)
47346 .k(k)
47347 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47348 }
47349 }
47350}
47351
47352TEST(F32_GEMM_4X2__SCALAR, n_gt_2_strided_cn) {
47353 for (uint32_t n = 3; n < 4; n++) {
47354 for (size_t k = 1; k <= 5; k += 2) {
47355 GemmMicrokernelTester()
47356 .mr(4)
47357 .nr(2)
47358 .kr(1)
47359 .sr(1)
47360 .m(4)
47361 .n(2)
47362 .k(k)
47363 .cn_stride(5)
47364 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47365 }
47366 }
47367}
47368
47369TEST(F32_GEMM_4X2__SCALAR, n_gt_2_strided_a) {
47370 for (uint32_t n = 3; n < 4; n++) {
47371 for (size_t k = 1; k <= 5; k += 2) {
47372 GemmMicrokernelTester()
47373 .mr(4)
47374 .nr(2)
47375 .kr(1)
47376 .sr(1)
47377 .m(4)
47378 .n(n)
47379 .k(k)
47380 .a_stride(7)
47381 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47382 }
47383 }
47384}
47385
47386TEST(F32_GEMM_4X2__SCALAR, n_gt_2_subtile) {
47387 for (uint32_t n = 3; n < 4; n++) {
47388 for (size_t k = 1; k <= 5; k += 2) {
47389 for (uint32_t m = 1; m <= 4; m++) {
47390 GemmMicrokernelTester()
47391 .mr(4)
47392 .nr(2)
47393 .kr(1)
47394 .sr(1)
47395 .m(m)
47396 .n(n)
47397 .k(k)
47398 .iterations(1)
47399 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47400 }
47401 }
47402 }
47403}
47404
47405TEST(F32_GEMM_4X2__SCALAR, n_div_2) {
47406 for (uint32_t n = 4; n <= 6; n += 2) {
47407 for (size_t k = 1; k <= 5; k += 2) {
47408 GemmMicrokernelTester()
47409 .mr(4)
47410 .nr(2)
47411 .kr(1)
47412 .sr(1)
47413 .m(4)
47414 .n(2)
47415 .k(k)
47416 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47417 }
47418 }
47419}
47420
47421TEST(F32_GEMM_4X2__SCALAR, n_div_2_strided_cn) {
47422 for (uint32_t n = 4; n <= 6; n += 2) {
47423 for (size_t k = 1; k <= 5; k += 2) {
47424 GemmMicrokernelTester()
47425 .mr(4)
47426 .nr(2)
47427 .kr(1)
47428 .sr(1)
47429 .m(4)
47430 .n(n)
47431 .k(k)
47432 .cn_stride(5)
47433 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47434 }
47435 }
47436}
47437
47438TEST(F32_GEMM_4X2__SCALAR, n_div_2_strided_a) {
47439 for (uint32_t n = 4; n <= 6; n += 2) {
47440 for (size_t k = 1; k <= 5; k += 2) {
47441 GemmMicrokernelTester()
47442 .mr(4)
47443 .nr(2)
47444 .kr(1)
47445 .sr(1)
47446 .m(4)
47447 .n(n)
47448 .k(k)
47449 .a_stride(7)
47450 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47451 }
47452 }
47453}
47454
47455TEST(F32_GEMM_4X2__SCALAR, n_div_2_subtile) {
47456 for (uint32_t n = 4; n <= 6; n += 2) {
47457 for (size_t k = 1; k <= 5; k += 2) {
47458 for (uint32_t m = 1; m <= 4; m++) {
47459 GemmMicrokernelTester()
47460 .mr(4)
47461 .nr(2)
47462 .kr(1)
47463 .sr(1)
47464 .m(m)
47465 .n(n)
47466 .k(k)
47467 .iterations(1)
47468 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47469 }
47470 }
47471 }
47472}
47473
47474TEST(F32_GEMM_4X2__SCALAR, strided_cm_subtile) {
47475 for (size_t k = 1; k <= 5; k += 2) {
47476 for (uint32_t m = 1; m <= 4; m++) {
47477 for (uint32_t n = 1; n <= 2; n++) {
47478 GemmMicrokernelTester()
47479 .mr(4)
47480 .nr(2)
47481 .kr(1)
47482 .sr(1)
47483 .m(m)
47484 .n(n)
47485 .k(k)
47486 .cm_stride(5)
47487 .iterations(1)
47488 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47489 }
47490 }
47491 }
47492}
47493
47494TEST(F32_GEMM_4X2__SCALAR, qmin) {
47495 GemmMicrokernelTester()
47496 .mr(4)
47497 .nr(2)
47498 .kr(1)
47499 .sr(1)
47500 .m(4)
47501 .n(2)
47502 .k(1)
47503 .qmin(128)
47504 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47505}
47506
47507TEST(F32_GEMM_4X2__SCALAR, qmax) {
47508 GemmMicrokernelTester()
47509 .mr(4)
47510 .nr(2)
47511 .kr(1)
47512 .sr(1)
47513 .m(4)
47514 .n(2)
47515 .k(1)
47516 .qmax(128)
47517 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47518}
47519
47520TEST(F32_GEMM_4X2__SCALAR, strided_cm) {
47521 GemmMicrokernelTester()
47522 .mr(4)
47523 .nr(2)
47524 .kr(1)
47525 .sr(1)
47526 .m(4)
47527 .n(2)
47528 .k(1)
47529 .cm_stride(5)
47530 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
47531}