blob: 5486ead54f55fdb00492cdae6e58c65f3b1b7673 [file] [log] [blame]
Marat Dukhan1c587112020-04-08 20:04:28 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f32-gemminc-minmax.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
22#include "gemm-microkernel-tester.h"
23
24
25#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
26 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
27 TEST_REQUIRES_ARM_NEON_FMA;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(8)
36 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
37 }
38
39 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FMA;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(8)
49 .cn_stride(11)
50 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
51 }
52
53 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
54 TEST_REQUIRES_ARM_NEON_FMA;
55 GemmMicrokernelTester()
56 .mr(1)
57 .nr(8)
58 .kr(1)
59 .sr(1)
60 .m(1)
61 .n(8)
62 .k(8)
63 .a_stride(11)
64 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
65 }
66
67 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
68 TEST_REQUIRES_ARM_NEON_FMA;
69 for (uint32_t m = 1; m <= 1; m++) {
70 for (uint32_t n = 1; n <= 8; n++) {
71 GemmMicrokernelTester()
72 .mr(1)
73 .nr(8)
74 .kr(1)
75 .sr(1)
76 .m(m)
77 .n(n)
78 .k(8)
79 .iterations(1)
80 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
81 }
82 }
83 }
84
85 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
86 TEST_REQUIRES_ARM_NEON_FMA;
87 for (uint32_t m = 1; m <= 1; m++) {
88 GemmMicrokernelTester()
89 .mr(1)
90 .nr(8)
91 .kr(1)
92 .sr(1)
93 .m(m)
94 .n(8)
95 .k(8)
96 .iterations(1)
97 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
98 }
99 }
100
101 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
102 TEST_REQUIRES_ARM_NEON_FMA;
103 for (uint32_t n = 1; n <= 8; n++) {
104 GemmMicrokernelTester()
105 .mr(1)
106 .nr(8)
107 .kr(1)
108 .sr(1)
109 .m(1)
110 .n(n)
111 .k(8)
112 .iterations(1)
113 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
114 }
115 }
116
117 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
118 TEST_REQUIRES_ARM_NEON_FMA;
119 GemmMicrokernelTester()
120 .mr(1)
121 .nr(8)
122 .kr(1)
123 .sr(1)
124 .m(1)
125 .n(8)
126 .k(16)
127 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
128 }
129
130 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_strided_a) {
131 TEST_REQUIRES_ARM_NEON_FMA;
132 GemmMicrokernelTester()
133 .mr(1)
134 .nr(8)
135 .kr(1)
136 .sr(1)
137 .m(1)
138 .n(8)
139 .k(16)
140 .a_stride(19)
141 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
142 }
143
144 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
145 TEST_REQUIRES_ARM_NEON_FMA;
146 for (uint32_t m = 1; m <= 1; m++) {
147 for (uint32_t n = 1; n <= 8; n++) {
148 GemmMicrokernelTester()
149 .mr(1)
150 .nr(8)
151 .kr(1)
152 .sr(1)
153 .m(m)
154 .n(n)
155 .k(16)
156 .iterations(1)
157 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
158 }
159 }
160 }
161
162 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
163 TEST_REQUIRES_ARM_NEON_FMA;
164 for (size_t k = 1; k < 16; k++) {
165 GemmMicrokernelTester()
166 .mr(1)
167 .nr(8)
168 .kr(1)
169 .sr(1)
170 .m(1)
171 .n(8)
172 .k(k)
173 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
174 }
175 }
176
177 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_strided_a) {
178 TEST_REQUIRES_ARM_NEON_FMA;
179 for (size_t k = 1; k < 16; k++) {
180 GemmMicrokernelTester()
181 .mr(1)
182 .nr(8)
183 .kr(1)
184 .sr(1)
185 .m(1)
186 .n(8)
187 .k(k)
188 .a_stride(19)
189 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
190 }
191 }
192
193 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
194 TEST_REQUIRES_ARM_NEON_FMA;
195 for (size_t k = 1; k < 16; k++) {
196 for (uint32_t m = 1; m <= 1; m++) {
197 for (uint32_t n = 1; n <= 8; n++) {
198 GemmMicrokernelTester()
199 .mr(1)
200 .nr(8)
201 .kr(1)
202 .sr(1)
203 .m(m)
204 .n(n)
205 .k(k)
206 .iterations(1)
207 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
208 }
209 }
210 }
211 }
212
213 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
214 TEST_REQUIRES_ARM_NEON_FMA;
215 for (size_t k = 17; k < 16; k++) {
216 GemmMicrokernelTester()
217 .mr(1)
218 .nr(8)
219 .kr(1)
220 .sr(1)
221 .m(1)
222 .n(8)
223 .k(k)
224 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
225 }
226 }
227
228 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_strided_a) {
229 TEST_REQUIRES_ARM_NEON_FMA;
230 for (size_t k = 17; k < 16; k++) {
231 GemmMicrokernelTester()
232 .mr(1)
233 .nr(8)
234 .kr(1)
235 .sr(1)
236 .m(1)
237 .n(8)
238 .k(k)
239 .a_stride(19)
240 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
241 }
242 }
243
244 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_subtile) {
245 TEST_REQUIRES_ARM_NEON_FMA;
246 for (size_t k = 17; k < 16; k++) {
247 for (uint32_t m = 1; m <= 1; m++) {
248 for (uint32_t n = 1; n <= 8; n++) {
249 GemmMicrokernelTester()
250 .mr(1)
251 .nr(8)
252 .kr(1)
253 .sr(1)
254 .m(m)
255 .n(n)
256 .k(k)
257 .iterations(1)
258 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
259 }
260 }
261 }
262 }
263
264 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
265 TEST_REQUIRES_ARM_NEON_FMA;
266 for (size_t k = 24; k <= 80; k += 8) {
267 GemmMicrokernelTester()
268 .mr(1)
269 .nr(8)
270 .kr(1)
271 .sr(1)
272 .m(1)
273 .n(8)
274 .k(k)
275 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
276 }
277 }
278
279 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_strided_a) {
280 TEST_REQUIRES_ARM_NEON_FMA;
281 for (size_t k = 24; k <= 80; k += 8) {
282 GemmMicrokernelTester()
283 .mr(1)
284 .nr(8)
285 .kr(1)
286 .sr(1)
287 .m(1)
288 .n(8)
289 .k(k)
290 .a_stride(83)
291 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
292 }
293 }
294
295 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
296 TEST_REQUIRES_ARM_NEON_FMA;
297 for (size_t k = 24; k <= 80; k += 8) {
298 for (uint32_t m = 1; m <= 1; m++) {
299 for (uint32_t n = 1; n <= 8; n++) {
300 GemmMicrokernelTester()
301 .mr(1)
302 .nr(8)
303 .kr(1)
304 .sr(1)
305 .m(m)
306 .n(n)
307 .k(k)
308 .iterations(1)
309 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
310 }
311 }
312 }
313 }
314
315 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
316 TEST_REQUIRES_ARM_NEON_FMA;
317 for (uint32_t n = 9; n < 16; n++) {
318 for (size_t k = 1; k <= 40; k += 9) {
319 GemmMicrokernelTester()
320 .mr(1)
321 .nr(8)
322 .kr(1)
323 .sr(1)
324 .m(1)
325 .n(8)
326 .k(k)
327 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
328 }
329 }
330 }
331
332 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
333 TEST_REQUIRES_ARM_NEON_FMA;
334 for (uint32_t n = 9; n < 16; n++) {
335 for (size_t k = 1; k <= 40; k += 9) {
336 GemmMicrokernelTester()
337 .mr(1)
338 .nr(8)
339 .kr(1)
340 .sr(1)
341 .m(1)
342 .n(8)
343 .k(k)
344 .cn_stride(11)
345 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
346 }
347 }
348 }
349
350 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
351 TEST_REQUIRES_ARM_NEON_FMA;
352 for (uint32_t n = 9; n < 16; n++) {
353 for (size_t k = 1; k <= 40; k += 9) {
354 GemmMicrokernelTester()
355 .mr(1)
356 .nr(8)
357 .kr(1)
358 .sr(1)
359 .m(1)
360 .n(n)
361 .k(k)
362 .a_stride(43)
363 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
364 }
365 }
366 }
367
368 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
369 TEST_REQUIRES_ARM_NEON_FMA;
370 for (uint32_t n = 9; n < 16; n++) {
371 for (size_t k = 1; k <= 40; k += 9) {
372 for (uint32_t m = 1; m <= 1; m++) {
373 GemmMicrokernelTester()
374 .mr(1)
375 .nr(8)
376 .kr(1)
377 .sr(1)
378 .m(m)
379 .n(n)
380 .k(k)
381 .iterations(1)
382 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
383 }
384 }
385 }
386 }
387
388 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
389 TEST_REQUIRES_ARM_NEON_FMA;
390 for (uint32_t n = 16; n <= 24; n += 8) {
391 for (size_t k = 1; k <= 40; k += 9) {
392 GemmMicrokernelTester()
393 .mr(1)
394 .nr(8)
395 .kr(1)
396 .sr(1)
397 .m(1)
398 .n(8)
399 .k(k)
400 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
401 }
402 }
403 }
404
405 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
406 TEST_REQUIRES_ARM_NEON_FMA;
407 for (uint32_t n = 16; n <= 24; n += 8) {
408 for (size_t k = 1; k <= 40; k += 9) {
409 GemmMicrokernelTester()
410 .mr(1)
411 .nr(8)
412 .kr(1)
413 .sr(1)
414 .m(1)
415 .n(n)
416 .k(k)
417 .cn_stride(11)
418 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
419 }
420 }
421 }
422
423 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
424 TEST_REQUIRES_ARM_NEON_FMA;
425 for (uint32_t n = 16; n <= 24; n += 8) {
426 for (size_t k = 1; k <= 40; k += 9) {
427 GemmMicrokernelTester()
428 .mr(1)
429 .nr(8)
430 .kr(1)
431 .sr(1)
432 .m(1)
433 .n(n)
434 .k(k)
435 .a_stride(43)
436 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
437 }
438 }
439 }
440
441 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
442 TEST_REQUIRES_ARM_NEON_FMA;
443 for (uint32_t n = 16; n <= 24; n += 8) {
444 for (size_t k = 1; k <= 40; k += 9) {
445 for (uint32_t m = 1; m <= 1; m++) {
446 GemmMicrokernelTester()
447 .mr(1)
448 .nr(8)
449 .kr(1)
450 .sr(1)
451 .m(m)
452 .n(n)
453 .k(k)
454 .iterations(1)
455 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
456 }
457 }
458 }
459 }
460
461 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
462 TEST_REQUIRES_ARM_NEON_FMA;
463 for (size_t k = 1; k <= 40; k += 9) {
464 for (uint32_t m = 1; m <= 1; m++) {
465 for (uint32_t n = 1; n <= 8; n++) {
466 GemmMicrokernelTester()
467 .mr(1)
468 .nr(8)
469 .kr(1)
470 .sr(1)
471 .m(m)
472 .n(n)
473 .k(k)
474 .cm_stride(11)
475 .iterations(1)
476 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
477 }
478 }
479 }
480 }
481
482 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
483 TEST_REQUIRES_ARM_NEON_FMA;
484 GemmMicrokernelTester()
485 .mr(1)
486 .nr(8)
487 .kr(1)
488 .sr(1)
489 .m(1)
490 .n(8)
491 .k(8)
492 .qmin(128)
493 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
494 }
495
496 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
497 TEST_REQUIRES_ARM_NEON_FMA;
498 GemmMicrokernelTester()
499 .mr(1)
500 .nr(8)
501 .kr(1)
502 .sr(1)
503 .m(1)
504 .n(8)
505 .k(8)
506 .qmax(128)
507 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
508 }
509
510 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
511 TEST_REQUIRES_ARM_NEON_FMA;
512 GemmMicrokernelTester()
513 .mr(1)
514 .nr(8)
515 .kr(1)
516 .sr(1)
517 .m(1)
518 .n(8)
519 .k(8)
520 .cm_stride(11)
521 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
522 }
523#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
524
525
526#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
527 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
528 TEST_REQUIRES_ARM_NEON_FMA;
529 GemmMicrokernelTester()
530 .mr(1)
531 .nr(8)
532 .kr(1)
533 .sr(1)
534 .m(1)
535 .n(8)
536 .k(8)
537 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
538 }
539
540 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
541 TEST_REQUIRES_ARM_NEON_FMA;
542 GemmMicrokernelTester()
543 .mr(1)
544 .nr(8)
545 .kr(1)
546 .sr(1)
547 .m(1)
548 .n(8)
549 .k(8)
550 .cn_stride(11)
551 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
552 }
553
554 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
555 TEST_REQUIRES_ARM_NEON_FMA;
556 GemmMicrokernelTester()
557 .mr(1)
558 .nr(8)
559 .kr(1)
560 .sr(1)
561 .m(1)
562 .n(8)
563 .k(8)
564 .a_stride(11)
565 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
566 }
567
568 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
569 TEST_REQUIRES_ARM_NEON_FMA;
570 for (uint32_t m = 1; m <= 1; m++) {
571 for (uint32_t n = 1; n <= 8; n++) {
572 GemmMicrokernelTester()
573 .mr(1)
574 .nr(8)
575 .kr(1)
576 .sr(1)
577 .m(m)
578 .n(n)
579 .k(8)
580 .iterations(1)
581 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
582 }
583 }
584 }
585
586 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
587 TEST_REQUIRES_ARM_NEON_FMA;
588 for (uint32_t m = 1; m <= 1; m++) {
589 GemmMicrokernelTester()
590 .mr(1)
591 .nr(8)
592 .kr(1)
593 .sr(1)
594 .m(m)
595 .n(8)
596 .k(8)
597 .iterations(1)
598 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
599 }
600 }
601
602 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
603 TEST_REQUIRES_ARM_NEON_FMA;
604 for (uint32_t n = 1; n <= 8; n++) {
605 GemmMicrokernelTester()
606 .mr(1)
607 .nr(8)
608 .kr(1)
609 .sr(1)
610 .m(1)
611 .n(n)
612 .k(8)
613 .iterations(1)
614 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
615 }
616 }
617
618 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
619 TEST_REQUIRES_ARM_NEON_FMA;
620 GemmMicrokernelTester()
621 .mr(1)
622 .nr(8)
623 .kr(1)
624 .sr(1)
625 .m(1)
626 .n(8)
627 .k(16)
628 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
629 }
630
631 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
632 TEST_REQUIRES_ARM_NEON_FMA;
633 GemmMicrokernelTester()
634 .mr(1)
635 .nr(8)
636 .kr(1)
637 .sr(1)
638 .m(1)
639 .n(8)
640 .k(16)
641 .a_stride(19)
642 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
643 }
644
645 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
646 TEST_REQUIRES_ARM_NEON_FMA;
647 for (uint32_t m = 1; m <= 1; m++) {
648 for (uint32_t n = 1; n <= 8; n++) {
649 GemmMicrokernelTester()
650 .mr(1)
651 .nr(8)
652 .kr(1)
653 .sr(1)
654 .m(m)
655 .n(n)
656 .k(16)
657 .iterations(1)
658 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
659 }
660 }
661 }
662
663 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
664 TEST_REQUIRES_ARM_NEON_FMA;
665 for (size_t k = 1; k < 16; k++) {
666 GemmMicrokernelTester()
667 .mr(1)
668 .nr(8)
669 .kr(1)
670 .sr(1)
671 .m(1)
672 .n(8)
673 .k(k)
674 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
675 }
676 }
677
678 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
679 TEST_REQUIRES_ARM_NEON_FMA;
680 for (size_t k = 1; k < 16; k++) {
681 GemmMicrokernelTester()
682 .mr(1)
683 .nr(8)
684 .kr(1)
685 .sr(1)
686 .m(1)
687 .n(8)
688 .k(k)
689 .a_stride(19)
690 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
691 }
692 }
693
694 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
695 TEST_REQUIRES_ARM_NEON_FMA;
696 for (size_t k = 1; k < 16; k++) {
697 for (uint32_t m = 1; m <= 1; m++) {
698 for (uint32_t n = 1; n <= 8; n++) {
699 GemmMicrokernelTester()
700 .mr(1)
701 .nr(8)
702 .kr(1)
703 .sr(1)
704 .m(m)
705 .n(n)
706 .k(k)
707 .iterations(1)
708 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
709 }
710 }
711 }
712 }
713
714 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
715 TEST_REQUIRES_ARM_NEON_FMA;
716 for (size_t k = 17; k < 16; k++) {
717 GemmMicrokernelTester()
718 .mr(1)
719 .nr(8)
720 .kr(1)
721 .sr(1)
722 .m(1)
723 .n(8)
724 .k(k)
725 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
726 }
727 }
728
729 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
730 TEST_REQUIRES_ARM_NEON_FMA;
731 for (size_t k = 17; k < 16; k++) {
732 GemmMicrokernelTester()
733 .mr(1)
734 .nr(8)
735 .kr(1)
736 .sr(1)
737 .m(1)
738 .n(8)
739 .k(k)
740 .a_stride(19)
741 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
742 }
743 }
744
745 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
746 TEST_REQUIRES_ARM_NEON_FMA;
747 for (size_t k = 17; k < 16; k++) {
748 for (uint32_t m = 1; m <= 1; m++) {
749 for (uint32_t n = 1; n <= 8; n++) {
750 GemmMicrokernelTester()
751 .mr(1)
752 .nr(8)
753 .kr(1)
754 .sr(1)
755 .m(m)
756 .n(n)
757 .k(k)
758 .iterations(1)
759 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
760 }
761 }
762 }
763 }
764
765 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
766 TEST_REQUIRES_ARM_NEON_FMA;
767 for (size_t k = 24; k <= 80; k += 8) {
768 GemmMicrokernelTester()
769 .mr(1)
770 .nr(8)
771 .kr(1)
772 .sr(1)
773 .m(1)
774 .n(8)
775 .k(k)
776 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
777 }
778 }
779
780 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
781 TEST_REQUIRES_ARM_NEON_FMA;
782 for (size_t k = 24; k <= 80; k += 8) {
783 GemmMicrokernelTester()
784 .mr(1)
785 .nr(8)
786 .kr(1)
787 .sr(1)
788 .m(1)
789 .n(8)
790 .k(k)
791 .a_stride(83)
792 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
793 }
794 }
795
796 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
797 TEST_REQUIRES_ARM_NEON_FMA;
798 for (size_t k = 24; k <= 80; k += 8) {
799 for (uint32_t m = 1; m <= 1; m++) {
800 for (uint32_t n = 1; n <= 8; n++) {
801 GemmMicrokernelTester()
802 .mr(1)
803 .nr(8)
804 .kr(1)
805 .sr(1)
806 .m(m)
807 .n(n)
808 .k(k)
809 .iterations(1)
810 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
811 }
812 }
813 }
814 }
815
816 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
817 TEST_REQUIRES_ARM_NEON_FMA;
818 for (uint32_t n = 9; n < 16; n++) {
819 for (size_t k = 1; k <= 40; k += 9) {
820 GemmMicrokernelTester()
821 .mr(1)
822 .nr(8)
823 .kr(1)
824 .sr(1)
825 .m(1)
826 .n(8)
827 .k(k)
828 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
829 }
830 }
831 }
832
833 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
834 TEST_REQUIRES_ARM_NEON_FMA;
835 for (uint32_t n = 9; n < 16; n++) {
836 for (size_t k = 1; k <= 40; k += 9) {
837 GemmMicrokernelTester()
838 .mr(1)
839 .nr(8)
840 .kr(1)
841 .sr(1)
842 .m(1)
843 .n(8)
844 .k(k)
845 .cn_stride(11)
846 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
847 }
848 }
849 }
850
851 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
852 TEST_REQUIRES_ARM_NEON_FMA;
853 for (uint32_t n = 9; n < 16; n++) {
854 for (size_t k = 1; k <= 40; k += 9) {
855 GemmMicrokernelTester()
856 .mr(1)
857 .nr(8)
858 .kr(1)
859 .sr(1)
860 .m(1)
861 .n(n)
862 .k(k)
863 .a_stride(43)
864 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
865 }
866 }
867 }
868
869 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
870 TEST_REQUIRES_ARM_NEON_FMA;
871 for (uint32_t n = 9; n < 16; n++) {
872 for (size_t k = 1; k <= 40; k += 9) {
873 for (uint32_t m = 1; m <= 1; m++) {
874 GemmMicrokernelTester()
875 .mr(1)
876 .nr(8)
877 .kr(1)
878 .sr(1)
879 .m(m)
880 .n(n)
881 .k(k)
882 .iterations(1)
883 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
884 }
885 }
886 }
887 }
888
889 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
890 TEST_REQUIRES_ARM_NEON_FMA;
891 for (uint32_t n = 16; n <= 24; n += 8) {
892 for (size_t k = 1; k <= 40; k += 9) {
893 GemmMicrokernelTester()
894 .mr(1)
895 .nr(8)
896 .kr(1)
897 .sr(1)
898 .m(1)
899 .n(8)
900 .k(k)
901 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
902 }
903 }
904 }
905
906 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
907 TEST_REQUIRES_ARM_NEON_FMA;
908 for (uint32_t n = 16; n <= 24; n += 8) {
909 for (size_t k = 1; k <= 40; k += 9) {
910 GemmMicrokernelTester()
911 .mr(1)
912 .nr(8)
913 .kr(1)
914 .sr(1)
915 .m(1)
916 .n(n)
917 .k(k)
918 .cn_stride(11)
919 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
920 }
921 }
922 }
923
924 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
925 TEST_REQUIRES_ARM_NEON_FMA;
926 for (uint32_t n = 16; n <= 24; n += 8) {
927 for (size_t k = 1; k <= 40; k += 9) {
928 GemmMicrokernelTester()
929 .mr(1)
930 .nr(8)
931 .kr(1)
932 .sr(1)
933 .m(1)
934 .n(n)
935 .k(k)
936 .a_stride(43)
937 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
938 }
939 }
940 }
941
942 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
943 TEST_REQUIRES_ARM_NEON_FMA;
944 for (uint32_t n = 16; n <= 24; n += 8) {
945 for (size_t k = 1; k <= 40; k += 9) {
946 for (uint32_t m = 1; m <= 1; m++) {
947 GemmMicrokernelTester()
948 .mr(1)
949 .nr(8)
950 .kr(1)
951 .sr(1)
952 .m(m)
953 .n(n)
954 .k(k)
955 .iterations(1)
956 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
957 }
958 }
959 }
960 }
961
962 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
963 TEST_REQUIRES_ARM_NEON_FMA;
964 for (size_t k = 1; k <= 40; k += 9) {
965 for (uint32_t m = 1; m <= 1; m++) {
966 for (uint32_t n = 1; n <= 8; n++) {
967 GemmMicrokernelTester()
968 .mr(1)
969 .nr(8)
970 .kr(1)
971 .sr(1)
972 .m(m)
973 .n(n)
974 .k(k)
975 .cm_stride(11)
976 .iterations(1)
977 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
978 }
979 }
980 }
981 }
982
983 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
984 TEST_REQUIRES_ARM_NEON_FMA;
985 GemmMicrokernelTester()
986 .mr(1)
987 .nr(8)
988 .kr(1)
989 .sr(1)
990 .m(1)
991 .n(8)
992 .k(8)
993 .qmin(128)
994 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
995 }
996
997 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
998 TEST_REQUIRES_ARM_NEON_FMA;
999 GemmMicrokernelTester()
1000 .mr(1)
1001 .nr(8)
1002 .kr(1)
1003 .sr(1)
1004 .m(1)
1005 .n(8)
1006 .k(8)
1007 .qmax(128)
1008 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
1009 }
1010
1011 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
1012 TEST_REQUIRES_ARM_NEON_FMA;
1013 GemmMicrokernelTester()
1014 .mr(1)
1015 .nr(8)
1016 .kr(1)
1017 .sr(1)
1018 .m(1)
1019 .n(8)
1020 .k(8)
1021 .cm_stride(11)
1022 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
1023 }
1024#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1025
1026
1027#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1028 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
1029 TEST_REQUIRES_ARM_NEON_FMA;
1030 GemmMicrokernelTester()
1031 .mr(1)
1032 .nr(8)
1033 .kr(1)
1034 .sr(1)
1035 .m(1)
1036 .n(8)
1037 .k(8)
1038 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1039 }
1040
1041 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
1042 TEST_REQUIRES_ARM_NEON_FMA;
1043 GemmMicrokernelTester()
1044 .mr(1)
1045 .nr(8)
1046 .kr(1)
1047 .sr(1)
1048 .m(1)
1049 .n(8)
1050 .k(8)
1051 .cn_stride(11)
1052 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1053 }
1054
1055 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
1056 TEST_REQUIRES_ARM_NEON_FMA;
1057 GemmMicrokernelTester()
1058 .mr(1)
1059 .nr(8)
1060 .kr(1)
1061 .sr(1)
1062 .m(1)
1063 .n(8)
1064 .k(8)
1065 .a_stride(11)
1066 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1067 }
1068
1069 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
1070 TEST_REQUIRES_ARM_NEON_FMA;
1071 for (uint32_t m = 1; m <= 1; m++) {
1072 for (uint32_t n = 1; n <= 8; n++) {
1073 GemmMicrokernelTester()
1074 .mr(1)
1075 .nr(8)
1076 .kr(1)
1077 .sr(1)
1078 .m(m)
1079 .n(n)
1080 .k(8)
1081 .iterations(1)
1082 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1083 }
1084 }
1085 }
1086
1087 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
1088 TEST_REQUIRES_ARM_NEON_FMA;
1089 for (uint32_t m = 1; m <= 1; m++) {
1090 GemmMicrokernelTester()
1091 .mr(1)
1092 .nr(8)
1093 .kr(1)
1094 .sr(1)
1095 .m(m)
1096 .n(8)
1097 .k(8)
1098 .iterations(1)
1099 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1100 }
1101 }
1102
1103 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
1104 TEST_REQUIRES_ARM_NEON_FMA;
1105 for (uint32_t n = 1; n <= 8; n++) {
1106 GemmMicrokernelTester()
1107 .mr(1)
1108 .nr(8)
1109 .kr(1)
1110 .sr(1)
1111 .m(1)
1112 .n(n)
1113 .k(8)
1114 .iterations(1)
1115 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1116 }
1117 }
1118
1119 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
1120 TEST_REQUIRES_ARM_NEON_FMA;
1121 GemmMicrokernelTester()
1122 .mr(1)
1123 .nr(8)
1124 .kr(1)
1125 .sr(1)
1126 .m(1)
1127 .n(8)
1128 .k(16)
1129 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1130 }
1131
1132 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
1133 TEST_REQUIRES_ARM_NEON_FMA;
1134 GemmMicrokernelTester()
1135 .mr(1)
1136 .nr(8)
1137 .kr(1)
1138 .sr(1)
1139 .m(1)
1140 .n(8)
1141 .k(16)
1142 .a_stride(19)
1143 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1144 }
1145
1146 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
1147 TEST_REQUIRES_ARM_NEON_FMA;
1148 for (uint32_t m = 1; m <= 1; m++) {
1149 for (uint32_t n = 1; n <= 8; n++) {
1150 GemmMicrokernelTester()
1151 .mr(1)
1152 .nr(8)
1153 .kr(1)
1154 .sr(1)
1155 .m(m)
1156 .n(n)
1157 .k(16)
1158 .iterations(1)
1159 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1160 }
1161 }
1162 }
1163
1164 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
1165 TEST_REQUIRES_ARM_NEON_FMA;
1166 for (size_t k = 1; k < 16; k++) {
1167 GemmMicrokernelTester()
1168 .mr(1)
1169 .nr(8)
1170 .kr(1)
1171 .sr(1)
1172 .m(1)
1173 .n(8)
1174 .k(k)
1175 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1176 }
1177 }
1178
1179 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
1180 TEST_REQUIRES_ARM_NEON_FMA;
1181 for (size_t k = 1; k < 16; k++) {
1182 GemmMicrokernelTester()
1183 .mr(1)
1184 .nr(8)
1185 .kr(1)
1186 .sr(1)
1187 .m(1)
1188 .n(8)
1189 .k(k)
1190 .a_stride(19)
1191 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1192 }
1193 }
1194
1195 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
1196 TEST_REQUIRES_ARM_NEON_FMA;
1197 for (size_t k = 1; k < 16; k++) {
1198 for (uint32_t m = 1; m <= 1; m++) {
1199 for (uint32_t n = 1; n <= 8; n++) {
1200 GemmMicrokernelTester()
1201 .mr(1)
1202 .nr(8)
1203 .kr(1)
1204 .sr(1)
1205 .m(m)
1206 .n(n)
1207 .k(k)
1208 .iterations(1)
1209 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1210 }
1211 }
1212 }
1213 }
1214
1215 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
1216 TEST_REQUIRES_ARM_NEON_FMA;
1217 for (size_t k = 17; k < 16; k++) {
1218 GemmMicrokernelTester()
1219 .mr(1)
1220 .nr(8)
1221 .kr(1)
1222 .sr(1)
1223 .m(1)
1224 .n(8)
1225 .k(k)
1226 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1227 }
1228 }
1229
1230 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
1231 TEST_REQUIRES_ARM_NEON_FMA;
1232 for (size_t k = 17; k < 16; k++) {
1233 GemmMicrokernelTester()
1234 .mr(1)
1235 .nr(8)
1236 .kr(1)
1237 .sr(1)
1238 .m(1)
1239 .n(8)
1240 .k(k)
1241 .a_stride(19)
1242 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1243 }
1244 }
1245
1246 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
1247 TEST_REQUIRES_ARM_NEON_FMA;
1248 for (size_t k = 17; k < 16; k++) {
1249 for (uint32_t m = 1; m <= 1; m++) {
1250 for (uint32_t n = 1; n <= 8; n++) {
1251 GemmMicrokernelTester()
1252 .mr(1)
1253 .nr(8)
1254 .kr(1)
1255 .sr(1)
1256 .m(m)
1257 .n(n)
1258 .k(k)
1259 .iterations(1)
1260 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1261 }
1262 }
1263 }
1264 }
1265
1266 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
1267 TEST_REQUIRES_ARM_NEON_FMA;
1268 for (size_t k = 24; k <= 80; k += 8) {
1269 GemmMicrokernelTester()
1270 .mr(1)
1271 .nr(8)
1272 .kr(1)
1273 .sr(1)
1274 .m(1)
1275 .n(8)
1276 .k(k)
1277 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1278 }
1279 }
1280
1281 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
1282 TEST_REQUIRES_ARM_NEON_FMA;
1283 for (size_t k = 24; k <= 80; k += 8) {
1284 GemmMicrokernelTester()
1285 .mr(1)
1286 .nr(8)
1287 .kr(1)
1288 .sr(1)
1289 .m(1)
1290 .n(8)
1291 .k(k)
1292 .a_stride(83)
1293 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1294 }
1295 }
1296
1297 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
1298 TEST_REQUIRES_ARM_NEON_FMA;
1299 for (size_t k = 24; k <= 80; k += 8) {
1300 for (uint32_t m = 1; m <= 1; m++) {
1301 for (uint32_t n = 1; n <= 8; n++) {
1302 GemmMicrokernelTester()
1303 .mr(1)
1304 .nr(8)
1305 .kr(1)
1306 .sr(1)
1307 .m(m)
1308 .n(n)
1309 .k(k)
1310 .iterations(1)
1311 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1312 }
1313 }
1314 }
1315 }
1316
1317 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
1318 TEST_REQUIRES_ARM_NEON_FMA;
1319 for (uint32_t n = 9; n < 16; n++) {
1320 for (size_t k = 1; k <= 40; k += 9) {
1321 GemmMicrokernelTester()
1322 .mr(1)
1323 .nr(8)
1324 .kr(1)
1325 .sr(1)
1326 .m(1)
1327 .n(8)
1328 .k(k)
1329 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1330 }
1331 }
1332 }
1333
1334 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
1335 TEST_REQUIRES_ARM_NEON_FMA;
1336 for (uint32_t n = 9; n < 16; n++) {
1337 for (size_t k = 1; k <= 40; k += 9) {
1338 GemmMicrokernelTester()
1339 .mr(1)
1340 .nr(8)
1341 .kr(1)
1342 .sr(1)
1343 .m(1)
1344 .n(8)
1345 .k(k)
1346 .cn_stride(11)
1347 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1348 }
1349 }
1350 }
1351
1352 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
1353 TEST_REQUIRES_ARM_NEON_FMA;
1354 for (uint32_t n = 9; n < 16; n++) {
1355 for (size_t k = 1; k <= 40; k += 9) {
1356 GemmMicrokernelTester()
1357 .mr(1)
1358 .nr(8)
1359 .kr(1)
1360 .sr(1)
1361 .m(1)
1362 .n(n)
1363 .k(k)
1364 .a_stride(43)
1365 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1366 }
1367 }
1368 }
1369
1370 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
1371 TEST_REQUIRES_ARM_NEON_FMA;
1372 for (uint32_t n = 9; n < 16; n++) {
1373 for (size_t k = 1; k <= 40; k += 9) {
1374 for (uint32_t m = 1; m <= 1; m++) {
1375 GemmMicrokernelTester()
1376 .mr(1)
1377 .nr(8)
1378 .kr(1)
1379 .sr(1)
1380 .m(m)
1381 .n(n)
1382 .k(k)
1383 .iterations(1)
1384 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1385 }
1386 }
1387 }
1388 }
1389
1390 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
1391 TEST_REQUIRES_ARM_NEON_FMA;
1392 for (uint32_t n = 16; n <= 24; n += 8) {
1393 for (size_t k = 1; k <= 40; k += 9) {
1394 GemmMicrokernelTester()
1395 .mr(1)
1396 .nr(8)
1397 .kr(1)
1398 .sr(1)
1399 .m(1)
1400 .n(8)
1401 .k(k)
1402 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1403 }
1404 }
1405 }
1406
1407 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
1408 TEST_REQUIRES_ARM_NEON_FMA;
1409 for (uint32_t n = 16; n <= 24; n += 8) {
1410 for (size_t k = 1; k <= 40; k += 9) {
1411 GemmMicrokernelTester()
1412 .mr(1)
1413 .nr(8)
1414 .kr(1)
1415 .sr(1)
1416 .m(1)
1417 .n(n)
1418 .k(k)
1419 .cn_stride(11)
1420 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1421 }
1422 }
1423 }
1424
1425 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
1426 TEST_REQUIRES_ARM_NEON_FMA;
1427 for (uint32_t n = 16; n <= 24; n += 8) {
1428 for (size_t k = 1; k <= 40; k += 9) {
1429 GemmMicrokernelTester()
1430 .mr(1)
1431 .nr(8)
1432 .kr(1)
1433 .sr(1)
1434 .m(1)
1435 .n(n)
1436 .k(k)
1437 .a_stride(43)
1438 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1439 }
1440 }
1441 }
1442
1443 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
1444 TEST_REQUIRES_ARM_NEON_FMA;
1445 for (uint32_t n = 16; n <= 24; n += 8) {
1446 for (size_t k = 1; k <= 40; k += 9) {
1447 for (uint32_t m = 1; m <= 1; m++) {
1448 GemmMicrokernelTester()
1449 .mr(1)
1450 .nr(8)
1451 .kr(1)
1452 .sr(1)
1453 .m(m)
1454 .n(n)
1455 .k(k)
1456 .iterations(1)
1457 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1458 }
1459 }
1460 }
1461 }
1462
1463 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
1464 TEST_REQUIRES_ARM_NEON_FMA;
1465 for (size_t k = 1; k <= 40; k += 9) {
1466 for (uint32_t m = 1; m <= 1; m++) {
1467 for (uint32_t n = 1; n <= 8; n++) {
1468 GemmMicrokernelTester()
1469 .mr(1)
1470 .nr(8)
1471 .kr(1)
1472 .sr(1)
1473 .m(m)
1474 .n(n)
1475 .k(k)
1476 .cm_stride(11)
1477 .iterations(1)
1478 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1479 }
1480 }
1481 }
1482 }
1483
1484 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
1485 TEST_REQUIRES_ARM_NEON_FMA;
1486 GemmMicrokernelTester()
1487 .mr(1)
1488 .nr(8)
1489 .kr(1)
1490 .sr(1)
1491 .m(1)
1492 .n(8)
1493 .k(8)
1494 .qmin(128)
1495 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1496 }
1497
1498 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
1499 TEST_REQUIRES_ARM_NEON_FMA;
1500 GemmMicrokernelTester()
1501 .mr(1)
1502 .nr(8)
1503 .kr(1)
1504 .sr(1)
1505 .m(1)
1506 .n(8)
1507 .k(8)
1508 .qmax(128)
1509 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1510 }
1511
1512 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
1513 TEST_REQUIRES_ARM_NEON_FMA;
1514 GemmMicrokernelTester()
1515 .mr(1)
1516 .nr(8)
1517 .kr(1)
1518 .sr(1)
1519 .m(1)
1520 .n(8)
1521 .k(8)
1522 .cm_stride(11)
1523 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1524 }
1525#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1526
1527
1528#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1529 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
1530 TEST_REQUIRES_ARM_NEON_FMA;
1531 GemmMicrokernelTester()
1532 .mr(4)
1533 .nr(8)
1534 .kr(1)
1535 .sr(1)
1536 .m(4)
1537 .n(8)
1538 .k(4)
1539 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1540 }
1541
1542 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1543 TEST_REQUIRES_ARM_NEON_FMA;
1544 GemmMicrokernelTester()
1545 .mr(4)
1546 .nr(8)
1547 .kr(1)
1548 .sr(1)
1549 .m(4)
1550 .n(8)
1551 .k(4)
1552 .cn_stride(11)
1553 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1554 }
1555
1556 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
1557 TEST_REQUIRES_ARM_NEON_FMA;
1558 GemmMicrokernelTester()
1559 .mr(4)
1560 .nr(8)
1561 .kr(1)
1562 .sr(1)
1563 .m(4)
1564 .n(8)
1565 .k(4)
1566 .a_stride(7)
1567 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1568 }
1569
1570 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
1571 TEST_REQUIRES_ARM_NEON_FMA;
1572 for (uint32_t m = 1; m <= 4; m++) {
1573 for (uint32_t n = 1; n <= 8; n++) {
1574 GemmMicrokernelTester()
1575 .mr(4)
1576 .nr(8)
1577 .kr(1)
1578 .sr(1)
1579 .m(m)
1580 .n(n)
1581 .k(4)
1582 .iterations(1)
1583 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1584 }
1585 }
1586 }
1587
1588 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
1589 TEST_REQUIRES_ARM_NEON_FMA;
1590 for (uint32_t m = 1; m <= 4; m++) {
1591 GemmMicrokernelTester()
1592 .mr(4)
1593 .nr(8)
1594 .kr(1)
1595 .sr(1)
1596 .m(m)
1597 .n(8)
1598 .k(4)
1599 .iterations(1)
1600 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1601 }
1602 }
1603
1604 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
1605 TEST_REQUIRES_ARM_NEON_FMA;
1606 for (uint32_t n = 1; n <= 8; n++) {
1607 GemmMicrokernelTester()
1608 .mr(4)
1609 .nr(8)
1610 .kr(1)
1611 .sr(1)
1612 .m(4)
1613 .n(n)
1614 .k(4)
1615 .iterations(1)
1616 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1617 }
1618 }
1619
1620 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
1621 TEST_REQUIRES_ARM_NEON_FMA;
1622 GemmMicrokernelTester()
1623 .mr(4)
1624 .nr(8)
1625 .kr(1)
1626 .sr(1)
1627 .m(4)
1628 .n(8)
1629 .k(8)
1630 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1631 }
1632
1633 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
1634 TEST_REQUIRES_ARM_NEON_FMA;
1635 GemmMicrokernelTester()
1636 .mr(4)
1637 .nr(8)
1638 .kr(1)
1639 .sr(1)
1640 .m(4)
1641 .n(8)
1642 .k(8)
1643 .a_stride(11)
1644 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1645 }
1646
1647 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1648 TEST_REQUIRES_ARM_NEON_FMA;
1649 for (uint32_t m = 1; m <= 4; m++) {
1650 for (uint32_t n = 1; n <= 8; n++) {
1651 GemmMicrokernelTester()
1652 .mr(4)
1653 .nr(8)
1654 .kr(1)
1655 .sr(1)
1656 .m(m)
1657 .n(n)
1658 .k(8)
1659 .iterations(1)
1660 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1661 }
1662 }
1663 }
1664
1665 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
1666 TEST_REQUIRES_ARM_NEON_FMA;
1667 for (size_t k = 1; k < 8; k++) {
1668 GemmMicrokernelTester()
1669 .mr(4)
1670 .nr(8)
1671 .kr(1)
1672 .sr(1)
1673 .m(4)
1674 .n(8)
1675 .k(k)
1676 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1677 }
1678 }
1679
1680 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
1681 TEST_REQUIRES_ARM_NEON_FMA;
1682 for (size_t k = 1; k < 8; k++) {
1683 GemmMicrokernelTester()
1684 .mr(4)
1685 .nr(8)
1686 .kr(1)
1687 .sr(1)
1688 .m(4)
1689 .n(8)
1690 .k(k)
1691 .a_stride(11)
1692 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1693 }
1694 }
1695
1696 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
1697 TEST_REQUIRES_ARM_NEON_FMA;
1698 for (size_t k = 1; k < 8; k++) {
1699 for (uint32_t m = 1; m <= 4; m++) {
1700 for (uint32_t n = 1; n <= 8; n++) {
1701 GemmMicrokernelTester()
1702 .mr(4)
1703 .nr(8)
1704 .kr(1)
1705 .sr(1)
1706 .m(m)
1707 .n(n)
1708 .k(k)
1709 .iterations(1)
1710 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1711 }
1712 }
1713 }
1714 }
1715
1716 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
1717 TEST_REQUIRES_ARM_NEON_FMA;
1718 for (size_t k = 9; k < 8; k++) {
1719 GemmMicrokernelTester()
1720 .mr(4)
1721 .nr(8)
1722 .kr(1)
1723 .sr(1)
1724 .m(4)
1725 .n(8)
1726 .k(k)
1727 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1728 }
1729 }
1730
1731 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
1732 TEST_REQUIRES_ARM_NEON_FMA;
1733 for (size_t k = 9; k < 8; k++) {
1734 GemmMicrokernelTester()
1735 .mr(4)
1736 .nr(8)
1737 .kr(1)
1738 .sr(1)
1739 .m(4)
1740 .n(8)
1741 .k(k)
1742 .a_stride(11)
1743 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1744 }
1745 }
1746
1747 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
1748 TEST_REQUIRES_ARM_NEON_FMA;
1749 for (size_t k = 9; k < 8; k++) {
1750 for (uint32_t m = 1; m <= 4; m++) {
1751 for (uint32_t n = 1; n <= 8; n++) {
1752 GemmMicrokernelTester()
1753 .mr(4)
1754 .nr(8)
1755 .kr(1)
1756 .sr(1)
1757 .m(m)
1758 .n(n)
1759 .k(k)
1760 .iterations(1)
1761 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1762 }
1763 }
1764 }
1765 }
1766
1767 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
1768 TEST_REQUIRES_ARM_NEON_FMA;
1769 for (size_t k = 12; k <= 40; k += 4) {
1770 GemmMicrokernelTester()
1771 .mr(4)
1772 .nr(8)
1773 .kr(1)
1774 .sr(1)
1775 .m(4)
1776 .n(8)
1777 .k(k)
1778 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1779 }
1780 }
1781
1782 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
1783 TEST_REQUIRES_ARM_NEON_FMA;
1784 for (size_t k = 12; k <= 40; k += 4) {
1785 GemmMicrokernelTester()
1786 .mr(4)
1787 .nr(8)
1788 .kr(1)
1789 .sr(1)
1790 .m(4)
1791 .n(8)
1792 .k(k)
1793 .a_stride(43)
1794 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1795 }
1796 }
1797
1798 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
1799 TEST_REQUIRES_ARM_NEON_FMA;
1800 for (size_t k = 12; k <= 40; k += 4) {
1801 for (uint32_t m = 1; m <= 4; m++) {
1802 for (uint32_t n = 1; n <= 8; n++) {
1803 GemmMicrokernelTester()
1804 .mr(4)
1805 .nr(8)
1806 .kr(1)
1807 .sr(1)
1808 .m(m)
1809 .n(n)
1810 .k(k)
1811 .iterations(1)
1812 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1813 }
1814 }
1815 }
1816 }
1817
1818 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1819 TEST_REQUIRES_ARM_NEON_FMA;
1820 for (uint32_t n = 9; n < 16; n++) {
1821 for (size_t k = 1; k <= 20; k += 5) {
1822 GemmMicrokernelTester()
1823 .mr(4)
1824 .nr(8)
1825 .kr(1)
1826 .sr(1)
1827 .m(4)
1828 .n(8)
1829 .k(k)
1830 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1831 }
1832 }
1833 }
1834
1835 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1836 TEST_REQUIRES_ARM_NEON_FMA;
1837 for (uint32_t n = 9; n < 16; n++) {
1838 for (size_t k = 1; k <= 20; k += 5) {
1839 GemmMicrokernelTester()
1840 .mr(4)
1841 .nr(8)
1842 .kr(1)
1843 .sr(1)
1844 .m(4)
1845 .n(8)
1846 .k(k)
1847 .cn_stride(11)
1848 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1849 }
1850 }
1851 }
1852
1853 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
1854 TEST_REQUIRES_ARM_NEON_FMA;
1855 for (uint32_t n = 9; n < 16; n++) {
1856 for (size_t k = 1; k <= 20; k += 5) {
1857 GemmMicrokernelTester()
1858 .mr(4)
1859 .nr(8)
1860 .kr(1)
1861 .sr(1)
1862 .m(4)
1863 .n(n)
1864 .k(k)
1865 .a_stride(23)
1866 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1867 }
1868 }
1869 }
1870
1871 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1872 TEST_REQUIRES_ARM_NEON_FMA;
1873 for (uint32_t n = 9; n < 16; n++) {
1874 for (size_t k = 1; k <= 20; k += 5) {
1875 for (uint32_t m = 1; m <= 4; m++) {
1876 GemmMicrokernelTester()
1877 .mr(4)
1878 .nr(8)
1879 .kr(1)
1880 .sr(1)
1881 .m(m)
1882 .n(n)
1883 .k(k)
1884 .iterations(1)
1885 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1886 }
1887 }
1888 }
1889 }
1890
1891 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1892 TEST_REQUIRES_ARM_NEON_FMA;
1893 for (uint32_t n = 16; n <= 24; n += 8) {
1894 for (size_t k = 1; k <= 20; k += 5) {
1895 GemmMicrokernelTester()
1896 .mr(4)
1897 .nr(8)
1898 .kr(1)
1899 .sr(1)
1900 .m(4)
1901 .n(8)
1902 .k(k)
1903 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1904 }
1905 }
1906 }
1907
1908 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1909 TEST_REQUIRES_ARM_NEON_FMA;
1910 for (uint32_t n = 16; n <= 24; n += 8) {
1911 for (size_t k = 1; k <= 20; k += 5) {
1912 GemmMicrokernelTester()
1913 .mr(4)
1914 .nr(8)
1915 .kr(1)
1916 .sr(1)
1917 .m(4)
1918 .n(n)
1919 .k(k)
1920 .cn_stride(11)
1921 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1922 }
1923 }
1924 }
1925
1926 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
1927 TEST_REQUIRES_ARM_NEON_FMA;
1928 for (uint32_t n = 16; n <= 24; n += 8) {
1929 for (size_t k = 1; k <= 20; k += 5) {
1930 GemmMicrokernelTester()
1931 .mr(4)
1932 .nr(8)
1933 .kr(1)
1934 .sr(1)
1935 .m(4)
1936 .n(n)
1937 .k(k)
1938 .a_stride(23)
1939 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1940 }
1941 }
1942 }
1943
1944 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1945 TEST_REQUIRES_ARM_NEON_FMA;
1946 for (uint32_t n = 16; n <= 24; n += 8) {
1947 for (size_t k = 1; k <= 20; k += 5) {
1948 for (uint32_t m = 1; m <= 4; m++) {
1949 GemmMicrokernelTester()
1950 .mr(4)
1951 .nr(8)
1952 .kr(1)
1953 .sr(1)
1954 .m(m)
1955 .n(n)
1956 .k(k)
1957 .iterations(1)
1958 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1959 }
1960 }
1961 }
1962 }
1963
1964 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1965 TEST_REQUIRES_ARM_NEON_FMA;
1966 for (size_t k = 1; k <= 20; k += 5) {
1967 for (uint32_t m = 1; m <= 4; m++) {
1968 for (uint32_t n = 1; n <= 8; n++) {
1969 GemmMicrokernelTester()
1970 .mr(4)
1971 .nr(8)
1972 .kr(1)
1973 .sr(1)
1974 .m(m)
1975 .n(n)
1976 .k(k)
1977 .cm_stride(11)
1978 .iterations(1)
1979 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1980 }
1981 }
1982 }
1983 }
1984
1985 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1986 TEST_REQUIRES_ARM_NEON_FMA;
1987 GemmMicrokernelTester()
1988 .mr(4)
1989 .nr(8)
1990 .kr(1)
1991 .sr(1)
1992 .m(4)
1993 .n(8)
1994 .k(4)
1995 .qmin(128)
1996 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1997 }
1998
1999 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
2000 TEST_REQUIRES_ARM_NEON_FMA;
2001 GemmMicrokernelTester()
2002 .mr(4)
2003 .nr(8)
2004 .kr(1)
2005 .sr(1)
2006 .m(4)
2007 .n(8)
2008 .k(4)
2009 .qmax(128)
2010 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
2011 }
2012
2013 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2014 TEST_REQUIRES_ARM_NEON_FMA;
2015 GemmMicrokernelTester()
2016 .mr(4)
2017 .nr(8)
2018 .kr(1)
2019 .sr(1)
2020 .m(4)
2021 .n(8)
2022 .k(4)
2023 .cm_stride(11)
2024 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
2025 }
2026#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2027
2028
2029#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2030 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
2031 TEST_REQUIRES_ARM_NEON_FMA;
2032 GemmMicrokernelTester()
2033 .mr(4)
2034 .nr(8)
2035 .kr(1)
2036 .sr(1)
2037 .m(4)
2038 .n(8)
2039 .k(4)
2040 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2041 }
2042
2043 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
2044 TEST_REQUIRES_ARM_NEON_FMA;
2045 GemmMicrokernelTester()
2046 .mr(4)
2047 .nr(8)
2048 .kr(1)
2049 .sr(1)
2050 .m(4)
2051 .n(8)
2052 .k(4)
2053 .cn_stride(11)
2054 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2055 }
2056
2057 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_strided_a) {
2058 TEST_REQUIRES_ARM_NEON_FMA;
2059 GemmMicrokernelTester()
2060 .mr(4)
2061 .nr(8)
2062 .kr(1)
2063 .sr(1)
2064 .m(4)
2065 .n(8)
2066 .k(4)
2067 .a_stride(7)
2068 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2069 }
2070
2071 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
2072 TEST_REQUIRES_ARM_NEON_FMA;
2073 for (uint32_t m = 1; m <= 4; m++) {
2074 for (uint32_t n = 1; n <= 8; n++) {
2075 GemmMicrokernelTester()
2076 .mr(4)
2077 .nr(8)
2078 .kr(1)
2079 .sr(1)
2080 .m(m)
2081 .n(n)
2082 .k(4)
2083 .iterations(1)
2084 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2085 }
2086 }
2087 }
2088
2089 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
2090 TEST_REQUIRES_ARM_NEON_FMA;
2091 for (uint32_t m = 1; m <= 4; m++) {
2092 GemmMicrokernelTester()
2093 .mr(4)
2094 .nr(8)
2095 .kr(1)
2096 .sr(1)
2097 .m(m)
2098 .n(8)
2099 .k(4)
2100 .iterations(1)
2101 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2102 }
2103 }
2104
2105 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
2106 TEST_REQUIRES_ARM_NEON_FMA;
2107 for (uint32_t n = 1; n <= 8; n++) {
2108 GemmMicrokernelTester()
2109 .mr(4)
2110 .nr(8)
2111 .kr(1)
2112 .sr(1)
2113 .m(4)
2114 .n(n)
2115 .k(4)
2116 .iterations(1)
2117 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2118 }
2119 }
2120
2121 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
2122 TEST_REQUIRES_ARM_NEON_FMA;
2123 GemmMicrokernelTester()
2124 .mr(4)
2125 .nr(8)
2126 .kr(1)
2127 .sr(1)
2128 .m(4)
2129 .n(8)
2130 .k(8)
2131 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2132 }
2133
2134 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_strided_a) {
2135 TEST_REQUIRES_ARM_NEON_FMA;
2136 GemmMicrokernelTester()
2137 .mr(4)
2138 .nr(8)
2139 .kr(1)
2140 .sr(1)
2141 .m(4)
2142 .n(8)
2143 .k(8)
2144 .a_stride(11)
2145 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2146 }
2147
2148 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
2149 TEST_REQUIRES_ARM_NEON_FMA;
2150 for (uint32_t m = 1; m <= 4; m++) {
2151 for (uint32_t n = 1; n <= 8; n++) {
2152 GemmMicrokernelTester()
2153 .mr(4)
2154 .nr(8)
2155 .kr(1)
2156 .sr(1)
2157 .m(m)
2158 .n(n)
2159 .k(8)
2160 .iterations(1)
2161 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2162 }
2163 }
2164 }
2165
2166 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
2167 TEST_REQUIRES_ARM_NEON_FMA;
2168 for (size_t k = 1; k < 8; k++) {
2169 GemmMicrokernelTester()
2170 .mr(4)
2171 .nr(8)
2172 .kr(1)
2173 .sr(1)
2174 .m(4)
2175 .n(8)
2176 .k(k)
2177 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2178 }
2179 }
2180
2181 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_strided_a) {
2182 TEST_REQUIRES_ARM_NEON_FMA;
2183 for (size_t k = 1; k < 8; k++) {
2184 GemmMicrokernelTester()
2185 .mr(4)
2186 .nr(8)
2187 .kr(1)
2188 .sr(1)
2189 .m(4)
2190 .n(8)
2191 .k(k)
2192 .a_stride(11)
2193 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2194 }
2195 }
2196
2197 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
2198 TEST_REQUIRES_ARM_NEON_FMA;
2199 for (size_t k = 1; k < 8; k++) {
2200 for (uint32_t m = 1; m <= 4; m++) {
2201 for (uint32_t n = 1; n <= 8; n++) {
2202 GemmMicrokernelTester()
2203 .mr(4)
2204 .nr(8)
2205 .kr(1)
2206 .sr(1)
2207 .m(m)
2208 .n(n)
2209 .k(k)
2210 .iterations(1)
2211 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2212 }
2213 }
2214 }
2215 }
2216
2217 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
2218 TEST_REQUIRES_ARM_NEON_FMA;
2219 for (size_t k = 9; k < 8; k++) {
2220 GemmMicrokernelTester()
2221 .mr(4)
2222 .nr(8)
2223 .kr(1)
2224 .sr(1)
2225 .m(4)
2226 .n(8)
2227 .k(k)
2228 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2229 }
2230 }
2231
2232 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_strided_a) {
2233 TEST_REQUIRES_ARM_NEON_FMA;
2234 for (size_t k = 9; k < 8; k++) {
2235 GemmMicrokernelTester()
2236 .mr(4)
2237 .nr(8)
2238 .kr(1)
2239 .sr(1)
2240 .m(4)
2241 .n(8)
2242 .k(k)
2243 .a_stride(11)
2244 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2245 }
2246 }
2247
2248 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_subtile) {
2249 TEST_REQUIRES_ARM_NEON_FMA;
2250 for (size_t k = 9; k < 8; k++) {
2251 for (uint32_t m = 1; m <= 4; m++) {
2252 for (uint32_t n = 1; n <= 8; n++) {
2253 GemmMicrokernelTester()
2254 .mr(4)
2255 .nr(8)
2256 .kr(1)
2257 .sr(1)
2258 .m(m)
2259 .n(n)
2260 .k(k)
2261 .iterations(1)
2262 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2263 }
2264 }
2265 }
2266 }
2267
2268 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
2269 TEST_REQUIRES_ARM_NEON_FMA;
2270 for (size_t k = 12; k <= 40; k += 4) {
2271 GemmMicrokernelTester()
2272 .mr(4)
2273 .nr(8)
2274 .kr(1)
2275 .sr(1)
2276 .m(4)
2277 .n(8)
2278 .k(k)
2279 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2280 }
2281 }
2282
2283 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_strided_a) {
2284 TEST_REQUIRES_ARM_NEON_FMA;
2285 for (size_t k = 12; k <= 40; k += 4) {
2286 GemmMicrokernelTester()
2287 .mr(4)
2288 .nr(8)
2289 .kr(1)
2290 .sr(1)
2291 .m(4)
2292 .n(8)
2293 .k(k)
2294 .a_stride(43)
2295 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2296 }
2297 }
2298
2299 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
2300 TEST_REQUIRES_ARM_NEON_FMA;
2301 for (size_t k = 12; k <= 40; k += 4) {
2302 for (uint32_t m = 1; m <= 4; m++) {
2303 for (uint32_t n = 1; n <= 8; n++) {
2304 GemmMicrokernelTester()
2305 .mr(4)
2306 .nr(8)
2307 .kr(1)
2308 .sr(1)
2309 .m(m)
2310 .n(n)
2311 .k(k)
2312 .iterations(1)
2313 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2314 }
2315 }
2316 }
2317 }
2318
2319 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
2320 TEST_REQUIRES_ARM_NEON_FMA;
2321 for (uint32_t n = 9; n < 16; n++) {
2322 for (size_t k = 1; k <= 20; k += 5) {
2323 GemmMicrokernelTester()
2324 .mr(4)
2325 .nr(8)
2326 .kr(1)
2327 .sr(1)
2328 .m(4)
2329 .n(8)
2330 .k(k)
2331 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2332 }
2333 }
2334 }
2335
2336 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
2337 TEST_REQUIRES_ARM_NEON_FMA;
2338 for (uint32_t n = 9; n < 16; n++) {
2339 for (size_t k = 1; k <= 20; k += 5) {
2340 GemmMicrokernelTester()
2341 .mr(4)
2342 .nr(8)
2343 .kr(1)
2344 .sr(1)
2345 .m(4)
2346 .n(8)
2347 .k(k)
2348 .cn_stride(11)
2349 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2350 }
2351 }
2352 }
2353
2354 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_a) {
2355 TEST_REQUIRES_ARM_NEON_FMA;
2356 for (uint32_t n = 9; n < 16; n++) {
2357 for (size_t k = 1; k <= 20; k += 5) {
2358 GemmMicrokernelTester()
2359 .mr(4)
2360 .nr(8)
2361 .kr(1)
2362 .sr(1)
2363 .m(4)
2364 .n(n)
2365 .k(k)
2366 .a_stride(23)
2367 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2368 }
2369 }
2370 }
2371
2372 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
2373 TEST_REQUIRES_ARM_NEON_FMA;
2374 for (uint32_t n = 9; n < 16; n++) {
2375 for (size_t k = 1; k <= 20; k += 5) {
2376 for (uint32_t m = 1; m <= 4; m++) {
2377 GemmMicrokernelTester()
2378 .mr(4)
2379 .nr(8)
2380 .kr(1)
2381 .sr(1)
2382 .m(m)
2383 .n(n)
2384 .k(k)
2385 .iterations(1)
2386 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2387 }
2388 }
2389 }
2390 }
2391
2392 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
2393 TEST_REQUIRES_ARM_NEON_FMA;
2394 for (uint32_t n = 16; n <= 24; n += 8) {
2395 for (size_t k = 1; k <= 20; k += 5) {
2396 GemmMicrokernelTester()
2397 .mr(4)
2398 .nr(8)
2399 .kr(1)
2400 .sr(1)
2401 .m(4)
2402 .n(8)
2403 .k(k)
2404 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2405 }
2406 }
2407 }
2408
2409 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
2410 TEST_REQUIRES_ARM_NEON_FMA;
2411 for (uint32_t n = 16; n <= 24; n += 8) {
2412 for (size_t k = 1; k <= 20; k += 5) {
2413 GemmMicrokernelTester()
2414 .mr(4)
2415 .nr(8)
2416 .kr(1)
2417 .sr(1)
2418 .m(4)
2419 .n(n)
2420 .k(k)
2421 .cn_stride(11)
2422 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2423 }
2424 }
2425 }
2426
2427 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_a) {
2428 TEST_REQUIRES_ARM_NEON_FMA;
2429 for (uint32_t n = 16; n <= 24; n += 8) {
2430 for (size_t k = 1; k <= 20; k += 5) {
2431 GemmMicrokernelTester()
2432 .mr(4)
2433 .nr(8)
2434 .kr(1)
2435 .sr(1)
2436 .m(4)
2437 .n(n)
2438 .k(k)
2439 .a_stride(23)
2440 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2441 }
2442 }
2443 }
2444
2445 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
2446 TEST_REQUIRES_ARM_NEON_FMA;
2447 for (uint32_t n = 16; n <= 24; n += 8) {
2448 for (size_t k = 1; k <= 20; k += 5) {
2449 for (uint32_t m = 1; m <= 4; m++) {
2450 GemmMicrokernelTester()
2451 .mr(4)
2452 .nr(8)
2453 .kr(1)
2454 .sr(1)
2455 .m(m)
2456 .n(n)
2457 .k(k)
2458 .iterations(1)
2459 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2460 }
2461 }
2462 }
2463 }
2464
2465 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
2466 TEST_REQUIRES_ARM_NEON_FMA;
2467 for (size_t k = 1; k <= 20; k += 5) {
2468 for (uint32_t m = 1; m <= 4; m++) {
2469 for (uint32_t n = 1; n <= 8; n++) {
2470 GemmMicrokernelTester()
2471 .mr(4)
2472 .nr(8)
2473 .kr(1)
2474 .sr(1)
2475 .m(m)
2476 .n(n)
2477 .k(k)
2478 .cm_stride(11)
2479 .iterations(1)
2480 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2481 }
2482 }
2483 }
2484 }
2485
2486 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
2487 TEST_REQUIRES_ARM_NEON_FMA;
2488 GemmMicrokernelTester()
2489 .mr(4)
2490 .nr(8)
2491 .kr(1)
2492 .sr(1)
2493 .m(4)
2494 .n(8)
2495 .k(4)
2496 .qmin(128)
2497 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2498 }
2499
2500 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
2501 TEST_REQUIRES_ARM_NEON_FMA;
2502 GemmMicrokernelTester()
2503 .mr(4)
2504 .nr(8)
2505 .kr(1)
2506 .sr(1)
2507 .m(4)
2508 .n(8)
2509 .k(4)
2510 .qmax(128)
2511 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2512 }
2513
2514 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
2515 TEST_REQUIRES_ARM_NEON_FMA;
2516 GemmMicrokernelTester()
2517 .mr(4)
2518 .nr(8)
2519 .kr(1)
2520 .sr(1)
2521 .m(4)
2522 .n(8)
2523 .k(4)
2524 .cm_stride(11)
2525 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a55);
2526 }
2527#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2528
2529
2530#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2531 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
2532 TEST_REQUIRES_ARM_NEON_FMA;
2533 GemmMicrokernelTester()
2534 .mr(4)
2535 .nr(8)
2536 .kr(1)
2537 .sr(1)
2538 .m(4)
2539 .n(8)
2540 .k(8)
2541 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2542 }
2543
2544 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
2545 TEST_REQUIRES_ARM_NEON_FMA;
2546 GemmMicrokernelTester()
2547 .mr(4)
2548 .nr(8)
2549 .kr(1)
2550 .sr(1)
2551 .m(4)
2552 .n(8)
2553 .k(8)
2554 .cn_stride(11)
2555 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2556 }
2557
2558 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
2559 TEST_REQUIRES_ARM_NEON_FMA;
2560 GemmMicrokernelTester()
2561 .mr(4)
2562 .nr(8)
2563 .kr(1)
2564 .sr(1)
2565 .m(4)
2566 .n(8)
2567 .k(8)
2568 .a_stride(11)
2569 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2570 }
2571
2572 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
2573 TEST_REQUIRES_ARM_NEON_FMA;
2574 for (uint32_t m = 1; m <= 4; m++) {
2575 for (uint32_t n = 1; n <= 8; n++) {
2576 GemmMicrokernelTester()
2577 .mr(4)
2578 .nr(8)
2579 .kr(1)
2580 .sr(1)
2581 .m(m)
2582 .n(n)
2583 .k(8)
2584 .iterations(1)
2585 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2586 }
2587 }
2588 }
2589
2590 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
2591 TEST_REQUIRES_ARM_NEON_FMA;
2592 for (uint32_t m = 1; m <= 4; m++) {
2593 GemmMicrokernelTester()
2594 .mr(4)
2595 .nr(8)
2596 .kr(1)
2597 .sr(1)
2598 .m(m)
2599 .n(8)
2600 .k(8)
2601 .iterations(1)
2602 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2603 }
2604 }
2605
2606 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
2607 TEST_REQUIRES_ARM_NEON_FMA;
2608 for (uint32_t n = 1; n <= 8; n++) {
2609 GemmMicrokernelTester()
2610 .mr(4)
2611 .nr(8)
2612 .kr(1)
2613 .sr(1)
2614 .m(4)
2615 .n(n)
2616 .k(8)
2617 .iterations(1)
2618 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2619 }
2620 }
2621
2622 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
2623 TEST_REQUIRES_ARM_NEON_FMA;
2624 GemmMicrokernelTester()
2625 .mr(4)
2626 .nr(8)
2627 .kr(1)
2628 .sr(1)
2629 .m(4)
2630 .n(8)
2631 .k(16)
2632 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2633 }
2634
2635 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
2636 TEST_REQUIRES_ARM_NEON_FMA;
2637 GemmMicrokernelTester()
2638 .mr(4)
2639 .nr(8)
2640 .kr(1)
2641 .sr(1)
2642 .m(4)
2643 .n(8)
2644 .k(16)
2645 .a_stride(19)
2646 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2647 }
2648
2649 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
2650 TEST_REQUIRES_ARM_NEON_FMA;
2651 for (uint32_t m = 1; m <= 4; m++) {
2652 for (uint32_t n = 1; n <= 8; n++) {
2653 GemmMicrokernelTester()
2654 .mr(4)
2655 .nr(8)
2656 .kr(1)
2657 .sr(1)
2658 .m(m)
2659 .n(n)
2660 .k(16)
2661 .iterations(1)
2662 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2663 }
2664 }
2665 }
2666
2667 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
2668 TEST_REQUIRES_ARM_NEON_FMA;
2669 for (size_t k = 1; k < 16; k++) {
2670 GemmMicrokernelTester()
2671 .mr(4)
2672 .nr(8)
2673 .kr(1)
2674 .sr(1)
2675 .m(4)
2676 .n(8)
2677 .k(k)
2678 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2679 }
2680 }
2681
2682 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
2683 TEST_REQUIRES_ARM_NEON_FMA;
2684 for (size_t k = 1; k < 16; k++) {
2685 GemmMicrokernelTester()
2686 .mr(4)
2687 .nr(8)
2688 .kr(1)
2689 .sr(1)
2690 .m(4)
2691 .n(8)
2692 .k(k)
2693 .a_stride(19)
2694 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2695 }
2696 }
2697
2698 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
2699 TEST_REQUIRES_ARM_NEON_FMA;
2700 for (size_t k = 1; k < 16; k++) {
2701 for (uint32_t m = 1; m <= 4; m++) {
2702 for (uint32_t n = 1; n <= 8; n++) {
2703 GemmMicrokernelTester()
2704 .mr(4)
2705 .nr(8)
2706 .kr(1)
2707 .sr(1)
2708 .m(m)
2709 .n(n)
2710 .k(k)
2711 .iterations(1)
2712 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2713 }
2714 }
2715 }
2716 }
2717
2718 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
2719 TEST_REQUIRES_ARM_NEON_FMA;
2720 for (size_t k = 17; k < 16; k++) {
2721 GemmMicrokernelTester()
2722 .mr(4)
2723 .nr(8)
2724 .kr(1)
2725 .sr(1)
2726 .m(4)
2727 .n(8)
2728 .k(k)
2729 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2730 }
2731 }
2732
2733 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
2734 TEST_REQUIRES_ARM_NEON_FMA;
2735 for (size_t k = 17; k < 16; k++) {
2736 GemmMicrokernelTester()
2737 .mr(4)
2738 .nr(8)
2739 .kr(1)
2740 .sr(1)
2741 .m(4)
2742 .n(8)
2743 .k(k)
2744 .a_stride(19)
2745 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2746 }
2747 }
2748
2749 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
2750 TEST_REQUIRES_ARM_NEON_FMA;
2751 for (size_t k = 17; k < 16; k++) {
2752 for (uint32_t m = 1; m <= 4; m++) {
2753 for (uint32_t n = 1; n <= 8; n++) {
2754 GemmMicrokernelTester()
2755 .mr(4)
2756 .nr(8)
2757 .kr(1)
2758 .sr(1)
2759 .m(m)
2760 .n(n)
2761 .k(k)
2762 .iterations(1)
2763 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2764 }
2765 }
2766 }
2767 }
2768
2769 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
2770 TEST_REQUIRES_ARM_NEON_FMA;
2771 for (size_t k = 24; k <= 80; k += 8) {
2772 GemmMicrokernelTester()
2773 .mr(4)
2774 .nr(8)
2775 .kr(1)
2776 .sr(1)
2777 .m(4)
2778 .n(8)
2779 .k(k)
2780 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2781 }
2782 }
2783
2784 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
2785 TEST_REQUIRES_ARM_NEON_FMA;
2786 for (size_t k = 24; k <= 80; k += 8) {
2787 GemmMicrokernelTester()
2788 .mr(4)
2789 .nr(8)
2790 .kr(1)
2791 .sr(1)
2792 .m(4)
2793 .n(8)
2794 .k(k)
2795 .a_stride(83)
2796 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2797 }
2798 }
2799
2800 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
2801 TEST_REQUIRES_ARM_NEON_FMA;
2802 for (size_t k = 24; k <= 80; k += 8) {
2803 for (uint32_t m = 1; m <= 4; m++) {
2804 for (uint32_t n = 1; n <= 8; n++) {
2805 GemmMicrokernelTester()
2806 .mr(4)
2807 .nr(8)
2808 .kr(1)
2809 .sr(1)
2810 .m(m)
2811 .n(n)
2812 .k(k)
2813 .iterations(1)
2814 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2815 }
2816 }
2817 }
2818 }
2819
2820 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
2821 TEST_REQUIRES_ARM_NEON_FMA;
2822 for (uint32_t n = 9; n < 16; n++) {
2823 for (size_t k = 1; k <= 40; k += 9) {
2824 GemmMicrokernelTester()
2825 .mr(4)
2826 .nr(8)
2827 .kr(1)
2828 .sr(1)
2829 .m(4)
2830 .n(8)
2831 .k(k)
2832 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2833 }
2834 }
2835 }
2836
2837 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
2838 TEST_REQUIRES_ARM_NEON_FMA;
2839 for (uint32_t n = 9; n < 16; n++) {
2840 for (size_t k = 1; k <= 40; k += 9) {
2841 GemmMicrokernelTester()
2842 .mr(4)
2843 .nr(8)
2844 .kr(1)
2845 .sr(1)
2846 .m(4)
2847 .n(8)
2848 .k(k)
2849 .cn_stride(11)
2850 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2851 }
2852 }
2853 }
2854
2855 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
2856 TEST_REQUIRES_ARM_NEON_FMA;
2857 for (uint32_t n = 9; n < 16; n++) {
2858 for (size_t k = 1; k <= 40; k += 9) {
2859 GemmMicrokernelTester()
2860 .mr(4)
2861 .nr(8)
2862 .kr(1)
2863 .sr(1)
2864 .m(4)
2865 .n(n)
2866 .k(k)
2867 .a_stride(43)
2868 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2869 }
2870 }
2871 }
2872
2873 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
2874 TEST_REQUIRES_ARM_NEON_FMA;
2875 for (uint32_t n = 9; n < 16; n++) {
2876 for (size_t k = 1; k <= 40; k += 9) {
2877 for (uint32_t m = 1; m <= 4; m++) {
2878 GemmMicrokernelTester()
2879 .mr(4)
2880 .nr(8)
2881 .kr(1)
2882 .sr(1)
2883 .m(m)
2884 .n(n)
2885 .k(k)
2886 .iterations(1)
2887 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2888 }
2889 }
2890 }
2891 }
2892
2893 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
2894 TEST_REQUIRES_ARM_NEON_FMA;
2895 for (uint32_t n = 16; n <= 24; n += 8) {
2896 for (size_t k = 1; k <= 40; k += 9) {
2897 GemmMicrokernelTester()
2898 .mr(4)
2899 .nr(8)
2900 .kr(1)
2901 .sr(1)
2902 .m(4)
2903 .n(8)
2904 .k(k)
2905 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2906 }
2907 }
2908 }
2909
2910 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
2911 TEST_REQUIRES_ARM_NEON_FMA;
2912 for (uint32_t n = 16; n <= 24; n += 8) {
2913 for (size_t k = 1; k <= 40; k += 9) {
2914 GemmMicrokernelTester()
2915 .mr(4)
2916 .nr(8)
2917 .kr(1)
2918 .sr(1)
2919 .m(4)
2920 .n(n)
2921 .k(k)
2922 .cn_stride(11)
2923 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2924 }
2925 }
2926 }
2927
2928 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
2929 TEST_REQUIRES_ARM_NEON_FMA;
2930 for (uint32_t n = 16; n <= 24; n += 8) {
2931 for (size_t k = 1; k <= 40; k += 9) {
2932 GemmMicrokernelTester()
2933 .mr(4)
2934 .nr(8)
2935 .kr(1)
2936 .sr(1)
2937 .m(4)
2938 .n(n)
2939 .k(k)
2940 .a_stride(43)
2941 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2942 }
2943 }
2944 }
2945
2946 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
2947 TEST_REQUIRES_ARM_NEON_FMA;
2948 for (uint32_t n = 16; n <= 24; n += 8) {
2949 for (size_t k = 1; k <= 40; k += 9) {
2950 for (uint32_t m = 1; m <= 4; m++) {
2951 GemmMicrokernelTester()
2952 .mr(4)
2953 .nr(8)
2954 .kr(1)
2955 .sr(1)
2956 .m(m)
2957 .n(n)
2958 .k(k)
2959 .iterations(1)
2960 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2961 }
2962 }
2963 }
2964 }
2965
2966 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
2967 TEST_REQUIRES_ARM_NEON_FMA;
2968 for (size_t k = 1; k <= 40; k += 9) {
2969 for (uint32_t m = 1; m <= 4; m++) {
2970 for (uint32_t n = 1; n <= 8; n++) {
2971 GemmMicrokernelTester()
2972 .mr(4)
2973 .nr(8)
2974 .kr(1)
2975 .sr(1)
2976 .m(m)
2977 .n(n)
2978 .k(k)
2979 .cm_stride(11)
2980 .iterations(1)
2981 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2982 }
2983 }
2984 }
2985 }
2986
2987 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
2988 TEST_REQUIRES_ARM_NEON_FMA;
2989 GemmMicrokernelTester()
2990 .mr(4)
2991 .nr(8)
2992 .kr(1)
2993 .sr(1)
2994 .m(4)
2995 .n(8)
2996 .k(8)
2997 .qmin(128)
2998 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2999 }
3000
3001 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
3002 TEST_REQUIRES_ARM_NEON_FMA;
3003 GemmMicrokernelTester()
3004 .mr(4)
3005 .nr(8)
3006 .kr(1)
3007 .sr(1)
3008 .m(4)
3009 .n(8)
3010 .k(8)
3011 .qmax(128)
3012 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
3013 }
3014
3015 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
3016 TEST_REQUIRES_ARM_NEON_FMA;
3017 GemmMicrokernelTester()
3018 .mr(4)
3019 .nr(8)
3020 .kr(1)
3021 .sr(1)
3022 .m(4)
3023 .n(8)
3024 .k(8)
3025 .cm_stride(11)
3026 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
3027 }
3028#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3029
3030
3031#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3032 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
3033 TEST_REQUIRES_ARM_NEON_FMA;
3034 GemmMicrokernelTester()
3035 .mr(4)
3036 .nr(8)
3037 .kr(1)
3038 .sr(1)
3039 .m(4)
3040 .n(8)
3041 .k(8)
3042 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3043 }
3044
3045 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3046 TEST_REQUIRES_ARM_NEON_FMA;
3047 GemmMicrokernelTester()
3048 .mr(4)
3049 .nr(8)
3050 .kr(1)
3051 .sr(1)
3052 .m(4)
3053 .n(8)
3054 .k(8)
3055 .cn_stride(11)
3056 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3057 }
3058
3059 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
3060 TEST_REQUIRES_ARM_NEON_FMA;
3061 GemmMicrokernelTester()
3062 .mr(4)
3063 .nr(8)
3064 .kr(1)
3065 .sr(1)
3066 .m(4)
3067 .n(8)
3068 .k(8)
3069 .a_stride(11)
3070 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3071 }
3072
3073 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3074 TEST_REQUIRES_ARM_NEON_FMA;
3075 for (uint32_t m = 1; m <= 4; m++) {
3076 for (uint32_t n = 1; n <= 8; n++) {
3077 GemmMicrokernelTester()
3078 .mr(4)
3079 .nr(8)
3080 .kr(1)
3081 .sr(1)
3082 .m(m)
3083 .n(n)
3084 .k(8)
3085 .iterations(1)
3086 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3087 }
3088 }
3089 }
3090
3091 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3092 TEST_REQUIRES_ARM_NEON_FMA;
3093 for (uint32_t m = 1; m <= 4; m++) {
3094 GemmMicrokernelTester()
3095 .mr(4)
3096 .nr(8)
3097 .kr(1)
3098 .sr(1)
3099 .m(m)
3100 .n(8)
3101 .k(8)
3102 .iterations(1)
3103 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3104 }
3105 }
3106
3107 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3108 TEST_REQUIRES_ARM_NEON_FMA;
3109 for (uint32_t n = 1; n <= 8; n++) {
3110 GemmMicrokernelTester()
3111 .mr(4)
3112 .nr(8)
3113 .kr(1)
3114 .sr(1)
3115 .m(4)
3116 .n(n)
3117 .k(8)
3118 .iterations(1)
3119 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3120 }
3121 }
3122
3123 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
3124 TEST_REQUIRES_ARM_NEON_FMA;
3125 GemmMicrokernelTester()
3126 .mr(4)
3127 .nr(8)
3128 .kr(1)
3129 .sr(1)
3130 .m(4)
3131 .n(8)
3132 .k(16)
3133 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3134 }
3135
3136 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
3137 TEST_REQUIRES_ARM_NEON_FMA;
3138 GemmMicrokernelTester()
3139 .mr(4)
3140 .nr(8)
3141 .kr(1)
3142 .sr(1)
3143 .m(4)
3144 .n(8)
3145 .k(16)
3146 .a_stride(19)
3147 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3148 }
3149
3150 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
3151 TEST_REQUIRES_ARM_NEON_FMA;
3152 for (uint32_t m = 1; m <= 4; m++) {
3153 for (uint32_t n = 1; n <= 8; n++) {
3154 GemmMicrokernelTester()
3155 .mr(4)
3156 .nr(8)
3157 .kr(1)
3158 .sr(1)
3159 .m(m)
3160 .n(n)
3161 .k(16)
3162 .iterations(1)
3163 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3164 }
3165 }
3166 }
3167
3168 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
3169 TEST_REQUIRES_ARM_NEON_FMA;
3170 for (size_t k = 1; k < 16; k++) {
3171 GemmMicrokernelTester()
3172 .mr(4)
3173 .nr(8)
3174 .kr(1)
3175 .sr(1)
3176 .m(4)
3177 .n(8)
3178 .k(k)
3179 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3180 }
3181 }
3182
3183 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
3184 TEST_REQUIRES_ARM_NEON_FMA;
3185 for (size_t k = 1; k < 16; k++) {
3186 GemmMicrokernelTester()
3187 .mr(4)
3188 .nr(8)
3189 .kr(1)
3190 .sr(1)
3191 .m(4)
3192 .n(8)
3193 .k(k)
3194 .a_stride(19)
3195 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3196 }
3197 }
3198
3199 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
3200 TEST_REQUIRES_ARM_NEON_FMA;
3201 for (size_t k = 1; k < 16; k++) {
3202 for (uint32_t m = 1; m <= 4; m++) {
3203 for (uint32_t n = 1; n <= 8; n++) {
3204 GemmMicrokernelTester()
3205 .mr(4)
3206 .nr(8)
3207 .kr(1)
3208 .sr(1)
3209 .m(m)
3210 .n(n)
3211 .k(k)
3212 .iterations(1)
3213 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3214 }
3215 }
3216 }
3217 }
3218
3219 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
3220 TEST_REQUIRES_ARM_NEON_FMA;
3221 for (size_t k = 17; k < 16; k++) {
3222 GemmMicrokernelTester()
3223 .mr(4)
3224 .nr(8)
3225 .kr(1)
3226 .sr(1)
3227 .m(4)
3228 .n(8)
3229 .k(k)
3230 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3231 }
3232 }
3233
3234 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
3235 TEST_REQUIRES_ARM_NEON_FMA;
3236 for (size_t k = 17; k < 16; k++) {
3237 GemmMicrokernelTester()
3238 .mr(4)
3239 .nr(8)
3240 .kr(1)
3241 .sr(1)
3242 .m(4)
3243 .n(8)
3244 .k(k)
3245 .a_stride(19)
3246 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3247 }
3248 }
3249
3250 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3251 TEST_REQUIRES_ARM_NEON_FMA;
3252 for (size_t k = 17; k < 16; k++) {
3253 for (uint32_t m = 1; m <= 4; m++) {
3254 for (uint32_t n = 1; n <= 8; n++) {
3255 GemmMicrokernelTester()
3256 .mr(4)
3257 .nr(8)
3258 .kr(1)
3259 .sr(1)
3260 .m(m)
3261 .n(n)
3262 .k(k)
3263 .iterations(1)
3264 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3265 }
3266 }
3267 }
3268 }
3269
3270 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3271 TEST_REQUIRES_ARM_NEON_FMA;
3272 for (size_t k = 24; k <= 80; k += 8) {
3273 GemmMicrokernelTester()
3274 .mr(4)
3275 .nr(8)
3276 .kr(1)
3277 .sr(1)
3278 .m(4)
3279 .n(8)
3280 .k(k)
3281 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3282 }
3283 }
3284
3285 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
3286 TEST_REQUIRES_ARM_NEON_FMA;
3287 for (size_t k = 24; k <= 80; k += 8) {
3288 GemmMicrokernelTester()
3289 .mr(4)
3290 .nr(8)
3291 .kr(1)
3292 .sr(1)
3293 .m(4)
3294 .n(8)
3295 .k(k)
3296 .a_stride(83)
3297 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3298 }
3299 }
3300
3301 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3302 TEST_REQUIRES_ARM_NEON_FMA;
3303 for (size_t k = 24; k <= 80; k += 8) {
3304 for (uint32_t m = 1; m <= 4; m++) {
3305 for (uint32_t n = 1; n <= 8; n++) {
3306 GemmMicrokernelTester()
3307 .mr(4)
3308 .nr(8)
3309 .kr(1)
3310 .sr(1)
3311 .m(m)
3312 .n(n)
3313 .k(k)
3314 .iterations(1)
3315 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3316 }
3317 }
3318 }
3319 }
3320
3321 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
3322 TEST_REQUIRES_ARM_NEON_FMA;
3323 for (uint32_t n = 9; n < 16; n++) {
3324 for (size_t k = 1; k <= 40; k += 9) {
3325 GemmMicrokernelTester()
3326 .mr(4)
3327 .nr(8)
3328 .kr(1)
3329 .sr(1)
3330 .m(4)
3331 .n(8)
3332 .k(k)
3333 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3334 }
3335 }
3336 }
3337
3338 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
3339 TEST_REQUIRES_ARM_NEON_FMA;
3340 for (uint32_t n = 9; n < 16; n++) {
3341 for (size_t k = 1; k <= 40; k += 9) {
3342 GemmMicrokernelTester()
3343 .mr(4)
3344 .nr(8)
3345 .kr(1)
3346 .sr(1)
3347 .m(4)
3348 .n(8)
3349 .k(k)
3350 .cn_stride(11)
3351 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3352 }
3353 }
3354 }
3355
3356 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
3357 TEST_REQUIRES_ARM_NEON_FMA;
3358 for (uint32_t n = 9; n < 16; n++) {
3359 for (size_t k = 1; k <= 40; k += 9) {
3360 GemmMicrokernelTester()
3361 .mr(4)
3362 .nr(8)
3363 .kr(1)
3364 .sr(1)
3365 .m(4)
3366 .n(n)
3367 .k(k)
3368 .a_stride(43)
3369 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3370 }
3371 }
3372 }
3373
3374 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
3375 TEST_REQUIRES_ARM_NEON_FMA;
3376 for (uint32_t n = 9; n < 16; n++) {
3377 for (size_t k = 1; k <= 40; k += 9) {
3378 for (uint32_t m = 1; m <= 4; m++) {
3379 GemmMicrokernelTester()
3380 .mr(4)
3381 .nr(8)
3382 .kr(1)
3383 .sr(1)
3384 .m(m)
3385 .n(n)
3386 .k(k)
3387 .iterations(1)
3388 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3389 }
3390 }
3391 }
3392 }
3393
3394 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
3395 TEST_REQUIRES_ARM_NEON_FMA;
3396 for (uint32_t n = 16; n <= 24; n += 8) {
3397 for (size_t k = 1; k <= 40; k += 9) {
3398 GemmMicrokernelTester()
3399 .mr(4)
3400 .nr(8)
3401 .kr(1)
3402 .sr(1)
3403 .m(4)
3404 .n(8)
3405 .k(k)
3406 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3407 }
3408 }
3409 }
3410
3411 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
3412 TEST_REQUIRES_ARM_NEON_FMA;
3413 for (uint32_t n = 16; n <= 24; n += 8) {
3414 for (size_t k = 1; k <= 40; k += 9) {
3415 GemmMicrokernelTester()
3416 .mr(4)
3417 .nr(8)
3418 .kr(1)
3419 .sr(1)
3420 .m(4)
3421 .n(n)
3422 .k(k)
3423 .cn_stride(11)
3424 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3425 }
3426 }
3427 }
3428
3429 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
3430 TEST_REQUIRES_ARM_NEON_FMA;
3431 for (uint32_t n = 16; n <= 24; n += 8) {
3432 for (size_t k = 1; k <= 40; k += 9) {
3433 GemmMicrokernelTester()
3434 .mr(4)
3435 .nr(8)
3436 .kr(1)
3437 .sr(1)
3438 .m(4)
3439 .n(n)
3440 .k(k)
3441 .a_stride(43)
3442 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3443 }
3444 }
3445 }
3446
3447 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
3448 TEST_REQUIRES_ARM_NEON_FMA;
3449 for (uint32_t n = 16; n <= 24; n += 8) {
3450 for (size_t k = 1; k <= 40; k += 9) {
3451 for (uint32_t m = 1; m <= 4; m++) {
3452 GemmMicrokernelTester()
3453 .mr(4)
3454 .nr(8)
3455 .kr(1)
3456 .sr(1)
3457 .m(m)
3458 .n(n)
3459 .k(k)
3460 .iterations(1)
3461 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3462 }
3463 }
3464 }
3465 }
3466
3467 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3468 TEST_REQUIRES_ARM_NEON_FMA;
3469 for (size_t k = 1; k <= 40; k += 9) {
3470 for (uint32_t m = 1; m <= 4; m++) {
3471 for (uint32_t n = 1; n <= 8; n++) {
3472 GemmMicrokernelTester()
3473 .mr(4)
3474 .nr(8)
3475 .kr(1)
3476 .sr(1)
3477 .m(m)
3478 .n(n)
3479 .k(k)
3480 .cm_stride(11)
3481 .iterations(1)
3482 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3483 }
3484 }
3485 }
3486 }
3487
3488 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3489 TEST_REQUIRES_ARM_NEON_FMA;
3490 GemmMicrokernelTester()
3491 .mr(4)
3492 .nr(8)
3493 .kr(1)
3494 .sr(1)
3495 .m(4)
3496 .n(8)
3497 .k(8)
3498 .qmin(128)
3499 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3500 }
3501
3502 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3503 TEST_REQUIRES_ARM_NEON_FMA;
3504 GemmMicrokernelTester()
3505 .mr(4)
3506 .nr(8)
3507 .kr(1)
3508 .sr(1)
3509 .m(4)
3510 .n(8)
3511 .k(8)
3512 .qmax(128)
3513 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3514 }
3515
3516 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3517 TEST_REQUIRES_ARM_NEON_FMA;
3518 GemmMicrokernelTester()
3519 .mr(4)
3520 .nr(8)
3521 .kr(1)
3522 .sr(1)
3523 .m(4)
3524 .n(8)
3525 .k(8)
3526 .cm_stride(11)
3527 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3528 }
3529#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3530
3531
3532#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3533 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
3534 TEST_REQUIRES_ARM_NEON_FMA;
3535 GemmMicrokernelTester()
3536 .mr(5)
3537 .nr(8)
3538 .kr(1)
3539 .sr(1)
3540 .m(5)
3541 .n(8)
3542 .k(8)
3543 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3544 }
3545
3546 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3547 TEST_REQUIRES_ARM_NEON_FMA;
3548 GemmMicrokernelTester()
3549 .mr(5)
3550 .nr(8)
3551 .kr(1)
3552 .sr(1)
3553 .m(5)
3554 .n(8)
3555 .k(8)
3556 .cn_stride(11)
3557 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3558 }
3559
3560 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
3561 TEST_REQUIRES_ARM_NEON_FMA;
3562 GemmMicrokernelTester()
3563 .mr(5)
3564 .nr(8)
3565 .kr(1)
3566 .sr(1)
3567 .m(5)
3568 .n(8)
3569 .k(8)
3570 .a_stride(11)
3571 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3572 }
3573
3574 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3575 TEST_REQUIRES_ARM_NEON_FMA;
3576 for (uint32_t m = 1; m <= 5; m++) {
3577 for (uint32_t n = 1; n <= 8; n++) {
3578 GemmMicrokernelTester()
3579 .mr(5)
3580 .nr(8)
3581 .kr(1)
3582 .sr(1)
3583 .m(m)
3584 .n(n)
3585 .k(8)
3586 .iterations(1)
3587 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3588 }
3589 }
3590 }
3591
3592 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3593 TEST_REQUIRES_ARM_NEON_FMA;
3594 for (uint32_t m = 1; m <= 5; m++) {
3595 GemmMicrokernelTester()
3596 .mr(5)
3597 .nr(8)
3598 .kr(1)
3599 .sr(1)
3600 .m(m)
3601 .n(8)
3602 .k(8)
3603 .iterations(1)
3604 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3605 }
3606 }
3607
3608 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3609 TEST_REQUIRES_ARM_NEON_FMA;
3610 for (uint32_t n = 1; n <= 8; n++) {
3611 GemmMicrokernelTester()
3612 .mr(5)
3613 .nr(8)
3614 .kr(1)
3615 .sr(1)
3616 .m(5)
3617 .n(n)
3618 .k(8)
3619 .iterations(1)
3620 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3621 }
3622 }
3623
3624 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
3625 TEST_REQUIRES_ARM_NEON_FMA;
3626 GemmMicrokernelTester()
3627 .mr(5)
3628 .nr(8)
3629 .kr(1)
3630 .sr(1)
3631 .m(5)
3632 .n(8)
3633 .k(16)
3634 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3635 }
3636
3637 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
3638 TEST_REQUIRES_ARM_NEON_FMA;
3639 GemmMicrokernelTester()
3640 .mr(5)
3641 .nr(8)
3642 .kr(1)
3643 .sr(1)
3644 .m(5)
3645 .n(8)
3646 .k(16)
3647 .a_stride(19)
3648 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3649 }
3650
3651 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
3652 TEST_REQUIRES_ARM_NEON_FMA;
3653 for (uint32_t m = 1; m <= 5; m++) {
3654 for (uint32_t n = 1; n <= 8; n++) {
3655 GemmMicrokernelTester()
3656 .mr(5)
3657 .nr(8)
3658 .kr(1)
3659 .sr(1)
3660 .m(m)
3661 .n(n)
3662 .k(16)
3663 .iterations(1)
3664 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3665 }
3666 }
3667 }
3668
3669 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
3670 TEST_REQUIRES_ARM_NEON_FMA;
3671 for (size_t k = 1; k < 16; k++) {
3672 GemmMicrokernelTester()
3673 .mr(5)
3674 .nr(8)
3675 .kr(1)
3676 .sr(1)
3677 .m(5)
3678 .n(8)
3679 .k(k)
3680 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3681 }
3682 }
3683
3684 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
3685 TEST_REQUIRES_ARM_NEON_FMA;
3686 for (size_t k = 1; k < 16; k++) {
3687 GemmMicrokernelTester()
3688 .mr(5)
3689 .nr(8)
3690 .kr(1)
3691 .sr(1)
3692 .m(5)
3693 .n(8)
3694 .k(k)
3695 .a_stride(19)
3696 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3697 }
3698 }
3699
3700 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
3701 TEST_REQUIRES_ARM_NEON_FMA;
3702 for (size_t k = 1; k < 16; k++) {
3703 for (uint32_t m = 1; m <= 5; m++) {
3704 for (uint32_t n = 1; n <= 8; n++) {
3705 GemmMicrokernelTester()
3706 .mr(5)
3707 .nr(8)
3708 .kr(1)
3709 .sr(1)
3710 .m(m)
3711 .n(n)
3712 .k(k)
3713 .iterations(1)
3714 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3715 }
3716 }
3717 }
3718 }
3719
3720 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
3721 TEST_REQUIRES_ARM_NEON_FMA;
3722 for (size_t k = 17; k < 16; k++) {
3723 GemmMicrokernelTester()
3724 .mr(5)
3725 .nr(8)
3726 .kr(1)
3727 .sr(1)
3728 .m(5)
3729 .n(8)
3730 .k(k)
3731 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3732 }
3733 }
3734
3735 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
3736 TEST_REQUIRES_ARM_NEON_FMA;
3737 for (size_t k = 17; k < 16; k++) {
3738 GemmMicrokernelTester()
3739 .mr(5)
3740 .nr(8)
3741 .kr(1)
3742 .sr(1)
3743 .m(5)
3744 .n(8)
3745 .k(k)
3746 .a_stride(19)
3747 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3748 }
3749 }
3750
3751 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3752 TEST_REQUIRES_ARM_NEON_FMA;
3753 for (size_t k = 17; k < 16; k++) {
3754 for (uint32_t m = 1; m <= 5; m++) {
3755 for (uint32_t n = 1; n <= 8; n++) {
3756 GemmMicrokernelTester()
3757 .mr(5)
3758 .nr(8)
3759 .kr(1)
3760 .sr(1)
3761 .m(m)
3762 .n(n)
3763 .k(k)
3764 .iterations(1)
3765 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3766 }
3767 }
3768 }
3769 }
3770
3771 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3772 TEST_REQUIRES_ARM_NEON_FMA;
3773 for (size_t k = 24; k <= 80; k += 8) {
3774 GemmMicrokernelTester()
3775 .mr(5)
3776 .nr(8)
3777 .kr(1)
3778 .sr(1)
3779 .m(5)
3780 .n(8)
3781 .k(k)
3782 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3783 }
3784 }
3785
3786 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
3787 TEST_REQUIRES_ARM_NEON_FMA;
3788 for (size_t k = 24; k <= 80; k += 8) {
3789 GemmMicrokernelTester()
3790 .mr(5)
3791 .nr(8)
3792 .kr(1)
3793 .sr(1)
3794 .m(5)
3795 .n(8)
3796 .k(k)
3797 .a_stride(83)
3798 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3799 }
3800 }
3801
3802 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3803 TEST_REQUIRES_ARM_NEON_FMA;
3804 for (size_t k = 24; k <= 80; k += 8) {
3805 for (uint32_t m = 1; m <= 5; m++) {
3806 for (uint32_t n = 1; n <= 8; n++) {
3807 GemmMicrokernelTester()
3808 .mr(5)
3809 .nr(8)
3810 .kr(1)
3811 .sr(1)
3812 .m(m)
3813 .n(n)
3814 .k(k)
3815 .iterations(1)
3816 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3817 }
3818 }
3819 }
3820 }
3821
3822 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
3823 TEST_REQUIRES_ARM_NEON_FMA;
3824 for (uint32_t n = 9; n < 16; n++) {
3825 for (size_t k = 1; k <= 40; k += 9) {
3826 GemmMicrokernelTester()
3827 .mr(5)
3828 .nr(8)
3829 .kr(1)
3830 .sr(1)
3831 .m(5)
3832 .n(8)
3833 .k(k)
3834 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3835 }
3836 }
3837 }
3838
3839 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
3840 TEST_REQUIRES_ARM_NEON_FMA;
3841 for (uint32_t n = 9; n < 16; n++) {
3842 for (size_t k = 1; k <= 40; k += 9) {
3843 GemmMicrokernelTester()
3844 .mr(5)
3845 .nr(8)
3846 .kr(1)
3847 .sr(1)
3848 .m(5)
3849 .n(8)
3850 .k(k)
3851 .cn_stride(11)
3852 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3853 }
3854 }
3855 }
3856
3857 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
3858 TEST_REQUIRES_ARM_NEON_FMA;
3859 for (uint32_t n = 9; n < 16; n++) {
3860 for (size_t k = 1; k <= 40; k += 9) {
3861 GemmMicrokernelTester()
3862 .mr(5)
3863 .nr(8)
3864 .kr(1)
3865 .sr(1)
3866 .m(5)
3867 .n(n)
3868 .k(k)
3869 .a_stride(43)
3870 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3871 }
3872 }
3873 }
3874
3875 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
3876 TEST_REQUIRES_ARM_NEON_FMA;
3877 for (uint32_t n = 9; n < 16; n++) {
3878 for (size_t k = 1; k <= 40; k += 9) {
3879 for (uint32_t m = 1; m <= 5; m++) {
3880 GemmMicrokernelTester()
3881 .mr(5)
3882 .nr(8)
3883 .kr(1)
3884 .sr(1)
3885 .m(m)
3886 .n(n)
3887 .k(k)
3888 .iterations(1)
3889 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3890 }
3891 }
3892 }
3893 }
3894
3895 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
3896 TEST_REQUIRES_ARM_NEON_FMA;
3897 for (uint32_t n = 16; n <= 24; n += 8) {
3898 for (size_t k = 1; k <= 40; k += 9) {
3899 GemmMicrokernelTester()
3900 .mr(5)
3901 .nr(8)
3902 .kr(1)
3903 .sr(1)
3904 .m(5)
3905 .n(8)
3906 .k(k)
3907 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3908 }
3909 }
3910 }
3911
3912 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
3913 TEST_REQUIRES_ARM_NEON_FMA;
3914 for (uint32_t n = 16; n <= 24; n += 8) {
3915 for (size_t k = 1; k <= 40; k += 9) {
3916 GemmMicrokernelTester()
3917 .mr(5)
3918 .nr(8)
3919 .kr(1)
3920 .sr(1)
3921 .m(5)
3922 .n(n)
3923 .k(k)
3924 .cn_stride(11)
3925 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3926 }
3927 }
3928 }
3929
3930 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
3931 TEST_REQUIRES_ARM_NEON_FMA;
3932 for (uint32_t n = 16; n <= 24; n += 8) {
3933 for (size_t k = 1; k <= 40; k += 9) {
3934 GemmMicrokernelTester()
3935 .mr(5)
3936 .nr(8)
3937 .kr(1)
3938 .sr(1)
3939 .m(5)
3940 .n(n)
3941 .k(k)
3942 .a_stride(43)
3943 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3944 }
3945 }
3946 }
3947
3948 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
3949 TEST_REQUIRES_ARM_NEON_FMA;
3950 for (uint32_t n = 16; n <= 24; n += 8) {
3951 for (size_t k = 1; k <= 40; k += 9) {
3952 for (uint32_t m = 1; m <= 5; m++) {
3953 GemmMicrokernelTester()
3954 .mr(5)
3955 .nr(8)
3956 .kr(1)
3957 .sr(1)
3958 .m(m)
3959 .n(n)
3960 .k(k)
3961 .iterations(1)
3962 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3963 }
3964 }
3965 }
3966 }
3967
3968 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3969 TEST_REQUIRES_ARM_NEON_FMA;
3970 for (size_t k = 1; k <= 40; k += 9) {
3971 for (uint32_t m = 1; m <= 5; m++) {
3972 for (uint32_t n = 1; n <= 8; n++) {
3973 GemmMicrokernelTester()
3974 .mr(5)
3975 .nr(8)
3976 .kr(1)
3977 .sr(1)
3978 .m(m)
3979 .n(n)
3980 .k(k)
3981 .cm_stride(11)
3982 .iterations(1)
3983 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3984 }
3985 }
3986 }
3987 }
3988
3989 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3990 TEST_REQUIRES_ARM_NEON_FMA;
3991 GemmMicrokernelTester()
3992 .mr(5)
3993 .nr(8)
3994 .kr(1)
3995 .sr(1)
3996 .m(5)
3997 .n(8)
3998 .k(8)
3999 .qmin(128)
4000 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
4001 }
4002
4003 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
4004 TEST_REQUIRES_ARM_NEON_FMA;
4005 GemmMicrokernelTester()
4006 .mr(5)
4007 .nr(8)
4008 .kr(1)
4009 .sr(1)
4010 .m(5)
4011 .n(8)
4012 .k(8)
4013 .qmax(128)
4014 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
4015 }
4016
4017 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
4018 TEST_REQUIRES_ARM_NEON_FMA;
4019 GemmMicrokernelTester()
4020 .mr(5)
4021 .nr(8)
4022 .kr(1)
4023 .sr(1)
4024 .m(5)
4025 .n(8)
4026 .k(8)
4027 .cm_stride(11)
4028 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
4029 }
4030#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4031
4032
4033#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4034 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
4035 TEST_REQUIRES_ARM_NEON_FMA;
4036 GemmMicrokernelTester()
4037 .mr(6)
4038 .nr(8)
4039 .kr(1)
4040 .sr(1)
4041 .m(6)
4042 .n(8)
4043 .k(4)
4044 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4045 }
4046
4047 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
4048 TEST_REQUIRES_ARM_NEON_FMA;
4049 GemmMicrokernelTester()
4050 .mr(6)
4051 .nr(8)
4052 .kr(1)
4053 .sr(1)
4054 .m(6)
4055 .n(8)
4056 .k(4)
4057 .cn_stride(11)
4058 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4059 }
4060
4061 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
4062 TEST_REQUIRES_ARM_NEON_FMA;
4063 GemmMicrokernelTester()
4064 .mr(6)
4065 .nr(8)
4066 .kr(1)
4067 .sr(1)
4068 .m(6)
4069 .n(8)
4070 .k(4)
4071 .a_stride(7)
4072 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4073 }
4074
4075 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
4076 TEST_REQUIRES_ARM_NEON_FMA;
4077 for (uint32_t m = 1; m <= 6; m++) {
4078 for (uint32_t n = 1; n <= 8; n++) {
4079 GemmMicrokernelTester()
4080 .mr(6)
4081 .nr(8)
4082 .kr(1)
4083 .sr(1)
4084 .m(m)
4085 .n(n)
4086 .k(4)
4087 .iterations(1)
4088 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4089 }
4090 }
4091 }
4092
4093 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
4094 TEST_REQUIRES_ARM_NEON_FMA;
4095 for (uint32_t m = 1; m <= 6; m++) {
4096 GemmMicrokernelTester()
4097 .mr(6)
4098 .nr(8)
4099 .kr(1)
4100 .sr(1)
4101 .m(m)
4102 .n(8)
4103 .k(4)
4104 .iterations(1)
4105 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4106 }
4107 }
4108
4109 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
4110 TEST_REQUIRES_ARM_NEON_FMA;
4111 for (uint32_t n = 1; n <= 8; n++) {
4112 GemmMicrokernelTester()
4113 .mr(6)
4114 .nr(8)
4115 .kr(1)
4116 .sr(1)
4117 .m(6)
4118 .n(n)
4119 .k(4)
4120 .iterations(1)
4121 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4122 }
4123 }
4124
4125 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
4126 TEST_REQUIRES_ARM_NEON_FMA;
4127 GemmMicrokernelTester()
4128 .mr(6)
4129 .nr(8)
4130 .kr(1)
4131 .sr(1)
4132 .m(6)
4133 .n(8)
4134 .k(8)
4135 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4136 }
4137
4138 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
4139 TEST_REQUIRES_ARM_NEON_FMA;
4140 GemmMicrokernelTester()
4141 .mr(6)
4142 .nr(8)
4143 .kr(1)
4144 .sr(1)
4145 .m(6)
4146 .n(8)
4147 .k(8)
4148 .a_stride(11)
4149 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4150 }
4151
4152 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
4153 TEST_REQUIRES_ARM_NEON_FMA;
4154 for (uint32_t m = 1; m <= 6; m++) {
4155 for (uint32_t n = 1; n <= 8; n++) {
4156 GemmMicrokernelTester()
4157 .mr(6)
4158 .nr(8)
4159 .kr(1)
4160 .sr(1)
4161 .m(m)
4162 .n(n)
4163 .k(8)
4164 .iterations(1)
4165 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4166 }
4167 }
4168 }
4169
4170 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
4171 TEST_REQUIRES_ARM_NEON_FMA;
4172 for (size_t k = 1; k < 8; k++) {
4173 GemmMicrokernelTester()
4174 .mr(6)
4175 .nr(8)
4176 .kr(1)
4177 .sr(1)
4178 .m(6)
4179 .n(8)
4180 .k(k)
4181 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4182 }
4183 }
4184
4185 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
4186 TEST_REQUIRES_ARM_NEON_FMA;
4187 for (size_t k = 1; k < 8; k++) {
4188 GemmMicrokernelTester()
4189 .mr(6)
4190 .nr(8)
4191 .kr(1)
4192 .sr(1)
4193 .m(6)
4194 .n(8)
4195 .k(k)
4196 .a_stride(11)
4197 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4198 }
4199 }
4200
4201 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
4202 TEST_REQUIRES_ARM_NEON_FMA;
4203 for (size_t k = 1; k < 8; k++) {
4204 for (uint32_t m = 1; m <= 6; m++) {
4205 for (uint32_t n = 1; n <= 8; n++) {
4206 GemmMicrokernelTester()
4207 .mr(6)
4208 .nr(8)
4209 .kr(1)
4210 .sr(1)
4211 .m(m)
4212 .n(n)
4213 .k(k)
4214 .iterations(1)
4215 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4216 }
4217 }
4218 }
4219 }
4220
4221 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
4222 TEST_REQUIRES_ARM_NEON_FMA;
4223 for (size_t k = 9; k < 8; k++) {
4224 GemmMicrokernelTester()
4225 .mr(6)
4226 .nr(8)
4227 .kr(1)
4228 .sr(1)
4229 .m(6)
4230 .n(8)
4231 .k(k)
4232 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4233 }
4234 }
4235
4236 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
4237 TEST_REQUIRES_ARM_NEON_FMA;
4238 for (size_t k = 9; k < 8; k++) {
4239 GemmMicrokernelTester()
4240 .mr(6)
4241 .nr(8)
4242 .kr(1)
4243 .sr(1)
4244 .m(6)
4245 .n(8)
4246 .k(k)
4247 .a_stride(11)
4248 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4249 }
4250 }
4251
4252 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
4253 TEST_REQUIRES_ARM_NEON_FMA;
4254 for (size_t k = 9; k < 8; k++) {
4255 for (uint32_t m = 1; m <= 6; m++) {
4256 for (uint32_t n = 1; n <= 8; n++) {
4257 GemmMicrokernelTester()
4258 .mr(6)
4259 .nr(8)
4260 .kr(1)
4261 .sr(1)
4262 .m(m)
4263 .n(n)
4264 .k(k)
4265 .iterations(1)
4266 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4267 }
4268 }
4269 }
4270 }
4271
4272 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
4273 TEST_REQUIRES_ARM_NEON_FMA;
4274 for (size_t k = 12; k <= 40; k += 4) {
4275 GemmMicrokernelTester()
4276 .mr(6)
4277 .nr(8)
4278 .kr(1)
4279 .sr(1)
4280 .m(6)
4281 .n(8)
4282 .k(k)
4283 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4284 }
4285 }
4286
4287 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
4288 TEST_REQUIRES_ARM_NEON_FMA;
4289 for (size_t k = 12; k <= 40; k += 4) {
4290 GemmMicrokernelTester()
4291 .mr(6)
4292 .nr(8)
4293 .kr(1)
4294 .sr(1)
4295 .m(6)
4296 .n(8)
4297 .k(k)
4298 .a_stride(43)
4299 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4300 }
4301 }
4302
4303 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
4304 TEST_REQUIRES_ARM_NEON_FMA;
4305 for (size_t k = 12; k <= 40; k += 4) {
4306 for (uint32_t m = 1; m <= 6; m++) {
4307 for (uint32_t n = 1; n <= 8; n++) {
4308 GemmMicrokernelTester()
4309 .mr(6)
4310 .nr(8)
4311 .kr(1)
4312 .sr(1)
4313 .m(m)
4314 .n(n)
4315 .k(k)
4316 .iterations(1)
4317 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4318 }
4319 }
4320 }
4321 }
4322
4323 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
4324 TEST_REQUIRES_ARM_NEON_FMA;
4325 for (uint32_t n = 9; n < 16; n++) {
4326 for (size_t k = 1; k <= 20; k += 5) {
4327 GemmMicrokernelTester()
4328 .mr(6)
4329 .nr(8)
4330 .kr(1)
4331 .sr(1)
4332 .m(6)
4333 .n(8)
4334 .k(k)
4335 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4336 }
4337 }
4338 }
4339
4340 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
4341 TEST_REQUIRES_ARM_NEON_FMA;
4342 for (uint32_t n = 9; n < 16; n++) {
4343 for (size_t k = 1; k <= 20; k += 5) {
4344 GemmMicrokernelTester()
4345 .mr(6)
4346 .nr(8)
4347 .kr(1)
4348 .sr(1)
4349 .m(6)
4350 .n(8)
4351 .k(k)
4352 .cn_stride(11)
4353 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4354 }
4355 }
4356 }
4357
4358 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
4359 TEST_REQUIRES_ARM_NEON_FMA;
4360 for (uint32_t n = 9; n < 16; n++) {
4361 for (size_t k = 1; k <= 20; k += 5) {
4362 GemmMicrokernelTester()
4363 .mr(6)
4364 .nr(8)
4365 .kr(1)
4366 .sr(1)
4367 .m(6)
4368 .n(n)
4369 .k(k)
4370 .a_stride(23)
4371 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4372 }
4373 }
4374 }
4375
4376 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
4377 TEST_REQUIRES_ARM_NEON_FMA;
4378 for (uint32_t n = 9; n < 16; n++) {
4379 for (size_t k = 1; k <= 20; k += 5) {
4380 for (uint32_t m = 1; m <= 6; m++) {
4381 GemmMicrokernelTester()
4382 .mr(6)
4383 .nr(8)
4384 .kr(1)
4385 .sr(1)
4386 .m(m)
4387 .n(n)
4388 .k(k)
4389 .iterations(1)
4390 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4391 }
4392 }
4393 }
4394 }
4395
4396 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
4397 TEST_REQUIRES_ARM_NEON_FMA;
4398 for (uint32_t n = 16; n <= 24; n += 8) {
4399 for (size_t k = 1; k <= 20; k += 5) {
4400 GemmMicrokernelTester()
4401 .mr(6)
4402 .nr(8)
4403 .kr(1)
4404 .sr(1)
4405 .m(6)
4406 .n(8)
4407 .k(k)
4408 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4409 }
4410 }
4411 }
4412
4413 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
4414 TEST_REQUIRES_ARM_NEON_FMA;
4415 for (uint32_t n = 16; n <= 24; n += 8) {
4416 for (size_t k = 1; k <= 20; k += 5) {
4417 GemmMicrokernelTester()
4418 .mr(6)
4419 .nr(8)
4420 .kr(1)
4421 .sr(1)
4422 .m(6)
4423 .n(n)
4424 .k(k)
4425 .cn_stride(11)
4426 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4427 }
4428 }
4429 }
4430
4431 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
4432 TEST_REQUIRES_ARM_NEON_FMA;
4433 for (uint32_t n = 16; n <= 24; n += 8) {
4434 for (size_t k = 1; k <= 20; k += 5) {
4435 GemmMicrokernelTester()
4436 .mr(6)
4437 .nr(8)
4438 .kr(1)
4439 .sr(1)
4440 .m(6)
4441 .n(n)
4442 .k(k)
4443 .a_stride(23)
4444 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4445 }
4446 }
4447 }
4448
4449 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
4450 TEST_REQUIRES_ARM_NEON_FMA;
4451 for (uint32_t n = 16; n <= 24; n += 8) {
4452 for (size_t k = 1; k <= 20; k += 5) {
4453 for (uint32_t m = 1; m <= 6; m++) {
4454 GemmMicrokernelTester()
4455 .mr(6)
4456 .nr(8)
4457 .kr(1)
4458 .sr(1)
4459 .m(m)
4460 .n(n)
4461 .k(k)
4462 .iterations(1)
4463 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4464 }
4465 }
4466 }
4467 }
4468
4469 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
4470 TEST_REQUIRES_ARM_NEON_FMA;
4471 for (size_t k = 1; k <= 20; k += 5) {
4472 for (uint32_t m = 1; m <= 6; m++) {
4473 for (uint32_t n = 1; n <= 8; n++) {
4474 GemmMicrokernelTester()
4475 .mr(6)
4476 .nr(8)
4477 .kr(1)
4478 .sr(1)
4479 .m(m)
4480 .n(n)
4481 .k(k)
4482 .cm_stride(11)
4483 .iterations(1)
4484 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4485 }
4486 }
4487 }
4488 }
4489
4490 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
4491 TEST_REQUIRES_ARM_NEON_FMA;
4492 GemmMicrokernelTester()
4493 .mr(6)
4494 .nr(8)
4495 .kr(1)
4496 .sr(1)
4497 .m(6)
4498 .n(8)
4499 .k(4)
4500 .qmin(128)
4501 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4502 }
4503
4504 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
4505 TEST_REQUIRES_ARM_NEON_FMA;
4506 GemmMicrokernelTester()
4507 .mr(6)
4508 .nr(8)
4509 .kr(1)
4510 .sr(1)
4511 .m(6)
4512 .n(8)
4513 .k(4)
4514 .qmax(128)
4515 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4516 }
4517
4518 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
4519 TEST_REQUIRES_ARM_NEON_FMA;
4520 GemmMicrokernelTester()
4521 .mr(6)
4522 .nr(8)
4523 .kr(1)
4524 .sr(1)
4525 .m(6)
4526 .n(8)
4527 .k(4)
4528 .cm_stride(11)
4529 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4530 }
4531#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4532
4533
4534#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4535 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
4536 TEST_REQUIRES_ARM_NEON_FMA;
4537 GemmMicrokernelTester()
4538 .mr(6)
4539 .nr(8)
4540 .kr(1)
4541 .sr(1)
4542 .m(6)
4543 .n(8)
4544 .k(4)
4545 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4546 }
4547
4548 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
4549 TEST_REQUIRES_ARM_NEON_FMA;
4550 GemmMicrokernelTester()
4551 .mr(6)
4552 .nr(8)
4553 .kr(1)
4554 .sr(1)
4555 .m(6)
4556 .n(8)
4557 .k(4)
4558 .cn_stride(11)
4559 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4560 }
4561
4562 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_strided_a) {
4563 TEST_REQUIRES_ARM_NEON_FMA;
4564 GemmMicrokernelTester()
4565 .mr(6)
4566 .nr(8)
4567 .kr(1)
4568 .sr(1)
4569 .m(6)
4570 .n(8)
4571 .k(4)
4572 .a_stride(7)
4573 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4574 }
4575
4576 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
4577 TEST_REQUIRES_ARM_NEON_FMA;
4578 for (uint32_t m = 1; m <= 6; m++) {
4579 for (uint32_t n = 1; n <= 8; n++) {
4580 GemmMicrokernelTester()
4581 .mr(6)
4582 .nr(8)
4583 .kr(1)
4584 .sr(1)
4585 .m(m)
4586 .n(n)
4587 .k(4)
4588 .iterations(1)
4589 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4590 }
4591 }
4592 }
4593
4594 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
4595 TEST_REQUIRES_ARM_NEON_FMA;
4596 for (uint32_t m = 1; m <= 6; m++) {
4597 GemmMicrokernelTester()
4598 .mr(6)
4599 .nr(8)
4600 .kr(1)
4601 .sr(1)
4602 .m(m)
4603 .n(8)
4604 .k(4)
4605 .iterations(1)
4606 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4607 }
4608 }
4609
4610 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
4611 TEST_REQUIRES_ARM_NEON_FMA;
4612 for (uint32_t n = 1; n <= 8; n++) {
4613 GemmMicrokernelTester()
4614 .mr(6)
4615 .nr(8)
4616 .kr(1)
4617 .sr(1)
4618 .m(6)
4619 .n(n)
4620 .k(4)
4621 .iterations(1)
4622 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4623 }
4624 }
4625
4626 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
4627 TEST_REQUIRES_ARM_NEON_FMA;
4628 GemmMicrokernelTester()
4629 .mr(6)
4630 .nr(8)
4631 .kr(1)
4632 .sr(1)
4633 .m(6)
4634 .n(8)
4635 .k(8)
4636 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4637 }
4638
4639 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_strided_a) {
4640 TEST_REQUIRES_ARM_NEON_FMA;
4641 GemmMicrokernelTester()
4642 .mr(6)
4643 .nr(8)
4644 .kr(1)
4645 .sr(1)
4646 .m(6)
4647 .n(8)
4648 .k(8)
4649 .a_stride(11)
4650 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4651 }
4652
4653 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
4654 TEST_REQUIRES_ARM_NEON_FMA;
4655 for (uint32_t m = 1; m <= 6; m++) {
4656 for (uint32_t n = 1; n <= 8; n++) {
4657 GemmMicrokernelTester()
4658 .mr(6)
4659 .nr(8)
4660 .kr(1)
4661 .sr(1)
4662 .m(m)
4663 .n(n)
4664 .k(8)
4665 .iterations(1)
4666 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4667 }
4668 }
4669 }
4670
4671 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
4672 TEST_REQUIRES_ARM_NEON_FMA;
4673 for (size_t k = 1; k < 8; k++) {
4674 GemmMicrokernelTester()
4675 .mr(6)
4676 .nr(8)
4677 .kr(1)
4678 .sr(1)
4679 .m(6)
4680 .n(8)
4681 .k(k)
4682 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4683 }
4684 }
4685
4686 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_strided_a) {
4687 TEST_REQUIRES_ARM_NEON_FMA;
4688 for (size_t k = 1; k < 8; k++) {
4689 GemmMicrokernelTester()
4690 .mr(6)
4691 .nr(8)
4692 .kr(1)
4693 .sr(1)
4694 .m(6)
4695 .n(8)
4696 .k(k)
4697 .a_stride(11)
4698 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4699 }
4700 }
4701
4702 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
4703 TEST_REQUIRES_ARM_NEON_FMA;
4704 for (size_t k = 1; k < 8; k++) {
4705 for (uint32_t m = 1; m <= 6; m++) {
4706 for (uint32_t n = 1; n <= 8; n++) {
4707 GemmMicrokernelTester()
4708 .mr(6)
4709 .nr(8)
4710 .kr(1)
4711 .sr(1)
4712 .m(m)
4713 .n(n)
4714 .k(k)
4715 .iterations(1)
4716 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4717 }
4718 }
4719 }
4720 }
4721
4722 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
4723 TEST_REQUIRES_ARM_NEON_FMA;
4724 for (size_t k = 9; k < 8; k++) {
4725 GemmMicrokernelTester()
4726 .mr(6)
4727 .nr(8)
4728 .kr(1)
4729 .sr(1)
4730 .m(6)
4731 .n(8)
4732 .k(k)
4733 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4734 }
4735 }
4736
4737 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_strided_a) {
4738 TEST_REQUIRES_ARM_NEON_FMA;
4739 for (size_t k = 9; k < 8; k++) {
4740 GemmMicrokernelTester()
4741 .mr(6)
4742 .nr(8)
4743 .kr(1)
4744 .sr(1)
4745 .m(6)
4746 .n(8)
4747 .k(k)
4748 .a_stride(11)
4749 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4750 }
4751 }
4752
4753 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_subtile) {
4754 TEST_REQUIRES_ARM_NEON_FMA;
4755 for (size_t k = 9; k < 8; k++) {
4756 for (uint32_t m = 1; m <= 6; m++) {
4757 for (uint32_t n = 1; n <= 8; n++) {
4758 GemmMicrokernelTester()
4759 .mr(6)
4760 .nr(8)
4761 .kr(1)
4762 .sr(1)
4763 .m(m)
4764 .n(n)
4765 .k(k)
4766 .iterations(1)
4767 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4768 }
4769 }
4770 }
4771 }
4772
4773 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
4774 TEST_REQUIRES_ARM_NEON_FMA;
4775 for (size_t k = 12; k <= 40; k += 4) {
4776 GemmMicrokernelTester()
4777 .mr(6)
4778 .nr(8)
4779 .kr(1)
4780 .sr(1)
4781 .m(6)
4782 .n(8)
4783 .k(k)
4784 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4785 }
4786 }
4787
4788 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_strided_a) {
4789 TEST_REQUIRES_ARM_NEON_FMA;
4790 for (size_t k = 12; k <= 40; k += 4) {
4791 GemmMicrokernelTester()
4792 .mr(6)
4793 .nr(8)
4794 .kr(1)
4795 .sr(1)
4796 .m(6)
4797 .n(8)
4798 .k(k)
4799 .a_stride(43)
4800 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4801 }
4802 }
4803
4804 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
4805 TEST_REQUIRES_ARM_NEON_FMA;
4806 for (size_t k = 12; k <= 40; k += 4) {
4807 for (uint32_t m = 1; m <= 6; m++) {
4808 for (uint32_t n = 1; n <= 8; n++) {
4809 GemmMicrokernelTester()
4810 .mr(6)
4811 .nr(8)
4812 .kr(1)
4813 .sr(1)
4814 .m(m)
4815 .n(n)
4816 .k(k)
4817 .iterations(1)
4818 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4819 }
4820 }
4821 }
4822 }
4823
4824 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
4825 TEST_REQUIRES_ARM_NEON_FMA;
4826 for (uint32_t n = 9; n < 16; n++) {
4827 for (size_t k = 1; k <= 20; k += 5) {
4828 GemmMicrokernelTester()
4829 .mr(6)
4830 .nr(8)
4831 .kr(1)
4832 .sr(1)
4833 .m(6)
4834 .n(8)
4835 .k(k)
4836 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4837 }
4838 }
4839 }
4840
4841 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
4842 TEST_REQUIRES_ARM_NEON_FMA;
4843 for (uint32_t n = 9; n < 16; n++) {
4844 for (size_t k = 1; k <= 20; k += 5) {
4845 GemmMicrokernelTester()
4846 .mr(6)
4847 .nr(8)
4848 .kr(1)
4849 .sr(1)
4850 .m(6)
4851 .n(8)
4852 .k(k)
4853 .cn_stride(11)
4854 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4855 }
4856 }
4857 }
4858
4859 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_a) {
4860 TEST_REQUIRES_ARM_NEON_FMA;
4861 for (uint32_t n = 9; n < 16; n++) {
4862 for (size_t k = 1; k <= 20; k += 5) {
4863 GemmMicrokernelTester()
4864 .mr(6)
4865 .nr(8)
4866 .kr(1)
4867 .sr(1)
4868 .m(6)
4869 .n(n)
4870 .k(k)
4871 .a_stride(23)
4872 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4873 }
4874 }
4875 }
4876
4877 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
4878 TEST_REQUIRES_ARM_NEON_FMA;
4879 for (uint32_t n = 9; n < 16; n++) {
4880 for (size_t k = 1; k <= 20; k += 5) {
4881 for (uint32_t m = 1; m <= 6; m++) {
4882 GemmMicrokernelTester()
4883 .mr(6)
4884 .nr(8)
4885 .kr(1)
4886 .sr(1)
4887 .m(m)
4888 .n(n)
4889 .k(k)
4890 .iterations(1)
4891 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4892 }
4893 }
4894 }
4895 }
4896
4897 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
4898 TEST_REQUIRES_ARM_NEON_FMA;
4899 for (uint32_t n = 16; n <= 24; n += 8) {
4900 for (size_t k = 1; k <= 20; k += 5) {
4901 GemmMicrokernelTester()
4902 .mr(6)
4903 .nr(8)
4904 .kr(1)
4905 .sr(1)
4906 .m(6)
4907 .n(8)
4908 .k(k)
4909 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4910 }
4911 }
4912 }
4913
4914 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
4915 TEST_REQUIRES_ARM_NEON_FMA;
4916 for (uint32_t n = 16; n <= 24; n += 8) {
4917 for (size_t k = 1; k <= 20; k += 5) {
4918 GemmMicrokernelTester()
4919 .mr(6)
4920 .nr(8)
4921 .kr(1)
4922 .sr(1)
4923 .m(6)
4924 .n(n)
4925 .k(k)
4926 .cn_stride(11)
4927 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4928 }
4929 }
4930 }
4931
4932 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_a) {
4933 TEST_REQUIRES_ARM_NEON_FMA;
4934 for (uint32_t n = 16; n <= 24; n += 8) {
4935 for (size_t k = 1; k <= 20; k += 5) {
4936 GemmMicrokernelTester()
4937 .mr(6)
4938 .nr(8)
4939 .kr(1)
4940 .sr(1)
4941 .m(6)
4942 .n(n)
4943 .k(k)
4944 .a_stride(23)
4945 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4946 }
4947 }
4948 }
4949
4950 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
4951 TEST_REQUIRES_ARM_NEON_FMA;
4952 for (uint32_t n = 16; n <= 24; n += 8) {
4953 for (size_t k = 1; k <= 20; k += 5) {
4954 for (uint32_t m = 1; m <= 6; m++) {
4955 GemmMicrokernelTester()
4956 .mr(6)
4957 .nr(8)
4958 .kr(1)
4959 .sr(1)
4960 .m(m)
4961 .n(n)
4962 .k(k)
4963 .iterations(1)
4964 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4965 }
4966 }
4967 }
4968 }
4969
4970 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
4971 TEST_REQUIRES_ARM_NEON_FMA;
4972 for (size_t k = 1; k <= 20; k += 5) {
4973 for (uint32_t m = 1; m <= 6; m++) {
4974 for (uint32_t n = 1; n <= 8; n++) {
4975 GemmMicrokernelTester()
4976 .mr(6)
4977 .nr(8)
4978 .kr(1)
4979 .sr(1)
4980 .m(m)
4981 .n(n)
4982 .k(k)
4983 .cm_stride(11)
4984 .iterations(1)
4985 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
4986 }
4987 }
4988 }
4989 }
4990
4991 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
4992 TEST_REQUIRES_ARM_NEON_FMA;
4993 GemmMicrokernelTester()
4994 .mr(6)
4995 .nr(8)
4996 .kr(1)
4997 .sr(1)
4998 .m(6)
4999 .n(8)
5000 .k(4)
5001 .qmin(128)
5002 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
5003 }
5004
5005 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
5006 TEST_REQUIRES_ARM_NEON_FMA;
5007 GemmMicrokernelTester()
5008 .mr(6)
5009 .nr(8)
5010 .kr(1)
5011 .sr(1)
5012 .m(6)
5013 .n(8)
5014 .k(4)
5015 .qmax(128)
5016 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
5017 }
5018
5019 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
5020 TEST_REQUIRES_ARM_NEON_FMA;
5021 GemmMicrokernelTester()
5022 .mr(6)
5023 .nr(8)
5024 .kr(1)
5025 .sr(1)
5026 .m(6)
5027 .n(8)
5028 .k(4)
5029 .cm_stride(11)
5030 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a55);
5031 }
5032#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5033
5034
5035#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5036 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4) {
5037 TEST_REQUIRES_ARM_NEON_FMA;
5038 GemmMicrokernelTester()
5039 .mr(6)
5040 .nr(8)
5041 .kr(1)
5042 .sr(1)
5043 .m(6)
5044 .n(8)
5045 .k(4)
5046 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5047 }
5048
5049 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
5050 TEST_REQUIRES_ARM_NEON_FMA;
5051 GemmMicrokernelTester()
5052 .mr(6)
5053 .nr(8)
5054 .kr(1)
5055 .sr(1)
5056 .m(6)
5057 .n(8)
5058 .k(4)
5059 .cn_stride(11)
5060 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5061 }
5062
5063 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_strided_a) {
5064 TEST_REQUIRES_ARM_NEON_FMA;
5065 GemmMicrokernelTester()
5066 .mr(6)
5067 .nr(8)
5068 .kr(1)
5069 .sr(1)
5070 .m(6)
5071 .n(8)
5072 .k(4)
5073 .a_stride(7)
5074 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5075 }
5076
5077 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_subtile) {
5078 TEST_REQUIRES_ARM_NEON_FMA;
5079 for (uint32_t m = 1; m <= 6; m++) {
5080 for (uint32_t n = 1; n <= 8; n++) {
5081 GemmMicrokernelTester()
5082 .mr(6)
5083 .nr(8)
5084 .kr(1)
5085 .sr(1)
5086 .m(m)
5087 .n(n)
5088 .k(4)
5089 .iterations(1)
5090 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5091 }
5092 }
5093 }
5094
5095 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_subtile_m) {
5096 TEST_REQUIRES_ARM_NEON_FMA;
5097 for (uint32_t m = 1; m <= 6; m++) {
5098 GemmMicrokernelTester()
5099 .mr(6)
5100 .nr(8)
5101 .kr(1)
5102 .sr(1)
5103 .m(m)
5104 .n(8)
5105 .k(4)
5106 .iterations(1)
5107 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5108 }
5109 }
5110
5111 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_subtile_n) {
5112 TEST_REQUIRES_ARM_NEON_FMA;
5113 for (uint32_t n = 1; n <= 8; n++) {
5114 GemmMicrokernelTester()
5115 .mr(6)
5116 .nr(8)
5117 .kr(1)
5118 .sr(1)
5119 .m(6)
5120 .n(n)
5121 .k(4)
5122 .iterations(1)
5123 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5124 }
5125 }
5126
5127 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_4) {
5128 TEST_REQUIRES_ARM_NEON_FMA;
5129 for (size_t k = 1; k < 4; k++) {
5130 GemmMicrokernelTester()
5131 .mr(6)
5132 .nr(8)
5133 .kr(1)
5134 .sr(1)
5135 .m(6)
5136 .n(8)
5137 .k(k)
5138 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5139 }
5140 }
5141
5142 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_4_strided_a) {
5143 TEST_REQUIRES_ARM_NEON_FMA;
5144 for (size_t k = 1; k < 4; k++) {
5145 GemmMicrokernelTester()
5146 .mr(6)
5147 .nr(8)
5148 .kr(1)
5149 .sr(1)
5150 .m(6)
5151 .n(8)
5152 .k(k)
5153 .a_stride(7)
5154 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5155 }
5156 }
5157
5158 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_4_subtile) {
5159 TEST_REQUIRES_ARM_NEON_FMA;
5160 for (size_t k = 1; k < 4; k++) {
5161 for (uint32_t m = 1; m <= 6; m++) {
5162 for (uint32_t n = 1; n <= 8; n++) {
5163 GemmMicrokernelTester()
5164 .mr(6)
5165 .nr(8)
5166 .kr(1)
5167 .sr(1)
5168 .m(m)
5169 .n(n)
5170 .k(k)
5171 .iterations(1)
5172 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5173 }
5174 }
5175 }
5176 }
5177
5178 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_4) {
5179 TEST_REQUIRES_ARM_NEON_FMA;
5180 for (size_t k = 5; k < 8; k++) {
5181 GemmMicrokernelTester()
5182 .mr(6)
5183 .nr(8)
5184 .kr(1)
5185 .sr(1)
5186 .m(6)
5187 .n(8)
5188 .k(k)
5189 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5190 }
5191 }
5192
5193 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_4_strided_a) {
5194 TEST_REQUIRES_ARM_NEON_FMA;
5195 for (size_t k = 5; k < 8; k++) {
5196 GemmMicrokernelTester()
5197 .mr(6)
5198 .nr(8)
5199 .kr(1)
5200 .sr(1)
5201 .m(6)
5202 .n(8)
5203 .k(k)
5204 .a_stride(11)
5205 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5206 }
5207 }
5208
5209 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_4_subtile) {
5210 TEST_REQUIRES_ARM_NEON_FMA;
5211 for (size_t k = 5; k < 8; k++) {
5212 for (uint32_t m = 1; m <= 6; m++) {
5213 for (uint32_t n = 1; n <= 8; n++) {
5214 GemmMicrokernelTester()
5215 .mr(6)
5216 .nr(8)
5217 .kr(1)
5218 .sr(1)
5219 .m(m)
5220 .n(n)
5221 .k(k)
5222 .iterations(1)
5223 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5224 }
5225 }
5226 }
5227 }
5228
5229 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_4) {
5230 TEST_REQUIRES_ARM_NEON_FMA;
5231 for (size_t k = 8; k <= 40; k += 4) {
5232 GemmMicrokernelTester()
5233 .mr(6)
5234 .nr(8)
5235 .kr(1)
5236 .sr(1)
5237 .m(6)
5238 .n(8)
5239 .k(k)
5240 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5241 }
5242 }
5243
5244 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_4_strided_a) {
5245 TEST_REQUIRES_ARM_NEON_FMA;
5246 for (size_t k = 8; k <= 40; k += 4) {
5247 GemmMicrokernelTester()
5248 .mr(6)
5249 .nr(8)
5250 .kr(1)
5251 .sr(1)
5252 .m(6)
5253 .n(8)
5254 .k(k)
5255 .a_stride(43)
5256 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5257 }
5258 }
5259
5260 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_4_subtile) {
5261 TEST_REQUIRES_ARM_NEON_FMA;
5262 for (size_t k = 8; k <= 40; k += 4) {
5263 for (uint32_t m = 1; m <= 6; m++) {
5264 for (uint32_t n = 1; n <= 8; n++) {
5265 GemmMicrokernelTester()
5266 .mr(6)
5267 .nr(8)
5268 .kr(1)
5269 .sr(1)
5270 .m(m)
5271 .n(n)
5272 .k(k)
5273 .iterations(1)
5274 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5275 }
5276 }
5277 }
5278 }
5279
5280 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
5281 TEST_REQUIRES_ARM_NEON_FMA;
5282 for (uint32_t n = 9; n < 16; n++) {
5283 for (size_t k = 1; k <= 20; k += 5) {
5284 GemmMicrokernelTester()
5285 .mr(6)
5286 .nr(8)
5287 .kr(1)
5288 .sr(1)
5289 .m(6)
5290 .n(8)
5291 .k(k)
5292 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5293 }
5294 }
5295 }
5296
5297 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
5298 TEST_REQUIRES_ARM_NEON_FMA;
5299 for (uint32_t n = 9; n < 16; n++) {
5300 for (size_t k = 1; k <= 20; k += 5) {
5301 GemmMicrokernelTester()
5302 .mr(6)
5303 .nr(8)
5304 .kr(1)
5305 .sr(1)
5306 .m(6)
5307 .n(8)
5308 .k(k)
5309 .cn_stride(11)
5310 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5311 }
5312 }
5313 }
5314
5315 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
5316 TEST_REQUIRES_ARM_NEON_FMA;
5317 for (uint32_t n = 9; n < 16; n++) {
5318 for (size_t k = 1; k <= 20; k += 5) {
5319 GemmMicrokernelTester()
5320 .mr(6)
5321 .nr(8)
5322 .kr(1)
5323 .sr(1)
5324 .m(6)
5325 .n(n)
5326 .k(k)
5327 .a_stride(23)
5328 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5329 }
5330 }
5331 }
5332
5333 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
5334 TEST_REQUIRES_ARM_NEON_FMA;
5335 for (uint32_t n = 9; n < 16; n++) {
5336 for (size_t k = 1; k <= 20; k += 5) {
5337 for (uint32_t m = 1; m <= 6; m++) {
5338 GemmMicrokernelTester()
5339 .mr(6)
5340 .nr(8)
5341 .kr(1)
5342 .sr(1)
5343 .m(m)
5344 .n(n)
5345 .k(k)
5346 .iterations(1)
5347 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5348 }
5349 }
5350 }
5351 }
5352
5353 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
5354 TEST_REQUIRES_ARM_NEON_FMA;
5355 for (uint32_t n = 16; n <= 24; n += 8) {
5356 for (size_t k = 1; k <= 20; k += 5) {
5357 GemmMicrokernelTester()
5358 .mr(6)
5359 .nr(8)
5360 .kr(1)
5361 .sr(1)
5362 .m(6)
5363 .n(8)
5364 .k(k)
5365 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5366 }
5367 }
5368 }
5369
5370 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
5371 TEST_REQUIRES_ARM_NEON_FMA;
5372 for (uint32_t n = 16; n <= 24; n += 8) {
5373 for (size_t k = 1; k <= 20; k += 5) {
5374 GemmMicrokernelTester()
5375 .mr(6)
5376 .nr(8)
5377 .kr(1)
5378 .sr(1)
5379 .m(6)
5380 .n(n)
5381 .k(k)
5382 .cn_stride(11)
5383 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5384 }
5385 }
5386 }
5387
5388 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
5389 TEST_REQUIRES_ARM_NEON_FMA;
5390 for (uint32_t n = 16; n <= 24; n += 8) {
5391 for (size_t k = 1; k <= 20; k += 5) {
5392 GemmMicrokernelTester()
5393 .mr(6)
5394 .nr(8)
5395 .kr(1)
5396 .sr(1)
5397 .m(6)
5398 .n(n)
5399 .k(k)
5400 .a_stride(23)
5401 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5402 }
5403 }
5404 }
5405
5406 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
5407 TEST_REQUIRES_ARM_NEON_FMA;
5408 for (uint32_t n = 16; n <= 24; n += 8) {
5409 for (size_t k = 1; k <= 20; k += 5) {
5410 for (uint32_t m = 1; m <= 6; m++) {
5411 GemmMicrokernelTester()
5412 .mr(6)
5413 .nr(8)
5414 .kr(1)
5415 .sr(1)
5416 .m(m)
5417 .n(n)
5418 .k(k)
5419 .iterations(1)
5420 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5421 }
5422 }
5423 }
5424 }
5425
5426 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
5427 TEST_REQUIRES_ARM_NEON_FMA;
5428 for (size_t k = 1; k <= 20; k += 5) {
5429 for (uint32_t m = 1; m <= 6; m++) {
5430 for (uint32_t n = 1; n <= 8; n++) {
5431 GemmMicrokernelTester()
5432 .mr(6)
5433 .nr(8)
5434 .kr(1)
5435 .sr(1)
5436 .m(m)
5437 .n(n)
5438 .k(k)
5439 .cm_stride(11)
5440 .iterations(1)
5441 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5442 }
5443 }
5444 }
5445 }
5446
5447 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
5448 TEST_REQUIRES_ARM_NEON_FMA;
5449 GemmMicrokernelTester()
5450 .mr(6)
5451 .nr(8)
5452 .kr(1)
5453 .sr(1)
5454 .m(6)
5455 .n(8)
5456 .k(4)
5457 .qmin(128)
5458 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5459 }
5460
5461 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
5462 TEST_REQUIRES_ARM_NEON_FMA;
5463 GemmMicrokernelTester()
5464 .mr(6)
5465 .nr(8)
5466 .kr(1)
5467 .sr(1)
5468 .m(6)
5469 .n(8)
5470 .k(4)
5471 .qmax(128)
5472 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5473 }
5474
5475 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
5476 TEST_REQUIRES_ARM_NEON_FMA;
5477 GemmMicrokernelTester()
5478 .mr(6)
5479 .nr(8)
5480 .kr(1)
5481 .sr(1)
5482 .m(6)
5483 .n(8)
5484 .k(4)
5485 .cm_stride(11)
5486 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
5487 }
5488#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5489
5490
5491#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5492 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
5493 TEST_REQUIRES_ARM_NEON_FMA;
5494 GemmMicrokernelTester()
5495 .mr(6)
5496 .nr(8)
5497 .kr(1)
5498 .sr(1)
5499 .m(6)
5500 .n(8)
5501 .k(8)
5502 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5503 }
5504
5505 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cn) {
5506 TEST_REQUIRES_ARM_NEON_FMA;
5507 GemmMicrokernelTester()
5508 .mr(6)
5509 .nr(8)
5510 .kr(1)
5511 .sr(1)
5512 .m(6)
5513 .n(8)
5514 .k(8)
5515 .cn_stride(11)
5516 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5517 }
5518
5519 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_strided_a) {
5520 TEST_REQUIRES_ARM_NEON_FMA;
5521 GemmMicrokernelTester()
5522 .mr(6)
5523 .nr(8)
5524 .kr(1)
5525 .sr(1)
5526 .m(6)
5527 .n(8)
5528 .k(8)
5529 .a_stride(11)
5530 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5531 }
5532
5533 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile) {
5534 TEST_REQUIRES_ARM_NEON_FMA;
5535 for (uint32_t m = 1; m <= 6; m++) {
5536 for (uint32_t n = 1; n <= 8; n++) {
5537 GemmMicrokernelTester()
5538 .mr(6)
5539 .nr(8)
5540 .kr(1)
5541 .sr(1)
5542 .m(m)
5543 .n(n)
5544 .k(8)
5545 .iterations(1)
5546 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5547 }
5548 }
5549 }
5550
5551 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_m) {
5552 TEST_REQUIRES_ARM_NEON_FMA;
5553 for (uint32_t m = 1; m <= 6; m++) {
5554 GemmMicrokernelTester()
5555 .mr(6)
5556 .nr(8)
5557 .kr(1)
5558 .sr(1)
5559 .m(m)
5560 .n(8)
5561 .k(8)
5562 .iterations(1)
5563 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5564 }
5565 }
5566
5567 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_n) {
5568 TEST_REQUIRES_ARM_NEON_FMA;
5569 for (uint32_t n = 1; n <= 8; n++) {
5570 GemmMicrokernelTester()
5571 .mr(6)
5572 .nr(8)
5573 .kr(1)
5574 .sr(1)
5575 .m(6)
5576 .n(n)
5577 .k(8)
5578 .iterations(1)
5579 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5580 }
5581 }
5582
5583 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16) {
5584 TEST_REQUIRES_ARM_NEON_FMA;
5585 GemmMicrokernelTester()
5586 .mr(6)
5587 .nr(8)
5588 .kr(1)
5589 .sr(1)
5590 .m(6)
5591 .n(8)
5592 .k(16)
5593 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5594 }
5595
5596 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_strided_a) {
5597 TEST_REQUIRES_ARM_NEON_FMA;
5598 GemmMicrokernelTester()
5599 .mr(6)
5600 .nr(8)
5601 .kr(1)
5602 .sr(1)
5603 .m(6)
5604 .n(8)
5605 .k(16)
5606 .a_stride(19)
5607 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5608 }
5609
5610 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_subtile) {
5611 TEST_REQUIRES_ARM_NEON_FMA;
5612 for (uint32_t m = 1; m <= 6; m++) {
5613 for (uint32_t n = 1; n <= 8; n++) {
5614 GemmMicrokernelTester()
5615 .mr(6)
5616 .nr(8)
5617 .kr(1)
5618 .sr(1)
5619 .m(m)
5620 .n(n)
5621 .k(16)
5622 .iterations(1)
5623 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5624 }
5625 }
5626 }
5627
5628 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16) {
5629 TEST_REQUIRES_ARM_NEON_FMA;
5630 for (size_t k = 1; k < 16; k++) {
5631 GemmMicrokernelTester()
5632 .mr(6)
5633 .nr(8)
5634 .kr(1)
5635 .sr(1)
5636 .m(6)
5637 .n(8)
5638 .k(k)
5639 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5640 }
5641 }
5642
5643 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_strided_a) {
5644 TEST_REQUIRES_ARM_NEON_FMA;
5645 for (size_t k = 1; k < 16; k++) {
5646 GemmMicrokernelTester()
5647 .mr(6)
5648 .nr(8)
5649 .kr(1)
5650 .sr(1)
5651 .m(6)
5652 .n(8)
5653 .k(k)
5654 .a_stride(19)
5655 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5656 }
5657 }
5658
5659 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_subtile) {
5660 TEST_REQUIRES_ARM_NEON_FMA;
5661 for (size_t k = 1; k < 16; k++) {
5662 for (uint32_t m = 1; m <= 6; m++) {
5663 for (uint32_t n = 1; n <= 8; n++) {
5664 GemmMicrokernelTester()
5665 .mr(6)
5666 .nr(8)
5667 .kr(1)
5668 .sr(1)
5669 .m(m)
5670 .n(n)
5671 .k(k)
5672 .iterations(1)
5673 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5674 }
5675 }
5676 }
5677 }
5678
5679 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_16) {
5680 TEST_REQUIRES_ARM_NEON_FMA;
5681 for (size_t k = 17; k < 16; k++) {
5682 GemmMicrokernelTester()
5683 .mr(6)
5684 .nr(8)
5685 .kr(1)
5686 .sr(1)
5687 .m(6)
5688 .n(8)
5689 .k(k)
5690 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5691 }
5692 }
5693
5694 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_strided_a) {
5695 TEST_REQUIRES_ARM_NEON_FMA;
5696 for (size_t k = 17; k < 16; k++) {
5697 GemmMicrokernelTester()
5698 .mr(6)
5699 .nr(8)
5700 .kr(1)
5701 .sr(1)
5702 .m(6)
5703 .n(8)
5704 .k(k)
5705 .a_stride(19)
5706 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5707 }
5708 }
5709
5710 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_subtile) {
5711 TEST_REQUIRES_ARM_NEON_FMA;
5712 for (size_t k = 17; k < 16; k++) {
5713 for (uint32_t m = 1; m <= 6; m++) {
5714 for (uint32_t n = 1; n <= 8; n++) {
5715 GemmMicrokernelTester()
5716 .mr(6)
5717 .nr(8)
5718 .kr(1)
5719 .sr(1)
5720 .m(m)
5721 .n(n)
5722 .k(k)
5723 .iterations(1)
5724 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5725 }
5726 }
5727 }
5728 }
5729
5730 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8) {
5731 TEST_REQUIRES_ARM_NEON_FMA;
5732 for (size_t k = 24; k <= 80; k += 8) {
5733 GemmMicrokernelTester()
5734 .mr(6)
5735 .nr(8)
5736 .kr(1)
5737 .sr(1)
5738 .m(6)
5739 .n(8)
5740 .k(k)
5741 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5742 }
5743 }
5744
5745 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_strided_a) {
5746 TEST_REQUIRES_ARM_NEON_FMA;
5747 for (size_t k = 24; k <= 80; k += 8) {
5748 GemmMicrokernelTester()
5749 .mr(6)
5750 .nr(8)
5751 .kr(1)
5752 .sr(1)
5753 .m(6)
5754 .n(8)
5755 .k(k)
5756 .a_stride(83)
5757 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5758 }
5759 }
5760
5761 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_subtile) {
5762 TEST_REQUIRES_ARM_NEON_FMA;
5763 for (size_t k = 24; k <= 80; k += 8) {
5764 for (uint32_t m = 1; m <= 6; m++) {
5765 for (uint32_t n = 1; n <= 8; n++) {
5766 GemmMicrokernelTester()
5767 .mr(6)
5768 .nr(8)
5769 .kr(1)
5770 .sr(1)
5771 .m(m)
5772 .n(n)
5773 .k(k)
5774 .iterations(1)
5775 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5776 }
5777 }
5778 }
5779 }
5780
5781 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8) {
5782 TEST_REQUIRES_ARM_NEON_FMA;
5783 for (uint32_t n = 9; n < 16; n++) {
5784 for (size_t k = 1; k <= 40; k += 9) {
5785 GemmMicrokernelTester()
5786 .mr(6)
5787 .nr(8)
5788 .kr(1)
5789 .sr(1)
5790 .m(6)
5791 .n(8)
5792 .k(k)
5793 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5794 }
5795 }
5796 }
5797
5798 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_cn) {
5799 TEST_REQUIRES_ARM_NEON_FMA;
5800 for (uint32_t n = 9; n < 16; n++) {
5801 for (size_t k = 1; k <= 40; k += 9) {
5802 GemmMicrokernelTester()
5803 .mr(6)
5804 .nr(8)
5805 .kr(1)
5806 .sr(1)
5807 .m(6)
5808 .n(8)
5809 .k(k)
5810 .cn_stride(11)
5811 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5812 }
5813 }
5814 }
5815
5816 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_a) {
5817 TEST_REQUIRES_ARM_NEON_FMA;
5818 for (uint32_t n = 9; n < 16; n++) {
5819 for (size_t k = 1; k <= 40; k += 9) {
5820 GemmMicrokernelTester()
5821 .mr(6)
5822 .nr(8)
5823 .kr(1)
5824 .sr(1)
5825 .m(6)
5826 .n(n)
5827 .k(k)
5828 .a_stride(43)
5829 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5830 }
5831 }
5832 }
5833
5834 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_subtile) {
5835 TEST_REQUIRES_ARM_NEON_FMA;
5836 for (uint32_t n = 9; n < 16; n++) {
5837 for (size_t k = 1; k <= 40; k += 9) {
5838 for (uint32_t m = 1; m <= 6; m++) {
5839 GemmMicrokernelTester()
5840 .mr(6)
5841 .nr(8)
5842 .kr(1)
5843 .sr(1)
5844 .m(m)
5845 .n(n)
5846 .k(k)
5847 .iterations(1)
5848 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5849 }
5850 }
5851 }
5852 }
5853
5854 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8) {
5855 TEST_REQUIRES_ARM_NEON_FMA;
5856 for (uint32_t n = 16; n <= 24; n += 8) {
5857 for (size_t k = 1; k <= 40; k += 9) {
5858 GemmMicrokernelTester()
5859 .mr(6)
5860 .nr(8)
5861 .kr(1)
5862 .sr(1)
5863 .m(6)
5864 .n(8)
5865 .k(k)
5866 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5867 }
5868 }
5869 }
5870
5871 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_cn) {
5872 TEST_REQUIRES_ARM_NEON_FMA;
5873 for (uint32_t n = 16; n <= 24; n += 8) {
5874 for (size_t k = 1; k <= 40; k += 9) {
5875 GemmMicrokernelTester()
5876 .mr(6)
5877 .nr(8)
5878 .kr(1)
5879 .sr(1)
5880 .m(6)
5881 .n(n)
5882 .k(k)
5883 .cn_stride(11)
5884 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5885 }
5886 }
5887 }
5888
5889 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_a) {
5890 TEST_REQUIRES_ARM_NEON_FMA;
5891 for (uint32_t n = 16; n <= 24; n += 8) {
5892 for (size_t k = 1; k <= 40; k += 9) {
5893 GemmMicrokernelTester()
5894 .mr(6)
5895 .nr(8)
5896 .kr(1)
5897 .sr(1)
5898 .m(6)
5899 .n(n)
5900 .k(k)
5901 .a_stride(43)
5902 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5903 }
5904 }
5905 }
5906
5907 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_subtile) {
5908 TEST_REQUIRES_ARM_NEON_FMA;
5909 for (uint32_t n = 16; n <= 24; n += 8) {
5910 for (size_t k = 1; k <= 40; k += 9) {
5911 for (uint32_t m = 1; m <= 6; m++) {
5912 GemmMicrokernelTester()
5913 .mr(6)
5914 .nr(8)
5915 .kr(1)
5916 .sr(1)
5917 .m(m)
5918 .n(n)
5919 .k(k)
5920 .iterations(1)
5921 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5922 }
5923 }
5924 }
5925 }
5926
5927 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm_subtile) {
5928 TEST_REQUIRES_ARM_NEON_FMA;
5929 for (size_t k = 1; k <= 40; k += 9) {
5930 for (uint32_t m = 1; m <= 6; m++) {
5931 for (uint32_t n = 1; n <= 8; n++) {
5932 GemmMicrokernelTester()
5933 .mr(6)
5934 .nr(8)
5935 .kr(1)
5936 .sr(1)
5937 .m(m)
5938 .n(n)
5939 .k(k)
5940 .cm_stride(11)
5941 .iterations(1)
5942 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5943 }
5944 }
5945 }
5946 }
5947
5948 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, qmin) {
5949 TEST_REQUIRES_ARM_NEON_FMA;
5950 GemmMicrokernelTester()
5951 .mr(6)
5952 .nr(8)
5953 .kr(1)
5954 .sr(1)
5955 .m(6)
5956 .n(8)
5957 .k(8)
5958 .qmin(128)
5959 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5960 }
5961
5962 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, qmax) {
5963 TEST_REQUIRES_ARM_NEON_FMA;
5964 GemmMicrokernelTester()
5965 .mr(6)
5966 .nr(8)
5967 .kr(1)
5968 .sr(1)
5969 .m(6)
5970 .n(8)
5971 .k(8)
5972 .qmax(128)
5973 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5974 }
5975
5976 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm) {
5977 TEST_REQUIRES_ARM_NEON_FMA;
5978 GemmMicrokernelTester()
5979 .mr(6)
5980 .nr(8)
5981 .kr(1)
5982 .sr(1)
5983 .m(6)
5984 .n(8)
5985 .k(8)
5986 .cm_stride(11)
5987 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
5988 }
5989#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5990
5991
5992#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5993 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
5994 TEST_REQUIRES_ARM_NEON_FMA;
5995 GemmMicrokernelTester()
5996 .mr(6)
5997 .nr(8)
5998 .kr(1)
5999 .sr(1)
6000 .m(6)
6001 .n(8)
6002 .k(8)
6003 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6004 }
6005
6006 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
6007 TEST_REQUIRES_ARM_NEON_FMA;
6008 GemmMicrokernelTester()
6009 .mr(6)
6010 .nr(8)
6011 .kr(1)
6012 .sr(1)
6013 .m(6)
6014 .n(8)
6015 .k(8)
6016 .cn_stride(11)
6017 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6018 }
6019
6020 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
6021 TEST_REQUIRES_ARM_NEON_FMA;
6022 GemmMicrokernelTester()
6023 .mr(6)
6024 .nr(8)
6025 .kr(1)
6026 .sr(1)
6027 .m(6)
6028 .n(8)
6029 .k(8)
6030 .a_stride(11)
6031 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6032 }
6033
6034 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
6035 TEST_REQUIRES_ARM_NEON_FMA;
6036 for (uint32_t m = 1; m <= 6; m++) {
6037 for (uint32_t n = 1; n <= 8; n++) {
6038 GemmMicrokernelTester()
6039 .mr(6)
6040 .nr(8)
6041 .kr(1)
6042 .sr(1)
6043 .m(m)
6044 .n(n)
6045 .k(8)
6046 .iterations(1)
6047 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6048 }
6049 }
6050 }
6051
6052 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
6053 TEST_REQUIRES_ARM_NEON_FMA;
6054 for (uint32_t m = 1; m <= 6; m++) {
6055 GemmMicrokernelTester()
6056 .mr(6)
6057 .nr(8)
6058 .kr(1)
6059 .sr(1)
6060 .m(m)
6061 .n(8)
6062 .k(8)
6063 .iterations(1)
6064 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6065 }
6066 }
6067
6068 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
6069 TEST_REQUIRES_ARM_NEON_FMA;
6070 for (uint32_t n = 1; n <= 8; n++) {
6071 GemmMicrokernelTester()
6072 .mr(6)
6073 .nr(8)
6074 .kr(1)
6075 .sr(1)
6076 .m(6)
6077 .n(n)
6078 .k(8)
6079 .iterations(1)
6080 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6081 }
6082 }
6083
6084 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
6085 TEST_REQUIRES_ARM_NEON_FMA;
6086 GemmMicrokernelTester()
6087 .mr(6)
6088 .nr(8)
6089 .kr(1)
6090 .sr(1)
6091 .m(6)
6092 .n(8)
6093 .k(16)
6094 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6095 }
6096
6097 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
6098 TEST_REQUIRES_ARM_NEON_FMA;
6099 GemmMicrokernelTester()
6100 .mr(6)
6101 .nr(8)
6102 .kr(1)
6103 .sr(1)
6104 .m(6)
6105 .n(8)
6106 .k(16)
6107 .a_stride(19)
6108 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6109 }
6110
6111 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
6112 TEST_REQUIRES_ARM_NEON_FMA;
6113 for (uint32_t m = 1; m <= 6; m++) {
6114 for (uint32_t n = 1; n <= 8; n++) {
6115 GemmMicrokernelTester()
6116 .mr(6)
6117 .nr(8)
6118 .kr(1)
6119 .sr(1)
6120 .m(m)
6121 .n(n)
6122 .k(16)
6123 .iterations(1)
6124 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6125 }
6126 }
6127 }
6128
6129 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
6130 TEST_REQUIRES_ARM_NEON_FMA;
6131 for (size_t k = 1; k < 16; k++) {
6132 GemmMicrokernelTester()
6133 .mr(6)
6134 .nr(8)
6135 .kr(1)
6136 .sr(1)
6137 .m(6)
6138 .n(8)
6139 .k(k)
6140 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6141 }
6142 }
6143
6144 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
6145 TEST_REQUIRES_ARM_NEON_FMA;
6146 for (size_t k = 1; k < 16; k++) {
6147 GemmMicrokernelTester()
6148 .mr(6)
6149 .nr(8)
6150 .kr(1)
6151 .sr(1)
6152 .m(6)
6153 .n(8)
6154 .k(k)
6155 .a_stride(19)
6156 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6157 }
6158 }
6159
6160 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
6161 TEST_REQUIRES_ARM_NEON_FMA;
6162 for (size_t k = 1; k < 16; k++) {
6163 for (uint32_t m = 1; m <= 6; m++) {
6164 for (uint32_t n = 1; n <= 8; n++) {
6165 GemmMicrokernelTester()
6166 .mr(6)
6167 .nr(8)
6168 .kr(1)
6169 .sr(1)
6170 .m(m)
6171 .n(n)
6172 .k(k)
6173 .iterations(1)
6174 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6175 }
6176 }
6177 }
6178 }
6179
6180 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
6181 TEST_REQUIRES_ARM_NEON_FMA;
6182 for (size_t k = 17; k < 16; k++) {
6183 GemmMicrokernelTester()
6184 .mr(6)
6185 .nr(8)
6186 .kr(1)
6187 .sr(1)
6188 .m(6)
6189 .n(8)
6190 .k(k)
6191 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6192 }
6193 }
6194
6195 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
6196 TEST_REQUIRES_ARM_NEON_FMA;
6197 for (size_t k = 17; k < 16; k++) {
6198 GemmMicrokernelTester()
6199 .mr(6)
6200 .nr(8)
6201 .kr(1)
6202 .sr(1)
6203 .m(6)
6204 .n(8)
6205 .k(k)
6206 .a_stride(19)
6207 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6208 }
6209 }
6210
6211 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
6212 TEST_REQUIRES_ARM_NEON_FMA;
6213 for (size_t k = 17; k < 16; k++) {
6214 for (uint32_t m = 1; m <= 6; m++) {
6215 for (uint32_t n = 1; n <= 8; n++) {
6216 GemmMicrokernelTester()
6217 .mr(6)
6218 .nr(8)
6219 .kr(1)
6220 .sr(1)
6221 .m(m)
6222 .n(n)
6223 .k(k)
6224 .iterations(1)
6225 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6226 }
6227 }
6228 }
6229 }
6230
6231 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
6232 TEST_REQUIRES_ARM_NEON_FMA;
6233 for (size_t k = 24; k <= 80; k += 8) {
6234 GemmMicrokernelTester()
6235 .mr(6)
6236 .nr(8)
6237 .kr(1)
6238 .sr(1)
6239 .m(6)
6240 .n(8)
6241 .k(k)
6242 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6243 }
6244 }
6245
6246 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
6247 TEST_REQUIRES_ARM_NEON_FMA;
6248 for (size_t k = 24; k <= 80; k += 8) {
6249 GemmMicrokernelTester()
6250 .mr(6)
6251 .nr(8)
6252 .kr(1)
6253 .sr(1)
6254 .m(6)
6255 .n(8)
6256 .k(k)
6257 .a_stride(83)
6258 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6259 }
6260 }
6261
6262 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
6263 TEST_REQUIRES_ARM_NEON_FMA;
6264 for (size_t k = 24; k <= 80; k += 8) {
6265 for (uint32_t m = 1; m <= 6; m++) {
6266 for (uint32_t n = 1; n <= 8; n++) {
6267 GemmMicrokernelTester()
6268 .mr(6)
6269 .nr(8)
6270 .kr(1)
6271 .sr(1)
6272 .m(m)
6273 .n(n)
6274 .k(k)
6275 .iterations(1)
6276 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6277 }
6278 }
6279 }
6280 }
6281
6282 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
6283 TEST_REQUIRES_ARM_NEON_FMA;
6284 for (uint32_t n = 9; n < 16; n++) {
6285 for (size_t k = 1; k <= 40; k += 9) {
6286 GemmMicrokernelTester()
6287 .mr(6)
6288 .nr(8)
6289 .kr(1)
6290 .sr(1)
6291 .m(6)
6292 .n(8)
6293 .k(k)
6294 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6295 }
6296 }
6297 }
6298
6299 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
6300 TEST_REQUIRES_ARM_NEON_FMA;
6301 for (uint32_t n = 9; n < 16; n++) {
6302 for (size_t k = 1; k <= 40; k += 9) {
6303 GemmMicrokernelTester()
6304 .mr(6)
6305 .nr(8)
6306 .kr(1)
6307 .sr(1)
6308 .m(6)
6309 .n(8)
6310 .k(k)
6311 .cn_stride(11)
6312 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6313 }
6314 }
6315 }
6316
6317 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
6318 TEST_REQUIRES_ARM_NEON_FMA;
6319 for (uint32_t n = 9; n < 16; n++) {
6320 for (size_t k = 1; k <= 40; k += 9) {
6321 GemmMicrokernelTester()
6322 .mr(6)
6323 .nr(8)
6324 .kr(1)
6325 .sr(1)
6326 .m(6)
6327 .n(n)
6328 .k(k)
6329 .a_stride(43)
6330 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6331 }
6332 }
6333 }
6334
6335 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
6336 TEST_REQUIRES_ARM_NEON_FMA;
6337 for (uint32_t n = 9; n < 16; n++) {
6338 for (size_t k = 1; k <= 40; k += 9) {
6339 for (uint32_t m = 1; m <= 6; m++) {
6340 GemmMicrokernelTester()
6341 .mr(6)
6342 .nr(8)
6343 .kr(1)
6344 .sr(1)
6345 .m(m)
6346 .n(n)
6347 .k(k)
6348 .iterations(1)
6349 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6350 }
6351 }
6352 }
6353 }
6354
6355 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
6356 TEST_REQUIRES_ARM_NEON_FMA;
6357 for (uint32_t n = 16; n <= 24; n += 8) {
6358 for (size_t k = 1; k <= 40; k += 9) {
6359 GemmMicrokernelTester()
6360 .mr(6)
6361 .nr(8)
6362 .kr(1)
6363 .sr(1)
6364 .m(6)
6365 .n(8)
6366 .k(k)
6367 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6368 }
6369 }
6370 }
6371
6372 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
6373 TEST_REQUIRES_ARM_NEON_FMA;
6374 for (uint32_t n = 16; n <= 24; n += 8) {
6375 for (size_t k = 1; k <= 40; k += 9) {
6376 GemmMicrokernelTester()
6377 .mr(6)
6378 .nr(8)
6379 .kr(1)
6380 .sr(1)
6381 .m(6)
6382 .n(n)
6383 .k(k)
6384 .cn_stride(11)
6385 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6386 }
6387 }
6388 }
6389
6390 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
6391 TEST_REQUIRES_ARM_NEON_FMA;
6392 for (uint32_t n = 16; n <= 24; n += 8) {
6393 for (size_t k = 1; k <= 40; k += 9) {
6394 GemmMicrokernelTester()
6395 .mr(6)
6396 .nr(8)
6397 .kr(1)
6398 .sr(1)
6399 .m(6)
6400 .n(n)
6401 .k(k)
6402 .a_stride(43)
6403 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6404 }
6405 }
6406 }
6407
6408 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
6409 TEST_REQUIRES_ARM_NEON_FMA;
6410 for (uint32_t n = 16; n <= 24; n += 8) {
6411 for (size_t k = 1; k <= 40; k += 9) {
6412 for (uint32_t m = 1; m <= 6; m++) {
6413 GemmMicrokernelTester()
6414 .mr(6)
6415 .nr(8)
6416 .kr(1)
6417 .sr(1)
6418 .m(m)
6419 .n(n)
6420 .k(k)
6421 .iterations(1)
6422 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6423 }
6424 }
6425 }
6426 }
6427
6428 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
6429 TEST_REQUIRES_ARM_NEON_FMA;
6430 for (size_t k = 1; k <= 40; k += 9) {
6431 for (uint32_t m = 1; m <= 6; m++) {
6432 for (uint32_t n = 1; n <= 8; n++) {
6433 GemmMicrokernelTester()
6434 .mr(6)
6435 .nr(8)
6436 .kr(1)
6437 .sr(1)
6438 .m(m)
6439 .n(n)
6440 .k(k)
6441 .cm_stride(11)
6442 .iterations(1)
6443 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6444 }
6445 }
6446 }
6447 }
6448
6449 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
6450 TEST_REQUIRES_ARM_NEON_FMA;
6451 GemmMicrokernelTester()
6452 .mr(6)
6453 .nr(8)
6454 .kr(1)
6455 .sr(1)
6456 .m(6)
6457 .n(8)
6458 .k(8)
6459 .qmin(128)
6460 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6461 }
6462
6463 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
6464 TEST_REQUIRES_ARM_NEON_FMA;
6465 GemmMicrokernelTester()
6466 .mr(6)
6467 .nr(8)
6468 .kr(1)
6469 .sr(1)
6470 .m(6)
6471 .n(8)
6472 .k(8)
6473 .qmax(128)
6474 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6475 }
6476
6477 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
6478 TEST_REQUIRES_ARM_NEON_FMA;
6479 GemmMicrokernelTester()
6480 .mr(6)
6481 .nr(8)
6482 .kr(1)
6483 .sr(1)
6484 .m(6)
6485 .n(8)
6486 .k(8)
6487 .cm_stride(11)
6488 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
6489 }
6490#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6491
6492
6493#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6494 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8) {
6495 TEST_REQUIRES_ARM_NEON_FMA;
6496 GemmMicrokernelTester()
6497 .mr(6)
6498 .nr(8)
6499 .kr(1)
6500 .sr(1)
6501 .m(6)
6502 .n(8)
6503 .k(8)
6504 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6505 }
6506
6507 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, strided_cn) {
6508 TEST_REQUIRES_ARM_NEON_FMA;
6509 GemmMicrokernelTester()
6510 .mr(6)
6511 .nr(8)
6512 .kr(1)
6513 .sr(1)
6514 .m(6)
6515 .n(8)
6516 .k(8)
6517 .cn_stride(11)
6518 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6519 }
6520
6521 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_strided_a) {
6522 TEST_REQUIRES_ARM_NEON_FMA;
6523 GemmMicrokernelTester()
6524 .mr(6)
6525 .nr(8)
6526 .kr(1)
6527 .sr(1)
6528 .m(6)
6529 .n(8)
6530 .k(8)
6531 .a_stride(11)
6532 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6533 }
6534
6535 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile) {
6536 TEST_REQUIRES_ARM_NEON_FMA;
6537 for (uint32_t m = 1; m <= 6; m++) {
6538 for (uint32_t n = 1; n <= 8; n++) {
6539 GemmMicrokernelTester()
6540 .mr(6)
6541 .nr(8)
6542 .kr(1)
6543 .sr(1)
6544 .m(m)
6545 .n(n)
6546 .k(8)
6547 .iterations(1)
6548 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6549 }
6550 }
6551 }
6552
6553 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_m) {
6554 TEST_REQUIRES_ARM_NEON_FMA;
6555 for (uint32_t m = 1; m <= 6; m++) {
6556 GemmMicrokernelTester()
6557 .mr(6)
6558 .nr(8)
6559 .kr(1)
6560 .sr(1)
6561 .m(m)
6562 .n(8)
6563 .k(8)
6564 .iterations(1)
6565 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6566 }
6567 }
6568
6569 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_n) {
6570 TEST_REQUIRES_ARM_NEON_FMA;
6571 for (uint32_t n = 1; n <= 8; n++) {
6572 GemmMicrokernelTester()
6573 .mr(6)
6574 .nr(8)
6575 .kr(1)
6576 .sr(1)
6577 .m(6)
6578 .n(n)
6579 .k(8)
6580 .iterations(1)
6581 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6582 }
6583 }
6584
6585 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_16) {
6586 TEST_REQUIRES_ARM_NEON_FMA;
6587 GemmMicrokernelTester()
6588 .mr(6)
6589 .nr(8)
6590 .kr(1)
6591 .sr(1)
6592 .m(6)
6593 .n(8)
6594 .k(16)
6595 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6596 }
6597
6598 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_16_strided_a) {
6599 TEST_REQUIRES_ARM_NEON_FMA;
6600 GemmMicrokernelTester()
6601 .mr(6)
6602 .nr(8)
6603 .kr(1)
6604 .sr(1)
6605 .m(6)
6606 .n(8)
6607 .k(16)
6608 .a_stride(19)
6609 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6610 }
6611
6612 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_eq_16_subtile) {
6613 TEST_REQUIRES_ARM_NEON_FMA;
6614 for (uint32_t m = 1; m <= 6; m++) {
6615 for (uint32_t n = 1; n <= 8; n++) {
6616 GemmMicrokernelTester()
6617 .mr(6)
6618 .nr(8)
6619 .kr(1)
6620 .sr(1)
6621 .m(m)
6622 .n(n)
6623 .k(16)
6624 .iterations(1)
6625 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6626 }
6627 }
6628 }
6629
6630 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_lt_16) {
6631 TEST_REQUIRES_ARM_NEON_FMA;
6632 for (size_t k = 1; k < 16; k++) {
6633 GemmMicrokernelTester()
6634 .mr(6)
6635 .nr(8)
6636 .kr(1)
6637 .sr(1)
6638 .m(6)
6639 .n(8)
6640 .k(k)
6641 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6642 }
6643 }
6644
6645 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_lt_16_strided_a) {
6646 TEST_REQUIRES_ARM_NEON_FMA;
6647 for (size_t k = 1; k < 16; k++) {
6648 GemmMicrokernelTester()
6649 .mr(6)
6650 .nr(8)
6651 .kr(1)
6652 .sr(1)
6653 .m(6)
6654 .n(8)
6655 .k(k)
6656 .a_stride(19)
6657 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6658 }
6659 }
6660
6661 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_lt_16_subtile) {
6662 TEST_REQUIRES_ARM_NEON_FMA;
6663 for (size_t k = 1; k < 16; k++) {
6664 for (uint32_t m = 1; m <= 6; m++) {
6665 for (uint32_t n = 1; n <= 8; n++) {
6666 GemmMicrokernelTester()
6667 .mr(6)
6668 .nr(8)
6669 .kr(1)
6670 .sr(1)
6671 .m(m)
6672 .n(n)
6673 .k(k)
6674 .iterations(1)
6675 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6676 }
6677 }
6678 }
6679 }
6680
6681 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_gt_16) {
6682 TEST_REQUIRES_ARM_NEON_FMA;
6683 for (size_t k = 17; k < 16; k++) {
6684 GemmMicrokernelTester()
6685 .mr(6)
6686 .nr(8)
6687 .kr(1)
6688 .sr(1)
6689 .m(6)
6690 .n(8)
6691 .k(k)
6692 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6693 }
6694 }
6695
6696 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_gt_8_strided_a) {
6697 TEST_REQUIRES_ARM_NEON_FMA;
6698 for (size_t k = 17; k < 16; k++) {
6699 GemmMicrokernelTester()
6700 .mr(6)
6701 .nr(8)
6702 .kr(1)
6703 .sr(1)
6704 .m(6)
6705 .n(8)
6706 .k(k)
6707 .a_stride(19)
6708 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6709 }
6710 }
6711
6712 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_gt_8_subtile) {
6713 TEST_REQUIRES_ARM_NEON_FMA;
6714 for (size_t k = 17; k < 16; k++) {
6715 for (uint32_t m = 1; m <= 6; m++) {
6716 for (uint32_t n = 1; n <= 8; n++) {
6717 GemmMicrokernelTester()
6718 .mr(6)
6719 .nr(8)
6720 .kr(1)
6721 .sr(1)
6722 .m(m)
6723 .n(n)
6724 .k(k)
6725 .iterations(1)
6726 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6727 }
6728 }
6729 }
6730 }
6731
6732 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_div_8) {
6733 TEST_REQUIRES_ARM_NEON_FMA;
6734 for (size_t k = 24; k <= 80; k += 8) {
6735 GemmMicrokernelTester()
6736 .mr(6)
6737 .nr(8)
6738 .kr(1)
6739 .sr(1)
6740 .m(6)
6741 .n(8)
6742 .k(k)
6743 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6744 }
6745 }
6746
6747 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_div_8_strided_a) {
6748 TEST_REQUIRES_ARM_NEON_FMA;
6749 for (size_t k = 24; k <= 80; k += 8) {
6750 GemmMicrokernelTester()
6751 .mr(6)
6752 .nr(8)
6753 .kr(1)
6754 .sr(1)
6755 .m(6)
6756 .n(8)
6757 .k(k)
6758 .a_stride(83)
6759 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6760 }
6761 }
6762
6763 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, k_div_8_subtile) {
6764 TEST_REQUIRES_ARM_NEON_FMA;
6765 for (size_t k = 24; k <= 80; k += 8) {
6766 for (uint32_t m = 1; m <= 6; m++) {
6767 for (uint32_t n = 1; n <= 8; n++) {
6768 GemmMicrokernelTester()
6769 .mr(6)
6770 .nr(8)
6771 .kr(1)
6772 .sr(1)
6773 .m(m)
6774 .n(n)
6775 .k(k)
6776 .iterations(1)
6777 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6778 }
6779 }
6780 }
6781 }
6782
6783 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8) {
6784 TEST_REQUIRES_ARM_NEON_FMA;
6785 for (uint32_t n = 9; n < 16; n++) {
6786 for (size_t k = 1; k <= 40; k += 9) {
6787 GemmMicrokernelTester()
6788 .mr(6)
6789 .nr(8)
6790 .kr(1)
6791 .sr(1)
6792 .m(6)
6793 .n(8)
6794 .k(k)
6795 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6796 }
6797 }
6798 }
6799
6800 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_cn) {
6801 TEST_REQUIRES_ARM_NEON_FMA;
6802 for (uint32_t n = 9; n < 16; n++) {
6803 for (size_t k = 1; k <= 40; k += 9) {
6804 GemmMicrokernelTester()
6805 .mr(6)
6806 .nr(8)
6807 .kr(1)
6808 .sr(1)
6809 .m(6)
6810 .n(8)
6811 .k(k)
6812 .cn_stride(11)
6813 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6814 }
6815 }
6816 }
6817
6818 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_a) {
6819 TEST_REQUIRES_ARM_NEON_FMA;
6820 for (uint32_t n = 9; n < 16; n++) {
6821 for (size_t k = 1; k <= 40; k += 9) {
6822 GemmMicrokernelTester()
6823 .mr(6)
6824 .nr(8)
6825 .kr(1)
6826 .sr(1)
6827 .m(6)
6828 .n(n)
6829 .k(k)
6830 .a_stride(43)
6831 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6832 }
6833 }
6834 }
6835
6836 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_gt_8_subtile) {
6837 TEST_REQUIRES_ARM_NEON_FMA;
6838 for (uint32_t n = 9; n < 16; n++) {
6839 for (size_t k = 1; k <= 40; k += 9) {
6840 for (uint32_t m = 1; m <= 6; m++) {
6841 GemmMicrokernelTester()
6842 .mr(6)
6843 .nr(8)
6844 .kr(1)
6845 .sr(1)
6846 .m(m)
6847 .n(n)
6848 .k(k)
6849 .iterations(1)
6850 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6851 }
6852 }
6853 }
6854 }
6855
6856 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8) {
6857 TEST_REQUIRES_ARM_NEON_FMA;
6858 for (uint32_t n = 16; n <= 24; n += 8) {
6859 for (size_t k = 1; k <= 40; k += 9) {
6860 GemmMicrokernelTester()
6861 .mr(6)
6862 .nr(8)
6863 .kr(1)
6864 .sr(1)
6865 .m(6)
6866 .n(8)
6867 .k(k)
6868 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6869 }
6870 }
6871 }
6872
6873 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_cn) {
6874 TEST_REQUIRES_ARM_NEON_FMA;
6875 for (uint32_t n = 16; n <= 24; n += 8) {
6876 for (size_t k = 1; k <= 40; k += 9) {
6877 GemmMicrokernelTester()
6878 .mr(6)
6879 .nr(8)
6880 .kr(1)
6881 .sr(1)
6882 .m(6)
6883 .n(n)
6884 .k(k)
6885 .cn_stride(11)
6886 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6887 }
6888 }
6889 }
6890
6891 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_a) {
6892 TEST_REQUIRES_ARM_NEON_FMA;
6893 for (uint32_t n = 16; n <= 24; n += 8) {
6894 for (size_t k = 1; k <= 40; k += 9) {
6895 GemmMicrokernelTester()
6896 .mr(6)
6897 .nr(8)
6898 .kr(1)
6899 .sr(1)
6900 .m(6)
6901 .n(n)
6902 .k(k)
6903 .a_stride(43)
6904 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6905 }
6906 }
6907 }
6908
6909 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, n_div_8_subtile) {
6910 TEST_REQUIRES_ARM_NEON_FMA;
6911 for (uint32_t n = 16; n <= 24; n += 8) {
6912 for (size_t k = 1; k <= 40; k += 9) {
6913 for (uint32_t m = 1; m <= 6; m++) {
6914 GemmMicrokernelTester()
6915 .mr(6)
6916 .nr(8)
6917 .kr(1)
6918 .sr(1)
6919 .m(m)
6920 .n(n)
6921 .k(k)
6922 .iterations(1)
6923 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6924 }
6925 }
6926 }
6927 }
6928
6929 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, strided_cm_subtile) {
6930 TEST_REQUIRES_ARM_NEON_FMA;
6931 for (size_t k = 1; k <= 40; k += 9) {
6932 for (uint32_t m = 1; m <= 6; m++) {
6933 for (uint32_t n = 1; n <= 8; n++) {
6934 GemmMicrokernelTester()
6935 .mr(6)
6936 .nr(8)
6937 .kr(1)
6938 .sr(1)
6939 .m(m)
6940 .n(n)
6941 .k(k)
6942 .cm_stride(11)
6943 .iterations(1)
6944 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6945 }
6946 }
6947 }
6948 }
6949
6950 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, qmin) {
6951 TEST_REQUIRES_ARM_NEON_FMA;
6952 GemmMicrokernelTester()
6953 .mr(6)
6954 .nr(8)
6955 .kr(1)
6956 .sr(1)
6957 .m(6)
6958 .n(8)
6959 .k(8)
6960 .qmin(128)
6961 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6962 }
6963
6964 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, qmax) {
6965 TEST_REQUIRES_ARM_NEON_FMA;
6966 GemmMicrokernelTester()
6967 .mr(6)
6968 .nr(8)
6969 .kr(1)
6970 .sr(1)
6971 .m(6)
6972 .n(8)
6973 .k(8)
6974 .qmax(128)
6975 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6976 }
6977
6978 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_IOS, strided_cm) {
6979 TEST_REQUIRES_ARM_NEON_FMA;
6980 GemmMicrokernelTester()
6981 .mr(6)
6982 .nr(8)
6983 .kr(1)
6984 .sr(1)
6985 .m(6)
6986 .n(8)
6987 .k(8)
6988 .cm_stride(11)
6989 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ios);
6990 }
6991#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6992
6993
6994#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6995 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
6996 TEST_REQUIRES_ARM_NEON_FMA;
6997 GemmMicrokernelTester()
6998 .mr(1)
6999 .nr(12)
7000 .kr(1)
7001 .sr(1)
7002 .m(1)
7003 .n(12)
7004 .k(4)
7005 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7006 }
7007
7008 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
7009 TEST_REQUIRES_ARM_NEON_FMA;
7010 GemmMicrokernelTester()
7011 .mr(1)
7012 .nr(12)
7013 .kr(1)
7014 .sr(1)
7015 .m(1)
7016 .n(12)
7017 .k(4)
7018 .cn_stride(17)
7019 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7020 }
7021
7022 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
7023 TEST_REQUIRES_ARM_NEON_FMA;
7024 GemmMicrokernelTester()
7025 .mr(1)
7026 .nr(12)
7027 .kr(1)
7028 .sr(1)
7029 .m(1)
7030 .n(12)
7031 .k(4)
7032 .a_stride(7)
7033 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7034 }
7035
7036 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
7037 TEST_REQUIRES_ARM_NEON_FMA;
7038 for (uint32_t m = 1; m <= 1; m++) {
7039 for (uint32_t n = 1; n <= 12; n++) {
7040 GemmMicrokernelTester()
7041 .mr(1)
7042 .nr(12)
7043 .kr(1)
7044 .sr(1)
7045 .m(m)
7046 .n(n)
7047 .k(4)
7048 .iterations(1)
7049 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7050 }
7051 }
7052 }
7053
7054 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
7055 TEST_REQUIRES_ARM_NEON_FMA;
7056 for (uint32_t m = 1; m <= 1; m++) {
7057 GemmMicrokernelTester()
7058 .mr(1)
7059 .nr(12)
7060 .kr(1)
7061 .sr(1)
7062 .m(m)
7063 .n(12)
7064 .k(4)
7065 .iterations(1)
7066 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7067 }
7068 }
7069
7070 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
7071 TEST_REQUIRES_ARM_NEON_FMA;
7072 for (uint32_t n = 1; n <= 12; n++) {
7073 GemmMicrokernelTester()
7074 .mr(1)
7075 .nr(12)
7076 .kr(1)
7077 .sr(1)
7078 .m(1)
7079 .n(n)
7080 .k(4)
7081 .iterations(1)
7082 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7083 }
7084 }
7085
7086 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
7087 TEST_REQUIRES_ARM_NEON_FMA;
7088 GemmMicrokernelTester()
7089 .mr(1)
7090 .nr(12)
7091 .kr(1)
7092 .sr(1)
7093 .m(1)
7094 .n(12)
7095 .k(8)
7096 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7097 }
7098
7099 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
7100 TEST_REQUIRES_ARM_NEON_FMA;
7101 GemmMicrokernelTester()
7102 .mr(1)
7103 .nr(12)
7104 .kr(1)
7105 .sr(1)
7106 .m(1)
7107 .n(12)
7108 .k(8)
7109 .a_stride(11)
7110 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7111 }
7112
7113 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
7114 TEST_REQUIRES_ARM_NEON_FMA;
7115 for (uint32_t m = 1; m <= 1; m++) {
7116 for (uint32_t n = 1; n <= 12; n++) {
7117 GemmMicrokernelTester()
7118 .mr(1)
7119 .nr(12)
7120 .kr(1)
7121 .sr(1)
7122 .m(m)
7123 .n(n)
7124 .k(8)
7125 .iterations(1)
7126 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7127 }
7128 }
7129 }
7130
7131 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
7132 TEST_REQUIRES_ARM_NEON_FMA;
7133 for (size_t k = 1; k < 8; k++) {
7134 GemmMicrokernelTester()
7135 .mr(1)
7136 .nr(12)
7137 .kr(1)
7138 .sr(1)
7139 .m(1)
7140 .n(12)
7141 .k(k)
7142 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7143 }
7144 }
7145
7146 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
7147 TEST_REQUIRES_ARM_NEON_FMA;
7148 for (size_t k = 1; k < 8; k++) {
7149 GemmMicrokernelTester()
7150 .mr(1)
7151 .nr(12)
7152 .kr(1)
7153 .sr(1)
7154 .m(1)
7155 .n(12)
7156 .k(k)
7157 .a_stride(11)
7158 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7159 }
7160 }
7161
7162 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
7163 TEST_REQUIRES_ARM_NEON_FMA;
7164 for (size_t k = 1; k < 8; k++) {
7165 for (uint32_t m = 1; m <= 1; m++) {
7166 for (uint32_t n = 1; n <= 12; n++) {
7167 GemmMicrokernelTester()
7168 .mr(1)
7169 .nr(12)
7170 .kr(1)
7171 .sr(1)
7172 .m(m)
7173 .n(n)
7174 .k(k)
7175 .iterations(1)
7176 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7177 }
7178 }
7179 }
7180 }
7181
7182 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
7183 TEST_REQUIRES_ARM_NEON_FMA;
7184 for (size_t k = 9; k < 8; k++) {
7185 GemmMicrokernelTester()
7186 .mr(1)
7187 .nr(12)
7188 .kr(1)
7189 .sr(1)
7190 .m(1)
7191 .n(12)
7192 .k(k)
7193 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7194 }
7195 }
7196
7197 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
7198 TEST_REQUIRES_ARM_NEON_FMA;
7199 for (size_t k = 9; k < 8; k++) {
7200 GemmMicrokernelTester()
7201 .mr(1)
7202 .nr(12)
7203 .kr(1)
7204 .sr(1)
7205 .m(1)
7206 .n(12)
7207 .k(k)
7208 .a_stride(11)
7209 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7210 }
7211 }
7212
7213 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
7214 TEST_REQUIRES_ARM_NEON_FMA;
7215 for (size_t k = 9; k < 8; k++) {
7216 for (uint32_t m = 1; m <= 1; m++) {
7217 for (uint32_t n = 1; n <= 12; n++) {
7218 GemmMicrokernelTester()
7219 .mr(1)
7220 .nr(12)
7221 .kr(1)
7222 .sr(1)
7223 .m(m)
7224 .n(n)
7225 .k(k)
7226 .iterations(1)
7227 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7228 }
7229 }
7230 }
7231 }
7232
7233 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
7234 TEST_REQUIRES_ARM_NEON_FMA;
7235 for (size_t k = 12; k <= 40; k += 4) {
7236 GemmMicrokernelTester()
7237 .mr(1)
7238 .nr(12)
7239 .kr(1)
7240 .sr(1)
7241 .m(1)
7242 .n(12)
7243 .k(k)
7244 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7245 }
7246 }
7247
7248 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
7249 TEST_REQUIRES_ARM_NEON_FMA;
7250 for (size_t k = 12; k <= 40; k += 4) {
7251 GemmMicrokernelTester()
7252 .mr(1)
7253 .nr(12)
7254 .kr(1)
7255 .sr(1)
7256 .m(1)
7257 .n(12)
7258 .k(k)
7259 .a_stride(43)
7260 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7261 }
7262 }
7263
7264 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
7265 TEST_REQUIRES_ARM_NEON_FMA;
7266 for (size_t k = 12; k <= 40; k += 4) {
7267 for (uint32_t m = 1; m <= 1; m++) {
7268 for (uint32_t n = 1; n <= 12; n++) {
7269 GemmMicrokernelTester()
7270 .mr(1)
7271 .nr(12)
7272 .kr(1)
7273 .sr(1)
7274 .m(m)
7275 .n(n)
7276 .k(k)
7277 .iterations(1)
7278 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7279 }
7280 }
7281 }
7282 }
7283
7284 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
7285 TEST_REQUIRES_ARM_NEON_FMA;
7286 for (uint32_t n = 13; n < 24; n++) {
7287 for (size_t k = 1; k <= 20; k += 5) {
7288 GemmMicrokernelTester()
7289 .mr(1)
7290 .nr(12)
7291 .kr(1)
7292 .sr(1)
7293 .m(1)
7294 .n(12)
7295 .k(k)
7296 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7297 }
7298 }
7299 }
7300
7301 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
7302 TEST_REQUIRES_ARM_NEON_FMA;
7303 for (uint32_t n = 13; n < 24; n++) {
7304 for (size_t k = 1; k <= 20; k += 5) {
7305 GemmMicrokernelTester()
7306 .mr(1)
7307 .nr(12)
7308 .kr(1)
7309 .sr(1)
7310 .m(1)
7311 .n(12)
7312 .k(k)
7313 .cn_stride(17)
7314 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7315 }
7316 }
7317 }
7318
7319 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
7320 TEST_REQUIRES_ARM_NEON_FMA;
7321 for (uint32_t n = 13; n < 24; n++) {
7322 for (size_t k = 1; k <= 20; k += 5) {
7323 GemmMicrokernelTester()
7324 .mr(1)
7325 .nr(12)
7326 .kr(1)
7327 .sr(1)
7328 .m(1)
7329 .n(n)
7330 .k(k)
7331 .a_stride(23)
7332 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7333 }
7334 }
7335 }
7336
7337 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
7338 TEST_REQUIRES_ARM_NEON_FMA;
7339 for (uint32_t n = 13; n < 24; n++) {
7340 for (size_t k = 1; k <= 20; k += 5) {
7341 for (uint32_t m = 1; m <= 1; m++) {
7342 GemmMicrokernelTester()
7343 .mr(1)
7344 .nr(12)
7345 .kr(1)
7346 .sr(1)
7347 .m(m)
7348 .n(n)
7349 .k(k)
7350 .iterations(1)
7351 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7352 }
7353 }
7354 }
7355 }
7356
7357 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
7358 TEST_REQUIRES_ARM_NEON_FMA;
7359 for (uint32_t n = 24; n <= 36; n += 12) {
7360 for (size_t k = 1; k <= 20; k += 5) {
7361 GemmMicrokernelTester()
7362 .mr(1)
7363 .nr(12)
7364 .kr(1)
7365 .sr(1)
7366 .m(1)
7367 .n(12)
7368 .k(k)
7369 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7370 }
7371 }
7372 }
7373
7374 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
7375 TEST_REQUIRES_ARM_NEON_FMA;
7376 for (uint32_t n = 24; n <= 36; n += 12) {
7377 for (size_t k = 1; k <= 20; k += 5) {
7378 GemmMicrokernelTester()
7379 .mr(1)
7380 .nr(12)
7381 .kr(1)
7382 .sr(1)
7383 .m(1)
7384 .n(n)
7385 .k(k)
7386 .cn_stride(17)
7387 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7388 }
7389 }
7390 }
7391
7392 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
7393 TEST_REQUIRES_ARM_NEON_FMA;
7394 for (uint32_t n = 24; n <= 36; n += 12) {
7395 for (size_t k = 1; k <= 20; k += 5) {
7396 GemmMicrokernelTester()
7397 .mr(1)
7398 .nr(12)
7399 .kr(1)
7400 .sr(1)
7401 .m(1)
7402 .n(n)
7403 .k(k)
7404 .a_stride(23)
7405 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7406 }
7407 }
7408 }
7409
7410 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
7411 TEST_REQUIRES_ARM_NEON_FMA;
7412 for (uint32_t n = 24; n <= 36; n += 12) {
7413 for (size_t k = 1; k <= 20; k += 5) {
7414 for (uint32_t m = 1; m <= 1; m++) {
7415 GemmMicrokernelTester()
7416 .mr(1)
7417 .nr(12)
7418 .kr(1)
7419 .sr(1)
7420 .m(m)
7421 .n(n)
7422 .k(k)
7423 .iterations(1)
7424 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7425 }
7426 }
7427 }
7428 }
7429
7430 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
7431 TEST_REQUIRES_ARM_NEON_FMA;
7432 for (size_t k = 1; k <= 20; k += 5) {
7433 for (uint32_t m = 1; m <= 1; m++) {
7434 for (uint32_t n = 1; n <= 12; n++) {
7435 GemmMicrokernelTester()
7436 .mr(1)
7437 .nr(12)
7438 .kr(1)
7439 .sr(1)
7440 .m(m)
7441 .n(n)
7442 .k(k)
7443 .cm_stride(17)
7444 .iterations(1)
7445 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7446 }
7447 }
7448 }
7449 }
7450
7451 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
7452 TEST_REQUIRES_ARM_NEON_FMA;
7453 GemmMicrokernelTester()
7454 .mr(1)
7455 .nr(12)
7456 .kr(1)
7457 .sr(1)
7458 .m(1)
7459 .n(12)
7460 .k(4)
7461 .qmin(128)
7462 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7463 }
7464
7465 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
7466 TEST_REQUIRES_ARM_NEON_FMA;
7467 GemmMicrokernelTester()
7468 .mr(1)
7469 .nr(12)
7470 .kr(1)
7471 .sr(1)
7472 .m(1)
7473 .n(12)
7474 .k(4)
7475 .qmax(128)
7476 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7477 }
7478
7479 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
7480 TEST_REQUIRES_ARM_NEON_FMA;
7481 GemmMicrokernelTester()
7482 .mr(1)
7483 .nr(12)
7484 .kr(1)
7485 .sr(1)
7486 .m(1)
7487 .n(12)
7488 .k(4)
7489 .cm_stride(17)
7490 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
7491 }
7492#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7493
7494
7495#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7496 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
7497 TEST_REQUIRES_ARM_NEON_FMA;
7498 GemmMicrokernelTester()
7499 .mr(4)
7500 .nr(12)
7501 .kr(1)
7502 .sr(1)
7503 .m(4)
7504 .n(12)
7505 .k(4)
7506 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7507 }
7508
7509 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
7510 TEST_REQUIRES_ARM_NEON_FMA;
7511 GemmMicrokernelTester()
7512 .mr(4)
7513 .nr(12)
7514 .kr(1)
7515 .sr(1)
7516 .m(4)
7517 .n(12)
7518 .k(4)
7519 .cn_stride(17)
7520 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7521 }
7522
7523 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
7524 TEST_REQUIRES_ARM_NEON_FMA;
7525 GemmMicrokernelTester()
7526 .mr(4)
7527 .nr(12)
7528 .kr(1)
7529 .sr(1)
7530 .m(4)
7531 .n(12)
7532 .k(4)
7533 .a_stride(7)
7534 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7535 }
7536
7537 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
7538 TEST_REQUIRES_ARM_NEON_FMA;
7539 for (uint32_t m = 1; m <= 4; m++) {
7540 for (uint32_t n = 1; n <= 12; n++) {
7541 GemmMicrokernelTester()
7542 .mr(4)
7543 .nr(12)
7544 .kr(1)
7545 .sr(1)
7546 .m(m)
7547 .n(n)
7548 .k(4)
7549 .iterations(1)
7550 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7551 }
7552 }
7553 }
7554
7555 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
7556 TEST_REQUIRES_ARM_NEON_FMA;
7557 for (uint32_t m = 1; m <= 4; m++) {
7558 GemmMicrokernelTester()
7559 .mr(4)
7560 .nr(12)
7561 .kr(1)
7562 .sr(1)
7563 .m(m)
7564 .n(12)
7565 .k(4)
7566 .iterations(1)
7567 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7568 }
7569 }
7570
7571 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
7572 TEST_REQUIRES_ARM_NEON_FMA;
7573 for (uint32_t n = 1; n <= 12; n++) {
7574 GemmMicrokernelTester()
7575 .mr(4)
7576 .nr(12)
7577 .kr(1)
7578 .sr(1)
7579 .m(4)
7580 .n(n)
7581 .k(4)
7582 .iterations(1)
7583 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7584 }
7585 }
7586
7587 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
7588 TEST_REQUIRES_ARM_NEON_FMA;
7589 GemmMicrokernelTester()
7590 .mr(4)
7591 .nr(12)
7592 .kr(1)
7593 .sr(1)
7594 .m(4)
7595 .n(12)
7596 .k(8)
7597 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7598 }
7599
7600 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
7601 TEST_REQUIRES_ARM_NEON_FMA;
7602 GemmMicrokernelTester()
7603 .mr(4)
7604 .nr(12)
7605 .kr(1)
7606 .sr(1)
7607 .m(4)
7608 .n(12)
7609 .k(8)
7610 .a_stride(11)
7611 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7612 }
7613
7614 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
7615 TEST_REQUIRES_ARM_NEON_FMA;
7616 for (uint32_t m = 1; m <= 4; m++) {
7617 for (uint32_t n = 1; n <= 12; n++) {
7618 GemmMicrokernelTester()
7619 .mr(4)
7620 .nr(12)
7621 .kr(1)
7622 .sr(1)
7623 .m(m)
7624 .n(n)
7625 .k(8)
7626 .iterations(1)
7627 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7628 }
7629 }
7630 }
7631
7632 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
7633 TEST_REQUIRES_ARM_NEON_FMA;
7634 for (size_t k = 1; k < 8; k++) {
7635 GemmMicrokernelTester()
7636 .mr(4)
7637 .nr(12)
7638 .kr(1)
7639 .sr(1)
7640 .m(4)
7641 .n(12)
7642 .k(k)
7643 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7644 }
7645 }
7646
7647 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
7648 TEST_REQUIRES_ARM_NEON_FMA;
7649 for (size_t k = 1; k < 8; k++) {
7650 GemmMicrokernelTester()
7651 .mr(4)
7652 .nr(12)
7653 .kr(1)
7654 .sr(1)
7655 .m(4)
7656 .n(12)
7657 .k(k)
7658 .a_stride(11)
7659 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7660 }
7661 }
7662
7663 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
7664 TEST_REQUIRES_ARM_NEON_FMA;
7665 for (size_t k = 1; k < 8; k++) {
7666 for (uint32_t m = 1; m <= 4; m++) {
7667 for (uint32_t n = 1; n <= 12; n++) {
7668 GemmMicrokernelTester()
7669 .mr(4)
7670 .nr(12)
7671 .kr(1)
7672 .sr(1)
7673 .m(m)
7674 .n(n)
7675 .k(k)
7676 .iterations(1)
7677 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7678 }
7679 }
7680 }
7681 }
7682
7683 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
7684 TEST_REQUIRES_ARM_NEON_FMA;
7685 for (size_t k = 9; k < 8; k++) {
7686 GemmMicrokernelTester()
7687 .mr(4)
7688 .nr(12)
7689 .kr(1)
7690 .sr(1)
7691 .m(4)
7692 .n(12)
7693 .k(k)
7694 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7695 }
7696 }
7697
7698 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
7699 TEST_REQUIRES_ARM_NEON_FMA;
7700 for (size_t k = 9; k < 8; k++) {
7701 GemmMicrokernelTester()
7702 .mr(4)
7703 .nr(12)
7704 .kr(1)
7705 .sr(1)
7706 .m(4)
7707 .n(12)
7708 .k(k)
7709 .a_stride(11)
7710 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7711 }
7712 }
7713
7714 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
7715 TEST_REQUIRES_ARM_NEON_FMA;
7716 for (size_t k = 9; k < 8; k++) {
7717 for (uint32_t m = 1; m <= 4; m++) {
7718 for (uint32_t n = 1; n <= 12; n++) {
7719 GemmMicrokernelTester()
7720 .mr(4)
7721 .nr(12)
7722 .kr(1)
7723 .sr(1)
7724 .m(m)
7725 .n(n)
7726 .k(k)
7727 .iterations(1)
7728 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7729 }
7730 }
7731 }
7732 }
7733
7734 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
7735 TEST_REQUIRES_ARM_NEON_FMA;
7736 for (size_t k = 12; k <= 40; k += 4) {
7737 GemmMicrokernelTester()
7738 .mr(4)
7739 .nr(12)
7740 .kr(1)
7741 .sr(1)
7742 .m(4)
7743 .n(12)
7744 .k(k)
7745 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7746 }
7747 }
7748
7749 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
7750 TEST_REQUIRES_ARM_NEON_FMA;
7751 for (size_t k = 12; k <= 40; k += 4) {
7752 GemmMicrokernelTester()
7753 .mr(4)
7754 .nr(12)
7755 .kr(1)
7756 .sr(1)
7757 .m(4)
7758 .n(12)
7759 .k(k)
7760 .a_stride(43)
7761 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7762 }
7763 }
7764
7765 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
7766 TEST_REQUIRES_ARM_NEON_FMA;
7767 for (size_t k = 12; k <= 40; k += 4) {
7768 for (uint32_t m = 1; m <= 4; m++) {
7769 for (uint32_t n = 1; n <= 12; n++) {
7770 GemmMicrokernelTester()
7771 .mr(4)
7772 .nr(12)
7773 .kr(1)
7774 .sr(1)
7775 .m(m)
7776 .n(n)
7777 .k(k)
7778 .iterations(1)
7779 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7780 }
7781 }
7782 }
7783 }
7784
7785 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
7786 TEST_REQUIRES_ARM_NEON_FMA;
7787 for (uint32_t n = 13; n < 24; n++) {
7788 for (size_t k = 1; k <= 20; k += 5) {
7789 GemmMicrokernelTester()
7790 .mr(4)
7791 .nr(12)
7792 .kr(1)
7793 .sr(1)
7794 .m(4)
7795 .n(12)
7796 .k(k)
7797 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7798 }
7799 }
7800 }
7801
7802 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
7803 TEST_REQUIRES_ARM_NEON_FMA;
7804 for (uint32_t n = 13; n < 24; n++) {
7805 for (size_t k = 1; k <= 20; k += 5) {
7806 GemmMicrokernelTester()
7807 .mr(4)
7808 .nr(12)
7809 .kr(1)
7810 .sr(1)
7811 .m(4)
7812 .n(12)
7813 .k(k)
7814 .cn_stride(17)
7815 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7816 }
7817 }
7818 }
7819
7820 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
7821 TEST_REQUIRES_ARM_NEON_FMA;
7822 for (uint32_t n = 13; n < 24; n++) {
7823 for (size_t k = 1; k <= 20; k += 5) {
7824 GemmMicrokernelTester()
7825 .mr(4)
7826 .nr(12)
7827 .kr(1)
7828 .sr(1)
7829 .m(4)
7830 .n(n)
7831 .k(k)
7832 .a_stride(23)
7833 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7834 }
7835 }
7836 }
7837
7838 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
7839 TEST_REQUIRES_ARM_NEON_FMA;
7840 for (uint32_t n = 13; n < 24; n++) {
7841 for (size_t k = 1; k <= 20; k += 5) {
7842 for (uint32_t m = 1; m <= 4; m++) {
7843 GemmMicrokernelTester()
7844 .mr(4)
7845 .nr(12)
7846 .kr(1)
7847 .sr(1)
7848 .m(m)
7849 .n(n)
7850 .k(k)
7851 .iterations(1)
7852 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7853 }
7854 }
7855 }
7856 }
7857
7858 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
7859 TEST_REQUIRES_ARM_NEON_FMA;
7860 for (uint32_t n = 24; n <= 36; n += 12) {
7861 for (size_t k = 1; k <= 20; k += 5) {
7862 GemmMicrokernelTester()
7863 .mr(4)
7864 .nr(12)
7865 .kr(1)
7866 .sr(1)
7867 .m(4)
7868 .n(12)
7869 .k(k)
7870 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7871 }
7872 }
7873 }
7874
7875 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
7876 TEST_REQUIRES_ARM_NEON_FMA;
7877 for (uint32_t n = 24; n <= 36; n += 12) {
7878 for (size_t k = 1; k <= 20; k += 5) {
7879 GemmMicrokernelTester()
7880 .mr(4)
7881 .nr(12)
7882 .kr(1)
7883 .sr(1)
7884 .m(4)
7885 .n(n)
7886 .k(k)
7887 .cn_stride(17)
7888 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7889 }
7890 }
7891 }
7892
7893 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
7894 TEST_REQUIRES_ARM_NEON_FMA;
7895 for (uint32_t n = 24; n <= 36; n += 12) {
7896 for (size_t k = 1; k <= 20; k += 5) {
7897 GemmMicrokernelTester()
7898 .mr(4)
7899 .nr(12)
7900 .kr(1)
7901 .sr(1)
7902 .m(4)
7903 .n(n)
7904 .k(k)
7905 .a_stride(23)
7906 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7907 }
7908 }
7909 }
7910
7911 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
7912 TEST_REQUIRES_ARM_NEON_FMA;
7913 for (uint32_t n = 24; n <= 36; n += 12) {
7914 for (size_t k = 1; k <= 20; k += 5) {
7915 for (uint32_t m = 1; m <= 4; m++) {
7916 GemmMicrokernelTester()
7917 .mr(4)
7918 .nr(12)
7919 .kr(1)
7920 .sr(1)
7921 .m(m)
7922 .n(n)
7923 .k(k)
7924 .iterations(1)
7925 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7926 }
7927 }
7928 }
7929 }
7930
7931 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
7932 TEST_REQUIRES_ARM_NEON_FMA;
7933 for (size_t k = 1; k <= 20; k += 5) {
7934 for (uint32_t m = 1; m <= 4; m++) {
7935 for (uint32_t n = 1; n <= 12; n++) {
7936 GemmMicrokernelTester()
7937 .mr(4)
7938 .nr(12)
7939 .kr(1)
7940 .sr(1)
7941 .m(m)
7942 .n(n)
7943 .k(k)
7944 .cm_stride(17)
7945 .iterations(1)
7946 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7947 }
7948 }
7949 }
7950 }
7951
7952 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
7953 TEST_REQUIRES_ARM_NEON_FMA;
7954 GemmMicrokernelTester()
7955 .mr(4)
7956 .nr(12)
7957 .kr(1)
7958 .sr(1)
7959 .m(4)
7960 .n(12)
7961 .k(4)
7962 .qmin(128)
7963 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7964 }
7965
7966 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
7967 TEST_REQUIRES_ARM_NEON_FMA;
7968 GemmMicrokernelTester()
7969 .mr(4)
7970 .nr(12)
7971 .kr(1)
7972 .sr(1)
7973 .m(4)
7974 .n(12)
7975 .k(4)
7976 .qmax(128)
7977 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7978 }
7979
7980 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
7981 TEST_REQUIRES_ARM_NEON_FMA;
7982 GemmMicrokernelTester()
7983 .mr(4)
7984 .nr(12)
7985 .kr(1)
7986 .sr(1)
7987 .m(4)
7988 .n(12)
7989 .k(4)
7990 .cm_stride(17)
7991 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
7992 }
7993#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7994
7995
7996#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7997 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2) {
7998 TEST_REQUIRES_ARM_NEON_FMA;
7999 GemmMicrokernelTester()
8000 .mr(4)
8001 .nr(8)
8002 .kr(1)
8003 .sr(1)
8004 .m(4)
8005 .n(8)
8006 .k(2)
8007 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8008 }
8009
8010 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, strided_cn) {
8011 TEST_REQUIRES_ARM_NEON_FMA;
8012 GemmMicrokernelTester()
8013 .mr(4)
8014 .nr(8)
8015 .kr(1)
8016 .sr(1)
8017 .m(4)
8018 .n(8)
8019 .k(2)
8020 .cn_stride(11)
8021 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8022 }
8023
8024 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
8025 TEST_REQUIRES_ARM_NEON_FMA;
8026 GemmMicrokernelTester()
8027 .mr(4)
8028 .nr(8)
8029 .kr(1)
8030 .sr(1)
8031 .m(4)
8032 .n(8)
8033 .k(2)
8034 .a_stride(5)
8035 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8036 }
8037
8038 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
8039 TEST_REQUIRES_ARM_NEON_FMA;
8040 for (uint32_t m = 1; m <= 4; m++) {
8041 for (uint32_t n = 1; n <= 8; n++) {
8042 GemmMicrokernelTester()
8043 .mr(4)
8044 .nr(8)
8045 .kr(1)
8046 .sr(1)
8047 .m(m)
8048 .n(n)
8049 .k(2)
8050 .iterations(1)
8051 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8052 }
8053 }
8054 }
8055
8056 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
8057 TEST_REQUIRES_ARM_NEON_FMA;
8058 for (uint32_t m = 1; m <= 4; m++) {
8059 GemmMicrokernelTester()
8060 .mr(4)
8061 .nr(8)
8062 .kr(1)
8063 .sr(1)
8064 .m(m)
8065 .n(8)
8066 .k(2)
8067 .iterations(1)
8068 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8069 }
8070 }
8071
8072 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
8073 TEST_REQUIRES_ARM_NEON_FMA;
8074 for (uint32_t n = 1; n <= 8; n++) {
8075 GemmMicrokernelTester()
8076 .mr(4)
8077 .nr(8)
8078 .kr(1)
8079 .sr(1)
8080 .m(4)
8081 .n(n)
8082 .k(2)
8083 .iterations(1)
8084 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8085 }
8086 }
8087
8088 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_lt_2) {
8089 TEST_REQUIRES_ARM_NEON_FMA;
8090 for (size_t k = 1; k < 2; k++) {
8091 GemmMicrokernelTester()
8092 .mr(4)
8093 .nr(8)
8094 .kr(1)
8095 .sr(1)
8096 .m(4)
8097 .n(8)
8098 .k(k)
8099 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8100 }
8101 }
8102
8103 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
8104 TEST_REQUIRES_ARM_NEON_FMA;
8105 for (size_t k = 1; k < 2; k++) {
8106 GemmMicrokernelTester()
8107 .mr(4)
8108 .nr(8)
8109 .kr(1)
8110 .sr(1)
8111 .m(4)
8112 .n(8)
8113 .k(k)
8114 .a_stride(5)
8115 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8116 }
8117 }
8118
8119 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
8120 TEST_REQUIRES_ARM_NEON_FMA;
8121 for (size_t k = 1; k < 2; k++) {
8122 for (uint32_t m = 1; m <= 4; m++) {
8123 for (uint32_t n = 1; n <= 8; n++) {
8124 GemmMicrokernelTester()
8125 .mr(4)
8126 .nr(8)
8127 .kr(1)
8128 .sr(1)
8129 .m(m)
8130 .n(n)
8131 .k(k)
8132 .iterations(1)
8133 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8134 }
8135 }
8136 }
8137 }
8138
8139 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_gt_2) {
8140 TEST_REQUIRES_ARM_NEON_FMA;
8141 for (size_t k = 3; k < 4; k++) {
8142 GemmMicrokernelTester()
8143 .mr(4)
8144 .nr(8)
8145 .kr(1)
8146 .sr(1)
8147 .m(4)
8148 .n(8)
8149 .k(k)
8150 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8151 }
8152 }
8153
8154 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
8155 TEST_REQUIRES_ARM_NEON_FMA;
8156 for (size_t k = 3; k < 4; k++) {
8157 GemmMicrokernelTester()
8158 .mr(4)
8159 .nr(8)
8160 .kr(1)
8161 .sr(1)
8162 .m(4)
8163 .n(8)
8164 .k(k)
8165 .a_stride(7)
8166 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8167 }
8168 }
8169
8170 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
8171 TEST_REQUIRES_ARM_NEON_FMA;
8172 for (size_t k = 3; k < 4; k++) {
8173 for (uint32_t m = 1; m <= 4; m++) {
8174 for (uint32_t n = 1; n <= 8; n++) {
8175 GemmMicrokernelTester()
8176 .mr(4)
8177 .nr(8)
8178 .kr(1)
8179 .sr(1)
8180 .m(m)
8181 .n(n)
8182 .k(k)
8183 .iterations(1)
8184 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8185 }
8186 }
8187 }
8188 }
8189
8190 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_div_2) {
8191 TEST_REQUIRES_ARM_NEON_FMA;
8192 for (size_t k = 4; k <= 20; k += 2) {
8193 GemmMicrokernelTester()
8194 .mr(4)
8195 .nr(8)
8196 .kr(1)
8197 .sr(1)
8198 .m(4)
8199 .n(8)
8200 .k(k)
8201 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8202 }
8203 }
8204
8205 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
8206 TEST_REQUIRES_ARM_NEON_FMA;
8207 for (size_t k = 4; k <= 20; k += 2) {
8208 GemmMicrokernelTester()
8209 .mr(4)
8210 .nr(8)
8211 .kr(1)
8212 .sr(1)
8213 .m(4)
8214 .n(8)
8215 .k(k)
8216 .a_stride(23)
8217 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8218 }
8219 }
8220
8221 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
8222 TEST_REQUIRES_ARM_NEON_FMA;
8223 for (size_t k = 4; k <= 20; k += 2) {
8224 for (uint32_t m = 1; m <= 4; m++) {
8225 for (uint32_t n = 1; n <= 8; n++) {
8226 GemmMicrokernelTester()
8227 .mr(4)
8228 .nr(8)
8229 .kr(1)
8230 .sr(1)
8231 .m(m)
8232 .n(n)
8233 .k(k)
8234 .iterations(1)
8235 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8236 }
8237 }
8238 }
8239 }
8240
8241 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8) {
8242 TEST_REQUIRES_ARM_NEON_FMA;
8243 for (uint32_t n = 9; n < 16; n++) {
8244 for (size_t k = 1; k <= 10; k += 3) {
8245 GemmMicrokernelTester()
8246 .mr(4)
8247 .nr(8)
8248 .kr(1)
8249 .sr(1)
8250 .m(4)
8251 .n(8)
8252 .k(k)
8253 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8254 }
8255 }
8256 }
8257
8258 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
8259 TEST_REQUIRES_ARM_NEON_FMA;
8260 for (uint32_t n = 9; n < 16; n++) {
8261 for (size_t k = 1; k <= 10; k += 3) {
8262 GemmMicrokernelTester()
8263 .mr(4)
8264 .nr(8)
8265 .kr(1)
8266 .sr(1)
8267 .m(4)
8268 .n(8)
8269 .k(k)
8270 .cn_stride(11)
8271 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8272 }
8273 }
8274 }
8275
8276 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
8277 TEST_REQUIRES_ARM_NEON_FMA;
8278 for (uint32_t n = 9; n < 16; n++) {
8279 for (size_t k = 1; k <= 10; k += 3) {
8280 GemmMicrokernelTester()
8281 .mr(4)
8282 .nr(8)
8283 .kr(1)
8284 .sr(1)
8285 .m(4)
8286 .n(n)
8287 .k(k)
8288 .a_stride(13)
8289 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8290 }
8291 }
8292 }
8293
8294 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
8295 TEST_REQUIRES_ARM_NEON_FMA;
8296 for (uint32_t n = 9; n < 16; n++) {
8297 for (size_t k = 1; k <= 10; k += 3) {
8298 for (uint32_t m = 1; m <= 4; m++) {
8299 GemmMicrokernelTester()
8300 .mr(4)
8301 .nr(8)
8302 .kr(1)
8303 .sr(1)
8304 .m(m)
8305 .n(n)
8306 .k(k)
8307 .iterations(1)
8308 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8309 }
8310 }
8311 }
8312 }
8313
8314 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8) {
8315 TEST_REQUIRES_ARM_NEON_FMA;
8316 for (uint32_t n = 16; n <= 24; n += 8) {
8317 for (size_t k = 1; k <= 10; k += 3) {
8318 GemmMicrokernelTester()
8319 .mr(4)
8320 .nr(8)
8321 .kr(1)
8322 .sr(1)
8323 .m(4)
8324 .n(8)
8325 .k(k)
8326 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8327 }
8328 }
8329 }
8330
8331 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
8332 TEST_REQUIRES_ARM_NEON_FMA;
8333 for (uint32_t n = 16; n <= 24; n += 8) {
8334 for (size_t k = 1; k <= 10; k += 3) {
8335 GemmMicrokernelTester()
8336 .mr(4)
8337 .nr(8)
8338 .kr(1)
8339 .sr(1)
8340 .m(4)
8341 .n(n)
8342 .k(k)
8343 .cn_stride(11)
8344 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8345 }
8346 }
8347 }
8348
8349 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
8350 TEST_REQUIRES_ARM_NEON_FMA;
8351 for (uint32_t n = 16; n <= 24; n += 8) {
8352 for (size_t k = 1; k <= 10; k += 3) {
8353 GemmMicrokernelTester()
8354 .mr(4)
8355 .nr(8)
8356 .kr(1)
8357 .sr(1)
8358 .m(4)
8359 .n(n)
8360 .k(k)
8361 .a_stride(13)
8362 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8363 }
8364 }
8365 }
8366
8367 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
8368 TEST_REQUIRES_ARM_NEON_FMA;
8369 for (uint32_t n = 16; n <= 24; n += 8) {
8370 for (size_t k = 1; k <= 10; k += 3) {
8371 for (uint32_t m = 1; m <= 4; m++) {
8372 GemmMicrokernelTester()
8373 .mr(4)
8374 .nr(8)
8375 .kr(1)
8376 .sr(1)
8377 .m(m)
8378 .n(n)
8379 .k(k)
8380 .iterations(1)
8381 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8382 }
8383 }
8384 }
8385 }
8386
8387 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
8388 TEST_REQUIRES_ARM_NEON_FMA;
8389 for (size_t k = 1; k <= 10; k += 3) {
8390 for (uint32_t m = 1; m <= 4; m++) {
8391 for (uint32_t n = 1; n <= 8; n++) {
8392 GemmMicrokernelTester()
8393 .mr(4)
8394 .nr(8)
8395 .kr(1)
8396 .sr(1)
8397 .m(m)
8398 .n(n)
8399 .k(k)
8400 .cm_stride(11)
8401 .iterations(1)
8402 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8403 }
8404 }
8405 }
8406 }
8407
8408 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, qmin) {
8409 TEST_REQUIRES_ARM_NEON_FMA;
8410 GemmMicrokernelTester()
8411 .mr(4)
8412 .nr(8)
8413 .kr(1)
8414 .sr(1)
8415 .m(4)
8416 .n(8)
8417 .k(2)
8418 .qmin(128)
8419 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8420 }
8421
8422 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, qmax) {
8423 TEST_REQUIRES_ARM_NEON_FMA;
8424 GemmMicrokernelTester()
8425 .mr(4)
8426 .nr(8)
8427 .kr(1)
8428 .sr(1)
8429 .m(4)
8430 .n(8)
8431 .k(2)
8432 .qmax(128)
8433 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8434 }
8435
8436 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, strided_cm) {
8437 TEST_REQUIRES_ARM_NEON_FMA;
8438 GemmMicrokernelTester()
8439 .mr(4)
8440 .nr(8)
8441 .kr(1)
8442 .sr(1)
8443 .m(4)
8444 .n(8)
8445 .k(2)
8446 .cm_stride(11)
8447 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
8448 }
8449#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8450
8451
8452#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8453 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
8454 TEST_REQUIRES_ARM_NEON_FMA;
8455 GemmMicrokernelTester()
8456 .mr(4)
8457 .nr(8)
8458 .kr(1)
8459 .sr(1)
8460 .m(4)
8461 .n(8)
8462 .k(4)
8463 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8464 }
8465
8466 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, strided_cn) {
8467 TEST_REQUIRES_ARM_NEON_FMA;
8468 GemmMicrokernelTester()
8469 .mr(4)
8470 .nr(8)
8471 .kr(1)
8472 .sr(1)
8473 .m(4)
8474 .n(8)
8475 .k(4)
8476 .cn_stride(11)
8477 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8478 }
8479
8480 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
8481 TEST_REQUIRES_ARM_NEON_FMA;
8482 GemmMicrokernelTester()
8483 .mr(4)
8484 .nr(8)
8485 .kr(1)
8486 .sr(1)
8487 .m(4)
8488 .n(8)
8489 .k(4)
8490 .a_stride(7)
8491 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8492 }
8493
8494 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
8495 TEST_REQUIRES_ARM_NEON_FMA;
8496 for (uint32_t m = 1; m <= 4; m++) {
8497 for (uint32_t n = 1; n <= 8; n++) {
8498 GemmMicrokernelTester()
8499 .mr(4)
8500 .nr(8)
8501 .kr(1)
8502 .sr(1)
8503 .m(m)
8504 .n(n)
8505 .k(4)
8506 .iterations(1)
8507 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8508 }
8509 }
8510 }
8511
8512 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
8513 TEST_REQUIRES_ARM_NEON_FMA;
8514 for (uint32_t m = 1; m <= 4; m++) {
8515 GemmMicrokernelTester()
8516 .mr(4)
8517 .nr(8)
8518 .kr(1)
8519 .sr(1)
8520 .m(m)
8521 .n(8)
8522 .k(4)
8523 .iterations(1)
8524 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8525 }
8526 }
8527
8528 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
8529 TEST_REQUIRES_ARM_NEON_FMA;
8530 for (uint32_t n = 1; n <= 8; n++) {
8531 GemmMicrokernelTester()
8532 .mr(4)
8533 .nr(8)
8534 .kr(1)
8535 .sr(1)
8536 .m(4)
8537 .n(n)
8538 .k(4)
8539 .iterations(1)
8540 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8541 }
8542 }
8543
8544 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_lt_4) {
8545 TEST_REQUIRES_ARM_NEON_FMA;
8546 for (size_t k = 1; k < 4; k++) {
8547 GemmMicrokernelTester()
8548 .mr(4)
8549 .nr(8)
8550 .kr(1)
8551 .sr(1)
8552 .m(4)
8553 .n(8)
8554 .k(k)
8555 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8556 }
8557 }
8558
8559 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
8560 TEST_REQUIRES_ARM_NEON_FMA;
8561 for (size_t k = 1; k < 4; k++) {
8562 GemmMicrokernelTester()
8563 .mr(4)
8564 .nr(8)
8565 .kr(1)
8566 .sr(1)
8567 .m(4)
8568 .n(8)
8569 .k(k)
8570 .a_stride(7)
8571 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8572 }
8573 }
8574
8575 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
8576 TEST_REQUIRES_ARM_NEON_FMA;
8577 for (size_t k = 1; k < 4; k++) {
8578 for (uint32_t m = 1; m <= 4; m++) {
8579 for (uint32_t n = 1; n <= 8; n++) {
8580 GemmMicrokernelTester()
8581 .mr(4)
8582 .nr(8)
8583 .kr(1)
8584 .sr(1)
8585 .m(m)
8586 .n(n)
8587 .k(k)
8588 .iterations(1)
8589 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8590 }
8591 }
8592 }
8593 }
8594
8595 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_gt_4) {
8596 TEST_REQUIRES_ARM_NEON_FMA;
8597 for (size_t k = 5; k < 8; k++) {
8598 GemmMicrokernelTester()
8599 .mr(4)
8600 .nr(8)
8601 .kr(1)
8602 .sr(1)
8603 .m(4)
8604 .n(8)
8605 .k(k)
8606 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8607 }
8608 }
8609
8610 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
8611 TEST_REQUIRES_ARM_NEON_FMA;
8612 for (size_t k = 5; k < 8; k++) {
8613 GemmMicrokernelTester()
8614 .mr(4)
8615 .nr(8)
8616 .kr(1)
8617 .sr(1)
8618 .m(4)
8619 .n(8)
8620 .k(k)
8621 .a_stride(11)
8622 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8623 }
8624 }
8625
8626 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
8627 TEST_REQUIRES_ARM_NEON_FMA;
8628 for (size_t k = 5; k < 8; k++) {
8629 for (uint32_t m = 1; m <= 4; m++) {
8630 for (uint32_t n = 1; n <= 8; n++) {
8631 GemmMicrokernelTester()
8632 .mr(4)
8633 .nr(8)
8634 .kr(1)
8635 .sr(1)
8636 .m(m)
8637 .n(n)
8638 .k(k)
8639 .iterations(1)
8640 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8641 }
8642 }
8643 }
8644 }
8645
8646 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_div_4) {
8647 TEST_REQUIRES_ARM_NEON_FMA;
8648 for (size_t k = 8; k <= 40; k += 4) {
8649 GemmMicrokernelTester()
8650 .mr(4)
8651 .nr(8)
8652 .kr(1)
8653 .sr(1)
8654 .m(4)
8655 .n(8)
8656 .k(k)
8657 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8658 }
8659 }
8660
8661 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
8662 TEST_REQUIRES_ARM_NEON_FMA;
8663 for (size_t k = 8; k <= 40; k += 4) {
8664 GemmMicrokernelTester()
8665 .mr(4)
8666 .nr(8)
8667 .kr(1)
8668 .sr(1)
8669 .m(4)
8670 .n(8)
8671 .k(k)
8672 .a_stride(43)
8673 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8674 }
8675 }
8676
8677 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
8678 TEST_REQUIRES_ARM_NEON_FMA;
8679 for (size_t k = 8; k <= 40; k += 4) {
8680 for (uint32_t m = 1; m <= 4; m++) {
8681 for (uint32_t n = 1; n <= 8; n++) {
8682 GemmMicrokernelTester()
8683 .mr(4)
8684 .nr(8)
8685 .kr(1)
8686 .sr(1)
8687 .m(m)
8688 .n(n)
8689 .k(k)
8690 .iterations(1)
8691 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8692 }
8693 }
8694 }
8695 }
8696
8697 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8) {
8698 TEST_REQUIRES_ARM_NEON_FMA;
8699 for (uint32_t n = 9; n < 16; n++) {
8700 for (size_t k = 1; k <= 20; k += 5) {
8701 GemmMicrokernelTester()
8702 .mr(4)
8703 .nr(8)
8704 .kr(1)
8705 .sr(1)
8706 .m(4)
8707 .n(8)
8708 .k(k)
8709 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8710 }
8711 }
8712 }
8713
8714 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
8715 TEST_REQUIRES_ARM_NEON_FMA;
8716 for (uint32_t n = 9; n < 16; n++) {
8717 for (size_t k = 1; k <= 20; k += 5) {
8718 GemmMicrokernelTester()
8719 .mr(4)
8720 .nr(8)
8721 .kr(1)
8722 .sr(1)
8723 .m(4)
8724 .n(8)
8725 .k(k)
8726 .cn_stride(11)
8727 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8728 }
8729 }
8730 }
8731
8732 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
8733 TEST_REQUIRES_ARM_NEON_FMA;
8734 for (uint32_t n = 9; n < 16; n++) {
8735 for (size_t k = 1; k <= 20; k += 5) {
8736 GemmMicrokernelTester()
8737 .mr(4)
8738 .nr(8)
8739 .kr(1)
8740 .sr(1)
8741 .m(4)
8742 .n(n)
8743 .k(k)
8744 .a_stride(23)
8745 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8746 }
8747 }
8748 }
8749
8750 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
8751 TEST_REQUIRES_ARM_NEON_FMA;
8752 for (uint32_t n = 9; n < 16; n++) {
8753 for (size_t k = 1; k <= 20; k += 5) {
8754 for (uint32_t m = 1; m <= 4; m++) {
8755 GemmMicrokernelTester()
8756 .mr(4)
8757 .nr(8)
8758 .kr(1)
8759 .sr(1)
8760 .m(m)
8761 .n(n)
8762 .k(k)
8763 .iterations(1)
8764 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8765 }
8766 }
8767 }
8768 }
8769
8770 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8) {
8771 TEST_REQUIRES_ARM_NEON_FMA;
8772 for (uint32_t n = 16; n <= 24; n += 8) {
8773 for (size_t k = 1; k <= 20; k += 5) {
8774 GemmMicrokernelTester()
8775 .mr(4)
8776 .nr(8)
8777 .kr(1)
8778 .sr(1)
8779 .m(4)
8780 .n(8)
8781 .k(k)
8782 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8783 }
8784 }
8785 }
8786
8787 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
8788 TEST_REQUIRES_ARM_NEON_FMA;
8789 for (uint32_t n = 16; n <= 24; n += 8) {
8790 for (size_t k = 1; k <= 20; k += 5) {
8791 GemmMicrokernelTester()
8792 .mr(4)
8793 .nr(8)
8794 .kr(1)
8795 .sr(1)
8796 .m(4)
8797 .n(n)
8798 .k(k)
8799 .cn_stride(11)
8800 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8801 }
8802 }
8803 }
8804
8805 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
8806 TEST_REQUIRES_ARM_NEON_FMA;
8807 for (uint32_t n = 16; n <= 24; n += 8) {
8808 for (size_t k = 1; k <= 20; k += 5) {
8809 GemmMicrokernelTester()
8810 .mr(4)
8811 .nr(8)
8812 .kr(1)
8813 .sr(1)
8814 .m(4)
8815 .n(n)
8816 .k(k)
8817 .a_stride(23)
8818 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8819 }
8820 }
8821 }
8822
8823 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
8824 TEST_REQUIRES_ARM_NEON_FMA;
8825 for (uint32_t n = 16; n <= 24; n += 8) {
8826 for (size_t k = 1; k <= 20; k += 5) {
8827 for (uint32_t m = 1; m <= 4; m++) {
8828 GemmMicrokernelTester()
8829 .mr(4)
8830 .nr(8)
8831 .kr(1)
8832 .sr(1)
8833 .m(m)
8834 .n(n)
8835 .k(k)
8836 .iterations(1)
8837 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8838 }
8839 }
8840 }
8841 }
8842
8843 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
8844 TEST_REQUIRES_ARM_NEON_FMA;
8845 for (size_t k = 1; k <= 20; k += 5) {
8846 for (uint32_t m = 1; m <= 4; m++) {
8847 for (uint32_t n = 1; n <= 8; n++) {
8848 GemmMicrokernelTester()
8849 .mr(4)
8850 .nr(8)
8851 .kr(1)
8852 .sr(1)
8853 .m(m)
8854 .n(n)
8855 .k(k)
8856 .cm_stride(11)
8857 .iterations(1)
8858 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8859 }
8860 }
8861 }
8862 }
8863
8864 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, qmin) {
8865 TEST_REQUIRES_ARM_NEON_FMA;
8866 GemmMicrokernelTester()
8867 .mr(4)
8868 .nr(8)
8869 .kr(1)
8870 .sr(1)
8871 .m(4)
8872 .n(8)
8873 .k(4)
8874 .qmin(128)
8875 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8876 }
8877
8878 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, qmax) {
8879 TEST_REQUIRES_ARM_NEON_FMA;
8880 GemmMicrokernelTester()
8881 .mr(4)
8882 .nr(8)
8883 .kr(1)
8884 .sr(1)
8885 .m(4)
8886 .n(8)
8887 .k(4)
8888 .qmax(128)
8889 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8890 }
8891
8892 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, strided_cm) {
8893 TEST_REQUIRES_ARM_NEON_FMA;
8894 GemmMicrokernelTester()
8895 .mr(4)
8896 .nr(8)
8897 .kr(1)
8898 .sr(1)
8899 .m(4)
8900 .n(8)
8901 .k(4)
8902 .cm_stride(11)
8903 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
8904 }
8905#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8906
8907
8908#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8909 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
8910 TEST_REQUIRES_ARM_NEON_FMA;
8911 GemmMicrokernelTester()
8912 .mr(6)
8913 .nr(8)
8914 .kr(1)
8915 .sr(1)
8916 .m(6)
8917 .n(8)
8918 .k(2)
8919 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
8920 }
8921
8922 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, strided_cn) {
8923 TEST_REQUIRES_ARM_NEON_FMA;
8924 GemmMicrokernelTester()
8925 .mr(6)
8926 .nr(8)
8927 .kr(1)
8928 .sr(1)
8929 .m(6)
8930 .n(8)
8931 .k(2)
8932 .cn_stride(11)
8933 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
8934 }
8935
8936 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
8937 TEST_REQUIRES_ARM_NEON_FMA;
8938 GemmMicrokernelTester()
8939 .mr(6)
8940 .nr(8)
8941 .kr(1)
8942 .sr(1)
8943 .m(6)
8944 .n(8)
8945 .k(2)
8946 .a_stride(5)
8947 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
8948 }
8949
8950 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
8951 TEST_REQUIRES_ARM_NEON_FMA;
8952 for (uint32_t m = 1; m <= 6; m++) {
8953 for (uint32_t n = 1; n <= 8; n++) {
8954 GemmMicrokernelTester()
8955 .mr(6)
8956 .nr(8)
8957 .kr(1)
8958 .sr(1)
8959 .m(m)
8960 .n(n)
8961 .k(2)
8962 .iterations(1)
8963 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
8964 }
8965 }
8966 }
8967
8968 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
8969 TEST_REQUIRES_ARM_NEON_FMA;
8970 for (uint32_t m = 1; m <= 6; m++) {
8971 GemmMicrokernelTester()
8972 .mr(6)
8973 .nr(8)
8974 .kr(1)
8975 .sr(1)
8976 .m(m)
8977 .n(8)
8978 .k(2)
8979 .iterations(1)
8980 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
8981 }
8982 }
8983
8984 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
8985 TEST_REQUIRES_ARM_NEON_FMA;
8986 for (uint32_t n = 1; n <= 8; n++) {
8987 GemmMicrokernelTester()
8988 .mr(6)
8989 .nr(8)
8990 .kr(1)
8991 .sr(1)
8992 .m(6)
8993 .n(n)
8994 .k(2)
8995 .iterations(1)
8996 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
8997 }
8998 }
8999
9000 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_lt_2) {
9001 TEST_REQUIRES_ARM_NEON_FMA;
9002 for (size_t k = 1; k < 2; k++) {
9003 GemmMicrokernelTester()
9004 .mr(6)
9005 .nr(8)
9006 .kr(1)
9007 .sr(1)
9008 .m(6)
9009 .n(8)
9010 .k(k)
9011 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9012 }
9013 }
9014
9015 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
9016 TEST_REQUIRES_ARM_NEON_FMA;
9017 for (size_t k = 1; k < 2; k++) {
9018 GemmMicrokernelTester()
9019 .mr(6)
9020 .nr(8)
9021 .kr(1)
9022 .sr(1)
9023 .m(6)
9024 .n(8)
9025 .k(k)
9026 .a_stride(5)
9027 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9028 }
9029 }
9030
9031 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
9032 TEST_REQUIRES_ARM_NEON_FMA;
9033 for (size_t k = 1; k < 2; k++) {
9034 for (uint32_t m = 1; m <= 6; m++) {
9035 for (uint32_t n = 1; n <= 8; n++) {
9036 GemmMicrokernelTester()
9037 .mr(6)
9038 .nr(8)
9039 .kr(1)
9040 .sr(1)
9041 .m(m)
9042 .n(n)
9043 .k(k)
9044 .iterations(1)
9045 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9046 }
9047 }
9048 }
9049 }
9050
9051 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_gt_2) {
9052 TEST_REQUIRES_ARM_NEON_FMA;
9053 for (size_t k = 3; k < 4; k++) {
9054 GemmMicrokernelTester()
9055 .mr(6)
9056 .nr(8)
9057 .kr(1)
9058 .sr(1)
9059 .m(6)
9060 .n(8)
9061 .k(k)
9062 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9063 }
9064 }
9065
9066 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
9067 TEST_REQUIRES_ARM_NEON_FMA;
9068 for (size_t k = 3; k < 4; k++) {
9069 GemmMicrokernelTester()
9070 .mr(6)
9071 .nr(8)
9072 .kr(1)
9073 .sr(1)
9074 .m(6)
9075 .n(8)
9076 .k(k)
9077 .a_stride(7)
9078 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9079 }
9080 }
9081
9082 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
9083 TEST_REQUIRES_ARM_NEON_FMA;
9084 for (size_t k = 3; k < 4; k++) {
9085 for (uint32_t m = 1; m <= 6; m++) {
9086 for (uint32_t n = 1; n <= 8; n++) {
9087 GemmMicrokernelTester()
9088 .mr(6)
9089 .nr(8)
9090 .kr(1)
9091 .sr(1)
9092 .m(m)
9093 .n(n)
9094 .k(k)
9095 .iterations(1)
9096 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9097 }
9098 }
9099 }
9100 }
9101
9102 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_div_2) {
9103 TEST_REQUIRES_ARM_NEON_FMA;
9104 for (size_t k = 4; k <= 20; k += 2) {
9105 GemmMicrokernelTester()
9106 .mr(6)
9107 .nr(8)
9108 .kr(1)
9109 .sr(1)
9110 .m(6)
9111 .n(8)
9112 .k(k)
9113 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9114 }
9115 }
9116
9117 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
9118 TEST_REQUIRES_ARM_NEON_FMA;
9119 for (size_t k = 4; k <= 20; k += 2) {
9120 GemmMicrokernelTester()
9121 .mr(6)
9122 .nr(8)
9123 .kr(1)
9124 .sr(1)
9125 .m(6)
9126 .n(8)
9127 .k(k)
9128 .a_stride(23)
9129 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9130 }
9131 }
9132
9133 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
9134 TEST_REQUIRES_ARM_NEON_FMA;
9135 for (size_t k = 4; k <= 20; k += 2) {
9136 for (uint32_t m = 1; m <= 6; m++) {
9137 for (uint32_t n = 1; n <= 8; n++) {
9138 GemmMicrokernelTester()
9139 .mr(6)
9140 .nr(8)
9141 .kr(1)
9142 .sr(1)
9143 .m(m)
9144 .n(n)
9145 .k(k)
9146 .iterations(1)
9147 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9148 }
9149 }
9150 }
9151 }
9152
9153 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8) {
9154 TEST_REQUIRES_ARM_NEON_FMA;
9155 for (uint32_t n = 9; n < 16; n++) {
9156 for (size_t k = 1; k <= 10; k += 3) {
9157 GemmMicrokernelTester()
9158 .mr(6)
9159 .nr(8)
9160 .kr(1)
9161 .sr(1)
9162 .m(6)
9163 .n(8)
9164 .k(k)
9165 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9166 }
9167 }
9168 }
9169
9170 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
9171 TEST_REQUIRES_ARM_NEON_FMA;
9172 for (uint32_t n = 9; n < 16; n++) {
9173 for (size_t k = 1; k <= 10; k += 3) {
9174 GemmMicrokernelTester()
9175 .mr(6)
9176 .nr(8)
9177 .kr(1)
9178 .sr(1)
9179 .m(6)
9180 .n(8)
9181 .k(k)
9182 .cn_stride(11)
9183 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9184 }
9185 }
9186 }
9187
9188 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
9189 TEST_REQUIRES_ARM_NEON_FMA;
9190 for (uint32_t n = 9; n < 16; n++) {
9191 for (size_t k = 1; k <= 10; k += 3) {
9192 GemmMicrokernelTester()
9193 .mr(6)
9194 .nr(8)
9195 .kr(1)
9196 .sr(1)
9197 .m(6)
9198 .n(n)
9199 .k(k)
9200 .a_stride(13)
9201 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9202 }
9203 }
9204 }
9205
9206 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
9207 TEST_REQUIRES_ARM_NEON_FMA;
9208 for (uint32_t n = 9; n < 16; n++) {
9209 for (size_t k = 1; k <= 10; k += 3) {
9210 for (uint32_t m = 1; m <= 6; m++) {
9211 GemmMicrokernelTester()
9212 .mr(6)
9213 .nr(8)
9214 .kr(1)
9215 .sr(1)
9216 .m(m)
9217 .n(n)
9218 .k(k)
9219 .iterations(1)
9220 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9221 }
9222 }
9223 }
9224 }
9225
9226 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8) {
9227 TEST_REQUIRES_ARM_NEON_FMA;
9228 for (uint32_t n = 16; n <= 24; n += 8) {
9229 for (size_t k = 1; k <= 10; k += 3) {
9230 GemmMicrokernelTester()
9231 .mr(6)
9232 .nr(8)
9233 .kr(1)
9234 .sr(1)
9235 .m(6)
9236 .n(8)
9237 .k(k)
9238 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9239 }
9240 }
9241 }
9242
9243 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
9244 TEST_REQUIRES_ARM_NEON_FMA;
9245 for (uint32_t n = 16; n <= 24; n += 8) {
9246 for (size_t k = 1; k <= 10; k += 3) {
9247 GemmMicrokernelTester()
9248 .mr(6)
9249 .nr(8)
9250 .kr(1)
9251 .sr(1)
9252 .m(6)
9253 .n(n)
9254 .k(k)
9255 .cn_stride(11)
9256 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9257 }
9258 }
9259 }
9260
9261 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
9262 TEST_REQUIRES_ARM_NEON_FMA;
9263 for (uint32_t n = 16; n <= 24; n += 8) {
9264 for (size_t k = 1; k <= 10; k += 3) {
9265 GemmMicrokernelTester()
9266 .mr(6)
9267 .nr(8)
9268 .kr(1)
9269 .sr(1)
9270 .m(6)
9271 .n(n)
9272 .k(k)
9273 .a_stride(13)
9274 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9275 }
9276 }
9277 }
9278
9279 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
9280 TEST_REQUIRES_ARM_NEON_FMA;
9281 for (uint32_t n = 16; n <= 24; n += 8) {
9282 for (size_t k = 1; k <= 10; k += 3) {
9283 for (uint32_t m = 1; m <= 6; m++) {
9284 GemmMicrokernelTester()
9285 .mr(6)
9286 .nr(8)
9287 .kr(1)
9288 .sr(1)
9289 .m(m)
9290 .n(n)
9291 .k(k)
9292 .iterations(1)
9293 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9294 }
9295 }
9296 }
9297 }
9298
9299 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
9300 TEST_REQUIRES_ARM_NEON_FMA;
9301 for (size_t k = 1; k <= 10; k += 3) {
9302 for (uint32_t m = 1; m <= 6; m++) {
9303 for (uint32_t n = 1; n <= 8; n++) {
9304 GemmMicrokernelTester()
9305 .mr(6)
9306 .nr(8)
9307 .kr(1)
9308 .sr(1)
9309 .m(m)
9310 .n(n)
9311 .k(k)
9312 .cm_stride(11)
9313 .iterations(1)
9314 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9315 }
9316 }
9317 }
9318 }
9319
9320 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, qmin) {
9321 TEST_REQUIRES_ARM_NEON_FMA;
9322 GemmMicrokernelTester()
9323 .mr(6)
9324 .nr(8)
9325 .kr(1)
9326 .sr(1)
9327 .m(6)
9328 .n(8)
9329 .k(2)
9330 .qmin(128)
9331 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9332 }
9333
9334 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, qmax) {
9335 TEST_REQUIRES_ARM_NEON_FMA;
9336 GemmMicrokernelTester()
9337 .mr(6)
9338 .nr(8)
9339 .kr(1)
9340 .sr(1)
9341 .m(6)
9342 .n(8)
9343 .k(2)
9344 .qmax(128)
9345 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9346 }
9347
9348 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, strided_cm) {
9349 TEST_REQUIRES_ARM_NEON_FMA;
9350 GemmMicrokernelTester()
9351 .mr(6)
9352 .nr(8)
9353 .kr(1)
9354 .sr(1)
9355 .m(6)
9356 .n(8)
9357 .k(2)
9358 .cm_stride(11)
9359 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
9360 }
9361#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9362
9363
9364#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9365 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
9366 TEST_REQUIRES_ARM_NEON_FMA;
9367 GemmMicrokernelTester()
9368 .mr(6)
9369 .nr(8)
9370 .kr(1)
9371 .sr(1)
9372 .m(6)
9373 .n(8)
9374 .k(4)
9375 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9376 }
9377
9378 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, strided_cn) {
9379 TEST_REQUIRES_ARM_NEON_FMA;
9380 GemmMicrokernelTester()
9381 .mr(6)
9382 .nr(8)
9383 .kr(1)
9384 .sr(1)
9385 .m(6)
9386 .n(8)
9387 .k(4)
9388 .cn_stride(11)
9389 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9390 }
9391
9392 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
9393 TEST_REQUIRES_ARM_NEON_FMA;
9394 GemmMicrokernelTester()
9395 .mr(6)
9396 .nr(8)
9397 .kr(1)
9398 .sr(1)
9399 .m(6)
9400 .n(8)
9401 .k(4)
9402 .a_stride(7)
9403 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9404 }
9405
9406 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
9407 TEST_REQUIRES_ARM_NEON_FMA;
9408 for (uint32_t m = 1; m <= 6; m++) {
9409 for (uint32_t n = 1; n <= 8; n++) {
9410 GemmMicrokernelTester()
9411 .mr(6)
9412 .nr(8)
9413 .kr(1)
9414 .sr(1)
9415 .m(m)
9416 .n(n)
9417 .k(4)
9418 .iterations(1)
9419 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9420 }
9421 }
9422 }
9423
9424 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
9425 TEST_REQUIRES_ARM_NEON_FMA;
9426 for (uint32_t m = 1; m <= 6; m++) {
9427 GemmMicrokernelTester()
9428 .mr(6)
9429 .nr(8)
9430 .kr(1)
9431 .sr(1)
9432 .m(m)
9433 .n(8)
9434 .k(4)
9435 .iterations(1)
9436 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9437 }
9438 }
9439
9440 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
9441 TEST_REQUIRES_ARM_NEON_FMA;
9442 for (uint32_t n = 1; n <= 8; n++) {
9443 GemmMicrokernelTester()
9444 .mr(6)
9445 .nr(8)
9446 .kr(1)
9447 .sr(1)
9448 .m(6)
9449 .n(n)
9450 .k(4)
9451 .iterations(1)
9452 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9453 }
9454 }
9455
9456 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_lt_4) {
9457 TEST_REQUIRES_ARM_NEON_FMA;
9458 for (size_t k = 1; k < 4; k++) {
9459 GemmMicrokernelTester()
9460 .mr(6)
9461 .nr(8)
9462 .kr(1)
9463 .sr(1)
9464 .m(6)
9465 .n(8)
9466 .k(k)
9467 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9468 }
9469 }
9470
9471 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
9472 TEST_REQUIRES_ARM_NEON_FMA;
9473 for (size_t k = 1; k < 4; k++) {
9474 GemmMicrokernelTester()
9475 .mr(6)
9476 .nr(8)
9477 .kr(1)
9478 .sr(1)
9479 .m(6)
9480 .n(8)
9481 .k(k)
9482 .a_stride(7)
9483 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9484 }
9485 }
9486
9487 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
9488 TEST_REQUIRES_ARM_NEON_FMA;
9489 for (size_t k = 1; k < 4; k++) {
9490 for (uint32_t m = 1; m <= 6; m++) {
9491 for (uint32_t n = 1; n <= 8; n++) {
9492 GemmMicrokernelTester()
9493 .mr(6)
9494 .nr(8)
9495 .kr(1)
9496 .sr(1)
9497 .m(m)
9498 .n(n)
9499 .k(k)
9500 .iterations(1)
9501 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9502 }
9503 }
9504 }
9505 }
9506
9507 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_gt_4) {
9508 TEST_REQUIRES_ARM_NEON_FMA;
9509 for (size_t k = 5; k < 8; k++) {
9510 GemmMicrokernelTester()
9511 .mr(6)
9512 .nr(8)
9513 .kr(1)
9514 .sr(1)
9515 .m(6)
9516 .n(8)
9517 .k(k)
9518 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9519 }
9520 }
9521
9522 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
9523 TEST_REQUIRES_ARM_NEON_FMA;
9524 for (size_t k = 5; k < 8; k++) {
9525 GemmMicrokernelTester()
9526 .mr(6)
9527 .nr(8)
9528 .kr(1)
9529 .sr(1)
9530 .m(6)
9531 .n(8)
9532 .k(k)
9533 .a_stride(11)
9534 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9535 }
9536 }
9537
9538 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
9539 TEST_REQUIRES_ARM_NEON_FMA;
9540 for (size_t k = 5; k < 8; k++) {
9541 for (uint32_t m = 1; m <= 6; m++) {
9542 for (uint32_t n = 1; n <= 8; n++) {
9543 GemmMicrokernelTester()
9544 .mr(6)
9545 .nr(8)
9546 .kr(1)
9547 .sr(1)
9548 .m(m)
9549 .n(n)
9550 .k(k)
9551 .iterations(1)
9552 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9553 }
9554 }
9555 }
9556 }
9557
9558 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_div_4) {
9559 TEST_REQUIRES_ARM_NEON_FMA;
9560 for (size_t k = 8; k <= 40; k += 4) {
9561 GemmMicrokernelTester()
9562 .mr(6)
9563 .nr(8)
9564 .kr(1)
9565 .sr(1)
9566 .m(6)
9567 .n(8)
9568 .k(k)
9569 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9570 }
9571 }
9572
9573 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
9574 TEST_REQUIRES_ARM_NEON_FMA;
9575 for (size_t k = 8; k <= 40; k += 4) {
9576 GemmMicrokernelTester()
9577 .mr(6)
9578 .nr(8)
9579 .kr(1)
9580 .sr(1)
9581 .m(6)
9582 .n(8)
9583 .k(k)
9584 .a_stride(43)
9585 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9586 }
9587 }
9588
9589 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
9590 TEST_REQUIRES_ARM_NEON_FMA;
9591 for (size_t k = 8; k <= 40; k += 4) {
9592 for (uint32_t m = 1; m <= 6; m++) {
9593 for (uint32_t n = 1; n <= 8; n++) {
9594 GemmMicrokernelTester()
9595 .mr(6)
9596 .nr(8)
9597 .kr(1)
9598 .sr(1)
9599 .m(m)
9600 .n(n)
9601 .k(k)
9602 .iterations(1)
9603 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9604 }
9605 }
9606 }
9607 }
9608
9609 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8) {
9610 TEST_REQUIRES_ARM_NEON_FMA;
9611 for (uint32_t n = 9; n < 16; n++) {
9612 for (size_t k = 1; k <= 20; k += 5) {
9613 GemmMicrokernelTester()
9614 .mr(6)
9615 .nr(8)
9616 .kr(1)
9617 .sr(1)
9618 .m(6)
9619 .n(8)
9620 .k(k)
9621 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9622 }
9623 }
9624 }
9625
9626 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
9627 TEST_REQUIRES_ARM_NEON_FMA;
9628 for (uint32_t n = 9; n < 16; n++) {
9629 for (size_t k = 1; k <= 20; k += 5) {
9630 GemmMicrokernelTester()
9631 .mr(6)
9632 .nr(8)
9633 .kr(1)
9634 .sr(1)
9635 .m(6)
9636 .n(8)
9637 .k(k)
9638 .cn_stride(11)
9639 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9640 }
9641 }
9642 }
9643
9644 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
9645 TEST_REQUIRES_ARM_NEON_FMA;
9646 for (uint32_t n = 9; n < 16; n++) {
9647 for (size_t k = 1; k <= 20; k += 5) {
9648 GemmMicrokernelTester()
9649 .mr(6)
9650 .nr(8)
9651 .kr(1)
9652 .sr(1)
9653 .m(6)
9654 .n(n)
9655 .k(k)
9656 .a_stride(23)
9657 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9658 }
9659 }
9660 }
9661
9662 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
9663 TEST_REQUIRES_ARM_NEON_FMA;
9664 for (uint32_t n = 9; n < 16; n++) {
9665 for (size_t k = 1; k <= 20; k += 5) {
9666 for (uint32_t m = 1; m <= 6; m++) {
9667 GemmMicrokernelTester()
9668 .mr(6)
9669 .nr(8)
9670 .kr(1)
9671 .sr(1)
9672 .m(m)
9673 .n(n)
9674 .k(k)
9675 .iterations(1)
9676 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9677 }
9678 }
9679 }
9680 }
9681
9682 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8) {
9683 TEST_REQUIRES_ARM_NEON_FMA;
9684 for (uint32_t n = 16; n <= 24; n += 8) {
9685 for (size_t k = 1; k <= 20; k += 5) {
9686 GemmMicrokernelTester()
9687 .mr(6)
9688 .nr(8)
9689 .kr(1)
9690 .sr(1)
9691 .m(6)
9692 .n(8)
9693 .k(k)
9694 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9695 }
9696 }
9697 }
9698
9699 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
9700 TEST_REQUIRES_ARM_NEON_FMA;
9701 for (uint32_t n = 16; n <= 24; n += 8) {
9702 for (size_t k = 1; k <= 20; k += 5) {
9703 GemmMicrokernelTester()
9704 .mr(6)
9705 .nr(8)
9706 .kr(1)
9707 .sr(1)
9708 .m(6)
9709 .n(n)
9710 .k(k)
9711 .cn_stride(11)
9712 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9713 }
9714 }
9715 }
9716
9717 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
9718 TEST_REQUIRES_ARM_NEON_FMA;
9719 for (uint32_t n = 16; n <= 24; n += 8) {
9720 for (size_t k = 1; k <= 20; k += 5) {
9721 GemmMicrokernelTester()
9722 .mr(6)
9723 .nr(8)
9724 .kr(1)
9725 .sr(1)
9726 .m(6)
9727 .n(n)
9728 .k(k)
9729 .a_stride(23)
9730 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9731 }
9732 }
9733 }
9734
9735 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
9736 TEST_REQUIRES_ARM_NEON_FMA;
9737 for (uint32_t n = 16; n <= 24; n += 8) {
9738 for (size_t k = 1; k <= 20; k += 5) {
9739 for (uint32_t m = 1; m <= 6; m++) {
9740 GemmMicrokernelTester()
9741 .mr(6)
9742 .nr(8)
9743 .kr(1)
9744 .sr(1)
9745 .m(m)
9746 .n(n)
9747 .k(k)
9748 .iterations(1)
9749 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9750 }
9751 }
9752 }
9753 }
9754
9755 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
9756 TEST_REQUIRES_ARM_NEON_FMA;
9757 for (size_t k = 1; k <= 20; k += 5) {
9758 for (uint32_t m = 1; m <= 6; m++) {
9759 for (uint32_t n = 1; n <= 8; n++) {
9760 GemmMicrokernelTester()
9761 .mr(6)
9762 .nr(8)
9763 .kr(1)
9764 .sr(1)
9765 .m(m)
9766 .n(n)
9767 .k(k)
9768 .cm_stride(11)
9769 .iterations(1)
9770 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9771 }
9772 }
9773 }
9774 }
9775
9776 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, qmin) {
9777 TEST_REQUIRES_ARM_NEON_FMA;
9778 GemmMicrokernelTester()
9779 .mr(6)
9780 .nr(8)
9781 .kr(1)
9782 .sr(1)
9783 .m(6)
9784 .n(8)
9785 .k(4)
9786 .qmin(128)
9787 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9788 }
9789
9790 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, qmax) {
9791 TEST_REQUIRES_ARM_NEON_FMA;
9792 GemmMicrokernelTester()
9793 .mr(6)
9794 .nr(8)
9795 .kr(1)
9796 .sr(1)
9797 .m(6)
9798 .n(8)
9799 .k(4)
9800 .qmax(128)
9801 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9802 }
9803
9804 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, strided_cm) {
9805 TEST_REQUIRES_ARM_NEON_FMA;
9806 GemmMicrokernelTester()
9807 .mr(6)
9808 .nr(8)
9809 .kr(1)
9810 .sr(1)
9811 .m(6)
9812 .n(8)
9813 .k(4)
9814 .cm_stride(11)
9815 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
9816 }
9817#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9818
9819
9820#if XNN_ARCH_ARM || XNN_ARCH_ARM64
9821 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2) {
9822 TEST_REQUIRES_ARM_NEON;
9823 GemmMicrokernelTester()
9824 .mr(1)
9825 .nr(8)
9826 .kr(1)
9827 .sr(1)
9828 .m(1)
9829 .n(8)
9830 .k(2)
9831 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9832 }
9833
9834 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cn) {
9835 TEST_REQUIRES_ARM_NEON;
9836 GemmMicrokernelTester()
9837 .mr(1)
9838 .nr(8)
9839 .kr(1)
9840 .sr(1)
9841 .m(1)
9842 .n(8)
9843 .k(2)
9844 .cn_stride(11)
9845 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9846 }
9847
9848 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
9849 TEST_REQUIRES_ARM_NEON;
9850 GemmMicrokernelTester()
9851 .mr(1)
9852 .nr(8)
9853 .kr(1)
9854 .sr(1)
9855 .m(1)
9856 .n(8)
9857 .k(2)
9858 .a_stride(5)
9859 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9860 }
9861
9862 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
9863 TEST_REQUIRES_ARM_NEON;
9864 for (uint32_t m = 1; m <= 1; m++) {
9865 for (uint32_t n = 1; n <= 8; n++) {
9866 GemmMicrokernelTester()
9867 .mr(1)
9868 .nr(8)
9869 .kr(1)
9870 .sr(1)
9871 .m(m)
9872 .n(n)
9873 .k(2)
9874 .iterations(1)
9875 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9876 }
9877 }
9878 }
9879
9880 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
9881 TEST_REQUIRES_ARM_NEON;
9882 for (uint32_t m = 1; m <= 1; m++) {
9883 GemmMicrokernelTester()
9884 .mr(1)
9885 .nr(8)
9886 .kr(1)
9887 .sr(1)
9888 .m(m)
9889 .n(8)
9890 .k(2)
9891 .iterations(1)
9892 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9893 }
9894 }
9895
9896 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
9897 TEST_REQUIRES_ARM_NEON;
9898 for (uint32_t n = 1; n <= 8; n++) {
9899 GemmMicrokernelTester()
9900 .mr(1)
9901 .nr(8)
9902 .kr(1)
9903 .sr(1)
9904 .m(1)
9905 .n(n)
9906 .k(2)
9907 .iterations(1)
9908 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9909 }
9910 }
9911
9912 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2) {
9913 TEST_REQUIRES_ARM_NEON;
9914 for (size_t k = 1; k < 2; k++) {
9915 GemmMicrokernelTester()
9916 .mr(1)
9917 .nr(8)
9918 .kr(1)
9919 .sr(1)
9920 .m(1)
9921 .n(8)
9922 .k(k)
9923 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9924 }
9925 }
9926
9927 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
9928 TEST_REQUIRES_ARM_NEON;
9929 for (size_t k = 1; k < 2; k++) {
9930 GemmMicrokernelTester()
9931 .mr(1)
9932 .nr(8)
9933 .kr(1)
9934 .sr(1)
9935 .m(1)
9936 .n(8)
9937 .k(k)
9938 .a_stride(5)
9939 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9940 }
9941 }
9942
9943 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
9944 TEST_REQUIRES_ARM_NEON;
9945 for (size_t k = 1; k < 2; k++) {
9946 for (uint32_t m = 1; m <= 1; m++) {
9947 for (uint32_t n = 1; n <= 8; n++) {
9948 GemmMicrokernelTester()
9949 .mr(1)
9950 .nr(8)
9951 .kr(1)
9952 .sr(1)
9953 .m(m)
9954 .n(n)
9955 .k(k)
9956 .iterations(1)
9957 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9958 }
9959 }
9960 }
9961 }
9962
9963 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2) {
9964 TEST_REQUIRES_ARM_NEON;
9965 for (size_t k = 3; k < 4; k++) {
9966 GemmMicrokernelTester()
9967 .mr(1)
9968 .nr(8)
9969 .kr(1)
9970 .sr(1)
9971 .m(1)
9972 .n(8)
9973 .k(k)
9974 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9975 }
9976 }
9977
9978 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
9979 TEST_REQUIRES_ARM_NEON;
9980 for (size_t k = 3; k < 4; k++) {
9981 GemmMicrokernelTester()
9982 .mr(1)
9983 .nr(8)
9984 .kr(1)
9985 .sr(1)
9986 .m(1)
9987 .n(8)
9988 .k(k)
9989 .a_stride(7)
9990 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
9991 }
9992 }
9993
9994 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
9995 TEST_REQUIRES_ARM_NEON;
9996 for (size_t k = 3; k < 4; k++) {
9997 for (uint32_t m = 1; m <= 1; m++) {
9998 for (uint32_t n = 1; n <= 8; n++) {
9999 GemmMicrokernelTester()
10000 .mr(1)
10001 .nr(8)
10002 .kr(1)
10003 .sr(1)
10004 .m(m)
10005 .n(n)
10006 .k(k)
10007 .iterations(1)
10008 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10009 }
10010 }
10011 }
10012 }
10013
10014 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2) {
10015 TEST_REQUIRES_ARM_NEON;
10016 for (size_t k = 4; k <= 20; k += 2) {
10017 GemmMicrokernelTester()
10018 .mr(1)
10019 .nr(8)
10020 .kr(1)
10021 .sr(1)
10022 .m(1)
10023 .n(8)
10024 .k(k)
10025 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10026 }
10027 }
10028
10029 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
10030 TEST_REQUIRES_ARM_NEON;
10031 for (size_t k = 4; k <= 20; k += 2) {
10032 GemmMicrokernelTester()
10033 .mr(1)
10034 .nr(8)
10035 .kr(1)
10036 .sr(1)
10037 .m(1)
10038 .n(8)
10039 .k(k)
10040 .a_stride(23)
10041 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10042 }
10043 }
10044
10045 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2_subtile) {
10046 TEST_REQUIRES_ARM_NEON;
10047 for (size_t k = 4; k <= 20; k += 2) {
10048 for (uint32_t m = 1; m <= 1; m++) {
10049 for (uint32_t n = 1; n <= 8; n++) {
10050 GemmMicrokernelTester()
10051 .mr(1)
10052 .nr(8)
10053 .kr(1)
10054 .sr(1)
10055 .m(m)
10056 .n(n)
10057 .k(k)
10058 .iterations(1)
10059 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10060 }
10061 }
10062 }
10063 }
10064
10065 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8) {
10066 TEST_REQUIRES_ARM_NEON;
10067 for (uint32_t n = 9; n < 16; n++) {
10068 for (size_t k = 1; k <= 10; k += 3) {
10069 GemmMicrokernelTester()
10070 .mr(1)
10071 .nr(8)
10072 .kr(1)
10073 .sr(1)
10074 .m(1)
10075 .n(8)
10076 .k(k)
10077 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10078 }
10079 }
10080 }
10081
10082 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
10083 TEST_REQUIRES_ARM_NEON;
10084 for (uint32_t n = 9; n < 16; n++) {
10085 for (size_t k = 1; k <= 10; k += 3) {
10086 GemmMicrokernelTester()
10087 .mr(1)
10088 .nr(8)
10089 .kr(1)
10090 .sr(1)
10091 .m(1)
10092 .n(8)
10093 .k(k)
10094 .cn_stride(11)
10095 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10096 }
10097 }
10098 }
10099
10100 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
10101 TEST_REQUIRES_ARM_NEON;
10102 for (uint32_t n = 9; n < 16; n++) {
10103 for (size_t k = 1; k <= 10; k += 3) {
10104 GemmMicrokernelTester()
10105 .mr(1)
10106 .nr(8)
10107 .kr(1)
10108 .sr(1)
10109 .m(1)
10110 .n(n)
10111 .k(k)
10112 .a_stride(13)
10113 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10114 }
10115 }
10116 }
10117
10118 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
10119 TEST_REQUIRES_ARM_NEON;
10120 for (uint32_t n = 9; n < 16; n++) {
10121 for (size_t k = 1; k <= 10; k += 3) {
10122 for (uint32_t m = 1; m <= 1; m++) {
10123 GemmMicrokernelTester()
10124 .mr(1)
10125 .nr(8)
10126 .kr(1)
10127 .sr(1)
10128 .m(m)
10129 .n(n)
10130 .k(k)
10131 .iterations(1)
10132 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10133 }
10134 }
10135 }
10136 }
10137
10138 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8) {
10139 TEST_REQUIRES_ARM_NEON;
10140 for (uint32_t n = 16; n <= 24; n += 8) {
10141 for (size_t k = 1; k <= 10; k += 3) {
10142 GemmMicrokernelTester()
10143 .mr(1)
10144 .nr(8)
10145 .kr(1)
10146 .sr(1)
10147 .m(1)
10148 .n(8)
10149 .k(k)
10150 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10151 }
10152 }
10153 }
10154
10155 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
10156 TEST_REQUIRES_ARM_NEON;
10157 for (uint32_t n = 16; n <= 24; n += 8) {
10158 for (size_t k = 1; k <= 10; k += 3) {
10159 GemmMicrokernelTester()
10160 .mr(1)
10161 .nr(8)
10162 .kr(1)
10163 .sr(1)
10164 .m(1)
10165 .n(n)
10166 .k(k)
10167 .cn_stride(11)
10168 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10169 }
10170 }
10171 }
10172
10173 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
10174 TEST_REQUIRES_ARM_NEON;
10175 for (uint32_t n = 16; n <= 24; n += 8) {
10176 for (size_t k = 1; k <= 10; k += 3) {
10177 GemmMicrokernelTester()
10178 .mr(1)
10179 .nr(8)
10180 .kr(1)
10181 .sr(1)
10182 .m(1)
10183 .n(n)
10184 .k(k)
10185 .a_stride(13)
10186 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10187 }
10188 }
10189 }
10190
10191 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_subtile) {
10192 TEST_REQUIRES_ARM_NEON;
10193 for (uint32_t n = 16; n <= 24; n += 8) {
10194 for (size_t k = 1; k <= 10; k += 3) {
10195 for (uint32_t m = 1; m <= 1; m++) {
10196 GemmMicrokernelTester()
10197 .mr(1)
10198 .nr(8)
10199 .kr(1)
10200 .sr(1)
10201 .m(m)
10202 .n(n)
10203 .k(k)
10204 .iterations(1)
10205 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10206 }
10207 }
10208 }
10209 }
10210
10211 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cm_subtile) {
10212 TEST_REQUIRES_ARM_NEON;
10213 for (size_t k = 1; k <= 10; k += 3) {
10214 for (uint32_t m = 1; m <= 1; m++) {
10215 for (uint32_t n = 1; n <= 8; n++) {
10216 GemmMicrokernelTester()
10217 .mr(1)
10218 .nr(8)
10219 .kr(1)
10220 .sr(1)
10221 .m(m)
10222 .n(n)
10223 .k(k)
10224 .cm_stride(11)
10225 .iterations(1)
10226 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10227 }
10228 }
10229 }
10230 }
10231
10232 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, qmin) {
10233 TEST_REQUIRES_ARM_NEON;
10234 GemmMicrokernelTester()
10235 .mr(1)
10236 .nr(8)
10237 .kr(1)
10238 .sr(1)
10239 .m(1)
10240 .n(8)
10241 .k(2)
10242 .qmin(128)
10243 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10244 }
10245
10246 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, qmax) {
10247 TEST_REQUIRES_ARM_NEON;
10248 GemmMicrokernelTester()
10249 .mr(1)
10250 .nr(8)
10251 .kr(1)
10252 .sr(1)
10253 .m(1)
10254 .n(8)
10255 .k(2)
10256 .qmax(128)
10257 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10258 }
10259
10260 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cm) {
10261 TEST_REQUIRES_ARM_NEON;
10262 GemmMicrokernelTester()
10263 .mr(1)
10264 .nr(8)
10265 .kr(1)
10266 .sr(1)
10267 .m(1)
10268 .n(8)
10269 .k(2)
10270 .cm_stride(11)
10271 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
10272 }
10273#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10274
10275
10276#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10277 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2) {
10278 TEST_REQUIRES_ARM_NEON;
10279 GemmMicrokernelTester()
10280 .mr(4)
10281 .nr(8)
10282 .kr(1)
10283 .sr(1)
10284 .m(4)
10285 .n(8)
10286 .k(2)
10287 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10288 }
10289
10290 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cn) {
10291 TEST_REQUIRES_ARM_NEON;
10292 GemmMicrokernelTester()
10293 .mr(4)
10294 .nr(8)
10295 .kr(1)
10296 .sr(1)
10297 .m(4)
10298 .n(8)
10299 .k(2)
10300 .cn_stride(11)
10301 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10302 }
10303
10304 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
10305 TEST_REQUIRES_ARM_NEON;
10306 GemmMicrokernelTester()
10307 .mr(4)
10308 .nr(8)
10309 .kr(1)
10310 .sr(1)
10311 .m(4)
10312 .n(8)
10313 .k(2)
10314 .a_stride(5)
10315 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10316 }
10317
10318 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
10319 TEST_REQUIRES_ARM_NEON;
10320 for (uint32_t m = 1; m <= 4; m++) {
10321 for (uint32_t n = 1; n <= 8; n++) {
10322 GemmMicrokernelTester()
10323 .mr(4)
10324 .nr(8)
10325 .kr(1)
10326 .sr(1)
10327 .m(m)
10328 .n(n)
10329 .k(2)
10330 .iterations(1)
10331 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10332 }
10333 }
10334 }
10335
10336 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
10337 TEST_REQUIRES_ARM_NEON;
10338 for (uint32_t m = 1; m <= 4; m++) {
10339 GemmMicrokernelTester()
10340 .mr(4)
10341 .nr(8)
10342 .kr(1)
10343 .sr(1)
10344 .m(m)
10345 .n(8)
10346 .k(2)
10347 .iterations(1)
10348 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10349 }
10350 }
10351
10352 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
10353 TEST_REQUIRES_ARM_NEON;
10354 for (uint32_t n = 1; n <= 8; n++) {
10355 GemmMicrokernelTester()
10356 .mr(4)
10357 .nr(8)
10358 .kr(1)
10359 .sr(1)
10360 .m(4)
10361 .n(n)
10362 .k(2)
10363 .iterations(1)
10364 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10365 }
10366 }
10367
10368 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2) {
10369 TEST_REQUIRES_ARM_NEON;
10370 for (size_t k = 1; k < 2; k++) {
10371 GemmMicrokernelTester()
10372 .mr(4)
10373 .nr(8)
10374 .kr(1)
10375 .sr(1)
10376 .m(4)
10377 .n(8)
10378 .k(k)
10379 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10380 }
10381 }
10382
10383 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
10384 TEST_REQUIRES_ARM_NEON;
10385 for (size_t k = 1; k < 2; k++) {
10386 GemmMicrokernelTester()
10387 .mr(4)
10388 .nr(8)
10389 .kr(1)
10390 .sr(1)
10391 .m(4)
10392 .n(8)
10393 .k(k)
10394 .a_stride(5)
10395 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10396 }
10397 }
10398
10399 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
10400 TEST_REQUIRES_ARM_NEON;
10401 for (size_t k = 1; k < 2; k++) {
10402 for (uint32_t m = 1; m <= 4; m++) {
10403 for (uint32_t n = 1; n <= 8; n++) {
10404 GemmMicrokernelTester()
10405 .mr(4)
10406 .nr(8)
10407 .kr(1)
10408 .sr(1)
10409 .m(m)
10410 .n(n)
10411 .k(k)
10412 .iterations(1)
10413 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10414 }
10415 }
10416 }
10417 }
10418
10419 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2) {
10420 TEST_REQUIRES_ARM_NEON;
10421 for (size_t k = 3; k < 4; k++) {
10422 GemmMicrokernelTester()
10423 .mr(4)
10424 .nr(8)
10425 .kr(1)
10426 .sr(1)
10427 .m(4)
10428 .n(8)
10429 .k(k)
10430 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10431 }
10432 }
10433
10434 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
10435 TEST_REQUIRES_ARM_NEON;
10436 for (size_t k = 3; k < 4; k++) {
10437 GemmMicrokernelTester()
10438 .mr(4)
10439 .nr(8)
10440 .kr(1)
10441 .sr(1)
10442 .m(4)
10443 .n(8)
10444 .k(k)
10445 .a_stride(7)
10446 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10447 }
10448 }
10449
10450 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
10451 TEST_REQUIRES_ARM_NEON;
10452 for (size_t k = 3; k < 4; k++) {
10453 for (uint32_t m = 1; m <= 4; m++) {
10454 for (uint32_t n = 1; n <= 8; n++) {
10455 GemmMicrokernelTester()
10456 .mr(4)
10457 .nr(8)
10458 .kr(1)
10459 .sr(1)
10460 .m(m)
10461 .n(n)
10462 .k(k)
10463 .iterations(1)
10464 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10465 }
10466 }
10467 }
10468 }
10469
10470 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2) {
10471 TEST_REQUIRES_ARM_NEON;
10472 for (size_t k = 4; k <= 20; k += 2) {
10473 GemmMicrokernelTester()
10474 .mr(4)
10475 .nr(8)
10476 .kr(1)
10477 .sr(1)
10478 .m(4)
10479 .n(8)
10480 .k(k)
10481 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10482 }
10483 }
10484
10485 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
10486 TEST_REQUIRES_ARM_NEON;
10487 for (size_t k = 4; k <= 20; k += 2) {
10488 GemmMicrokernelTester()
10489 .mr(4)
10490 .nr(8)
10491 .kr(1)
10492 .sr(1)
10493 .m(4)
10494 .n(8)
10495 .k(k)
10496 .a_stride(23)
10497 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10498 }
10499 }
10500
10501 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2_subtile) {
10502 TEST_REQUIRES_ARM_NEON;
10503 for (size_t k = 4; k <= 20; k += 2) {
10504 for (uint32_t m = 1; m <= 4; m++) {
10505 for (uint32_t n = 1; n <= 8; n++) {
10506 GemmMicrokernelTester()
10507 .mr(4)
10508 .nr(8)
10509 .kr(1)
10510 .sr(1)
10511 .m(m)
10512 .n(n)
10513 .k(k)
10514 .iterations(1)
10515 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10516 }
10517 }
10518 }
10519 }
10520
10521 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8) {
10522 TEST_REQUIRES_ARM_NEON;
10523 for (uint32_t n = 9; n < 16; n++) {
10524 for (size_t k = 1; k <= 10; k += 3) {
10525 GemmMicrokernelTester()
10526 .mr(4)
10527 .nr(8)
10528 .kr(1)
10529 .sr(1)
10530 .m(4)
10531 .n(8)
10532 .k(k)
10533 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10534 }
10535 }
10536 }
10537
10538 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
10539 TEST_REQUIRES_ARM_NEON;
10540 for (uint32_t n = 9; n < 16; n++) {
10541 for (size_t k = 1; k <= 10; k += 3) {
10542 GemmMicrokernelTester()
10543 .mr(4)
10544 .nr(8)
10545 .kr(1)
10546 .sr(1)
10547 .m(4)
10548 .n(8)
10549 .k(k)
10550 .cn_stride(11)
10551 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10552 }
10553 }
10554 }
10555
10556 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
10557 TEST_REQUIRES_ARM_NEON;
10558 for (uint32_t n = 9; n < 16; n++) {
10559 for (size_t k = 1; k <= 10; k += 3) {
10560 GemmMicrokernelTester()
10561 .mr(4)
10562 .nr(8)
10563 .kr(1)
10564 .sr(1)
10565 .m(4)
10566 .n(n)
10567 .k(k)
10568 .a_stride(13)
10569 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10570 }
10571 }
10572 }
10573
10574 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
10575 TEST_REQUIRES_ARM_NEON;
10576 for (uint32_t n = 9; n < 16; n++) {
10577 for (size_t k = 1; k <= 10; k += 3) {
10578 for (uint32_t m = 1; m <= 4; m++) {
10579 GemmMicrokernelTester()
10580 .mr(4)
10581 .nr(8)
10582 .kr(1)
10583 .sr(1)
10584 .m(m)
10585 .n(n)
10586 .k(k)
10587 .iterations(1)
10588 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10589 }
10590 }
10591 }
10592 }
10593
10594 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8) {
10595 TEST_REQUIRES_ARM_NEON;
10596 for (uint32_t n = 16; n <= 24; n += 8) {
10597 for (size_t k = 1; k <= 10; k += 3) {
10598 GemmMicrokernelTester()
10599 .mr(4)
10600 .nr(8)
10601 .kr(1)
10602 .sr(1)
10603 .m(4)
10604 .n(8)
10605 .k(k)
10606 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10607 }
10608 }
10609 }
10610
10611 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
10612 TEST_REQUIRES_ARM_NEON;
10613 for (uint32_t n = 16; n <= 24; n += 8) {
10614 for (size_t k = 1; k <= 10; k += 3) {
10615 GemmMicrokernelTester()
10616 .mr(4)
10617 .nr(8)
10618 .kr(1)
10619 .sr(1)
10620 .m(4)
10621 .n(n)
10622 .k(k)
10623 .cn_stride(11)
10624 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10625 }
10626 }
10627 }
10628
10629 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
10630 TEST_REQUIRES_ARM_NEON;
10631 for (uint32_t n = 16; n <= 24; n += 8) {
10632 for (size_t k = 1; k <= 10; k += 3) {
10633 GemmMicrokernelTester()
10634 .mr(4)
10635 .nr(8)
10636 .kr(1)
10637 .sr(1)
10638 .m(4)
10639 .n(n)
10640 .k(k)
10641 .a_stride(13)
10642 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10643 }
10644 }
10645 }
10646
10647 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_subtile) {
10648 TEST_REQUIRES_ARM_NEON;
10649 for (uint32_t n = 16; n <= 24; n += 8) {
10650 for (size_t k = 1; k <= 10; k += 3) {
10651 for (uint32_t m = 1; m <= 4; m++) {
10652 GemmMicrokernelTester()
10653 .mr(4)
10654 .nr(8)
10655 .kr(1)
10656 .sr(1)
10657 .m(m)
10658 .n(n)
10659 .k(k)
10660 .iterations(1)
10661 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10662 }
10663 }
10664 }
10665 }
10666
10667 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cm_subtile) {
10668 TEST_REQUIRES_ARM_NEON;
10669 for (size_t k = 1; k <= 10; k += 3) {
10670 for (uint32_t m = 1; m <= 4; m++) {
10671 for (uint32_t n = 1; n <= 8; n++) {
10672 GemmMicrokernelTester()
10673 .mr(4)
10674 .nr(8)
10675 .kr(1)
10676 .sr(1)
10677 .m(m)
10678 .n(n)
10679 .k(k)
10680 .cm_stride(11)
10681 .iterations(1)
10682 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10683 }
10684 }
10685 }
10686 }
10687
10688 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, qmin) {
10689 TEST_REQUIRES_ARM_NEON;
10690 GemmMicrokernelTester()
10691 .mr(4)
10692 .nr(8)
10693 .kr(1)
10694 .sr(1)
10695 .m(4)
10696 .n(8)
10697 .k(2)
10698 .qmin(128)
10699 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10700 }
10701
10702 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, qmax) {
10703 TEST_REQUIRES_ARM_NEON;
10704 GemmMicrokernelTester()
10705 .mr(4)
10706 .nr(8)
10707 .kr(1)
10708 .sr(1)
10709 .m(4)
10710 .n(8)
10711 .k(2)
10712 .qmax(128)
10713 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10714 }
10715
10716 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cm) {
10717 TEST_REQUIRES_ARM_NEON;
10718 GemmMicrokernelTester()
10719 .mr(4)
10720 .nr(8)
10721 .kr(1)
10722 .sr(1)
10723 .m(4)
10724 .n(8)
10725 .k(2)
10726 .cm_stride(11)
10727 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
10728 }
10729#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
10730
10731
10732#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10733 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4) {
10734 TEST_REQUIRES_ARM_NEON;
10735 GemmMicrokernelTester()
10736 .mr(4)
10737 .nr(8)
10738 .kr(1)
10739 .sr(1)
10740 .m(4)
10741 .n(8)
10742 .k(4)
10743 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10744 }
10745
10746 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cn) {
10747 TEST_REQUIRES_ARM_NEON;
10748 GemmMicrokernelTester()
10749 .mr(4)
10750 .nr(8)
10751 .kr(1)
10752 .sr(1)
10753 .m(4)
10754 .n(8)
10755 .k(4)
10756 .cn_stride(11)
10757 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10758 }
10759
10760 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
10761 TEST_REQUIRES_ARM_NEON;
10762 GemmMicrokernelTester()
10763 .mr(4)
10764 .nr(8)
10765 .kr(1)
10766 .sr(1)
10767 .m(4)
10768 .n(8)
10769 .k(4)
10770 .a_stride(7)
10771 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10772 }
10773
10774 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
10775 TEST_REQUIRES_ARM_NEON;
10776 for (uint32_t m = 1; m <= 4; m++) {
10777 for (uint32_t n = 1; n <= 8; n++) {
10778 GemmMicrokernelTester()
10779 .mr(4)
10780 .nr(8)
10781 .kr(1)
10782 .sr(1)
10783 .m(m)
10784 .n(n)
10785 .k(4)
10786 .iterations(1)
10787 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10788 }
10789 }
10790 }
10791
10792 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
10793 TEST_REQUIRES_ARM_NEON;
10794 for (uint32_t m = 1; m <= 4; m++) {
10795 GemmMicrokernelTester()
10796 .mr(4)
10797 .nr(8)
10798 .kr(1)
10799 .sr(1)
10800 .m(m)
10801 .n(8)
10802 .k(4)
10803 .iterations(1)
10804 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10805 }
10806 }
10807
10808 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
10809 TEST_REQUIRES_ARM_NEON;
10810 for (uint32_t n = 1; n <= 8; n++) {
10811 GemmMicrokernelTester()
10812 .mr(4)
10813 .nr(8)
10814 .kr(1)
10815 .sr(1)
10816 .m(4)
10817 .n(n)
10818 .k(4)
10819 .iterations(1)
10820 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10821 }
10822 }
10823
10824 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4) {
10825 TEST_REQUIRES_ARM_NEON;
10826 for (size_t k = 1; k < 4; k++) {
10827 GemmMicrokernelTester()
10828 .mr(4)
10829 .nr(8)
10830 .kr(1)
10831 .sr(1)
10832 .m(4)
10833 .n(8)
10834 .k(k)
10835 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10836 }
10837 }
10838
10839 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
10840 TEST_REQUIRES_ARM_NEON;
10841 for (size_t k = 1; k < 4; k++) {
10842 GemmMicrokernelTester()
10843 .mr(4)
10844 .nr(8)
10845 .kr(1)
10846 .sr(1)
10847 .m(4)
10848 .n(8)
10849 .k(k)
10850 .a_stride(7)
10851 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10852 }
10853 }
10854
10855 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
10856 TEST_REQUIRES_ARM_NEON;
10857 for (size_t k = 1; k < 4; k++) {
10858 for (uint32_t m = 1; m <= 4; m++) {
10859 for (uint32_t n = 1; n <= 8; n++) {
10860 GemmMicrokernelTester()
10861 .mr(4)
10862 .nr(8)
10863 .kr(1)
10864 .sr(1)
10865 .m(m)
10866 .n(n)
10867 .k(k)
10868 .iterations(1)
10869 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10870 }
10871 }
10872 }
10873 }
10874
10875 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4) {
10876 TEST_REQUIRES_ARM_NEON;
10877 for (size_t k = 5; k < 8; k++) {
10878 GemmMicrokernelTester()
10879 .mr(4)
10880 .nr(8)
10881 .kr(1)
10882 .sr(1)
10883 .m(4)
10884 .n(8)
10885 .k(k)
10886 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10887 }
10888 }
10889
10890 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
10891 TEST_REQUIRES_ARM_NEON;
10892 for (size_t k = 5; k < 8; k++) {
10893 GemmMicrokernelTester()
10894 .mr(4)
10895 .nr(8)
10896 .kr(1)
10897 .sr(1)
10898 .m(4)
10899 .n(8)
10900 .k(k)
10901 .a_stride(11)
10902 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10903 }
10904 }
10905
10906 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
10907 TEST_REQUIRES_ARM_NEON;
10908 for (size_t k = 5; k < 8; k++) {
10909 for (uint32_t m = 1; m <= 4; m++) {
10910 for (uint32_t n = 1; n <= 8; n++) {
10911 GemmMicrokernelTester()
10912 .mr(4)
10913 .nr(8)
10914 .kr(1)
10915 .sr(1)
10916 .m(m)
10917 .n(n)
10918 .k(k)
10919 .iterations(1)
10920 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10921 }
10922 }
10923 }
10924 }
10925
10926 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4) {
10927 TEST_REQUIRES_ARM_NEON;
10928 for (size_t k = 8; k <= 40; k += 4) {
10929 GemmMicrokernelTester()
10930 .mr(4)
10931 .nr(8)
10932 .kr(1)
10933 .sr(1)
10934 .m(4)
10935 .n(8)
10936 .k(k)
10937 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10938 }
10939 }
10940
10941 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
10942 TEST_REQUIRES_ARM_NEON;
10943 for (size_t k = 8; k <= 40; k += 4) {
10944 GemmMicrokernelTester()
10945 .mr(4)
10946 .nr(8)
10947 .kr(1)
10948 .sr(1)
10949 .m(4)
10950 .n(8)
10951 .k(k)
10952 .a_stride(43)
10953 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10954 }
10955 }
10956
10957 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4_subtile) {
10958 TEST_REQUIRES_ARM_NEON;
10959 for (size_t k = 8; k <= 40; k += 4) {
10960 for (uint32_t m = 1; m <= 4; m++) {
10961 for (uint32_t n = 1; n <= 8; n++) {
10962 GemmMicrokernelTester()
10963 .mr(4)
10964 .nr(8)
10965 .kr(1)
10966 .sr(1)
10967 .m(m)
10968 .n(n)
10969 .k(k)
10970 .iterations(1)
10971 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10972 }
10973 }
10974 }
10975 }
10976
10977 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8) {
10978 TEST_REQUIRES_ARM_NEON;
10979 for (uint32_t n = 9; n < 16; n++) {
10980 for (size_t k = 1; k <= 20; k += 5) {
10981 GemmMicrokernelTester()
10982 .mr(4)
10983 .nr(8)
10984 .kr(1)
10985 .sr(1)
10986 .m(4)
10987 .n(8)
10988 .k(k)
10989 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
10990 }
10991 }
10992 }
10993
10994 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
10995 TEST_REQUIRES_ARM_NEON;
10996 for (uint32_t n = 9; n < 16; n++) {
10997 for (size_t k = 1; k <= 20; k += 5) {
10998 GemmMicrokernelTester()
10999 .mr(4)
11000 .nr(8)
11001 .kr(1)
11002 .sr(1)
11003 .m(4)
11004 .n(8)
11005 .k(k)
11006 .cn_stride(11)
11007 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11008 }
11009 }
11010 }
11011
11012 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
11013 TEST_REQUIRES_ARM_NEON;
11014 for (uint32_t n = 9; n < 16; n++) {
11015 for (size_t k = 1; k <= 20; k += 5) {
11016 GemmMicrokernelTester()
11017 .mr(4)
11018 .nr(8)
11019 .kr(1)
11020 .sr(1)
11021 .m(4)
11022 .n(n)
11023 .k(k)
11024 .a_stride(23)
11025 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11026 }
11027 }
11028 }
11029
11030 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
11031 TEST_REQUIRES_ARM_NEON;
11032 for (uint32_t n = 9; n < 16; n++) {
11033 for (size_t k = 1; k <= 20; k += 5) {
11034 for (uint32_t m = 1; m <= 4; m++) {
11035 GemmMicrokernelTester()
11036 .mr(4)
11037 .nr(8)
11038 .kr(1)
11039 .sr(1)
11040 .m(m)
11041 .n(n)
11042 .k(k)
11043 .iterations(1)
11044 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11045 }
11046 }
11047 }
11048 }
11049
11050 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8) {
11051 TEST_REQUIRES_ARM_NEON;
11052 for (uint32_t n = 16; n <= 24; n += 8) {
11053 for (size_t k = 1; k <= 20; k += 5) {
11054 GemmMicrokernelTester()
11055 .mr(4)
11056 .nr(8)
11057 .kr(1)
11058 .sr(1)
11059 .m(4)
11060 .n(8)
11061 .k(k)
11062 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11063 }
11064 }
11065 }
11066
11067 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
11068 TEST_REQUIRES_ARM_NEON;
11069 for (uint32_t n = 16; n <= 24; n += 8) {
11070 for (size_t k = 1; k <= 20; k += 5) {
11071 GemmMicrokernelTester()
11072 .mr(4)
11073 .nr(8)
11074 .kr(1)
11075 .sr(1)
11076 .m(4)
11077 .n(n)
11078 .k(k)
11079 .cn_stride(11)
11080 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11081 }
11082 }
11083 }
11084
11085 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
11086 TEST_REQUIRES_ARM_NEON;
11087 for (uint32_t n = 16; n <= 24; n += 8) {
11088 for (size_t k = 1; k <= 20; k += 5) {
11089 GemmMicrokernelTester()
11090 .mr(4)
11091 .nr(8)
11092 .kr(1)
11093 .sr(1)
11094 .m(4)
11095 .n(n)
11096 .k(k)
11097 .a_stride(23)
11098 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11099 }
11100 }
11101 }
11102
11103 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_subtile) {
11104 TEST_REQUIRES_ARM_NEON;
11105 for (uint32_t n = 16; n <= 24; n += 8) {
11106 for (size_t k = 1; k <= 20; k += 5) {
11107 for (uint32_t m = 1; m <= 4; m++) {
11108 GemmMicrokernelTester()
11109 .mr(4)
11110 .nr(8)
11111 .kr(1)
11112 .sr(1)
11113 .m(m)
11114 .n(n)
11115 .k(k)
11116 .iterations(1)
11117 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11118 }
11119 }
11120 }
11121 }
11122
11123 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cm_subtile) {
11124 TEST_REQUIRES_ARM_NEON;
11125 for (size_t k = 1; k <= 20; k += 5) {
11126 for (uint32_t m = 1; m <= 4; m++) {
11127 for (uint32_t n = 1; n <= 8; n++) {
11128 GemmMicrokernelTester()
11129 .mr(4)
11130 .nr(8)
11131 .kr(1)
11132 .sr(1)
11133 .m(m)
11134 .n(n)
11135 .k(k)
11136 .cm_stride(11)
11137 .iterations(1)
11138 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11139 }
11140 }
11141 }
11142 }
11143
11144 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, qmin) {
11145 TEST_REQUIRES_ARM_NEON;
11146 GemmMicrokernelTester()
11147 .mr(4)
11148 .nr(8)
11149 .kr(1)
11150 .sr(1)
11151 .m(4)
11152 .n(8)
11153 .k(4)
11154 .qmin(128)
11155 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11156 }
11157
11158 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, qmax) {
11159 TEST_REQUIRES_ARM_NEON;
11160 GemmMicrokernelTester()
11161 .mr(4)
11162 .nr(8)
11163 .kr(1)
11164 .sr(1)
11165 .m(4)
11166 .n(8)
11167 .k(4)
11168 .qmax(128)
11169 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11170 }
11171
11172 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cm) {
11173 TEST_REQUIRES_ARM_NEON;
11174 GemmMicrokernelTester()
11175 .mr(4)
11176 .nr(8)
11177 .kr(1)
11178 .sr(1)
11179 .m(4)
11180 .n(8)
11181 .k(4)
11182 .cm_stride(11)
11183 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
11184 }
11185#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11186
11187
11188#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11189 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2) {
11190 TEST_REQUIRES_ARM_NEON;
11191 GemmMicrokernelTester()
11192 .mr(5)
11193 .nr(8)
11194 .kr(1)
11195 .sr(1)
11196 .m(5)
11197 .n(8)
11198 .k(2)
11199 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11200 }
11201
11202 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cn) {
11203 TEST_REQUIRES_ARM_NEON;
11204 GemmMicrokernelTester()
11205 .mr(5)
11206 .nr(8)
11207 .kr(1)
11208 .sr(1)
11209 .m(5)
11210 .n(8)
11211 .k(2)
11212 .cn_stride(11)
11213 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11214 }
11215
11216 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
11217 TEST_REQUIRES_ARM_NEON;
11218 GemmMicrokernelTester()
11219 .mr(5)
11220 .nr(8)
11221 .kr(1)
11222 .sr(1)
11223 .m(5)
11224 .n(8)
11225 .k(2)
11226 .a_stride(5)
11227 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11228 }
11229
11230 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
11231 TEST_REQUIRES_ARM_NEON;
11232 for (uint32_t m = 1; m <= 5; m++) {
11233 for (uint32_t n = 1; n <= 8; n++) {
11234 GemmMicrokernelTester()
11235 .mr(5)
11236 .nr(8)
11237 .kr(1)
11238 .sr(1)
11239 .m(m)
11240 .n(n)
11241 .k(2)
11242 .iterations(1)
11243 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11244 }
11245 }
11246 }
11247
11248 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
11249 TEST_REQUIRES_ARM_NEON;
11250 for (uint32_t m = 1; m <= 5; m++) {
11251 GemmMicrokernelTester()
11252 .mr(5)
11253 .nr(8)
11254 .kr(1)
11255 .sr(1)
11256 .m(m)
11257 .n(8)
11258 .k(2)
11259 .iterations(1)
11260 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11261 }
11262 }
11263
11264 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
11265 TEST_REQUIRES_ARM_NEON;
11266 for (uint32_t n = 1; n <= 8; n++) {
11267 GemmMicrokernelTester()
11268 .mr(5)
11269 .nr(8)
11270 .kr(1)
11271 .sr(1)
11272 .m(5)
11273 .n(n)
11274 .k(2)
11275 .iterations(1)
11276 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11277 }
11278 }
11279
11280 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2) {
11281 TEST_REQUIRES_ARM_NEON;
11282 for (size_t k = 1; k < 2; k++) {
11283 GemmMicrokernelTester()
11284 .mr(5)
11285 .nr(8)
11286 .kr(1)
11287 .sr(1)
11288 .m(5)
11289 .n(8)
11290 .k(k)
11291 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11292 }
11293 }
11294
11295 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
11296 TEST_REQUIRES_ARM_NEON;
11297 for (size_t k = 1; k < 2; k++) {
11298 GemmMicrokernelTester()
11299 .mr(5)
11300 .nr(8)
11301 .kr(1)
11302 .sr(1)
11303 .m(5)
11304 .n(8)
11305 .k(k)
11306 .a_stride(5)
11307 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11308 }
11309 }
11310
11311 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
11312 TEST_REQUIRES_ARM_NEON;
11313 for (size_t k = 1; k < 2; k++) {
11314 for (uint32_t m = 1; m <= 5; m++) {
11315 for (uint32_t n = 1; n <= 8; n++) {
11316 GemmMicrokernelTester()
11317 .mr(5)
11318 .nr(8)
11319 .kr(1)
11320 .sr(1)
11321 .m(m)
11322 .n(n)
11323 .k(k)
11324 .iterations(1)
11325 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11326 }
11327 }
11328 }
11329 }
11330
11331 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2) {
11332 TEST_REQUIRES_ARM_NEON;
11333 for (size_t k = 3; k < 4; k++) {
11334 GemmMicrokernelTester()
11335 .mr(5)
11336 .nr(8)
11337 .kr(1)
11338 .sr(1)
11339 .m(5)
11340 .n(8)
11341 .k(k)
11342 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11343 }
11344 }
11345
11346 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
11347 TEST_REQUIRES_ARM_NEON;
11348 for (size_t k = 3; k < 4; k++) {
11349 GemmMicrokernelTester()
11350 .mr(5)
11351 .nr(8)
11352 .kr(1)
11353 .sr(1)
11354 .m(5)
11355 .n(8)
11356 .k(k)
11357 .a_stride(7)
11358 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11359 }
11360 }
11361
11362 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
11363 TEST_REQUIRES_ARM_NEON;
11364 for (size_t k = 3; k < 4; k++) {
11365 for (uint32_t m = 1; m <= 5; m++) {
11366 for (uint32_t n = 1; n <= 8; n++) {
11367 GemmMicrokernelTester()
11368 .mr(5)
11369 .nr(8)
11370 .kr(1)
11371 .sr(1)
11372 .m(m)
11373 .n(n)
11374 .k(k)
11375 .iterations(1)
11376 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11377 }
11378 }
11379 }
11380 }
11381
11382 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2) {
11383 TEST_REQUIRES_ARM_NEON;
11384 for (size_t k = 4; k <= 20; k += 2) {
11385 GemmMicrokernelTester()
11386 .mr(5)
11387 .nr(8)
11388 .kr(1)
11389 .sr(1)
11390 .m(5)
11391 .n(8)
11392 .k(k)
11393 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11394 }
11395 }
11396
11397 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
11398 TEST_REQUIRES_ARM_NEON;
11399 for (size_t k = 4; k <= 20; k += 2) {
11400 GemmMicrokernelTester()
11401 .mr(5)
11402 .nr(8)
11403 .kr(1)
11404 .sr(1)
11405 .m(5)
11406 .n(8)
11407 .k(k)
11408 .a_stride(23)
11409 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11410 }
11411 }
11412
11413 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2_subtile) {
11414 TEST_REQUIRES_ARM_NEON;
11415 for (size_t k = 4; k <= 20; k += 2) {
11416 for (uint32_t m = 1; m <= 5; m++) {
11417 for (uint32_t n = 1; n <= 8; n++) {
11418 GemmMicrokernelTester()
11419 .mr(5)
11420 .nr(8)
11421 .kr(1)
11422 .sr(1)
11423 .m(m)
11424 .n(n)
11425 .k(k)
11426 .iterations(1)
11427 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11428 }
11429 }
11430 }
11431 }
11432
11433 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8) {
11434 TEST_REQUIRES_ARM_NEON;
11435 for (uint32_t n = 9; n < 16; n++) {
11436 for (size_t k = 1; k <= 10; k += 3) {
11437 GemmMicrokernelTester()
11438 .mr(5)
11439 .nr(8)
11440 .kr(1)
11441 .sr(1)
11442 .m(5)
11443 .n(8)
11444 .k(k)
11445 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11446 }
11447 }
11448 }
11449
11450 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
11451 TEST_REQUIRES_ARM_NEON;
11452 for (uint32_t n = 9; n < 16; n++) {
11453 for (size_t k = 1; k <= 10; k += 3) {
11454 GemmMicrokernelTester()
11455 .mr(5)
11456 .nr(8)
11457 .kr(1)
11458 .sr(1)
11459 .m(5)
11460 .n(8)
11461 .k(k)
11462 .cn_stride(11)
11463 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11464 }
11465 }
11466 }
11467
11468 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
11469 TEST_REQUIRES_ARM_NEON;
11470 for (uint32_t n = 9; n < 16; n++) {
11471 for (size_t k = 1; k <= 10; k += 3) {
11472 GemmMicrokernelTester()
11473 .mr(5)
11474 .nr(8)
11475 .kr(1)
11476 .sr(1)
11477 .m(5)
11478 .n(n)
11479 .k(k)
11480 .a_stride(13)
11481 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11482 }
11483 }
11484 }
11485
11486 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
11487 TEST_REQUIRES_ARM_NEON;
11488 for (uint32_t n = 9; n < 16; n++) {
11489 for (size_t k = 1; k <= 10; k += 3) {
11490 for (uint32_t m = 1; m <= 5; m++) {
11491 GemmMicrokernelTester()
11492 .mr(5)
11493 .nr(8)
11494 .kr(1)
11495 .sr(1)
11496 .m(m)
11497 .n(n)
11498 .k(k)
11499 .iterations(1)
11500 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11501 }
11502 }
11503 }
11504 }
11505
11506 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8) {
11507 TEST_REQUIRES_ARM_NEON;
11508 for (uint32_t n = 16; n <= 24; n += 8) {
11509 for (size_t k = 1; k <= 10; k += 3) {
11510 GemmMicrokernelTester()
11511 .mr(5)
11512 .nr(8)
11513 .kr(1)
11514 .sr(1)
11515 .m(5)
11516 .n(8)
11517 .k(k)
11518 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11519 }
11520 }
11521 }
11522
11523 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
11524 TEST_REQUIRES_ARM_NEON;
11525 for (uint32_t n = 16; n <= 24; n += 8) {
11526 for (size_t k = 1; k <= 10; k += 3) {
11527 GemmMicrokernelTester()
11528 .mr(5)
11529 .nr(8)
11530 .kr(1)
11531 .sr(1)
11532 .m(5)
11533 .n(n)
11534 .k(k)
11535 .cn_stride(11)
11536 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11537 }
11538 }
11539 }
11540
11541 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
11542 TEST_REQUIRES_ARM_NEON;
11543 for (uint32_t n = 16; n <= 24; n += 8) {
11544 for (size_t k = 1; k <= 10; k += 3) {
11545 GemmMicrokernelTester()
11546 .mr(5)
11547 .nr(8)
11548 .kr(1)
11549 .sr(1)
11550 .m(5)
11551 .n(n)
11552 .k(k)
11553 .a_stride(13)
11554 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11555 }
11556 }
11557 }
11558
11559 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_subtile) {
11560 TEST_REQUIRES_ARM_NEON;
11561 for (uint32_t n = 16; n <= 24; n += 8) {
11562 for (size_t k = 1; k <= 10; k += 3) {
11563 for (uint32_t m = 1; m <= 5; m++) {
11564 GemmMicrokernelTester()
11565 .mr(5)
11566 .nr(8)
11567 .kr(1)
11568 .sr(1)
11569 .m(m)
11570 .n(n)
11571 .k(k)
11572 .iterations(1)
11573 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11574 }
11575 }
11576 }
11577 }
11578
11579 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cm_subtile) {
11580 TEST_REQUIRES_ARM_NEON;
11581 for (size_t k = 1; k <= 10; k += 3) {
11582 for (uint32_t m = 1; m <= 5; m++) {
11583 for (uint32_t n = 1; n <= 8; n++) {
11584 GemmMicrokernelTester()
11585 .mr(5)
11586 .nr(8)
11587 .kr(1)
11588 .sr(1)
11589 .m(m)
11590 .n(n)
11591 .k(k)
11592 .cm_stride(11)
11593 .iterations(1)
11594 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11595 }
11596 }
11597 }
11598 }
11599
11600 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, qmin) {
11601 TEST_REQUIRES_ARM_NEON;
11602 GemmMicrokernelTester()
11603 .mr(5)
11604 .nr(8)
11605 .kr(1)
11606 .sr(1)
11607 .m(5)
11608 .n(8)
11609 .k(2)
11610 .qmin(128)
11611 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11612 }
11613
11614 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, qmax) {
11615 TEST_REQUIRES_ARM_NEON;
11616 GemmMicrokernelTester()
11617 .mr(5)
11618 .nr(8)
11619 .kr(1)
11620 .sr(1)
11621 .m(5)
11622 .n(8)
11623 .k(2)
11624 .qmax(128)
11625 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11626 }
11627
11628 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cm) {
11629 TEST_REQUIRES_ARM_NEON;
11630 GemmMicrokernelTester()
11631 .mr(5)
11632 .nr(8)
11633 .kr(1)
11634 .sr(1)
11635 .m(5)
11636 .n(8)
11637 .k(2)
11638 .cm_stride(11)
11639 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
11640 }
11641#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11642
11643
11644#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11645 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2) {
11646 TEST_REQUIRES_ARM_NEON;
11647 GemmMicrokernelTester()
11648 .mr(6)
11649 .nr(8)
11650 .kr(1)
11651 .sr(1)
11652 .m(6)
11653 .n(8)
11654 .k(2)
11655 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11656 }
11657
11658 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cn) {
11659 TEST_REQUIRES_ARM_NEON;
11660 GemmMicrokernelTester()
11661 .mr(6)
11662 .nr(8)
11663 .kr(1)
11664 .sr(1)
11665 .m(6)
11666 .n(8)
11667 .k(2)
11668 .cn_stride(11)
11669 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11670 }
11671
11672 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_strided_a) {
11673 TEST_REQUIRES_ARM_NEON;
11674 GemmMicrokernelTester()
11675 .mr(6)
11676 .nr(8)
11677 .kr(1)
11678 .sr(1)
11679 .m(6)
11680 .n(8)
11681 .k(2)
11682 .a_stride(5)
11683 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11684 }
11685
11686 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
11687 TEST_REQUIRES_ARM_NEON;
11688 for (uint32_t m = 1; m <= 6; m++) {
11689 for (uint32_t n = 1; n <= 8; n++) {
11690 GemmMicrokernelTester()
11691 .mr(6)
11692 .nr(8)
11693 .kr(1)
11694 .sr(1)
11695 .m(m)
11696 .n(n)
11697 .k(2)
11698 .iterations(1)
11699 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11700 }
11701 }
11702 }
11703
11704 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
11705 TEST_REQUIRES_ARM_NEON;
11706 for (uint32_t m = 1; m <= 6; m++) {
11707 GemmMicrokernelTester()
11708 .mr(6)
11709 .nr(8)
11710 .kr(1)
11711 .sr(1)
11712 .m(m)
11713 .n(8)
11714 .k(2)
11715 .iterations(1)
11716 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11717 }
11718 }
11719
11720 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
11721 TEST_REQUIRES_ARM_NEON;
11722 for (uint32_t n = 1; n <= 8; n++) {
11723 GemmMicrokernelTester()
11724 .mr(6)
11725 .nr(8)
11726 .kr(1)
11727 .sr(1)
11728 .m(6)
11729 .n(n)
11730 .k(2)
11731 .iterations(1)
11732 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11733 }
11734 }
11735
11736 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2) {
11737 TEST_REQUIRES_ARM_NEON;
11738 for (size_t k = 1; k < 2; k++) {
11739 GemmMicrokernelTester()
11740 .mr(6)
11741 .nr(8)
11742 .kr(1)
11743 .sr(1)
11744 .m(6)
11745 .n(8)
11746 .k(k)
11747 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11748 }
11749 }
11750
11751 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2_strided_a) {
11752 TEST_REQUIRES_ARM_NEON;
11753 for (size_t k = 1; k < 2; k++) {
11754 GemmMicrokernelTester()
11755 .mr(6)
11756 .nr(8)
11757 .kr(1)
11758 .sr(1)
11759 .m(6)
11760 .n(8)
11761 .k(k)
11762 .a_stride(5)
11763 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11764 }
11765 }
11766
11767 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
11768 TEST_REQUIRES_ARM_NEON;
11769 for (size_t k = 1; k < 2; k++) {
11770 for (uint32_t m = 1; m <= 6; m++) {
11771 for (uint32_t n = 1; n <= 8; n++) {
11772 GemmMicrokernelTester()
11773 .mr(6)
11774 .nr(8)
11775 .kr(1)
11776 .sr(1)
11777 .m(m)
11778 .n(n)
11779 .k(k)
11780 .iterations(1)
11781 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11782 }
11783 }
11784 }
11785 }
11786
11787 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2) {
11788 TEST_REQUIRES_ARM_NEON;
11789 for (size_t k = 3; k < 4; k++) {
11790 GemmMicrokernelTester()
11791 .mr(6)
11792 .nr(8)
11793 .kr(1)
11794 .sr(1)
11795 .m(6)
11796 .n(8)
11797 .k(k)
11798 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11799 }
11800 }
11801
11802 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2_strided_a) {
11803 TEST_REQUIRES_ARM_NEON;
11804 for (size_t k = 3; k < 4; k++) {
11805 GemmMicrokernelTester()
11806 .mr(6)
11807 .nr(8)
11808 .kr(1)
11809 .sr(1)
11810 .m(6)
11811 .n(8)
11812 .k(k)
11813 .a_stride(7)
11814 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11815 }
11816 }
11817
11818 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
11819 TEST_REQUIRES_ARM_NEON;
11820 for (size_t k = 3; k < 4; k++) {
11821 for (uint32_t m = 1; m <= 6; m++) {
11822 for (uint32_t n = 1; n <= 8; n++) {
11823 GemmMicrokernelTester()
11824 .mr(6)
11825 .nr(8)
11826 .kr(1)
11827 .sr(1)
11828 .m(m)
11829 .n(n)
11830 .k(k)
11831 .iterations(1)
11832 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11833 }
11834 }
11835 }
11836 }
11837
11838 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2) {
11839 TEST_REQUIRES_ARM_NEON;
11840 for (size_t k = 4; k <= 20; k += 2) {
11841 GemmMicrokernelTester()
11842 .mr(6)
11843 .nr(8)
11844 .kr(1)
11845 .sr(1)
11846 .m(6)
11847 .n(8)
11848 .k(k)
11849 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11850 }
11851 }
11852
11853 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2_strided_a) {
11854 TEST_REQUIRES_ARM_NEON;
11855 for (size_t k = 4; k <= 20; k += 2) {
11856 GemmMicrokernelTester()
11857 .mr(6)
11858 .nr(8)
11859 .kr(1)
11860 .sr(1)
11861 .m(6)
11862 .n(8)
11863 .k(k)
11864 .a_stride(23)
11865 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11866 }
11867 }
11868
11869 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2_subtile) {
11870 TEST_REQUIRES_ARM_NEON;
11871 for (size_t k = 4; k <= 20; k += 2) {
11872 for (uint32_t m = 1; m <= 6; m++) {
11873 for (uint32_t n = 1; n <= 8; n++) {
11874 GemmMicrokernelTester()
11875 .mr(6)
11876 .nr(8)
11877 .kr(1)
11878 .sr(1)
11879 .m(m)
11880 .n(n)
11881 .k(k)
11882 .iterations(1)
11883 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11884 }
11885 }
11886 }
11887 }
11888
11889 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8) {
11890 TEST_REQUIRES_ARM_NEON;
11891 for (uint32_t n = 9; n < 16; n++) {
11892 for (size_t k = 1; k <= 10; k += 3) {
11893 GemmMicrokernelTester()
11894 .mr(6)
11895 .nr(8)
11896 .kr(1)
11897 .sr(1)
11898 .m(6)
11899 .n(8)
11900 .k(k)
11901 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11902 }
11903 }
11904 }
11905
11906 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
11907 TEST_REQUIRES_ARM_NEON;
11908 for (uint32_t n = 9; n < 16; n++) {
11909 for (size_t k = 1; k <= 10; k += 3) {
11910 GemmMicrokernelTester()
11911 .mr(6)
11912 .nr(8)
11913 .kr(1)
11914 .sr(1)
11915 .m(6)
11916 .n(8)
11917 .k(k)
11918 .cn_stride(11)
11919 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11920 }
11921 }
11922 }
11923
11924 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_strided_a) {
11925 TEST_REQUIRES_ARM_NEON;
11926 for (uint32_t n = 9; n < 16; n++) {
11927 for (size_t k = 1; k <= 10; k += 3) {
11928 GemmMicrokernelTester()
11929 .mr(6)
11930 .nr(8)
11931 .kr(1)
11932 .sr(1)
11933 .m(6)
11934 .n(n)
11935 .k(k)
11936 .a_stride(13)
11937 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11938 }
11939 }
11940 }
11941
11942 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
11943 TEST_REQUIRES_ARM_NEON;
11944 for (uint32_t n = 9; n < 16; n++) {
11945 for (size_t k = 1; k <= 10; k += 3) {
11946 for (uint32_t m = 1; m <= 6; m++) {
11947 GemmMicrokernelTester()
11948 .mr(6)
11949 .nr(8)
11950 .kr(1)
11951 .sr(1)
11952 .m(m)
11953 .n(n)
11954 .k(k)
11955 .iterations(1)
11956 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11957 }
11958 }
11959 }
11960 }
11961
11962 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8) {
11963 TEST_REQUIRES_ARM_NEON;
11964 for (uint32_t n = 16; n <= 24; n += 8) {
11965 for (size_t k = 1; k <= 10; k += 3) {
11966 GemmMicrokernelTester()
11967 .mr(6)
11968 .nr(8)
11969 .kr(1)
11970 .sr(1)
11971 .m(6)
11972 .n(8)
11973 .k(k)
11974 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11975 }
11976 }
11977 }
11978
11979 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
11980 TEST_REQUIRES_ARM_NEON;
11981 for (uint32_t n = 16; n <= 24; n += 8) {
11982 for (size_t k = 1; k <= 10; k += 3) {
11983 GemmMicrokernelTester()
11984 .mr(6)
11985 .nr(8)
11986 .kr(1)
11987 .sr(1)
11988 .m(6)
11989 .n(n)
11990 .k(k)
11991 .cn_stride(11)
11992 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
11993 }
11994 }
11995 }
11996
11997 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_strided_a) {
11998 TEST_REQUIRES_ARM_NEON;
11999 for (uint32_t n = 16; n <= 24; n += 8) {
12000 for (size_t k = 1; k <= 10; k += 3) {
12001 GemmMicrokernelTester()
12002 .mr(6)
12003 .nr(8)
12004 .kr(1)
12005 .sr(1)
12006 .m(6)
12007 .n(n)
12008 .k(k)
12009 .a_stride(13)
12010 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
12011 }
12012 }
12013 }
12014
12015 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_subtile) {
12016 TEST_REQUIRES_ARM_NEON;
12017 for (uint32_t n = 16; n <= 24; n += 8) {
12018 for (size_t k = 1; k <= 10; k += 3) {
12019 for (uint32_t m = 1; m <= 6; m++) {
12020 GemmMicrokernelTester()
12021 .mr(6)
12022 .nr(8)
12023 .kr(1)
12024 .sr(1)
12025 .m(m)
12026 .n(n)
12027 .k(k)
12028 .iterations(1)
12029 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
12030 }
12031 }
12032 }
12033 }
12034
12035 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cm_subtile) {
12036 TEST_REQUIRES_ARM_NEON;
12037 for (size_t k = 1; k <= 10; k += 3) {
12038 for (uint32_t m = 1; m <= 6; m++) {
12039 for (uint32_t n = 1; n <= 8; n++) {
12040 GemmMicrokernelTester()
12041 .mr(6)
12042 .nr(8)
12043 .kr(1)
12044 .sr(1)
12045 .m(m)
12046 .n(n)
12047 .k(k)
12048 .cm_stride(11)
12049 .iterations(1)
12050 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
12051 }
12052 }
12053 }
12054 }
12055
12056 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, qmin) {
12057 TEST_REQUIRES_ARM_NEON;
12058 GemmMicrokernelTester()
12059 .mr(6)
12060 .nr(8)
12061 .kr(1)
12062 .sr(1)
12063 .m(6)
12064 .n(8)
12065 .k(2)
12066 .qmin(128)
12067 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
12068 }
12069
12070 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, qmax) {
12071 TEST_REQUIRES_ARM_NEON;
12072 GemmMicrokernelTester()
12073 .mr(6)
12074 .nr(8)
12075 .kr(1)
12076 .sr(1)
12077 .m(6)
12078 .n(8)
12079 .k(2)
12080 .qmax(128)
12081 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
12082 }
12083
12084 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cm) {
12085 TEST_REQUIRES_ARM_NEON;
12086 GemmMicrokernelTester()
12087 .mr(6)
12088 .nr(8)
12089 .kr(1)
12090 .sr(1)
12091 .m(6)
12092 .n(8)
12093 .k(2)
12094 .cm_stride(11)
12095 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
12096 }
12097#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12098
12099
12100#if XNN_ARCH_ARM || XNN_ARCH_ARM64
12101 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4) {
12102 TEST_REQUIRES_ARM_NEON;
12103 GemmMicrokernelTester()
12104 .mr(6)
12105 .nr(8)
12106 .kr(1)
12107 .sr(1)
12108 .m(6)
12109 .n(8)
12110 .k(4)
12111 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12112 }
12113
12114 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, strided_cn) {
12115 TEST_REQUIRES_ARM_NEON;
12116 GemmMicrokernelTester()
12117 .mr(6)
12118 .nr(8)
12119 .kr(1)
12120 .sr(1)
12121 .m(6)
12122 .n(8)
12123 .k(4)
12124 .cn_stride(11)
12125 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12126 }
12127
12128 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_strided_a) {
12129 TEST_REQUIRES_ARM_NEON;
12130 GemmMicrokernelTester()
12131 .mr(6)
12132 .nr(8)
12133 .kr(1)
12134 .sr(1)
12135 .m(6)
12136 .n(8)
12137 .k(4)
12138 .a_stride(7)
12139 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12140 }
12141
12142 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
12143 TEST_REQUIRES_ARM_NEON;
12144 for (uint32_t m = 1; m <= 6; m++) {
12145 for (uint32_t n = 1; n <= 8; n++) {
12146 GemmMicrokernelTester()
12147 .mr(6)
12148 .nr(8)
12149 .kr(1)
12150 .sr(1)
12151 .m(m)
12152 .n(n)
12153 .k(4)
12154 .iterations(1)
12155 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12156 }
12157 }
12158 }
12159
12160 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
12161 TEST_REQUIRES_ARM_NEON;
12162 for (uint32_t m = 1; m <= 6; m++) {
12163 GemmMicrokernelTester()
12164 .mr(6)
12165 .nr(8)
12166 .kr(1)
12167 .sr(1)
12168 .m(m)
12169 .n(8)
12170 .k(4)
12171 .iterations(1)
12172 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12173 }
12174 }
12175
12176 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
12177 TEST_REQUIRES_ARM_NEON;
12178 for (uint32_t n = 1; n <= 8; n++) {
12179 GemmMicrokernelTester()
12180 .mr(6)
12181 .nr(8)
12182 .kr(1)
12183 .sr(1)
12184 .m(6)
12185 .n(n)
12186 .k(4)
12187 .iterations(1)
12188 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12189 }
12190 }
12191
12192 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_lt_4) {
12193 TEST_REQUIRES_ARM_NEON;
12194 for (size_t k = 1; k < 4; k++) {
12195 GemmMicrokernelTester()
12196 .mr(6)
12197 .nr(8)
12198 .kr(1)
12199 .sr(1)
12200 .m(6)
12201 .n(8)
12202 .k(k)
12203 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12204 }
12205 }
12206
12207 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_lt_4_strided_a) {
12208 TEST_REQUIRES_ARM_NEON;
12209 for (size_t k = 1; k < 4; k++) {
12210 GemmMicrokernelTester()
12211 .mr(6)
12212 .nr(8)
12213 .kr(1)
12214 .sr(1)
12215 .m(6)
12216 .n(8)
12217 .k(k)
12218 .a_stride(7)
12219 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12220 }
12221 }
12222
12223 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
12224 TEST_REQUIRES_ARM_NEON;
12225 for (size_t k = 1; k < 4; k++) {
12226 for (uint32_t m = 1; m <= 6; m++) {
12227 for (uint32_t n = 1; n <= 8; n++) {
12228 GemmMicrokernelTester()
12229 .mr(6)
12230 .nr(8)
12231 .kr(1)
12232 .sr(1)
12233 .m(m)
12234 .n(n)
12235 .k(k)
12236 .iterations(1)
12237 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12238 }
12239 }
12240 }
12241 }
12242
12243 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_gt_4) {
12244 TEST_REQUIRES_ARM_NEON;
12245 for (size_t k = 5; k < 8; k++) {
12246 GemmMicrokernelTester()
12247 .mr(6)
12248 .nr(8)
12249 .kr(1)
12250 .sr(1)
12251 .m(6)
12252 .n(8)
12253 .k(k)
12254 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12255 }
12256 }
12257
12258 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_gt_4_strided_a) {
12259 TEST_REQUIRES_ARM_NEON;
12260 for (size_t k = 5; k < 8; k++) {
12261 GemmMicrokernelTester()
12262 .mr(6)
12263 .nr(8)
12264 .kr(1)
12265 .sr(1)
12266 .m(6)
12267 .n(8)
12268 .k(k)
12269 .a_stride(11)
12270 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12271 }
12272 }
12273
12274 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
12275 TEST_REQUIRES_ARM_NEON;
12276 for (size_t k = 5; k < 8; k++) {
12277 for (uint32_t m = 1; m <= 6; m++) {
12278 for (uint32_t n = 1; n <= 8; n++) {
12279 GemmMicrokernelTester()
12280 .mr(6)
12281 .nr(8)
12282 .kr(1)
12283 .sr(1)
12284 .m(m)
12285 .n(n)
12286 .k(k)
12287 .iterations(1)
12288 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12289 }
12290 }
12291 }
12292 }
12293
12294 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_div_4) {
12295 TEST_REQUIRES_ARM_NEON;
12296 for (size_t k = 8; k <= 40; k += 4) {
12297 GemmMicrokernelTester()
12298 .mr(6)
12299 .nr(8)
12300 .kr(1)
12301 .sr(1)
12302 .m(6)
12303 .n(8)
12304 .k(k)
12305 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12306 }
12307 }
12308
12309 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_div_4_strided_a) {
12310 TEST_REQUIRES_ARM_NEON;
12311 for (size_t k = 8; k <= 40; k += 4) {
12312 GemmMicrokernelTester()
12313 .mr(6)
12314 .nr(8)
12315 .kr(1)
12316 .sr(1)
12317 .m(6)
12318 .n(8)
12319 .k(k)
12320 .a_stride(43)
12321 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12322 }
12323 }
12324
12325 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_div_4_subtile) {
12326 TEST_REQUIRES_ARM_NEON;
12327 for (size_t k = 8; k <= 40; k += 4) {
12328 for (uint32_t m = 1; m <= 6; m++) {
12329 for (uint32_t n = 1; n <= 8; n++) {
12330 GemmMicrokernelTester()
12331 .mr(6)
12332 .nr(8)
12333 .kr(1)
12334 .sr(1)
12335 .m(m)
12336 .n(n)
12337 .k(k)
12338 .iterations(1)
12339 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12340 }
12341 }
12342 }
12343 }
12344
12345 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8) {
12346 TEST_REQUIRES_ARM_NEON;
12347 for (uint32_t n = 9; n < 16; n++) {
12348 for (size_t k = 1; k <= 20; k += 5) {
12349 GemmMicrokernelTester()
12350 .mr(6)
12351 .nr(8)
12352 .kr(1)
12353 .sr(1)
12354 .m(6)
12355 .n(8)
12356 .k(k)
12357 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12358 }
12359 }
12360 }
12361
12362 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
12363 TEST_REQUIRES_ARM_NEON;
12364 for (uint32_t n = 9; n < 16; n++) {
12365 for (size_t k = 1; k <= 20; k += 5) {
12366 GemmMicrokernelTester()
12367 .mr(6)
12368 .nr(8)
12369 .kr(1)
12370 .sr(1)
12371 .m(6)
12372 .n(8)
12373 .k(k)
12374 .cn_stride(11)
12375 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12376 }
12377 }
12378 }
12379
12380 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8_strided_a) {
12381 TEST_REQUIRES_ARM_NEON;
12382 for (uint32_t n = 9; n < 16; n++) {
12383 for (size_t k = 1; k <= 20; k += 5) {
12384 GemmMicrokernelTester()
12385 .mr(6)
12386 .nr(8)
12387 .kr(1)
12388 .sr(1)
12389 .m(6)
12390 .n(n)
12391 .k(k)
12392 .a_stride(23)
12393 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12394 }
12395 }
12396 }
12397
12398 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
12399 TEST_REQUIRES_ARM_NEON;
12400 for (uint32_t n = 9; n < 16; n++) {
12401 for (size_t k = 1; k <= 20; k += 5) {
12402 for (uint32_t m = 1; m <= 6; m++) {
12403 GemmMicrokernelTester()
12404 .mr(6)
12405 .nr(8)
12406 .kr(1)
12407 .sr(1)
12408 .m(m)
12409 .n(n)
12410 .k(k)
12411 .iterations(1)
12412 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12413 }
12414 }
12415 }
12416 }
12417
12418 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8) {
12419 TEST_REQUIRES_ARM_NEON;
12420 for (uint32_t n = 16; n <= 24; n += 8) {
12421 for (size_t k = 1; k <= 20; k += 5) {
12422 GemmMicrokernelTester()
12423 .mr(6)
12424 .nr(8)
12425 .kr(1)
12426 .sr(1)
12427 .m(6)
12428 .n(8)
12429 .k(k)
12430 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12431 }
12432 }
12433 }
12434
12435 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
12436 TEST_REQUIRES_ARM_NEON;
12437 for (uint32_t n = 16; n <= 24; n += 8) {
12438 for (size_t k = 1; k <= 20; k += 5) {
12439 GemmMicrokernelTester()
12440 .mr(6)
12441 .nr(8)
12442 .kr(1)
12443 .sr(1)
12444 .m(6)
12445 .n(n)
12446 .k(k)
12447 .cn_stride(11)
12448 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12449 }
12450 }
12451 }
12452
12453 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8_strided_a) {
12454 TEST_REQUIRES_ARM_NEON;
12455 for (uint32_t n = 16; n <= 24; n += 8) {
12456 for (size_t k = 1; k <= 20; k += 5) {
12457 GemmMicrokernelTester()
12458 .mr(6)
12459 .nr(8)
12460 .kr(1)
12461 .sr(1)
12462 .m(6)
12463 .n(n)
12464 .k(k)
12465 .a_stride(23)
12466 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12467 }
12468 }
12469 }
12470
12471 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8_subtile) {
12472 TEST_REQUIRES_ARM_NEON;
12473 for (uint32_t n = 16; n <= 24; n += 8) {
12474 for (size_t k = 1; k <= 20; k += 5) {
12475 for (uint32_t m = 1; m <= 6; m++) {
12476 GemmMicrokernelTester()
12477 .mr(6)
12478 .nr(8)
12479 .kr(1)
12480 .sr(1)
12481 .m(m)
12482 .n(n)
12483 .k(k)
12484 .iterations(1)
12485 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12486 }
12487 }
12488 }
12489 }
12490
12491 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, strided_cm_subtile) {
12492 TEST_REQUIRES_ARM_NEON;
12493 for (size_t k = 1; k <= 20; k += 5) {
12494 for (uint32_t m = 1; m <= 6; m++) {
12495 for (uint32_t n = 1; n <= 8; n++) {
12496 GemmMicrokernelTester()
12497 .mr(6)
12498 .nr(8)
12499 .kr(1)
12500 .sr(1)
12501 .m(m)
12502 .n(n)
12503 .k(k)
12504 .cm_stride(11)
12505 .iterations(1)
12506 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12507 }
12508 }
12509 }
12510 }
12511
12512 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, qmin) {
12513 TEST_REQUIRES_ARM_NEON;
12514 GemmMicrokernelTester()
12515 .mr(6)
12516 .nr(8)
12517 .kr(1)
12518 .sr(1)
12519 .m(6)
12520 .n(8)
12521 .k(4)
12522 .qmin(128)
12523 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12524 }
12525
12526 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, qmax) {
12527 TEST_REQUIRES_ARM_NEON;
12528 GemmMicrokernelTester()
12529 .mr(6)
12530 .nr(8)
12531 .kr(1)
12532 .sr(1)
12533 .m(6)
12534 .n(8)
12535 .k(4)
12536 .qmax(128)
12537 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12538 }
12539
12540 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, strided_cm) {
12541 TEST_REQUIRES_ARM_NEON;
12542 GemmMicrokernelTester()
12543 .mr(6)
12544 .nr(8)
12545 .kr(1)
12546 .sr(1)
12547 .m(6)
12548 .n(8)
12549 .k(4)
12550 .cm_stride(11)
12551 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
12552 }
12553#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12554
12555
12556#if XNN_ARCH_ARM64
12557 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2) {
12558 TEST_REQUIRES_ARM_NEON_FMA;
12559 GemmMicrokernelTester()
12560 .mr(1)
12561 .nr(8)
12562 .kr(1)
12563 .sr(1)
12564 .m(1)
12565 .n(8)
12566 .k(2)
12567 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12568 }
12569
12570 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cn) {
12571 TEST_REQUIRES_ARM_NEON_FMA;
12572 GemmMicrokernelTester()
12573 .mr(1)
12574 .nr(8)
12575 .kr(1)
12576 .sr(1)
12577 .m(1)
12578 .n(8)
12579 .k(2)
12580 .cn_stride(11)
12581 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12582 }
12583
12584 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
12585 TEST_REQUIRES_ARM_NEON_FMA;
12586 GemmMicrokernelTester()
12587 .mr(1)
12588 .nr(8)
12589 .kr(1)
12590 .sr(1)
12591 .m(1)
12592 .n(8)
12593 .k(2)
12594 .a_stride(5)
12595 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12596 }
12597
12598 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
12599 TEST_REQUIRES_ARM_NEON_FMA;
12600 for (uint32_t m = 1; m <= 1; m++) {
12601 for (uint32_t n = 1; n <= 8; n++) {
12602 GemmMicrokernelTester()
12603 .mr(1)
12604 .nr(8)
12605 .kr(1)
12606 .sr(1)
12607 .m(m)
12608 .n(n)
12609 .k(2)
12610 .iterations(1)
12611 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12612 }
12613 }
12614 }
12615
12616 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
12617 TEST_REQUIRES_ARM_NEON_FMA;
12618 for (uint32_t m = 1; m <= 1; m++) {
12619 GemmMicrokernelTester()
12620 .mr(1)
12621 .nr(8)
12622 .kr(1)
12623 .sr(1)
12624 .m(m)
12625 .n(8)
12626 .k(2)
12627 .iterations(1)
12628 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12629 }
12630 }
12631
12632 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
12633 TEST_REQUIRES_ARM_NEON_FMA;
12634 for (uint32_t n = 1; n <= 8; n++) {
12635 GemmMicrokernelTester()
12636 .mr(1)
12637 .nr(8)
12638 .kr(1)
12639 .sr(1)
12640 .m(1)
12641 .n(n)
12642 .k(2)
12643 .iterations(1)
12644 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12645 }
12646 }
12647
12648 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2) {
12649 TEST_REQUIRES_ARM_NEON_FMA;
12650 for (size_t k = 1; k < 2; k++) {
12651 GemmMicrokernelTester()
12652 .mr(1)
12653 .nr(8)
12654 .kr(1)
12655 .sr(1)
12656 .m(1)
12657 .n(8)
12658 .k(k)
12659 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12660 }
12661 }
12662
12663 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
12664 TEST_REQUIRES_ARM_NEON_FMA;
12665 for (size_t k = 1; k < 2; k++) {
12666 GemmMicrokernelTester()
12667 .mr(1)
12668 .nr(8)
12669 .kr(1)
12670 .sr(1)
12671 .m(1)
12672 .n(8)
12673 .k(k)
12674 .a_stride(5)
12675 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12676 }
12677 }
12678
12679 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
12680 TEST_REQUIRES_ARM_NEON_FMA;
12681 for (size_t k = 1; k < 2; k++) {
12682 for (uint32_t m = 1; m <= 1; m++) {
12683 for (uint32_t n = 1; n <= 8; n++) {
12684 GemmMicrokernelTester()
12685 .mr(1)
12686 .nr(8)
12687 .kr(1)
12688 .sr(1)
12689 .m(m)
12690 .n(n)
12691 .k(k)
12692 .iterations(1)
12693 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12694 }
12695 }
12696 }
12697 }
12698
12699 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2) {
12700 TEST_REQUIRES_ARM_NEON_FMA;
12701 for (size_t k = 3; k < 4; k++) {
12702 GemmMicrokernelTester()
12703 .mr(1)
12704 .nr(8)
12705 .kr(1)
12706 .sr(1)
12707 .m(1)
12708 .n(8)
12709 .k(k)
12710 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12711 }
12712 }
12713
12714 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
12715 TEST_REQUIRES_ARM_NEON_FMA;
12716 for (size_t k = 3; k < 4; k++) {
12717 GemmMicrokernelTester()
12718 .mr(1)
12719 .nr(8)
12720 .kr(1)
12721 .sr(1)
12722 .m(1)
12723 .n(8)
12724 .k(k)
12725 .a_stride(7)
12726 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12727 }
12728 }
12729
12730 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
12731 TEST_REQUIRES_ARM_NEON_FMA;
12732 for (size_t k = 3; k < 4; k++) {
12733 for (uint32_t m = 1; m <= 1; m++) {
12734 for (uint32_t n = 1; n <= 8; n++) {
12735 GemmMicrokernelTester()
12736 .mr(1)
12737 .nr(8)
12738 .kr(1)
12739 .sr(1)
12740 .m(m)
12741 .n(n)
12742 .k(k)
12743 .iterations(1)
12744 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12745 }
12746 }
12747 }
12748 }
12749
12750 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2) {
12751 TEST_REQUIRES_ARM_NEON_FMA;
12752 for (size_t k = 4; k <= 20; k += 2) {
12753 GemmMicrokernelTester()
12754 .mr(1)
12755 .nr(8)
12756 .kr(1)
12757 .sr(1)
12758 .m(1)
12759 .n(8)
12760 .k(k)
12761 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12762 }
12763 }
12764
12765 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
12766 TEST_REQUIRES_ARM_NEON_FMA;
12767 for (size_t k = 4; k <= 20; k += 2) {
12768 GemmMicrokernelTester()
12769 .mr(1)
12770 .nr(8)
12771 .kr(1)
12772 .sr(1)
12773 .m(1)
12774 .n(8)
12775 .k(k)
12776 .a_stride(23)
12777 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12778 }
12779 }
12780
12781 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
12782 TEST_REQUIRES_ARM_NEON_FMA;
12783 for (size_t k = 4; k <= 20; k += 2) {
12784 for (uint32_t m = 1; m <= 1; m++) {
12785 for (uint32_t n = 1; n <= 8; n++) {
12786 GemmMicrokernelTester()
12787 .mr(1)
12788 .nr(8)
12789 .kr(1)
12790 .sr(1)
12791 .m(m)
12792 .n(n)
12793 .k(k)
12794 .iterations(1)
12795 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12796 }
12797 }
12798 }
12799 }
12800
12801 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8) {
12802 TEST_REQUIRES_ARM_NEON_FMA;
12803 for (uint32_t n = 9; n < 16; n++) {
12804 for (size_t k = 1; k <= 10; k += 3) {
12805 GemmMicrokernelTester()
12806 .mr(1)
12807 .nr(8)
12808 .kr(1)
12809 .sr(1)
12810 .m(1)
12811 .n(8)
12812 .k(k)
12813 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12814 }
12815 }
12816 }
12817
12818 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
12819 TEST_REQUIRES_ARM_NEON_FMA;
12820 for (uint32_t n = 9; n < 16; n++) {
12821 for (size_t k = 1; k <= 10; k += 3) {
12822 GemmMicrokernelTester()
12823 .mr(1)
12824 .nr(8)
12825 .kr(1)
12826 .sr(1)
12827 .m(1)
12828 .n(8)
12829 .k(k)
12830 .cn_stride(11)
12831 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12832 }
12833 }
12834 }
12835
12836 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
12837 TEST_REQUIRES_ARM_NEON_FMA;
12838 for (uint32_t n = 9; n < 16; n++) {
12839 for (size_t k = 1; k <= 10; k += 3) {
12840 GemmMicrokernelTester()
12841 .mr(1)
12842 .nr(8)
12843 .kr(1)
12844 .sr(1)
12845 .m(1)
12846 .n(n)
12847 .k(k)
12848 .a_stride(13)
12849 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12850 }
12851 }
12852 }
12853
12854 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
12855 TEST_REQUIRES_ARM_NEON_FMA;
12856 for (uint32_t n = 9; n < 16; n++) {
12857 for (size_t k = 1; k <= 10; k += 3) {
12858 for (uint32_t m = 1; m <= 1; m++) {
12859 GemmMicrokernelTester()
12860 .mr(1)
12861 .nr(8)
12862 .kr(1)
12863 .sr(1)
12864 .m(m)
12865 .n(n)
12866 .k(k)
12867 .iterations(1)
12868 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12869 }
12870 }
12871 }
12872 }
12873
12874 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8) {
12875 TEST_REQUIRES_ARM_NEON_FMA;
12876 for (uint32_t n = 16; n <= 24; n += 8) {
12877 for (size_t k = 1; k <= 10; k += 3) {
12878 GemmMicrokernelTester()
12879 .mr(1)
12880 .nr(8)
12881 .kr(1)
12882 .sr(1)
12883 .m(1)
12884 .n(8)
12885 .k(k)
12886 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12887 }
12888 }
12889 }
12890
12891 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
12892 TEST_REQUIRES_ARM_NEON_FMA;
12893 for (uint32_t n = 16; n <= 24; n += 8) {
12894 for (size_t k = 1; k <= 10; k += 3) {
12895 GemmMicrokernelTester()
12896 .mr(1)
12897 .nr(8)
12898 .kr(1)
12899 .sr(1)
12900 .m(1)
12901 .n(n)
12902 .k(k)
12903 .cn_stride(11)
12904 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12905 }
12906 }
12907 }
12908
12909 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
12910 TEST_REQUIRES_ARM_NEON_FMA;
12911 for (uint32_t n = 16; n <= 24; n += 8) {
12912 for (size_t k = 1; k <= 10; k += 3) {
12913 GemmMicrokernelTester()
12914 .mr(1)
12915 .nr(8)
12916 .kr(1)
12917 .sr(1)
12918 .m(1)
12919 .n(n)
12920 .k(k)
12921 .a_stride(13)
12922 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12923 }
12924 }
12925 }
12926
12927 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
12928 TEST_REQUIRES_ARM_NEON_FMA;
12929 for (uint32_t n = 16; n <= 24; n += 8) {
12930 for (size_t k = 1; k <= 10; k += 3) {
12931 for (uint32_t m = 1; m <= 1; m++) {
12932 GemmMicrokernelTester()
12933 .mr(1)
12934 .nr(8)
12935 .kr(1)
12936 .sr(1)
12937 .m(m)
12938 .n(n)
12939 .k(k)
12940 .iterations(1)
12941 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12942 }
12943 }
12944 }
12945 }
12946
12947 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
12948 TEST_REQUIRES_ARM_NEON_FMA;
12949 for (size_t k = 1; k <= 10; k += 3) {
12950 for (uint32_t m = 1; m <= 1; m++) {
12951 for (uint32_t n = 1; n <= 8; n++) {
12952 GemmMicrokernelTester()
12953 .mr(1)
12954 .nr(8)
12955 .kr(1)
12956 .sr(1)
12957 .m(m)
12958 .n(n)
12959 .k(k)
12960 .cm_stride(11)
12961 .iterations(1)
12962 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12963 }
12964 }
12965 }
12966 }
12967
12968 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, qmin) {
12969 TEST_REQUIRES_ARM_NEON_FMA;
12970 GemmMicrokernelTester()
12971 .mr(1)
12972 .nr(8)
12973 .kr(1)
12974 .sr(1)
12975 .m(1)
12976 .n(8)
12977 .k(2)
12978 .qmin(128)
12979 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12980 }
12981
12982 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, qmax) {
12983 TEST_REQUIRES_ARM_NEON_FMA;
12984 GemmMicrokernelTester()
12985 .mr(1)
12986 .nr(8)
12987 .kr(1)
12988 .sr(1)
12989 .m(1)
12990 .n(8)
12991 .k(2)
12992 .qmax(128)
12993 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
12994 }
12995
12996 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cm) {
12997 TEST_REQUIRES_ARM_NEON_FMA;
12998 GemmMicrokernelTester()
12999 .mr(1)
13000 .nr(8)
13001 .kr(1)
13002 .sr(1)
13003 .m(1)
13004 .n(8)
13005 .k(2)
13006 .cm_stride(11)
13007 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
13008 }
13009#endif // XNN_ARCH_ARM64
13010
13011
13012#if XNN_ARCH_ARM64
13013 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2) {
13014 TEST_REQUIRES_ARM_NEON_FMA;
13015 GemmMicrokernelTester()
13016 .mr(4)
13017 .nr(8)
13018 .kr(1)
13019 .sr(1)
13020 .m(4)
13021 .n(8)
13022 .k(2)
13023 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13024 }
13025
13026 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cn) {
13027 TEST_REQUIRES_ARM_NEON_FMA;
13028 GemmMicrokernelTester()
13029 .mr(4)
13030 .nr(8)
13031 .kr(1)
13032 .sr(1)
13033 .m(4)
13034 .n(8)
13035 .k(2)
13036 .cn_stride(11)
13037 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13038 }
13039
13040 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
13041 TEST_REQUIRES_ARM_NEON_FMA;
13042 GemmMicrokernelTester()
13043 .mr(4)
13044 .nr(8)
13045 .kr(1)
13046 .sr(1)
13047 .m(4)
13048 .n(8)
13049 .k(2)
13050 .a_stride(5)
13051 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13052 }
13053
13054 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
13055 TEST_REQUIRES_ARM_NEON_FMA;
13056 for (uint32_t m = 1; m <= 4; m++) {
13057 for (uint32_t n = 1; n <= 8; n++) {
13058 GemmMicrokernelTester()
13059 .mr(4)
13060 .nr(8)
13061 .kr(1)
13062 .sr(1)
13063 .m(m)
13064 .n(n)
13065 .k(2)
13066 .iterations(1)
13067 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13068 }
13069 }
13070 }
13071
13072 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
13073 TEST_REQUIRES_ARM_NEON_FMA;
13074 for (uint32_t m = 1; m <= 4; m++) {
13075 GemmMicrokernelTester()
13076 .mr(4)
13077 .nr(8)
13078 .kr(1)
13079 .sr(1)
13080 .m(m)
13081 .n(8)
13082 .k(2)
13083 .iterations(1)
13084 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13085 }
13086 }
13087
13088 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
13089 TEST_REQUIRES_ARM_NEON_FMA;
13090 for (uint32_t n = 1; n <= 8; n++) {
13091 GemmMicrokernelTester()
13092 .mr(4)
13093 .nr(8)
13094 .kr(1)
13095 .sr(1)
13096 .m(4)
13097 .n(n)
13098 .k(2)
13099 .iterations(1)
13100 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13101 }
13102 }
13103
13104 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2) {
13105 TEST_REQUIRES_ARM_NEON_FMA;
13106 for (size_t k = 1; k < 2; k++) {
13107 GemmMicrokernelTester()
13108 .mr(4)
13109 .nr(8)
13110 .kr(1)
13111 .sr(1)
13112 .m(4)
13113 .n(8)
13114 .k(k)
13115 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13116 }
13117 }
13118
13119 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
13120 TEST_REQUIRES_ARM_NEON_FMA;
13121 for (size_t k = 1; k < 2; k++) {
13122 GemmMicrokernelTester()
13123 .mr(4)
13124 .nr(8)
13125 .kr(1)
13126 .sr(1)
13127 .m(4)
13128 .n(8)
13129 .k(k)
13130 .a_stride(5)
13131 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13132 }
13133 }
13134
13135 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
13136 TEST_REQUIRES_ARM_NEON_FMA;
13137 for (size_t k = 1; k < 2; k++) {
13138 for (uint32_t m = 1; m <= 4; m++) {
13139 for (uint32_t n = 1; n <= 8; n++) {
13140 GemmMicrokernelTester()
13141 .mr(4)
13142 .nr(8)
13143 .kr(1)
13144 .sr(1)
13145 .m(m)
13146 .n(n)
13147 .k(k)
13148 .iterations(1)
13149 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13150 }
13151 }
13152 }
13153 }
13154
13155 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2) {
13156 TEST_REQUIRES_ARM_NEON_FMA;
13157 for (size_t k = 3; k < 4; k++) {
13158 GemmMicrokernelTester()
13159 .mr(4)
13160 .nr(8)
13161 .kr(1)
13162 .sr(1)
13163 .m(4)
13164 .n(8)
13165 .k(k)
13166 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13167 }
13168 }
13169
13170 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
13171 TEST_REQUIRES_ARM_NEON_FMA;
13172 for (size_t k = 3; k < 4; k++) {
13173 GemmMicrokernelTester()
13174 .mr(4)
13175 .nr(8)
13176 .kr(1)
13177 .sr(1)
13178 .m(4)
13179 .n(8)
13180 .k(k)
13181 .a_stride(7)
13182 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13183 }
13184 }
13185
13186 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
13187 TEST_REQUIRES_ARM_NEON_FMA;
13188 for (size_t k = 3; k < 4; k++) {
13189 for (uint32_t m = 1; m <= 4; m++) {
13190 for (uint32_t n = 1; n <= 8; n++) {
13191 GemmMicrokernelTester()
13192 .mr(4)
13193 .nr(8)
13194 .kr(1)
13195 .sr(1)
13196 .m(m)
13197 .n(n)
13198 .k(k)
13199 .iterations(1)
13200 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13201 }
13202 }
13203 }
13204 }
13205
13206 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2) {
13207 TEST_REQUIRES_ARM_NEON_FMA;
13208 for (size_t k = 4; k <= 20; k += 2) {
13209 GemmMicrokernelTester()
13210 .mr(4)
13211 .nr(8)
13212 .kr(1)
13213 .sr(1)
13214 .m(4)
13215 .n(8)
13216 .k(k)
13217 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13218 }
13219 }
13220
13221 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
13222 TEST_REQUIRES_ARM_NEON_FMA;
13223 for (size_t k = 4; k <= 20; k += 2) {
13224 GemmMicrokernelTester()
13225 .mr(4)
13226 .nr(8)
13227 .kr(1)
13228 .sr(1)
13229 .m(4)
13230 .n(8)
13231 .k(k)
13232 .a_stride(23)
13233 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13234 }
13235 }
13236
13237 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
13238 TEST_REQUIRES_ARM_NEON_FMA;
13239 for (size_t k = 4; k <= 20; k += 2) {
13240 for (uint32_t m = 1; m <= 4; m++) {
13241 for (uint32_t n = 1; n <= 8; n++) {
13242 GemmMicrokernelTester()
13243 .mr(4)
13244 .nr(8)
13245 .kr(1)
13246 .sr(1)
13247 .m(m)
13248 .n(n)
13249 .k(k)
13250 .iterations(1)
13251 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13252 }
13253 }
13254 }
13255 }
13256
13257 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8) {
13258 TEST_REQUIRES_ARM_NEON_FMA;
13259 for (uint32_t n = 9; n < 16; n++) {
13260 for (size_t k = 1; k <= 10; k += 3) {
13261 GemmMicrokernelTester()
13262 .mr(4)
13263 .nr(8)
13264 .kr(1)
13265 .sr(1)
13266 .m(4)
13267 .n(8)
13268 .k(k)
13269 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13270 }
13271 }
13272 }
13273
13274 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
13275 TEST_REQUIRES_ARM_NEON_FMA;
13276 for (uint32_t n = 9; n < 16; n++) {
13277 for (size_t k = 1; k <= 10; k += 3) {
13278 GemmMicrokernelTester()
13279 .mr(4)
13280 .nr(8)
13281 .kr(1)
13282 .sr(1)
13283 .m(4)
13284 .n(8)
13285 .k(k)
13286 .cn_stride(11)
13287 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13288 }
13289 }
13290 }
13291
13292 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
13293 TEST_REQUIRES_ARM_NEON_FMA;
13294 for (uint32_t n = 9; n < 16; n++) {
13295 for (size_t k = 1; k <= 10; k += 3) {
13296 GemmMicrokernelTester()
13297 .mr(4)
13298 .nr(8)
13299 .kr(1)
13300 .sr(1)
13301 .m(4)
13302 .n(n)
13303 .k(k)
13304 .a_stride(13)
13305 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13306 }
13307 }
13308 }
13309
13310 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
13311 TEST_REQUIRES_ARM_NEON_FMA;
13312 for (uint32_t n = 9; n < 16; n++) {
13313 for (size_t k = 1; k <= 10; k += 3) {
13314 for (uint32_t m = 1; m <= 4; m++) {
13315 GemmMicrokernelTester()
13316 .mr(4)
13317 .nr(8)
13318 .kr(1)
13319 .sr(1)
13320 .m(m)
13321 .n(n)
13322 .k(k)
13323 .iterations(1)
13324 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13325 }
13326 }
13327 }
13328 }
13329
13330 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8) {
13331 TEST_REQUIRES_ARM_NEON_FMA;
13332 for (uint32_t n = 16; n <= 24; n += 8) {
13333 for (size_t k = 1; k <= 10; k += 3) {
13334 GemmMicrokernelTester()
13335 .mr(4)
13336 .nr(8)
13337 .kr(1)
13338 .sr(1)
13339 .m(4)
13340 .n(8)
13341 .k(k)
13342 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13343 }
13344 }
13345 }
13346
13347 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
13348 TEST_REQUIRES_ARM_NEON_FMA;
13349 for (uint32_t n = 16; n <= 24; n += 8) {
13350 for (size_t k = 1; k <= 10; k += 3) {
13351 GemmMicrokernelTester()
13352 .mr(4)
13353 .nr(8)
13354 .kr(1)
13355 .sr(1)
13356 .m(4)
13357 .n(n)
13358 .k(k)
13359 .cn_stride(11)
13360 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13361 }
13362 }
13363 }
13364
13365 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
13366 TEST_REQUIRES_ARM_NEON_FMA;
13367 for (uint32_t n = 16; n <= 24; n += 8) {
13368 for (size_t k = 1; k <= 10; k += 3) {
13369 GemmMicrokernelTester()
13370 .mr(4)
13371 .nr(8)
13372 .kr(1)
13373 .sr(1)
13374 .m(4)
13375 .n(n)
13376 .k(k)
13377 .a_stride(13)
13378 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13379 }
13380 }
13381 }
13382
13383 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
13384 TEST_REQUIRES_ARM_NEON_FMA;
13385 for (uint32_t n = 16; n <= 24; n += 8) {
13386 for (size_t k = 1; k <= 10; k += 3) {
13387 for (uint32_t m = 1; m <= 4; m++) {
13388 GemmMicrokernelTester()
13389 .mr(4)
13390 .nr(8)
13391 .kr(1)
13392 .sr(1)
13393 .m(m)
13394 .n(n)
13395 .k(k)
13396 .iterations(1)
13397 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13398 }
13399 }
13400 }
13401 }
13402
13403 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
13404 TEST_REQUIRES_ARM_NEON_FMA;
13405 for (size_t k = 1; k <= 10; k += 3) {
13406 for (uint32_t m = 1; m <= 4; m++) {
13407 for (uint32_t n = 1; n <= 8; n++) {
13408 GemmMicrokernelTester()
13409 .mr(4)
13410 .nr(8)
13411 .kr(1)
13412 .sr(1)
13413 .m(m)
13414 .n(n)
13415 .k(k)
13416 .cm_stride(11)
13417 .iterations(1)
13418 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13419 }
13420 }
13421 }
13422 }
13423
13424 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, qmin) {
13425 TEST_REQUIRES_ARM_NEON_FMA;
13426 GemmMicrokernelTester()
13427 .mr(4)
13428 .nr(8)
13429 .kr(1)
13430 .sr(1)
13431 .m(4)
13432 .n(8)
13433 .k(2)
13434 .qmin(128)
13435 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13436 }
13437
13438 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, qmax) {
13439 TEST_REQUIRES_ARM_NEON_FMA;
13440 GemmMicrokernelTester()
13441 .mr(4)
13442 .nr(8)
13443 .kr(1)
13444 .sr(1)
13445 .m(4)
13446 .n(8)
13447 .k(2)
13448 .qmax(128)
13449 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13450 }
13451
13452 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cm) {
13453 TEST_REQUIRES_ARM_NEON_FMA;
13454 GemmMicrokernelTester()
13455 .mr(4)
13456 .nr(8)
13457 .kr(1)
13458 .sr(1)
13459 .m(4)
13460 .n(8)
13461 .k(2)
13462 .cm_stride(11)
13463 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
13464 }
13465#endif // XNN_ARCH_ARM64
13466
13467
13468#if XNN_ARCH_ARM64
13469 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4) {
13470 TEST_REQUIRES_ARM_NEON_FMA;
13471 GemmMicrokernelTester()
13472 .mr(4)
13473 .nr(8)
13474 .kr(1)
13475 .sr(1)
13476 .m(4)
13477 .n(8)
13478 .k(4)
13479 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13480 }
13481
13482 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cn) {
13483 TEST_REQUIRES_ARM_NEON_FMA;
13484 GemmMicrokernelTester()
13485 .mr(4)
13486 .nr(8)
13487 .kr(1)
13488 .sr(1)
13489 .m(4)
13490 .n(8)
13491 .k(4)
13492 .cn_stride(11)
13493 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13494 }
13495
13496 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
13497 TEST_REQUIRES_ARM_NEON_FMA;
13498 GemmMicrokernelTester()
13499 .mr(4)
13500 .nr(8)
13501 .kr(1)
13502 .sr(1)
13503 .m(4)
13504 .n(8)
13505 .k(4)
13506 .a_stride(7)
13507 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13508 }
13509
13510 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
13511 TEST_REQUIRES_ARM_NEON_FMA;
13512 for (uint32_t m = 1; m <= 4; m++) {
13513 for (uint32_t n = 1; n <= 8; n++) {
13514 GemmMicrokernelTester()
13515 .mr(4)
13516 .nr(8)
13517 .kr(1)
13518 .sr(1)
13519 .m(m)
13520 .n(n)
13521 .k(4)
13522 .iterations(1)
13523 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13524 }
13525 }
13526 }
13527
13528 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
13529 TEST_REQUIRES_ARM_NEON_FMA;
13530 for (uint32_t m = 1; m <= 4; m++) {
13531 GemmMicrokernelTester()
13532 .mr(4)
13533 .nr(8)
13534 .kr(1)
13535 .sr(1)
13536 .m(m)
13537 .n(8)
13538 .k(4)
13539 .iterations(1)
13540 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13541 }
13542 }
13543
13544 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
13545 TEST_REQUIRES_ARM_NEON_FMA;
13546 for (uint32_t n = 1; n <= 8; n++) {
13547 GemmMicrokernelTester()
13548 .mr(4)
13549 .nr(8)
13550 .kr(1)
13551 .sr(1)
13552 .m(4)
13553 .n(n)
13554 .k(4)
13555 .iterations(1)
13556 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13557 }
13558 }
13559
13560 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4) {
13561 TEST_REQUIRES_ARM_NEON_FMA;
13562 for (size_t k = 1; k < 4; k++) {
13563 GemmMicrokernelTester()
13564 .mr(4)
13565 .nr(8)
13566 .kr(1)
13567 .sr(1)
13568 .m(4)
13569 .n(8)
13570 .k(k)
13571 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13572 }
13573 }
13574
13575 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
13576 TEST_REQUIRES_ARM_NEON_FMA;
13577 for (size_t k = 1; k < 4; k++) {
13578 GemmMicrokernelTester()
13579 .mr(4)
13580 .nr(8)
13581 .kr(1)
13582 .sr(1)
13583 .m(4)
13584 .n(8)
13585 .k(k)
13586 .a_stride(7)
13587 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13588 }
13589 }
13590
13591 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
13592 TEST_REQUIRES_ARM_NEON_FMA;
13593 for (size_t k = 1; k < 4; k++) {
13594 for (uint32_t m = 1; m <= 4; m++) {
13595 for (uint32_t n = 1; n <= 8; n++) {
13596 GemmMicrokernelTester()
13597 .mr(4)
13598 .nr(8)
13599 .kr(1)
13600 .sr(1)
13601 .m(m)
13602 .n(n)
13603 .k(k)
13604 .iterations(1)
13605 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13606 }
13607 }
13608 }
13609 }
13610
13611 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4) {
13612 TEST_REQUIRES_ARM_NEON_FMA;
13613 for (size_t k = 5; k < 8; k++) {
13614 GemmMicrokernelTester()
13615 .mr(4)
13616 .nr(8)
13617 .kr(1)
13618 .sr(1)
13619 .m(4)
13620 .n(8)
13621 .k(k)
13622 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13623 }
13624 }
13625
13626 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
13627 TEST_REQUIRES_ARM_NEON_FMA;
13628 for (size_t k = 5; k < 8; k++) {
13629 GemmMicrokernelTester()
13630 .mr(4)
13631 .nr(8)
13632 .kr(1)
13633 .sr(1)
13634 .m(4)
13635 .n(8)
13636 .k(k)
13637 .a_stride(11)
13638 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13639 }
13640 }
13641
13642 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
13643 TEST_REQUIRES_ARM_NEON_FMA;
13644 for (size_t k = 5; k < 8; k++) {
13645 for (uint32_t m = 1; m <= 4; m++) {
13646 for (uint32_t n = 1; n <= 8; n++) {
13647 GemmMicrokernelTester()
13648 .mr(4)
13649 .nr(8)
13650 .kr(1)
13651 .sr(1)
13652 .m(m)
13653 .n(n)
13654 .k(k)
13655 .iterations(1)
13656 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13657 }
13658 }
13659 }
13660 }
13661
13662 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4) {
13663 TEST_REQUIRES_ARM_NEON_FMA;
13664 for (size_t k = 8; k <= 40; k += 4) {
13665 GemmMicrokernelTester()
13666 .mr(4)
13667 .nr(8)
13668 .kr(1)
13669 .sr(1)
13670 .m(4)
13671 .n(8)
13672 .k(k)
13673 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13674 }
13675 }
13676
13677 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
13678 TEST_REQUIRES_ARM_NEON_FMA;
13679 for (size_t k = 8; k <= 40; k += 4) {
13680 GemmMicrokernelTester()
13681 .mr(4)
13682 .nr(8)
13683 .kr(1)
13684 .sr(1)
13685 .m(4)
13686 .n(8)
13687 .k(k)
13688 .a_stride(43)
13689 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13690 }
13691 }
13692
13693 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
13694 TEST_REQUIRES_ARM_NEON_FMA;
13695 for (size_t k = 8; k <= 40; k += 4) {
13696 for (uint32_t m = 1; m <= 4; m++) {
13697 for (uint32_t n = 1; n <= 8; n++) {
13698 GemmMicrokernelTester()
13699 .mr(4)
13700 .nr(8)
13701 .kr(1)
13702 .sr(1)
13703 .m(m)
13704 .n(n)
13705 .k(k)
13706 .iterations(1)
13707 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13708 }
13709 }
13710 }
13711 }
13712
13713 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8) {
13714 TEST_REQUIRES_ARM_NEON_FMA;
13715 for (uint32_t n = 9; n < 16; n++) {
13716 for (size_t k = 1; k <= 20; k += 5) {
13717 GemmMicrokernelTester()
13718 .mr(4)
13719 .nr(8)
13720 .kr(1)
13721 .sr(1)
13722 .m(4)
13723 .n(8)
13724 .k(k)
13725 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13726 }
13727 }
13728 }
13729
13730 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
13731 TEST_REQUIRES_ARM_NEON_FMA;
13732 for (uint32_t n = 9; n < 16; n++) {
13733 for (size_t k = 1; k <= 20; k += 5) {
13734 GemmMicrokernelTester()
13735 .mr(4)
13736 .nr(8)
13737 .kr(1)
13738 .sr(1)
13739 .m(4)
13740 .n(8)
13741 .k(k)
13742 .cn_stride(11)
13743 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13744 }
13745 }
13746 }
13747
13748 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
13749 TEST_REQUIRES_ARM_NEON_FMA;
13750 for (uint32_t n = 9; n < 16; n++) {
13751 for (size_t k = 1; k <= 20; k += 5) {
13752 GemmMicrokernelTester()
13753 .mr(4)
13754 .nr(8)
13755 .kr(1)
13756 .sr(1)
13757 .m(4)
13758 .n(n)
13759 .k(k)
13760 .a_stride(23)
13761 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13762 }
13763 }
13764 }
13765
13766 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
13767 TEST_REQUIRES_ARM_NEON_FMA;
13768 for (uint32_t n = 9; n < 16; n++) {
13769 for (size_t k = 1; k <= 20; k += 5) {
13770 for (uint32_t m = 1; m <= 4; m++) {
13771 GemmMicrokernelTester()
13772 .mr(4)
13773 .nr(8)
13774 .kr(1)
13775 .sr(1)
13776 .m(m)
13777 .n(n)
13778 .k(k)
13779 .iterations(1)
13780 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13781 }
13782 }
13783 }
13784 }
13785
13786 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8) {
13787 TEST_REQUIRES_ARM_NEON_FMA;
13788 for (uint32_t n = 16; n <= 24; n += 8) {
13789 for (size_t k = 1; k <= 20; k += 5) {
13790 GemmMicrokernelTester()
13791 .mr(4)
13792 .nr(8)
13793 .kr(1)
13794 .sr(1)
13795 .m(4)
13796 .n(8)
13797 .k(k)
13798 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13799 }
13800 }
13801 }
13802
13803 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
13804 TEST_REQUIRES_ARM_NEON_FMA;
13805 for (uint32_t n = 16; n <= 24; n += 8) {
13806 for (size_t k = 1; k <= 20; k += 5) {
13807 GemmMicrokernelTester()
13808 .mr(4)
13809 .nr(8)
13810 .kr(1)
13811 .sr(1)
13812 .m(4)
13813 .n(n)
13814 .k(k)
13815 .cn_stride(11)
13816 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13817 }
13818 }
13819 }
13820
13821 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
13822 TEST_REQUIRES_ARM_NEON_FMA;
13823 for (uint32_t n = 16; n <= 24; n += 8) {
13824 for (size_t k = 1; k <= 20; k += 5) {
13825 GemmMicrokernelTester()
13826 .mr(4)
13827 .nr(8)
13828 .kr(1)
13829 .sr(1)
13830 .m(4)
13831 .n(n)
13832 .k(k)
13833 .a_stride(23)
13834 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13835 }
13836 }
13837 }
13838
13839 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
13840 TEST_REQUIRES_ARM_NEON_FMA;
13841 for (uint32_t n = 16; n <= 24; n += 8) {
13842 for (size_t k = 1; k <= 20; k += 5) {
13843 for (uint32_t m = 1; m <= 4; m++) {
13844 GemmMicrokernelTester()
13845 .mr(4)
13846 .nr(8)
13847 .kr(1)
13848 .sr(1)
13849 .m(m)
13850 .n(n)
13851 .k(k)
13852 .iterations(1)
13853 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13854 }
13855 }
13856 }
13857 }
13858
13859 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
13860 TEST_REQUIRES_ARM_NEON_FMA;
13861 for (size_t k = 1; k <= 20; k += 5) {
13862 for (uint32_t m = 1; m <= 4; m++) {
13863 for (uint32_t n = 1; n <= 8; n++) {
13864 GemmMicrokernelTester()
13865 .mr(4)
13866 .nr(8)
13867 .kr(1)
13868 .sr(1)
13869 .m(m)
13870 .n(n)
13871 .k(k)
13872 .cm_stride(11)
13873 .iterations(1)
13874 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13875 }
13876 }
13877 }
13878 }
13879
13880 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, qmin) {
13881 TEST_REQUIRES_ARM_NEON_FMA;
13882 GemmMicrokernelTester()
13883 .mr(4)
13884 .nr(8)
13885 .kr(1)
13886 .sr(1)
13887 .m(4)
13888 .n(8)
13889 .k(4)
13890 .qmin(128)
13891 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13892 }
13893
13894 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, qmax) {
13895 TEST_REQUIRES_ARM_NEON_FMA;
13896 GemmMicrokernelTester()
13897 .mr(4)
13898 .nr(8)
13899 .kr(1)
13900 .sr(1)
13901 .m(4)
13902 .n(8)
13903 .k(4)
13904 .qmax(128)
13905 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13906 }
13907
13908 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cm) {
13909 TEST_REQUIRES_ARM_NEON_FMA;
13910 GemmMicrokernelTester()
13911 .mr(4)
13912 .nr(8)
13913 .kr(1)
13914 .sr(1)
13915 .m(4)
13916 .n(8)
13917 .k(4)
13918 .cm_stride(11)
13919 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
13920 }
13921#endif // XNN_ARCH_ARM64
13922
13923
13924#if XNN_ARCH_ARM64
13925 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2) {
13926 TEST_REQUIRES_ARM_NEON_FMA;
13927 GemmMicrokernelTester()
13928 .mr(5)
13929 .nr(8)
13930 .kr(1)
13931 .sr(1)
13932 .m(5)
13933 .n(8)
13934 .k(2)
13935 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
13936 }
13937
13938 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cn) {
13939 TEST_REQUIRES_ARM_NEON_FMA;
13940 GemmMicrokernelTester()
13941 .mr(5)
13942 .nr(8)
13943 .kr(1)
13944 .sr(1)
13945 .m(5)
13946 .n(8)
13947 .k(2)
13948 .cn_stride(11)
13949 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
13950 }
13951
13952 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
13953 TEST_REQUIRES_ARM_NEON_FMA;
13954 GemmMicrokernelTester()
13955 .mr(5)
13956 .nr(8)
13957 .kr(1)
13958 .sr(1)
13959 .m(5)
13960 .n(8)
13961 .k(2)
13962 .a_stride(5)
13963 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
13964 }
13965
13966 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
13967 TEST_REQUIRES_ARM_NEON_FMA;
13968 for (uint32_t m = 1; m <= 5; m++) {
13969 for (uint32_t n = 1; n <= 8; n++) {
13970 GemmMicrokernelTester()
13971 .mr(5)
13972 .nr(8)
13973 .kr(1)
13974 .sr(1)
13975 .m(m)
13976 .n(n)
13977 .k(2)
13978 .iterations(1)
13979 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
13980 }
13981 }
13982 }
13983
13984 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
13985 TEST_REQUIRES_ARM_NEON_FMA;
13986 for (uint32_t m = 1; m <= 5; m++) {
13987 GemmMicrokernelTester()
13988 .mr(5)
13989 .nr(8)
13990 .kr(1)
13991 .sr(1)
13992 .m(m)
13993 .n(8)
13994 .k(2)
13995 .iterations(1)
13996 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
13997 }
13998 }
13999
14000 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
14001 TEST_REQUIRES_ARM_NEON_FMA;
14002 for (uint32_t n = 1; n <= 8; n++) {
14003 GemmMicrokernelTester()
14004 .mr(5)
14005 .nr(8)
14006 .kr(1)
14007 .sr(1)
14008 .m(5)
14009 .n(n)
14010 .k(2)
14011 .iterations(1)
14012 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14013 }
14014 }
14015
14016 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2) {
14017 TEST_REQUIRES_ARM_NEON_FMA;
14018 for (size_t k = 1; k < 2; k++) {
14019 GemmMicrokernelTester()
14020 .mr(5)
14021 .nr(8)
14022 .kr(1)
14023 .sr(1)
14024 .m(5)
14025 .n(8)
14026 .k(k)
14027 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14028 }
14029 }
14030
14031 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
14032 TEST_REQUIRES_ARM_NEON_FMA;
14033 for (size_t k = 1; k < 2; k++) {
14034 GemmMicrokernelTester()
14035 .mr(5)
14036 .nr(8)
14037 .kr(1)
14038 .sr(1)
14039 .m(5)
14040 .n(8)
14041 .k(k)
14042 .a_stride(5)
14043 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14044 }
14045 }
14046
14047 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
14048 TEST_REQUIRES_ARM_NEON_FMA;
14049 for (size_t k = 1; k < 2; k++) {
14050 for (uint32_t m = 1; m <= 5; m++) {
14051 for (uint32_t n = 1; n <= 8; n++) {
14052 GemmMicrokernelTester()
14053 .mr(5)
14054 .nr(8)
14055 .kr(1)
14056 .sr(1)
14057 .m(m)
14058 .n(n)
14059 .k(k)
14060 .iterations(1)
14061 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14062 }
14063 }
14064 }
14065 }
14066
14067 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2) {
14068 TEST_REQUIRES_ARM_NEON_FMA;
14069 for (size_t k = 3; k < 4; k++) {
14070 GemmMicrokernelTester()
14071 .mr(5)
14072 .nr(8)
14073 .kr(1)
14074 .sr(1)
14075 .m(5)
14076 .n(8)
14077 .k(k)
14078 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14079 }
14080 }
14081
14082 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
14083 TEST_REQUIRES_ARM_NEON_FMA;
14084 for (size_t k = 3; k < 4; k++) {
14085 GemmMicrokernelTester()
14086 .mr(5)
14087 .nr(8)
14088 .kr(1)
14089 .sr(1)
14090 .m(5)
14091 .n(8)
14092 .k(k)
14093 .a_stride(7)
14094 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14095 }
14096 }
14097
14098 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
14099 TEST_REQUIRES_ARM_NEON_FMA;
14100 for (size_t k = 3; k < 4; k++) {
14101 for (uint32_t m = 1; m <= 5; m++) {
14102 for (uint32_t n = 1; n <= 8; n++) {
14103 GemmMicrokernelTester()
14104 .mr(5)
14105 .nr(8)
14106 .kr(1)
14107 .sr(1)
14108 .m(m)
14109 .n(n)
14110 .k(k)
14111 .iterations(1)
14112 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14113 }
14114 }
14115 }
14116 }
14117
14118 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2) {
14119 TEST_REQUIRES_ARM_NEON_FMA;
14120 for (size_t k = 4; k <= 20; k += 2) {
14121 GemmMicrokernelTester()
14122 .mr(5)
14123 .nr(8)
14124 .kr(1)
14125 .sr(1)
14126 .m(5)
14127 .n(8)
14128 .k(k)
14129 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14130 }
14131 }
14132
14133 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
14134 TEST_REQUIRES_ARM_NEON_FMA;
14135 for (size_t k = 4; k <= 20; k += 2) {
14136 GemmMicrokernelTester()
14137 .mr(5)
14138 .nr(8)
14139 .kr(1)
14140 .sr(1)
14141 .m(5)
14142 .n(8)
14143 .k(k)
14144 .a_stride(23)
14145 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14146 }
14147 }
14148
14149 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
14150 TEST_REQUIRES_ARM_NEON_FMA;
14151 for (size_t k = 4; k <= 20; k += 2) {
14152 for (uint32_t m = 1; m <= 5; m++) {
14153 for (uint32_t n = 1; n <= 8; n++) {
14154 GemmMicrokernelTester()
14155 .mr(5)
14156 .nr(8)
14157 .kr(1)
14158 .sr(1)
14159 .m(m)
14160 .n(n)
14161 .k(k)
14162 .iterations(1)
14163 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14164 }
14165 }
14166 }
14167 }
14168
14169 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8) {
14170 TEST_REQUIRES_ARM_NEON_FMA;
14171 for (uint32_t n = 9; n < 16; n++) {
14172 for (size_t k = 1; k <= 10; k += 3) {
14173 GemmMicrokernelTester()
14174 .mr(5)
14175 .nr(8)
14176 .kr(1)
14177 .sr(1)
14178 .m(5)
14179 .n(8)
14180 .k(k)
14181 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14182 }
14183 }
14184 }
14185
14186 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
14187 TEST_REQUIRES_ARM_NEON_FMA;
14188 for (uint32_t n = 9; n < 16; n++) {
14189 for (size_t k = 1; k <= 10; k += 3) {
14190 GemmMicrokernelTester()
14191 .mr(5)
14192 .nr(8)
14193 .kr(1)
14194 .sr(1)
14195 .m(5)
14196 .n(8)
14197 .k(k)
14198 .cn_stride(11)
14199 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14200 }
14201 }
14202 }
14203
14204 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
14205 TEST_REQUIRES_ARM_NEON_FMA;
14206 for (uint32_t n = 9; n < 16; n++) {
14207 for (size_t k = 1; k <= 10; k += 3) {
14208 GemmMicrokernelTester()
14209 .mr(5)
14210 .nr(8)
14211 .kr(1)
14212 .sr(1)
14213 .m(5)
14214 .n(n)
14215 .k(k)
14216 .a_stride(13)
14217 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14218 }
14219 }
14220 }
14221
14222 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
14223 TEST_REQUIRES_ARM_NEON_FMA;
14224 for (uint32_t n = 9; n < 16; n++) {
14225 for (size_t k = 1; k <= 10; k += 3) {
14226 for (uint32_t m = 1; m <= 5; m++) {
14227 GemmMicrokernelTester()
14228 .mr(5)
14229 .nr(8)
14230 .kr(1)
14231 .sr(1)
14232 .m(m)
14233 .n(n)
14234 .k(k)
14235 .iterations(1)
14236 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14237 }
14238 }
14239 }
14240 }
14241
14242 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8) {
14243 TEST_REQUIRES_ARM_NEON_FMA;
14244 for (uint32_t n = 16; n <= 24; n += 8) {
14245 for (size_t k = 1; k <= 10; k += 3) {
14246 GemmMicrokernelTester()
14247 .mr(5)
14248 .nr(8)
14249 .kr(1)
14250 .sr(1)
14251 .m(5)
14252 .n(8)
14253 .k(k)
14254 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14255 }
14256 }
14257 }
14258
14259 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
14260 TEST_REQUIRES_ARM_NEON_FMA;
14261 for (uint32_t n = 16; n <= 24; n += 8) {
14262 for (size_t k = 1; k <= 10; k += 3) {
14263 GemmMicrokernelTester()
14264 .mr(5)
14265 .nr(8)
14266 .kr(1)
14267 .sr(1)
14268 .m(5)
14269 .n(n)
14270 .k(k)
14271 .cn_stride(11)
14272 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14273 }
14274 }
14275 }
14276
14277 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
14278 TEST_REQUIRES_ARM_NEON_FMA;
14279 for (uint32_t n = 16; n <= 24; n += 8) {
14280 for (size_t k = 1; k <= 10; k += 3) {
14281 GemmMicrokernelTester()
14282 .mr(5)
14283 .nr(8)
14284 .kr(1)
14285 .sr(1)
14286 .m(5)
14287 .n(n)
14288 .k(k)
14289 .a_stride(13)
14290 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14291 }
14292 }
14293 }
14294
14295 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
14296 TEST_REQUIRES_ARM_NEON_FMA;
14297 for (uint32_t n = 16; n <= 24; n += 8) {
14298 for (size_t k = 1; k <= 10; k += 3) {
14299 for (uint32_t m = 1; m <= 5; m++) {
14300 GemmMicrokernelTester()
14301 .mr(5)
14302 .nr(8)
14303 .kr(1)
14304 .sr(1)
14305 .m(m)
14306 .n(n)
14307 .k(k)
14308 .iterations(1)
14309 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14310 }
14311 }
14312 }
14313 }
14314
14315 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
14316 TEST_REQUIRES_ARM_NEON_FMA;
14317 for (size_t k = 1; k <= 10; k += 3) {
14318 for (uint32_t m = 1; m <= 5; m++) {
14319 for (uint32_t n = 1; n <= 8; n++) {
14320 GemmMicrokernelTester()
14321 .mr(5)
14322 .nr(8)
14323 .kr(1)
14324 .sr(1)
14325 .m(m)
14326 .n(n)
14327 .k(k)
14328 .cm_stride(11)
14329 .iterations(1)
14330 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14331 }
14332 }
14333 }
14334 }
14335
14336 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, qmin) {
14337 TEST_REQUIRES_ARM_NEON_FMA;
14338 GemmMicrokernelTester()
14339 .mr(5)
14340 .nr(8)
14341 .kr(1)
14342 .sr(1)
14343 .m(5)
14344 .n(8)
14345 .k(2)
14346 .qmin(128)
14347 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14348 }
14349
14350 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, qmax) {
14351 TEST_REQUIRES_ARM_NEON_FMA;
14352 GemmMicrokernelTester()
14353 .mr(5)
14354 .nr(8)
14355 .kr(1)
14356 .sr(1)
14357 .m(5)
14358 .n(8)
14359 .k(2)
14360 .qmax(128)
14361 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14362 }
14363
14364 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cm) {
14365 TEST_REQUIRES_ARM_NEON_FMA;
14366 GemmMicrokernelTester()
14367 .mr(5)
14368 .nr(8)
14369 .kr(1)
14370 .sr(1)
14371 .m(5)
14372 .n(8)
14373 .k(2)
14374 .cm_stride(11)
14375 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
14376 }
14377#endif // XNN_ARCH_ARM64
14378
14379
14380#if XNN_ARCH_ARM64
14381 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2) {
14382 TEST_REQUIRES_ARM_NEON_FMA;
14383 GemmMicrokernelTester()
14384 .mr(6)
14385 .nr(8)
14386 .kr(1)
14387 .sr(1)
14388 .m(6)
14389 .n(8)
14390 .k(2)
14391 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14392 }
14393
14394 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cn) {
14395 TEST_REQUIRES_ARM_NEON_FMA;
14396 GemmMicrokernelTester()
14397 .mr(6)
14398 .nr(8)
14399 .kr(1)
14400 .sr(1)
14401 .m(6)
14402 .n(8)
14403 .k(2)
14404 .cn_stride(11)
14405 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14406 }
14407
14408 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
14409 TEST_REQUIRES_ARM_NEON_FMA;
14410 GemmMicrokernelTester()
14411 .mr(6)
14412 .nr(8)
14413 .kr(1)
14414 .sr(1)
14415 .m(6)
14416 .n(8)
14417 .k(2)
14418 .a_stride(5)
14419 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14420 }
14421
14422 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
14423 TEST_REQUIRES_ARM_NEON_FMA;
14424 for (uint32_t m = 1; m <= 6; m++) {
14425 for (uint32_t n = 1; n <= 8; n++) {
14426 GemmMicrokernelTester()
14427 .mr(6)
14428 .nr(8)
14429 .kr(1)
14430 .sr(1)
14431 .m(m)
14432 .n(n)
14433 .k(2)
14434 .iterations(1)
14435 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14436 }
14437 }
14438 }
14439
14440 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
14441 TEST_REQUIRES_ARM_NEON_FMA;
14442 for (uint32_t m = 1; m <= 6; m++) {
14443 GemmMicrokernelTester()
14444 .mr(6)
14445 .nr(8)
14446 .kr(1)
14447 .sr(1)
14448 .m(m)
14449 .n(8)
14450 .k(2)
14451 .iterations(1)
14452 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14453 }
14454 }
14455
14456 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
14457 TEST_REQUIRES_ARM_NEON_FMA;
14458 for (uint32_t n = 1; n <= 8; n++) {
14459 GemmMicrokernelTester()
14460 .mr(6)
14461 .nr(8)
14462 .kr(1)
14463 .sr(1)
14464 .m(6)
14465 .n(n)
14466 .k(2)
14467 .iterations(1)
14468 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14469 }
14470 }
14471
14472 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2) {
14473 TEST_REQUIRES_ARM_NEON_FMA;
14474 for (size_t k = 1; k < 2; k++) {
14475 GemmMicrokernelTester()
14476 .mr(6)
14477 .nr(8)
14478 .kr(1)
14479 .sr(1)
14480 .m(6)
14481 .n(8)
14482 .k(k)
14483 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14484 }
14485 }
14486
14487 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
14488 TEST_REQUIRES_ARM_NEON_FMA;
14489 for (size_t k = 1; k < 2; k++) {
14490 GemmMicrokernelTester()
14491 .mr(6)
14492 .nr(8)
14493 .kr(1)
14494 .sr(1)
14495 .m(6)
14496 .n(8)
14497 .k(k)
14498 .a_stride(5)
14499 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14500 }
14501 }
14502
14503 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
14504 TEST_REQUIRES_ARM_NEON_FMA;
14505 for (size_t k = 1; k < 2; k++) {
14506 for (uint32_t m = 1; m <= 6; m++) {
14507 for (uint32_t n = 1; n <= 8; n++) {
14508 GemmMicrokernelTester()
14509 .mr(6)
14510 .nr(8)
14511 .kr(1)
14512 .sr(1)
14513 .m(m)
14514 .n(n)
14515 .k(k)
14516 .iterations(1)
14517 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14518 }
14519 }
14520 }
14521 }
14522
14523 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2) {
14524 TEST_REQUIRES_ARM_NEON_FMA;
14525 for (size_t k = 3; k < 4; k++) {
14526 GemmMicrokernelTester()
14527 .mr(6)
14528 .nr(8)
14529 .kr(1)
14530 .sr(1)
14531 .m(6)
14532 .n(8)
14533 .k(k)
14534 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14535 }
14536 }
14537
14538 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
14539 TEST_REQUIRES_ARM_NEON_FMA;
14540 for (size_t k = 3; k < 4; k++) {
14541 GemmMicrokernelTester()
14542 .mr(6)
14543 .nr(8)
14544 .kr(1)
14545 .sr(1)
14546 .m(6)
14547 .n(8)
14548 .k(k)
14549 .a_stride(7)
14550 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14551 }
14552 }
14553
14554 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
14555 TEST_REQUIRES_ARM_NEON_FMA;
14556 for (size_t k = 3; k < 4; k++) {
14557 for (uint32_t m = 1; m <= 6; m++) {
14558 for (uint32_t n = 1; n <= 8; n++) {
14559 GemmMicrokernelTester()
14560 .mr(6)
14561 .nr(8)
14562 .kr(1)
14563 .sr(1)
14564 .m(m)
14565 .n(n)
14566 .k(k)
14567 .iterations(1)
14568 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14569 }
14570 }
14571 }
14572 }
14573
14574 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2) {
14575 TEST_REQUIRES_ARM_NEON_FMA;
14576 for (size_t k = 4; k <= 20; k += 2) {
14577 GemmMicrokernelTester()
14578 .mr(6)
14579 .nr(8)
14580 .kr(1)
14581 .sr(1)
14582 .m(6)
14583 .n(8)
14584 .k(k)
14585 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14586 }
14587 }
14588
14589 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
14590 TEST_REQUIRES_ARM_NEON_FMA;
14591 for (size_t k = 4; k <= 20; k += 2) {
14592 GemmMicrokernelTester()
14593 .mr(6)
14594 .nr(8)
14595 .kr(1)
14596 .sr(1)
14597 .m(6)
14598 .n(8)
14599 .k(k)
14600 .a_stride(23)
14601 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14602 }
14603 }
14604
14605 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
14606 TEST_REQUIRES_ARM_NEON_FMA;
14607 for (size_t k = 4; k <= 20; k += 2) {
14608 for (uint32_t m = 1; m <= 6; m++) {
14609 for (uint32_t n = 1; n <= 8; n++) {
14610 GemmMicrokernelTester()
14611 .mr(6)
14612 .nr(8)
14613 .kr(1)
14614 .sr(1)
14615 .m(m)
14616 .n(n)
14617 .k(k)
14618 .iterations(1)
14619 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14620 }
14621 }
14622 }
14623 }
14624
14625 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8) {
14626 TEST_REQUIRES_ARM_NEON_FMA;
14627 for (uint32_t n = 9; n < 16; n++) {
14628 for (size_t k = 1; k <= 10; k += 3) {
14629 GemmMicrokernelTester()
14630 .mr(6)
14631 .nr(8)
14632 .kr(1)
14633 .sr(1)
14634 .m(6)
14635 .n(8)
14636 .k(k)
14637 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14638 }
14639 }
14640 }
14641
14642 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
14643 TEST_REQUIRES_ARM_NEON_FMA;
14644 for (uint32_t n = 9; n < 16; n++) {
14645 for (size_t k = 1; k <= 10; k += 3) {
14646 GemmMicrokernelTester()
14647 .mr(6)
14648 .nr(8)
14649 .kr(1)
14650 .sr(1)
14651 .m(6)
14652 .n(8)
14653 .k(k)
14654 .cn_stride(11)
14655 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14656 }
14657 }
14658 }
14659
14660 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
14661 TEST_REQUIRES_ARM_NEON_FMA;
14662 for (uint32_t n = 9; n < 16; n++) {
14663 for (size_t k = 1; k <= 10; k += 3) {
14664 GemmMicrokernelTester()
14665 .mr(6)
14666 .nr(8)
14667 .kr(1)
14668 .sr(1)
14669 .m(6)
14670 .n(n)
14671 .k(k)
14672 .a_stride(13)
14673 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14674 }
14675 }
14676 }
14677
14678 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
14679 TEST_REQUIRES_ARM_NEON_FMA;
14680 for (uint32_t n = 9; n < 16; n++) {
14681 for (size_t k = 1; k <= 10; k += 3) {
14682 for (uint32_t m = 1; m <= 6; m++) {
14683 GemmMicrokernelTester()
14684 .mr(6)
14685 .nr(8)
14686 .kr(1)
14687 .sr(1)
14688 .m(m)
14689 .n(n)
14690 .k(k)
14691 .iterations(1)
14692 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14693 }
14694 }
14695 }
14696 }
14697
14698 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8) {
14699 TEST_REQUIRES_ARM_NEON_FMA;
14700 for (uint32_t n = 16; n <= 24; n += 8) {
14701 for (size_t k = 1; k <= 10; k += 3) {
14702 GemmMicrokernelTester()
14703 .mr(6)
14704 .nr(8)
14705 .kr(1)
14706 .sr(1)
14707 .m(6)
14708 .n(8)
14709 .k(k)
14710 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14711 }
14712 }
14713 }
14714
14715 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
14716 TEST_REQUIRES_ARM_NEON_FMA;
14717 for (uint32_t n = 16; n <= 24; n += 8) {
14718 for (size_t k = 1; k <= 10; k += 3) {
14719 GemmMicrokernelTester()
14720 .mr(6)
14721 .nr(8)
14722 .kr(1)
14723 .sr(1)
14724 .m(6)
14725 .n(n)
14726 .k(k)
14727 .cn_stride(11)
14728 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14729 }
14730 }
14731 }
14732
14733 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
14734 TEST_REQUIRES_ARM_NEON_FMA;
14735 for (uint32_t n = 16; n <= 24; n += 8) {
14736 for (size_t k = 1; k <= 10; k += 3) {
14737 GemmMicrokernelTester()
14738 .mr(6)
14739 .nr(8)
14740 .kr(1)
14741 .sr(1)
14742 .m(6)
14743 .n(n)
14744 .k(k)
14745 .a_stride(13)
14746 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14747 }
14748 }
14749 }
14750
14751 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
14752 TEST_REQUIRES_ARM_NEON_FMA;
14753 for (uint32_t n = 16; n <= 24; n += 8) {
14754 for (size_t k = 1; k <= 10; k += 3) {
14755 for (uint32_t m = 1; m <= 6; m++) {
14756 GemmMicrokernelTester()
14757 .mr(6)
14758 .nr(8)
14759 .kr(1)
14760 .sr(1)
14761 .m(m)
14762 .n(n)
14763 .k(k)
14764 .iterations(1)
14765 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14766 }
14767 }
14768 }
14769 }
14770
14771 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
14772 TEST_REQUIRES_ARM_NEON_FMA;
14773 for (size_t k = 1; k <= 10; k += 3) {
14774 for (uint32_t m = 1; m <= 6; m++) {
14775 for (uint32_t n = 1; n <= 8; n++) {
14776 GemmMicrokernelTester()
14777 .mr(6)
14778 .nr(8)
14779 .kr(1)
14780 .sr(1)
14781 .m(m)
14782 .n(n)
14783 .k(k)
14784 .cm_stride(11)
14785 .iterations(1)
14786 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14787 }
14788 }
14789 }
14790 }
14791
14792 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, qmin) {
14793 TEST_REQUIRES_ARM_NEON_FMA;
14794 GemmMicrokernelTester()
14795 .mr(6)
14796 .nr(8)
14797 .kr(1)
14798 .sr(1)
14799 .m(6)
14800 .n(8)
14801 .k(2)
14802 .qmin(128)
14803 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14804 }
14805
14806 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, qmax) {
14807 TEST_REQUIRES_ARM_NEON_FMA;
14808 GemmMicrokernelTester()
14809 .mr(6)
14810 .nr(8)
14811 .kr(1)
14812 .sr(1)
14813 .m(6)
14814 .n(8)
14815 .k(2)
14816 .qmax(128)
14817 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14818 }
14819
14820 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cm) {
14821 TEST_REQUIRES_ARM_NEON_FMA;
14822 GemmMicrokernelTester()
14823 .mr(6)
14824 .nr(8)
14825 .kr(1)
14826 .sr(1)
14827 .m(6)
14828 .n(8)
14829 .k(2)
14830 .cm_stride(11)
14831 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
14832 }
14833#endif // XNN_ARCH_ARM64
14834
14835
14836#if XNN_ARCH_ARM64
14837 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4) {
14838 TEST_REQUIRES_ARM_NEON_FMA;
14839 GemmMicrokernelTester()
14840 .mr(6)
14841 .nr(8)
14842 .kr(1)
14843 .sr(1)
14844 .m(6)
14845 .n(8)
14846 .k(4)
14847 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14848 }
14849
14850 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, strided_cn) {
14851 TEST_REQUIRES_ARM_NEON_FMA;
14852 GemmMicrokernelTester()
14853 .mr(6)
14854 .nr(8)
14855 .kr(1)
14856 .sr(1)
14857 .m(6)
14858 .n(8)
14859 .k(4)
14860 .cn_stride(11)
14861 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14862 }
14863
14864 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
14865 TEST_REQUIRES_ARM_NEON_FMA;
14866 GemmMicrokernelTester()
14867 .mr(6)
14868 .nr(8)
14869 .kr(1)
14870 .sr(1)
14871 .m(6)
14872 .n(8)
14873 .k(4)
14874 .a_stride(7)
14875 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14876 }
14877
14878 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
14879 TEST_REQUIRES_ARM_NEON_FMA;
14880 for (uint32_t m = 1; m <= 6; m++) {
14881 for (uint32_t n = 1; n <= 8; n++) {
14882 GemmMicrokernelTester()
14883 .mr(6)
14884 .nr(8)
14885 .kr(1)
14886 .sr(1)
14887 .m(m)
14888 .n(n)
14889 .k(4)
14890 .iterations(1)
14891 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14892 }
14893 }
14894 }
14895
14896 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
14897 TEST_REQUIRES_ARM_NEON_FMA;
14898 for (uint32_t m = 1; m <= 6; m++) {
14899 GemmMicrokernelTester()
14900 .mr(6)
14901 .nr(8)
14902 .kr(1)
14903 .sr(1)
14904 .m(m)
14905 .n(8)
14906 .k(4)
14907 .iterations(1)
14908 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14909 }
14910 }
14911
14912 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
14913 TEST_REQUIRES_ARM_NEON_FMA;
14914 for (uint32_t n = 1; n <= 8; n++) {
14915 GemmMicrokernelTester()
14916 .mr(6)
14917 .nr(8)
14918 .kr(1)
14919 .sr(1)
14920 .m(6)
14921 .n(n)
14922 .k(4)
14923 .iterations(1)
14924 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14925 }
14926 }
14927
14928 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_lt_4) {
14929 TEST_REQUIRES_ARM_NEON_FMA;
14930 for (size_t k = 1; k < 4; k++) {
14931 GemmMicrokernelTester()
14932 .mr(6)
14933 .nr(8)
14934 .kr(1)
14935 .sr(1)
14936 .m(6)
14937 .n(8)
14938 .k(k)
14939 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14940 }
14941 }
14942
14943 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
14944 TEST_REQUIRES_ARM_NEON_FMA;
14945 for (size_t k = 1; k < 4; k++) {
14946 GemmMicrokernelTester()
14947 .mr(6)
14948 .nr(8)
14949 .kr(1)
14950 .sr(1)
14951 .m(6)
14952 .n(8)
14953 .k(k)
14954 .a_stride(7)
14955 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14956 }
14957 }
14958
14959 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
14960 TEST_REQUIRES_ARM_NEON_FMA;
14961 for (size_t k = 1; k < 4; k++) {
14962 for (uint32_t m = 1; m <= 6; m++) {
14963 for (uint32_t n = 1; n <= 8; n++) {
14964 GemmMicrokernelTester()
14965 .mr(6)
14966 .nr(8)
14967 .kr(1)
14968 .sr(1)
14969 .m(m)
14970 .n(n)
14971 .k(k)
14972 .iterations(1)
14973 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14974 }
14975 }
14976 }
14977 }
14978
14979 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_gt_4) {
14980 TEST_REQUIRES_ARM_NEON_FMA;
14981 for (size_t k = 5; k < 8; k++) {
14982 GemmMicrokernelTester()
14983 .mr(6)
14984 .nr(8)
14985 .kr(1)
14986 .sr(1)
14987 .m(6)
14988 .n(8)
14989 .k(k)
14990 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
14991 }
14992 }
14993
14994 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
14995 TEST_REQUIRES_ARM_NEON_FMA;
14996 for (size_t k = 5; k < 8; k++) {
14997 GemmMicrokernelTester()
14998 .mr(6)
14999 .nr(8)
15000 .kr(1)
15001 .sr(1)
15002 .m(6)
15003 .n(8)
15004 .k(k)
15005 .a_stride(11)
15006 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15007 }
15008 }
15009
15010 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
15011 TEST_REQUIRES_ARM_NEON_FMA;
15012 for (size_t k = 5; k < 8; k++) {
15013 for (uint32_t m = 1; m <= 6; m++) {
15014 for (uint32_t n = 1; n <= 8; n++) {
15015 GemmMicrokernelTester()
15016 .mr(6)
15017 .nr(8)
15018 .kr(1)
15019 .sr(1)
15020 .m(m)
15021 .n(n)
15022 .k(k)
15023 .iterations(1)
15024 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15025 }
15026 }
15027 }
15028 }
15029
15030 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_div_4) {
15031 TEST_REQUIRES_ARM_NEON_FMA;
15032 for (size_t k = 8; k <= 40; k += 4) {
15033 GemmMicrokernelTester()
15034 .mr(6)
15035 .nr(8)
15036 .kr(1)
15037 .sr(1)
15038 .m(6)
15039 .n(8)
15040 .k(k)
15041 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15042 }
15043 }
15044
15045 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
15046 TEST_REQUIRES_ARM_NEON_FMA;
15047 for (size_t k = 8; k <= 40; k += 4) {
15048 GemmMicrokernelTester()
15049 .mr(6)
15050 .nr(8)
15051 .kr(1)
15052 .sr(1)
15053 .m(6)
15054 .n(8)
15055 .k(k)
15056 .a_stride(43)
15057 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15058 }
15059 }
15060
15061 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
15062 TEST_REQUIRES_ARM_NEON_FMA;
15063 for (size_t k = 8; k <= 40; k += 4) {
15064 for (uint32_t m = 1; m <= 6; m++) {
15065 for (uint32_t n = 1; n <= 8; n++) {
15066 GemmMicrokernelTester()
15067 .mr(6)
15068 .nr(8)
15069 .kr(1)
15070 .sr(1)
15071 .m(m)
15072 .n(n)
15073 .k(k)
15074 .iterations(1)
15075 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15076 }
15077 }
15078 }
15079 }
15080
15081 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8) {
15082 TEST_REQUIRES_ARM_NEON_FMA;
15083 for (uint32_t n = 9; n < 16; n++) {
15084 for (size_t k = 1; k <= 20; k += 5) {
15085 GemmMicrokernelTester()
15086 .mr(6)
15087 .nr(8)
15088 .kr(1)
15089 .sr(1)
15090 .m(6)
15091 .n(8)
15092 .k(k)
15093 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15094 }
15095 }
15096 }
15097
15098 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
15099 TEST_REQUIRES_ARM_NEON_FMA;
15100 for (uint32_t n = 9; n < 16; n++) {
15101 for (size_t k = 1; k <= 20; k += 5) {
15102 GemmMicrokernelTester()
15103 .mr(6)
15104 .nr(8)
15105 .kr(1)
15106 .sr(1)
15107 .m(6)
15108 .n(8)
15109 .k(k)
15110 .cn_stride(11)
15111 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15112 }
15113 }
15114 }
15115
15116 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
15117 TEST_REQUIRES_ARM_NEON_FMA;
15118 for (uint32_t n = 9; n < 16; n++) {
15119 for (size_t k = 1; k <= 20; k += 5) {
15120 GemmMicrokernelTester()
15121 .mr(6)
15122 .nr(8)
15123 .kr(1)
15124 .sr(1)
15125 .m(6)
15126 .n(n)
15127 .k(k)
15128 .a_stride(23)
15129 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15130 }
15131 }
15132 }
15133
15134 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
15135 TEST_REQUIRES_ARM_NEON_FMA;
15136 for (uint32_t n = 9; n < 16; n++) {
15137 for (size_t k = 1; k <= 20; k += 5) {
15138 for (uint32_t m = 1; m <= 6; m++) {
15139 GemmMicrokernelTester()
15140 .mr(6)
15141 .nr(8)
15142 .kr(1)
15143 .sr(1)
15144 .m(m)
15145 .n(n)
15146 .k(k)
15147 .iterations(1)
15148 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15149 }
15150 }
15151 }
15152 }
15153
15154 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8) {
15155 TEST_REQUIRES_ARM_NEON_FMA;
15156 for (uint32_t n = 16; n <= 24; n += 8) {
15157 for (size_t k = 1; k <= 20; k += 5) {
15158 GemmMicrokernelTester()
15159 .mr(6)
15160 .nr(8)
15161 .kr(1)
15162 .sr(1)
15163 .m(6)
15164 .n(8)
15165 .k(k)
15166 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15167 }
15168 }
15169 }
15170
15171 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
15172 TEST_REQUIRES_ARM_NEON_FMA;
15173 for (uint32_t n = 16; n <= 24; n += 8) {
15174 for (size_t k = 1; k <= 20; k += 5) {
15175 GemmMicrokernelTester()
15176 .mr(6)
15177 .nr(8)
15178 .kr(1)
15179 .sr(1)
15180 .m(6)
15181 .n(n)
15182 .k(k)
15183 .cn_stride(11)
15184 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15185 }
15186 }
15187 }
15188
15189 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
15190 TEST_REQUIRES_ARM_NEON_FMA;
15191 for (uint32_t n = 16; n <= 24; n += 8) {
15192 for (size_t k = 1; k <= 20; k += 5) {
15193 GemmMicrokernelTester()
15194 .mr(6)
15195 .nr(8)
15196 .kr(1)
15197 .sr(1)
15198 .m(6)
15199 .n(n)
15200 .k(k)
15201 .a_stride(23)
15202 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15203 }
15204 }
15205 }
15206
15207 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
15208 TEST_REQUIRES_ARM_NEON_FMA;
15209 for (uint32_t n = 16; n <= 24; n += 8) {
15210 for (size_t k = 1; k <= 20; k += 5) {
15211 for (uint32_t m = 1; m <= 6; m++) {
15212 GemmMicrokernelTester()
15213 .mr(6)
15214 .nr(8)
15215 .kr(1)
15216 .sr(1)
15217 .m(m)
15218 .n(n)
15219 .k(k)
15220 .iterations(1)
15221 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15222 }
15223 }
15224 }
15225 }
15226
15227 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
15228 TEST_REQUIRES_ARM_NEON_FMA;
15229 for (size_t k = 1; k <= 20; k += 5) {
15230 for (uint32_t m = 1; m <= 6; m++) {
15231 for (uint32_t n = 1; n <= 8; n++) {
15232 GemmMicrokernelTester()
15233 .mr(6)
15234 .nr(8)
15235 .kr(1)
15236 .sr(1)
15237 .m(m)
15238 .n(n)
15239 .k(k)
15240 .cm_stride(11)
15241 .iterations(1)
15242 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15243 }
15244 }
15245 }
15246 }
15247
15248 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, qmin) {
15249 TEST_REQUIRES_ARM_NEON_FMA;
15250 GemmMicrokernelTester()
15251 .mr(6)
15252 .nr(8)
15253 .kr(1)
15254 .sr(1)
15255 .m(6)
15256 .n(8)
15257 .k(4)
15258 .qmin(128)
15259 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15260 }
15261
15262 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, qmax) {
15263 TEST_REQUIRES_ARM_NEON_FMA;
15264 GemmMicrokernelTester()
15265 .mr(6)
15266 .nr(8)
15267 .kr(1)
15268 .sr(1)
15269 .m(6)
15270 .n(8)
15271 .k(4)
15272 .qmax(128)
15273 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15274 }
15275
15276 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, strided_cm) {
15277 TEST_REQUIRES_ARM_NEON_FMA;
15278 GemmMicrokernelTester()
15279 .mr(6)
15280 .nr(8)
15281 .kr(1)
15282 .sr(1)
15283 .m(6)
15284 .n(8)
15285 .k(4)
15286 .cm_stride(11)
15287 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
15288 }
15289#endif // XNN_ARCH_ARM64
15290
15291
15292#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15293 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2) {
15294 TEST_REQUIRES_ARM_NEON;
15295 GemmMicrokernelTester()
15296 .mr(1)
15297 .nr(8)
15298 .kr(1)
15299 .sr(1)
15300 .m(1)
15301 .n(8)
15302 .k(2)
15303 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15304 }
15305
15306 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, strided_cn) {
15307 TEST_REQUIRES_ARM_NEON;
15308 GemmMicrokernelTester()
15309 .mr(1)
15310 .nr(8)
15311 .kr(1)
15312 .sr(1)
15313 .m(1)
15314 .n(8)
15315 .k(2)
15316 .cn_stride(11)
15317 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15318 }
15319
15320 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_strided_a) {
15321 TEST_REQUIRES_ARM_NEON;
15322 GemmMicrokernelTester()
15323 .mr(1)
15324 .nr(8)
15325 .kr(1)
15326 .sr(1)
15327 .m(1)
15328 .n(8)
15329 .k(2)
15330 .a_stride(5)
15331 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15332 }
15333
15334 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
15335 TEST_REQUIRES_ARM_NEON;
15336 for (uint32_t m = 1; m <= 1; m++) {
15337 for (uint32_t n = 1; n <= 8; n++) {
15338 GemmMicrokernelTester()
15339 .mr(1)
15340 .nr(8)
15341 .kr(1)
15342 .sr(1)
15343 .m(m)
15344 .n(n)
15345 .k(2)
15346 .iterations(1)
15347 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15348 }
15349 }
15350 }
15351
15352 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
15353 TEST_REQUIRES_ARM_NEON;
15354 for (uint32_t m = 1; m <= 1; m++) {
15355 GemmMicrokernelTester()
15356 .mr(1)
15357 .nr(8)
15358 .kr(1)
15359 .sr(1)
15360 .m(m)
15361 .n(8)
15362 .k(2)
15363 .iterations(1)
15364 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15365 }
15366 }
15367
15368 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
15369 TEST_REQUIRES_ARM_NEON;
15370 for (uint32_t n = 1; n <= 8; n++) {
15371 GemmMicrokernelTester()
15372 .mr(1)
15373 .nr(8)
15374 .kr(1)
15375 .sr(1)
15376 .m(1)
15377 .n(n)
15378 .k(2)
15379 .iterations(1)
15380 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15381 }
15382 }
15383
15384 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_lt_2) {
15385 TEST_REQUIRES_ARM_NEON;
15386 for (size_t k = 1; k < 2; k++) {
15387 GemmMicrokernelTester()
15388 .mr(1)
15389 .nr(8)
15390 .kr(1)
15391 .sr(1)
15392 .m(1)
15393 .n(8)
15394 .k(k)
15395 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15396 }
15397 }
15398
15399 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_lt_2_strided_a) {
15400 TEST_REQUIRES_ARM_NEON;
15401 for (size_t k = 1; k < 2; k++) {
15402 GemmMicrokernelTester()
15403 .mr(1)
15404 .nr(8)
15405 .kr(1)
15406 .sr(1)
15407 .m(1)
15408 .n(8)
15409 .k(k)
15410 .a_stride(5)
15411 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15412 }
15413 }
15414
15415 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
15416 TEST_REQUIRES_ARM_NEON;
15417 for (size_t k = 1; k < 2; k++) {
15418 for (uint32_t m = 1; m <= 1; m++) {
15419 for (uint32_t n = 1; n <= 8; n++) {
15420 GemmMicrokernelTester()
15421 .mr(1)
15422 .nr(8)
15423 .kr(1)
15424 .sr(1)
15425 .m(m)
15426 .n(n)
15427 .k(k)
15428 .iterations(1)
15429 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15430 }
15431 }
15432 }
15433 }
15434
15435 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_gt_2) {
15436 TEST_REQUIRES_ARM_NEON;
15437 for (size_t k = 3; k < 4; k++) {
15438 GemmMicrokernelTester()
15439 .mr(1)
15440 .nr(8)
15441 .kr(1)
15442 .sr(1)
15443 .m(1)
15444 .n(8)
15445 .k(k)
15446 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15447 }
15448 }
15449
15450 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_gt_2_strided_a) {
15451 TEST_REQUIRES_ARM_NEON;
15452 for (size_t k = 3; k < 4; k++) {
15453 GemmMicrokernelTester()
15454 .mr(1)
15455 .nr(8)
15456 .kr(1)
15457 .sr(1)
15458 .m(1)
15459 .n(8)
15460 .k(k)
15461 .a_stride(7)
15462 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15463 }
15464 }
15465
15466 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
15467 TEST_REQUIRES_ARM_NEON;
15468 for (size_t k = 3; k < 4; k++) {
15469 for (uint32_t m = 1; m <= 1; m++) {
15470 for (uint32_t n = 1; n <= 8; n++) {
15471 GemmMicrokernelTester()
15472 .mr(1)
15473 .nr(8)
15474 .kr(1)
15475 .sr(1)
15476 .m(m)
15477 .n(n)
15478 .k(k)
15479 .iterations(1)
15480 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15481 }
15482 }
15483 }
15484 }
15485
15486 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_div_2) {
15487 TEST_REQUIRES_ARM_NEON;
15488 for (size_t k = 4; k <= 20; k += 2) {
15489 GemmMicrokernelTester()
15490 .mr(1)
15491 .nr(8)
15492 .kr(1)
15493 .sr(1)
15494 .m(1)
15495 .n(8)
15496 .k(k)
15497 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15498 }
15499 }
15500
15501 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_div_2_strided_a) {
15502 TEST_REQUIRES_ARM_NEON;
15503 for (size_t k = 4; k <= 20; k += 2) {
15504 GemmMicrokernelTester()
15505 .mr(1)
15506 .nr(8)
15507 .kr(1)
15508 .sr(1)
15509 .m(1)
15510 .n(8)
15511 .k(k)
15512 .a_stride(23)
15513 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15514 }
15515 }
15516
15517 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_div_2_subtile) {
15518 TEST_REQUIRES_ARM_NEON;
15519 for (size_t k = 4; k <= 20; k += 2) {
15520 for (uint32_t m = 1; m <= 1; m++) {
15521 for (uint32_t n = 1; n <= 8; n++) {
15522 GemmMicrokernelTester()
15523 .mr(1)
15524 .nr(8)
15525 .kr(1)
15526 .sr(1)
15527 .m(m)
15528 .n(n)
15529 .k(k)
15530 .iterations(1)
15531 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15532 }
15533 }
15534 }
15535 }
15536
15537 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8) {
15538 TEST_REQUIRES_ARM_NEON;
15539 for (uint32_t n = 9; n < 16; n++) {
15540 for (size_t k = 1; k <= 10; k += 3) {
15541 GemmMicrokernelTester()
15542 .mr(1)
15543 .nr(8)
15544 .kr(1)
15545 .sr(1)
15546 .m(1)
15547 .n(8)
15548 .k(k)
15549 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15550 }
15551 }
15552 }
15553
15554 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
15555 TEST_REQUIRES_ARM_NEON;
15556 for (uint32_t n = 9; n < 16; n++) {
15557 for (size_t k = 1; k <= 10; k += 3) {
15558 GemmMicrokernelTester()
15559 .mr(1)
15560 .nr(8)
15561 .kr(1)
15562 .sr(1)
15563 .m(1)
15564 .n(8)
15565 .k(k)
15566 .cn_stride(11)
15567 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15568 }
15569 }
15570 }
15571
15572 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8_strided_a) {
15573 TEST_REQUIRES_ARM_NEON;
15574 for (uint32_t n = 9; n < 16; n++) {
15575 for (size_t k = 1; k <= 10; k += 3) {
15576 GemmMicrokernelTester()
15577 .mr(1)
15578 .nr(8)
15579 .kr(1)
15580 .sr(1)
15581 .m(1)
15582 .n(n)
15583 .k(k)
15584 .a_stride(13)
15585 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15586 }
15587 }
15588 }
15589
15590 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
15591 TEST_REQUIRES_ARM_NEON;
15592 for (uint32_t n = 9; n < 16; n++) {
15593 for (size_t k = 1; k <= 10; k += 3) {
15594 for (uint32_t m = 1; m <= 1; m++) {
15595 GemmMicrokernelTester()
15596 .mr(1)
15597 .nr(8)
15598 .kr(1)
15599 .sr(1)
15600 .m(m)
15601 .n(n)
15602 .k(k)
15603 .iterations(1)
15604 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15605 }
15606 }
15607 }
15608 }
15609
15610 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8) {
15611 TEST_REQUIRES_ARM_NEON;
15612 for (uint32_t n = 16; n <= 24; n += 8) {
15613 for (size_t k = 1; k <= 10; k += 3) {
15614 GemmMicrokernelTester()
15615 .mr(1)
15616 .nr(8)
15617 .kr(1)
15618 .sr(1)
15619 .m(1)
15620 .n(8)
15621 .k(k)
15622 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15623 }
15624 }
15625 }
15626
15627 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
15628 TEST_REQUIRES_ARM_NEON;
15629 for (uint32_t n = 16; n <= 24; n += 8) {
15630 for (size_t k = 1; k <= 10; k += 3) {
15631 GemmMicrokernelTester()
15632 .mr(1)
15633 .nr(8)
15634 .kr(1)
15635 .sr(1)
15636 .m(1)
15637 .n(n)
15638 .k(k)
15639 .cn_stride(11)
15640 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15641 }
15642 }
15643 }
15644
15645 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8_strided_a) {
15646 TEST_REQUIRES_ARM_NEON;
15647 for (uint32_t n = 16; n <= 24; n += 8) {
15648 for (size_t k = 1; k <= 10; k += 3) {
15649 GemmMicrokernelTester()
15650 .mr(1)
15651 .nr(8)
15652 .kr(1)
15653 .sr(1)
15654 .m(1)
15655 .n(n)
15656 .k(k)
15657 .a_stride(13)
15658 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15659 }
15660 }
15661 }
15662
15663 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8_subtile) {
15664 TEST_REQUIRES_ARM_NEON;
15665 for (uint32_t n = 16; n <= 24; n += 8) {
15666 for (size_t k = 1; k <= 10; k += 3) {
15667 for (uint32_t m = 1; m <= 1; m++) {
15668 GemmMicrokernelTester()
15669 .mr(1)
15670 .nr(8)
15671 .kr(1)
15672 .sr(1)
15673 .m(m)
15674 .n(n)
15675 .k(k)
15676 .iterations(1)
15677 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15678 }
15679 }
15680 }
15681 }
15682
15683 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, strided_cm_subtile) {
15684 TEST_REQUIRES_ARM_NEON;
15685 for (size_t k = 1; k <= 10; k += 3) {
15686 for (uint32_t m = 1; m <= 1; m++) {
15687 for (uint32_t n = 1; n <= 8; n++) {
15688 GemmMicrokernelTester()
15689 .mr(1)
15690 .nr(8)
15691 .kr(1)
15692 .sr(1)
15693 .m(m)
15694 .n(n)
15695 .k(k)
15696 .cm_stride(11)
15697 .iterations(1)
15698 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15699 }
15700 }
15701 }
15702 }
15703
15704 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, qmin) {
15705 TEST_REQUIRES_ARM_NEON;
15706 GemmMicrokernelTester()
15707 .mr(1)
15708 .nr(8)
15709 .kr(1)
15710 .sr(1)
15711 .m(1)
15712 .n(8)
15713 .k(2)
15714 .qmin(128)
15715 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15716 }
15717
15718 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, qmax) {
15719 TEST_REQUIRES_ARM_NEON;
15720 GemmMicrokernelTester()
15721 .mr(1)
15722 .nr(8)
15723 .kr(1)
15724 .sr(1)
15725 .m(1)
15726 .n(8)
15727 .k(2)
15728 .qmax(128)
15729 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15730 }
15731
15732 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, strided_cm) {
15733 TEST_REQUIRES_ARM_NEON;
15734 GemmMicrokernelTester()
15735 .mr(1)
15736 .nr(8)
15737 .kr(1)
15738 .sr(1)
15739 .m(1)
15740 .n(8)
15741 .k(2)
15742 .cm_stride(11)
15743 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
15744 }
15745#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15746
15747
15748#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15749 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2) {
15750 TEST_REQUIRES_ARM_NEON;
15751 GemmMicrokernelTester()
15752 .mr(4)
15753 .nr(8)
15754 .kr(1)
15755 .sr(1)
15756 .m(4)
15757 .n(8)
15758 .k(2)
15759 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15760 }
15761
15762 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, strided_cn) {
15763 TEST_REQUIRES_ARM_NEON;
15764 GemmMicrokernelTester()
15765 .mr(4)
15766 .nr(8)
15767 .kr(1)
15768 .sr(1)
15769 .m(4)
15770 .n(8)
15771 .k(2)
15772 .cn_stride(11)
15773 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15774 }
15775
15776 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_strided_a) {
15777 TEST_REQUIRES_ARM_NEON;
15778 GemmMicrokernelTester()
15779 .mr(4)
15780 .nr(8)
15781 .kr(1)
15782 .sr(1)
15783 .m(4)
15784 .n(8)
15785 .k(2)
15786 .a_stride(5)
15787 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15788 }
15789
15790 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_subtile) {
15791 TEST_REQUIRES_ARM_NEON;
15792 for (uint32_t m = 1; m <= 4; m++) {
15793 for (uint32_t n = 1; n <= 8; n++) {
15794 GemmMicrokernelTester()
15795 .mr(4)
15796 .nr(8)
15797 .kr(1)
15798 .sr(1)
15799 .m(m)
15800 .n(n)
15801 .k(2)
15802 .iterations(1)
15803 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15804 }
15805 }
15806 }
15807
15808 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
15809 TEST_REQUIRES_ARM_NEON;
15810 for (uint32_t m = 1; m <= 4; m++) {
15811 GemmMicrokernelTester()
15812 .mr(4)
15813 .nr(8)
15814 .kr(1)
15815 .sr(1)
15816 .m(m)
15817 .n(8)
15818 .k(2)
15819 .iterations(1)
15820 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15821 }
15822 }
15823
15824 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
15825 TEST_REQUIRES_ARM_NEON;
15826 for (uint32_t n = 1; n <= 8; n++) {
15827 GemmMicrokernelTester()
15828 .mr(4)
15829 .nr(8)
15830 .kr(1)
15831 .sr(1)
15832 .m(4)
15833 .n(n)
15834 .k(2)
15835 .iterations(1)
15836 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15837 }
15838 }
15839
15840 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_lt_2) {
15841 TEST_REQUIRES_ARM_NEON;
15842 for (size_t k = 1; k < 2; k++) {
15843 GemmMicrokernelTester()
15844 .mr(4)
15845 .nr(8)
15846 .kr(1)
15847 .sr(1)
15848 .m(4)
15849 .n(8)
15850 .k(k)
15851 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15852 }
15853 }
15854
15855 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_lt_2_strided_a) {
15856 TEST_REQUIRES_ARM_NEON;
15857 for (size_t k = 1; k < 2; k++) {
15858 GemmMicrokernelTester()
15859 .mr(4)
15860 .nr(8)
15861 .kr(1)
15862 .sr(1)
15863 .m(4)
15864 .n(8)
15865 .k(k)
15866 .a_stride(5)
15867 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15868 }
15869 }
15870
15871 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_lt_2_subtile) {
15872 TEST_REQUIRES_ARM_NEON;
15873 for (size_t k = 1; k < 2; k++) {
15874 for (uint32_t m = 1; m <= 4; m++) {
15875 for (uint32_t n = 1; n <= 8; n++) {
15876 GemmMicrokernelTester()
15877 .mr(4)
15878 .nr(8)
15879 .kr(1)
15880 .sr(1)
15881 .m(m)
15882 .n(n)
15883 .k(k)
15884 .iterations(1)
15885 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15886 }
15887 }
15888 }
15889 }
15890
15891 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_gt_2) {
15892 TEST_REQUIRES_ARM_NEON;
15893 for (size_t k = 3; k < 4; k++) {
15894 GemmMicrokernelTester()
15895 .mr(4)
15896 .nr(8)
15897 .kr(1)
15898 .sr(1)
15899 .m(4)
15900 .n(8)
15901 .k(k)
15902 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15903 }
15904 }
15905
15906 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_gt_2_strided_a) {
15907 TEST_REQUIRES_ARM_NEON;
15908 for (size_t k = 3; k < 4; k++) {
15909 GemmMicrokernelTester()
15910 .mr(4)
15911 .nr(8)
15912 .kr(1)
15913 .sr(1)
15914 .m(4)
15915 .n(8)
15916 .k(k)
15917 .a_stride(7)
15918 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15919 }
15920 }
15921
15922 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_gt_2_subtile) {
15923 TEST_REQUIRES_ARM_NEON;
15924 for (size_t k = 3; k < 4; k++) {
15925 for (uint32_t m = 1; m <= 4; m++) {
15926 for (uint32_t n = 1; n <= 8; n++) {
15927 GemmMicrokernelTester()
15928 .mr(4)
15929 .nr(8)
15930 .kr(1)
15931 .sr(1)
15932 .m(m)
15933 .n(n)
15934 .k(k)
15935 .iterations(1)
15936 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15937 }
15938 }
15939 }
15940 }
15941
15942 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_div_2) {
15943 TEST_REQUIRES_ARM_NEON;
15944 for (size_t k = 4; k <= 20; k += 2) {
15945 GemmMicrokernelTester()
15946 .mr(4)
15947 .nr(8)
15948 .kr(1)
15949 .sr(1)
15950 .m(4)
15951 .n(8)
15952 .k(k)
15953 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15954 }
15955 }
15956
15957 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_div_2_strided_a) {
15958 TEST_REQUIRES_ARM_NEON;
15959 for (size_t k = 4; k <= 20; k += 2) {
15960 GemmMicrokernelTester()
15961 .mr(4)
15962 .nr(8)
15963 .kr(1)
15964 .sr(1)
15965 .m(4)
15966 .n(8)
15967 .k(k)
15968 .a_stride(23)
15969 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15970 }
15971 }
15972
15973 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_div_2_subtile) {
15974 TEST_REQUIRES_ARM_NEON;
15975 for (size_t k = 4; k <= 20; k += 2) {
15976 for (uint32_t m = 1; m <= 4; m++) {
15977 for (uint32_t n = 1; n <= 8; n++) {
15978 GemmMicrokernelTester()
15979 .mr(4)
15980 .nr(8)
15981 .kr(1)
15982 .sr(1)
15983 .m(m)
15984 .n(n)
15985 .k(k)
15986 .iterations(1)
15987 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
15988 }
15989 }
15990 }
15991 }
15992
15993 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8) {
15994 TEST_REQUIRES_ARM_NEON;
15995 for (uint32_t n = 9; n < 16; n++) {
15996 for (size_t k = 1; k <= 10; k += 3) {
15997 GemmMicrokernelTester()
15998 .mr(4)
15999 .nr(8)
16000 .kr(1)
16001 .sr(1)
16002 .m(4)
16003 .n(8)
16004 .k(k)
16005 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16006 }
16007 }
16008 }
16009
16010 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
16011 TEST_REQUIRES_ARM_NEON;
16012 for (uint32_t n = 9; n < 16; n++) {
16013 for (size_t k = 1; k <= 10; k += 3) {
16014 GemmMicrokernelTester()
16015 .mr(4)
16016 .nr(8)
16017 .kr(1)
16018 .sr(1)
16019 .m(4)
16020 .n(8)
16021 .k(k)
16022 .cn_stride(11)
16023 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16024 }
16025 }
16026 }
16027
16028 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8_strided_a) {
16029 TEST_REQUIRES_ARM_NEON;
16030 for (uint32_t n = 9; n < 16; n++) {
16031 for (size_t k = 1; k <= 10; k += 3) {
16032 GemmMicrokernelTester()
16033 .mr(4)
16034 .nr(8)
16035 .kr(1)
16036 .sr(1)
16037 .m(4)
16038 .n(n)
16039 .k(k)
16040 .a_stride(13)
16041 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16042 }
16043 }
16044 }
16045
16046 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8_subtile) {
16047 TEST_REQUIRES_ARM_NEON;
16048 for (uint32_t n = 9; n < 16; n++) {
16049 for (size_t k = 1; k <= 10; k += 3) {
16050 for (uint32_t m = 1; m <= 4; m++) {
16051 GemmMicrokernelTester()
16052 .mr(4)
16053 .nr(8)
16054 .kr(1)
16055 .sr(1)
16056 .m(m)
16057 .n(n)
16058 .k(k)
16059 .iterations(1)
16060 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16061 }
16062 }
16063 }
16064 }
16065
16066 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8) {
16067 TEST_REQUIRES_ARM_NEON;
16068 for (uint32_t n = 16; n <= 24; n += 8) {
16069 for (size_t k = 1; k <= 10; k += 3) {
16070 GemmMicrokernelTester()
16071 .mr(4)
16072 .nr(8)
16073 .kr(1)
16074 .sr(1)
16075 .m(4)
16076 .n(8)
16077 .k(k)
16078 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16079 }
16080 }
16081 }
16082
16083 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8_strided_cn) {
16084 TEST_REQUIRES_ARM_NEON;
16085 for (uint32_t n = 16; n <= 24; n += 8) {
16086 for (size_t k = 1; k <= 10; k += 3) {
16087 GemmMicrokernelTester()
16088 .mr(4)
16089 .nr(8)
16090 .kr(1)
16091 .sr(1)
16092 .m(4)
16093 .n(n)
16094 .k(k)
16095 .cn_stride(11)
16096 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16097 }
16098 }
16099 }
16100
16101 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8_strided_a) {
16102 TEST_REQUIRES_ARM_NEON;
16103 for (uint32_t n = 16; n <= 24; n += 8) {
16104 for (size_t k = 1; k <= 10; k += 3) {
16105 GemmMicrokernelTester()
16106 .mr(4)
16107 .nr(8)
16108 .kr(1)
16109 .sr(1)
16110 .m(4)
16111 .n(n)
16112 .k(k)
16113 .a_stride(13)
16114 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16115 }
16116 }
16117 }
16118
16119 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8_subtile) {
16120 TEST_REQUIRES_ARM_NEON;
16121 for (uint32_t n = 16; n <= 24; n += 8) {
16122 for (size_t k = 1; k <= 10; k += 3) {
16123 for (uint32_t m = 1; m <= 4; m++) {
16124 GemmMicrokernelTester()
16125 .mr(4)
16126 .nr(8)
16127 .kr(1)
16128 .sr(1)
16129 .m(m)
16130 .n(n)
16131 .k(k)
16132 .iterations(1)
16133 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16134 }
16135 }
16136 }
16137 }
16138
16139 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, strided_cm_subtile) {
16140 TEST_REQUIRES_ARM_NEON;
16141 for (size_t k = 1; k <= 10; k += 3) {
16142 for (uint32_t m = 1; m <= 4; m++) {
16143 for (uint32_t n = 1; n <= 8; n++) {
16144 GemmMicrokernelTester()
16145 .mr(4)
16146 .nr(8)
16147 .kr(1)
16148 .sr(1)
16149 .m(m)
16150 .n(n)
16151 .k(k)
16152 .cm_stride(11)
16153 .iterations(1)
16154 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16155 }
16156 }
16157 }
16158 }
16159
16160 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, qmin) {
16161 TEST_REQUIRES_ARM_NEON;
16162 GemmMicrokernelTester()
16163 .mr(4)
16164 .nr(8)
16165 .kr(1)
16166 .sr(1)
16167 .m(4)
16168 .n(8)
16169 .k(2)
16170 .qmin(128)
16171 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16172 }
16173
16174 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, qmax) {
16175 TEST_REQUIRES_ARM_NEON;
16176 GemmMicrokernelTester()
16177 .mr(4)
16178 .nr(8)
16179 .kr(1)
16180 .sr(1)
16181 .m(4)
16182 .n(8)
16183 .k(2)
16184 .qmax(128)
16185 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16186 }
16187
16188 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, strided_cm) {
16189 TEST_REQUIRES_ARM_NEON;
16190 GemmMicrokernelTester()
16191 .mr(4)
16192 .nr(8)
16193 .kr(1)
16194 .sr(1)
16195 .m(4)
16196 .n(8)
16197 .k(2)
16198 .cm_stride(11)
16199 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
16200 }
16201#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16202
16203
16204#if XNN_ARCH_ARM || XNN_ARCH_ARM64
16205 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4) {
16206 TEST_REQUIRES_ARM_NEON;
16207 GemmMicrokernelTester()
16208 .mr(4)
16209 .nr(8)
16210 .kr(1)
16211 .sr(1)
16212 .m(4)
16213 .n(8)
16214 .k(4)
16215 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16216 }
16217
16218 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, strided_cn) {
16219 TEST_REQUIRES_ARM_NEON;
16220 GemmMicrokernelTester()
16221 .mr(4)
16222 .nr(8)
16223 .kr(1)
16224 .sr(1)
16225 .m(4)
16226 .n(8)
16227 .k(4)
16228 .cn_stride(11)
16229 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16230 }
16231
16232 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_strided_a) {
16233 TEST_REQUIRES_ARM_NEON;
16234 GemmMicrokernelTester()
16235 .mr(4)
16236 .nr(8)
16237 .kr(1)
16238 .sr(1)
16239 .m(4)
16240 .n(8)
16241 .k(4)
16242 .a_stride(7)
16243 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16244 }
16245
16246 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
16247 TEST_REQUIRES_ARM_NEON;
16248 for (uint32_t m = 1; m <= 4; m++) {
16249 for (uint32_t n = 1; n <= 8; n++) {
16250 GemmMicrokernelTester()
16251 .mr(4)
16252 .nr(8)
16253 .kr(1)
16254 .sr(1)
16255 .m(m)
16256 .n(n)
16257 .k(4)
16258 .iterations(1)
16259 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16260 }
16261 }
16262 }
16263
16264 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
16265 TEST_REQUIRES_ARM_NEON;
16266 for (uint32_t m = 1; m <= 4; m++) {
16267 GemmMicrokernelTester()
16268 .mr(4)
16269 .nr(8)
16270 .kr(1)
16271 .sr(1)
16272 .m(m)
16273 .n(8)
16274 .k(4)
16275 .iterations(1)
16276 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16277 }
16278 }
16279
16280 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
16281 TEST_REQUIRES_ARM_NEON;
16282 for (uint32_t n = 1; n <= 8; n++) {
16283 GemmMicrokernelTester()
16284 .mr(4)
16285 .nr(8)
16286 .kr(1)
16287 .sr(1)
16288 .m(4)
16289 .n(n)
16290 .k(4)
16291 .iterations(1)
16292 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16293 }
16294 }
16295
16296 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_lt_4) {
16297 TEST_REQUIRES_ARM_NEON;
16298 for (size_t k = 1; k < 4; k++) {
16299 GemmMicrokernelTester()
16300 .mr(4)
16301 .nr(8)
16302 .kr(1)
16303 .sr(1)
16304 .m(4)
16305 .n(8)
16306 .k(k)
16307 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16308 }
16309 }
16310
16311 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_lt_4_strided_a) {
16312 TEST_REQUIRES_ARM_NEON;
16313 for (size_t k = 1; k < 4; k++) {
16314 GemmMicrokernelTester()
16315 .mr(4)
16316 .nr(8)
16317 .kr(1)
16318 .sr(1)
16319 .m(4)
16320 .n(8)
16321 .k(k)
16322 .a_stride(7)
16323 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16324 }
16325 }
16326
16327 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
16328 TEST_REQUIRES_ARM_NEON;
16329 for (size_t k = 1; k < 4; k++) {
16330 for (uint32_t m = 1; m <= 4; m++) {
16331 for (uint32_t n = 1; n <= 8; n++) {
16332 GemmMicrokernelTester()
16333 .mr(4)
16334 .nr(8)
16335 .kr(1)
16336 .sr(1)
16337 .m(m)
16338 .n(n)
16339 .k(k)
16340 .iterations(1)
16341 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16342 }
16343 }
16344 }
16345 }
16346
16347 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_gt_4) {
16348 TEST_REQUIRES_ARM_NEON;
16349 for (size_t k = 5; k < 8; k++) {
16350 GemmMicrokernelTester()
16351 .mr(4)
16352 .nr(8)
16353 .kr(1)
16354 .sr(1)
16355 .m(4)
16356 .n(8)
16357 .k(k)
16358 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16359 }
16360 }
16361
16362 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_gt_4_strided_a) {
16363 TEST_REQUIRES_ARM_NEON;
16364 for (size_t k = 5; k < 8; k++) {
16365 GemmMicrokernelTester()
16366 .mr(4)
16367 .nr(8)
16368 .kr(1)
16369 .sr(1)
16370 .m(4)
16371 .n(8)
16372 .k(k)
16373 .a_stride(11)
16374 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16375 }
16376 }
16377
16378 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
16379 TEST_REQUIRES_ARM_NEON;
16380 for (size_t k = 5; k < 8; k++) {
16381 for (uint32_t m = 1; m <= 4; m++) {
16382 for (uint32_t n = 1; n <= 8; n++) {
16383 GemmMicrokernelTester()
16384 .mr(4)
16385 .nr(8)
16386 .kr(1)
16387 .sr(1)
16388 .m(m)
16389 .n(n)
16390 .k(k)
16391 .iterations(1)
16392 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16393 }
16394 }
16395 }
16396 }
16397
16398 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_div_4) {
16399 TEST_REQUIRES_ARM_NEON;
16400 for (size_t k = 8; k <= 40; k += 4) {
16401 GemmMicrokernelTester()
16402 .mr(4)
16403 .nr(8)
16404 .kr(1)
16405 .sr(1)
16406 .m(4)
16407 .n(8)
16408 .k(k)
16409 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16410 }
16411 }
16412
16413 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_div_4_strided_a) {
16414 TEST_REQUIRES_ARM_NEON;
16415 for (size_t k = 8; k <= 40; k += 4) {
16416 GemmMicrokernelTester()
16417 .mr(4)
16418 .nr(8)
16419 .kr(1)
16420 .sr(1)
16421 .m(4)
16422 .n(8)
16423 .k(k)
16424 .a_stride(43)
16425 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16426 }
16427 }
16428
16429 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_div_4_subtile) {
16430 TEST_REQUIRES_ARM_NEON;
16431 for (size_t k = 8; k <= 40; k += 4) {
16432 for (uint32_t m = 1; m <= 4; m++) {
16433 for (uint32_t n = 1; n <= 8; n++) {
16434 GemmMicrokernelTester()
16435 .mr(4)
16436 .nr(8)
16437 .kr(1)
16438 .sr(1)
16439 .m(m)
16440 .n(n)
16441 .k(k)
16442 .iterations(1)
16443 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16444 }
16445 }
16446 }
16447 }
16448
16449 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8) {
16450 TEST_REQUIRES_ARM_NEON;
16451 for (uint32_t n = 9; n < 16; n++) {
16452 for (size_t k = 1; k <= 20; k += 5) {
16453 GemmMicrokernelTester()
16454 .mr(4)
16455 .nr(8)
16456 .kr(1)
16457 .sr(1)
16458 .m(4)
16459 .n(8)
16460 .k(k)
16461 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16462 }
16463 }
16464 }
16465
16466 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
16467 TEST_REQUIRES_ARM_NEON;
16468 for (uint32_t n = 9; n < 16; n++) {
16469 for (size_t k = 1; k <= 20; k += 5) {
16470 GemmMicrokernelTester()
16471 .mr(4)
16472 .nr(8)
16473 .kr(1)
16474 .sr(1)
16475 .m(4)
16476 .n(8)
16477 .k(k)
16478 .cn_stride(11)
16479 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16480 }
16481 }
16482 }
16483
16484 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8_strided_a) {
16485 TEST_REQUIRES_ARM_NEON;
16486 for (uint32_t n = 9; n < 16; n++) {
16487 for (size_t k = 1; k <= 20; k += 5) {
16488 GemmMicrokernelTester()
16489 .mr(4)
16490 .nr(8)
16491 .kr(1)
16492 .sr(1)
16493 .m(4)
16494 .n(n)
16495 .k(k)
16496 .a_stride(23)
16497 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16498 }
16499 }
16500 }
16501
16502 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
16503 TEST_REQUIRES_ARM_NEON;
16504 for (uint32_t n = 9; n < 16; n++) {
16505 for (size_t k = 1; k <= 20; k += 5) {
16506 for (uint32_t m = 1; m <= 4; m++) {
16507 GemmMicrokernelTester()
16508 .mr(4)
16509 .nr(8)
16510 .kr(1)
16511 .sr(1)
16512 .m(m)
16513 .n(n)
16514 .k(k)
16515 .iterations(1)
16516 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16517 }
16518 }
16519 }
16520 }
16521
16522 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8) {
16523 TEST_REQUIRES_ARM_NEON;
16524 for (uint32_t n = 16; n <= 24; n += 8) {
16525 for (size_t k = 1; k <= 20; k += 5) {
16526 GemmMicrokernelTester()
16527 .mr(4)
16528 .nr(8)
16529 .kr(1)
16530 .sr(1)
16531 .m(4)
16532 .n(8)
16533 .k(k)
16534 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16535 }
16536 }
16537 }
16538
16539 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
16540 TEST_REQUIRES_ARM_NEON;
16541 for (uint32_t n = 16; n <= 24; n += 8) {
16542 for (size_t k = 1; k <= 20; k += 5) {
16543 GemmMicrokernelTester()
16544 .mr(4)
16545 .nr(8)
16546 .kr(1)
16547 .sr(1)
16548 .m(4)
16549 .n(n)
16550 .k(k)
16551 .cn_stride(11)
16552 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16553 }
16554 }
16555 }
16556
16557 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8_strided_a) {
16558 TEST_REQUIRES_ARM_NEON;
16559 for (uint32_t n = 16; n <= 24; n += 8) {
16560 for (size_t k = 1; k <= 20; k += 5) {
16561 GemmMicrokernelTester()
16562 .mr(4)
16563 .nr(8)
16564 .kr(1)
16565 .sr(1)
16566 .m(4)
16567 .n(n)
16568 .k(k)
16569 .a_stride(23)
16570 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16571 }
16572 }
16573 }
16574
16575 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8_subtile) {
16576 TEST_REQUIRES_ARM_NEON;
16577 for (uint32_t n = 16; n <= 24; n += 8) {
16578 for (size_t k = 1; k <= 20; k += 5) {
16579 for (uint32_t m = 1; m <= 4; m++) {
16580 GemmMicrokernelTester()
16581 .mr(4)
16582 .nr(8)
16583 .kr(1)
16584 .sr(1)
16585 .m(m)
16586 .n(n)
16587 .k(k)
16588 .iterations(1)
16589 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16590 }
16591 }
16592 }
16593 }
16594
16595 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, strided_cm_subtile) {
16596 TEST_REQUIRES_ARM_NEON;
16597 for (size_t k = 1; k <= 20; k += 5) {
16598 for (uint32_t m = 1; m <= 4; m++) {
16599 for (uint32_t n = 1; n <= 8; n++) {
16600 GemmMicrokernelTester()
16601 .mr(4)
16602 .nr(8)
16603 .kr(1)
16604 .sr(1)
16605 .m(m)
16606 .n(n)
16607 .k(k)
16608 .cm_stride(11)
16609 .iterations(1)
16610 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16611 }
16612 }
16613 }
16614 }
16615
16616 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, qmin) {
16617 TEST_REQUIRES_ARM_NEON;
16618 GemmMicrokernelTester()
16619 .mr(4)
16620 .nr(8)
16621 .kr(1)
16622 .sr(1)
16623 .m(4)
16624 .n(8)
16625 .k(4)
16626 .qmin(128)
16627 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16628 }
16629
16630 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, qmax) {
16631 TEST_REQUIRES_ARM_NEON;
16632 GemmMicrokernelTester()
16633 .mr(4)
16634 .nr(8)
16635 .kr(1)
16636 .sr(1)
16637 .m(4)
16638 .n(8)
16639 .k(4)
16640 .qmax(128)
16641 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16642 }
16643
16644 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, strided_cm) {
16645 TEST_REQUIRES_ARM_NEON;
16646 GemmMicrokernelTester()
16647 .mr(4)
16648 .nr(8)
16649 .kr(1)
16650 .sr(1)
16651 .m(4)
16652 .n(8)
16653 .k(4)
16654 .cm_stride(11)
16655 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
16656 }
16657#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16658
16659
16660#if XNN_ARCH_ARM || XNN_ARCH_ARM64
16661 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2) {
16662 TEST_REQUIRES_ARM_NEON;
16663 GemmMicrokernelTester()
16664 .mr(6)
16665 .nr(8)
16666 .kr(1)
16667 .sr(1)
16668 .m(6)
16669 .n(8)
16670 .k(2)
16671 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16672 }
16673
16674 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, strided_cn) {
16675 TEST_REQUIRES_ARM_NEON;
16676 GemmMicrokernelTester()
16677 .mr(6)
16678 .nr(8)
16679 .kr(1)
16680 .sr(1)
16681 .m(6)
16682 .n(8)
16683 .k(2)
16684 .cn_stride(11)
16685 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16686 }
16687
16688 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_strided_a) {
16689 TEST_REQUIRES_ARM_NEON;
16690 GemmMicrokernelTester()
16691 .mr(6)
16692 .nr(8)
16693 .kr(1)
16694 .sr(1)
16695 .m(6)
16696 .n(8)
16697 .k(2)
16698 .a_stride(5)
16699 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16700 }
16701
16702 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
16703 TEST_REQUIRES_ARM_NEON;
16704 for (uint32_t m = 1; m <= 6; m++) {
16705 for (uint32_t n = 1; n <= 8; n++) {
16706 GemmMicrokernelTester()
16707 .mr(6)
16708 .nr(8)
16709 .kr(1)
16710 .sr(1)
16711 .m(m)
16712 .n(n)
16713 .k(2)
16714 .iterations(1)
16715 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16716 }
16717 }
16718 }
16719
16720 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
16721 TEST_REQUIRES_ARM_NEON;
16722 for (uint32_t m = 1; m <= 6; m++) {
16723 GemmMicrokernelTester()
16724 .mr(6)
16725 .nr(8)
16726 .kr(1)
16727 .sr(1)
16728 .m(m)
16729 .n(8)
16730 .k(2)
16731 .iterations(1)
16732 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16733 }
16734 }
16735
16736 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
16737 TEST_REQUIRES_ARM_NEON;
16738 for (uint32_t n = 1; n <= 8; n++) {
16739 GemmMicrokernelTester()
16740 .mr(6)
16741 .nr(8)
16742 .kr(1)
16743 .sr(1)
16744 .m(6)
16745 .n(n)
16746 .k(2)
16747 .iterations(1)
16748 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16749 }
16750 }
16751
16752 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_lt_2) {
16753 TEST_REQUIRES_ARM_NEON;
16754 for (size_t k = 1; k < 2; k++) {
16755 GemmMicrokernelTester()
16756 .mr(6)
16757 .nr(8)
16758 .kr(1)
16759 .sr(1)
16760 .m(6)
16761 .n(8)
16762 .k(k)
16763 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16764 }
16765 }
16766
16767 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_lt_2_strided_a) {
16768 TEST_REQUIRES_ARM_NEON;
16769 for (size_t k = 1; k < 2; k++) {
16770 GemmMicrokernelTester()
16771 .mr(6)
16772 .nr(8)
16773 .kr(1)
16774 .sr(1)
16775 .m(6)
16776 .n(8)
16777 .k(k)
16778 .a_stride(5)
16779 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16780 }
16781 }
16782
16783 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
16784 TEST_REQUIRES_ARM_NEON;
16785 for (size_t k = 1; k < 2; k++) {
16786 for (uint32_t m = 1; m <= 6; m++) {
16787 for (uint32_t n = 1; n <= 8; n++) {
16788 GemmMicrokernelTester()
16789 .mr(6)
16790 .nr(8)
16791 .kr(1)
16792 .sr(1)
16793 .m(m)
16794 .n(n)
16795 .k(k)
16796 .iterations(1)
16797 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16798 }
16799 }
16800 }
16801 }
16802
16803 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_gt_2) {
16804 TEST_REQUIRES_ARM_NEON;
16805 for (size_t k = 3; k < 4; k++) {
16806 GemmMicrokernelTester()
16807 .mr(6)
16808 .nr(8)
16809 .kr(1)
16810 .sr(1)
16811 .m(6)
16812 .n(8)
16813 .k(k)
16814 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16815 }
16816 }
16817
16818 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_gt_2_strided_a) {
16819 TEST_REQUIRES_ARM_NEON;
16820 for (size_t k = 3; k < 4; k++) {
16821 GemmMicrokernelTester()
16822 .mr(6)
16823 .nr(8)
16824 .kr(1)
16825 .sr(1)
16826 .m(6)
16827 .n(8)
16828 .k(k)
16829 .a_stride(7)
16830 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16831 }
16832 }
16833
16834 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
16835 TEST_REQUIRES_ARM_NEON;
16836 for (size_t k = 3; k < 4; k++) {
16837 for (uint32_t m = 1; m <= 6; m++) {
16838 for (uint32_t n = 1; n <= 8; n++) {
16839 GemmMicrokernelTester()
16840 .mr(6)
16841 .nr(8)
16842 .kr(1)
16843 .sr(1)
16844 .m(m)
16845 .n(n)
16846 .k(k)
16847 .iterations(1)
16848 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16849 }
16850 }
16851 }
16852 }
16853
16854 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_div_2) {
16855 TEST_REQUIRES_ARM_NEON;
16856 for (size_t k = 4; k <= 20; k += 2) {
16857 GemmMicrokernelTester()
16858 .mr(6)
16859 .nr(8)
16860 .kr(1)
16861 .sr(1)
16862 .m(6)
16863 .n(8)
16864 .k(k)
16865 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16866 }
16867 }
16868
16869 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_div_2_strided_a) {
16870 TEST_REQUIRES_ARM_NEON;
16871 for (size_t k = 4; k <= 20; k += 2) {
16872 GemmMicrokernelTester()
16873 .mr(6)
16874 .nr(8)
16875 .kr(1)
16876 .sr(1)
16877 .m(6)
16878 .n(8)
16879 .k(k)
16880 .a_stride(23)
16881 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16882 }
16883 }
16884
16885 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_div_2_subtile) {
16886 TEST_REQUIRES_ARM_NEON;
16887 for (size_t k = 4; k <= 20; k += 2) {
16888 for (uint32_t m = 1; m <= 6; m++) {
16889 for (uint32_t n = 1; n <= 8; n++) {
16890 GemmMicrokernelTester()
16891 .mr(6)
16892 .nr(8)
16893 .kr(1)
16894 .sr(1)
16895 .m(m)
16896 .n(n)
16897 .k(k)
16898 .iterations(1)
16899 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16900 }
16901 }
16902 }
16903 }
16904
16905 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8) {
16906 TEST_REQUIRES_ARM_NEON;
16907 for (uint32_t n = 9; n < 16; n++) {
16908 for (size_t k = 1; k <= 10; k += 3) {
16909 GemmMicrokernelTester()
16910 .mr(6)
16911 .nr(8)
16912 .kr(1)
16913 .sr(1)
16914 .m(6)
16915 .n(8)
16916 .k(k)
16917 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16918 }
16919 }
16920 }
16921
16922 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
16923 TEST_REQUIRES_ARM_NEON;
16924 for (uint32_t n = 9; n < 16; n++) {
16925 for (size_t k = 1; k <= 10; k += 3) {
16926 GemmMicrokernelTester()
16927 .mr(6)
16928 .nr(8)
16929 .kr(1)
16930 .sr(1)
16931 .m(6)
16932 .n(8)
16933 .k(k)
16934 .cn_stride(11)
16935 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16936 }
16937 }
16938 }
16939
16940 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8_strided_a) {
16941 TEST_REQUIRES_ARM_NEON;
16942 for (uint32_t n = 9; n < 16; n++) {
16943 for (size_t k = 1; k <= 10; k += 3) {
16944 GemmMicrokernelTester()
16945 .mr(6)
16946 .nr(8)
16947 .kr(1)
16948 .sr(1)
16949 .m(6)
16950 .n(n)
16951 .k(k)
16952 .a_stride(13)
16953 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16954 }
16955 }
16956 }
16957
16958 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
16959 TEST_REQUIRES_ARM_NEON;
16960 for (uint32_t n = 9; n < 16; n++) {
16961 for (size_t k = 1; k <= 10; k += 3) {
16962 for (uint32_t m = 1; m <= 6; m++) {
16963 GemmMicrokernelTester()
16964 .mr(6)
16965 .nr(8)
16966 .kr(1)
16967 .sr(1)
16968 .m(m)
16969 .n(n)
16970 .k(k)
16971 .iterations(1)
16972 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16973 }
16974 }
16975 }
16976 }
16977
16978 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8) {
16979 TEST_REQUIRES_ARM_NEON;
16980 for (uint32_t n = 16; n <= 24; n += 8) {
16981 for (size_t k = 1; k <= 10; k += 3) {
16982 GemmMicrokernelTester()
16983 .mr(6)
16984 .nr(8)
16985 .kr(1)
16986 .sr(1)
16987 .m(6)
16988 .n(8)
16989 .k(k)
16990 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
16991 }
16992 }
16993 }
16994
16995 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
16996 TEST_REQUIRES_ARM_NEON;
16997 for (uint32_t n = 16; n <= 24; n += 8) {
16998 for (size_t k = 1; k <= 10; k += 3) {
16999 GemmMicrokernelTester()
17000 .mr(6)
17001 .nr(8)
17002 .kr(1)
17003 .sr(1)
17004 .m(6)
17005 .n(n)
17006 .k(k)
17007 .cn_stride(11)
17008 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17009 }
17010 }
17011 }
17012
17013 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8_strided_a) {
17014 TEST_REQUIRES_ARM_NEON;
17015 for (uint32_t n = 16; n <= 24; n += 8) {
17016 for (size_t k = 1; k <= 10; k += 3) {
17017 GemmMicrokernelTester()
17018 .mr(6)
17019 .nr(8)
17020 .kr(1)
17021 .sr(1)
17022 .m(6)
17023 .n(n)
17024 .k(k)
17025 .a_stride(13)
17026 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17027 }
17028 }
17029 }
17030
17031 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8_subtile) {
17032 TEST_REQUIRES_ARM_NEON;
17033 for (uint32_t n = 16; n <= 24; n += 8) {
17034 for (size_t k = 1; k <= 10; k += 3) {
17035 for (uint32_t m = 1; m <= 6; m++) {
17036 GemmMicrokernelTester()
17037 .mr(6)
17038 .nr(8)
17039 .kr(1)
17040 .sr(1)
17041 .m(m)
17042 .n(n)
17043 .k(k)
17044 .iterations(1)
17045 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17046 }
17047 }
17048 }
17049 }
17050
17051 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, strided_cm_subtile) {
17052 TEST_REQUIRES_ARM_NEON;
17053 for (size_t k = 1; k <= 10; k += 3) {
17054 for (uint32_t m = 1; m <= 6; m++) {
17055 for (uint32_t n = 1; n <= 8; n++) {
17056 GemmMicrokernelTester()
17057 .mr(6)
17058 .nr(8)
17059 .kr(1)
17060 .sr(1)
17061 .m(m)
17062 .n(n)
17063 .k(k)
17064 .cm_stride(11)
17065 .iterations(1)
17066 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17067 }
17068 }
17069 }
17070 }
17071
17072 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, qmin) {
17073 TEST_REQUIRES_ARM_NEON;
17074 GemmMicrokernelTester()
17075 .mr(6)
17076 .nr(8)
17077 .kr(1)
17078 .sr(1)
17079 .m(6)
17080 .n(8)
17081 .k(2)
17082 .qmin(128)
17083 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17084 }
17085
17086 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, qmax) {
17087 TEST_REQUIRES_ARM_NEON;
17088 GemmMicrokernelTester()
17089 .mr(6)
17090 .nr(8)
17091 .kr(1)
17092 .sr(1)
17093 .m(6)
17094 .n(8)
17095 .k(2)
17096 .qmax(128)
17097 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17098 }
17099
17100 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, strided_cm) {
17101 TEST_REQUIRES_ARM_NEON;
17102 GemmMicrokernelTester()
17103 .mr(6)
17104 .nr(8)
17105 .kr(1)
17106 .sr(1)
17107 .m(6)
17108 .n(8)
17109 .k(2)
17110 .cm_stride(11)
17111 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
17112 }
17113#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17114
17115
17116#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17117 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4) {
17118 TEST_REQUIRES_ARM_NEON;
17119 GemmMicrokernelTester()
17120 .mr(6)
17121 .nr(8)
17122 .kr(1)
17123 .sr(1)
17124 .m(6)
17125 .n(8)
17126 .k(4)
17127 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17128 }
17129
17130 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, strided_cn) {
17131 TEST_REQUIRES_ARM_NEON;
17132 GemmMicrokernelTester()
17133 .mr(6)
17134 .nr(8)
17135 .kr(1)
17136 .sr(1)
17137 .m(6)
17138 .n(8)
17139 .k(4)
17140 .cn_stride(11)
17141 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17142 }
17143
17144 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_strided_a) {
17145 TEST_REQUIRES_ARM_NEON;
17146 GemmMicrokernelTester()
17147 .mr(6)
17148 .nr(8)
17149 .kr(1)
17150 .sr(1)
17151 .m(6)
17152 .n(8)
17153 .k(4)
17154 .a_stride(7)
17155 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17156 }
17157
17158 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_subtile) {
17159 TEST_REQUIRES_ARM_NEON;
17160 for (uint32_t m = 1; m <= 6; m++) {
17161 for (uint32_t n = 1; n <= 8; n++) {
17162 GemmMicrokernelTester()
17163 .mr(6)
17164 .nr(8)
17165 .kr(1)
17166 .sr(1)
17167 .m(m)
17168 .n(n)
17169 .k(4)
17170 .iterations(1)
17171 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17172 }
17173 }
17174 }
17175
17176 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
17177 TEST_REQUIRES_ARM_NEON;
17178 for (uint32_t m = 1; m <= 6; m++) {
17179 GemmMicrokernelTester()
17180 .mr(6)
17181 .nr(8)
17182 .kr(1)
17183 .sr(1)
17184 .m(m)
17185 .n(8)
17186 .k(4)
17187 .iterations(1)
17188 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17189 }
17190 }
17191
17192 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
17193 TEST_REQUIRES_ARM_NEON;
17194 for (uint32_t n = 1; n <= 8; n++) {
17195 GemmMicrokernelTester()
17196 .mr(6)
17197 .nr(8)
17198 .kr(1)
17199 .sr(1)
17200 .m(6)
17201 .n(n)
17202 .k(4)
17203 .iterations(1)
17204 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17205 }
17206 }
17207
17208 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_lt_4) {
17209 TEST_REQUIRES_ARM_NEON;
17210 for (size_t k = 1; k < 4; k++) {
17211 GemmMicrokernelTester()
17212 .mr(6)
17213 .nr(8)
17214 .kr(1)
17215 .sr(1)
17216 .m(6)
17217 .n(8)
17218 .k(k)
17219 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17220 }
17221 }
17222
17223 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_lt_4_strided_a) {
17224 TEST_REQUIRES_ARM_NEON;
17225 for (size_t k = 1; k < 4; k++) {
17226 GemmMicrokernelTester()
17227 .mr(6)
17228 .nr(8)
17229 .kr(1)
17230 .sr(1)
17231 .m(6)
17232 .n(8)
17233 .k(k)
17234 .a_stride(7)
17235 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17236 }
17237 }
17238
17239 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_lt_4_subtile) {
17240 TEST_REQUIRES_ARM_NEON;
17241 for (size_t k = 1; k < 4; k++) {
17242 for (uint32_t m = 1; m <= 6; m++) {
17243 for (uint32_t n = 1; n <= 8; n++) {
17244 GemmMicrokernelTester()
17245 .mr(6)
17246 .nr(8)
17247 .kr(1)
17248 .sr(1)
17249 .m(m)
17250 .n(n)
17251 .k(k)
17252 .iterations(1)
17253 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17254 }
17255 }
17256 }
17257 }
17258
17259 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_gt_4) {
17260 TEST_REQUIRES_ARM_NEON;
17261 for (size_t k = 5; k < 8; k++) {
17262 GemmMicrokernelTester()
17263 .mr(6)
17264 .nr(8)
17265 .kr(1)
17266 .sr(1)
17267 .m(6)
17268 .n(8)
17269 .k(k)
17270 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17271 }
17272 }
17273
17274 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_gt_4_strided_a) {
17275 TEST_REQUIRES_ARM_NEON;
17276 for (size_t k = 5; k < 8; k++) {
17277 GemmMicrokernelTester()
17278 .mr(6)
17279 .nr(8)
17280 .kr(1)
17281 .sr(1)
17282 .m(6)
17283 .n(8)
17284 .k(k)
17285 .a_stride(11)
17286 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17287 }
17288 }
17289
17290 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_gt_4_subtile) {
17291 TEST_REQUIRES_ARM_NEON;
17292 for (size_t k = 5; k < 8; k++) {
17293 for (uint32_t m = 1; m <= 6; m++) {
17294 for (uint32_t n = 1; n <= 8; n++) {
17295 GemmMicrokernelTester()
17296 .mr(6)
17297 .nr(8)
17298 .kr(1)
17299 .sr(1)
17300 .m(m)
17301 .n(n)
17302 .k(k)
17303 .iterations(1)
17304 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17305 }
17306 }
17307 }
17308 }
17309
17310 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_div_4) {
17311 TEST_REQUIRES_ARM_NEON;
17312 for (size_t k = 8; k <= 40; k += 4) {
17313 GemmMicrokernelTester()
17314 .mr(6)
17315 .nr(8)
17316 .kr(1)
17317 .sr(1)
17318 .m(6)
17319 .n(8)
17320 .k(k)
17321 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17322 }
17323 }
17324
17325 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_div_4_strided_a) {
17326 TEST_REQUIRES_ARM_NEON;
17327 for (size_t k = 8; k <= 40; k += 4) {
17328 GemmMicrokernelTester()
17329 .mr(6)
17330 .nr(8)
17331 .kr(1)
17332 .sr(1)
17333 .m(6)
17334 .n(8)
17335 .k(k)
17336 .a_stride(43)
17337 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17338 }
17339 }
17340
17341 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_div_4_subtile) {
17342 TEST_REQUIRES_ARM_NEON;
17343 for (size_t k = 8; k <= 40; k += 4) {
17344 for (uint32_t m = 1; m <= 6; m++) {
17345 for (uint32_t n = 1; n <= 8; n++) {
17346 GemmMicrokernelTester()
17347 .mr(6)
17348 .nr(8)
17349 .kr(1)
17350 .sr(1)
17351 .m(m)
17352 .n(n)
17353 .k(k)
17354 .iterations(1)
17355 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17356 }
17357 }
17358 }
17359 }
17360
17361 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8) {
17362 TEST_REQUIRES_ARM_NEON;
17363 for (uint32_t n = 9; n < 16; n++) {
17364 for (size_t k = 1; k <= 20; k += 5) {
17365 GemmMicrokernelTester()
17366 .mr(6)
17367 .nr(8)
17368 .kr(1)
17369 .sr(1)
17370 .m(6)
17371 .n(8)
17372 .k(k)
17373 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17374 }
17375 }
17376 }
17377
17378 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
17379 TEST_REQUIRES_ARM_NEON;
17380 for (uint32_t n = 9; n < 16; n++) {
17381 for (size_t k = 1; k <= 20; k += 5) {
17382 GemmMicrokernelTester()
17383 .mr(6)
17384 .nr(8)
17385 .kr(1)
17386 .sr(1)
17387 .m(6)
17388 .n(8)
17389 .k(k)
17390 .cn_stride(11)
17391 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17392 }
17393 }
17394 }
17395
17396 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8_strided_a) {
17397 TEST_REQUIRES_ARM_NEON;
17398 for (uint32_t n = 9; n < 16; n++) {
17399 for (size_t k = 1; k <= 20; k += 5) {
17400 GemmMicrokernelTester()
17401 .mr(6)
17402 .nr(8)
17403 .kr(1)
17404 .sr(1)
17405 .m(6)
17406 .n(n)
17407 .k(k)
17408 .a_stride(23)
17409 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17410 }
17411 }
17412 }
17413
17414 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8_subtile) {
17415 TEST_REQUIRES_ARM_NEON;
17416 for (uint32_t n = 9; n < 16; n++) {
17417 for (size_t k = 1; k <= 20; k += 5) {
17418 for (uint32_t m = 1; m <= 6; m++) {
17419 GemmMicrokernelTester()
17420 .mr(6)
17421 .nr(8)
17422 .kr(1)
17423 .sr(1)
17424 .m(m)
17425 .n(n)
17426 .k(k)
17427 .iterations(1)
17428 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17429 }
17430 }
17431 }
17432 }
17433
17434 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8) {
17435 TEST_REQUIRES_ARM_NEON;
17436 for (uint32_t n = 16; n <= 24; n += 8) {
17437 for (size_t k = 1; k <= 20; k += 5) {
17438 GemmMicrokernelTester()
17439 .mr(6)
17440 .nr(8)
17441 .kr(1)
17442 .sr(1)
17443 .m(6)
17444 .n(8)
17445 .k(k)
17446 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17447 }
17448 }
17449 }
17450
17451 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8_strided_cn) {
17452 TEST_REQUIRES_ARM_NEON;
17453 for (uint32_t n = 16; n <= 24; n += 8) {
17454 for (size_t k = 1; k <= 20; k += 5) {
17455 GemmMicrokernelTester()
17456 .mr(6)
17457 .nr(8)
17458 .kr(1)
17459 .sr(1)
17460 .m(6)
17461 .n(n)
17462 .k(k)
17463 .cn_stride(11)
17464 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17465 }
17466 }
17467 }
17468
17469 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8_strided_a) {
17470 TEST_REQUIRES_ARM_NEON;
17471 for (uint32_t n = 16; n <= 24; n += 8) {
17472 for (size_t k = 1; k <= 20; k += 5) {
17473 GemmMicrokernelTester()
17474 .mr(6)
17475 .nr(8)
17476 .kr(1)
17477 .sr(1)
17478 .m(6)
17479 .n(n)
17480 .k(k)
17481 .a_stride(23)
17482 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17483 }
17484 }
17485 }
17486
17487 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8_subtile) {
17488 TEST_REQUIRES_ARM_NEON;
17489 for (uint32_t n = 16; n <= 24; n += 8) {
17490 for (size_t k = 1; k <= 20; k += 5) {
17491 for (uint32_t m = 1; m <= 6; m++) {
17492 GemmMicrokernelTester()
17493 .mr(6)
17494 .nr(8)
17495 .kr(1)
17496 .sr(1)
17497 .m(m)
17498 .n(n)
17499 .k(k)
17500 .iterations(1)
17501 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17502 }
17503 }
17504 }
17505 }
17506
17507 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, strided_cm_subtile) {
17508 TEST_REQUIRES_ARM_NEON;
17509 for (size_t k = 1; k <= 20; k += 5) {
17510 for (uint32_t m = 1; m <= 6; m++) {
17511 for (uint32_t n = 1; n <= 8; n++) {
17512 GemmMicrokernelTester()
17513 .mr(6)
17514 .nr(8)
17515 .kr(1)
17516 .sr(1)
17517 .m(m)
17518 .n(n)
17519 .k(k)
17520 .cm_stride(11)
17521 .iterations(1)
17522 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17523 }
17524 }
17525 }
17526 }
17527
17528 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, qmin) {
17529 TEST_REQUIRES_ARM_NEON;
17530 GemmMicrokernelTester()
17531 .mr(6)
17532 .nr(8)
17533 .kr(1)
17534 .sr(1)
17535 .m(6)
17536 .n(8)
17537 .k(4)
17538 .qmin(128)
17539 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17540 }
17541
17542 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, qmax) {
17543 TEST_REQUIRES_ARM_NEON;
17544 GemmMicrokernelTester()
17545 .mr(6)
17546 .nr(8)
17547 .kr(1)
17548 .sr(1)
17549 .m(6)
17550 .n(8)
17551 .k(4)
17552 .qmax(128)
17553 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17554 }
17555
17556 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, strided_cm) {
17557 TEST_REQUIRES_ARM_NEON;
17558 GemmMicrokernelTester()
17559 .mr(6)
17560 .nr(8)
17561 .kr(1)
17562 .sr(1)
17563 .m(6)
17564 .n(8)
17565 .k(4)
17566 .cm_stride(11)
17567 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
17568 }
17569#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17570
17571
17572#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17573 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2) {
17574 TEST_REQUIRES_ARM_NEON_FMA;
17575 GemmMicrokernelTester()
17576 .mr(1)
17577 .nr(8)
17578 .kr(1)
17579 .sr(1)
17580 .m(1)
17581 .n(8)
17582 .k(2)
17583 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17584 }
17585
17586 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, strided_cn) {
17587 TEST_REQUIRES_ARM_NEON_FMA;
17588 GemmMicrokernelTester()
17589 .mr(1)
17590 .nr(8)
17591 .kr(1)
17592 .sr(1)
17593 .m(1)
17594 .n(8)
17595 .k(2)
17596 .cn_stride(11)
17597 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17598 }
17599
17600 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
17601 TEST_REQUIRES_ARM_NEON_FMA;
17602 GemmMicrokernelTester()
17603 .mr(1)
17604 .nr(8)
17605 .kr(1)
17606 .sr(1)
17607 .m(1)
17608 .n(8)
17609 .k(2)
17610 .a_stride(5)
17611 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17612 }
17613
17614 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
17615 TEST_REQUIRES_ARM_NEON_FMA;
17616 for (uint32_t m = 1; m <= 1; m++) {
17617 for (uint32_t n = 1; n <= 8; n++) {
17618 GemmMicrokernelTester()
17619 .mr(1)
17620 .nr(8)
17621 .kr(1)
17622 .sr(1)
17623 .m(m)
17624 .n(n)
17625 .k(2)
17626 .iterations(1)
17627 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17628 }
17629 }
17630 }
17631
17632 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
17633 TEST_REQUIRES_ARM_NEON_FMA;
17634 for (uint32_t m = 1; m <= 1; m++) {
17635 GemmMicrokernelTester()
17636 .mr(1)
17637 .nr(8)
17638 .kr(1)
17639 .sr(1)
17640 .m(m)
17641 .n(8)
17642 .k(2)
17643 .iterations(1)
17644 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17645 }
17646 }
17647
17648 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
17649 TEST_REQUIRES_ARM_NEON_FMA;
17650 for (uint32_t n = 1; n <= 8; n++) {
17651 GemmMicrokernelTester()
17652 .mr(1)
17653 .nr(8)
17654 .kr(1)
17655 .sr(1)
17656 .m(1)
17657 .n(n)
17658 .k(2)
17659 .iterations(1)
17660 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17661 }
17662 }
17663
17664 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_lt_2) {
17665 TEST_REQUIRES_ARM_NEON_FMA;
17666 for (size_t k = 1; k < 2; k++) {
17667 GemmMicrokernelTester()
17668 .mr(1)
17669 .nr(8)
17670 .kr(1)
17671 .sr(1)
17672 .m(1)
17673 .n(8)
17674 .k(k)
17675 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17676 }
17677 }
17678
17679 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
17680 TEST_REQUIRES_ARM_NEON_FMA;
17681 for (size_t k = 1; k < 2; k++) {
17682 GemmMicrokernelTester()
17683 .mr(1)
17684 .nr(8)
17685 .kr(1)
17686 .sr(1)
17687 .m(1)
17688 .n(8)
17689 .k(k)
17690 .a_stride(5)
17691 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17692 }
17693 }
17694
17695 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
17696 TEST_REQUIRES_ARM_NEON_FMA;
17697 for (size_t k = 1; k < 2; k++) {
17698 for (uint32_t m = 1; m <= 1; m++) {
17699 for (uint32_t n = 1; n <= 8; n++) {
17700 GemmMicrokernelTester()
17701 .mr(1)
17702 .nr(8)
17703 .kr(1)
17704 .sr(1)
17705 .m(m)
17706 .n(n)
17707 .k(k)
17708 .iterations(1)
17709 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17710 }
17711 }
17712 }
17713 }
17714
17715 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_gt_2) {
17716 TEST_REQUIRES_ARM_NEON_FMA;
17717 for (size_t k = 3; k < 4; k++) {
17718 GemmMicrokernelTester()
17719 .mr(1)
17720 .nr(8)
17721 .kr(1)
17722 .sr(1)
17723 .m(1)
17724 .n(8)
17725 .k(k)
17726 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17727 }
17728 }
17729
17730 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
17731 TEST_REQUIRES_ARM_NEON_FMA;
17732 for (size_t k = 3; k < 4; k++) {
17733 GemmMicrokernelTester()
17734 .mr(1)
17735 .nr(8)
17736 .kr(1)
17737 .sr(1)
17738 .m(1)
17739 .n(8)
17740 .k(k)
17741 .a_stride(7)
17742 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17743 }
17744 }
17745
17746 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
17747 TEST_REQUIRES_ARM_NEON_FMA;
17748 for (size_t k = 3; k < 4; k++) {
17749 for (uint32_t m = 1; m <= 1; m++) {
17750 for (uint32_t n = 1; n <= 8; n++) {
17751 GemmMicrokernelTester()
17752 .mr(1)
17753 .nr(8)
17754 .kr(1)
17755 .sr(1)
17756 .m(m)
17757 .n(n)
17758 .k(k)
17759 .iterations(1)
17760 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17761 }
17762 }
17763 }
17764 }
17765
17766 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_div_2) {
17767 TEST_REQUIRES_ARM_NEON_FMA;
17768 for (size_t k = 4; k <= 20; k += 2) {
17769 GemmMicrokernelTester()
17770 .mr(1)
17771 .nr(8)
17772 .kr(1)
17773 .sr(1)
17774 .m(1)
17775 .n(8)
17776 .k(k)
17777 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17778 }
17779 }
17780
17781 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
17782 TEST_REQUIRES_ARM_NEON_FMA;
17783 for (size_t k = 4; k <= 20; k += 2) {
17784 GemmMicrokernelTester()
17785 .mr(1)
17786 .nr(8)
17787 .kr(1)
17788 .sr(1)
17789 .m(1)
17790 .n(8)
17791 .k(k)
17792 .a_stride(23)
17793 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17794 }
17795 }
17796
17797 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
17798 TEST_REQUIRES_ARM_NEON_FMA;
17799 for (size_t k = 4; k <= 20; k += 2) {
17800 for (uint32_t m = 1; m <= 1; m++) {
17801 for (uint32_t n = 1; n <= 8; n++) {
17802 GemmMicrokernelTester()
17803 .mr(1)
17804 .nr(8)
17805 .kr(1)
17806 .sr(1)
17807 .m(m)
17808 .n(n)
17809 .k(k)
17810 .iterations(1)
17811 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17812 }
17813 }
17814 }
17815 }
17816
17817 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8) {
17818 TEST_REQUIRES_ARM_NEON_FMA;
17819 for (uint32_t n = 9; n < 16; n++) {
17820 for (size_t k = 1; k <= 10; k += 3) {
17821 GemmMicrokernelTester()
17822 .mr(1)
17823 .nr(8)
17824 .kr(1)
17825 .sr(1)
17826 .m(1)
17827 .n(8)
17828 .k(k)
17829 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17830 }
17831 }
17832 }
17833
17834 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
17835 TEST_REQUIRES_ARM_NEON_FMA;
17836 for (uint32_t n = 9; n < 16; n++) {
17837 for (size_t k = 1; k <= 10; k += 3) {
17838 GemmMicrokernelTester()
17839 .mr(1)
17840 .nr(8)
17841 .kr(1)
17842 .sr(1)
17843 .m(1)
17844 .n(8)
17845 .k(k)
17846 .cn_stride(11)
17847 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17848 }
17849 }
17850 }
17851
17852 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
17853 TEST_REQUIRES_ARM_NEON_FMA;
17854 for (uint32_t n = 9; n < 16; n++) {
17855 for (size_t k = 1; k <= 10; k += 3) {
17856 GemmMicrokernelTester()
17857 .mr(1)
17858 .nr(8)
17859 .kr(1)
17860 .sr(1)
17861 .m(1)
17862 .n(n)
17863 .k(k)
17864 .a_stride(13)
17865 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17866 }
17867 }
17868 }
17869
17870 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
17871 TEST_REQUIRES_ARM_NEON_FMA;
17872 for (uint32_t n = 9; n < 16; n++) {
17873 for (size_t k = 1; k <= 10; k += 3) {
17874 for (uint32_t m = 1; m <= 1; m++) {
17875 GemmMicrokernelTester()
17876 .mr(1)
17877 .nr(8)
17878 .kr(1)
17879 .sr(1)
17880 .m(m)
17881 .n(n)
17882 .k(k)
17883 .iterations(1)
17884 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17885 }
17886 }
17887 }
17888 }
17889
17890 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8) {
17891 TEST_REQUIRES_ARM_NEON_FMA;
17892 for (uint32_t n = 16; n <= 24; n += 8) {
17893 for (size_t k = 1; k <= 10; k += 3) {
17894 GemmMicrokernelTester()
17895 .mr(1)
17896 .nr(8)
17897 .kr(1)
17898 .sr(1)
17899 .m(1)
17900 .n(8)
17901 .k(k)
17902 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17903 }
17904 }
17905 }
17906
17907 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
17908 TEST_REQUIRES_ARM_NEON_FMA;
17909 for (uint32_t n = 16; n <= 24; n += 8) {
17910 for (size_t k = 1; k <= 10; k += 3) {
17911 GemmMicrokernelTester()
17912 .mr(1)
17913 .nr(8)
17914 .kr(1)
17915 .sr(1)
17916 .m(1)
17917 .n(n)
17918 .k(k)
17919 .cn_stride(11)
17920 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17921 }
17922 }
17923 }
17924
17925 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
17926 TEST_REQUIRES_ARM_NEON_FMA;
17927 for (uint32_t n = 16; n <= 24; n += 8) {
17928 for (size_t k = 1; k <= 10; k += 3) {
17929 GemmMicrokernelTester()
17930 .mr(1)
17931 .nr(8)
17932 .kr(1)
17933 .sr(1)
17934 .m(1)
17935 .n(n)
17936 .k(k)
17937 .a_stride(13)
17938 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17939 }
17940 }
17941 }
17942
17943 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
17944 TEST_REQUIRES_ARM_NEON_FMA;
17945 for (uint32_t n = 16; n <= 24; n += 8) {
17946 for (size_t k = 1; k <= 10; k += 3) {
17947 for (uint32_t m = 1; m <= 1; m++) {
17948 GemmMicrokernelTester()
17949 .mr(1)
17950 .nr(8)
17951 .kr(1)
17952 .sr(1)
17953 .m(m)
17954 .n(n)
17955 .k(k)
17956 .iterations(1)
17957 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17958 }
17959 }
17960 }
17961 }
17962
17963 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
17964 TEST_REQUIRES_ARM_NEON_FMA;
17965 for (size_t k = 1; k <= 10; k += 3) {
17966 for (uint32_t m = 1; m <= 1; m++) {
17967 for (uint32_t n = 1; n <= 8; n++) {
17968 GemmMicrokernelTester()
17969 .mr(1)
17970 .nr(8)
17971 .kr(1)
17972 .sr(1)
17973 .m(m)
17974 .n(n)
17975 .k(k)
17976 .cm_stride(11)
17977 .iterations(1)
17978 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17979 }
17980 }
17981 }
17982 }
17983
17984 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, qmin) {
17985 TEST_REQUIRES_ARM_NEON_FMA;
17986 GemmMicrokernelTester()
17987 .mr(1)
17988 .nr(8)
17989 .kr(1)
17990 .sr(1)
17991 .m(1)
17992 .n(8)
17993 .k(2)
17994 .qmin(128)
17995 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
17996 }
17997
17998 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, qmax) {
17999 TEST_REQUIRES_ARM_NEON_FMA;
18000 GemmMicrokernelTester()
18001 .mr(1)
18002 .nr(8)
18003 .kr(1)
18004 .sr(1)
18005 .m(1)
18006 .n(8)
18007 .k(2)
18008 .qmax(128)
18009 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
18010 }
18011
18012 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, strided_cm) {
18013 TEST_REQUIRES_ARM_NEON_FMA;
18014 GemmMicrokernelTester()
18015 .mr(1)
18016 .nr(8)
18017 .kr(1)
18018 .sr(1)
18019 .m(1)
18020 .n(8)
18021 .k(2)
18022 .cm_stride(11)
18023 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
18024 }
18025#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18026
18027
18028#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18029 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2) {
18030 TEST_REQUIRES_ARM_NEON_FMA;
18031 GemmMicrokernelTester()
18032 .mr(4)
18033 .nr(8)
18034 .kr(1)
18035 .sr(1)
18036 .m(4)
18037 .n(8)
18038 .k(2)
18039 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18040 }
18041
18042 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, strided_cn) {
18043 TEST_REQUIRES_ARM_NEON_FMA;
18044 GemmMicrokernelTester()
18045 .mr(4)
18046 .nr(8)
18047 .kr(1)
18048 .sr(1)
18049 .m(4)
18050 .n(8)
18051 .k(2)
18052 .cn_stride(11)
18053 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18054 }
18055
18056 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
18057 TEST_REQUIRES_ARM_NEON_FMA;
18058 GemmMicrokernelTester()
18059 .mr(4)
18060 .nr(8)
18061 .kr(1)
18062 .sr(1)
18063 .m(4)
18064 .n(8)
18065 .k(2)
18066 .a_stride(5)
18067 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18068 }
18069
18070 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
18071 TEST_REQUIRES_ARM_NEON_FMA;
18072 for (uint32_t m = 1; m <= 4; m++) {
18073 for (uint32_t n = 1; n <= 8; n++) {
18074 GemmMicrokernelTester()
18075 .mr(4)
18076 .nr(8)
18077 .kr(1)
18078 .sr(1)
18079 .m(m)
18080 .n(n)
18081 .k(2)
18082 .iterations(1)
18083 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18084 }
18085 }
18086 }
18087
18088 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
18089 TEST_REQUIRES_ARM_NEON_FMA;
18090 for (uint32_t m = 1; m <= 4; m++) {
18091 GemmMicrokernelTester()
18092 .mr(4)
18093 .nr(8)
18094 .kr(1)
18095 .sr(1)
18096 .m(m)
18097 .n(8)
18098 .k(2)
18099 .iterations(1)
18100 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18101 }
18102 }
18103
18104 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
18105 TEST_REQUIRES_ARM_NEON_FMA;
18106 for (uint32_t n = 1; n <= 8; n++) {
18107 GemmMicrokernelTester()
18108 .mr(4)
18109 .nr(8)
18110 .kr(1)
18111 .sr(1)
18112 .m(4)
18113 .n(n)
18114 .k(2)
18115 .iterations(1)
18116 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18117 }
18118 }
18119
18120 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_lt_2) {
18121 TEST_REQUIRES_ARM_NEON_FMA;
18122 for (size_t k = 1; k < 2; k++) {
18123 GemmMicrokernelTester()
18124 .mr(4)
18125 .nr(8)
18126 .kr(1)
18127 .sr(1)
18128 .m(4)
18129 .n(8)
18130 .k(k)
18131 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18132 }
18133 }
18134
18135 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
18136 TEST_REQUIRES_ARM_NEON_FMA;
18137 for (size_t k = 1; k < 2; k++) {
18138 GemmMicrokernelTester()
18139 .mr(4)
18140 .nr(8)
18141 .kr(1)
18142 .sr(1)
18143 .m(4)
18144 .n(8)
18145 .k(k)
18146 .a_stride(5)
18147 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18148 }
18149 }
18150
18151 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
18152 TEST_REQUIRES_ARM_NEON_FMA;
18153 for (size_t k = 1; k < 2; k++) {
18154 for (uint32_t m = 1; m <= 4; m++) {
18155 for (uint32_t n = 1; n <= 8; n++) {
18156 GemmMicrokernelTester()
18157 .mr(4)
18158 .nr(8)
18159 .kr(1)
18160 .sr(1)
18161 .m(m)
18162 .n(n)
18163 .k(k)
18164 .iterations(1)
18165 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18166 }
18167 }
18168 }
18169 }
18170
18171 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_gt_2) {
18172 TEST_REQUIRES_ARM_NEON_FMA;
18173 for (size_t k = 3; k < 4; k++) {
18174 GemmMicrokernelTester()
18175 .mr(4)
18176 .nr(8)
18177 .kr(1)
18178 .sr(1)
18179 .m(4)
18180 .n(8)
18181 .k(k)
18182 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18183 }
18184 }
18185
18186 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
18187 TEST_REQUIRES_ARM_NEON_FMA;
18188 for (size_t k = 3; k < 4; k++) {
18189 GemmMicrokernelTester()
18190 .mr(4)
18191 .nr(8)
18192 .kr(1)
18193 .sr(1)
18194 .m(4)
18195 .n(8)
18196 .k(k)
18197 .a_stride(7)
18198 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18199 }
18200 }
18201
18202 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
18203 TEST_REQUIRES_ARM_NEON_FMA;
18204 for (size_t k = 3; k < 4; k++) {
18205 for (uint32_t m = 1; m <= 4; m++) {
18206 for (uint32_t n = 1; n <= 8; n++) {
18207 GemmMicrokernelTester()
18208 .mr(4)
18209 .nr(8)
18210 .kr(1)
18211 .sr(1)
18212 .m(m)
18213 .n(n)
18214 .k(k)
18215 .iterations(1)
18216 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18217 }
18218 }
18219 }
18220 }
18221
18222 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_div_2) {
18223 TEST_REQUIRES_ARM_NEON_FMA;
18224 for (size_t k = 4; k <= 20; k += 2) {
18225 GemmMicrokernelTester()
18226 .mr(4)
18227 .nr(8)
18228 .kr(1)
18229 .sr(1)
18230 .m(4)
18231 .n(8)
18232 .k(k)
18233 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18234 }
18235 }
18236
18237 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
18238 TEST_REQUIRES_ARM_NEON_FMA;
18239 for (size_t k = 4; k <= 20; k += 2) {
18240 GemmMicrokernelTester()
18241 .mr(4)
18242 .nr(8)
18243 .kr(1)
18244 .sr(1)
18245 .m(4)
18246 .n(8)
18247 .k(k)
18248 .a_stride(23)
18249 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18250 }
18251 }
18252
18253 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
18254 TEST_REQUIRES_ARM_NEON_FMA;
18255 for (size_t k = 4; k <= 20; k += 2) {
18256 for (uint32_t m = 1; m <= 4; m++) {
18257 for (uint32_t n = 1; n <= 8; n++) {
18258 GemmMicrokernelTester()
18259 .mr(4)
18260 .nr(8)
18261 .kr(1)
18262 .sr(1)
18263 .m(m)
18264 .n(n)
18265 .k(k)
18266 .iterations(1)
18267 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18268 }
18269 }
18270 }
18271 }
18272
18273 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8) {
18274 TEST_REQUIRES_ARM_NEON_FMA;
18275 for (uint32_t n = 9; n < 16; n++) {
18276 for (size_t k = 1; k <= 10; k += 3) {
18277 GemmMicrokernelTester()
18278 .mr(4)
18279 .nr(8)
18280 .kr(1)
18281 .sr(1)
18282 .m(4)
18283 .n(8)
18284 .k(k)
18285 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18286 }
18287 }
18288 }
18289
18290 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
18291 TEST_REQUIRES_ARM_NEON_FMA;
18292 for (uint32_t n = 9; n < 16; n++) {
18293 for (size_t k = 1; k <= 10; k += 3) {
18294 GemmMicrokernelTester()
18295 .mr(4)
18296 .nr(8)
18297 .kr(1)
18298 .sr(1)
18299 .m(4)
18300 .n(8)
18301 .k(k)
18302 .cn_stride(11)
18303 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18304 }
18305 }
18306 }
18307
18308 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
18309 TEST_REQUIRES_ARM_NEON_FMA;
18310 for (uint32_t n = 9; n < 16; n++) {
18311 for (size_t k = 1; k <= 10; k += 3) {
18312 GemmMicrokernelTester()
18313 .mr(4)
18314 .nr(8)
18315 .kr(1)
18316 .sr(1)
18317 .m(4)
18318 .n(n)
18319 .k(k)
18320 .a_stride(13)
18321 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18322 }
18323 }
18324 }
18325
18326 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
18327 TEST_REQUIRES_ARM_NEON_FMA;
18328 for (uint32_t n = 9; n < 16; n++) {
18329 for (size_t k = 1; k <= 10; k += 3) {
18330 for (uint32_t m = 1; m <= 4; m++) {
18331 GemmMicrokernelTester()
18332 .mr(4)
18333 .nr(8)
18334 .kr(1)
18335 .sr(1)
18336 .m(m)
18337 .n(n)
18338 .k(k)
18339 .iterations(1)
18340 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18341 }
18342 }
18343 }
18344 }
18345
18346 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8) {
18347 TEST_REQUIRES_ARM_NEON_FMA;
18348 for (uint32_t n = 16; n <= 24; n += 8) {
18349 for (size_t k = 1; k <= 10; k += 3) {
18350 GemmMicrokernelTester()
18351 .mr(4)
18352 .nr(8)
18353 .kr(1)
18354 .sr(1)
18355 .m(4)
18356 .n(8)
18357 .k(k)
18358 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18359 }
18360 }
18361 }
18362
18363 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
18364 TEST_REQUIRES_ARM_NEON_FMA;
18365 for (uint32_t n = 16; n <= 24; n += 8) {
18366 for (size_t k = 1; k <= 10; k += 3) {
18367 GemmMicrokernelTester()
18368 .mr(4)
18369 .nr(8)
18370 .kr(1)
18371 .sr(1)
18372 .m(4)
18373 .n(n)
18374 .k(k)
18375 .cn_stride(11)
18376 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18377 }
18378 }
18379 }
18380
18381 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
18382 TEST_REQUIRES_ARM_NEON_FMA;
18383 for (uint32_t n = 16; n <= 24; n += 8) {
18384 for (size_t k = 1; k <= 10; k += 3) {
18385 GemmMicrokernelTester()
18386 .mr(4)
18387 .nr(8)
18388 .kr(1)
18389 .sr(1)
18390 .m(4)
18391 .n(n)
18392 .k(k)
18393 .a_stride(13)
18394 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18395 }
18396 }
18397 }
18398
18399 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
18400 TEST_REQUIRES_ARM_NEON_FMA;
18401 for (uint32_t n = 16; n <= 24; n += 8) {
18402 for (size_t k = 1; k <= 10; k += 3) {
18403 for (uint32_t m = 1; m <= 4; m++) {
18404 GemmMicrokernelTester()
18405 .mr(4)
18406 .nr(8)
18407 .kr(1)
18408 .sr(1)
18409 .m(m)
18410 .n(n)
18411 .k(k)
18412 .iterations(1)
18413 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18414 }
18415 }
18416 }
18417 }
18418
18419 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
18420 TEST_REQUIRES_ARM_NEON_FMA;
18421 for (size_t k = 1; k <= 10; k += 3) {
18422 for (uint32_t m = 1; m <= 4; m++) {
18423 for (uint32_t n = 1; n <= 8; n++) {
18424 GemmMicrokernelTester()
18425 .mr(4)
18426 .nr(8)
18427 .kr(1)
18428 .sr(1)
18429 .m(m)
18430 .n(n)
18431 .k(k)
18432 .cm_stride(11)
18433 .iterations(1)
18434 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18435 }
18436 }
18437 }
18438 }
18439
18440 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, qmin) {
18441 TEST_REQUIRES_ARM_NEON_FMA;
18442 GemmMicrokernelTester()
18443 .mr(4)
18444 .nr(8)
18445 .kr(1)
18446 .sr(1)
18447 .m(4)
18448 .n(8)
18449 .k(2)
18450 .qmin(128)
18451 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18452 }
18453
18454 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, qmax) {
18455 TEST_REQUIRES_ARM_NEON_FMA;
18456 GemmMicrokernelTester()
18457 .mr(4)
18458 .nr(8)
18459 .kr(1)
18460 .sr(1)
18461 .m(4)
18462 .n(8)
18463 .k(2)
18464 .qmax(128)
18465 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18466 }
18467
18468 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, strided_cm) {
18469 TEST_REQUIRES_ARM_NEON_FMA;
18470 GemmMicrokernelTester()
18471 .mr(4)
18472 .nr(8)
18473 .kr(1)
18474 .sr(1)
18475 .m(4)
18476 .n(8)
18477 .k(2)
18478 .cm_stride(11)
18479 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
18480 }
18481#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18482
18483
18484#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18485 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4) {
18486 TEST_REQUIRES_ARM_NEON_FMA;
18487 GemmMicrokernelTester()
18488 .mr(4)
18489 .nr(8)
18490 .kr(1)
18491 .sr(1)
18492 .m(4)
18493 .n(8)
18494 .k(4)
18495 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18496 }
18497
18498 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, strided_cn) {
18499 TEST_REQUIRES_ARM_NEON_FMA;
18500 GemmMicrokernelTester()
18501 .mr(4)
18502 .nr(8)
18503 .kr(1)
18504 .sr(1)
18505 .m(4)
18506 .n(8)
18507 .k(4)
18508 .cn_stride(11)
18509 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18510 }
18511
18512 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
18513 TEST_REQUIRES_ARM_NEON_FMA;
18514 GemmMicrokernelTester()
18515 .mr(4)
18516 .nr(8)
18517 .kr(1)
18518 .sr(1)
18519 .m(4)
18520 .n(8)
18521 .k(4)
18522 .a_stride(7)
18523 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18524 }
18525
18526 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
18527 TEST_REQUIRES_ARM_NEON_FMA;
18528 for (uint32_t m = 1; m <= 4; m++) {
18529 for (uint32_t n = 1; n <= 8; n++) {
18530 GemmMicrokernelTester()
18531 .mr(4)
18532 .nr(8)
18533 .kr(1)
18534 .sr(1)
18535 .m(m)
18536 .n(n)
18537 .k(4)
18538 .iterations(1)
18539 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18540 }
18541 }
18542 }
18543
18544 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
18545 TEST_REQUIRES_ARM_NEON_FMA;
18546 for (uint32_t m = 1; m <= 4; m++) {
18547 GemmMicrokernelTester()
18548 .mr(4)
18549 .nr(8)
18550 .kr(1)
18551 .sr(1)
18552 .m(m)
18553 .n(8)
18554 .k(4)
18555 .iterations(1)
18556 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18557 }
18558 }
18559
18560 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
18561 TEST_REQUIRES_ARM_NEON_FMA;
18562 for (uint32_t n = 1; n <= 8; n++) {
18563 GemmMicrokernelTester()
18564 .mr(4)
18565 .nr(8)
18566 .kr(1)
18567 .sr(1)
18568 .m(4)
18569 .n(n)
18570 .k(4)
18571 .iterations(1)
18572 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18573 }
18574 }
18575
18576 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_lt_4) {
18577 TEST_REQUIRES_ARM_NEON_FMA;
18578 for (size_t k = 1; k < 4; k++) {
18579 GemmMicrokernelTester()
18580 .mr(4)
18581 .nr(8)
18582 .kr(1)
18583 .sr(1)
18584 .m(4)
18585 .n(8)
18586 .k(k)
18587 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18588 }
18589 }
18590
18591 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
18592 TEST_REQUIRES_ARM_NEON_FMA;
18593 for (size_t k = 1; k < 4; k++) {
18594 GemmMicrokernelTester()
18595 .mr(4)
18596 .nr(8)
18597 .kr(1)
18598 .sr(1)
18599 .m(4)
18600 .n(8)
18601 .k(k)
18602 .a_stride(7)
18603 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18604 }
18605 }
18606
18607 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
18608 TEST_REQUIRES_ARM_NEON_FMA;
18609 for (size_t k = 1; k < 4; k++) {
18610 for (uint32_t m = 1; m <= 4; m++) {
18611 for (uint32_t n = 1; n <= 8; n++) {
18612 GemmMicrokernelTester()
18613 .mr(4)
18614 .nr(8)
18615 .kr(1)
18616 .sr(1)
18617 .m(m)
18618 .n(n)
18619 .k(k)
18620 .iterations(1)
18621 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18622 }
18623 }
18624 }
18625 }
18626
18627 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_gt_4) {
18628 TEST_REQUIRES_ARM_NEON_FMA;
18629 for (size_t k = 5; k < 8; k++) {
18630 GemmMicrokernelTester()
18631 .mr(4)
18632 .nr(8)
18633 .kr(1)
18634 .sr(1)
18635 .m(4)
18636 .n(8)
18637 .k(k)
18638 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18639 }
18640 }
18641
18642 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
18643 TEST_REQUIRES_ARM_NEON_FMA;
18644 for (size_t k = 5; k < 8; k++) {
18645 GemmMicrokernelTester()
18646 .mr(4)
18647 .nr(8)
18648 .kr(1)
18649 .sr(1)
18650 .m(4)
18651 .n(8)
18652 .k(k)
18653 .a_stride(11)
18654 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18655 }
18656 }
18657
18658 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
18659 TEST_REQUIRES_ARM_NEON_FMA;
18660 for (size_t k = 5; k < 8; k++) {
18661 for (uint32_t m = 1; m <= 4; m++) {
18662 for (uint32_t n = 1; n <= 8; n++) {
18663 GemmMicrokernelTester()
18664 .mr(4)
18665 .nr(8)
18666 .kr(1)
18667 .sr(1)
18668 .m(m)
18669 .n(n)
18670 .k(k)
18671 .iterations(1)
18672 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18673 }
18674 }
18675 }
18676 }
18677
18678 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_div_4) {
18679 TEST_REQUIRES_ARM_NEON_FMA;
18680 for (size_t k = 8; k <= 40; k += 4) {
18681 GemmMicrokernelTester()
18682 .mr(4)
18683 .nr(8)
18684 .kr(1)
18685 .sr(1)
18686 .m(4)
18687 .n(8)
18688 .k(k)
18689 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18690 }
18691 }
18692
18693 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
18694 TEST_REQUIRES_ARM_NEON_FMA;
18695 for (size_t k = 8; k <= 40; k += 4) {
18696 GemmMicrokernelTester()
18697 .mr(4)
18698 .nr(8)
18699 .kr(1)
18700 .sr(1)
18701 .m(4)
18702 .n(8)
18703 .k(k)
18704 .a_stride(43)
18705 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18706 }
18707 }
18708
18709 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
18710 TEST_REQUIRES_ARM_NEON_FMA;
18711 for (size_t k = 8; k <= 40; k += 4) {
18712 for (uint32_t m = 1; m <= 4; m++) {
18713 for (uint32_t n = 1; n <= 8; n++) {
18714 GemmMicrokernelTester()
18715 .mr(4)
18716 .nr(8)
18717 .kr(1)
18718 .sr(1)
18719 .m(m)
18720 .n(n)
18721 .k(k)
18722 .iterations(1)
18723 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18724 }
18725 }
18726 }
18727 }
18728
18729 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8) {
18730 TEST_REQUIRES_ARM_NEON_FMA;
18731 for (uint32_t n = 9; n < 16; n++) {
18732 for (size_t k = 1; k <= 20; k += 5) {
18733 GemmMicrokernelTester()
18734 .mr(4)
18735 .nr(8)
18736 .kr(1)
18737 .sr(1)
18738 .m(4)
18739 .n(8)
18740 .k(k)
18741 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18742 }
18743 }
18744 }
18745
18746 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
18747 TEST_REQUIRES_ARM_NEON_FMA;
18748 for (uint32_t n = 9; n < 16; n++) {
18749 for (size_t k = 1; k <= 20; k += 5) {
18750 GemmMicrokernelTester()
18751 .mr(4)
18752 .nr(8)
18753 .kr(1)
18754 .sr(1)
18755 .m(4)
18756 .n(8)
18757 .k(k)
18758 .cn_stride(11)
18759 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18760 }
18761 }
18762 }
18763
18764 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
18765 TEST_REQUIRES_ARM_NEON_FMA;
18766 for (uint32_t n = 9; n < 16; n++) {
18767 for (size_t k = 1; k <= 20; k += 5) {
18768 GemmMicrokernelTester()
18769 .mr(4)
18770 .nr(8)
18771 .kr(1)
18772 .sr(1)
18773 .m(4)
18774 .n(n)
18775 .k(k)
18776 .a_stride(23)
18777 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18778 }
18779 }
18780 }
18781
18782 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
18783 TEST_REQUIRES_ARM_NEON_FMA;
18784 for (uint32_t n = 9; n < 16; n++) {
18785 for (size_t k = 1; k <= 20; k += 5) {
18786 for (uint32_t m = 1; m <= 4; m++) {
18787 GemmMicrokernelTester()
18788 .mr(4)
18789 .nr(8)
18790 .kr(1)
18791 .sr(1)
18792 .m(m)
18793 .n(n)
18794 .k(k)
18795 .iterations(1)
18796 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18797 }
18798 }
18799 }
18800 }
18801
18802 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8) {
18803 TEST_REQUIRES_ARM_NEON_FMA;
18804 for (uint32_t n = 16; n <= 24; n += 8) {
18805 for (size_t k = 1; k <= 20; k += 5) {
18806 GemmMicrokernelTester()
18807 .mr(4)
18808 .nr(8)
18809 .kr(1)
18810 .sr(1)
18811 .m(4)
18812 .n(8)
18813 .k(k)
18814 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18815 }
18816 }
18817 }
18818
18819 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
18820 TEST_REQUIRES_ARM_NEON_FMA;
18821 for (uint32_t n = 16; n <= 24; n += 8) {
18822 for (size_t k = 1; k <= 20; k += 5) {
18823 GemmMicrokernelTester()
18824 .mr(4)
18825 .nr(8)
18826 .kr(1)
18827 .sr(1)
18828 .m(4)
18829 .n(n)
18830 .k(k)
18831 .cn_stride(11)
18832 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18833 }
18834 }
18835 }
18836
18837 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
18838 TEST_REQUIRES_ARM_NEON_FMA;
18839 for (uint32_t n = 16; n <= 24; n += 8) {
18840 for (size_t k = 1; k <= 20; k += 5) {
18841 GemmMicrokernelTester()
18842 .mr(4)
18843 .nr(8)
18844 .kr(1)
18845 .sr(1)
18846 .m(4)
18847 .n(n)
18848 .k(k)
18849 .a_stride(23)
18850 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18851 }
18852 }
18853 }
18854
18855 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
18856 TEST_REQUIRES_ARM_NEON_FMA;
18857 for (uint32_t n = 16; n <= 24; n += 8) {
18858 for (size_t k = 1; k <= 20; k += 5) {
18859 for (uint32_t m = 1; m <= 4; m++) {
18860 GemmMicrokernelTester()
18861 .mr(4)
18862 .nr(8)
18863 .kr(1)
18864 .sr(1)
18865 .m(m)
18866 .n(n)
18867 .k(k)
18868 .iterations(1)
18869 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18870 }
18871 }
18872 }
18873 }
18874
18875 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
18876 TEST_REQUIRES_ARM_NEON_FMA;
18877 for (size_t k = 1; k <= 20; k += 5) {
18878 for (uint32_t m = 1; m <= 4; m++) {
18879 for (uint32_t n = 1; n <= 8; n++) {
18880 GemmMicrokernelTester()
18881 .mr(4)
18882 .nr(8)
18883 .kr(1)
18884 .sr(1)
18885 .m(m)
18886 .n(n)
18887 .k(k)
18888 .cm_stride(11)
18889 .iterations(1)
18890 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18891 }
18892 }
18893 }
18894 }
18895
18896 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, qmin) {
18897 TEST_REQUIRES_ARM_NEON_FMA;
18898 GemmMicrokernelTester()
18899 .mr(4)
18900 .nr(8)
18901 .kr(1)
18902 .sr(1)
18903 .m(4)
18904 .n(8)
18905 .k(4)
18906 .qmin(128)
18907 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18908 }
18909
18910 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, qmax) {
18911 TEST_REQUIRES_ARM_NEON_FMA;
18912 GemmMicrokernelTester()
18913 .mr(4)
18914 .nr(8)
18915 .kr(1)
18916 .sr(1)
18917 .m(4)
18918 .n(8)
18919 .k(4)
18920 .qmax(128)
18921 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18922 }
18923
18924 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, strided_cm) {
18925 TEST_REQUIRES_ARM_NEON_FMA;
18926 GemmMicrokernelTester()
18927 .mr(4)
18928 .nr(8)
18929 .kr(1)
18930 .sr(1)
18931 .m(4)
18932 .n(8)
18933 .k(4)
18934 .cm_stride(11)
18935 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
18936 }
18937#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18938
18939
18940#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18941 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2) {
18942 TEST_REQUIRES_ARM_NEON_FMA;
18943 GemmMicrokernelTester()
18944 .mr(6)
18945 .nr(8)
18946 .kr(1)
18947 .sr(1)
18948 .m(6)
18949 .n(8)
18950 .k(2)
18951 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
18952 }
18953
18954 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, strided_cn) {
18955 TEST_REQUIRES_ARM_NEON_FMA;
18956 GemmMicrokernelTester()
18957 .mr(6)
18958 .nr(8)
18959 .kr(1)
18960 .sr(1)
18961 .m(6)
18962 .n(8)
18963 .k(2)
18964 .cn_stride(11)
18965 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
18966 }
18967
18968 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
18969 TEST_REQUIRES_ARM_NEON_FMA;
18970 GemmMicrokernelTester()
18971 .mr(6)
18972 .nr(8)
18973 .kr(1)
18974 .sr(1)
18975 .m(6)
18976 .n(8)
18977 .k(2)
18978 .a_stride(5)
18979 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
18980 }
18981
18982 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
18983 TEST_REQUIRES_ARM_NEON_FMA;
18984 for (uint32_t m = 1; m <= 6; m++) {
18985 for (uint32_t n = 1; n <= 8; n++) {
18986 GemmMicrokernelTester()
18987 .mr(6)
18988 .nr(8)
18989 .kr(1)
18990 .sr(1)
18991 .m(m)
18992 .n(n)
18993 .k(2)
18994 .iterations(1)
18995 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
18996 }
18997 }
18998 }
18999
19000 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
19001 TEST_REQUIRES_ARM_NEON_FMA;
19002 for (uint32_t m = 1; m <= 6; m++) {
19003 GemmMicrokernelTester()
19004 .mr(6)
19005 .nr(8)
19006 .kr(1)
19007 .sr(1)
19008 .m(m)
19009 .n(8)
19010 .k(2)
19011 .iterations(1)
19012 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19013 }
19014 }
19015
19016 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
19017 TEST_REQUIRES_ARM_NEON_FMA;
19018 for (uint32_t n = 1; n <= 8; n++) {
19019 GemmMicrokernelTester()
19020 .mr(6)
19021 .nr(8)
19022 .kr(1)
19023 .sr(1)
19024 .m(6)
19025 .n(n)
19026 .k(2)
19027 .iterations(1)
19028 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19029 }
19030 }
19031
19032 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_lt_2) {
19033 TEST_REQUIRES_ARM_NEON_FMA;
19034 for (size_t k = 1; k < 2; k++) {
19035 GemmMicrokernelTester()
19036 .mr(6)
19037 .nr(8)
19038 .kr(1)
19039 .sr(1)
19040 .m(6)
19041 .n(8)
19042 .k(k)
19043 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19044 }
19045 }
19046
19047 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
19048 TEST_REQUIRES_ARM_NEON_FMA;
19049 for (size_t k = 1; k < 2; k++) {
19050 GemmMicrokernelTester()
19051 .mr(6)
19052 .nr(8)
19053 .kr(1)
19054 .sr(1)
19055 .m(6)
19056 .n(8)
19057 .k(k)
19058 .a_stride(5)
19059 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19060 }
19061 }
19062
19063 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
19064 TEST_REQUIRES_ARM_NEON_FMA;
19065 for (size_t k = 1; k < 2; k++) {
19066 for (uint32_t m = 1; m <= 6; m++) {
19067 for (uint32_t n = 1; n <= 8; n++) {
19068 GemmMicrokernelTester()
19069 .mr(6)
19070 .nr(8)
19071 .kr(1)
19072 .sr(1)
19073 .m(m)
19074 .n(n)
19075 .k(k)
19076 .iterations(1)
19077 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19078 }
19079 }
19080 }
19081 }
19082
19083 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_gt_2) {
19084 TEST_REQUIRES_ARM_NEON_FMA;
19085 for (size_t k = 3; k < 4; k++) {
19086 GemmMicrokernelTester()
19087 .mr(6)
19088 .nr(8)
19089 .kr(1)
19090 .sr(1)
19091 .m(6)
19092 .n(8)
19093 .k(k)
19094 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19095 }
19096 }
19097
19098 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
19099 TEST_REQUIRES_ARM_NEON_FMA;
19100 for (size_t k = 3; k < 4; k++) {
19101 GemmMicrokernelTester()
19102 .mr(6)
19103 .nr(8)
19104 .kr(1)
19105 .sr(1)
19106 .m(6)
19107 .n(8)
19108 .k(k)
19109 .a_stride(7)
19110 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19111 }
19112 }
19113
19114 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
19115 TEST_REQUIRES_ARM_NEON_FMA;
19116 for (size_t k = 3; k < 4; k++) {
19117 for (uint32_t m = 1; m <= 6; m++) {
19118 for (uint32_t n = 1; n <= 8; n++) {
19119 GemmMicrokernelTester()
19120 .mr(6)
19121 .nr(8)
19122 .kr(1)
19123 .sr(1)
19124 .m(m)
19125 .n(n)
19126 .k(k)
19127 .iterations(1)
19128 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19129 }
19130 }
19131 }
19132 }
19133
19134 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_div_2) {
19135 TEST_REQUIRES_ARM_NEON_FMA;
19136 for (size_t k = 4; k <= 20; k += 2) {
19137 GemmMicrokernelTester()
19138 .mr(6)
19139 .nr(8)
19140 .kr(1)
19141 .sr(1)
19142 .m(6)
19143 .n(8)
19144 .k(k)
19145 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19146 }
19147 }
19148
19149 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
19150 TEST_REQUIRES_ARM_NEON_FMA;
19151 for (size_t k = 4; k <= 20; k += 2) {
19152 GemmMicrokernelTester()
19153 .mr(6)
19154 .nr(8)
19155 .kr(1)
19156 .sr(1)
19157 .m(6)
19158 .n(8)
19159 .k(k)
19160 .a_stride(23)
19161 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19162 }
19163 }
19164
19165 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
19166 TEST_REQUIRES_ARM_NEON_FMA;
19167 for (size_t k = 4; k <= 20; k += 2) {
19168 for (uint32_t m = 1; m <= 6; m++) {
19169 for (uint32_t n = 1; n <= 8; n++) {
19170 GemmMicrokernelTester()
19171 .mr(6)
19172 .nr(8)
19173 .kr(1)
19174 .sr(1)
19175 .m(m)
19176 .n(n)
19177 .k(k)
19178 .iterations(1)
19179 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19180 }
19181 }
19182 }
19183 }
19184
19185 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8) {
19186 TEST_REQUIRES_ARM_NEON_FMA;
19187 for (uint32_t n = 9; n < 16; n++) {
19188 for (size_t k = 1; k <= 10; k += 3) {
19189 GemmMicrokernelTester()
19190 .mr(6)
19191 .nr(8)
19192 .kr(1)
19193 .sr(1)
19194 .m(6)
19195 .n(8)
19196 .k(k)
19197 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19198 }
19199 }
19200 }
19201
19202 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
19203 TEST_REQUIRES_ARM_NEON_FMA;
19204 for (uint32_t n = 9; n < 16; n++) {
19205 for (size_t k = 1; k <= 10; k += 3) {
19206 GemmMicrokernelTester()
19207 .mr(6)
19208 .nr(8)
19209 .kr(1)
19210 .sr(1)
19211 .m(6)
19212 .n(8)
19213 .k(k)
19214 .cn_stride(11)
19215 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19216 }
19217 }
19218 }
19219
19220 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
19221 TEST_REQUIRES_ARM_NEON_FMA;
19222 for (uint32_t n = 9; n < 16; n++) {
19223 for (size_t k = 1; k <= 10; k += 3) {
19224 GemmMicrokernelTester()
19225 .mr(6)
19226 .nr(8)
19227 .kr(1)
19228 .sr(1)
19229 .m(6)
19230 .n(n)
19231 .k(k)
19232 .a_stride(13)
19233 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19234 }
19235 }
19236 }
19237
19238 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
19239 TEST_REQUIRES_ARM_NEON_FMA;
19240 for (uint32_t n = 9; n < 16; n++) {
19241 for (size_t k = 1; k <= 10; k += 3) {
19242 for (uint32_t m = 1; m <= 6; m++) {
19243 GemmMicrokernelTester()
19244 .mr(6)
19245 .nr(8)
19246 .kr(1)
19247 .sr(1)
19248 .m(m)
19249 .n(n)
19250 .k(k)
19251 .iterations(1)
19252 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19253 }
19254 }
19255 }
19256 }
19257
19258 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8) {
19259 TEST_REQUIRES_ARM_NEON_FMA;
19260 for (uint32_t n = 16; n <= 24; n += 8) {
19261 for (size_t k = 1; k <= 10; k += 3) {
19262 GemmMicrokernelTester()
19263 .mr(6)
19264 .nr(8)
19265 .kr(1)
19266 .sr(1)
19267 .m(6)
19268 .n(8)
19269 .k(k)
19270 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19271 }
19272 }
19273 }
19274
19275 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
19276 TEST_REQUIRES_ARM_NEON_FMA;
19277 for (uint32_t n = 16; n <= 24; n += 8) {
19278 for (size_t k = 1; k <= 10; k += 3) {
19279 GemmMicrokernelTester()
19280 .mr(6)
19281 .nr(8)
19282 .kr(1)
19283 .sr(1)
19284 .m(6)
19285 .n(n)
19286 .k(k)
19287 .cn_stride(11)
19288 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19289 }
19290 }
19291 }
19292
19293 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
19294 TEST_REQUIRES_ARM_NEON_FMA;
19295 for (uint32_t n = 16; n <= 24; n += 8) {
19296 for (size_t k = 1; k <= 10; k += 3) {
19297 GemmMicrokernelTester()
19298 .mr(6)
19299 .nr(8)
19300 .kr(1)
19301 .sr(1)
19302 .m(6)
19303 .n(n)
19304 .k(k)
19305 .a_stride(13)
19306 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19307 }
19308 }
19309 }
19310
19311 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
19312 TEST_REQUIRES_ARM_NEON_FMA;
19313 for (uint32_t n = 16; n <= 24; n += 8) {
19314 for (size_t k = 1; k <= 10; k += 3) {
19315 for (uint32_t m = 1; m <= 6; m++) {
19316 GemmMicrokernelTester()
19317 .mr(6)
19318 .nr(8)
19319 .kr(1)
19320 .sr(1)
19321 .m(m)
19322 .n(n)
19323 .k(k)
19324 .iterations(1)
19325 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19326 }
19327 }
19328 }
19329 }
19330
19331 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
19332 TEST_REQUIRES_ARM_NEON_FMA;
19333 for (size_t k = 1; k <= 10; k += 3) {
19334 for (uint32_t m = 1; m <= 6; m++) {
19335 for (uint32_t n = 1; n <= 8; n++) {
19336 GemmMicrokernelTester()
19337 .mr(6)
19338 .nr(8)
19339 .kr(1)
19340 .sr(1)
19341 .m(m)
19342 .n(n)
19343 .k(k)
19344 .cm_stride(11)
19345 .iterations(1)
19346 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19347 }
19348 }
19349 }
19350 }
19351
19352 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, qmin) {
19353 TEST_REQUIRES_ARM_NEON_FMA;
19354 GemmMicrokernelTester()
19355 .mr(6)
19356 .nr(8)
19357 .kr(1)
19358 .sr(1)
19359 .m(6)
19360 .n(8)
19361 .k(2)
19362 .qmin(128)
19363 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19364 }
19365
19366 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, qmax) {
19367 TEST_REQUIRES_ARM_NEON_FMA;
19368 GemmMicrokernelTester()
19369 .mr(6)
19370 .nr(8)
19371 .kr(1)
19372 .sr(1)
19373 .m(6)
19374 .n(8)
19375 .k(2)
19376 .qmax(128)
19377 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19378 }
19379
19380 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, strided_cm) {
19381 TEST_REQUIRES_ARM_NEON_FMA;
19382 GemmMicrokernelTester()
19383 .mr(6)
19384 .nr(8)
19385 .kr(1)
19386 .sr(1)
19387 .m(6)
19388 .n(8)
19389 .k(2)
19390 .cm_stride(11)
19391 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
19392 }
19393#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19394
19395
19396#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19397 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4) {
19398 TEST_REQUIRES_ARM_NEON_FMA;
19399 GemmMicrokernelTester()
19400 .mr(6)
19401 .nr(8)
19402 .kr(1)
19403 .sr(1)
19404 .m(6)
19405 .n(8)
19406 .k(4)
19407 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19408 }
19409
19410 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, strided_cn) {
19411 TEST_REQUIRES_ARM_NEON_FMA;
19412 GemmMicrokernelTester()
19413 .mr(6)
19414 .nr(8)
19415 .kr(1)
19416 .sr(1)
19417 .m(6)
19418 .n(8)
19419 .k(4)
19420 .cn_stride(11)
19421 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19422 }
19423
19424 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
19425 TEST_REQUIRES_ARM_NEON_FMA;
19426 GemmMicrokernelTester()
19427 .mr(6)
19428 .nr(8)
19429 .kr(1)
19430 .sr(1)
19431 .m(6)
19432 .n(8)
19433 .k(4)
19434 .a_stride(7)
19435 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19436 }
19437
19438 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
19439 TEST_REQUIRES_ARM_NEON_FMA;
19440 for (uint32_t m = 1; m <= 6; m++) {
19441 for (uint32_t n = 1; n <= 8; n++) {
19442 GemmMicrokernelTester()
19443 .mr(6)
19444 .nr(8)
19445 .kr(1)
19446 .sr(1)
19447 .m(m)
19448 .n(n)
19449 .k(4)
19450 .iterations(1)
19451 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19452 }
19453 }
19454 }
19455
19456 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
19457 TEST_REQUIRES_ARM_NEON_FMA;
19458 for (uint32_t m = 1; m <= 6; m++) {
19459 GemmMicrokernelTester()
19460 .mr(6)
19461 .nr(8)
19462 .kr(1)
19463 .sr(1)
19464 .m(m)
19465 .n(8)
19466 .k(4)
19467 .iterations(1)
19468 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19469 }
19470 }
19471
19472 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
19473 TEST_REQUIRES_ARM_NEON_FMA;
19474 for (uint32_t n = 1; n <= 8; n++) {
19475 GemmMicrokernelTester()
19476 .mr(6)
19477 .nr(8)
19478 .kr(1)
19479 .sr(1)
19480 .m(6)
19481 .n(n)
19482 .k(4)
19483 .iterations(1)
19484 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19485 }
19486 }
19487
19488 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_lt_4) {
19489 TEST_REQUIRES_ARM_NEON_FMA;
19490 for (size_t k = 1; k < 4; k++) {
19491 GemmMicrokernelTester()
19492 .mr(6)
19493 .nr(8)
19494 .kr(1)
19495 .sr(1)
19496 .m(6)
19497 .n(8)
19498 .k(k)
19499 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19500 }
19501 }
19502
19503 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
19504 TEST_REQUIRES_ARM_NEON_FMA;
19505 for (size_t k = 1; k < 4; k++) {
19506 GemmMicrokernelTester()
19507 .mr(6)
19508 .nr(8)
19509 .kr(1)
19510 .sr(1)
19511 .m(6)
19512 .n(8)
19513 .k(k)
19514 .a_stride(7)
19515 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19516 }
19517 }
19518
19519 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
19520 TEST_REQUIRES_ARM_NEON_FMA;
19521 for (size_t k = 1; k < 4; k++) {
19522 for (uint32_t m = 1; m <= 6; m++) {
19523 for (uint32_t n = 1; n <= 8; n++) {
19524 GemmMicrokernelTester()
19525 .mr(6)
19526 .nr(8)
19527 .kr(1)
19528 .sr(1)
19529 .m(m)
19530 .n(n)
19531 .k(k)
19532 .iterations(1)
19533 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19534 }
19535 }
19536 }
19537 }
19538
19539 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_gt_4) {
19540 TEST_REQUIRES_ARM_NEON_FMA;
19541 for (size_t k = 5; k < 8; k++) {
19542 GemmMicrokernelTester()
19543 .mr(6)
19544 .nr(8)
19545 .kr(1)
19546 .sr(1)
19547 .m(6)
19548 .n(8)
19549 .k(k)
19550 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19551 }
19552 }
19553
19554 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
19555 TEST_REQUIRES_ARM_NEON_FMA;
19556 for (size_t k = 5; k < 8; k++) {
19557 GemmMicrokernelTester()
19558 .mr(6)
19559 .nr(8)
19560 .kr(1)
19561 .sr(1)
19562 .m(6)
19563 .n(8)
19564 .k(k)
19565 .a_stride(11)
19566 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19567 }
19568 }
19569
19570 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
19571 TEST_REQUIRES_ARM_NEON_FMA;
19572 for (size_t k = 5; k < 8; k++) {
19573 for (uint32_t m = 1; m <= 6; m++) {
19574 for (uint32_t n = 1; n <= 8; n++) {
19575 GemmMicrokernelTester()
19576 .mr(6)
19577 .nr(8)
19578 .kr(1)
19579 .sr(1)
19580 .m(m)
19581 .n(n)
19582 .k(k)
19583 .iterations(1)
19584 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19585 }
19586 }
19587 }
19588 }
19589
19590 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_div_4) {
19591 TEST_REQUIRES_ARM_NEON_FMA;
19592 for (size_t k = 8; k <= 40; k += 4) {
19593 GemmMicrokernelTester()
19594 .mr(6)
19595 .nr(8)
19596 .kr(1)
19597 .sr(1)
19598 .m(6)
19599 .n(8)
19600 .k(k)
19601 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19602 }
19603 }
19604
19605 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
19606 TEST_REQUIRES_ARM_NEON_FMA;
19607 for (size_t k = 8; k <= 40; k += 4) {
19608 GemmMicrokernelTester()
19609 .mr(6)
19610 .nr(8)
19611 .kr(1)
19612 .sr(1)
19613 .m(6)
19614 .n(8)
19615 .k(k)
19616 .a_stride(43)
19617 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19618 }
19619 }
19620
19621 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
19622 TEST_REQUIRES_ARM_NEON_FMA;
19623 for (size_t k = 8; k <= 40; k += 4) {
19624 for (uint32_t m = 1; m <= 6; m++) {
19625 for (uint32_t n = 1; n <= 8; n++) {
19626 GemmMicrokernelTester()
19627 .mr(6)
19628 .nr(8)
19629 .kr(1)
19630 .sr(1)
19631 .m(m)
19632 .n(n)
19633 .k(k)
19634 .iterations(1)
19635 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19636 }
19637 }
19638 }
19639 }
19640
19641 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8) {
19642 TEST_REQUIRES_ARM_NEON_FMA;
19643 for (uint32_t n = 9; n < 16; n++) {
19644 for (size_t k = 1; k <= 20; k += 5) {
19645 GemmMicrokernelTester()
19646 .mr(6)
19647 .nr(8)
19648 .kr(1)
19649 .sr(1)
19650 .m(6)
19651 .n(8)
19652 .k(k)
19653 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19654 }
19655 }
19656 }
19657
19658 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
19659 TEST_REQUIRES_ARM_NEON_FMA;
19660 for (uint32_t n = 9; n < 16; n++) {
19661 for (size_t k = 1; k <= 20; k += 5) {
19662 GemmMicrokernelTester()
19663 .mr(6)
19664 .nr(8)
19665 .kr(1)
19666 .sr(1)
19667 .m(6)
19668 .n(8)
19669 .k(k)
19670 .cn_stride(11)
19671 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19672 }
19673 }
19674 }
19675
19676 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
19677 TEST_REQUIRES_ARM_NEON_FMA;
19678 for (uint32_t n = 9; n < 16; n++) {
19679 for (size_t k = 1; k <= 20; k += 5) {
19680 GemmMicrokernelTester()
19681 .mr(6)
19682 .nr(8)
19683 .kr(1)
19684 .sr(1)
19685 .m(6)
19686 .n(n)
19687 .k(k)
19688 .a_stride(23)
19689 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19690 }
19691 }
19692 }
19693
19694 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
19695 TEST_REQUIRES_ARM_NEON_FMA;
19696 for (uint32_t n = 9; n < 16; n++) {
19697 for (size_t k = 1; k <= 20; k += 5) {
19698 for (uint32_t m = 1; m <= 6; m++) {
19699 GemmMicrokernelTester()
19700 .mr(6)
19701 .nr(8)
19702 .kr(1)
19703 .sr(1)
19704 .m(m)
19705 .n(n)
19706 .k(k)
19707 .iterations(1)
19708 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19709 }
19710 }
19711 }
19712 }
19713
19714 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8) {
19715 TEST_REQUIRES_ARM_NEON_FMA;
19716 for (uint32_t n = 16; n <= 24; n += 8) {
19717 for (size_t k = 1; k <= 20; k += 5) {
19718 GemmMicrokernelTester()
19719 .mr(6)
19720 .nr(8)
19721 .kr(1)
19722 .sr(1)
19723 .m(6)
19724 .n(8)
19725 .k(k)
19726 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19727 }
19728 }
19729 }
19730
19731 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
19732 TEST_REQUIRES_ARM_NEON_FMA;
19733 for (uint32_t n = 16; n <= 24; n += 8) {
19734 for (size_t k = 1; k <= 20; k += 5) {
19735 GemmMicrokernelTester()
19736 .mr(6)
19737 .nr(8)
19738 .kr(1)
19739 .sr(1)
19740 .m(6)
19741 .n(n)
19742 .k(k)
19743 .cn_stride(11)
19744 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19745 }
19746 }
19747 }
19748
19749 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
19750 TEST_REQUIRES_ARM_NEON_FMA;
19751 for (uint32_t n = 16; n <= 24; n += 8) {
19752 for (size_t k = 1; k <= 20; k += 5) {
19753 GemmMicrokernelTester()
19754 .mr(6)
19755 .nr(8)
19756 .kr(1)
19757 .sr(1)
19758 .m(6)
19759 .n(n)
19760 .k(k)
19761 .a_stride(23)
19762 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19763 }
19764 }
19765 }
19766
19767 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
19768 TEST_REQUIRES_ARM_NEON_FMA;
19769 for (uint32_t n = 16; n <= 24; n += 8) {
19770 for (size_t k = 1; k <= 20; k += 5) {
19771 for (uint32_t m = 1; m <= 6; m++) {
19772 GemmMicrokernelTester()
19773 .mr(6)
19774 .nr(8)
19775 .kr(1)
19776 .sr(1)
19777 .m(m)
19778 .n(n)
19779 .k(k)
19780 .iterations(1)
19781 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19782 }
19783 }
19784 }
19785 }
19786
19787 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
19788 TEST_REQUIRES_ARM_NEON_FMA;
19789 for (size_t k = 1; k <= 20; k += 5) {
19790 for (uint32_t m = 1; m <= 6; m++) {
19791 for (uint32_t n = 1; n <= 8; n++) {
19792 GemmMicrokernelTester()
19793 .mr(6)
19794 .nr(8)
19795 .kr(1)
19796 .sr(1)
19797 .m(m)
19798 .n(n)
19799 .k(k)
19800 .cm_stride(11)
19801 .iterations(1)
19802 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19803 }
19804 }
19805 }
19806 }
19807
19808 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, qmin) {
19809 TEST_REQUIRES_ARM_NEON_FMA;
19810 GemmMicrokernelTester()
19811 .mr(6)
19812 .nr(8)
19813 .kr(1)
19814 .sr(1)
19815 .m(6)
19816 .n(8)
19817 .k(4)
19818 .qmin(128)
19819 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19820 }
19821
19822 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, qmax) {
19823 TEST_REQUIRES_ARM_NEON_FMA;
19824 GemmMicrokernelTester()
19825 .mr(6)
19826 .nr(8)
19827 .kr(1)
19828 .sr(1)
19829 .m(6)
19830 .n(8)
19831 .k(4)
19832 .qmax(128)
19833 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19834 }
19835
19836 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, strided_cm) {
19837 TEST_REQUIRES_ARM_NEON_FMA;
19838 GemmMicrokernelTester()
19839 .mr(6)
19840 .nr(8)
19841 .kr(1)
19842 .sr(1)
19843 .m(6)
19844 .n(8)
19845 .k(4)
19846 .cm_stride(11)
19847 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
19848 }
19849#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19850
19851
19852#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19853 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4) {
19854 TEST_REQUIRES_ARM_NEON;
19855 GemmMicrokernelTester()
19856 .mr(1)
19857 .nr(8)
19858 .kr(1)
19859 .sr(4)
19860 .m(1)
19861 .n(8)
19862 .k(4)
19863 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19864 }
19865
19866 TEST(F32_GEMMINC_1X8S4__NEON, strided_cn) {
19867 TEST_REQUIRES_ARM_NEON;
19868 GemmMicrokernelTester()
19869 .mr(1)
19870 .nr(8)
19871 .kr(1)
19872 .sr(4)
19873 .m(1)
19874 .n(8)
19875 .k(4)
19876 .cn_stride(11)
19877 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19878 }
19879
19880 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_strided_a) {
19881 TEST_REQUIRES_ARM_NEON;
19882 GemmMicrokernelTester()
19883 .mr(1)
19884 .nr(8)
19885 .kr(1)
19886 .sr(4)
19887 .m(1)
19888 .n(8)
19889 .k(4)
19890 .a_stride(7)
19891 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19892 }
19893
19894 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_subtile) {
19895 TEST_REQUIRES_ARM_NEON;
19896 for (uint32_t m = 1; m <= 1; m++) {
19897 for (uint32_t n = 1; n <= 8; n++) {
19898 GemmMicrokernelTester()
19899 .mr(1)
19900 .nr(8)
19901 .kr(1)
19902 .sr(4)
19903 .m(m)
19904 .n(n)
19905 .k(4)
19906 .iterations(1)
19907 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19908 }
19909 }
19910 }
19911
19912 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_subtile_m) {
19913 TEST_REQUIRES_ARM_NEON;
19914 for (uint32_t m = 1; m <= 1; m++) {
19915 GemmMicrokernelTester()
19916 .mr(1)
19917 .nr(8)
19918 .kr(1)
19919 .sr(4)
19920 .m(m)
19921 .n(8)
19922 .k(4)
19923 .iterations(1)
19924 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19925 }
19926 }
19927
19928 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_subtile_n) {
19929 TEST_REQUIRES_ARM_NEON;
19930 for (uint32_t n = 1; n <= 8; n++) {
19931 GemmMicrokernelTester()
19932 .mr(1)
19933 .nr(8)
19934 .kr(1)
19935 .sr(4)
19936 .m(1)
19937 .n(n)
19938 .k(4)
19939 .iterations(1)
19940 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19941 }
19942 }
19943
19944 TEST(F32_GEMMINC_1X8S4__NEON, k_lt_4) {
19945 TEST_REQUIRES_ARM_NEON;
19946 for (size_t k = 1; k < 4; k++) {
19947 GemmMicrokernelTester()
19948 .mr(1)
19949 .nr(8)
19950 .kr(1)
19951 .sr(4)
19952 .m(1)
19953 .n(8)
19954 .k(k)
19955 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19956 }
19957 }
19958
19959 TEST(F32_GEMMINC_1X8S4__NEON, k_lt_4_strided_a) {
19960 TEST_REQUIRES_ARM_NEON;
19961 for (size_t k = 1; k < 4; k++) {
19962 GemmMicrokernelTester()
19963 .mr(1)
19964 .nr(8)
19965 .kr(1)
19966 .sr(4)
19967 .m(1)
19968 .n(8)
19969 .k(k)
19970 .a_stride(7)
19971 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19972 }
19973 }
19974
19975 TEST(F32_GEMMINC_1X8S4__NEON, k_lt_4_subtile) {
19976 TEST_REQUIRES_ARM_NEON;
19977 for (size_t k = 1; k < 4; k++) {
19978 for (uint32_t m = 1; m <= 1; m++) {
19979 for (uint32_t n = 1; n <= 8; n++) {
19980 GemmMicrokernelTester()
19981 .mr(1)
19982 .nr(8)
19983 .kr(1)
19984 .sr(4)
19985 .m(m)
19986 .n(n)
19987 .k(k)
19988 .iterations(1)
19989 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
19990 }
19991 }
19992 }
19993 }
19994
19995 TEST(F32_GEMMINC_1X8S4__NEON, k_gt_4) {
19996 TEST_REQUIRES_ARM_NEON;
19997 for (size_t k = 5; k < 8; k++) {
19998 GemmMicrokernelTester()
19999 .mr(1)
20000 .nr(8)
20001 .kr(1)
20002 .sr(4)
20003 .m(1)
20004 .n(8)
20005 .k(k)
20006 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20007 }
20008 }
20009
20010 TEST(F32_GEMMINC_1X8S4__NEON, k_gt_4_strided_a) {
20011 TEST_REQUIRES_ARM_NEON;
20012 for (size_t k = 5; k < 8; k++) {
20013 GemmMicrokernelTester()
20014 .mr(1)
20015 .nr(8)
20016 .kr(1)
20017 .sr(4)
20018 .m(1)
20019 .n(8)
20020 .k(k)
20021 .a_stride(11)
20022 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20023 }
20024 }
20025
20026 TEST(F32_GEMMINC_1X8S4__NEON, k_gt_4_subtile) {
20027 TEST_REQUIRES_ARM_NEON;
20028 for (size_t k = 5; k < 8; k++) {
20029 for (uint32_t m = 1; m <= 1; m++) {
20030 for (uint32_t n = 1; n <= 8; n++) {
20031 GemmMicrokernelTester()
20032 .mr(1)
20033 .nr(8)
20034 .kr(1)
20035 .sr(4)
20036 .m(m)
20037 .n(n)
20038 .k(k)
20039 .iterations(1)
20040 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20041 }
20042 }
20043 }
20044 }
20045
20046 TEST(F32_GEMMINC_1X8S4__NEON, k_div_4) {
20047 TEST_REQUIRES_ARM_NEON;
20048 for (size_t k = 8; k <= 40; k += 4) {
20049 GemmMicrokernelTester()
20050 .mr(1)
20051 .nr(8)
20052 .kr(1)
20053 .sr(4)
20054 .m(1)
20055 .n(8)
20056 .k(k)
20057 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20058 }
20059 }
20060
20061 TEST(F32_GEMMINC_1X8S4__NEON, k_div_4_strided_a) {
20062 TEST_REQUIRES_ARM_NEON;
20063 for (size_t k = 8; k <= 40; k += 4) {
20064 GemmMicrokernelTester()
20065 .mr(1)
20066 .nr(8)
20067 .kr(1)
20068 .sr(4)
20069 .m(1)
20070 .n(8)
20071 .k(k)
20072 .a_stride(43)
20073 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20074 }
20075 }
20076
20077 TEST(F32_GEMMINC_1X8S4__NEON, k_div_4_subtile) {
20078 TEST_REQUIRES_ARM_NEON;
20079 for (size_t k = 8; k <= 40; k += 4) {
20080 for (uint32_t m = 1; m <= 1; m++) {
20081 for (uint32_t n = 1; n <= 8; n++) {
20082 GemmMicrokernelTester()
20083 .mr(1)
20084 .nr(8)
20085 .kr(1)
20086 .sr(4)
20087 .m(m)
20088 .n(n)
20089 .k(k)
20090 .iterations(1)
20091 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20092 }
20093 }
20094 }
20095 }
20096
20097 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8) {
20098 TEST_REQUIRES_ARM_NEON;
20099 for (uint32_t n = 9; n < 16; n++) {
20100 for (size_t k = 1; k <= 20; k += 5) {
20101 GemmMicrokernelTester()
20102 .mr(1)
20103 .nr(8)
20104 .kr(1)
20105 .sr(4)
20106 .m(1)
20107 .n(8)
20108 .k(k)
20109 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20110 }
20111 }
20112 }
20113
20114 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8_strided_cn) {
20115 TEST_REQUIRES_ARM_NEON;
20116 for (uint32_t n = 9; n < 16; n++) {
20117 for (size_t k = 1; k <= 20; k += 5) {
20118 GemmMicrokernelTester()
20119 .mr(1)
20120 .nr(8)
20121 .kr(1)
20122 .sr(4)
20123 .m(1)
20124 .n(8)
20125 .k(k)
20126 .cn_stride(11)
20127 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20128 }
20129 }
20130 }
20131
20132 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8_strided_a) {
20133 TEST_REQUIRES_ARM_NEON;
20134 for (uint32_t n = 9; n < 16; n++) {
20135 for (size_t k = 1; k <= 20; k += 5) {
20136 GemmMicrokernelTester()
20137 .mr(1)
20138 .nr(8)
20139 .kr(1)
20140 .sr(4)
20141 .m(1)
20142 .n(n)
20143 .k(k)
20144 .a_stride(23)
20145 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20146 }
20147 }
20148 }
20149
20150 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8_subtile) {
20151 TEST_REQUIRES_ARM_NEON;
20152 for (uint32_t n = 9; n < 16; n++) {
20153 for (size_t k = 1; k <= 20; k += 5) {
20154 for (uint32_t m = 1; m <= 1; m++) {
20155 GemmMicrokernelTester()
20156 .mr(1)
20157 .nr(8)
20158 .kr(1)
20159 .sr(4)
20160 .m(m)
20161 .n(n)
20162 .k(k)
20163 .iterations(1)
20164 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20165 }
20166 }
20167 }
20168 }
20169
20170 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8) {
20171 TEST_REQUIRES_ARM_NEON;
20172 for (uint32_t n = 16; n <= 24; n += 8) {
20173 for (size_t k = 1; k <= 20; k += 5) {
20174 GemmMicrokernelTester()
20175 .mr(1)
20176 .nr(8)
20177 .kr(1)
20178 .sr(4)
20179 .m(1)
20180 .n(8)
20181 .k(k)
20182 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20183 }
20184 }
20185 }
20186
20187 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8_strided_cn) {
20188 TEST_REQUIRES_ARM_NEON;
20189 for (uint32_t n = 16; n <= 24; n += 8) {
20190 for (size_t k = 1; k <= 20; k += 5) {
20191 GemmMicrokernelTester()
20192 .mr(1)
20193 .nr(8)
20194 .kr(1)
20195 .sr(4)
20196 .m(1)
20197 .n(n)
20198 .k(k)
20199 .cn_stride(11)
20200 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20201 }
20202 }
20203 }
20204
20205 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8_strided_a) {
20206 TEST_REQUIRES_ARM_NEON;
20207 for (uint32_t n = 16; n <= 24; n += 8) {
20208 for (size_t k = 1; k <= 20; k += 5) {
20209 GemmMicrokernelTester()
20210 .mr(1)
20211 .nr(8)
20212 .kr(1)
20213 .sr(4)
20214 .m(1)
20215 .n(n)
20216 .k(k)
20217 .a_stride(23)
20218 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20219 }
20220 }
20221 }
20222
20223 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8_subtile) {
20224 TEST_REQUIRES_ARM_NEON;
20225 for (uint32_t n = 16; n <= 24; n += 8) {
20226 for (size_t k = 1; k <= 20; k += 5) {
20227 for (uint32_t m = 1; m <= 1; m++) {
20228 GemmMicrokernelTester()
20229 .mr(1)
20230 .nr(8)
20231 .kr(1)
20232 .sr(4)
20233 .m(m)
20234 .n(n)
20235 .k(k)
20236 .iterations(1)
20237 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20238 }
20239 }
20240 }
20241 }
20242
20243 TEST(F32_GEMMINC_1X8S4__NEON, strided_cm_subtile) {
20244 TEST_REQUIRES_ARM_NEON;
20245 for (size_t k = 1; k <= 20; k += 5) {
20246 for (uint32_t m = 1; m <= 1; m++) {
20247 for (uint32_t n = 1; n <= 8; n++) {
20248 GemmMicrokernelTester()
20249 .mr(1)
20250 .nr(8)
20251 .kr(1)
20252 .sr(4)
20253 .m(m)
20254 .n(n)
20255 .k(k)
20256 .cm_stride(11)
20257 .iterations(1)
20258 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20259 }
20260 }
20261 }
20262 }
20263
20264 TEST(F32_GEMMINC_1X8S4__NEON, qmin) {
20265 TEST_REQUIRES_ARM_NEON;
20266 GemmMicrokernelTester()
20267 .mr(1)
20268 .nr(8)
20269 .kr(1)
20270 .sr(4)
20271 .m(1)
20272 .n(8)
20273 .k(4)
20274 .qmin(128)
20275 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20276 }
20277
20278 TEST(F32_GEMMINC_1X8S4__NEON, qmax) {
20279 TEST_REQUIRES_ARM_NEON;
20280 GemmMicrokernelTester()
20281 .mr(1)
20282 .nr(8)
20283 .kr(1)
20284 .sr(4)
20285 .m(1)
20286 .n(8)
20287 .k(4)
20288 .qmax(128)
20289 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20290 }
20291
20292 TEST(F32_GEMMINC_1X8S4__NEON, strided_cm) {
20293 TEST_REQUIRES_ARM_NEON;
20294 GemmMicrokernelTester()
20295 .mr(1)
20296 .nr(8)
20297 .kr(1)
20298 .sr(4)
20299 .m(1)
20300 .n(8)
20301 .k(4)
20302 .cm_stride(11)
20303 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
20304 }
20305#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20306
20307
20308#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20309 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4) {
20310 TEST_REQUIRES_ARM_NEON;
20311 GemmMicrokernelTester()
20312 .mr(4)
20313 .nr(8)
20314 .kr(1)
20315 .sr(4)
20316 .m(4)
20317 .n(8)
20318 .k(4)
20319 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20320 }
20321
20322 TEST(F32_GEMMINC_4X8S4__NEON, strided_cn) {
20323 TEST_REQUIRES_ARM_NEON;
20324 GemmMicrokernelTester()
20325 .mr(4)
20326 .nr(8)
20327 .kr(1)
20328 .sr(4)
20329 .m(4)
20330 .n(8)
20331 .k(4)
20332 .cn_stride(11)
20333 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20334 }
20335
20336 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_strided_a) {
20337 TEST_REQUIRES_ARM_NEON;
20338 GemmMicrokernelTester()
20339 .mr(4)
20340 .nr(8)
20341 .kr(1)
20342 .sr(4)
20343 .m(4)
20344 .n(8)
20345 .k(4)
20346 .a_stride(7)
20347 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20348 }
20349
20350 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_subtile) {
20351 TEST_REQUIRES_ARM_NEON;
20352 for (uint32_t m = 1; m <= 4; m++) {
20353 for (uint32_t n = 1; n <= 8; n++) {
20354 GemmMicrokernelTester()
20355 .mr(4)
20356 .nr(8)
20357 .kr(1)
20358 .sr(4)
20359 .m(m)
20360 .n(n)
20361 .k(4)
20362 .iterations(1)
20363 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20364 }
20365 }
20366 }
20367
20368 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_subtile_m) {
20369 TEST_REQUIRES_ARM_NEON;
20370 for (uint32_t m = 1; m <= 4; m++) {
20371 GemmMicrokernelTester()
20372 .mr(4)
20373 .nr(8)
20374 .kr(1)
20375 .sr(4)
20376 .m(m)
20377 .n(8)
20378 .k(4)
20379 .iterations(1)
20380 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20381 }
20382 }
20383
20384 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_subtile_n) {
20385 TEST_REQUIRES_ARM_NEON;
20386 for (uint32_t n = 1; n <= 8; n++) {
20387 GemmMicrokernelTester()
20388 .mr(4)
20389 .nr(8)
20390 .kr(1)
20391 .sr(4)
20392 .m(4)
20393 .n(n)
20394 .k(4)
20395 .iterations(1)
20396 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20397 }
20398 }
20399
20400 TEST(F32_GEMMINC_4X8S4__NEON, k_lt_4) {
20401 TEST_REQUIRES_ARM_NEON;
20402 for (size_t k = 1; k < 4; k++) {
20403 GemmMicrokernelTester()
20404 .mr(4)
20405 .nr(8)
20406 .kr(1)
20407 .sr(4)
20408 .m(4)
20409 .n(8)
20410 .k(k)
20411 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20412 }
20413 }
20414
20415 TEST(F32_GEMMINC_4X8S4__NEON, k_lt_4_strided_a) {
20416 TEST_REQUIRES_ARM_NEON;
20417 for (size_t k = 1; k < 4; k++) {
20418 GemmMicrokernelTester()
20419 .mr(4)
20420 .nr(8)
20421 .kr(1)
20422 .sr(4)
20423 .m(4)
20424 .n(8)
20425 .k(k)
20426 .a_stride(7)
20427 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20428 }
20429 }
20430
20431 TEST(F32_GEMMINC_4X8S4__NEON, k_lt_4_subtile) {
20432 TEST_REQUIRES_ARM_NEON;
20433 for (size_t k = 1; k < 4; k++) {
20434 for (uint32_t m = 1; m <= 4; m++) {
20435 for (uint32_t n = 1; n <= 8; n++) {
20436 GemmMicrokernelTester()
20437 .mr(4)
20438 .nr(8)
20439 .kr(1)
20440 .sr(4)
20441 .m(m)
20442 .n(n)
20443 .k(k)
20444 .iterations(1)
20445 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20446 }
20447 }
20448 }
20449 }
20450
20451 TEST(F32_GEMMINC_4X8S4__NEON, k_gt_4) {
20452 TEST_REQUIRES_ARM_NEON;
20453 for (size_t k = 5; k < 8; k++) {
20454 GemmMicrokernelTester()
20455 .mr(4)
20456 .nr(8)
20457 .kr(1)
20458 .sr(4)
20459 .m(4)
20460 .n(8)
20461 .k(k)
20462 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20463 }
20464 }
20465
20466 TEST(F32_GEMMINC_4X8S4__NEON, k_gt_4_strided_a) {
20467 TEST_REQUIRES_ARM_NEON;
20468 for (size_t k = 5; k < 8; k++) {
20469 GemmMicrokernelTester()
20470 .mr(4)
20471 .nr(8)
20472 .kr(1)
20473 .sr(4)
20474 .m(4)
20475 .n(8)
20476 .k(k)
20477 .a_stride(11)
20478 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20479 }
20480 }
20481
20482 TEST(F32_GEMMINC_4X8S4__NEON, k_gt_4_subtile) {
20483 TEST_REQUIRES_ARM_NEON;
20484 for (size_t k = 5; k < 8; k++) {
20485 for (uint32_t m = 1; m <= 4; m++) {
20486 for (uint32_t n = 1; n <= 8; n++) {
20487 GemmMicrokernelTester()
20488 .mr(4)
20489 .nr(8)
20490 .kr(1)
20491 .sr(4)
20492 .m(m)
20493 .n(n)
20494 .k(k)
20495 .iterations(1)
20496 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20497 }
20498 }
20499 }
20500 }
20501
20502 TEST(F32_GEMMINC_4X8S4__NEON, k_div_4) {
20503 TEST_REQUIRES_ARM_NEON;
20504 for (size_t k = 8; k <= 40; k += 4) {
20505 GemmMicrokernelTester()
20506 .mr(4)
20507 .nr(8)
20508 .kr(1)
20509 .sr(4)
20510 .m(4)
20511 .n(8)
20512 .k(k)
20513 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20514 }
20515 }
20516
20517 TEST(F32_GEMMINC_4X8S4__NEON, k_div_4_strided_a) {
20518 TEST_REQUIRES_ARM_NEON;
20519 for (size_t k = 8; k <= 40; k += 4) {
20520 GemmMicrokernelTester()
20521 .mr(4)
20522 .nr(8)
20523 .kr(1)
20524 .sr(4)
20525 .m(4)
20526 .n(8)
20527 .k(k)
20528 .a_stride(43)
20529 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20530 }
20531 }
20532
20533 TEST(F32_GEMMINC_4X8S4__NEON, k_div_4_subtile) {
20534 TEST_REQUIRES_ARM_NEON;
20535 for (size_t k = 8; k <= 40; k += 4) {
20536 for (uint32_t m = 1; m <= 4; m++) {
20537 for (uint32_t n = 1; n <= 8; n++) {
20538 GemmMicrokernelTester()
20539 .mr(4)
20540 .nr(8)
20541 .kr(1)
20542 .sr(4)
20543 .m(m)
20544 .n(n)
20545 .k(k)
20546 .iterations(1)
20547 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20548 }
20549 }
20550 }
20551 }
20552
20553 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8) {
20554 TEST_REQUIRES_ARM_NEON;
20555 for (uint32_t n = 9; n < 16; n++) {
20556 for (size_t k = 1; k <= 20; k += 5) {
20557 GemmMicrokernelTester()
20558 .mr(4)
20559 .nr(8)
20560 .kr(1)
20561 .sr(4)
20562 .m(4)
20563 .n(8)
20564 .k(k)
20565 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20566 }
20567 }
20568 }
20569
20570 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8_strided_cn) {
20571 TEST_REQUIRES_ARM_NEON;
20572 for (uint32_t n = 9; n < 16; n++) {
20573 for (size_t k = 1; k <= 20; k += 5) {
20574 GemmMicrokernelTester()
20575 .mr(4)
20576 .nr(8)
20577 .kr(1)
20578 .sr(4)
20579 .m(4)
20580 .n(8)
20581 .k(k)
20582 .cn_stride(11)
20583 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20584 }
20585 }
20586 }
20587
20588 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8_strided_a) {
20589 TEST_REQUIRES_ARM_NEON;
20590 for (uint32_t n = 9; n < 16; n++) {
20591 for (size_t k = 1; k <= 20; k += 5) {
20592 GemmMicrokernelTester()
20593 .mr(4)
20594 .nr(8)
20595 .kr(1)
20596 .sr(4)
20597 .m(4)
20598 .n(n)
20599 .k(k)
20600 .a_stride(23)
20601 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20602 }
20603 }
20604 }
20605
20606 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8_subtile) {
20607 TEST_REQUIRES_ARM_NEON;
20608 for (uint32_t n = 9; n < 16; n++) {
20609 for (size_t k = 1; k <= 20; k += 5) {
20610 for (uint32_t m = 1; m <= 4; m++) {
20611 GemmMicrokernelTester()
20612 .mr(4)
20613 .nr(8)
20614 .kr(1)
20615 .sr(4)
20616 .m(m)
20617 .n(n)
20618 .k(k)
20619 .iterations(1)
20620 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20621 }
20622 }
20623 }
20624 }
20625
20626 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8) {
20627 TEST_REQUIRES_ARM_NEON;
20628 for (uint32_t n = 16; n <= 24; n += 8) {
20629 for (size_t k = 1; k <= 20; k += 5) {
20630 GemmMicrokernelTester()
20631 .mr(4)
20632 .nr(8)
20633 .kr(1)
20634 .sr(4)
20635 .m(4)
20636 .n(8)
20637 .k(k)
20638 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20639 }
20640 }
20641 }
20642
20643 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8_strided_cn) {
20644 TEST_REQUIRES_ARM_NEON;
20645 for (uint32_t n = 16; n <= 24; n += 8) {
20646 for (size_t k = 1; k <= 20; k += 5) {
20647 GemmMicrokernelTester()
20648 .mr(4)
20649 .nr(8)
20650 .kr(1)
20651 .sr(4)
20652 .m(4)
20653 .n(n)
20654 .k(k)
20655 .cn_stride(11)
20656 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20657 }
20658 }
20659 }
20660
20661 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8_strided_a) {
20662 TEST_REQUIRES_ARM_NEON;
20663 for (uint32_t n = 16; n <= 24; n += 8) {
20664 for (size_t k = 1; k <= 20; k += 5) {
20665 GemmMicrokernelTester()
20666 .mr(4)
20667 .nr(8)
20668 .kr(1)
20669 .sr(4)
20670 .m(4)
20671 .n(n)
20672 .k(k)
20673 .a_stride(23)
20674 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20675 }
20676 }
20677 }
20678
20679 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8_subtile) {
20680 TEST_REQUIRES_ARM_NEON;
20681 for (uint32_t n = 16; n <= 24; n += 8) {
20682 for (size_t k = 1; k <= 20; k += 5) {
20683 for (uint32_t m = 1; m <= 4; m++) {
20684 GemmMicrokernelTester()
20685 .mr(4)
20686 .nr(8)
20687 .kr(1)
20688 .sr(4)
20689 .m(m)
20690 .n(n)
20691 .k(k)
20692 .iterations(1)
20693 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20694 }
20695 }
20696 }
20697 }
20698
20699 TEST(F32_GEMMINC_4X8S4__NEON, strided_cm_subtile) {
20700 TEST_REQUIRES_ARM_NEON;
20701 for (size_t k = 1; k <= 20; k += 5) {
20702 for (uint32_t m = 1; m <= 4; m++) {
20703 for (uint32_t n = 1; n <= 8; n++) {
20704 GemmMicrokernelTester()
20705 .mr(4)
20706 .nr(8)
20707 .kr(1)
20708 .sr(4)
20709 .m(m)
20710 .n(n)
20711 .k(k)
20712 .cm_stride(11)
20713 .iterations(1)
20714 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20715 }
20716 }
20717 }
20718 }
20719
20720 TEST(F32_GEMMINC_4X8S4__NEON, qmin) {
20721 TEST_REQUIRES_ARM_NEON;
20722 GemmMicrokernelTester()
20723 .mr(4)
20724 .nr(8)
20725 .kr(1)
20726 .sr(4)
20727 .m(4)
20728 .n(8)
20729 .k(4)
20730 .qmin(128)
20731 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20732 }
20733
20734 TEST(F32_GEMMINC_4X8S4__NEON, qmax) {
20735 TEST_REQUIRES_ARM_NEON;
20736 GemmMicrokernelTester()
20737 .mr(4)
20738 .nr(8)
20739 .kr(1)
20740 .sr(4)
20741 .m(4)
20742 .n(8)
20743 .k(4)
20744 .qmax(128)
20745 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20746 }
20747
20748 TEST(F32_GEMMINC_4X8S4__NEON, strided_cm) {
20749 TEST_REQUIRES_ARM_NEON;
20750 GemmMicrokernelTester()
20751 .mr(4)
20752 .nr(8)
20753 .kr(1)
20754 .sr(4)
20755 .m(4)
20756 .n(8)
20757 .k(4)
20758 .cm_stride(11)
20759 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
20760 }
20761#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20762
20763
20764#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20765 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4) {
20766 TEST_REQUIRES_ARM_NEON;
20767 GemmMicrokernelTester()
20768 .mr(6)
20769 .nr(8)
20770 .kr(1)
20771 .sr(4)
20772 .m(6)
20773 .n(8)
20774 .k(4)
20775 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20776 }
20777
20778 TEST(F32_GEMMINC_6X8S4__NEON, strided_cn) {
20779 TEST_REQUIRES_ARM_NEON;
20780 GemmMicrokernelTester()
20781 .mr(6)
20782 .nr(8)
20783 .kr(1)
20784 .sr(4)
20785 .m(6)
20786 .n(8)
20787 .k(4)
20788 .cn_stride(11)
20789 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20790 }
20791
20792 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_strided_a) {
20793 TEST_REQUIRES_ARM_NEON;
20794 GemmMicrokernelTester()
20795 .mr(6)
20796 .nr(8)
20797 .kr(1)
20798 .sr(4)
20799 .m(6)
20800 .n(8)
20801 .k(4)
20802 .a_stride(7)
20803 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20804 }
20805
20806 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_subtile) {
20807 TEST_REQUIRES_ARM_NEON;
20808 for (uint32_t m = 1; m <= 6; m++) {
20809 for (uint32_t n = 1; n <= 8; n++) {
20810 GemmMicrokernelTester()
20811 .mr(6)
20812 .nr(8)
20813 .kr(1)
20814 .sr(4)
20815 .m(m)
20816 .n(n)
20817 .k(4)
20818 .iterations(1)
20819 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20820 }
20821 }
20822 }
20823
20824 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_subtile_m) {
20825 TEST_REQUIRES_ARM_NEON;
20826 for (uint32_t m = 1; m <= 6; m++) {
20827 GemmMicrokernelTester()
20828 .mr(6)
20829 .nr(8)
20830 .kr(1)
20831 .sr(4)
20832 .m(m)
20833 .n(8)
20834 .k(4)
20835 .iterations(1)
20836 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20837 }
20838 }
20839
20840 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_subtile_n) {
20841 TEST_REQUIRES_ARM_NEON;
20842 for (uint32_t n = 1; n <= 8; n++) {
20843 GemmMicrokernelTester()
20844 .mr(6)
20845 .nr(8)
20846 .kr(1)
20847 .sr(4)
20848 .m(6)
20849 .n(n)
20850 .k(4)
20851 .iterations(1)
20852 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20853 }
20854 }
20855
20856 TEST(F32_GEMMINC_6X8S4__NEON, k_lt_4) {
20857 TEST_REQUIRES_ARM_NEON;
20858 for (size_t k = 1; k < 4; k++) {
20859 GemmMicrokernelTester()
20860 .mr(6)
20861 .nr(8)
20862 .kr(1)
20863 .sr(4)
20864 .m(6)
20865 .n(8)
20866 .k(k)
20867 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20868 }
20869 }
20870
20871 TEST(F32_GEMMINC_6X8S4__NEON, k_lt_4_strided_a) {
20872 TEST_REQUIRES_ARM_NEON;
20873 for (size_t k = 1; k < 4; k++) {
20874 GemmMicrokernelTester()
20875 .mr(6)
20876 .nr(8)
20877 .kr(1)
20878 .sr(4)
20879 .m(6)
20880 .n(8)
20881 .k(k)
20882 .a_stride(7)
20883 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20884 }
20885 }
20886
20887 TEST(F32_GEMMINC_6X8S4__NEON, k_lt_4_subtile) {
20888 TEST_REQUIRES_ARM_NEON;
20889 for (size_t k = 1; k < 4; k++) {
20890 for (uint32_t m = 1; m <= 6; m++) {
20891 for (uint32_t n = 1; n <= 8; n++) {
20892 GemmMicrokernelTester()
20893 .mr(6)
20894 .nr(8)
20895 .kr(1)
20896 .sr(4)
20897 .m(m)
20898 .n(n)
20899 .k(k)
20900 .iterations(1)
20901 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20902 }
20903 }
20904 }
20905 }
20906
20907 TEST(F32_GEMMINC_6X8S4__NEON, k_gt_4) {
20908 TEST_REQUIRES_ARM_NEON;
20909 for (size_t k = 5; k < 8; k++) {
20910 GemmMicrokernelTester()
20911 .mr(6)
20912 .nr(8)
20913 .kr(1)
20914 .sr(4)
20915 .m(6)
20916 .n(8)
20917 .k(k)
20918 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20919 }
20920 }
20921
20922 TEST(F32_GEMMINC_6X8S4__NEON, k_gt_4_strided_a) {
20923 TEST_REQUIRES_ARM_NEON;
20924 for (size_t k = 5; k < 8; k++) {
20925 GemmMicrokernelTester()
20926 .mr(6)
20927 .nr(8)
20928 .kr(1)
20929 .sr(4)
20930 .m(6)
20931 .n(8)
20932 .k(k)
20933 .a_stride(11)
20934 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20935 }
20936 }
20937
20938 TEST(F32_GEMMINC_6X8S4__NEON, k_gt_4_subtile) {
20939 TEST_REQUIRES_ARM_NEON;
20940 for (size_t k = 5; k < 8; k++) {
20941 for (uint32_t m = 1; m <= 6; m++) {
20942 for (uint32_t n = 1; n <= 8; n++) {
20943 GemmMicrokernelTester()
20944 .mr(6)
20945 .nr(8)
20946 .kr(1)
20947 .sr(4)
20948 .m(m)
20949 .n(n)
20950 .k(k)
20951 .iterations(1)
20952 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20953 }
20954 }
20955 }
20956 }
20957
20958 TEST(F32_GEMMINC_6X8S4__NEON, k_div_4) {
20959 TEST_REQUIRES_ARM_NEON;
20960 for (size_t k = 8; k <= 40; k += 4) {
20961 GemmMicrokernelTester()
20962 .mr(6)
20963 .nr(8)
20964 .kr(1)
20965 .sr(4)
20966 .m(6)
20967 .n(8)
20968 .k(k)
20969 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20970 }
20971 }
20972
20973 TEST(F32_GEMMINC_6X8S4__NEON, k_div_4_strided_a) {
20974 TEST_REQUIRES_ARM_NEON;
20975 for (size_t k = 8; k <= 40; k += 4) {
20976 GemmMicrokernelTester()
20977 .mr(6)
20978 .nr(8)
20979 .kr(1)
20980 .sr(4)
20981 .m(6)
20982 .n(8)
20983 .k(k)
20984 .a_stride(43)
20985 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
20986 }
20987 }
20988
20989 TEST(F32_GEMMINC_6X8S4__NEON, k_div_4_subtile) {
20990 TEST_REQUIRES_ARM_NEON;
20991 for (size_t k = 8; k <= 40; k += 4) {
20992 for (uint32_t m = 1; m <= 6; m++) {
20993 for (uint32_t n = 1; n <= 8; n++) {
20994 GemmMicrokernelTester()
20995 .mr(6)
20996 .nr(8)
20997 .kr(1)
20998 .sr(4)
20999 .m(m)
21000 .n(n)
21001 .k(k)
21002 .iterations(1)
21003 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21004 }
21005 }
21006 }
21007 }
21008
21009 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8) {
21010 TEST_REQUIRES_ARM_NEON;
21011 for (uint32_t n = 9; n < 16; n++) {
21012 for (size_t k = 1; k <= 20; k += 5) {
21013 GemmMicrokernelTester()
21014 .mr(6)
21015 .nr(8)
21016 .kr(1)
21017 .sr(4)
21018 .m(6)
21019 .n(8)
21020 .k(k)
21021 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21022 }
21023 }
21024 }
21025
21026 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8_strided_cn) {
21027 TEST_REQUIRES_ARM_NEON;
21028 for (uint32_t n = 9; n < 16; n++) {
21029 for (size_t k = 1; k <= 20; k += 5) {
21030 GemmMicrokernelTester()
21031 .mr(6)
21032 .nr(8)
21033 .kr(1)
21034 .sr(4)
21035 .m(6)
21036 .n(8)
21037 .k(k)
21038 .cn_stride(11)
21039 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21040 }
21041 }
21042 }
21043
21044 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8_strided_a) {
21045 TEST_REQUIRES_ARM_NEON;
21046 for (uint32_t n = 9; n < 16; n++) {
21047 for (size_t k = 1; k <= 20; k += 5) {
21048 GemmMicrokernelTester()
21049 .mr(6)
21050 .nr(8)
21051 .kr(1)
21052 .sr(4)
21053 .m(6)
21054 .n(n)
21055 .k(k)
21056 .a_stride(23)
21057 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21058 }
21059 }
21060 }
21061
21062 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8_subtile) {
21063 TEST_REQUIRES_ARM_NEON;
21064 for (uint32_t n = 9; n < 16; n++) {
21065 for (size_t k = 1; k <= 20; k += 5) {
21066 for (uint32_t m = 1; m <= 6; m++) {
21067 GemmMicrokernelTester()
21068 .mr(6)
21069 .nr(8)
21070 .kr(1)
21071 .sr(4)
21072 .m(m)
21073 .n(n)
21074 .k(k)
21075 .iterations(1)
21076 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21077 }
21078 }
21079 }
21080 }
21081
21082 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8) {
21083 TEST_REQUIRES_ARM_NEON;
21084 for (uint32_t n = 16; n <= 24; n += 8) {
21085 for (size_t k = 1; k <= 20; k += 5) {
21086 GemmMicrokernelTester()
21087 .mr(6)
21088 .nr(8)
21089 .kr(1)
21090 .sr(4)
21091 .m(6)
21092 .n(8)
21093 .k(k)
21094 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21095 }
21096 }
21097 }
21098
21099 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8_strided_cn) {
21100 TEST_REQUIRES_ARM_NEON;
21101 for (uint32_t n = 16; n <= 24; n += 8) {
21102 for (size_t k = 1; k <= 20; k += 5) {
21103 GemmMicrokernelTester()
21104 .mr(6)
21105 .nr(8)
21106 .kr(1)
21107 .sr(4)
21108 .m(6)
21109 .n(n)
21110 .k(k)
21111 .cn_stride(11)
21112 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21113 }
21114 }
21115 }
21116
21117 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8_strided_a) {
21118 TEST_REQUIRES_ARM_NEON;
21119 for (uint32_t n = 16; n <= 24; n += 8) {
21120 for (size_t k = 1; k <= 20; k += 5) {
21121 GemmMicrokernelTester()
21122 .mr(6)
21123 .nr(8)
21124 .kr(1)
21125 .sr(4)
21126 .m(6)
21127 .n(n)
21128 .k(k)
21129 .a_stride(23)
21130 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21131 }
21132 }
21133 }
21134
21135 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8_subtile) {
21136 TEST_REQUIRES_ARM_NEON;
21137 for (uint32_t n = 16; n <= 24; n += 8) {
21138 for (size_t k = 1; k <= 20; k += 5) {
21139 for (uint32_t m = 1; m <= 6; m++) {
21140 GemmMicrokernelTester()
21141 .mr(6)
21142 .nr(8)
21143 .kr(1)
21144 .sr(4)
21145 .m(m)
21146 .n(n)
21147 .k(k)
21148 .iterations(1)
21149 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21150 }
21151 }
21152 }
21153 }
21154
21155 TEST(F32_GEMMINC_6X8S4__NEON, strided_cm_subtile) {
21156 TEST_REQUIRES_ARM_NEON;
21157 for (size_t k = 1; k <= 20; k += 5) {
21158 for (uint32_t m = 1; m <= 6; m++) {
21159 for (uint32_t n = 1; n <= 8; n++) {
21160 GemmMicrokernelTester()
21161 .mr(6)
21162 .nr(8)
21163 .kr(1)
21164 .sr(4)
21165 .m(m)
21166 .n(n)
21167 .k(k)
21168 .cm_stride(11)
21169 .iterations(1)
21170 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21171 }
21172 }
21173 }
21174 }
21175
21176 TEST(F32_GEMMINC_6X8S4__NEON, qmin) {
21177 TEST_REQUIRES_ARM_NEON;
21178 GemmMicrokernelTester()
21179 .mr(6)
21180 .nr(8)
21181 .kr(1)
21182 .sr(4)
21183 .m(6)
21184 .n(8)
21185 .k(4)
21186 .qmin(128)
21187 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21188 }
21189
21190 TEST(F32_GEMMINC_6X8S4__NEON, qmax) {
21191 TEST_REQUIRES_ARM_NEON;
21192 GemmMicrokernelTester()
21193 .mr(6)
21194 .nr(8)
21195 .kr(1)
21196 .sr(4)
21197 .m(6)
21198 .n(8)
21199 .k(4)
21200 .qmax(128)
21201 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21202 }
21203
21204 TEST(F32_GEMMINC_6X8S4__NEON, strided_cm) {
21205 TEST_REQUIRES_ARM_NEON;
21206 GemmMicrokernelTester()
21207 .mr(6)
21208 .nr(8)
21209 .kr(1)
21210 .sr(4)
21211 .m(6)
21212 .n(8)
21213 .k(4)
21214 .cm_stride(11)
21215 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
21216 }
21217#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21218
21219
21220#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21221 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4) {
21222 TEST_REQUIRES_ARM_NEON;
21223 GemmMicrokernelTester()
21224 .mr(8)
21225 .nr(8)
21226 .kr(1)
21227 .sr(4)
21228 .m(8)
21229 .n(8)
21230 .k(4)
21231 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21232 }
21233
21234 TEST(F32_GEMMINC_8X8S4__NEON, strided_cn) {
21235 TEST_REQUIRES_ARM_NEON;
21236 GemmMicrokernelTester()
21237 .mr(8)
21238 .nr(8)
21239 .kr(1)
21240 .sr(4)
21241 .m(8)
21242 .n(8)
21243 .k(4)
21244 .cn_stride(11)
21245 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21246 }
21247
21248 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_strided_a) {
21249 TEST_REQUIRES_ARM_NEON;
21250 GemmMicrokernelTester()
21251 .mr(8)
21252 .nr(8)
21253 .kr(1)
21254 .sr(4)
21255 .m(8)
21256 .n(8)
21257 .k(4)
21258 .a_stride(7)
21259 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21260 }
21261
21262 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_subtile) {
21263 TEST_REQUIRES_ARM_NEON;
21264 for (uint32_t m = 1; m <= 8; m++) {
21265 for (uint32_t n = 1; n <= 8; n++) {
21266 GemmMicrokernelTester()
21267 .mr(8)
21268 .nr(8)
21269 .kr(1)
21270 .sr(4)
21271 .m(m)
21272 .n(n)
21273 .k(4)
21274 .iterations(1)
21275 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21276 }
21277 }
21278 }
21279
21280 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_subtile_m) {
21281 TEST_REQUIRES_ARM_NEON;
21282 for (uint32_t m = 1; m <= 8; m++) {
21283 GemmMicrokernelTester()
21284 .mr(8)
21285 .nr(8)
21286 .kr(1)
21287 .sr(4)
21288 .m(m)
21289 .n(8)
21290 .k(4)
21291 .iterations(1)
21292 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21293 }
21294 }
21295
21296 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_subtile_n) {
21297 TEST_REQUIRES_ARM_NEON;
21298 for (uint32_t n = 1; n <= 8; n++) {
21299 GemmMicrokernelTester()
21300 .mr(8)
21301 .nr(8)
21302 .kr(1)
21303 .sr(4)
21304 .m(8)
21305 .n(n)
21306 .k(4)
21307 .iterations(1)
21308 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21309 }
21310 }
21311
21312 TEST(F32_GEMMINC_8X8S4__NEON, k_lt_4) {
21313 TEST_REQUIRES_ARM_NEON;
21314 for (size_t k = 1; k < 4; k++) {
21315 GemmMicrokernelTester()
21316 .mr(8)
21317 .nr(8)
21318 .kr(1)
21319 .sr(4)
21320 .m(8)
21321 .n(8)
21322 .k(k)
21323 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21324 }
21325 }
21326
21327 TEST(F32_GEMMINC_8X8S4__NEON, k_lt_4_strided_a) {
21328 TEST_REQUIRES_ARM_NEON;
21329 for (size_t k = 1; k < 4; k++) {
21330 GemmMicrokernelTester()
21331 .mr(8)
21332 .nr(8)
21333 .kr(1)
21334 .sr(4)
21335 .m(8)
21336 .n(8)
21337 .k(k)
21338 .a_stride(7)
21339 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21340 }
21341 }
21342
21343 TEST(F32_GEMMINC_8X8S4__NEON, k_lt_4_subtile) {
21344 TEST_REQUIRES_ARM_NEON;
21345 for (size_t k = 1; k < 4; k++) {
21346 for (uint32_t m = 1; m <= 8; m++) {
21347 for (uint32_t n = 1; n <= 8; n++) {
21348 GemmMicrokernelTester()
21349 .mr(8)
21350 .nr(8)
21351 .kr(1)
21352 .sr(4)
21353 .m(m)
21354 .n(n)
21355 .k(k)
21356 .iterations(1)
21357 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21358 }
21359 }
21360 }
21361 }
21362
21363 TEST(F32_GEMMINC_8X8S4__NEON, k_gt_4) {
21364 TEST_REQUIRES_ARM_NEON;
21365 for (size_t k = 5; k < 8; k++) {
21366 GemmMicrokernelTester()
21367 .mr(8)
21368 .nr(8)
21369 .kr(1)
21370 .sr(4)
21371 .m(8)
21372 .n(8)
21373 .k(k)
21374 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21375 }
21376 }
21377
21378 TEST(F32_GEMMINC_8X8S4__NEON, k_gt_4_strided_a) {
21379 TEST_REQUIRES_ARM_NEON;
21380 for (size_t k = 5; k < 8; k++) {
21381 GemmMicrokernelTester()
21382 .mr(8)
21383 .nr(8)
21384 .kr(1)
21385 .sr(4)
21386 .m(8)
21387 .n(8)
21388 .k(k)
21389 .a_stride(11)
21390 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21391 }
21392 }
21393
21394 TEST(F32_GEMMINC_8X8S4__NEON, k_gt_4_subtile) {
21395 TEST_REQUIRES_ARM_NEON;
21396 for (size_t k = 5; k < 8; k++) {
21397 for (uint32_t m = 1; m <= 8; m++) {
21398 for (uint32_t n = 1; n <= 8; n++) {
21399 GemmMicrokernelTester()
21400 .mr(8)
21401 .nr(8)
21402 .kr(1)
21403 .sr(4)
21404 .m(m)
21405 .n(n)
21406 .k(k)
21407 .iterations(1)
21408 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21409 }
21410 }
21411 }
21412 }
21413
21414 TEST(F32_GEMMINC_8X8S4__NEON, k_div_4) {
21415 TEST_REQUIRES_ARM_NEON;
21416 for (size_t k = 8; k <= 40; k += 4) {
21417 GemmMicrokernelTester()
21418 .mr(8)
21419 .nr(8)
21420 .kr(1)
21421 .sr(4)
21422 .m(8)
21423 .n(8)
21424 .k(k)
21425 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21426 }
21427 }
21428
21429 TEST(F32_GEMMINC_8X8S4__NEON, k_div_4_strided_a) {
21430 TEST_REQUIRES_ARM_NEON;
21431 for (size_t k = 8; k <= 40; k += 4) {
21432 GemmMicrokernelTester()
21433 .mr(8)
21434 .nr(8)
21435 .kr(1)
21436 .sr(4)
21437 .m(8)
21438 .n(8)
21439 .k(k)
21440 .a_stride(43)
21441 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21442 }
21443 }
21444
21445 TEST(F32_GEMMINC_8X8S4__NEON, k_div_4_subtile) {
21446 TEST_REQUIRES_ARM_NEON;
21447 for (size_t k = 8; k <= 40; k += 4) {
21448 for (uint32_t m = 1; m <= 8; m++) {
21449 for (uint32_t n = 1; n <= 8; n++) {
21450 GemmMicrokernelTester()
21451 .mr(8)
21452 .nr(8)
21453 .kr(1)
21454 .sr(4)
21455 .m(m)
21456 .n(n)
21457 .k(k)
21458 .iterations(1)
21459 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21460 }
21461 }
21462 }
21463 }
21464
21465 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8) {
21466 TEST_REQUIRES_ARM_NEON;
21467 for (uint32_t n = 9; n < 16; n++) {
21468 for (size_t k = 1; k <= 20; k += 5) {
21469 GemmMicrokernelTester()
21470 .mr(8)
21471 .nr(8)
21472 .kr(1)
21473 .sr(4)
21474 .m(8)
21475 .n(8)
21476 .k(k)
21477 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21478 }
21479 }
21480 }
21481
21482 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8_strided_cn) {
21483 TEST_REQUIRES_ARM_NEON;
21484 for (uint32_t n = 9; n < 16; n++) {
21485 for (size_t k = 1; k <= 20; k += 5) {
21486 GemmMicrokernelTester()
21487 .mr(8)
21488 .nr(8)
21489 .kr(1)
21490 .sr(4)
21491 .m(8)
21492 .n(8)
21493 .k(k)
21494 .cn_stride(11)
21495 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21496 }
21497 }
21498 }
21499
21500 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8_strided_a) {
21501 TEST_REQUIRES_ARM_NEON;
21502 for (uint32_t n = 9; n < 16; n++) {
21503 for (size_t k = 1; k <= 20; k += 5) {
21504 GemmMicrokernelTester()
21505 .mr(8)
21506 .nr(8)
21507 .kr(1)
21508 .sr(4)
21509 .m(8)
21510 .n(n)
21511 .k(k)
21512 .a_stride(23)
21513 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21514 }
21515 }
21516 }
21517
21518 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8_subtile) {
21519 TEST_REQUIRES_ARM_NEON;
21520 for (uint32_t n = 9; n < 16; n++) {
21521 for (size_t k = 1; k <= 20; k += 5) {
21522 for (uint32_t m = 1; m <= 8; m++) {
21523 GemmMicrokernelTester()
21524 .mr(8)
21525 .nr(8)
21526 .kr(1)
21527 .sr(4)
21528 .m(m)
21529 .n(n)
21530 .k(k)
21531 .iterations(1)
21532 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21533 }
21534 }
21535 }
21536 }
21537
21538 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8) {
21539 TEST_REQUIRES_ARM_NEON;
21540 for (uint32_t n = 16; n <= 24; n += 8) {
21541 for (size_t k = 1; k <= 20; k += 5) {
21542 GemmMicrokernelTester()
21543 .mr(8)
21544 .nr(8)
21545 .kr(1)
21546 .sr(4)
21547 .m(8)
21548 .n(8)
21549 .k(k)
21550 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21551 }
21552 }
21553 }
21554
21555 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8_strided_cn) {
21556 TEST_REQUIRES_ARM_NEON;
21557 for (uint32_t n = 16; n <= 24; n += 8) {
21558 for (size_t k = 1; k <= 20; k += 5) {
21559 GemmMicrokernelTester()
21560 .mr(8)
21561 .nr(8)
21562 .kr(1)
21563 .sr(4)
21564 .m(8)
21565 .n(n)
21566 .k(k)
21567 .cn_stride(11)
21568 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21569 }
21570 }
21571 }
21572
21573 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8_strided_a) {
21574 TEST_REQUIRES_ARM_NEON;
21575 for (uint32_t n = 16; n <= 24; n += 8) {
21576 for (size_t k = 1; k <= 20; k += 5) {
21577 GemmMicrokernelTester()
21578 .mr(8)
21579 .nr(8)
21580 .kr(1)
21581 .sr(4)
21582 .m(8)
21583 .n(n)
21584 .k(k)
21585 .a_stride(23)
21586 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21587 }
21588 }
21589 }
21590
21591 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8_subtile) {
21592 TEST_REQUIRES_ARM_NEON;
21593 for (uint32_t n = 16; n <= 24; n += 8) {
21594 for (size_t k = 1; k <= 20; k += 5) {
21595 for (uint32_t m = 1; m <= 8; m++) {
21596 GemmMicrokernelTester()
21597 .mr(8)
21598 .nr(8)
21599 .kr(1)
21600 .sr(4)
21601 .m(m)
21602 .n(n)
21603 .k(k)
21604 .iterations(1)
21605 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21606 }
21607 }
21608 }
21609 }
21610
21611 TEST(F32_GEMMINC_8X8S4__NEON, strided_cm_subtile) {
21612 TEST_REQUIRES_ARM_NEON;
21613 for (size_t k = 1; k <= 20; k += 5) {
21614 for (uint32_t m = 1; m <= 8; m++) {
21615 for (uint32_t n = 1; n <= 8; n++) {
21616 GemmMicrokernelTester()
21617 .mr(8)
21618 .nr(8)
21619 .kr(1)
21620 .sr(4)
21621 .m(m)
21622 .n(n)
21623 .k(k)
21624 .cm_stride(11)
21625 .iterations(1)
21626 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21627 }
21628 }
21629 }
21630 }
21631
21632 TEST(F32_GEMMINC_8X8S4__NEON, qmin) {
21633 TEST_REQUIRES_ARM_NEON;
21634 GemmMicrokernelTester()
21635 .mr(8)
21636 .nr(8)
21637 .kr(1)
21638 .sr(4)
21639 .m(8)
21640 .n(8)
21641 .k(4)
21642 .qmin(128)
21643 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21644 }
21645
21646 TEST(F32_GEMMINC_8X8S4__NEON, qmax) {
21647 TEST_REQUIRES_ARM_NEON;
21648 GemmMicrokernelTester()
21649 .mr(8)
21650 .nr(8)
21651 .kr(1)
21652 .sr(4)
21653 .m(8)
21654 .n(8)
21655 .k(4)
21656 .qmax(128)
21657 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21658 }
21659
21660 TEST(F32_GEMMINC_8X8S4__NEON, strided_cm) {
21661 TEST_REQUIRES_ARM_NEON;
21662 GemmMicrokernelTester()
21663 .mr(8)
21664 .nr(8)
21665 .kr(1)
21666 .sr(4)
21667 .m(8)
21668 .n(8)
21669 .k(4)
21670 .cm_stride(11)
21671 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
21672 }
21673#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21674
21675
21676#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21677 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4) {
21678 TEST_REQUIRES_ARM_NEON_FMA;
21679 GemmMicrokernelTester()
21680 .mr(1)
21681 .nr(8)
21682 .kr(1)
21683 .sr(4)
21684 .m(1)
21685 .n(8)
21686 .k(4)
21687 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21688 }
21689
21690 TEST(F32_GEMMINC_1X8S4__NEONFMA, strided_cn) {
21691 TEST_REQUIRES_ARM_NEON_FMA;
21692 GemmMicrokernelTester()
21693 .mr(1)
21694 .nr(8)
21695 .kr(1)
21696 .sr(4)
21697 .m(1)
21698 .n(8)
21699 .k(4)
21700 .cn_stride(11)
21701 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21702 }
21703
21704 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_strided_a) {
21705 TEST_REQUIRES_ARM_NEON_FMA;
21706 GemmMicrokernelTester()
21707 .mr(1)
21708 .nr(8)
21709 .kr(1)
21710 .sr(4)
21711 .m(1)
21712 .n(8)
21713 .k(4)
21714 .a_stride(7)
21715 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21716 }
21717
21718 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_subtile) {
21719 TEST_REQUIRES_ARM_NEON_FMA;
21720 for (uint32_t m = 1; m <= 1; m++) {
21721 for (uint32_t n = 1; n <= 8; n++) {
21722 GemmMicrokernelTester()
21723 .mr(1)
21724 .nr(8)
21725 .kr(1)
21726 .sr(4)
21727 .m(m)
21728 .n(n)
21729 .k(4)
21730 .iterations(1)
21731 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21732 }
21733 }
21734 }
21735
21736 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_subtile_m) {
21737 TEST_REQUIRES_ARM_NEON_FMA;
21738 for (uint32_t m = 1; m <= 1; m++) {
21739 GemmMicrokernelTester()
21740 .mr(1)
21741 .nr(8)
21742 .kr(1)
21743 .sr(4)
21744 .m(m)
21745 .n(8)
21746 .k(4)
21747 .iterations(1)
21748 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21749 }
21750 }
21751
21752 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_subtile_n) {
21753 TEST_REQUIRES_ARM_NEON_FMA;
21754 for (uint32_t n = 1; n <= 8; n++) {
21755 GemmMicrokernelTester()
21756 .mr(1)
21757 .nr(8)
21758 .kr(1)
21759 .sr(4)
21760 .m(1)
21761 .n(n)
21762 .k(4)
21763 .iterations(1)
21764 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21765 }
21766 }
21767
21768 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_lt_4) {
21769 TEST_REQUIRES_ARM_NEON_FMA;
21770 for (size_t k = 1; k < 4; k++) {
21771 GemmMicrokernelTester()
21772 .mr(1)
21773 .nr(8)
21774 .kr(1)
21775 .sr(4)
21776 .m(1)
21777 .n(8)
21778 .k(k)
21779 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21780 }
21781 }
21782
21783 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_lt_4_strided_a) {
21784 TEST_REQUIRES_ARM_NEON_FMA;
21785 for (size_t k = 1; k < 4; k++) {
21786 GemmMicrokernelTester()
21787 .mr(1)
21788 .nr(8)
21789 .kr(1)
21790 .sr(4)
21791 .m(1)
21792 .n(8)
21793 .k(k)
21794 .a_stride(7)
21795 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21796 }
21797 }
21798
21799 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_lt_4_subtile) {
21800 TEST_REQUIRES_ARM_NEON_FMA;
21801 for (size_t k = 1; k < 4; k++) {
21802 for (uint32_t m = 1; m <= 1; m++) {
21803 for (uint32_t n = 1; n <= 8; n++) {
21804 GemmMicrokernelTester()
21805 .mr(1)
21806 .nr(8)
21807 .kr(1)
21808 .sr(4)
21809 .m(m)
21810 .n(n)
21811 .k(k)
21812 .iterations(1)
21813 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21814 }
21815 }
21816 }
21817 }
21818
21819 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_gt_4) {
21820 TEST_REQUIRES_ARM_NEON_FMA;
21821 for (size_t k = 5; k < 8; k++) {
21822 GemmMicrokernelTester()
21823 .mr(1)
21824 .nr(8)
21825 .kr(1)
21826 .sr(4)
21827 .m(1)
21828 .n(8)
21829 .k(k)
21830 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21831 }
21832 }
21833
21834 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_gt_4_strided_a) {
21835 TEST_REQUIRES_ARM_NEON_FMA;
21836 for (size_t k = 5; k < 8; k++) {
21837 GemmMicrokernelTester()
21838 .mr(1)
21839 .nr(8)
21840 .kr(1)
21841 .sr(4)
21842 .m(1)
21843 .n(8)
21844 .k(k)
21845 .a_stride(11)
21846 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21847 }
21848 }
21849
21850 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_gt_4_subtile) {
21851 TEST_REQUIRES_ARM_NEON_FMA;
21852 for (size_t k = 5; k < 8; k++) {
21853 for (uint32_t m = 1; m <= 1; m++) {
21854 for (uint32_t n = 1; n <= 8; n++) {
21855 GemmMicrokernelTester()
21856 .mr(1)
21857 .nr(8)
21858 .kr(1)
21859 .sr(4)
21860 .m(m)
21861 .n(n)
21862 .k(k)
21863 .iterations(1)
21864 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21865 }
21866 }
21867 }
21868 }
21869
21870 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_div_4) {
21871 TEST_REQUIRES_ARM_NEON_FMA;
21872 for (size_t k = 8; k <= 40; k += 4) {
21873 GemmMicrokernelTester()
21874 .mr(1)
21875 .nr(8)
21876 .kr(1)
21877 .sr(4)
21878 .m(1)
21879 .n(8)
21880 .k(k)
21881 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21882 }
21883 }
21884
21885 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_div_4_strided_a) {
21886 TEST_REQUIRES_ARM_NEON_FMA;
21887 for (size_t k = 8; k <= 40; k += 4) {
21888 GemmMicrokernelTester()
21889 .mr(1)
21890 .nr(8)
21891 .kr(1)
21892 .sr(4)
21893 .m(1)
21894 .n(8)
21895 .k(k)
21896 .a_stride(43)
21897 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21898 }
21899 }
21900
21901 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_div_4_subtile) {
21902 TEST_REQUIRES_ARM_NEON_FMA;
21903 for (size_t k = 8; k <= 40; k += 4) {
21904 for (uint32_t m = 1; m <= 1; m++) {
21905 for (uint32_t n = 1; n <= 8; n++) {
21906 GemmMicrokernelTester()
21907 .mr(1)
21908 .nr(8)
21909 .kr(1)
21910 .sr(4)
21911 .m(m)
21912 .n(n)
21913 .k(k)
21914 .iterations(1)
21915 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21916 }
21917 }
21918 }
21919 }
21920
21921 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8) {
21922 TEST_REQUIRES_ARM_NEON_FMA;
21923 for (uint32_t n = 9; n < 16; n++) {
21924 for (size_t k = 1; k <= 20; k += 5) {
21925 GemmMicrokernelTester()
21926 .mr(1)
21927 .nr(8)
21928 .kr(1)
21929 .sr(4)
21930 .m(1)
21931 .n(8)
21932 .k(k)
21933 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21934 }
21935 }
21936 }
21937
21938 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8_strided_cn) {
21939 TEST_REQUIRES_ARM_NEON_FMA;
21940 for (uint32_t n = 9; n < 16; n++) {
21941 for (size_t k = 1; k <= 20; k += 5) {
21942 GemmMicrokernelTester()
21943 .mr(1)
21944 .nr(8)
21945 .kr(1)
21946 .sr(4)
21947 .m(1)
21948 .n(8)
21949 .k(k)
21950 .cn_stride(11)
21951 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21952 }
21953 }
21954 }
21955
21956 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8_strided_a) {
21957 TEST_REQUIRES_ARM_NEON_FMA;
21958 for (uint32_t n = 9; n < 16; n++) {
21959 for (size_t k = 1; k <= 20; k += 5) {
21960 GemmMicrokernelTester()
21961 .mr(1)
21962 .nr(8)
21963 .kr(1)
21964 .sr(4)
21965 .m(1)
21966 .n(n)
21967 .k(k)
21968 .a_stride(23)
21969 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21970 }
21971 }
21972 }
21973
21974 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8_subtile) {
21975 TEST_REQUIRES_ARM_NEON_FMA;
21976 for (uint32_t n = 9; n < 16; n++) {
21977 for (size_t k = 1; k <= 20; k += 5) {
21978 for (uint32_t m = 1; m <= 1; m++) {
21979 GemmMicrokernelTester()
21980 .mr(1)
21981 .nr(8)
21982 .kr(1)
21983 .sr(4)
21984 .m(m)
21985 .n(n)
21986 .k(k)
21987 .iterations(1)
21988 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
21989 }
21990 }
21991 }
21992 }
21993
21994 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8) {
21995 TEST_REQUIRES_ARM_NEON_FMA;
21996 for (uint32_t n = 16; n <= 24; n += 8) {
21997 for (size_t k = 1; k <= 20; k += 5) {
21998 GemmMicrokernelTester()
21999 .mr(1)
22000 .nr(8)
22001 .kr(1)
22002 .sr(4)
22003 .m(1)
22004 .n(8)
22005 .k(k)
22006 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22007 }
22008 }
22009 }
22010
22011 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8_strided_cn) {
22012 TEST_REQUIRES_ARM_NEON_FMA;
22013 for (uint32_t n = 16; n <= 24; n += 8) {
22014 for (size_t k = 1; k <= 20; k += 5) {
22015 GemmMicrokernelTester()
22016 .mr(1)
22017 .nr(8)
22018 .kr(1)
22019 .sr(4)
22020 .m(1)
22021 .n(n)
22022 .k(k)
22023 .cn_stride(11)
22024 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22025 }
22026 }
22027 }
22028
22029 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8_strided_a) {
22030 TEST_REQUIRES_ARM_NEON_FMA;
22031 for (uint32_t n = 16; n <= 24; n += 8) {
22032 for (size_t k = 1; k <= 20; k += 5) {
22033 GemmMicrokernelTester()
22034 .mr(1)
22035 .nr(8)
22036 .kr(1)
22037 .sr(4)
22038 .m(1)
22039 .n(n)
22040 .k(k)
22041 .a_stride(23)
22042 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22043 }
22044 }
22045 }
22046
22047 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8_subtile) {
22048 TEST_REQUIRES_ARM_NEON_FMA;
22049 for (uint32_t n = 16; n <= 24; n += 8) {
22050 for (size_t k = 1; k <= 20; k += 5) {
22051 for (uint32_t m = 1; m <= 1; m++) {
22052 GemmMicrokernelTester()
22053 .mr(1)
22054 .nr(8)
22055 .kr(1)
22056 .sr(4)
22057 .m(m)
22058 .n(n)
22059 .k(k)
22060 .iterations(1)
22061 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22062 }
22063 }
22064 }
22065 }
22066
22067 TEST(F32_GEMMINC_1X8S4__NEONFMA, strided_cm_subtile) {
22068 TEST_REQUIRES_ARM_NEON_FMA;
22069 for (size_t k = 1; k <= 20; k += 5) {
22070 for (uint32_t m = 1; m <= 1; m++) {
22071 for (uint32_t n = 1; n <= 8; n++) {
22072 GemmMicrokernelTester()
22073 .mr(1)
22074 .nr(8)
22075 .kr(1)
22076 .sr(4)
22077 .m(m)
22078 .n(n)
22079 .k(k)
22080 .cm_stride(11)
22081 .iterations(1)
22082 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22083 }
22084 }
22085 }
22086 }
22087
22088 TEST(F32_GEMMINC_1X8S4__NEONFMA, qmin) {
22089 TEST_REQUIRES_ARM_NEON_FMA;
22090 GemmMicrokernelTester()
22091 .mr(1)
22092 .nr(8)
22093 .kr(1)
22094 .sr(4)
22095 .m(1)
22096 .n(8)
22097 .k(4)
22098 .qmin(128)
22099 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22100 }
22101
22102 TEST(F32_GEMMINC_1X8S4__NEONFMA, qmax) {
22103 TEST_REQUIRES_ARM_NEON_FMA;
22104 GemmMicrokernelTester()
22105 .mr(1)
22106 .nr(8)
22107 .kr(1)
22108 .sr(4)
22109 .m(1)
22110 .n(8)
22111 .k(4)
22112 .qmax(128)
22113 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22114 }
22115
22116 TEST(F32_GEMMINC_1X8S4__NEONFMA, strided_cm) {
22117 TEST_REQUIRES_ARM_NEON_FMA;
22118 GemmMicrokernelTester()
22119 .mr(1)
22120 .nr(8)
22121 .kr(1)
22122 .sr(4)
22123 .m(1)
22124 .n(8)
22125 .k(4)
22126 .cm_stride(11)
22127 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
22128 }
22129#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22130
22131
22132#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22133 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4) {
22134 TEST_REQUIRES_ARM_NEON_FMA;
22135 GemmMicrokernelTester()
22136 .mr(4)
22137 .nr(8)
22138 .kr(1)
22139 .sr(4)
22140 .m(4)
22141 .n(8)
22142 .k(4)
22143 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22144 }
22145
22146 TEST(F32_GEMMINC_4X8S4__NEONFMA, strided_cn) {
22147 TEST_REQUIRES_ARM_NEON_FMA;
22148 GemmMicrokernelTester()
22149 .mr(4)
22150 .nr(8)
22151 .kr(1)
22152 .sr(4)
22153 .m(4)
22154 .n(8)
22155 .k(4)
22156 .cn_stride(11)
22157 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22158 }
22159
22160 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_strided_a) {
22161 TEST_REQUIRES_ARM_NEON_FMA;
22162 GemmMicrokernelTester()
22163 .mr(4)
22164 .nr(8)
22165 .kr(1)
22166 .sr(4)
22167 .m(4)
22168 .n(8)
22169 .k(4)
22170 .a_stride(7)
22171 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22172 }
22173
22174 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_subtile) {
22175 TEST_REQUIRES_ARM_NEON_FMA;
22176 for (uint32_t m = 1; m <= 4; m++) {
22177 for (uint32_t n = 1; n <= 8; n++) {
22178 GemmMicrokernelTester()
22179 .mr(4)
22180 .nr(8)
22181 .kr(1)
22182 .sr(4)
22183 .m(m)
22184 .n(n)
22185 .k(4)
22186 .iterations(1)
22187 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22188 }
22189 }
22190 }
22191
22192 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_subtile_m) {
22193 TEST_REQUIRES_ARM_NEON_FMA;
22194 for (uint32_t m = 1; m <= 4; m++) {
22195 GemmMicrokernelTester()
22196 .mr(4)
22197 .nr(8)
22198 .kr(1)
22199 .sr(4)
22200 .m(m)
22201 .n(8)
22202 .k(4)
22203 .iterations(1)
22204 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22205 }
22206 }
22207
22208 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_subtile_n) {
22209 TEST_REQUIRES_ARM_NEON_FMA;
22210 for (uint32_t n = 1; n <= 8; n++) {
22211 GemmMicrokernelTester()
22212 .mr(4)
22213 .nr(8)
22214 .kr(1)
22215 .sr(4)
22216 .m(4)
22217 .n(n)
22218 .k(4)
22219 .iterations(1)
22220 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22221 }
22222 }
22223
22224 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_lt_4) {
22225 TEST_REQUIRES_ARM_NEON_FMA;
22226 for (size_t k = 1; k < 4; k++) {
22227 GemmMicrokernelTester()
22228 .mr(4)
22229 .nr(8)
22230 .kr(1)
22231 .sr(4)
22232 .m(4)
22233 .n(8)
22234 .k(k)
22235 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22236 }
22237 }
22238
22239 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_lt_4_strided_a) {
22240 TEST_REQUIRES_ARM_NEON_FMA;
22241 for (size_t k = 1; k < 4; k++) {
22242 GemmMicrokernelTester()
22243 .mr(4)
22244 .nr(8)
22245 .kr(1)
22246 .sr(4)
22247 .m(4)
22248 .n(8)
22249 .k(k)
22250 .a_stride(7)
22251 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22252 }
22253 }
22254
22255 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_lt_4_subtile) {
22256 TEST_REQUIRES_ARM_NEON_FMA;
22257 for (size_t k = 1; k < 4; k++) {
22258 for (uint32_t m = 1; m <= 4; m++) {
22259 for (uint32_t n = 1; n <= 8; n++) {
22260 GemmMicrokernelTester()
22261 .mr(4)
22262 .nr(8)
22263 .kr(1)
22264 .sr(4)
22265 .m(m)
22266 .n(n)
22267 .k(k)
22268 .iterations(1)
22269 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22270 }
22271 }
22272 }
22273 }
22274
22275 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_gt_4) {
22276 TEST_REQUIRES_ARM_NEON_FMA;
22277 for (size_t k = 5; k < 8; k++) {
22278 GemmMicrokernelTester()
22279 .mr(4)
22280 .nr(8)
22281 .kr(1)
22282 .sr(4)
22283 .m(4)
22284 .n(8)
22285 .k(k)
22286 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22287 }
22288 }
22289
22290 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_gt_4_strided_a) {
22291 TEST_REQUIRES_ARM_NEON_FMA;
22292 for (size_t k = 5; k < 8; k++) {
22293 GemmMicrokernelTester()
22294 .mr(4)
22295 .nr(8)
22296 .kr(1)
22297 .sr(4)
22298 .m(4)
22299 .n(8)
22300 .k(k)
22301 .a_stride(11)
22302 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22303 }
22304 }
22305
22306 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_gt_4_subtile) {
22307 TEST_REQUIRES_ARM_NEON_FMA;
22308 for (size_t k = 5; k < 8; k++) {
22309 for (uint32_t m = 1; m <= 4; m++) {
22310 for (uint32_t n = 1; n <= 8; n++) {
22311 GemmMicrokernelTester()
22312 .mr(4)
22313 .nr(8)
22314 .kr(1)
22315 .sr(4)
22316 .m(m)
22317 .n(n)
22318 .k(k)
22319 .iterations(1)
22320 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22321 }
22322 }
22323 }
22324 }
22325
22326 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_div_4) {
22327 TEST_REQUIRES_ARM_NEON_FMA;
22328 for (size_t k = 8; k <= 40; k += 4) {
22329 GemmMicrokernelTester()
22330 .mr(4)
22331 .nr(8)
22332 .kr(1)
22333 .sr(4)
22334 .m(4)
22335 .n(8)
22336 .k(k)
22337 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22338 }
22339 }
22340
22341 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_div_4_strided_a) {
22342 TEST_REQUIRES_ARM_NEON_FMA;
22343 for (size_t k = 8; k <= 40; k += 4) {
22344 GemmMicrokernelTester()
22345 .mr(4)
22346 .nr(8)
22347 .kr(1)
22348 .sr(4)
22349 .m(4)
22350 .n(8)
22351 .k(k)
22352 .a_stride(43)
22353 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22354 }
22355 }
22356
22357 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_div_4_subtile) {
22358 TEST_REQUIRES_ARM_NEON_FMA;
22359 for (size_t k = 8; k <= 40; k += 4) {
22360 for (uint32_t m = 1; m <= 4; m++) {
22361 for (uint32_t n = 1; n <= 8; n++) {
22362 GemmMicrokernelTester()
22363 .mr(4)
22364 .nr(8)
22365 .kr(1)
22366 .sr(4)
22367 .m(m)
22368 .n(n)
22369 .k(k)
22370 .iterations(1)
22371 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22372 }
22373 }
22374 }
22375 }
22376
22377 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8) {
22378 TEST_REQUIRES_ARM_NEON_FMA;
22379 for (uint32_t n = 9; n < 16; n++) {
22380 for (size_t k = 1; k <= 20; k += 5) {
22381 GemmMicrokernelTester()
22382 .mr(4)
22383 .nr(8)
22384 .kr(1)
22385 .sr(4)
22386 .m(4)
22387 .n(8)
22388 .k(k)
22389 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22390 }
22391 }
22392 }
22393
22394 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8_strided_cn) {
22395 TEST_REQUIRES_ARM_NEON_FMA;
22396 for (uint32_t n = 9; n < 16; n++) {
22397 for (size_t k = 1; k <= 20; k += 5) {
22398 GemmMicrokernelTester()
22399 .mr(4)
22400 .nr(8)
22401 .kr(1)
22402 .sr(4)
22403 .m(4)
22404 .n(8)
22405 .k(k)
22406 .cn_stride(11)
22407 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22408 }
22409 }
22410 }
22411
22412 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8_strided_a) {
22413 TEST_REQUIRES_ARM_NEON_FMA;
22414 for (uint32_t n = 9; n < 16; n++) {
22415 for (size_t k = 1; k <= 20; k += 5) {
22416 GemmMicrokernelTester()
22417 .mr(4)
22418 .nr(8)
22419 .kr(1)
22420 .sr(4)
22421 .m(4)
22422 .n(n)
22423 .k(k)
22424 .a_stride(23)
22425 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22426 }
22427 }
22428 }
22429
22430 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8_subtile) {
22431 TEST_REQUIRES_ARM_NEON_FMA;
22432 for (uint32_t n = 9; n < 16; n++) {
22433 for (size_t k = 1; k <= 20; k += 5) {
22434 for (uint32_t m = 1; m <= 4; m++) {
22435 GemmMicrokernelTester()
22436 .mr(4)
22437 .nr(8)
22438 .kr(1)
22439 .sr(4)
22440 .m(m)
22441 .n(n)
22442 .k(k)
22443 .iterations(1)
22444 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22445 }
22446 }
22447 }
22448 }
22449
22450 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8) {
22451 TEST_REQUIRES_ARM_NEON_FMA;
22452 for (uint32_t n = 16; n <= 24; n += 8) {
22453 for (size_t k = 1; k <= 20; k += 5) {
22454 GemmMicrokernelTester()
22455 .mr(4)
22456 .nr(8)
22457 .kr(1)
22458 .sr(4)
22459 .m(4)
22460 .n(8)
22461 .k(k)
22462 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22463 }
22464 }
22465 }
22466
22467 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8_strided_cn) {
22468 TEST_REQUIRES_ARM_NEON_FMA;
22469 for (uint32_t n = 16; n <= 24; n += 8) {
22470 for (size_t k = 1; k <= 20; k += 5) {
22471 GemmMicrokernelTester()
22472 .mr(4)
22473 .nr(8)
22474 .kr(1)
22475 .sr(4)
22476 .m(4)
22477 .n(n)
22478 .k(k)
22479 .cn_stride(11)
22480 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22481 }
22482 }
22483 }
22484
22485 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8_strided_a) {
22486 TEST_REQUIRES_ARM_NEON_FMA;
22487 for (uint32_t n = 16; n <= 24; n += 8) {
22488 for (size_t k = 1; k <= 20; k += 5) {
22489 GemmMicrokernelTester()
22490 .mr(4)
22491 .nr(8)
22492 .kr(1)
22493 .sr(4)
22494 .m(4)
22495 .n(n)
22496 .k(k)
22497 .a_stride(23)
22498 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22499 }
22500 }
22501 }
22502
22503 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8_subtile) {
22504 TEST_REQUIRES_ARM_NEON_FMA;
22505 for (uint32_t n = 16; n <= 24; n += 8) {
22506 for (size_t k = 1; k <= 20; k += 5) {
22507 for (uint32_t m = 1; m <= 4; m++) {
22508 GemmMicrokernelTester()
22509 .mr(4)
22510 .nr(8)
22511 .kr(1)
22512 .sr(4)
22513 .m(m)
22514 .n(n)
22515 .k(k)
22516 .iterations(1)
22517 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22518 }
22519 }
22520 }
22521 }
22522
22523 TEST(F32_GEMMINC_4X8S4__NEONFMA, strided_cm_subtile) {
22524 TEST_REQUIRES_ARM_NEON_FMA;
22525 for (size_t k = 1; k <= 20; k += 5) {
22526 for (uint32_t m = 1; m <= 4; m++) {
22527 for (uint32_t n = 1; n <= 8; n++) {
22528 GemmMicrokernelTester()
22529 .mr(4)
22530 .nr(8)
22531 .kr(1)
22532 .sr(4)
22533 .m(m)
22534 .n(n)
22535 .k(k)
22536 .cm_stride(11)
22537 .iterations(1)
22538 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22539 }
22540 }
22541 }
22542 }
22543
22544 TEST(F32_GEMMINC_4X8S4__NEONFMA, qmin) {
22545 TEST_REQUIRES_ARM_NEON_FMA;
22546 GemmMicrokernelTester()
22547 .mr(4)
22548 .nr(8)
22549 .kr(1)
22550 .sr(4)
22551 .m(4)
22552 .n(8)
22553 .k(4)
22554 .qmin(128)
22555 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22556 }
22557
22558 TEST(F32_GEMMINC_4X8S4__NEONFMA, qmax) {
22559 TEST_REQUIRES_ARM_NEON_FMA;
22560 GemmMicrokernelTester()
22561 .mr(4)
22562 .nr(8)
22563 .kr(1)
22564 .sr(4)
22565 .m(4)
22566 .n(8)
22567 .k(4)
22568 .qmax(128)
22569 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22570 }
22571
22572 TEST(F32_GEMMINC_4X8S4__NEONFMA, strided_cm) {
22573 TEST_REQUIRES_ARM_NEON_FMA;
22574 GemmMicrokernelTester()
22575 .mr(4)
22576 .nr(8)
22577 .kr(1)
22578 .sr(4)
22579 .m(4)
22580 .n(8)
22581 .k(4)
22582 .cm_stride(11)
22583 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
22584 }
22585#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22586
22587
22588#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22589 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4) {
22590 TEST_REQUIRES_ARM_NEON_FMA;
22591 GemmMicrokernelTester()
22592 .mr(6)
22593 .nr(8)
22594 .kr(1)
22595 .sr(4)
22596 .m(6)
22597 .n(8)
22598 .k(4)
22599 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22600 }
22601
22602 TEST(F32_GEMMINC_6X8S4__NEONFMA, strided_cn) {
22603 TEST_REQUIRES_ARM_NEON_FMA;
22604 GemmMicrokernelTester()
22605 .mr(6)
22606 .nr(8)
22607 .kr(1)
22608 .sr(4)
22609 .m(6)
22610 .n(8)
22611 .k(4)
22612 .cn_stride(11)
22613 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22614 }
22615
22616 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_strided_a) {
22617 TEST_REQUIRES_ARM_NEON_FMA;
22618 GemmMicrokernelTester()
22619 .mr(6)
22620 .nr(8)
22621 .kr(1)
22622 .sr(4)
22623 .m(6)
22624 .n(8)
22625 .k(4)
22626 .a_stride(7)
22627 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22628 }
22629
22630 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_subtile) {
22631 TEST_REQUIRES_ARM_NEON_FMA;
22632 for (uint32_t m = 1; m <= 6; m++) {
22633 for (uint32_t n = 1; n <= 8; n++) {
22634 GemmMicrokernelTester()
22635 .mr(6)
22636 .nr(8)
22637 .kr(1)
22638 .sr(4)
22639 .m(m)
22640 .n(n)
22641 .k(4)
22642 .iterations(1)
22643 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22644 }
22645 }
22646 }
22647
22648 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_subtile_m) {
22649 TEST_REQUIRES_ARM_NEON_FMA;
22650 for (uint32_t m = 1; m <= 6; m++) {
22651 GemmMicrokernelTester()
22652 .mr(6)
22653 .nr(8)
22654 .kr(1)
22655 .sr(4)
22656 .m(m)
22657 .n(8)
22658 .k(4)
22659 .iterations(1)
22660 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22661 }
22662 }
22663
22664 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_subtile_n) {
22665 TEST_REQUIRES_ARM_NEON_FMA;
22666 for (uint32_t n = 1; n <= 8; n++) {
22667 GemmMicrokernelTester()
22668 .mr(6)
22669 .nr(8)
22670 .kr(1)
22671 .sr(4)
22672 .m(6)
22673 .n(n)
22674 .k(4)
22675 .iterations(1)
22676 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22677 }
22678 }
22679
22680 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_lt_4) {
22681 TEST_REQUIRES_ARM_NEON_FMA;
22682 for (size_t k = 1; k < 4; k++) {
22683 GemmMicrokernelTester()
22684 .mr(6)
22685 .nr(8)
22686 .kr(1)
22687 .sr(4)
22688 .m(6)
22689 .n(8)
22690 .k(k)
22691 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22692 }
22693 }
22694
22695 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_lt_4_strided_a) {
22696 TEST_REQUIRES_ARM_NEON_FMA;
22697 for (size_t k = 1; k < 4; k++) {
22698 GemmMicrokernelTester()
22699 .mr(6)
22700 .nr(8)
22701 .kr(1)
22702 .sr(4)
22703 .m(6)
22704 .n(8)
22705 .k(k)
22706 .a_stride(7)
22707 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22708 }
22709 }
22710
22711 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_lt_4_subtile) {
22712 TEST_REQUIRES_ARM_NEON_FMA;
22713 for (size_t k = 1; k < 4; k++) {
22714 for (uint32_t m = 1; m <= 6; m++) {
22715 for (uint32_t n = 1; n <= 8; n++) {
22716 GemmMicrokernelTester()
22717 .mr(6)
22718 .nr(8)
22719 .kr(1)
22720 .sr(4)
22721 .m(m)
22722 .n(n)
22723 .k(k)
22724 .iterations(1)
22725 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22726 }
22727 }
22728 }
22729 }
22730
22731 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_gt_4) {
22732 TEST_REQUIRES_ARM_NEON_FMA;
22733 for (size_t k = 5; k < 8; k++) {
22734 GemmMicrokernelTester()
22735 .mr(6)
22736 .nr(8)
22737 .kr(1)
22738 .sr(4)
22739 .m(6)
22740 .n(8)
22741 .k(k)
22742 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22743 }
22744 }
22745
22746 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_gt_4_strided_a) {
22747 TEST_REQUIRES_ARM_NEON_FMA;
22748 for (size_t k = 5; k < 8; k++) {
22749 GemmMicrokernelTester()
22750 .mr(6)
22751 .nr(8)
22752 .kr(1)
22753 .sr(4)
22754 .m(6)
22755 .n(8)
22756 .k(k)
22757 .a_stride(11)
22758 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22759 }
22760 }
22761
22762 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_gt_4_subtile) {
22763 TEST_REQUIRES_ARM_NEON_FMA;
22764 for (size_t k = 5; k < 8; k++) {
22765 for (uint32_t m = 1; m <= 6; m++) {
22766 for (uint32_t n = 1; n <= 8; n++) {
22767 GemmMicrokernelTester()
22768 .mr(6)
22769 .nr(8)
22770 .kr(1)
22771 .sr(4)
22772 .m(m)
22773 .n(n)
22774 .k(k)
22775 .iterations(1)
22776 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22777 }
22778 }
22779 }
22780 }
22781
22782 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_div_4) {
22783 TEST_REQUIRES_ARM_NEON_FMA;
22784 for (size_t k = 8; k <= 40; k += 4) {
22785 GemmMicrokernelTester()
22786 .mr(6)
22787 .nr(8)
22788 .kr(1)
22789 .sr(4)
22790 .m(6)
22791 .n(8)
22792 .k(k)
22793 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22794 }
22795 }
22796
22797 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_div_4_strided_a) {
22798 TEST_REQUIRES_ARM_NEON_FMA;
22799 for (size_t k = 8; k <= 40; k += 4) {
22800 GemmMicrokernelTester()
22801 .mr(6)
22802 .nr(8)
22803 .kr(1)
22804 .sr(4)
22805 .m(6)
22806 .n(8)
22807 .k(k)
22808 .a_stride(43)
22809 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22810 }
22811 }
22812
22813 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_div_4_subtile) {
22814 TEST_REQUIRES_ARM_NEON_FMA;
22815 for (size_t k = 8; k <= 40; k += 4) {
22816 for (uint32_t m = 1; m <= 6; m++) {
22817 for (uint32_t n = 1; n <= 8; n++) {
22818 GemmMicrokernelTester()
22819 .mr(6)
22820 .nr(8)
22821 .kr(1)
22822 .sr(4)
22823 .m(m)
22824 .n(n)
22825 .k(k)
22826 .iterations(1)
22827 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22828 }
22829 }
22830 }
22831 }
22832
22833 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8) {
22834 TEST_REQUIRES_ARM_NEON_FMA;
22835 for (uint32_t n = 9; n < 16; n++) {
22836 for (size_t k = 1; k <= 20; k += 5) {
22837 GemmMicrokernelTester()
22838 .mr(6)
22839 .nr(8)
22840 .kr(1)
22841 .sr(4)
22842 .m(6)
22843 .n(8)
22844 .k(k)
22845 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22846 }
22847 }
22848 }
22849
22850 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8_strided_cn) {
22851 TEST_REQUIRES_ARM_NEON_FMA;
22852 for (uint32_t n = 9; n < 16; n++) {
22853 for (size_t k = 1; k <= 20; k += 5) {
22854 GemmMicrokernelTester()
22855 .mr(6)
22856 .nr(8)
22857 .kr(1)
22858 .sr(4)
22859 .m(6)
22860 .n(8)
22861 .k(k)
22862 .cn_stride(11)
22863 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22864 }
22865 }
22866 }
22867
22868 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8_strided_a) {
22869 TEST_REQUIRES_ARM_NEON_FMA;
22870 for (uint32_t n = 9; n < 16; n++) {
22871 for (size_t k = 1; k <= 20; k += 5) {
22872 GemmMicrokernelTester()
22873 .mr(6)
22874 .nr(8)
22875 .kr(1)
22876 .sr(4)
22877 .m(6)
22878 .n(n)
22879 .k(k)
22880 .a_stride(23)
22881 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22882 }
22883 }
22884 }
22885
22886 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8_subtile) {
22887 TEST_REQUIRES_ARM_NEON_FMA;
22888 for (uint32_t n = 9; n < 16; n++) {
22889 for (size_t k = 1; k <= 20; k += 5) {
22890 for (uint32_t m = 1; m <= 6; m++) {
22891 GemmMicrokernelTester()
22892 .mr(6)
22893 .nr(8)
22894 .kr(1)
22895 .sr(4)
22896 .m(m)
22897 .n(n)
22898 .k(k)
22899 .iterations(1)
22900 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22901 }
22902 }
22903 }
22904 }
22905
22906 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8) {
22907 TEST_REQUIRES_ARM_NEON_FMA;
22908 for (uint32_t n = 16; n <= 24; n += 8) {
22909 for (size_t k = 1; k <= 20; k += 5) {
22910 GemmMicrokernelTester()
22911 .mr(6)
22912 .nr(8)
22913 .kr(1)
22914 .sr(4)
22915 .m(6)
22916 .n(8)
22917 .k(k)
22918 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22919 }
22920 }
22921 }
22922
22923 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8_strided_cn) {
22924 TEST_REQUIRES_ARM_NEON_FMA;
22925 for (uint32_t n = 16; n <= 24; n += 8) {
22926 for (size_t k = 1; k <= 20; k += 5) {
22927 GemmMicrokernelTester()
22928 .mr(6)
22929 .nr(8)
22930 .kr(1)
22931 .sr(4)
22932 .m(6)
22933 .n(n)
22934 .k(k)
22935 .cn_stride(11)
22936 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22937 }
22938 }
22939 }
22940
22941 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8_strided_a) {
22942 TEST_REQUIRES_ARM_NEON_FMA;
22943 for (uint32_t n = 16; n <= 24; n += 8) {
22944 for (size_t k = 1; k <= 20; k += 5) {
22945 GemmMicrokernelTester()
22946 .mr(6)
22947 .nr(8)
22948 .kr(1)
22949 .sr(4)
22950 .m(6)
22951 .n(n)
22952 .k(k)
22953 .a_stride(23)
22954 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22955 }
22956 }
22957 }
22958
22959 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8_subtile) {
22960 TEST_REQUIRES_ARM_NEON_FMA;
22961 for (uint32_t n = 16; n <= 24; n += 8) {
22962 for (size_t k = 1; k <= 20; k += 5) {
22963 for (uint32_t m = 1; m <= 6; m++) {
22964 GemmMicrokernelTester()
22965 .mr(6)
22966 .nr(8)
22967 .kr(1)
22968 .sr(4)
22969 .m(m)
22970 .n(n)
22971 .k(k)
22972 .iterations(1)
22973 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22974 }
22975 }
22976 }
22977 }
22978
22979 TEST(F32_GEMMINC_6X8S4__NEONFMA, strided_cm_subtile) {
22980 TEST_REQUIRES_ARM_NEON_FMA;
22981 for (size_t k = 1; k <= 20; k += 5) {
22982 for (uint32_t m = 1; m <= 6; m++) {
22983 for (uint32_t n = 1; n <= 8; n++) {
22984 GemmMicrokernelTester()
22985 .mr(6)
22986 .nr(8)
22987 .kr(1)
22988 .sr(4)
22989 .m(m)
22990 .n(n)
22991 .k(k)
22992 .cm_stride(11)
22993 .iterations(1)
22994 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
22995 }
22996 }
22997 }
22998 }
22999
23000 TEST(F32_GEMMINC_6X8S4__NEONFMA, qmin) {
23001 TEST_REQUIRES_ARM_NEON_FMA;
23002 GemmMicrokernelTester()
23003 .mr(6)
23004 .nr(8)
23005 .kr(1)
23006 .sr(4)
23007 .m(6)
23008 .n(8)
23009 .k(4)
23010 .qmin(128)
23011 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
23012 }
23013
23014 TEST(F32_GEMMINC_6X8S4__NEONFMA, qmax) {
23015 TEST_REQUIRES_ARM_NEON_FMA;
23016 GemmMicrokernelTester()
23017 .mr(6)
23018 .nr(8)
23019 .kr(1)
23020 .sr(4)
23021 .m(6)
23022 .n(8)
23023 .k(4)
23024 .qmax(128)
23025 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
23026 }
23027
23028 TEST(F32_GEMMINC_6X8S4__NEONFMA, strided_cm) {
23029 TEST_REQUIRES_ARM_NEON_FMA;
23030 GemmMicrokernelTester()
23031 .mr(6)
23032 .nr(8)
23033 .kr(1)
23034 .sr(4)
23035 .m(6)
23036 .n(8)
23037 .k(4)
23038 .cm_stride(11)
23039 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
23040 }
23041#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23042
23043
23044#if XNN_ARCH_ARM || XNN_ARCH_ARM64
23045 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4) {
23046 TEST_REQUIRES_ARM_NEON_FMA;
23047 GemmMicrokernelTester()
23048 .mr(8)
23049 .nr(8)
23050 .kr(1)
23051 .sr(4)
23052 .m(8)
23053 .n(8)
23054 .k(4)
23055 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23056 }
23057
23058 TEST(F32_GEMMINC_8X8S4__NEONFMA, strided_cn) {
23059 TEST_REQUIRES_ARM_NEON_FMA;
23060 GemmMicrokernelTester()
23061 .mr(8)
23062 .nr(8)
23063 .kr(1)
23064 .sr(4)
23065 .m(8)
23066 .n(8)
23067 .k(4)
23068 .cn_stride(11)
23069 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23070 }
23071
23072 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_strided_a) {
23073 TEST_REQUIRES_ARM_NEON_FMA;
23074 GemmMicrokernelTester()
23075 .mr(8)
23076 .nr(8)
23077 .kr(1)
23078 .sr(4)
23079 .m(8)
23080 .n(8)
23081 .k(4)
23082 .a_stride(7)
23083 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23084 }
23085
23086 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_subtile) {
23087 TEST_REQUIRES_ARM_NEON_FMA;
23088 for (uint32_t m = 1; m <= 8; m++) {
23089 for (uint32_t n = 1; n <= 8; n++) {
23090 GemmMicrokernelTester()
23091 .mr(8)
23092 .nr(8)
23093 .kr(1)
23094 .sr(4)
23095 .m(m)
23096 .n(n)
23097 .k(4)
23098 .iterations(1)
23099 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23100 }
23101 }
23102 }
23103
23104 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_subtile_m) {
23105 TEST_REQUIRES_ARM_NEON_FMA;
23106 for (uint32_t m = 1; m <= 8; m++) {
23107 GemmMicrokernelTester()
23108 .mr(8)
23109 .nr(8)
23110 .kr(1)
23111 .sr(4)
23112 .m(m)
23113 .n(8)
23114 .k(4)
23115 .iterations(1)
23116 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23117 }
23118 }
23119
23120 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_subtile_n) {
23121 TEST_REQUIRES_ARM_NEON_FMA;
23122 for (uint32_t n = 1; n <= 8; n++) {
23123 GemmMicrokernelTester()
23124 .mr(8)
23125 .nr(8)
23126 .kr(1)
23127 .sr(4)
23128 .m(8)
23129 .n(n)
23130 .k(4)
23131 .iterations(1)
23132 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23133 }
23134 }
23135
23136 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_lt_4) {
23137 TEST_REQUIRES_ARM_NEON_FMA;
23138 for (size_t k = 1; k < 4; k++) {
23139 GemmMicrokernelTester()
23140 .mr(8)
23141 .nr(8)
23142 .kr(1)
23143 .sr(4)
23144 .m(8)
23145 .n(8)
23146 .k(k)
23147 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23148 }
23149 }
23150
23151 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_lt_4_strided_a) {
23152 TEST_REQUIRES_ARM_NEON_FMA;
23153 for (size_t k = 1; k < 4; k++) {
23154 GemmMicrokernelTester()
23155 .mr(8)
23156 .nr(8)
23157 .kr(1)
23158 .sr(4)
23159 .m(8)
23160 .n(8)
23161 .k(k)
23162 .a_stride(7)
23163 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23164 }
23165 }
23166
23167 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_lt_4_subtile) {
23168 TEST_REQUIRES_ARM_NEON_FMA;
23169 for (size_t k = 1; k < 4; k++) {
23170 for (uint32_t m = 1; m <= 8; m++) {
23171 for (uint32_t n = 1; n <= 8; n++) {
23172 GemmMicrokernelTester()
23173 .mr(8)
23174 .nr(8)
23175 .kr(1)
23176 .sr(4)
23177 .m(m)
23178 .n(n)
23179 .k(k)
23180 .iterations(1)
23181 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23182 }
23183 }
23184 }
23185 }
23186
23187 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_gt_4) {
23188 TEST_REQUIRES_ARM_NEON_FMA;
23189 for (size_t k = 5; k < 8; k++) {
23190 GemmMicrokernelTester()
23191 .mr(8)
23192 .nr(8)
23193 .kr(1)
23194 .sr(4)
23195 .m(8)
23196 .n(8)
23197 .k(k)
23198 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23199 }
23200 }
23201
23202 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_gt_4_strided_a) {
23203 TEST_REQUIRES_ARM_NEON_FMA;
23204 for (size_t k = 5; k < 8; k++) {
23205 GemmMicrokernelTester()
23206 .mr(8)
23207 .nr(8)
23208 .kr(1)
23209 .sr(4)
23210 .m(8)
23211 .n(8)
23212 .k(k)
23213 .a_stride(11)
23214 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23215 }
23216 }
23217
23218 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_gt_4_subtile) {
23219 TEST_REQUIRES_ARM_NEON_FMA;
23220 for (size_t k = 5; k < 8; k++) {
23221 for (uint32_t m = 1; m <= 8; m++) {
23222 for (uint32_t n = 1; n <= 8; n++) {
23223 GemmMicrokernelTester()
23224 .mr(8)
23225 .nr(8)
23226 .kr(1)
23227 .sr(4)
23228 .m(m)
23229 .n(n)
23230 .k(k)
23231 .iterations(1)
23232 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23233 }
23234 }
23235 }
23236 }
23237
23238 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_div_4) {
23239 TEST_REQUIRES_ARM_NEON_FMA;
23240 for (size_t k = 8; k <= 40; k += 4) {
23241 GemmMicrokernelTester()
23242 .mr(8)
23243 .nr(8)
23244 .kr(1)
23245 .sr(4)
23246 .m(8)
23247 .n(8)
23248 .k(k)
23249 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23250 }
23251 }
23252
23253 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_div_4_strided_a) {
23254 TEST_REQUIRES_ARM_NEON_FMA;
23255 for (size_t k = 8; k <= 40; k += 4) {
23256 GemmMicrokernelTester()
23257 .mr(8)
23258 .nr(8)
23259 .kr(1)
23260 .sr(4)
23261 .m(8)
23262 .n(8)
23263 .k(k)
23264 .a_stride(43)
23265 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23266 }
23267 }
23268
23269 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_div_4_subtile) {
23270 TEST_REQUIRES_ARM_NEON_FMA;
23271 for (size_t k = 8; k <= 40; k += 4) {
23272 for (uint32_t m = 1; m <= 8; m++) {
23273 for (uint32_t n = 1; n <= 8; n++) {
23274 GemmMicrokernelTester()
23275 .mr(8)
23276 .nr(8)
23277 .kr(1)
23278 .sr(4)
23279 .m(m)
23280 .n(n)
23281 .k(k)
23282 .iterations(1)
23283 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23284 }
23285 }
23286 }
23287 }
23288
23289 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8) {
23290 TEST_REQUIRES_ARM_NEON_FMA;
23291 for (uint32_t n = 9; n < 16; n++) {
23292 for (size_t k = 1; k <= 20; k += 5) {
23293 GemmMicrokernelTester()
23294 .mr(8)
23295 .nr(8)
23296 .kr(1)
23297 .sr(4)
23298 .m(8)
23299 .n(8)
23300 .k(k)
23301 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23302 }
23303 }
23304 }
23305
23306 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8_strided_cn) {
23307 TEST_REQUIRES_ARM_NEON_FMA;
23308 for (uint32_t n = 9; n < 16; n++) {
23309 for (size_t k = 1; k <= 20; k += 5) {
23310 GemmMicrokernelTester()
23311 .mr(8)
23312 .nr(8)
23313 .kr(1)
23314 .sr(4)
23315 .m(8)
23316 .n(8)
23317 .k(k)
23318 .cn_stride(11)
23319 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23320 }
23321 }
23322 }
23323
23324 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8_strided_a) {
23325 TEST_REQUIRES_ARM_NEON_FMA;
23326 for (uint32_t n = 9; n < 16; n++) {
23327 for (size_t k = 1; k <= 20; k += 5) {
23328 GemmMicrokernelTester()
23329 .mr(8)
23330 .nr(8)
23331 .kr(1)
23332 .sr(4)
23333 .m(8)
23334 .n(n)
23335 .k(k)
23336 .a_stride(23)
23337 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23338 }
23339 }
23340 }
23341
23342 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8_subtile) {
23343 TEST_REQUIRES_ARM_NEON_FMA;
23344 for (uint32_t n = 9; n < 16; n++) {
23345 for (size_t k = 1; k <= 20; k += 5) {
23346 for (uint32_t m = 1; m <= 8; m++) {
23347 GemmMicrokernelTester()
23348 .mr(8)
23349 .nr(8)
23350 .kr(1)
23351 .sr(4)
23352 .m(m)
23353 .n(n)
23354 .k(k)
23355 .iterations(1)
23356 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23357 }
23358 }
23359 }
23360 }
23361
23362 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8) {
23363 TEST_REQUIRES_ARM_NEON_FMA;
23364 for (uint32_t n = 16; n <= 24; n += 8) {
23365 for (size_t k = 1; k <= 20; k += 5) {
23366 GemmMicrokernelTester()
23367 .mr(8)
23368 .nr(8)
23369 .kr(1)
23370 .sr(4)
23371 .m(8)
23372 .n(8)
23373 .k(k)
23374 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23375 }
23376 }
23377 }
23378
23379 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8_strided_cn) {
23380 TEST_REQUIRES_ARM_NEON_FMA;
23381 for (uint32_t n = 16; n <= 24; n += 8) {
23382 for (size_t k = 1; k <= 20; k += 5) {
23383 GemmMicrokernelTester()
23384 .mr(8)
23385 .nr(8)
23386 .kr(1)
23387 .sr(4)
23388 .m(8)
23389 .n(n)
23390 .k(k)
23391 .cn_stride(11)
23392 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23393 }
23394 }
23395 }
23396
23397 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8_strided_a) {
23398 TEST_REQUIRES_ARM_NEON_FMA;
23399 for (uint32_t n = 16; n <= 24; n += 8) {
23400 for (size_t k = 1; k <= 20; k += 5) {
23401 GemmMicrokernelTester()
23402 .mr(8)
23403 .nr(8)
23404 .kr(1)
23405 .sr(4)
23406 .m(8)
23407 .n(n)
23408 .k(k)
23409 .a_stride(23)
23410 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23411 }
23412 }
23413 }
23414
23415 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8_subtile) {
23416 TEST_REQUIRES_ARM_NEON_FMA;
23417 for (uint32_t n = 16; n <= 24; n += 8) {
23418 for (size_t k = 1; k <= 20; k += 5) {
23419 for (uint32_t m = 1; m <= 8; m++) {
23420 GemmMicrokernelTester()
23421 .mr(8)
23422 .nr(8)
23423 .kr(1)
23424 .sr(4)
23425 .m(m)
23426 .n(n)
23427 .k(k)
23428 .iterations(1)
23429 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23430 }
23431 }
23432 }
23433 }
23434
23435 TEST(F32_GEMMINC_8X8S4__NEONFMA, strided_cm_subtile) {
23436 TEST_REQUIRES_ARM_NEON_FMA;
23437 for (size_t k = 1; k <= 20; k += 5) {
23438 for (uint32_t m = 1; m <= 8; m++) {
23439 for (uint32_t n = 1; n <= 8; n++) {
23440 GemmMicrokernelTester()
23441 .mr(8)
23442 .nr(8)
23443 .kr(1)
23444 .sr(4)
23445 .m(m)
23446 .n(n)
23447 .k(k)
23448 .cm_stride(11)
23449 .iterations(1)
23450 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23451 }
23452 }
23453 }
23454 }
23455
23456 TEST(F32_GEMMINC_8X8S4__NEONFMA, qmin) {
23457 TEST_REQUIRES_ARM_NEON_FMA;
23458 GemmMicrokernelTester()
23459 .mr(8)
23460 .nr(8)
23461 .kr(1)
23462 .sr(4)
23463 .m(8)
23464 .n(8)
23465 .k(4)
23466 .qmin(128)
23467 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23468 }
23469
23470 TEST(F32_GEMMINC_8X8S4__NEONFMA, qmax) {
23471 TEST_REQUIRES_ARM_NEON_FMA;
23472 GemmMicrokernelTester()
23473 .mr(8)
23474 .nr(8)
23475 .kr(1)
23476 .sr(4)
23477 .m(8)
23478 .n(8)
23479 .k(4)
23480 .qmax(128)
23481 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23482 }
23483
23484 TEST(F32_GEMMINC_8X8S4__NEONFMA, strided_cm) {
23485 TEST_REQUIRES_ARM_NEON_FMA;
23486 GemmMicrokernelTester()
23487 .mr(8)
23488 .nr(8)
23489 .kr(1)
23490 .sr(4)
23491 .m(8)
23492 .n(8)
23493 .k(4)
23494 .cm_stride(11)
23495 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
23496 }
23497#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23498
23499
23500#if XNN_ARCH_X86 || XNN_ARCH_X86_64
23501 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1) {
23502 TEST_REQUIRES_X86_SSE;
23503 GemmMicrokernelTester()
23504 .mr(1)
23505 .nr(8)
23506 .kr(1)
23507 .sr(1)
23508 .m(1)
23509 .n(8)
23510 .k(1)
23511 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23512 }
23513
23514 TEST(F32_GEMMINC_1X8__SSE_LOAD1, strided_cn) {
23515 TEST_REQUIRES_X86_SSE;
23516 GemmMicrokernelTester()
23517 .mr(1)
23518 .nr(8)
23519 .kr(1)
23520 .sr(1)
23521 .m(1)
23522 .n(8)
23523 .k(1)
23524 .cn_stride(11)
23525 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23526 }
23527
23528 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_strided_a) {
23529 TEST_REQUIRES_X86_SSE;
23530 GemmMicrokernelTester()
23531 .mr(1)
23532 .nr(8)
23533 .kr(1)
23534 .sr(1)
23535 .m(1)
23536 .n(8)
23537 .k(1)
23538 .a_stride(3)
23539 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23540 }
23541
23542 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_subtile) {
23543 TEST_REQUIRES_X86_SSE;
23544 for (uint32_t m = 1; m <= 1; m++) {
23545 for (uint32_t n = 1; n <= 8; n++) {
23546 GemmMicrokernelTester()
23547 .mr(1)
23548 .nr(8)
23549 .kr(1)
23550 .sr(1)
23551 .m(m)
23552 .n(n)
23553 .k(1)
23554 .iterations(1)
23555 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23556 }
23557 }
23558 }
23559
23560 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_subtile_m) {
23561 TEST_REQUIRES_X86_SSE;
23562 for (uint32_t m = 1; m <= 1; m++) {
23563 GemmMicrokernelTester()
23564 .mr(1)
23565 .nr(8)
23566 .kr(1)
23567 .sr(1)
23568 .m(m)
23569 .n(8)
23570 .k(1)
23571 .iterations(1)
23572 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23573 }
23574 }
23575
23576 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_subtile_n) {
23577 TEST_REQUIRES_X86_SSE;
23578 for (uint32_t n = 1; n <= 8; n++) {
23579 GemmMicrokernelTester()
23580 .mr(1)
23581 .nr(8)
23582 .kr(1)
23583 .sr(1)
23584 .m(1)
23585 .n(n)
23586 .k(1)
23587 .iterations(1)
23588 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23589 }
23590 }
23591
23592 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_gt_1) {
23593 TEST_REQUIRES_X86_SSE;
23594 for (size_t k = 2; k < 10; k++) {
23595 GemmMicrokernelTester()
23596 .mr(1)
23597 .nr(8)
23598 .kr(1)
23599 .sr(1)
23600 .m(1)
23601 .n(8)
23602 .k(k)
23603 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23604 }
23605 }
23606
23607 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_gt_1_strided_a) {
23608 TEST_REQUIRES_X86_SSE;
23609 for (size_t k = 2; k < 10; k++) {
23610 GemmMicrokernelTester()
23611 .mr(1)
23612 .nr(8)
23613 .kr(1)
23614 .sr(1)
23615 .m(1)
23616 .n(8)
23617 .k(k)
23618 .a_stride(11)
23619 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23620 }
23621 }
23622
23623 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_gt_1_subtile) {
23624 TEST_REQUIRES_X86_SSE;
23625 for (size_t k = 2; k < 10; k++) {
23626 for (uint32_t m = 1; m <= 1; m++) {
23627 for (uint32_t n = 1; n <= 8; n++) {
23628 GemmMicrokernelTester()
23629 .mr(1)
23630 .nr(8)
23631 .kr(1)
23632 .sr(1)
23633 .m(m)
23634 .n(n)
23635 .k(k)
23636 .iterations(1)
23637 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23638 }
23639 }
23640 }
23641 }
23642
23643 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8) {
23644 TEST_REQUIRES_X86_SSE;
23645 for (uint32_t n = 9; n < 16; n++) {
23646 for (size_t k = 1; k <= 5; k += 2) {
23647 GemmMicrokernelTester()
23648 .mr(1)
23649 .nr(8)
23650 .kr(1)
23651 .sr(1)
23652 .m(1)
23653 .n(8)
23654 .k(k)
23655 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23656 }
23657 }
23658 }
23659
23660 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8_strided_cn) {
23661 TEST_REQUIRES_X86_SSE;
23662 for (uint32_t n = 9; n < 16; n++) {
23663 for (size_t k = 1; k <= 5; k += 2) {
23664 GemmMicrokernelTester()
23665 .mr(1)
23666 .nr(8)
23667 .kr(1)
23668 .sr(1)
23669 .m(1)
23670 .n(8)
23671 .k(k)
23672 .cn_stride(11)
23673 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23674 }
23675 }
23676 }
23677
23678 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8_strided_a) {
23679 TEST_REQUIRES_X86_SSE;
23680 for (uint32_t n = 9; n < 16; n++) {
23681 for (size_t k = 1; k <= 5; k += 2) {
23682 GemmMicrokernelTester()
23683 .mr(1)
23684 .nr(8)
23685 .kr(1)
23686 .sr(1)
23687 .m(1)
23688 .n(n)
23689 .k(k)
23690 .a_stride(7)
23691 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23692 }
23693 }
23694 }
23695
23696 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8_subtile) {
23697 TEST_REQUIRES_X86_SSE;
23698 for (uint32_t n = 9; n < 16; n++) {
23699 for (size_t k = 1; k <= 5; k += 2) {
23700 for (uint32_t m = 1; m <= 1; m++) {
23701 GemmMicrokernelTester()
23702 .mr(1)
23703 .nr(8)
23704 .kr(1)
23705 .sr(1)
23706 .m(m)
23707 .n(n)
23708 .k(k)
23709 .iterations(1)
23710 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23711 }
23712 }
23713 }
23714 }
23715
23716 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8) {
23717 TEST_REQUIRES_X86_SSE;
23718 for (uint32_t n = 16; n <= 24; n += 8) {
23719 for (size_t k = 1; k <= 5; k += 2) {
23720 GemmMicrokernelTester()
23721 .mr(1)
23722 .nr(8)
23723 .kr(1)
23724 .sr(1)
23725 .m(1)
23726 .n(8)
23727 .k(k)
23728 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23729 }
23730 }
23731 }
23732
23733 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8_strided_cn) {
23734 TEST_REQUIRES_X86_SSE;
23735 for (uint32_t n = 16; n <= 24; n += 8) {
23736 for (size_t k = 1; k <= 5; k += 2) {
23737 GemmMicrokernelTester()
23738 .mr(1)
23739 .nr(8)
23740 .kr(1)
23741 .sr(1)
23742 .m(1)
23743 .n(n)
23744 .k(k)
23745 .cn_stride(11)
23746 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23747 }
23748 }
23749 }
23750
23751 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8_strided_a) {
23752 TEST_REQUIRES_X86_SSE;
23753 for (uint32_t n = 16; n <= 24; n += 8) {
23754 for (size_t k = 1; k <= 5; k += 2) {
23755 GemmMicrokernelTester()
23756 .mr(1)
23757 .nr(8)
23758 .kr(1)
23759 .sr(1)
23760 .m(1)
23761 .n(n)
23762 .k(k)
23763 .a_stride(7)
23764 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23765 }
23766 }
23767 }
23768
23769 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8_subtile) {
23770 TEST_REQUIRES_X86_SSE;
23771 for (uint32_t n = 16; n <= 24; n += 8) {
23772 for (size_t k = 1; k <= 5; k += 2) {
23773 for (uint32_t m = 1; m <= 1; m++) {
23774 GemmMicrokernelTester()
23775 .mr(1)
23776 .nr(8)
23777 .kr(1)
23778 .sr(1)
23779 .m(m)
23780 .n(n)
23781 .k(k)
23782 .iterations(1)
23783 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23784 }
23785 }
23786 }
23787 }
23788
23789 TEST(F32_GEMMINC_1X8__SSE_LOAD1, strided_cm_subtile) {
23790 TEST_REQUIRES_X86_SSE;
23791 for (size_t k = 1; k <= 5; k += 2) {
23792 for (uint32_t m = 1; m <= 1; m++) {
23793 for (uint32_t n = 1; n <= 8; n++) {
23794 GemmMicrokernelTester()
23795 .mr(1)
23796 .nr(8)
23797 .kr(1)
23798 .sr(1)
23799 .m(m)
23800 .n(n)
23801 .k(k)
23802 .cm_stride(11)
23803 .iterations(1)
23804 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23805 }
23806 }
23807 }
23808 }
23809
23810 TEST(F32_GEMMINC_1X8__SSE_LOAD1, qmin) {
23811 TEST_REQUIRES_X86_SSE;
23812 GemmMicrokernelTester()
23813 .mr(1)
23814 .nr(8)
23815 .kr(1)
23816 .sr(1)
23817 .m(1)
23818 .n(8)
23819 .k(1)
23820 .qmin(128)
23821 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23822 }
23823
23824 TEST(F32_GEMMINC_1X8__SSE_LOAD1, qmax) {
23825 TEST_REQUIRES_X86_SSE;
23826 GemmMicrokernelTester()
23827 .mr(1)
23828 .nr(8)
23829 .kr(1)
23830 .sr(1)
23831 .m(1)
23832 .n(8)
23833 .k(1)
23834 .qmax(128)
23835 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23836 }
23837
23838 TEST(F32_GEMMINC_1X8__SSE_LOAD1, strided_cm) {
23839 TEST_REQUIRES_X86_SSE;
23840 GemmMicrokernelTester()
23841 .mr(1)
23842 .nr(8)
23843 .kr(1)
23844 .sr(1)
23845 .m(1)
23846 .n(8)
23847 .k(1)
23848 .cm_stride(11)
23849 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
23850 }
23851#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
23852
23853
23854#if XNN_ARCH_X86 || XNN_ARCH_X86_64
23855 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1) {
23856 TEST_REQUIRES_X86_SSE;
23857 GemmMicrokernelTester()
23858 .mr(4)
23859 .nr(8)
23860 .kr(1)
23861 .sr(1)
23862 .m(4)
23863 .n(8)
23864 .k(1)
23865 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23866 }
23867
23868 TEST(F32_GEMMINC_4X8__SSE_LOAD1, strided_cn) {
23869 TEST_REQUIRES_X86_SSE;
23870 GemmMicrokernelTester()
23871 .mr(4)
23872 .nr(8)
23873 .kr(1)
23874 .sr(1)
23875 .m(4)
23876 .n(8)
23877 .k(1)
23878 .cn_stride(11)
23879 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23880 }
23881
23882 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_strided_a) {
23883 TEST_REQUIRES_X86_SSE;
23884 GemmMicrokernelTester()
23885 .mr(4)
23886 .nr(8)
23887 .kr(1)
23888 .sr(1)
23889 .m(4)
23890 .n(8)
23891 .k(1)
23892 .a_stride(3)
23893 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23894 }
23895
23896 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_subtile) {
23897 TEST_REQUIRES_X86_SSE;
23898 for (uint32_t m = 1; m <= 4; m++) {
23899 for (uint32_t n = 1; n <= 8; n++) {
23900 GemmMicrokernelTester()
23901 .mr(4)
23902 .nr(8)
23903 .kr(1)
23904 .sr(1)
23905 .m(m)
23906 .n(n)
23907 .k(1)
23908 .iterations(1)
23909 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23910 }
23911 }
23912 }
23913
23914 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
23915 TEST_REQUIRES_X86_SSE;
23916 for (uint32_t m = 1; m <= 4; m++) {
23917 GemmMicrokernelTester()
23918 .mr(4)
23919 .nr(8)
23920 .kr(1)
23921 .sr(1)
23922 .m(m)
23923 .n(8)
23924 .k(1)
23925 .iterations(1)
23926 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23927 }
23928 }
23929
23930 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
23931 TEST_REQUIRES_X86_SSE;
23932 for (uint32_t n = 1; n <= 8; n++) {
23933 GemmMicrokernelTester()
23934 .mr(4)
23935 .nr(8)
23936 .kr(1)
23937 .sr(1)
23938 .m(4)
23939 .n(n)
23940 .k(1)
23941 .iterations(1)
23942 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23943 }
23944 }
23945
23946 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_gt_1) {
23947 TEST_REQUIRES_X86_SSE;
23948 for (size_t k = 2; k < 10; k++) {
23949 GemmMicrokernelTester()
23950 .mr(4)
23951 .nr(8)
23952 .kr(1)
23953 .sr(1)
23954 .m(4)
23955 .n(8)
23956 .k(k)
23957 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23958 }
23959 }
23960
23961 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_gt_1_strided_a) {
23962 TEST_REQUIRES_X86_SSE;
23963 for (size_t k = 2; k < 10; k++) {
23964 GemmMicrokernelTester()
23965 .mr(4)
23966 .nr(8)
23967 .kr(1)
23968 .sr(1)
23969 .m(4)
23970 .n(8)
23971 .k(k)
23972 .a_stride(11)
23973 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23974 }
23975 }
23976
23977 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_gt_1_subtile) {
23978 TEST_REQUIRES_X86_SSE;
23979 for (size_t k = 2; k < 10; k++) {
23980 for (uint32_t m = 1; m <= 4; m++) {
23981 for (uint32_t n = 1; n <= 8; n++) {
23982 GemmMicrokernelTester()
23983 .mr(4)
23984 .nr(8)
23985 .kr(1)
23986 .sr(1)
23987 .m(m)
23988 .n(n)
23989 .k(k)
23990 .iterations(1)
23991 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
23992 }
23993 }
23994 }
23995 }
23996
23997 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8) {
23998 TEST_REQUIRES_X86_SSE;
23999 for (uint32_t n = 9; n < 16; n++) {
24000 for (size_t k = 1; k <= 5; k += 2) {
24001 GemmMicrokernelTester()
24002 .mr(4)
24003 .nr(8)
24004 .kr(1)
24005 .sr(1)
24006 .m(4)
24007 .n(8)
24008 .k(k)
24009 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24010 }
24011 }
24012 }
24013
24014 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
24015 TEST_REQUIRES_X86_SSE;
24016 for (uint32_t n = 9; n < 16; n++) {
24017 for (size_t k = 1; k <= 5; k += 2) {
24018 GemmMicrokernelTester()
24019 .mr(4)
24020 .nr(8)
24021 .kr(1)
24022 .sr(1)
24023 .m(4)
24024 .n(8)
24025 .k(k)
24026 .cn_stride(11)
24027 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24028 }
24029 }
24030 }
24031
24032 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8_strided_a) {
24033 TEST_REQUIRES_X86_SSE;
24034 for (uint32_t n = 9; n < 16; n++) {
24035 for (size_t k = 1; k <= 5; k += 2) {
24036 GemmMicrokernelTester()
24037 .mr(4)
24038 .nr(8)
24039 .kr(1)
24040 .sr(1)
24041 .m(4)
24042 .n(n)
24043 .k(k)
24044 .a_stride(7)
24045 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24046 }
24047 }
24048 }
24049
24050 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8_subtile) {
24051 TEST_REQUIRES_X86_SSE;
24052 for (uint32_t n = 9; n < 16; n++) {
24053 for (size_t k = 1; k <= 5; k += 2) {
24054 for (uint32_t m = 1; m <= 4; m++) {
24055 GemmMicrokernelTester()
24056 .mr(4)
24057 .nr(8)
24058 .kr(1)
24059 .sr(1)
24060 .m(m)
24061 .n(n)
24062 .k(k)
24063 .iterations(1)
24064 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24065 }
24066 }
24067 }
24068 }
24069
24070 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8) {
24071 TEST_REQUIRES_X86_SSE;
24072 for (uint32_t n = 16; n <= 24; n += 8) {
24073 for (size_t k = 1; k <= 5; k += 2) {
24074 GemmMicrokernelTester()
24075 .mr(4)
24076 .nr(8)
24077 .kr(1)
24078 .sr(1)
24079 .m(4)
24080 .n(8)
24081 .k(k)
24082 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24083 }
24084 }
24085 }
24086
24087 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8_strided_cn) {
24088 TEST_REQUIRES_X86_SSE;
24089 for (uint32_t n = 16; n <= 24; n += 8) {
24090 for (size_t k = 1; k <= 5; k += 2) {
24091 GemmMicrokernelTester()
24092 .mr(4)
24093 .nr(8)
24094 .kr(1)
24095 .sr(1)
24096 .m(4)
24097 .n(n)
24098 .k(k)
24099 .cn_stride(11)
24100 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24101 }
24102 }
24103 }
24104
24105 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8_strided_a) {
24106 TEST_REQUIRES_X86_SSE;
24107 for (uint32_t n = 16; n <= 24; n += 8) {
24108 for (size_t k = 1; k <= 5; k += 2) {
24109 GemmMicrokernelTester()
24110 .mr(4)
24111 .nr(8)
24112 .kr(1)
24113 .sr(1)
24114 .m(4)
24115 .n(n)
24116 .k(k)
24117 .a_stride(7)
24118 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24119 }
24120 }
24121 }
24122
24123 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8_subtile) {
24124 TEST_REQUIRES_X86_SSE;
24125 for (uint32_t n = 16; n <= 24; n += 8) {
24126 for (size_t k = 1; k <= 5; k += 2) {
24127 for (uint32_t m = 1; m <= 4; m++) {
24128 GemmMicrokernelTester()
24129 .mr(4)
24130 .nr(8)
24131 .kr(1)
24132 .sr(1)
24133 .m(m)
24134 .n(n)
24135 .k(k)
24136 .iterations(1)
24137 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24138 }
24139 }
24140 }
24141 }
24142
24143 TEST(F32_GEMMINC_4X8__SSE_LOAD1, strided_cm_subtile) {
24144 TEST_REQUIRES_X86_SSE;
24145 for (size_t k = 1; k <= 5; k += 2) {
24146 for (uint32_t m = 1; m <= 4; m++) {
24147 for (uint32_t n = 1; n <= 8; n++) {
24148 GemmMicrokernelTester()
24149 .mr(4)
24150 .nr(8)
24151 .kr(1)
24152 .sr(1)
24153 .m(m)
24154 .n(n)
24155 .k(k)
24156 .cm_stride(11)
24157 .iterations(1)
24158 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24159 }
24160 }
24161 }
24162 }
24163
24164 TEST(F32_GEMMINC_4X8__SSE_LOAD1, qmin) {
24165 TEST_REQUIRES_X86_SSE;
24166 GemmMicrokernelTester()
24167 .mr(4)
24168 .nr(8)
24169 .kr(1)
24170 .sr(1)
24171 .m(4)
24172 .n(8)
24173 .k(1)
24174 .qmin(128)
24175 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24176 }
24177
24178 TEST(F32_GEMMINC_4X8__SSE_LOAD1, qmax) {
24179 TEST_REQUIRES_X86_SSE;
24180 GemmMicrokernelTester()
24181 .mr(4)
24182 .nr(8)
24183 .kr(1)
24184 .sr(1)
24185 .m(4)
24186 .n(8)
24187 .k(1)
24188 .qmax(128)
24189 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24190 }
24191
24192 TEST(F32_GEMMINC_4X8__SSE_LOAD1, strided_cm) {
24193 TEST_REQUIRES_X86_SSE;
24194 GemmMicrokernelTester()
24195 .mr(4)
24196 .nr(8)
24197 .kr(1)
24198 .sr(1)
24199 .m(4)
24200 .n(8)
24201 .k(1)
24202 .cm_stride(11)
24203 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
24204 }
24205#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24206
24207
24208#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24209 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4) {
24210 TEST_REQUIRES_X86_SSE;
24211 GemmMicrokernelTester()
24212 .mr(1)
24213 .nr(8)
24214 .kr(1)
24215 .sr(1)
24216 .m(1)
24217 .n(8)
24218 .k(4)
24219 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24220 }
24221
24222 TEST(F32_GEMMINC_1X8__SSE_DUP, strided_cn) {
24223 TEST_REQUIRES_X86_SSE;
24224 GemmMicrokernelTester()
24225 .mr(1)
24226 .nr(8)
24227 .kr(1)
24228 .sr(1)
24229 .m(1)
24230 .n(8)
24231 .k(4)
24232 .cn_stride(11)
24233 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24234 }
24235
24236 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_strided_a) {
24237 TEST_REQUIRES_X86_SSE;
24238 GemmMicrokernelTester()
24239 .mr(1)
24240 .nr(8)
24241 .kr(1)
24242 .sr(1)
24243 .m(1)
24244 .n(8)
24245 .k(4)
24246 .a_stride(7)
24247 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24248 }
24249
24250 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_subtile) {
24251 TEST_REQUIRES_X86_SSE;
24252 for (uint32_t m = 1; m <= 1; m++) {
24253 for (uint32_t n = 1; n <= 8; n++) {
24254 GemmMicrokernelTester()
24255 .mr(1)
24256 .nr(8)
24257 .kr(1)
24258 .sr(1)
24259 .m(m)
24260 .n(n)
24261 .k(4)
24262 .iterations(1)
24263 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24264 }
24265 }
24266 }
24267
24268 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_subtile_m) {
24269 TEST_REQUIRES_X86_SSE;
24270 for (uint32_t m = 1; m <= 1; m++) {
24271 GemmMicrokernelTester()
24272 .mr(1)
24273 .nr(8)
24274 .kr(1)
24275 .sr(1)
24276 .m(m)
24277 .n(8)
24278 .k(4)
24279 .iterations(1)
24280 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24281 }
24282 }
24283
24284 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_subtile_n) {
24285 TEST_REQUIRES_X86_SSE;
24286 for (uint32_t n = 1; n <= 8; n++) {
24287 GemmMicrokernelTester()
24288 .mr(1)
24289 .nr(8)
24290 .kr(1)
24291 .sr(1)
24292 .m(1)
24293 .n(n)
24294 .k(4)
24295 .iterations(1)
24296 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24297 }
24298 }
24299
24300 TEST(F32_GEMMINC_1X8__SSE_DUP, k_lt_4) {
24301 TEST_REQUIRES_X86_SSE;
24302 for (size_t k = 1; k < 4; k++) {
24303 GemmMicrokernelTester()
24304 .mr(1)
24305 .nr(8)
24306 .kr(1)
24307 .sr(1)
24308 .m(1)
24309 .n(8)
24310 .k(k)
24311 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24312 }
24313 }
24314
24315 TEST(F32_GEMMINC_1X8__SSE_DUP, k_lt_4_strided_a) {
24316 TEST_REQUIRES_X86_SSE;
24317 for (size_t k = 1; k < 4; k++) {
24318 GemmMicrokernelTester()
24319 .mr(1)
24320 .nr(8)
24321 .kr(1)
24322 .sr(1)
24323 .m(1)
24324 .n(8)
24325 .k(k)
24326 .a_stride(7)
24327 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24328 }
24329 }
24330
24331 TEST(F32_GEMMINC_1X8__SSE_DUP, k_lt_4_subtile) {
24332 TEST_REQUIRES_X86_SSE;
24333 for (size_t k = 1; k < 4; k++) {
24334 for (uint32_t m = 1; m <= 1; m++) {
24335 for (uint32_t n = 1; n <= 8; n++) {
24336 GemmMicrokernelTester()
24337 .mr(1)
24338 .nr(8)
24339 .kr(1)
24340 .sr(1)
24341 .m(m)
24342 .n(n)
24343 .k(k)
24344 .iterations(1)
24345 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24346 }
24347 }
24348 }
24349 }
24350
24351 TEST(F32_GEMMINC_1X8__SSE_DUP, k_gt_4) {
24352 TEST_REQUIRES_X86_SSE;
24353 for (size_t k = 5; k < 8; k++) {
24354 GemmMicrokernelTester()
24355 .mr(1)
24356 .nr(8)
24357 .kr(1)
24358 .sr(1)
24359 .m(1)
24360 .n(8)
24361 .k(k)
24362 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24363 }
24364 }
24365
24366 TEST(F32_GEMMINC_1X8__SSE_DUP, k_gt_4_strided_a) {
24367 TEST_REQUIRES_X86_SSE;
24368 for (size_t k = 5; k < 8; k++) {
24369 GemmMicrokernelTester()
24370 .mr(1)
24371 .nr(8)
24372 .kr(1)
24373 .sr(1)
24374 .m(1)
24375 .n(8)
24376 .k(k)
24377 .a_stride(11)
24378 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24379 }
24380 }
24381
24382 TEST(F32_GEMMINC_1X8__SSE_DUP, k_gt_4_subtile) {
24383 TEST_REQUIRES_X86_SSE;
24384 for (size_t k = 5; k < 8; k++) {
24385 for (uint32_t m = 1; m <= 1; m++) {
24386 for (uint32_t n = 1; n <= 8; n++) {
24387 GemmMicrokernelTester()
24388 .mr(1)
24389 .nr(8)
24390 .kr(1)
24391 .sr(1)
24392 .m(m)
24393 .n(n)
24394 .k(k)
24395 .iterations(1)
24396 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24397 }
24398 }
24399 }
24400 }
24401
24402 TEST(F32_GEMMINC_1X8__SSE_DUP, k_div_4) {
24403 TEST_REQUIRES_X86_SSE;
24404 for (size_t k = 8; k <= 40; k += 4) {
24405 GemmMicrokernelTester()
24406 .mr(1)
24407 .nr(8)
24408 .kr(1)
24409 .sr(1)
24410 .m(1)
24411 .n(8)
24412 .k(k)
24413 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24414 }
24415 }
24416
24417 TEST(F32_GEMMINC_1X8__SSE_DUP, k_div_4_strided_a) {
24418 TEST_REQUIRES_X86_SSE;
24419 for (size_t k = 8; k <= 40; k += 4) {
24420 GemmMicrokernelTester()
24421 .mr(1)
24422 .nr(8)
24423 .kr(1)
24424 .sr(1)
24425 .m(1)
24426 .n(8)
24427 .k(k)
24428 .a_stride(43)
24429 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24430 }
24431 }
24432
24433 TEST(F32_GEMMINC_1X8__SSE_DUP, k_div_4_subtile) {
24434 TEST_REQUIRES_X86_SSE;
24435 for (size_t k = 8; k <= 40; k += 4) {
24436 for (uint32_t m = 1; m <= 1; m++) {
24437 for (uint32_t n = 1; n <= 8; n++) {
24438 GemmMicrokernelTester()
24439 .mr(1)
24440 .nr(8)
24441 .kr(1)
24442 .sr(1)
24443 .m(m)
24444 .n(n)
24445 .k(k)
24446 .iterations(1)
24447 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24448 }
24449 }
24450 }
24451 }
24452
24453 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8) {
24454 TEST_REQUIRES_X86_SSE;
24455 for (uint32_t n = 9; n < 16; n++) {
24456 for (size_t k = 1; k <= 20; k += 5) {
24457 GemmMicrokernelTester()
24458 .mr(1)
24459 .nr(8)
24460 .kr(1)
24461 .sr(1)
24462 .m(1)
24463 .n(8)
24464 .k(k)
24465 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24466 }
24467 }
24468 }
24469
24470 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8_strided_cn) {
24471 TEST_REQUIRES_X86_SSE;
24472 for (uint32_t n = 9; n < 16; n++) {
24473 for (size_t k = 1; k <= 20; k += 5) {
24474 GemmMicrokernelTester()
24475 .mr(1)
24476 .nr(8)
24477 .kr(1)
24478 .sr(1)
24479 .m(1)
24480 .n(8)
24481 .k(k)
24482 .cn_stride(11)
24483 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24484 }
24485 }
24486 }
24487
24488 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8_strided_a) {
24489 TEST_REQUIRES_X86_SSE;
24490 for (uint32_t n = 9; n < 16; n++) {
24491 for (size_t k = 1; k <= 20; k += 5) {
24492 GemmMicrokernelTester()
24493 .mr(1)
24494 .nr(8)
24495 .kr(1)
24496 .sr(1)
24497 .m(1)
24498 .n(n)
24499 .k(k)
24500 .a_stride(23)
24501 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24502 }
24503 }
24504 }
24505
24506 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8_subtile) {
24507 TEST_REQUIRES_X86_SSE;
24508 for (uint32_t n = 9; n < 16; n++) {
24509 for (size_t k = 1; k <= 20; k += 5) {
24510 for (uint32_t m = 1; m <= 1; m++) {
24511 GemmMicrokernelTester()
24512 .mr(1)
24513 .nr(8)
24514 .kr(1)
24515 .sr(1)
24516 .m(m)
24517 .n(n)
24518 .k(k)
24519 .iterations(1)
24520 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24521 }
24522 }
24523 }
24524 }
24525
24526 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8) {
24527 TEST_REQUIRES_X86_SSE;
24528 for (uint32_t n = 16; n <= 24; n += 8) {
24529 for (size_t k = 1; k <= 20; k += 5) {
24530 GemmMicrokernelTester()
24531 .mr(1)
24532 .nr(8)
24533 .kr(1)
24534 .sr(1)
24535 .m(1)
24536 .n(8)
24537 .k(k)
24538 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24539 }
24540 }
24541 }
24542
24543 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8_strided_cn) {
24544 TEST_REQUIRES_X86_SSE;
24545 for (uint32_t n = 16; n <= 24; n += 8) {
24546 for (size_t k = 1; k <= 20; k += 5) {
24547 GemmMicrokernelTester()
24548 .mr(1)
24549 .nr(8)
24550 .kr(1)
24551 .sr(1)
24552 .m(1)
24553 .n(n)
24554 .k(k)
24555 .cn_stride(11)
24556 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24557 }
24558 }
24559 }
24560
24561 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8_strided_a) {
24562 TEST_REQUIRES_X86_SSE;
24563 for (uint32_t n = 16; n <= 24; n += 8) {
24564 for (size_t k = 1; k <= 20; k += 5) {
24565 GemmMicrokernelTester()
24566 .mr(1)
24567 .nr(8)
24568 .kr(1)
24569 .sr(1)
24570 .m(1)
24571 .n(n)
24572 .k(k)
24573 .a_stride(23)
24574 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24575 }
24576 }
24577 }
24578
24579 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8_subtile) {
24580 TEST_REQUIRES_X86_SSE;
24581 for (uint32_t n = 16; n <= 24; n += 8) {
24582 for (size_t k = 1; k <= 20; k += 5) {
24583 for (uint32_t m = 1; m <= 1; m++) {
24584 GemmMicrokernelTester()
24585 .mr(1)
24586 .nr(8)
24587 .kr(1)
24588 .sr(1)
24589 .m(m)
24590 .n(n)
24591 .k(k)
24592 .iterations(1)
24593 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24594 }
24595 }
24596 }
24597 }
24598
24599 TEST(F32_GEMMINC_1X8__SSE_DUP, strided_cm_subtile) {
24600 TEST_REQUIRES_X86_SSE;
24601 for (size_t k = 1; k <= 20; k += 5) {
24602 for (uint32_t m = 1; m <= 1; m++) {
24603 for (uint32_t n = 1; n <= 8; n++) {
24604 GemmMicrokernelTester()
24605 .mr(1)
24606 .nr(8)
24607 .kr(1)
24608 .sr(1)
24609 .m(m)
24610 .n(n)
24611 .k(k)
24612 .cm_stride(11)
24613 .iterations(1)
24614 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24615 }
24616 }
24617 }
24618 }
24619
24620 TEST(F32_GEMMINC_1X8__SSE_DUP, qmin) {
24621 TEST_REQUIRES_X86_SSE;
24622 GemmMicrokernelTester()
24623 .mr(1)
24624 .nr(8)
24625 .kr(1)
24626 .sr(1)
24627 .m(1)
24628 .n(8)
24629 .k(4)
24630 .qmin(128)
24631 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24632 }
24633
24634 TEST(F32_GEMMINC_1X8__SSE_DUP, qmax) {
24635 TEST_REQUIRES_X86_SSE;
24636 GemmMicrokernelTester()
24637 .mr(1)
24638 .nr(8)
24639 .kr(1)
24640 .sr(1)
24641 .m(1)
24642 .n(8)
24643 .k(4)
24644 .qmax(128)
24645 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24646 }
24647
24648 TEST(F32_GEMMINC_1X8__SSE_DUP, strided_cm) {
24649 TEST_REQUIRES_X86_SSE;
24650 GemmMicrokernelTester()
24651 .mr(1)
24652 .nr(8)
24653 .kr(1)
24654 .sr(1)
24655 .m(1)
24656 .n(8)
24657 .k(4)
24658 .cm_stride(11)
24659 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
24660 }
24661#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24662
24663
24664#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24665 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4) {
24666 TEST_REQUIRES_X86_SSE;
24667 GemmMicrokernelTester()
24668 .mr(4)
24669 .nr(8)
24670 .kr(1)
24671 .sr(1)
24672 .m(4)
24673 .n(8)
24674 .k(4)
24675 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24676 }
24677
24678 TEST(F32_GEMMINC_4X8__SSE_DUP, strided_cn) {
24679 TEST_REQUIRES_X86_SSE;
24680 GemmMicrokernelTester()
24681 .mr(4)
24682 .nr(8)
24683 .kr(1)
24684 .sr(1)
24685 .m(4)
24686 .n(8)
24687 .k(4)
24688 .cn_stride(11)
24689 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24690 }
24691
24692 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_strided_a) {
24693 TEST_REQUIRES_X86_SSE;
24694 GemmMicrokernelTester()
24695 .mr(4)
24696 .nr(8)
24697 .kr(1)
24698 .sr(1)
24699 .m(4)
24700 .n(8)
24701 .k(4)
24702 .a_stride(7)
24703 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24704 }
24705
24706 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_subtile) {
24707 TEST_REQUIRES_X86_SSE;
24708 for (uint32_t m = 1; m <= 4; m++) {
24709 for (uint32_t n = 1; n <= 8; n++) {
24710 GemmMicrokernelTester()
24711 .mr(4)
24712 .nr(8)
24713 .kr(1)
24714 .sr(1)
24715 .m(m)
24716 .n(n)
24717 .k(4)
24718 .iterations(1)
24719 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24720 }
24721 }
24722 }
24723
24724 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_subtile_m) {
24725 TEST_REQUIRES_X86_SSE;
24726 for (uint32_t m = 1; m <= 4; m++) {
24727 GemmMicrokernelTester()
24728 .mr(4)
24729 .nr(8)
24730 .kr(1)
24731 .sr(1)
24732 .m(m)
24733 .n(8)
24734 .k(4)
24735 .iterations(1)
24736 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24737 }
24738 }
24739
24740 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_subtile_n) {
24741 TEST_REQUIRES_X86_SSE;
24742 for (uint32_t n = 1; n <= 8; n++) {
24743 GemmMicrokernelTester()
24744 .mr(4)
24745 .nr(8)
24746 .kr(1)
24747 .sr(1)
24748 .m(4)
24749 .n(n)
24750 .k(4)
24751 .iterations(1)
24752 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24753 }
24754 }
24755
24756 TEST(F32_GEMMINC_4X8__SSE_DUP, k_lt_4) {
24757 TEST_REQUIRES_X86_SSE;
24758 for (size_t k = 1; k < 4; k++) {
24759 GemmMicrokernelTester()
24760 .mr(4)
24761 .nr(8)
24762 .kr(1)
24763 .sr(1)
24764 .m(4)
24765 .n(8)
24766 .k(k)
24767 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24768 }
24769 }
24770
24771 TEST(F32_GEMMINC_4X8__SSE_DUP, k_lt_4_strided_a) {
24772 TEST_REQUIRES_X86_SSE;
24773 for (size_t k = 1; k < 4; k++) {
24774 GemmMicrokernelTester()
24775 .mr(4)
24776 .nr(8)
24777 .kr(1)
24778 .sr(1)
24779 .m(4)
24780 .n(8)
24781 .k(k)
24782 .a_stride(7)
24783 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24784 }
24785 }
24786
24787 TEST(F32_GEMMINC_4X8__SSE_DUP, k_lt_4_subtile) {
24788 TEST_REQUIRES_X86_SSE;
24789 for (size_t k = 1; k < 4; k++) {
24790 for (uint32_t m = 1; m <= 4; m++) {
24791 for (uint32_t n = 1; n <= 8; n++) {
24792 GemmMicrokernelTester()
24793 .mr(4)
24794 .nr(8)
24795 .kr(1)
24796 .sr(1)
24797 .m(m)
24798 .n(n)
24799 .k(k)
24800 .iterations(1)
24801 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24802 }
24803 }
24804 }
24805 }
24806
24807 TEST(F32_GEMMINC_4X8__SSE_DUP, k_gt_4) {
24808 TEST_REQUIRES_X86_SSE;
24809 for (size_t k = 5; k < 8; k++) {
24810 GemmMicrokernelTester()
24811 .mr(4)
24812 .nr(8)
24813 .kr(1)
24814 .sr(1)
24815 .m(4)
24816 .n(8)
24817 .k(k)
24818 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24819 }
24820 }
24821
24822 TEST(F32_GEMMINC_4X8__SSE_DUP, k_gt_4_strided_a) {
24823 TEST_REQUIRES_X86_SSE;
24824 for (size_t k = 5; k < 8; k++) {
24825 GemmMicrokernelTester()
24826 .mr(4)
24827 .nr(8)
24828 .kr(1)
24829 .sr(1)
24830 .m(4)
24831 .n(8)
24832 .k(k)
24833 .a_stride(11)
24834 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24835 }
24836 }
24837
24838 TEST(F32_GEMMINC_4X8__SSE_DUP, k_gt_4_subtile) {
24839 TEST_REQUIRES_X86_SSE;
24840 for (size_t k = 5; k < 8; k++) {
24841 for (uint32_t m = 1; m <= 4; m++) {
24842 for (uint32_t n = 1; n <= 8; n++) {
24843 GemmMicrokernelTester()
24844 .mr(4)
24845 .nr(8)
24846 .kr(1)
24847 .sr(1)
24848 .m(m)
24849 .n(n)
24850 .k(k)
24851 .iterations(1)
24852 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24853 }
24854 }
24855 }
24856 }
24857
24858 TEST(F32_GEMMINC_4X8__SSE_DUP, k_div_4) {
24859 TEST_REQUIRES_X86_SSE;
24860 for (size_t k = 8; k <= 40; k += 4) {
24861 GemmMicrokernelTester()
24862 .mr(4)
24863 .nr(8)
24864 .kr(1)
24865 .sr(1)
24866 .m(4)
24867 .n(8)
24868 .k(k)
24869 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24870 }
24871 }
24872
24873 TEST(F32_GEMMINC_4X8__SSE_DUP, k_div_4_strided_a) {
24874 TEST_REQUIRES_X86_SSE;
24875 for (size_t k = 8; k <= 40; k += 4) {
24876 GemmMicrokernelTester()
24877 .mr(4)
24878 .nr(8)
24879 .kr(1)
24880 .sr(1)
24881 .m(4)
24882 .n(8)
24883 .k(k)
24884 .a_stride(43)
24885 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24886 }
24887 }
24888
24889 TEST(F32_GEMMINC_4X8__SSE_DUP, k_div_4_subtile) {
24890 TEST_REQUIRES_X86_SSE;
24891 for (size_t k = 8; k <= 40; k += 4) {
24892 for (uint32_t m = 1; m <= 4; m++) {
24893 for (uint32_t n = 1; n <= 8; n++) {
24894 GemmMicrokernelTester()
24895 .mr(4)
24896 .nr(8)
24897 .kr(1)
24898 .sr(1)
24899 .m(m)
24900 .n(n)
24901 .k(k)
24902 .iterations(1)
24903 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24904 }
24905 }
24906 }
24907 }
24908
24909 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8) {
24910 TEST_REQUIRES_X86_SSE;
24911 for (uint32_t n = 9; n < 16; n++) {
24912 for (size_t k = 1; k <= 20; k += 5) {
24913 GemmMicrokernelTester()
24914 .mr(4)
24915 .nr(8)
24916 .kr(1)
24917 .sr(1)
24918 .m(4)
24919 .n(8)
24920 .k(k)
24921 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24922 }
24923 }
24924 }
24925
24926 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8_strided_cn) {
24927 TEST_REQUIRES_X86_SSE;
24928 for (uint32_t n = 9; n < 16; n++) {
24929 for (size_t k = 1; k <= 20; k += 5) {
24930 GemmMicrokernelTester()
24931 .mr(4)
24932 .nr(8)
24933 .kr(1)
24934 .sr(1)
24935 .m(4)
24936 .n(8)
24937 .k(k)
24938 .cn_stride(11)
24939 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24940 }
24941 }
24942 }
24943
24944 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8_strided_a) {
24945 TEST_REQUIRES_X86_SSE;
24946 for (uint32_t n = 9; n < 16; n++) {
24947 for (size_t k = 1; k <= 20; k += 5) {
24948 GemmMicrokernelTester()
24949 .mr(4)
24950 .nr(8)
24951 .kr(1)
24952 .sr(1)
24953 .m(4)
24954 .n(n)
24955 .k(k)
24956 .a_stride(23)
24957 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24958 }
24959 }
24960 }
24961
24962 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8_subtile) {
24963 TEST_REQUIRES_X86_SSE;
24964 for (uint32_t n = 9; n < 16; n++) {
24965 for (size_t k = 1; k <= 20; k += 5) {
24966 for (uint32_t m = 1; m <= 4; m++) {
24967 GemmMicrokernelTester()
24968 .mr(4)
24969 .nr(8)
24970 .kr(1)
24971 .sr(1)
24972 .m(m)
24973 .n(n)
24974 .k(k)
24975 .iterations(1)
24976 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24977 }
24978 }
24979 }
24980 }
24981
24982 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8) {
24983 TEST_REQUIRES_X86_SSE;
24984 for (uint32_t n = 16; n <= 24; n += 8) {
24985 for (size_t k = 1; k <= 20; k += 5) {
24986 GemmMicrokernelTester()
24987 .mr(4)
24988 .nr(8)
24989 .kr(1)
24990 .sr(1)
24991 .m(4)
24992 .n(8)
24993 .k(k)
24994 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
24995 }
24996 }
24997 }
24998
24999 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8_strided_cn) {
25000 TEST_REQUIRES_X86_SSE;
25001 for (uint32_t n = 16; n <= 24; n += 8) {
25002 for (size_t k = 1; k <= 20; k += 5) {
25003 GemmMicrokernelTester()
25004 .mr(4)
25005 .nr(8)
25006 .kr(1)
25007 .sr(1)
25008 .m(4)
25009 .n(n)
25010 .k(k)
25011 .cn_stride(11)
25012 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25013 }
25014 }
25015 }
25016
25017 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8_strided_a) {
25018 TEST_REQUIRES_X86_SSE;
25019 for (uint32_t n = 16; n <= 24; n += 8) {
25020 for (size_t k = 1; k <= 20; k += 5) {
25021 GemmMicrokernelTester()
25022 .mr(4)
25023 .nr(8)
25024 .kr(1)
25025 .sr(1)
25026 .m(4)
25027 .n(n)
25028 .k(k)
25029 .a_stride(23)
25030 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25031 }
25032 }
25033 }
25034
25035 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8_subtile) {
25036 TEST_REQUIRES_X86_SSE;
25037 for (uint32_t n = 16; n <= 24; n += 8) {
25038 for (size_t k = 1; k <= 20; k += 5) {
25039 for (uint32_t m = 1; m <= 4; m++) {
25040 GemmMicrokernelTester()
25041 .mr(4)
25042 .nr(8)
25043 .kr(1)
25044 .sr(1)
25045 .m(m)
25046 .n(n)
25047 .k(k)
25048 .iterations(1)
25049 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25050 }
25051 }
25052 }
25053 }
25054
25055 TEST(F32_GEMMINC_4X8__SSE_DUP, strided_cm_subtile) {
25056 TEST_REQUIRES_X86_SSE;
25057 for (size_t k = 1; k <= 20; k += 5) {
25058 for (uint32_t m = 1; m <= 4; m++) {
25059 for (uint32_t n = 1; n <= 8; n++) {
25060 GemmMicrokernelTester()
25061 .mr(4)
25062 .nr(8)
25063 .kr(1)
25064 .sr(1)
25065 .m(m)
25066 .n(n)
25067 .k(k)
25068 .cm_stride(11)
25069 .iterations(1)
25070 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25071 }
25072 }
25073 }
25074 }
25075
25076 TEST(F32_GEMMINC_4X8__SSE_DUP, qmin) {
25077 TEST_REQUIRES_X86_SSE;
25078 GemmMicrokernelTester()
25079 .mr(4)
25080 .nr(8)
25081 .kr(1)
25082 .sr(1)
25083 .m(4)
25084 .n(8)
25085 .k(4)
25086 .qmin(128)
25087 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25088 }
25089
25090 TEST(F32_GEMMINC_4X8__SSE_DUP, qmax) {
25091 TEST_REQUIRES_X86_SSE;
25092 GemmMicrokernelTester()
25093 .mr(4)
25094 .nr(8)
25095 .kr(1)
25096 .sr(1)
25097 .m(4)
25098 .n(8)
25099 .k(4)
25100 .qmax(128)
25101 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25102 }
25103
25104 TEST(F32_GEMMINC_4X8__SSE_DUP, strided_cm) {
25105 TEST_REQUIRES_X86_SSE;
25106 GemmMicrokernelTester()
25107 .mr(4)
25108 .nr(8)
25109 .kr(1)
25110 .sr(1)
25111 .m(4)
25112 .n(8)
25113 .k(4)
25114 .cm_stride(11)
25115 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
25116 }
25117#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25118
25119
25120#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25121 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4) {
25122 TEST_REQUIRES_X86_SSE;
25123 GemmMicrokernelTester()
25124 .mr(1)
25125 .nr(8)
25126 .kr(1)
25127 .sr(4)
25128 .m(1)
25129 .n(8)
25130 .k(4)
25131 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25132 }
25133
25134 TEST(F32_GEMMINC_1X8S4__SSE, strided_cn) {
25135 TEST_REQUIRES_X86_SSE;
25136 GemmMicrokernelTester()
25137 .mr(1)
25138 .nr(8)
25139 .kr(1)
25140 .sr(4)
25141 .m(1)
25142 .n(8)
25143 .k(4)
25144 .cn_stride(11)
25145 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25146 }
25147
25148 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_strided_a) {
25149 TEST_REQUIRES_X86_SSE;
25150 GemmMicrokernelTester()
25151 .mr(1)
25152 .nr(8)
25153 .kr(1)
25154 .sr(4)
25155 .m(1)
25156 .n(8)
25157 .k(4)
25158 .a_stride(7)
25159 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25160 }
25161
25162 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_subtile) {
25163 TEST_REQUIRES_X86_SSE;
25164 for (uint32_t m = 1; m <= 1; m++) {
25165 for (uint32_t n = 1; n <= 8; n++) {
25166 GemmMicrokernelTester()
25167 .mr(1)
25168 .nr(8)
25169 .kr(1)
25170 .sr(4)
25171 .m(m)
25172 .n(n)
25173 .k(4)
25174 .iterations(1)
25175 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25176 }
25177 }
25178 }
25179
25180 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_subtile_m) {
25181 TEST_REQUIRES_X86_SSE;
25182 for (uint32_t m = 1; m <= 1; m++) {
25183 GemmMicrokernelTester()
25184 .mr(1)
25185 .nr(8)
25186 .kr(1)
25187 .sr(4)
25188 .m(m)
25189 .n(8)
25190 .k(4)
25191 .iterations(1)
25192 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25193 }
25194 }
25195
25196 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_subtile_n) {
25197 TEST_REQUIRES_X86_SSE;
25198 for (uint32_t n = 1; n <= 8; n++) {
25199 GemmMicrokernelTester()
25200 .mr(1)
25201 .nr(8)
25202 .kr(1)
25203 .sr(4)
25204 .m(1)
25205 .n(n)
25206 .k(4)
25207 .iterations(1)
25208 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25209 }
25210 }
25211
25212 TEST(F32_GEMMINC_1X8S4__SSE, k_lt_4) {
25213 TEST_REQUIRES_X86_SSE;
25214 for (size_t k = 1; k < 4; k++) {
25215 GemmMicrokernelTester()
25216 .mr(1)
25217 .nr(8)
25218 .kr(1)
25219 .sr(4)
25220 .m(1)
25221 .n(8)
25222 .k(k)
25223 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25224 }
25225 }
25226
25227 TEST(F32_GEMMINC_1X8S4__SSE, k_lt_4_strided_a) {
25228 TEST_REQUIRES_X86_SSE;
25229 for (size_t k = 1; k < 4; k++) {
25230 GemmMicrokernelTester()
25231 .mr(1)
25232 .nr(8)
25233 .kr(1)
25234 .sr(4)
25235 .m(1)
25236 .n(8)
25237 .k(k)
25238 .a_stride(7)
25239 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25240 }
25241 }
25242
25243 TEST(F32_GEMMINC_1X8S4__SSE, k_lt_4_subtile) {
25244 TEST_REQUIRES_X86_SSE;
25245 for (size_t k = 1; k < 4; k++) {
25246 for (uint32_t m = 1; m <= 1; m++) {
25247 for (uint32_t n = 1; n <= 8; n++) {
25248 GemmMicrokernelTester()
25249 .mr(1)
25250 .nr(8)
25251 .kr(1)
25252 .sr(4)
25253 .m(m)
25254 .n(n)
25255 .k(k)
25256 .iterations(1)
25257 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25258 }
25259 }
25260 }
25261 }
25262
25263 TEST(F32_GEMMINC_1X8S4__SSE, k_gt_4) {
25264 TEST_REQUIRES_X86_SSE;
25265 for (size_t k = 5; k < 8; k++) {
25266 GemmMicrokernelTester()
25267 .mr(1)
25268 .nr(8)
25269 .kr(1)
25270 .sr(4)
25271 .m(1)
25272 .n(8)
25273 .k(k)
25274 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25275 }
25276 }
25277
25278 TEST(F32_GEMMINC_1X8S4__SSE, k_gt_4_strided_a) {
25279 TEST_REQUIRES_X86_SSE;
25280 for (size_t k = 5; k < 8; k++) {
25281 GemmMicrokernelTester()
25282 .mr(1)
25283 .nr(8)
25284 .kr(1)
25285 .sr(4)
25286 .m(1)
25287 .n(8)
25288 .k(k)
25289 .a_stride(11)
25290 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25291 }
25292 }
25293
25294 TEST(F32_GEMMINC_1X8S4__SSE, k_gt_4_subtile) {
25295 TEST_REQUIRES_X86_SSE;
25296 for (size_t k = 5; k < 8; k++) {
25297 for (uint32_t m = 1; m <= 1; m++) {
25298 for (uint32_t n = 1; n <= 8; n++) {
25299 GemmMicrokernelTester()
25300 .mr(1)
25301 .nr(8)
25302 .kr(1)
25303 .sr(4)
25304 .m(m)
25305 .n(n)
25306 .k(k)
25307 .iterations(1)
25308 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25309 }
25310 }
25311 }
25312 }
25313
25314 TEST(F32_GEMMINC_1X8S4__SSE, k_div_4) {
25315 TEST_REQUIRES_X86_SSE;
25316 for (size_t k = 8; k <= 40; k += 4) {
25317 GemmMicrokernelTester()
25318 .mr(1)
25319 .nr(8)
25320 .kr(1)
25321 .sr(4)
25322 .m(1)
25323 .n(8)
25324 .k(k)
25325 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25326 }
25327 }
25328
25329 TEST(F32_GEMMINC_1X8S4__SSE, k_div_4_strided_a) {
25330 TEST_REQUIRES_X86_SSE;
25331 for (size_t k = 8; k <= 40; k += 4) {
25332 GemmMicrokernelTester()
25333 .mr(1)
25334 .nr(8)
25335 .kr(1)
25336 .sr(4)
25337 .m(1)
25338 .n(8)
25339 .k(k)
25340 .a_stride(43)
25341 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25342 }
25343 }
25344
25345 TEST(F32_GEMMINC_1X8S4__SSE, k_div_4_subtile) {
25346 TEST_REQUIRES_X86_SSE;
25347 for (size_t k = 8; k <= 40; k += 4) {
25348 for (uint32_t m = 1; m <= 1; m++) {
25349 for (uint32_t n = 1; n <= 8; n++) {
25350 GemmMicrokernelTester()
25351 .mr(1)
25352 .nr(8)
25353 .kr(1)
25354 .sr(4)
25355 .m(m)
25356 .n(n)
25357 .k(k)
25358 .iterations(1)
25359 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25360 }
25361 }
25362 }
25363 }
25364
25365 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8) {
25366 TEST_REQUIRES_X86_SSE;
25367 for (uint32_t n = 9; n < 16; n++) {
25368 for (size_t k = 1; k <= 20; k += 5) {
25369 GemmMicrokernelTester()
25370 .mr(1)
25371 .nr(8)
25372 .kr(1)
25373 .sr(4)
25374 .m(1)
25375 .n(8)
25376 .k(k)
25377 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25378 }
25379 }
25380 }
25381
25382 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8_strided_cn) {
25383 TEST_REQUIRES_X86_SSE;
25384 for (uint32_t n = 9; n < 16; n++) {
25385 for (size_t k = 1; k <= 20; k += 5) {
25386 GemmMicrokernelTester()
25387 .mr(1)
25388 .nr(8)
25389 .kr(1)
25390 .sr(4)
25391 .m(1)
25392 .n(8)
25393 .k(k)
25394 .cn_stride(11)
25395 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25396 }
25397 }
25398 }
25399
25400 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8_strided_a) {
25401 TEST_REQUIRES_X86_SSE;
25402 for (uint32_t n = 9; n < 16; n++) {
25403 for (size_t k = 1; k <= 20; k += 5) {
25404 GemmMicrokernelTester()
25405 .mr(1)
25406 .nr(8)
25407 .kr(1)
25408 .sr(4)
25409 .m(1)
25410 .n(n)
25411 .k(k)
25412 .a_stride(23)
25413 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25414 }
25415 }
25416 }
25417
25418 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8_subtile) {
25419 TEST_REQUIRES_X86_SSE;
25420 for (uint32_t n = 9; n < 16; n++) {
25421 for (size_t k = 1; k <= 20; k += 5) {
25422 for (uint32_t m = 1; m <= 1; m++) {
25423 GemmMicrokernelTester()
25424 .mr(1)
25425 .nr(8)
25426 .kr(1)
25427 .sr(4)
25428 .m(m)
25429 .n(n)
25430 .k(k)
25431 .iterations(1)
25432 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25433 }
25434 }
25435 }
25436 }
25437
25438 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8) {
25439 TEST_REQUIRES_X86_SSE;
25440 for (uint32_t n = 16; n <= 24; n += 8) {
25441 for (size_t k = 1; k <= 20; k += 5) {
25442 GemmMicrokernelTester()
25443 .mr(1)
25444 .nr(8)
25445 .kr(1)
25446 .sr(4)
25447 .m(1)
25448 .n(8)
25449 .k(k)
25450 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25451 }
25452 }
25453 }
25454
25455 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8_strided_cn) {
25456 TEST_REQUIRES_X86_SSE;
25457 for (uint32_t n = 16; n <= 24; n += 8) {
25458 for (size_t k = 1; k <= 20; k += 5) {
25459 GemmMicrokernelTester()
25460 .mr(1)
25461 .nr(8)
25462 .kr(1)
25463 .sr(4)
25464 .m(1)
25465 .n(n)
25466 .k(k)
25467 .cn_stride(11)
25468 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25469 }
25470 }
25471 }
25472
25473 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8_strided_a) {
25474 TEST_REQUIRES_X86_SSE;
25475 for (uint32_t n = 16; n <= 24; n += 8) {
25476 for (size_t k = 1; k <= 20; k += 5) {
25477 GemmMicrokernelTester()
25478 .mr(1)
25479 .nr(8)
25480 .kr(1)
25481 .sr(4)
25482 .m(1)
25483 .n(n)
25484 .k(k)
25485 .a_stride(23)
25486 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25487 }
25488 }
25489 }
25490
25491 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8_subtile) {
25492 TEST_REQUIRES_X86_SSE;
25493 for (uint32_t n = 16; n <= 24; n += 8) {
25494 for (size_t k = 1; k <= 20; k += 5) {
25495 for (uint32_t m = 1; m <= 1; m++) {
25496 GemmMicrokernelTester()
25497 .mr(1)
25498 .nr(8)
25499 .kr(1)
25500 .sr(4)
25501 .m(m)
25502 .n(n)
25503 .k(k)
25504 .iterations(1)
25505 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25506 }
25507 }
25508 }
25509 }
25510
25511 TEST(F32_GEMMINC_1X8S4__SSE, strided_cm_subtile) {
25512 TEST_REQUIRES_X86_SSE;
25513 for (size_t k = 1; k <= 20; k += 5) {
25514 for (uint32_t m = 1; m <= 1; m++) {
25515 for (uint32_t n = 1; n <= 8; n++) {
25516 GemmMicrokernelTester()
25517 .mr(1)
25518 .nr(8)
25519 .kr(1)
25520 .sr(4)
25521 .m(m)
25522 .n(n)
25523 .k(k)
25524 .cm_stride(11)
25525 .iterations(1)
25526 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25527 }
25528 }
25529 }
25530 }
25531
25532 TEST(F32_GEMMINC_1X8S4__SSE, qmin) {
25533 TEST_REQUIRES_X86_SSE;
25534 GemmMicrokernelTester()
25535 .mr(1)
25536 .nr(8)
25537 .kr(1)
25538 .sr(4)
25539 .m(1)
25540 .n(8)
25541 .k(4)
25542 .qmin(128)
25543 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25544 }
25545
25546 TEST(F32_GEMMINC_1X8S4__SSE, qmax) {
25547 TEST_REQUIRES_X86_SSE;
25548 GemmMicrokernelTester()
25549 .mr(1)
25550 .nr(8)
25551 .kr(1)
25552 .sr(4)
25553 .m(1)
25554 .n(8)
25555 .k(4)
25556 .qmax(128)
25557 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25558 }
25559
25560 TEST(F32_GEMMINC_1X8S4__SSE, strided_cm) {
25561 TEST_REQUIRES_X86_SSE;
25562 GemmMicrokernelTester()
25563 .mr(1)
25564 .nr(8)
25565 .kr(1)
25566 .sr(4)
25567 .m(1)
25568 .n(8)
25569 .k(4)
25570 .cm_stride(11)
25571 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
25572 }
25573#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25574
25575
25576#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25577 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4) {
25578 TEST_REQUIRES_X86_SSE;
25579 GemmMicrokernelTester()
25580 .mr(4)
25581 .nr(8)
25582 .kr(1)
25583 .sr(4)
25584 .m(4)
25585 .n(8)
25586 .k(4)
25587 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25588 }
25589
25590 TEST(F32_GEMMINC_4X8S4__SSE, strided_cn) {
25591 TEST_REQUIRES_X86_SSE;
25592 GemmMicrokernelTester()
25593 .mr(4)
25594 .nr(8)
25595 .kr(1)
25596 .sr(4)
25597 .m(4)
25598 .n(8)
25599 .k(4)
25600 .cn_stride(11)
25601 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25602 }
25603
25604 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_strided_a) {
25605 TEST_REQUIRES_X86_SSE;
25606 GemmMicrokernelTester()
25607 .mr(4)
25608 .nr(8)
25609 .kr(1)
25610 .sr(4)
25611 .m(4)
25612 .n(8)
25613 .k(4)
25614 .a_stride(7)
25615 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25616 }
25617
25618 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_subtile) {
25619 TEST_REQUIRES_X86_SSE;
25620 for (uint32_t m = 1; m <= 4; m++) {
25621 for (uint32_t n = 1; n <= 8; n++) {
25622 GemmMicrokernelTester()
25623 .mr(4)
25624 .nr(8)
25625 .kr(1)
25626 .sr(4)
25627 .m(m)
25628 .n(n)
25629 .k(4)
25630 .iterations(1)
25631 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25632 }
25633 }
25634 }
25635
25636 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_subtile_m) {
25637 TEST_REQUIRES_X86_SSE;
25638 for (uint32_t m = 1; m <= 4; m++) {
25639 GemmMicrokernelTester()
25640 .mr(4)
25641 .nr(8)
25642 .kr(1)
25643 .sr(4)
25644 .m(m)
25645 .n(8)
25646 .k(4)
25647 .iterations(1)
25648 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25649 }
25650 }
25651
25652 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_subtile_n) {
25653 TEST_REQUIRES_X86_SSE;
25654 for (uint32_t n = 1; n <= 8; n++) {
25655 GemmMicrokernelTester()
25656 .mr(4)
25657 .nr(8)
25658 .kr(1)
25659 .sr(4)
25660 .m(4)
25661 .n(n)
25662 .k(4)
25663 .iterations(1)
25664 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25665 }
25666 }
25667
25668 TEST(F32_GEMMINC_4X8S4__SSE, k_lt_4) {
25669 TEST_REQUIRES_X86_SSE;
25670 for (size_t k = 1; k < 4; k++) {
25671 GemmMicrokernelTester()
25672 .mr(4)
25673 .nr(8)
25674 .kr(1)
25675 .sr(4)
25676 .m(4)
25677 .n(8)
25678 .k(k)
25679 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25680 }
25681 }
25682
25683 TEST(F32_GEMMINC_4X8S4__SSE, k_lt_4_strided_a) {
25684 TEST_REQUIRES_X86_SSE;
25685 for (size_t k = 1; k < 4; k++) {
25686 GemmMicrokernelTester()
25687 .mr(4)
25688 .nr(8)
25689 .kr(1)
25690 .sr(4)
25691 .m(4)
25692 .n(8)
25693 .k(k)
25694 .a_stride(7)
25695 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25696 }
25697 }
25698
25699 TEST(F32_GEMMINC_4X8S4__SSE, k_lt_4_subtile) {
25700 TEST_REQUIRES_X86_SSE;
25701 for (size_t k = 1; k < 4; k++) {
25702 for (uint32_t m = 1; m <= 4; m++) {
25703 for (uint32_t n = 1; n <= 8; n++) {
25704 GemmMicrokernelTester()
25705 .mr(4)
25706 .nr(8)
25707 .kr(1)
25708 .sr(4)
25709 .m(m)
25710 .n(n)
25711 .k(k)
25712 .iterations(1)
25713 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25714 }
25715 }
25716 }
25717 }
25718
25719 TEST(F32_GEMMINC_4X8S4__SSE, k_gt_4) {
25720 TEST_REQUIRES_X86_SSE;
25721 for (size_t k = 5; k < 8; k++) {
25722 GemmMicrokernelTester()
25723 .mr(4)
25724 .nr(8)
25725 .kr(1)
25726 .sr(4)
25727 .m(4)
25728 .n(8)
25729 .k(k)
25730 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25731 }
25732 }
25733
25734 TEST(F32_GEMMINC_4X8S4__SSE, k_gt_4_strided_a) {
25735 TEST_REQUIRES_X86_SSE;
25736 for (size_t k = 5; k < 8; k++) {
25737 GemmMicrokernelTester()
25738 .mr(4)
25739 .nr(8)
25740 .kr(1)
25741 .sr(4)
25742 .m(4)
25743 .n(8)
25744 .k(k)
25745 .a_stride(11)
25746 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25747 }
25748 }
25749
25750 TEST(F32_GEMMINC_4X8S4__SSE, k_gt_4_subtile) {
25751 TEST_REQUIRES_X86_SSE;
25752 for (size_t k = 5; k < 8; k++) {
25753 for (uint32_t m = 1; m <= 4; m++) {
25754 for (uint32_t n = 1; n <= 8; n++) {
25755 GemmMicrokernelTester()
25756 .mr(4)
25757 .nr(8)
25758 .kr(1)
25759 .sr(4)
25760 .m(m)
25761 .n(n)
25762 .k(k)
25763 .iterations(1)
25764 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25765 }
25766 }
25767 }
25768 }
25769
25770 TEST(F32_GEMMINC_4X8S4__SSE, k_div_4) {
25771 TEST_REQUIRES_X86_SSE;
25772 for (size_t k = 8; k <= 40; k += 4) {
25773 GemmMicrokernelTester()
25774 .mr(4)
25775 .nr(8)
25776 .kr(1)
25777 .sr(4)
25778 .m(4)
25779 .n(8)
25780 .k(k)
25781 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25782 }
25783 }
25784
25785 TEST(F32_GEMMINC_4X8S4__SSE, k_div_4_strided_a) {
25786 TEST_REQUIRES_X86_SSE;
25787 for (size_t k = 8; k <= 40; k += 4) {
25788 GemmMicrokernelTester()
25789 .mr(4)
25790 .nr(8)
25791 .kr(1)
25792 .sr(4)
25793 .m(4)
25794 .n(8)
25795 .k(k)
25796 .a_stride(43)
25797 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25798 }
25799 }
25800
25801 TEST(F32_GEMMINC_4X8S4__SSE, k_div_4_subtile) {
25802 TEST_REQUIRES_X86_SSE;
25803 for (size_t k = 8; k <= 40; k += 4) {
25804 for (uint32_t m = 1; m <= 4; m++) {
25805 for (uint32_t n = 1; n <= 8; n++) {
25806 GemmMicrokernelTester()
25807 .mr(4)
25808 .nr(8)
25809 .kr(1)
25810 .sr(4)
25811 .m(m)
25812 .n(n)
25813 .k(k)
25814 .iterations(1)
25815 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25816 }
25817 }
25818 }
25819 }
25820
25821 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8) {
25822 TEST_REQUIRES_X86_SSE;
25823 for (uint32_t n = 9; n < 16; n++) {
25824 for (size_t k = 1; k <= 20; k += 5) {
25825 GemmMicrokernelTester()
25826 .mr(4)
25827 .nr(8)
25828 .kr(1)
25829 .sr(4)
25830 .m(4)
25831 .n(8)
25832 .k(k)
25833 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25834 }
25835 }
25836 }
25837
25838 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8_strided_cn) {
25839 TEST_REQUIRES_X86_SSE;
25840 for (uint32_t n = 9; n < 16; n++) {
25841 for (size_t k = 1; k <= 20; k += 5) {
25842 GemmMicrokernelTester()
25843 .mr(4)
25844 .nr(8)
25845 .kr(1)
25846 .sr(4)
25847 .m(4)
25848 .n(8)
25849 .k(k)
25850 .cn_stride(11)
25851 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25852 }
25853 }
25854 }
25855
25856 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8_strided_a) {
25857 TEST_REQUIRES_X86_SSE;
25858 for (uint32_t n = 9; n < 16; n++) {
25859 for (size_t k = 1; k <= 20; k += 5) {
25860 GemmMicrokernelTester()
25861 .mr(4)
25862 .nr(8)
25863 .kr(1)
25864 .sr(4)
25865 .m(4)
25866 .n(n)
25867 .k(k)
25868 .a_stride(23)
25869 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25870 }
25871 }
25872 }
25873
25874 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8_subtile) {
25875 TEST_REQUIRES_X86_SSE;
25876 for (uint32_t n = 9; n < 16; n++) {
25877 for (size_t k = 1; k <= 20; k += 5) {
25878 for (uint32_t m = 1; m <= 4; m++) {
25879 GemmMicrokernelTester()
25880 .mr(4)
25881 .nr(8)
25882 .kr(1)
25883 .sr(4)
25884 .m(m)
25885 .n(n)
25886 .k(k)
25887 .iterations(1)
25888 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25889 }
25890 }
25891 }
25892 }
25893
25894 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8) {
25895 TEST_REQUIRES_X86_SSE;
25896 for (uint32_t n = 16; n <= 24; n += 8) {
25897 for (size_t k = 1; k <= 20; k += 5) {
25898 GemmMicrokernelTester()
25899 .mr(4)
25900 .nr(8)
25901 .kr(1)
25902 .sr(4)
25903 .m(4)
25904 .n(8)
25905 .k(k)
25906 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25907 }
25908 }
25909 }
25910
25911 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8_strided_cn) {
25912 TEST_REQUIRES_X86_SSE;
25913 for (uint32_t n = 16; n <= 24; n += 8) {
25914 for (size_t k = 1; k <= 20; k += 5) {
25915 GemmMicrokernelTester()
25916 .mr(4)
25917 .nr(8)
25918 .kr(1)
25919 .sr(4)
25920 .m(4)
25921 .n(n)
25922 .k(k)
25923 .cn_stride(11)
25924 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25925 }
25926 }
25927 }
25928
25929 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8_strided_a) {
25930 TEST_REQUIRES_X86_SSE;
25931 for (uint32_t n = 16; n <= 24; n += 8) {
25932 for (size_t k = 1; k <= 20; k += 5) {
25933 GemmMicrokernelTester()
25934 .mr(4)
25935 .nr(8)
25936 .kr(1)
25937 .sr(4)
25938 .m(4)
25939 .n(n)
25940 .k(k)
25941 .a_stride(23)
25942 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25943 }
25944 }
25945 }
25946
25947 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8_subtile) {
25948 TEST_REQUIRES_X86_SSE;
25949 for (uint32_t n = 16; n <= 24; n += 8) {
25950 for (size_t k = 1; k <= 20; k += 5) {
25951 for (uint32_t m = 1; m <= 4; m++) {
25952 GemmMicrokernelTester()
25953 .mr(4)
25954 .nr(8)
25955 .kr(1)
25956 .sr(4)
25957 .m(m)
25958 .n(n)
25959 .k(k)
25960 .iterations(1)
25961 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25962 }
25963 }
25964 }
25965 }
25966
25967 TEST(F32_GEMMINC_4X8S4__SSE, strided_cm_subtile) {
25968 TEST_REQUIRES_X86_SSE;
25969 for (size_t k = 1; k <= 20; k += 5) {
25970 for (uint32_t m = 1; m <= 4; m++) {
25971 for (uint32_t n = 1; n <= 8; n++) {
25972 GemmMicrokernelTester()
25973 .mr(4)
25974 .nr(8)
25975 .kr(1)
25976 .sr(4)
25977 .m(m)
25978 .n(n)
25979 .k(k)
25980 .cm_stride(11)
25981 .iterations(1)
25982 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
25983 }
25984 }
25985 }
25986 }
25987
25988 TEST(F32_GEMMINC_4X8S4__SSE, qmin) {
25989 TEST_REQUIRES_X86_SSE;
25990 GemmMicrokernelTester()
25991 .mr(4)
25992 .nr(8)
25993 .kr(1)
25994 .sr(4)
25995 .m(4)
25996 .n(8)
25997 .k(4)
25998 .qmin(128)
25999 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
26000 }
26001
26002 TEST(F32_GEMMINC_4X8S4__SSE, qmax) {
26003 TEST_REQUIRES_X86_SSE;
26004 GemmMicrokernelTester()
26005 .mr(4)
26006 .nr(8)
26007 .kr(1)
26008 .sr(4)
26009 .m(4)
26010 .n(8)
26011 .k(4)
26012 .qmax(128)
26013 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
26014 }
26015
26016 TEST(F32_GEMMINC_4X8S4__SSE, strided_cm) {
26017 TEST_REQUIRES_X86_SSE;
26018 GemmMicrokernelTester()
26019 .mr(4)
26020 .nr(8)
26021 .kr(1)
26022 .sr(4)
26023 .m(4)
26024 .n(8)
26025 .k(4)
26026 .cm_stride(11)
26027 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
26028 }
26029#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26030
26031
26032#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26033 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1) {
26034 TEST_REQUIRES_X86_AVX;
26035 GemmMicrokernelTester()
26036 .mr(1)
26037 .nr(8)
26038 .kr(1)
26039 .sr(1)
26040 .m(1)
26041 .n(8)
26042 .k(1)
26043 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26044 }
26045
26046 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, strided_cn) {
26047 TEST_REQUIRES_X86_AVX;
26048 GemmMicrokernelTester()
26049 .mr(1)
26050 .nr(8)
26051 .kr(1)
26052 .sr(1)
26053 .m(1)
26054 .n(8)
26055 .k(1)
26056 .cn_stride(11)
26057 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26058 }
26059
26060 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_strided_a) {
26061 TEST_REQUIRES_X86_AVX;
26062 GemmMicrokernelTester()
26063 .mr(1)
26064 .nr(8)
26065 .kr(1)
26066 .sr(1)
26067 .m(1)
26068 .n(8)
26069 .k(1)
26070 .a_stride(3)
26071 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26072 }
26073
26074 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_subtile) {
26075 TEST_REQUIRES_X86_AVX;
26076 for (uint32_t m = 1; m <= 1; m++) {
26077 for (uint32_t n = 1; n <= 8; n++) {
26078 GemmMicrokernelTester()
26079 .mr(1)
26080 .nr(8)
26081 .kr(1)
26082 .sr(1)
26083 .m(m)
26084 .n(n)
26085 .k(1)
26086 .iterations(1)
26087 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26088 }
26089 }
26090 }
26091
26092 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
26093 TEST_REQUIRES_X86_AVX;
26094 for (uint32_t m = 1; m <= 1; m++) {
26095 GemmMicrokernelTester()
26096 .mr(1)
26097 .nr(8)
26098 .kr(1)
26099 .sr(1)
26100 .m(m)
26101 .n(8)
26102 .k(1)
26103 .iterations(1)
26104 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26105 }
26106 }
26107
26108 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
26109 TEST_REQUIRES_X86_AVX;
26110 for (uint32_t n = 1; n <= 8; n++) {
26111 GemmMicrokernelTester()
26112 .mr(1)
26113 .nr(8)
26114 .kr(1)
26115 .sr(1)
26116 .m(1)
26117 .n(n)
26118 .k(1)
26119 .iterations(1)
26120 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26121 }
26122 }
26123
26124 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_gt_1) {
26125 TEST_REQUIRES_X86_AVX;
26126 for (size_t k = 2; k < 10; k++) {
26127 GemmMicrokernelTester()
26128 .mr(1)
26129 .nr(8)
26130 .kr(1)
26131 .sr(1)
26132 .m(1)
26133 .n(8)
26134 .k(k)
26135 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26136 }
26137 }
26138
26139 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_gt_1_strided_a) {
26140 TEST_REQUIRES_X86_AVX;
26141 for (size_t k = 2; k < 10; k++) {
26142 GemmMicrokernelTester()
26143 .mr(1)
26144 .nr(8)
26145 .kr(1)
26146 .sr(1)
26147 .m(1)
26148 .n(8)
26149 .k(k)
26150 .a_stride(11)
26151 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26152 }
26153 }
26154
26155 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_gt_1_subtile) {
26156 TEST_REQUIRES_X86_AVX;
26157 for (size_t k = 2; k < 10; k++) {
26158 for (uint32_t m = 1; m <= 1; m++) {
26159 for (uint32_t n = 1; n <= 8; n++) {
26160 GemmMicrokernelTester()
26161 .mr(1)
26162 .nr(8)
26163 .kr(1)
26164 .sr(1)
26165 .m(m)
26166 .n(n)
26167 .k(k)
26168 .iterations(1)
26169 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26170 }
26171 }
26172 }
26173 }
26174
26175 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8) {
26176 TEST_REQUIRES_X86_AVX;
26177 for (uint32_t n = 9; n < 16; n++) {
26178 for (size_t k = 1; k <= 5; k += 2) {
26179 GemmMicrokernelTester()
26180 .mr(1)
26181 .nr(8)
26182 .kr(1)
26183 .sr(1)
26184 .m(1)
26185 .n(8)
26186 .k(k)
26187 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26188 }
26189 }
26190 }
26191
26192 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
26193 TEST_REQUIRES_X86_AVX;
26194 for (uint32_t n = 9; n < 16; n++) {
26195 for (size_t k = 1; k <= 5; k += 2) {
26196 GemmMicrokernelTester()
26197 .mr(1)
26198 .nr(8)
26199 .kr(1)
26200 .sr(1)
26201 .m(1)
26202 .n(8)
26203 .k(k)
26204 .cn_stride(11)
26205 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26206 }
26207 }
26208 }
26209
26210 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8_strided_a) {
26211 TEST_REQUIRES_X86_AVX;
26212 for (uint32_t n = 9; n < 16; n++) {
26213 for (size_t k = 1; k <= 5; k += 2) {
26214 GemmMicrokernelTester()
26215 .mr(1)
26216 .nr(8)
26217 .kr(1)
26218 .sr(1)
26219 .m(1)
26220 .n(n)
26221 .k(k)
26222 .a_stride(7)
26223 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26224 }
26225 }
26226 }
26227
26228 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8_subtile) {
26229 TEST_REQUIRES_X86_AVX;
26230 for (uint32_t n = 9; n < 16; n++) {
26231 for (size_t k = 1; k <= 5; k += 2) {
26232 for (uint32_t m = 1; m <= 1; m++) {
26233 GemmMicrokernelTester()
26234 .mr(1)
26235 .nr(8)
26236 .kr(1)
26237 .sr(1)
26238 .m(m)
26239 .n(n)
26240 .k(k)
26241 .iterations(1)
26242 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26243 }
26244 }
26245 }
26246 }
26247
26248 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8) {
26249 TEST_REQUIRES_X86_AVX;
26250 for (uint32_t n = 16; n <= 24; n += 8) {
26251 for (size_t k = 1; k <= 5; k += 2) {
26252 GemmMicrokernelTester()
26253 .mr(1)
26254 .nr(8)
26255 .kr(1)
26256 .sr(1)
26257 .m(1)
26258 .n(8)
26259 .k(k)
26260 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26261 }
26262 }
26263 }
26264
26265 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
26266 TEST_REQUIRES_X86_AVX;
26267 for (uint32_t n = 16; n <= 24; n += 8) {
26268 for (size_t k = 1; k <= 5; k += 2) {
26269 GemmMicrokernelTester()
26270 .mr(1)
26271 .nr(8)
26272 .kr(1)
26273 .sr(1)
26274 .m(1)
26275 .n(n)
26276 .k(k)
26277 .cn_stride(11)
26278 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26279 }
26280 }
26281 }
26282
26283 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8_strided_a) {
26284 TEST_REQUIRES_X86_AVX;
26285 for (uint32_t n = 16; n <= 24; n += 8) {
26286 for (size_t k = 1; k <= 5; k += 2) {
26287 GemmMicrokernelTester()
26288 .mr(1)
26289 .nr(8)
26290 .kr(1)
26291 .sr(1)
26292 .m(1)
26293 .n(n)
26294 .k(k)
26295 .a_stride(7)
26296 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26297 }
26298 }
26299 }
26300
26301 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8_subtile) {
26302 TEST_REQUIRES_X86_AVX;
26303 for (uint32_t n = 16; n <= 24; n += 8) {
26304 for (size_t k = 1; k <= 5; k += 2) {
26305 for (uint32_t m = 1; m <= 1; m++) {
26306 GemmMicrokernelTester()
26307 .mr(1)
26308 .nr(8)
26309 .kr(1)
26310 .sr(1)
26311 .m(m)
26312 .n(n)
26313 .k(k)
26314 .iterations(1)
26315 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26316 }
26317 }
26318 }
26319 }
26320
26321 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, strided_cm_subtile) {
26322 TEST_REQUIRES_X86_AVX;
26323 for (size_t k = 1; k <= 5; k += 2) {
26324 for (uint32_t m = 1; m <= 1; m++) {
26325 for (uint32_t n = 1; n <= 8; n++) {
26326 GemmMicrokernelTester()
26327 .mr(1)
26328 .nr(8)
26329 .kr(1)
26330 .sr(1)
26331 .m(m)
26332 .n(n)
26333 .k(k)
26334 .cm_stride(11)
26335 .iterations(1)
26336 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26337 }
26338 }
26339 }
26340 }
26341
26342 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, qmin) {
26343 TEST_REQUIRES_X86_AVX;
26344 GemmMicrokernelTester()
26345 .mr(1)
26346 .nr(8)
26347 .kr(1)
26348 .sr(1)
26349 .m(1)
26350 .n(8)
26351 .k(1)
26352 .qmin(128)
26353 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26354 }
26355
26356 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, qmax) {
26357 TEST_REQUIRES_X86_AVX;
26358 GemmMicrokernelTester()
26359 .mr(1)
26360 .nr(8)
26361 .kr(1)
26362 .sr(1)
26363 .m(1)
26364 .n(8)
26365 .k(1)
26366 .qmax(128)
26367 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26368 }
26369
26370 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, strided_cm) {
26371 TEST_REQUIRES_X86_AVX;
26372 GemmMicrokernelTester()
26373 .mr(1)
26374 .nr(8)
26375 .kr(1)
26376 .sr(1)
26377 .m(1)
26378 .n(8)
26379 .k(1)
26380 .cm_stride(11)
26381 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
26382 }
26383#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26384
26385
26386#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26387 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1) {
26388 TEST_REQUIRES_X86_AVX;
26389 GemmMicrokernelTester()
26390 .mr(4)
26391 .nr(8)
26392 .kr(1)
26393 .sr(1)
26394 .m(4)
26395 .n(8)
26396 .k(1)
26397 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26398 }
26399
26400 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, strided_cn) {
26401 TEST_REQUIRES_X86_AVX;
26402 GemmMicrokernelTester()
26403 .mr(4)
26404 .nr(8)
26405 .kr(1)
26406 .sr(1)
26407 .m(4)
26408 .n(8)
26409 .k(1)
26410 .cn_stride(11)
26411 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26412 }
26413
26414 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_strided_a) {
26415 TEST_REQUIRES_X86_AVX;
26416 GemmMicrokernelTester()
26417 .mr(4)
26418 .nr(8)
26419 .kr(1)
26420 .sr(1)
26421 .m(4)
26422 .n(8)
26423 .k(1)
26424 .a_stride(3)
26425 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26426 }
26427
26428 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_subtile) {
26429 TEST_REQUIRES_X86_AVX;
26430 for (uint32_t m = 1; m <= 4; m++) {
26431 for (uint32_t n = 1; n <= 8; n++) {
26432 GemmMicrokernelTester()
26433 .mr(4)
26434 .nr(8)
26435 .kr(1)
26436 .sr(1)
26437 .m(m)
26438 .n(n)
26439 .k(1)
26440 .iterations(1)
26441 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26442 }
26443 }
26444 }
26445
26446 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_subtile_m) {
26447 TEST_REQUIRES_X86_AVX;
26448 for (uint32_t m = 1; m <= 4; m++) {
26449 GemmMicrokernelTester()
26450 .mr(4)
26451 .nr(8)
26452 .kr(1)
26453 .sr(1)
26454 .m(m)
26455 .n(8)
26456 .k(1)
26457 .iterations(1)
26458 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26459 }
26460 }
26461
26462 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_subtile_n) {
26463 TEST_REQUIRES_X86_AVX;
26464 for (uint32_t n = 1; n <= 8; n++) {
26465 GemmMicrokernelTester()
26466 .mr(4)
26467 .nr(8)
26468 .kr(1)
26469 .sr(1)
26470 .m(4)
26471 .n(n)
26472 .k(1)
26473 .iterations(1)
26474 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26475 }
26476 }
26477
26478 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_gt_1) {
26479 TEST_REQUIRES_X86_AVX;
26480 for (size_t k = 2; k < 10; k++) {
26481 GemmMicrokernelTester()
26482 .mr(4)
26483 .nr(8)
26484 .kr(1)
26485 .sr(1)
26486 .m(4)
26487 .n(8)
26488 .k(k)
26489 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26490 }
26491 }
26492
26493 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_gt_1_strided_a) {
26494 TEST_REQUIRES_X86_AVX;
26495 for (size_t k = 2; k < 10; k++) {
26496 GemmMicrokernelTester()
26497 .mr(4)
26498 .nr(8)
26499 .kr(1)
26500 .sr(1)
26501 .m(4)
26502 .n(8)
26503 .k(k)
26504 .a_stride(11)
26505 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26506 }
26507 }
26508
26509 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_gt_1_subtile) {
26510 TEST_REQUIRES_X86_AVX;
26511 for (size_t k = 2; k < 10; k++) {
26512 for (uint32_t m = 1; m <= 4; m++) {
26513 for (uint32_t n = 1; n <= 8; n++) {
26514 GemmMicrokernelTester()
26515 .mr(4)
26516 .nr(8)
26517 .kr(1)
26518 .sr(1)
26519 .m(m)
26520 .n(n)
26521 .k(k)
26522 .iterations(1)
26523 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26524 }
26525 }
26526 }
26527 }
26528
26529 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8) {
26530 TEST_REQUIRES_X86_AVX;
26531 for (uint32_t n = 9; n < 16; n++) {
26532 for (size_t k = 1; k <= 5; k += 2) {
26533 GemmMicrokernelTester()
26534 .mr(4)
26535 .nr(8)
26536 .kr(1)
26537 .sr(1)
26538 .m(4)
26539 .n(8)
26540 .k(k)
26541 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26542 }
26543 }
26544 }
26545
26546 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8_strided_cn) {
26547 TEST_REQUIRES_X86_AVX;
26548 for (uint32_t n = 9; n < 16; n++) {
26549 for (size_t k = 1; k <= 5; k += 2) {
26550 GemmMicrokernelTester()
26551 .mr(4)
26552 .nr(8)
26553 .kr(1)
26554 .sr(1)
26555 .m(4)
26556 .n(8)
26557 .k(k)
26558 .cn_stride(11)
26559 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26560 }
26561 }
26562 }
26563
26564 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8_strided_a) {
26565 TEST_REQUIRES_X86_AVX;
26566 for (uint32_t n = 9; n < 16; n++) {
26567 for (size_t k = 1; k <= 5; k += 2) {
26568 GemmMicrokernelTester()
26569 .mr(4)
26570 .nr(8)
26571 .kr(1)
26572 .sr(1)
26573 .m(4)
26574 .n(n)
26575 .k(k)
26576 .a_stride(7)
26577 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26578 }
26579 }
26580 }
26581
26582 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8_subtile) {
26583 TEST_REQUIRES_X86_AVX;
26584 for (uint32_t n = 9; n < 16; n++) {
26585 for (size_t k = 1; k <= 5; k += 2) {
26586 for (uint32_t m = 1; m <= 4; m++) {
26587 GemmMicrokernelTester()
26588 .mr(4)
26589 .nr(8)
26590 .kr(1)
26591 .sr(1)
26592 .m(m)
26593 .n(n)
26594 .k(k)
26595 .iterations(1)
26596 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26597 }
26598 }
26599 }
26600 }
26601
26602 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8) {
26603 TEST_REQUIRES_X86_AVX;
26604 for (uint32_t n = 16; n <= 24; n += 8) {
26605 for (size_t k = 1; k <= 5; k += 2) {
26606 GemmMicrokernelTester()
26607 .mr(4)
26608 .nr(8)
26609 .kr(1)
26610 .sr(1)
26611 .m(4)
26612 .n(8)
26613 .k(k)
26614 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26615 }
26616 }
26617 }
26618
26619 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8_strided_cn) {
26620 TEST_REQUIRES_X86_AVX;
26621 for (uint32_t n = 16; n <= 24; n += 8) {
26622 for (size_t k = 1; k <= 5; k += 2) {
26623 GemmMicrokernelTester()
26624 .mr(4)
26625 .nr(8)
26626 .kr(1)
26627 .sr(1)
26628 .m(4)
26629 .n(n)
26630 .k(k)
26631 .cn_stride(11)
26632 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26633 }
26634 }
26635 }
26636
26637 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8_strided_a) {
26638 TEST_REQUIRES_X86_AVX;
26639 for (uint32_t n = 16; n <= 24; n += 8) {
26640 for (size_t k = 1; k <= 5; k += 2) {
26641 GemmMicrokernelTester()
26642 .mr(4)
26643 .nr(8)
26644 .kr(1)
26645 .sr(1)
26646 .m(4)
26647 .n(n)
26648 .k(k)
26649 .a_stride(7)
26650 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26651 }
26652 }
26653 }
26654
26655 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8_subtile) {
26656 TEST_REQUIRES_X86_AVX;
26657 for (uint32_t n = 16; n <= 24; n += 8) {
26658 for (size_t k = 1; k <= 5; k += 2) {
26659 for (uint32_t m = 1; m <= 4; m++) {
26660 GemmMicrokernelTester()
26661 .mr(4)
26662 .nr(8)
26663 .kr(1)
26664 .sr(1)
26665 .m(m)
26666 .n(n)
26667 .k(k)
26668 .iterations(1)
26669 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26670 }
26671 }
26672 }
26673 }
26674
26675 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, strided_cm_subtile) {
26676 TEST_REQUIRES_X86_AVX;
26677 for (size_t k = 1; k <= 5; k += 2) {
26678 for (uint32_t m = 1; m <= 4; m++) {
26679 for (uint32_t n = 1; n <= 8; n++) {
26680 GemmMicrokernelTester()
26681 .mr(4)
26682 .nr(8)
26683 .kr(1)
26684 .sr(1)
26685 .m(m)
26686 .n(n)
26687 .k(k)
26688 .cm_stride(11)
26689 .iterations(1)
26690 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26691 }
26692 }
26693 }
26694 }
26695
26696 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, qmin) {
26697 TEST_REQUIRES_X86_AVX;
26698 GemmMicrokernelTester()
26699 .mr(4)
26700 .nr(8)
26701 .kr(1)
26702 .sr(1)
26703 .m(4)
26704 .n(8)
26705 .k(1)
26706 .qmin(128)
26707 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26708 }
26709
26710 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, qmax) {
26711 TEST_REQUIRES_X86_AVX;
26712 GemmMicrokernelTester()
26713 .mr(4)
26714 .nr(8)
26715 .kr(1)
26716 .sr(1)
26717 .m(4)
26718 .n(8)
26719 .k(1)
26720 .qmax(128)
26721 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26722 }
26723
26724 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, strided_cm) {
26725 TEST_REQUIRES_X86_AVX;
26726 GemmMicrokernelTester()
26727 .mr(4)
26728 .nr(8)
26729 .kr(1)
26730 .sr(1)
26731 .m(4)
26732 .n(8)
26733 .k(1)
26734 .cm_stride(11)
26735 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
26736 }
26737#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26738
26739
26740#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26741 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1) {
26742 TEST_REQUIRES_X86_AVX;
26743 GemmMicrokernelTester()
26744 .mr(5)
26745 .nr(8)
26746 .kr(1)
26747 .sr(1)
26748 .m(5)
26749 .n(8)
26750 .k(1)
26751 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26752 }
26753
26754 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, strided_cn) {
26755 TEST_REQUIRES_X86_AVX;
26756 GemmMicrokernelTester()
26757 .mr(5)
26758 .nr(8)
26759 .kr(1)
26760 .sr(1)
26761 .m(5)
26762 .n(8)
26763 .k(1)
26764 .cn_stride(11)
26765 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26766 }
26767
26768 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_strided_a) {
26769 TEST_REQUIRES_X86_AVX;
26770 GemmMicrokernelTester()
26771 .mr(5)
26772 .nr(8)
26773 .kr(1)
26774 .sr(1)
26775 .m(5)
26776 .n(8)
26777 .k(1)
26778 .a_stride(3)
26779 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26780 }
26781
26782 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_subtile) {
26783 TEST_REQUIRES_X86_AVX;
26784 for (uint32_t m = 1; m <= 5; m++) {
26785 for (uint32_t n = 1; n <= 8; n++) {
26786 GemmMicrokernelTester()
26787 .mr(5)
26788 .nr(8)
26789 .kr(1)
26790 .sr(1)
26791 .m(m)
26792 .n(n)
26793 .k(1)
26794 .iterations(1)
26795 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26796 }
26797 }
26798 }
26799
26800 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
26801 TEST_REQUIRES_X86_AVX;
26802 for (uint32_t m = 1; m <= 5; m++) {
26803 GemmMicrokernelTester()
26804 .mr(5)
26805 .nr(8)
26806 .kr(1)
26807 .sr(1)
26808 .m(m)
26809 .n(8)
26810 .k(1)
26811 .iterations(1)
26812 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26813 }
26814 }
26815
26816 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
26817 TEST_REQUIRES_X86_AVX;
26818 for (uint32_t n = 1; n <= 8; n++) {
26819 GemmMicrokernelTester()
26820 .mr(5)
26821 .nr(8)
26822 .kr(1)
26823 .sr(1)
26824 .m(5)
26825 .n(n)
26826 .k(1)
26827 .iterations(1)
26828 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26829 }
26830 }
26831
26832 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_gt_1) {
26833 TEST_REQUIRES_X86_AVX;
26834 for (size_t k = 2; k < 10; k++) {
26835 GemmMicrokernelTester()
26836 .mr(5)
26837 .nr(8)
26838 .kr(1)
26839 .sr(1)
26840 .m(5)
26841 .n(8)
26842 .k(k)
26843 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26844 }
26845 }
26846
26847 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_gt_1_strided_a) {
26848 TEST_REQUIRES_X86_AVX;
26849 for (size_t k = 2; k < 10; k++) {
26850 GemmMicrokernelTester()
26851 .mr(5)
26852 .nr(8)
26853 .kr(1)
26854 .sr(1)
26855 .m(5)
26856 .n(8)
26857 .k(k)
26858 .a_stride(11)
26859 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26860 }
26861 }
26862
26863 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_gt_1_subtile) {
26864 TEST_REQUIRES_X86_AVX;
26865 for (size_t k = 2; k < 10; k++) {
26866 for (uint32_t m = 1; m <= 5; m++) {
26867 for (uint32_t n = 1; n <= 8; n++) {
26868 GemmMicrokernelTester()
26869 .mr(5)
26870 .nr(8)
26871 .kr(1)
26872 .sr(1)
26873 .m(m)
26874 .n(n)
26875 .k(k)
26876 .iterations(1)
26877 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26878 }
26879 }
26880 }
26881 }
26882
26883 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8) {
26884 TEST_REQUIRES_X86_AVX;
26885 for (uint32_t n = 9; n < 16; n++) {
26886 for (size_t k = 1; k <= 5; k += 2) {
26887 GemmMicrokernelTester()
26888 .mr(5)
26889 .nr(8)
26890 .kr(1)
26891 .sr(1)
26892 .m(5)
26893 .n(8)
26894 .k(k)
26895 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26896 }
26897 }
26898 }
26899
26900 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
26901 TEST_REQUIRES_X86_AVX;
26902 for (uint32_t n = 9; n < 16; n++) {
26903 for (size_t k = 1; k <= 5; k += 2) {
26904 GemmMicrokernelTester()
26905 .mr(5)
26906 .nr(8)
26907 .kr(1)
26908 .sr(1)
26909 .m(5)
26910 .n(8)
26911 .k(k)
26912 .cn_stride(11)
26913 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26914 }
26915 }
26916 }
26917
26918 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8_strided_a) {
26919 TEST_REQUIRES_X86_AVX;
26920 for (uint32_t n = 9; n < 16; n++) {
26921 for (size_t k = 1; k <= 5; k += 2) {
26922 GemmMicrokernelTester()
26923 .mr(5)
26924 .nr(8)
26925 .kr(1)
26926 .sr(1)
26927 .m(5)
26928 .n(n)
26929 .k(k)
26930 .a_stride(7)
26931 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26932 }
26933 }
26934 }
26935
26936 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8_subtile) {
26937 TEST_REQUIRES_X86_AVX;
26938 for (uint32_t n = 9; n < 16; n++) {
26939 for (size_t k = 1; k <= 5; k += 2) {
26940 for (uint32_t m = 1; m <= 5; m++) {
26941 GemmMicrokernelTester()
26942 .mr(5)
26943 .nr(8)
26944 .kr(1)
26945 .sr(1)
26946 .m(m)
26947 .n(n)
26948 .k(k)
26949 .iterations(1)
26950 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26951 }
26952 }
26953 }
26954 }
26955
26956 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8) {
26957 TEST_REQUIRES_X86_AVX;
26958 for (uint32_t n = 16; n <= 24; n += 8) {
26959 for (size_t k = 1; k <= 5; k += 2) {
26960 GemmMicrokernelTester()
26961 .mr(5)
26962 .nr(8)
26963 .kr(1)
26964 .sr(1)
26965 .m(5)
26966 .n(8)
26967 .k(k)
26968 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26969 }
26970 }
26971 }
26972
26973 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
26974 TEST_REQUIRES_X86_AVX;
26975 for (uint32_t n = 16; n <= 24; n += 8) {
26976 for (size_t k = 1; k <= 5; k += 2) {
26977 GemmMicrokernelTester()
26978 .mr(5)
26979 .nr(8)
26980 .kr(1)
26981 .sr(1)
26982 .m(5)
26983 .n(n)
26984 .k(k)
26985 .cn_stride(11)
26986 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
26987 }
26988 }
26989 }
26990
26991 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8_strided_a) {
26992 TEST_REQUIRES_X86_AVX;
26993 for (uint32_t n = 16; n <= 24; n += 8) {
26994 for (size_t k = 1; k <= 5; k += 2) {
26995 GemmMicrokernelTester()
26996 .mr(5)
26997 .nr(8)
26998 .kr(1)
26999 .sr(1)
27000 .m(5)
27001 .n(n)
27002 .k(k)
27003 .a_stride(7)
27004 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
27005 }
27006 }
27007 }
27008
27009 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8_subtile) {
27010 TEST_REQUIRES_X86_AVX;
27011 for (uint32_t n = 16; n <= 24; n += 8) {
27012 for (size_t k = 1; k <= 5; k += 2) {
27013 for (uint32_t m = 1; m <= 5; m++) {
27014 GemmMicrokernelTester()
27015 .mr(5)
27016 .nr(8)
27017 .kr(1)
27018 .sr(1)
27019 .m(m)
27020 .n(n)
27021 .k(k)
27022 .iterations(1)
27023 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
27024 }
27025 }
27026 }
27027 }
27028
27029 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, strided_cm_subtile) {
27030 TEST_REQUIRES_X86_AVX;
27031 for (size_t k = 1; k <= 5; k += 2) {
27032 for (uint32_t m = 1; m <= 5; m++) {
27033 for (uint32_t n = 1; n <= 8; n++) {
27034 GemmMicrokernelTester()
27035 .mr(5)
27036 .nr(8)
27037 .kr(1)
27038 .sr(1)
27039 .m(m)
27040 .n(n)
27041 .k(k)
27042 .cm_stride(11)
27043 .iterations(1)
27044 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
27045 }
27046 }
27047 }
27048 }
27049
27050 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, qmin) {
27051 TEST_REQUIRES_X86_AVX;
27052 GemmMicrokernelTester()
27053 .mr(5)
27054 .nr(8)
27055 .kr(1)
27056 .sr(1)
27057 .m(5)
27058 .n(8)
27059 .k(1)
27060 .qmin(128)
27061 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
27062 }
27063
27064 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, qmax) {
27065 TEST_REQUIRES_X86_AVX;
27066 GemmMicrokernelTester()
27067 .mr(5)
27068 .nr(8)
27069 .kr(1)
27070 .sr(1)
27071 .m(5)
27072 .n(8)
27073 .k(1)
27074 .qmax(128)
27075 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
27076 }
27077
27078 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, strided_cm) {
27079 TEST_REQUIRES_X86_AVX;
27080 GemmMicrokernelTester()
27081 .mr(5)
27082 .nr(8)
27083 .kr(1)
27084 .sr(1)
27085 .m(5)
27086 .n(8)
27087 .k(1)
27088 .cm_stride(11)
27089 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
27090 }
27091#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27092
27093
27094#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27095 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1) {
27096 TEST_REQUIRES_X86_AVX;
27097 GemmMicrokernelTester()
27098 .mr(6)
27099 .nr(8)
27100 .kr(1)
27101 .sr(1)
27102 .m(6)
27103 .n(8)
27104 .k(1)
27105 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27106 }
27107
27108 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, strided_cn) {
27109 TEST_REQUIRES_X86_AVX;
27110 GemmMicrokernelTester()
27111 .mr(6)
27112 .nr(8)
27113 .kr(1)
27114 .sr(1)
27115 .m(6)
27116 .n(8)
27117 .k(1)
27118 .cn_stride(11)
27119 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27120 }
27121
27122 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_strided_a) {
27123 TEST_REQUIRES_X86_AVX;
27124 GemmMicrokernelTester()
27125 .mr(6)
27126 .nr(8)
27127 .kr(1)
27128 .sr(1)
27129 .m(6)
27130 .n(8)
27131 .k(1)
27132 .a_stride(3)
27133 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27134 }
27135
27136 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_subtile) {
27137 TEST_REQUIRES_X86_AVX;
27138 for (uint32_t m = 1; m <= 6; m++) {
27139 for (uint32_t n = 1; n <= 8; n++) {
27140 GemmMicrokernelTester()
27141 .mr(6)
27142 .nr(8)
27143 .kr(1)
27144 .sr(1)
27145 .m(m)
27146 .n(n)
27147 .k(1)
27148 .iterations(1)
27149 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27150 }
27151 }
27152 }
27153
27154 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27155 TEST_REQUIRES_X86_AVX;
27156 for (uint32_t m = 1; m <= 6; m++) {
27157 GemmMicrokernelTester()
27158 .mr(6)
27159 .nr(8)
27160 .kr(1)
27161 .sr(1)
27162 .m(m)
27163 .n(8)
27164 .k(1)
27165 .iterations(1)
27166 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27167 }
27168 }
27169
27170 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27171 TEST_REQUIRES_X86_AVX;
27172 for (uint32_t n = 1; n <= 8; n++) {
27173 GemmMicrokernelTester()
27174 .mr(6)
27175 .nr(8)
27176 .kr(1)
27177 .sr(1)
27178 .m(6)
27179 .n(n)
27180 .k(1)
27181 .iterations(1)
27182 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27183 }
27184 }
27185
27186 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_gt_1) {
27187 TEST_REQUIRES_X86_AVX;
27188 for (size_t k = 2; k < 10; k++) {
27189 GemmMicrokernelTester()
27190 .mr(6)
27191 .nr(8)
27192 .kr(1)
27193 .sr(1)
27194 .m(6)
27195 .n(8)
27196 .k(k)
27197 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27198 }
27199 }
27200
27201 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_gt_1_strided_a) {
27202 TEST_REQUIRES_X86_AVX;
27203 for (size_t k = 2; k < 10; k++) {
27204 GemmMicrokernelTester()
27205 .mr(6)
27206 .nr(8)
27207 .kr(1)
27208 .sr(1)
27209 .m(6)
27210 .n(8)
27211 .k(k)
27212 .a_stride(11)
27213 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27214 }
27215 }
27216
27217 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_gt_1_subtile) {
27218 TEST_REQUIRES_X86_AVX;
27219 for (size_t k = 2; k < 10; k++) {
27220 for (uint32_t m = 1; m <= 6; m++) {
27221 for (uint32_t n = 1; n <= 8; n++) {
27222 GemmMicrokernelTester()
27223 .mr(6)
27224 .nr(8)
27225 .kr(1)
27226 .sr(1)
27227 .m(m)
27228 .n(n)
27229 .k(k)
27230 .iterations(1)
27231 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27232 }
27233 }
27234 }
27235 }
27236
27237 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8) {
27238 TEST_REQUIRES_X86_AVX;
27239 for (uint32_t n = 9; n < 16; n++) {
27240 for (size_t k = 1; k <= 5; k += 2) {
27241 GemmMicrokernelTester()
27242 .mr(6)
27243 .nr(8)
27244 .kr(1)
27245 .sr(1)
27246 .m(6)
27247 .n(8)
27248 .k(k)
27249 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27250 }
27251 }
27252 }
27253
27254 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
27255 TEST_REQUIRES_X86_AVX;
27256 for (uint32_t n = 9; n < 16; n++) {
27257 for (size_t k = 1; k <= 5; k += 2) {
27258 GemmMicrokernelTester()
27259 .mr(6)
27260 .nr(8)
27261 .kr(1)
27262 .sr(1)
27263 .m(6)
27264 .n(8)
27265 .k(k)
27266 .cn_stride(11)
27267 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27268 }
27269 }
27270 }
27271
27272 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8_strided_a) {
27273 TEST_REQUIRES_X86_AVX;
27274 for (uint32_t n = 9; n < 16; n++) {
27275 for (size_t k = 1; k <= 5; k += 2) {
27276 GemmMicrokernelTester()
27277 .mr(6)
27278 .nr(8)
27279 .kr(1)
27280 .sr(1)
27281 .m(6)
27282 .n(n)
27283 .k(k)
27284 .a_stride(7)
27285 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27286 }
27287 }
27288 }
27289
27290 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8_subtile) {
27291 TEST_REQUIRES_X86_AVX;
27292 for (uint32_t n = 9; n < 16; n++) {
27293 for (size_t k = 1; k <= 5; k += 2) {
27294 for (uint32_t m = 1; m <= 6; m++) {
27295 GemmMicrokernelTester()
27296 .mr(6)
27297 .nr(8)
27298 .kr(1)
27299 .sr(1)
27300 .m(m)
27301 .n(n)
27302 .k(k)
27303 .iterations(1)
27304 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27305 }
27306 }
27307 }
27308 }
27309
27310 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8) {
27311 TEST_REQUIRES_X86_AVX;
27312 for (uint32_t n = 16; n <= 24; n += 8) {
27313 for (size_t k = 1; k <= 5; k += 2) {
27314 GemmMicrokernelTester()
27315 .mr(6)
27316 .nr(8)
27317 .kr(1)
27318 .sr(1)
27319 .m(6)
27320 .n(8)
27321 .k(k)
27322 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27323 }
27324 }
27325 }
27326
27327 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
27328 TEST_REQUIRES_X86_AVX;
27329 for (uint32_t n = 16; n <= 24; n += 8) {
27330 for (size_t k = 1; k <= 5; k += 2) {
27331 GemmMicrokernelTester()
27332 .mr(6)
27333 .nr(8)
27334 .kr(1)
27335 .sr(1)
27336 .m(6)
27337 .n(n)
27338 .k(k)
27339 .cn_stride(11)
27340 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27341 }
27342 }
27343 }
27344
27345 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8_strided_a) {
27346 TEST_REQUIRES_X86_AVX;
27347 for (uint32_t n = 16; n <= 24; n += 8) {
27348 for (size_t k = 1; k <= 5; k += 2) {
27349 GemmMicrokernelTester()
27350 .mr(6)
27351 .nr(8)
27352 .kr(1)
27353 .sr(1)
27354 .m(6)
27355 .n(n)
27356 .k(k)
27357 .a_stride(7)
27358 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27359 }
27360 }
27361 }
27362
27363 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8_subtile) {
27364 TEST_REQUIRES_X86_AVX;
27365 for (uint32_t n = 16; n <= 24; n += 8) {
27366 for (size_t k = 1; k <= 5; k += 2) {
27367 for (uint32_t m = 1; m <= 6; m++) {
27368 GemmMicrokernelTester()
27369 .mr(6)
27370 .nr(8)
27371 .kr(1)
27372 .sr(1)
27373 .m(m)
27374 .n(n)
27375 .k(k)
27376 .iterations(1)
27377 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27378 }
27379 }
27380 }
27381 }
27382
27383 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, strided_cm_subtile) {
27384 TEST_REQUIRES_X86_AVX;
27385 for (size_t k = 1; k <= 5; k += 2) {
27386 for (uint32_t m = 1; m <= 6; m++) {
27387 for (uint32_t n = 1; n <= 8; n++) {
27388 GemmMicrokernelTester()
27389 .mr(6)
27390 .nr(8)
27391 .kr(1)
27392 .sr(1)
27393 .m(m)
27394 .n(n)
27395 .k(k)
27396 .cm_stride(11)
27397 .iterations(1)
27398 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27399 }
27400 }
27401 }
27402 }
27403
27404 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, qmin) {
27405 TEST_REQUIRES_X86_AVX;
27406 GemmMicrokernelTester()
27407 .mr(6)
27408 .nr(8)
27409 .kr(1)
27410 .sr(1)
27411 .m(6)
27412 .n(8)
27413 .k(1)
27414 .qmin(128)
27415 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27416 }
27417
27418 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, qmax) {
27419 TEST_REQUIRES_X86_AVX;
27420 GemmMicrokernelTester()
27421 .mr(6)
27422 .nr(8)
27423 .kr(1)
27424 .sr(1)
27425 .m(6)
27426 .n(8)
27427 .k(1)
27428 .qmax(128)
27429 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27430 }
27431
27432 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, strided_cm) {
27433 TEST_REQUIRES_X86_AVX;
27434 GemmMicrokernelTester()
27435 .mr(6)
27436 .nr(8)
27437 .kr(1)
27438 .sr(1)
27439 .m(6)
27440 .n(8)
27441 .k(1)
27442 .cm_stride(11)
27443 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
27444 }
27445#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27446
27447
27448#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27449 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1) {
27450 TEST_REQUIRES_X86_AVX;
27451 GemmMicrokernelTester()
27452 .mr(7)
27453 .nr(8)
27454 .kr(1)
27455 .sr(1)
27456 .m(7)
27457 .n(8)
27458 .k(1)
27459 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27460 }
27461
27462 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, strided_cn) {
27463 TEST_REQUIRES_X86_AVX;
27464 GemmMicrokernelTester()
27465 .mr(7)
27466 .nr(8)
27467 .kr(1)
27468 .sr(1)
27469 .m(7)
27470 .n(8)
27471 .k(1)
27472 .cn_stride(11)
27473 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27474 }
27475
27476 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_strided_a) {
27477 TEST_REQUIRES_X86_AVX;
27478 GemmMicrokernelTester()
27479 .mr(7)
27480 .nr(8)
27481 .kr(1)
27482 .sr(1)
27483 .m(7)
27484 .n(8)
27485 .k(1)
27486 .a_stride(3)
27487 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27488 }
27489
27490 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_subtile) {
27491 TEST_REQUIRES_X86_AVX;
27492 for (uint32_t m = 1; m <= 7; m++) {
27493 for (uint32_t n = 1; n <= 8; n++) {
27494 GemmMicrokernelTester()
27495 .mr(7)
27496 .nr(8)
27497 .kr(1)
27498 .sr(1)
27499 .m(m)
27500 .n(n)
27501 .k(1)
27502 .iterations(1)
27503 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27504 }
27505 }
27506 }
27507
27508 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27509 TEST_REQUIRES_X86_AVX;
27510 for (uint32_t m = 1; m <= 7; m++) {
27511 GemmMicrokernelTester()
27512 .mr(7)
27513 .nr(8)
27514 .kr(1)
27515 .sr(1)
27516 .m(m)
27517 .n(8)
27518 .k(1)
27519 .iterations(1)
27520 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27521 }
27522 }
27523
27524 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27525 TEST_REQUIRES_X86_AVX;
27526 for (uint32_t n = 1; n <= 8; n++) {
27527 GemmMicrokernelTester()
27528 .mr(7)
27529 .nr(8)
27530 .kr(1)
27531 .sr(1)
27532 .m(7)
27533 .n(n)
27534 .k(1)
27535 .iterations(1)
27536 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27537 }
27538 }
27539
27540 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_gt_1) {
27541 TEST_REQUIRES_X86_AVX;
27542 for (size_t k = 2; k < 10; k++) {
27543 GemmMicrokernelTester()
27544 .mr(7)
27545 .nr(8)
27546 .kr(1)
27547 .sr(1)
27548 .m(7)
27549 .n(8)
27550 .k(k)
27551 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27552 }
27553 }
27554
27555 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_gt_1_strided_a) {
27556 TEST_REQUIRES_X86_AVX;
27557 for (size_t k = 2; k < 10; k++) {
27558 GemmMicrokernelTester()
27559 .mr(7)
27560 .nr(8)
27561 .kr(1)
27562 .sr(1)
27563 .m(7)
27564 .n(8)
27565 .k(k)
27566 .a_stride(11)
27567 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27568 }
27569 }
27570
27571 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_gt_1_subtile) {
27572 TEST_REQUIRES_X86_AVX;
27573 for (size_t k = 2; k < 10; k++) {
27574 for (uint32_t m = 1; m <= 7; m++) {
27575 for (uint32_t n = 1; n <= 8; n++) {
27576 GemmMicrokernelTester()
27577 .mr(7)
27578 .nr(8)
27579 .kr(1)
27580 .sr(1)
27581 .m(m)
27582 .n(n)
27583 .k(k)
27584 .iterations(1)
27585 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27586 }
27587 }
27588 }
27589 }
27590
27591 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8) {
27592 TEST_REQUIRES_X86_AVX;
27593 for (uint32_t n = 9; n < 16; n++) {
27594 for (size_t k = 1; k <= 5; k += 2) {
27595 GemmMicrokernelTester()
27596 .mr(7)
27597 .nr(8)
27598 .kr(1)
27599 .sr(1)
27600 .m(7)
27601 .n(8)
27602 .k(k)
27603 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27604 }
27605 }
27606 }
27607
27608 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8_strided_cn) {
27609 TEST_REQUIRES_X86_AVX;
27610 for (uint32_t n = 9; n < 16; n++) {
27611 for (size_t k = 1; k <= 5; k += 2) {
27612 GemmMicrokernelTester()
27613 .mr(7)
27614 .nr(8)
27615 .kr(1)
27616 .sr(1)
27617 .m(7)
27618 .n(8)
27619 .k(k)
27620 .cn_stride(11)
27621 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27622 }
27623 }
27624 }
27625
27626 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8_strided_a) {
27627 TEST_REQUIRES_X86_AVX;
27628 for (uint32_t n = 9; n < 16; n++) {
27629 for (size_t k = 1; k <= 5; k += 2) {
27630 GemmMicrokernelTester()
27631 .mr(7)
27632 .nr(8)
27633 .kr(1)
27634 .sr(1)
27635 .m(7)
27636 .n(n)
27637 .k(k)
27638 .a_stride(7)
27639 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27640 }
27641 }
27642 }
27643
27644 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8_subtile) {
27645 TEST_REQUIRES_X86_AVX;
27646 for (uint32_t n = 9; n < 16; n++) {
27647 for (size_t k = 1; k <= 5; k += 2) {
27648 for (uint32_t m = 1; m <= 7; m++) {
27649 GemmMicrokernelTester()
27650 .mr(7)
27651 .nr(8)
27652 .kr(1)
27653 .sr(1)
27654 .m(m)
27655 .n(n)
27656 .k(k)
27657 .iterations(1)
27658 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27659 }
27660 }
27661 }
27662 }
27663
27664 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8) {
27665 TEST_REQUIRES_X86_AVX;
27666 for (uint32_t n = 16; n <= 24; n += 8) {
27667 for (size_t k = 1; k <= 5; k += 2) {
27668 GemmMicrokernelTester()
27669 .mr(7)
27670 .nr(8)
27671 .kr(1)
27672 .sr(1)
27673 .m(7)
27674 .n(8)
27675 .k(k)
27676 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27677 }
27678 }
27679 }
27680
27681 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8_strided_cn) {
27682 TEST_REQUIRES_X86_AVX;
27683 for (uint32_t n = 16; n <= 24; n += 8) {
27684 for (size_t k = 1; k <= 5; k += 2) {
27685 GemmMicrokernelTester()
27686 .mr(7)
27687 .nr(8)
27688 .kr(1)
27689 .sr(1)
27690 .m(7)
27691 .n(n)
27692 .k(k)
27693 .cn_stride(11)
27694 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27695 }
27696 }
27697 }
27698
27699 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8_strided_a) {
27700 TEST_REQUIRES_X86_AVX;
27701 for (uint32_t n = 16; n <= 24; n += 8) {
27702 for (size_t k = 1; k <= 5; k += 2) {
27703 GemmMicrokernelTester()
27704 .mr(7)
27705 .nr(8)
27706 .kr(1)
27707 .sr(1)
27708 .m(7)
27709 .n(n)
27710 .k(k)
27711 .a_stride(7)
27712 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27713 }
27714 }
27715 }
27716
27717 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8_subtile) {
27718 TEST_REQUIRES_X86_AVX;
27719 for (uint32_t n = 16; n <= 24; n += 8) {
27720 for (size_t k = 1; k <= 5; k += 2) {
27721 for (uint32_t m = 1; m <= 7; m++) {
27722 GemmMicrokernelTester()
27723 .mr(7)
27724 .nr(8)
27725 .kr(1)
27726 .sr(1)
27727 .m(m)
27728 .n(n)
27729 .k(k)
27730 .iterations(1)
27731 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27732 }
27733 }
27734 }
27735 }
27736
27737 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, strided_cm_subtile) {
27738 TEST_REQUIRES_X86_AVX;
27739 for (size_t k = 1; k <= 5; k += 2) {
27740 for (uint32_t m = 1; m <= 7; m++) {
27741 for (uint32_t n = 1; n <= 8; n++) {
27742 GemmMicrokernelTester()
27743 .mr(7)
27744 .nr(8)
27745 .kr(1)
27746 .sr(1)
27747 .m(m)
27748 .n(n)
27749 .k(k)
27750 .cm_stride(11)
27751 .iterations(1)
27752 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27753 }
27754 }
27755 }
27756 }
27757
27758 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, qmin) {
27759 TEST_REQUIRES_X86_AVX;
27760 GemmMicrokernelTester()
27761 .mr(7)
27762 .nr(8)
27763 .kr(1)
27764 .sr(1)
27765 .m(7)
27766 .n(8)
27767 .k(1)
27768 .qmin(128)
27769 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27770 }
27771
27772 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, qmax) {
27773 TEST_REQUIRES_X86_AVX;
27774 GemmMicrokernelTester()
27775 .mr(7)
27776 .nr(8)
27777 .kr(1)
27778 .sr(1)
27779 .m(7)
27780 .n(8)
27781 .k(1)
27782 .qmax(128)
27783 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27784 }
27785
27786 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, strided_cm) {
27787 TEST_REQUIRES_X86_AVX;
27788 GemmMicrokernelTester()
27789 .mr(7)
27790 .nr(8)
27791 .kr(1)
27792 .sr(1)
27793 .m(7)
27794 .n(8)
27795 .k(1)
27796 .cm_stride(11)
27797 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
27798 }
27799#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27800
27801
27802#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27803 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1) {
27804 TEST_REQUIRES_X86_AVX;
27805 GemmMicrokernelTester()
27806 .mr(1)
27807 .nr(16)
27808 .kr(1)
27809 .sr(1)
27810 .m(1)
27811 .n(16)
27812 .k(1)
27813 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27814 }
27815
27816 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, strided_cn) {
27817 TEST_REQUIRES_X86_AVX;
27818 GemmMicrokernelTester()
27819 .mr(1)
27820 .nr(16)
27821 .kr(1)
27822 .sr(1)
27823 .m(1)
27824 .n(16)
27825 .k(1)
27826 .cn_stride(19)
27827 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27828 }
27829
27830 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_strided_a) {
27831 TEST_REQUIRES_X86_AVX;
27832 GemmMicrokernelTester()
27833 .mr(1)
27834 .nr(16)
27835 .kr(1)
27836 .sr(1)
27837 .m(1)
27838 .n(16)
27839 .k(1)
27840 .a_stride(3)
27841 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27842 }
27843
27844 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_subtile) {
27845 TEST_REQUIRES_X86_AVX;
27846 for (uint32_t m = 1; m <= 1; m++) {
27847 for (uint32_t n = 1; n <= 16; n++) {
27848 GemmMicrokernelTester()
27849 .mr(1)
27850 .nr(16)
27851 .kr(1)
27852 .sr(1)
27853 .m(m)
27854 .n(n)
27855 .k(1)
27856 .iterations(1)
27857 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27858 }
27859 }
27860 }
27861
27862 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_subtile_m) {
27863 TEST_REQUIRES_X86_AVX;
27864 for (uint32_t m = 1; m <= 1; m++) {
27865 GemmMicrokernelTester()
27866 .mr(1)
27867 .nr(16)
27868 .kr(1)
27869 .sr(1)
27870 .m(m)
27871 .n(16)
27872 .k(1)
27873 .iterations(1)
27874 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27875 }
27876 }
27877
27878 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_subtile_n) {
27879 TEST_REQUIRES_X86_AVX;
27880 for (uint32_t n = 1; n <= 16; n++) {
27881 GemmMicrokernelTester()
27882 .mr(1)
27883 .nr(16)
27884 .kr(1)
27885 .sr(1)
27886 .m(1)
27887 .n(n)
27888 .k(1)
27889 .iterations(1)
27890 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27891 }
27892 }
27893
27894 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_gt_1) {
27895 TEST_REQUIRES_X86_AVX;
27896 for (size_t k = 2; k < 10; k++) {
27897 GemmMicrokernelTester()
27898 .mr(1)
27899 .nr(16)
27900 .kr(1)
27901 .sr(1)
27902 .m(1)
27903 .n(16)
27904 .k(k)
27905 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27906 }
27907 }
27908
27909 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_gt_1_strided_a) {
27910 TEST_REQUIRES_X86_AVX;
27911 for (size_t k = 2; k < 10; k++) {
27912 GemmMicrokernelTester()
27913 .mr(1)
27914 .nr(16)
27915 .kr(1)
27916 .sr(1)
27917 .m(1)
27918 .n(16)
27919 .k(k)
27920 .a_stride(11)
27921 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27922 }
27923 }
27924
27925 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_gt_1_subtile) {
27926 TEST_REQUIRES_X86_AVX;
27927 for (size_t k = 2; k < 10; k++) {
27928 for (uint32_t m = 1; m <= 1; m++) {
27929 for (uint32_t n = 1; n <= 16; n++) {
27930 GemmMicrokernelTester()
27931 .mr(1)
27932 .nr(16)
27933 .kr(1)
27934 .sr(1)
27935 .m(m)
27936 .n(n)
27937 .k(k)
27938 .iterations(1)
27939 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27940 }
27941 }
27942 }
27943 }
27944
27945 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16) {
27946 TEST_REQUIRES_X86_AVX;
27947 for (uint32_t n = 17; n < 32; n++) {
27948 for (size_t k = 1; k <= 5; k += 2) {
27949 GemmMicrokernelTester()
27950 .mr(1)
27951 .nr(16)
27952 .kr(1)
27953 .sr(1)
27954 .m(1)
27955 .n(16)
27956 .k(k)
27957 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27958 }
27959 }
27960 }
27961
27962 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16_strided_cn) {
27963 TEST_REQUIRES_X86_AVX;
27964 for (uint32_t n = 17; n < 32; n++) {
27965 for (size_t k = 1; k <= 5; k += 2) {
27966 GemmMicrokernelTester()
27967 .mr(1)
27968 .nr(16)
27969 .kr(1)
27970 .sr(1)
27971 .m(1)
27972 .n(16)
27973 .k(k)
27974 .cn_stride(19)
27975 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27976 }
27977 }
27978 }
27979
27980 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16_strided_a) {
27981 TEST_REQUIRES_X86_AVX;
27982 for (uint32_t n = 17; n < 32; n++) {
27983 for (size_t k = 1; k <= 5; k += 2) {
27984 GemmMicrokernelTester()
27985 .mr(1)
27986 .nr(16)
27987 .kr(1)
27988 .sr(1)
27989 .m(1)
27990 .n(n)
27991 .k(k)
27992 .a_stride(7)
27993 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
27994 }
27995 }
27996 }
27997
27998 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16_subtile) {
27999 TEST_REQUIRES_X86_AVX;
28000 for (uint32_t n = 17; n < 32; n++) {
28001 for (size_t k = 1; k <= 5; k += 2) {
28002 for (uint32_t m = 1; m <= 1; m++) {
28003 GemmMicrokernelTester()
28004 .mr(1)
28005 .nr(16)
28006 .kr(1)
28007 .sr(1)
28008 .m(m)
28009 .n(n)
28010 .k(k)
28011 .iterations(1)
28012 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28013 }
28014 }
28015 }
28016 }
28017
28018 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16) {
28019 TEST_REQUIRES_X86_AVX;
28020 for (uint32_t n = 32; n <= 48; n += 16) {
28021 for (size_t k = 1; k <= 5; k += 2) {
28022 GemmMicrokernelTester()
28023 .mr(1)
28024 .nr(16)
28025 .kr(1)
28026 .sr(1)
28027 .m(1)
28028 .n(16)
28029 .k(k)
28030 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28031 }
28032 }
28033 }
28034
28035 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16_strided_cn) {
28036 TEST_REQUIRES_X86_AVX;
28037 for (uint32_t n = 32; n <= 48; n += 16) {
28038 for (size_t k = 1; k <= 5; k += 2) {
28039 GemmMicrokernelTester()
28040 .mr(1)
28041 .nr(16)
28042 .kr(1)
28043 .sr(1)
28044 .m(1)
28045 .n(n)
28046 .k(k)
28047 .cn_stride(19)
28048 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28049 }
28050 }
28051 }
28052
28053 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16_strided_a) {
28054 TEST_REQUIRES_X86_AVX;
28055 for (uint32_t n = 32; n <= 48; n += 16) {
28056 for (size_t k = 1; k <= 5; k += 2) {
28057 GemmMicrokernelTester()
28058 .mr(1)
28059 .nr(16)
28060 .kr(1)
28061 .sr(1)
28062 .m(1)
28063 .n(n)
28064 .k(k)
28065 .a_stride(7)
28066 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28067 }
28068 }
28069 }
28070
28071 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16_subtile) {
28072 TEST_REQUIRES_X86_AVX;
28073 for (uint32_t n = 32; n <= 48; n += 16) {
28074 for (size_t k = 1; k <= 5; k += 2) {
28075 for (uint32_t m = 1; m <= 1; m++) {
28076 GemmMicrokernelTester()
28077 .mr(1)
28078 .nr(16)
28079 .kr(1)
28080 .sr(1)
28081 .m(m)
28082 .n(n)
28083 .k(k)
28084 .iterations(1)
28085 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28086 }
28087 }
28088 }
28089 }
28090
28091 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, strided_cm_subtile) {
28092 TEST_REQUIRES_X86_AVX;
28093 for (size_t k = 1; k <= 5; k += 2) {
28094 for (uint32_t m = 1; m <= 1; m++) {
28095 for (uint32_t n = 1; n <= 16; n++) {
28096 GemmMicrokernelTester()
28097 .mr(1)
28098 .nr(16)
28099 .kr(1)
28100 .sr(1)
28101 .m(m)
28102 .n(n)
28103 .k(k)
28104 .cm_stride(19)
28105 .iterations(1)
28106 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28107 }
28108 }
28109 }
28110 }
28111
28112 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, qmin) {
28113 TEST_REQUIRES_X86_AVX;
28114 GemmMicrokernelTester()
28115 .mr(1)
28116 .nr(16)
28117 .kr(1)
28118 .sr(1)
28119 .m(1)
28120 .n(16)
28121 .k(1)
28122 .qmin(128)
28123 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28124 }
28125
28126 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, qmax) {
28127 TEST_REQUIRES_X86_AVX;
28128 GemmMicrokernelTester()
28129 .mr(1)
28130 .nr(16)
28131 .kr(1)
28132 .sr(1)
28133 .m(1)
28134 .n(16)
28135 .k(1)
28136 .qmax(128)
28137 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28138 }
28139
28140 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, strided_cm) {
28141 TEST_REQUIRES_X86_AVX;
28142 GemmMicrokernelTester()
28143 .mr(1)
28144 .nr(16)
28145 .kr(1)
28146 .sr(1)
28147 .m(1)
28148 .n(16)
28149 .k(1)
28150 .cm_stride(19)
28151 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
28152 }
28153#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28154
28155
28156#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28157 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1) {
28158 TEST_REQUIRES_X86_AVX;
28159 GemmMicrokernelTester()
28160 .mr(3)
28161 .nr(16)
28162 .kr(1)
28163 .sr(1)
28164 .m(3)
28165 .n(16)
28166 .k(1)
28167 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28168 }
28169
28170 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, strided_cn) {
28171 TEST_REQUIRES_X86_AVX;
28172 GemmMicrokernelTester()
28173 .mr(3)
28174 .nr(16)
28175 .kr(1)
28176 .sr(1)
28177 .m(3)
28178 .n(16)
28179 .k(1)
28180 .cn_stride(19)
28181 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28182 }
28183
28184 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_strided_a) {
28185 TEST_REQUIRES_X86_AVX;
28186 GemmMicrokernelTester()
28187 .mr(3)
28188 .nr(16)
28189 .kr(1)
28190 .sr(1)
28191 .m(3)
28192 .n(16)
28193 .k(1)
28194 .a_stride(3)
28195 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28196 }
28197
28198 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_subtile) {
28199 TEST_REQUIRES_X86_AVX;
28200 for (uint32_t m = 1; m <= 3; m++) {
28201 for (uint32_t n = 1; n <= 16; n++) {
28202 GemmMicrokernelTester()
28203 .mr(3)
28204 .nr(16)
28205 .kr(1)
28206 .sr(1)
28207 .m(m)
28208 .n(n)
28209 .k(1)
28210 .iterations(1)
28211 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28212 }
28213 }
28214 }
28215
28216 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
28217 TEST_REQUIRES_X86_AVX;
28218 for (uint32_t m = 1; m <= 3; m++) {
28219 GemmMicrokernelTester()
28220 .mr(3)
28221 .nr(16)
28222 .kr(1)
28223 .sr(1)
28224 .m(m)
28225 .n(16)
28226 .k(1)
28227 .iterations(1)
28228 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28229 }
28230 }
28231
28232 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
28233 TEST_REQUIRES_X86_AVX;
28234 for (uint32_t n = 1; n <= 16; n++) {
28235 GemmMicrokernelTester()
28236 .mr(3)
28237 .nr(16)
28238 .kr(1)
28239 .sr(1)
28240 .m(3)
28241 .n(n)
28242 .k(1)
28243 .iterations(1)
28244 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28245 }
28246 }
28247
28248 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_gt_1) {
28249 TEST_REQUIRES_X86_AVX;
28250 for (size_t k = 2; k < 10; k++) {
28251 GemmMicrokernelTester()
28252 .mr(3)
28253 .nr(16)
28254 .kr(1)
28255 .sr(1)
28256 .m(3)
28257 .n(16)
28258 .k(k)
28259 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28260 }
28261 }
28262
28263 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_gt_1_strided_a) {
28264 TEST_REQUIRES_X86_AVX;
28265 for (size_t k = 2; k < 10; k++) {
28266 GemmMicrokernelTester()
28267 .mr(3)
28268 .nr(16)
28269 .kr(1)
28270 .sr(1)
28271 .m(3)
28272 .n(16)
28273 .k(k)
28274 .a_stride(11)
28275 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28276 }
28277 }
28278
28279 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_gt_1_subtile) {
28280 TEST_REQUIRES_X86_AVX;
28281 for (size_t k = 2; k < 10; k++) {
28282 for (uint32_t m = 1; m <= 3; m++) {
28283 for (uint32_t n = 1; n <= 16; n++) {
28284 GemmMicrokernelTester()
28285 .mr(3)
28286 .nr(16)
28287 .kr(1)
28288 .sr(1)
28289 .m(m)
28290 .n(n)
28291 .k(k)
28292 .iterations(1)
28293 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28294 }
28295 }
28296 }
28297 }
28298
28299 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16) {
28300 TEST_REQUIRES_X86_AVX;
28301 for (uint32_t n = 17; n < 32; n++) {
28302 for (size_t k = 1; k <= 5; k += 2) {
28303 GemmMicrokernelTester()
28304 .mr(3)
28305 .nr(16)
28306 .kr(1)
28307 .sr(1)
28308 .m(3)
28309 .n(16)
28310 .k(k)
28311 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28312 }
28313 }
28314 }
28315
28316 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
28317 TEST_REQUIRES_X86_AVX;
28318 for (uint32_t n = 17; n < 32; n++) {
28319 for (size_t k = 1; k <= 5; k += 2) {
28320 GemmMicrokernelTester()
28321 .mr(3)
28322 .nr(16)
28323 .kr(1)
28324 .sr(1)
28325 .m(3)
28326 .n(16)
28327 .k(k)
28328 .cn_stride(19)
28329 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28330 }
28331 }
28332 }
28333
28334 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16_strided_a) {
28335 TEST_REQUIRES_X86_AVX;
28336 for (uint32_t n = 17; n < 32; n++) {
28337 for (size_t k = 1; k <= 5; k += 2) {
28338 GemmMicrokernelTester()
28339 .mr(3)
28340 .nr(16)
28341 .kr(1)
28342 .sr(1)
28343 .m(3)
28344 .n(n)
28345 .k(k)
28346 .a_stride(7)
28347 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28348 }
28349 }
28350 }
28351
28352 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16_subtile) {
28353 TEST_REQUIRES_X86_AVX;
28354 for (uint32_t n = 17; n < 32; n++) {
28355 for (size_t k = 1; k <= 5; k += 2) {
28356 for (uint32_t m = 1; m <= 3; m++) {
28357 GemmMicrokernelTester()
28358 .mr(3)
28359 .nr(16)
28360 .kr(1)
28361 .sr(1)
28362 .m(m)
28363 .n(n)
28364 .k(k)
28365 .iterations(1)
28366 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28367 }
28368 }
28369 }
28370 }
28371
28372 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16) {
28373 TEST_REQUIRES_X86_AVX;
28374 for (uint32_t n = 32; n <= 48; n += 16) {
28375 for (size_t k = 1; k <= 5; k += 2) {
28376 GemmMicrokernelTester()
28377 .mr(3)
28378 .nr(16)
28379 .kr(1)
28380 .sr(1)
28381 .m(3)
28382 .n(16)
28383 .k(k)
28384 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28385 }
28386 }
28387 }
28388
28389 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
28390 TEST_REQUIRES_X86_AVX;
28391 for (uint32_t n = 32; n <= 48; n += 16) {
28392 for (size_t k = 1; k <= 5; k += 2) {
28393 GemmMicrokernelTester()
28394 .mr(3)
28395 .nr(16)
28396 .kr(1)
28397 .sr(1)
28398 .m(3)
28399 .n(n)
28400 .k(k)
28401 .cn_stride(19)
28402 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28403 }
28404 }
28405 }
28406
28407 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16_strided_a) {
28408 TEST_REQUIRES_X86_AVX;
28409 for (uint32_t n = 32; n <= 48; n += 16) {
28410 for (size_t k = 1; k <= 5; k += 2) {
28411 GemmMicrokernelTester()
28412 .mr(3)
28413 .nr(16)
28414 .kr(1)
28415 .sr(1)
28416 .m(3)
28417 .n(n)
28418 .k(k)
28419 .a_stride(7)
28420 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28421 }
28422 }
28423 }
28424
28425 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16_subtile) {
28426 TEST_REQUIRES_X86_AVX;
28427 for (uint32_t n = 32; n <= 48; n += 16) {
28428 for (size_t k = 1; k <= 5; k += 2) {
28429 for (uint32_t m = 1; m <= 3; m++) {
28430 GemmMicrokernelTester()
28431 .mr(3)
28432 .nr(16)
28433 .kr(1)
28434 .sr(1)
28435 .m(m)
28436 .n(n)
28437 .k(k)
28438 .iterations(1)
28439 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28440 }
28441 }
28442 }
28443 }
28444
28445 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, strided_cm_subtile) {
28446 TEST_REQUIRES_X86_AVX;
28447 for (size_t k = 1; k <= 5; k += 2) {
28448 for (uint32_t m = 1; m <= 3; m++) {
28449 for (uint32_t n = 1; n <= 16; n++) {
28450 GemmMicrokernelTester()
28451 .mr(3)
28452 .nr(16)
28453 .kr(1)
28454 .sr(1)
28455 .m(m)
28456 .n(n)
28457 .k(k)
28458 .cm_stride(19)
28459 .iterations(1)
28460 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28461 }
28462 }
28463 }
28464 }
28465
28466 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, qmin) {
28467 TEST_REQUIRES_X86_AVX;
28468 GemmMicrokernelTester()
28469 .mr(3)
28470 .nr(16)
28471 .kr(1)
28472 .sr(1)
28473 .m(3)
28474 .n(16)
28475 .k(1)
28476 .qmin(128)
28477 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28478 }
28479
28480 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, qmax) {
28481 TEST_REQUIRES_X86_AVX;
28482 GemmMicrokernelTester()
28483 .mr(3)
28484 .nr(16)
28485 .kr(1)
28486 .sr(1)
28487 .m(3)
28488 .n(16)
28489 .k(1)
28490 .qmax(128)
28491 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28492 }
28493
28494 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, strided_cm) {
28495 TEST_REQUIRES_X86_AVX;
28496 GemmMicrokernelTester()
28497 .mr(3)
28498 .nr(16)
28499 .kr(1)
28500 .sr(1)
28501 .m(3)
28502 .n(16)
28503 .k(1)
28504 .cm_stride(19)
28505 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
28506 }
28507#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28508
28509
28510#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28511 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1) {
28512 TEST_REQUIRES_X86_AVX;
28513 GemmMicrokernelTester()
28514 .mr(4)
28515 .nr(16)
28516 .kr(1)
28517 .sr(1)
28518 .m(4)
28519 .n(16)
28520 .k(1)
28521 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28522 }
28523
28524 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, strided_cn) {
28525 TEST_REQUIRES_X86_AVX;
28526 GemmMicrokernelTester()
28527 .mr(4)
28528 .nr(16)
28529 .kr(1)
28530 .sr(1)
28531 .m(4)
28532 .n(16)
28533 .k(1)
28534 .cn_stride(19)
28535 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28536 }
28537
28538 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_strided_a) {
28539 TEST_REQUIRES_X86_AVX;
28540 GemmMicrokernelTester()
28541 .mr(4)
28542 .nr(16)
28543 .kr(1)
28544 .sr(1)
28545 .m(4)
28546 .n(16)
28547 .k(1)
28548 .a_stride(3)
28549 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28550 }
28551
28552 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_subtile) {
28553 TEST_REQUIRES_X86_AVX;
28554 for (uint32_t m = 1; m <= 4; m++) {
28555 for (uint32_t n = 1; n <= 16; n++) {
28556 GemmMicrokernelTester()
28557 .mr(4)
28558 .nr(16)
28559 .kr(1)
28560 .sr(1)
28561 .m(m)
28562 .n(n)
28563 .k(1)
28564 .iterations(1)
28565 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28566 }
28567 }
28568 }
28569
28570 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_subtile_m) {
28571 TEST_REQUIRES_X86_AVX;
28572 for (uint32_t m = 1; m <= 4; m++) {
28573 GemmMicrokernelTester()
28574 .mr(4)
28575 .nr(16)
28576 .kr(1)
28577 .sr(1)
28578 .m(m)
28579 .n(16)
28580 .k(1)
28581 .iterations(1)
28582 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28583 }
28584 }
28585
28586 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_subtile_n) {
28587 TEST_REQUIRES_X86_AVX;
28588 for (uint32_t n = 1; n <= 16; n++) {
28589 GemmMicrokernelTester()
28590 .mr(4)
28591 .nr(16)
28592 .kr(1)
28593 .sr(1)
28594 .m(4)
28595 .n(n)
28596 .k(1)
28597 .iterations(1)
28598 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28599 }
28600 }
28601
28602 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_gt_1) {
28603 TEST_REQUIRES_X86_AVX;
28604 for (size_t k = 2; k < 10; k++) {
28605 GemmMicrokernelTester()
28606 .mr(4)
28607 .nr(16)
28608 .kr(1)
28609 .sr(1)
28610 .m(4)
28611 .n(16)
28612 .k(k)
28613 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28614 }
28615 }
28616
28617 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_gt_1_strided_a) {
28618 TEST_REQUIRES_X86_AVX;
28619 for (size_t k = 2; k < 10; k++) {
28620 GemmMicrokernelTester()
28621 .mr(4)
28622 .nr(16)
28623 .kr(1)
28624 .sr(1)
28625 .m(4)
28626 .n(16)
28627 .k(k)
28628 .a_stride(11)
28629 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28630 }
28631 }
28632
28633 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_gt_1_subtile) {
28634 TEST_REQUIRES_X86_AVX;
28635 for (size_t k = 2; k < 10; k++) {
28636 for (uint32_t m = 1; m <= 4; m++) {
28637 for (uint32_t n = 1; n <= 16; n++) {
28638 GemmMicrokernelTester()
28639 .mr(4)
28640 .nr(16)
28641 .kr(1)
28642 .sr(1)
28643 .m(m)
28644 .n(n)
28645 .k(k)
28646 .iterations(1)
28647 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28648 }
28649 }
28650 }
28651 }
28652
28653 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16) {
28654 TEST_REQUIRES_X86_AVX;
28655 for (uint32_t n = 17; n < 32; n++) {
28656 for (size_t k = 1; k <= 5; k += 2) {
28657 GemmMicrokernelTester()
28658 .mr(4)
28659 .nr(16)
28660 .kr(1)
28661 .sr(1)
28662 .m(4)
28663 .n(16)
28664 .k(k)
28665 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28666 }
28667 }
28668 }
28669
28670 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16_strided_cn) {
28671 TEST_REQUIRES_X86_AVX;
28672 for (uint32_t n = 17; n < 32; n++) {
28673 for (size_t k = 1; k <= 5; k += 2) {
28674 GemmMicrokernelTester()
28675 .mr(4)
28676 .nr(16)
28677 .kr(1)
28678 .sr(1)
28679 .m(4)
28680 .n(16)
28681 .k(k)
28682 .cn_stride(19)
28683 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28684 }
28685 }
28686 }
28687
28688 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16_strided_a) {
28689 TEST_REQUIRES_X86_AVX;
28690 for (uint32_t n = 17; n < 32; n++) {
28691 for (size_t k = 1; k <= 5; k += 2) {
28692 GemmMicrokernelTester()
28693 .mr(4)
28694 .nr(16)
28695 .kr(1)
28696 .sr(1)
28697 .m(4)
28698 .n(n)
28699 .k(k)
28700 .a_stride(7)
28701 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28702 }
28703 }
28704 }
28705
28706 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16_subtile) {
28707 TEST_REQUIRES_X86_AVX;
28708 for (uint32_t n = 17; n < 32; n++) {
28709 for (size_t k = 1; k <= 5; k += 2) {
28710 for (uint32_t m = 1; m <= 4; m++) {
28711 GemmMicrokernelTester()
28712 .mr(4)
28713 .nr(16)
28714 .kr(1)
28715 .sr(1)
28716 .m(m)
28717 .n(n)
28718 .k(k)
28719 .iterations(1)
28720 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28721 }
28722 }
28723 }
28724 }
28725
28726 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16) {
28727 TEST_REQUIRES_X86_AVX;
28728 for (uint32_t n = 32; n <= 48; n += 16) {
28729 for (size_t k = 1; k <= 5; k += 2) {
28730 GemmMicrokernelTester()
28731 .mr(4)
28732 .nr(16)
28733 .kr(1)
28734 .sr(1)
28735 .m(4)
28736 .n(16)
28737 .k(k)
28738 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28739 }
28740 }
28741 }
28742
28743 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16_strided_cn) {
28744 TEST_REQUIRES_X86_AVX;
28745 for (uint32_t n = 32; n <= 48; n += 16) {
28746 for (size_t k = 1; k <= 5; k += 2) {
28747 GemmMicrokernelTester()
28748 .mr(4)
28749 .nr(16)
28750 .kr(1)
28751 .sr(1)
28752 .m(4)
28753 .n(n)
28754 .k(k)
28755 .cn_stride(19)
28756 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28757 }
28758 }
28759 }
28760
28761 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16_strided_a) {
28762 TEST_REQUIRES_X86_AVX;
28763 for (uint32_t n = 32; n <= 48; n += 16) {
28764 for (size_t k = 1; k <= 5; k += 2) {
28765 GemmMicrokernelTester()
28766 .mr(4)
28767 .nr(16)
28768 .kr(1)
28769 .sr(1)
28770 .m(4)
28771 .n(n)
28772 .k(k)
28773 .a_stride(7)
28774 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28775 }
28776 }
28777 }
28778
28779 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16_subtile) {
28780 TEST_REQUIRES_X86_AVX;
28781 for (uint32_t n = 32; n <= 48; n += 16) {
28782 for (size_t k = 1; k <= 5; k += 2) {
28783 for (uint32_t m = 1; m <= 4; m++) {
28784 GemmMicrokernelTester()
28785 .mr(4)
28786 .nr(16)
28787 .kr(1)
28788 .sr(1)
28789 .m(m)
28790 .n(n)
28791 .k(k)
28792 .iterations(1)
28793 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28794 }
28795 }
28796 }
28797 }
28798
28799 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, strided_cm_subtile) {
28800 TEST_REQUIRES_X86_AVX;
28801 for (size_t k = 1; k <= 5; k += 2) {
28802 for (uint32_t m = 1; m <= 4; m++) {
28803 for (uint32_t n = 1; n <= 16; n++) {
28804 GemmMicrokernelTester()
28805 .mr(4)
28806 .nr(16)
28807 .kr(1)
28808 .sr(1)
28809 .m(m)
28810 .n(n)
28811 .k(k)
28812 .cm_stride(19)
28813 .iterations(1)
28814 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28815 }
28816 }
28817 }
28818 }
28819
28820 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, qmin) {
28821 TEST_REQUIRES_X86_AVX;
28822 GemmMicrokernelTester()
28823 .mr(4)
28824 .nr(16)
28825 .kr(1)
28826 .sr(1)
28827 .m(4)
28828 .n(16)
28829 .k(1)
28830 .qmin(128)
28831 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28832 }
28833
28834 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, qmax) {
28835 TEST_REQUIRES_X86_AVX;
28836 GemmMicrokernelTester()
28837 .mr(4)
28838 .nr(16)
28839 .kr(1)
28840 .sr(1)
28841 .m(4)
28842 .n(16)
28843 .k(1)
28844 .qmax(128)
28845 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28846 }
28847
28848 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, strided_cm) {
28849 TEST_REQUIRES_X86_AVX;
28850 GemmMicrokernelTester()
28851 .mr(4)
28852 .nr(16)
28853 .kr(1)
28854 .sr(1)
28855 .m(4)
28856 .n(16)
28857 .k(1)
28858 .cm_stride(19)
28859 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
28860 }
28861#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28862
28863
28864#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28865 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1) {
28866 TEST_REQUIRES_X86_AVX;
28867 GemmMicrokernelTester()
28868 .mr(5)
28869 .nr(16)
28870 .kr(1)
28871 .sr(1)
28872 .m(5)
28873 .n(16)
28874 .k(1)
28875 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28876 }
28877
28878 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, strided_cn) {
28879 TEST_REQUIRES_X86_AVX;
28880 GemmMicrokernelTester()
28881 .mr(5)
28882 .nr(16)
28883 .kr(1)
28884 .sr(1)
28885 .m(5)
28886 .n(16)
28887 .k(1)
28888 .cn_stride(19)
28889 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28890 }
28891
28892 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_strided_a) {
28893 TEST_REQUIRES_X86_AVX;
28894 GemmMicrokernelTester()
28895 .mr(5)
28896 .nr(16)
28897 .kr(1)
28898 .sr(1)
28899 .m(5)
28900 .n(16)
28901 .k(1)
28902 .a_stride(3)
28903 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28904 }
28905
28906 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_subtile) {
28907 TEST_REQUIRES_X86_AVX;
28908 for (uint32_t m = 1; m <= 5; m++) {
28909 for (uint32_t n = 1; n <= 16; n++) {
28910 GemmMicrokernelTester()
28911 .mr(5)
28912 .nr(16)
28913 .kr(1)
28914 .sr(1)
28915 .m(m)
28916 .n(n)
28917 .k(1)
28918 .iterations(1)
28919 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28920 }
28921 }
28922 }
28923
28924 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_subtile_m) {
28925 TEST_REQUIRES_X86_AVX;
28926 for (uint32_t m = 1; m <= 5; m++) {
28927 GemmMicrokernelTester()
28928 .mr(5)
28929 .nr(16)
28930 .kr(1)
28931 .sr(1)
28932 .m(m)
28933 .n(16)
28934 .k(1)
28935 .iterations(1)
28936 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28937 }
28938 }
28939
28940 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_subtile_n) {
28941 TEST_REQUIRES_X86_AVX;
28942 for (uint32_t n = 1; n <= 16; n++) {
28943 GemmMicrokernelTester()
28944 .mr(5)
28945 .nr(16)
28946 .kr(1)
28947 .sr(1)
28948 .m(5)
28949 .n(n)
28950 .k(1)
28951 .iterations(1)
28952 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28953 }
28954 }
28955
28956 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_gt_1) {
28957 TEST_REQUIRES_X86_AVX;
28958 for (size_t k = 2; k < 10; k++) {
28959 GemmMicrokernelTester()
28960 .mr(5)
28961 .nr(16)
28962 .kr(1)
28963 .sr(1)
28964 .m(5)
28965 .n(16)
28966 .k(k)
28967 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28968 }
28969 }
28970
28971 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_gt_1_strided_a) {
28972 TEST_REQUIRES_X86_AVX;
28973 for (size_t k = 2; k < 10; k++) {
28974 GemmMicrokernelTester()
28975 .mr(5)
28976 .nr(16)
28977 .kr(1)
28978 .sr(1)
28979 .m(5)
28980 .n(16)
28981 .k(k)
28982 .a_stride(11)
28983 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
28984 }
28985 }
28986
28987 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_gt_1_subtile) {
28988 TEST_REQUIRES_X86_AVX;
28989 for (size_t k = 2; k < 10; k++) {
28990 for (uint32_t m = 1; m <= 5; m++) {
28991 for (uint32_t n = 1; n <= 16; n++) {
28992 GemmMicrokernelTester()
28993 .mr(5)
28994 .nr(16)
28995 .kr(1)
28996 .sr(1)
28997 .m(m)
28998 .n(n)
28999 .k(k)
29000 .iterations(1)
29001 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29002 }
29003 }
29004 }
29005 }
29006
29007 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16) {
29008 TEST_REQUIRES_X86_AVX;
29009 for (uint32_t n = 17; n < 32; n++) {
29010 for (size_t k = 1; k <= 5; k += 2) {
29011 GemmMicrokernelTester()
29012 .mr(5)
29013 .nr(16)
29014 .kr(1)
29015 .sr(1)
29016 .m(5)
29017 .n(16)
29018 .k(k)
29019 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29020 }
29021 }
29022 }
29023
29024 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16_strided_cn) {
29025 TEST_REQUIRES_X86_AVX;
29026 for (uint32_t n = 17; n < 32; n++) {
29027 for (size_t k = 1; k <= 5; k += 2) {
29028 GemmMicrokernelTester()
29029 .mr(5)
29030 .nr(16)
29031 .kr(1)
29032 .sr(1)
29033 .m(5)
29034 .n(16)
29035 .k(k)
29036 .cn_stride(19)
29037 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29038 }
29039 }
29040 }
29041
29042 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16_strided_a) {
29043 TEST_REQUIRES_X86_AVX;
29044 for (uint32_t n = 17; n < 32; n++) {
29045 for (size_t k = 1; k <= 5; k += 2) {
29046 GemmMicrokernelTester()
29047 .mr(5)
29048 .nr(16)
29049 .kr(1)
29050 .sr(1)
29051 .m(5)
29052 .n(n)
29053 .k(k)
29054 .a_stride(7)
29055 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29056 }
29057 }
29058 }
29059
29060 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16_subtile) {
29061 TEST_REQUIRES_X86_AVX;
29062 for (uint32_t n = 17; n < 32; n++) {
29063 for (size_t k = 1; k <= 5; k += 2) {
29064 for (uint32_t m = 1; m <= 5; m++) {
29065 GemmMicrokernelTester()
29066 .mr(5)
29067 .nr(16)
29068 .kr(1)
29069 .sr(1)
29070 .m(m)
29071 .n(n)
29072 .k(k)
29073 .iterations(1)
29074 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29075 }
29076 }
29077 }
29078 }
29079
29080 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16) {
29081 TEST_REQUIRES_X86_AVX;
29082 for (uint32_t n = 32; n <= 48; n += 16) {
29083 for (size_t k = 1; k <= 5; k += 2) {
29084 GemmMicrokernelTester()
29085 .mr(5)
29086 .nr(16)
29087 .kr(1)
29088 .sr(1)
29089 .m(5)
29090 .n(16)
29091 .k(k)
29092 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29093 }
29094 }
29095 }
29096
29097 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16_strided_cn) {
29098 TEST_REQUIRES_X86_AVX;
29099 for (uint32_t n = 32; n <= 48; n += 16) {
29100 for (size_t k = 1; k <= 5; k += 2) {
29101 GemmMicrokernelTester()
29102 .mr(5)
29103 .nr(16)
29104 .kr(1)
29105 .sr(1)
29106 .m(5)
29107 .n(n)
29108 .k(k)
29109 .cn_stride(19)
29110 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29111 }
29112 }
29113 }
29114
29115 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16_strided_a) {
29116 TEST_REQUIRES_X86_AVX;
29117 for (uint32_t n = 32; n <= 48; n += 16) {
29118 for (size_t k = 1; k <= 5; k += 2) {
29119 GemmMicrokernelTester()
29120 .mr(5)
29121 .nr(16)
29122 .kr(1)
29123 .sr(1)
29124 .m(5)
29125 .n(n)
29126 .k(k)
29127 .a_stride(7)
29128 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29129 }
29130 }
29131 }
29132
29133 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16_subtile) {
29134 TEST_REQUIRES_X86_AVX;
29135 for (uint32_t n = 32; n <= 48; n += 16) {
29136 for (size_t k = 1; k <= 5; k += 2) {
29137 for (uint32_t m = 1; m <= 5; m++) {
29138 GemmMicrokernelTester()
29139 .mr(5)
29140 .nr(16)
29141 .kr(1)
29142 .sr(1)
29143 .m(m)
29144 .n(n)
29145 .k(k)
29146 .iterations(1)
29147 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29148 }
29149 }
29150 }
29151 }
29152
29153 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, strided_cm_subtile) {
29154 TEST_REQUIRES_X86_AVX;
29155 for (size_t k = 1; k <= 5; k += 2) {
29156 for (uint32_t m = 1; m <= 5; m++) {
29157 for (uint32_t n = 1; n <= 16; n++) {
29158 GemmMicrokernelTester()
29159 .mr(5)
29160 .nr(16)
29161 .kr(1)
29162 .sr(1)
29163 .m(m)
29164 .n(n)
29165 .k(k)
29166 .cm_stride(19)
29167 .iterations(1)
29168 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29169 }
29170 }
29171 }
29172 }
29173
29174 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, qmin) {
29175 TEST_REQUIRES_X86_AVX;
29176 GemmMicrokernelTester()
29177 .mr(5)
29178 .nr(16)
29179 .kr(1)
29180 .sr(1)
29181 .m(5)
29182 .n(16)
29183 .k(1)
29184 .qmin(128)
29185 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29186 }
29187
29188 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, qmax) {
29189 TEST_REQUIRES_X86_AVX;
29190 GemmMicrokernelTester()
29191 .mr(5)
29192 .nr(16)
29193 .kr(1)
29194 .sr(1)
29195 .m(5)
29196 .n(16)
29197 .k(1)
29198 .qmax(128)
29199 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29200 }
29201
29202 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, strided_cm) {
29203 TEST_REQUIRES_X86_AVX;
29204 GemmMicrokernelTester()
29205 .mr(5)
29206 .nr(16)
29207 .kr(1)
29208 .sr(1)
29209 .m(5)
29210 .n(16)
29211 .k(1)
29212 .cm_stride(19)
29213 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
29214 }
29215#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29216
29217
29218#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29219 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1) {
29220 TEST_REQUIRES_X86_FMA3;
29221 GemmMicrokernelTester()
29222 .mr(1)
29223 .nr(8)
29224 .kr(1)
29225 .sr(1)
29226 .m(1)
29227 .n(8)
29228 .k(1)
29229 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29230 }
29231
29232 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, strided_cn) {
29233 TEST_REQUIRES_X86_FMA3;
29234 GemmMicrokernelTester()
29235 .mr(1)
29236 .nr(8)
29237 .kr(1)
29238 .sr(1)
29239 .m(1)
29240 .n(8)
29241 .k(1)
29242 .cn_stride(11)
29243 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29244 }
29245
29246 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_strided_a) {
29247 TEST_REQUIRES_X86_FMA3;
29248 GemmMicrokernelTester()
29249 .mr(1)
29250 .nr(8)
29251 .kr(1)
29252 .sr(1)
29253 .m(1)
29254 .n(8)
29255 .k(1)
29256 .a_stride(3)
29257 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29258 }
29259
29260 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
29261 TEST_REQUIRES_X86_FMA3;
29262 for (uint32_t m = 1; m <= 1; m++) {
29263 for (uint32_t n = 1; n <= 8; n++) {
29264 GemmMicrokernelTester()
29265 .mr(1)
29266 .nr(8)
29267 .kr(1)
29268 .sr(1)
29269 .m(m)
29270 .n(n)
29271 .k(1)
29272 .iterations(1)
29273 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29274 }
29275 }
29276 }
29277
29278 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
29279 TEST_REQUIRES_X86_FMA3;
29280 for (uint32_t m = 1; m <= 1; m++) {
29281 GemmMicrokernelTester()
29282 .mr(1)
29283 .nr(8)
29284 .kr(1)
29285 .sr(1)
29286 .m(m)
29287 .n(8)
29288 .k(1)
29289 .iterations(1)
29290 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29291 }
29292 }
29293
29294 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
29295 TEST_REQUIRES_X86_FMA3;
29296 for (uint32_t n = 1; n <= 8; n++) {
29297 GemmMicrokernelTester()
29298 .mr(1)
29299 .nr(8)
29300 .kr(1)
29301 .sr(1)
29302 .m(1)
29303 .n(n)
29304 .k(1)
29305 .iterations(1)
29306 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29307 }
29308 }
29309
29310 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_gt_1) {
29311 TEST_REQUIRES_X86_FMA3;
29312 for (size_t k = 2; k < 10; k++) {
29313 GemmMicrokernelTester()
29314 .mr(1)
29315 .nr(8)
29316 .kr(1)
29317 .sr(1)
29318 .m(1)
29319 .n(8)
29320 .k(k)
29321 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29322 }
29323 }
29324
29325 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_gt_1_strided_a) {
29326 TEST_REQUIRES_X86_FMA3;
29327 for (size_t k = 2; k < 10; k++) {
29328 GemmMicrokernelTester()
29329 .mr(1)
29330 .nr(8)
29331 .kr(1)
29332 .sr(1)
29333 .m(1)
29334 .n(8)
29335 .k(k)
29336 .a_stride(11)
29337 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29338 }
29339 }
29340
29341 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
29342 TEST_REQUIRES_X86_FMA3;
29343 for (size_t k = 2; k < 10; k++) {
29344 for (uint32_t m = 1; m <= 1; m++) {
29345 for (uint32_t n = 1; n <= 8; n++) {
29346 GemmMicrokernelTester()
29347 .mr(1)
29348 .nr(8)
29349 .kr(1)
29350 .sr(1)
29351 .m(m)
29352 .n(n)
29353 .k(k)
29354 .iterations(1)
29355 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29356 }
29357 }
29358 }
29359 }
29360
29361 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8) {
29362 TEST_REQUIRES_X86_FMA3;
29363 for (uint32_t n = 9; n < 16; n++) {
29364 for (size_t k = 1; k <= 5; k += 2) {
29365 GemmMicrokernelTester()
29366 .mr(1)
29367 .nr(8)
29368 .kr(1)
29369 .sr(1)
29370 .m(1)
29371 .n(8)
29372 .k(k)
29373 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29374 }
29375 }
29376 }
29377
29378 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
29379 TEST_REQUIRES_X86_FMA3;
29380 for (uint32_t n = 9; n < 16; n++) {
29381 for (size_t k = 1; k <= 5; k += 2) {
29382 GemmMicrokernelTester()
29383 .mr(1)
29384 .nr(8)
29385 .kr(1)
29386 .sr(1)
29387 .m(1)
29388 .n(8)
29389 .k(k)
29390 .cn_stride(11)
29391 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29392 }
29393 }
29394 }
29395
29396 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8_strided_a) {
29397 TEST_REQUIRES_X86_FMA3;
29398 for (uint32_t n = 9; n < 16; n++) {
29399 for (size_t k = 1; k <= 5; k += 2) {
29400 GemmMicrokernelTester()
29401 .mr(1)
29402 .nr(8)
29403 .kr(1)
29404 .sr(1)
29405 .m(1)
29406 .n(n)
29407 .k(k)
29408 .a_stride(7)
29409 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29410 }
29411 }
29412 }
29413
29414 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
29415 TEST_REQUIRES_X86_FMA3;
29416 for (uint32_t n = 9; n < 16; n++) {
29417 for (size_t k = 1; k <= 5; k += 2) {
29418 for (uint32_t m = 1; m <= 1; m++) {
29419 GemmMicrokernelTester()
29420 .mr(1)
29421 .nr(8)
29422 .kr(1)
29423 .sr(1)
29424 .m(m)
29425 .n(n)
29426 .k(k)
29427 .iterations(1)
29428 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29429 }
29430 }
29431 }
29432 }
29433
29434 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8) {
29435 TEST_REQUIRES_X86_FMA3;
29436 for (uint32_t n = 16; n <= 24; n += 8) {
29437 for (size_t k = 1; k <= 5; k += 2) {
29438 GemmMicrokernelTester()
29439 .mr(1)
29440 .nr(8)
29441 .kr(1)
29442 .sr(1)
29443 .m(1)
29444 .n(8)
29445 .k(k)
29446 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29447 }
29448 }
29449 }
29450
29451 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
29452 TEST_REQUIRES_X86_FMA3;
29453 for (uint32_t n = 16; n <= 24; n += 8) {
29454 for (size_t k = 1; k <= 5; k += 2) {
29455 GemmMicrokernelTester()
29456 .mr(1)
29457 .nr(8)
29458 .kr(1)
29459 .sr(1)
29460 .m(1)
29461 .n(n)
29462 .k(k)
29463 .cn_stride(11)
29464 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29465 }
29466 }
29467 }
29468
29469 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8_strided_a) {
29470 TEST_REQUIRES_X86_FMA3;
29471 for (uint32_t n = 16; n <= 24; n += 8) {
29472 for (size_t k = 1; k <= 5; k += 2) {
29473 GemmMicrokernelTester()
29474 .mr(1)
29475 .nr(8)
29476 .kr(1)
29477 .sr(1)
29478 .m(1)
29479 .n(n)
29480 .k(k)
29481 .a_stride(7)
29482 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29483 }
29484 }
29485 }
29486
29487 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8_subtile) {
29488 TEST_REQUIRES_X86_FMA3;
29489 for (uint32_t n = 16; n <= 24; n += 8) {
29490 for (size_t k = 1; k <= 5; k += 2) {
29491 for (uint32_t m = 1; m <= 1; m++) {
29492 GemmMicrokernelTester()
29493 .mr(1)
29494 .nr(8)
29495 .kr(1)
29496 .sr(1)
29497 .m(m)
29498 .n(n)
29499 .k(k)
29500 .iterations(1)
29501 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29502 }
29503 }
29504 }
29505 }
29506
29507 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, strided_cm_subtile) {
29508 TEST_REQUIRES_X86_FMA3;
29509 for (size_t k = 1; k <= 5; k += 2) {
29510 for (uint32_t m = 1; m <= 1; m++) {
29511 for (uint32_t n = 1; n <= 8; n++) {
29512 GemmMicrokernelTester()
29513 .mr(1)
29514 .nr(8)
29515 .kr(1)
29516 .sr(1)
29517 .m(m)
29518 .n(n)
29519 .k(k)
29520 .cm_stride(11)
29521 .iterations(1)
29522 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29523 }
29524 }
29525 }
29526 }
29527
29528 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, qmin) {
29529 TEST_REQUIRES_X86_FMA3;
29530 GemmMicrokernelTester()
29531 .mr(1)
29532 .nr(8)
29533 .kr(1)
29534 .sr(1)
29535 .m(1)
29536 .n(8)
29537 .k(1)
29538 .qmin(128)
29539 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29540 }
29541
29542 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, qmax) {
29543 TEST_REQUIRES_X86_FMA3;
29544 GemmMicrokernelTester()
29545 .mr(1)
29546 .nr(8)
29547 .kr(1)
29548 .sr(1)
29549 .m(1)
29550 .n(8)
29551 .k(1)
29552 .qmax(128)
29553 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29554 }
29555
29556 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, strided_cm) {
29557 TEST_REQUIRES_X86_FMA3;
29558 GemmMicrokernelTester()
29559 .mr(1)
29560 .nr(8)
29561 .kr(1)
29562 .sr(1)
29563 .m(1)
29564 .n(8)
29565 .k(1)
29566 .cm_stride(11)
29567 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
29568 }
29569#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29570
29571
29572#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29573 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1) {
29574 TEST_REQUIRES_X86_FMA3;
29575 GemmMicrokernelTester()
29576 .mr(4)
29577 .nr(8)
29578 .kr(1)
29579 .sr(1)
29580 .m(4)
29581 .n(8)
29582 .k(1)
29583 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29584 }
29585
29586 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, strided_cn) {
29587 TEST_REQUIRES_X86_FMA3;
29588 GemmMicrokernelTester()
29589 .mr(4)
29590 .nr(8)
29591 .kr(1)
29592 .sr(1)
29593 .m(4)
29594 .n(8)
29595 .k(1)
29596 .cn_stride(11)
29597 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29598 }
29599
29600 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_strided_a) {
29601 TEST_REQUIRES_X86_FMA3;
29602 GemmMicrokernelTester()
29603 .mr(4)
29604 .nr(8)
29605 .kr(1)
29606 .sr(1)
29607 .m(4)
29608 .n(8)
29609 .k(1)
29610 .a_stride(3)
29611 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29612 }
29613
29614 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
29615 TEST_REQUIRES_X86_FMA3;
29616 for (uint32_t m = 1; m <= 4; m++) {
29617 for (uint32_t n = 1; n <= 8; n++) {
29618 GemmMicrokernelTester()
29619 .mr(4)
29620 .nr(8)
29621 .kr(1)
29622 .sr(1)
29623 .m(m)
29624 .n(n)
29625 .k(1)
29626 .iterations(1)
29627 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29628 }
29629 }
29630 }
29631
29632 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
29633 TEST_REQUIRES_X86_FMA3;
29634 for (uint32_t m = 1; m <= 4; m++) {
29635 GemmMicrokernelTester()
29636 .mr(4)
29637 .nr(8)
29638 .kr(1)
29639 .sr(1)
29640 .m(m)
29641 .n(8)
29642 .k(1)
29643 .iterations(1)
29644 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29645 }
29646 }
29647
29648 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
29649 TEST_REQUIRES_X86_FMA3;
29650 for (uint32_t n = 1; n <= 8; n++) {
29651 GemmMicrokernelTester()
29652 .mr(4)
29653 .nr(8)
29654 .kr(1)
29655 .sr(1)
29656 .m(4)
29657 .n(n)
29658 .k(1)
29659 .iterations(1)
29660 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29661 }
29662 }
29663
29664 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_gt_1) {
29665 TEST_REQUIRES_X86_FMA3;
29666 for (size_t k = 2; k < 10; k++) {
29667 GemmMicrokernelTester()
29668 .mr(4)
29669 .nr(8)
29670 .kr(1)
29671 .sr(1)
29672 .m(4)
29673 .n(8)
29674 .k(k)
29675 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29676 }
29677 }
29678
29679 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_gt_1_strided_a) {
29680 TEST_REQUIRES_X86_FMA3;
29681 for (size_t k = 2; k < 10; k++) {
29682 GemmMicrokernelTester()
29683 .mr(4)
29684 .nr(8)
29685 .kr(1)
29686 .sr(1)
29687 .m(4)
29688 .n(8)
29689 .k(k)
29690 .a_stride(11)
29691 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29692 }
29693 }
29694
29695 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
29696 TEST_REQUIRES_X86_FMA3;
29697 for (size_t k = 2; k < 10; k++) {
29698 for (uint32_t m = 1; m <= 4; m++) {
29699 for (uint32_t n = 1; n <= 8; n++) {
29700 GemmMicrokernelTester()
29701 .mr(4)
29702 .nr(8)
29703 .kr(1)
29704 .sr(1)
29705 .m(m)
29706 .n(n)
29707 .k(k)
29708 .iterations(1)
29709 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29710 }
29711 }
29712 }
29713 }
29714
29715 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8) {
29716 TEST_REQUIRES_X86_FMA3;
29717 for (uint32_t n = 9; n < 16; n++) {
29718 for (size_t k = 1; k <= 5; k += 2) {
29719 GemmMicrokernelTester()
29720 .mr(4)
29721 .nr(8)
29722 .kr(1)
29723 .sr(1)
29724 .m(4)
29725 .n(8)
29726 .k(k)
29727 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29728 }
29729 }
29730 }
29731
29732 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
29733 TEST_REQUIRES_X86_FMA3;
29734 for (uint32_t n = 9; n < 16; n++) {
29735 for (size_t k = 1; k <= 5; k += 2) {
29736 GemmMicrokernelTester()
29737 .mr(4)
29738 .nr(8)
29739 .kr(1)
29740 .sr(1)
29741 .m(4)
29742 .n(8)
29743 .k(k)
29744 .cn_stride(11)
29745 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29746 }
29747 }
29748 }
29749
29750 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8_strided_a) {
29751 TEST_REQUIRES_X86_FMA3;
29752 for (uint32_t n = 9; n < 16; n++) {
29753 for (size_t k = 1; k <= 5; k += 2) {
29754 GemmMicrokernelTester()
29755 .mr(4)
29756 .nr(8)
29757 .kr(1)
29758 .sr(1)
29759 .m(4)
29760 .n(n)
29761 .k(k)
29762 .a_stride(7)
29763 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29764 }
29765 }
29766 }
29767
29768 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
29769 TEST_REQUIRES_X86_FMA3;
29770 for (uint32_t n = 9; n < 16; n++) {
29771 for (size_t k = 1; k <= 5; k += 2) {
29772 for (uint32_t m = 1; m <= 4; m++) {
29773 GemmMicrokernelTester()
29774 .mr(4)
29775 .nr(8)
29776 .kr(1)
29777 .sr(1)
29778 .m(m)
29779 .n(n)
29780 .k(k)
29781 .iterations(1)
29782 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29783 }
29784 }
29785 }
29786 }
29787
29788 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8) {
29789 TEST_REQUIRES_X86_FMA3;
29790 for (uint32_t n = 16; n <= 24; n += 8) {
29791 for (size_t k = 1; k <= 5; k += 2) {
29792 GemmMicrokernelTester()
29793 .mr(4)
29794 .nr(8)
29795 .kr(1)
29796 .sr(1)
29797 .m(4)
29798 .n(8)
29799 .k(k)
29800 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29801 }
29802 }
29803 }
29804
29805 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
29806 TEST_REQUIRES_X86_FMA3;
29807 for (uint32_t n = 16; n <= 24; n += 8) {
29808 for (size_t k = 1; k <= 5; k += 2) {
29809 GemmMicrokernelTester()
29810 .mr(4)
29811 .nr(8)
29812 .kr(1)
29813 .sr(1)
29814 .m(4)
29815 .n(n)
29816 .k(k)
29817 .cn_stride(11)
29818 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29819 }
29820 }
29821 }
29822
29823 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8_strided_a) {
29824 TEST_REQUIRES_X86_FMA3;
29825 for (uint32_t n = 16; n <= 24; n += 8) {
29826 for (size_t k = 1; k <= 5; k += 2) {
29827 GemmMicrokernelTester()
29828 .mr(4)
29829 .nr(8)
29830 .kr(1)
29831 .sr(1)
29832 .m(4)
29833 .n(n)
29834 .k(k)
29835 .a_stride(7)
29836 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29837 }
29838 }
29839 }
29840
29841 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8_subtile) {
29842 TEST_REQUIRES_X86_FMA3;
29843 for (uint32_t n = 16; n <= 24; n += 8) {
29844 for (size_t k = 1; k <= 5; k += 2) {
29845 for (uint32_t m = 1; m <= 4; m++) {
29846 GemmMicrokernelTester()
29847 .mr(4)
29848 .nr(8)
29849 .kr(1)
29850 .sr(1)
29851 .m(m)
29852 .n(n)
29853 .k(k)
29854 .iterations(1)
29855 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29856 }
29857 }
29858 }
29859 }
29860
29861 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, strided_cm_subtile) {
29862 TEST_REQUIRES_X86_FMA3;
29863 for (size_t k = 1; k <= 5; k += 2) {
29864 for (uint32_t m = 1; m <= 4; m++) {
29865 for (uint32_t n = 1; n <= 8; n++) {
29866 GemmMicrokernelTester()
29867 .mr(4)
29868 .nr(8)
29869 .kr(1)
29870 .sr(1)
29871 .m(m)
29872 .n(n)
29873 .k(k)
29874 .cm_stride(11)
29875 .iterations(1)
29876 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29877 }
29878 }
29879 }
29880 }
29881
29882 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, qmin) {
29883 TEST_REQUIRES_X86_FMA3;
29884 GemmMicrokernelTester()
29885 .mr(4)
29886 .nr(8)
29887 .kr(1)
29888 .sr(1)
29889 .m(4)
29890 .n(8)
29891 .k(1)
29892 .qmin(128)
29893 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29894 }
29895
29896 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, qmax) {
29897 TEST_REQUIRES_X86_FMA3;
29898 GemmMicrokernelTester()
29899 .mr(4)
29900 .nr(8)
29901 .kr(1)
29902 .sr(1)
29903 .m(4)
29904 .n(8)
29905 .k(1)
29906 .qmax(128)
29907 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29908 }
29909
29910 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, strided_cm) {
29911 TEST_REQUIRES_X86_FMA3;
29912 GemmMicrokernelTester()
29913 .mr(4)
29914 .nr(8)
29915 .kr(1)
29916 .sr(1)
29917 .m(4)
29918 .n(8)
29919 .k(1)
29920 .cm_stride(11)
29921 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
29922 }
29923#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29924
29925
29926#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29927 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1) {
29928 TEST_REQUIRES_X86_FMA3;
29929 GemmMicrokernelTester()
29930 .mr(5)
29931 .nr(8)
29932 .kr(1)
29933 .sr(1)
29934 .m(5)
29935 .n(8)
29936 .k(1)
29937 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
29938 }
29939
29940 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, strided_cn) {
29941 TEST_REQUIRES_X86_FMA3;
29942 GemmMicrokernelTester()
29943 .mr(5)
29944 .nr(8)
29945 .kr(1)
29946 .sr(1)
29947 .m(5)
29948 .n(8)
29949 .k(1)
29950 .cn_stride(11)
29951 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
29952 }
29953
29954 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_strided_a) {
29955 TEST_REQUIRES_X86_FMA3;
29956 GemmMicrokernelTester()
29957 .mr(5)
29958 .nr(8)
29959 .kr(1)
29960 .sr(1)
29961 .m(5)
29962 .n(8)
29963 .k(1)
29964 .a_stride(3)
29965 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
29966 }
29967
29968 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
29969 TEST_REQUIRES_X86_FMA3;
29970 for (uint32_t m = 1; m <= 5; m++) {
29971 for (uint32_t n = 1; n <= 8; n++) {
29972 GemmMicrokernelTester()
29973 .mr(5)
29974 .nr(8)
29975 .kr(1)
29976 .sr(1)
29977 .m(m)
29978 .n(n)
29979 .k(1)
29980 .iterations(1)
29981 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
29982 }
29983 }
29984 }
29985
29986 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
29987 TEST_REQUIRES_X86_FMA3;
29988 for (uint32_t m = 1; m <= 5; m++) {
29989 GemmMicrokernelTester()
29990 .mr(5)
29991 .nr(8)
29992 .kr(1)
29993 .sr(1)
29994 .m(m)
29995 .n(8)
29996 .k(1)
29997 .iterations(1)
29998 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
29999 }
30000 }
30001
30002 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
30003 TEST_REQUIRES_X86_FMA3;
30004 for (uint32_t n = 1; n <= 8; n++) {
30005 GemmMicrokernelTester()
30006 .mr(5)
30007 .nr(8)
30008 .kr(1)
30009 .sr(1)
30010 .m(5)
30011 .n(n)
30012 .k(1)
30013 .iterations(1)
30014 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30015 }
30016 }
30017
30018 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_gt_1) {
30019 TEST_REQUIRES_X86_FMA3;
30020 for (size_t k = 2; k < 10; k++) {
30021 GemmMicrokernelTester()
30022 .mr(5)
30023 .nr(8)
30024 .kr(1)
30025 .sr(1)
30026 .m(5)
30027 .n(8)
30028 .k(k)
30029 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30030 }
30031 }
30032
30033 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_gt_1_strided_a) {
30034 TEST_REQUIRES_X86_FMA3;
30035 for (size_t k = 2; k < 10; k++) {
30036 GemmMicrokernelTester()
30037 .mr(5)
30038 .nr(8)
30039 .kr(1)
30040 .sr(1)
30041 .m(5)
30042 .n(8)
30043 .k(k)
30044 .a_stride(11)
30045 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30046 }
30047 }
30048
30049 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
30050 TEST_REQUIRES_X86_FMA3;
30051 for (size_t k = 2; k < 10; k++) {
30052 for (uint32_t m = 1; m <= 5; m++) {
30053 for (uint32_t n = 1; n <= 8; n++) {
30054 GemmMicrokernelTester()
30055 .mr(5)
30056 .nr(8)
30057 .kr(1)
30058 .sr(1)
30059 .m(m)
30060 .n(n)
30061 .k(k)
30062 .iterations(1)
30063 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30064 }
30065 }
30066 }
30067 }
30068
30069 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8) {
30070 TEST_REQUIRES_X86_FMA3;
30071 for (uint32_t n = 9; n < 16; n++) {
30072 for (size_t k = 1; k <= 5; k += 2) {
30073 GemmMicrokernelTester()
30074 .mr(5)
30075 .nr(8)
30076 .kr(1)
30077 .sr(1)
30078 .m(5)
30079 .n(8)
30080 .k(k)
30081 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30082 }
30083 }
30084 }
30085
30086 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
30087 TEST_REQUIRES_X86_FMA3;
30088 for (uint32_t n = 9; n < 16; n++) {
30089 for (size_t k = 1; k <= 5; k += 2) {
30090 GemmMicrokernelTester()
30091 .mr(5)
30092 .nr(8)
30093 .kr(1)
30094 .sr(1)
30095 .m(5)
30096 .n(8)
30097 .k(k)
30098 .cn_stride(11)
30099 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30100 }
30101 }
30102 }
30103
30104 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8_strided_a) {
30105 TEST_REQUIRES_X86_FMA3;
30106 for (uint32_t n = 9; n < 16; n++) {
30107 for (size_t k = 1; k <= 5; k += 2) {
30108 GemmMicrokernelTester()
30109 .mr(5)
30110 .nr(8)
30111 .kr(1)
30112 .sr(1)
30113 .m(5)
30114 .n(n)
30115 .k(k)
30116 .a_stride(7)
30117 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30118 }
30119 }
30120 }
30121
30122 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
30123 TEST_REQUIRES_X86_FMA3;
30124 for (uint32_t n = 9; n < 16; n++) {
30125 for (size_t k = 1; k <= 5; k += 2) {
30126 for (uint32_t m = 1; m <= 5; m++) {
30127 GemmMicrokernelTester()
30128 .mr(5)
30129 .nr(8)
30130 .kr(1)
30131 .sr(1)
30132 .m(m)
30133 .n(n)
30134 .k(k)
30135 .iterations(1)
30136 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30137 }
30138 }
30139 }
30140 }
30141
30142 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8) {
30143 TEST_REQUIRES_X86_FMA3;
30144 for (uint32_t n = 16; n <= 24; n += 8) {
30145 for (size_t k = 1; k <= 5; k += 2) {
30146 GemmMicrokernelTester()
30147 .mr(5)
30148 .nr(8)
30149 .kr(1)
30150 .sr(1)
30151 .m(5)
30152 .n(8)
30153 .k(k)
30154 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30155 }
30156 }
30157 }
30158
30159 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
30160 TEST_REQUIRES_X86_FMA3;
30161 for (uint32_t n = 16; n <= 24; n += 8) {
30162 for (size_t k = 1; k <= 5; k += 2) {
30163 GemmMicrokernelTester()
30164 .mr(5)
30165 .nr(8)
30166 .kr(1)
30167 .sr(1)
30168 .m(5)
30169 .n(n)
30170 .k(k)
30171 .cn_stride(11)
30172 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30173 }
30174 }
30175 }
30176
30177 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8_strided_a) {
30178 TEST_REQUIRES_X86_FMA3;
30179 for (uint32_t n = 16; n <= 24; n += 8) {
30180 for (size_t k = 1; k <= 5; k += 2) {
30181 GemmMicrokernelTester()
30182 .mr(5)
30183 .nr(8)
30184 .kr(1)
30185 .sr(1)
30186 .m(5)
30187 .n(n)
30188 .k(k)
30189 .a_stride(7)
30190 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30191 }
30192 }
30193 }
30194
30195 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8_subtile) {
30196 TEST_REQUIRES_X86_FMA3;
30197 for (uint32_t n = 16; n <= 24; n += 8) {
30198 for (size_t k = 1; k <= 5; k += 2) {
30199 for (uint32_t m = 1; m <= 5; m++) {
30200 GemmMicrokernelTester()
30201 .mr(5)
30202 .nr(8)
30203 .kr(1)
30204 .sr(1)
30205 .m(m)
30206 .n(n)
30207 .k(k)
30208 .iterations(1)
30209 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30210 }
30211 }
30212 }
30213 }
30214
30215 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, strided_cm_subtile) {
30216 TEST_REQUIRES_X86_FMA3;
30217 for (size_t k = 1; k <= 5; k += 2) {
30218 for (uint32_t m = 1; m <= 5; m++) {
30219 for (uint32_t n = 1; n <= 8; n++) {
30220 GemmMicrokernelTester()
30221 .mr(5)
30222 .nr(8)
30223 .kr(1)
30224 .sr(1)
30225 .m(m)
30226 .n(n)
30227 .k(k)
30228 .cm_stride(11)
30229 .iterations(1)
30230 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30231 }
30232 }
30233 }
30234 }
30235
30236 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, qmin) {
30237 TEST_REQUIRES_X86_FMA3;
30238 GemmMicrokernelTester()
30239 .mr(5)
30240 .nr(8)
30241 .kr(1)
30242 .sr(1)
30243 .m(5)
30244 .n(8)
30245 .k(1)
30246 .qmin(128)
30247 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30248 }
30249
30250 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, qmax) {
30251 TEST_REQUIRES_X86_FMA3;
30252 GemmMicrokernelTester()
30253 .mr(5)
30254 .nr(8)
30255 .kr(1)
30256 .sr(1)
30257 .m(5)
30258 .n(8)
30259 .k(1)
30260 .qmax(128)
30261 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30262 }
30263
30264 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, strided_cm) {
30265 TEST_REQUIRES_X86_FMA3;
30266 GemmMicrokernelTester()
30267 .mr(5)
30268 .nr(8)
30269 .kr(1)
30270 .sr(1)
30271 .m(5)
30272 .n(8)
30273 .k(1)
30274 .cm_stride(11)
30275 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
30276 }
30277#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30278
30279
30280#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30281 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1) {
30282 TEST_REQUIRES_X86_FMA3;
30283 GemmMicrokernelTester()
30284 .mr(6)
30285 .nr(8)
30286 .kr(1)
30287 .sr(1)
30288 .m(6)
30289 .n(8)
30290 .k(1)
30291 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30292 }
30293
30294 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, strided_cn) {
30295 TEST_REQUIRES_X86_FMA3;
30296 GemmMicrokernelTester()
30297 .mr(6)
30298 .nr(8)
30299 .kr(1)
30300 .sr(1)
30301 .m(6)
30302 .n(8)
30303 .k(1)
30304 .cn_stride(11)
30305 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30306 }
30307
30308 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_strided_a) {
30309 TEST_REQUIRES_X86_FMA3;
30310 GemmMicrokernelTester()
30311 .mr(6)
30312 .nr(8)
30313 .kr(1)
30314 .sr(1)
30315 .m(6)
30316 .n(8)
30317 .k(1)
30318 .a_stride(3)
30319 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30320 }
30321
30322 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_subtile) {
30323 TEST_REQUIRES_X86_FMA3;
30324 for (uint32_t m = 1; m <= 6; m++) {
30325 for (uint32_t n = 1; n <= 8; n++) {
30326 GemmMicrokernelTester()
30327 .mr(6)
30328 .nr(8)
30329 .kr(1)
30330 .sr(1)
30331 .m(m)
30332 .n(n)
30333 .k(1)
30334 .iterations(1)
30335 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30336 }
30337 }
30338 }
30339
30340 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
30341 TEST_REQUIRES_X86_FMA3;
30342 for (uint32_t m = 1; m <= 6; m++) {
30343 GemmMicrokernelTester()
30344 .mr(6)
30345 .nr(8)
30346 .kr(1)
30347 .sr(1)
30348 .m(m)
30349 .n(8)
30350 .k(1)
30351 .iterations(1)
30352 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30353 }
30354 }
30355
30356 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
30357 TEST_REQUIRES_X86_FMA3;
30358 for (uint32_t n = 1; n <= 8; n++) {
30359 GemmMicrokernelTester()
30360 .mr(6)
30361 .nr(8)
30362 .kr(1)
30363 .sr(1)
30364 .m(6)
30365 .n(n)
30366 .k(1)
30367 .iterations(1)
30368 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30369 }
30370 }
30371
30372 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_gt_1) {
30373 TEST_REQUIRES_X86_FMA3;
30374 for (size_t k = 2; k < 10; k++) {
30375 GemmMicrokernelTester()
30376 .mr(6)
30377 .nr(8)
30378 .kr(1)
30379 .sr(1)
30380 .m(6)
30381 .n(8)
30382 .k(k)
30383 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30384 }
30385 }
30386
30387 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_gt_1_strided_a) {
30388 TEST_REQUIRES_X86_FMA3;
30389 for (size_t k = 2; k < 10; k++) {
30390 GemmMicrokernelTester()
30391 .mr(6)
30392 .nr(8)
30393 .kr(1)
30394 .sr(1)
30395 .m(6)
30396 .n(8)
30397 .k(k)
30398 .a_stride(11)
30399 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30400 }
30401 }
30402
30403 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_gt_1_subtile) {
30404 TEST_REQUIRES_X86_FMA3;
30405 for (size_t k = 2; k < 10; k++) {
30406 for (uint32_t m = 1; m <= 6; m++) {
30407 for (uint32_t n = 1; n <= 8; n++) {
30408 GemmMicrokernelTester()
30409 .mr(6)
30410 .nr(8)
30411 .kr(1)
30412 .sr(1)
30413 .m(m)
30414 .n(n)
30415 .k(k)
30416 .iterations(1)
30417 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30418 }
30419 }
30420 }
30421 }
30422
30423 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8) {
30424 TEST_REQUIRES_X86_FMA3;
30425 for (uint32_t n = 9; n < 16; n++) {
30426 for (size_t k = 1; k <= 5; k += 2) {
30427 GemmMicrokernelTester()
30428 .mr(6)
30429 .nr(8)
30430 .kr(1)
30431 .sr(1)
30432 .m(6)
30433 .n(8)
30434 .k(k)
30435 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30436 }
30437 }
30438 }
30439
30440 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
30441 TEST_REQUIRES_X86_FMA3;
30442 for (uint32_t n = 9; n < 16; n++) {
30443 for (size_t k = 1; k <= 5; k += 2) {
30444 GemmMicrokernelTester()
30445 .mr(6)
30446 .nr(8)
30447 .kr(1)
30448 .sr(1)
30449 .m(6)
30450 .n(8)
30451 .k(k)
30452 .cn_stride(11)
30453 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30454 }
30455 }
30456 }
30457
30458 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8_strided_a) {
30459 TEST_REQUIRES_X86_FMA3;
30460 for (uint32_t n = 9; n < 16; n++) {
30461 for (size_t k = 1; k <= 5; k += 2) {
30462 GemmMicrokernelTester()
30463 .mr(6)
30464 .nr(8)
30465 .kr(1)
30466 .sr(1)
30467 .m(6)
30468 .n(n)
30469 .k(k)
30470 .a_stride(7)
30471 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30472 }
30473 }
30474 }
30475
30476 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8_subtile) {
30477 TEST_REQUIRES_X86_FMA3;
30478 for (uint32_t n = 9; n < 16; n++) {
30479 for (size_t k = 1; k <= 5; k += 2) {
30480 for (uint32_t m = 1; m <= 6; m++) {
30481 GemmMicrokernelTester()
30482 .mr(6)
30483 .nr(8)
30484 .kr(1)
30485 .sr(1)
30486 .m(m)
30487 .n(n)
30488 .k(k)
30489 .iterations(1)
30490 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30491 }
30492 }
30493 }
30494 }
30495
30496 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8) {
30497 TEST_REQUIRES_X86_FMA3;
30498 for (uint32_t n = 16; n <= 24; n += 8) {
30499 for (size_t k = 1; k <= 5; k += 2) {
30500 GemmMicrokernelTester()
30501 .mr(6)
30502 .nr(8)
30503 .kr(1)
30504 .sr(1)
30505 .m(6)
30506 .n(8)
30507 .k(k)
30508 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30509 }
30510 }
30511 }
30512
30513 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8_strided_cn) {
30514 TEST_REQUIRES_X86_FMA3;
30515 for (uint32_t n = 16; n <= 24; n += 8) {
30516 for (size_t k = 1; k <= 5; k += 2) {
30517 GemmMicrokernelTester()
30518 .mr(6)
30519 .nr(8)
30520 .kr(1)
30521 .sr(1)
30522 .m(6)
30523 .n(n)
30524 .k(k)
30525 .cn_stride(11)
30526 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30527 }
30528 }
30529 }
30530
30531 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8_strided_a) {
30532 TEST_REQUIRES_X86_FMA3;
30533 for (uint32_t n = 16; n <= 24; n += 8) {
30534 for (size_t k = 1; k <= 5; k += 2) {
30535 GemmMicrokernelTester()
30536 .mr(6)
30537 .nr(8)
30538 .kr(1)
30539 .sr(1)
30540 .m(6)
30541 .n(n)
30542 .k(k)
30543 .a_stride(7)
30544 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30545 }
30546 }
30547 }
30548
30549 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8_subtile) {
30550 TEST_REQUIRES_X86_FMA3;
30551 for (uint32_t n = 16; n <= 24; n += 8) {
30552 for (size_t k = 1; k <= 5; k += 2) {
30553 for (uint32_t m = 1; m <= 6; m++) {
30554 GemmMicrokernelTester()
30555 .mr(6)
30556 .nr(8)
30557 .kr(1)
30558 .sr(1)
30559 .m(m)
30560 .n(n)
30561 .k(k)
30562 .iterations(1)
30563 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30564 }
30565 }
30566 }
30567 }
30568
30569 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, strided_cm_subtile) {
30570 TEST_REQUIRES_X86_FMA3;
30571 for (size_t k = 1; k <= 5; k += 2) {
30572 for (uint32_t m = 1; m <= 6; m++) {
30573 for (uint32_t n = 1; n <= 8; n++) {
30574 GemmMicrokernelTester()
30575 .mr(6)
30576 .nr(8)
30577 .kr(1)
30578 .sr(1)
30579 .m(m)
30580 .n(n)
30581 .k(k)
30582 .cm_stride(11)
30583 .iterations(1)
30584 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30585 }
30586 }
30587 }
30588 }
30589
30590 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, qmin) {
30591 TEST_REQUIRES_X86_FMA3;
30592 GemmMicrokernelTester()
30593 .mr(6)
30594 .nr(8)
30595 .kr(1)
30596 .sr(1)
30597 .m(6)
30598 .n(8)
30599 .k(1)
30600 .qmin(128)
30601 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30602 }
30603
30604 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, qmax) {
30605 TEST_REQUIRES_X86_FMA3;
30606 GemmMicrokernelTester()
30607 .mr(6)
30608 .nr(8)
30609 .kr(1)
30610 .sr(1)
30611 .m(6)
30612 .n(8)
30613 .k(1)
30614 .qmax(128)
30615 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30616 }
30617
30618 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, strided_cm) {
30619 TEST_REQUIRES_X86_FMA3;
30620 GemmMicrokernelTester()
30621 .mr(6)
30622 .nr(8)
30623 .kr(1)
30624 .sr(1)
30625 .m(6)
30626 .n(8)
30627 .k(1)
30628 .cm_stride(11)
30629 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
30630 }
30631#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30632
30633
30634#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30635 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1) {
30636 TEST_REQUIRES_X86_FMA3;
30637 GemmMicrokernelTester()
30638 .mr(7)
30639 .nr(8)
30640 .kr(1)
30641 .sr(1)
30642 .m(7)
30643 .n(8)
30644 .k(1)
30645 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30646 }
30647
30648 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, strided_cn) {
30649 TEST_REQUIRES_X86_FMA3;
30650 GemmMicrokernelTester()
30651 .mr(7)
30652 .nr(8)
30653 .kr(1)
30654 .sr(1)
30655 .m(7)
30656 .n(8)
30657 .k(1)
30658 .cn_stride(11)
30659 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30660 }
30661
30662 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_strided_a) {
30663 TEST_REQUIRES_X86_FMA3;
30664 GemmMicrokernelTester()
30665 .mr(7)
30666 .nr(8)
30667 .kr(1)
30668 .sr(1)
30669 .m(7)
30670 .n(8)
30671 .k(1)
30672 .a_stride(3)
30673 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30674 }
30675
30676 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_subtile) {
30677 TEST_REQUIRES_X86_FMA3;
30678 for (uint32_t m = 1; m <= 7; m++) {
30679 for (uint32_t n = 1; n <= 8; n++) {
30680 GemmMicrokernelTester()
30681 .mr(7)
30682 .nr(8)
30683 .kr(1)
30684 .sr(1)
30685 .m(m)
30686 .n(n)
30687 .k(1)
30688 .iterations(1)
30689 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30690 }
30691 }
30692 }
30693
30694 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
30695 TEST_REQUIRES_X86_FMA3;
30696 for (uint32_t m = 1; m <= 7; m++) {
30697 GemmMicrokernelTester()
30698 .mr(7)
30699 .nr(8)
30700 .kr(1)
30701 .sr(1)
30702 .m(m)
30703 .n(8)
30704 .k(1)
30705 .iterations(1)
30706 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30707 }
30708 }
30709
30710 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
30711 TEST_REQUIRES_X86_FMA3;
30712 for (uint32_t n = 1; n <= 8; n++) {
30713 GemmMicrokernelTester()
30714 .mr(7)
30715 .nr(8)
30716 .kr(1)
30717 .sr(1)
30718 .m(7)
30719 .n(n)
30720 .k(1)
30721 .iterations(1)
30722 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30723 }
30724 }
30725
30726 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_gt_1) {
30727 TEST_REQUIRES_X86_FMA3;
30728 for (size_t k = 2; k < 10; k++) {
30729 GemmMicrokernelTester()
30730 .mr(7)
30731 .nr(8)
30732 .kr(1)
30733 .sr(1)
30734 .m(7)
30735 .n(8)
30736 .k(k)
30737 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30738 }
30739 }
30740
30741 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_gt_1_strided_a) {
30742 TEST_REQUIRES_X86_FMA3;
30743 for (size_t k = 2; k < 10; k++) {
30744 GemmMicrokernelTester()
30745 .mr(7)
30746 .nr(8)
30747 .kr(1)
30748 .sr(1)
30749 .m(7)
30750 .n(8)
30751 .k(k)
30752 .a_stride(11)
30753 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30754 }
30755 }
30756
30757 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_gt_1_subtile) {
30758 TEST_REQUIRES_X86_FMA3;
30759 for (size_t k = 2; k < 10; k++) {
30760 for (uint32_t m = 1; m <= 7; m++) {
30761 for (uint32_t n = 1; n <= 8; n++) {
30762 GemmMicrokernelTester()
30763 .mr(7)
30764 .nr(8)
30765 .kr(1)
30766 .sr(1)
30767 .m(m)
30768 .n(n)
30769 .k(k)
30770 .iterations(1)
30771 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30772 }
30773 }
30774 }
30775 }
30776
30777 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8) {
30778 TEST_REQUIRES_X86_FMA3;
30779 for (uint32_t n = 9; n < 16; n++) {
30780 for (size_t k = 1; k <= 5; k += 2) {
30781 GemmMicrokernelTester()
30782 .mr(7)
30783 .nr(8)
30784 .kr(1)
30785 .sr(1)
30786 .m(7)
30787 .n(8)
30788 .k(k)
30789 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30790 }
30791 }
30792 }
30793
30794 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
30795 TEST_REQUIRES_X86_FMA3;
30796 for (uint32_t n = 9; n < 16; n++) {
30797 for (size_t k = 1; k <= 5; k += 2) {
30798 GemmMicrokernelTester()
30799 .mr(7)
30800 .nr(8)
30801 .kr(1)
30802 .sr(1)
30803 .m(7)
30804 .n(8)
30805 .k(k)
30806 .cn_stride(11)
30807 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30808 }
30809 }
30810 }
30811
30812 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8_strided_a) {
30813 TEST_REQUIRES_X86_FMA3;
30814 for (uint32_t n = 9; n < 16; n++) {
30815 for (size_t k = 1; k <= 5; k += 2) {
30816 GemmMicrokernelTester()
30817 .mr(7)
30818 .nr(8)
30819 .kr(1)
30820 .sr(1)
30821 .m(7)
30822 .n(n)
30823 .k(k)
30824 .a_stride(7)
30825 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30826 }
30827 }
30828 }
30829
30830 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8_subtile) {
30831 TEST_REQUIRES_X86_FMA3;
30832 for (uint32_t n = 9; n < 16; n++) {
30833 for (size_t k = 1; k <= 5; k += 2) {
30834 for (uint32_t m = 1; m <= 7; m++) {
30835 GemmMicrokernelTester()
30836 .mr(7)
30837 .nr(8)
30838 .kr(1)
30839 .sr(1)
30840 .m(m)
30841 .n(n)
30842 .k(k)
30843 .iterations(1)
30844 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30845 }
30846 }
30847 }
30848 }
30849
30850 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8) {
30851 TEST_REQUIRES_X86_FMA3;
30852 for (uint32_t n = 16; n <= 24; n += 8) {
30853 for (size_t k = 1; k <= 5; k += 2) {
30854 GemmMicrokernelTester()
30855 .mr(7)
30856 .nr(8)
30857 .kr(1)
30858 .sr(1)
30859 .m(7)
30860 .n(8)
30861 .k(k)
30862 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30863 }
30864 }
30865 }
30866
30867 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8_strided_cn) {
30868 TEST_REQUIRES_X86_FMA3;
30869 for (uint32_t n = 16; n <= 24; n += 8) {
30870 for (size_t k = 1; k <= 5; k += 2) {
30871 GemmMicrokernelTester()
30872 .mr(7)
30873 .nr(8)
30874 .kr(1)
30875 .sr(1)
30876 .m(7)
30877 .n(n)
30878 .k(k)
30879 .cn_stride(11)
30880 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30881 }
30882 }
30883 }
30884
30885 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8_strided_a) {
30886 TEST_REQUIRES_X86_FMA3;
30887 for (uint32_t n = 16; n <= 24; n += 8) {
30888 for (size_t k = 1; k <= 5; k += 2) {
30889 GemmMicrokernelTester()
30890 .mr(7)
30891 .nr(8)
30892 .kr(1)
30893 .sr(1)
30894 .m(7)
30895 .n(n)
30896 .k(k)
30897 .a_stride(7)
30898 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30899 }
30900 }
30901 }
30902
30903 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8_subtile) {
30904 TEST_REQUIRES_X86_FMA3;
30905 for (uint32_t n = 16; n <= 24; n += 8) {
30906 for (size_t k = 1; k <= 5; k += 2) {
30907 for (uint32_t m = 1; m <= 7; m++) {
30908 GemmMicrokernelTester()
30909 .mr(7)
30910 .nr(8)
30911 .kr(1)
30912 .sr(1)
30913 .m(m)
30914 .n(n)
30915 .k(k)
30916 .iterations(1)
30917 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30918 }
30919 }
30920 }
30921 }
30922
30923 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, strided_cm_subtile) {
30924 TEST_REQUIRES_X86_FMA3;
30925 for (size_t k = 1; k <= 5; k += 2) {
30926 for (uint32_t m = 1; m <= 7; m++) {
30927 for (uint32_t n = 1; n <= 8; n++) {
30928 GemmMicrokernelTester()
30929 .mr(7)
30930 .nr(8)
30931 .kr(1)
30932 .sr(1)
30933 .m(m)
30934 .n(n)
30935 .k(k)
30936 .cm_stride(11)
30937 .iterations(1)
30938 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30939 }
30940 }
30941 }
30942 }
30943
30944 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, qmin) {
30945 TEST_REQUIRES_X86_FMA3;
30946 GemmMicrokernelTester()
30947 .mr(7)
30948 .nr(8)
30949 .kr(1)
30950 .sr(1)
30951 .m(7)
30952 .n(8)
30953 .k(1)
30954 .qmin(128)
30955 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30956 }
30957
30958 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, qmax) {
30959 TEST_REQUIRES_X86_FMA3;
30960 GemmMicrokernelTester()
30961 .mr(7)
30962 .nr(8)
30963 .kr(1)
30964 .sr(1)
30965 .m(7)
30966 .n(8)
30967 .k(1)
30968 .qmax(128)
30969 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30970 }
30971
30972 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, strided_cm) {
30973 TEST_REQUIRES_X86_FMA3;
30974 GemmMicrokernelTester()
30975 .mr(7)
30976 .nr(8)
30977 .kr(1)
30978 .sr(1)
30979 .m(7)
30980 .n(8)
30981 .k(1)
30982 .cm_stride(11)
30983 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
30984 }
30985#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30986
30987
30988#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30989 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1) {
30990 TEST_REQUIRES_X86_FMA3;
30991 GemmMicrokernelTester()
30992 .mr(8)
30993 .nr(8)
30994 .kr(1)
30995 .sr(1)
30996 .m(8)
30997 .n(8)
30998 .k(1)
30999 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31000 }
31001
31002 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, strided_cn) {
31003 TEST_REQUIRES_X86_FMA3;
31004 GemmMicrokernelTester()
31005 .mr(8)
31006 .nr(8)
31007 .kr(1)
31008 .sr(1)
31009 .m(8)
31010 .n(8)
31011 .k(1)
31012 .cn_stride(11)
31013 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31014 }
31015
31016 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_strided_a) {
31017 TEST_REQUIRES_X86_FMA3;
31018 GemmMicrokernelTester()
31019 .mr(8)
31020 .nr(8)
31021 .kr(1)
31022 .sr(1)
31023 .m(8)
31024 .n(8)
31025 .k(1)
31026 .a_stride(3)
31027 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31028 }
31029
31030 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_subtile) {
31031 TEST_REQUIRES_X86_FMA3;
31032 for (uint32_t m = 1; m <= 8; m++) {
31033 for (uint32_t n = 1; n <= 8; n++) {
31034 GemmMicrokernelTester()
31035 .mr(8)
31036 .nr(8)
31037 .kr(1)
31038 .sr(1)
31039 .m(m)
31040 .n(n)
31041 .k(1)
31042 .iterations(1)
31043 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31044 }
31045 }
31046 }
31047
31048 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31049 TEST_REQUIRES_X86_FMA3;
31050 for (uint32_t m = 1; m <= 8; m++) {
31051 GemmMicrokernelTester()
31052 .mr(8)
31053 .nr(8)
31054 .kr(1)
31055 .sr(1)
31056 .m(m)
31057 .n(8)
31058 .k(1)
31059 .iterations(1)
31060 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31061 }
31062 }
31063
31064 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31065 TEST_REQUIRES_X86_FMA3;
31066 for (uint32_t n = 1; n <= 8; n++) {
31067 GemmMicrokernelTester()
31068 .mr(8)
31069 .nr(8)
31070 .kr(1)
31071 .sr(1)
31072 .m(8)
31073 .n(n)
31074 .k(1)
31075 .iterations(1)
31076 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31077 }
31078 }
31079
31080 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_gt_1) {
31081 TEST_REQUIRES_X86_FMA3;
31082 for (size_t k = 2; k < 10; k++) {
31083 GemmMicrokernelTester()
31084 .mr(8)
31085 .nr(8)
31086 .kr(1)
31087 .sr(1)
31088 .m(8)
31089 .n(8)
31090 .k(k)
31091 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31092 }
31093 }
31094
31095 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_gt_1_strided_a) {
31096 TEST_REQUIRES_X86_FMA3;
31097 for (size_t k = 2; k < 10; k++) {
31098 GemmMicrokernelTester()
31099 .mr(8)
31100 .nr(8)
31101 .kr(1)
31102 .sr(1)
31103 .m(8)
31104 .n(8)
31105 .k(k)
31106 .a_stride(11)
31107 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31108 }
31109 }
31110
31111 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_gt_1_subtile) {
31112 TEST_REQUIRES_X86_FMA3;
31113 for (size_t k = 2; k < 10; k++) {
31114 for (uint32_t m = 1; m <= 8; m++) {
31115 for (uint32_t n = 1; n <= 8; n++) {
31116 GemmMicrokernelTester()
31117 .mr(8)
31118 .nr(8)
31119 .kr(1)
31120 .sr(1)
31121 .m(m)
31122 .n(n)
31123 .k(k)
31124 .iterations(1)
31125 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31126 }
31127 }
31128 }
31129 }
31130
31131 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8) {
31132 TEST_REQUIRES_X86_FMA3;
31133 for (uint32_t n = 9; n < 16; n++) {
31134 for (size_t k = 1; k <= 5; k += 2) {
31135 GemmMicrokernelTester()
31136 .mr(8)
31137 .nr(8)
31138 .kr(1)
31139 .sr(1)
31140 .m(8)
31141 .n(8)
31142 .k(k)
31143 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31144 }
31145 }
31146 }
31147
31148 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31149 TEST_REQUIRES_X86_FMA3;
31150 for (uint32_t n = 9; n < 16; n++) {
31151 for (size_t k = 1; k <= 5; k += 2) {
31152 GemmMicrokernelTester()
31153 .mr(8)
31154 .nr(8)
31155 .kr(1)
31156 .sr(1)
31157 .m(8)
31158 .n(8)
31159 .k(k)
31160 .cn_stride(11)
31161 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31162 }
31163 }
31164 }
31165
31166 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8_strided_a) {
31167 TEST_REQUIRES_X86_FMA3;
31168 for (uint32_t n = 9; n < 16; n++) {
31169 for (size_t k = 1; k <= 5; k += 2) {
31170 GemmMicrokernelTester()
31171 .mr(8)
31172 .nr(8)
31173 .kr(1)
31174 .sr(1)
31175 .m(8)
31176 .n(n)
31177 .k(k)
31178 .a_stride(7)
31179 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31180 }
31181 }
31182 }
31183
31184 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8_subtile) {
31185 TEST_REQUIRES_X86_FMA3;
31186 for (uint32_t n = 9; n < 16; n++) {
31187 for (size_t k = 1; k <= 5; k += 2) {
31188 for (uint32_t m = 1; m <= 8; m++) {
31189 GemmMicrokernelTester()
31190 .mr(8)
31191 .nr(8)
31192 .kr(1)
31193 .sr(1)
31194 .m(m)
31195 .n(n)
31196 .k(k)
31197 .iterations(1)
31198 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31199 }
31200 }
31201 }
31202 }
31203
31204 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8) {
31205 TEST_REQUIRES_X86_FMA3;
31206 for (uint32_t n = 16; n <= 24; n += 8) {
31207 for (size_t k = 1; k <= 5; k += 2) {
31208 GemmMicrokernelTester()
31209 .mr(8)
31210 .nr(8)
31211 .kr(1)
31212 .sr(1)
31213 .m(8)
31214 .n(8)
31215 .k(k)
31216 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31217 }
31218 }
31219 }
31220
31221 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8_strided_cn) {
31222 TEST_REQUIRES_X86_FMA3;
31223 for (uint32_t n = 16; n <= 24; n += 8) {
31224 for (size_t k = 1; k <= 5; k += 2) {
31225 GemmMicrokernelTester()
31226 .mr(8)
31227 .nr(8)
31228 .kr(1)
31229 .sr(1)
31230 .m(8)
31231 .n(n)
31232 .k(k)
31233 .cn_stride(11)
31234 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31235 }
31236 }
31237 }
31238
31239 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8_strided_a) {
31240 TEST_REQUIRES_X86_FMA3;
31241 for (uint32_t n = 16; n <= 24; n += 8) {
31242 for (size_t k = 1; k <= 5; k += 2) {
31243 GemmMicrokernelTester()
31244 .mr(8)
31245 .nr(8)
31246 .kr(1)
31247 .sr(1)
31248 .m(8)
31249 .n(n)
31250 .k(k)
31251 .a_stride(7)
31252 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31253 }
31254 }
31255 }
31256
31257 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8_subtile) {
31258 TEST_REQUIRES_X86_FMA3;
31259 for (uint32_t n = 16; n <= 24; n += 8) {
31260 for (size_t k = 1; k <= 5; k += 2) {
31261 for (uint32_t m = 1; m <= 8; m++) {
31262 GemmMicrokernelTester()
31263 .mr(8)
31264 .nr(8)
31265 .kr(1)
31266 .sr(1)
31267 .m(m)
31268 .n(n)
31269 .k(k)
31270 .iterations(1)
31271 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31272 }
31273 }
31274 }
31275 }
31276
31277 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, strided_cm_subtile) {
31278 TEST_REQUIRES_X86_FMA3;
31279 for (size_t k = 1; k <= 5; k += 2) {
31280 for (uint32_t m = 1; m <= 8; m++) {
31281 for (uint32_t n = 1; n <= 8; n++) {
31282 GemmMicrokernelTester()
31283 .mr(8)
31284 .nr(8)
31285 .kr(1)
31286 .sr(1)
31287 .m(m)
31288 .n(n)
31289 .k(k)
31290 .cm_stride(11)
31291 .iterations(1)
31292 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31293 }
31294 }
31295 }
31296 }
31297
31298 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, qmin) {
31299 TEST_REQUIRES_X86_FMA3;
31300 GemmMicrokernelTester()
31301 .mr(8)
31302 .nr(8)
31303 .kr(1)
31304 .sr(1)
31305 .m(8)
31306 .n(8)
31307 .k(1)
31308 .qmin(128)
31309 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31310 }
31311
31312 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, qmax) {
31313 TEST_REQUIRES_X86_FMA3;
31314 GemmMicrokernelTester()
31315 .mr(8)
31316 .nr(8)
31317 .kr(1)
31318 .sr(1)
31319 .m(8)
31320 .n(8)
31321 .k(1)
31322 .qmax(128)
31323 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31324 }
31325
31326 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, strided_cm) {
31327 TEST_REQUIRES_X86_FMA3;
31328 GemmMicrokernelTester()
31329 .mr(8)
31330 .nr(8)
31331 .kr(1)
31332 .sr(1)
31333 .m(8)
31334 .n(8)
31335 .k(1)
31336 .cm_stride(11)
31337 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
31338 }
31339#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31340
31341
31342#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31343 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1) {
31344 TEST_REQUIRES_X86_FMA3;
31345 GemmMicrokernelTester()
31346 .mr(1)
31347 .nr(16)
31348 .kr(1)
31349 .sr(1)
31350 .m(1)
31351 .n(16)
31352 .k(1)
31353 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31354 }
31355
31356 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, strided_cn) {
31357 TEST_REQUIRES_X86_FMA3;
31358 GemmMicrokernelTester()
31359 .mr(1)
31360 .nr(16)
31361 .kr(1)
31362 .sr(1)
31363 .m(1)
31364 .n(16)
31365 .k(1)
31366 .cn_stride(19)
31367 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31368 }
31369
31370 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_strided_a) {
31371 TEST_REQUIRES_X86_FMA3;
31372 GemmMicrokernelTester()
31373 .mr(1)
31374 .nr(16)
31375 .kr(1)
31376 .sr(1)
31377 .m(1)
31378 .n(16)
31379 .k(1)
31380 .a_stride(3)
31381 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31382 }
31383
31384 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
31385 TEST_REQUIRES_X86_FMA3;
31386 for (uint32_t m = 1; m <= 1; m++) {
31387 for (uint32_t n = 1; n <= 16; n++) {
31388 GemmMicrokernelTester()
31389 .mr(1)
31390 .nr(16)
31391 .kr(1)
31392 .sr(1)
31393 .m(m)
31394 .n(n)
31395 .k(1)
31396 .iterations(1)
31397 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31398 }
31399 }
31400 }
31401
31402 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
31403 TEST_REQUIRES_X86_FMA3;
31404 for (uint32_t m = 1; m <= 1; m++) {
31405 GemmMicrokernelTester()
31406 .mr(1)
31407 .nr(16)
31408 .kr(1)
31409 .sr(1)
31410 .m(m)
31411 .n(16)
31412 .k(1)
31413 .iterations(1)
31414 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31415 }
31416 }
31417
31418 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
31419 TEST_REQUIRES_X86_FMA3;
31420 for (uint32_t n = 1; n <= 16; n++) {
31421 GemmMicrokernelTester()
31422 .mr(1)
31423 .nr(16)
31424 .kr(1)
31425 .sr(1)
31426 .m(1)
31427 .n(n)
31428 .k(1)
31429 .iterations(1)
31430 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31431 }
31432 }
31433
31434 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_gt_1) {
31435 TEST_REQUIRES_X86_FMA3;
31436 for (size_t k = 2; k < 10; k++) {
31437 GemmMicrokernelTester()
31438 .mr(1)
31439 .nr(16)
31440 .kr(1)
31441 .sr(1)
31442 .m(1)
31443 .n(16)
31444 .k(k)
31445 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31446 }
31447 }
31448
31449 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_gt_1_strided_a) {
31450 TEST_REQUIRES_X86_FMA3;
31451 for (size_t k = 2; k < 10; k++) {
31452 GemmMicrokernelTester()
31453 .mr(1)
31454 .nr(16)
31455 .kr(1)
31456 .sr(1)
31457 .m(1)
31458 .n(16)
31459 .k(k)
31460 .a_stride(11)
31461 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31462 }
31463 }
31464
31465 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
31466 TEST_REQUIRES_X86_FMA3;
31467 for (size_t k = 2; k < 10; k++) {
31468 for (uint32_t m = 1; m <= 1; m++) {
31469 for (uint32_t n = 1; n <= 16; n++) {
31470 GemmMicrokernelTester()
31471 .mr(1)
31472 .nr(16)
31473 .kr(1)
31474 .sr(1)
31475 .m(m)
31476 .n(n)
31477 .k(k)
31478 .iterations(1)
31479 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31480 }
31481 }
31482 }
31483 }
31484
31485 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16) {
31486 TEST_REQUIRES_X86_FMA3;
31487 for (uint32_t n = 17; n < 32; n++) {
31488 for (size_t k = 1; k <= 5; k += 2) {
31489 GemmMicrokernelTester()
31490 .mr(1)
31491 .nr(16)
31492 .kr(1)
31493 .sr(1)
31494 .m(1)
31495 .n(16)
31496 .k(k)
31497 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31498 }
31499 }
31500 }
31501
31502 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
31503 TEST_REQUIRES_X86_FMA3;
31504 for (uint32_t n = 17; n < 32; n++) {
31505 for (size_t k = 1; k <= 5; k += 2) {
31506 GemmMicrokernelTester()
31507 .mr(1)
31508 .nr(16)
31509 .kr(1)
31510 .sr(1)
31511 .m(1)
31512 .n(16)
31513 .k(k)
31514 .cn_stride(19)
31515 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31516 }
31517 }
31518 }
31519
31520 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16_strided_a) {
31521 TEST_REQUIRES_X86_FMA3;
31522 for (uint32_t n = 17; n < 32; n++) {
31523 for (size_t k = 1; k <= 5; k += 2) {
31524 GemmMicrokernelTester()
31525 .mr(1)
31526 .nr(16)
31527 .kr(1)
31528 .sr(1)
31529 .m(1)
31530 .n(n)
31531 .k(k)
31532 .a_stride(7)
31533 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31534 }
31535 }
31536 }
31537
31538 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
31539 TEST_REQUIRES_X86_FMA3;
31540 for (uint32_t n = 17; n < 32; n++) {
31541 for (size_t k = 1; k <= 5; k += 2) {
31542 for (uint32_t m = 1; m <= 1; m++) {
31543 GemmMicrokernelTester()
31544 .mr(1)
31545 .nr(16)
31546 .kr(1)
31547 .sr(1)
31548 .m(m)
31549 .n(n)
31550 .k(k)
31551 .iterations(1)
31552 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31553 }
31554 }
31555 }
31556 }
31557
31558 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16) {
31559 TEST_REQUIRES_X86_FMA3;
31560 for (uint32_t n = 32; n <= 48; n += 16) {
31561 for (size_t k = 1; k <= 5; k += 2) {
31562 GemmMicrokernelTester()
31563 .mr(1)
31564 .nr(16)
31565 .kr(1)
31566 .sr(1)
31567 .m(1)
31568 .n(16)
31569 .k(k)
31570 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31571 }
31572 }
31573 }
31574
31575 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
31576 TEST_REQUIRES_X86_FMA3;
31577 for (uint32_t n = 32; n <= 48; n += 16) {
31578 for (size_t k = 1; k <= 5; k += 2) {
31579 GemmMicrokernelTester()
31580 .mr(1)
31581 .nr(16)
31582 .kr(1)
31583 .sr(1)
31584 .m(1)
31585 .n(n)
31586 .k(k)
31587 .cn_stride(19)
31588 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31589 }
31590 }
31591 }
31592
31593 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16_strided_a) {
31594 TEST_REQUIRES_X86_FMA3;
31595 for (uint32_t n = 32; n <= 48; n += 16) {
31596 for (size_t k = 1; k <= 5; k += 2) {
31597 GemmMicrokernelTester()
31598 .mr(1)
31599 .nr(16)
31600 .kr(1)
31601 .sr(1)
31602 .m(1)
31603 .n(n)
31604 .k(k)
31605 .a_stride(7)
31606 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31607 }
31608 }
31609 }
31610
31611 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16_subtile) {
31612 TEST_REQUIRES_X86_FMA3;
31613 for (uint32_t n = 32; n <= 48; n += 16) {
31614 for (size_t k = 1; k <= 5; k += 2) {
31615 for (uint32_t m = 1; m <= 1; m++) {
31616 GemmMicrokernelTester()
31617 .mr(1)
31618 .nr(16)
31619 .kr(1)
31620 .sr(1)
31621 .m(m)
31622 .n(n)
31623 .k(k)
31624 .iterations(1)
31625 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31626 }
31627 }
31628 }
31629 }
31630
31631 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, strided_cm_subtile) {
31632 TEST_REQUIRES_X86_FMA3;
31633 for (size_t k = 1; k <= 5; k += 2) {
31634 for (uint32_t m = 1; m <= 1; m++) {
31635 for (uint32_t n = 1; n <= 16; n++) {
31636 GemmMicrokernelTester()
31637 .mr(1)
31638 .nr(16)
31639 .kr(1)
31640 .sr(1)
31641 .m(m)
31642 .n(n)
31643 .k(k)
31644 .cm_stride(19)
31645 .iterations(1)
31646 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31647 }
31648 }
31649 }
31650 }
31651
31652 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, qmin) {
31653 TEST_REQUIRES_X86_FMA3;
31654 GemmMicrokernelTester()
31655 .mr(1)
31656 .nr(16)
31657 .kr(1)
31658 .sr(1)
31659 .m(1)
31660 .n(16)
31661 .k(1)
31662 .qmin(128)
31663 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31664 }
31665
31666 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, qmax) {
31667 TEST_REQUIRES_X86_FMA3;
31668 GemmMicrokernelTester()
31669 .mr(1)
31670 .nr(16)
31671 .kr(1)
31672 .sr(1)
31673 .m(1)
31674 .n(16)
31675 .k(1)
31676 .qmax(128)
31677 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31678 }
31679
31680 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, strided_cm) {
31681 TEST_REQUIRES_X86_FMA3;
31682 GemmMicrokernelTester()
31683 .mr(1)
31684 .nr(16)
31685 .kr(1)
31686 .sr(1)
31687 .m(1)
31688 .n(16)
31689 .k(1)
31690 .cm_stride(19)
31691 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
31692 }
31693#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31694
31695
31696#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31697 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1) {
31698 TEST_REQUIRES_X86_FMA3;
31699 GemmMicrokernelTester()
31700 .mr(3)
31701 .nr(16)
31702 .kr(1)
31703 .sr(1)
31704 .m(3)
31705 .n(16)
31706 .k(1)
31707 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31708 }
31709
31710 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, strided_cn) {
31711 TEST_REQUIRES_X86_FMA3;
31712 GemmMicrokernelTester()
31713 .mr(3)
31714 .nr(16)
31715 .kr(1)
31716 .sr(1)
31717 .m(3)
31718 .n(16)
31719 .k(1)
31720 .cn_stride(19)
31721 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31722 }
31723
31724 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_strided_a) {
31725 TEST_REQUIRES_X86_FMA3;
31726 GemmMicrokernelTester()
31727 .mr(3)
31728 .nr(16)
31729 .kr(1)
31730 .sr(1)
31731 .m(3)
31732 .n(16)
31733 .k(1)
31734 .a_stride(3)
31735 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31736 }
31737
31738 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
31739 TEST_REQUIRES_X86_FMA3;
31740 for (uint32_t m = 1; m <= 3; m++) {
31741 for (uint32_t n = 1; n <= 16; n++) {
31742 GemmMicrokernelTester()
31743 .mr(3)
31744 .nr(16)
31745 .kr(1)
31746 .sr(1)
31747 .m(m)
31748 .n(n)
31749 .k(1)
31750 .iterations(1)
31751 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31752 }
31753 }
31754 }
31755
31756 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
31757 TEST_REQUIRES_X86_FMA3;
31758 for (uint32_t m = 1; m <= 3; m++) {
31759 GemmMicrokernelTester()
31760 .mr(3)
31761 .nr(16)
31762 .kr(1)
31763 .sr(1)
31764 .m(m)
31765 .n(16)
31766 .k(1)
31767 .iterations(1)
31768 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31769 }
31770 }
31771
31772 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
31773 TEST_REQUIRES_X86_FMA3;
31774 for (uint32_t n = 1; n <= 16; n++) {
31775 GemmMicrokernelTester()
31776 .mr(3)
31777 .nr(16)
31778 .kr(1)
31779 .sr(1)
31780 .m(3)
31781 .n(n)
31782 .k(1)
31783 .iterations(1)
31784 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31785 }
31786 }
31787
31788 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_gt_1) {
31789 TEST_REQUIRES_X86_FMA3;
31790 for (size_t k = 2; k < 10; k++) {
31791 GemmMicrokernelTester()
31792 .mr(3)
31793 .nr(16)
31794 .kr(1)
31795 .sr(1)
31796 .m(3)
31797 .n(16)
31798 .k(k)
31799 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31800 }
31801 }
31802
31803 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_gt_1_strided_a) {
31804 TEST_REQUIRES_X86_FMA3;
31805 for (size_t k = 2; k < 10; k++) {
31806 GemmMicrokernelTester()
31807 .mr(3)
31808 .nr(16)
31809 .kr(1)
31810 .sr(1)
31811 .m(3)
31812 .n(16)
31813 .k(k)
31814 .a_stride(11)
31815 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31816 }
31817 }
31818
31819 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
31820 TEST_REQUIRES_X86_FMA3;
31821 for (size_t k = 2; k < 10; k++) {
31822 for (uint32_t m = 1; m <= 3; m++) {
31823 for (uint32_t n = 1; n <= 16; n++) {
31824 GemmMicrokernelTester()
31825 .mr(3)
31826 .nr(16)
31827 .kr(1)
31828 .sr(1)
31829 .m(m)
31830 .n(n)
31831 .k(k)
31832 .iterations(1)
31833 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31834 }
31835 }
31836 }
31837 }
31838
31839 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16) {
31840 TEST_REQUIRES_X86_FMA3;
31841 for (uint32_t n = 17; n < 32; n++) {
31842 for (size_t k = 1; k <= 5; k += 2) {
31843 GemmMicrokernelTester()
31844 .mr(3)
31845 .nr(16)
31846 .kr(1)
31847 .sr(1)
31848 .m(3)
31849 .n(16)
31850 .k(k)
31851 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31852 }
31853 }
31854 }
31855
31856 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
31857 TEST_REQUIRES_X86_FMA3;
31858 for (uint32_t n = 17; n < 32; n++) {
31859 for (size_t k = 1; k <= 5; k += 2) {
31860 GemmMicrokernelTester()
31861 .mr(3)
31862 .nr(16)
31863 .kr(1)
31864 .sr(1)
31865 .m(3)
31866 .n(16)
31867 .k(k)
31868 .cn_stride(19)
31869 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31870 }
31871 }
31872 }
31873
31874 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16_strided_a) {
31875 TEST_REQUIRES_X86_FMA3;
31876 for (uint32_t n = 17; n < 32; n++) {
31877 for (size_t k = 1; k <= 5; k += 2) {
31878 GemmMicrokernelTester()
31879 .mr(3)
31880 .nr(16)
31881 .kr(1)
31882 .sr(1)
31883 .m(3)
31884 .n(n)
31885 .k(k)
31886 .a_stride(7)
31887 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31888 }
31889 }
31890 }
31891
31892 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
31893 TEST_REQUIRES_X86_FMA3;
31894 for (uint32_t n = 17; n < 32; n++) {
31895 for (size_t k = 1; k <= 5; k += 2) {
31896 for (uint32_t m = 1; m <= 3; m++) {
31897 GemmMicrokernelTester()
31898 .mr(3)
31899 .nr(16)
31900 .kr(1)
31901 .sr(1)
31902 .m(m)
31903 .n(n)
31904 .k(k)
31905 .iterations(1)
31906 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31907 }
31908 }
31909 }
31910 }
31911
31912 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16) {
31913 TEST_REQUIRES_X86_FMA3;
31914 for (uint32_t n = 32; n <= 48; n += 16) {
31915 for (size_t k = 1; k <= 5; k += 2) {
31916 GemmMicrokernelTester()
31917 .mr(3)
31918 .nr(16)
31919 .kr(1)
31920 .sr(1)
31921 .m(3)
31922 .n(16)
31923 .k(k)
31924 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31925 }
31926 }
31927 }
31928
31929 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
31930 TEST_REQUIRES_X86_FMA3;
31931 for (uint32_t n = 32; n <= 48; n += 16) {
31932 for (size_t k = 1; k <= 5; k += 2) {
31933 GemmMicrokernelTester()
31934 .mr(3)
31935 .nr(16)
31936 .kr(1)
31937 .sr(1)
31938 .m(3)
31939 .n(n)
31940 .k(k)
31941 .cn_stride(19)
31942 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31943 }
31944 }
31945 }
31946
31947 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16_strided_a) {
31948 TEST_REQUIRES_X86_FMA3;
31949 for (uint32_t n = 32; n <= 48; n += 16) {
31950 for (size_t k = 1; k <= 5; k += 2) {
31951 GemmMicrokernelTester()
31952 .mr(3)
31953 .nr(16)
31954 .kr(1)
31955 .sr(1)
31956 .m(3)
31957 .n(n)
31958 .k(k)
31959 .a_stride(7)
31960 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31961 }
31962 }
31963 }
31964
31965 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16_subtile) {
31966 TEST_REQUIRES_X86_FMA3;
31967 for (uint32_t n = 32; n <= 48; n += 16) {
31968 for (size_t k = 1; k <= 5; k += 2) {
31969 for (uint32_t m = 1; m <= 3; m++) {
31970 GemmMicrokernelTester()
31971 .mr(3)
31972 .nr(16)
31973 .kr(1)
31974 .sr(1)
31975 .m(m)
31976 .n(n)
31977 .k(k)
31978 .iterations(1)
31979 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
31980 }
31981 }
31982 }
31983 }
31984
31985 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, strided_cm_subtile) {
31986 TEST_REQUIRES_X86_FMA3;
31987 for (size_t k = 1; k <= 5; k += 2) {
31988 for (uint32_t m = 1; m <= 3; m++) {
31989 for (uint32_t n = 1; n <= 16; n++) {
31990 GemmMicrokernelTester()
31991 .mr(3)
31992 .nr(16)
31993 .kr(1)
31994 .sr(1)
31995 .m(m)
31996 .n(n)
31997 .k(k)
31998 .cm_stride(19)
31999 .iterations(1)
32000 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
32001 }
32002 }
32003 }
32004 }
32005
32006 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, qmin) {
32007 TEST_REQUIRES_X86_FMA3;
32008 GemmMicrokernelTester()
32009 .mr(3)
32010 .nr(16)
32011 .kr(1)
32012 .sr(1)
32013 .m(3)
32014 .n(16)
32015 .k(1)
32016 .qmin(128)
32017 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
32018 }
32019
32020 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, qmax) {
32021 TEST_REQUIRES_X86_FMA3;
32022 GemmMicrokernelTester()
32023 .mr(3)
32024 .nr(16)
32025 .kr(1)
32026 .sr(1)
32027 .m(3)
32028 .n(16)
32029 .k(1)
32030 .qmax(128)
32031 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
32032 }
32033
32034 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, strided_cm) {
32035 TEST_REQUIRES_X86_FMA3;
32036 GemmMicrokernelTester()
32037 .mr(3)
32038 .nr(16)
32039 .kr(1)
32040 .sr(1)
32041 .m(3)
32042 .n(16)
32043 .k(1)
32044 .cm_stride(19)
32045 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
32046 }
32047#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32048
32049
32050#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32051 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1) {
32052 TEST_REQUIRES_X86_FMA3;
32053 GemmMicrokernelTester()
32054 .mr(4)
32055 .nr(16)
32056 .kr(1)
32057 .sr(1)
32058 .m(4)
32059 .n(16)
32060 .k(1)
32061 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32062 }
32063
32064 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, strided_cn) {
32065 TEST_REQUIRES_X86_FMA3;
32066 GemmMicrokernelTester()
32067 .mr(4)
32068 .nr(16)
32069 .kr(1)
32070 .sr(1)
32071 .m(4)
32072 .n(16)
32073 .k(1)
32074 .cn_stride(19)
32075 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32076 }
32077
32078 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_strided_a) {
32079 TEST_REQUIRES_X86_FMA3;
32080 GemmMicrokernelTester()
32081 .mr(4)
32082 .nr(16)
32083 .kr(1)
32084 .sr(1)
32085 .m(4)
32086 .n(16)
32087 .k(1)
32088 .a_stride(3)
32089 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32090 }
32091
32092 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_subtile) {
32093 TEST_REQUIRES_X86_FMA3;
32094 for (uint32_t m = 1; m <= 4; m++) {
32095 for (uint32_t n = 1; n <= 16; n++) {
32096 GemmMicrokernelTester()
32097 .mr(4)
32098 .nr(16)
32099 .kr(1)
32100 .sr(1)
32101 .m(m)
32102 .n(n)
32103 .k(1)
32104 .iterations(1)
32105 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32106 }
32107 }
32108 }
32109
32110 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
32111 TEST_REQUIRES_X86_FMA3;
32112 for (uint32_t m = 1; m <= 4; m++) {
32113 GemmMicrokernelTester()
32114 .mr(4)
32115 .nr(16)
32116 .kr(1)
32117 .sr(1)
32118 .m(m)
32119 .n(16)
32120 .k(1)
32121 .iterations(1)
32122 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32123 }
32124 }
32125
32126 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
32127 TEST_REQUIRES_X86_FMA3;
32128 for (uint32_t n = 1; n <= 16; n++) {
32129 GemmMicrokernelTester()
32130 .mr(4)
32131 .nr(16)
32132 .kr(1)
32133 .sr(1)
32134 .m(4)
32135 .n(n)
32136 .k(1)
32137 .iterations(1)
32138 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32139 }
32140 }
32141
32142 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_gt_1) {
32143 TEST_REQUIRES_X86_FMA3;
32144 for (size_t k = 2; k < 10; k++) {
32145 GemmMicrokernelTester()
32146 .mr(4)
32147 .nr(16)
32148 .kr(1)
32149 .sr(1)
32150 .m(4)
32151 .n(16)
32152 .k(k)
32153 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32154 }
32155 }
32156
32157 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_gt_1_strided_a) {
32158 TEST_REQUIRES_X86_FMA3;
32159 for (size_t k = 2; k < 10; k++) {
32160 GemmMicrokernelTester()
32161 .mr(4)
32162 .nr(16)
32163 .kr(1)
32164 .sr(1)
32165 .m(4)
32166 .n(16)
32167 .k(k)
32168 .a_stride(11)
32169 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32170 }
32171 }
32172
32173 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_gt_1_subtile) {
32174 TEST_REQUIRES_X86_FMA3;
32175 for (size_t k = 2; k < 10; k++) {
32176 for (uint32_t m = 1; m <= 4; m++) {
32177 for (uint32_t n = 1; n <= 16; n++) {
32178 GemmMicrokernelTester()
32179 .mr(4)
32180 .nr(16)
32181 .kr(1)
32182 .sr(1)
32183 .m(m)
32184 .n(n)
32185 .k(k)
32186 .iterations(1)
32187 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32188 }
32189 }
32190 }
32191 }
32192
32193 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16) {
32194 TEST_REQUIRES_X86_FMA3;
32195 for (uint32_t n = 17; n < 32; n++) {
32196 for (size_t k = 1; k <= 5; k += 2) {
32197 GemmMicrokernelTester()
32198 .mr(4)
32199 .nr(16)
32200 .kr(1)
32201 .sr(1)
32202 .m(4)
32203 .n(16)
32204 .k(k)
32205 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32206 }
32207 }
32208 }
32209
32210 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
32211 TEST_REQUIRES_X86_FMA3;
32212 for (uint32_t n = 17; n < 32; n++) {
32213 for (size_t k = 1; k <= 5; k += 2) {
32214 GemmMicrokernelTester()
32215 .mr(4)
32216 .nr(16)
32217 .kr(1)
32218 .sr(1)
32219 .m(4)
32220 .n(16)
32221 .k(k)
32222 .cn_stride(19)
32223 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32224 }
32225 }
32226 }
32227
32228 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16_strided_a) {
32229 TEST_REQUIRES_X86_FMA3;
32230 for (uint32_t n = 17; n < 32; n++) {
32231 for (size_t k = 1; k <= 5; k += 2) {
32232 GemmMicrokernelTester()
32233 .mr(4)
32234 .nr(16)
32235 .kr(1)
32236 .sr(1)
32237 .m(4)
32238 .n(n)
32239 .k(k)
32240 .a_stride(7)
32241 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32242 }
32243 }
32244 }
32245
32246 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16_subtile) {
32247 TEST_REQUIRES_X86_FMA3;
32248 for (uint32_t n = 17; n < 32; n++) {
32249 for (size_t k = 1; k <= 5; k += 2) {
32250 for (uint32_t m = 1; m <= 4; m++) {
32251 GemmMicrokernelTester()
32252 .mr(4)
32253 .nr(16)
32254 .kr(1)
32255 .sr(1)
32256 .m(m)
32257 .n(n)
32258 .k(k)
32259 .iterations(1)
32260 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32261 }
32262 }
32263 }
32264 }
32265
32266 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16) {
32267 TEST_REQUIRES_X86_FMA3;
32268 for (uint32_t n = 32; n <= 48; n += 16) {
32269 for (size_t k = 1; k <= 5; k += 2) {
32270 GemmMicrokernelTester()
32271 .mr(4)
32272 .nr(16)
32273 .kr(1)
32274 .sr(1)
32275 .m(4)
32276 .n(16)
32277 .k(k)
32278 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32279 }
32280 }
32281 }
32282
32283 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16_strided_cn) {
32284 TEST_REQUIRES_X86_FMA3;
32285 for (uint32_t n = 32; n <= 48; n += 16) {
32286 for (size_t k = 1; k <= 5; k += 2) {
32287 GemmMicrokernelTester()
32288 .mr(4)
32289 .nr(16)
32290 .kr(1)
32291 .sr(1)
32292 .m(4)
32293 .n(n)
32294 .k(k)
32295 .cn_stride(19)
32296 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32297 }
32298 }
32299 }
32300
32301 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16_strided_a) {
32302 TEST_REQUIRES_X86_FMA3;
32303 for (uint32_t n = 32; n <= 48; n += 16) {
32304 for (size_t k = 1; k <= 5; k += 2) {
32305 GemmMicrokernelTester()
32306 .mr(4)
32307 .nr(16)
32308 .kr(1)
32309 .sr(1)
32310 .m(4)
32311 .n(n)
32312 .k(k)
32313 .a_stride(7)
32314 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32315 }
32316 }
32317 }
32318
32319 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16_subtile) {
32320 TEST_REQUIRES_X86_FMA3;
32321 for (uint32_t n = 32; n <= 48; n += 16) {
32322 for (size_t k = 1; k <= 5; k += 2) {
32323 for (uint32_t m = 1; m <= 4; m++) {
32324 GemmMicrokernelTester()
32325 .mr(4)
32326 .nr(16)
32327 .kr(1)
32328 .sr(1)
32329 .m(m)
32330 .n(n)
32331 .k(k)
32332 .iterations(1)
32333 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32334 }
32335 }
32336 }
32337 }
32338
32339 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, strided_cm_subtile) {
32340 TEST_REQUIRES_X86_FMA3;
32341 for (size_t k = 1; k <= 5; k += 2) {
32342 for (uint32_t m = 1; m <= 4; m++) {
32343 for (uint32_t n = 1; n <= 16; n++) {
32344 GemmMicrokernelTester()
32345 .mr(4)
32346 .nr(16)
32347 .kr(1)
32348 .sr(1)
32349 .m(m)
32350 .n(n)
32351 .k(k)
32352 .cm_stride(19)
32353 .iterations(1)
32354 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32355 }
32356 }
32357 }
32358 }
32359
32360 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, qmin) {
32361 TEST_REQUIRES_X86_FMA3;
32362 GemmMicrokernelTester()
32363 .mr(4)
32364 .nr(16)
32365 .kr(1)
32366 .sr(1)
32367 .m(4)
32368 .n(16)
32369 .k(1)
32370 .qmin(128)
32371 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32372 }
32373
32374 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, qmax) {
32375 TEST_REQUIRES_X86_FMA3;
32376 GemmMicrokernelTester()
32377 .mr(4)
32378 .nr(16)
32379 .kr(1)
32380 .sr(1)
32381 .m(4)
32382 .n(16)
32383 .k(1)
32384 .qmax(128)
32385 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32386 }
32387
32388 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, strided_cm) {
32389 TEST_REQUIRES_X86_FMA3;
32390 GemmMicrokernelTester()
32391 .mr(4)
32392 .nr(16)
32393 .kr(1)
32394 .sr(1)
32395 .m(4)
32396 .n(16)
32397 .k(1)
32398 .cm_stride(19)
32399 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
32400 }
32401#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32402
32403
32404#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32405 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1) {
32406 TEST_REQUIRES_X86_FMA3;
32407 GemmMicrokernelTester()
32408 .mr(5)
32409 .nr(16)
32410 .kr(1)
32411 .sr(1)
32412 .m(5)
32413 .n(16)
32414 .k(1)
32415 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32416 }
32417
32418 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, strided_cn) {
32419 TEST_REQUIRES_X86_FMA3;
32420 GemmMicrokernelTester()
32421 .mr(5)
32422 .nr(16)
32423 .kr(1)
32424 .sr(1)
32425 .m(5)
32426 .n(16)
32427 .k(1)
32428 .cn_stride(19)
32429 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32430 }
32431
32432 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_strided_a) {
32433 TEST_REQUIRES_X86_FMA3;
32434 GemmMicrokernelTester()
32435 .mr(5)
32436 .nr(16)
32437 .kr(1)
32438 .sr(1)
32439 .m(5)
32440 .n(16)
32441 .k(1)
32442 .a_stride(3)
32443 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32444 }
32445
32446 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_subtile) {
32447 TEST_REQUIRES_X86_FMA3;
32448 for (uint32_t m = 1; m <= 5; m++) {
32449 for (uint32_t n = 1; n <= 16; n++) {
32450 GemmMicrokernelTester()
32451 .mr(5)
32452 .nr(16)
32453 .kr(1)
32454 .sr(1)
32455 .m(m)
32456 .n(n)
32457 .k(1)
32458 .iterations(1)
32459 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32460 }
32461 }
32462 }
32463
32464 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
32465 TEST_REQUIRES_X86_FMA3;
32466 for (uint32_t m = 1; m <= 5; m++) {
32467 GemmMicrokernelTester()
32468 .mr(5)
32469 .nr(16)
32470 .kr(1)
32471 .sr(1)
32472 .m(m)
32473 .n(16)
32474 .k(1)
32475 .iterations(1)
32476 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32477 }
32478 }
32479
32480 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
32481 TEST_REQUIRES_X86_FMA3;
32482 for (uint32_t n = 1; n <= 16; n++) {
32483 GemmMicrokernelTester()
32484 .mr(5)
32485 .nr(16)
32486 .kr(1)
32487 .sr(1)
32488 .m(5)
32489 .n(n)
32490 .k(1)
32491 .iterations(1)
32492 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32493 }
32494 }
32495
32496 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_gt_1) {
32497 TEST_REQUIRES_X86_FMA3;
32498 for (size_t k = 2; k < 10; k++) {
32499 GemmMicrokernelTester()
32500 .mr(5)
32501 .nr(16)
32502 .kr(1)
32503 .sr(1)
32504 .m(5)
32505 .n(16)
32506 .k(k)
32507 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32508 }
32509 }
32510
32511 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_gt_1_strided_a) {
32512 TEST_REQUIRES_X86_FMA3;
32513 for (size_t k = 2; k < 10; k++) {
32514 GemmMicrokernelTester()
32515 .mr(5)
32516 .nr(16)
32517 .kr(1)
32518 .sr(1)
32519 .m(5)
32520 .n(16)
32521 .k(k)
32522 .a_stride(11)
32523 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32524 }
32525 }
32526
32527 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_gt_1_subtile) {
32528 TEST_REQUIRES_X86_FMA3;
32529 for (size_t k = 2; k < 10; k++) {
32530 for (uint32_t m = 1; m <= 5; m++) {
32531 for (uint32_t n = 1; n <= 16; n++) {
32532 GemmMicrokernelTester()
32533 .mr(5)
32534 .nr(16)
32535 .kr(1)
32536 .sr(1)
32537 .m(m)
32538 .n(n)
32539 .k(k)
32540 .iterations(1)
32541 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32542 }
32543 }
32544 }
32545 }
32546
32547 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16) {
32548 TEST_REQUIRES_X86_FMA3;
32549 for (uint32_t n = 17; n < 32; n++) {
32550 for (size_t k = 1; k <= 5; k += 2) {
32551 GemmMicrokernelTester()
32552 .mr(5)
32553 .nr(16)
32554 .kr(1)
32555 .sr(1)
32556 .m(5)
32557 .n(16)
32558 .k(k)
32559 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32560 }
32561 }
32562 }
32563
32564 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
32565 TEST_REQUIRES_X86_FMA3;
32566 for (uint32_t n = 17; n < 32; n++) {
32567 for (size_t k = 1; k <= 5; k += 2) {
32568 GemmMicrokernelTester()
32569 .mr(5)
32570 .nr(16)
32571 .kr(1)
32572 .sr(1)
32573 .m(5)
32574 .n(16)
32575 .k(k)
32576 .cn_stride(19)
32577 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32578 }
32579 }
32580 }
32581
32582 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16_strided_a) {
32583 TEST_REQUIRES_X86_FMA3;
32584 for (uint32_t n = 17; n < 32; n++) {
32585 for (size_t k = 1; k <= 5; k += 2) {
32586 GemmMicrokernelTester()
32587 .mr(5)
32588 .nr(16)
32589 .kr(1)
32590 .sr(1)
32591 .m(5)
32592 .n(n)
32593 .k(k)
32594 .a_stride(7)
32595 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32596 }
32597 }
32598 }
32599
32600 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16_subtile) {
32601 TEST_REQUIRES_X86_FMA3;
32602 for (uint32_t n = 17; n < 32; n++) {
32603 for (size_t k = 1; k <= 5; k += 2) {
32604 for (uint32_t m = 1; m <= 5; m++) {
32605 GemmMicrokernelTester()
32606 .mr(5)
32607 .nr(16)
32608 .kr(1)
32609 .sr(1)
32610 .m(m)
32611 .n(n)
32612 .k(k)
32613 .iterations(1)
32614 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32615 }
32616 }
32617 }
32618 }
32619
32620 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16) {
32621 TEST_REQUIRES_X86_FMA3;
32622 for (uint32_t n = 32; n <= 48; n += 16) {
32623 for (size_t k = 1; k <= 5; k += 2) {
32624 GemmMicrokernelTester()
32625 .mr(5)
32626 .nr(16)
32627 .kr(1)
32628 .sr(1)
32629 .m(5)
32630 .n(16)
32631 .k(k)
32632 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32633 }
32634 }
32635 }
32636
32637 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16_strided_cn) {
32638 TEST_REQUIRES_X86_FMA3;
32639 for (uint32_t n = 32; n <= 48; n += 16) {
32640 for (size_t k = 1; k <= 5; k += 2) {
32641 GemmMicrokernelTester()
32642 .mr(5)
32643 .nr(16)
32644 .kr(1)
32645 .sr(1)
32646 .m(5)
32647 .n(n)
32648 .k(k)
32649 .cn_stride(19)
32650 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32651 }
32652 }
32653 }
32654
32655 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16_strided_a) {
32656 TEST_REQUIRES_X86_FMA3;
32657 for (uint32_t n = 32; n <= 48; n += 16) {
32658 for (size_t k = 1; k <= 5; k += 2) {
32659 GemmMicrokernelTester()
32660 .mr(5)
32661 .nr(16)
32662 .kr(1)
32663 .sr(1)
32664 .m(5)
32665 .n(n)
32666 .k(k)
32667 .a_stride(7)
32668 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32669 }
32670 }
32671 }
32672
32673 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16_subtile) {
32674 TEST_REQUIRES_X86_FMA3;
32675 for (uint32_t n = 32; n <= 48; n += 16) {
32676 for (size_t k = 1; k <= 5; k += 2) {
32677 for (uint32_t m = 1; m <= 5; m++) {
32678 GemmMicrokernelTester()
32679 .mr(5)
32680 .nr(16)
32681 .kr(1)
32682 .sr(1)
32683 .m(m)
32684 .n(n)
32685 .k(k)
32686 .iterations(1)
32687 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32688 }
32689 }
32690 }
32691 }
32692
32693 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, strided_cm_subtile) {
32694 TEST_REQUIRES_X86_FMA3;
32695 for (size_t k = 1; k <= 5; k += 2) {
32696 for (uint32_t m = 1; m <= 5; m++) {
32697 for (uint32_t n = 1; n <= 16; n++) {
32698 GemmMicrokernelTester()
32699 .mr(5)
32700 .nr(16)
32701 .kr(1)
32702 .sr(1)
32703 .m(m)
32704 .n(n)
32705 .k(k)
32706 .cm_stride(19)
32707 .iterations(1)
32708 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32709 }
32710 }
32711 }
32712 }
32713
32714 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, qmin) {
32715 TEST_REQUIRES_X86_FMA3;
32716 GemmMicrokernelTester()
32717 .mr(5)
32718 .nr(16)
32719 .kr(1)
32720 .sr(1)
32721 .m(5)
32722 .n(16)
32723 .k(1)
32724 .qmin(128)
32725 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32726 }
32727
32728 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, qmax) {
32729 TEST_REQUIRES_X86_FMA3;
32730 GemmMicrokernelTester()
32731 .mr(5)
32732 .nr(16)
32733 .kr(1)
32734 .sr(1)
32735 .m(5)
32736 .n(16)
32737 .k(1)
32738 .qmax(128)
32739 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32740 }
32741
32742 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, strided_cm) {
32743 TEST_REQUIRES_X86_FMA3;
32744 GemmMicrokernelTester()
32745 .mr(5)
32746 .nr(16)
32747 .kr(1)
32748 .sr(1)
32749 .m(5)
32750 .n(16)
32751 .k(1)
32752 .cm_stride(19)
32753 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
32754 }
32755#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32756
32757
32758#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32759 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4) {
32760 TEST_REQUIRES_X86_FMA3;
32761 GemmMicrokernelTester()
32762 .mr(1)
32763 .nr(16)
32764 .kr(1)
32765 .sr(4)
32766 .m(1)
32767 .n(16)
32768 .k(4)
32769 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32770 }
32771
32772 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, strided_cn) {
32773 TEST_REQUIRES_X86_FMA3;
32774 GemmMicrokernelTester()
32775 .mr(1)
32776 .nr(16)
32777 .kr(1)
32778 .sr(4)
32779 .m(1)
32780 .n(16)
32781 .k(4)
32782 .cn_stride(19)
32783 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32784 }
32785
32786 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
32787 TEST_REQUIRES_X86_FMA3;
32788 GemmMicrokernelTester()
32789 .mr(1)
32790 .nr(16)
32791 .kr(1)
32792 .sr(4)
32793 .m(1)
32794 .n(16)
32795 .k(4)
32796 .a_stride(7)
32797 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32798 }
32799
32800 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
32801 TEST_REQUIRES_X86_FMA3;
32802 for (uint32_t m = 1; m <= 1; m++) {
32803 for (uint32_t n = 1; n <= 16; n++) {
32804 GemmMicrokernelTester()
32805 .mr(1)
32806 .nr(16)
32807 .kr(1)
32808 .sr(4)
32809 .m(m)
32810 .n(n)
32811 .k(4)
32812 .iterations(1)
32813 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32814 }
32815 }
32816 }
32817
32818 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
32819 TEST_REQUIRES_X86_FMA3;
32820 for (uint32_t m = 1; m <= 1; m++) {
32821 GemmMicrokernelTester()
32822 .mr(1)
32823 .nr(16)
32824 .kr(1)
32825 .sr(4)
32826 .m(m)
32827 .n(16)
32828 .k(4)
32829 .iterations(1)
32830 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32831 }
32832 }
32833
32834 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
32835 TEST_REQUIRES_X86_FMA3;
32836 for (uint32_t n = 1; n <= 16; n++) {
32837 GemmMicrokernelTester()
32838 .mr(1)
32839 .nr(16)
32840 .kr(1)
32841 .sr(4)
32842 .m(1)
32843 .n(n)
32844 .k(4)
32845 .iterations(1)
32846 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32847 }
32848 }
32849
32850 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_lt_4) {
32851 TEST_REQUIRES_X86_FMA3;
32852 for (size_t k = 1; k < 4; k++) {
32853 GemmMicrokernelTester()
32854 .mr(1)
32855 .nr(16)
32856 .kr(1)
32857 .sr(4)
32858 .m(1)
32859 .n(16)
32860 .k(k)
32861 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32862 }
32863 }
32864
32865 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
32866 TEST_REQUIRES_X86_FMA3;
32867 for (size_t k = 1; k < 4; k++) {
32868 GemmMicrokernelTester()
32869 .mr(1)
32870 .nr(16)
32871 .kr(1)
32872 .sr(4)
32873 .m(1)
32874 .n(16)
32875 .k(k)
32876 .a_stride(7)
32877 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32878 }
32879 }
32880
32881 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
32882 TEST_REQUIRES_X86_FMA3;
32883 for (size_t k = 1; k < 4; k++) {
32884 for (uint32_t m = 1; m <= 1; m++) {
32885 for (uint32_t n = 1; n <= 16; n++) {
32886 GemmMicrokernelTester()
32887 .mr(1)
32888 .nr(16)
32889 .kr(1)
32890 .sr(4)
32891 .m(m)
32892 .n(n)
32893 .k(k)
32894 .iterations(1)
32895 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32896 }
32897 }
32898 }
32899 }
32900
32901 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_gt_4) {
32902 TEST_REQUIRES_X86_FMA3;
32903 for (size_t k = 5; k < 8; k++) {
32904 GemmMicrokernelTester()
32905 .mr(1)
32906 .nr(16)
32907 .kr(1)
32908 .sr(4)
32909 .m(1)
32910 .n(16)
32911 .k(k)
32912 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32913 }
32914 }
32915
32916 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
32917 TEST_REQUIRES_X86_FMA3;
32918 for (size_t k = 5; k < 8; k++) {
32919 GemmMicrokernelTester()
32920 .mr(1)
32921 .nr(16)
32922 .kr(1)
32923 .sr(4)
32924 .m(1)
32925 .n(16)
32926 .k(k)
32927 .a_stride(11)
32928 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32929 }
32930 }
32931
32932 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
32933 TEST_REQUIRES_X86_FMA3;
32934 for (size_t k = 5; k < 8; k++) {
32935 for (uint32_t m = 1; m <= 1; m++) {
32936 for (uint32_t n = 1; n <= 16; n++) {
32937 GemmMicrokernelTester()
32938 .mr(1)
32939 .nr(16)
32940 .kr(1)
32941 .sr(4)
32942 .m(m)
32943 .n(n)
32944 .k(k)
32945 .iterations(1)
32946 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32947 }
32948 }
32949 }
32950 }
32951
32952 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_div_4) {
32953 TEST_REQUIRES_X86_FMA3;
32954 for (size_t k = 8; k <= 40; k += 4) {
32955 GemmMicrokernelTester()
32956 .mr(1)
32957 .nr(16)
32958 .kr(1)
32959 .sr(4)
32960 .m(1)
32961 .n(16)
32962 .k(k)
32963 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32964 }
32965 }
32966
32967 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
32968 TEST_REQUIRES_X86_FMA3;
32969 for (size_t k = 8; k <= 40; k += 4) {
32970 GemmMicrokernelTester()
32971 .mr(1)
32972 .nr(16)
32973 .kr(1)
32974 .sr(4)
32975 .m(1)
32976 .n(16)
32977 .k(k)
32978 .a_stride(43)
32979 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32980 }
32981 }
32982
32983 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_div_4_subtile) {
32984 TEST_REQUIRES_X86_FMA3;
32985 for (size_t k = 8; k <= 40; k += 4) {
32986 for (uint32_t m = 1; m <= 1; m++) {
32987 for (uint32_t n = 1; n <= 16; n++) {
32988 GemmMicrokernelTester()
32989 .mr(1)
32990 .nr(16)
32991 .kr(1)
32992 .sr(4)
32993 .m(m)
32994 .n(n)
32995 .k(k)
32996 .iterations(1)
32997 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
32998 }
32999 }
33000 }
33001 }
33002
33003 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16) {
33004 TEST_REQUIRES_X86_FMA3;
33005 for (uint32_t n = 17; n < 32; n++) {
33006 for (size_t k = 1; k <= 20; k += 5) {
33007 GemmMicrokernelTester()
33008 .mr(1)
33009 .nr(16)
33010 .kr(1)
33011 .sr(4)
33012 .m(1)
33013 .n(16)
33014 .k(k)
33015 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33016 }
33017 }
33018 }
33019
33020 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
33021 TEST_REQUIRES_X86_FMA3;
33022 for (uint32_t n = 17; n < 32; n++) {
33023 for (size_t k = 1; k <= 20; k += 5) {
33024 GemmMicrokernelTester()
33025 .mr(1)
33026 .nr(16)
33027 .kr(1)
33028 .sr(4)
33029 .m(1)
33030 .n(16)
33031 .k(k)
33032 .cn_stride(19)
33033 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33034 }
33035 }
33036 }
33037
33038 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
33039 TEST_REQUIRES_X86_FMA3;
33040 for (uint32_t n = 17; n < 32; n++) {
33041 for (size_t k = 1; k <= 20; k += 5) {
33042 GemmMicrokernelTester()
33043 .mr(1)
33044 .nr(16)
33045 .kr(1)
33046 .sr(4)
33047 .m(1)
33048 .n(n)
33049 .k(k)
33050 .a_stride(23)
33051 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33052 }
33053 }
33054 }
33055
33056 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
33057 TEST_REQUIRES_X86_FMA3;
33058 for (uint32_t n = 17; n < 32; n++) {
33059 for (size_t k = 1; k <= 20; k += 5) {
33060 for (uint32_t m = 1; m <= 1; m++) {
33061 GemmMicrokernelTester()
33062 .mr(1)
33063 .nr(16)
33064 .kr(1)
33065 .sr(4)
33066 .m(m)
33067 .n(n)
33068 .k(k)
33069 .iterations(1)
33070 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33071 }
33072 }
33073 }
33074 }
33075
33076 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16) {
33077 TEST_REQUIRES_X86_FMA3;
33078 for (uint32_t n = 32; n <= 48; n += 16) {
33079 for (size_t k = 1; k <= 20; k += 5) {
33080 GemmMicrokernelTester()
33081 .mr(1)
33082 .nr(16)
33083 .kr(1)
33084 .sr(4)
33085 .m(1)
33086 .n(16)
33087 .k(k)
33088 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33089 }
33090 }
33091 }
33092
33093 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
33094 TEST_REQUIRES_X86_FMA3;
33095 for (uint32_t n = 32; n <= 48; n += 16) {
33096 for (size_t k = 1; k <= 20; k += 5) {
33097 GemmMicrokernelTester()
33098 .mr(1)
33099 .nr(16)
33100 .kr(1)
33101 .sr(4)
33102 .m(1)
33103 .n(n)
33104 .k(k)
33105 .cn_stride(19)
33106 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33107 }
33108 }
33109 }
33110
33111 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
33112 TEST_REQUIRES_X86_FMA3;
33113 for (uint32_t n = 32; n <= 48; n += 16) {
33114 for (size_t k = 1; k <= 20; k += 5) {
33115 GemmMicrokernelTester()
33116 .mr(1)
33117 .nr(16)
33118 .kr(1)
33119 .sr(4)
33120 .m(1)
33121 .n(n)
33122 .k(k)
33123 .a_stride(23)
33124 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33125 }
33126 }
33127 }
33128
33129 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16_subtile) {
33130 TEST_REQUIRES_X86_FMA3;
33131 for (uint32_t n = 32; n <= 48; n += 16) {
33132 for (size_t k = 1; k <= 20; k += 5) {
33133 for (uint32_t m = 1; m <= 1; m++) {
33134 GemmMicrokernelTester()
33135 .mr(1)
33136 .nr(16)
33137 .kr(1)
33138 .sr(4)
33139 .m(m)
33140 .n(n)
33141 .k(k)
33142 .iterations(1)
33143 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33144 }
33145 }
33146 }
33147 }
33148
33149 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, strided_cm_subtile) {
33150 TEST_REQUIRES_X86_FMA3;
33151 for (size_t k = 1; k <= 20; k += 5) {
33152 for (uint32_t m = 1; m <= 1; m++) {
33153 for (uint32_t n = 1; n <= 16; n++) {
33154 GemmMicrokernelTester()
33155 .mr(1)
33156 .nr(16)
33157 .kr(1)
33158 .sr(4)
33159 .m(m)
33160 .n(n)
33161 .k(k)
33162 .cm_stride(19)
33163 .iterations(1)
33164 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33165 }
33166 }
33167 }
33168 }
33169
33170 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, qmin) {
33171 TEST_REQUIRES_X86_FMA3;
33172 GemmMicrokernelTester()
33173 .mr(1)
33174 .nr(16)
33175 .kr(1)
33176 .sr(4)
33177 .m(1)
33178 .n(16)
33179 .k(4)
33180 .qmin(128)
33181 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33182 }
33183
33184 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, qmax) {
33185 TEST_REQUIRES_X86_FMA3;
33186 GemmMicrokernelTester()
33187 .mr(1)
33188 .nr(16)
33189 .kr(1)
33190 .sr(4)
33191 .m(1)
33192 .n(16)
33193 .k(4)
33194 .qmax(128)
33195 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33196 }
33197
33198 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, strided_cm) {
33199 TEST_REQUIRES_X86_FMA3;
33200 GemmMicrokernelTester()
33201 .mr(1)
33202 .nr(16)
33203 .kr(1)
33204 .sr(4)
33205 .m(1)
33206 .n(16)
33207 .k(4)
33208 .cm_stride(19)
33209 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
33210 }
33211#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33212
33213
33214#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33215 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4) {
33216 TEST_REQUIRES_X86_FMA3;
33217 GemmMicrokernelTester()
33218 .mr(3)
33219 .nr(16)
33220 .kr(1)
33221 .sr(4)
33222 .m(3)
33223 .n(16)
33224 .k(4)
33225 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33226 }
33227
33228 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, strided_cn) {
33229 TEST_REQUIRES_X86_FMA3;
33230 GemmMicrokernelTester()
33231 .mr(3)
33232 .nr(16)
33233 .kr(1)
33234 .sr(4)
33235 .m(3)
33236 .n(16)
33237 .k(4)
33238 .cn_stride(19)
33239 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33240 }
33241
33242 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
33243 TEST_REQUIRES_X86_FMA3;
33244 GemmMicrokernelTester()
33245 .mr(3)
33246 .nr(16)
33247 .kr(1)
33248 .sr(4)
33249 .m(3)
33250 .n(16)
33251 .k(4)
33252 .a_stride(7)
33253 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33254 }
33255
33256 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
33257 TEST_REQUIRES_X86_FMA3;
33258 for (uint32_t m = 1; m <= 3; m++) {
33259 for (uint32_t n = 1; n <= 16; n++) {
33260 GemmMicrokernelTester()
33261 .mr(3)
33262 .nr(16)
33263 .kr(1)
33264 .sr(4)
33265 .m(m)
33266 .n(n)
33267 .k(4)
33268 .iterations(1)
33269 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33270 }
33271 }
33272 }
33273
33274 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
33275 TEST_REQUIRES_X86_FMA3;
33276 for (uint32_t m = 1; m <= 3; m++) {
33277 GemmMicrokernelTester()
33278 .mr(3)
33279 .nr(16)
33280 .kr(1)
33281 .sr(4)
33282 .m(m)
33283 .n(16)
33284 .k(4)
33285 .iterations(1)
33286 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33287 }
33288 }
33289
33290 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
33291 TEST_REQUIRES_X86_FMA3;
33292 for (uint32_t n = 1; n <= 16; n++) {
33293 GemmMicrokernelTester()
33294 .mr(3)
33295 .nr(16)
33296 .kr(1)
33297 .sr(4)
33298 .m(3)
33299 .n(n)
33300 .k(4)
33301 .iterations(1)
33302 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33303 }
33304 }
33305
33306 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_lt_4) {
33307 TEST_REQUIRES_X86_FMA3;
33308 for (size_t k = 1; k < 4; k++) {
33309 GemmMicrokernelTester()
33310 .mr(3)
33311 .nr(16)
33312 .kr(1)
33313 .sr(4)
33314 .m(3)
33315 .n(16)
33316 .k(k)
33317 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33318 }
33319 }
33320
33321 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
33322 TEST_REQUIRES_X86_FMA3;
33323 for (size_t k = 1; k < 4; k++) {
33324 GemmMicrokernelTester()
33325 .mr(3)
33326 .nr(16)
33327 .kr(1)
33328 .sr(4)
33329 .m(3)
33330 .n(16)
33331 .k(k)
33332 .a_stride(7)
33333 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33334 }
33335 }
33336
33337 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
33338 TEST_REQUIRES_X86_FMA3;
33339 for (size_t k = 1; k < 4; k++) {
33340 for (uint32_t m = 1; m <= 3; m++) {
33341 for (uint32_t n = 1; n <= 16; n++) {
33342 GemmMicrokernelTester()
33343 .mr(3)
33344 .nr(16)
33345 .kr(1)
33346 .sr(4)
33347 .m(m)
33348 .n(n)
33349 .k(k)
33350 .iterations(1)
33351 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33352 }
33353 }
33354 }
33355 }
33356
33357 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_gt_4) {
33358 TEST_REQUIRES_X86_FMA3;
33359 for (size_t k = 5; k < 8; k++) {
33360 GemmMicrokernelTester()
33361 .mr(3)
33362 .nr(16)
33363 .kr(1)
33364 .sr(4)
33365 .m(3)
33366 .n(16)
33367 .k(k)
33368 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33369 }
33370 }
33371
33372 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
33373 TEST_REQUIRES_X86_FMA3;
33374 for (size_t k = 5; k < 8; k++) {
33375 GemmMicrokernelTester()
33376 .mr(3)
33377 .nr(16)
33378 .kr(1)
33379 .sr(4)
33380 .m(3)
33381 .n(16)
33382 .k(k)
33383 .a_stride(11)
33384 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33385 }
33386 }
33387
33388 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
33389 TEST_REQUIRES_X86_FMA3;
33390 for (size_t k = 5; k < 8; k++) {
33391 for (uint32_t m = 1; m <= 3; m++) {
33392 for (uint32_t n = 1; n <= 16; n++) {
33393 GemmMicrokernelTester()
33394 .mr(3)
33395 .nr(16)
33396 .kr(1)
33397 .sr(4)
33398 .m(m)
33399 .n(n)
33400 .k(k)
33401 .iterations(1)
33402 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33403 }
33404 }
33405 }
33406 }
33407
33408 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_div_4) {
33409 TEST_REQUIRES_X86_FMA3;
33410 for (size_t k = 8; k <= 40; k += 4) {
33411 GemmMicrokernelTester()
33412 .mr(3)
33413 .nr(16)
33414 .kr(1)
33415 .sr(4)
33416 .m(3)
33417 .n(16)
33418 .k(k)
33419 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33420 }
33421 }
33422
33423 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
33424 TEST_REQUIRES_X86_FMA3;
33425 for (size_t k = 8; k <= 40; k += 4) {
33426 GemmMicrokernelTester()
33427 .mr(3)
33428 .nr(16)
33429 .kr(1)
33430 .sr(4)
33431 .m(3)
33432 .n(16)
33433 .k(k)
33434 .a_stride(43)
33435 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33436 }
33437 }
33438
33439 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_div_4_subtile) {
33440 TEST_REQUIRES_X86_FMA3;
33441 for (size_t k = 8; k <= 40; k += 4) {
33442 for (uint32_t m = 1; m <= 3; m++) {
33443 for (uint32_t n = 1; n <= 16; n++) {
33444 GemmMicrokernelTester()
33445 .mr(3)
33446 .nr(16)
33447 .kr(1)
33448 .sr(4)
33449 .m(m)
33450 .n(n)
33451 .k(k)
33452 .iterations(1)
33453 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33454 }
33455 }
33456 }
33457 }
33458
33459 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16) {
33460 TEST_REQUIRES_X86_FMA3;
33461 for (uint32_t n = 17; n < 32; n++) {
33462 for (size_t k = 1; k <= 20; k += 5) {
33463 GemmMicrokernelTester()
33464 .mr(3)
33465 .nr(16)
33466 .kr(1)
33467 .sr(4)
33468 .m(3)
33469 .n(16)
33470 .k(k)
33471 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33472 }
33473 }
33474 }
33475
33476 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
33477 TEST_REQUIRES_X86_FMA3;
33478 for (uint32_t n = 17; n < 32; n++) {
33479 for (size_t k = 1; k <= 20; k += 5) {
33480 GemmMicrokernelTester()
33481 .mr(3)
33482 .nr(16)
33483 .kr(1)
33484 .sr(4)
33485 .m(3)
33486 .n(16)
33487 .k(k)
33488 .cn_stride(19)
33489 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33490 }
33491 }
33492 }
33493
33494 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
33495 TEST_REQUIRES_X86_FMA3;
33496 for (uint32_t n = 17; n < 32; n++) {
33497 for (size_t k = 1; k <= 20; k += 5) {
33498 GemmMicrokernelTester()
33499 .mr(3)
33500 .nr(16)
33501 .kr(1)
33502 .sr(4)
33503 .m(3)
33504 .n(n)
33505 .k(k)
33506 .a_stride(23)
33507 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33508 }
33509 }
33510 }
33511
33512 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
33513 TEST_REQUIRES_X86_FMA3;
33514 for (uint32_t n = 17; n < 32; n++) {
33515 for (size_t k = 1; k <= 20; k += 5) {
33516 for (uint32_t m = 1; m <= 3; m++) {
33517 GemmMicrokernelTester()
33518 .mr(3)
33519 .nr(16)
33520 .kr(1)
33521 .sr(4)
33522 .m(m)
33523 .n(n)
33524 .k(k)
33525 .iterations(1)
33526 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33527 }
33528 }
33529 }
33530 }
33531
33532 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16) {
33533 TEST_REQUIRES_X86_FMA3;
33534 for (uint32_t n = 32; n <= 48; n += 16) {
33535 for (size_t k = 1; k <= 20; k += 5) {
33536 GemmMicrokernelTester()
33537 .mr(3)
33538 .nr(16)
33539 .kr(1)
33540 .sr(4)
33541 .m(3)
33542 .n(16)
33543 .k(k)
33544 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33545 }
33546 }
33547 }
33548
33549 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
33550 TEST_REQUIRES_X86_FMA3;
33551 for (uint32_t n = 32; n <= 48; n += 16) {
33552 for (size_t k = 1; k <= 20; k += 5) {
33553 GemmMicrokernelTester()
33554 .mr(3)
33555 .nr(16)
33556 .kr(1)
33557 .sr(4)
33558 .m(3)
33559 .n(n)
33560 .k(k)
33561 .cn_stride(19)
33562 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33563 }
33564 }
33565 }
33566
33567 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
33568 TEST_REQUIRES_X86_FMA3;
33569 for (uint32_t n = 32; n <= 48; n += 16) {
33570 for (size_t k = 1; k <= 20; k += 5) {
33571 GemmMicrokernelTester()
33572 .mr(3)
33573 .nr(16)
33574 .kr(1)
33575 .sr(4)
33576 .m(3)
33577 .n(n)
33578 .k(k)
33579 .a_stride(23)
33580 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33581 }
33582 }
33583 }
33584
33585 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16_subtile) {
33586 TEST_REQUIRES_X86_FMA3;
33587 for (uint32_t n = 32; n <= 48; n += 16) {
33588 for (size_t k = 1; k <= 20; k += 5) {
33589 for (uint32_t m = 1; m <= 3; m++) {
33590 GemmMicrokernelTester()
33591 .mr(3)
33592 .nr(16)
33593 .kr(1)
33594 .sr(4)
33595 .m(m)
33596 .n(n)
33597 .k(k)
33598 .iterations(1)
33599 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33600 }
33601 }
33602 }
33603 }
33604
33605 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, strided_cm_subtile) {
33606 TEST_REQUIRES_X86_FMA3;
33607 for (size_t k = 1; k <= 20; k += 5) {
33608 for (uint32_t m = 1; m <= 3; m++) {
33609 for (uint32_t n = 1; n <= 16; n++) {
33610 GemmMicrokernelTester()
33611 .mr(3)
33612 .nr(16)
33613 .kr(1)
33614 .sr(4)
33615 .m(m)
33616 .n(n)
33617 .k(k)
33618 .cm_stride(19)
33619 .iterations(1)
33620 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33621 }
33622 }
33623 }
33624 }
33625
33626 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, qmin) {
33627 TEST_REQUIRES_X86_FMA3;
33628 GemmMicrokernelTester()
33629 .mr(3)
33630 .nr(16)
33631 .kr(1)
33632 .sr(4)
33633 .m(3)
33634 .n(16)
33635 .k(4)
33636 .qmin(128)
33637 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33638 }
33639
33640 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, qmax) {
33641 TEST_REQUIRES_X86_FMA3;
33642 GemmMicrokernelTester()
33643 .mr(3)
33644 .nr(16)
33645 .kr(1)
33646 .sr(4)
33647 .m(3)
33648 .n(16)
33649 .k(4)
33650 .qmax(128)
33651 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33652 }
33653
33654 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, strided_cm) {
33655 TEST_REQUIRES_X86_FMA3;
33656 GemmMicrokernelTester()
33657 .mr(3)
33658 .nr(16)
33659 .kr(1)
33660 .sr(4)
33661 .m(3)
33662 .n(16)
33663 .k(4)
33664 .cm_stride(19)
33665 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
33666 }
33667#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33668
33669
33670#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33671 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4) {
33672 TEST_REQUIRES_X86_FMA3;
33673 GemmMicrokernelTester()
33674 .mr(4)
33675 .nr(16)
33676 .kr(1)
33677 .sr(4)
33678 .m(4)
33679 .n(16)
33680 .k(4)
33681 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33682 }
33683
33684 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, strided_cn) {
33685 TEST_REQUIRES_X86_FMA3;
33686 GemmMicrokernelTester()
33687 .mr(4)
33688 .nr(16)
33689 .kr(1)
33690 .sr(4)
33691 .m(4)
33692 .n(16)
33693 .k(4)
33694 .cn_stride(19)
33695 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33696 }
33697
33698 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
33699 TEST_REQUIRES_X86_FMA3;
33700 GemmMicrokernelTester()
33701 .mr(4)
33702 .nr(16)
33703 .kr(1)
33704 .sr(4)
33705 .m(4)
33706 .n(16)
33707 .k(4)
33708 .a_stride(7)
33709 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33710 }
33711
33712 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
33713 TEST_REQUIRES_X86_FMA3;
33714 for (uint32_t m = 1; m <= 4; m++) {
33715 for (uint32_t n = 1; n <= 16; n++) {
33716 GemmMicrokernelTester()
33717 .mr(4)
33718 .nr(16)
33719 .kr(1)
33720 .sr(4)
33721 .m(m)
33722 .n(n)
33723 .k(4)
33724 .iterations(1)
33725 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33726 }
33727 }
33728 }
33729
33730 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
33731 TEST_REQUIRES_X86_FMA3;
33732 for (uint32_t m = 1; m <= 4; m++) {
33733 GemmMicrokernelTester()
33734 .mr(4)
33735 .nr(16)
33736 .kr(1)
33737 .sr(4)
33738 .m(m)
33739 .n(16)
33740 .k(4)
33741 .iterations(1)
33742 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33743 }
33744 }
33745
33746 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
33747 TEST_REQUIRES_X86_FMA3;
33748 for (uint32_t n = 1; n <= 16; n++) {
33749 GemmMicrokernelTester()
33750 .mr(4)
33751 .nr(16)
33752 .kr(1)
33753 .sr(4)
33754 .m(4)
33755 .n(n)
33756 .k(4)
33757 .iterations(1)
33758 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33759 }
33760 }
33761
33762 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_lt_4) {
33763 TEST_REQUIRES_X86_FMA3;
33764 for (size_t k = 1; k < 4; k++) {
33765 GemmMicrokernelTester()
33766 .mr(4)
33767 .nr(16)
33768 .kr(1)
33769 .sr(4)
33770 .m(4)
33771 .n(16)
33772 .k(k)
33773 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33774 }
33775 }
33776
33777 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
33778 TEST_REQUIRES_X86_FMA3;
33779 for (size_t k = 1; k < 4; k++) {
33780 GemmMicrokernelTester()
33781 .mr(4)
33782 .nr(16)
33783 .kr(1)
33784 .sr(4)
33785 .m(4)
33786 .n(16)
33787 .k(k)
33788 .a_stride(7)
33789 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33790 }
33791 }
33792
33793 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
33794 TEST_REQUIRES_X86_FMA3;
33795 for (size_t k = 1; k < 4; k++) {
33796 for (uint32_t m = 1; m <= 4; m++) {
33797 for (uint32_t n = 1; n <= 16; n++) {
33798 GemmMicrokernelTester()
33799 .mr(4)
33800 .nr(16)
33801 .kr(1)
33802 .sr(4)
33803 .m(m)
33804 .n(n)
33805 .k(k)
33806 .iterations(1)
33807 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33808 }
33809 }
33810 }
33811 }
33812
33813 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_gt_4) {
33814 TEST_REQUIRES_X86_FMA3;
33815 for (size_t k = 5; k < 8; k++) {
33816 GemmMicrokernelTester()
33817 .mr(4)
33818 .nr(16)
33819 .kr(1)
33820 .sr(4)
33821 .m(4)
33822 .n(16)
33823 .k(k)
33824 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33825 }
33826 }
33827
33828 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
33829 TEST_REQUIRES_X86_FMA3;
33830 for (size_t k = 5; k < 8; k++) {
33831 GemmMicrokernelTester()
33832 .mr(4)
33833 .nr(16)
33834 .kr(1)
33835 .sr(4)
33836 .m(4)
33837 .n(16)
33838 .k(k)
33839 .a_stride(11)
33840 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33841 }
33842 }
33843
33844 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
33845 TEST_REQUIRES_X86_FMA3;
33846 for (size_t k = 5; k < 8; k++) {
33847 for (uint32_t m = 1; m <= 4; m++) {
33848 for (uint32_t n = 1; n <= 16; n++) {
33849 GemmMicrokernelTester()
33850 .mr(4)
33851 .nr(16)
33852 .kr(1)
33853 .sr(4)
33854 .m(m)
33855 .n(n)
33856 .k(k)
33857 .iterations(1)
33858 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33859 }
33860 }
33861 }
33862 }
33863
33864 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_div_4) {
33865 TEST_REQUIRES_X86_FMA3;
33866 for (size_t k = 8; k <= 40; k += 4) {
33867 GemmMicrokernelTester()
33868 .mr(4)
33869 .nr(16)
33870 .kr(1)
33871 .sr(4)
33872 .m(4)
33873 .n(16)
33874 .k(k)
33875 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33876 }
33877 }
33878
33879 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
33880 TEST_REQUIRES_X86_FMA3;
33881 for (size_t k = 8; k <= 40; k += 4) {
33882 GemmMicrokernelTester()
33883 .mr(4)
33884 .nr(16)
33885 .kr(1)
33886 .sr(4)
33887 .m(4)
33888 .n(16)
33889 .k(k)
33890 .a_stride(43)
33891 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33892 }
33893 }
33894
33895 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_div_4_subtile) {
33896 TEST_REQUIRES_X86_FMA3;
33897 for (size_t k = 8; k <= 40; k += 4) {
33898 for (uint32_t m = 1; m <= 4; m++) {
33899 for (uint32_t n = 1; n <= 16; n++) {
33900 GemmMicrokernelTester()
33901 .mr(4)
33902 .nr(16)
33903 .kr(1)
33904 .sr(4)
33905 .m(m)
33906 .n(n)
33907 .k(k)
33908 .iterations(1)
33909 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33910 }
33911 }
33912 }
33913 }
33914
33915 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16) {
33916 TEST_REQUIRES_X86_FMA3;
33917 for (uint32_t n = 17; n < 32; n++) {
33918 for (size_t k = 1; k <= 20; k += 5) {
33919 GemmMicrokernelTester()
33920 .mr(4)
33921 .nr(16)
33922 .kr(1)
33923 .sr(4)
33924 .m(4)
33925 .n(16)
33926 .k(k)
33927 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33928 }
33929 }
33930 }
33931
33932 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
33933 TEST_REQUIRES_X86_FMA3;
33934 for (uint32_t n = 17; n < 32; n++) {
33935 for (size_t k = 1; k <= 20; k += 5) {
33936 GemmMicrokernelTester()
33937 .mr(4)
33938 .nr(16)
33939 .kr(1)
33940 .sr(4)
33941 .m(4)
33942 .n(16)
33943 .k(k)
33944 .cn_stride(19)
33945 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33946 }
33947 }
33948 }
33949
33950 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
33951 TEST_REQUIRES_X86_FMA3;
33952 for (uint32_t n = 17; n < 32; n++) {
33953 for (size_t k = 1; k <= 20; k += 5) {
33954 GemmMicrokernelTester()
33955 .mr(4)
33956 .nr(16)
33957 .kr(1)
33958 .sr(4)
33959 .m(4)
33960 .n(n)
33961 .k(k)
33962 .a_stride(23)
33963 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33964 }
33965 }
33966 }
33967
33968 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
33969 TEST_REQUIRES_X86_FMA3;
33970 for (uint32_t n = 17; n < 32; n++) {
33971 for (size_t k = 1; k <= 20; k += 5) {
33972 for (uint32_t m = 1; m <= 4; m++) {
33973 GemmMicrokernelTester()
33974 .mr(4)
33975 .nr(16)
33976 .kr(1)
33977 .sr(4)
33978 .m(m)
33979 .n(n)
33980 .k(k)
33981 .iterations(1)
33982 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
33983 }
33984 }
33985 }
33986 }
33987
33988 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16) {
33989 TEST_REQUIRES_X86_FMA3;
33990 for (uint32_t n = 32; n <= 48; n += 16) {
33991 for (size_t k = 1; k <= 20; k += 5) {
33992 GemmMicrokernelTester()
33993 .mr(4)
33994 .nr(16)
33995 .kr(1)
33996 .sr(4)
33997 .m(4)
33998 .n(16)
33999 .k(k)
34000 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34001 }
34002 }
34003 }
34004
34005 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
34006 TEST_REQUIRES_X86_FMA3;
34007 for (uint32_t n = 32; n <= 48; n += 16) {
34008 for (size_t k = 1; k <= 20; k += 5) {
34009 GemmMicrokernelTester()
34010 .mr(4)
34011 .nr(16)
34012 .kr(1)
34013 .sr(4)
34014 .m(4)
34015 .n(n)
34016 .k(k)
34017 .cn_stride(19)
34018 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34019 }
34020 }
34021 }
34022
34023 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
34024 TEST_REQUIRES_X86_FMA3;
34025 for (uint32_t n = 32; n <= 48; n += 16) {
34026 for (size_t k = 1; k <= 20; k += 5) {
34027 GemmMicrokernelTester()
34028 .mr(4)
34029 .nr(16)
34030 .kr(1)
34031 .sr(4)
34032 .m(4)
34033 .n(n)
34034 .k(k)
34035 .a_stride(23)
34036 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34037 }
34038 }
34039 }
34040
34041 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16_subtile) {
34042 TEST_REQUIRES_X86_FMA3;
34043 for (uint32_t n = 32; n <= 48; n += 16) {
34044 for (size_t k = 1; k <= 20; k += 5) {
34045 for (uint32_t m = 1; m <= 4; m++) {
34046 GemmMicrokernelTester()
34047 .mr(4)
34048 .nr(16)
34049 .kr(1)
34050 .sr(4)
34051 .m(m)
34052 .n(n)
34053 .k(k)
34054 .iterations(1)
34055 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34056 }
34057 }
34058 }
34059 }
34060
34061 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, strided_cm_subtile) {
34062 TEST_REQUIRES_X86_FMA3;
34063 for (size_t k = 1; k <= 20; k += 5) {
34064 for (uint32_t m = 1; m <= 4; m++) {
34065 for (uint32_t n = 1; n <= 16; n++) {
34066 GemmMicrokernelTester()
34067 .mr(4)
34068 .nr(16)
34069 .kr(1)
34070 .sr(4)
34071 .m(m)
34072 .n(n)
34073 .k(k)
34074 .cm_stride(19)
34075 .iterations(1)
34076 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34077 }
34078 }
34079 }
34080 }
34081
34082 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, qmin) {
34083 TEST_REQUIRES_X86_FMA3;
34084 GemmMicrokernelTester()
34085 .mr(4)
34086 .nr(16)
34087 .kr(1)
34088 .sr(4)
34089 .m(4)
34090 .n(16)
34091 .k(4)
34092 .qmin(128)
34093 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34094 }
34095
34096 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, qmax) {
34097 TEST_REQUIRES_X86_FMA3;
34098 GemmMicrokernelTester()
34099 .mr(4)
34100 .nr(16)
34101 .kr(1)
34102 .sr(4)
34103 .m(4)
34104 .n(16)
34105 .k(4)
34106 .qmax(128)
34107 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34108 }
34109
34110 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, strided_cm) {
34111 TEST_REQUIRES_X86_FMA3;
34112 GemmMicrokernelTester()
34113 .mr(4)
34114 .nr(16)
34115 .kr(1)
34116 .sr(4)
34117 .m(4)
34118 .n(16)
34119 .k(4)
34120 .cm_stride(19)
34121 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
34122 }
34123#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34124
34125
34126#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34127 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4) {
34128 TEST_REQUIRES_X86_FMA3;
34129 GemmMicrokernelTester()
34130 .mr(5)
34131 .nr(16)
34132 .kr(1)
34133 .sr(4)
34134 .m(5)
34135 .n(16)
34136 .k(4)
34137 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34138 }
34139
34140 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, strided_cn) {
34141 TEST_REQUIRES_X86_FMA3;
34142 GemmMicrokernelTester()
34143 .mr(5)
34144 .nr(16)
34145 .kr(1)
34146 .sr(4)
34147 .m(5)
34148 .n(16)
34149 .k(4)
34150 .cn_stride(19)
34151 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34152 }
34153
34154 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
34155 TEST_REQUIRES_X86_FMA3;
34156 GemmMicrokernelTester()
34157 .mr(5)
34158 .nr(16)
34159 .kr(1)
34160 .sr(4)
34161 .m(5)
34162 .n(16)
34163 .k(4)
34164 .a_stride(7)
34165 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34166 }
34167
34168 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
34169 TEST_REQUIRES_X86_FMA3;
34170 for (uint32_t m = 1; m <= 5; m++) {
34171 for (uint32_t n = 1; n <= 16; n++) {
34172 GemmMicrokernelTester()
34173 .mr(5)
34174 .nr(16)
34175 .kr(1)
34176 .sr(4)
34177 .m(m)
34178 .n(n)
34179 .k(4)
34180 .iterations(1)
34181 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34182 }
34183 }
34184 }
34185
34186 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
34187 TEST_REQUIRES_X86_FMA3;
34188 for (uint32_t m = 1; m <= 5; m++) {
34189 GemmMicrokernelTester()
34190 .mr(5)
34191 .nr(16)
34192 .kr(1)
34193 .sr(4)
34194 .m(m)
34195 .n(16)
34196 .k(4)
34197 .iterations(1)
34198 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34199 }
34200 }
34201
34202 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
34203 TEST_REQUIRES_X86_FMA3;
34204 for (uint32_t n = 1; n <= 16; n++) {
34205 GemmMicrokernelTester()
34206 .mr(5)
34207 .nr(16)
34208 .kr(1)
34209 .sr(4)
34210 .m(5)
34211 .n(n)
34212 .k(4)
34213 .iterations(1)
34214 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34215 }
34216 }
34217
34218 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_lt_4) {
34219 TEST_REQUIRES_X86_FMA3;
34220 for (size_t k = 1; k < 4; k++) {
34221 GemmMicrokernelTester()
34222 .mr(5)
34223 .nr(16)
34224 .kr(1)
34225 .sr(4)
34226 .m(5)
34227 .n(16)
34228 .k(k)
34229 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34230 }
34231 }
34232
34233 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
34234 TEST_REQUIRES_X86_FMA3;
34235 for (size_t k = 1; k < 4; k++) {
34236 GemmMicrokernelTester()
34237 .mr(5)
34238 .nr(16)
34239 .kr(1)
34240 .sr(4)
34241 .m(5)
34242 .n(16)
34243 .k(k)
34244 .a_stride(7)
34245 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34246 }
34247 }
34248
34249 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
34250 TEST_REQUIRES_X86_FMA3;
34251 for (size_t k = 1; k < 4; k++) {
34252 for (uint32_t m = 1; m <= 5; m++) {
34253 for (uint32_t n = 1; n <= 16; n++) {
34254 GemmMicrokernelTester()
34255 .mr(5)
34256 .nr(16)
34257 .kr(1)
34258 .sr(4)
34259 .m(m)
34260 .n(n)
34261 .k(k)
34262 .iterations(1)
34263 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34264 }
34265 }
34266 }
34267 }
34268
34269 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_gt_4) {
34270 TEST_REQUIRES_X86_FMA3;
34271 for (size_t k = 5; k < 8; k++) {
34272 GemmMicrokernelTester()
34273 .mr(5)
34274 .nr(16)
34275 .kr(1)
34276 .sr(4)
34277 .m(5)
34278 .n(16)
34279 .k(k)
34280 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34281 }
34282 }
34283
34284 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
34285 TEST_REQUIRES_X86_FMA3;
34286 for (size_t k = 5; k < 8; k++) {
34287 GemmMicrokernelTester()
34288 .mr(5)
34289 .nr(16)
34290 .kr(1)
34291 .sr(4)
34292 .m(5)
34293 .n(16)
34294 .k(k)
34295 .a_stride(11)
34296 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34297 }
34298 }
34299
34300 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
34301 TEST_REQUIRES_X86_FMA3;
34302 for (size_t k = 5; k < 8; k++) {
34303 for (uint32_t m = 1; m <= 5; m++) {
34304 for (uint32_t n = 1; n <= 16; n++) {
34305 GemmMicrokernelTester()
34306 .mr(5)
34307 .nr(16)
34308 .kr(1)
34309 .sr(4)
34310 .m(m)
34311 .n(n)
34312 .k(k)
34313 .iterations(1)
34314 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34315 }
34316 }
34317 }
34318 }
34319
34320 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_div_4) {
34321 TEST_REQUIRES_X86_FMA3;
34322 for (size_t k = 8; k <= 40; k += 4) {
34323 GemmMicrokernelTester()
34324 .mr(5)
34325 .nr(16)
34326 .kr(1)
34327 .sr(4)
34328 .m(5)
34329 .n(16)
34330 .k(k)
34331 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34332 }
34333 }
34334
34335 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
34336 TEST_REQUIRES_X86_FMA3;
34337 for (size_t k = 8; k <= 40; k += 4) {
34338 GemmMicrokernelTester()
34339 .mr(5)
34340 .nr(16)
34341 .kr(1)
34342 .sr(4)
34343 .m(5)
34344 .n(16)
34345 .k(k)
34346 .a_stride(43)
34347 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34348 }
34349 }
34350
34351 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_div_4_subtile) {
34352 TEST_REQUIRES_X86_FMA3;
34353 for (size_t k = 8; k <= 40; k += 4) {
34354 for (uint32_t m = 1; m <= 5; m++) {
34355 for (uint32_t n = 1; n <= 16; n++) {
34356 GemmMicrokernelTester()
34357 .mr(5)
34358 .nr(16)
34359 .kr(1)
34360 .sr(4)
34361 .m(m)
34362 .n(n)
34363 .k(k)
34364 .iterations(1)
34365 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34366 }
34367 }
34368 }
34369 }
34370
34371 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16) {
34372 TEST_REQUIRES_X86_FMA3;
34373 for (uint32_t n = 17; n < 32; n++) {
34374 for (size_t k = 1; k <= 20; k += 5) {
34375 GemmMicrokernelTester()
34376 .mr(5)
34377 .nr(16)
34378 .kr(1)
34379 .sr(4)
34380 .m(5)
34381 .n(16)
34382 .k(k)
34383 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34384 }
34385 }
34386 }
34387
34388 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
34389 TEST_REQUIRES_X86_FMA3;
34390 for (uint32_t n = 17; n < 32; n++) {
34391 for (size_t k = 1; k <= 20; k += 5) {
34392 GemmMicrokernelTester()
34393 .mr(5)
34394 .nr(16)
34395 .kr(1)
34396 .sr(4)
34397 .m(5)
34398 .n(16)
34399 .k(k)
34400 .cn_stride(19)
34401 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34402 }
34403 }
34404 }
34405
34406 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
34407 TEST_REQUIRES_X86_FMA3;
34408 for (uint32_t n = 17; n < 32; n++) {
34409 for (size_t k = 1; k <= 20; k += 5) {
34410 GemmMicrokernelTester()
34411 .mr(5)
34412 .nr(16)
34413 .kr(1)
34414 .sr(4)
34415 .m(5)
34416 .n(n)
34417 .k(k)
34418 .a_stride(23)
34419 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34420 }
34421 }
34422 }
34423
34424 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
34425 TEST_REQUIRES_X86_FMA3;
34426 for (uint32_t n = 17; n < 32; n++) {
34427 for (size_t k = 1; k <= 20; k += 5) {
34428 for (uint32_t m = 1; m <= 5; m++) {
34429 GemmMicrokernelTester()
34430 .mr(5)
34431 .nr(16)
34432 .kr(1)
34433 .sr(4)
34434 .m(m)
34435 .n(n)
34436 .k(k)
34437 .iterations(1)
34438 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34439 }
34440 }
34441 }
34442 }
34443
34444 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16) {
34445 TEST_REQUIRES_X86_FMA3;
34446 for (uint32_t n = 32; n <= 48; n += 16) {
34447 for (size_t k = 1; k <= 20; k += 5) {
34448 GemmMicrokernelTester()
34449 .mr(5)
34450 .nr(16)
34451 .kr(1)
34452 .sr(4)
34453 .m(5)
34454 .n(16)
34455 .k(k)
34456 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34457 }
34458 }
34459 }
34460
34461 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
34462 TEST_REQUIRES_X86_FMA3;
34463 for (uint32_t n = 32; n <= 48; n += 16) {
34464 for (size_t k = 1; k <= 20; k += 5) {
34465 GemmMicrokernelTester()
34466 .mr(5)
34467 .nr(16)
34468 .kr(1)
34469 .sr(4)
34470 .m(5)
34471 .n(n)
34472 .k(k)
34473 .cn_stride(19)
34474 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34475 }
34476 }
34477 }
34478
34479 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
34480 TEST_REQUIRES_X86_FMA3;
34481 for (uint32_t n = 32; n <= 48; n += 16) {
34482 for (size_t k = 1; k <= 20; k += 5) {
34483 GemmMicrokernelTester()
34484 .mr(5)
34485 .nr(16)
34486 .kr(1)
34487 .sr(4)
34488 .m(5)
34489 .n(n)
34490 .k(k)
34491 .a_stride(23)
34492 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34493 }
34494 }
34495 }
34496
34497 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16_subtile) {
34498 TEST_REQUIRES_X86_FMA3;
34499 for (uint32_t n = 32; n <= 48; n += 16) {
34500 for (size_t k = 1; k <= 20; k += 5) {
34501 for (uint32_t m = 1; m <= 5; m++) {
34502 GemmMicrokernelTester()
34503 .mr(5)
34504 .nr(16)
34505 .kr(1)
34506 .sr(4)
34507 .m(m)
34508 .n(n)
34509 .k(k)
34510 .iterations(1)
34511 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34512 }
34513 }
34514 }
34515 }
34516
34517 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, strided_cm_subtile) {
34518 TEST_REQUIRES_X86_FMA3;
34519 for (size_t k = 1; k <= 20; k += 5) {
34520 for (uint32_t m = 1; m <= 5; m++) {
34521 for (uint32_t n = 1; n <= 16; n++) {
34522 GemmMicrokernelTester()
34523 .mr(5)
34524 .nr(16)
34525 .kr(1)
34526 .sr(4)
34527 .m(m)
34528 .n(n)
34529 .k(k)
34530 .cm_stride(19)
34531 .iterations(1)
34532 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34533 }
34534 }
34535 }
34536 }
34537
34538 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, qmin) {
34539 TEST_REQUIRES_X86_FMA3;
34540 GemmMicrokernelTester()
34541 .mr(5)
34542 .nr(16)
34543 .kr(1)
34544 .sr(4)
34545 .m(5)
34546 .n(16)
34547 .k(4)
34548 .qmin(128)
34549 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34550 }
34551
34552 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, qmax) {
34553 TEST_REQUIRES_X86_FMA3;
34554 GemmMicrokernelTester()
34555 .mr(5)
34556 .nr(16)
34557 .kr(1)
34558 .sr(4)
34559 .m(5)
34560 .n(16)
34561 .k(4)
34562 .qmax(128)
34563 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34564 }
34565
34566 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, strided_cm) {
34567 TEST_REQUIRES_X86_FMA3;
34568 GemmMicrokernelTester()
34569 .mr(5)
34570 .nr(16)
34571 .kr(1)
34572 .sr(4)
34573 .m(5)
34574 .n(16)
34575 .k(4)
34576 .cm_stride(19)
34577 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
34578 }
34579#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34580
34581
34582#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34583 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1) {
34584 TEST_REQUIRES_X86_AVX512F;
34585 GemmMicrokernelTester()
34586 .mr(1)
34587 .nr(16)
34588 .kr(1)
34589 .sr(1)
34590 .m(1)
34591 .n(16)
34592 .k(1)
34593 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34594 }
34595
34596 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, strided_cn) {
34597 TEST_REQUIRES_X86_AVX512F;
34598 GemmMicrokernelTester()
34599 .mr(1)
34600 .nr(16)
34601 .kr(1)
34602 .sr(1)
34603 .m(1)
34604 .n(16)
34605 .k(1)
34606 .cn_stride(19)
34607 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34608 }
34609
34610 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
34611 TEST_REQUIRES_X86_AVX512F;
34612 GemmMicrokernelTester()
34613 .mr(1)
34614 .nr(16)
34615 .kr(1)
34616 .sr(1)
34617 .m(1)
34618 .n(16)
34619 .k(1)
34620 .a_stride(3)
34621 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34622 }
34623
34624 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
34625 TEST_REQUIRES_X86_AVX512F;
34626 for (uint32_t m = 1; m <= 1; m++) {
34627 for (uint32_t n = 1; n <= 16; n++) {
34628 GemmMicrokernelTester()
34629 .mr(1)
34630 .nr(16)
34631 .kr(1)
34632 .sr(1)
34633 .m(m)
34634 .n(n)
34635 .k(1)
34636 .iterations(1)
34637 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34638 }
34639 }
34640 }
34641
34642 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
34643 TEST_REQUIRES_X86_AVX512F;
34644 for (uint32_t m = 1; m <= 1; m++) {
34645 GemmMicrokernelTester()
34646 .mr(1)
34647 .nr(16)
34648 .kr(1)
34649 .sr(1)
34650 .m(m)
34651 .n(16)
34652 .k(1)
34653 .iterations(1)
34654 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34655 }
34656 }
34657
34658 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
34659 TEST_REQUIRES_X86_AVX512F;
34660 for (uint32_t n = 1; n <= 16; n++) {
34661 GemmMicrokernelTester()
34662 .mr(1)
34663 .nr(16)
34664 .kr(1)
34665 .sr(1)
34666 .m(1)
34667 .n(n)
34668 .k(1)
34669 .iterations(1)
34670 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34671 }
34672 }
34673
34674 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_gt_1) {
34675 TEST_REQUIRES_X86_AVX512F;
34676 for (size_t k = 2; k < 10; k++) {
34677 GemmMicrokernelTester()
34678 .mr(1)
34679 .nr(16)
34680 .kr(1)
34681 .sr(1)
34682 .m(1)
34683 .n(16)
34684 .k(k)
34685 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34686 }
34687 }
34688
34689 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
34690 TEST_REQUIRES_X86_AVX512F;
34691 for (size_t k = 2; k < 10; k++) {
34692 GemmMicrokernelTester()
34693 .mr(1)
34694 .nr(16)
34695 .kr(1)
34696 .sr(1)
34697 .m(1)
34698 .n(16)
34699 .k(k)
34700 .a_stride(11)
34701 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34702 }
34703 }
34704
34705 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
34706 TEST_REQUIRES_X86_AVX512F;
34707 for (size_t k = 2; k < 10; k++) {
34708 for (uint32_t m = 1; m <= 1; m++) {
34709 for (uint32_t n = 1; n <= 16; n++) {
34710 GemmMicrokernelTester()
34711 .mr(1)
34712 .nr(16)
34713 .kr(1)
34714 .sr(1)
34715 .m(m)
34716 .n(n)
34717 .k(k)
34718 .iterations(1)
34719 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34720 }
34721 }
34722 }
34723 }
34724
34725 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16) {
34726 TEST_REQUIRES_X86_AVX512F;
34727 for (uint32_t n = 17; n < 32; n++) {
34728 for (size_t k = 1; k <= 5; k += 2) {
34729 GemmMicrokernelTester()
34730 .mr(1)
34731 .nr(16)
34732 .kr(1)
34733 .sr(1)
34734 .m(1)
34735 .n(16)
34736 .k(k)
34737 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34738 }
34739 }
34740 }
34741
34742 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
34743 TEST_REQUIRES_X86_AVX512F;
34744 for (uint32_t n = 17; n < 32; n++) {
34745 for (size_t k = 1; k <= 5; k += 2) {
34746 GemmMicrokernelTester()
34747 .mr(1)
34748 .nr(16)
34749 .kr(1)
34750 .sr(1)
34751 .m(1)
34752 .n(16)
34753 .k(k)
34754 .cn_stride(19)
34755 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34756 }
34757 }
34758 }
34759
34760 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
34761 TEST_REQUIRES_X86_AVX512F;
34762 for (uint32_t n = 17; n < 32; n++) {
34763 for (size_t k = 1; k <= 5; k += 2) {
34764 GemmMicrokernelTester()
34765 .mr(1)
34766 .nr(16)
34767 .kr(1)
34768 .sr(1)
34769 .m(1)
34770 .n(n)
34771 .k(k)
34772 .a_stride(7)
34773 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34774 }
34775 }
34776 }
34777
34778 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
34779 TEST_REQUIRES_X86_AVX512F;
34780 for (uint32_t n = 17; n < 32; n++) {
34781 for (size_t k = 1; k <= 5; k += 2) {
34782 for (uint32_t m = 1; m <= 1; m++) {
34783 GemmMicrokernelTester()
34784 .mr(1)
34785 .nr(16)
34786 .kr(1)
34787 .sr(1)
34788 .m(m)
34789 .n(n)
34790 .k(k)
34791 .iterations(1)
34792 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34793 }
34794 }
34795 }
34796 }
34797
34798 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16) {
34799 TEST_REQUIRES_X86_AVX512F;
34800 for (uint32_t n = 32; n <= 48; n += 16) {
34801 for (size_t k = 1; k <= 5; k += 2) {
34802 GemmMicrokernelTester()
34803 .mr(1)
34804 .nr(16)
34805 .kr(1)
34806 .sr(1)
34807 .m(1)
34808 .n(16)
34809 .k(k)
34810 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34811 }
34812 }
34813 }
34814
34815 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
34816 TEST_REQUIRES_X86_AVX512F;
34817 for (uint32_t n = 32; n <= 48; n += 16) {
34818 for (size_t k = 1; k <= 5; k += 2) {
34819 GemmMicrokernelTester()
34820 .mr(1)
34821 .nr(16)
34822 .kr(1)
34823 .sr(1)
34824 .m(1)
34825 .n(n)
34826 .k(k)
34827 .cn_stride(19)
34828 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34829 }
34830 }
34831 }
34832
34833 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16_strided_a) {
34834 TEST_REQUIRES_X86_AVX512F;
34835 for (uint32_t n = 32; n <= 48; n += 16) {
34836 for (size_t k = 1; k <= 5; k += 2) {
34837 GemmMicrokernelTester()
34838 .mr(1)
34839 .nr(16)
34840 .kr(1)
34841 .sr(1)
34842 .m(1)
34843 .n(n)
34844 .k(k)
34845 .a_stride(7)
34846 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34847 }
34848 }
34849 }
34850
34851 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
34852 TEST_REQUIRES_X86_AVX512F;
34853 for (uint32_t n = 32; n <= 48; n += 16) {
34854 for (size_t k = 1; k <= 5; k += 2) {
34855 for (uint32_t m = 1; m <= 1; m++) {
34856 GemmMicrokernelTester()
34857 .mr(1)
34858 .nr(16)
34859 .kr(1)
34860 .sr(1)
34861 .m(m)
34862 .n(n)
34863 .k(k)
34864 .iterations(1)
34865 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34866 }
34867 }
34868 }
34869 }
34870
34871 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
34872 TEST_REQUIRES_X86_AVX512F;
34873 for (size_t k = 1; k <= 5; k += 2) {
34874 for (uint32_t m = 1; m <= 1; m++) {
34875 for (uint32_t n = 1; n <= 16; n++) {
34876 GemmMicrokernelTester()
34877 .mr(1)
34878 .nr(16)
34879 .kr(1)
34880 .sr(1)
34881 .m(m)
34882 .n(n)
34883 .k(k)
34884 .cm_stride(19)
34885 .iterations(1)
34886 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34887 }
34888 }
34889 }
34890 }
34891
34892 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, qmin) {
34893 TEST_REQUIRES_X86_AVX512F;
34894 GemmMicrokernelTester()
34895 .mr(1)
34896 .nr(16)
34897 .kr(1)
34898 .sr(1)
34899 .m(1)
34900 .n(16)
34901 .k(1)
34902 .qmin(128)
34903 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34904 }
34905
34906 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, qmax) {
34907 TEST_REQUIRES_X86_AVX512F;
34908 GemmMicrokernelTester()
34909 .mr(1)
34910 .nr(16)
34911 .kr(1)
34912 .sr(1)
34913 .m(1)
34914 .n(16)
34915 .k(1)
34916 .qmax(128)
34917 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34918 }
34919
34920 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, strided_cm) {
34921 TEST_REQUIRES_X86_AVX512F;
34922 GemmMicrokernelTester()
34923 .mr(1)
34924 .nr(16)
34925 .kr(1)
34926 .sr(1)
34927 .m(1)
34928 .n(16)
34929 .k(1)
34930 .cm_stride(19)
34931 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
34932 }
34933#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34934
34935
34936#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34937 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1) {
34938 TEST_REQUIRES_X86_AVX512F;
34939 GemmMicrokernelTester()
34940 .mr(4)
34941 .nr(16)
34942 .kr(1)
34943 .sr(1)
34944 .m(4)
34945 .n(16)
34946 .k(1)
34947 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
34948 }
34949
34950 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, strided_cn) {
34951 TEST_REQUIRES_X86_AVX512F;
34952 GemmMicrokernelTester()
34953 .mr(4)
34954 .nr(16)
34955 .kr(1)
34956 .sr(1)
34957 .m(4)
34958 .n(16)
34959 .k(1)
34960 .cn_stride(19)
34961 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
34962 }
34963
34964 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
34965 TEST_REQUIRES_X86_AVX512F;
34966 GemmMicrokernelTester()
34967 .mr(4)
34968 .nr(16)
34969 .kr(1)
34970 .sr(1)
34971 .m(4)
34972 .n(16)
34973 .k(1)
34974 .a_stride(3)
34975 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
34976 }
34977
34978 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_subtile) {
34979 TEST_REQUIRES_X86_AVX512F;
34980 for (uint32_t m = 1; m <= 4; m++) {
34981 for (uint32_t n = 1; n <= 16; n++) {
34982 GemmMicrokernelTester()
34983 .mr(4)
34984 .nr(16)
34985 .kr(1)
34986 .sr(1)
34987 .m(m)
34988 .n(n)
34989 .k(1)
34990 .iterations(1)
34991 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
34992 }
34993 }
34994 }
34995
34996 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
34997 TEST_REQUIRES_X86_AVX512F;
34998 for (uint32_t m = 1; m <= 4; m++) {
34999 GemmMicrokernelTester()
35000 .mr(4)
35001 .nr(16)
35002 .kr(1)
35003 .sr(1)
35004 .m(m)
35005 .n(16)
35006 .k(1)
35007 .iterations(1)
35008 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35009 }
35010 }
35011
35012 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
35013 TEST_REQUIRES_X86_AVX512F;
35014 for (uint32_t n = 1; n <= 16; n++) {
35015 GemmMicrokernelTester()
35016 .mr(4)
35017 .nr(16)
35018 .kr(1)
35019 .sr(1)
35020 .m(4)
35021 .n(n)
35022 .k(1)
35023 .iterations(1)
35024 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35025 }
35026 }
35027
35028 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_gt_1) {
35029 TEST_REQUIRES_X86_AVX512F;
35030 for (size_t k = 2; k < 10; k++) {
35031 GemmMicrokernelTester()
35032 .mr(4)
35033 .nr(16)
35034 .kr(1)
35035 .sr(1)
35036 .m(4)
35037 .n(16)
35038 .k(k)
35039 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35040 }
35041 }
35042
35043 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
35044 TEST_REQUIRES_X86_AVX512F;
35045 for (size_t k = 2; k < 10; k++) {
35046 GemmMicrokernelTester()
35047 .mr(4)
35048 .nr(16)
35049 .kr(1)
35050 .sr(1)
35051 .m(4)
35052 .n(16)
35053 .k(k)
35054 .a_stride(11)
35055 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35056 }
35057 }
35058
35059 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_gt_1_subtile) {
35060 TEST_REQUIRES_X86_AVX512F;
35061 for (size_t k = 2; k < 10; k++) {
35062 for (uint32_t m = 1; m <= 4; m++) {
35063 for (uint32_t n = 1; n <= 16; n++) {
35064 GemmMicrokernelTester()
35065 .mr(4)
35066 .nr(16)
35067 .kr(1)
35068 .sr(1)
35069 .m(m)
35070 .n(n)
35071 .k(k)
35072 .iterations(1)
35073 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35074 }
35075 }
35076 }
35077 }
35078
35079 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16) {
35080 TEST_REQUIRES_X86_AVX512F;
35081 for (uint32_t n = 17; n < 32; n++) {
35082 for (size_t k = 1; k <= 5; k += 2) {
35083 GemmMicrokernelTester()
35084 .mr(4)
35085 .nr(16)
35086 .kr(1)
35087 .sr(1)
35088 .m(4)
35089 .n(16)
35090 .k(k)
35091 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35092 }
35093 }
35094 }
35095
35096 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
35097 TEST_REQUIRES_X86_AVX512F;
35098 for (uint32_t n = 17; n < 32; n++) {
35099 for (size_t k = 1; k <= 5; k += 2) {
35100 GemmMicrokernelTester()
35101 .mr(4)
35102 .nr(16)
35103 .kr(1)
35104 .sr(1)
35105 .m(4)
35106 .n(16)
35107 .k(k)
35108 .cn_stride(19)
35109 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35110 }
35111 }
35112 }
35113
35114 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
35115 TEST_REQUIRES_X86_AVX512F;
35116 for (uint32_t n = 17; n < 32; n++) {
35117 for (size_t k = 1; k <= 5; k += 2) {
35118 GemmMicrokernelTester()
35119 .mr(4)
35120 .nr(16)
35121 .kr(1)
35122 .sr(1)
35123 .m(4)
35124 .n(n)
35125 .k(k)
35126 .a_stride(7)
35127 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35128 }
35129 }
35130 }
35131
35132 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16_subtile) {
35133 TEST_REQUIRES_X86_AVX512F;
35134 for (uint32_t n = 17; n < 32; n++) {
35135 for (size_t k = 1; k <= 5; k += 2) {
35136 for (uint32_t m = 1; m <= 4; m++) {
35137 GemmMicrokernelTester()
35138 .mr(4)
35139 .nr(16)
35140 .kr(1)
35141 .sr(1)
35142 .m(m)
35143 .n(n)
35144 .k(k)
35145 .iterations(1)
35146 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35147 }
35148 }
35149 }
35150 }
35151
35152 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16) {
35153 TEST_REQUIRES_X86_AVX512F;
35154 for (uint32_t n = 32; n <= 48; n += 16) {
35155 for (size_t k = 1; k <= 5; k += 2) {
35156 GemmMicrokernelTester()
35157 .mr(4)
35158 .nr(16)
35159 .kr(1)
35160 .sr(1)
35161 .m(4)
35162 .n(16)
35163 .k(k)
35164 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35165 }
35166 }
35167 }
35168
35169 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
35170 TEST_REQUIRES_X86_AVX512F;
35171 for (uint32_t n = 32; n <= 48; n += 16) {
35172 for (size_t k = 1; k <= 5; k += 2) {
35173 GemmMicrokernelTester()
35174 .mr(4)
35175 .nr(16)
35176 .kr(1)
35177 .sr(1)
35178 .m(4)
35179 .n(n)
35180 .k(k)
35181 .cn_stride(19)
35182 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35183 }
35184 }
35185 }
35186
35187 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16_strided_a) {
35188 TEST_REQUIRES_X86_AVX512F;
35189 for (uint32_t n = 32; n <= 48; n += 16) {
35190 for (size_t k = 1; k <= 5; k += 2) {
35191 GemmMicrokernelTester()
35192 .mr(4)
35193 .nr(16)
35194 .kr(1)
35195 .sr(1)
35196 .m(4)
35197 .n(n)
35198 .k(k)
35199 .a_stride(7)
35200 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35201 }
35202 }
35203 }
35204
35205 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16_subtile) {
35206 TEST_REQUIRES_X86_AVX512F;
35207 for (uint32_t n = 32; n <= 48; n += 16) {
35208 for (size_t k = 1; k <= 5; k += 2) {
35209 for (uint32_t m = 1; m <= 4; m++) {
35210 GemmMicrokernelTester()
35211 .mr(4)
35212 .nr(16)
35213 .kr(1)
35214 .sr(1)
35215 .m(m)
35216 .n(n)
35217 .k(k)
35218 .iterations(1)
35219 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35220 }
35221 }
35222 }
35223 }
35224
35225 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, strided_cm_subtile) {
35226 TEST_REQUIRES_X86_AVX512F;
35227 for (size_t k = 1; k <= 5; k += 2) {
35228 for (uint32_t m = 1; m <= 4; m++) {
35229 for (uint32_t n = 1; n <= 16; n++) {
35230 GemmMicrokernelTester()
35231 .mr(4)
35232 .nr(16)
35233 .kr(1)
35234 .sr(1)
35235 .m(m)
35236 .n(n)
35237 .k(k)
35238 .cm_stride(19)
35239 .iterations(1)
35240 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35241 }
35242 }
35243 }
35244 }
35245
35246 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, qmin) {
35247 TEST_REQUIRES_X86_AVX512F;
35248 GemmMicrokernelTester()
35249 .mr(4)
35250 .nr(16)
35251 .kr(1)
35252 .sr(1)
35253 .m(4)
35254 .n(16)
35255 .k(1)
35256 .qmin(128)
35257 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35258 }
35259
35260 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, qmax) {
35261 TEST_REQUIRES_X86_AVX512F;
35262 GemmMicrokernelTester()
35263 .mr(4)
35264 .nr(16)
35265 .kr(1)
35266 .sr(1)
35267 .m(4)
35268 .n(16)
35269 .k(1)
35270 .qmax(128)
35271 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35272 }
35273
35274 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, strided_cm) {
35275 TEST_REQUIRES_X86_AVX512F;
35276 GemmMicrokernelTester()
35277 .mr(4)
35278 .nr(16)
35279 .kr(1)
35280 .sr(1)
35281 .m(4)
35282 .n(16)
35283 .k(1)
35284 .cm_stride(19)
35285 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
35286 }
35287#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35288
35289
35290#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35291 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1) {
35292 TEST_REQUIRES_X86_AVX512F;
35293 GemmMicrokernelTester()
35294 .mr(5)
35295 .nr(16)
35296 .kr(1)
35297 .sr(1)
35298 .m(5)
35299 .n(16)
35300 .k(1)
35301 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35302 }
35303
35304 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, strided_cn) {
35305 TEST_REQUIRES_X86_AVX512F;
35306 GemmMicrokernelTester()
35307 .mr(5)
35308 .nr(16)
35309 .kr(1)
35310 .sr(1)
35311 .m(5)
35312 .n(16)
35313 .k(1)
35314 .cn_stride(19)
35315 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35316 }
35317
35318 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
35319 TEST_REQUIRES_X86_AVX512F;
35320 GemmMicrokernelTester()
35321 .mr(5)
35322 .nr(16)
35323 .kr(1)
35324 .sr(1)
35325 .m(5)
35326 .n(16)
35327 .k(1)
35328 .a_stride(3)
35329 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35330 }
35331
35332 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_subtile) {
35333 TEST_REQUIRES_X86_AVX512F;
35334 for (uint32_t m = 1; m <= 5; m++) {
35335 for (uint32_t n = 1; n <= 16; n++) {
35336 GemmMicrokernelTester()
35337 .mr(5)
35338 .nr(16)
35339 .kr(1)
35340 .sr(1)
35341 .m(m)
35342 .n(n)
35343 .k(1)
35344 .iterations(1)
35345 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35346 }
35347 }
35348 }
35349
35350 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
35351 TEST_REQUIRES_X86_AVX512F;
35352 for (uint32_t m = 1; m <= 5; m++) {
35353 GemmMicrokernelTester()
35354 .mr(5)
35355 .nr(16)
35356 .kr(1)
35357 .sr(1)
35358 .m(m)
35359 .n(16)
35360 .k(1)
35361 .iterations(1)
35362 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35363 }
35364 }
35365
35366 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
35367 TEST_REQUIRES_X86_AVX512F;
35368 for (uint32_t n = 1; n <= 16; n++) {
35369 GemmMicrokernelTester()
35370 .mr(5)
35371 .nr(16)
35372 .kr(1)
35373 .sr(1)
35374 .m(5)
35375 .n(n)
35376 .k(1)
35377 .iterations(1)
35378 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35379 }
35380 }
35381
35382 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_gt_1) {
35383 TEST_REQUIRES_X86_AVX512F;
35384 for (size_t k = 2; k < 10; k++) {
35385 GemmMicrokernelTester()
35386 .mr(5)
35387 .nr(16)
35388 .kr(1)
35389 .sr(1)
35390 .m(5)
35391 .n(16)
35392 .k(k)
35393 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35394 }
35395 }
35396
35397 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
35398 TEST_REQUIRES_X86_AVX512F;
35399 for (size_t k = 2; k < 10; k++) {
35400 GemmMicrokernelTester()
35401 .mr(5)
35402 .nr(16)
35403 .kr(1)
35404 .sr(1)
35405 .m(5)
35406 .n(16)
35407 .k(k)
35408 .a_stride(11)
35409 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35410 }
35411 }
35412
35413 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_gt_1_subtile) {
35414 TEST_REQUIRES_X86_AVX512F;
35415 for (size_t k = 2; k < 10; k++) {
35416 for (uint32_t m = 1; m <= 5; m++) {
35417 for (uint32_t n = 1; n <= 16; n++) {
35418 GemmMicrokernelTester()
35419 .mr(5)
35420 .nr(16)
35421 .kr(1)
35422 .sr(1)
35423 .m(m)
35424 .n(n)
35425 .k(k)
35426 .iterations(1)
35427 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35428 }
35429 }
35430 }
35431 }
35432
35433 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16) {
35434 TEST_REQUIRES_X86_AVX512F;
35435 for (uint32_t n = 17; n < 32; n++) {
35436 for (size_t k = 1; k <= 5; k += 2) {
35437 GemmMicrokernelTester()
35438 .mr(5)
35439 .nr(16)
35440 .kr(1)
35441 .sr(1)
35442 .m(5)
35443 .n(16)
35444 .k(k)
35445 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35446 }
35447 }
35448 }
35449
35450 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
35451 TEST_REQUIRES_X86_AVX512F;
35452 for (uint32_t n = 17; n < 32; n++) {
35453 for (size_t k = 1; k <= 5; k += 2) {
35454 GemmMicrokernelTester()
35455 .mr(5)
35456 .nr(16)
35457 .kr(1)
35458 .sr(1)
35459 .m(5)
35460 .n(16)
35461 .k(k)
35462 .cn_stride(19)
35463 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35464 }
35465 }
35466 }
35467
35468 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
35469 TEST_REQUIRES_X86_AVX512F;
35470 for (uint32_t n = 17; n < 32; n++) {
35471 for (size_t k = 1; k <= 5; k += 2) {
35472 GemmMicrokernelTester()
35473 .mr(5)
35474 .nr(16)
35475 .kr(1)
35476 .sr(1)
35477 .m(5)
35478 .n(n)
35479 .k(k)
35480 .a_stride(7)
35481 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35482 }
35483 }
35484 }
35485
35486 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16_subtile) {
35487 TEST_REQUIRES_X86_AVX512F;
35488 for (uint32_t n = 17; n < 32; n++) {
35489 for (size_t k = 1; k <= 5; k += 2) {
35490 for (uint32_t m = 1; m <= 5; m++) {
35491 GemmMicrokernelTester()
35492 .mr(5)
35493 .nr(16)
35494 .kr(1)
35495 .sr(1)
35496 .m(m)
35497 .n(n)
35498 .k(k)
35499 .iterations(1)
35500 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35501 }
35502 }
35503 }
35504 }
35505
35506 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16) {
35507 TEST_REQUIRES_X86_AVX512F;
35508 for (uint32_t n = 32; n <= 48; n += 16) {
35509 for (size_t k = 1; k <= 5; k += 2) {
35510 GemmMicrokernelTester()
35511 .mr(5)
35512 .nr(16)
35513 .kr(1)
35514 .sr(1)
35515 .m(5)
35516 .n(16)
35517 .k(k)
35518 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35519 }
35520 }
35521 }
35522
35523 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
35524 TEST_REQUIRES_X86_AVX512F;
35525 for (uint32_t n = 32; n <= 48; n += 16) {
35526 for (size_t k = 1; k <= 5; k += 2) {
35527 GemmMicrokernelTester()
35528 .mr(5)
35529 .nr(16)
35530 .kr(1)
35531 .sr(1)
35532 .m(5)
35533 .n(n)
35534 .k(k)
35535 .cn_stride(19)
35536 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35537 }
35538 }
35539 }
35540
35541 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16_strided_a) {
35542 TEST_REQUIRES_X86_AVX512F;
35543 for (uint32_t n = 32; n <= 48; n += 16) {
35544 for (size_t k = 1; k <= 5; k += 2) {
35545 GemmMicrokernelTester()
35546 .mr(5)
35547 .nr(16)
35548 .kr(1)
35549 .sr(1)
35550 .m(5)
35551 .n(n)
35552 .k(k)
35553 .a_stride(7)
35554 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35555 }
35556 }
35557 }
35558
35559 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16_subtile) {
35560 TEST_REQUIRES_X86_AVX512F;
35561 for (uint32_t n = 32; n <= 48; n += 16) {
35562 for (size_t k = 1; k <= 5; k += 2) {
35563 for (uint32_t m = 1; m <= 5; m++) {
35564 GemmMicrokernelTester()
35565 .mr(5)
35566 .nr(16)
35567 .kr(1)
35568 .sr(1)
35569 .m(m)
35570 .n(n)
35571 .k(k)
35572 .iterations(1)
35573 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35574 }
35575 }
35576 }
35577 }
35578
35579 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, strided_cm_subtile) {
35580 TEST_REQUIRES_X86_AVX512F;
35581 for (size_t k = 1; k <= 5; k += 2) {
35582 for (uint32_t m = 1; m <= 5; m++) {
35583 for (uint32_t n = 1; n <= 16; n++) {
35584 GemmMicrokernelTester()
35585 .mr(5)
35586 .nr(16)
35587 .kr(1)
35588 .sr(1)
35589 .m(m)
35590 .n(n)
35591 .k(k)
35592 .cm_stride(19)
35593 .iterations(1)
35594 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35595 }
35596 }
35597 }
35598 }
35599
35600 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, qmin) {
35601 TEST_REQUIRES_X86_AVX512F;
35602 GemmMicrokernelTester()
35603 .mr(5)
35604 .nr(16)
35605 .kr(1)
35606 .sr(1)
35607 .m(5)
35608 .n(16)
35609 .k(1)
35610 .qmin(128)
35611 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35612 }
35613
35614 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, qmax) {
35615 TEST_REQUIRES_X86_AVX512F;
35616 GemmMicrokernelTester()
35617 .mr(5)
35618 .nr(16)
35619 .kr(1)
35620 .sr(1)
35621 .m(5)
35622 .n(16)
35623 .k(1)
35624 .qmax(128)
35625 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35626 }
35627
35628 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, strided_cm) {
35629 TEST_REQUIRES_X86_AVX512F;
35630 GemmMicrokernelTester()
35631 .mr(5)
35632 .nr(16)
35633 .kr(1)
35634 .sr(1)
35635 .m(5)
35636 .n(16)
35637 .k(1)
35638 .cm_stride(19)
35639 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
35640 }
35641#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35642
35643
35644#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35645 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1) {
35646 TEST_REQUIRES_X86_AVX512F;
35647 GemmMicrokernelTester()
35648 .mr(6)
35649 .nr(16)
35650 .kr(1)
35651 .sr(1)
35652 .m(6)
35653 .n(16)
35654 .k(1)
35655 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35656 }
35657
35658 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, strided_cn) {
35659 TEST_REQUIRES_X86_AVX512F;
35660 GemmMicrokernelTester()
35661 .mr(6)
35662 .nr(16)
35663 .kr(1)
35664 .sr(1)
35665 .m(6)
35666 .n(16)
35667 .k(1)
35668 .cn_stride(19)
35669 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35670 }
35671
35672 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
35673 TEST_REQUIRES_X86_AVX512F;
35674 GemmMicrokernelTester()
35675 .mr(6)
35676 .nr(16)
35677 .kr(1)
35678 .sr(1)
35679 .m(6)
35680 .n(16)
35681 .k(1)
35682 .a_stride(3)
35683 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35684 }
35685
35686 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
35687 TEST_REQUIRES_X86_AVX512F;
35688 for (uint32_t m = 1; m <= 6; m++) {
35689 for (uint32_t n = 1; n <= 16; n++) {
35690 GemmMicrokernelTester()
35691 .mr(6)
35692 .nr(16)
35693 .kr(1)
35694 .sr(1)
35695 .m(m)
35696 .n(n)
35697 .k(1)
35698 .iterations(1)
35699 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35700 }
35701 }
35702 }
35703
35704 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
35705 TEST_REQUIRES_X86_AVX512F;
35706 for (uint32_t m = 1; m <= 6; m++) {
35707 GemmMicrokernelTester()
35708 .mr(6)
35709 .nr(16)
35710 .kr(1)
35711 .sr(1)
35712 .m(m)
35713 .n(16)
35714 .k(1)
35715 .iterations(1)
35716 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35717 }
35718 }
35719
35720 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
35721 TEST_REQUIRES_X86_AVX512F;
35722 for (uint32_t n = 1; n <= 16; n++) {
35723 GemmMicrokernelTester()
35724 .mr(6)
35725 .nr(16)
35726 .kr(1)
35727 .sr(1)
35728 .m(6)
35729 .n(n)
35730 .k(1)
35731 .iterations(1)
35732 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35733 }
35734 }
35735
35736 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_gt_1) {
35737 TEST_REQUIRES_X86_AVX512F;
35738 for (size_t k = 2; k < 10; k++) {
35739 GemmMicrokernelTester()
35740 .mr(6)
35741 .nr(16)
35742 .kr(1)
35743 .sr(1)
35744 .m(6)
35745 .n(16)
35746 .k(k)
35747 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35748 }
35749 }
35750
35751 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
35752 TEST_REQUIRES_X86_AVX512F;
35753 for (size_t k = 2; k < 10; k++) {
35754 GemmMicrokernelTester()
35755 .mr(6)
35756 .nr(16)
35757 .kr(1)
35758 .sr(1)
35759 .m(6)
35760 .n(16)
35761 .k(k)
35762 .a_stride(11)
35763 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35764 }
35765 }
35766
35767 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
35768 TEST_REQUIRES_X86_AVX512F;
35769 for (size_t k = 2; k < 10; k++) {
35770 for (uint32_t m = 1; m <= 6; m++) {
35771 for (uint32_t n = 1; n <= 16; n++) {
35772 GemmMicrokernelTester()
35773 .mr(6)
35774 .nr(16)
35775 .kr(1)
35776 .sr(1)
35777 .m(m)
35778 .n(n)
35779 .k(k)
35780 .iterations(1)
35781 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35782 }
35783 }
35784 }
35785 }
35786
35787 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16) {
35788 TEST_REQUIRES_X86_AVX512F;
35789 for (uint32_t n = 17; n < 32; n++) {
35790 for (size_t k = 1; k <= 5; k += 2) {
35791 GemmMicrokernelTester()
35792 .mr(6)
35793 .nr(16)
35794 .kr(1)
35795 .sr(1)
35796 .m(6)
35797 .n(16)
35798 .k(k)
35799 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35800 }
35801 }
35802 }
35803
35804 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
35805 TEST_REQUIRES_X86_AVX512F;
35806 for (uint32_t n = 17; n < 32; n++) {
35807 for (size_t k = 1; k <= 5; k += 2) {
35808 GemmMicrokernelTester()
35809 .mr(6)
35810 .nr(16)
35811 .kr(1)
35812 .sr(1)
35813 .m(6)
35814 .n(16)
35815 .k(k)
35816 .cn_stride(19)
35817 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35818 }
35819 }
35820 }
35821
35822 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
35823 TEST_REQUIRES_X86_AVX512F;
35824 for (uint32_t n = 17; n < 32; n++) {
35825 for (size_t k = 1; k <= 5; k += 2) {
35826 GemmMicrokernelTester()
35827 .mr(6)
35828 .nr(16)
35829 .kr(1)
35830 .sr(1)
35831 .m(6)
35832 .n(n)
35833 .k(k)
35834 .a_stride(7)
35835 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35836 }
35837 }
35838 }
35839
35840 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
35841 TEST_REQUIRES_X86_AVX512F;
35842 for (uint32_t n = 17; n < 32; n++) {
35843 for (size_t k = 1; k <= 5; k += 2) {
35844 for (uint32_t m = 1; m <= 6; m++) {
35845 GemmMicrokernelTester()
35846 .mr(6)
35847 .nr(16)
35848 .kr(1)
35849 .sr(1)
35850 .m(m)
35851 .n(n)
35852 .k(k)
35853 .iterations(1)
35854 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35855 }
35856 }
35857 }
35858 }
35859
35860 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16) {
35861 TEST_REQUIRES_X86_AVX512F;
35862 for (uint32_t n = 32; n <= 48; n += 16) {
35863 for (size_t k = 1; k <= 5; k += 2) {
35864 GemmMicrokernelTester()
35865 .mr(6)
35866 .nr(16)
35867 .kr(1)
35868 .sr(1)
35869 .m(6)
35870 .n(16)
35871 .k(k)
35872 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35873 }
35874 }
35875 }
35876
35877 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
35878 TEST_REQUIRES_X86_AVX512F;
35879 for (uint32_t n = 32; n <= 48; n += 16) {
35880 for (size_t k = 1; k <= 5; k += 2) {
35881 GemmMicrokernelTester()
35882 .mr(6)
35883 .nr(16)
35884 .kr(1)
35885 .sr(1)
35886 .m(6)
35887 .n(n)
35888 .k(k)
35889 .cn_stride(19)
35890 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35891 }
35892 }
35893 }
35894
35895 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16_strided_a) {
35896 TEST_REQUIRES_X86_AVX512F;
35897 for (uint32_t n = 32; n <= 48; n += 16) {
35898 for (size_t k = 1; k <= 5; k += 2) {
35899 GemmMicrokernelTester()
35900 .mr(6)
35901 .nr(16)
35902 .kr(1)
35903 .sr(1)
35904 .m(6)
35905 .n(n)
35906 .k(k)
35907 .a_stride(7)
35908 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35909 }
35910 }
35911 }
35912
35913 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
35914 TEST_REQUIRES_X86_AVX512F;
35915 for (uint32_t n = 32; n <= 48; n += 16) {
35916 for (size_t k = 1; k <= 5; k += 2) {
35917 for (uint32_t m = 1; m <= 6; m++) {
35918 GemmMicrokernelTester()
35919 .mr(6)
35920 .nr(16)
35921 .kr(1)
35922 .sr(1)
35923 .m(m)
35924 .n(n)
35925 .k(k)
35926 .iterations(1)
35927 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35928 }
35929 }
35930 }
35931 }
35932
35933 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
35934 TEST_REQUIRES_X86_AVX512F;
35935 for (size_t k = 1; k <= 5; k += 2) {
35936 for (uint32_t m = 1; m <= 6; m++) {
35937 for (uint32_t n = 1; n <= 16; n++) {
35938 GemmMicrokernelTester()
35939 .mr(6)
35940 .nr(16)
35941 .kr(1)
35942 .sr(1)
35943 .m(m)
35944 .n(n)
35945 .k(k)
35946 .cm_stride(19)
35947 .iterations(1)
35948 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35949 }
35950 }
35951 }
35952 }
35953
35954 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, qmin) {
35955 TEST_REQUIRES_X86_AVX512F;
35956 GemmMicrokernelTester()
35957 .mr(6)
35958 .nr(16)
35959 .kr(1)
35960 .sr(1)
35961 .m(6)
35962 .n(16)
35963 .k(1)
35964 .qmin(128)
35965 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35966 }
35967
35968 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, qmax) {
35969 TEST_REQUIRES_X86_AVX512F;
35970 GemmMicrokernelTester()
35971 .mr(6)
35972 .nr(16)
35973 .kr(1)
35974 .sr(1)
35975 .m(6)
35976 .n(16)
35977 .k(1)
35978 .qmax(128)
35979 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35980 }
35981
35982 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, strided_cm) {
35983 TEST_REQUIRES_X86_AVX512F;
35984 GemmMicrokernelTester()
35985 .mr(6)
35986 .nr(16)
35987 .kr(1)
35988 .sr(1)
35989 .m(6)
35990 .n(16)
35991 .k(1)
35992 .cm_stride(19)
35993 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
35994 }
35995#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35996
35997
35998#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35999 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1) {
36000 TEST_REQUIRES_X86_AVX512F;
36001 GemmMicrokernelTester()
36002 .mr(7)
36003 .nr(16)
36004 .kr(1)
36005 .sr(1)
36006 .m(7)
36007 .n(16)
36008 .k(1)
36009 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36010 }
36011
36012 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, strided_cn) {
36013 TEST_REQUIRES_X86_AVX512F;
36014 GemmMicrokernelTester()
36015 .mr(7)
36016 .nr(16)
36017 .kr(1)
36018 .sr(1)
36019 .m(7)
36020 .n(16)
36021 .k(1)
36022 .cn_stride(19)
36023 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36024 }
36025
36026 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
36027 TEST_REQUIRES_X86_AVX512F;
36028 GemmMicrokernelTester()
36029 .mr(7)
36030 .nr(16)
36031 .kr(1)
36032 .sr(1)
36033 .m(7)
36034 .n(16)
36035 .k(1)
36036 .a_stride(3)
36037 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36038 }
36039
36040 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36041 TEST_REQUIRES_X86_AVX512F;
36042 for (uint32_t m = 1; m <= 7; m++) {
36043 for (uint32_t n = 1; n <= 16; n++) {
36044 GemmMicrokernelTester()
36045 .mr(7)
36046 .nr(16)
36047 .kr(1)
36048 .sr(1)
36049 .m(m)
36050 .n(n)
36051 .k(1)
36052 .iterations(1)
36053 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36054 }
36055 }
36056 }
36057
36058 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36059 TEST_REQUIRES_X86_AVX512F;
36060 for (uint32_t m = 1; m <= 7; m++) {
36061 GemmMicrokernelTester()
36062 .mr(7)
36063 .nr(16)
36064 .kr(1)
36065 .sr(1)
36066 .m(m)
36067 .n(16)
36068 .k(1)
36069 .iterations(1)
36070 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36071 }
36072 }
36073
36074 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36075 TEST_REQUIRES_X86_AVX512F;
36076 for (uint32_t n = 1; n <= 16; n++) {
36077 GemmMicrokernelTester()
36078 .mr(7)
36079 .nr(16)
36080 .kr(1)
36081 .sr(1)
36082 .m(7)
36083 .n(n)
36084 .k(1)
36085 .iterations(1)
36086 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36087 }
36088 }
36089
36090 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_gt_1) {
36091 TEST_REQUIRES_X86_AVX512F;
36092 for (size_t k = 2; k < 10; k++) {
36093 GemmMicrokernelTester()
36094 .mr(7)
36095 .nr(16)
36096 .kr(1)
36097 .sr(1)
36098 .m(7)
36099 .n(16)
36100 .k(k)
36101 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36102 }
36103 }
36104
36105 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
36106 TEST_REQUIRES_X86_AVX512F;
36107 for (size_t k = 2; k < 10; k++) {
36108 GemmMicrokernelTester()
36109 .mr(7)
36110 .nr(16)
36111 .kr(1)
36112 .sr(1)
36113 .m(7)
36114 .n(16)
36115 .k(k)
36116 .a_stride(11)
36117 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36118 }
36119 }
36120
36121 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36122 TEST_REQUIRES_X86_AVX512F;
36123 for (size_t k = 2; k < 10; k++) {
36124 for (uint32_t m = 1; m <= 7; m++) {
36125 for (uint32_t n = 1; n <= 16; n++) {
36126 GemmMicrokernelTester()
36127 .mr(7)
36128 .nr(16)
36129 .kr(1)
36130 .sr(1)
36131 .m(m)
36132 .n(n)
36133 .k(k)
36134 .iterations(1)
36135 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36136 }
36137 }
36138 }
36139 }
36140
36141 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16) {
36142 TEST_REQUIRES_X86_AVX512F;
36143 for (uint32_t n = 17; n < 32; n++) {
36144 for (size_t k = 1; k <= 5; k += 2) {
36145 GemmMicrokernelTester()
36146 .mr(7)
36147 .nr(16)
36148 .kr(1)
36149 .sr(1)
36150 .m(7)
36151 .n(16)
36152 .k(k)
36153 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36154 }
36155 }
36156 }
36157
36158 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
36159 TEST_REQUIRES_X86_AVX512F;
36160 for (uint32_t n = 17; n < 32; n++) {
36161 for (size_t k = 1; k <= 5; k += 2) {
36162 GemmMicrokernelTester()
36163 .mr(7)
36164 .nr(16)
36165 .kr(1)
36166 .sr(1)
36167 .m(7)
36168 .n(16)
36169 .k(k)
36170 .cn_stride(19)
36171 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36172 }
36173 }
36174 }
36175
36176 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
36177 TEST_REQUIRES_X86_AVX512F;
36178 for (uint32_t n = 17; n < 32; n++) {
36179 for (size_t k = 1; k <= 5; k += 2) {
36180 GemmMicrokernelTester()
36181 .mr(7)
36182 .nr(16)
36183 .kr(1)
36184 .sr(1)
36185 .m(7)
36186 .n(n)
36187 .k(k)
36188 .a_stride(7)
36189 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36190 }
36191 }
36192 }
36193
36194 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
36195 TEST_REQUIRES_X86_AVX512F;
36196 for (uint32_t n = 17; n < 32; n++) {
36197 for (size_t k = 1; k <= 5; k += 2) {
36198 for (uint32_t m = 1; m <= 7; m++) {
36199 GemmMicrokernelTester()
36200 .mr(7)
36201 .nr(16)
36202 .kr(1)
36203 .sr(1)
36204 .m(m)
36205 .n(n)
36206 .k(k)
36207 .iterations(1)
36208 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36209 }
36210 }
36211 }
36212 }
36213
36214 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16) {
36215 TEST_REQUIRES_X86_AVX512F;
36216 for (uint32_t n = 32; n <= 48; n += 16) {
36217 for (size_t k = 1; k <= 5; k += 2) {
36218 GemmMicrokernelTester()
36219 .mr(7)
36220 .nr(16)
36221 .kr(1)
36222 .sr(1)
36223 .m(7)
36224 .n(16)
36225 .k(k)
36226 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36227 }
36228 }
36229 }
36230
36231 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
36232 TEST_REQUIRES_X86_AVX512F;
36233 for (uint32_t n = 32; n <= 48; n += 16) {
36234 for (size_t k = 1; k <= 5; k += 2) {
36235 GemmMicrokernelTester()
36236 .mr(7)
36237 .nr(16)
36238 .kr(1)
36239 .sr(1)
36240 .m(7)
36241 .n(n)
36242 .k(k)
36243 .cn_stride(19)
36244 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36245 }
36246 }
36247 }
36248
36249 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16_strided_a) {
36250 TEST_REQUIRES_X86_AVX512F;
36251 for (uint32_t n = 32; n <= 48; n += 16) {
36252 for (size_t k = 1; k <= 5; k += 2) {
36253 GemmMicrokernelTester()
36254 .mr(7)
36255 .nr(16)
36256 .kr(1)
36257 .sr(1)
36258 .m(7)
36259 .n(n)
36260 .k(k)
36261 .a_stride(7)
36262 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36263 }
36264 }
36265 }
36266
36267 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
36268 TEST_REQUIRES_X86_AVX512F;
36269 for (uint32_t n = 32; n <= 48; n += 16) {
36270 for (size_t k = 1; k <= 5; k += 2) {
36271 for (uint32_t m = 1; m <= 7; m++) {
36272 GemmMicrokernelTester()
36273 .mr(7)
36274 .nr(16)
36275 .kr(1)
36276 .sr(1)
36277 .m(m)
36278 .n(n)
36279 .k(k)
36280 .iterations(1)
36281 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36282 }
36283 }
36284 }
36285 }
36286
36287 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
36288 TEST_REQUIRES_X86_AVX512F;
36289 for (size_t k = 1; k <= 5; k += 2) {
36290 for (uint32_t m = 1; m <= 7; m++) {
36291 for (uint32_t n = 1; n <= 16; n++) {
36292 GemmMicrokernelTester()
36293 .mr(7)
36294 .nr(16)
36295 .kr(1)
36296 .sr(1)
36297 .m(m)
36298 .n(n)
36299 .k(k)
36300 .cm_stride(19)
36301 .iterations(1)
36302 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36303 }
36304 }
36305 }
36306 }
36307
36308 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, qmin) {
36309 TEST_REQUIRES_X86_AVX512F;
36310 GemmMicrokernelTester()
36311 .mr(7)
36312 .nr(16)
36313 .kr(1)
36314 .sr(1)
36315 .m(7)
36316 .n(16)
36317 .k(1)
36318 .qmin(128)
36319 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36320 }
36321
36322 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, qmax) {
36323 TEST_REQUIRES_X86_AVX512F;
36324 GemmMicrokernelTester()
36325 .mr(7)
36326 .nr(16)
36327 .kr(1)
36328 .sr(1)
36329 .m(7)
36330 .n(16)
36331 .k(1)
36332 .qmax(128)
36333 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36334 }
36335
36336 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, strided_cm) {
36337 TEST_REQUIRES_X86_AVX512F;
36338 GemmMicrokernelTester()
36339 .mr(7)
36340 .nr(16)
36341 .kr(1)
36342 .sr(1)
36343 .m(7)
36344 .n(16)
36345 .k(1)
36346 .cm_stride(19)
36347 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
36348 }
36349#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36350
36351
36352#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36353 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1) {
36354 TEST_REQUIRES_X86_AVX512F;
36355 GemmMicrokernelTester()
36356 .mr(8)
36357 .nr(16)
36358 .kr(1)
36359 .sr(1)
36360 .m(8)
36361 .n(16)
36362 .k(1)
36363 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36364 }
36365
36366 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, strided_cn) {
36367 TEST_REQUIRES_X86_AVX512F;
36368 GemmMicrokernelTester()
36369 .mr(8)
36370 .nr(16)
36371 .kr(1)
36372 .sr(1)
36373 .m(8)
36374 .n(16)
36375 .k(1)
36376 .cn_stride(19)
36377 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36378 }
36379
36380 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
36381 TEST_REQUIRES_X86_AVX512F;
36382 GemmMicrokernelTester()
36383 .mr(8)
36384 .nr(16)
36385 .kr(1)
36386 .sr(1)
36387 .m(8)
36388 .n(16)
36389 .k(1)
36390 .a_stride(3)
36391 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36392 }
36393
36394 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36395 TEST_REQUIRES_X86_AVX512F;
36396 for (uint32_t m = 1; m <= 8; m++) {
36397 for (uint32_t n = 1; n <= 16; n++) {
36398 GemmMicrokernelTester()
36399 .mr(8)
36400 .nr(16)
36401 .kr(1)
36402 .sr(1)
36403 .m(m)
36404 .n(n)
36405 .k(1)
36406 .iterations(1)
36407 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36408 }
36409 }
36410 }
36411
36412 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36413 TEST_REQUIRES_X86_AVX512F;
36414 for (uint32_t m = 1; m <= 8; m++) {
36415 GemmMicrokernelTester()
36416 .mr(8)
36417 .nr(16)
36418 .kr(1)
36419 .sr(1)
36420 .m(m)
36421 .n(16)
36422 .k(1)
36423 .iterations(1)
36424 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36425 }
36426 }
36427
36428 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36429 TEST_REQUIRES_X86_AVX512F;
36430 for (uint32_t n = 1; n <= 16; n++) {
36431 GemmMicrokernelTester()
36432 .mr(8)
36433 .nr(16)
36434 .kr(1)
36435 .sr(1)
36436 .m(8)
36437 .n(n)
36438 .k(1)
36439 .iterations(1)
36440 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36441 }
36442 }
36443
36444 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_gt_1) {
36445 TEST_REQUIRES_X86_AVX512F;
36446 for (size_t k = 2; k < 10; k++) {
36447 GemmMicrokernelTester()
36448 .mr(8)
36449 .nr(16)
36450 .kr(1)
36451 .sr(1)
36452 .m(8)
36453 .n(16)
36454 .k(k)
36455 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36456 }
36457 }
36458
36459 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
36460 TEST_REQUIRES_X86_AVX512F;
36461 for (size_t k = 2; k < 10; k++) {
36462 GemmMicrokernelTester()
36463 .mr(8)
36464 .nr(16)
36465 .kr(1)
36466 .sr(1)
36467 .m(8)
36468 .n(16)
36469 .k(k)
36470 .a_stride(11)
36471 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36472 }
36473 }
36474
36475 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36476 TEST_REQUIRES_X86_AVX512F;
36477 for (size_t k = 2; k < 10; k++) {
36478 for (uint32_t m = 1; m <= 8; m++) {
36479 for (uint32_t n = 1; n <= 16; n++) {
36480 GemmMicrokernelTester()
36481 .mr(8)
36482 .nr(16)
36483 .kr(1)
36484 .sr(1)
36485 .m(m)
36486 .n(n)
36487 .k(k)
36488 .iterations(1)
36489 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36490 }
36491 }
36492 }
36493 }
36494
36495 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16) {
36496 TEST_REQUIRES_X86_AVX512F;
36497 for (uint32_t n = 17; n < 32; n++) {
36498 for (size_t k = 1; k <= 5; k += 2) {
36499 GemmMicrokernelTester()
36500 .mr(8)
36501 .nr(16)
36502 .kr(1)
36503 .sr(1)
36504 .m(8)
36505 .n(16)
36506 .k(k)
36507 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36508 }
36509 }
36510 }
36511
36512 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
36513 TEST_REQUIRES_X86_AVX512F;
36514 for (uint32_t n = 17; n < 32; n++) {
36515 for (size_t k = 1; k <= 5; k += 2) {
36516 GemmMicrokernelTester()
36517 .mr(8)
36518 .nr(16)
36519 .kr(1)
36520 .sr(1)
36521 .m(8)
36522 .n(16)
36523 .k(k)
36524 .cn_stride(19)
36525 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36526 }
36527 }
36528 }
36529
36530 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
36531 TEST_REQUIRES_X86_AVX512F;
36532 for (uint32_t n = 17; n < 32; n++) {
36533 for (size_t k = 1; k <= 5; k += 2) {
36534 GemmMicrokernelTester()
36535 .mr(8)
36536 .nr(16)
36537 .kr(1)
36538 .sr(1)
36539 .m(8)
36540 .n(n)
36541 .k(k)
36542 .a_stride(7)
36543 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36544 }
36545 }
36546 }
36547
36548 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
36549 TEST_REQUIRES_X86_AVX512F;
36550 for (uint32_t n = 17; n < 32; n++) {
36551 for (size_t k = 1; k <= 5; k += 2) {
36552 for (uint32_t m = 1; m <= 8; m++) {
36553 GemmMicrokernelTester()
36554 .mr(8)
36555 .nr(16)
36556 .kr(1)
36557 .sr(1)
36558 .m(m)
36559 .n(n)
36560 .k(k)
36561 .iterations(1)
36562 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36563 }
36564 }
36565 }
36566 }
36567
36568 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16) {
36569 TEST_REQUIRES_X86_AVX512F;
36570 for (uint32_t n = 32; n <= 48; n += 16) {
36571 for (size_t k = 1; k <= 5; k += 2) {
36572 GemmMicrokernelTester()
36573 .mr(8)
36574 .nr(16)
36575 .kr(1)
36576 .sr(1)
36577 .m(8)
36578 .n(16)
36579 .k(k)
36580 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36581 }
36582 }
36583 }
36584
36585 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
36586 TEST_REQUIRES_X86_AVX512F;
36587 for (uint32_t n = 32; n <= 48; n += 16) {
36588 for (size_t k = 1; k <= 5; k += 2) {
36589 GemmMicrokernelTester()
36590 .mr(8)
36591 .nr(16)
36592 .kr(1)
36593 .sr(1)
36594 .m(8)
36595 .n(n)
36596 .k(k)
36597 .cn_stride(19)
36598 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36599 }
36600 }
36601 }
36602
36603 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16_strided_a) {
36604 TEST_REQUIRES_X86_AVX512F;
36605 for (uint32_t n = 32; n <= 48; n += 16) {
36606 for (size_t k = 1; k <= 5; k += 2) {
36607 GemmMicrokernelTester()
36608 .mr(8)
36609 .nr(16)
36610 .kr(1)
36611 .sr(1)
36612 .m(8)
36613 .n(n)
36614 .k(k)
36615 .a_stride(7)
36616 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36617 }
36618 }
36619 }
36620
36621 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
36622 TEST_REQUIRES_X86_AVX512F;
36623 for (uint32_t n = 32; n <= 48; n += 16) {
36624 for (size_t k = 1; k <= 5; k += 2) {
36625 for (uint32_t m = 1; m <= 8; m++) {
36626 GemmMicrokernelTester()
36627 .mr(8)
36628 .nr(16)
36629 .kr(1)
36630 .sr(1)
36631 .m(m)
36632 .n(n)
36633 .k(k)
36634 .iterations(1)
36635 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36636 }
36637 }
36638 }
36639 }
36640
36641 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
36642 TEST_REQUIRES_X86_AVX512F;
36643 for (size_t k = 1; k <= 5; k += 2) {
36644 for (uint32_t m = 1; m <= 8; m++) {
36645 for (uint32_t n = 1; n <= 16; n++) {
36646 GemmMicrokernelTester()
36647 .mr(8)
36648 .nr(16)
36649 .kr(1)
36650 .sr(1)
36651 .m(m)
36652 .n(n)
36653 .k(k)
36654 .cm_stride(19)
36655 .iterations(1)
36656 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36657 }
36658 }
36659 }
36660 }
36661
36662 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, qmin) {
36663 TEST_REQUIRES_X86_AVX512F;
36664 GemmMicrokernelTester()
36665 .mr(8)
36666 .nr(16)
36667 .kr(1)
36668 .sr(1)
36669 .m(8)
36670 .n(16)
36671 .k(1)
36672 .qmin(128)
36673 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36674 }
36675
36676 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, qmax) {
36677 TEST_REQUIRES_X86_AVX512F;
36678 GemmMicrokernelTester()
36679 .mr(8)
36680 .nr(16)
36681 .kr(1)
36682 .sr(1)
36683 .m(8)
36684 .n(16)
36685 .k(1)
36686 .qmax(128)
36687 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36688 }
36689
36690 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, strided_cm) {
36691 TEST_REQUIRES_X86_AVX512F;
36692 GemmMicrokernelTester()
36693 .mr(8)
36694 .nr(16)
36695 .kr(1)
36696 .sr(1)
36697 .m(8)
36698 .n(16)
36699 .k(1)
36700 .cm_stride(19)
36701 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
36702 }
36703#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36704
36705
36706#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
36707 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1) {
36708 TEST_REQUIRES_PSIMD;
36709 GemmMicrokernelTester()
36710 .mr(1)
36711 .nr(8)
36712 .kr(1)
36713 .sr(1)
36714 .m(1)
36715 .n(8)
36716 .k(1)
36717 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36718 }
36719
36720 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, strided_cn) {
36721 TEST_REQUIRES_PSIMD;
36722 GemmMicrokernelTester()
36723 .mr(1)
36724 .nr(8)
36725 .kr(1)
36726 .sr(1)
36727 .m(1)
36728 .n(8)
36729 .k(1)
36730 .cn_stride(11)
36731 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36732 }
36733
36734 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
36735 TEST_REQUIRES_PSIMD;
36736 GemmMicrokernelTester()
36737 .mr(1)
36738 .nr(8)
36739 .kr(1)
36740 .sr(1)
36741 .m(1)
36742 .n(8)
36743 .k(1)
36744 .a_stride(3)
36745 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36746 }
36747
36748 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
36749 TEST_REQUIRES_PSIMD;
36750 for (uint32_t m = 1; m <= 1; m++) {
36751 for (uint32_t n = 1; n <= 8; n++) {
36752 GemmMicrokernelTester()
36753 .mr(1)
36754 .nr(8)
36755 .kr(1)
36756 .sr(1)
36757 .m(m)
36758 .n(n)
36759 .k(1)
36760 .iterations(1)
36761 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36762 }
36763 }
36764 }
36765
36766 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
36767 TEST_REQUIRES_PSIMD;
36768 for (uint32_t m = 1; m <= 1; m++) {
36769 GemmMicrokernelTester()
36770 .mr(1)
36771 .nr(8)
36772 .kr(1)
36773 .sr(1)
36774 .m(m)
36775 .n(8)
36776 .k(1)
36777 .iterations(1)
36778 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36779 }
36780 }
36781
36782 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
36783 TEST_REQUIRES_PSIMD;
36784 for (uint32_t n = 1; n <= 8; n++) {
36785 GemmMicrokernelTester()
36786 .mr(1)
36787 .nr(8)
36788 .kr(1)
36789 .sr(1)
36790 .m(1)
36791 .n(n)
36792 .k(1)
36793 .iterations(1)
36794 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36795 }
36796 }
36797
36798 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_gt_1) {
36799 TEST_REQUIRES_PSIMD;
36800 for (size_t k = 2; k < 10; k++) {
36801 GemmMicrokernelTester()
36802 .mr(1)
36803 .nr(8)
36804 .kr(1)
36805 .sr(1)
36806 .m(1)
36807 .n(8)
36808 .k(k)
36809 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36810 }
36811 }
36812
36813 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
36814 TEST_REQUIRES_PSIMD;
36815 for (size_t k = 2; k < 10; k++) {
36816 GemmMicrokernelTester()
36817 .mr(1)
36818 .nr(8)
36819 .kr(1)
36820 .sr(1)
36821 .m(1)
36822 .n(8)
36823 .k(k)
36824 .a_stride(11)
36825 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36826 }
36827 }
36828
36829 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
36830 TEST_REQUIRES_PSIMD;
36831 for (size_t k = 2; k < 10; k++) {
36832 for (uint32_t m = 1; m <= 1; m++) {
36833 for (uint32_t n = 1; n <= 8; n++) {
36834 GemmMicrokernelTester()
36835 .mr(1)
36836 .nr(8)
36837 .kr(1)
36838 .sr(1)
36839 .m(m)
36840 .n(n)
36841 .k(k)
36842 .iterations(1)
36843 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36844 }
36845 }
36846 }
36847 }
36848
36849 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8) {
36850 TEST_REQUIRES_PSIMD;
36851 for (uint32_t n = 9; n < 16; n++) {
36852 for (size_t k = 1; k <= 5; k += 2) {
36853 GemmMicrokernelTester()
36854 .mr(1)
36855 .nr(8)
36856 .kr(1)
36857 .sr(1)
36858 .m(1)
36859 .n(8)
36860 .k(k)
36861 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36862 }
36863 }
36864 }
36865
36866 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
36867 TEST_REQUIRES_PSIMD;
36868 for (uint32_t n = 9; n < 16; n++) {
36869 for (size_t k = 1; k <= 5; k += 2) {
36870 GemmMicrokernelTester()
36871 .mr(1)
36872 .nr(8)
36873 .kr(1)
36874 .sr(1)
36875 .m(1)
36876 .n(8)
36877 .k(k)
36878 .cn_stride(11)
36879 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36880 }
36881 }
36882 }
36883
36884 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
36885 TEST_REQUIRES_PSIMD;
36886 for (uint32_t n = 9; n < 16; n++) {
36887 for (size_t k = 1; k <= 5; k += 2) {
36888 GemmMicrokernelTester()
36889 .mr(1)
36890 .nr(8)
36891 .kr(1)
36892 .sr(1)
36893 .m(1)
36894 .n(n)
36895 .k(k)
36896 .a_stride(7)
36897 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36898 }
36899 }
36900 }
36901
36902 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
36903 TEST_REQUIRES_PSIMD;
36904 for (uint32_t n = 9; n < 16; n++) {
36905 for (size_t k = 1; k <= 5; k += 2) {
36906 for (uint32_t m = 1; m <= 1; m++) {
36907 GemmMicrokernelTester()
36908 .mr(1)
36909 .nr(8)
36910 .kr(1)
36911 .sr(1)
36912 .m(m)
36913 .n(n)
36914 .k(k)
36915 .iterations(1)
36916 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36917 }
36918 }
36919 }
36920 }
36921
36922 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8) {
36923 TEST_REQUIRES_PSIMD;
36924 for (uint32_t n = 16; n <= 24; n += 8) {
36925 for (size_t k = 1; k <= 5; k += 2) {
36926 GemmMicrokernelTester()
36927 .mr(1)
36928 .nr(8)
36929 .kr(1)
36930 .sr(1)
36931 .m(1)
36932 .n(8)
36933 .k(k)
36934 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36935 }
36936 }
36937 }
36938
36939 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
36940 TEST_REQUIRES_PSIMD;
36941 for (uint32_t n = 16; n <= 24; n += 8) {
36942 for (size_t k = 1; k <= 5; k += 2) {
36943 GemmMicrokernelTester()
36944 .mr(1)
36945 .nr(8)
36946 .kr(1)
36947 .sr(1)
36948 .m(1)
36949 .n(n)
36950 .k(k)
36951 .cn_stride(11)
36952 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36953 }
36954 }
36955 }
36956
36957 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
36958 TEST_REQUIRES_PSIMD;
36959 for (uint32_t n = 16; n <= 24; n += 8) {
36960 for (size_t k = 1; k <= 5; k += 2) {
36961 GemmMicrokernelTester()
36962 .mr(1)
36963 .nr(8)
36964 .kr(1)
36965 .sr(1)
36966 .m(1)
36967 .n(n)
36968 .k(k)
36969 .a_stride(7)
36970 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36971 }
36972 }
36973 }
36974
36975 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
36976 TEST_REQUIRES_PSIMD;
36977 for (uint32_t n = 16; n <= 24; n += 8) {
36978 for (size_t k = 1; k <= 5; k += 2) {
36979 for (uint32_t m = 1; m <= 1; m++) {
36980 GemmMicrokernelTester()
36981 .mr(1)
36982 .nr(8)
36983 .kr(1)
36984 .sr(1)
36985 .m(m)
36986 .n(n)
36987 .k(k)
36988 .iterations(1)
36989 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36990 }
36991 }
36992 }
36993 }
36994
36995 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
36996 TEST_REQUIRES_PSIMD;
36997 for (size_t k = 1; k <= 5; k += 2) {
36998 for (uint32_t m = 1; m <= 1; m++) {
36999 for (uint32_t n = 1; n <= 8; n++) {
37000 GemmMicrokernelTester()
37001 .mr(1)
37002 .nr(8)
37003 .kr(1)
37004 .sr(1)
37005 .m(m)
37006 .n(n)
37007 .k(k)
37008 .cm_stride(11)
37009 .iterations(1)
37010 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37011 }
37012 }
37013 }
37014 }
37015
37016 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, qmin) {
37017 TEST_REQUIRES_PSIMD;
37018 GemmMicrokernelTester()
37019 .mr(1)
37020 .nr(8)
37021 .kr(1)
37022 .sr(1)
37023 .m(1)
37024 .n(8)
37025 .k(1)
37026 .qmin(128)
37027 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37028 }
37029
37030 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, qmax) {
37031 TEST_REQUIRES_PSIMD;
37032 GemmMicrokernelTester()
37033 .mr(1)
37034 .nr(8)
37035 .kr(1)
37036 .sr(1)
37037 .m(1)
37038 .n(8)
37039 .k(1)
37040 .qmax(128)
37041 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37042 }
37043
37044 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, strided_cm) {
37045 TEST_REQUIRES_PSIMD;
37046 GemmMicrokernelTester()
37047 .mr(1)
37048 .nr(8)
37049 .kr(1)
37050 .sr(1)
37051 .m(1)
37052 .n(8)
37053 .k(1)
37054 .cm_stride(11)
37055 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37056 }
37057#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
37058
37059
37060#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
37061 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1) {
37062 TEST_REQUIRES_PSIMD;
37063 GemmMicrokernelTester()
37064 .mr(4)
37065 .nr(8)
37066 .kr(1)
37067 .sr(1)
37068 .m(4)
37069 .n(8)
37070 .k(1)
37071 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37072 }
37073
37074 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, strided_cn) {
37075 TEST_REQUIRES_PSIMD;
37076 GemmMicrokernelTester()
37077 .mr(4)
37078 .nr(8)
37079 .kr(1)
37080 .sr(1)
37081 .m(4)
37082 .n(8)
37083 .k(1)
37084 .cn_stride(11)
37085 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37086 }
37087
37088 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
37089 TEST_REQUIRES_PSIMD;
37090 GemmMicrokernelTester()
37091 .mr(4)
37092 .nr(8)
37093 .kr(1)
37094 .sr(1)
37095 .m(4)
37096 .n(8)
37097 .k(1)
37098 .a_stride(3)
37099 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37100 }
37101
37102 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
37103 TEST_REQUIRES_PSIMD;
37104 for (uint32_t m = 1; m <= 4; m++) {
37105 for (uint32_t n = 1; n <= 8; n++) {
37106 GemmMicrokernelTester()
37107 .mr(4)
37108 .nr(8)
37109 .kr(1)
37110 .sr(1)
37111 .m(m)
37112 .n(n)
37113 .k(1)
37114 .iterations(1)
37115 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37116 }
37117 }
37118 }
37119
37120 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
37121 TEST_REQUIRES_PSIMD;
37122 for (uint32_t m = 1; m <= 4; m++) {
37123 GemmMicrokernelTester()
37124 .mr(4)
37125 .nr(8)
37126 .kr(1)
37127 .sr(1)
37128 .m(m)
37129 .n(8)
37130 .k(1)
37131 .iterations(1)
37132 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37133 }
37134 }
37135
37136 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
37137 TEST_REQUIRES_PSIMD;
37138 for (uint32_t n = 1; n <= 8; n++) {
37139 GemmMicrokernelTester()
37140 .mr(4)
37141 .nr(8)
37142 .kr(1)
37143 .sr(1)
37144 .m(4)
37145 .n(n)
37146 .k(1)
37147 .iterations(1)
37148 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37149 }
37150 }
37151
37152 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_gt_1) {
37153 TEST_REQUIRES_PSIMD;
37154 for (size_t k = 2; k < 10; k++) {
37155 GemmMicrokernelTester()
37156 .mr(4)
37157 .nr(8)
37158 .kr(1)
37159 .sr(1)
37160 .m(4)
37161 .n(8)
37162 .k(k)
37163 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37164 }
37165 }
37166
37167 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
37168 TEST_REQUIRES_PSIMD;
37169 for (size_t k = 2; k < 10; k++) {
37170 GemmMicrokernelTester()
37171 .mr(4)
37172 .nr(8)
37173 .kr(1)
37174 .sr(1)
37175 .m(4)
37176 .n(8)
37177 .k(k)
37178 .a_stride(11)
37179 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37180 }
37181 }
37182
37183 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
37184 TEST_REQUIRES_PSIMD;
37185 for (size_t k = 2; k < 10; k++) {
37186 for (uint32_t m = 1; m <= 4; m++) {
37187 for (uint32_t n = 1; n <= 8; n++) {
37188 GemmMicrokernelTester()
37189 .mr(4)
37190 .nr(8)
37191 .kr(1)
37192 .sr(1)
37193 .m(m)
37194 .n(n)
37195 .k(k)
37196 .iterations(1)
37197 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37198 }
37199 }
37200 }
37201 }
37202
37203 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8) {
37204 TEST_REQUIRES_PSIMD;
37205 for (uint32_t n = 9; n < 16; n++) {
37206 for (size_t k = 1; k <= 5; k += 2) {
37207 GemmMicrokernelTester()
37208 .mr(4)
37209 .nr(8)
37210 .kr(1)
37211 .sr(1)
37212 .m(4)
37213 .n(8)
37214 .k(k)
37215 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37216 }
37217 }
37218 }
37219
37220 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
37221 TEST_REQUIRES_PSIMD;
37222 for (uint32_t n = 9; n < 16; n++) {
37223 for (size_t k = 1; k <= 5; k += 2) {
37224 GemmMicrokernelTester()
37225 .mr(4)
37226 .nr(8)
37227 .kr(1)
37228 .sr(1)
37229 .m(4)
37230 .n(8)
37231 .k(k)
37232 .cn_stride(11)
37233 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37234 }
37235 }
37236 }
37237
37238 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
37239 TEST_REQUIRES_PSIMD;
37240 for (uint32_t n = 9; n < 16; n++) {
37241 for (size_t k = 1; k <= 5; k += 2) {
37242 GemmMicrokernelTester()
37243 .mr(4)
37244 .nr(8)
37245 .kr(1)
37246 .sr(1)
37247 .m(4)
37248 .n(n)
37249 .k(k)
37250 .a_stride(7)
37251 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37252 }
37253 }
37254 }
37255
37256 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
37257 TEST_REQUIRES_PSIMD;
37258 for (uint32_t n = 9; n < 16; n++) {
37259 for (size_t k = 1; k <= 5; k += 2) {
37260 for (uint32_t m = 1; m <= 4; m++) {
37261 GemmMicrokernelTester()
37262 .mr(4)
37263 .nr(8)
37264 .kr(1)
37265 .sr(1)
37266 .m(m)
37267 .n(n)
37268 .k(k)
37269 .iterations(1)
37270 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37271 }
37272 }
37273 }
37274 }
37275
37276 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8) {
37277 TEST_REQUIRES_PSIMD;
37278 for (uint32_t n = 16; n <= 24; n += 8) {
37279 for (size_t k = 1; k <= 5; k += 2) {
37280 GemmMicrokernelTester()
37281 .mr(4)
37282 .nr(8)
37283 .kr(1)
37284 .sr(1)
37285 .m(4)
37286 .n(8)
37287 .k(k)
37288 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37289 }
37290 }
37291 }
37292
37293 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
37294 TEST_REQUIRES_PSIMD;
37295 for (uint32_t n = 16; n <= 24; n += 8) {
37296 for (size_t k = 1; k <= 5; k += 2) {
37297 GemmMicrokernelTester()
37298 .mr(4)
37299 .nr(8)
37300 .kr(1)
37301 .sr(1)
37302 .m(4)
37303 .n(n)
37304 .k(k)
37305 .cn_stride(11)
37306 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37307 }
37308 }
37309 }
37310
37311 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
37312 TEST_REQUIRES_PSIMD;
37313 for (uint32_t n = 16; n <= 24; n += 8) {
37314 for (size_t k = 1; k <= 5; k += 2) {
37315 GemmMicrokernelTester()
37316 .mr(4)
37317 .nr(8)
37318 .kr(1)
37319 .sr(1)
37320 .m(4)
37321 .n(n)
37322 .k(k)
37323 .a_stride(7)
37324 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37325 }
37326 }
37327 }
37328
37329 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
37330 TEST_REQUIRES_PSIMD;
37331 for (uint32_t n = 16; n <= 24; n += 8) {
37332 for (size_t k = 1; k <= 5; k += 2) {
37333 for (uint32_t m = 1; m <= 4; m++) {
37334 GemmMicrokernelTester()
37335 .mr(4)
37336 .nr(8)
37337 .kr(1)
37338 .sr(1)
37339 .m(m)
37340 .n(n)
37341 .k(k)
37342 .iterations(1)
37343 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37344 }
37345 }
37346 }
37347 }
37348
37349 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
37350 TEST_REQUIRES_PSIMD;
37351 for (size_t k = 1; k <= 5; k += 2) {
37352 for (uint32_t m = 1; m <= 4; m++) {
37353 for (uint32_t n = 1; n <= 8; n++) {
37354 GemmMicrokernelTester()
37355 .mr(4)
37356 .nr(8)
37357 .kr(1)
37358 .sr(1)
37359 .m(m)
37360 .n(n)
37361 .k(k)
37362 .cm_stride(11)
37363 .iterations(1)
37364 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37365 }
37366 }
37367 }
37368 }
37369
37370 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, qmin) {
37371 TEST_REQUIRES_PSIMD;
37372 GemmMicrokernelTester()
37373 .mr(4)
37374 .nr(8)
37375 .kr(1)
37376 .sr(1)
37377 .m(4)
37378 .n(8)
37379 .k(1)
37380 .qmin(128)
37381 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37382 }
37383
37384 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, qmax) {
37385 TEST_REQUIRES_PSIMD;
37386 GemmMicrokernelTester()
37387 .mr(4)
37388 .nr(8)
37389 .kr(1)
37390 .sr(1)
37391 .m(4)
37392 .n(8)
37393 .k(1)
37394 .qmax(128)
37395 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37396 }
37397
37398 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, strided_cm) {
37399 TEST_REQUIRES_PSIMD;
37400 GemmMicrokernelTester()
37401 .mr(4)
37402 .nr(8)
37403 .kr(1)
37404 .sr(1)
37405 .m(4)
37406 .n(8)
37407 .k(1)
37408 .cm_stride(11)
37409 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37410 }
37411#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
37412
37413
37414#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
37415 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1) {
37416 TEST_REQUIRES_PSIMD;
37417 GemmMicrokernelTester()
37418 .mr(6)
37419 .nr(8)
37420 .kr(1)
37421 .sr(1)
37422 .m(6)
37423 .n(8)
37424 .k(1)
37425 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37426 }
37427
37428 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, strided_cn) {
37429 TEST_REQUIRES_PSIMD;
37430 GemmMicrokernelTester()
37431 .mr(6)
37432 .nr(8)
37433 .kr(1)
37434 .sr(1)
37435 .m(6)
37436 .n(8)
37437 .k(1)
37438 .cn_stride(11)
37439 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37440 }
37441
37442 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
37443 TEST_REQUIRES_PSIMD;
37444 GemmMicrokernelTester()
37445 .mr(6)
37446 .nr(8)
37447 .kr(1)
37448 .sr(1)
37449 .m(6)
37450 .n(8)
37451 .k(1)
37452 .a_stride(3)
37453 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37454 }
37455
37456 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
37457 TEST_REQUIRES_PSIMD;
37458 for (uint32_t m = 1; m <= 6; m++) {
37459 for (uint32_t n = 1; n <= 8; n++) {
37460 GemmMicrokernelTester()
37461 .mr(6)
37462 .nr(8)
37463 .kr(1)
37464 .sr(1)
37465 .m(m)
37466 .n(n)
37467 .k(1)
37468 .iterations(1)
37469 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37470 }
37471 }
37472 }
37473
37474 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
37475 TEST_REQUIRES_PSIMD;
37476 for (uint32_t m = 1; m <= 6; m++) {
37477 GemmMicrokernelTester()
37478 .mr(6)
37479 .nr(8)
37480 .kr(1)
37481 .sr(1)
37482 .m(m)
37483 .n(8)
37484 .k(1)
37485 .iterations(1)
37486 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37487 }
37488 }
37489
37490 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
37491 TEST_REQUIRES_PSIMD;
37492 for (uint32_t n = 1; n <= 8; n++) {
37493 GemmMicrokernelTester()
37494 .mr(6)
37495 .nr(8)
37496 .kr(1)
37497 .sr(1)
37498 .m(6)
37499 .n(n)
37500 .k(1)
37501 .iterations(1)
37502 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37503 }
37504 }
37505
37506 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_gt_1) {
37507 TEST_REQUIRES_PSIMD;
37508 for (size_t k = 2; k < 10; k++) {
37509 GemmMicrokernelTester()
37510 .mr(6)
37511 .nr(8)
37512 .kr(1)
37513 .sr(1)
37514 .m(6)
37515 .n(8)
37516 .k(k)
37517 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37518 }
37519 }
37520
37521 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
37522 TEST_REQUIRES_PSIMD;
37523 for (size_t k = 2; k < 10; k++) {
37524 GemmMicrokernelTester()
37525 .mr(6)
37526 .nr(8)
37527 .kr(1)
37528 .sr(1)
37529 .m(6)
37530 .n(8)
37531 .k(k)
37532 .a_stride(11)
37533 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37534 }
37535 }
37536
37537 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
37538 TEST_REQUIRES_PSIMD;
37539 for (size_t k = 2; k < 10; k++) {
37540 for (uint32_t m = 1; m <= 6; m++) {
37541 for (uint32_t n = 1; n <= 8; n++) {
37542 GemmMicrokernelTester()
37543 .mr(6)
37544 .nr(8)
37545 .kr(1)
37546 .sr(1)
37547 .m(m)
37548 .n(n)
37549 .k(k)
37550 .iterations(1)
37551 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37552 }
37553 }
37554 }
37555 }
37556
37557 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8) {
37558 TEST_REQUIRES_PSIMD;
37559 for (uint32_t n = 9; n < 16; n++) {
37560 for (size_t k = 1; k <= 5; k += 2) {
37561 GemmMicrokernelTester()
37562 .mr(6)
37563 .nr(8)
37564 .kr(1)
37565 .sr(1)
37566 .m(6)
37567 .n(8)
37568 .k(k)
37569 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37570 }
37571 }
37572 }
37573
37574 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
37575 TEST_REQUIRES_PSIMD;
37576 for (uint32_t n = 9; n < 16; n++) {
37577 for (size_t k = 1; k <= 5; k += 2) {
37578 GemmMicrokernelTester()
37579 .mr(6)
37580 .nr(8)
37581 .kr(1)
37582 .sr(1)
37583 .m(6)
37584 .n(8)
37585 .k(k)
37586 .cn_stride(11)
37587 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37588 }
37589 }
37590 }
37591
37592 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
37593 TEST_REQUIRES_PSIMD;
37594 for (uint32_t n = 9; n < 16; n++) {
37595 for (size_t k = 1; k <= 5; k += 2) {
37596 GemmMicrokernelTester()
37597 .mr(6)
37598 .nr(8)
37599 .kr(1)
37600 .sr(1)
37601 .m(6)
37602 .n(n)
37603 .k(k)
37604 .a_stride(7)
37605 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37606 }
37607 }
37608 }
37609
37610 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
37611 TEST_REQUIRES_PSIMD;
37612 for (uint32_t n = 9; n < 16; n++) {
37613 for (size_t k = 1; k <= 5; k += 2) {
37614 for (uint32_t m = 1; m <= 6; m++) {
37615 GemmMicrokernelTester()
37616 .mr(6)
37617 .nr(8)
37618 .kr(1)
37619 .sr(1)
37620 .m(m)
37621 .n(n)
37622 .k(k)
37623 .iterations(1)
37624 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37625 }
37626 }
37627 }
37628 }
37629
37630 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8) {
37631 TEST_REQUIRES_PSIMD;
37632 for (uint32_t n = 16; n <= 24; n += 8) {
37633 for (size_t k = 1; k <= 5; k += 2) {
37634 GemmMicrokernelTester()
37635 .mr(6)
37636 .nr(8)
37637 .kr(1)
37638 .sr(1)
37639 .m(6)
37640 .n(8)
37641 .k(k)
37642 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37643 }
37644 }
37645 }
37646
37647 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
37648 TEST_REQUIRES_PSIMD;
37649 for (uint32_t n = 16; n <= 24; n += 8) {
37650 for (size_t k = 1; k <= 5; k += 2) {
37651 GemmMicrokernelTester()
37652 .mr(6)
37653 .nr(8)
37654 .kr(1)
37655 .sr(1)
37656 .m(6)
37657 .n(n)
37658 .k(k)
37659 .cn_stride(11)
37660 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37661 }
37662 }
37663 }
37664
37665 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
37666 TEST_REQUIRES_PSIMD;
37667 for (uint32_t n = 16; n <= 24; n += 8) {
37668 for (size_t k = 1; k <= 5; k += 2) {
37669 GemmMicrokernelTester()
37670 .mr(6)
37671 .nr(8)
37672 .kr(1)
37673 .sr(1)
37674 .m(6)
37675 .n(n)
37676 .k(k)
37677 .a_stride(7)
37678 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37679 }
37680 }
37681 }
37682
37683 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
37684 TEST_REQUIRES_PSIMD;
37685 for (uint32_t n = 16; n <= 24; n += 8) {
37686 for (size_t k = 1; k <= 5; k += 2) {
37687 for (uint32_t m = 1; m <= 6; m++) {
37688 GemmMicrokernelTester()
37689 .mr(6)
37690 .nr(8)
37691 .kr(1)
37692 .sr(1)
37693 .m(m)
37694 .n(n)
37695 .k(k)
37696 .iterations(1)
37697 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37698 }
37699 }
37700 }
37701 }
37702
37703 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
37704 TEST_REQUIRES_PSIMD;
37705 for (size_t k = 1; k <= 5; k += 2) {
37706 for (uint32_t m = 1; m <= 6; m++) {
37707 for (uint32_t n = 1; n <= 8; n++) {
37708 GemmMicrokernelTester()
37709 .mr(6)
37710 .nr(8)
37711 .kr(1)
37712 .sr(1)
37713 .m(m)
37714 .n(n)
37715 .k(k)
37716 .cm_stride(11)
37717 .iterations(1)
37718 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37719 }
37720 }
37721 }
37722 }
37723
37724 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, qmin) {
37725 TEST_REQUIRES_PSIMD;
37726 GemmMicrokernelTester()
37727 .mr(6)
37728 .nr(8)
37729 .kr(1)
37730 .sr(1)
37731 .m(6)
37732 .n(8)
37733 .k(1)
37734 .qmin(128)
37735 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37736 }
37737
37738 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, qmax) {
37739 TEST_REQUIRES_PSIMD;
37740 GemmMicrokernelTester()
37741 .mr(6)
37742 .nr(8)
37743 .kr(1)
37744 .sr(1)
37745 .m(6)
37746 .n(8)
37747 .k(1)
37748 .qmax(128)
37749 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37750 }
37751
37752 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, strided_cm) {
37753 TEST_REQUIRES_PSIMD;
37754 GemmMicrokernelTester()
37755 .mr(6)
37756 .nr(8)
37757 .kr(1)
37758 .sr(1)
37759 .m(6)
37760 .n(8)
37761 .k(1)
37762 .cm_stride(11)
37763 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
37764 }
37765#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
37766
37767
37768#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
37769 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4) {
37770 TEST_REQUIRES_PSIMD;
37771 GemmMicrokernelTester()
37772 .mr(1)
37773 .nr(8)
37774 .kr(1)
37775 .sr(1)
37776 .m(1)
37777 .n(8)
37778 .k(4)
37779 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37780 }
37781
37782 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, strided_cn) {
37783 TEST_REQUIRES_PSIMD;
37784 GemmMicrokernelTester()
37785 .mr(1)
37786 .nr(8)
37787 .kr(1)
37788 .sr(1)
37789 .m(1)
37790 .n(8)
37791 .k(4)
37792 .cn_stride(11)
37793 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37794 }
37795
37796 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_strided_a) {
37797 TEST_REQUIRES_PSIMD;
37798 GemmMicrokernelTester()
37799 .mr(1)
37800 .nr(8)
37801 .kr(1)
37802 .sr(1)
37803 .m(1)
37804 .n(8)
37805 .k(4)
37806 .a_stride(7)
37807 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37808 }
37809
37810 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_subtile) {
37811 TEST_REQUIRES_PSIMD;
37812 for (uint32_t m = 1; m <= 1; m++) {
37813 for (uint32_t n = 1; n <= 8; n++) {
37814 GemmMicrokernelTester()
37815 .mr(1)
37816 .nr(8)
37817 .kr(1)
37818 .sr(1)
37819 .m(m)
37820 .n(n)
37821 .k(4)
37822 .iterations(1)
37823 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37824 }
37825 }
37826 }
37827
37828 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
37829 TEST_REQUIRES_PSIMD;
37830 for (uint32_t m = 1; m <= 1; m++) {
37831 GemmMicrokernelTester()
37832 .mr(1)
37833 .nr(8)
37834 .kr(1)
37835 .sr(1)
37836 .m(m)
37837 .n(8)
37838 .k(4)
37839 .iterations(1)
37840 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37841 }
37842 }
37843
37844 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
37845 TEST_REQUIRES_PSIMD;
37846 for (uint32_t n = 1; n <= 8; n++) {
37847 GemmMicrokernelTester()
37848 .mr(1)
37849 .nr(8)
37850 .kr(1)
37851 .sr(1)
37852 .m(1)
37853 .n(n)
37854 .k(4)
37855 .iterations(1)
37856 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37857 }
37858 }
37859
37860 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_lt_4) {
37861 TEST_REQUIRES_PSIMD;
37862 for (size_t k = 1; k < 4; k++) {
37863 GemmMicrokernelTester()
37864 .mr(1)
37865 .nr(8)
37866 .kr(1)
37867 .sr(1)
37868 .m(1)
37869 .n(8)
37870 .k(k)
37871 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37872 }
37873 }
37874
37875 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_lt_4_strided_a) {
37876 TEST_REQUIRES_PSIMD;
37877 for (size_t k = 1; k < 4; k++) {
37878 GemmMicrokernelTester()
37879 .mr(1)
37880 .nr(8)
37881 .kr(1)
37882 .sr(1)
37883 .m(1)
37884 .n(8)
37885 .k(k)
37886 .a_stride(7)
37887 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37888 }
37889 }
37890
37891 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_lt_4_subtile) {
37892 TEST_REQUIRES_PSIMD;
37893 for (size_t k = 1; k < 4; k++) {
37894 for (uint32_t m = 1; m <= 1; m++) {
37895 for (uint32_t n = 1; n <= 8; n++) {
37896 GemmMicrokernelTester()
37897 .mr(1)
37898 .nr(8)
37899 .kr(1)
37900 .sr(1)
37901 .m(m)
37902 .n(n)
37903 .k(k)
37904 .iterations(1)
37905 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37906 }
37907 }
37908 }
37909 }
37910
37911 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_gt_4) {
37912 TEST_REQUIRES_PSIMD;
37913 for (size_t k = 5; k < 8; k++) {
37914 GemmMicrokernelTester()
37915 .mr(1)
37916 .nr(8)
37917 .kr(1)
37918 .sr(1)
37919 .m(1)
37920 .n(8)
37921 .k(k)
37922 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37923 }
37924 }
37925
37926 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_gt_4_strided_a) {
37927 TEST_REQUIRES_PSIMD;
37928 for (size_t k = 5; k < 8; k++) {
37929 GemmMicrokernelTester()
37930 .mr(1)
37931 .nr(8)
37932 .kr(1)
37933 .sr(1)
37934 .m(1)
37935 .n(8)
37936 .k(k)
37937 .a_stride(11)
37938 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37939 }
37940 }
37941
37942 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_gt_4_subtile) {
37943 TEST_REQUIRES_PSIMD;
37944 for (size_t k = 5; k < 8; k++) {
37945 for (uint32_t m = 1; m <= 1; m++) {
37946 for (uint32_t n = 1; n <= 8; n++) {
37947 GemmMicrokernelTester()
37948 .mr(1)
37949 .nr(8)
37950 .kr(1)
37951 .sr(1)
37952 .m(m)
37953 .n(n)
37954 .k(k)
37955 .iterations(1)
37956 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37957 }
37958 }
37959 }
37960 }
37961
37962 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_div_4) {
37963 TEST_REQUIRES_PSIMD;
37964 for (size_t k = 8; k <= 40; k += 4) {
37965 GemmMicrokernelTester()
37966 .mr(1)
37967 .nr(8)
37968 .kr(1)
37969 .sr(1)
37970 .m(1)
37971 .n(8)
37972 .k(k)
37973 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37974 }
37975 }
37976
37977 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_div_4_strided_a) {
37978 TEST_REQUIRES_PSIMD;
37979 for (size_t k = 8; k <= 40; k += 4) {
37980 GemmMicrokernelTester()
37981 .mr(1)
37982 .nr(8)
37983 .kr(1)
37984 .sr(1)
37985 .m(1)
37986 .n(8)
37987 .k(k)
37988 .a_stride(43)
37989 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37990 }
37991 }
37992
37993 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_div_4_subtile) {
37994 TEST_REQUIRES_PSIMD;
37995 for (size_t k = 8; k <= 40; k += 4) {
37996 for (uint32_t m = 1; m <= 1; m++) {
37997 for (uint32_t n = 1; n <= 8; n++) {
37998 GemmMicrokernelTester()
37999 .mr(1)
38000 .nr(8)
38001 .kr(1)
38002 .sr(1)
38003 .m(m)
38004 .n(n)
38005 .k(k)
38006 .iterations(1)
38007 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38008 }
38009 }
38010 }
38011 }
38012
38013 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8) {
38014 TEST_REQUIRES_PSIMD;
38015 for (uint32_t n = 9; n < 16; n++) {
38016 for (size_t k = 1; k <= 20; k += 5) {
38017 GemmMicrokernelTester()
38018 .mr(1)
38019 .nr(8)
38020 .kr(1)
38021 .sr(1)
38022 .m(1)
38023 .n(8)
38024 .k(k)
38025 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38026 }
38027 }
38028 }
38029
38030 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
38031 TEST_REQUIRES_PSIMD;
38032 for (uint32_t n = 9; n < 16; n++) {
38033 for (size_t k = 1; k <= 20; k += 5) {
38034 GemmMicrokernelTester()
38035 .mr(1)
38036 .nr(8)
38037 .kr(1)
38038 .sr(1)
38039 .m(1)
38040 .n(8)
38041 .k(k)
38042 .cn_stride(11)
38043 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38044 }
38045 }
38046 }
38047
38048 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8_strided_a) {
38049 TEST_REQUIRES_PSIMD;
38050 for (uint32_t n = 9; n < 16; n++) {
38051 for (size_t k = 1; k <= 20; k += 5) {
38052 GemmMicrokernelTester()
38053 .mr(1)
38054 .nr(8)
38055 .kr(1)
38056 .sr(1)
38057 .m(1)
38058 .n(n)
38059 .k(k)
38060 .a_stride(23)
38061 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38062 }
38063 }
38064 }
38065
38066 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8_subtile) {
38067 TEST_REQUIRES_PSIMD;
38068 for (uint32_t n = 9; n < 16; n++) {
38069 for (size_t k = 1; k <= 20; k += 5) {
38070 for (uint32_t m = 1; m <= 1; m++) {
38071 GemmMicrokernelTester()
38072 .mr(1)
38073 .nr(8)
38074 .kr(1)
38075 .sr(1)
38076 .m(m)
38077 .n(n)
38078 .k(k)
38079 .iterations(1)
38080 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38081 }
38082 }
38083 }
38084 }
38085
38086 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8) {
38087 TEST_REQUIRES_PSIMD;
38088 for (uint32_t n = 16; n <= 24; n += 8) {
38089 for (size_t k = 1; k <= 20; k += 5) {
38090 GemmMicrokernelTester()
38091 .mr(1)
38092 .nr(8)
38093 .kr(1)
38094 .sr(1)
38095 .m(1)
38096 .n(8)
38097 .k(k)
38098 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38099 }
38100 }
38101 }
38102
38103 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8_strided_cn) {
38104 TEST_REQUIRES_PSIMD;
38105 for (uint32_t n = 16; n <= 24; n += 8) {
38106 for (size_t k = 1; k <= 20; k += 5) {
38107 GemmMicrokernelTester()
38108 .mr(1)
38109 .nr(8)
38110 .kr(1)
38111 .sr(1)
38112 .m(1)
38113 .n(n)
38114 .k(k)
38115 .cn_stride(11)
38116 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38117 }
38118 }
38119 }
38120
38121 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8_strided_a) {
38122 TEST_REQUIRES_PSIMD;
38123 for (uint32_t n = 16; n <= 24; n += 8) {
38124 for (size_t k = 1; k <= 20; k += 5) {
38125 GemmMicrokernelTester()
38126 .mr(1)
38127 .nr(8)
38128 .kr(1)
38129 .sr(1)
38130 .m(1)
38131 .n(n)
38132 .k(k)
38133 .a_stride(23)
38134 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38135 }
38136 }
38137 }
38138
38139 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8_subtile) {
38140 TEST_REQUIRES_PSIMD;
38141 for (uint32_t n = 16; n <= 24; n += 8) {
38142 for (size_t k = 1; k <= 20; k += 5) {
38143 for (uint32_t m = 1; m <= 1; m++) {
38144 GemmMicrokernelTester()
38145 .mr(1)
38146 .nr(8)
38147 .kr(1)
38148 .sr(1)
38149 .m(m)
38150 .n(n)
38151 .k(k)
38152 .iterations(1)
38153 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38154 }
38155 }
38156 }
38157 }
38158
38159 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, strided_cm_subtile) {
38160 TEST_REQUIRES_PSIMD;
38161 for (size_t k = 1; k <= 20; k += 5) {
38162 for (uint32_t m = 1; m <= 1; m++) {
38163 for (uint32_t n = 1; n <= 8; n++) {
38164 GemmMicrokernelTester()
38165 .mr(1)
38166 .nr(8)
38167 .kr(1)
38168 .sr(1)
38169 .m(m)
38170 .n(n)
38171 .k(k)
38172 .cm_stride(11)
38173 .iterations(1)
38174 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38175 }
38176 }
38177 }
38178 }
38179
38180 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, qmin) {
38181 TEST_REQUIRES_PSIMD;
38182 GemmMicrokernelTester()
38183 .mr(1)
38184 .nr(8)
38185 .kr(1)
38186 .sr(1)
38187 .m(1)
38188 .n(8)
38189 .k(4)
38190 .qmin(128)
38191 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38192 }
38193
38194 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, qmax) {
38195 TEST_REQUIRES_PSIMD;
38196 GemmMicrokernelTester()
38197 .mr(1)
38198 .nr(8)
38199 .kr(1)
38200 .sr(1)
38201 .m(1)
38202 .n(8)
38203 .k(4)
38204 .qmax(128)
38205 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38206 }
38207
38208 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, strided_cm) {
38209 TEST_REQUIRES_PSIMD;
38210 GemmMicrokernelTester()
38211 .mr(1)
38212 .nr(8)
38213 .kr(1)
38214 .sr(1)
38215 .m(1)
38216 .n(8)
38217 .k(4)
38218 .cm_stride(11)
38219 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38220 }
38221#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
38222
38223
38224#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
38225 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4) {
38226 TEST_REQUIRES_PSIMD;
38227 GemmMicrokernelTester()
38228 .mr(4)
38229 .nr(8)
38230 .kr(1)
38231 .sr(1)
38232 .m(4)
38233 .n(8)
38234 .k(4)
38235 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38236 }
38237
38238 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, strided_cn) {
38239 TEST_REQUIRES_PSIMD;
38240 GemmMicrokernelTester()
38241 .mr(4)
38242 .nr(8)
38243 .kr(1)
38244 .sr(1)
38245 .m(4)
38246 .n(8)
38247 .k(4)
38248 .cn_stride(11)
38249 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38250 }
38251
38252 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_strided_a) {
38253 TEST_REQUIRES_PSIMD;
38254 GemmMicrokernelTester()
38255 .mr(4)
38256 .nr(8)
38257 .kr(1)
38258 .sr(1)
38259 .m(4)
38260 .n(8)
38261 .k(4)
38262 .a_stride(7)
38263 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38264 }
38265
38266 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_subtile) {
38267 TEST_REQUIRES_PSIMD;
38268 for (uint32_t m = 1; m <= 4; m++) {
38269 for (uint32_t n = 1; n <= 8; n++) {
38270 GemmMicrokernelTester()
38271 .mr(4)
38272 .nr(8)
38273 .kr(1)
38274 .sr(1)
38275 .m(m)
38276 .n(n)
38277 .k(4)
38278 .iterations(1)
38279 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38280 }
38281 }
38282 }
38283
38284 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
38285 TEST_REQUIRES_PSIMD;
38286 for (uint32_t m = 1; m <= 4; m++) {
38287 GemmMicrokernelTester()
38288 .mr(4)
38289 .nr(8)
38290 .kr(1)
38291 .sr(1)
38292 .m(m)
38293 .n(8)
38294 .k(4)
38295 .iterations(1)
38296 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38297 }
38298 }
38299
38300 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
38301 TEST_REQUIRES_PSIMD;
38302 for (uint32_t n = 1; n <= 8; n++) {
38303 GemmMicrokernelTester()
38304 .mr(4)
38305 .nr(8)
38306 .kr(1)
38307 .sr(1)
38308 .m(4)
38309 .n(n)
38310 .k(4)
38311 .iterations(1)
38312 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38313 }
38314 }
38315
38316 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_lt_4) {
38317 TEST_REQUIRES_PSIMD;
38318 for (size_t k = 1; k < 4; k++) {
38319 GemmMicrokernelTester()
38320 .mr(4)
38321 .nr(8)
38322 .kr(1)
38323 .sr(1)
38324 .m(4)
38325 .n(8)
38326 .k(k)
38327 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38328 }
38329 }
38330
38331 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_lt_4_strided_a) {
38332 TEST_REQUIRES_PSIMD;
38333 for (size_t k = 1; k < 4; k++) {
38334 GemmMicrokernelTester()
38335 .mr(4)
38336 .nr(8)
38337 .kr(1)
38338 .sr(1)
38339 .m(4)
38340 .n(8)
38341 .k(k)
38342 .a_stride(7)
38343 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38344 }
38345 }
38346
38347 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_lt_4_subtile) {
38348 TEST_REQUIRES_PSIMD;
38349 for (size_t k = 1; k < 4; k++) {
38350 for (uint32_t m = 1; m <= 4; m++) {
38351 for (uint32_t n = 1; n <= 8; n++) {
38352 GemmMicrokernelTester()
38353 .mr(4)
38354 .nr(8)
38355 .kr(1)
38356 .sr(1)
38357 .m(m)
38358 .n(n)
38359 .k(k)
38360 .iterations(1)
38361 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38362 }
38363 }
38364 }
38365 }
38366
38367 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_gt_4) {
38368 TEST_REQUIRES_PSIMD;
38369 for (size_t k = 5; k < 8; k++) {
38370 GemmMicrokernelTester()
38371 .mr(4)
38372 .nr(8)
38373 .kr(1)
38374 .sr(1)
38375 .m(4)
38376 .n(8)
38377 .k(k)
38378 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38379 }
38380 }
38381
38382 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_gt_4_strided_a) {
38383 TEST_REQUIRES_PSIMD;
38384 for (size_t k = 5; k < 8; k++) {
38385 GemmMicrokernelTester()
38386 .mr(4)
38387 .nr(8)
38388 .kr(1)
38389 .sr(1)
38390 .m(4)
38391 .n(8)
38392 .k(k)
38393 .a_stride(11)
38394 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38395 }
38396 }
38397
38398 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_gt_4_subtile) {
38399 TEST_REQUIRES_PSIMD;
38400 for (size_t k = 5; k < 8; k++) {
38401 for (uint32_t m = 1; m <= 4; m++) {
38402 for (uint32_t n = 1; n <= 8; n++) {
38403 GemmMicrokernelTester()
38404 .mr(4)
38405 .nr(8)
38406 .kr(1)
38407 .sr(1)
38408 .m(m)
38409 .n(n)
38410 .k(k)
38411 .iterations(1)
38412 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38413 }
38414 }
38415 }
38416 }
38417
38418 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_div_4) {
38419 TEST_REQUIRES_PSIMD;
38420 for (size_t k = 8; k <= 40; k += 4) {
38421 GemmMicrokernelTester()
38422 .mr(4)
38423 .nr(8)
38424 .kr(1)
38425 .sr(1)
38426 .m(4)
38427 .n(8)
38428 .k(k)
38429 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38430 }
38431 }
38432
38433 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_div_4_strided_a) {
38434 TEST_REQUIRES_PSIMD;
38435 for (size_t k = 8; k <= 40; k += 4) {
38436 GemmMicrokernelTester()
38437 .mr(4)
38438 .nr(8)
38439 .kr(1)
38440 .sr(1)
38441 .m(4)
38442 .n(8)
38443 .k(k)
38444 .a_stride(43)
38445 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38446 }
38447 }
38448
38449 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_div_4_subtile) {
38450 TEST_REQUIRES_PSIMD;
38451 for (size_t k = 8; k <= 40; k += 4) {
38452 for (uint32_t m = 1; m <= 4; m++) {
38453 for (uint32_t n = 1; n <= 8; n++) {
38454 GemmMicrokernelTester()
38455 .mr(4)
38456 .nr(8)
38457 .kr(1)
38458 .sr(1)
38459 .m(m)
38460 .n(n)
38461 .k(k)
38462 .iterations(1)
38463 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38464 }
38465 }
38466 }
38467 }
38468
38469 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8) {
38470 TEST_REQUIRES_PSIMD;
38471 for (uint32_t n = 9; n < 16; n++) {
38472 for (size_t k = 1; k <= 20; k += 5) {
38473 GemmMicrokernelTester()
38474 .mr(4)
38475 .nr(8)
38476 .kr(1)
38477 .sr(1)
38478 .m(4)
38479 .n(8)
38480 .k(k)
38481 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38482 }
38483 }
38484 }
38485
38486 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
38487 TEST_REQUIRES_PSIMD;
38488 for (uint32_t n = 9; n < 16; n++) {
38489 for (size_t k = 1; k <= 20; k += 5) {
38490 GemmMicrokernelTester()
38491 .mr(4)
38492 .nr(8)
38493 .kr(1)
38494 .sr(1)
38495 .m(4)
38496 .n(8)
38497 .k(k)
38498 .cn_stride(11)
38499 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38500 }
38501 }
38502 }
38503
38504 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8_strided_a) {
38505 TEST_REQUIRES_PSIMD;
38506 for (uint32_t n = 9; n < 16; n++) {
38507 for (size_t k = 1; k <= 20; k += 5) {
38508 GemmMicrokernelTester()
38509 .mr(4)
38510 .nr(8)
38511 .kr(1)
38512 .sr(1)
38513 .m(4)
38514 .n(n)
38515 .k(k)
38516 .a_stride(23)
38517 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38518 }
38519 }
38520 }
38521
38522 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8_subtile) {
38523 TEST_REQUIRES_PSIMD;
38524 for (uint32_t n = 9; n < 16; n++) {
38525 for (size_t k = 1; k <= 20; k += 5) {
38526 for (uint32_t m = 1; m <= 4; m++) {
38527 GemmMicrokernelTester()
38528 .mr(4)
38529 .nr(8)
38530 .kr(1)
38531 .sr(1)
38532 .m(m)
38533 .n(n)
38534 .k(k)
38535 .iterations(1)
38536 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38537 }
38538 }
38539 }
38540 }
38541
38542 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8) {
38543 TEST_REQUIRES_PSIMD;
38544 for (uint32_t n = 16; n <= 24; n += 8) {
38545 for (size_t k = 1; k <= 20; k += 5) {
38546 GemmMicrokernelTester()
38547 .mr(4)
38548 .nr(8)
38549 .kr(1)
38550 .sr(1)
38551 .m(4)
38552 .n(8)
38553 .k(k)
38554 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38555 }
38556 }
38557 }
38558
38559 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8_strided_cn) {
38560 TEST_REQUIRES_PSIMD;
38561 for (uint32_t n = 16; n <= 24; n += 8) {
38562 for (size_t k = 1; k <= 20; k += 5) {
38563 GemmMicrokernelTester()
38564 .mr(4)
38565 .nr(8)
38566 .kr(1)
38567 .sr(1)
38568 .m(4)
38569 .n(n)
38570 .k(k)
38571 .cn_stride(11)
38572 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38573 }
38574 }
38575 }
38576
38577 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8_strided_a) {
38578 TEST_REQUIRES_PSIMD;
38579 for (uint32_t n = 16; n <= 24; n += 8) {
38580 for (size_t k = 1; k <= 20; k += 5) {
38581 GemmMicrokernelTester()
38582 .mr(4)
38583 .nr(8)
38584 .kr(1)
38585 .sr(1)
38586 .m(4)
38587 .n(n)
38588 .k(k)
38589 .a_stride(23)
38590 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38591 }
38592 }
38593 }
38594
38595 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8_subtile) {
38596 TEST_REQUIRES_PSIMD;
38597 for (uint32_t n = 16; n <= 24; n += 8) {
38598 for (size_t k = 1; k <= 20; k += 5) {
38599 for (uint32_t m = 1; m <= 4; m++) {
38600 GemmMicrokernelTester()
38601 .mr(4)
38602 .nr(8)
38603 .kr(1)
38604 .sr(1)
38605 .m(m)
38606 .n(n)
38607 .k(k)
38608 .iterations(1)
38609 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38610 }
38611 }
38612 }
38613 }
38614
38615 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, strided_cm_subtile) {
38616 TEST_REQUIRES_PSIMD;
38617 for (size_t k = 1; k <= 20; k += 5) {
38618 for (uint32_t m = 1; m <= 4; m++) {
38619 for (uint32_t n = 1; n <= 8; n++) {
38620 GemmMicrokernelTester()
38621 .mr(4)
38622 .nr(8)
38623 .kr(1)
38624 .sr(1)
38625 .m(m)
38626 .n(n)
38627 .k(k)
38628 .cm_stride(11)
38629 .iterations(1)
38630 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38631 }
38632 }
38633 }
38634 }
38635
38636 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, qmin) {
38637 TEST_REQUIRES_PSIMD;
38638 GemmMicrokernelTester()
38639 .mr(4)
38640 .nr(8)
38641 .kr(1)
38642 .sr(1)
38643 .m(4)
38644 .n(8)
38645 .k(4)
38646 .qmin(128)
38647 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38648 }
38649
38650 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, qmax) {
38651 TEST_REQUIRES_PSIMD;
38652 GemmMicrokernelTester()
38653 .mr(4)
38654 .nr(8)
38655 .kr(1)
38656 .sr(1)
38657 .m(4)
38658 .n(8)
38659 .k(4)
38660 .qmax(128)
38661 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38662 }
38663
38664 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, strided_cm) {
38665 TEST_REQUIRES_PSIMD;
38666 GemmMicrokernelTester()
38667 .mr(4)
38668 .nr(8)
38669 .kr(1)
38670 .sr(1)
38671 .m(4)
38672 .n(8)
38673 .k(4)
38674 .cm_stride(11)
38675 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38676 }
38677#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
38678
38679
38680#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
38681 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4) {
38682 TEST_REQUIRES_PSIMD;
38683 GemmMicrokernelTester()
38684 .mr(6)
38685 .nr(8)
38686 .kr(1)
38687 .sr(1)
38688 .m(6)
38689 .n(8)
38690 .k(4)
38691 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38692 }
38693
38694 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, strided_cn) {
38695 TEST_REQUIRES_PSIMD;
38696 GemmMicrokernelTester()
38697 .mr(6)
38698 .nr(8)
38699 .kr(1)
38700 .sr(1)
38701 .m(6)
38702 .n(8)
38703 .k(4)
38704 .cn_stride(11)
38705 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38706 }
38707
38708 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_strided_a) {
38709 TEST_REQUIRES_PSIMD;
38710 GemmMicrokernelTester()
38711 .mr(6)
38712 .nr(8)
38713 .kr(1)
38714 .sr(1)
38715 .m(6)
38716 .n(8)
38717 .k(4)
38718 .a_stride(7)
38719 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38720 }
38721
38722 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_subtile) {
38723 TEST_REQUIRES_PSIMD;
38724 for (uint32_t m = 1; m <= 6; m++) {
38725 for (uint32_t n = 1; n <= 8; n++) {
38726 GemmMicrokernelTester()
38727 .mr(6)
38728 .nr(8)
38729 .kr(1)
38730 .sr(1)
38731 .m(m)
38732 .n(n)
38733 .k(4)
38734 .iterations(1)
38735 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38736 }
38737 }
38738 }
38739
38740 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
38741 TEST_REQUIRES_PSIMD;
38742 for (uint32_t m = 1; m <= 6; m++) {
38743 GemmMicrokernelTester()
38744 .mr(6)
38745 .nr(8)
38746 .kr(1)
38747 .sr(1)
38748 .m(m)
38749 .n(8)
38750 .k(4)
38751 .iterations(1)
38752 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38753 }
38754 }
38755
38756 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
38757 TEST_REQUIRES_PSIMD;
38758 for (uint32_t n = 1; n <= 8; n++) {
38759 GemmMicrokernelTester()
38760 .mr(6)
38761 .nr(8)
38762 .kr(1)
38763 .sr(1)
38764 .m(6)
38765 .n(n)
38766 .k(4)
38767 .iterations(1)
38768 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38769 }
38770 }
38771
38772 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_lt_4) {
38773 TEST_REQUIRES_PSIMD;
38774 for (size_t k = 1; k < 4; k++) {
38775 GemmMicrokernelTester()
38776 .mr(6)
38777 .nr(8)
38778 .kr(1)
38779 .sr(1)
38780 .m(6)
38781 .n(8)
38782 .k(k)
38783 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38784 }
38785 }
38786
38787 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_lt_4_strided_a) {
38788 TEST_REQUIRES_PSIMD;
38789 for (size_t k = 1; k < 4; k++) {
38790 GemmMicrokernelTester()
38791 .mr(6)
38792 .nr(8)
38793 .kr(1)
38794 .sr(1)
38795 .m(6)
38796 .n(8)
38797 .k(k)
38798 .a_stride(7)
38799 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38800 }
38801 }
38802
38803 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_lt_4_subtile) {
38804 TEST_REQUIRES_PSIMD;
38805 for (size_t k = 1; k < 4; k++) {
38806 for (uint32_t m = 1; m <= 6; m++) {
38807 for (uint32_t n = 1; n <= 8; n++) {
38808 GemmMicrokernelTester()
38809 .mr(6)
38810 .nr(8)
38811 .kr(1)
38812 .sr(1)
38813 .m(m)
38814 .n(n)
38815 .k(k)
38816 .iterations(1)
38817 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38818 }
38819 }
38820 }
38821 }
38822
38823 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_gt_4) {
38824 TEST_REQUIRES_PSIMD;
38825 for (size_t k = 5; k < 8; k++) {
38826 GemmMicrokernelTester()
38827 .mr(6)
38828 .nr(8)
38829 .kr(1)
38830 .sr(1)
38831 .m(6)
38832 .n(8)
38833 .k(k)
38834 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38835 }
38836 }
38837
38838 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_gt_4_strided_a) {
38839 TEST_REQUIRES_PSIMD;
38840 for (size_t k = 5; k < 8; k++) {
38841 GemmMicrokernelTester()
38842 .mr(6)
38843 .nr(8)
38844 .kr(1)
38845 .sr(1)
38846 .m(6)
38847 .n(8)
38848 .k(k)
38849 .a_stride(11)
38850 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38851 }
38852 }
38853
38854 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_gt_4_subtile) {
38855 TEST_REQUIRES_PSIMD;
38856 for (size_t k = 5; k < 8; k++) {
38857 for (uint32_t m = 1; m <= 6; m++) {
38858 for (uint32_t n = 1; n <= 8; n++) {
38859 GemmMicrokernelTester()
38860 .mr(6)
38861 .nr(8)
38862 .kr(1)
38863 .sr(1)
38864 .m(m)
38865 .n(n)
38866 .k(k)
38867 .iterations(1)
38868 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38869 }
38870 }
38871 }
38872 }
38873
38874 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_div_4) {
38875 TEST_REQUIRES_PSIMD;
38876 for (size_t k = 8; k <= 40; k += 4) {
38877 GemmMicrokernelTester()
38878 .mr(6)
38879 .nr(8)
38880 .kr(1)
38881 .sr(1)
38882 .m(6)
38883 .n(8)
38884 .k(k)
38885 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38886 }
38887 }
38888
38889 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_div_4_strided_a) {
38890 TEST_REQUIRES_PSIMD;
38891 for (size_t k = 8; k <= 40; k += 4) {
38892 GemmMicrokernelTester()
38893 .mr(6)
38894 .nr(8)
38895 .kr(1)
38896 .sr(1)
38897 .m(6)
38898 .n(8)
38899 .k(k)
38900 .a_stride(43)
38901 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38902 }
38903 }
38904
38905 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_div_4_subtile) {
38906 TEST_REQUIRES_PSIMD;
38907 for (size_t k = 8; k <= 40; k += 4) {
38908 for (uint32_t m = 1; m <= 6; m++) {
38909 for (uint32_t n = 1; n <= 8; n++) {
38910 GemmMicrokernelTester()
38911 .mr(6)
38912 .nr(8)
38913 .kr(1)
38914 .sr(1)
38915 .m(m)
38916 .n(n)
38917 .k(k)
38918 .iterations(1)
38919 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38920 }
38921 }
38922 }
38923 }
38924
38925 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8) {
38926 TEST_REQUIRES_PSIMD;
38927 for (uint32_t n = 9; n < 16; n++) {
38928 for (size_t k = 1; k <= 20; k += 5) {
38929 GemmMicrokernelTester()
38930 .mr(6)
38931 .nr(8)
38932 .kr(1)
38933 .sr(1)
38934 .m(6)
38935 .n(8)
38936 .k(k)
38937 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38938 }
38939 }
38940 }
38941
38942 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
38943 TEST_REQUIRES_PSIMD;
38944 for (uint32_t n = 9; n < 16; n++) {
38945 for (size_t k = 1; k <= 20; k += 5) {
38946 GemmMicrokernelTester()
38947 .mr(6)
38948 .nr(8)
38949 .kr(1)
38950 .sr(1)
38951 .m(6)
38952 .n(8)
38953 .k(k)
38954 .cn_stride(11)
38955 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38956 }
38957 }
38958 }
38959
38960 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8_strided_a) {
38961 TEST_REQUIRES_PSIMD;
38962 for (uint32_t n = 9; n < 16; n++) {
38963 for (size_t k = 1; k <= 20; k += 5) {
38964 GemmMicrokernelTester()
38965 .mr(6)
38966 .nr(8)
38967 .kr(1)
38968 .sr(1)
38969 .m(6)
38970 .n(n)
38971 .k(k)
38972 .a_stride(23)
38973 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38974 }
38975 }
38976 }
38977
38978 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8_subtile) {
38979 TEST_REQUIRES_PSIMD;
38980 for (uint32_t n = 9; n < 16; n++) {
38981 for (size_t k = 1; k <= 20; k += 5) {
38982 for (uint32_t m = 1; m <= 6; m++) {
38983 GemmMicrokernelTester()
38984 .mr(6)
38985 .nr(8)
38986 .kr(1)
38987 .sr(1)
38988 .m(m)
38989 .n(n)
38990 .k(k)
38991 .iterations(1)
38992 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
38993 }
38994 }
38995 }
38996 }
38997
38998 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8) {
38999 TEST_REQUIRES_PSIMD;
39000 for (uint32_t n = 16; n <= 24; n += 8) {
39001 for (size_t k = 1; k <= 20; k += 5) {
39002 GemmMicrokernelTester()
39003 .mr(6)
39004 .nr(8)
39005 .kr(1)
39006 .sr(1)
39007 .m(6)
39008 .n(8)
39009 .k(k)
39010 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39011 }
39012 }
39013 }
39014
39015 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8_strided_cn) {
39016 TEST_REQUIRES_PSIMD;
39017 for (uint32_t n = 16; n <= 24; n += 8) {
39018 for (size_t k = 1; k <= 20; k += 5) {
39019 GemmMicrokernelTester()
39020 .mr(6)
39021 .nr(8)
39022 .kr(1)
39023 .sr(1)
39024 .m(6)
39025 .n(n)
39026 .k(k)
39027 .cn_stride(11)
39028 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39029 }
39030 }
39031 }
39032
39033 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8_strided_a) {
39034 TEST_REQUIRES_PSIMD;
39035 for (uint32_t n = 16; n <= 24; n += 8) {
39036 for (size_t k = 1; k <= 20; k += 5) {
39037 GemmMicrokernelTester()
39038 .mr(6)
39039 .nr(8)
39040 .kr(1)
39041 .sr(1)
39042 .m(6)
39043 .n(n)
39044 .k(k)
39045 .a_stride(23)
39046 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39047 }
39048 }
39049 }
39050
39051 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8_subtile) {
39052 TEST_REQUIRES_PSIMD;
39053 for (uint32_t n = 16; n <= 24; n += 8) {
39054 for (size_t k = 1; k <= 20; k += 5) {
39055 for (uint32_t m = 1; m <= 6; m++) {
39056 GemmMicrokernelTester()
39057 .mr(6)
39058 .nr(8)
39059 .kr(1)
39060 .sr(1)
39061 .m(m)
39062 .n(n)
39063 .k(k)
39064 .iterations(1)
39065 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39066 }
39067 }
39068 }
39069 }
39070
39071 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, strided_cm_subtile) {
39072 TEST_REQUIRES_PSIMD;
39073 for (size_t k = 1; k <= 20; k += 5) {
39074 for (uint32_t m = 1; m <= 6; m++) {
39075 for (uint32_t n = 1; n <= 8; n++) {
39076 GemmMicrokernelTester()
39077 .mr(6)
39078 .nr(8)
39079 .kr(1)
39080 .sr(1)
39081 .m(m)
39082 .n(n)
39083 .k(k)
39084 .cm_stride(11)
39085 .iterations(1)
39086 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39087 }
39088 }
39089 }
39090 }
39091
39092 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, qmin) {
39093 TEST_REQUIRES_PSIMD;
39094 GemmMicrokernelTester()
39095 .mr(6)
39096 .nr(8)
39097 .kr(1)
39098 .sr(1)
39099 .m(6)
39100 .n(8)
39101 .k(4)
39102 .qmin(128)
39103 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39104 }
39105
39106 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, qmax) {
39107 TEST_REQUIRES_PSIMD;
39108 GemmMicrokernelTester()
39109 .mr(6)
39110 .nr(8)
39111 .kr(1)
39112 .sr(1)
39113 .m(6)
39114 .n(8)
39115 .k(4)
39116 .qmax(128)
39117 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39118 }
39119
39120 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, strided_cm) {
39121 TEST_REQUIRES_PSIMD;
39122 GemmMicrokernelTester()
39123 .mr(6)
39124 .nr(8)
39125 .kr(1)
39126 .sr(1)
39127 .m(6)
39128 .n(8)
39129 .k(4)
39130 .cm_stride(11)
39131 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39132 }
39133#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
39134
39135
39136#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
39137 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4) {
39138 TEST_REQUIRES_PSIMD;
39139 GemmMicrokernelTester()
39140 .mr(1)
39141 .nr(8)
39142 .kr(1)
39143 .sr(4)
39144 .m(1)
39145 .n(8)
39146 .k(4)
39147 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39148 }
39149
39150 TEST(F32_GEMMINC_1X8S4__PSIMD, strided_cn) {
39151 TEST_REQUIRES_PSIMD;
39152 GemmMicrokernelTester()
39153 .mr(1)
39154 .nr(8)
39155 .kr(1)
39156 .sr(4)
39157 .m(1)
39158 .n(8)
39159 .k(4)
39160 .cn_stride(11)
39161 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39162 }
39163
39164 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_strided_a) {
39165 TEST_REQUIRES_PSIMD;
39166 GemmMicrokernelTester()
39167 .mr(1)
39168 .nr(8)
39169 .kr(1)
39170 .sr(4)
39171 .m(1)
39172 .n(8)
39173 .k(4)
39174 .a_stride(7)
39175 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39176 }
39177
39178 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_subtile) {
39179 TEST_REQUIRES_PSIMD;
39180 for (uint32_t m = 1; m <= 1; m++) {
39181 for (uint32_t n = 1; n <= 8; n++) {
39182 GemmMicrokernelTester()
39183 .mr(1)
39184 .nr(8)
39185 .kr(1)
39186 .sr(4)
39187 .m(m)
39188 .n(n)
39189 .k(4)
39190 .iterations(1)
39191 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39192 }
39193 }
39194 }
39195
39196 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_subtile_m) {
39197 TEST_REQUIRES_PSIMD;
39198 for (uint32_t m = 1; m <= 1; m++) {
39199 GemmMicrokernelTester()
39200 .mr(1)
39201 .nr(8)
39202 .kr(1)
39203 .sr(4)
39204 .m(m)
39205 .n(8)
39206 .k(4)
39207 .iterations(1)
39208 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39209 }
39210 }
39211
39212 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_subtile_n) {
39213 TEST_REQUIRES_PSIMD;
39214 for (uint32_t n = 1; n <= 8; n++) {
39215 GemmMicrokernelTester()
39216 .mr(1)
39217 .nr(8)
39218 .kr(1)
39219 .sr(4)
39220 .m(1)
39221 .n(n)
39222 .k(4)
39223 .iterations(1)
39224 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39225 }
39226 }
39227
39228 TEST(F32_GEMMINC_1X8S4__PSIMD, k_lt_4) {
39229 TEST_REQUIRES_PSIMD;
39230 for (size_t k = 1; k < 4; k++) {
39231 GemmMicrokernelTester()
39232 .mr(1)
39233 .nr(8)
39234 .kr(1)
39235 .sr(4)
39236 .m(1)
39237 .n(8)
39238 .k(k)
39239 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39240 }
39241 }
39242
39243 TEST(F32_GEMMINC_1X8S4__PSIMD, k_lt_4_strided_a) {
39244 TEST_REQUIRES_PSIMD;
39245 for (size_t k = 1; k < 4; k++) {
39246 GemmMicrokernelTester()
39247 .mr(1)
39248 .nr(8)
39249 .kr(1)
39250 .sr(4)
39251 .m(1)
39252 .n(8)
39253 .k(k)
39254 .a_stride(7)
39255 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39256 }
39257 }
39258
39259 TEST(F32_GEMMINC_1X8S4__PSIMD, k_lt_4_subtile) {
39260 TEST_REQUIRES_PSIMD;
39261 for (size_t k = 1; k < 4; k++) {
39262 for (uint32_t m = 1; m <= 1; m++) {
39263 for (uint32_t n = 1; n <= 8; n++) {
39264 GemmMicrokernelTester()
39265 .mr(1)
39266 .nr(8)
39267 .kr(1)
39268 .sr(4)
39269 .m(m)
39270 .n(n)
39271 .k(k)
39272 .iterations(1)
39273 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39274 }
39275 }
39276 }
39277 }
39278
39279 TEST(F32_GEMMINC_1X8S4__PSIMD, k_gt_4) {
39280 TEST_REQUIRES_PSIMD;
39281 for (size_t k = 5; k < 8; k++) {
39282 GemmMicrokernelTester()
39283 .mr(1)
39284 .nr(8)
39285 .kr(1)
39286 .sr(4)
39287 .m(1)
39288 .n(8)
39289 .k(k)
39290 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39291 }
39292 }
39293
39294 TEST(F32_GEMMINC_1X8S4__PSIMD, k_gt_4_strided_a) {
39295 TEST_REQUIRES_PSIMD;
39296 for (size_t k = 5; k < 8; k++) {
39297 GemmMicrokernelTester()
39298 .mr(1)
39299 .nr(8)
39300 .kr(1)
39301 .sr(4)
39302 .m(1)
39303 .n(8)
39304 .k(k)
39305 .a_stride(11)
39306 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39307 }
39308 }
39309
39310 TEST(F32_GEMMINC_1X8S4__PSIMD, k_gt_4_subtile) {
39311 TEST_REQUIRES_PSIMD;
39312 for (size_t k = 5; k < 8; k++) {
39313 for (uint32_t m = 1; m <= 1; m++) {
39314 for (uint32_t n = 1; n <= 8; n++) {
39315 GemmMicrokernelTester()
39316 .mr(1)
39317 .nr(8)
39318 .kr(1)
39319 .sr(4)
39320 .m(m)
39321 .n(n)
39322 .k(k)
39323 .iterations(1)
39324 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39325 }
39326 }
39327 }
39328 }
39329
39330 TEST(F32_GEMMINC_1X8S4__PSIMD, k_div_4) {
39331 TEST_REQUIRES_PSIMD;
39332 for (size_t k = 8; k <= 40; k += 4) {
39333 GemmMicrokernelTester()
39334 .mr(1)
39335 .nr(8)
39336 .kr(1)
39337 .sr(4)
39338 .m(1)
39339 .n(8)
39340 .k(k)
39341 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39342 }
39343 }
39344
39345 TEST(F32_GEMMINC_1X8S4__PSIMD, k_div_4_strided_a) {
39346 TEST_REQUIRES_PSIMD;
39347 for (size_t k = 8; k <= 40; k += 4) {
39348 GemmMicrokernelTester()
39349 .mr(1)
39350 .nr(8)
39351 .kr(1)
39352 .sr(4)
39353 .m(1)
39354 .n(8)
39355 .k(k)
39356 .a_stride(43)
39357 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39358 }
39359 }
39360
39361 TEST(F32_GEMMINC_1X8S4__PSIMD, k_div_4_subtile) {
39362 TEST_REQUIRES_PSIMD;
39363 for (size_t k = 8; k <= 40; k += 4) {
39364 for (uint32_t m = 1; m <= 1; m++) {
39365 for (uint32_t n = 1; n <= 8; n++) {
39366 GemmMicrokernelTester()
39367 .mr(1)
39368 .nr(8)
39369 .kr(1)
39370 .sr(4)
39371 .m(m)
39372 .n(n)
39373 .k(k)
39374 .iterations(1)
39375 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39376 }
39377 }
39378 }
39379 }
39380
39381 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8) {
39382 TEST_REQUIRES_PSIMD;
39383 for (uint32_t n = 9; n < 16; n++) {
39384 for (size_t k = 1; k <= 20; k += 5) {
39385 GemmMicrokernelTester()
39386 .mr(1)
39387 .nr(8)
39388 .kr(1)
39389 .sr(4)
39390 .m(1)
39391 .n(8)
39392 .k(k)
39393 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39394 }
39395 }
39396 }
39397
39398 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8_strided_cn) {
39399 TEST_REQUIRES_PSIMD;
39400 for (uint32_t n = 9; n < 16; n++) {
39401 for (size_t k = 1; k <= 20; k += 5) {
39402 GemmMicrokernelTester()
39403 .mr(1)
39404 .nr(8)
39405 .kr(1)
39406 .sr(4)
39407 .m(1)
39408 .n(8)
39409 .k(k)
39410 .cn_stride(11)
39411 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39412 }
39413 }
39414 }
39415
39416 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8_strided_a) {
39417 TEST_REQUIRES_PSIMD;
39418 for (uint32_t n = 9; n < 16; n++) {
39419 for (size_t k = 1; k <= 20; k += 5) {
39420 GemmMicrokernelTester()
39421 .mr(1)
39422 .nr(8)
39423 .kr(1)
39424 .sr(4)
39425 .m(1)
39426 .n(n)
39427 .k(k)
39428 .a_stride(23)
39429 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39430 }
39431 }
39432 }
39433
39434 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8_subtile) {
39435 TEST_REQUIRES_PSIMD;
39436 for (uint32_t n = 9; n < 16; n++) {
39437 for (size_t k = 1; k <= 20; k += 5) {
39438 for (uint32_t m = 1; m <= 1; m++) {
39439 GemmMicrokernelTester()
39440 .mr(1)
39441 .nr(8)
39442 .kr(1)
39443 .sr(4)
39444 .m(m)
39445 .n(n)
39446 .k(k)
39447 .iterations(1)
39448 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39449 }
39450 }
39451 }
39452 }
39453
39454 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8) {
39455 TEST_REQUIRES_PSIMD;
39456 for (uint32_t n = 16; n <= 24; n += 8) {
39457 for (size_t k = 1; k <= 20; k += 5) {
39458 GemmMicrokernelTester()
39459 .mr(1)
39460 .nr(8)
39461 .kr(1)
39462 .sr(4)
39463 .m(1)
39464 .n(8)
39465 .k(k)
39466 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39467 }
39468 }
39469 }
39470
39471 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8_strided_cn) {
39472 TEST_REQUIRES_PSIMD;
39473 for (uint32_t n = 16; n <= 24; n += 8) {
39474 for (size_t k = 1; k <= 20; k += 5) {
39475 GemmMicrokernelTester()
39476 .mr(1)
39477 .nr(8)
39478 .kr(1)
39479 .sr(4)
39480 .m(1)
39481 .n(n)
39482 .k(k)
39483 .cn_stride(11)
39484 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39485 }
39486 }
39487 }
39488
39489 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8_strided_a) {
39490 TEST_REQUIRES_PSIMD;
39491 for (uint32_t n = 16; n <= 24; n += 8) {
39492 for (size_t k = 1; k <= 20; k += 5) {
39493 GemmMicrokernelTester()
39494 .mr(1)
39495 .nr(8)
39496 .kr(1)
39497 .sr(4)
39498 .m(1)
39499 .n(n)
39500 .k(k)
39501 .a_stride(23)
39502 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39503 }
39504 }
39505 }
39506
39507 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8_subtile) {
39508 TEST_REQUIRES_PSIMD;
39509 for (uint32_t n = 16; n <= 24; n += 8) {
39510 for (size_t k = 1; k <= 20; k += 5) {
39511 for (uint32_t m = 1; m <= 1; m++) {
39512 GemmMicrokernelTester()
39513 .mr(1)
39514 .nr(8)
39515 .kr(1)
39516 .sr(4)
39517 .m(m)
39518 .n(n)
39519 .k(k)
39520 .iterations(1)
39521 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39522 }
39523 }
39524 }
39525 }
39526
39527 TEST(F32_GEMMINC_1X8S4__PSIMD, strided_cm_subtile) {
39528 TEST_REQUIRES_PSIMD;
39529 for (size_t k = 1; k <= 20; k += 5) {
39530 for (uint32_t m = 1; m <= 1; m++) {
39531 for (uint32_t n = 1; n <= 8; n++) {
39532 GemmMicrokernelTester()
39533 .mr(1)
39534 .nr(8)
39535 .kr(1)
39536 .sr(4)
39537 .m(m)
39538 .n(n)
39539 .k(k)
39540 .cm_stride(11)
39541 .iterations(1)
39542 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39543 }
39544 }
39545 }
39546 }
39547
39548 TEST(F32_GEMMINC_1X8S4__PSIMD, qmin) {
39549 TEST_REQUIRES_PSIMD;
39550 GemmMicrokernelTester()
39551 .mr(1)
39552 .nr(8)
39553 .kr(1)
39554 .sr(4)
39555 .m(1)
39556 .n(8)
39557 .k(4)
39558 .qmin(128)
39559 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39560 }
39561
39562 TEST(F32_GEMMINC_1X8S4__PSIMD, qmax) {
39563 TEST_REQUIRES_PSIMD;
39564 GemmMicrokernelTester()
39565 .mr(1)
39566 .nr(8)
39567 .kr(1)
39568 .sr(4)
39569 .m(1)
39570 .n(8)
39571 .k(4)
39572 .qmax(128)
39573 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39574 }
39575
39576 TEST(F32_GEMMINC_1X8S4__PSIMD, strided_cm) {
39577 TEST_REQUIRES_PSIMD;
39578 GemmMicrokernelTester()
39579 .mr(1)
39580 .nr(8)
39581 .kr(1)
39582 .sr(4)
39583 .m(1)
39584 .n(8)
39585 .k(4)
39586 .cm_stride(11)
39587 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39588 }
39589#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
39590
39591
39592#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
39593 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4) {
39594 TEST_REQUIRES_PSIMD;
39595 GemmMicrokernelTester()
39596 .mr(4)
39597 .nr(8)
39598 .kr(1)
39599 .sr(4)
39600 .m(4)
39601 .n(8)
39602 .k(4)
39603 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39604 }
39605
39606 TEST(F32_GEMMINC_4X8S4__PSIMD, strided_cn) {
39607 TEST_REQUIRES_PSIMD;
39608 GemmMicrokernelTester()
39609 .mr(4)
39610 .nr(8)
39611 .kr(1)
39612 .sr(4)
39613 .m(4)
39614 .n(8)
39615 .k(4)
39616 .cn_stride(11)
39617 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39618 }
39619
39620 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_strided_a) {
39621 TEST_REQUIRES_PSIMD;
39622 GemmMicrokernelTester()
39623 .mr(4)
39624 .nr(8)
39625 .kr(1)
39626 .sr(4)
39627 .m(4)
39628 .n(8)
39629 .k(4)
39630 .a_stride(7)
39631 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39632 }
39633
39634 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_subtile) {
39635 TEST_REQUIRES_PSIMD;
39636 for (uint32_t m = 1; m <= 4; m++) {
39637 for (uint32_t n = 1; n <= 8; n++) {
39638 GemmMicrokernelTester()
39639 .mr(4)
39640 .nr(8)
39641 .kr(1)
39642 .sr(4)
39643 .m(m)
39644 .n(n)
39645 .k(4)
39646 .iterations(1)
39647 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39648 }
39649 }
39650 }
39651
39652 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_subtile_m) {
39653 TEST_REQUIRES_PSIMD;
39654 for (uint32_t m = 1; m <= 4; m++) {
39655 GemmMicrokernelTester()
39656 .mr(4)
39657 .nr(8)
39658 .kr(1)
39659 .sr(4)
39660 .m(m)
39661 .n(8)
39662 .k(4)
39663 .iterations(1)
39664 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39665 }
39666 }
39667
39668 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_subtile_n) {
39669 TEST_REQUIRES_PSIMD;
39670 for (uint32_t n = 1; n <= 8; n++) {
39671 GemmMicrokernelTester()
39672 .mr(4)
39673 .nr(8)
39674 .kr(1)
39675 .sr(4)
39676 .m(4)
39677 .n(n)
39678 .k(4)
39679 .iterations(1)
39680 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39681 }
39682 }
39683
39684 TEST(F32_GEMMINC_4X8S4__PSIMD, k_lt_4) {
39685 TEST_REQUIRES_PSIMD;
39686 for (size_t k = 1; k < 4; k++) {
39687 GemmMicrokernelTester()
39688 .mr(4)
39689 .nr(8)
39690 .kr(1)
39691 .sr(4)
39692 .m(4)
39693 .n(8)
39694 .k(k)
39695 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39696 }
39697 }
39698
39699 TEST(F32_GEMMINC_4X8S4__PSIMD, k_lt_4_strided_a) {
39700 TEST_REQUIRES_PSIMD;
39701 for (size_t k = 1; k < 4; k++) {
39702 GemmMicrokernelTester()
39703 .mr(4)
39704 .nr(8)
39705 .kr(1)
39706 .sr(4)
39707 .m(4)
39708 .n(8)
39709 .k(k)
39710 .a_stride(7)
39711 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39712 }
39713 }
39714
39715 TEST(F32_GEMMINC_4X8S4__PSIMD, k_lt_4_subtile) {
39716 TEST_REQUIRES_PSIMD;
39717 for (size_t k = 1; k < 4; k++) {
39718 for (uint32_t m = 1; m <= 4; m++) {
39719 for (uint32_t n = 1; n <= 8; n++) {
39720 GemmMicrokernelTester()
39721 .mr(4)
39722 .nr(8)
39723 .kr(1)
39724 .sr(4)
39725 .m(m)
39726 .n(n)
39727 .k(k)
39728 .iterations(1)
39729 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39730 }
39731 }
39732 }
39733 }
39734
39735 TEST(F32_GEMMINC_4X8S4__PSIMD, k_gt_4) {
39736 TEST_REQUIRES_PSIMD;
39737 for (size_t k = 5; k < 8; k++) {
39738 GemmMicrokernelTester()
39739 .mr(4)
39740 .nr(8)
39741 .kr(1)
39742 .sr(4)
39743 .m(4)
39744 .n(8)
39745 .k(k)
39746 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39747 }
39748 }
39749
39750 TEST(F32_GEMMINC_4X8S4__PSIMD, k_gt_4_strided_a) {
39751 TEST_REQUIRES_PSIMD;
39752 for (size_t k = 5; k < 8; k++) {
39753 GemmMicrokernelTester()
39754 .mr(4)
39755 .nr(8)
39756 .kr(1)
39757 .sr(4)
39758 .m(4)
39759 .n(8)
39760 .k(k)
39761 .a_stride(11)
39762 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39763 }
39764 }
39765
39766 TEST(F32_GEMMINC_4X8S4__PSIMD, k_gt_4_subtile) {
39767 TEST_REQUIRES_PSIMD;
39768 for (size_t k = 5; k < 8; k++) {
39769 for (uint32_t m = 1; m <= 4; m++) {
39770 for (uint32_t n = 1; n <= 8; n++) {
39771 GemmMicrokernelTester()
39772 .mr(4)
39773 .nr(8)
39774 .kr(1)
39775 .sr(4)
39776 .m(m)
39777 .n(n)
39778 .k(k)
39779 .iterations(1)
39780 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39781 }
39782 }
39783 }
39784 }
39785
39786 TEST(F32_GEMMINC_4X8S4__PSIMD, k_div_4) {
39787 TEST_REQUIRES_PSIMD;
39788 for (size_t k = 8; k <= 40; k += 4) {
39789 GemmMicrokernelTester()
39790 .mr(4)
39791 .nr(8)
39792 .kr(1)
39793 .sr(4)
39794 .m(4)
39795 .n(8)
39796 .k(k)
39797 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39798 }
39799 }
39800
39801 TEST(F32_GEMMINC_4X8S4__PSIMD, k_div_4_strided_a) {
39802 TEST_REQUIRES_PSIMD;
39803 for (size_t k = 8; k <= 40; k += 4) {
39804 GemmMicrokernelTester()
39805 .mr(4)
39806 .nr(8)
39807 .kr(1)
39808 .sr(4)
39809 .m(4)
39810 .n(8)
39811 .k(k)
39812 .a_stride(43)
39813 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39814 }
39815 }
39816
39817 TEST(F32_GEMMINC_4X8S4__PSIMD, k_div_4_subtile) {
39818 TEST_REQUIRES_PSIMD;
39819 for (size_t k = 8; k <= 40; k += 4) {
39820 for (uint32_t m = 1; m <= 4; m++) {
39821 for (uint32_t n = 1; n <= 8; n++) {
39822 GemmMicrokernelTester()
39823 .mr(4)
39824 .nr(8)
39825 .kr(1)
39826 .sr(4)
39827 .m(m)
39828 .n(n)
39829 .k(k)
39830 .iterations(1)
39831 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39832 }
39833 }
39834 }
39835 }
39836
39837 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8) {
39838 TEST_REQUIRES_PSIMD;
39839 for (uint32_t n = 9; n < 16; n++) {
39840 for (size_t k = 1; k <= 20; k += 5) {
39841 GemmMicrokernelTester()
39842 .mr(4)
39843 .nr(8)
39844 .kr(1)
39845 .sr(4)
39846 .m(4)
39847 .n(8)
39848 .k(k)
39849 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39850 }
39851 }
39852 }
39853
39854 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8_strided_cn) {
39855 TEST_REQUIRES_PSIMD;
39856 for (uint32_t n = 9; n < 16; n++) {
39857 for (size_t k = 1; k <= 20; k += 5) {
39858 GemmMicrokernelTester()
39859 .mr(4)
39860 .nr(8)
39861 .kr(1)
39862 .sr(4)
39863 .m(4)
39864 .n(8)
39865 .k(k)
39866 .cn_stride(11)
39867 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39868 }
39869 }
39870 }
39871
39872 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8_strided_a) {
39873 TEST_REQUIRES_PSIMD;
39874 for (uint32_t n = 9; n < 16; n++) {
39875 for (size_t k = 1; k <= 20; k += 5) {
39876 GemmMicrokernelTester()
39877 .mr(4)
39878 .nr(8)
39879 .kr(1)
39880 .sr(4)
39881 .m(4)
39882 .n(n)
39883 .k(k)
39884 .a_stride(23)
39885 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39886 }
39887 }
39888 }
39889
39890 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8_subtile) {
39891 TEST_REQUIRES_PSIMD;
39892 for (uint32_t n = 9; n < 16; n++) {
39893 for (size_t k = 1; k <= 20; k += 5) {
39894 for (uint32_t m = 1; m <= 4; m++) {
39895 GemmMicrokernelTester()
39896 .mr(4)
39897 .nr(8)
39898 .kr(1)
39899 .sr(4)
39900 .m(m)
39901 .n(n)
39902 .k(k)
39903 .iterations(1)
39904 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39905 }
39906 }
39907 }
39908 }
39909
39910 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8) {
39911 TEST_REQUIRES_PSIMD;
39912 for (uint32_t n = 16; n <= 24; n += 8) {
39913 for (size_t k = 1; k <= 20; k += 5) {
39914 GemmMicrokernelTester()
39915 .mr(4)
39916 .nr(8)
39917 .kr(1)
39918 .sr(4)
39919 .m(4)
39920 .n(8)
39921 .k(k)
39922 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39923 }
39924 }
39925 }
39926
39927 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8_strided_cn) {
39928 TEST_REQUIRES_PSIMD;
39929 for (uint32_t n = 16; n <= 24; n += 8) {
39930 for (size_t k = 1; k <= 20; k += 5) {
39931 GemmMicrokernelTester()
39932 .mr(4)
39933 .nr(8)
39934 .kr(1)
39935 .sr(4)
39936 .m(4)
39937 .n(n)
39938 .k(k)
39939 .cn_stride(11)
39940 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39941 }
39942 }
39943 }
39944
39945 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8_strided_a) {
39946 TEST_REQUIRES_PSIMD;
39947 for (uint32_t n = 16; n <= 24; n += 8) {
39948 for (size_t k = 1; k <= 20; k += 5) {
39949 GemmMicrokernelTester()
39950 .mr(4)
39951 .nr(8)
39952 .kr(1)
39953 .sr(4)
39954 .m(4)
39955 .n(n)
39956 .k(k)
39957 .a_stride(23)
39958 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39959 }
39960 }
39961 }
39962
39963 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8_subtile) {
39964 TEST_REQUIRES_PSIMD;
39965 for (uint32_t n = 16; n <= 24; n += 8) {
39966 for (size_t k = 1; k <= 20; k += 5) {
39967 for (uint32_t m = 1; m <= 4; m++) {
39968 GemmMicrokernelTester()
39969 .mr(4)
39970 .nr(8)
39971 .kr(1)
39972 .sr(4)
39973 .m(m)
39974 .n(n)
39975 .k(k)
39976 .iterations(1)
39977 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39978 }
39979 }
39980 }
39981 }
39982
39983 TEST(F32_GEMMINC_4X8S4__PSIMD, strided_cm_subtile) {
39984 TEST_REQUIRES_PSIMD;
39985 for (size_t k = 1; k <= 20; k += 5) {
39986 for (uint32_t m = 1; m <= 4; m++) {
39987 for (uint32_t n = 1; n <= 8; n++) {
39988 GemmMicrokernelTester()
39989 .mr(4)
39990 .nr(8)
39991 .kr(1)
39992 .sr(4)
39993 .m(m)
39994 .n(n)
39995 .k(k)
39996 .cm_stride(11)
39997 .iterations(1)
39998 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
39999 }
40000 }
40001 }
40002 }
40003
40004 TEST(F32_GEMMINC_4X8S4__PSIMD, qmin) {
40005 TEST_REQUIRES_PSIMD;
40006 GemmMicrokernelTester()
40007 .mr(4)
40008 .nr(8)
40009 .kr(1)
40010 .sr(4)
40011 .m(4)
40012 .n(8)
40013 .k(4)
40014 .qmin(128)
40015 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40016 }
40017
40018 TEST(F32_GEMMINC_4X8S4__PSIMD, qmax) {
40019 TEST_REQUIRES_PSIMD;
40020 GemmMicrokernelTester()
40021 .mr(4)
40022 .nr(8)
40023 .kr(1)
40024 .sr(4)
40025 .m(4)
40026 .n(8)
40027 .k(4)
40028 .qmax(128)
40029 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40030 }
40031
40032 TEST(F32_GEMMINC_4X8S4__PSIMD, strided_cm) {
40033 TEST_REQUIRES_PSIMD;
40034 GemmMicrokernelTester()
40035 .mr(4)
40036 .nr(8)
40037 .kr(1)
40038 .sr(4)
40039 .m(4)
40040 .n(8)
40041 .k(4)
40042 .cm_stride(11)
40043 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40044 }
40045#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
40046
40047
40048#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
40049 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4) {
40050 TEST_REQUIRES_PSIMD;
40051 GemmMicrokernelTester()
40052 .mr(6)
40053 .nr(8)
40054 .kr(1)
40055 .sr(4)
40056 .m(6)
40057 .n(8)
40058 .k(4)
40059 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40060 }
40061
40062 TEST(F32_GEMMINC_6X8S4__PSIMD, strided_cn) {
40063 TEST_REQUIRES_PSIMD;
40064 GemmMicrokernelTester()
40065 .mr(6)
40066 .nr(8)
40067 .kr(1)
40068 .sr(4)
40069 .m(6)
40070 .n(8)
40071 .k(4)
40072 .cn_stride(11)
40073 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40074 }
40075
40076 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_strided_a) {
40077 TEST_REQUIRES_PSIMD;
40078 GemmMicrokernelTester()
40079 .mr(6)
40080 .nr(8)
40081 .kr(1)
40082 .sr(4)
40083 .m(6)
40084 .n(8)
40085 .k(4)
40086 .a_stride(7)
40087 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40088 }
40089
40090 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_subtile) {
40091 TEST_REQUIRES_PSIMD;
40092 for (uint32_t m = 1; m <= 6; m++) {
40093 for (uint32_t n = 1; n <= 8; n++) {
40094 GemmMicrokernelTester()
40095 .mr(6)
40096 .nr(8)
40097 .kr(1)
40098 .sr(4)
40099 .m(m)
40100 .n(n)
40101 .k(4)
40102 .iterations(1)
40103 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40104 }
40105 }
40106 }
40107
40108 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_subtile_m) {
40109 TEST_REQUIRES_PSIMD;
40110 for (uint32_t m = 1; m <= 6; m++) {
40111 GemmMicrokernelTester()
40112 .mr(6)
40113 .nr(8)
40114 .kr(1)
40115 .sr(4)
40116 .m(m)
40117 .n(8)
40118 .k(4)
40119 .iterations(1)
40120 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40121 }
40122 }
40123
40124 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_subtile_n) {
40125 TEST_REQUIRES_PSIMD;
40126 for (uint32_t n = 1; n <= 8; n++) {
40127 GemmMicrokernelTester()
40128 .mr(6)
40129 .nr(8)
40130 .kr(1)
40131 .sr(4)
40132 .m(6)
40133 .n(n)
40134 .k(4)
40135 .iterations(1)
40136 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40137 }
40138 }
40139
40140 TEST(F32_GEMMINC_6X8S4__PSIMD, k_lt_4) {
40141 TEST_REQUIRES_PSIMD;
40142 for (size_t k = 1; k < 4; k++) {
40143 GemmMicrokernelTester()
40144 .mr(6)
40145 .nr(8)
40146 .kr(1)
40147 .sr(4)
40148 .m(6)
40149 .n(8)
40150 .k(k)
40151 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40152 }
40153 }
40154
40155 TEST(F32_GEMMINC_6X8S4__PSIMD, k_lt_4_strided_a) {
40156 TEST_REQUIRES_PSIMD;
40157 for (size_t k = 1; k < 4; k++) {
40158 GemmMicrokernelTester()
40159 .mr(6)
40160 .nr(8)
40161 .kr(1)
40162 .sr(4)
40163 .m(6)
40164 .n(8)
40165 .k(k)
40166 .a_stride(7)
40167 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40168 }
40169 }
40170
40171 TEST(F32_GEMMINC_6X8S4__PSIMD, k_lt_4_subtile) {
40172 TEST_REQUIRES_PSIMD;
40173 for (size_t k = 1; k < 4; k++) {
40174 for (uint32_t m = 1; m <= 6; m++) {
40175 for (uint32_t n = 1; n <= 8; n++) {
40176 GemmMicrokernelTester()
40177 .mr(6)
40178 .nr(8)
40179 .kr(1)
40180 .sr(4)
40181 .m(m)
40182 .n(n)
40183 .k(k)
40184 .iterations(1)
40185 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40186 }
40187 }
40188 }
40189 }
40190
40191 TEST(F32_GEMMINC_6X8S4__PSIMD, k_gt_4) {
40192 TEST_REQUIRES_PSIMD;
40193 for (size_t k = 5; k < 8; k++) {
40194 GemmMicrokernelTester()
40195 .mr(6)
40196 .nr(8)
40197 .kr(1)
40198 .sr(4)
40199 .m(6)
40200 .n(8)
40201 .k(k)
40202 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40203 }
40204 }
40205
40206 TEST(F32_GEMMINC_6X8S4__PSIMD, k_gt_4_strided_a) {
40207 TEST_REQUIRES_PSIMD;
40208 for (size_t k = 5; k < 8; k++) {
40209 GemmMicrokernelTester()
40210 .mr(6)
40211 .nr(8)
40212 .kr(1)
40213 .sr(4)
40214 .m(6)
40215 .n(8)
40216 .k(k)
40217 .a_stride(11)
40218 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40219 }
40220 }
40221
40222 TEST(F32_GEMMINC_6X8S4__PSIMD, k_gt_4_subtile) {
40223 TEST_REQUIRES_PSIMD;
40224 for (size_t k = 5; k < 8; k++) {
40225 for (uint32_t m = 1; m <= 6; m++) {
40226 for (uint32_t n = 1; n <= 8; n++) {
40227 GemmMicrokernelTester()
40228 .mr(6)
40229 .nr(8)
40230 .kr(1)
40231 .sr(4)
40232 .m(m)
40233 .n(n)
40234 .k(k)
40235 .iterations(1)
40236 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40237 }
40238 }
40239 }
40240 }
40241
40242 TEST(F32_GEMMINC_6X8S4__PSIMD, k_div_4) {
40243 TEST_REQUIRES_PSIMD;
40244 for (size_t k = 8; k <= 40; k += 4) {
40245 GemmMicrokernelTester()
40246 .mr(6)
40247 .nr(8)
40248 .kr(1)
40249 .sr(4)
40250 .m(6)
40251 .n(8)
40252 .k(k)
40253 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40254 }
40255 }
40256
40257 TEST(F32_GEMMINC_6X8S4__PSIMD, k_div_4_strided_a) {
40258 TEST_REQUIRES_PSIMD;
40259 for (size_t k = 8; k <= 40; k += 4) {
40260 GemmMicrokernelTester()
40261 .mr(6)
40262 .nr(8)
40263 .kr(1)
40264 .sr(4)
40265 .m(6)
40266 .n(8)
40267 .k(k)
40268 .a_stride(43)
40269 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40270 }
40271 }
40272
40273 TEST(F32_GEMMINC_6X8S4__PSIMD, k_div_4_subtile) {
40274 TEST_REQUIRES_PSIMD;
40275 for (size_t k = 8; k <= 40; k += 4) {
40276 for (uint32_t m = 1; m <= 6; m++) {
40277 for (uint32_t n = 1; n <= 8; n++) {
40278 GemmMicrokernelTester()
40279 .mr(6)
40280 .nr(8)
40281 .kr(1)
40282 .sr(4)
40283 .m(m)
40284 .n(n)
40285 .k(k)
40286 .iterations(1)
40287 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40288 }
40289 }
40290 }
40291 }
40292
40293 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8) {
40294 TEST_REQUIRES_PSIMD;
40295 for (uint32_t n = 9; n < 16; n++) {
40296 for (size_t k = 1; k <= 20; k += 5) {
40297 GemmMicrokernelTester()
40298 .mr(6)
40299 .nr(8)
40300 .kr(1)
40301 .sr(4)
40302 .m(6)
40303 .n(8)
40304 .k(k)
40305 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40306 }
40307 }
40308 }
40309
40310 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8_strided_cn) {
40311 TEST_REQUIRES_PSIMD;
40312 for (uint32_t n = 9; n < 16; n++) {
40313 for (size_t k = 1; k <= 20; k += 5) {
40314 GemmMicrokernelTester()
40315 .mr(6)
40316 .nr(8)
40317 .kr(1)
40318 .sr(4)
40319 .m(6)
40320 .n(8)
40321 .k(k)
40322 .cn_stride(11)
40323 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40324 }
40325 }
40326 }
40327
40328 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8_strided_a) {
40329 TEST_REQUIRES_PSIMD;
40330 for (uint32_t n = 9; n < 16; n++) {
40331 for (size_t k = 1; k <= 20; k += 5) {
40332 GemmMicrokernelTester()
40333 .mr(6)
40334 .nr(8)
40335 .kr(1)
40336 .sr(4)
40337 .m(6)
40338 .n(n)
40339 .k(k)
40340 .a_stride(23)
40341 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40342 }
40343 }
40344 }
40345
40346 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8_subtile) {
40347 TEST_REQUIRES_PSIMD;
40348 for (uint32_t n = 9; n < 16; n++) {
40349 for (size_t k = 1; k <= 20; k += 5) {
40350 for (uint32_t m = 1; m <= 6; m++) {
40351 GemmMicrokernelTester()
40352 .mr(6)
40353 .nr(8)
40354 .kr(1)
40355 .sr(4)
40356 .m(m)
40357 .n(n)
40358 .k(k)
40359 .iterations(1)
40360 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40361 }
40362 }
40363 }
40364 }
40365
40366 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8) {
40367 TEST_REQUIRES_PSIMD;
40368 for (uint32_t n = 16; n <= 24; n += 8) {
40369 for (size_t k = 1; k <= 20; k += 5) {
40370 GemmMicrokernelTester()
40371 .mr(6)
40372 .nr(8)
40373 .kr(1)
40374 .sr(4)
40375 .m(6)
40376 .n(8)
40377 .k(k)
40378 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40379 }
40380 }
40381 }
40382
40383 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8_strided_cn) {
40384 TEST_REQUIRES_PSIMD;
40385 for (uint32_t n = 16; n <= 24; n += 8) {
40386 for (size_t k = 1; k <= 20; k += 5) {
40387 GemmMicrokernelTester()
40388 .mr(6)
40389 .nr(8)
40390 .kr(1)
40391 .sr(4)
40392 .m(6)
40393 .n(n)
40394 .k(k)
40395 .cn_stride(11)
40396 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40397 }
40398 }
40399 }
40400
40401 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8_strided_a) {
40402 TEST_REQUIRES_PSIMD;
40403 for (uint32_t n = 16; n <= 24; n += 8) {
40404 for (size_t k = 1; k <= 20; k += 5) {
40405 GemmMicrokernelTester()
40406 .mr(6)
40407 .nr(8)
40408 .kr(1)
40409 .sr(4)
40410 .m(6)
40411 .n(n)
40412 .k(k)
40413 .a_stride(23)
40414 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40415 }
40416 }
40417 }
40418
40419 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8_subtile) {
40420 TEST_REQUIRES_PSIMD;
40421 for (uint32_t n = 16; n <= 24; n += 8) {
40422 for (size_t k = 1; k <= 20; k += 5) {
40423 for (uint32_t m = 1; m <= 6; m++) {
40424 GemmMicrokernelTester()
40425 .mr(6)
40426 .nr(8)
40427 .kr(1)
40428 .sr(4)
40429 .m(m)
40430 .n(n)
40431 .k(k)
40432 .iterations(1)
40433 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40434 }
40435 }
40436 }
40437 }
40438
40439 TEST(F32_GEMMINC_6X8S4__PSIMD, strided_cm_subtile) {
40440 TEST_REQUIRES_PSIMD;
40441 for (size_t k = 1; k <= 20; k += 5) {
40442 for (uint32_t m = 1; m <= 6; m++) {
40443 for (uint32_t n = 1; n <= 8; n++) {
40444 GemmMicrokernelTester()
40445 .mr(6)
40446 .nr(8)
40447 .kr(1)
40448 .sr(4)
40449 .m(m)
40450 .n(n)
40451 .k(k)
40452 .cm_stride(11)
40453 .iterations(1)
40454 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40455 }
40456 }
40457 }
40458 }
40459
40460 TEST(F32_GEMMINC_6X8S4__PSIMD, qmin) {
40461 TEST_REQUIRES_PSIMD;
40462 GemmMicrokernelTester()
40463 .mr(6)
40464 .nr(8)
40465 .kr(1)
40466 .sr(4)
40467 .m(6)
40468 .n(8)
40469 .k(4)
40470 .qmin(128)
40471 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40472 }
40473
40474 TEST(F32_GEMMINC_6X8S4__PSIMD, qmax) {
40475 TEST_REQUIRES_PSIMD;
40476 GemmMicrokernelTester()
40477 .mr(6)
40478 .nr(8)
40479 .kr(1)
40480 .sr(4)
40481 .m(6)
40482 .n(8)
40483 .k(4)
40484 .qmax(128)
40485 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40486 }
40487
40488 TEST(F32_GEMMINC_6X8S4__PSIMD, strided_cm) {
40489 TEST_REQUIRES_PSIMD;
40490 GemmMicrokernelTester()
40491 .mr(6)
40492 .nr(8)
40493 .kr(1)
40494 .sr(4)
40495 .m(6)
40496 .n(8)
40497 .k(4)
40498 .cm_stride(11)
40499 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40500 }
40501#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
40502
40503
40504#if XNN_ARCH_WASM
40505 TEST(F32_GEMMINC_1X4__WASM, k_eq_1) {
40506 GemmMicrokernelTester()
40507 .mr(1)
40508 .nr(4)
40509 .kr(1)
40510 .sr(1)
40511 .m(1)
40512 .n(4)
40513 .k(1)
40514 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40515 }
40516
40517 TEST(F32_GEMMINC_1X4__WASM, strided_cn) {
40518 GemmMicrokernelTester()
40519 .mr(1)
40520 .nr(4)
40521 .kr(1)
40522 .sr(1)
40523 .m(1)
40524 .n(4)
40525 .k(1)
40526 .cn_stride(7)
40527 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40528 }
40529
40530 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_strided_a) {
40531 GemmMicrokernelTester()
40532 .mr(1)
40533 .nr(4)
40534 .kr(1)
40535 .sr(1)
40536 .m(1)
40537 .n(4)
40538 .k(1)
40539 .a_stride(3)
40540 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40541 }
40542
40543 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_subtile) {
40544 for (uint32_t m = 1; m <= 1; m++) {
40545 for (uint32_t n = 1; n <= 4; n++) {
40546 GemmMicrokernelTester()
40547 .mr(1)
40548 .nr(4)
40549 .kr(1)
40550 .sr(1)
40551 .m(m)
40552 .n(n)
40553 .k(1)
40554 .iterations(1)
40555 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40556 }
40557 }
40558 }
40559
40560 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_subtile_m) {
40561 for (uint32_t m = 1; m <= 1; m++) {
40562 GemmMicrokernelTester()
40563 .mr(1)
40564 .nr(4)
40565 .kr(1)
40566 .sr(1)
40567 .m(m)
40568 .n(4)
40569 .k(1)
40570 .iterations(1)
40571 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40572 }
40573 }
40574
40575 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_subtile_n) {
40576 for (uint32_t n = 1; n <= 4; n++) {
40577 GemmMicrokernelTester()
40578 .mr(1)
40579 .nr(4)
40580 .kr(1)
40581 .sr(1)
40582 .m(1)
40583 .n(n)
40584 .k(1)
40585 .iterations(1)
40586 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40587 }
40588 }
40589
40590 TEST(F32_GEMMINC_1X4__WASM, k_gt_1) {
40591 for (size_t k = 2; k < 10; k++) {
40592 GemmMicrokernelTester()
40593 .mr(1)
40594 .nr(4)
40595 .kr(1)
40596 .sr(1)
40597 .m(1)
40598 .n(4)
40599 .k(k)
40600 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40601 }
40602 }
40603
40604 TEST(F32_GEMMINC_1X4__WASM, k_gt_1_strided_a) {
40605 for (size_t k = 2; k < 10; k++) {
40606 GemmMicrokernelTester()
40607 .mr(1)
40608 .nr(4)
40609 .kr(1)
40610 .sr(1)
40611 .m(1)
40612 .n(4)
40613 .k(k)
40614 .a_stride(11)
40615 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40616 }
40617 }
40618
40619 TEST(F32_GEMMINC_1X4__WASM, k_gt_1_subtile) {
40620 for (size_t k = 2; k < 10; k++) {
40621 for (uint32_t m = 1; m <= 1; m++) {
40622 for (uint32_t n = 1; n <= 4; n++) {
40623 GemmMicrokernelTester()
40624 .mr(1)
40625 .nr(4)
40626 .kr(1)
40627 .sr(1)
40628 .m(m)
40629 .n(n)
40630 .k(k)
40631 .iterations(1)
40632 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40633 }
40634 }
40635 }
40636 }
40637
40638 TEST(F32_GEMMINC_1X4__WASM, n_gt_4) {
40639 for (uint32_t n = 5; n < 8; n++) {
40640 for (size_t k = 1; k <= 5; k += 2) {
40641 GemmMicrokernelTester()
40642 .mr(1)
40643 .nr(4)
40644 .kr(1)
40645 .sr(1)
40646 .m(1)
40647 .n(4)
40648 .k(k)
40649 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40650 }
40651 }
40652 }
40653
40654 TEST(F32_GEMMINC_1X4__WASM, n_gt_4_strided_cn) {
40655 for (uint32_t n = 5; n < 8; n++) {
40656 for (size_t k = 1; k <= 5; k += 2) {
40657 GemmMicrokernelTester()
40658 .mr(1)
40659 .nr(4)
40660 .kr(1)
40661 .sr(1)
40662 .m(1)
40663 .n(4)
40664 .k(k)
40665 .cn_stride(7)
40666 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40667 }
40668 }
40669 }
40670
40671 TEST(F32_GEMMINC_1X4__WASM, n_gt_4_strided_a) {
40672 for (uint32_t n = 5; n < 8; n++) {
40673 for (size_t k = 1; k <= 5; k += 2) {
40674 GemmMicrokernelTester()
40675 .mr(1)
40676 .nr(4)
40677 .kr(1)
40678 .sr(1)
40679 .m(1)
40680 .n(n)
40681 .k(k)
40682 .a_stride(7)
40683 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40684 }
40685 }
40686 }
40687
40688 TEST(F32_GEMMINC_1X4__WASM, n_gt_4_subtile) {
40689 for (uint32_t n = 5; n < 8; n++) {
40690 for (size_t k = 1; k <= 5; k += 2) {
40691 for (uint32_t m = 1; m <= 1; m++) {
40692 GemmMicrokernelTester()
40693 .mr(1)
40694 .nr(4)
40695 .kr(1)
40696 .sr(1)
40697 .m(m)
40698 .n(n)
40699 .k(k)
40700 .iterations(1)
40701 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40702 }
40703 }
40704 }
40705 }
40706
40707 TEST(F32_GEMMINC_1X4__WASM, n_div_4) {
40708 for (uint32_t n = 8; n <= 12; n += 4) {
40709 for (size_t k = 1; k <= 5; k += 2) {
40710 GemmMicrokernelTester()
40711 .mr(1)
40712 .nr(4)
40713 .kr(1)
40714 .sr(1)
40715 .m(1)
40716 .n(4)
40717 .k(k)
40718 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40719 }
40720 }
40721 }
40722
40723 TEST(F32_GEMMINC_1X4__WASM, n_div_4_strided_cn) {
40724 for (uint32_t n = 8; n <= 12; n += 4) {
40725 for (size_t k = 1; k <= 5; k += 2) {
40726 GemmMicrokernelTester()
40727 .mr(1)
40728 .nr(4)
40729 .kr(1)
40730 .sr(1)
40731 .m(1)
40732 .n(n)
40733 .k(k)
40734 .cn_stride(7)
40735 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40736 }
40737 }
40738 }
40739
40740 TEST(F32_GEMMINC_1X4__WASM, n_div_4_strided_a) {
40741 for (uint32_t n = 8; n <= 12; n += 4) {
40742 for (size_t k = 1; k <= 5; k += 2) {
40743 GemmMicrokernelTester()
40744 .mr(1)
40745 .nr(4)
40746 .kr(1)
40747 .sr(1)
40748 .m(1)
40749 .n(n)
40750 .k(k)
40751 .a_stride(7)
40752 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40753 }
40754 }
40755 }
40756
40757 TEST(F32_GEMMINC_1X4__WASM, n_div_4_subtile) {
40758 for (uint32_t n = 8; n <= 12; n += 4) {
40759 for (size_t k = 1; k <= 5; k += 2) {
40760 for (uint32_t m = 1; m <= 1; m++) {
40761 GemmMicrokernelTester()
40762 .mr(1)
40763 .nr(4)
40764 .kr(1)
40765 .sr(1)
40766 .m(m)
40767 .n(n)
40768 .k(k)
40769 .iterations(1)
40770 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40771 }
40772 }
40773 }
40774 }
40775
40776 TEST(F32_GEMMINC_1X4__WASM, strided_cm_subtile) {
40777 for (size_t k = 1; k <= 5; k += 2) {
40778 for (uint32_t m = 1; m <= 1; m++) {
40779 for (uint32_t n = 1; n <= 4; n++) {
40780 GemmMicrokernelTester()
40781 .mr(1)
40782 .nr(4)
40783 .kr(1)
40784 .sr(1)
40785 .m(m)
40786 .n(n)
40787 .k(k)
40788 .cm_stride(7)
40789 .iterations(1)
40790 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40791 }
40792 }
40793 }
40794 }
40795
40796 TEST(F32_GEMMINC_1X4__WASM, qmin) {
40797 GemmMicrokernelTester()
40798 .mr(1)
40799 .nr(4)
40800 .kr(1)
40801 .sr(1)
40802 .m(1)
40803 .n(4)
40804 .k(1)
40805 .qmin(128)
40806 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40807 }
40808
40809 TEST(F32_GEMMINC_1X4__WASM, qmax) {
40810 GemmMicrokernelTester()
40811 .mr(1)
40812 .nr(4)
40813 .kr(1)
40814 .sr(1)
40815 .m(1)
40816 .n(4)
40817 .k(1)
40818 .qmax(128)
40819 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40820 }
40821
40822 TEST(F32_GEMMINC_1X4__WASM, strided_cm) {
40823 GemmMicrokernelTester()
40824 .mr(1)
40825 .nr(4)
40826 .kr(1)
40827 .sr(1)
40828 .m(1)
40829 .n(4)
40830 .k(1)
40831 .cm_stride(7)
40832 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40833 }
40834#endif // XNN_ARCH_WASM
40835
40836
40837#if XNN_ARCH_WASM
40838 TEST(F32_GEMMINC_2X4__WASM, k_eq_1) {
40839 GemmMicrokernelTester()
40840 .mr(2)
40841 .nr(4)
40842 .kr(1)
40843 .sr(1)
40844 .m(2)
40845 .n(4)
40846 .k(1)
40847 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40848 }
40849
40850 TEST(F32_GEMMINC_2X4__WASM, strided_cn) {
40851 GemmMicrokernelTester()
40852 .mr(2)
40853 .nr(4)
40854 .kr(1)
40855 .sr(1)
40856 .m(2)
40857 .n(4)
40858 .k(1)
40859 .cn_stride(7)
40860 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40861 }
40862
40863 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_strided_a) {
40864 GemmMicrokernelTester()
40865 .mr(2)
40866 .nr(4)
40867 .kr(1)
40868 .sr(1)
40869 .m(2)
40870 .n(4)
40871 .k(1)
40872 .a_stride(3)
40873 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40874 }
40875
40876 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_subtile) {
40877 for (uint32_t m = 1; m <= 2; m++) {
40878 for (uint32_t n = 1; n <= 4; n++) {
40879 GemmMicrokernelTester()
40880 .mr(2)
40881 .nr(4)
40882 .kr(1)
40883 .sr(1)
40884 .m(m)
40885 .n(n)
40886 .k(1)
40887 .iterations(1)
40888 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40889 }
40890 }
40891 }
40892
40893 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_subtile_m) {
40894 for (uint32_t m = 1; m <= 2; m++) {
40895 GemmMicrokernelTester()
40896 .mr(2)
40897 .nr(4)
40898 .kr(1)
40899 .sr(1)
40900 .m(m)
40901 .n(4)
40902 .k(1)
40903 .iterations(1)
40904 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40905 }
40906 }
40907
40908 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_subtile_n) {
40909 for (uint32_t n = 1; n <= 4; n++) {
40910 GemmMicrokernelTester()
40911 .mr(2)
40912 .nr(4)
40913 .kr(1)
40914 .sr(1)
40915 .m(2)
40916 .n(n)
40917 .k(1)
40918 .iterations(1)
40919 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40920 }
40921 }
40922
40923 TEST(F32_GEMMINC_2X4__WASM, k_gt_1) {
40924 for (size_t k = 2; k < 10; k++) {
40925 GemmMicrokernelTester()
40926 .mr(2)
40927 .nr(4)
40928 .kr(1)
40929 .sr(1)
40930 .m(2)
40931 .n(4)
40932 .k(k)
40933 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40934 }
40935 }
40936
40937 TEST(F32_GEMMINC_2X4__WASM, k_gt_1_strided_a) {
40938 for (size_t k = 2; k < 10; k++) {
40939 GemmMicrokernelTester()
40940 .mr(2)
40941 .nr(4)
40942 .kr(1)
40943 .sr(1)
40944 .m(2)
40945 .n(4)
40946 .k(k)
40947 .a_stride(11)
40948 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40949 }
40950 }
40951
40952 TEST(F32_GEMMINC_2X4__WASM, k_gt_1_subtile) {
40953 for (size_t k = 2; k < 10; k++) {
40954 for (uint32_t m = 1; m <= 2; m++) {
40955 for (uint32_t n = 1; n <= 4; n++) {
40956 GemmMicrokernelTester()
40957 .mr(2)
40958 .nr(4)
40959 .kr(1)
40960 .sr(1)
40961 .m(m)
40962 .n(n)
40963 .k(k)
40964 .iterations(1)
40965 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40966 }
40967 }
40968 }
40969 }
40970
40971 TEST(F32_GEMMINC_2X4__WASM, n_gt_4) {
40972 for (uint32_t n = 5; n < 8; n++) {
40973 for (size_t k = 1; k <= 5; k += 2) {
40974 GemmMicrokernelTester()
40975 .mr(2)
40976 .nr(4)
40977 .kr(1)
40978 .sr(1)
40979 .m(2)
40980 .n(4)
40981 .k(k)
40982 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
40983 }
40984 }
40985 }
40986
40987 TEST(F32_GEMMINC_2X4__WASM, n_gt_4_strided_cn) {
40988 for (uint32_t n = 5; n < 8; n++) {
40989 for (size_t k = 1; k <= 5; k += 2) {
40990 GemmMicrokernelTester()
40991 .mr(2)
40992 .nr(4)
40993 .kr(1)
40994 .sr(1)
40995 .m(2)
40996 .n(4)
40997 .k(k)
40998 .cn_stride(7)
40999 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41000 }
41001 }
41002 }
41003
41004 TEST(F32_GEMMINC_2X4__WASM, n_gt_4_strided_a) {
41005 for (uint32_t n = 5; n < 8; n++) {
41006 for (size_t k = 1; k <= 5; k += 2) {
41007 GemmMicrokernelTester()
41008 .mr(2)
41009 .nr(4)
41010 .kr(1)
41011 .sr(1)
41012 .m(2)
41013 .n(n)
41014 .k(k)
41015 .a_stride(7)
41016 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41017 }
41018 }
41019 }
41020
41021 TEST(F32_GEMMINC_2X4__WASM, n_gt_4_subtile) {
41022 for (uint32_t n = 5; n < 8; n++) {
41023 for (size_t k = 1; k <= 5; k += 2) {
41024 for (uint32_t m = 1; m <= 2; m++) {
41025 GemmMicrokernelTester()
41026 .mr(2)
41027 .nr(4)
41028 .kr(1)
41029 .sr(1)
41030 .m(m)
41031 .n(n)
41032 .k(k)
41033 .iterations(1)
41034 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41035 }
41036 }
41037 }
41038 }
41039
41040 TEST(F32_GEMMINC_2X4__WASM, n_div_4) {
41041 for (uint32_t n = 8; n <= 12; n += 4) {
41042 for (size_t k = 1; k <= 5; k += 2) {
41043 GemmMicrokernelTester()
41044 .mr(2)
41045 .nr(4)
41046 .kr(1)
41047 .sr(1)
41048 .m(2)
41049 .n(4)
41050 .k(k)
41051 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41052 }
41053 }
41054 }
41055
41056 TEST(F32_GEMMINC_2X4__WASM, n_div_4_strided_cn) {
41057 for (uint32_t n = 8; n <= 12; n += 4) {
41058 for (size_t k = 1; k <= 5; k += 2) {
41059 GemmMicrokernelTester()
41060 .mr(2)
41061 .nr(4)
41062 .kr(1)
41063 .sr(1)
41064 .m(2)
41065 .n(n)
41066 .k(k)
41067 .cn_stride(7)
41068 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41069 }
41070 }
41071 }
41072
41073 TEST(F32_GEMMINC_2X4__WASM, n_div_4_strided_a) {
41074 for (uint32_t n = 8; n <= 12; n += 4) {
41075 for (size_t k = 1; k <= 5; k += 2) {
41076 GemmMicrokernelTester()
41077 .mr(2)
41078 .nr(4)
41079 .kr(1)
41080 .sr(1)
41081 .m(2)
41082 .n(n)
41083 .k(k)
41084 .a_stride(7)
41085 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41086 }
41087 }
41088 }
41089
41090 TEST(F32_GEMMINC_2X4__WASM, n_div_4_subtile) {
41091 for (uint32_t n = 8; n <= 12; n += 4) {
41092 for (size_t k = 1; k <= 5; k += 2) {
41093 for (uint32_t m = 1; m <= 2; m++) {
41094 GemmMicrokernelTester()
41095 .mr(2)
41096 .nr(4)
41097 .kr(1)
41098 .sr(1)
41099 .m(m)
41100 .n(n)
41101 .k(k)
41102 .iterations(1)
41103 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41104 }
41105 }
41106 }
41107 }
41108
41109 TEST(F32_GEMMINC_2X4__WASM, strided_cm_subtile) {
41110 for (size_t k = 1; k <= 5; k += 2) {
41111 for (uint32_t m = 1; m <= 2; m++) {
41112 for (uint32_t n = 1; n <= 4; n++) {
41113 GemmMicrokernelTester()
41114 .mr(2)
41115 .nr(4)
41116 .kr(1)
41117 .sr(1)
41118 .m(m)
41119 .n(n)
41120 .k(k)
41121 .cm_stride(7)
41122 .iterations(1)
41123 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41124 }
41125 }
41126 }
41127 }
41128
41129 TEST(F32_GEMMINC_2X4__WASM, qmin) {
41130 GemmMicrokernelTester()
41131 .mr(2)
41132 .nr(4)
41133 .kr(1)
41134 .sr(1)
41135 .m(2)
41136 .n(4)
41137 .k(1)
41138 .qmin(128)
41139 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41140 }
41141
41142 TEST(F32_GEMMINC_2X4__WASM, qmax) {
41143 GemmMicrokernelTester()
41144 .mr(2)
41145 .nr(4)
41146 .kr(1)
41147 .sr(1)
41148 .m(2)
41149 .n(4)
41150 .k(1)
41151 .qmax(128)
41152 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41153 }
41154
41155 TEST(F32_GEMMINC_2X4__WASM, strided_cm) {
41156 GemmMicrokernelTester()
41157 .mr(2)
41158 .nr(4)
41159 .kr(1)
41160 .sr(1)
41161 .m(2)
41162 .n(4)
41163 .k(1)
41164 .cm_stride(7)
41165 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41166 }
41167#endif // XNN_ARCH_WASM
41168
41169
41170#if XNN_ARCH_WASM
41171 TEST(F32_GEMMINC_4X4__WASM, k_eq_1) {
41172 GemmMicrokernelTester()
41173 .mr(4)
41174 .nr(4)
41175 .kr(1)
41176 .sr(1)
41177 .m(4)
41178 .n(4)
41179 .k(1)
41180 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41181 }
41182
41183 TEST(F32_GEMMINC_4X4__WASM, strided_cn) {
41184 GemmMicrokernelTester()
41185 .mr(4)
41186 .nr(4)
41187 .kr(1)
41188 .sr(1)
41189 .m(4)
41190 .n(4)
41191 .k(1)
41192 .cn_stride(7)
41193 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41194 }
41195
41196 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_strided_a) {
41197 GemmMicrokernelTester()
41198 .mr(4)
41199 .nr(4)
41200 .kr(1)
41201 .sr(1)
41202 .m(4)
41203 .n(4)
41204 .k(1)
41205 .a_stride(3)
41206 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41207 }
41208
41209 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_subtile) {
41210 for (uint32_t m = 1; m <= 4; m++) {
41211 for (uint32_t n = 1; n <= 4; n++) {
41212 GemmMicrokernelTester()
41213 .mr(4)
41214 .nr(4)
41215 .kr(1)
41216 .sr(1)
41217 .m(m)
41218 .n(n)
41219 .k(1)
41220 .iterations(1)
41221 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41222 }
41223 }
41224 }
41225
41226 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_subtile_m) {
41227 for (uint32_t m = 1; m <= 4; m++) {
41228 GemmMicrokernelTester()
41229 .mr(4)
41230 .nr(4)
41231 .kr(1)
41232 .sr(1)
41233 .m(m)
41234 .n(4)
41235 .k(1)
41236 .iterations(1)
41237 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41238 }
41239 }
41240
41241 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_subtile_n) {
41242 for (uint32_t n = 1; n <= 4; n++) {
41243 GemmMicrokernelTester()
41244 .mr(4)
41245 .nr(4)
41246 .kr(1)
41247 .sr(1)
41248 .m(4)
41249 .n(n)
41250 .k(1)
41251 .iterations(1)
41252 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41253 }
41254 }
41255
41256 TEST(F32_GEMMINC_4X4__WASM, k_gt_1) {
41257 for (size_t k = 2; k < 10; k++) {
41258 GemmMicrokernelTester()
41259 .mr(4)
41260 .nr(4)
41261 .kr(1)
41262 .sr(1)
41263 .m(4)
41264 .n(4)
41265 .k(k)
41266 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41267 }
41268 }
41269
41270 TEST(F32_GEMMINC_4X4__WASM, k_gt_1_strided_a) {
41271 for (size_t k = 2; k < 10; k++) {
41272 GemmMicrokernelTester()
41273 .mr(4)
41274 .nr(4)
41275 .kr(1)
41276 .sr(1)
41277 .m(4)
41278 .n(4)
41279 .k(k)
41280 .a_stride(11)
41281 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41282 }
41283 }
41284
41285 TEST(F32_GEMMINC_4X4__WASM, k_gt_1_subtile) {
41286 for (size_t k = 2; k < 10; k++) {
41287 for (uint32_t m = 1; m <= 4; m++) {
41288 for (uint32_t n = 1; n <= 4; n++) {
41289 GemmMicrokernelTester()
41290 .mr(4)
41291 .nr(4)
41292 .kr(1)
41293 .sr(1)
41294 .m(m)
41295 .n(n)
41296 .k(k)
41297 .iterations(1)
41298 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41299 }
41300 }
41301 }
41302 }
41303
41304 TEST(F32_GEMMINC_4X4__WASM, n_gt_4) {
41305 for (uint32_t n = 5; n < 8; n++) {
41306 for (size_t k = 1; k <= 5; k += 2) {
41307 GemmMicrokernelTester()
41308 .mr(4)
41309 .nr(4)
41310 .kr(1)
41311 .sr(1)
41312 .m(4)
41313 .n(4)
41314 .k(k)
41315 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41316 }
41317 }
41318 }
41319
41320 TEST(F32_GEMMINC_4X4__WASM, n_gt_4_strided_cn) {
41321 for (uint32_t n = 5; n < 8; n++) {
41322 for (size_t k = 1; k <= 5; k += 2) {
41323 GemmMicrokernelTester()
41324 .mr(4)
41325 .nr(4)
41326 .kr(1)
41327 .sr(1)
41328 .m(4)
41329 .n(4)
41330 .k(k)
41331 .cn_stride(7)
41332 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41333 }
41334 }
41335 }
41336
41337 TEST(F32_GEMMINC_4X4__WASM, n_gt_4_strided_a) {
41338 for (uint32_t n = 5; n < 8; n++) {
41339 for (size_t k = 1; k <= 5; k += 2) {
41340 GemmMicrokernelTester()
41341 .mr(4)
41342 .nr(4)
41343 .kr(1)
41344 .sr(1)
41345 .m(4)
41346 .n(n)
41347 .k(k)
41348 .a_stride(7)
41349 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41350 }
41351 }
41352 }
41353
41354 TEST(F32_GEMMINC_4X4__WASM, n_gt_4_subtile) {
41355 for (uint32_t n = 5; n < 8; n++) {
41356 for (size_t k = 1; k <= 5; k += 2) {
41357 for (uint32_t m = 1; m <= 4; m++) {
41358 GemmMicrokernelTester()
41359 .mr(4)
41360 .nr(4)
41361 .kr(1)
41362 .sr(1)
41363 .m(m)
41364 .n(n)
41365 .k(k)
41366 .iterations(1)
41367 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41368 }
41369 }
41370 }
41371 }
41372
41373 TEST(F32_GEMMINC_4X4__WASM, n_div_4) {
41374 for (uint32_t n = 8; n <= 12; n += 4) {
41375 for (size_t k = 1; k <= 5; k += 2) {
41376 GemmMicrokernelTester()
41377 .mr(4)
41378 .nr(4)
41379 .kr(1)
41380 .sr(1)
41381 .m(4)
41382 .n(4)
41383 .k(k)
41384 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41385 }
41386 }
41387 }
41388
41389 TEST(F32_GEMMINC_4X4__WASM, n_div_4_strided_cn) {
41390 for (uint32_t n = 8; n <= 12; n += 4) {
41391 for (size_t k = 1; k <= 5; k += 2) {
41392 GemmMicrokernelTester()
41393 .mr(4)
41394 .nr(4)
41395 .kr(1)
41396 .sr(1)
41397 .m(4)
41398 .n(n)
41399 .k(k)
41400 .cn_stride(7)
41401 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41402 }
41403 }
41404 }
41405
41406 TEST(F32_GEMMINC_4X4__WASM, n_div_4_strided_a) {
41407 for (uint32_t n = 8; n <= 12; n += 4) {
41408 for (size_t k = 1; k <= 5; k += 2) {
41409 GemmMicrokernelTester()
41410 .mr(4)
41411 .nr(4)
41412 .kr(1)
41413 .sr(1)
41414 .m(4)
41415 .n(n)
41416 .k(k)
41417 .a_stride(7)
41418 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41419 }
41420 }
41421 }
41422
41423 TEST(F32_GEMMINC_4X4__WASM, n_div_4_subtile) {
41424 for (uint32_t n = 8; n <= 12; n += 4) {
41425 for (size_t k = 1; k <= 5; k += 2) {
41426 for (uint32_t m = 1; m <= 4; m++) {
41427 GemmMicrokernelTester()
41428 .mr(4)
41429 .nr(4)
41430 .kr(1)
41431 .sr(1)
41432 .m(m)
41433 .n(n)
41434 .k(k)
41435 .iterations(1)
41436 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41437 }
41438 }
41439 }
41440 }
41441
41442 TEST(F32_GEMMINC_4X4__WASM, strided_cm_subtile) {
41443 for (size_t k = 1; k <= 5; k += 2) {
41444 for (uint32_t m = 1; m <= 4; m++) {
41445 for (uint32_t n = 1; n <= 4; n++) {
41446 GemmMicrokernelTester()
41447 .mr(4)
41448 .nr(4)
41449 .kr(1)
41450 .sr(1)
41451 .m(m)
41452 .n(n)
41453 .k(k)
41454 .cm_stride(7)
41455 .iterations(1)
41456 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41457 }
41458 }
41459 }
41460 }
41461
41462 TEST(F32_GEMMINC_4X4__WASM, qmin) {
41463 GemmMicrokernelTester()
41464 .mr(4)
41465 .nr(4)
41466 .kr(1)
41467 .sr(1)
41468 .m(4)
41469 .n(4)
41470 .k(1)
41471 .qmin(128)
41472 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41473 }
41474
41475 TEST(F32_GEMMINC_4X4__WASM, qmax) {
41476 GemmMicrokernelTester()
41477 .mr(4)
41478 .nr(4)
41479 .kr(1)
41480 .sr(1)
41481 .m(4)
41482 .n(4)
41483 .k(1)
41484 .qmax(128)
41485 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41486 }
41487
41488 TEST(F32_GEMMINC_4X4__WASM, strided_cm) {
41489 GemmMicrokernelTester()
41490 .mr(4)
41491 .nr(4)
41492 .kr(1)
41493 .sr(1)
41494 .m(4)
41495 .n(4)
41496 .k(1)
41497 .cm_stride(7)
41498 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41499 }
41500#endif // XNN_ARCH_WASM
41501
41502
41503TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1) {
41504 GemmMicrokernelTester()
41505 .mr(1)
41506 .nr(4)
41507 .kr(1)
41508 .sr(1)
41509 .m(1)
41510 .n(4)
41511 .k(1)
41512 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41513}
41514
41515TEST(F32_GEMMINC_1X4__SCALAR, strided_cn) {
41516 GemmMicrokernelTester()
41517 .mr(1)
41518 .nr(4)
41519 .kr(1)
41520 .sr(1)
41521 .m(1)
41522 .n(4)
41523 .k(1)
41524 .cn_stride(7)
41525 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41526}
41527
41528TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_strided_a) {
41529 GemmMicrokernelTester()
41530 .mr(1)
41531 .nr(4)
41532 .kr(1)
41533 .sr(1)
41534 .m(1)
41535 .n(4)
41536 .k(1)
41537 .a_stride(3)
41538 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41539}
41540
41541TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_subtile) {
41542 for (uint32_t m = 1; m <= 1; m++) {
41543 for (uint32_t n = 1; n <= 4; n++) {
41544 GemmMicrokernelTester()
41545 .mr(1)
41546 .nr(4)
41547 .kr(1)
41548 .sr(1)
41549 .m(m)
41550 .n(n)
41551 .k(1)
41552 .iterations(1)
41553 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41554 }
41555 }
41556}
41557
41558TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_subtile_m) {
41559 for (uint32_t m = 1; m <= 1; m++) {
41560 GemmMicrokernelTester()
41561 .mr(1)
41562 .nr(4)
41563 .kr(1)
41564 .sr(1)
41565 .m(m)
41566 .n(4)
41567 .k(1)
41568 .iterations(1)
41569 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41570 }
41571}
41572
41573TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_subtile_n) {
41574 for (uint32_t n = 1; n <= 4; n++) {
41575 GemmMicrokernelTester()
41576 .mr(1)
41577 .nr(4)
41578 .kr(1)
41579 .sr(1)
41580 .m(1)
41581 .n(n)
41582 .k(1)
41583 .iterations(1)
41584 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41585 }
41586}
41587
41588TEST(F32_GEMMINC_1X4__SCALAR, k_gt_1) {
41589 for (size_t k = 2; k < 10; k++) {
41590 GemmMicrokernelTester()
41591 .mr(1)
41592 .nr(4)
41593 .kr(1)
41594 .sr(1)
41595 .m(1)
41596 .n(4)
41597 .k(k)
41598 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41599 }
41600}
41601
41602TEST(F32_GEMMINC_1X4__SCALAR, k_gt_1_strided_a) {
41603 for (size_t k = 2; k < 10; k++) {
41604 GemmMicrokernelTester()
41605 .mr(1)
41606 .nr(4)
41607 .kr(1)
41608 .sr(1)
41609 .m(1)
41610 .n(4)
41611 .k(k)
41612 .a_stride(11)
41613 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41614 }
41615}
41616
41617TEST(F32_GEMMINC_1X4__SCALAR, k_gt_1_subtile) {
41618 for (size_t k = 2; k < 10; k++) {
41619 for (uint32_t m = 1; m <= 1; m++) {
41620 for (uint32_t n = 1; n <= 4; n++) {
41621 GemmMicrokernelTester()
41622 .mr(1)
41623 .nr(4)
41624 .kr(1)
41625 .sr(1)
41626 .m(m)
41627 .n(n)
41628 .k(k)
41629 .iterations(1)
41630 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41631 }
41632 }
41633 }
41634}
41635
41636TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4) {
41637 for (uint32_t n = 5; n < 8; n++) {
41638 for (size_t k = 1; k <= 5; k += 2) {
41639 GemmMicrokernelTester()
41640 .mr(1)
41641 .nr(4)
41642 .kr(1)
41643 .sr(1)
41644 .m(1)
41645 .n(4)
41646 .k(k)
41647 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41648 }
41649 }
41650}
41651
41652TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4_strided_cn) {
41653 for (uint32_t n = 5; n < 8; n++) {
41654 for (size_t k = 1; k <= 5; k += 2) {
41655 GemmMicrokernelTester()
41656 .mr(1)
41657 .nr(4)
41658 .kr(1)
41659 .sr(1)
41660 .m(1)
41661 .n(4)
41662 .k(k)
41663 .cn_stride(7)
41664 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41665 }
41666 }
41667}
41668
41669TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4_strided_a) {
41670 for (uint32_t n = 5; n < 8; n++) {
41671 for (size_t k = 1; k <= 5; k += 2) {
41672 GemmMicrokernelTester()
41673 .mr(1)
41674 .nr(4)
41675 .kr(1)
41676 .sr(1)
41677 .m(1)
41678 .n(n)
41679 .k(k)
41680 .a_stride(7)
41681 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41682 }
41683 }
41684}
41685
41686TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4_subtile) {
41687 for (uint32_t n = 5; n < 8; n++) {
41688 for (size_t k = 1; k <= 5; k += 2) {
41689 for (uint32_t m = 1; m <= 1; m++) {
41690 GemmMicrokernelTester()
41691 .mr(1)
41692 .nr(4)
41693 .kr(1)
41694 .sr(1)
41695 .m(m)
41696 .n(n)
41697 .k(k)
41698 .iterations(1)
41699 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41700 }
41701 }
41702 }
41703}
41704
41705TEST(F32_GEMMINC_1X4__SCALAR, n_div_4) {
41706 for (uint32_t n = 8; n <= 12; n += 4) {
41707 for (size_t k = 1; k <= 5; k += 2) {
41708 GemmMicrokernelTester()
41709 .mr(1)
41710 .nr(4)
41711 .kr(1)
41712 .sr(1)
41713 .m(1)
41714 .n(4)
41715 .k(k)
41716 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41717 }
41718 }
41719}
41720
41721TEST(F32_GEMMINC_1X4__SCALAR, n_div_4_strided_cn) {
41722 for (uint32_t n = 8; n <= 12; n += 4) {
41723 for (size_t k = 1; k <= 5; k += 2) {
41724 GemmMicrokernelTester()
41725 .mr(1)
41726 .nr(4)
41727 .kr(1)
41728 .sr(1)
41729 .m(1)
41730 .n(n)
41731 .k(k)
41732 .cn_stride(7)
41733 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41734 }
41735 }
41736}
41737
41738TEST(F32_GEMMINC_1X4__SCALAR, n_div_4_strided_a) {
41739 for (uint32_t n = 8; n <= 12; n += 4) {
41740 for (size_t k = 1; k <= 5; k += 2) {
41741 GemmMicrokernelTester()
41742 .mr(1)
41743 .nr(4)
41744 .kr(1)
41745 .sr(1)
41746 .m(1)
41747 .n(n)
41748 .k(k)
41749 .a_stride(7)
41750 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41751 }
41752 }
41753}
41754
41755TEST(F32_GEMMINC_1X4__SCALAR, n_div_4_subtile) {
41756 for (uint32_t n = 8; n <= 12; n += 4) {
41757 for (size_t k = 1; k <= 5; k += 2) {
41758 for (uint32_t m = 1; m <= 1; m++) {
41759 GemmMicrokernelTester()
41760 .mr(1)
41761 .nr(4)
41762 .kr(1)
41763 .sr(1)
41764 .m(m)
41765 .n(n)
41766 .k(k)
41767 .iterations(1)
41768 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41769 }
41770 }
41771 }
41772}
41773
41774TEST(F32_GEMMINC_1X4__SCALAR, strided_cm_subtile) {
41775 for (size_t k = 1; k <= 5; k += 2) {
41776 for (uint32_t m = 1; m <= 1; m++) {
41777 for (uint32_t n = 1; n <= 4; n++) {
41778 GemmMicrokernelTester()
41779 .mr(1)
41780 .nr(4)
41781 .kr(1)
41782 .sr(1)
41783 .m(m)
41784 .n(n)
41785 .k(k)
41786 .cm_stride(7)
41787 .iterations(1)
41788 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41789 }
41790 }
41791 }
41792}
41793
41794TEST(F32_GEMMINC_1X4__SCALAR, qmin) {
41795 GemmMicrokernelTester()
41796 .mr(1)
41797 .nr(4)
41798 .kr(1)
41799 .sr(1)
41800 .m(1)
41801 .n(4)
41802 .k(1)
41803 .qmin(128)
41804 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41805}
41806
41807TEST(F32_GEMMINC_1X4__SCALAR, qmax) {
41808 GemmMicrokernelTester()
41809 .mr(1)
41810 .nr(4)
41811 .kr(1)
41812 .sr(1)
41813 .m(1)
41814 .n(4)
41815 .k(1)
41816 .qmax(128)
41817 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41818}
41819
41820TEST(F32_GEMMINC_1X4__SCALAR, strided_cm) {
41821 GemmMicrokernelTester()
41822 .mr(1)
41823 .nr(4)
41824 .kr(1)
41825 .sr(1)
41826 .m(1)
41827 .n(4)
41828 .k(1)
41829 .cm_stride(7)
41830 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41831}
41832
41833
41834TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1) {
41835 GemmMicrokernelTester()
41836 .mr(2)
41837 .nr(4)
41838 .kr(1)
41839 .sr(1)
41840 .m(2)
41841 .n(4)
41842 .k(1)
41843 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41844}
41845
41846TEST(F32_GEMMINC_2X4__SCALAR, strided_cn) {
41847 GemmMicrokernelTester()
41848 .mr(2)
41849 .nr(4)
41850 .kr(1)
41851 .sr(1)
41852 .m(2)
41853 .n(4)
41854 .k(1)
41855 .cn_stride(7)
41856 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41857}
41858
41859TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_strided_a) {
41860 GemmMicrokernelTester()
41861 .mr(2)
41862 .nr(4)
41863 .kr(1)
41864 .sr(1)
41865 .m(2)
41866 .n(4)
41867 .k(1)
41868 .a_stride(3)
41869 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41870}
41871
41872TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_subtile) {
41873 for (uint32_t m = 1; m <= 2; m++) {
41874 for (uint32_t n = 1; n <= 4; n++) {
41875 GemmMicrokernelTester()
41876 .mr(2)
41877 .nr(4)
41878 .kr(1)
41879 .sr(1)
41880 .m(m)
41881 .n(n)
41882 .k(1)
41883 .iterations(1)
41884 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41885 }
41886 }
41887}
41888
41889TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_subtile_m) {
41890 for (uint32_t m = 1; m <= 2; m++) {
41891 GemmMicrokernelTester()
41892 .mr(2)
41893 .nr(4)
41894 .kr(1)
41895 .sr(1)
41896 .m(m)
41897 .n(4)
41898 .k(1)
41899 .iterations(1)
41900 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41901 }
41902}
41903
41904TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_subtile_n) {
41905 for (uint32_t n = 1; n <= 4; n++) {
41906 GemmMicrokernelTester()
41907 .mr(2)
41908 .nr(4)
41909 .kr(1)
41910 .sr(1)
41911 .m(2)
41912 .n(n)
41913 .k(1)
41914 .iterations(1)
41915 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41916 }
41917}
41918
41919TEST(F32_GEMMINC_2X4__SCALAR, k_gt_1) {
41920 for (size_t k = 2; k < 10; k++) {
41921 GemmMicrokernelTester()
41922 .mr(2)
41923 .nr(4)
41924 .kr(1)
41925 .sr(1)
41926 .m(2)
41927 .n(4)
41928 .k(k)
41929 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41930 }
41931}
41932
41933TEST(F32_GEMMINC_2X4__SCALAR, k_gt_1_strided_a) {
41934 for (size_t k = 2; k < 10; k++) {
41935 GemmMicrokernelTester()
41936 .mr(2)
41937 .nr(4)
41938 .kr(1)
41939 .sr(1)
41940 .m(2)
41941 .n(4)
41942 .k(k)
41943 .a_stride(11)
41944 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41945 }
41946}
41947
41948TEST(F32_GEMMINC_2X4__SCALAR, k_gt_1_subtile) {
41949 for (size_t k = 2; k < 10; k++) {
41950 for (uint32_t m = 1; m <= 2; m++) {
41951 for (uint32_t n = 1; n <= 4; n++) {
41952 GemmMicrokernelTester()
41953 .mr(2)
41954 .nr(4)
41955 .kr(1)
41956 .sr(1)
41957 .m(m)
41958 .n(n)
41959 .k(k)
41960 .iterations(1)
41961 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41962 }
41963 }
41964 }
41965}
41966
41967TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4) {
41968 for (uint32_t n = 5; n < 8; n++) {
41969 for (size_t k = 1; k <= 5; k += 2) {
41970 GemmMicrokernelTester()
41971 .mr(2)
41972 .nr(4)
41973 .kr(1)
41974 .sr(1)
41975 .m(2)
41976 .n(4)
41977 .k(k)
41978 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41979 }
41980 }
41981}
41982
41983TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4_strided_cn) {
41984 for (uint32_t n = 5; n < 8; n++) {
41985 for (size_t k = 1; k <= 5; k += 2) {
41986 GemmMicrokernelTester()
41987 .mr(2)
41988 .nr(4)
41989 .kr(1)
41990 .sr(1)
41991 .m(2)
41992 .n(4)
41993 .k(k)
41994 .cn_stride(7)
41995 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
41996 }
41997 }
41998}
41999
42000TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4_strided_a) {
42001 for (uint32_t n = 5; n < 8; n++) {
42002 for (size_t k = 1; k <= 5; k += 2) {
42003 GemmMicrokernelTester()
42004 .mr(2)
42005 .nr(4)
42006 .kr(1)
42007 .sr(1)
42008 .m(2)
42009 .n(n)
42010 .k(k)
42011 .a_stride(7)
42012 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42013 }
42014 }
42015}
42016
42017TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4_subtile) {
42018 for (uint32_t n = 5; n < 8; n++) {
42019 for (size_t k = 1; k <= 5; k += 2) {
42020 for (uint32_t m = 1; m <= 2; m++) {
42021 GemmMicrokernelTester()
42022 .mr(2)
42023 .nr(4)
42024 .kr(1)
42025 .sr(1)
42026 .m(m)
42027 .n(n)
42028 .k(k)
42029 .iterations(1)
42030 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42031 }
42032 }
42033 }
42034}
42035
42036TEST(F32_GEMMINC_2X4__SCALAR, n_div_4) {
42037 for (uint32_t n = 8; n <= 12; n += 4) {
42038 for (size_t k = 1; k <= 5; k += 2) {
42039 GemmMicrokernelTester()
42040 .mr(2)
42041 .nr(4)
42042 .kr(1)
42043 .sr(1)
42044 .m(2)
42045 .n(4)
42046 .k(k)
42047 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42048 }
42049 }
42050}
42051
42052TEST(F32_GEMMINC_2X4__SCALAR, n_div_4_strided_cn) {
42053 for (uint32_t n = 8; n <= 12; n += 4) {
42054 for (size_t k = 1; k <= 5; k += 2) {
42055 GemmMicrokernelTester()
42056 .mr(2)
42057 .nr(4)
42058 .kr(1)
42059 .sr(1)
42060 .m(2)
42061 .n(n)
42062 .k(k)
42063 .cn_stride(7)
42064 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42065 }
42066 }
42067}
42068
42069TEST(F32_GEMMINC_2X4__SCALAR, n_div_4_strided_a) {
42070 for (uint32_t n = 8; n <= 12; n += 4) {
42071 for (size_t k = 1; k <= 5; k += 2) {
42072 GemmMicrokernelTester()
42073 .mr(2)
42074 .nr(4)
42075 .kr(1)
42076 .sr(1)
42077 .m(2)
42078 .n(n)
42079 .k(k)
42080 .a_stride(7)
42081 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42082 }
42083 }
42084}
42085
42086TEST(F32_GEMMINC_2X4__SCALAR, n_div_4_subtile) {
42087 for (uint32_t n = 8; n <= 12; n += 4) {
42088 for (size_t k = 1; k <= 5; k += 2) {
42089 for (uint32_t m = 1; m <= 2; m++) {
42090 GemmMicrokernelTester()
42091 .mr(2)
42092 .nr(4)
42093 .kr(1)
42094 .sr(1)
42095 .m(m)
42096 .n(n)
42097 .k(k)
42098 .iterations(1)
42099 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42100 }
42101 }
42102 }
42103}
42104
42105TEST(F32_GEMMINC_2X4__SCALAR, strided_cm_subtile) {
42106 for (size_t k = 1; k <= 5; k += 2) {
42107 for (uint32_t m = 1; m <= 2; m++) {
42108 for (uint32_t n = 1; n <= 4; n++) {
42109 GemmMicrokernelTester()
42110 .mr(2)
42111 .nr(4)
42112 .kr(1)
42113 .sr(1)
42114 .m(m)
42115 .n(n)
42116 .k(k)
42117 .cm_stride(7)
42118 .iterations(1)
42119 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42120 }
42121 }
42122 }
42123}
42124
42125TEST(F32_GEMMINC_2X4__SCALAR, qmin) {
42126 GemmMicrokernelTester()
42127 .mr(2)
42128 .nr(4)
42129 .kr(1)
42130 .sr(1)
42131 .m(2)
42132 .n(4)
42133 .k(1)
42134 .qmin(128)
42135 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42136}
42137
42138TEST(F32_GEMMINC_2X4__SCALAR, qmax) {
42139 GemmMicrokernelTester()
42140 .mr(2)
42141 .nr(4)
42142 .kr(1)
42143 .sr(1)
42144 .m(2)
42145 .n(4)
42146 .k(1)
42147 .qmax(128)
42148 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42149}
42150
42151TEST(F32_GEMMINC_2X4__SCALAR, strided_cm) {
42152 GemmMicrokernelTester()
42153 .mr(2)
42154 .nr(4)
42155 .kr(1)
42156 .sr(1)
42157 .m(2)
42158 .n(4)
42159 .k(1)
42160 .cm_stride(7)
42161 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42162}
42163
42164
42165TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1) {
42166 GemmMicrokernelTester()
42167 .mr(4)
42168 .nr(4)
42169 .kr(1)
42170 .sr(1)
42171 .m(4)
42172 .n(4)
42173 .k(1)
42174 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42175}
42176
42177TEST(F32_GEMMINC_4X4__SCALAR, strided_cn) {
42178 GemmMicrokernelTester()
42179 .mr(4)
42180 .nr(4)
42181 .kr(1)
42182 .sr(1)
42183 .m(4)
42184 .n(4)
42185 .k(1)
42186 .cn_stride(7)
42187 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42188}
42189
42190TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_strided_a) {
42191 GemmMicrokernelTester()
42192 .mr(4)
42193 .nr(4)
42194 .kr(1)
42195 .sr(1)
42196 .m(4)
42197 .n(4)
42198 .k(1)
42199 .a_stride(3)
42200 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42201}
42202
42203TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_subtile) {
42204 for (uint32_t m = 1; m <= 4; m++) {
42205 for (uint32_t n = 1; n <= 4; n++) {
42206 GemmMicrokernelTester()
42207 .mr(4)
42208 .nr(4)
42209 .kr(1)
42210 .sr(1)
42211 .m(m)
42212 .n(n)
42213 .k(1)
42214 .iterations(1)
42215 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42216 }
42217 }
42218}
42219
42220TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_subtile_m) {
42221 for (uint32_t m = 1; m <= 4; m++) {
42222 GemmMicrokernelTester()
42223 .mr(4)
42224 .nr(4)
42225 .kr(1)
42226 .sr(1)
42227 .m(m)
42228 .n(4)
42229 .k(1)
42230 .iterations(1)
42231 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42232 }
42233}
42234
42235TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_subtile_n) {
42236 for (uint32_t n = 1; n <= 4; n++) {
42237 GemmMicrokernelTester()
42238 .mr(4)
42239 .nr(4)
42240 .kr(1)
42241 .sr(1)
42242 .m(4)
42243 .n(n)
42244 .k(1)
42245 .iterations(1)
42246 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42247 }
42248}
42249
42250TEST(F32_GEMMINC_4X4__SCALAR, k_gt_1) {
42251 for (size_t k = 2; k < 10; k++) {
42252 GemmMicrokernelTester()
42253 .mr(4)
42254 .nr(4)
42255 .kr(1)
42256 .sr(1)
42257 .m(4)
42258 .n(4)
42259 .k(k)
42260 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42261 }
42262}
42263
42264TEST(F32_GEMMINC_4X4__SCALAR, k_gt_1_strided_a) {
42265 for (size_t k = 2; k < 10; k++) {
42266 GemmMicrokernelTester()
42267 .mr(4)
42268 .nr(4)
42269 .kr(1)
42270 .sr(1)
42271 .m(4)
42272 .n(4)
42273 .k(k)
42274 .a_stride(11)
42275 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42276 }
42277}
42278
42279TEST(F32_GEMMINC_4X4__SCALAR, k_gt_1_subtile) {
42280 for (size_t k = 2; k < 10; k++) {
42281 for (uint32_t m = 1; m <= 4; m++) {
42282 for (uint32_t n = 1; n <= 4; n++) {
42283 GemmMicrokernelTester()
42284 .mr(4)
42285 .nr(4)
42286 .kr(1)
42287 .sr(1)
42288 .m(m)
42289 .n(n)
42290 .k(k)
42291 .iterations(1)
42292 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42293 }
42294 }
42295 }
42296}
42297
42298TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4) {
42299 for (uint32_t n = 5; n < 8; n++) {
42300 for (size_t k = 1; k <= 5; k += 2) {
42301 GemmMicrokernelTester()
42302 .mr(4)
42303 .nr(4)
42304 .kr(1)
42305 .sr(1)
42306 .m(4)
42307 .n(4)
42308 .k(k)
42309 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42310 }
42311 }
42312}
42313
42314TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4_strided_cn) {
42315 for (uint32_t n = 5; n < 8; n++) {
42316 for (size_t k = 1; k <= 5; k += 2) {
42317 GemmMicrokernelTester()
42318 .mr(4)
42319 .nr(4)
42320 .kr(1)
42321 .sr(1)
42322 .m(4)
42323 .n(4)
42324 .k(k)
42325 .cn_stride(7)
42326 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42327 }
42328 }
42329}
42330
42331TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4_strided_a) {
42332 for (uint32_t n = 5; n < 8; n++) {
42333 for (size_t k = 1; k <= 5; k += 2) {
42334 GemmMicrokernelTester()
42335 .mr(4)
42336 .nr(4)
42337 .kr(1)
42338 .sr(1)
42339 .m(4)
42340 .n(n)
42341 .k(k)
42342 .a_stride(7)
42343 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42344 }
42345 }
42346}
42347
42348TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4_subtile) {
42349 for (uint32_t n = 5; n < 8; n++) {
42350 for (size_t k = 1; k <= 5; k += 2) {
42351 for (uint32_t m = 1; m <= 4; m++) {
42352 GemmMicrokernelTester()
42353 .mr(4)
42354 .nr(4)
42355 .kr(1)
42356 .sr(1)
42357 .m(m)
42358 .n(n)
42359 .k(k)
42360 .iterations(1)
42361 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42362 }
42363 }
42364 }
42365}
42366
42367TEST(F32_GEMMINC_4X4__SCALAR, n_div_4) {
42368 for (uint32_t n = 8; n <= 12; n += 4) {
42369 for (size_t k = 1; k <= 5; k += 2) {
42370 GemmMicrokernelTester()
42371 .mr(4)
42372 .nr(4)
42373 .kr(1)
42374 .sr(1)
42375 .m(4)
42376 .n(4)
42377 .k(k)
42378 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42379 }
42380 }
42381}
42382
42383TEST(F32_GEMMINC_4X4__SCALAR, n_div_4_strided_cn) {
42384 for (uint32_t n = 8; n <= 12; n += 4) {
42385 for (size_t k = 1; k <= 5; k += 2) {
42386 GemmMicrokernelTester()
42387 .mr(4)
42388 .nr(4)
42389 .kr(1)
42390 .sr(1)
42391 .m(4)
42392 .n(n)
42393 .k(k)
42394 .cn_stride(7)
42395 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42396 }
42397 }
42398}
42399
42400TEST(F32_GEMMINC_4X4__SCALAR, n_div_4_strided_a) {
42401 for (uint32_t n = 8; n <= 12; n += 4) {
42402 for (size_t k = 1; k <= 5; k += 2) {
42403 GemmMicrokernelTester()
42404 .mr(4)
42405 .nr(4)
42406 .kr(1)
42407 .sr(1)
42408 .m(4)
42409 .n(n)
42410 .k(k)
42411 .a_stride(7)
42412 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42413 }
42414 }
42415}
42416
42417TEST(F32_GEMMINC_4X4__SCALAR, n_div_4_subtile) {
42418 for (uint32_t n = 8; n <= 12; n += 4) {
42419 for (size_t k = 1; k <= 5; k += 2) {
42420 for (uint32_t m = 1; m <= 4; m++) {
42421 GemmMicrokernelTester()
42422 .mr(4)
42423 .nr(4)
42424 .kr(1)
42425 .sr(1)
42426 .m(m)
42427 .n(n)
42428 .k(k)
42429 .iterations(1)
42430 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42431 }
42432 }
42433 }
42434}
42435
42436TEST(F32_GEMMINC_4X4__SCALAR, strided_cm_subtile) {
42437 for (size_t k = 1; k <= 5; k += 2) {
42438 for (uint32_t m = 1; m <= 4; m++) {
42439 for (uint32_t n = 1; n <= 4; n++) {
42440 GemmMicrokernelTester()
42441 .mr(4)
42442 .nr(4)
42443 .kr(1)
42444 .sr(1)
42445 .m(m)
42446 .n(n)
42447 .k(k)
42448 .cm_stride(7)
42449 .iterations(1)
42450 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42451 }
42452 }
42453 }
42454}
42455
42456TEST(F32_GEMMINC_4X4__SCALAR, qmin) {
42457 GemmMicrokernelTester()
42458 .mr(4)
42459 .nr(4)
42460 .kr(1)
42461 .sr(1)
42462 .m(4)
42463 .n(4)
42464 .k(1)
42465 .qmin(128)
42466 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42467}
42468
42469TEST(F32_GEMMINC_4X4__SCALAR, qmax) {
42470 GemmMicrokernelTester()
42471 .mr(4)
42472 .nr(4)
42473 .kr(1)
42474 .sr(1)
42475 .m(4)
42476 .n(4)
42477 .k(1)
42478 .qmax(128)
42479 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42480}
42481
42482TEST(F32_GEMMINC_4X4__SCALAR, strided_cm) {
42483 GemmMicrokernelTester()
42484 .mr(4)
42485 .nr(4)
42486 .kr(1)
42487 .sr(1)
42488 .m(4)
42489 .n(4)
42490 .k(1)
42491 .cm_stride(7)
42492 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
42493}