blob: c215aca4e35ceda60c53bab987daf35786ef7bdb [file] [log] [blame]
Marat Dukhan1c587112020-04-08 20:04:28 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f32-igemm-minmax.yaml
11// Generator: tools/generate-gemm-test.py
12
13
14#include <gtest/gtest.h>
15
16#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
19#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
22#include "gemm-microkernel-tester.h"
23
24
25#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
26 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
27 TEST_REQUIRES_ARM_NEON_FMA;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(8)
36 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
37 }
38
39 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FMA;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(8)
49 .cn_stride(11)
50 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
51 }
52
53 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
54 TEST_REQUIRES_ARM_NEON_FMA;
55 for (uint32_t m = 1; m <= 1; m++) {
56 for (uint32_t n = 1; n <= 8; n++) {
57 GemmMicrokernelTester()
58 .mr(1)
59 .nr(8)
60 .kr(1)
61 .sr(1)
62 .m(m)
63 .n(n)
64 .k(8)
65 .iterations(1)
66 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
67 }
68 }
69 }
70
71 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
72 TEST_REQUIRES_ARM_NEON_FMA;
73 for (uint32_t m = 1; m <= 1; m++) {
74 GemmMicrokernelTester()
75 .mr(1)
76 .nr(8)
77 .kr(1)
78 .sr(1)
79 .m(m)
80 .n(8)
81 .k(8)
82 .iterations(1)
83 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
84 }
85 }
86
87 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
88 TEST_REQUIRES_ARM_NEON_FMA;
89 for (uint32_t n = 1; n <= 8; n++) {
90 GemmMicrokernelTester()
91 .mr(1)
92 .nr(8)
93 .kr(1)
94 .sr(1)
95 .m(1)
96 .n(n)
97 .k(8)
98 .iterations(1)
99 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
100 }
101 }
102
103 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
104 TEST_REQUIRES_ARM_NEON_FMA;
105 GemmMicrokernelTester()
106 .mr(1)
107 .nr(8)
108 .kr(1)
109 .sr(1)
110 .m(1)
111 .n(8)
112 .k(16)
113 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
114 }
115
116 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
117 TEST_REQUIRES_ARM_NEON_FMA;
118 for (uint32_t m = 1; m <= 1; m++) {
119 for (uint32_t n = 1; n <= 8; n++) {
120 GemmMicrokernelTester()
121 .mr(1)
122 .nr(8)
123 .kr(1)
124 .sr(1)
125 .m(m)
126 .n(n)
127 .k(16)
128 .iterations(1)
129 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
130 }
131 }
132 }
133
134 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
135 TEST_REQUIRES_ARM_NEON_FMA;
136 for (size_t k = 1; k < 16; k++) {
137 GemmMicrokernelTester()
138 .mr(1)
139 .nr(8)
140 .kr(1)
141 .sr(1)
142 .m(1)
143 .n(8)
144 .k(k)
145 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
146 }
147 }
148
149 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
150 TEST_REQUIRES_ARM_NEON_FMA;
151 for (size_t k = 1; k < 16; k++) {
152 for (uint32_t m = 1; m <= 1; m++) {
153 for (uint32_t n = 1; n <= 8; n++) {
154 GemmMicrokernelTester()
155 .mr(1)
156 .nr(8)
157 .kr(1)
158 .sr(1)
159 .m(m)
160 .n(n)
161 .k(k)
162 .iterations(1)
163 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
164 }
165 }
166 }
167 }
168
169 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
170 TEST_REQUIRES_ARM_NEON_FMA;
171 for (size_t k = 17; k < 16; k++) {
172 GemmMicrokernelTester()
173 .mr(1)
174 .nr(8)
175 .kr(1)
176 .sr(1)
177 .m(1)
178 .n(8)
179 .k(k)
180 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
181 }
182 }
183
184 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_subtile) {
185 TEST_REQUIRES_ARM_NEON_FMA;
186 for (size_t k = 17; k < 16; k++) {
187 for (uint32_t m = 1; m <= 1; m++) {
188 for (uint32_t n = 1; n <= 8; n++) {
189 GemmMicrokernelTester()
190 .mr(1)
191 .nr(8)
192 .kr(1)
193 .sr(1)
194 .m(m)
195 .n(n)
196 .k(k)
197 .iterations(1)
198 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
199 }
200 }
201 }
202 }
203
204 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
205 TEST_REQUIRES_ARM_NEON_FMA;
206 for (size_t k = 24; k <= 80; k += 8) {
207 GemmMicrokernelTester()
208 .mr(1)
209 .nr(8)
210 .kr(1)
211 .sr(1)
212 .m(1)
213 .n(8)
214 .k(k)
215 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
216 }
217 }
218
219 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
220 TEST_REQUIRES_ARM_NEON_FMA;
221 for (size_t k = 24; k <= 80; k += 8) {
222 for (uint32_t m = 1; m <= 1; m++) {
223 for (uint32_t n = 1; n <= 8; n++) {
224 GemmMicrokernelTester()
225 .mr(1)
226 .nr(8)
227 .kr(1)
228 .sr(1)
229 .m(m)
230 .n(n)
231 .k(k)
232 .iterations(1)
233 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
234 }
235 }
236 }
237 }
238
239 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
240 TEST_REQUIRES_ARM_NEON_FMA;
241 for (uint32_t n = 9; n < 16; n++) {
242 for (size_t k = 1; k <= 40; k += 9) {
243 GemmMicrokernelTester()
244 .mr(1)
245 .nr(8)
246 .kr(1)
247 .sr(1)
248 .m(1)
249 .n(8)
250 .k(k)
251 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
252 }
253 }
254 }
255
256 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
257 TEST_REQUIRES_ARM_NEON_FMA;
258 for (uint32_t n = 9; n < 16; n++) {
259 for (size_t k = 1; k <= 40; k += 9) {
260 GemmMicrokernelTester()
261 .mr(1)
262 .nr(8)
263 .kr(1)
264 .sr(1)
265 .m(1)
266 .n(8)
267 .k(k)
268 .cn_stride(11)
269 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
270 }
271 }
272 }
273
274 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
275 TEST_REQUIRES_ARM_NEON_FMA;
276 for (uint32_t n = 9; n < 16; n++) {
277 for (size_t k = 1; k <= 40; k += 9) {
278 for (uint32_t m = 1; m <= 1; m++) {
279 GemmMicrokernelTester()
280 .mr(1)
281 .nr(8)
282 .kr(1)
283 .sr(1)
284 .m(m)
285 .n(n)
286 .k(k)
287 .iterations(1)
288 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
289 }
290 }
291 }
292 }
293
294 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
295 TEST_REQUIRES_ARM_NEON_FMA;
296 for (uint32_t n = 16; n <= 24; n += 8) {
297 for (size_t k = 1; k <= 40; k += 9) {
298 GemmMicrokernelTester()
299 .mr(1)
300 .nr(8)
301 .kr(1)
302 .sr(1)
303 .m(1)
304 .n(8)
305 .k(k)
306 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
307 }
308 }
309 }
310
311 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
312 TEST_REQUIRES_ARM_NEON_FMA;
313 for (uint32_t n = 16; n <= 24; n += 8) {
314 for (size_t k = 1; k <= 40; k += 9) {
315 GemmMicrokernelTester()
316 .mr(1)
317 .nr(8)
318 .kr(1)
319 .sr(1)
320 .m(1)
321 .n(n)
322 .k(k)
323 .cn_stride(11)
324 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
325 }
326 }
327 }
328
329 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
330 TEST_REQUIRES_ARM_NEON_FMA;
331 for (uint32_t n = 16; n <= 24; n += 8) {
332 for (size_t k = 1; k <= 40; k += 9) {
333 for (uint32_t m = 1; m <= 1; m++) {
334 GemmMicrokernelTester()
335 .mr(1)
336 .nr(8)
337 .kr(1)
338 .sr(1)
339 .m(m)
340 .n(n)
341 .k(k)
342 .iterations(1)
343 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
344 }
345 }
346 }
347 }
348
349 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
350 TEST_REQUIRES_ARM_NEON_FMA;
351 for (size_t k = 1; k <= 40; k += 9) {
352 GemmMicrokernelTester()
353 .mr(1)
354 .nr(8)
355 .kr(1)
356 .sr(1)
357 .m(1)
358 .n(8)
359 .k(k)
360 .ks(3)
361 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
362 }
363 }
364
365 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
366 TEST_REQUIRES_ARM_NEON_FMA;
367 for (size_t k = 1; k <= 40; k += 9) {
368 for (uint32_t m = 1; m <= 1; m++) {
369 for (uint32_t n = 1; n <= 8; n++) {
370 GemmMicrokernelTester()
371 .mr(1)
372 .nr(8)
373 .kr(1)
374 .sr(1)
375 .m(m)
376 .n(n)
377 .k(k)
378 .ks(3)
379 .iterations(1)
380 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
381 }
382 }
383 }
384 }
385
386 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_small_kernel) {
387 TEST_REQUIRES_ARM_NEON_FMA;
388 for (uint32_t n = 9; n < 16; n++) {
389 for (size_t k = 1; k <= 40; k += 9) {
390 GemmMicrokernelTester()
391 .mr(1)
392 .nr(8)
393 .kr(1)
394 .sr(1)
395 .m(1)
396 .n(8)
397 .k(k)
398 .ks(3)
399 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
400 }
401 }
402 }
403
404 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_small_kernel) {
405 TEST_REQUIRES_ARM_NEON_FMA;
406 for (uint32_t n = 16; n <= 24; n += 8) {
407 for (size_t k = 1; k <= 40; k += 9) {
408 GemmMicrokernelTester()
409 .mr(1)
410 .nr(8)
411 .kr(1)
412 .sr(1)
413 .m(1)
414 .n(8)
415 .k(k)
416 .ks(3)
417 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
418 }
419 }
420 }
421
422 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
423 TEST_REQUIRES_ARM_NEON_FMA;
424 for (size_t k = 1; k <= 40; k += 9) {
425 for (uint32_t m = 1; m <= 1; m++) {
426 for (uint32_t n = 1; n <= 8; n++) {
427 GemmMicrokernelTester()
428 .mr(1)
429 .nr(8)
430 .kr(1)
431 .sr(1)
432 .m(m)
433 .n(n)
434 .k(k)
435 .cm_stride(11)
436 .iterations(1)
437 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
438 }
439 }
440 }
441 }
442
443 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
444 TEST_REQUIRES_ARM_NEON_FMA;
445 for (size_t k = 1; k <= 40; k += 9) {
446 GemmMicrokernelTester()
447 .mr(1)
448 .nr(8)
449 .kr(1)
450 .sr(1)
451 .m(1)
452 .n(8)
453 .k(k)
454 .ks(3)
455 .a_offset(43)
456 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
457 }
458 }
459
460 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, zero) {
461 TEST_REQUIRES_ARM_NEON_FMA;
462 for (uint32_t mz = 0; mz < 1; mz++) {
463 for (size_t k = 1; k <= 40; k += 9) {
464 GemmMicrokernelTester()
465 .mr(1)
466 .nr(8)
467 .kr(1)
468 .sr(1)
469 .m(1)
470 .n(8)
471 .k(k)
472 .ks(3)
473 .a_offset(43)
474 .zero_index(mz)
475 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
476 }
477 }
478 }
479
480 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
481 TEST_REQUIRES_ARM_NEON_FMA;
482 GemmMicrokernelTester()
483 .mr(1)
484 .nr(8)
485 .kr(1)
486 .sr(1)
487 .m(1)
488 .n(8)
489 .k(8)
490 .qmin(128)
491 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
492 }
493
494 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
495 TEST_REQUIRES_ARM_NEON_FMA;
496 GemmMicrokernelTester()
497 .mr(1)
498 .nr(8)
499 .kr(1)
500 .sr(1)
501 .m(1)
502 .n(8)
503 .k(8)
504 .qmax(128)
505 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
506 }
507
508 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
509 TEST_REQUIRES_ARM_NEON_FMA;
510 GemmMicrokernelTester()
511 .mr(1)
512 .nr(8)
513 .kr(1)
514 .sr(1)
515 .m(1)
516 .n(8)
517 .k(8)
518 .cm_stride(11)
519 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
520 }
521#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
522
523
524#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
525 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
526 TEST_REQUIRES_ARM_NEON_FMA;
527 GemmMicrokernelTester()
528 .mr(1)
529 .nr(8)
530 .kr(1)
531 .sr(1)
532 .m(1)
533 .n(8)
534 .k(8)
535 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
536 }
537
538 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
539 TEST_REQUIRES_ARM_NEON_FMA;
540 GemmMicrokernelTester()
541 .mr(1)
542 .nr(8)
543 .kr(1)
544 .sr(1)
545 .m(1)
546 .n(8)
547 .k(8)
548 .cn_stride(11)
549 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
550 }
551
552 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
553 TEST_REQUIRES_ARM_NEON_FMA;
554 for (uint32_t m = 1; m <= 1; m++) {
555 for (uint32_t n = 1; n <= 8; n++) {
556 GemmMicrokernelTester()
557 .mr(1)
558 .nr(8)
559 .kr(1)
560 .sr(1)
561 .m(m)
562 .n(n)
563 .k(8)
564 .iterations(1)
565 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
566 }
567 }
568 }
569
570 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
571 TEST_REQUIRES_ARM_NEON_FMA;
572 for (uint32_t m = 1; m <= 1; m++) {
573 GemmMicrokernelTester()
574 .mr(1)
575 .nr(8)
576 .kr(1)
577 .sr(1)
578 .m(m)
579 .n(8)
580 .k(8)
581 .iterations(1)
582 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
583 }
584 }
585
586 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
587 TEST_REQUIRES_ARM_NEON_FMA;
588 for (uint32_t n = 1; n <= 8; n++) {
589 GemmMicrokernelTester()
590 .mr(1)
591 .nr(8)
592 .kr(1)
593 .sr(1)
594 .m(1)
595 .n(n)
596 .k(8)
597 .iterations(1)
598 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
599 }
600 }
601
602 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
603 TEST_REQUIRES_ARM_NEON_FMA;
604 GemmMicrokernelTester()
605 .mr(1)
606 .nr(8)
607 .kr(1)
608 .sr(1)
609 .m(1)
610 .n(8)
611 .k(16)
612 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
613 }
614
615 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
616 TEST_REQUIRES_ARM_NEON_FMA;
617 for (uint32_t m = 1; m <= 1; m++) {
618 for (uint32_t n = 1; n <= 8; n++) {
619 GemmMicrokernelTester()
620 .mr(1)
621 .nr(8)
622 .kr(1)
623 .sr(1)
624 .m(m)
625 .n(n)
626 .k(16)
627 .iterations(1)
628 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
629 }
630 }
631 }
632
633 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
634 TEST_REQUIRES_ARM_NEON_FMA;
635 for (size_t k = 1; k < 16; k++) {
636 GemmMicrokernelTester()
637 .mr(1)
638 .nr(8)
639 .kr(1)
640 .sr(1)
641 .m(1)
642 .n(8)
643 .k(k)
644 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
645 }
646 }
647
648 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
649 TEST_REQUIRES_ARM_NEON_FMA;
650 for (size_t k = 1; k < 16; k++) {
651 for (uint32_t m = 1; m <= 1; m++) {
652 for (uint32_t n = 1; n <= 8; n++) {
653 GemmMicrokernelTester()
654 .mr(1)
655 .nr(8)
656 .kr(1)
657 .sr(1)
658 .m(m)
659 .n(n)
660 .k(k)
661 .iterations(1)
662 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
663 }
664 }
665 }
666 }
667
668 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
669 TEST_REQUIRES_ARM_NEON_FMA;
670 for (size_t k = 17; k < 16; k++) {
671 GemmMicrokernelTester()
672 .mr(1)
673 .nr(8)
674 .kr(1)
675 .sr(1)
676 .m(1)
677 .n(8)
678 .k(k)
679 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
680 }
681 }
682
683 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
684 TEST_REQUIRES_ARM_NEON_FMA;
685 for (size_t k = 17; k < 16; k++) {
686 for (uint32_t m = 1; m <= 1; m++) {
687 for (uint32_t n = 1; n <= 8; n++) {
688 GemmMicrokernelTester()
689 .mr(1)
690 .nr(8)
691 .kr(1)
692 .sr(1)
693 .m(m)
694 .n(n)
695 .k(k)
696 .iterations(1)
697 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
698 }
699 }
700 }
701 }
702
703 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
704 TEST_REQUIRES_ARM_NEON_FMA;
705 for (size_t k = 24; k <= 80; k += 8) {
706 GemmMicrokernelTester()
707 .mr(1)
708 .nr(8)
709 .kr(1)
710 .sr(1)
711 .m(1)
712 .n(8)
713 .k(k)
714 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
715 }
716 }
717
718 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
719 TEST_REQUIRES_ARM_NEON_FMA;
720 for (size_t k = 24; k <= 80; k += 8) {
721 for (uint32_t m = 1; m <= 1; m++) {
722 for (uint32_t n = 1; n <= 8; n++) {
723 GemmMicrokernelTester()
724 .mr(1)
725 .nr(8)
726 .kr(1)
727 .sr(1)
728 .m(m)
729 .n(n)
730 .k(k)
731 .iterations(1)
732 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
733 }
734 }
735 }
736 }
737
738 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
739 TEST_REQUIRES_ARM_NEON_FMA;
740 for (uint32_t n = 9; n < 16; n++) {
741 for (size_t k = 1; k <= 40; k += 9) {
742 GemmMicrokernelTester()
743 .mr(1)
744 .nr(8)
745 .kr(1)
746 .sr(1)
747 .m(1)
748 .n(8)
749 .k(k)
750 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
751 }
752 }
753 }
754
755 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
756 TEST_REQUIRES_ARM_NEON_FMA;
757 for (uint32_t n = 9; n < 16; n++) {
758 for (size_t k = 1; k <= 40; k += 9) {
759 GemmMicrokernelTester()
760 .mr(1)
761 .nr(8)
762 .kr(1)
763 .sr(1)
764 .m(1)
765 .n(8)
766 .k(k)
767 .cn_stride(11)
768 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
769 }
770 }
771 }
772
773 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
774 TEST_REQUIRES_ARM_NEON_FMA;
775 for (uint32_t n = 9; n < 16; n++) {
776 for (size_t k = 1; k <= 40; k += 9) {
777 for (uint32_t m = 1; m <= 1; m++) {
778 GemmMicrokernelTester()
779 .mr(1)
780 .nr(8)
781 .kr(1)
782 .sr(1)
783 .m(m)
784 .n(n)
785 .k(k)
786 .iterations(1)
787 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
788 }
789 }
790 }
791 }
792
793 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
794 TEST_REQUIRES_ARM_NEON_FMA;
795 for (uint32_t n = 16; n <= 24; n += 8) {
796 for (size_t k = 1; k <= 40; k += 9) {
797 GemmMicrokernelTester()
798 .mr(1)
799 .nr(8)
800 .kr(1)
801 .sr(1)
802 .m(1)
803 .n(8)
804 .k(k)
805 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
806 }
807 }
808 }
809
810 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
811 TEST_REQUIRES_ARM_NEON_FMA;
812 for (uint32_t n = 16; n <= 24; n += 8) {
813 for (size_t k = 1; k <= 40; k += 9) {
814 GemmMicrokernelTester()
815 .mr(1)
816 .nr(8)
817 .kr(1)
818 .sr(1)
819 .m(1)
820 .n(n)
821 .k(k)
822 .cn_stride(11)
823 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
824 }
825 }
826 }
827
828 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
829 TEST_REQUIRES_ARM_NEON_FMA;
830 for (uint32_t n = 16; n <= 24; n += 8) {
831 for (size_t k = 1; k <= 40; k += 9) {
832 for (uint32_t m = 1; m <= 1; m++) {
833 GemmMicrokernelTester()
834 .mr(1)
835 .nr(8)
836 .kr(1)
837 .sr(1)
838 .m(m)
839 .n(n)
840 .k(k)
841 .iterations(1)
842 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
843 }
844 }
845 }
846 }
847
848 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
849 TEST_REQUIRES_ARM_NEON_FMA;
850 for (size_t k = 1; k <= 40; k += 9) {
851 GemmMicrokernelTester()
852 .mr(1)
853 .nr(8)
854 .kr(1)
855 .sr(1)
856 .m(1)
857 .n(8)
858 .k(k)
859 .ks(3)
860 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
861 }
862 }
863
864 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
865 TEST_REQUIRES_ARM_NEON_FMA;
866 for (size_t k = 1; k <= 40; k += 9) {
867 for (uint32_t m = 1; m <= 1; m++) {
868 for (uint32_t n = 1; n <= 8; n++) {
869 GemmMicrokernelTester()
870 .mr(1)
871 .nr(8)
872 .kr(1)
873 .sr(1)
874 .m(m)
875 .n(n)
876 .k(k)
877 .ks(3)
878 .iterations(1)
879 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
880 }
881 }
882 }
883 }
884
885 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
886 TEST_REQUIRES_ARM_NEON_FMA;
887 for (uint32_t n = 9; n < 16; n++) {
888 for (size_t k = 1; k <= 40; k += 9) {
889 GemmMicrokernelTester()
890 .mr(1)
891 .nr(8)
892 .kr(1)
893 .sr(1)
894 .m(1)
895 .n(8)
896 .k(k)
897 .ks(3)
898 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
899 }
900 }
901 }
902
903 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
904 TEST_REQUIRES_ARM_NEON_FMA;
905 for (uint32_t n = 16; n <= 24; n += 8) {
906 for (size_t k = 1; k <= 40; k += 9) {
907 GemmMicrokernelTester()
908 .mr(1)
909 .nr(8)
910 .kr(1)
911 .sr(1)
912 .m(1)
913 .n(8)
914 .k(k)
915 .ks(3)
916 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
917 }
918 }
919 }
920
921 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
922 TEST_REQUIRES_ARM_NEON_FMA;
923 for (size_t k = 1; k <= 40; k += 9) {
924 for (uint32_t m = 1; m <= 1; m++) {
925 for (uint32_t n = 1; n <= 8; n++) {
926 GemmMicrokernelTester()
927 .mr(1)
928 .nr(8)
929 .kr(1)
930 .sr(1)
931 .m(m)
932 .n(n)
933 .k(k)
934 .cm_stride(11)
935 .iterations(1)
936 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
937 }
938 }
939 }
940 }
941
942 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
943 TEST_REQUIRES_ARM_NEON_FMA;
944 for (size_t k = 1; k <= 40; k += 9) {
945 GemmMicrokernelTester()
946 .mr(1)
947 .nr(8)
948 .kr(1)
949 .sr(1)
950 .m(1)
951 .n(8)
952 .k(k)
953 .ks(3)
954 .a_offset(43)
955 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
956 }
957 }
958
959 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
960 TEST_REQUIRES_ARM_NEON_FMA;
961 for (uint32_t mz = 0; mz < 1; mz++) {
962 for (size_t k = 1; k <= 40; k += 9) {
963 GemmMicrokernelTester()
964 .mr(1)
965 .nr(8)
966 .kr(1)
967 .sr(1)
968 .m(1)
969 .n(8)
970 .k(k)
971 .ks(3)
972 .a_offset(43)
973 .zero_index(mz)
974 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
975 }
976 }
977 }
978
979 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
980 TEST_REQUIRES_ARM_NEON_FMA;
981 GemmMicrokernelTester()
982 .mr(1)
983 .nr(8)
984 .kr(1)
985 .sr(1)
986 .m(1)
987 .n(8)
988 .k(8)
989 .qmin(128)
990 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
991 }
992
993 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
994 TEST_REQUIRES_ARM_NEON_FMA;
995 GemmMicrokernelTester()
996 .mr(1)
997 .nr(8)
998 .kr(1)
999 .sr(1)
1000 .m(1)
1001 .n(8)
1002 .k(8)
1003 .qmax(128)
1004 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1005 }
1006
1007 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
1008 TEST_REQUIRES_ARM_NEON_FMA;
1009 GemmMicrokernelTester()
1010 .mr(1)
1011 .nr(8)
1012 .kr(1)
1013 .sr(1)
1014 .m(1)
1015 .n(8)
1016 .k(8)
1017 .cm_stride(11)
1018 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1019 }
1020#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1021
1022
1023#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1024 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
1025 TEST_REQUIRES_ARM_NEON_FMA;
1026 GemmMicrokernelTester()
1027 .mr(1)
1028 .nr(8)
1029 .kr(1)
1030 .sr(1)
1031 .m(1)
1032 .n(8)
1033 .k(8)
1034 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1035 }
1036
1037 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
1038 TEST_REQUIRES_ARM_NEON_FMA;
1039 GemmMicrokernelTester()
1040 .mr(1)
1041 .nr(8)
1042 .kr(1)
1043 .sr(1)
1044 .m(1)
1045 .n(8)
1046 .k(8)
1047 .cn_stride(11)
1048 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1049 }
1050
1051 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
1052 TEST_REQUIRES_ARM_NEON_FMA;
1053 for (uint32_t m = 1; m <= 1; m++) {
1054 for (uint32_t n = 1; n <= 8; n++) {
1055 GemmMicrokernelTester()
1056 .mr(1)
1057 .nr(8)
1058 .kr(1)
1059 .sr(1)
1060 .m(m)
1061 .n(n)
1062 .k(8)
1063 .iterations(1)
1064 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1065 }
1066 }
1067 }
1068
1069 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
1070 TEST_REQUIRES_ARM_NEON_FMA;
1071 for (uint32_t m = 1; m <= 1; m++) {
1072 GemmMicrokernelTester()
1073 .mr(1)
1074 .nr(8)
1075 .kr(1)
1076 .sr(1)
1077 .m(m)
1078 .n(8)
1079 .k(8)
1080 .iterations(1)
1081 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1082 }
1083 }
1084
1085 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
1086 TEST_REQUIRES_ARM_NEON_FMA;
1087 for (uint32_t n = 1; n <= 8; n++) {
1088 GemmMicrokernelTester()
1089 .mr(1)
1090 .nr(8)
1091 .kr(1)
1092 .sr(1)
1093 .m(1)
1094 .n(n)
1095 .k(8)
1096 .iterations(1)
1097 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1098 }
1099 }
1100
1101 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
1102 TEST_REQUIRES_ARM_NEON_FMA;
1103 GemmMicrokernelTester()
1104 .mr(1)
1105 .nr(8)
1106 .kr(1)
1107 .sr(1)
1108 .m(1)
1109 .n(8)
1110 .k(16)
1111 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1112 }
1113
1114 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
1115 TEST_REQUIRES_ARM_NEON_FMA;
1116 for (uint32_t m = 1; m <= 1; m++) {
1117 for (uint32_t n = 1; n <= 8; n++) {
1118 GemmMicrokernelTester()
1119 .mr(1)
1120 .nr(8)
1121 .kr(1)
1122 .sr(1)
1123 .m(m)
1124 .n(n)
1125 .k(16)
1126 .iterations(1)
1127 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1128 }
1129 }
1130 }
1131
1132 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
1133 TEST_REQUIRES_ARM_NEON_FMA;
1134 for (size_t k = 1; k < 16; k++) {
1135 GemmMicrokernelTester()
1136 .mr(1)
1137 .nr(8)
1138 .kr(1)
1139 .sr(1)
1140 .m(1)
1141 .n(8)
1142 .k(k)
1143 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1144 }
1145 }
1146
1147 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
1148 TEST_REQUIRES_ARM_NEON_FMA;
1149 for (size_t k = 1; k < 16; k++) {
1150 for (uint32_t m = 1; m <= 1; m++) {
1151 for (uint32_t n = 1; n <= 8; n++) {
1152 GemmMicrokernelTester()
1153 .mr(1)
1154 .nr(8)
1155 .kr(1)
1156 .sr(1)
1157 .m(m)
1158 .n(n)
1159 .k(k)
1160 .iterations(1)
1161 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1162 }
1163 }
1164 }
1165 }
1166
1167 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
1168 TEST_REQUIRES_ARM_NEON_FMA;
1169 for (size_t k = 17; k < 16; k++) {
1170 GemmMicrokernelTester()
1171 .mr(1)
1172 .nr(8)
1173 .kr(1)
1174 .sr(1)
1175 .m(1)
1176 .n(8)
1177 .k(k)
1178 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1179 }
1180 }
1181
1182 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
1183 TEST_REQUIRES_ARM_NEON_FMA;
1184 for (size_t k = 17; k < 16; k++) {
1185 for (uint32_t m = 1; m <= 1; m++) {
1186 for (uint32_t n = 1; n <= 8; n++) {
1187 GemmMicrokernelTester()
1188 .mr(1)
1189 .nr(8)
1190 .kr(1)
1191 .sr(1)
1192 .m(m)
1193 .n(n)
1194 .k(k)
1195 .iterations(1)
1196 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1197 }
1198 }
1199 }
1200 }
1201
1202 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
1203 TEST_REQUIRES_ARM_NEON_FMA;
1204 for (size_t k = 24; k <= 80; k += 8) {
1205 GemmMicrokernelTester()
1206 .mr(1)
1207 .nr(8)
1208 .kr(1)
1209 .sr(1)
1210 .m(1)
1211 .n(8)
1212 .k(k)
1213 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1214 }
1215 }
1216
1217 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
1218 TEST_REQUIRES_ARM_NEON_FMA;
1219 for (size_t k = 24; k <= 80; k += 8) {
1220 for (uint32_t m = 1; m <= 1; m++) {
1221 for (uint32_t n = 1; n <= 8; n++) {
1222 GemmMicrokernelTester()
1223 .mr(1)
1224 .nr(8)
1225 .kr(1)
1226 .sr(1)
1227 .m(m)
1228 .n(n)
1229 .k(k)
1230 .iterations(1)
1231 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1232 }
1233 }
1234 }
1235 }
1236
1237 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
1238 TEST_REQUIRES_ARM_NEON_FMA;
1239 for (uint32_t n = 9; n < 16; n++) {
1240 for (size_t k = 1; k <= 40; k += 9) {
1241 GemmMicrokernelTester()
1242 .mr(1)
1243 .nr(8)
1244 .kr(1)
1245 .sr(1)
1246 .m(1)
1247 .n(8)
1248 .k(k)
1249 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1250 }
1251 }
1252 }
1253
1254 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
1255 TEST_REQUIRES_ARM_NEON_FMA;
1256 for (uint32_t n = 9; n < 16; n++) {
1257 for (size_t k = 1; k <= 40; k += 9) {
1258 GemmMicrokernelTester()
1259 .mr(1)
1260 .nr(8)
1261 .kr(1)
1262 .sr(1)
1263 .m(1)
1264 .n(8)
1265 .k(k)
1266 .cn_stride(11)
1267 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1268 }
1269 }
1270 }
1271
1272 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
1273 TEST_REQUIRES_ARM_NEON_FMA;
1274 for (uint32_t n = 9; n < 16; n++) {
1275 for (size_t k = 1; k <= 40; k += 9) {
1276 for (uint32_t m = 1; m <= 1; m++) {
1277 GemmMicrokernelTester()
1278 .mr(1)
1279 .nr(8)
1280 .kr(1)
1281 .sr(1)
1282 .m(m)
1283 .n(n)
1284 .k(k)
1285 .iterations(1)
1286 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1287 }
1288 }
1289 }
1290 }
1291
1292 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
1293 TEST_REQUIRES_ARM_NEON_FMA;
1294 for (uint32_t n = 16; n <= 24; n += 8) {
1295 for (size_t k = 1; k <= 40; k += 9) {
1296 GemmMicrokernelTester()
1297 .mr(1)
1298 .nr(8)
1299 .kr(1)
1300 .sr(1)
1301 .m(1)
1302 .n(8)
1303 .k(k)
1304 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1305 }
1306 }
1307 }
1308
1309 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
1310 TEST_REQUIRES_ARM_NEON_FMA;
1311 for (uint32_t n = 16; n <= 24; n += 8) {
1312 for (size_t k = 1; k <= 40; k += 9) {
1313 GemmMicrokernelTester()
1314 .mr(1)
1315 .nr(8)
1316 .kr(1)
1317 .sr(1)
1318 .m(1)
1319 .n(n)
1320 .k(k)
1321 .cn_stride(11)
1322 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1323 }
1324 }
1325 }
1326
1327 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
1328 TEST_REQUIRES_ARM_NEON_FMA;
1329 for (uint32_t n = 16; n <= 24; n += 8) {
1330 for (size_t k = 1; k <= 40; k += 9) {
1331 for (uint32_t m = 1; m <= 1; m++) {
1332 GemmMicrokernelTester()
1333 .mr(1)
1334 .nr(8)
1335 .kr(1)
1336 .sr(1)
1337 .m(m)
1338 .n(n)
1339 .k(k)
1340 .iterations(1)
1341 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1342 }
1343 }
1344 }
1345 }
1346
1347 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
1348 TEST_REQUIRES_ARM_NEON_FMA;
1349 for (size_t k = 1; k <= 40; k += 9) {
1350 GemmMicrokernelTester()
1351 .mr(1)
1352 .nr(8)
1353 .kr(1)
1354 .sr(1)
1355 .m(1)
1356 .n(8)
1357 .k(k)
1358 .ks(3)
1359 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1360 }
1361 }
1362
1363 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
1364 TEST_REQUIRES_ARM_NEON_FMA;
1365 for (size_t k = 1; k <= 40; k += 9) {
1366 for (uint32_t m = 1; m <= 1; m++) {
1367 for (uint32_t n = 1; n <= 8; n++) {
1368 GemmMicrokernelTester()
1369 .mr(1)
1370 .nr(8)
1371 .kr(1)
1372 .sr(1)
1373 .m(m)
1374 .n(n)
1375 .k(k)
1376 .ks(3)
1377 .iterations(1)
1378 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1379 }
1380 }
1381 }
1382 }
1383
1384 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
1385 TEST_REQUIRES_ARM_NEON_FMA;
1386 for (uint32_t n = 9; n < 16; n++) {
1387 for (size_t k = 1; k <= 40; k += 9) {
1388 GemmMicrokernelTester()
1389 .mr(1)
1390 .nr(8)
1391 .kr(1)
1392 .sr(1)
1393 .m(1)
1394 .n(8)
1395 .k(k)
1396 .ks(3)
1397 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1398 }
1399 }
1400 }
1401
1402 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
1403 TEST_REQUIRES_ARM_NEON_FMA;
1404 for (uint32_t n = 16; n <= 24; n += 8) {
1405 for (size_t k = 1; k <= 40; k += 9) {
1406 GemmMicrokernelTester()
1407 .mr(1)
1408 .nr(8)
1409 .kr(1)
1410 .sr(1)
1411 .m(1)
1412 .n(8)
1413 .k(k)
1414 .ks(3)
1415 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1416 }
1417 }
1418 }
1419
1420 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
1421 TEST_REQUIRES_ARM_NEON_FMA;
1422 for (size_t k = 1; k <= 40; k += 9) {
1423 for (uint32_t m = 1; m <= 1; m++) {
1424 for (uint32_t n = 1; n <= 8; n++) {
1425 GemmMicrokernelTester()
1426 .mr(1)
1427 .nr(8)
1428 .kr(1)
1429 .sr(1)
1430 .m(m)
1431 .n(n)
1432 .k(k)
1433 .cm_stride(11)
1434 .iterations(1)
1435 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1436 }
1437 }
1438 }
1439 }
1440
1441 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
1442 TEST_REQUIRES_ARM_NEON_FMA;
1443 for (size_t k = 1; k <= 40; k += 9) {
1444 GemmMicrokernelTester()
1445 .mr(1)
1446 .nr(8)
1447 .kr(1)
1448 .sr(1)
1449 .m(1)
1450 .n(8)
1451 .k(k)
1452 .ks(3)
1453 .a_offset(43)
1454 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1455 }
1456 }
1457
1458 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
1459 TEST_REQUIRES_ARM_NEON_FMA;
1460 for (uint32_t mz = 0; mz < 1; mz++) {
1461 for (size_t k = 1; k <= 40; k += 9) {
1462 GemmMicrokernelTester()
1463 .mr(1)
1464 .nr(8)
1465 .kr(1)
1466 .sr(1)
1467 .m(1)
1468 .n(8)
1469 .k(k)
1470 .ks(3)
1471 .a_offset(43)
1472 .zero_index(mz)
1473 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1474 }
1475 }
1476 }
1477
1478 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
1479 TEST_REQUIRES_ARM_NEON_FMA;
1480 GemmMicrokernelTester()
1481 .mr(1)
1482 .nr(8)
1483 .kr(1)
1484 .sr(1)
1485 .m(1)
1486 .n(8)
1487 .k(8)
1488 .qmin(128)
1489 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1490 }
1491
1492 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
1493 TEST_REQUIRES_ARM_NEON_FMA;
1494 GemmMicrokernelTester()
1495 .mr(1)
1496 .nr(8)
1497 .kr(1)
1498 .sr(1)
1499 .m(1)
1500 .n(8)
1501 .k(8)
1502 .qmax(128)
1503 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1504 }
1505
1506 TEST(F32_IGEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
1507 TEST_REQUIRES_ARM_NEON_FMA;
1508 GemmMicrokernelTester()
1509 .mr(1)
1510 .nr(8)
1511 .kr(1)
1512 .sr(1)
1513 .m(1)
1514 .n(8)
1515 .k(8)
1516 .cm_stride(11)
1517 .Test(xnn_f32_igemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1518 }
1519#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1520
1521
1522#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
1523 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
1524 TEST_REQUIRES_ARM_NEON_FMA;
1525 GemmMicrokernelTester()
1526 .mr(4)
1527 .nr(8)
1528 .kr(1)
1529 .sr(1)
1530 .m(4)
1531 .n(8)
1532 .k(4)
1533 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1534 }
1535
1536 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1537 TEST_REQUIRES_ARM_NEON_FMA;
1538 GemmMicrokernelTester()
1539 .mr(4)
1540 .nr(8)
1541 .kr(1)
1542 .sr(1)
1543 .m(4)
1544 .n(8)
1545 .k(4)
1546 .cn_stride(11)
1547 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1548 }
1549
1550 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
1551 TEST_REQUIRES_ARM_NEON_FMA;
1552 for (uint32_t m = 1; m <= 4; m++) {
1553 for (uint32_t n = 1; n <= 8; n++) {
1554 GemmMicrokernelTester()
1555 .mr(4)
1556 .nr(8)
1557 .kr(1)
1558 .sr(1)
1559 .m(m)
1560 .n(n)
1561 .k(4)
1562 .iterations(1)
1563 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1564 }
1565 }
1566 }
1567
1568 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
1569 TEST_REQUIRES_ARM_NEON_FMA;
1570 for (uint32_t m = 1; m <= 4; m++) {
1571 GemmMicrokernelTester()
1572 .mr(4)
1573 .nr(8)
1574 .kr(1)
1575 .sr(1)
1576 .m(m)
1577 .n(8)
1578 .k(4)
1579 .iterations(1)
1580 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1581 }
1582 }
1583
1584 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
1585 TEST_REQUIRES_ARM_NEON_FMA;
1586 for (uint32_t n = 1; n <= 8; n++) {
1587 GemmMicrokernelTester()
1588 .mr(4)
1589 .nr(8)
1590 .kr(1)
1591 .sr(1)
1592 .m(4)
1593 .n(n)
1594 .k(4)
1595 .iterations(1)
1596 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1597 }
1598 }
1599
1600 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
1601 TEST_REQUIRES_ARM_NEON_FMA;
1602 GemmMicrokernelTester()
1603 .mr(4)
1604 .nr(8)
1605 .kr(1)
1606 .sr(1)
1607 .m(4)
1608 .n(8)
1609 .k(8)
1610 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1611 }
1612
1613 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1614 TEST_REQUIRES_ARM_NEON_FMA;
1615 for (uint32_t m = 1; m <= 4; m++) {
1616 for (uint32_t n = 1; n <= 8; n++) {
1617 GemmMicrokernelTester()
1618 .mr(4)
1619 .nr(8)
1620 .kr(1)
1621 .sr(1)
1622 .m(m)
1623 .n(n)
1624 .k(8)
1625 .iterations(1)
1626 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1627 }
1628 }
1629 }
1630
1631 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
1632 TEST_REQUIRES_ARM_NEON_FMA;
1633 for (size_t k = 1; k < 8; k++) {
1634 GemmMicrokernelTester()
1635 .mr(4)
1636 .nr(8)
1637 .kr(1)
1638 .sr(1)
1639 .m(4)
1640 .n(8)
1641 .k(k)
1642 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1643 }
1644 }
1645
1646 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
1647 TEST_REQUIRES_ARM_NEON_FMA;
1648 for (size_t k = 1; k < 8; k++) {
1649 for (uint32_t m = 1; m <= 4; m++) {
1650 for (uint32_t n = 1; n <= 8; n++) {
1651 GemmMicrokernelTester()
1652 .mr(4)
1653 .nr(8)
1654 .kr(1)
1655 .sr(1)
1656 .m(m)
1657 .n(n)
1658 .k(k)
1659 .iterations(1)
1660 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1661 }
1662 }
1663 }
1664 }
1665
1666 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
1667 TEST_REQUIRES_ARM_NEON_FMA;
1668 for (size_t k = 9; k < 8; k++) {
1669 GemmMicrokernelTester()
1670 .mr(4)
1671 .nr(8)
1672 .kr(1)
1673 .sr(1)
1674 .m(4)
1675 .n(8)
1676 .k(k)
1677 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1678 }
1679 }
1680
1681 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
1682 TEST_REQUIRES_ARM_NEON_FMA;
1683 for (size_t k = 9; k < 8; k++) {
1684 for (uint32_t m = 1; m <= 4; m++) {
1685 for (uint32_t n = 1; n <= 8; n++) {
1686 GemmMicrokernelTester()
1687 .mr(4)
1688 .nr(8)
1689 .kr(1)
1690 .sr(1)
1691 .m(m)
1692 .n(n)
1693 .k(k)
1694 .iterations(1)
1695 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1696 }
1697 }
1698 }
1699 }
1700
1701 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
1702 TEST_REQUIRES_ARM_NEON_FMA;
1703 for (size_t k = 12; k <= 40; k += 4) {
1704 GemmMicrokernelTester()
1705 .mr(4)
1706 .nr(8)
1707 .kr(1)
1708 .sr(1)
1709 .m(4)
1710 .n(8)
1711 .k(k)
1712 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1713 }
1714 }
1715
1716 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
1717 TEST_REQUIRES_ARM_NEON_FMA;
1718 for (size_t k = 12; k <= 40; k += 4) {
1719 for (uint32_t m = 1; m <= 4; m++) {
1720 for (uint32_t n = 1; n <= 8; n++) {
1721 GemmMicrokernelTester()
1722 .mr(4)
1723 .nr(8)
1724 .kr(1)
1725 .sr(1)
1726 .m(m)
1727 .n(n)
1728 .k(k)
1729 .iterations(1)
1730 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1731 }
1732 }
1733 }
1734 }
1735
1736 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1737 TEST_REQUIRES_ARM_NEON_FMA;
1738 for (uint32_t n = 9; n < 16; n++) {
1739 for (size_t k = 1; k <= 20; k += 5) {
1740 GemmMicrokernelTester()
1741 .mr(4)
1742 .nr(8)
1743 .kr(1)
1744 .sr(1)
1745 .m(4)
1746 .n(8)
1747 .k(k)
1748 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1749 }
1750 }
1751 }
1752
1753 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1754 TEST_REQUIRES_ARM_NEON_FMA;
1755 for (uint32_t n = 9; n < 16; n++) {
1756 for (size_t k = 1; k <= 20; k += 5) {
1757 GemmMicrokernelTester()
1758 .mr(4)
1759 .nr(8)
1760 .kr(1)
1761 .sr(1)
1762 .m(4)
1763 .n(8)
1764 .k(k)
1765 .cn_stride(11)
1766 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1767 }
1768 }
1769 }
1770
1771 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1772 TEST_REQUIRES_ARM_NEON_FMA;
1773 for (uint32_t n = 9; n < 16; n++) {
1774 for (size_t k = 1; k <= 20; k += 5) {
1775 for (uint32_t m = 1; m <= 4; m++) {
1776 GemmMicrokernelTester()
1777 .mr(4)
1778 .nr(8)
1779 .kr(1)
1780 .sr(1)
1781 .m(m)
1782 .n(n)
1783 .k(k)
1784 .iterations(1)
1785 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1786 }
1787 }
1788 }
1789 }
1790
1791 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1792 TEST_REQUIRES_ARM_NEON_FMA;
1793 for (uint32_t n = 16; n <= 24; n += 8) {
1794 for (size_t k = 1; k <= 20; k += 5) {
1795 GemmMicrokernelTester()
1796 .mr(4)
1797 .nr(8)
1798 .kr(1)
1799 .sr(1)
1800 .m(4)
1801 .n(8)
1802 .k(k)
1803 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1804 }
1805 }
1806 }
1807
1808 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1809 TEST_REQUIRES_ARM_NEON_FMA;
1810 for (uint32_t n = 16; n <= 24; n += 8) {
1811 for (size_t k = 1; k <= 20; k += 5) {
1812 GemmMicrokernelTester()
1813 .mr(4)
1814 .nr(8)
1815 .kr(1)
1816 .sr(1)
1817 .m(4)
1818 .n(n)
1819 .k(k)
1820 .cn_stride(11)
1821 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1822 }
1823 }
1824 }
1825
1826 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1827 TEST_REQUIRES_ARM_NEON_FMA;
1828 for (uint32_t n = 16; n <= 24; n += 8) {
1829 for (size_t k = 1; k <= 20; k += 5) {
1830 for (uint32_t m = 1; m <= 4; m++) {
1831 GemmMicrokernelTester()
1832 .mr(4)
1833 .nr(8)
1834 .kr(1)
1835 .sr(1)
1836 .m(m)
1837 .n(n)
1838 .k(k)
1839 .iterations(1)
1840 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1841 }
1842 }
1843 }
1844 }
1845
1846 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
1847 TEST_REQUIRES_ARM_NEON_FMA;
1848 for (size_t k = 1; k <= 20; k += 5) {
1849 GemmMicrokernelTester()
1850 .mr(4)
1851 .nr(8)
1852 .kr(1)
1853 .sr(1)
1854 .m(4)
1855 .n(8)
1856 .k(k)
1857 .ks(3)
1858 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1859 }
1860 }
1861
1862 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
1863 TEST_REQUIRES_ARM_NEON_FMA;
1864 for (size_t k = 1; k <= 20; k += 5) {
1865 for (uint32_t m = 1; m <= 4; m++) {
1866 for (uint32_t n = 1; n <= 8; n++) {
1867 GemmMicrokernelTester()
1868 .mr(4)
1869 .nr(8)
1870 .kr(1)
1871 .sr(1)
1872 .m(m)
1873 .n(n)
1874 .k(k)
1875 .ks(3)
1876 .iterations(1)
1877 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1878 }
1879 }
1880 }
1881 }
1882
1883 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_small_kernel) {
1884 TEST_REQUIRES_ARM_NEON_FMA;
1885 for (uint32_t n = 9; n < 16; n++) {
1886 for (size_t k = 1; k <= 20; k += 5) {
1887 GemmMicrokernelTester()
1888 .mr(4)
1889 .nr(8)
1890 .kr(1)
1891 .sr(1)
1892 .m(4)
1893 .n(8)
1894 .k(k)
1895 .ks(3)
1896 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1897 }
1898 }
1899 }
1900
1901 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_small_kernel) {
1902 TEST_REQUIRES_ARM_NEON_FMA;
1903 for (uint32_t n = 16; n <= 24; n += 8) {
1904 for (size_t k = 1; k <= 20; k += 5) {
1905 GemmMicrokernelTester()
1906 .mr(4)
1907 .nr(8)
1908 .kr(1)
1909 .sr(1)
1910 .m(4)
1911 .n(8)
1912 .k(k)
1913 .ks(3)
1914 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1915 }
1916 }
1917 }
1918
1919 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1920 TEST_REQUIRES_ARM_NEON_FMA;
1921 for (size_t k = 1; k <= 20; k += 5) {
1922 for (uint32_t m = 1; m <= 4; m++) {
1923 for (uint32_t n = 1; n <= 8; n++) {
1924 GemmMicrokernelTester()
1925 .mr(4)
1926 .nr(8)
1927 .kr(1)
1928 .sr(1)
1929 .m(m)
1930 .n(n)
1931 .k(k)
1932 .cm_stride(11)
1933 .iterations(1)
1934 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1935 }
1936 }
1937 }
1938 }
1939
1940 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
1941 TEST_REQUIRES_ARM_NEON_FMA;
1942 for (size_t k = 1; k <= 20; k += 5) {
1943 GemmMicrokernelTester()
1944 .mr(4)
1945 .nr(8)
1946 .kr(1)
1947 .sr(1)
1948 .m(4)
1949 .n(8)
1950 .k(k)
1951 .ks(3)
1952 .a_offset(83)
1953 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1954 }
1955 }
1956
1957 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, zero) {
1958 TEST_REQUIRES_ARM_NEON_FMA;
1959 for (uint32_t mz = 0; mz < 4; mz++) {
1960 for (size_t k = 1; k <= 20; k += 5) {
1961 GemmMicrokernelTester()
1962 .mr(4)
1963 .nr(8)
1964 .kr(1)
1965 .sr(1)
1966 .m(4)
1967 .n(8)
1968 .k(k)
1969 .ks(3)
1970 .a_offset(83)
1971 .zero_index(mz)
1972 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1973 }
1974 }
1975 }
1976
1977 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1978 TEST_REQUIRES_ARM_NEON_FMA;
1979 GemmMicrokernelTester()
1980 .mr(4)
1981 .nr(8)
1982 .kr(1)
1983 .sr(1)
1984 .m(4)
1985 .n(8)
1986 .k(4)
1987 .qmin(128)
1988 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1989 }
1990
1991 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
1992 TEST_REQUIRES_ARM_NEON_FMA;
1993 GemmMicrokernelTester()
1994 .mr(4)
1995 .nr(8)
1996 .kr(1)
1997 .sr(1)
1998 .m(4)
1999 .n(8)
2000 .k(4)
2001 .qmax(128)
2002 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2003 }
2004
2005 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2006 TEST_REQUIRES_ARM_NEON_FMA;
2007 GemmMicrokernelTester()
2008 .mr(4)
2009 .nr(8)
2010 .kr(1)
2011 .sr(1)
2012 .m(4)
2013 .n(8)
2014 .k(4)
2015 .cm_stride(11)
2016 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2017 }
2018#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2019
2020
2021#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2022 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
2023 TEST_REQUIRES_ARM_NEON_FMA;
2024 GemmMicrokernelTester()
2025 .mr(4)
2026 .nr(8)
2027 .kr(1)
2028 .sr(1)
2029 .m(4)
2030 .n(8)
2031 .k(4)
2032 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2033 }
2034
2035 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
2036 TEST_REQUIRES_ARM_NEON_FMA;
2037 GemmMicrokernelTester()
2038 .mr(4)
2039 .nr(8)
2040 .kr(1)
2041 .sr(1)
2042 .m(4)
2043 .n(8)
2044 .k(4)
2045 .cn_stride(11)
2046 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2047 }
2048
2049 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
2050 TEST_REQUIRES_ARM_NEON_FMA;
2051 for (uint32_t m = 1; m <= 4; m++) {
2052 for (uint32_t n = 1; n <= 8; n++) {
2053 GemmMicrokernelTester()
2054 .mr(4)
2055 .nr(8)
2056 .kr(1)
2057 .sr(1)
2058 .m(m)
2059 .n(n)
2060 .k(4)
2061 .iterations(1)
2062 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2063 }
2064 }
2065 }
2066
2067 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
2068 TEST_REQUIRES_ARM_NEON_FMA;
2069 for (uint32_t m = 1; m <= 4; m++) {
2070 GemmMicrokernelTester()
2071 .mr(4)
2072 .nr(8)
2073 .kr(1)
2074 .sr(1)
2075 .m(m)
2076 .n(8)
2077 .k(4)
2078 .iterations(1)
2079 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2080 }
2081 }
2082
2083 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
2084 TEST_REQUIRES_ARM_NEON_FMA;
2085 for (uint32_t n = 1; n <= 8; n++) {
2086 GemmMicrokernelTester()
2087 .mr(4)
2088 .nr(8)
2089 .kr(1)
2090 .sr(1)
2091 .m(4)
2092 .n(n)
2093 .k(4)
2094 .iterations(1)
2095 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2096 }
2097 }
2098
2099 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
2100 TEST_REQUIRES_ARM_NEON_FMA;
2101 GemmMicrokernelTester()
2102 .mr(4)
2103 .nr(8)
2104 .kr(1)
2105 .sr(1)
2106 .m(4)
2107 .n(8)
2108 .k(8)
2109 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2110 }
2111
2112 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
2113 TEST_REQUIRES_ARM_NEON_FMA;
2114 for (uint32_t m = 1; m <= 4; m++) {
2115 for (uint32_t n = 1; n <= 8; n++) {
2116 GemmMicrokernelTester()
2117 .mr(4)
2118 .nr(8)
2119 .kr(1)
2120 .sr(1)
2121 .m(m)
2122 .n(n)
2123 .k(8)
2124 .iterations(1)
2125 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2126 }
2127 }
2128 }
2129
2130 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
2131 TEST_REQUIRES_ARM_NEON_FMA;
2132 for (size_t k = 1; k < 8; k++) {
2133 GemmMicrokernelTester()
2134 .mr(4)
2135 .nr(8)
2136 .kr(1)
2137 .sr(1)
2138 .m(4)
2139 .n(8)
2140 .k(k)
2141 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2142 }
2143 }
2144
2145 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
2146 TEST_REQUIRES_ARM_NEON_FMA;
2147 for (size_t k = 1; k < 8; k++) {
2148 for (uint32_t m = 1; m <= 4; m++) {
2149 for (uint32_t n = 1; n <= 8; n++) {
2150 GemmMicrokernelTester()
2151 .mr(4)
2152 .nr(8)
2153 .kr(1)
2154 .sr(1)
2155 .m(m)
2156 .n(n)
2157 .k(k)
2158 .iterations(1)
2159 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2160 }
2161 }
2162 }
2163 }
2164
2165 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
2166 TEST_REQUIRES_ARM_NEON_FMA;
2167 for (size_t k = 9; k < 8; k++) {
2168 GemmMicrokernelTester()
2169 .mr(4)
2170 .nr(8)
2171 .kr(1)
2172 .sr(1)
2173 .m(4)
2174 .n(8)
2175 .k(k)
2176 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2177 }
2178 }
2179
2180 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_subtile) {
2181 TEST_REQUIRES_ARM_NEON_FMA;
2182 for (size_t k = 9; k < 8; k++) {
2183 for (uint32_t m = 1; m <= 4; m++) {
2184 for (uint32_t n = 1; n <= 8; n++) {
2185 GemmMicrokernelTester()
2186 .mr(4)
2187 .nr(8)
2188 .kr(1)
2189 .sr(1)
2190 .m(m)
2191 .n(n)
2192 .k(k)
2193 .iterations(1)
2194 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2195 }
2196 }
2197 }
2198 }
2199
2200 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
2201 TEST_REQUIRES_ARM_NEON_FMA;
2202 for (size_t k = 12; k <= 40; k += 4) {
2203 GemmMicrokernelTester()
2204 .mr(4)
2205 .nr(8)
2206 .kr(1)
2207 .sr(1)
2208 .m(4)
2209 .n(8)
2210 .k(k)
2211 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2212 }
2213 }
2214
2215 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
2216 TEST_REQUIRES_ARM_NEON_FMA;
2217 for (size_t k = 12; k <= 40; k += 4) {
2218 for (uint32_t m = 1; m <= 4; m++) {
2219 for (uint32_t n = 1; n <= 8; n++) {
2220 GemmMicrokernelTester()
2221 .mr(4)
2222 .nr(8)
2223 .kr(1)
2224 .sr(1)
2225 .m(m)
2226 .n(n)
2227 .k(k)
2228 .iterations(1)
2229 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2230 }
2231 }
2232 }
2233 }
2234
2235 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
2236 TEST_REQUIRES_ARM_NEON_FMA;
2237 for (uint32_t n = 9; n < 16; n++) {
2238 for (size_t k = 1; k <= 20; k += 5) {
2239 GemmMicrokernelTester()
2240 .mr(4)
2241 .nr(8)
2242 .kr(1)
2243 .sr(1)
2244 .m(4)
2245 .n(8)
2246 .k(k)
2247 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2248 }
2249 }
2250 }
2251
2252 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
2253 TEST_REQUIRES_ARM_NEON_FMA;
2254 for (uint32_t n = 9; n < 16; n++) {
2255 for (size_t k = 1; k <= 20; k += 5) {
2256 GemmMicrokernelTester()
2257 .mr(4)
2258 .nr(8)
2259 .kr(1)
2260 .sr(1)
2261 .m(4)
2262 .n(8)
2263 .k(k)
2264 .cn_stride(11)
2265 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2266 }
2267 }
2268 }
2269
2270 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
2271 TEST_REQUIRES_ARM_NEON_FMA;
2272 for (uint32_t n = 9; n < 16; n++) {
2273 for (size_t k = 1; k <= 20; k += 5) {
2274 for (uint32_t m = 1; m <= 4; m++) {
2275 GemmMicrokernelTester()
2276 .mr(4)
2277 .nr(8)
2278 .kr(1)
2279 .sr(1)
2280 .m(m)
2281 .n(n)
2282 .k(k)
2283 .iterations(1)
2284 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2285 }
2286 }
2287 }
2288 }
2289
2290 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
2291 TEST_REQUIRES_ARM_NEON_FMA;
2292 for (uint32_t n = 16; n <= 24; n += 8) {
2293 for (size_t k = 1; k <= 20; k += 5) {
2294 GemmMicrokernelTester()
2295 .mr(4)
2296 .nr(8)
2297 .kr(1)
2298 .sr(1)
2299 .m(4)
2300 .n(8)
2301 .k(k)
2302 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2303 }
2304 }
2305 }
2306
2307 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
2308 TEST_REQUIRES_ARM_NEON_FMA;
2309 for (uint32_t n = 16; n <= 24; n += 8) {
2310 for (size_t k = 1; k <= 20; k += 5) {
2311 GemmMicrokernelTester()
2312 .mr(4)
2313 .nr(8)
2314 .kr(1)
2315 .sr(1)
2316 .m(4)
2317 .n(n)
2318 .k(k)
2319 .cn_stride(11)
2320 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2321 }
2322 }
2323 }
2324
2325 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
2326 TEST_REQUIRES_ARM_NEON_FMA;
2327 for (uint32_t n = 16; n <= 24; n += 8) {
2328 for (size_t k = 1; k <= 20; k += 5) {
2329 for (uint32_t m = 1; m <= 4; m++) {
2330 GemmMicrokernelTester()
2331 .mr(4)
2332 .nr(8)
2333 .kr(1)
2334 .sr(1)
2335 .m(m)
2336 .n(n)
2337 .k(k)
2338 .iterations(1)
2339 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2340 }
2341 }
2342 }
2343 }
2344
2345 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, small_kernel) {
2346 TEST_REQUIRES_ARM_NEON_FMA;
2347 for (size_t k = 1; k <= 20; k += 5) {
2348 GemmMicrokernelTester()
2349 .mr(4)
2350 .nr(8)
2351 .kr(1)
2352 .sr(1)
2353 .m(4)
2354 .n(8)
2355 .k(k)
2356 .ks(3)
2357 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2358 }
2359 }
2360
2361 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, small_kernel_subtile) {
2362 TEST_REQUIRES_ARM_NEON_FMA;
2363 for (size_t k = 1; k <= 20; k += 5) {
2364 for (uint32_t m = 1; m <= 4; m++) {
2365 for (uint32_t n = 1; n <= 8; n++) {
2366 GemmMicrokernelTester()
2367 .mr(4)
2368 .nr(8)
2369 .kr(1)
2370 .sr(1)
2371 .m(m)
2372 .n(n)
2373 .k(k)
2374 .ks(3)
2375 .iterations(1)
2376 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2377 }
2378 }
2379 }
2380 }
2381
2382 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_small_kernel) {
2383 TEST_REQUIRES_ARM_NEON_FMA;
2384 for (uint32_t n = 9; n < 16; n++) {
2385 for (size_t k = 1; k <= 20; k += 5) {
2386 GemmMicrokernelTester()
2387 .mr(4)
2388 .nr(8)
2389 .kr(1)
2390 .sr(1)
2391 .m(4)
2392 .n(8)
2393 .k(k)
2394 .ks(3)
2395 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2396 }
2397 }
2398 }
2399
2400 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_small_kernel) {
2401 TEST_REQUIRES_ARM_NEON_FMA;
2402 for (uint32_t n = 16; n <= 24; n += 8) {
2403 for (size_t k = 1; k <= 20; k += 5) {
2404 GemmMicrokernelTester()
2405 .mr(4)
2406 .nr(8)
2407 .kr(1)
2408 .sr(1)
2409 .m(4)
2410 .n(8)
2411 .k(k)
2412 .ks(3)
2413 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2414 }
2415 }
2416 }
2417
2418 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
2419 TEST_REQUIRES_ARM_NEON_FMA;
2420 for (size_t k = 1; k <= 20; k += 5) {
2421 for (uint32_t m = 1; m <= 4; m++) {
2422 for (uint32_t n = 1; n <= 8; n++) {
2423 GemmMicrokernelTester()
2424 .mr(4)
2425 .nr(8)
2426 .kr(1)
2427 .sr(1)
2428 .m(m)
2429 .n(n)
2430 .k(k)
2431 .cm_stride(11)
2432 .iterations(1)
2433 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2434 }
2435 }
2436 }
2437 }
2438
2439 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, a_offset) {
2440 TEST_REQUIRES_ARM_NEON_FMA;
2441 for (size_t k = 1; k <= 20; k += 5) {
2442 GemmMicrokernelTester()
2443 .mr(4)
2444 .nr(8)
2445 .kr(1)
2446 .sr(1)
2447 .m(4)
2448 .n(8)
2449 .k(k)
2450 .ks(3)
2451 .a_offset(83)
2452 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2453 }
2454 }
2455
2456 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, zero) {
2457 TEST_REQUIRES_ARM_NEON_FMA;
2458 for (uint32_t mz = 0; mz < 4; mz++) {
2459 for (size_t k = 1; k <= 20; k += 5) {
2460 GemmMicrokernelTester()
2461 .mr(4)
2462 .nr(8)
2463 .kr(1)
2464 .sr(1)
2465 .m(4)
2466 .n(8)
2467 .k(k)
2468 .ks(3)
2469 .a_offset(83)
2470 .zero_index(mz)
2471 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2472 }
2473 }
2474 }
2475
2476 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
2477 TEST_REQUIRES_ARM_NEON_FMA;
2478 GemmMicrokernelTester()
2479 .mr(4)
2480 .nr(8)
2481 .kr(1)
2482 .sr(1)
2483 .m(4)
2484 .n(8)
2485 .k(4)
2486 .qmin(128)
2487 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2488 }
2489
2490 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
2491 TEST_REQUIRES_ARM_NEON_FMA;
2492 GemmMicrokernelTester()
2493 .mr(4)
2494 .nr(8)
2495 .kr(1)
2496 .sr(1)
2497 .m(4)
2498 .n(8)
2499 .k(4)
2500 .qmax(128)
2501 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2502 }
2503
2504 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
2505 TEST_REQUIRES_ARM_NEON_FMA;
2506 GemmMicrokernelTester()
2507 .mr(4)
2508 .nr(8)
2509 .kr(1)
2510 .sr(1)
2511 .m(4)
2512 .n(8)
2513 .k(4)
2514 .cm_stride(11)
2515 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a55);
2516 }
2517#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2518
2519
2520#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2521 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
2522 TEST_REQUIRES_ARM_NEON_FMA;
2523 GemmMicrokernelTester()
2524 .mr(4)
2525 .nr(8)
2526 .kr(1)
2527 .sr(1)
2528 .m(4)
2529 .n(8)
2530 .k(8)
2531 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2532 }
2533
2534 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
2535 TEST_REQUIRES_ARM_NEON_FMA;
2536 GemmMicrokernelTester()
2537 .mr(4)
2538 .nr(8)
2539 .kr(1)
2540 .sr(1)
2541 .m(4)
2542 .n(8)
2543 .k(8)
2544 .cn_stride(11)
2545 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2546 }
2547
2548 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
2549 TEST_REQUIRES_ARM_NEON_FMA;
2550 for (uint32_t m = 1; m <= 4; m++) {
2551 for (uint32_t n = 1; n <= 8; n++) {
2552 GemmMicrokernelTester()
2553 .mr(4)
2554 .nr(8)
2555 .kr(1)
2556 .sr(1)
2557 .m(m)
2558 .n(n)
2559 .k(8)
2560 .iterations(1)
2561 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2562 }
2563 }
2564 }
2565
2566 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
2567 TEST_REQUIRES_ARM_NEON_FMA;
2568 for (uint32_t m = 1; m <= 4; m++) {
2569 GemmMicrokernelTester()
2570 .mr(4)
2571 .nr(8)
2572 .kr(1)
2573 .sr(1)
2574 .m(m)
2575 .n(8)
2576 .k(8)
2577 .iterations(1)
2578 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2579 }
2580 }
2581
2582 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
2583 TEST_REQUIRES_ARM_NEON_FMA;
2584 for (uint32_t n = 1; n <= 8; n++) {
2585 GemmMicrokernelTester()
2586 .mr(4)
2587 .nr(8)
2588 .kr(1)
2589 .sr(1)
2590 .m(4)
2591 .n(n)
2592 .k(8)
2593 .iterations(1)
2594 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2595 }
2596 }
2597
2598 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
2599 TEST_REQUIRES_ARM_NEON_FMA;
2600 GemmMicrokernelTester()
2601 .mr(4)
2602 .nr(8)
2603 .kr(1)
2604 .sr(1)
2605 .m(4)
2606 .n(8)
2607 .k(16)
2608 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2609 }
2610
2611 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
2612 TEST_REQUIRES_ARM_NEON_FMA;
2613 for (uint32_t m = 1; m <= 4; m++) {
2614 for (uint32_t n = 1; n <= 8; n++) {
2615 GemmMicrokernelTester()
2616 .mr(4)
2617 .nr(8)
2618 .kr(1)
2619 .sr(1)
2620 .m(m)
2621 .n(n)
2622 .k(16)
2623 .iterations(1)
2624 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2625 }
2626 }
2627 }
2628
2629 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
2630 TEST_REQUIRES_ARM_NEON_FMA;
2631 for (size_t k = 1; k < 16; k++) {
2632 GemmMicrokernelTester()
2633 .mr(4)
2634 .nr(8)
2635 .kr(1)
2636 .sr(1)
2637 .m(4)
2638 .n(8)
2639 .k(k)
2640 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2641 }
2642 }
2643
2644 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
2645 TEST_REQUIRES_ARM_NEON_FMA;
2646 for (size_t k = 1; k < 16; k++) {
2647 for (uint32_t m = 1; m <= 4; m++) {
2648 for (uint32_t n = 1; n <= 8; n++) {
2649 GemmMicrokernelTester()
2650 .mr(4)
2651 .nr(8)
2652 .kr(1)
2653 .sr(1)
2654 .m(m)
2655 .n(n)
2656 .k(k)
2657 .iterations(1)
2658 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2659 }
2660 }
2661 }
2662 }
2663
2664 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
2665 TEST_REQUIRES_ARM_NEON_FMA;
2666 for (size_t k = 17; k < 16; k++) {
2667 GemmMicrokernelTester()
2668 .mr(4)
2669 .nr(8)
2670 .kr(1)
2671 .sr(1)
2672 .m(4)
2673 .n(8)
2674 .k(k)
2675 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2676 }
2677 }
2678
2679 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
2680 TEST_REQUIRES_ARM_NEON_FMA;
2681 for (size_t k = 17; k < 16; k++) {
2682 for (uint32_t m = 1; m <= 4; m++) {
2683 for (uint32_t n = 1; n <= 8; n++) {
2684 GemmMicrokernelTester()
2685 .mr(4)
2686 .nr(8)
2687 .kr(1)
2688 .sr(1)
2689 .m(m)
2690 .n(n)
2691 .k(k)
2692 .iterations(1)
2693 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2694 }
2695 }
2696 }
2697 }
2698
2699 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
2700 TEST_REQUIRES_ARM_NEON_FMA;
2701 for (size_t k = 24; k <= 80; k += 8) {
2702 GemmMicrokernelTester()
2703 .mr(4)
2704 .nr(8)
2705 .kr(1)
2706 .sr(1)
2707 .m(4)
2708 .n(8)
2709 .k(k)
2710 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2711 }
2712 }
2713
2714 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
2715 TEST_REQUIRES_ARM_NEON_FMA;
2716 for (size_t k = 24; k <= 80; k += 8) {
2717 for (uint32_t m = 1; m <= 4; m++) {
2718 for (uint32_t n = 1; n <= 8; n++) {
2719 GemmMicrokernelTester()
2720 .mr(4)
2721 .nr(8)
2722 .kr(1)
2723 .sr(1)
2724 .m(m)
2725 .n(n)
2726 .k(k)
2727 .iterations(1)
2728 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2729 }
2730 }
2731 }
2732 }
2733
2734 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
2735 TEST_REQUIRES_ARM_NEON_FMA;
2736 for (uint32_t n = 9; n < 16; n++) {
2737 for (size_t k = 1; k <= 40; k += 9) {
2738 GemmMicrokernelTester()
2739 .mr(4)
2740 .nr(8)
2741 .kr(1)
2742 .sr(1)
2743 .m(4)
2744 .n(8)
2745 .k(k)
2746 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2747 }
2748 }
2749 }
2750
2751 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
2752 TEST_REQUIRES_ARM_NEON_FMA;
2753 for (uint32_t n = 9; n < 16; n++) {
2754 for (size_t k = 1; k <= 40; k += 9) {
2755 GemmMicrokernelTester()
2756 .mr(4)
2757 .nr(8)
2758 .kr(1)
2759 .sr(1)
2760 .m(4)
2761 .n(8)
2762 .k(k)
2763 .cn_stride(11)
2764 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2765 }
2766 }
2767 }
2768
2769 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
2770 TEST_REQUIRES_ARM_NEON_FMA;
2771 for (uint32_t n = 9; n < 16; n++) {
2772 for (size_t k = 1; k <= 40; k += 9) {
2773 for (uint32_t m = 1; m <= 4; m++) {
2774 GemmMicrokernelTester()
2775 .mr(4)
2776 .nr(8)
2777 .kr(1)
2778 .sr(1)
2779 .m(m)
2780 .n(n)
2781 .k(k)
2782 .iterations(1)
2783 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2784 }
2785 }
2786 }
2787 }
2788
2789 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
2790 TEST_REQUIRES_ARM_NEON_FMA;
2791 for (uint32_t n = 16; n <= 24; n += 8) {
2792 for (size_t k = 1; k <= 40; k += 9) {
2793 GemmMicrokernelTester()
2794 .mr(4)
2795 .nr(8)
2796 .kr(1)
2797 .sr(1)
2798 .m(4)
2799 .n(8)
2800 .k(k)
2801 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2802 }
2803 }
2804 }
2805
2806 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
2807 TEST_REQUIRES_ARM_NEON_FMA;
2808 for (uint32_t n = 16; n <= 24; n += 8) {
2809 for (size_t k = 1; k <= 40; k += 9) {
2810 GemmMicrokernelTester()
2811 .mr(4)
2812 .nr(8)
2813 .kr(1)
2814 .sr(1)
2815 .m(4)
2816 .n(n)
2817 .k(k)
2818 .cn_stride(11)
2819 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2820 }
2821 }
2822 }
2823
2824 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
2825 TEST_REQUIRES_ARM_NEON_FMA;
2826 for (uint32_t n = 16; n <= 24; n += 8) {
2827 for (size_t k = 1; k <= 40; k += 9) {
2828 for (uint32_t m = 1; m <= 4; m++) {
2829 GemmMicrokernelTester()
2830 .mr(4)
2831 .nr(8)
2832 .kr(1)
2833 .sr(1)
2834 .m(m)
2835 .n(n)
2836 .k(k)
2837 .iterations(1)
2838 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2839 }
2840 }
2841 }
2842 }
2843
2844 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
2845 TEST_REQUIRES_ARM_NEON_FMA;
2846 for (size_t k = 1; k <= 40; k += 9) {
2847 GemmMicrokernelTester()
2848 .mr(4)
2849 .nr(8)
2850 .kr(1)
2851 .sr(1)
2852 .m(4)
2853 .n(8)
2854 .k(k)
2855 .ks(3)
2856 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2857 }
2858 }
2859
2860 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
2861 TEST_REQUIRES_ARM_NEON_FMA;
2862 for (size_t k = 1; k <= 40; k += 9) {
2863 for (uint32_t m = 1; m <= 4; m++) {
2864 for (uint32_t n = 1; n <= 8; n++) {
2865 GemmMicrokernelTester()
2866 .mr(4)
2867 .nr(8)
2868 .kr(1)
2869 .sr(1)
2870 .m(m)
2871 .n(n)
2872 .k(k)
2873 .ks(3)
2874 .iterations(1)
2875 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2876 }
2877 }
2878 }
2879 }
2880
2881 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
2882 TEST_REQUIRES_ARM_NEON_FMA;
2883 for (uint32_t n = 9; n < 16; n++) {
2884 for (size_t k = 1; k <= 40; k += 9) {
2885 GemmMicrokernelTester()
2886 .mr(4)
2887 .nr(8)
2888 .kr(1)
2889 .sr(1)
2890 .m(4)
2891 .n(8)
2892 .k(k)
2893 .ks(3)
2894 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2895 }
2896 }
2897 }
2898
2899 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
2900 TEST_REQUIRES_ARM_NEON_FMA;
2901 for (uint32_t n = 16; n <= 24; n += 8) {
2902 for (size_t k = 1; k <= 40; k += 9) {
2903 GemmMicrokernelTester()
2904 .mr(4)
2905 .nr(8)
2906 .kr(1)
2907 .sr(1)
2908 .m(4)
2909 .n(8)
2910 .k(k)
2911 .ks(3)
2912 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2913 }
2914 }
2915 }
2916
2917 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
2918 TEST_REQUIRES_ARM_NEON_FMA;
2919 for (size_t k = 1; k <= 40; k += 9) {
2920 for (uint32_t m = 1; m <= 4; m++) {
2921 for (uint32_t n = 1; n <= 8; n++) {
2922 GemmMicrokernelTester()
2923 .mr(4)
2924 .nr(8)
2925 .kr(1)
2926 .sr(1)
2927 .m(m)
2928 .n(n)
2929 .k(k)
2930 .cm_stride(11)
2931 .iterations(1)
2932 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2933 }
2934 }
2935 }
2936 }
2937
2938 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
2939 TEST_REQUIRES_ARM_NEON_FMA;
2940 for (size_t k = 1; k <= 40; k += 9) {
2941 GemmMicrokernelTester()
2942 .mr(4)
2943 .nr(8)
2944 .kr(1)
2945 .sr(1)
2946 .m(4)
2947 .n(8)
2948 .k(k)
2949 .ks(3)
2950 .a_offset(163)
2951 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2952 }
2953 }
2954
2955 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
2956 TEST_REQUIRES_ARM_NEON_FMA;
2957 for (uint32_t mz = 0; mz < 4; mz++) {
2958 for (size_t k = 1; k <= 40; k += 9) {
2959 GemmMicrokernelTester()
2960 .mr(4)
2961 .nr(8)
2962 .kr(1)
2963 .sr(1)
2964 .m(4)
2965 .n(8)
2966 .k(k)
2967 .ks(3)
2968 .a_offset(163)
2969 .zero_index(mz)
2970 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2971 }
2972 }
2973 }
2974
2975 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
2976 TEST_REQUIRES_ARM_NEON_FMA;
2977 GemmMicrokernelTester()
2978 .mr(4)
2979 .nr(8)
2980 .kr(1)
2981 .sr(1)
2982 .m(4)
2983 .n(8)
2984 .k(8)
2985 .qmin(128)
2986 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2987 }
2988
2989 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
2990 TEST_REQUIRES_ARM_NEON_FMA;
2991 GemmMicrokernelTester()
2992 .mr(4)
2993 .nr(8)
2994 .kr(1)
2995 .sr(1)
2996 .m(4)
2997 .n(8)
2998 .k(8)
2999 .qmax(128)
3000 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
3001 }
3002
3003 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
3004 TEST_REQUIRES_ARM_NEON_FMA;
3005 GemmMicrokernelTester()
3006 .mr(4)
3007 .nr(8)
3008 .kr(1)
3009 .sr(1)
3010 .m(4)
3011 .n(8)
3012 .k(8)
3013 .cm_stride(11)
3014 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
3015 }
3016#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3017
3018
3019#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3020 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
3021 TEST_REQUIRES_ARM_NEON_FMA;
3022 GemmMicrokernelTester()
3023 .mr(4)
3024 .nr(8)
3025 .kr(1)
3026 .sr(1)
3027 .m(4)
3028 .n(8)
3029 .k(8)
3030 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3031 }
3032
3033 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3034 TEST_REQUIRES_ARM_NEON_FMA;
3035 GemmMicrokernelTester()
3036 .mr(4)
3037 .nr(8)
3038 .kr(1)
3039 .sr(1)
3040 .m(4)
3041 .n(8)
3042 .k(8)
3043 .cn_stride(11)
3044 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3045 }
3046
3047 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3048 TEST_REQUIRES_ARM_NEON_FMA;
3049 for (uint32_t m = 1; m <= 4; m++) {
3050 for (uint32_t n = 1; n <= 8; n++) {
3051 GemmMicrokernelTester()
3052 .mr(4)
3053 .nr(8)
3054 .kr(1)
3055 .sr(1)
3056 .m(m)
3057 .n(n)
3058 .k(8)
3059 .iterations(1)
3060 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3061 }
3062 }
3063 }
3064
3065 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3066 TEST_REQUIRES_ARM_NEON_FMA;
3067 for (uint32_t m = 1; m <= 4; m++) {
3068 GemmMicrokernelTester()
3069 .mr(4)
3070 .nr(8)
3071 .kr(1)
3072 .sr(1)
3073 .m(m)
3074 .n(8)
3075 .k(8)
3076 .iterations(1)
3077 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3078 }
3079 }
3080
3081 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3082 TEST_REQUIRES_ARM_NEON_FMA;
3083 for (uint32_t n = 1; n <= 8; n++) {
3084 GemmMicrokernelTester()
3085 .mr(4)
3086 .nr(8)
3087 .kr(1)
3088 .sr(1)
3089 .m(4)
3090 .n(n)
3091 .k(8)
3092 .iterations(1)
3093 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3094 }
3095 }
3096
3097 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
3098 TEST_REQUIRES_ARM_NEON_FMA;
3099 GemmMicrokernelTester()
3100 .mr(4)
3101 .nr(8)
3102 .kr(1)
3103 .sr(1)
3104 .m(4)
3105 .n(8)
3106 .k(16)
3107 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3108 }
3109
3110 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
3111 TEST_REQUIRES_ARM_NEON_FMA;
3112 for (uint32_t m = 1; m <= 4; m++) {
3113 for (uint32_t n = 1; n <= 8; n++) {
3114 GemmMicrokernelTester()
3115 .mr(4)
3116 .nr(8)
3117 .kr(1)
3118 .sr(1)
3119 .m(m)
3120 .n(n)
3121 .k(16)
3122 .iterations(1)
3123 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3124 }
3125 }
3126 }
3127
3128 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
3129 TEST_REQUIRES_ARM_NEON_FMA;
3130 for (size_t k = 1; k < 16; k++) {
3131 GemmMicrokernelTester()
3132 .mr(4)
3133 .nr(8)
3134 .kr(1)
3135 .sr(1)
3136 .m(4)
3137 .n(8)
3138 .k(k)
3139 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3140 }
3141 }
3142
3143 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
3144 TEST_REQUIRES_ARM_NEON_FMA;
3145 for (size_t k = 1; k < 16; k++) {
3146 for (uint32_t m = 1; m <= 4; m++) {
3147 for (uint32_t n = 1; n <= 8; n++) {
3148 GemmMicrokernelTester()
3149 .mr(4)
3150 .nr(8)
3151 .kr(1)
3152 .sr(1)
3153 .m(m)
3154 .n(n)
3155 .k(k)
3156 .iterations(1)
3157 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3158 }
3159 }
3160 }
3161 }
3162
3163 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
3164 TEST_REQUIRES_ARM_NEON_FMA;
3165 for (size_t k = 17; k < 16; k++) {
3166 GemmMicrokernelTester()
3167 .mr(4)
3168 .nr(8)
3169 .kr(1)
3170 .sr(1)
3171 .m(4)
3172 .n(8)
3173 .k(k)
3174 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3175 }
3176 }
3177
3178 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3179 TEST_REQUIRES_ARM_NEON_FMA;
3180 for (size_t k = 17; k < 16; k++) {
3181 for (uint32_t m = 1; m <= 4; m++) {
3182 for (uint32_t n = 1; n <= 8; n++) {
3183 GemmMicrokernelTester()
3184 .mr(4)
3185 .nr(8)
3186 .kr(1)
3187 .sr(1)
3188 .m(m)
3189 .n(n)
3190 .k(k)
3191 .iterations(1)
3192 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3193 }
3194 }
3195 }
3196 }
3197
3198 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3199 TEST_REQUIRES_ARM_NEON_FMA;
3200 for (size_t k = 24; k <= 80; k += 8) {
3201 GemmMicrokernelTester()
3202 .mr(4)
3203 .nr(8)
3204 .kr(1)
3205 .sr(1)
3206 .m(4)
3207 .n(8)
3208 .k(k)
3209 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3210 }
3211 }
3212
3213 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3214 TEST_REQUIRES_ARM_NEON_FMA;
3215 for (size_t k = 24; k <= 80; k += 8) {
3216 for (uint32_t m = 1; m <= 4; m++) {
3217 for (uint32_t n = 1; n <= 8; n++) {
3218 GemmMicrokernelTester()
3219 .mr(4)
3220 .nr(8)
3221 .kr(1)
3222 .sr(1)
3223 .m(m)
3224 .n(n)
3225 .k(k)
3226 .iterations(1)
3227 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3228 }
3229 }
3230 }
3231 }
3232
3233 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
3234 TEST_REQUIRES_ARM_NEON_FMA;
3235 for (uint32_t n = 9; n < 16; n++) {
3236 for (size_t k = 1; k <= 40; k += 9) {
3237 GemmMicrokernelTester()
3238 .mr(4)
3239 .nr(8)
3240 .kr(1)
3241 .sr(1)
3242 .m(4)
3243 .n(8)
3244 .k(k)
3245 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3246 }
3247 }
3248 }
3249
3250 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
3251 TEST_REQUIRES_ARM_NEON_FMA;
3252 for (uint32_t n = 9; n < 16; n++) {
3253 for (size_t k = 1; k <= 40; k += 9) {
3254 GemmMicrokernelTester()
3255 .mr(4)
3256 .nr(8)
3257 .kr(1)
3258 .sr(1)
3259 .m(4)
3260 .n(8)
3261 .k(k)
3262 .cn_stride(11)
3263 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3264 }
3265 }
3266 }
3267
3268 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
3269 TEST_REQUIRES_ARM_NEON_FMA;
3270 for (uint32_t n = 9; n < 16; n++) {
3271 for (size_t k = 1; k <= 40; k += 9) {
3272 for (uint32_t m = 1; m <= 4; m++) {
3273 GemmMicrokernelTester()
3274 .mr(4)
3275 .nr(8)
3276 .kr(1)
3277 .sr(1)
3278 .m(m)
3279 .n(n)
3280 .k(k)
3281 .iterations(1)
3282 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3283 }
3284 }
3285 }
3286 }
3287
3288 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
3289 TEST_REQUIRES_ARM_NEON_FMA;
3290 for (uint32_t n = 16; n <= 24; n += 8) {
3291 for (size_t k = 1; k <= 40; k += 9) {
3292 GemmMicrokernelTester()
3293 .mr(4)
3294 .nr(8)
3295 .kr(1)
3296 .sr(1)
3297 .m(4)
3298 .n(8)
3299 .k(k)
3300 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3301 }
3302 }
3303 }
3304
3305 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
3306 TEST_REQUIRES_ARM_NEON_FMA;
3307 for (uint32_t n = 16; n <= 24; n += 8) {
3308 for (size_t k = 1; k <= 40; k += 9) {
3309 GemmMicrokernelTester()
3310 .mr(4)
3311 .nr(8)
3312 .kr(1)
3313 .sr(1)
3314 .m(4)
3315 .n(n)
3316 .k(k)
3317 .cn_stride(11)
3318 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3319 }
3320 }
3321 }
3322
3323 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
3324 TEST_REQUIRES_ARM_NEON_FMA;
3325 for (uint32_t n = 16; n <= 24; n += 8) {
3326 for (size_t k = 1; k <= 40; k += 9) {
3327 for (uint32_t m = 1; m <= 4; m++) {
3328 GemmMicrokernelTester()
3329 .mr(4)
3330 .nr(8)
3331 .kr(1)
3332 .sr(1)
3333 .m(m)
3334 .n(n)
3335 .k(k)
3336 .iterations(1)
3337 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3338 }
3339 }
3340 }
3341 }
3342
3343 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
3344 TEST_REQUIRES_ARM_NEON_FMA;
3345 for (size_t k = 1; k <= 40; k += 9) {
3346 GemmMicrokernelTester()
3347 .mr(4)
3348 .nr(8)
3349 .kr(1)
3350 .sr(1)
3351 .m(4)
3352 .n(8)
3353 .k(k)
3354 .ks(3)
3355 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3356 }
3357 }
3358
3359 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
3360 TEST_REQUIRES_ARM_NEON_FMA;
3361 for (size_t k = 1; k <= 40; k += 9) {
3362 for (uint32_t m = 1; m <= 4; m++) {
3363 for (uint32_t n = 1; n <= 8; n++) {
3364 GemmMicrokernelTester()
3365 .mr(4)
3366 .nr(8)
3367 .kr(1)
3368 .sr(1)
3369 .m(m)
3370 .n(n)
3371 .k(k)
3372 .ks(3)
3373 .iterations(1)
3374 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3375 }
3376 }
3377 }
3378 }
3379
3380 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
3381 TEST_REQUIRES_ARM_NEON_FMA;
3382 for (uint32_t n = 9; n < 16; n++) {
3383 for (size_t k = 1; k <= 40; k += 9) {
3384 GemmMicrokernelTester()
3385 .mr(4)
3386 .nr(8)
3387 .kr(1)
3388 .sr(1)
3389 .m(4)
3390 .n(8)
3391 .k(k)
3392 .ks(3)
3393 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3394 }
3395 }
3396 }
3397
3398 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
3399 TEST_REQUIRES_ARM_NEON_FMA;
3400 for (uint32_t n = 16; n <= 24; n += 8) {
3401 for (size_t k = 1; k <= 40; k += 9) {
3402 GemmMicrokernelTester()
3403 .mr(4)
3404 .nr(8)
3405 .kr(1)
3406 .sr(1)
3407 .m(4)
3408 .n(8)
3409 .k(k)
3410 .ks(3)
3411 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3412 }
3413 }
3414 }
3415
3416 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3417 TEST_REQUIRES_ARM_NEON_FMA;
3418 for (size_t k = 1; k <= 40; k += 9) {
3419 for (uint32_t m = 1; m <= 4; m++) {
3420 for (uint32_t n = 1; n <= 8; n++) {
3421 GemmMicrokernelTester()
3422 .mr(4)
3423 .nr(8)
3424 .kr(1)
3425 .sr(1)
3426 .m(m)
3427 .n(n)
3428 .k(k)
3429 .cm_stride(11)
3430 .iterations(1)
3431 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3432 }
3433 }
3434 }
3435 }
3436
3437 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
3438 TEST_REQUIRES_ARM_NEON_FMA;
3439 for (size_t k = 1; k <= 40; k += 9) {
3440 GemmMicrokernelTester()
3441 .mr(4)
3442 .nr(8)
3443 .kr(1)
3444 .sr(1)
3445 .m(4)
3446 .n(8)
3447 .k(k)
3448 .ks(3)
3449 .a_offset(163)
3450 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3451 }
3452 }
3453
3454 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
3455 TEST_REQUIRES_ARM_NEON_FMA;
3456 for (uint32_t mz = 0; mz < 4; mz++) {
3457 for (size_t k = 1; k <= 40; k += 9) {
3458 GemmMicrokernelTester()
3459 .mr(4)
3460 .nr(8)
3461 .kr(1)
3462 .sr(1)
3463 .m(4)
3464 .n(8)
3465 .k(k)
3466 .ks(3)
3467 .a_offset(163)
3468 .zero_index(mz)
3469 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3470 }
3471 }
3472 }
3473
3474 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3475 TEST_REQUIRES_ARM_NEON_FMA;
3476 GemmMicrokernelTester()
3477 .mr(4)
3478 .nr(8)
3479 .kr(1)
3480 .sr(1)
3481 .m(4)
3482 .n(8)
3483 .k(8)
3484 .qmin(128)
3485 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3486 }
3487
3488 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3489 TEST_REQUIRES_ARM_NEON_FMA;
3490 GemmMicrokernelTester()
3491 .mr(4)
3492 .nr(8)
3493 .kr(1)
3494 .sr(1)
3495 .m(4)
3496 .n(8)
3497 .k(8)
3498 .qmax(128)
3499 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3500 }
3501
3502 TEST(F32_IGEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3503 TEST_REQUIRES_ARM_NEON_FMA;
3504 GemmMicrokernelTester()
3505 .mr(4)
3506 .nr(8)
3507 .kr(1)
3508 .sr(1)
3509 .m(4)
3510 .n(8)
3511 .k(8)
3512 .cm_stride(11)
3513 .Test(xnn_f32_igemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3514 }
3515#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3516
3517
3518#if XNN_ARCH_ARM
3519 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2) {
3520 TEST_REQUIRES_ARM_NEON;
3521 GemmMicrokernelTester()
3522 .mr(4)
3523 .nr(8)
3524 .kr(1)
3525 .sr(1)
3526 .m(4)
3527 .n(8)
3528 .k(2)
3529 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3530 }
3531
3532 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, strided_cn) {
3533 TEST_REQUIRES_ARM_NEON;
3534 GemmMicrokernelTester()
3535 .mr(4)
3536 .nr(8)
3537 .kr(1)
3538 .sr(1)
3539 .m(4)
3540 .n(8)
3541 .k(2)
3542 .cn_stride(11)
3543 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3544 }
3545
3546 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile) {
3547 TEST_REQUIRES_ARM_NEON;
3548 for (uint32_t m = 1; m <= 4; m++) {
3549 for (uint32_t n = 1; n <= 8; n++) {
3550 GemmMicrokernelTester()
3551 .mr(4)
3552 .nr(8)
3553 .kr(1)
3554 .sr(1)
3555 .m(m)
3556 .n(n)
3557 .k(2)
3558 .iterations(1)
3559 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3560 }
3561 }
3562 }
3563
3564 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_m) {
3565 TEST_REQUIRES_ARM_NEON;
3566 for (uint32_t m = 1; m <= 4; m++) {
3567 GemmMicrokernelTester()
3568 .mr(4)
3569 .nr(8)
3570 .kr(1)
3571 .sr(1)
3572 .m(m)
3573 .n(8)
3574 .k(2)
3575 .iterations(1)
3576 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3577 }
3578 }
3579
3580 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_n) {
3581 TEST_REQUIRES_ARM_NEON;
3582 for (uint32_t n = 1; n <= 8; n++) {
3583 GemmMicrokernelTester()
3584 .mr(4)
3585 .nr(8)
3586 .kr(1)
3587 .sr(1)
3588 .m(4)
3589 .n(n)
3590 .k(2)
3591 .iterations(1)
3592 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3593 }
3594 }
3595
3596 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_lt_2) {
3597 TEST_REQUIRES_ARM_NEON;
3598 for (size_t k = 1; k < 2; k++) {
3599 GemmMicrokernelTester()
3600 .mr(4)
3601 .nr(8)
3602 .kr(1)
3603 .sr(1)
3604 .m(4)
3605 .n(8)
3606 .k(k)
3607 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3608 }
3609 }
3610
3611 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_lt_2_subtile) {
3612 TEST_REQUIRES_ARM_NEON;
3613 for (size_t k = 1; k < 2; k++) {
3614 for (uint32_t m = 1; m <= 4; m++) {
3615 for (uint32_t n = 1; n <= 8; n++) {
3616 GemmMicrokernelTester()
3617 .mr(4)
3618 .nr(8)
3619 .kr(1)
3620 .sr(1)
3621 .m(m)
3622 .n(n)
3623 .k(k)
3624 .iterations(1)
3625 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3626 }
3627 }
3628 }
3629 }
3630
3631 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_gt_2) {
3632 TEST_REQUIRES_ARM_NEON;
3633 for (size_t k = 3; k < 4; k++) {
3634 GemmMicrokernelTester()
3635 .mr(4)
3636 .nr(8)
3637 .kr(1)
3638 .sr(1)
3639 .m(4)
3640 .n(8)
3641 .k(k)
3642 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3643 }
3644 }
3645
3646 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_gt_2_subtile) {
3647 TEST_REQUIRES_ARM_NEON;
3648 for (size_t k = 3; k < 4; k++) {
3649 for (uint32_t m = 1; m <= 4; m++) {
3650 for (uint32_t n = 1; n <= 8; n++) {
3651 GemmMicrokernelTester()
3652 .mr(4)
3653 .nr(8)
3654 .kr(1)
3655 .sr(1)
3656 .m(m)
3657 .n(n)
3658 .k(k)
3659 .iterations(1)
3660 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3661 }
3662 }
3663 }
3664 }
3665
3666 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_div_2) {
3667 TEST_REQUIRES_ARM_NEON;
3668 for (size_t k = 4; k <= 20; k += 2) {
3669 GemmMicrokernelTester()
3670 .mr(4)
3671 .nr(8)
3672 .kr(1)
3673 .sr(1)
3674 .m(4)
3675 .n(8)
3676 .k(k)
3677 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3678 }
3679 }
3680
3681 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, k_div_2_subtile) {
3682 TEST_REQUIRES_ARM_NEON;
3683 for (size_t k = 4; k <= 20; k += 2) {
3684 for (uint32_t m = 1; m <= 4; m++) {
3685 for (uint32_t n = 1; n <= 8; n++) {
3686 GemmMicrokernelTester()
3687 .mr(4)
3688 .nr(8)
3689 .kr(1)
3690 .sr(1)
3691 .m(m)
3692 .n(n)
3693 .k(k)
3694 .iterations(1)
3695 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3696 }
3697 }
3698 }
3699 }
3700
3701 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8) {
3702 TEST_REQUIRES_ARM_NEON;
3703 for (uint32_t n = 9; n < 16; n++) {
3704 for (size_t k = 1; k <= 10; k += 3) {
3705 GemmMicrokernelTester()
3706 .mr(4)
3707 .nr(8)
3708 .kr(1)
3709 .sr(1)
3710 .m(4)
3711 .n(8)
3712 .k(k)
3713 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3714 }
3715 }
3716 }
3717
3718 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8_strided_cn) {
3719 TEST_REQUIRES_ARM_NEON;
3720 for (uint32_t n = 9; n < 16; n++) {
3721 for (size_t k = 1; k <= 10; k += 3) {
3722 GemmMicrokernelTester()
3723 .mr(4)
3724 .nr(8)
3725 .kr(1)
3726 .sr(1)
3727 .m(4)
3728 .n(8)
3729 .k(k)
3730 .cn_stride(11)
3731 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3732 }
3733 }
3734 }
3735
3736 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8_subtile) {
3737 TEST_REQUIRES_ARM_NEON;
3738 for (uint32_t n = 9; n < 16; n++) {
3739 for (size_t k = 1; k <= 10; k += 3) {
3740 for (uint32_t m = 1; m <= 4; m++) {
3741 GemmMicrokernelTester()
3742 .mr(4)
3743 .nr(8)
3744 .kr(1)
3745 .sr(1)
3746 .m(m)
3747 .n(n)
3748 .k(k)
3749 .iterations(1)
3750 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3751 }
3752 }
3753 }
3754 }
3755
3756 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8) {
3757 TEST_REQUIRES_ARM_NEON;
3758 for (uint32_t n = 16; n <= 24; n += 8) {
3759 for (size_t k = 1; k <= 10; k += 3) {
3760 GemmMicrokernelTester()
3761 .mr(4)
3762 .nr(8)
3763 .kr(1)
3764 .sr(1)
3765 .m(4)
3766 .n(8)
3767 .k(k)
3768 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3769 }
3770 }
3771 }
3772
3773 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8_strided_cn) {
3774 TEST_REQUIRES_ARM_NEON;
3775 for (uint32_t n = 16; n <= 24; n += 8) {
3776 for (size_t k = 1; k <= 10; k += 3) {
3777 GemmMicrokernelTester()
3778 .mr(4)
3779 .nr(8)
3780 .kr(1)
3781 .sr(1)
3782 .m(4)
3783 .n(n)
3784 .k(k)
3785 .cn_stride(11)
3786 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3787 }
3788 }
3789 }
3790
3791 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8_subtile) {
3792 TEST_REQUIRES_ARM_NEON;
3793 for (uint32_t n = 16; n <= 24; n += 8) {
3794 for (size_t k = 1; k <= 10; k += 3) {
3795 for (uint32_t m = 1; m <= 4; m++) {
3796 GemmMicrokernelTester()
3797 .mr(4)
3798 .nr(8)
3799 .kr(1)
3800 .sr(1)
3801 .m(m)
3802 .n(n)
3803 .k(k)
3804 .iterations(1)
3805 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3806 }
3807 }
3808 }
3809 }
3810
3811 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, small_kernel) {
3812 TEST_REQUIRES_ARM_NEON;
3813 for (size_t k = 1; k <= 10; k += 3) {
3814 GemmMicrokernelTester()
3815 .mr(4)
3816 .nr(8)
3817 .kr(1)
3818 .sr(1)
3819 .m(4)
3820 .n(8)
3821 .k(k)
3822 .ks(3)
3823 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3824 }
3825 }
3826
3827 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, small_kernel_subtile) {
3828 TEST_REQUIRES_ARM_NEON;
3829 for (size_t k = 1; k <= 10; k += 3) {
3830 for (uint32_t m = 1; m <= 4; m++) {
3831 for (uint32_t n = 1; n <= 8; n++) {
3832 GemmMicrokernelTester()
3833 .mr(4)
3834 .nr(8)
3835 .kr(1)
3836 .sr(1)
3837 .m(m)
3838 .n(n)
3839 .k(k)
3840 .ks(3)
3841 .iterations(1)
3842 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3843 }
3844 }
3845 }
3846 }
3847
3848 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_gt_8_small_kernel) {
3849 TEST_REQUIRES_ARM_NEON;
3850 for (uint32_t n = 9; n < 16; n++) {
3851 for (size_t k = 1; k <= 10; k += 3) {
3852 GemmMicrokernelTester()
3853 .mr(4)
3854 .nr(8)
3855 .kr(1)
3856 .sr(1)
3857 .m(4)
3858 .n(8)
3859 .k(k)
3860 .ks(3)
3861 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3862 }
3863 }
3864 }
3865
3866 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, n_div_8_small_kernel) {
3867 TEST_REQUIRES_ARM_NEON;
3868 for (uint32_t n = 16; n <= 24; n += 8) {
3869 for (size_t k = 1; k <= 10; k += 3) {
3870 GemmMicrokernelTester()
3871 .mr(4)
3872 .nr(8)
3873 .kr(1)
3874 .sr(1)
3875 .m(4)
3876 .n(8)
3877 .k(k)
3878 .ks(3)
3879 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3880 }
3881 }
3882 }
3883
3884 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, strided_cm_subtile) {
3885 TEST_REQUIRES_ARM_NEON;
3886 for (size_t k = 1; k <= 10; k += 3) {
3887 for (uint32_t m = 1; m <= 4; m++) {
3888 for (uint32_t n = 1; n <= 8; n++) {
3889 GemmMicrokernelTester()
3890 .mr(4)
3891 .nr(8)
3892 .kr(1)
3893 .sr(1)
3894 .m(m)
3895 .n(n)
3896 .k(k)
3897 .cm_stride(11)
3898 .iterations(1)
3899 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3900 }
3901 }
3902 }
3903 }
3904
3905 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, a_offset) {
3906 TEST_REQUIRES_ARM_NEON;
3907 for (size_t k = 1; k <= 10; k += 3) {
3908 GemmMicrokernelTester()
3909 .mr(4)
3910 .nr(8)
3911 .kr(1)
3912 .sr(1)
3913 .m(4)
3914 .n(8)
3915 .k(k)
3916 .ks(3)
3917 .a_offset(43)
3918 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3919 }
3920 }
3921
3922 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, zero) {
3923 TEST_REQUIRES_ARM_NEON;
3924 for (uint32_t mz = 0; mz < 4; mz++) {
3925 for (size_t k = 1; k <= 10; k += 3) {
3926 GemmMicrokernelTester()
3927 .mr(4)
3928 .nr(8)
3929 .kr(1)
3930 .sr(1)
3931 .m(4)
3932 .n(8)
3933 .k(k)
3934 .ks(3)
3935 .a_offset(43)
3936 .zero_index(mz)
3937 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3938 }
3939 }
3940 }
3941
3942 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, qmin) {
3943 TEST_REQUIRES_ARM_NEON;
3944 GemmMicrokernelTester()
3945 .mr(4)
3946 .nr(8)
3947 .kr(1)
3948 .sr(1)
3949 .m(4)
3950 .n(8)
3951 .k(2)
3952 .qmin(128)
3953 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3954 }
3955
3956 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, qmax) {
3957 TEST_REQUIRES_ARM_NEON;
3958 GemmMicrokernelTester()
3959 .mr(4)
3960 .nr(8)
3961 .kr(1)
3962 .sr(1)
3963 .m(4)
3964 .n(8)
3965 .k(2)
3966 .qmax(128)
3967 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3968 }
3969
3970 TEST(F32_IGEMM_4X8__AARCH32_NEON_LD64, strided_cm) {
3971 TEST_REQUIRES_ARM_NEON;
3972 GemmMicrokernelTester()
3973 .mr(4)
3974 .nr(8)
3975 .kr(1)
3976 .sr(1)
3977 .m(4)
3978 .n(8)
3979 .k(2)
3980 .cm_stride(11)
3981 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_ld64);
3982 }
3983#endif // XNN_ARCH_ARM
3984
3985
3986#if XNN_ARCH_ARM
3987 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
3988 TEST_REQUIRES_ARM_NEON;
3989 GemmMicrokernelTester()
3990 .mr(4)
3991 .nr(8)
3992 .kr(1)
3993 .sr(1)
3994 .m(4)
3995 .n(8)
3996 .k(4)
3997 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
3998 }
3999
4000 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cn) {
4001 TEST_REQUIRES_ARM_NEON;
4002 GemmMicrokernelTester()
4003 .mr(4)
4004 .nr(8)
4005 .kr(1)
4006 .sr(1)
4007 .m(4)
4008 .n(8)
4009 .k(4)
4010 .cn_stride(11)
4011 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4012 }
4013
4014 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile) {
4015 TEST_REQUIRES_ARM_NEON;
4016 for (uint32_t m = 1; m <= 4; m++) {
4017 for (uint32_t n = 1; n <= 8; n++) {
4018 GemmMicrokernelTester()
4019 .mr(4)
4020 .nr(8)
4021 .kr(1)
4022 .sr(1)
4023 .m(m)
4024 .n(n)
4025 .k(4)
4026 .iterations(1)
4027 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4028 }
4029 }
4030 }
4031
4032 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_m) {
4033 TEST_REQUIRES_ARM_NEON;
4034 for (uint32_t m = 1; m <= 4; m++) {
4035 GemmMicrokernelTester()
4036 .mr(4)
4037 .nr(8)
4038 .kr(1)
4039 .sr(1)
4040 .m(m)
4041 .n(8)
4042 .k(4)
4043 .iterations(1)
4044 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4045 }
4046 }
4047
4048 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_n) {
4049 TEST_REQUIRES_ARM_NEON;
4050 for (uint32_t n = 1; n <= 8; n++) {
4051 GemmMicrokernelTester()
4052 .mr(4)
4053 .nr(8)
4054 .kr(1)
4055 .sr(1)
4056 .m(4)
4057 .n(n)
4058 .k(4)
4059 .iterations(1)
4060 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4061 }
4062 }
4063
4064 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8) {
4065 TEST_REQUIRES_ARM_NEON;
4066 GemmMicrokernelTester()
4067 .mr(4)
4068 .nr(8)
4069 .kr(1)
4070 .sr(1)
4071 .m(4)
4072 .n(8)
4073 .k(8)
4074 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4075 }
4076
4077 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_subtile) {
4078 TEST_REQUIRES_ARM_NEON;
4079 for (uint32_t m = 1; m <= 4; m++) {
4080 for (uint32_t n = 1; n <= 8; n++) {
4081 GemmMicrokernelTester()
4082 .mr(4)
4083 .nr(8)
4084 .kr(1)
4085 .sr(1)
4086 .m(m)
4087 .n(n)
4088 .k(8)
4089 .iterations(1)
4090 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4091 }
4092 }
4093 }
4094
4095 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8) {
4096 TEST_REQUIRES_ARM_NEON;
4097 for (size_t k = 1; k < 8; k++) {
4098 GemmMicrokernelTester()
4099 .mr(4)
4100 .nr(8)
4101 .kr(1)
4102 .sr(1)
4103 .m(4)
4104 .n(8)
4105 .k(k)
4106 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4107 }
4108 }
4109
4110 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_subtile) {
4111 TEST_REQUIRES_ARM_NEON;
4112 for (size_t k = 1; k < 8; k++) {
4113 for (uint32_t m = 1; m <= 4; m++) {
4114 for (uint32_t n = 1; n <= 8; n++) {
4115 GemmMicrokernelTester()
4116 .mr(4)
4117 .nr(8)
4118 .kr(1)
4119 .sr(1)
4120 .m(m)
4121 .n(n)
4122 .k(k)
4123 .iterations(1)
4124 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4125 }
4126 }
4127 }
4128 }
4129
4130 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8) {
4131 TEST_REQUIRES_ARM_NEON;
4132 for (size_t k = 9; k < 8; k++) {
4133 GemmMicrokernelTester()
4134 .mr(4)
4135 .nr(8)
4136 .kr(1)
4137 .sr(1)
4138 .m(4)
4139 .n(8)
4140 .k(k)
4141 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4142 }
4143 }
4144
4145 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_4_subtile) {
4146 TEST_REQUIRES_ARM_NEON;
4147 for (size_t k = 9; k < 8; k++) {
4148 for (uint32_t m = 1; m <= 4; m++) {
4149 for (uint32_t n = 1; n <= 8; n++) {
4150 GemmMicrokernelTester()
4151 .mr(4)
4152 .nr(8)
4153 .kr(1)
4154 .sr(1)
4155 .m(m)
4156 .n(n)
4157 .k(k)
4158 .iterations(1)
4159 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4160 }
4161 }
4162 }
4163 }
4164
4165 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4) {
4166 TEST_REQUIRES_ARM_NEON;
4167 for (size_t k = 12; k <= 40; k += 4) {
4168 GemmMicrokernelTester()
4169 .mr(4)
4170 .nr(8)
4171 .kr(1)
4172 .sr(1)
4173 .m(4)
4174 .n(8)
4175 .k(k)
4176 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4177 }
4178 }
4179
4180 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_subtile) {
4181 TEST_REQUIRES_ARM_NEON;
4182 for (size_t k = 12; k <= 40; k += 4) {
4183 for (uint32_t m = 1; m <= 4; m++) {
4184 for (uint32_t n = 1; n <= 8; n++) {
4185 GemmMicrokernelTester()
4186 .mr(4)
4187 .nr(8)
4188 .kr(1)
4189 .sr(1)
4190 .m(m)
4191 .n(n)
4192 .k(k)
4193 .iterations(1)
4194 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4195 }
4196 }
4197 }
4198 }
4199
4200 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8) {
4201 TEST_REQUIRES_ARM_NEON;
4202 for (uint32_t n = 9; n < 16; n++) {
4203 for (size_t k = 1; k <= 20; k += 5) {
4204 GemmMicrokernelTester()
4205 .mr(4)
4206 .nr(8)
4207 .kr(1)
4208 .sr(1)
4209 .m(4)
4210 .n(8)
4211 .k(k)
4212 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4213 }
4214 }
4215 }
4216
4217 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_cn) {
4218 TEST_REQUIRES_ARM_NEON;
4219 for (uint32_t n = 9; n < 16; n++) {
4220 for (size_t k = 1; k <= 20; k += 5) {
4221 GemmMicrokernelTester()
4222 .mr(4)
4223 .nr(8)
4224 .kr(1)
4225 .sr(1)
4226 .m(4)
4227 .n(8)
4228 .k(k)
4229 .cn_stride(11)
4230 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4231 }
4232 }
4233 }
4234
4235 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_subtile) {
4236 TEST_REQUIRES_ARM_NEON;
4237 for (uint32_t n = 9; n < 16; n++) {
4238 for (size_t k = 1; k <= 20; k += 5) {
4239 for (uint32_t m = 1; m <= 4; m++) {
4240 GemmMicrokernelTester()
4241 .mr(4)
4242 .nr(8)
4243 .kr(1)
4244 .sr(1)
4245 .m(m)
4246 .n(n)
4247 .k(k)
4248 .iterations(1)
4249 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4250 }
4251 }
4252 }
4253 }
4254
4255 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8) {
4256 TEST_REQUIRES_ARM_NEON;
4257 for (uint32_t n = 16; n <= 24; n += 8) {
4258 for (size_t k = 1; k <= 20; k += 5) {
4259 GemmMicrokernelTester()
4260 .mr(4)
4261 .nr(8)
4262 .kr(1)
4263 .sr(1)
4264 .m(4)
4265 .n(8)
4266 .k(k)
4267 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4268 }
4269 }
4270 }
4271
4272 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_cn) {
4273 TEST_REQUIRES_ARM_NEON;
4274 for (uint32_t n = 16; n <= 24; n += 8) {
4275 for (size_t k = 1; k <= 20; k += 5) {
4276 GemmMicrokernelTester()
4277 .mr(4)
4278 .nr(8)
4279 .kr(1)
4280 .sr(1)
4281 .m(4)
4282 .n(n)
4283 .k(k)
4284 .cn_stride(11)
4285 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4286 }
4287 }
4288 }
4289
4290 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_subtile) {
4291 TEST_REQUIRES_ARM_NEON;
4292 for (uint32_t n = 16; n <= 24; n += 8) {
4293 for (size_t k = 1; k <= 20; k += 5) {
4294 for (uint32_t m = 1; m <= 4; m++) {
4295 GemmMicrokernelTester()
4296 .mr(4)
4297 .nr(8)
4298 .kr(1)
4299 .sr(1)
4300 .m(m)
4301 .n(n)
4302 .k(k)
4303 .iterations(1)
4304 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4305 }
4306 }
4307 }
4308 }
4309
4310 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, small_kernel) {
4311 TEST_REQUIRES_ARM_NEON;
4312 for (size_t k = 1; k <= 20; k += 5) {
4313 GemmMicrokernelTester()
4314 .mr(4)
4315 .nr(8)
4316 .kr(1)
4317 .sr(1)
4318 .m(4)
4319 .n(8)
4320 .k(k)
4321 .ks(3)
4322 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4323 }
4324 }
4325
4326 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, small_kernel_subtile) {
4327 TEST_REQUIRES_ARM_NEON;
4328 for (size_t k = 1; k <= 20; k += 5) {
4329 for (uint32_t m = 1; m <= 4; m++) {
4330 for (uint32_t n = 1; n <= 8; n++) {
4331 GemmMicrokernelTester()
4332 .mr(4)
4333 .nr(8)
4334 .kr(1)
4335 .sr(1)
4336 .m(m)
4337 .n(n)
4338 .k(k)
4339 .ks(3)
4340 .iterations(1)
4341 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4342 }
4343 }
4344 }
4345 }
4346
4347 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_small_kernel) {
4348 TEST_REQUIRES_ARM_NEON;
4349 for (uint32_t n = 9; n < 16; n++) {
4350 for (size_t k = 1; k <= 20; k += 5) {
4351 GemmMicrokernelTester()
4352 .mr(4)
4353 .nr(8)
4354 .kr(1)
4355 .sr(1)
4356 .m(4)
4357 .n(8)
4358 .k(k)
4359 .ks(3)
4360 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4361 }
4362 }
4363 }
4364
4365 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_small_kernel) {
4366 TEST_REQUIRES_ARM_NEON;
4367 for (uint32_t n = 16; n <= 24; n += 8) {
4368 for (size_t k = 1; k <= 20; k += 5) {
4369 GemmMicrokernelTester()
4370 .mr(4)
4371 .nr(8)
4372 .kr(1)
4373 .sr(1)
4374 .m(4)
4375 .n(8)
4376 .k(k)
4377 .ks(3)
4378 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4379 }
4380 }
4381 }
4382
4383 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm_subtile) {
4384 TEST_REQUIRES_ARM_NEON;
4385 for (size_t k = 1; k <= 20; k += 5) {
4386 for (uint32_t m = 1; m <= 4; m++) {
4387 for (uint32_t n = 1; n <= 8; n++) {
4388 GemmMicrokernelTester()
4389 .mr(4)
4390 .nr(8)
4391 .kr(1)
4392 .sr(1)
4393 .m(m)
4394 .n(n)
4395 .k(k)
4396 .cm_stride(11)
4397 .iterations(1)
4398 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4399 }
4400 }
4401 }
4402 }
4403
4404 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, a_offset) {
4405 TEST_REQUIRES_ARM_NEON;
4406 for (size_t k = 1; k <= 20; k += 5) {
4407 GemmMicrokernelTester()
4408 .mr(4)
4409 .nr(8)
4410 .kr(1)
4411 .sr(1)
4412 .m(4)
4413 .n(8)
4414 .k(k)
4415 .ks(3)
4416 .a_offset(83)
4417 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4418 }
4419 }
4420
4421 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, zero) {
4422 TEST_REQUIRES_ARM_NEON;
4423 for (uint32_t mz = 0; mz < 4; mz++) {
4424 for (size_t k = 1; k <= 20; k += 5) {
4425 GemmMicrokernelTester()
4426 .mr(4)
4427 .nr(8)
4428 .kr(1)
4429 .sr(1)
4430 .m(4)
4431 .n(8)
4432 .k(k)
4433 .ks(3)
4434 .a_offset(83)
4435 .zero_index(mz)
4436 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4437 }
4438 }
4439 }
4440
4441 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, qmin) {
4442 TEST_REQUIRES_ARM_NEON;
4443 GemmMicrokernelTester()
4444 .mr(4)
4445 .nr(8)
4446 .kr(1)
4447 .sr(1)
4448 .m(4)
4449 .n(8)
4450 .k(4)
4451 .qmin(128)
4452 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4453 }
4454
4455 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, qmax) {
4456 TEST_REQUIRES_ARM_NEON;
4457 GemmMicrokernelTester()
4458 .mr(4)
4459 .nr(8)
4460 .kr(1)
4461 .sr(1)
4462 .m(4)
4463 .n(8)
4464 .k(4)
4465 .qmax(128)
4466 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4467 }
4468
4469 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm) {
4470 TEST_REQUIRES_ARM_NEON;
4471 GemmMicrokernelTester()
4472 .mr(4)
4473 .nr(8)
4474 .kr(1)
4475 .sr(1)
4476 .m(4)
4477 .n(8)
4478 .k(4)
4479 .cm_stride(11)
4480 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a75);
4481 }
4482#endif // XNN_ARCH_ARM
4483
4484
4485#if XNN_ARCH_ARM
4486 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4) {
4487 TEST_REQUIRES_ARM_NEON;
4488 GemmMicrokernelTester()
4489 .mr(4)
4490 .nr(8)
4491 .kr(1)
4492 .sr(1)
4493 .m(4)
4494 .n(8)
4495 .k(4)
4496 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4497 }
4498
4499 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cn) {
4500 TEST_REQUIRES_ARM_NEON;
4501 GemmMicrokernelTester()
4502 .mr(4)
4503 .nr(8)
4504 .kr(1)
4505 .sr(1)
4506 .m(4)
4507 .n(8)
4508 .k(4)
4509 .cn_stride(11)
4510 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4511 }
4512
4513 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile) {
4514 TEST_REQUIRES_ARM_NEON;
4515 for (uint32_t m = 1; m <= 4; m++) {
4516 for (uint32_t n = 1; n <= 8; n++) {
4517 GemmMicrokernelTester()
4518 .mr(4)
4519 .nr(8)
4520 .kr(1)
4521 .sr(1)
4522 .m(m)
4523 .n(n)
4524 .k(4)
4525 .iterations(1)
4526 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4527 }
4528 }
4529 }
4530
4531 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_m) {
4532 TEST_REQUIRES_ARM_NEON;
4533 for (uint32_t m = 1; m <= 4; m++) {
4534 GemmMicrokernelTester()
4535 .mr(4)
4536 .nr(8)
4537 .kr(1)
4538 .sr(1)
4539 .m(m)
4540 .n(8)
4541 .k(4)
4542 .iterations(1)
4543 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4544 }
4545 }
4546
4547 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_n) {
4548 TEST_REQUIRES_ARM_NEON;
4549 for (uint32_t n = 1; n <= 8; n++) {
4550 GemmMicrokernelTester()
4551 .mr(4)
4552 .nr(8)
4553 .kr(1)
4554 .sr(1)
4555 .m(4)
4556 .n(n)
4557 .k(4)
4558 .iterations(1)
4559 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4560 }
4561 }
4562
4563 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8) {
4564 TEST_REQUIRES_ARM_NEON;
4565 GemmMicrokernelTester()
4566 .mr(4)
4567 .nr(8)
4568 .kr(1)
4569 .sr(1)
4570 .m(4)
4571 .n(8)
4572 .k(8)
4573 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4574 }
4575
4576 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8_subtile) {
4577 TEST_REQUIRES_ARM_NEON;
4578 for (uint32_t m = 1; m <= 4; m++) {
4579 for (uint32_t n = 1; n <= 8; n++) {
4580 GemmMicrokernelTester()
4581 .mr(4)
4582 .nr(8)
4583 .kr(1)
4584 .sr(1)
4585 .m(m)
4586 .n(n)
4587 .k(8)
4588 .iterations(1)
4589 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4590 }
4591 }
4592 }
4593
4594 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8) {
4595 TEST_REQUIRES_ARM_NEON;
4596 for (size_t k = 1; k < 8; k++) {
4597 GemmMicrokernelTester()
4598 .mr(4)
4599 .nr(8)
4600 .kr(1)
4601 .sr(1)
4602 .m(4)
4603 .n(8)
4604 .k(k)
4605 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4606 }
4607 }
4608
4609 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8_subtile) {
4610 TEST_REQUIRES_ARM_NEON;
4611 for (size_t k = 1; k < 8; k++) {
4612 for (uint32_t m = 1; m <= 4; m++) {
4613 for (uint32_t n = 1; n <= 8; n++) {
4614 GemmMicrokernelTester()
4615 .mr(4)
4616 .nr(8)
4617 .kr(1)
4618 .sr(1)
4619 .m(m)
4620 .n(n)
4621 .k(k)
4622 .iterations(1)
4623 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4624 }
4625 }
4626 }
4627 }
4628
4629 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_8) {
4630 TEST_REQUIRES_ARM_NEON;
4631 for (size_t k = 9; k < 8; k++) {
4632 GemmMicrokernelTester()
4633 .mr(4)
4634 .nr(8)
4635 .kr(1)
4636 .sr(1)
4637 .m(4)
4638 .n(8)
4639 .k(k)
4640 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4641 }
4642 }
4643
4644 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_4_subtile) {
4645 TEST_REQUIRES_ARM_NEON;
4646 for (size_t k = 9; k < 8; k++) {
4647 for (uint32_t m = 1; m <= 4; m++) {
4648 for (uint32_t n = 1; n <= 8; n++) {
4649 GemmMicrokernelTester()
4650 .mr(4)
4651 .nr(8)
4652 .kr(1)
4653 .sr(1)
4654 .m(m)
4655 .n(n)
4656 .k(k)
4657 .iterations(1)
4658 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4659 }
4660 }
4661 }
4662 }
4663
4664 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4) {
4665 TEST_REQUIRES_ARM_NEON;
4666 for (size_t k = 12; k <= 40; k += 4) {
4667 GemmMicrokernelTester()
4668 .mr(4)
4669 .nr(8)
4670 .kr(1)
4671 .sr(1)
4672 .m(4)
4673 .n(8)
4674 .k(k)
4675 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4676 }
4677 }
4678
4679 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4_subtile) {
4680 TEST_REQUIRES_ARM_NEON;
4681 for (size_t k = 12; k <= 40; k += 4) {
4682 for (uint32_t m = 1; m <= 4; m++) {
4683 for (uint32_t n = 1; n <= 8; n++) {
4684 GemmMicrokernelTester()
4685 .mr(4)
4686 .nr(8)
4687 .kr(1)
4688 .sr(1)
4689 .m(m)
4690 .n(n)
4691 .k(k)
4692 .iterations(1)
4693 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4694 }
4695 }
4696 }
4697 }
4698
4699 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8) {
4700 TEST_REQUIRES_ARM_NEON;
4701 for (uint32_t n = 9; n < 16; n++) {
4702 for (size_t k = 1; k <= 20; k += 5) {
4703 GemmMicrokernelTester()
4704 .mr(4)
4705 .nr(8)
4706 .kr(1)
4707 .sr(1)
4708 .m(4)
4709 .n(8)
4710 .k(k)
4711 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4712 }
4713 }
4714 }
4715
4716 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_strided_cn) {
4717 TEST_REQUIRES_ARM_NEON;
4718 for (uint32_t n = 9; n < 16; n++) {
4719 for (size_t k = 1; k <= 20; k += 5) {
4720 GemmMicrokernelTester()
4721 .mr(4)
4722 .nr(8)
4723 .kr(1)
4724 .sr(1)
4725 .m(4)
4726 .n(8)
4727 .k(k)
4728 .cn_stride(11)
4729 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4730 }
4731 }
4732 }
4733
4734 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_subtile) {
4735 TEST_REQUIRES_ARM_NEON;
4736 for (uint32_t n = 9; n < 16; n++) {
4737 for (size_t k = 1; k <= 20; k += 5) {
4738 for (uint32_t m = 1; m <= 4; m++) {
4739 GemmMicrokernelTester()
4740 .mr(4)
4741 .nr(8)
4742 .kr(1)
4743 .sr(1)
4744 .m(m)
4745 .n(n)
4746 .k(k)
4747 .iterations(1)
4748 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4749 }
4750 }
4751 }
4752 }
4753
4754 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8) {
4755 TEST_REQUIRES_ARM_NEON;
4756 for (uint32_t n = 16; n <= 24; n += 8) {
4757 for (size_t k = 1; k <= 20; k += 5) {
4758 GemmMicrokernelTester()
4759 .mr(4)
4760 .nr(8)
4761 .kr(1)
4762 .sr(1)
4763 .m(4)
4764 .n(8)
4765 .k(k)
4766 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4767 }
4768 }
4769 }
4770
4771 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_strided_cn) {
4772 TEST_REQUIRES_ARM_NEON;
4773 for (uint32_t n = 16; n <= 24; n += 8) {
4774 for (size_t k = 1; k <= 20; k += 5) {
4775 GemmMicrokernelTester()
4776 .mr(4)
4777 .nr(8)
4778 .kr(1)
4779 .sr(1)
4780 .m(4)
4781 .n(n)
4782 .k(k)
4783 .cn_stride(11)
4784 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4785 }
4786 }
4787 }
4788
4789 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_subtile) {
4790 TEST_REQUIRES_ARM_NEON;
4791 for (uint32_t n = 16; n <= 24; n += 8) {
4792 for (size_t k = 1; k <= 20; k += 5) {
4793 for (uint32_t m = 1; m <= 4; m++) {
4794 GemmMicrokernelTester()
4795 .mr(4)
4796 .nr(8)
4797 .kr(1)
4798 .sr(1)
4799 .m(m)
4800 .n(n)
4801 .k(k)
4802 .iterations(1)
4803 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4804 }
4805 }
4806 }
4807 }
4808
4809 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, small_kernel) {
4810 TEST_REQUIRES_ARM_NEON;
4811 for (size_t k = 1; k <= 20; k += 5) {
4812 GemmMicrokernelTester()
4813 .mr(4)
4814 .nr(8)
4815 .kr(1)
4816 .sr(1)
4817 .m(4)
4818 .n(8)
4819 .k(k)
4820 .ks(3)
4821 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4822 }
4823 }
4824
4825 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, small_kernel_subtile) {
4826 TEST_REQUIRES_ARM_NEON;
4827 for (size_t k = 1; k <= 20; k += 5) {
4828 for (uint32_t m = 1; m <= 4; m++) {
4829 for (uint32_t n = 1; n <= 8; n++) {
4830 GemmMicrokernelTester()
4831 .mr(4)
4832 .nr(8)
4833 .kr(1)
4834 .sr(1)
4835 .m(m)
4836 .n(n)
4837 .k(k)
4838 .ks(3)
4839 .iterations(1)
4840 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4841 }
4842 }
4843 }
4844 }
4845
4846 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_small_kernel) {
4847 TEST_REQUIRES_ARM_NEON;
4848 for (uint32_t n = 9; n < 16; n++) {
4849 for (size_t k = 1; k <= 20; k += 5) {
4850 GemmMicrokernelTester()
4851 .mr(4)
4852 .nr(8)
4853 .kr(1)
4854 .sr(1)
4855 .m(4)
4856 .n(8)
4857 .k(k)
4858 .ks(3)
4859 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4860 }
4861 }
4862 }
4863
4864 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_small_kernel) {
4865 TEST_REQUIRES_ARM_NEON;
4866 for (uint32_t n = 16; n <= 24; n += 8) {
4867 for (size_t k = 1; k <= 20; k += 5) {
4868 GemmMicrokernelTester()
4869 .mr(4)
4870 .nr(8)
4871 .kr(1)
4872 .sr(1)
4873 .m(4)
4874 .n(8)
4875 .k(k)
4876 .ks(3)
4877 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4878 }
4879 }
4880 }
4881
4882 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm_subtile) {
4883 TEST_REQUIRES_ARM_NEON;
4884 for (size_t k = 1; k <= 20; k += 5) {
4885 for (uint32_t m = 1; m <= 4; m++) {
4886 for (uint32_t n = 1; n <= 8; n++) {
4887 GemmMicrokernelTester()
4888 .mr(4)
4889 .nr(8)
4890 .kr(1)
4891 .sr(1)
4892 .m(m)
4893 .n(n)
4894 .k(k)
4895 .cm_stride(11)
4896 .iterations(1)
4897 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4898 }
4899 }
4900 }
4901 }
4902
4903 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, a_offset) {
4904 TEST_REQUIRES_ARM_NEON;
4905 for (size_t k = 1; k <= 20; k += 5) {
4906 GemmMicrokernelTester()
4907 .mr(4)
4908 .nr(8)
4909 .kr(1)
4910 .sr(1)
4911 .m(4)
4912 .n(8)
4913 .k(k)
4914 .ks(3)
4915 .a_offset(83)
4916 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4917 }
4918 }
4919
4920 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, zero) {
4921 TEST_REQUIRES_ARM_NEON;
4922 for (uint32_t mz = 0; mz < 4; mz++) {
4923 for (size_t k = 1; k <= 20; k += 5) {
4924 GemmMicrokernelTester()
4925 .mr(4)
4926 .nr(8)
4927 .kr(1)
4928 .sr(1)
4929 .m(4)
4930 .n(8)
4931 .k(k)
4932 .ks(3)
4933 .a_offset(83)
4934 .zero_index(mz)
4935 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4936 }
4937 }
4938 }
4939
4940 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmin) {
4941 TEST_REQUIRES_ARM_NEON;
4942 GemmMicrokernelTester()
4943 .mr(4)
4944 .nr(8)
4945 .kr(1)
4946 .sr(1)
4947 .m(4)
4948 .n(8)
4949 .k(4)
4950 .qmin(128)
4951 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4952 }
4953
4954 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmax) {
4955 TEST_REQUIRES_ARM_NEON;
4956 GemmMicrokernelTester()
4957 .mr(4)
4958 .nr(8)
4959 .kr(1)
4960 .sr(1)
4961 .m(4)
4962 .n(8)
4963 .k(4)
4964 .qmax(128)
4965 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4966 }
4967
4968 TEST(F32_IGEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm) {
4969 TEST_REQUIRES_ARM_NEON;
4970 GemmMicrokernelTester()
4971 .mr(4)
4972 .nr(8)
4973 .kr(1)
4974 .sr(1)
4975 .m(4)
4976 .n(8)
4977 .k(4)
4978 .cm_stride(11)
4979 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
4980 }
4981#endif // XNN_ARCH_ARM
4982
4983
4984#if XNN_ARCH_ARM
4985 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4) {
4986 TEST_REQUIRES_ARM_NEON;
4987 GemmMicrokernelTester()
4988 .mr(4)
4989 .nr(8)
4990 .kr(1)
4991 .sr(1)
4992 .m(4)
4993 .n(8)
4994 .k(4)
4995 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
4996 }
4997
4998 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cn) {
4999 TEST_REQUIRES_ARM_NEON;
5000 GemmMicrokernelTester()
5001 .mr(4)
5002 .nr(8)
5003 .kr(1)
5004 .sr(1)
5005 .m(4)
5006 .n(8)
5007 .k(4)
5008 .cn_stride(11)
5009 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5010 }
5011
5012 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile) {
5013 TEST_REQUIRES_ARM_NEON;
5014 for (uint32_t m = 1; m <= 4; m++) {
5015 for (uint32_t n = 1; n <= 8; n++) {
5016 GemmMicrokernelTester()
5017 .mr(4)
5018 .nr(8)
5019 .kr(1)
5020 .sr(1)
5021 .m(m)
5022 .n(n)
5023 .k(4)
5024 .iterations(1)
5025 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5026 }
5027 }
5028 }
5029
5030 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile_m) {
5031 TEST_REQUIRES_ARM_NEON;
5032 for (uint32_t m = 1; m <= 4; m++) {
5033 GemmMicrokernelTester()
5034 .mr(4)
5035 .nr(8)
5036 .kr(1)
5037 .sr(1)
5038 .m(m)
5039 .n(8)
5040 .k(4)
5041 .iterations(1)
5042 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5043 }
5044 }
5045
5046 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile_n) {
5047 TEST_REQUIRES_ARM_NEON;
5048 for (uint32_t n = 1; n <= 8; n++) {
5049 GemmMicrokernelTester()
5050 .mr(4)
5051 .nr(8)
5052 .kr(1)
5053 .sr(1)
5054 .m(4)
5055 .n(n)
5056 .k(4)
5057 .iterations(1)
5058 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5059 }
5060 }
5061
5062 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8) {
5063 TEST_REQUIRES_ARM_NEON;
5064 GemmMicrokernelTester()
5065 .mr(4)
5066 .nr(8)
5067 .kr(1)
5068 .sr(1)
5069 .m(4)
5070 .n(8)
5071 .k(8)
5072 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5073 }
5074
5075 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8_subtile) {
5076 TEST_REQUIRES_ARM_NEON;
5077 for (uint32_t m = 1; m <= 4; m++) {
5078 for (uint32_t n = 1; n <= 8; n++) {
5079 GemmMicrokernelTester()
5080 .mr(4)
5081 .nr(8)
5082 .kr(1)
5083 .sr(1)
5084 .m(m)
5085 .n(n)
5086 .k(8)
5087 .iterations(1)
5088 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5089 }
5090 }
5091 }
5092
5093 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8) {
5094 TEST_REQUIRES_ARM_NEON;
5095 for (size_t k = 1; k < 8; k++) {
5096 GemmMicrokernelTester()
5097 .mr(4)
5098 .nr(8)
5099 .kr(1)
5100 .sr(1)
5101 .m(4)
5102 .n(8)
5103 .k(k)
5104 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5105 }
5106 }
5107
5108 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8_subtile) {
5109 TEST_REQUIRES_ARM_NEON;
5110 for (size_t k = 1; k < 8; k++) {
5111 for (uint32_t m = 1; m <= 4; m++) {
5112 for (uint32_t n = 1; n <= 8; n++) {
5113 GemmMicrokernelTester()
5114 .mr(4)
5115 .nr(8)
5116 .kr(1)
5117 .sr(1)
5118 .m(m)
5119 .n(n)
5120 .k(k)
5121 .iterations(1)
5122 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5123 }
5124 }
5125 }
5126 }
5127
5128 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_8) {
5129 TEST_REQUIRES_ARM_NEON;
5130 for (size_t k = 9; k < 8; k++) {
5131 GemmMicrokernelTester()
5132 .mr(4)
5133 .nr(8)
5134 .kr(1)
5135 .sr(1)
5136 .m(4)
5137 .n(8)
5138 .k(k)
5139 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5140 }
5141 }
5142
5143 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_4_subtile) {
5144 TEST_REQUIRES_ARM_NEON;
5145 for (size_t k = 9; k < 8; k++) {
5146 for (uint32_t m = 1; m <= 4; m++) {
5147 for (uint32_t n = 1; n <= 8; n++) {
5148 GemmMicrokernelTester()
5149 .mr(4)
5150 .nr(8)
5151 .kr(1)
5152 .sr(1)
5153 .m(m)
5154 .n(n)
5155 .k(k)
5156 .iterations(1)
5157 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5158 }
5159 }
5160 }
5161 }
5162
5163 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4) {
5164 TEST_REQUIRES_ARM_NEON;
5165 for (size_t k = 12; k <= 40; k += 4) {
5166 GemmMicrokernelTester()
5167 .mr(4)
5168 .nr(8)
5169 .kr(1)
5170 .sr(1)
5171 .m(4)
5172 .n(8)
5173 .k(k)
5174 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5175 }
5176 }
5177
5178 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4_subtile) {
5179 TEST_REQUIRES_ARM_NEON;
5180 for (size_t k = 12; k <= 40; k += 4) {
5181 for (uint32_t m = 1; m <= 4; m++) {
5182 for (uint32_t n = 1; n <= 8; n++) {
5183 GemmMicrokernelTester()
5184 .mr(4)
5185 .nr(8)
5186 .kr(1)
5187 .sr(1)
5188 .m(m)
5189 .n(n)
5190 .k(k)
5191 .iterations(1)
5192 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5193 }
5194 }
5195 }
5196 }
5197
5198 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8) {
5199 TEST_REQUIRES_ARM_NEON;
5200 for (uint32_t n = 9; n < 16; n++) {
5201 for (size_t k = 1; k <= 20; k += 5) {
5202 GemmMicrokernelTester()
5203 .mr(4)
5204 .nr(8)
5205 .kr(1)
5206 .sr(1)
5207 .m(4)
5208 .n(8)
5209 .k(k)
5210 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5211 }
5212 }
5213 }
5214
5215 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_strided_cn) {
5216 TEST_REQUIRES_ARM_NEON;
5217 for (uint32_t n = 9; n < 16; n++) {
5218 for (size_t k = 1; k <= 20; k += 5) {
5219 GemmMicrokernelTester()
5220 .mr(4)
5221 .nr(8)
5222 .kr(1)
5223 .sr(1)
5224 .m(4)
5225 .n(8)
5226 .k(k)
5227 .cn_stride(11)
5228 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5229 }
5230 }
5231 }
5232
5233 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_subtile) {
5234 TEST_REQUIRES_ARM_NEON;
5235 for (uint32_t n = 9; n < 16; n++) {
5236 for (size_t k = 1; k <= 20; k += 5) {
5237 for (uint32_t m = 1; m <= 4; m++) {
5238 GemmMicrokernelTester()
5239 .mr(4)
5240 .nr(8)
5241 .kr(1)
5242 .sr(1)
5243 .m(m)
5244 .n(n)
5245 .k(k)
5246 .iterations(1)
5247 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5248 }
5249 }
5250 }
5251 }
5252
5253 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8) {
5254 TEST_REQUIRES_ARM_NEON;
5255 for (uint32_t n = 16; n <= 24; n += 8) {
5256 for (size_t k = 1; k <= 20; k += 5) {
5257 GemmMicrokernelTester()
5258 .mr(4)
5259 .nr(8)
5260 .kr(1)
5261 .sr(1)
5262 .m(4)
5263 .n(8)
5264 .k(k)
5265 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5266 }
5267 }
5268 }
5269
5270 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_strided_cn) {
5271 TEST_REQUIRES_ARM_NEON;
5272 for (uint32_t n = 16; n <= 24; n += 8) {
5273 for (size_t k = 1; k <= 20; k += 5) {
5274 GemmMicrokernelTester()
5275 .mr(4)
5276 .nr(8)
5277 .kr(1)
5278 .sr(1)
5279 .m(4)
5280 .n(n)
5281 .k(k)
5282 .cn_stride(11)
5283 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5284 }
5285 }
5286 }
5287
5288 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_subtile) {
5289 TEST_REQUIRES_ARM_NEON;
5290 for (uint32_t n = 16; n <= 24; n += 8) {
5291 for (size_t k = 1; k <= 20; k += 5) {
5292 for (uint32_t m = 1; m <= 4; m++) {
5293 GemmMicrokernelTester()
5294 .mr(4)
5295 .nr(8)
5296 .kr(1)
5297 .sr(1)
5298 .m(m)
5299 .n(n)
5300 .k(k)
5301 .iterations(1)
5302 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5303 }
5304 }
5305 }
5306 }
5307
5308 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, small_kernel) {
5309 TEST_REQUIRES_ARM_NEON;
5310 for (size_t k = 1; k <= 20; k += 5) {
5311 GemmMicrokernelTester()
5312 .mr(4)
5313 .nr(8)
5314 .kr(1)
5315 .sr(1)
5316 .m(4)
5317 .n(8)
5318 .k(k)
5319 .ks(3)
5320 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5321 }
5322 }
5323
5324 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, small_kernel_subtile) {
5325 TEST_REQUIRES_ARM_NEON;
5326 for (size_t k = 1; k <= 20; k += 5) {
5327 for (uint32_t m = 1; m <= 4; m++) {
5328 for (uint32_t n = 1; n <= 8; n++) {
5329 GemmMicrokernelTester()
5330 .mr(4)
5331 .nr(8)
5332 .kr(1)
5333 .sr(1)
5334 .m(m)
5335 .n(n)
5336 .k(k)
5337 .ks(3)
5338 .iterations(1)
5339 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5340 }
5341 }
5342 }
5343 }
5344
5345 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_small_kernel) {
5346 TEST_REQUIRES_ARM_NEON;
5347 for (uint32_t n = 9; n < 16; n++) {
5348 for (size_t k = 1; k <= 20; k += 5) {
5349 GemmMicrokernelTester()
5350 .mr(4)
5351 .nr(8)
5352 .kr(1)
5353 .sr(1)
5354 .m(4)
5355 .n(8)
5356 .k(k)
5357 .ks(3)
5358 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5359 }
5360 }
5361 }
5362
5363 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_small_kernel) {
5364 TEST_REQUIRES_ARM_NEON;
5365 for (uint32_t n = 16; n <= 24; n += 8) {
5366 for (size_t k = 1; k <= 20; k += 5) {
5367 GemmMicrokernelTester()
5368 .mr(4)
5369 .nr(8)
5370 .kr(1)
5371 .sr(1)
5372 .m(4)
5373 .n(8)
5374 .k(k)
5375 .ks(3)
5376 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5377 }
5378 }
5379 }
5380
5381 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cm_subtile) {
5382 TEST_REQUIRES_ARM_NEON;
5383 for (size_t k = 1; k <= 20; k += 5) {
5384 for (uint32_t m = 1; m <= 4; m++) {
5385 for (uint32_t n = 1; n <= 8; n++) {
5386 GemmMicrokernelTester()
5387 .mr(4)
5388 .nr(8)
5389 .kr(1)
5390 .sr(1)
5391 .m(m)
5392 .n(n)
5393 .k(k)
5394 .cm_stride(11)
5395 .iterations(1)
5396 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5397 }
5398 }
5399 }
5400 }
5401
5402 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, a_offset) {
5403 TEST_REQUIRES_ARM_NEON;
5404 for (size_t k = 1; k <= 20; k += 5) {
5405 GemmMicrokernelTester()
5406 .mr(4)
5407 .nr(8)
5408 .kr(1)
5409 .sr(1)
5410 .m(4)
5411 .n(8)
5412 .k(k)
5413 .ks(3)
5414 .a_offset(83)
5415 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5416 }
5417 }
5418
5419 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, zero) {
5420 TEST_REQUIRES_ARM_NEON;
5421 for (uint32_t mz = 0; mz < 4; mz++) {
5422 for (size_t k = 1; k <= 20; k += 5) {
5423 GemmMicrokernelTester()
5424 .mr(4)
5425 .nr(8)
5426 .kr(1)
5427 .sr(1)
5428 .m(4)
5429 .n(8)
5430 .k(k)
5431 .ks(3)
5432 .a_offset(83)
5433 .zero_index(mz)
5434 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5435 }
5436 }
5437 }
5438
5439 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, qmin) {
5440 TEST_REQUIRES_ARM_NEON;
5441 GemmMicrokernelTester()
5442 .mr(4)
5443 .nr(8)
5444 .kr(1)
5445 .sr(1)
5446 .m(4)
5447 .n(8)
5448 .k(4)
5449 .qmin(128)
5450 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5451 }
5452
5453 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, qmax) {
5454 TEST_REQUIRES_ARM_NEON;
5455 GemmMicrokernelTester()
5456 .mr(4)
5457 .nr(8)
5458 .kr(1)
5459 .sr(1)
5460 .m(4)
5461 .n(8)
5462 .k(4)
5463 .qmax(128)
5464 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5465 }
5466
5467 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cm) {
5468 TEST_REQUIRES_ARM_NEON;
5469 GemmMicrokernelTester()
5470 .mr(4)
5471 .nr(8)
5472 .kr(1)
5473 .sr(1)
5474 .m(4)
5475 .n(8)
5476 .k(4)
5477 .cm_stride(11)
5478 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a53);
5479 }
5480#endif // XNN_ARCH_ARM
5481
5482
5483#if XNN_ARCH_ARM
5484 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4) {
5485 TEST_REQUIRES_ARM_NEON;
5486 GemmMicrokernelTester()
5487 .mr(4)
5488 .nr(8)
5489 .kr(1)
5490 .sr(1)
5491 .m(4)
5492 .n(8)
5493 .k(4)
5494 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5495 }
5496
5497 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cn) {
5498 TEST_REQUIRES_ARM_NEON;
5499 GemmMicrokernelTester()
5500 .mr(4)
5501 .nr(8)
5502 .kr(1)
5503 .sr(1)
5504 .m(4)
5505 .n(8)
5506 .k(4)
5507 .cn_stride(11)
5508 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5509 }
5510
5511 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile) {
5512 TEST_REQUIRES_ARM_NEON;
5513 for (uint32_t m = 1; m <= 4; m++) {
5514 for (uint32_t n = 1; n <= 8; n++) {
5515 GemmMicrokernelTester()
5516 .mr(4)
5517 .nr(8)
5518 .kr(1)
5519 .sr(1)
5520 .m(m)
5521 .n(n)
5522 .k(4)
5523 .iterations(1)
5524 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5525 }
5526 }
5527 }
5528
5529 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_m) {
5530 TEST_REQUIRES_ARM_NEON;
5531 for (uint32_t m = 1; m <= 4; m++) {
5532 GemmMicrokernelTester()
5533 .mr(4)
5534 .nr(8)
5535 .kr(1)
5536 .sr(1)
5537 .m(m)
5538 .n(8)
5539 .k(4)
5540 .iterations(1)
5541 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5542 }
5543 }
5544
5545 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_4_subtile_n) {
5546 TEST_REQUIRES_ARM_NEON;
5547 for (uint32_t n = 1; n <= 8; n++) {
5548 GemmMicrokernelTester()
5549 .mr(4)
5550 .nr(8)
5551 .kr(1)
5552 .sr(1)
5553 .m(4)
5554 .n(n)
5555 .k(4)
5556 .iterations(1)
5557 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5558 }
5559 }
5560
5561 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8) {
5562 TEST_REQUIRES_ARM_NEON;
5563 GemmMicrokernelTester()
5564 .mr(4)
5565 .nr(8)
5566 .kr(1)
5567 .sr(1)
5568 .m(4)
5569 .n(8)
5570 .k(8)
5571 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5572 }
5573
5574 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_eq_8_subtile) {
5575 TEST_REQUIRES_ARM_NEON;
5576 for (uint32_t m = 1; m <= 4; m++) {
5577 for (uint32_t n = 1; n <= 8; n++) {
5578 GemmMicrokernelTester()
5579 .mr(4)
5580 .nr(8)
5581 .kr(1)
5582 .sr(1)
5583 .m(m)
5584 .n(n)
5585 .k(8)
5586 .iterations(1)
5587 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5588 }
5589 }
5590 }
5591
5592 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8) {
5593 TEST_REQUIRES_ARM_NEON;
5594 for (size_t k = 1; k < 8; k++) {
5595 GemmMicrokernelTester()
5596 .mr(4)
5597 .nr(8)
5598 .kr(1)
5599 .sr(1)
5600 .m(4)
5601 .n(8)
5602 .k(k)
5603 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5604 }
5605 }
5606
5607 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_lt_8_subtile) {
5608 TEST_REQUIRES_ARM_NEON;
5609 for (size_t k = 1; k < 8; k++) {
5610 for (uint32_t m = 1; m <= 4; m++) {
5611 for (uint32_t n = 1; n <= 8; n++) {
5612 GemmMicrokernelTester()
5613 .mr(4)
5614 .nr(8)
5615 .kr(1)
5616 .sr(1)
5617 .m(m)
5618 .n(n)
5619 .k(k)
5620 .iterations(1)
5621 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5622 }
5623 }
5624 }
5625 }
5626
5627 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_8) {
5628 TEST_REQUIRES_ARM_NEON;
5629 for (size_t k = 9; k < 8; k++) {
5630 GemmMicrokernelTester()
5631 .mr(4)
5632 .nr(8)
5633 .kr(1)
5634 .sr(1)
5635 .m(4)
5636 .n(8)
5637 .k(k)
5638 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5639 }
5640 }
5641
5642 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_gt_4_subtile) {
5643 TEST_REQUIRES_ARM_NEON;
5644 for (size_t k = 9; k < 8; k++) {
5645 for (uint32_t m = 1; m <= 4; m++) {
5646 for (uint32_t n = 1; n <= 8; n++) {
5647 GemmMicrokernelTester()
5648 .mr(4)
5649 .nr(8)
5650 .kr(1)
5651 .sr(1)
5652 .m(m)
5653 .n(n)
5654 .k(k)
5655 .iterations(1)
5656 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5657 }
5658 }
5659 }
5660 }
5661
5662 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4) {
5663 TEST_REQUIRES_ARM_NEON;
5664 for (size_t k = 12; k <= 40; k += 4) {
5665 GemmMicrokernelTester()
5666 .mr(4)
5667 .nr(8)
5668 .kr(1)
5669 .sr(1)
5670 .m(4)
5671 .n(8)
5672 .k(k)
5673 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5674 }
5675 }
5676
5677 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, k_div_4_subtile) {
5678 TEST_REQUIRES_ARM_NEON;
5679 for (size_t k = 12; k <= 40; k += 4) {
5680 for (uint32_t m = 1; m <= 4; m++) {
5681 for (uint32_t n = 1; n <= 8; n++) {
5682 GemmMicrokernelTester()
5683 .mr(4)
5684 .nr(8)
5685 .kr(1)
5686 .sr(1)
5687 .m(m)
5688 .n(n)
5689 .k(k)
5690 .iterations(1)
5691 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5692 }
5693 }
5694 }
5695 }
5696
5697 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8) {
5698 TEST_REQUIRES_ARM_NEON;
5699 for (uint32_t n = 9; n < 16; n++) {
5700 for (size_t k = 1; k <= 20; k += 5) {
5701 GemmMicrokernelTester()
5702 .mr(4)
5703 .nr(8)
5704 .kr(1)
5705 .sr(1)
5706 .m(4)
5707 .n(8)
5708 .k(k)
5709 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5710 }
5711 }
5712 }
5713
5714 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_strided_cn) {
5715 TEST_REQUIRES_ARM_NEON;
5716 for (uint32_t n = 9; n < 16; n++) {
5717 for (size_t k = 1; k <= 20; k += 5) {
5718 GemmMicrokernelTester()
5719 .mr(4)
5720 .nr(8)
5721 .kr(1)
5722 .sr(1)
5723 .m(4)
5724 .n(8)
5725 .k(k)
5726 .cn_stride(11)
5727 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5728 }
5729 }
5730 }
5731
5732 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_subtile) {
5733 TEST_REQUIRES_ARM_NEON;
5734 for (uint32_t n = 9; n < 16; n++) {
5735 for (size_t k = 1; k <= 20; k += 5) {
5736 for (uint32_t m = 1; m <= 4; m++) {
5737 GemmMicrokernelTester()
5738 .mr(4)
5739 .nr(8)
5740 .kr(1)
5741 .sr(1)
5742 .m(m)
5743 .n(n)
5744 .k(k)
5745 .iterations(1)
5746 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5747 }
5748 }
5749 }
5750 }
5751
5752 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8) {
5753 TEST_REQUIRES_ARM_NEON;
5754 for (uint32_t n = 16; n <= 24; n += 8) {
5755 for (size_t k = 1; k <= 20; k += 5) {
5756 GemmMicrokernelTester()
5757 .mr(4)
5758 .nr(8)
5759 .kr(1)
5760 .sr(1)
5761 .m(4)
5762 .n(8)
5763 .k(k)
5764 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5765 }
5766 }
5767 }
5768
5769 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_strided_cn) {
5770 TEST_REQUIRES_ARM_NEON;
5771 for (uint32_t n = 16; n <= 24; n += 8) {
5772 for (size_t k = 1; k <= 20; k += 5) {
5773 GemmMicrokernelTester()
5774 .mr(4)
5775 .nr(8)
5776 .kr(1)
5777 .sr(1)
5778 .m(4)
5779 .n(n)
5780 .k(k)
5781 .cn_stride(11)
5782 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5783 }
5784 }
5785 }
5786
5787 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_subtile) {
5788 TEST_REQUIRES_ARM_NEON;
5789 for (uint32_t n = 16; n <= 24; n += 8) {
5790 for (size_t k = 1; k <= 20; k += 5) {
5791 for (uint32_t m = 1; m <= 4; m++) {
5792 GemmMicrokernelTester()
5793 .mr(4)
5794 .nr(8)
5795 .kr(1)
5796 .sr(1)
5797 .m(m)
5798 .n(n)
5799 .k(k)
5800 .iterations(1)
5801 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5802 }
5803 }
5804 }
5805 }
5806
5807 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, small_kernel) {
5808 TEST_REQUIRES_ARM_NEON;
5809 for (size_t k = 1; k <= 20; k += 5) {
5810 GemmMicrokernelTester()
5811 .mr(4)
5812 .nr(8)
5813 .kr(1)
5814 .sr(1)
5815 .m(4)
5816 .n(8)
5817 .k(k)
5818 .ks(3)
5819 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5820 }
5821 }
5822
5823 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, small_kernel_subtile) {
5824 TEST_REQUIRES_ARM_NEON;
5825 for (size_t k = 1; k <= 20; k += 5) {
5826 for (uint32_t m = 1; m <= 4; m++) {
5827 for (uint32_t n = 1; n <= 8; n++) {
5828 GemmMicrokernelTester()
5829 .mr(4)
5830 .nr(8)
5831 .kr(1)
5832 .sr(1)
5833 .m(m)
5834 .n(n)
5835 .k(k)
5836 .ks(3)
5837 .iterations(1)
5838 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5839 }
5840 }
5841 }
5842 }
5843
5844 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_gt_8_small_kernel) {
5845 TEST_REQUIRES_ARM_NEON;
5846 for (uint32_t n = 9; n < 16; n++) {
5847 for (size_t k = 1; k <= 20; k += 5) {
5848 GemmMicrokernelTester()
5849 .mr(4)
5850 .nr(8)
5851 .kr(1)
5852 .sr(1)
5853 .m(4)
5854 .n(8)
5855 .k(k)
5856 .ks(3)
5857 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5858 }
5859 }
5860 }
5861
5862 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, n_div_8_small_kernel) {
5863 TEST_REQUIRES_ARM_NEON;
5864 for (uint32_t n = 16; n <= 24; n += 8) {
5865 for (size_t k = 1; k <= 20; k += 5) {
5866 GemmMicrokernelTester()
5867 .mr(4)
5868 .nr(8)
5869 .kr(1)
5870 .sr(1)
5871 .m(4)
5872 .n(8)
5873 .k(k)
5874 .ks(3)
5875 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5876 }
5877 }
5878 }
5879
5880 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm_subtile) {
5881 TEST_REQUIRES_ARM_NEON;
5882 for (size_t k = 1; k <= 20; k += 5) {
5883 for (uint32_t m = 1; m <= 4; m++) {
5884 for (uint32_t n = 1; n <= 8; n++) {
5885 GemmMicrokernelTester()
5886 .mr(4)
5887 .nr(8)
5888 .kr(1)
5889 .sr(1)
5890 .m(m)
5891 .n(n)
5892 .k(k)
5893 .cm_stride(11)
5894 .iterations(1)
5895 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5896 }
5897 }
5898 }
5899 }
5900
5901 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, a_offset) {
5902 TEST_REQUIRES_ARM_NEON;
5903 for (size_t k = 1; k <= 20; k += 5) {
5904 GemmMicrokernelTester()
5905 .mr(4)
5906 .nr(8)
5907 .kr(1)
5908 .sr(1)
5909 .m(4)
5910 .n(8)
5911 .k(k)
5912 .ks(3)
5913 .a_offset(83)
5914 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5915 }
5916 }
5917
5918 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, zero) {
5919 TEST_REQUIRES_ARM_NEON;
5920 for (uint32_t mz = 0; mz < 4; mz++) {
5921 for (size_t k = 1; k <= 20; k += 5) {
5922 GemmMicrokernelTester()
5923 .mr(4)
5924 .nr(8)
5925 .kr(1)
5926 .sr(1)
5927 .m(4)
5928 .n(8)
5929 .k(k)
5930 .ks(3)
5931 .a_offset(83)
5932 .zero_index(mz)
5933 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5934 }
5935 }
5936 }
5937
5938 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, qmin) {
5939 TEST_REQUIRES_ARM_NEON;
5940 GemmMicrokernelTester()
5941 .mr(4)
5942 .nr(8)
5943 .kr(1)
5944 .sr(1)
5945 .m(4)
5946 .n(8)
5947 .k(4)
5948 .qmin(128)
5949 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5950 }
5951
5952 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, qmax) {
5953 TEST_REQUIRES_ARM_NEON;
5954 GemmMicrokernelTester()
5955 .mr(4)
5956 .nr(8)
5957 .kr(1)
5958 .sr(1)
5959 .m(4)
5960 .n(8)
5961 .k(4)
5962 .qmax(128)
5963 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5964 }
5965
5966 TEST(F32_IGEMM_4X8__AARCH32_NEON_CORTEX_A55, strided_cm) {
5967 TEST_REQUIRES_ARM_NEON;
5968 GemmMicrokernelTester()
5969 .mr(4)
5970 .nr(8)
5971 .kr(1)
5972 .sr(1)
5973 .m(4)
5974 .n(8)
5975 .k(4)
5976 .cm_stride(11)
5977 .Test(xnn_f32_igemm_ukernel_4x8__aarch32_neon_cortex_a55);
5978 }
5979#endif // XNN_ARCH_ARM
5980
5981
5982#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5983 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
5984 TEST_REQUIRES_ARM_NEON_FMA;
5985 GemmMicrokernelTester()
5986 .mr(5)
5987 .nr(8)
5988 .kr(1)
5989 .sr(1)
5990 .m(5)
5991 .n(8)
5992 .k(8)
5993 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
5994 }
5995
5996 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
5997 TEST_REQUIRES_ARM_NEON_FMA;
5998 GemmMicrokernelTester()
5999 .mr(5)
6000 .nr(8)
6001 .kr(1)
6002 .sr(1)
6003 .m(5)
6004 .n(8)
6005 .k(8)
6006 .cn_stride(11)
6007 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6008 }
6009
6010 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
6011 TEST_REQUIRES_ARM_NEON_FMA;
6012 for (uint32_t m = 1; m <= 5; m++) {
6013 for (uint32_t n = 1; n <= 8; n++) {
6014 GemmMicrokernelTester()
6015 .mr(5)
6016 .nr(8)
6017 .kr(1)
6018 .sr(1)
6019 .m(m)
6020 .n(n)
6021 .k(8)
6022 .iterations(1)
6023 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6024 }
6025 }
6026 }
6027
6028 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
6029 TEST_REQUIRES_ARM_NEON_FMA;
6030 for (uint32_t m = 1; m <= 5; m++) {
6031 GemmMicrokernelTester()
6032 .mr(5)
6033 .nr(8)
6034 .kr(1)
6035 .sr(1)
6036 .m(m)
6037 .n(8)
6038 .k(8)
6039 .iterations(1)
6040 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6041 }
6042 }
6043
6044 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
6045 TEST_REQUIRES_ARM_NEON_FMA;
6046 for (uint32_t n = 1; n <= 8; n++) {
6047 GemmMicrokernelTester()
6048 .mr(5)
6049 .nr(8)
6050 .kr(1)
6051 .sr(1)
6052 .m(5)
6053 .n(n)
6054 .k(8)
6055 .iterations(1)
6056 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6057 }
6058 }
6059
6060 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
6061 TEST_REQUIRES_ARM_NEON_FMA;
6062 GemmMicrokernelTester()
6063 .mr(5)
6064 .nr(8)
6065 .kr(1)
6066 .sr(1)
6067 .m(5)
6068 .n(8)
6069 .k(16)
6070 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6071 }
6072
6073 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
6074 TEST_REQUIRES_ARM_NEON_FMA;
6075 for (uint32_t m = 1; m <= 5; m++) {
6076 for (uint32_t n = 1; n <= 8; n++) {
6077 GemmMicrokernelTester()
6078 .mr(5)
6079 .nr(8)
6080 .kr(1)
6081 .sr(1)
6082 .m(m)
6083 .n(n)
6084 .k(16)
6085 .iterations(1)
6086 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6087 }
6088 }
6089 }
6090
6091 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
6092 TEST_REQUIRES_ARM_NEON_FMA;
6093 for (size_t k = 1; k < 16; k++) {
6094 GemmMicrokernelTester()
6095 .mr(5)
6096 .nr(8)
6097 .kr(1)
6098 .sr(1)
6099 .m(5)
6100 .n(8)
6101 .k(k)
6102 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6103 }
6104 }
6105
6106 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
6107 TEST_REQUIRES_ARM_NEON_FMA;
6108 for (size_t k = 1; k < 16; k++) {
6109 for (uint32_t m = 1; m <= 5; m++) {
6110 for (uint32_t n = 1; n <= 8; n++) {
6111 GemmMicrokernelTester()
6112 .mr(5)
6113 .nr(8)
6114 .kr(1)
6115 .sr(1)
6116 .m(m)
6117 .n(n)
6118 .k(k)
6119 .iterations(1)
6120 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6121 }
6122 }
6123 }
6124 }
6125
6126 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
6127 TEST_REQUIRES_ARM_NEON_FMA;
6128 for (size_t k = 17; k < 16; k++) {
6129 GemmMicrokernelTester()
6130 .mr(5)
6131 .nr(8)
6132 .kr(1)
6133 .sr(1)
6134 .m(5)
6135 .n(8)
6136 .k(k)
6137 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6138 }
6139 }
6140
6141 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
6142 TEST_REQUIRES_ARM_NEON_FMA;
6143 for (size_t k = 17; k < 16; k++) {
6144 for (uint32_t m = 1; m <= 5; m++) {
6145 for (uint32_t n = 1; n <= 8; n++) {
6146 GemmMicrokernelTester()
6147 .mr(5)
6148 .nr(8)
6149 .kr(1)
6150 .sr(1)
6151 .m(m)
6152 .n(n)
6153 .k(k)
6154 .iterations(1)
6155 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6156 }
6157 }
6158 }
6159 }
6160
6161 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
6162 TEST_REQUIRES_ARM_NEON_FMA;
6163 for (size_t k = 24; k <= 80; k += 8) {
6164 GemmMicrokernelTester()
6165 .mr(5)
6166 .nr(8)
6167 .kr(1)
6168 .sr(1)
6169 .m(5)
6170 .n(8)
6171 .k(k)
6172 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6173 }
6174 }
6175
6176 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
6177 TEST_REQUIRES_ARM_NEON_FMA;
6178 for (size_t k = 24; k <= 80; k += 8) {
6179 for (uint32_t m = 1; m <= 5; m++) {
6180 for (uint32_t n = 1; n <= 8; n++) {
6181 GemmMicrokernelTester()
6182 .mr(5)
6183 .nr(8)
6184 .kr(1)
6185 .sr(1)
6186 .m(m)
6187 .n(n)
6188 .k(k)
6189 .iterations(1)
6190 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6191 }
6192 }
6193 }
6194 }
6195
6196 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
6197 TEST_REQUIRES_ARM_NEON_FMA;
6198 for (uint32_t n = 9; n < 16; n++) {
6199 for (size_t k = 1; k <= 40; k += 9) {
6200 GemmMicrokernelTester()
6201 .mr(5)
6202 .nr(8)
6203 .kr(1)
6204 .sr(1)
6205 .m(5)
6206 .n(8)
6207 .k(k)
6208 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6209 }
6210 }
6211 }
6212
6213 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
6214 TEST_REQUIRES_ARM_NEON_FMA;
6215 for (uint32_t n = 9; n < 16; n++) {
6216 for (size_t k = 1; k <= 40; k += 9) {
6217 GemmMicrokernelTester()
6218 .mr(5)
6219 .nr(8)
6220 .kr(1)
6221 .sr(1)
6222 .m(5)
6223 .n(8)
6224 .k(k)
6225 .cn_stride(11)
6226 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6227 }
6228 }
6229 }
6230
6231 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
6232 TEST_REQUIRES_ARM_NEON_FMA;
6233 for (uint32_t n = 9; n < 16; n++) {
6234 for (size_t k = 1; k <= 40; k += 9) {
6235 for (uint32_t m = 1; m <= 5; m++) {
6236 GemmMicrokernelTester()
6237 .mr(5)
6238 .nr(8)
6239 .kr(1)
6240 .sr(1)
6241 .m(m)
6242 .n(n)
6243 .k(k)
6244 .iterations(1)
6245 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6246 }
6247 }
6248 }
6249 }
6250
6251 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
6252 TEST_REQUIRES_ARM_NEON_FMA;
6253 for (uint32_t n = 16; n <= 24; n += 8) {
6254 for (size_t k = 1; k <= 40; k += 9) {
6255 GemmMicrokernelTester()
6256 .mr(5)
6257 .nr(8)
6258 .kr(1)
6259 .sr(1)
6260 .m(5)
6261 .n(8)
6262 .k(k)
6263 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6264 }
6265 }
6266 }
6267
6268 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
6269 TEST_REQUIRES_ARM_NEON_FMA;
6270 for (uint32_t n = 16; n <= 24; n += 8) {
6271 for (size_t k = 1; k <= 40; k += 9) {
6272 GemmMicrokernelTester()
6273 .mr(5)
6274 .nr(8)
6275 .kr(1)
6276 .sr(1)
6277 .m(5)
6278 .n(n)
6279 .k(k)
6280 .cn_stride(11)
6281 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6282 }
6283 }
6284 }
6285
6286 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
6287 TEST_REQUIRES_ARM_NEON_FMA;
6288 for (uint32_t n = 16; n <= 24; n += 8) {
6289 for (size_t k = 1; k <= 40; k += 9) {
6290 for (uint32_t m = 1; m <= 5; m++) {
6291 GemmMicrokernelTester()
6292 .mr(5)
6293 .nr(8)
6294 .kr(1)
6295 .sr(1)
6296 .m(m)
6297 .n(n)
6298 .k(k)
6299 .iterations(1)
6300 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6301 }
6302 }
6303 }
6304 }
6305
6306 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
6307 TEST_REQUIRES_ARM_NEON_FMA;
6308 for (size_t k = 1; k <= 40; k += 9) {
6309 GemmMicrokernelTester()
6310 .mr(5)
6311 .nr(8)
6312 .kr(1)
6313 .sr(1)
6314 .m(5)
6315 .n(8)
6316 .k(k)
6317 .ks(3)
6318 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6319 }
6320 }
6321
6322 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
6323 TEST_REQUIRES_ARM_NEON_FMA;
6324 for (size_t k = 1; k <= 40; k += 9) {
6325 for (uint32_t m = 1; m <= 5; m++) {
6326 for (uint32_t n = 1; n <= 8; n++) {
6327 GemmMicrokernelTester()
6328 .mr(5)
6329 .nr(8)
6330 .kr(1)
6331 .sr(1)
6332 .m(m)
6333 .n(n)
6334 .k(k)
6335 .ks(3)
6336 .iterations(1)
6337 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6338 }
6339 }
6340 }
6341 }
6342
6343 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
6344 TEST_REQUIRES_ARM_NEON_FMA;
6345 for (uint32_t n = 9; n < 16; n++) {
6346 for (size_t k = 1; k <= 40; k += 9) {
6347 GemmMicrokernelTester()
6348 .mr(5)
6349 .nr(8)
6350 .kr(1)
6351 .sr(1)
6352 .m(5)
6353 .n(8)
6354 .k(k)
6355 .ks(3)
6356 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6357 }
6358 }
6359 }
6360
6361 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
6362 TEST_REQUIRES_ARM_NEON_FMA;
6363 for (uint32_t n = 16; n <= 24; n += 8) {
6364 for (size_t k = 1; k <= 40; k += 9) {
6365 GemmMicrokernelTester()
6366 .mr(5)
6367 .nr(8)
6368 .kr(1)
6369 .sr(1)
6370 .m(5)
6371 .n(8)
6372 .k(k)
6373 .ks(3)
6374 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6375 }
6376 }
6377 }
6378
6379 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
6380 TEST_REQUIRES_ARM_NEON_FMA;
6381 for (size_t k = 1; k <= 40; k += 9) {
6382 for (uint32_t m = 1; m <= 5; m++) {
6383 for (uint32_t n = 1; n <= 8; n++) {
6384 GemmMicrokernelTester()
6385 .mr(5)
6386 .nr(8)
6387 .kr(1)
6388 .sr(1)
6389 .m(m)
6390 .n(n)
6391 .k(k)
6392 .cm_stride(11)
6393 .iterations(1)
6394 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6395 }
6396 }
6397 }
6398 }
6399
6400 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
6401 TEST_REQUIRES_ARM_NEON_FMA;
6402 for (size_t k = 1; k <= 40; k += 9) {
6403 GemmMicrokernelTester()
6404 .mr(5)
6405 .nr(8)
6406 .kr(1)
6407 .sr(1)
6408 .m(5)
6409 .n(8)
6410 .k(k)
6411 .ks(3)
6412 .a_offset(211)
6413 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6414 }
6415 }
6416
6417 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
6418 TEST_REQUIRES_ARM_NEON_FMA;
6419 for (uint32_t mz = 0; mz < 5; mz++) {
6420 for (size_t k = 1; k <= 40; k += 9) {
6421 GemmMicrokernelTester()
6422 .mr(5)
6423 .nr(8)
6424 .kr(1)
6425 .sr(1)
6426 .m(5)
6427 .n(8)
6428 .k(k)
6429 .ks(3)
6430 .a_offset(211)
6431 .zero_index(mz)
6432 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6433 }
6434 }
6435 }
6436
6437 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
6438 TEST_REQUIRES_ARM_NEON_FMA;
6439 GemmMicrokernelTester()
6440 .mr(5)
6441 .nr(8)
6442 .kr(1)
6443 .sr(1)
6444 .m(5)
6445 .n(8)
6446 .k(8)
6447 .qmin(128)
6448 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6449 }
6450
6451 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
6452 TEST_REQUIRES_ARM_NEON_FMA;
6453 GemmMicrokernelTester()
6454 .mr(5)
6455 .nr(8)
6456 .kr(1)
6457 .sr(1)
6458 .m(5)
6459 .n(8)
6460 .k(8)
6461 .qmax(128)
6462 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6463 }
6464
6465 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
6466 TEST_REQUIRES_ARM_NEON_FMA;
6467 GemmMicrokernelTester()
6468 .mr(5)
6469 .nr(8)
6470 .kr(1)
6471 .sr(1)
6472 .m(5)
6473 .n(8)
6474 .k(8)
6475 .cm_stride(11)
6476 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
6477 }
6478#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6479
6480
6481#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6482 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
6483 TEST_REQUIRES_ARM_NEON_FMA;
6484 GemmMicrokernelTester()
6485 .mr(5)
6486 .nr(8)
6487 .kr(1)
6488 .sr(1)
6489 .m(5)
6490 .n(8)
6491 .k(8)
6492 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6493 }
6494
6495 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
6496 TEST_REQUIRES_ARM_NEON_FMA;
6497 GemmMicrokernelTester()
6498 .mr(5)
6499 .nr(8)
6500 .kr(1)
6501 .sr(1)
6502 .m(5)
6503 .n(8)
6504 .k(8)
6505 .cn_stride(11)
6506 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6507 }
6508
6509 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
6510 TEST_REQUIRES_ARM_NEON_FMA;
6511 for (uint32_t m = 1; m <= 5; m++) {
6512 for (uint32_t n = 1; n <= 8; n++) {
6513 GemmMicrokernelTester()
6514 .mr(5)
6515 .nr(8)
6516 .kr(1)
6517 .sr(1)
6518 .m(m)
6519 .n(n)
6520 .k(8)
6521 .iterations(1)
6522 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6523 }
6524 }
6525 }
6526
6527 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
6528 TEST_REQUIRES_ARM_NEON_FMA;
6529 for (uint32_t m = 1; m <= 5; m++) {
6530 GemmMicrokernelTester()
6531 .mr(5)
6532 .nr(8)
6533 .kr(1)
6534 .sr(1)
6535 .m(m)
6536 .n(8)
6537 .k(8)
6538 .iterations(1)
6539 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6540 }
6541 }
6542
6543 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
6544 TEST_REQUIRES_ARM_NEON_FMA;
6545 for (uint32_t n = 1; n <= 8; n++) {
6546 GemmMicrokernelTester()
6547 .mr(5)
6548 .nr(8)
6549 .kr(1)
6550 .sr(1)
6551 .m(5)
6552 .n(n)
6553 .k(8)
6554 .iterations(1)
6555 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6556 }
6557 }
6558
6559 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
6560 TEST_REQUIRES_ARM_NEON_FMA;
6561 GemmMicrokernelTester()
6562 .mr(5)
6563 .nr(8)
6564 .kr(1)
6565 .sr(1)
6566 .m(5)
6567 .n(8)
6568 .k(16)
6569 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6570 }
6571
6572 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
6573 TEST_REQUIRES_ARM_NEON_FMA;
6574 for (uint32_t m = 1; m <= 5; m++) {
6575 for (uint32_t n = 1; n <= 8; n++) {
6576 GemmMicrokernelTester()
6577 .mr(5)
6578 .nr(8)
6579 .kr(1)
6580 .sr(1)
6581 .m(m)
6582 .n(n)
6583 .k(16)
6584 .iterations(1)
6585 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6586 }
6587 }
6588 }
6589
6590 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
6591 TEST_REQUIRES_ARM_NEON_FMA;
6592 for (size_t k = 1; k < 16; k++) {
6593 GemmMicrokernelTester()
6594 .mr(5)
6595 .nr(8)
6596 .kr(1)
6597 .sr(1)
6598 .m(5)
6599 .n(8)
6600 .k(k)
6601 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6602 }
6603 }
6604
6605 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
6606 TEST_REQUIRES_ARM_NEON_FMA;
6607 for (size_t k = 1; k < 16; k++) {
6608 for (uint32_t m = 1; m <= 5; m++) {
6609 for (uint32_t n = 1; n <= 8; n++) {
6610 GemmMicrokernelTester()
6611 .mr(5)
6612 .nr(8)
6613 .kr(1)
6614 .sr(1)
6615 .m(m)
6616 .n(n)
6617 .k(k)
6618 .iterations(1)
6619 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6620 }
6621 }
6622 }
6623 }
6624
6625 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
6626 TEST_REQUIRES_ARM_NEON_FMA;
6627 for (size_t k = 17; k < 16; k++) {
6628 GemmMicrokernelTester()
6629 .mr(5)
6630 .nr(8)
6631 .kr(1)
6632 .sr(1)
6633 .m(5)
6634 .n(8)
6635 .k(k)
6636 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6637 }
6638 }
6639
6640 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
6641 TEST_REQUIRES_ARM_NEON_FMA;
6642 for (size_t k = 17; k < 16; k++) {
6643 for (uint32_t m = 1; m <= 5; m++) {
6644 for (uint32_t n = 1; n <= 8; n++) {
6645 GemmMicrokernelTester()
6646 .mr(5)
6647 .nr(8)
6648 .kr(1)
6649 .sr(1)
6650 .m(m)
6651 .n(n)
6652 .k(k)
6653 .iterations(1)
6654 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6655 }
6656 }
6657 }
6658 }
6659
6660 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
6661 TEST_REQUIRES_ARM_NEON_FMA;
6662 for (size_t k = 24; k <= 80; k += 8) {
6663 GemmMicrokernelTester()
6664 .mr(5)
6665 .nr(8)
6666 .kr(1)
6667 .sr(1)
6668 .m(5)
6669 .n(8)
6670 .k(k)
6671 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6672 }
6673 }
6674
6675 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
6676 TEST_REQUIRES_ARM_NEON_FMA;
6677 for (size_t k = 24; k <= 80; k += 8) {
6678 for (uint32_t m = 1; m <= 5; m++) {
6679 for (uint32_t n = 1; n <= 8; n++) {
6680 GemmMicrokernelTester()
6681 .mr(5)
6682 .nr(8)
6683 .kr(1)
6684 .sr(1)
6685 .m(m)
6686 .n(n)
6687 .k(k)
6688 .iterations(1)
6689 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6690 }
6691 }
6692 }
6693 }
6694
6695 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
6696 TEST_REQUIRES_ARM_NEON_FMA;
6697 for (uint32_t n = 9; n < 16; n++) {
6698 for (size_t k = 1; k <= 40; k += 9) {
6699 GemmMicrokernelTester()
6700 .mr(5)
6701 .nr(8)
6702 .kr(1)
6703 .sr(1)
6704 .m(5)
6705 .n(8)
6706 .k(k)
6707 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6708 }
6709 }
6710 }
6711
6712 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
6713 TEST_REQUIRES_ARM_NEON_FMA;
6714 for (uint32_t n = 9; n < 16; n++) {
6715 for (size_t k = 1; k <= 40; k += 9) {
6716 GemmMicrokernelTester()
6717 .mr(5)
6718 .nr(8)
6719 .kr(1)
6720 .sr(1)
6721 .m(5)
6722 .n(8)
6723 .k(k)
6724 .cn_stride(11)
6725 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6726 }
6727 }
6728 }
6729
6730 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
6731 TEST_REQUIRES_ARM_NEON_FMA;
6732 for (uint32_t n = 9; n < 16; n++) {
6733 for (size_t k = 1; k <= 40; k += 9) {
6734 for (uint32_t m = 1; m <= 5; m++) {
6735 GemmMicrokernelTester()
6736 .mr(5)
6737 .nr(8)
6738 .kr(1)
6739 .sr(1)
6740 .m(m)
6741 .n(n)
6742 .k(k)
6743 .iterations(1)
6744 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6745 }
6746 }
6747 }
6748 }
6749
6750 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
6751 TEST_REQUIRES_ARM_NEON_FMA;
6752 for (uint32_t n = 16; n <= 24; n += 8) {
6753 for (size_t k = 1; k <= 40; k += 9) {
6754 GemmMicrokernelTester()
6755 .mr(5)
6756 .nr(8)
6757 .kr(1)
6758 .sr(1)
6759 .m(5)
6760 .n(8)
6761 .k(k)
6762 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6763 }
6764 }
6765 }
6766
6767 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
6768 TEST_REQUIRES_ARM_NEON_FMA;
6769 for (uint32_t n = 16; n <= 24; n += 8) {
6770 for (size_t k = 1; k <= 40; k += 9) {
6771 GemmMicrokernelTester()
6772 .mr(5)
6773 .nr(8)
6774 .kr(1)
6775 .sr(1)
6776 .m(5)
6777 .n(n)
6778 .k(k)
6779 .cn_stride(11)
6780 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6781 }
6782 }
6783 }
6784
6785 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
6786 TEST_REQUIRES_ARM_NEON_FMA;
6787 for (uint32_t n = 16; n <= 24; n += 8) {
6788 for (size_t k = 1; k <= 40; k += 9) {
6789 for (uint32_t m = 1; m <= 5; m++) {
6790 GemmMicrokernelTester()
6791 .mr(5)
6792 .nr(8)
6793 .kr(1)
6794 .sr(1)
6795 .m(m)
6796 .n(n)
6797 .k(k)
6798 .iterations(1)
6799 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6800 }
6801 }
6802 }
6803 }
6804
6805 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
6806 TEST_REQUIRES_ARM_NEON_FMA;
6807 for (size_t k = 1; k <= 40; k += 9) {
6808 GemmMicrokernelTester()
6809 .mr(5)
6810 .nr(8)
6811 .kr(1)
6812 .sr(1)
6813 .m(5)
6814 .n(8)
6815 .k(k)
6816 .ks(3)
6817 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6818 }
6819 }
6820
6821 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
6822 TEST_REQUIRES_ARM_NEON_FMA;
6823 for (size_t k = 1; k <= 40; k += 9) {
6824 for (uint32_t m = 1; m <= 5; m++) {
6825 for (uint32_t n = 1; n <= 8; n++) {
6826 GemmMicrokernelTester()
6827 .mr(5)
6828 .nr(8)
6829 .kr(1)
6830 .sr(1)
6831 .m(m)
6832 .n(n)
6833 .k(k)
6834 .ks(3)
6835 .iterations(1)
6836 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6837 }
6838 }
6839 }
6840 }
6841
6842 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
6843 TEST_REQUIRES_ARM_NEON_FMA;
6844 for (uint32_t n = 9; n < 16; n++) {
6845 for (size_t k = 1; k <= 40; k += 9) {
6846 GemmMicrokernelTester()
6847 .mr(5)
6848 .nr(8)
6849 .kr(1)
6850 .sr(1)
6851 .m(5)
6852 .n(8)
6853 .k(k)
6854 .ks(3)
6855 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6856 }
6857 }
6858 }
6859
6860 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
6861 TEST_REQUIRES_ARM_NEON_FMA;
6862 for (uint32_t n = 16; n <= 24; n += 8) {
6863 for (size_t k = 1; k <= 40; k += 9) {
6864 GemmMicrokernelTester()
6865 .mr(5)
6866 .nr(8)
6867 .kr(1)
6868 .sr(1)
6869 .m(5)
6870 .n(8)
6871 .k(k)
6872 .ks(3)
6873 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6874 }
6875 }
6876 }
6877
6878 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
6879 TEST_REQUIRES_ARM_NEON_FMA;
6880 for (size_t k = 1; k <= 40; k += 9) {
6881 for (uint32_t m = 1; m <= 5; m++) {
6882 for (uint32_t n = 1; n <= 8; n++) {
6883 GemmMicrokernelTester()
6884 .mr(5)
6885 .nr(8)
6886 .kr(1)
6887 .sr(1)
6888 .m(m)
6889 .n(n)
6890 .k(k)
6891 .cm_stride(11)
6892 .iterations(1)
6893 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6894 }
6895 }
6896 }
6897 }
6898
6899 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
6900 TEST_REQUIRES_ARM_NEON_FMA;
6901 for (size_t k = 1; k <= 40; k += 9) {
6902 GemmMicrokernelTester()
6903 .mr(5)
6904 .nr(8)
6905 .kr(1)
6906 .sr(1)
6907 .m(5)
6908 .n(8)
6909 .k(k)
6910 .ks(3)
6911 .a_offset(211)
6912 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6913 }
6914 }
6915
6916 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
6917 TEST_REQUIRES_ARM_NEON_FMA;
6918 for (uint32_t mz = 0; mz < 5; mz++) {
6919 for (size_t k = 1; k <= 40; k += 9) {
6920 GemmMicrokernelTester()
6921 .mr(5)
6922 .nr(8)
6923 .kr(1)
6924 .sr(1)
6925 .m(5)
6926 .n(8)
6927 .k(k)
6928 .ks(3)
6929 .a_offset(211)
6930 .zero_index(mz)
6931 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6932 }
6933 }
6934 }
6935
6936 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
6937 TEST_REQUIRES_ARM_NEON_FMA;
6938 GemmMicrokernelTester()
6939 .mr(5)
6940 .nr(8)
6941 .kr(1)
6942 .sr(1)
6943 .m(5)
6944 .n(8)
6945 .k(8)
6946 .qmin(128)
6947 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6948 }
6949
6950 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
6951 TEST_REQUIRES_ARM_NEON_FMA;
6952 GemmMicrokernelTester()
6953 .mr(5)
6954 .nr(8)
6955 .kr(1)
6956 .sr(1)
6957 .m(5)
6958 .n(8)
6959 .k(8)
6960 .qmax(128)
6961 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6962 }
6963
6964 TEST(F32_IGEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
6965 TEST_REQUIRES_ARM_NEON_FMA;
6966 GemmMicrokernelTester()
6967 .mr(5)
6968 .nr(8)
6969 .kr(1)
6970 .sr(1)
6971 .m(5)
6972 .n(8)
6973 .k(8)
6974 .cm_stride(11)
6975 .Test(xnn_f32_igemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
6976 }
6977#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6978
6979
6980#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
6981 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
6982 TEST_REQUIRES_ARM_NEON_FMA;
6983 GemmMicrokernelTester()
6984 .mr(6)
6985 .nr(8)
6986 .kr(1)
6987 .sr(1)
6988 .m(6)
6989 .n(8)
6990 .k(4)
6991 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
6992 }
6993
6994 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
6995 TEST_REQUIRES_ARM_NEON_FMA;
6996 GemmMicrokernelTester()
6997 .mr(6)
6998 .nr(8)
6999 .kr(1)
7000 .sr(1)
7001 .m(6)
7002 .n(8)
7003 .k(4)
7004 .cn_stride(11)
7005 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7006 }
7007
7008 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
7009 TEST_REQUIRES_ARM_NEON_FMA;
7010 for (uint32_t m = 1; m <= 6; m++) {
7011 for (uint32_t n = 1; n <= 8; n++) {
7012 GemmMicrokernelTester()
7013 .mr(6)
7014 .nr(8)
7015 .kr(1)
7016 .sr(1)
7017 .m(m)
7018 .n(n)
7019 .k(4)
7020 .iterations(1)
7021 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7022 }
7023 }
7024 }
7025
7026 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
7027 TEST_REQUIRES_ARM_NEON_FMA;
7028 for (uint32_t m = 1; m <= 6; m++) {
7029 GemmMicrokernelTester()
7030 .mr(6)
7031 .nr(8)
7032 .kr(1)
7033 .sr(1)
7034 .m(m)
7035 .n(8)
7036 .k(4)
7037 .iterations(1)
7038 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7039 }
7040 }
7041
7042 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
7043 TEST_REQUIRES_ARM_NEON_FMA;
7044 for (uint32_t n = 1; n <= 8; n++) {
7045 GemmMicrokernelTester()
7046 .mr(6)
7047 .nr(8)
7048 .kr(1)
7049 .sr(1)
7050 .m(6)
7051 .n(n)
7052 .k(4)
7053 .iterations(1)
7054 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7055 }
7056 }
7057
7058 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
7059 TEST_REQUIRES_ARM_NEON_FMA;
7060 GemmMicrokernelTester()
7061 .mr(6)
7062 .nr(8)
7063 .kr(1)
7064 .sr(1)
7065 .m(6)
7066 .n(8)
7067 .k(8)
7068 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7069 }
7070
7071 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
7072 TEST_REQUIRES_ARM_NEON_FMA;
7073 for (uint32_t m = 1; m <= 6; m++) {
7074 for (uint32_t n = 1; n <= 8; n++) {
7075 GemmMicrokernelTester()
7076 .mr(6)
7077 .nr(8)
7078 .kr(1)
7079 .sr(1)
7080 .m(m)
7081 .n(n)
7082 .k(8)
7083 .iterations(1)
7084 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7085 }
7086 }
7087 }
7088
7089 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
7090 TEST_REQUIRES_ARM_NEON_FMA;
7091 for (size_t k = 1; k < 8; k++) {
7092 GemmMicrokernelTester()
7093 .mr(6)
7094 .nr(8)
7095 .kr(1)
7096 .sr(1)
7097 .m(6)
7098 .n(8)
7099 .k(k)
7100 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7101 }
7102 }
7103
7104 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
7105 TEST_REQUIRES_ARM_NEON_FMA;
7106 for (size_t k = 1; k < 8; k++) {
7107 for (uint32_t m = 1; m <= 6; m++) {
7108 for (uint32_t n = 1; n <= 8; n++) {
7109 GemmMicrokernelTester()
7110 .mr(6)
7111 .nr(8)
7112 .kr(1)
7113 .sr(1)
7114 .m(m)
7115 .n(n)
7116 .k(k)
7117 .iterations(1)
7118 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7119 }
7120 }
7121 }
7122 }
7123
7124 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
7125 TEST_REQUIRES_ARM_NEON_FMA;
7126 for (size_t k = 9; k < 8; k++) {
7127 GemmMicrokernelTester()
7128 .mr(6)
7129 .nr(8)
7130 .kr(1)
7131 .sr(1)
7132 .m(6)
7133 .n(8)
7134 .k(k)
7135 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7136 }
7137 }
7138
7139 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
7140 TEST_REQUIRES_ARM_NEON_FMA;
7141 for (size_t k = 9; k < 8; k++) {
7142 for (uint32_t m = 1; m <= 6; m++) {
7143 for (uint32_t n = 1; n <= 8; n++) {
7144 GemmMicrokernelTester()
7145 .mr(6)
7146 .nr(8)
7147 .kr(1)
7148 .sr(1)
7149 .m(m)
7150 .n(n)
7151 .k(k)
7152 .iterations(1)
7153 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7154 }
7155 }
7156 }
7157 }
7158
7159 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
7160 TEST_REQUIRES_ARM_NEON_FMA;
7161 for (size_t k = 12; k <= 40; k += 4) {
7162 GemmMicrokernelTester()
7163 .mr(6)
7164 .nr(8)
7165 .kr(1)
7166 .sr(1)
7167 .m(6)
7168 .n(8)
7169 .k(k)
7170 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7171 }
7172 }
7173
7174 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
7175 TEST_REQUIRES_ARM_NEON_FMA;
7176 for (size_t k = 12; k <= 40; k += 4) {
7177 for (uint32_t m = 1; m <= 6; m++) {
7178 for (uint32_t n = 1; n <= 8; n++) {
7179 GemmMicrokernelTester()
7180 .mr(6)
7181 .nr(8)
7182 .kr(1)
7183 .sr(1)
7184 .m(m)
7185 .n(n)
7186 .k(k)
7187 .iterations(1)
7188 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7189 }
7190 }
7191 }
7192 }
7193
7194 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
7195 TEST_REQUIRES_ARM_NEON_FMA;
7196 for (uint32_t n = 9; n < 16; n++) {
7197 for (size_t k = 1; k <= 20; k += 5) {
7198 GemmMicrokernelTester()
7199 .mr(6)
7200 .nr(8)
7201 .kr(1)
7202 .sr(1)
7203 .m(6)
7204 .n(8)
7205 .k(k)
7206 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7207 }
7208 }
7209 }
7210
7211 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
7212 TEST_REQUIRES_ARM_NEON_FMA;
7213 for (uint32_t n = 9; n < 16; n++) {
7214 for (size_t k = 1; k <= 20; k += 5) {
7215 GemmMicrokernelTester()
7216 .mr(6)
7217 .nr(8)
7218 .kr(1)
7219 .sr(1)
7220 .m(6)
7221 .n(8)
7222 .k(k)
7223 .cn_stride(11)
7224 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7225 }
7226 }
7227 }
7228
7229 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
7230 TEST_REQUIRES_ARM_NEON_FMA;
7231 for (uint32_t n = 9; n < 16; n++) {
7232 for (size_t k = 1; k <= 20; k += 5) {
7233 for (uint32_t m = 1; m <= 6; m++) {
7234 GemmMicrokernelTester()
7235 .mr(6)
7236 .nr(8)
7237 .kr(1)
7238 .sr(1)
7239 .m(m)
7240 .n(n)
7241 .k(k)
7242 .iterations(1)
7243 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7244 }
7245 }
7246 }
7247 }
7248
7249 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
7250 TEST_REQUIRES_ARM_NEON_FMA;
7251 for (uint32_t n = 16; n <= 24; n += 8) {
7252 for (size_t k = 1; k <= 20; k += 5) {
7253 GemmMicrokernelTester()
7254 .mr(6)
7255 .nr(8)
7256 .kr(1)
7257 .sr(1)
7258 .m(6)
7259 .n(8)
7260 .k(k)
7261 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7262 }
7263 }
7264 }
7265
7266 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
7267 TEST_REQUIRES_ARM_NEON_FMA;
7268 for (uint32_t n = 16; n <= 24; n += 8) {
7269 for (size_t k = 1; k <= 20; k += 5) {
7270 GemmMicrokernelTester()
7271 .mr(6)
7272 .nr(8)
7273 .kr(1)
7274 .sr(1)
7275 .m(6)
7276 .n(n)
7277 .k(k)
7278 .cn_stride(11)
7279 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7280 }
7281 }
7282 }
7283
7284 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
7285 TEST_REQUIRES_ARM_NEON_FMA;
7286 for (uint32_t n = 16; n <= 24; n += 8) {
7287 for (size_t k = 1; k <= 20; k += 5) {
7288 for (uint32_t m = 1; m <= 6; m++) {
7289 GemmMicrokernelTester()
7290 .mr(6)
7291 .nr(8)
7292 .kr(1)
7293 .sr(1)
7294 .m(m)
7295 .n(n)
7296 .k(k)
7297 .iterations(1)
7298 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7299 }
7300 }
7301 }
7302 }
7303
7304 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
7305 TEST_REQUIRES_ARM_NEON_FMA;
7306 for (size_t k = 1; k <= 20; k += 5) {
7307 GemmMicrokernelTester()
7308 .mr(6)
7309 .nr(8)
7310 .kr(1)
7311 .sr(1)
7312 .m(6)
7313 .n(8)
7314 .k(k)
7315 .ks(3)
7316 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7317 }
7318 }
7319
7320 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
7321 TEST_REQUIRES_ARM_NEON_FMA;
7322 for (size_t k = 1; k <= 20; k += 5) {
7323 for (uint32_t m = 1; m <= 6; m++) {
7324 for (uint32_t n = 1; n <= 8; n++) {
7325 GemmMicrokernelTester()
7326 .mr(6)
7327 .nr(8)
7328 .kr(1)
7329 .sr(1)
7330 .m(m)
7331 .n(n)
7332 .k(k)
7333 .ks(3)
7334 .iterations(1)
7335 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7336 }
7337 }
7338 }
7339 }
7340
7341 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_small_kernel) {
7342 TEST_REQUIRES_ARM_NEON_FMA;
7343 for (uint32_t n = 9; n < 16; n++) {
7344 for (size_t k = 1; k <= 20; k += 5) {
7345 GemmMicrokernelTester()
7346 .mr(6)
7347 .nr(8)
7348 .kr(1)
7349 .sr(1)
7350 .m(6)
7351 .n(8)
7352 .k(k)
7353 .ks(3)
7354 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7355 }
7356 }
7357 }
7358
7359 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_small_kernel) {
7360 TEST_REQUIRES_ARM_NEON_FMA;
7361 for (uint32_t n = 16; n <= 24; n += 8) {
7362 for (size_t k = 1; k <= 20; k += 5) {
7363 GemmMicrokernelTester()
7364 .mr(6)
7365 .nr(8)
7366 .kr(1)
7367 .sr(1)
7368 .m(6)
7369 .n(8)
7370 .k(k)
7371 .ks(3)
7372 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7373 }
7374 }
7375 }
7376
7377 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
7378 TEST_REQUIRES_ARM_NEON_FMA;
7379 for (size_t k = 1; k <= 20; k += 5) {
7380 for (uint32_t m = 1; m <= 6; m++) {
7381 for (uint32_t n = 1; n <= 8; n++) {
7382 GemmMicrokernelTester()
7383 .mr(6)
7384 .nr(8)
7385 .kr(1)
7386 .sr(1)
7387 .m(m)
7388 .n(n)
7389 .k(k)
7390 .cm_stride(11)
7391 .iterations(1)
7392 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7393 }
7394 }
7395 }
7396 }
7397
7398 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
7399 TEST_REQUIRES_ARM_NEON_FMA;
7400 for (size_t k = 1; k <= 20; k += 5) {
7401 GemmMicrokernelTester()
7402 .mr(6)
7403 .nr(8)
7404 .kr(1)
7405 .sr(1)
7406 .m(6)
7407 .n(8)
7408 .k(k)
7409 .ks(3)
7410 .a_offset(127)
7411 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7412 }
7413 }
7414
7415 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, zero) {
7416 TEST_REQUIRES_ARM_NEON_FMA;
7417 for (uint32_t mz = 0; mz < 6; mz++) {
7418 for (size_t k = 1; k <= 20; k += 5) {
7419 GemmMicrokernelTester()
7420 .mr(6)
7421 .nr(8)
7422 .kr(1)
7423 .sr(1)
7424 .m(6)
7425 .n(8)
7426 .k(k)
7427 .ks(3)
7428 .a_offset(127)
7429 .zero_index(mz)
7430 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7431 }
7432 }
7433 }
7434
7435 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
7436 TEST_REQUIRES_ARM_NEON_FMA;
7437 GemmMicrokernelTester()
7438 .mr(6)
7439 .nr(8)
7440 .kr(1)
7441 .sr(1)
7442 .m(6)
7443 .n(8)
7444 .k(4)
7445 .qmin(128)
7446 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7447 }
7448
7449 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
7450 TEST_REQUIRES_ARM_NEON_FMA;
7451 GemmMicrokernelTester()
7452 .mr(6)
7453 .nr(8)
7454 .kr(1)
7455 .sr(1)
7456 .m(6)
7457 .n(8)
7458 .k(4)
7459 .qmax(128)
7460 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7461 }
7462
7463 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
7464 TEST_REQUIRES_ARM_NEON_FMA;
7465 GemmMicrokernelTester()
7466 .mr(6)
7467 .nr(8)
7468 .kr(1)
7469 .sr(1)
7470 .m(6)
7471 .n(8)
7472 .k(4)
7473 .cm_stride(11)
7474 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
7475 }
7476#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7477
7478
7479#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7480 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4) {
7481 TEST_REQUIRES_ARM_NEON_FMA;
7482 GemmMicrokernelTester()
7483 .mr(6)
7484 .nr(8)
7485 .kr(1)
7486 .sr(1)
7487 .m(6)
7488 .n(8)
7489 .k(4)
7490 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7491 }
7492
7493 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cn) {
7494 TEST_REQUIRES_ARM_NEON_FMA;
7495 GemmMicrokernelTester()
7496 .mr(6)
7497 .nr(8)
7498 .kr(1)
7499 .sr(1)
7500 .m(6)
7501 .n(8)
7502 .k(4)
7503 .cn_stride(11)
7504 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7505 }
7506
7507 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile) {
7508 TEST_REQUIRES_ARM_NEON_FMA;
7509 for (uint32_t m = 1; m <= 6; m++) {
7510 for (uint32_t n = 1; n <= 8; n++) {
7511 GemmMicrokernelTester()
7512 .mr(6)
7513 .nr(8)
7514 .kr(1)
7515 .sr(1)
7516 .m(m)
7517 .n(n)
7518 .k(4)
7519 .iterations(1)
7520 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7521 }
7522 }
7523 }
7524
7525 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_m) {
7526 TEST_REQUIRES_ARM_NEON_FMA;
7527 for (uint32_t m = 1; m <= 6; m++) {
7528 GemmMicrokernelTester()
7529 .mr(6)
7530 .nr(8)
7531 .kr(1)
7532 .sr(1)
7533 .m(m)
7534 .n(8)
7535 .k(4)
7536 .iterations(1)
7537 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7538 }
7539 }
7540
7541 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_4_subtile_n) {
7542 TEST_REQUIRES_ARM_NEON_FMA;
7543 for (uint32_t n = 1; n <= 8; n++) {
7544 GemmMicrokernelTester()
7545 .mr(6)
7546 .nr(8)
7547 .kr(1)
7548 .sr(1)
7549 .m(6)
7550 .n(n)
7551 .k(4)
7552 .iterations(1)
7553 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7554 }
7555 }
7556
7557 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8) {
7558 TEST_REQUIRES_ARM_NEON_FMA;
7559 GemmMicrokernelTester()
7560 .mr(6)
7561 .nr(8)
7562 .kr(1)
7563 .sr(1)
7564 .m(6)
7565 .n(8)
7566 .k(8)
7567 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7568 }
7569
7570 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_eq_8_subtile) {
7571 TEST_REQUIRES_ARM_NEON_FMA;
7572 for (uint32_t m = 1; m <= 6; m++) {
7573 for (uint32_t n = 1; n <= 8; n++) {
7574 GemmMicrokernelTester()
7575 .mr(6)
7576 .nr(8)
7577 .kr(1)
7578 .sr(1)
7579 .m(m)
7580 .n(n)
7581 .k(8)
7582 .iterations(1)
7583 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7584 }
7585 }
7586 }
7587
7588 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8) {
7589 TEST_REQUIRES_ARM_NEON_FMA;
7590 for (size_t k = 1; k < 8; k++) {
7591 GemmMicrokernelTester()
7592 .mr(6)
7593 .nr(8)
7594 .kr(1)
7595 .sr(1)
7596 .m(6)
7597 .n(8)
7598 .k(k)
7599 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7600 }
7601 }
7602
7603 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_lt_8_subtile) {
7604 TEST_REQUIRES_ARM_NEON_FMA;
7605 for (size_t k = 1; k < 8; k++) {
7606 for (uint32_t m = 1; m <= 6; m++) {
7607 for (uint32_t n = 1; n <= 8; n++) {
7608 GemmMicrokernelTester()
7609 .mr(6)
7610 .nr(8)
7611 .kr(1)
7612 .sr(1)
7613 .m(m)
7614 .n(n)
7615 .k(k)
7616 .iterations(1)
7617 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7618 }
7619 }
7620 }
7621 }
7622
7623 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_8) {
7624 TEST_REQUIRES_ARM_NEON_FMA;
7625 for (size_t k = 9; k < 8; k++) {
7626 GemmMicrokernelTester()
7627 .mr(6)
7628 .nr(8)
7629 .kr(1)
7630 .sr(1)
7631 .m(6)
7632 .n(8)
7633 .k(k)
7634 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7635 }
7636 }
7637
7638 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_gt_4_subtile) {
7639 TEST_REQUIRES_ARM_NEON_FMA;
7640 for (size_t k = 9; k < 8; k++) {
7641 for (uint32_t m = 1; m <= 6; m++) {
7642 for (uint32_t n = 1; n <= 8; n++) {
7643 GemmMicrokernelTester()
7644 .mr(6)
7645 .nr(8)
7646 .kr(1)
7647 .sr(1)
7648 .m(m)
7649 .n(n)
7650 .k(k)
7651 .iterations(1)
7652 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7653 }
7654 }
7655 }
7656 }
7657
7658 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4) {
7659 TEST_REQUIRES_ARM_NEON_FMA;
7660 for (size_t k = 12; k <= 40; k += 4) {
7661 GemmMicrokernelTester()
7662 .mr(6)
7663 .nr(8)
7664 .kr(1)
7665 .sr(1)
7666 .m(6)
7667 .n(8)
7668 .k(k)
7669 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7670 }
7671 }
7672
7673 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, k_div_4_subtile) {
7674 TEST_REQUIRES_ARM_NEON_FMA;
7675 for (size_t k = 12; k <= 40; k += 4) {
7676 for (uint32_t m = 1; m <= 6; m++) {
7677 for (uint32_t n = 1; n <= 8; n++) {
7678 GemmMicrokernelTester()
7679 .mr(6)
7680 .nr(8)
7681 .kr(1)
7682 .sr(1)
7683 .m(m)
7684 .n(n)
7685 .k(k)
7686 .iterations(1)
7687 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7688 }
7689 }
7690 }
7691 }
7692
7693 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8) {
7694 TEST_REQUIRES_ARM_NEON_FMA;
7695 for (uint32_t n = 9; n < 16; n++) {
7696 for (size_t k = 1; k <= 20; k += 5) {
7697 GemmMicrokernelTester()
7698 .mr(6)
7699 .nr(8)
7700 .kr(1)
7701 .sr(1)
7702 .m(6)
7703 .n(8)
7704 .k(k)
7705 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7706 }
7707 }
7708 }
7709
7710 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_strided_cn) {
7711 TEST_REQUIRES_ARM_NEON_FMA;
7712 for (uint32_t n = 9; n < 16; n++) {
7713 for (size_t k = 1; k <= 20; k += 5) {
7714 GemmMicrokernelTester()
7715 .mr(6)
7716 .nr(8)
7717 .kr(1)
7718 .sr(1)
7719 .m(6)
7720 .n(8)
7721 .k(k)
7722 .cn_stride(11)
7723 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7724 }
7725 }
7726 }
7727
7728 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_subtile) {
7729 TEST_REQUIRES_ARM_NEON_FMA;
7730 for (uint32_t n = 9; n < 16; n++) {
7731 for (size_t k = 1; k <= 20; k += 5) {
7732 for (uint32_t m = 1; m <= 6; m++) {
7733 GemmMicrokernelTester()
7734 .mr(6)
7735 .nr(8)
7736 .kr(1)
7737 .sr(1)
7738 .m(m)
7739 .n(n)
7740 .k(k)
7741 .iterations(1)
7742 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7743 }
7744 }
7745 }
7746 }
7747
7748 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8) {
7749 TEST_REQUIRES_ARM_NEON_FMA;
7750 for (uint32_t n = 16; n <= 24; n += 8) {
7751 for (size_t k = 1; k <= 20; k += 5) {
7752 GemmMicrokernelTester()
7753 .mr(6)
7754 .nr(8)
7755 .kr(1)
7756 .sr(1)
7757 .m(6)
7758 .n(8)
7759 .k(k)
7760 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7761 }
7762 }
7763 }
7764
7765 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_strided_cn) {
7766 TEST_REQUIRES_ARM_NEON_FMA;
7767 for (uint32_t n = 16; n <= 24; n += 8) {
7768 for (size_t k = 1; k <= 20; k += 5) {
7769 GemmMicrokernelTester()
7770 .mr(6)
7771 .nr(8)
7772 .kr(1)
7773 .sr(1)
7774 .m(6)
7775 .n(n)
7776 .k(k)
7777 .cn_stride(11)
7778 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7779 }
7780 }
7781 }
7782
7783 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_subtile) {
7784 TEST_REQUIRES_ARM_NEON_FMA;
7785 for (uint32_t n = 16; n <= 24; n += 8) {
7786 for (size_t k = 1; k <= 20; k += 5) {
7787 for (uint32_t m = 1; m <= 6; m++) {
7788 GemmMicrokernelTester()
7789 .mr(6)
7790 .nr(8)
7791 .kr(1)
7792 .sr(1)
7793 .m(m)
7794 .n(n)
7795 .k(k)
7796 .iterations(1)
7797 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7798 }
7799 }
7800 }
7801 }
7802
7803 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, small_kernel) {
7804 TEST_REQUIRES_ARM_NEON_FMA;
7805 for (size_t k = 1; k <= 20; k += 5) {
7806 GemmMicrokernelTester()
7807 .mr(6)
7808 .nr(8)
7809 .kr(1)
7810 .sr(1)
7811 .m(6)
7812 .n(8)
7813 .k(k)
7814 .ks(3)
7815 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7816 }
7817 }
7818
7819 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, small_kernel_subtile) {
7820 TEST_REQUIRES_ARM_NEON_FMA;
7821 for (size_t k = 1; k <= 20; k += 5) {
7822 for (uint32_t m = 1; m <= 6; m++) {
7823 for (uint32_t n = 1; n <= 8; n++) {
7824 GemmMicrokernelTester()
7825 .mr(6)
7826 .nr(8)
7827 .kr(1)
7828 .sr(1)
7829 .m(m)
7830 .n(n)
7831 .k(k)
7832 .ks(3)
7833 .iterations(1)
7834 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7835 }
7836 }
7837 }
7838 }
7839
7840 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_gt_8_small_kernel) {
7841 TEST_REQUIRES_ARM_NEON_FMA;
7842 for (uint32_t n = 9; n < 16; n++) {
7843 for (size_t k = 1; k <= 20; k += 5) {
7844 GemmMicrokernelTester()
7845 .mr(6)
7846 .nr(8)
7847 .kr(1)
7848 .sr(1)
7849 .m(6)
7850 .n(8)
7851 .k(k)
7852 .ks(3)
7853 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7854 }
7855 }
7856 }
7857
7858 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, n_div_8_small_kernel) {
7859 TEST_REQUIRES_ARM_NEON_FMA;
7860 for (uint32_t n = 16; n <= 24; n += 8) {
7861 for (size_t k = 1; k <= 20; k += 5) {
7862 GemmMicrokernelTester()
7863 .mr(6)
7864 .nr(8)
7865 .kr(1)
7866 .sr(1)
7867 .m(6)
7868 .n(8)
7869 .k(k)
7870 .ks(3)
7871 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7872 }
7873 }
7874 }
7875
7876 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm_subtile) {
7877 TEST_REQUIRES_ARM_NEON_FMA;
7878 for (size_t k = 1; k <= 20; k += 5) {
7879 for (uint32_t m = 1; m <= 6; m++) {
7880 for (uint32_t n = 1; n <= 8; n++) {
7881 GemmMicrokernelTester()
7882 .mr(6)
7883 .nr(8)
7884 .kr(1)
7885 .sr(1)
7886 .m(m)
7887 .n(n)
7888 .k(k)
7889 .cm_stride(11)
7890 .iterations(1)
7891 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7892 }
7893 }
7894 }
7895 }
7896
7897 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, a_offset) {
7898 TEST_REQUIRES_ARM_NEON_FMA;
7899 for (size_t k = 1; k <= 20; k += 5) {
7900 GemmMicrokernelTester()
7901 .mr(6)
7902 .nr(8)
7903 .kr(1)
7904 .sr(1)
7905 .m(6)
7906 .n(8)
7907 .k(k)
7908 .ks(3)
7909 .a_offset(127)
7910 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7911 }
7912 }
7913
7914 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, zero) {
7915 TEST_REQUIRES_ARM_NEON_FMA;
7916 for (uint32_t mz = 0; mz < 6; mz++) {
7917 for (size_t k = 1; k <= 20; k += 5) {
7918 GemmMicrokernelTester()
7919 .mr(6)
7920 .nr(8)
7921 .kr(1)
7922 .sr(1)
7923 .m(6)
7924 .n(8)
7925 .k(k)
7926 .ks(3)
7927 .a_offset(127)
7928 .zero_index(mz)
7929 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7930 }
7931 }
7932 }
7933
7934 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, qmin) {
7935 TEST_REQUIRES_ARM_NEON_FMA;
7936 GemmMicrokernelTester()
7937 .mr(6)
7938 .nr(8)
7939 .kr(1)
7940 .sr(1)
7941 .m(6)
7942 .n(8)
7943 .k(4)
7944 .qmin(128)
7945 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7946 }
7947
7948 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, qmax) {
7949 TEST_REQUIRES_ARM_NEON_FMA;
7950 GemmMicrokernelTester()
7951 .mr(6)
7952 .nr(8)
7953 .kr(1)
7954 .sr(1)
7955 .m(6)
7956 .n(8)
7957 .k(4)
7958 .qmax(128)
7959 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7960 }
7961
7962 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A55, strided_cm) {
7963 TEST_REQUIRES_ARM_NEON_FMA;
7964 GemmMicrokernelTester()
7965 .mr(6)
7966 .nr(8)
7967 .kr(1)
7968 .sr(1)
7969 .m(6)
7970 .n(8)
7971 .k(4)
7972 .cm_stride(11)
7973 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a55);
7974 }
7975#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7976
7977
7978#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
7979 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
7980 TEST_REQUIRES_ARM_NEON_FMA;
7981 GemmMicrokernelTester()
7982 .mr(6)
7983 .nr(8)
7984 .kr(1)
7985 .sr(1)
7986 .m(6)
7987 .n(8)
7988 .k(8)
7989 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
7990 }
7991
7992 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cn) {
7993 TEST_REQUIRES_ARM_NEON_FMA;
7994 GemmMicrokernelTester()
7995 .mr(6)
7996 .nr(8)
7997 .kr(1)
7998 .sr(1)
7999 .m(6)
8000 .n(8)
8001 .k(8)
8002 .cn_stride(11)
8003 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8004 }
8005
8006 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile) {
8007 TEST_REQUIRES_ARM_NEON_FMA;
8008 for (uint32_t m = 1; m <= 6; m++) {
8009 for (uint32_t n = 1; n <= 8; n++) {
8010 GemmMicrokernelTester()
8011 .mr(6)
8012 .nr(8)
8013 .kr(1)
8014 .sr(1)
8015 .m(m)
8016 .n(n)
8017 .k(8)
8018 .iterations(1)
8019 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8020 }
8021 }
8022 }
8023
8024 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_m) {
8025 TEST_REQUIRES_ARM_NEON_FMA;
8026 for (uint32_t m = 1; m <= 6; m++) {
8027 GemmMicrokernelTester()
8028 .mr(6)
8029 .nr(8)
8030 .kr(1)
8031 .sr(1)
8032 .m(m)
8033 .n(8)
8034 .k(8)
8035 .iterations(1)
8036 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8037 }
8038 }
8039
8040 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_n) {
8041 TEST_REQUIRES_ARM_NEON_FMA;
8042 for (uint32_t n = 1; n <= 8; n++) {
8043 GemmMicrokernelTester()
8044 .mr(6)
8045 .nr(8)
8046 .kr(1)
8047 .sr(1)
8048 .m(6)
8049 .n(n)
8050 .k(8)
8051 .iterations(1)
8052 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8053 }
8054 }
8055
8056 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16) {
8057 TEST_REQUIRES_ARM_NEON_FMA;
8058 GemmMicrokernelTester()
8059 .mr(6)
8060 .nr(8)
8061 .kr(1)
8062 .sr(1)
8063 .m(6)
8064 .n(8)
8065 .k(16)
8066 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8067 }
8068
8069 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_subtile) {
8070 TEST_REQUIRES_ARM_NEON_FMA;
8071 for (uint32_t m = 1; m <= 6; m++) {
8072 for (uint32_t n = 1; n <= 8; n++) {
8073 GemmMicrokernelTester()
8074 .mr(6)
8075 .nr(8)
8076 .kr(1)
8077 .sr(1)
8078 .m(m)
8079 .n(n)
8080 .k(16)
8081 .iterations(1)
8082 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8083 }
8084 }
8085 }
8086
8087 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16) {
8088 TEST_REQUIRES_ARM_NEON_FMA;
8089 for (size_t k = 1; k < 16; k++) {
8090 GemmMicrokernelTester()
8091 .mr(6)
8092 .nr(8)
8093 .kr(1)
8094 .sr(1)
8095 .m(6)
8096 .n(8)
8097 .k(k)
8098 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8099 }
8100 }
8101
8102 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_subtile) {
8103 TEST_REQUIRES_ARM_NEON_FMA;
8104 for (size_t k = 1; k < 16; k++) {
8105 for (uint32_t m = 1; m <= 6; m++) {
8106 for (uint32_t n = 1; n <= 8; n++) {
8107 GemmMicrokernelTester()
8108 .mr(6)
8109 .nr(8)
8110 .kr(1)
8111 .sr(1)
8112 .m(m)
8113 .n(n)
8114 .k(k)
8115 .iterations(1)
8116 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8117 }
8118 }
8119 }
8120 }
8121
8122 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_16) {
8123 TEST_REQUIRES_ARM_NEON_FMA;
8124 for (size_t k = 17; k < 16; k++) {
8125 GemmMicrokernelTester()
8126 .mr(6)
8127 .nr(8)
8128 .kr(1)
8129 .sr(1)
8130 .m(6)
8131 .n(8)
8132 .k(k)
8133 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8134 }
8135 }
8136
8137 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_subtile) {
8138 TEST_REQUIRES_ARM_NEON_FMA;
8139 for (size_t k = 17; k < 16; k++) {
8140 for (uint32_t m = 1; m <= 6; m++) {
8141 for (uint32_t n = 1; n <= 8; n++) {
8142 GemmMicrokernelTester()
8143 .mr(6)
8144 .nr(8)
8145 .kr(1)
8146 .sr(1)
8147 .m(m)
8148 .n(n)
8149 .k(k)
8150 .iterations(1)
8151 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8152 }
8153 }
8154 }
8155 }
8156
8157 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8) {
8158 TEST_REQUIRES_ARM_NEON_FMA;
8159 for (size_t k = 24; k <= 80; k += 8) {
8160 GemmMicrokernelTester()
8161 .mr(6)
8162 .nr(8)
8163 .kr(1)
8164 .sr(1)
8165 .m(6)
8166 .n(8)
8167 .k(k)
8168 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8169 }
8170 }
8171
8172 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_subtile) {
8173 TEST_REQUIRES_ARM_NEON_FMA;
8174 for (size_t k = 24; k <= 80; k += 8) {
8175 for (uint32_t m = 1; m <= 6; m++) {
8176 for (uint32_t n = 1; n <= 8; n++) {
8177 GemmMicrokernelTester()
8178 .mr(6)
8179 .nr(8)
8180 .kr(1)
8181 .sr(1)
8182 .m(m)
8183 .n(n)
8184 .k(k)
8185 .iterations(1)
8186 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8187 }
8188 }
8189 }
8190 }
8191
8192 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8) {
8193 TEST_REQUIRES_ARM_NEON_FMA;
8194 for (uint32_t n = 9; n < 16; n++) {
8195 for (size_t k = 1; k <= 40; k += 9) {
8196 GemmMicrokernelTester()
8197 .mr(6)
8198 .nr(8)
8199 .kr(1)
8200 .sr(1)
8201 .m(6)
8202 .n(8)
8203 .k(k)
8204 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8205 }
8206 }
8207 }
8208
8209 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_cn) {
8210 TEST_REQUIRES_ARM_NEON_FMA;
8211 for (uint32_t n = 9; n < 16; n++) {
8212 for (size_t k = 1; k <= 40; k += 9) {
8213 GemmMicrokernelTester()
8214 .mr(6)
8215 .nr(8)
8216 .kr(1)
8217 .sr(1)
8218 .m(6)
8219 .n(8)
8220 .k(k)
8221 .cn_stride(11)
8222 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8223 }
8224 }
8225 }
8226
8227 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_subtile) {
8228 TEST_REQUIRES_ARM_NEON_FMA;
8229 for (uint32_t n = 9; n < 16; n++) {
8230 for (size_t k = 1; k <= 40; k += 9) {
8231 for (uint32_t m = 1; m <= 6; m++) {
8232 GemmMicrokernelTester()
8233 .mr(6)
8234 .nr(8)
8235 .kr(1)
8236 .sr(1)
8237 .m(m)
8238 .n(n)
8239 .k(k)
8240 .iterations(1)
8241 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8242 }
8243 }
8244 }
8245 }
8246
8247 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8) {
8248 TEST_REQUIRES_ARM_NEON_FMA;
8249 for (uint32_t n = 16; n <= 24; n += 8) {
8250 for (size_t k = 1; k <= 40; k += 9) {
8251 GemmMicrokernelTester()
8252 .mr(6)
8253 .nr(8)
8254 .kr(1)
8255 .sr(1)
8256 .m(6)
8257 .n(8)
8258 .k(k)
8259 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8260 }
8261 }
8262 }
8263
8264 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_cn) {
8265 TEST_REQUIRES_ARM_NEON_FMA;
8266 for (uint32_t n = 16; n <= 24; n += 8) {
8267 for (size_t k = 1; k <= 40; k += 9) {
8268 GemmMicrokernelTester()
8269 .mr(6)
8270 .nr(8)
8271 .kr(1)
8272 .sr(1)
8273 .m(6)
8274 .n(n)
8275 .k(k)
8276 .cn_stride(11)
8277 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8278 }
8279 }
8280 }
8281
8282 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_subtile) {
8283 TEST_REQUIRES_ARM_NEON_FMA;
8284 for (uint32_t n = 16; n <= 24; n += 8) {
8285 for (size_t k = 1; k <= 40; k += 9) {
8286 for (uint32_t m = 1; m <= 6; m++) {
8287 GemmMicrokernelTester()
8288 .mr(6)
8289 .nr(8)
8290 .kr(1)
8291 .sr(1)
8292 .m(m)
8293 .n(n)
8294 .k(k)
8295 .iterations(1)
8296 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8297 }
8298 }
8299 }
8300 }
8301
8302 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, small_kernel) {
8303 TEST_REQUIRES_ARM_NEON_FMA;
8304 for (size_t k = 1; k <= 40; k += 9) {
8305 GemmMicrokernelTester()
8306 .mr(6)
8307 .nr(8)
8308 .kr(1)
8309 .sr(1)
8310 .m(6)
8311 .n(8)
8312 .k(k)
8313 .ks(3)
8314 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8315 }
8316 }
8317
8318 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, small_kernel_subtile) {
8319 TEST_REQUIRES_ARM_NEON_FMA;
8320 for (size_t k = 1; k <= 40; k += 9) {
8321 for (uint32_t m = 1; m <= 6; m++) {
8322 for (uint32_t n = 1; n <= 8; n++) {
8323 GemmMicrokernelTester()
8324 .mr(6)
8325 .nr(8)
8326 .kr(1)
8327 .sr(1)
8328 .m(m)
8329 .n(n)
8330 .k(k)
8331 .ks(3)
8332 .iterations(1)
8333 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8334 }
8335 }
8336 }
8337 }
8338
8339 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_small_kernel) {
8340 TEST_REQUIRES_ARM_NEON_FMA;
8341 for (uint32_t n = 9; n < 16; n++) {
8342 for (size_t k = 1; k <= 40; k += 9) {
8343 GemmMicrokernelTester()
8344 .mr(6)
8345 .nr(8)
8346 .kr(1)
8347 .sr(1)
8348 .m(6)
8349 .n(8)
8350 .k(k)
8351 .ks(3)
8352 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8353 }
8354 }
8355 }
8356
8357 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_small_kernel) {
8358 TEST_REQUIRES_ARM_NEON_FMA;
8359 for (uint32_t n = 16; n <= 24; n += 8) {
8360 for (size_t k = 1; k <= 40; k += 9) {
8361 GemmMicrokernelTester()
8362 .mr(6)
8363 .nr(8)
8364 .kr(1)
8365 .sr(1)
8366 .m(6)
8367 .n(8)
8368 .k(k)
8369 .ks(3)
8370 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8371 }
8372 }
8373 }
8374
8375 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm_subtile) {
8376 TEST_REQUIRES_ARM_NEON_FMA;
8377 for (size_t k = 1; k <= 40; k += 9) {
8378 for (uint32_t m = 1; m <= 6; m++) {
8379 for (uint32_t n = 1; n <= 8; n++) {
8380 GemmMicrokernelTester()
8381 .mr(6)
8382 .nr(8)
8383 .kr(1)
8384 .sr(1)
8385 .m(m)
8386 .n(n)
8387 .k(k)
8388 .cm_stride(11)
8389 .iterations(1)
8390 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8391 }
8392 }
8393 }
8394 }
8395
8396 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, a_offset) {
8397 TEST_REQUIRES_ARM_NEON_FMA;
8398 for (size_t k = 1; k <= 40; k += 9) {
8399 GemmMicrokernelTester()
8400 .mr(6)
8401 .nr(8)
8402 .kr(1)
8403 .sr(1)
8404 .m(6)
8405 .n(8)
8406 .k(k)
8407 .ks(3)
8408 .a_offset(251)
8409 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8410 }
8411 }
8412
8413 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, zero) {
8414 TEST_REQUIRES_ARM_NEON_FMA;
8415 for (uint32_t mz = 0; mz < 6; mz++) {
8416 for (size_t k = 1; k <= 40; k += 9) {
8417 GemmMicrokernelTester()
8418 .mr(6)
8419 .nr(8)
8420 .kr(1)
8421 .sr(1)
8422 .m(6)
8423 .n(8)
8424 .k(k)
8425 .ks(3)
8426 .a_offset(251)
8427 .zero_index(mz)
8428 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8429 }
8430 }
8431 }
8432
8433 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmin) {
8434 TEST_REQUIRES_ARM_NEON_FMA;
8435 GemmMicrokernelTester()
8436 .mr(6)
8437 .nr(8)
8438 .kr(1)
8439 .sr(1)
8440 .m(6)
8441 .n(8)
8442 .k(8)
8443 .qmin(128)
8444 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8445 }
8446
8447 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmax) {
8448 TEST_REQUIRES_ARM_NEON_FMA;
8449 GemmMicrokernelTester()
8450 .mr(6)
8451 .nr(8)
8452 .kr(1)
8453 .sr(1)
8454 .m(6)
8455 .n(8)
8456 .k(8)
8457 .qmax(128)
8458 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8459 }
8460
8461 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm) {
8462 TEST_REQUIRES_ARM_NEON_FMA;
8463 GemmMicrokernelTester()
8464 .mr(6)
8465 .nr(8)
8466 .kr(1)
8467 .sr(1)
8468 .m(6)
8469 .n(8)
8470 .k(8)
8471 .cm_stride(11)
8472 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
8473 }
8474#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8475
8476
8477#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8478 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
8479 TEST_REQUIRES_ARM_NEON_FMA;
8480 GemmMicrokernelTester()
8481 .mr(6)
8482 .nr(8)
8483 .kr(1)
8484 .sr(1)
8485 .m(6)
8486 .n(8)
8487 .k(8)
8488 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8489 }
8490
8491 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
8492 TEST_REQUIRES_ARM_NEON_FMA;
8493 GemmMicrokernelTester()
8494 .mr(6)
8495 .nr(8)
8496 .kr(1)
8497 .sr(1)
8498 .m(6)
8499 .n(8)
8500 .k(8)
8501 .cn_stride(11)
8502 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8503 }
8504
8505 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
8506 TEST_REQUIRES_ARM_NEON_FMA;
8507 for (uint32_t m = 1; m <= 6; m++) {
8508 for (uint32_t n = 1; n <= 8; n++) {
8509 GemmMicrokernelTester()
8510 .mr(6)
8511 .nr(8)
8512 .kr(1)
8513 .sr(1)
8514 .m(m)
8515 .n(n)
8516 .k(8)
8517 .iterations(1)
8518 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8519 }
8520 }
8521 }
8522
8523 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
8524 TEST_REQUIRES_ARM_NEON_FMA;
8525 for (uint32_t m = 1; m <= 6; m++) {
8526 GemmMicrokernelTester()
8527 .mr(6)
8528 .nr(8)
8529 .kr(1)
8530 .sr(1)
8531 .m(m)
8532 .n(8)
8533 .k(8)
8534 .iterations(1)
8535 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8536 }
8537 }
8538
8539 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
8540 TEST_REQUIRES_ARM_NEON_FMA;
8541 for (uint32_t n = 1; n <= 8; n++) {
8542 GemmMicrokernelTester()
8543 .mr(6)
8544 .nr(8)
8545 .kr(1)
8546 .sr(1)
8547 .m(6)
8548 .n(n)
8549 .k(8)
8550 .iterations(1)
8551 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8552 }
8553 }
8554
8555 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
8556 TEST_REQUIRES_ARM_NEON_FMA;
8557 GemmMicrokernelTester()
8558 .mr(6)
8559 .nr(8)
8560 .kr(1)
8561 .sr(1)
8562 .m(6)
8563 .n(8)
8564 .k(16)
8565 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8566 }
8567
8568 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
8569 TEST_REQUIRES_ARM_NEON_FMA;
8570 for (uint32_t m = 1; m <= 6; m++) {
8571 for (uint32_t n = 1; n <= 8; n++) {
8572 GemmMicrokernelTester()
8573 .mr(6)
8574 .nr(8)
8575 .kr(1)
8576 .sr(1)
8577 .m(m)
8578 .n(n)
8579 .k(16)
8580 .iterations(1)
8581 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8582 }
8583 }
8584 }
8585
8586 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
8587 TEST_REQUIRES_ARM_NEON_FMA;
8588 for (size_t k = 1; k < 16; k++) {
8589 GemmMicrokernelTester()
8590 .mr(6)
8591 .nr(8)
8592 .kr(1)
8593 .sr(1)
8594 .m(6)
8595 .n(8)
8596 .k(k)
8597 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8598 }
8599 }
8600
8601 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
8602 TEST_REQUIRES_ARM_NEON_FMA;
8603 for (size_t k = 1; k < 16; k++) {
8604 for (uint32_t m = 1; m <= 6; m++) {
8605 for (uint32_t n = 1; n <= 8; n++) {
8606 GemmMicrokernelTester()
8607 .mr(6)
8608 .nr(8)
8609 .kr(1)
8610 .sr(1)
8611 .m(m)
8612 .n(n)
8613 .k(k)
8614 .iterations(1)
8615 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8616 }
8617 }
8618 }
8619 }
8620
8621 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
8622 TEST_REQUIRES_ARM_NEON_FMA;
8623 for (size_t k = 17; k < 16; k++) {
8624 GemmMicrokernelTester()
8625 .mr(6)
8626 .nr(8)
8627 .kr(1)
8628 .sr(1)
8629 .m(6)
8630 .n(8)
8631 .k(k)
8632 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8633 }
8634 }
8635
8636 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
8637 TEST_REQUIRES_ARM_NEON_FMA;
8638 for (size_t k = 17; k < 16; k++) {
8639 for (uint32_t m = 1; m <= 6; m++) {
8640 for (uint32_t n = 1; n <= 8; n++) {
8641 GemmMicrokernelTester()
8642 .mr(6)
8643 .nr(8)
8644 .kr(1)
8645 .sr(1)
8646 .m(m)
8647 .n(n)
8648 .k(k)
8649 .iterations(1)
8650 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8651 }
8652 }
8653 }
8654 }
8655
8656 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
8657 TEST_REQUIRES_ARM_NEON_FMA;
8658 for (size_t k = 24; k <= 80; k += 8) {
8659 GemmMicrokernelTester()
8660 .mr(6)
8661 .nr(8)
8662 .kr(1)
8663 .sr(1)
8664 .m(6)
8665 .n(8)
8666 .k(k)
8667 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8668 }
8669 }
8670
8671 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
8672 TEST_REQUIRES_ARM_NEON_FMA;
8673 for (size_t k = 24; k <= 80; k += 8) {
8674 for (uint32_t m = 1; m <= 6; m++) {
8675 for (uint32_t n = 1; n <= 8; n++) {
8676 GemmMicrokernelTester()
8677 .mr(6)
8678 .nr(8)
8679 .kr(1)
8680 .sr(1)
8681 .m(m)
8682 .n(n)
8683 .k(k)
8684 .iterations(1)
8685 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8686 }
8687 }
8688 }
8689 }
8690
8691 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
8692 TEST_REQUIRES_ARM_NEON_FMA;
8693 for (uint32_t n = 9; n < 16; n++) {
8694 for (size_t k = 1; k <= 40; k += 9) {
8695 GemmMicrokernelTester()
8696 .mr(6)
8697 .nr(8)
8698 .kr(1)
8699 .sr(1)
8700 .m(6)
8701 .n(8)
8702 .k(k)
8703 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8704 }
8705 }
8706 }
8707
8708 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
8709 TEST_REQUIRES_ARM_NEON_FMA;
8710 for (uint32_t n = 9; n < 16; n++) {
8711 for (size_t k = 1; k <= 40; k += 9) {
8712 GemmMicrokernelTester()
8713 .mr(6)
8714 .nr(8)
8715 .kr(1)
8716 .sr(1)
8717 .m(6)
8718 .n(8)
8719 .k(k)
8720 .cn_stride(11)
8721 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8722 }
8723 }
8724 }
8725
8726 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
8727 TEST_REQUIRES_ARM_NEON_FMA;
8728 for (uint32_t n = 9; n < 16; n++) {
8729 for (size_t k = 1; k <= 40; k += 9) {
8730 for (uint32_t m = 1; m <= 6; m++) {
8731 GemmMicrokernelTester()
8732 .mr(6)
8733 .nr(8)
8734 .kr(1)
8735 .sr(1)
8736 .m(m)
8737 .n(n)
8738 .k(k)
8739 .iterations(1)
8740 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8741 }
8742 }
8743 }
8744 }
8745
8746 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
8747 TEST_REQUIRES_ARM_NEON_FMA;
8748 for (uint32_t n = 16; n <= 24; n += 8) {
8749 for (size_t k = 1; k <= 40; k += 9) {
8750 GemmMicrokernelTester()
8751 .mr(6)
8752 .nr(8)
8753 .kr(1)
8754 .sr(1)
8755 .m(6)
8756 .n(8)
8757 .k(k)
8758 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8759 }
8760 }
8761 }
8762
8763 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
8764 TEST_REQUIRES_ARM_NEON_FMA;
8765 for (uint32_t n = 16; n <= 24; n += 8) {
8766 for (size_t k = 1; k <= 40; k += 9) {
8767 GemmMicrokernelTester()
8768 .mr(6)
8769 .nr(8)
8770 .kr(1)
8771 .sr(1)
8772 .m(6)
8773 .n(n)
8774 .k(k)
8775 .cn_stride(11)
8776 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8777 }
8778 }
8779 }
8780
8781 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
8782 TEST_REQUIRES_ARM_NEON_FMA;
8783 for (uint32_t n = 16; n <= 24; n += 8) {
8784 for (size_t k = 1; k <= 40; k += 9) {
8785 for (uint32_t m = 1; m <= 6; m++) {
8786 GemmMicrokernelTester()
8787 .mr(6)
8788 .nr(8)
8789 .kr(1)
8790 .sr(1)
8791 .m(m)
8792 .n(n)
8793 .k(k)
8794 .iterations(1)
8795 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8796 }
8797 }
8798 }
8799 }
8800
8801 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel) {
8802 TEST_REQUIRES_ARM_NEON_FMA;
8803 for (size_t k = 1; k <= 40; k += 9) {
8804 GemmMicrokernelTester()
8805 .mr(6)
8806 .nr(8)
8807 .kr(1)
8808 .sr(1)
8809 .m(6)
8810 .n(8)
8811 .k(k)
8812 .ks(3)
8813 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8814 }
8815 }
8816
8817 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, small_kernel_subtile) {
8818 TEST_REQUIRES_ARM_NEON_FMA;
8819 for (size_t k = 1; k <= 40; k += 9) {
8820 for (uint32_t m = 1; m <= 6; m++) {
8821 for (uint32_t n = 1; n <= 8; n++) {
8822 GemmMicrokernelTester()
8823 .mr(6)
8824 .nr(8)
8825 .kr(1)
8826 .sr(1)
8827 .m(m)
8828 .n(n)
8829 .k(k)
8830 .ks(3)
8831 .iterations(1)
8832 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8833 }
8834 }
8835 }
8836 }
8837
8838 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_small_kernel) {
8839 TEST_REQUIRES_ARM_NEON_FMA;
8840 for (uint32_t n = 9; n < 16; n++) {
8841 for (size_t k = 1; k <= 40; k += 9) {
8842 GemmMicrokernelTester()
8843 .mr(6)
8844 .nr(8)
8845 .kr(1)
8846 .sr(1)
8847 .m(6)
8848 .n(8)
8849 .k(k)
8850 .ks(3)
8851 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8852 }
8853 }
8854 }
8855
8856 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_small_kernel) {
8857 TEST_REQUIRES_ARM_NEON_FMA;
8858 for (uint32_t n = 16; n <= 24; n += 8) {
8859 for (size_t k = 1; k <= 40; k += 9) {
8860 GemmMicrokernelTester()
8861 .mr(6)
8862 .nr(8)
8863 .kr(1)
8864 .sr(1)
8865 .m(6)
8866 .n(8)
8867 .k(k)
8868 .ks(3)
8869 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8870 }
8871 }
8872 }
8873
8874 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
8875 TEST_REQUIRES_ARM_NEON_FMA;
8876 for (size_t k = 1; k <= 40; k += 9) {
8877 for (uint32_t m = 1; m <= 6; m++) {
8878 for (uint32_t n = 1; n <= 8; n++) {
8879 GemmMicrokernelTester()
8880 .mr(6)
8881 .nr(8)
8882 .kr(1)
8883 .sr(1)
8884 .m(m)
8885 .n(n)
8886 .k(k)
8887 .cm_stride(11)
8888 .iterations(1)
8889 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8890 }
8891 }
8892 }
8893 }
8894
8895 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, a_offset) {
8896 TEST_REQUIRES_ARM_NEON_FMA;
8897 for (size_t k = 1; k <= 40; k += 9) {
8898 GemmMicrokernelTester()
8899 .mr(6)
8900 .nr(8)
8901 .kr(1)
8902 .sr(1)
8903 .m(6)
8904 .n(8)
8905 .k(k)
8906 .ks(3)
8907 .a_offset(251)
8908 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8909 }
8910 }
8911
8912 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, zero) {
8913 TEST_REQUIRES_ARM_NEON_FMA;
8914 for (uint32_t mz = 0; mz < 6; mz++) {
8915 for (size_t k = 1; k <= 40; k += 9) {
8916 GemmMicrokernelTester()
8917 .mr(6)
8918 .nr(8)
8919 .kr(1)
8920 .sr(1)
8921 .m(6)
8922 .n(8)
8923 .k(k)
8924 .ks(3)
8925 .a_offset(251)
8926 .zero_index(mz)
8927 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8928 }
8929 }
8930 }
8931
8932 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
8933 TEST_REQUIRES_ARM_NEON_FMA;
8934 GemmMicrokernelTester()
8935 .mr(6)
8936 .nr(8)
8937 .kr(1)
8938 .sr(1)
8939 .m(6)
8940 .n(8)
8941 .k(8)
8942 .qmin(128)
8943 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8944 }
8945
8946 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
8947 TEST_REQUIRES_ARM_NEON_FMA;
8948 GemmMicrokernelTester()
8949 .mr(6)
8950 .nr(8)
8951 .kr(1)
8952 .sr(1)
8953 .m(6)
8954 .n(8)
8955 .k(8)
8956 .qmax(128)
8957 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8958 }
8959
8960 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
8961 TEST_REQUIRES_ARM_NEON_FMA;
8962 GemmMicrokernelTester()
8963 .mr(6)
8964 .nr(8)
8965 .kr(1)
8966 .sr(1)
8967 .m(6)
8968 .n(8)
8969 .k(8)
8970 .cm_stride(11)
8971 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
8972 }
8973#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8974
8975
8976#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
8977 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
8978 TEST_REQUIRES_ARM_NEON_FMA;
8979 GemmMicrokernelTester()
8980 .mr(6)
8981 .nr(8)
8982 .kr(1)
8983 .sr(1)
8984 .m(6)
8985 .n(8)
8986 .k(8)
8987 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
8988 }
8989
8990 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
8991 TEST_REQUIRES_ARM_NEON_FMA;
8992 GemmMicrokernelTester()
8993 .mr(6)
8994 .nr(8)
8995 .kr(1)
8996 .sr(1)
8997 .m(6)
8998 .n(8)
8999 .k(8)
9000 .cn_stride(11)
9001 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9002 }
9003
9004 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
9005 TEST_REQUIRES_ARM_NEON_FMA;
9006 for (uint32_t m = 1; m <= 6; m++) {
9007 for (uint32_t n = 1; n <= 8; n++) {
9008 GemmMicrokernelTester()
9009 .mr(6)
9010 .nr(8)
9011 .kr(1)
9012 .sr(1)
9013 .m(m)
9014 .n(n)
9015 .k(8)
9016 .iterations(1)
9017 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9018 }
9019 }
9020 }
9021
9022 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
9023 TEST_REQUIRES_ARM_NEON_FMA;
9024 for (uint32_t m = 1; m <= 6; m++) {
9025 GemmMicrokernelTester()
9026 .mr(6)
9027 .nr(8)
9028 .kr(1)
9029 .sr(1)
9030 .m(m)
9031 .n(8)
9032 .k(8)
9033 .iterations(1)
9034 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9035 }
9036 }
9037
9038 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
9039 TEST_REQUIRES_ARM_NEON_FMA;
9040 for (uint32_t n = 1; n <= 8; n++) {
9041 GemmMicrokernelTester()
9042 .mr(6)
9043 .nr(8)
9044 .kr(1)
9045 .sr(1)
9046 .m(6)
9047 .n(n)
9048 .k(8)
9049 .iterations(1)
9050 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9051 }
9052 }
9053
9054 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
9055 TEST_REQUIRES_ARM_NEON_FMA;
9056 GemmMicrokernelTester()
9057 .mr(6)
9058 .nr(8)
9059 .kr(1)
9060 .sr(1)
9061 .m(6)
9062 .n(8)
9063 .k(16)
9064 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9065 }
9066
9067 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
9068 TEST_REQUIRES_ARM_NEON_FMA;
9069 for (uint32_t m = 1; m <= 6; m++) {
9070 for (uint32_t n = 1; n <= 8; n++) {
9071 GemmMicrokernelTester()
9072 .mr(6)
9073 .nr(8)
9074 .kr(1)
9075 .sr(1)
9076 .m(m)
9077 .n(n)
9078 .k(16)
9079 .iterations(1)
9080 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9081 }
9082 }
9083 }
9084
9085 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
9086 TEST_REQUIRES_ARM_NEON_FMA;
9087 for (size_t k = 1; k < 16; k++) {
9088 GemmMicrokernelTester()
9089 .mr(6)
9090 .nr(8)
9091 .kr(1)
9092 .sr(1)
9093 .m(6)
9094 .n(8)
9095 .k(k)
9096 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9097 }
9098 }
9099
9100 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
9101 TEST_REQUIRES_ARM_NEON_FMA;
9102 for (size_t k = 1; k < 16; k++) {
9103 for (uint32_t m = 1; m <= 6; m++) {
9104 for (uint32_t n = 1; n <= 8; n++) {
9105 GemmMicrokernelTester()
9106 .mr(6)
9107 .nr(8)
9108 .kr(1)
9109 .sr(1)
9110 .m(m)
9111 .n(n)
9112 .k(k)
9113 .iterations(1)
9114 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9115 }
9116 }
9117 }
9118 }
9119
9120 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
9121 TEST_REQUIRES_ARM_NEON_FMA;
9122 for (size_t k = 17; k < 16; k++) {
9123 GemmMicrokernelTester()
9124 .mr(6)
9125 .nr(8)
9126 .kr(1)
9127 .sr(1)
9128 .m(6)
9129 .n(8)
9130 .k(k)
9131 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9132 }
9133 }
9134
9135 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
9136 TEST_REQUIRES_ARM_NEON_FMA;
9137 for (size_t k = 17; k < 16; k++) {
9138 for (uint32_t m = 1; m <= 6; m++) {
9139 for (uint32_t n = 1; n <= 8; n++) {
9140 GemmMicrokernelTester()
9141 .mr(6)
9142 .nr(8)
9143 .kr(1)
9144 .sr(1)
9145 .m(m)
9146 .n(n)
9147 .k(k)
9148 .iterations(1)
9149 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9150 }
9151 }
9152 }
9153 }
9154
9155 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
9156 TEST_REQUIRES_ARM_NEON_FMA;
9157 for (size_t k = 24; k <= 80; k += 8) {
9158 GemmMicrokernelTester()
9159 .mr(6)
9160 .nr(8)
9161 .kr(1)
9162 .sr(1)
9163 .m(6)
9164 .n(8)
9165 .k(k)
9166 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9167 }
9168 }
9169
9170 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
9171 TEST_REQUIRES_ARM_NEON_FMA;
9172 for (size_t k = 24; k <= 80; k += 8) {
9173 for (uint32_t m = 1; m <= 6; m++) {
9174 for (uint32_t n = 1; n <= 8; n++) {
9175 GemmMicrokernelTester()
9176 .mr(6)
9177 .nr(8)
9178 .kr(1)
9179 .sr(1)
9180 .m(m)
9181 .n(n)
9182 .k(k)
9183 .iterations(1)
9184 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9185 }
9186 }
9187 }
9188 }
9189
9190 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
9191 TEST_REQUIRES_ARM_NEON_FMA;
9192 for (uint32_t n = 9; n < 16; n++) {
9193 for (size_t k = 1; k <= 40; k += 9) {
9194 GemmMicrokernelTester()
9195 .mr(6)
9196 .nr(8)
9197 .kr(1)
9198 .sr(1)
9199 .m(6)
9200 .n(8)
9201 .k(k)
9202 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9203 }
9204 }
9205 }
9206
9207 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
9208 TEST_REQUIRES_ARM_NEON_FMA;
9209 for (uint32_t n = 9; n < 16; n++) {
9210 for (size_t k = 1; k <= 40; k += 9) {
9211 GemmMicrokernelTester()
9212 .mr(6)
9213 .nr(8)
9214 .kr(1)
9215 .sr(1)
9216 .m(6)
9217 .n(8)
9218 .k(k)
9219 .cn_stride(11)
9220 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9221 }
9222 }
9223 }
9224
9225 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
9226 TEST_REQUIRES_ARM_NEON_FMA;
9227 for (uint32_t n = 9; n < 16; n++) {
9228 for (size_t k = 1; k <= 40; k += 9) {
9229 for (uint32_t m = 1; m <= 6; m++) {
9230 GemmMicrokernelTester()
9231 .mr(6)
9232 .nr(8)
9233 .kr(1)
9234 .sr(1)
9235 .m(m)
9236 .n(n)
9237 .k(k)
9238 .iterations(1)
9239 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9240 }
9241 }
9242 }
9243 }
9244
9245 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
9246 TEST_REQUIRES_ARM_NEON_FMA;
9247 for (uint32_t n = 16; n <= 24; n += 8) {
9248 for (size_t k = 1; k <= 40; k += 9) {
9249 GemmMicrokernelTester()
9250 .mr(6)
9251 .nr(8)
9252 .kr(1)
9253 .sr(1)
9254 .m(6)
9255 .n(8)
9256 .k(k)
9257 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9258 }
9259 }
9260 }
9261
9262 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
9263 TEST_REQUIRES_ARM_NEON_FMA;
9264 for (uint32_t n = 16; n <= 24; n += 8) {
9265 for (size_t k = 1; k <= 40; k += 9) {
9266 GemmMicrokernelTester()
9267 .mr(6)
9268 .nr(8)
9269 .kr(1)
9270 .sr(1)
9271 .m(6)
9272 .n(n)
9273 .k(k)
9274 .cn_stride(11)
9275 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9276 }
9277 }
9278 }
9279
9280 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
9281 TEST_REQUIRES_ARM_NEON_FMA;
9282 for (uint32_t n = 16; n <= 24; n += 8) {
9283 for (size_t k = 1; k <= 40; k += 9) {
9284 for (uint32_t m = 1; m <= 6; m++) {
9285 GemmMicrokernelTester()
9286 .mr(6)
9287 .nr(8)
9288 .kr(1)
9289 .sr(1)
9290 .m(m)
9291 .n(n)
9292 .k(k)
9293 .iterations(1)
9294 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9295 }
9296 }
9297 }
9298 }
9299
9300 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel) {
9301 TEST_REQUIRES_ARM_NEON_FMA;
9302 for (size_t k = 1; k <= 40; k += 9) {
9303 GemmMicrokernelTester()
9304 .mr(6)
9305 .nr(8)
9306 .kr(1)
9307 .sr(1)
9308 .m(6)
9309 .n(8)
9310 .k(k)
9311 .ks(3)
9312 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9313 }
9314 }
9315
9316 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, small_kernel_subtile) {
9317 TEST_REQUIRES_ARM_NEON_FMA;
9318 for (size_t k = 1; k <= 40; k += 9) {
9319 for (uint32_t m = 1; m <= 6; m++) {
9320 for (uint32_t n = 1; n <= 8; n++) {
9321 GemmMicrokernelTester()
9322 .mr(6)
9323 .nr(8)
9324 .kr(1)
9325 .sr(1)
9326 .m(m)
9327 .n(n)
9328 .k(k)
9329 .ks(3)
9330 .iterations(1)
9331 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9332 }
9333 }
9334 }
9335 }
9336
9337 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_small_kernel) {
9338 TEST_REQUIRES_ARM_NEON_FMA;
9339 for (uint32_t n = 9; n < 16; n++) {
9340 for (size_t k = 1; k <= 40; k += 9) {
9341 GemmMicrokernelTester()
9342 .mr(6)
9343 .nr(8)
9344 .kr(1)
9345 .sr(1)
9346 .m(6)
9347 .n(8)
9348 .k(k)
9349 .ks(3)
9350 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9351 }
9352 }
9353 }
9354
9355 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_small_kernel) {
9356 TEST_REQUIRES_ARM_NEON_FMA;
9357 for (uint32_t n = 16; n <= 24; n += 8) {
9358 for (size_t k = 1; k <= 40; k += 9) {
9359 GemmMicrokernelTester()
9360 .mr(6)
9361 .nr(8)
9362 .kr(1)
9363 .sr(1)
9364 .m(6)
9365 .n(8)
9366 .k(k)
9367 .ks(3)
9368 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9369 }
9370 }
9371 }
9372
9373 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
9374 TEST_REQUIRES_ARM_NEON_FMA;
9375 for (size_t k = 1; k <= 40; k += 9) {
9376 for (uint32_t m = 1; m <= 6; m++) {
9377 for (uint32_t n = 1; n <= 8; n++) {
9378 GemmMicrokernelTester()
9379 .mr(6)
9380 .nr(8)
9381 .kr(1)
9382 .sr(1)
9383 .m(m)
9384 .n(n)
9385 .k(k)
9386 .cm_stride(11)
9387 .iterations(1)
9388 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9389 }
9390 }
9391 }
9392 }
9393
9394 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, a_offset) {
9395 TEST_REQUIRES_ARM_NEON_FMA;
9396 for (size_t k = 1; k <= 40; k += 9) {
9397 GemmMicrokernelTester()
9398 .mr(6)
9399 .nr(8)
9400 .kr(1)
9401 .sr(1)
9402 .m(6)
9403 .n(8)
9404 .k(k)
9405 .ks(3)
9406 .a_offset(251)
9407 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9408 }
9409 }
9410
9411 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, zero) {
9412 TEST_REQUIRES_ARM_NEON_FMA;
9413 for (uint32_t mz = 0; mz < 6; mz++) {
9414 for (size_t k = 1; k <= 40; k += 9) {
9415 GemmMicrokernelTester()
9416 .mr(6)
9417 .nr(8)
9418 .kr(1)
9419 .sr(1)
9420 .m(6)
9421 .n(8)
9422 .k(k)
9423 .ks(3)
9424 .a_offset(251)
9425 .zero_index(mz)
9426 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9427 }
9428 }
9429 }
9430
9431 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
9432 TEST_REQUIRES_ARM_NEON_FMA;
9433 GemmMicrokernelTester()
9434 .mr(6)
9435 .nr(8)
9436 .kr(1)
9437 .sr(1)
9438 .m(6)
9439 .n(8)
9440 .k(8)
9441 .qmin(128)
9442 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9443 }
9444
9445 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
9446 TEST_REQUIRES_ARM_NEON_FMA;
9447 GemmMicrokernelTester()
9448 .mr(6)
9449 .nr(8)
9450 .kr(1)
9451 .sr(1)
9452 .m(6)
9453 .n(8)
9454 .k(8)
9455 .qmax(128)
9456 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9457 }
9458
9459 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
9460 TEST_REQUIRES_ARM_NEON_FMA;
9461 GemmMicrokernelTester()
9462 .mr(6)
9463 .nr(8)
9464 .kr(1)
9465 .sr(1)
9466 .m(6)
9467 .n(8)
9468 .k(8)
9469 .cm_stride(11)
9470 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
9471 }
9472#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9473
9474
9475#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9476 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8) {
9477 TEST_REQUIRES_ARM_NEON_FMA;
9478 GemmMicrokernelTester()
9479 .mr(6)
9480 .nr(8)
9481 .kr(1)
9482 .sr(1)
9483 .m(6)
9484 .n(8)
9485 .k(8)
9486 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9487 }
9488
9489 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, strided_cn) {
9490 TEST_REQUIRES_ARM_NEON_FMA;
9491 GemmMicrokernelTester()
9492 .mr(6)
9493 .nr(8)
9494 .kr(1)
9495 .sr(1)
9496 .m(6)
9497 .n(8)
9498 .k(8)
9499 .cn_stride(11)
9500 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9501 }
9502
9503 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile) {
9504 TEST_REQUIRES_ARM_NEON_FMA;
9505 for (uint32_t m = 1; m <= 6; m++) {
9506 for (uint32_t n = 1; n <= 8; n++) {
9507 GemmMicrokernelTester()
9508 .mr(6)
9509 .nr(8)
9510 .kr(1)
9511 .sr(1)
9512 .m(m)
9513 .n(n)
9514 .k(8)
9515 .iterations(1)
9516 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9517 }
9518 }
9519 }
9520
9521 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_m) {
9522 TEST_REQUIRES_ARM_NEON_FMA;
9523 for (uint32_t m = 1; m <= 6; m++) {
9524 GemmMicrokernelTester()
9525 .mr(6)
9526 .nr(8)
9527 .kr(1)
9528 .sr(1)
9529 .m(m)
9530 .n(8)
9531 .k(8)
9532 .iterations(1)
9533 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9534 }
9535 }
9536
9537 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_8_subtile_n) {
9538 TEST_REQUIRES_ARM_NEON_FMA;
9539 for (uint32_t n = 1; n <= 8; n++) {
9540 GemmMicrokernelTester()
9541 .mr(6)
9542 .nr(8)
9543 .kr(1)
9544 .sr(1)
9545 .m(6)
9546 .n(n)
9547 .k(8)
9548 .iterations(1)
9549 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9550 }
9551 }
9552
9553 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16) {
9554 TEST_REQUIRES_ARM_NEON_FMA;
9555 GemmMicrokernelTester()
9556 .mr(6)
9557 .nr(8)
9558 .kr(1)
9559 .sr(1)
9560 .m(6)
9561 .n(8)
9562 .k(16)
9563 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9564 }
9565
9566 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_eq_16_subtile) {
9567 TEST_REQUIRES_ARM_NEON_FMA;
9568 for (uint32_t m = 1; m <= 6; m++) {
9569 for (uint32_t n = 1; n <= 8; n++) {
9570 GemmMicrokernelTester()
9571 .mr(6)
9572 .nr(8)
9573 .kr(1)
9574 .sr(1)
9575 .m(m)
9576 .n(n)
9577 .k(16)
9578 .iterations(1)
9579 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9580 }
9581 }
9582 }
9583
9584 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16) {
9585 TEST_REQUIRES_ARM_NEON_FMA;
9586 for (size_t k = 1; k < 16; k++) {
9587 GemmMicrokernelTester()
9588 .mr(6)
9589 .nr(8)
9590 .kr(1)
9591 .sr(1)
9592 .m(6)
9593 .n(8)
9594 .k(k)
9595 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9596 }
9597 }
9598
9599 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_lt_16_subtile) {
9600 TEST_REQUIRES_ARM_NEON_FMA;
9601 for (size_t k = 1; k < 16; k++) {
9602 for (uint32_t m = 1; m <= 6; m++) {
9603 for (uint32_t n = 1; n <= 8; n++) {
9604 GemmMicrokernelTester()
9605 .mr(6)
9606 .nr(8)
9607 .kr(1)
9608 .sr(1)
9609 .m(m)
9610 .n(n)
9611 .k(k)
9612 .iterations(1)
9613 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9614 }
9615 }
9616 }
9617 }
9618
9619 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_16) {
9620 TEST_REQUIRES_ARM_NEON_FMA;
9621 for (size_t k = 17; k < 16; k++) {
9622 GemmMicrokernelTester()
9623 .mr(6)
9624 .nr(8)
9625 .kr(1)
9626 .sr(1)
9627 .m(6)
9628 .n(8)
9629 .k(k)
9630 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9631 }
9632 }
9633
9634 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_gt_8_subtile) {
9635 TEST_REQUIRES_ARM_NEON_FMA;
9636 for (size_t k = 17; k < 16; k++) {
9637 for (uint32_t m = 1; m <= 6; m++) {
9638 for (uint32_t n = 1; n <= 8; n++) {
9639 GemmMicrokernelTester()
9640 .mr(6)
9641 .nr(8)
9642 .kr(1)
9643 .sr(1)
9644 .m(m)
9645 .n(n)
9646 .k(k)
9647 .iterations(1)
9648 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9649 }
9650 }
9651 }
9652 }
9653
9654 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8) {
9655 TEST_REQUIRES_ARM_NEON_FMA;
9656 for (size_t k = 24; k <= 80; k += 8) {
9657 GemmMicrokernelTester()
9658 .mr(6)
9659 .nr(8)
9660 .kr(1)
9661 .sr(1)
9662 .m(6)
9663 .n(8)
9664 .k(k)
9665 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9666 }
9667 }
9668
9669 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, k_div_8_subtile) {
9670 TEST_REQUIRES_ARM_NEON_FMA;
9671 for (size_t k = 24; k <= 80; k += 8) {
9672 for (uint32_t m = 1; m <= 6; m++) {
9673 for (uint32_t n = 1; n <= 8; n++) {
9674 GemmMicrokernelTester()
9675 .mr(6)
9676 .nr(8)
9677 .kr(1)
9678 .sr(1)
9679 .m(m)
9680 .n(n)
9681 .k(k)
9682 .iterations(1)
9683 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9684 }
9685 }
9686 }
9687 }
9688
9689 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8) {
9690 TEST_REQUIRES_ARM_NEON_FMA;
9691 for (uint32_t n = 9; n < 16; n++) {
9692 for (size_t k = 1; k <= 40; k += 9) {
9693 GemmMicrokernelTester()
9694 .mr(6)
9695 .nr(8)
9696 .kr(1)
9697 .sr(1)
9698 .m(6)
9699 .n(8)
9700 .k(k)
9701 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9702 }
9703 }
9704 }
9705
9706 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_strided_cn) {
9707 TEST_REQUIRES_ARM_NEON_FMA;
9708 for (uint32_t n = 9; n < 16; n++) {
9709 for (size_t k = 1; k <= 40; k += 9) {
9710 GemmMicrokernelTester()
9711 .mr(6)
9712 .nr(8)
9713 .kr(1)
9714 .sr(1)
9715 .m(6)
9716 .n(8)
9717 .k(k)
9718 .cn_stride(11)
9719 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9720 }
9721 }
9722 }
9723
9724 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_subtile) {
9725 TEST_REQUIRES_ARM_NEON_FMA;
9726 for (uint32_t n = 9; n < 16; n++) {
9727 for (size_t k = 1; k <= 40; k += 9) {
9728 for (uint32_t m = 1; m <= 6; m++) {
9729 GemmMicrokernelTester()
9730 .mr(6)
9731 .nr(8)
9732 .kr(1)
9733 .sr(1)
9734 .m(m)
9735 .n(n)
9736 .k(k)
9737 .iterations(1)
9738 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9739 }
9740 }
9741 }
9742 }
9743
9744 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8) {
9745 TEST_REQUIRES_ARM_NEON_FMA;
9746 for (uint32_t n = 16; n <= 24; n += 8) {
9747 for (size_t k = 1; k <= 40; k += 9) {
9748 GemmMicrokernelTester()
9749 .mr(6)
9750 .nr(8)
9751 .kr(1)
9752 .sr(1)
9753 .m(6)
9754 .n(8)
9755 .k(k)
9756 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9757 }
9758 }
9759 }
9760
9761 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_strided_cn) {
9762 TEST_REQUIRES_ARM_NEON_FMA;
9763 for (uint32_t n = 16; n <= 24; n += 8) {
9764 for (size_t k = 1; k <= 40; k += 9) {
9765 GemmMicrokernelTester()
9766 .mr(6)
9767 .nr(8)
9768 .kr(1)
9769 .sr(1)
9770 .m(6)
9771 .n(n)
9772 .k(k)
9773 .cn_stride(11)
9774 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9775 }
9776 }
9777 }
9778
9779 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_subtile) {
9780 TEST_REQUIRES_ARM_NEON_FMA;
9781 for (uint32_t n = 16; n <= 24; n += 8) {
9782 for (size_t k = 1; k <= 40; k += 9) {
9783 for (uint32_t m = 1; m <= 6; m++) {
9784 GemmMicrokernelTester()
9785 .mr(6)
9786 .nr(8)
9787 .kr(1)
9788 .sr(1)
9789 .m(m)
9790 .n(n)
9791 .k(k)
9792 .iterations(1)
9793 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9794 }
9795 }
9796 }
9797 }
9798
9799 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, small_kernel) {
9800 TEST_REQUIRES_ARM_NEON_FMA;
9801 for (size_t k = 1; k <= 40; k += 9) {
9802 GemmMicrokernelTester()
9803 .mr(6)
9804 .nr(8)
9805 .kr(1)
9806 .sr(1)
9807 .m(6)
9808 .n(8)
9809 .k(k)
9810 .ks(3)
9811 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9812 }
9813 }
9814
9815 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, small_kernel_subtile) {
9816 TEST_REQUIRES_ARM_NEON_FMA;
9817 for (size_t k = 1; k <= 40; k += 9) {
9818 for (uint32_t m = 1; m <= 6; m++) {
9819 for (uint32_t n = 1; n <= 8; n++) {
9820 GemmMicrokernelTester()
9821 .mr(6)
9822 .nr(8)
9823 .kr(1)
9824 .sr(1)
9825 .m(m)
9826 .n(n)
9827 .k(k)
9828 .ks(3)
9829 .iterations(1)
9830 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9831 }
9832 }
9833 }
9834 }
9835
9836 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_gt_8_small_kernel) {
9837 TEST_REQUIRES_ARM_NEON_FMA;
9838 for (uint32_t n = 9; n < 16; n++) {
9839 for (size_t k = 1; k <= 40; k += 9) {
9840 GemmMicrokernelTester()
9841 .mr(6)
9842 .nr(8)
9843 .kr(1)
9844 .sr(1)
9845 .m(6)
9846 .n(8)
9847 .k(k)
9848 .ks(3)
9849 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9850 }
9851 }
9852 }
9853
9854 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, n_div_8_small_kernel) {
9855 TEST_REQUIRES_ARM_NEON_FMA;
9856 for (uint32_t n = 16; n <= 24; n += 8) {
9857 for (size_t k = 1; k <= 40; k += 9) {
9858 GemmMicrokernelTester()
9859 .mr(6)
9860 .nr(8)
9861 .kr(1)
9862 .sr(1)
9863 .m(6)
9864 .n(8)
9865 .k(k)
9866 .ks(3)
9867 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9868 }
9869 }
9870 }
9871
9872 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm_subtile) {
9873 TEST_REQUIRES_ARM_NEON_FMA;
9874 for (size_t k = 1; k <= 40; k += 9) {
9875 for (uint32_t m = 1; m <= 6; m++) {
9876 for (uint32_t n = 1; n <= 8; n++) {
9877 GemmMicrokernelTester()
9878 .mr(6)
9879 .nr(8)
9880 .kr(1)
9881 .sr(1)
9882 .m(m)
9883 .n(n)
9884 .k(k)
9885 .cm_stride(11)
9886 .iterations(1)
9887 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9888 }
9889 }
9890 }
9891 }
9892
9893 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, a_offset) {
9894 TEST_REQUIRES_ARM_NEON_FMA;
9895 for (size_t k = 1; k <= 40; k += 9) {
9896 GemmMicrokernelTester()
9897 .mr(6)
9898 .nr(8)
9899 .kr(1)
9900 .sr(1)
9901 .m(6)
9902 .n(8)
9903 .k(k)
9904 .ks(3)
9905 .a_offset(251)
9906 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9907 }
9908 }
9909
9910 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, zero) {
9911 TEST_REQUIRES_ARM_NEON_FMA;
9912 for (uint32_t mz = 0; mz < 6; mz++) {
9913 for (size_t k = 1; k <= 40; k += 9) {
9914 GemmMicrokernelTester()
9915 .mr(6)
9916 .nr(8)
9917 .kr(1)
9918 .sr(1)
9919 .m(6)
9920 .n(8)
9921 .k(k)
9922 .ks(3)
9923 .a_offset(251)
9924 .zero_index(mz)
9925 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9926 }
9927 }
9928 }
9929
9930 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, qmin) {
9931 TEST_REQUIRES_ARM_NEON_FMA;
9932 GemmMicrokernelTester()
9933 .mr(6)
9934 .nr(8)
9935 .kr(1)
9936 .sr(1)
9937 .m(6)
9938 .n(8)
9939 .k(8)
9940 .qmin(128)
9941 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9942 }
9943
9944 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, qmax) {
9945 TEST_REQUIRES_ARM_NEON_FMA;
9946 GemmMicrokernelTester()
9947 .mr(6)
9948 .nr(8)
9949 .kr(1)
9950 .sr(1)
9951 .m(6)
9952 .n(8)
9953 .k(8)
9954 .qmax(128)
9955 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9956 }
9957
9958 TEST(F32_IGEMM_6X8__AARCH64_NEONFMA_IOS, strided_cm) {
9959 TEST_REQUIRES_ARM_NEON_FMA;
9960 GemmMicrokernelTester()
9961 .mr(6)
9962 .nr(8)
9963 .kr(1)
9964 .sr(1)
9965 .m(6)
9966 .n(8)
9967 .k(8)
9968 .cm_stride(11)
9969 .Test(xnn_f32_igemm_ukernel_6x8__aarch64_neonfma_ios);
9970 }
9971#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9972
9973
9974#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
9975 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
9976 TEST_REQUIRES_ARM_NEON_FMA;
9977 GemmMicrokernelTester()
9978 .mr(1)
9979 .nr(12)
9980 .kr(1)
9981 .sr(1)
9982 .m(1)
9983 .n(12)
9984 .k(4)
9985 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
9986 }
9987
9988 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
9989 TEST_REQUIRES_ARM_NEON_FMA;
9990 GemmMicrokernelTester()
9991 .mr(1)
9992 .nr(12)
9993 .kr(1)
9994 .sr(1)
9995 .m(1)
9996 .n(12)
9997 .k(4)
9998 .cn_stride(17)
9999 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10000 }
10001
10002 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
10003 TEST_REQUIRES_ARM_NEON_FMA;
10004 for (uint32_t m = 1; m <= 1; m++) {
10005 for (uint32_t n = 1; n <= 12; n++) {
10006 GemmMicrokernelTester()
10007 .mr(1)
10008 .nr(12)
10009 .kr(1)
10010 .sr(1)
10011 .m(m)
10012 .n(n)
10013 .k(4)
10014 .iterations(1)
10015 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10016 }
10017 }
10018 }
10019
10020 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
10021 TEST_REQUIRES_ARM_NEON_FMA;
10022 for (uint32_t m = 1; m <= 1; m++) {
10023 GemmMicrokernelTester()
10024 .mr(1)
10025 .nr(12)
10026 .kr(1)
10027 .sr(1)
10028 .m(m)
10029 .n(12)
10030 .k(4)
10031 .iterations(1)
10032 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10033 }
10034 }
10035
10036 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
10037 TEST_REQUIRES_ARM_NEON_FMA;
10038 for (uint32_t n = 1; n <= 12; n++) {
10039 GemmMicrokernelTester()
10040 .mr(1)
10041 .nr(12)
10042 .kr(1)
10043 .sr(1)
10044 .m(1)
10045 .n(n)
10046 .k(4)
10047 .iterations(1)
10048 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10049 }
10050 }
10051
10052 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
10053 TEST_REQUIRES_ARM_NEON_FMA;
10054 GemmMicrokernelTester()
10055 .mr(1)
10056 .nr(12)
10057 .kr(1)
10058 .sr(1)
10059 .m(1)
10060 .n(12)
10061 .k(8)
10062 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10063 }
10064
10065 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
10066 TEST_REQUIRES_ARM_NEON_FMA;
10067 for (uint32_t m = 1; m <= 1; m++) {
10068 for (uint32_t n = 1; n <= 12; n++) {
10069 GemmMicrokernelTester()
10070 .mr(1)
10071 .nr(12)
10072 .kr(1)
10073 .sr(1)
10074 .m(m)
10075 .n(n)
10076 .k(8)
10077 .iterations(1)
10078 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10079 }
10080 }
10081 }
10082
10083 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
10084 TEST_REQUIRES_ARM_NEON_FMA;
10085 for (size_t k = 1; k < 8; k++) {
10086 GemmMicrokernelTester()
10087 .mr(1)
10088 .nr(12)
10089 .kr(1)
10090 .sr(1)
10091 .m(1)
10092 .n(12)
10093 .k(k)
10094 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10095 }
10096 }
10097
10098 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
10099 TEST_REQUIRES_ARM_NEON_FMA;
10100 for (size_t k = 1; k < 8; k++) {
10101 for (uint32_t m = 1; m <= 1; m++) {
10102 for (uint32_t n = 1; n <= 12; n++) {
10103 GemmMicrokernelTester()
10104 .mr(1)
10105 .nr(12)
10106 .kr(1)
10107 .sr(1)
10108 .m(m)
10109 .n(n)
10110 .k(k)
10111 .iterations(1)
10112 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10113 }
10114 }
10115 }
10116 }
10117
10118 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
10119 TEST_REQUIRES_ARM_NEON_FMA;
10120 for (size_t k = 9; k < 8; k++) {
10121 GemmMicrokernelTester()
10122 .mr(1)
10123 .nr(12)
10124 .kr(1)
10125 .sr(1)
10126 .m(1)
10127 .n(12)
10128 .k(k)
10129 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10130 }
10131 }
10132
10133 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
10134 TEST_REQUIRES_ARM_NEON_FMA;
10135 for (size_t k = 9; k < 8; k++) {
10136 for (uint32_t m = 1; m <= 1; m++) {
10137 for (uint32_t n = 1; n <= 12; n++) {
10138 GemmMicrokernelTester()
10139 .mr(1)
10140 .nr(12)
10141 .kr(1)
10142 .sr(1)
10143 .m(m)
10144 .n(n)
10145 .k(k)
10146 .iterations(1)
10147 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10148 }
10149 }
10150 }
10151 }
10152
10153 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
10154 TEST_REQUIRES_ARM_NEON_FMA;
10155 for (size_t k = 12; k <= 40; k += 4) {
10156 GemmMicrokernelTester()
10157 .mr(1)
10158 .nr(12)
10159 .kr(1)
10160 .sr(1)
10161 .m(1)
10162 .n(12)
10163 .k(k)
10164 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10165 }
10166 }
10167
10168 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
10169 TEST_REQUIRES_ARM_NEON_FMA;
10170 for (size_t k = 12; k <= 40; k += 4) {
10171 for (uint32_t m = 1; m <= 1; m++) {
10172 for (uint32_t n = 1; n <= 12; n++) {
10173 GemmMicrokernelTester()
10174 .mr(1)
10175 .nr(12)
10176 .kr(1)
10177 .sr(1)
10178 .m(m)
10179 .n(n)
10180 .k(k)
10181 .iterations(1)
10182 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10183 }
10184 }
10185 }
10186 }
10187
10188 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
10189 TEST_REQUIRES_ARM_NEON_FMA;
10190 for (uint32_t n = 13; n < 24; n++) {
10191 for (size_t k = 1; k <= 20; k += 5) {
10192 GemmMicrokernelTester()
10193 .mr(1)
10194 .nr(12)
10195 .kr(1)
10196 .sr(1)
10197 .m(1)
10198 .n(12)
10199 .k(k)
10200 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10201 }
10202 }
10203 }
10204
10205 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
10206 TEST_REQUIRES_ARM_NEON_FMA;
10207 for (uint32_t n = 13; n < 24; n++) {
10208 for (size_t k = 1; k <= 20; k += 5) {
10209 GemmMicrokernelTester()
10210 .mr(1)
10211 .nr(12)
10212 .kr(1)
10213 .sr(1)
10214 .m(1)
10215 .n(12)
10216 .k(k)
10217 .cn_stride(17)
10218 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10219 }
10220 }
10221 }
10222
10223 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
10224 TEST_REQUIRES_ARM_NEON_FMA;
10225 for (uint32_t n = 13; n < 24; n++) {
10226 for (size_t k = 1; k <= 20; k += 5) {
10227 for (uint32_t m = 1; m <= 1; m++) {
10228 GemmMicrokernelTester()
10229 .mr(1)
10230 .nr(12)
10231 .kr(1)
10232 .sr(1)
10233 .m(m)
10234 .n(n)
10235 .k(k)
10236 .iterations(1)
10237 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10238 }
10239 }
10240 }
10241 }
10242
10243 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
10244 TEST_REQUIRES_ARM_NEON_FMA;
10245 for (uint32_t n = 24; n <= 36; n += 12) {
10246 for (size_t k = 1; k <= 20; k += 5) {
10247 GemmMicrokernelTester()
10248 .mr(1)
10249 .nr(12)
10250 .kr(1)
10251 .sr(1)
10252 .m(1)
10253 .n(12)
10254 .k(k)
10255 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10256 }
10257 }
10258 }
10259
10260 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
10261 TEST_REQUIRES_ARM_NEON_FMA;
10262 for (uint32_t n = 24; n <= 36; n += 12) {
10263 for (size_t k = 1; k <= 20; k += 5) {
10264 GemmMicrokernelTester()
10265 .mr(1)
10266 .nr(12)
10267 .kr(1)
10268 .sr(1)
10269 .m(1)
10270 .n(n)
10271 .k(k)
10272 .cn_stride(17)
10273 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10274 }
10275 }
10276 }
10277
10278 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
10279 TEST_REQUIRES_ARM_NEON_FMA;
10280 for (uint32_t n = 24; n <= 36; n += 12) {
10281 for (size_t k = 1; k <= 20; k += 5) {
10282 for (uint32_t m = 1; m <= 1; m++) {
10283 GemmMicrokernelTester()
10284 .mr(1)
10285 .nr(12)
10286 .kr(1)
10287 .sr(1)
10288 .m(m)
10289 .n(n)
10290 .k(k)
10291 .iterations(1)
10292 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10293 }
10294 }
10295 }
10296 }
10297
10298 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
10299 TEST_REQUIRES_ARM_NEON_FMA;
10300 for (size_t k = 1; k <= 20; k += 5) {
10301 GemmMicrokernelTester()
10302 .mr(1)
10303 .nr(12)
10304 .kr(1)
10305 .sr(1)
10306 .m(1)
10307 .n(12)
10308 .k(k)
10309 .ks(3)
10310 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10311 }
10312 }
10313
10314 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
10315 TEST_REQUIRES_ARM_NEON_FMA;
10316 for (size_t k = 1; k <= 20; k += 5) {
10317 for (uint32_t m = 1; m <= 1; m++) {
10318 for (uint32_t n = 1; n <= 12; n++) {
10319 GemmMicrokernelTester()
10320 .mr(1)
10321 .nr(12)
10322 .kr(1)
10323 .sr(1)
10324 .m(m)
10325 .n(n)
10326 .k(k)
10327 .ks(3)
10328 .iterations(1)
10329 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10330 }
10331 }
10332 }
10333 }
10334
10335 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_small_kernel) {
10336 TEST_REQUIRES_ARM_NEON_FMA;
10337 for (uint32_t n = 13; n < 24; n++) {
10338 for (size_t k = 1; k <= 20; k += 5) {
10339 GemmMicrokernelTester()
10340 .mr(1)
10341 .nr(12)
10342 .kr(1)
10343 .sr(1)
10344 .m(1)
10345 .n(12)
10346 .k(k)
10347 .ks(3)
10348 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10349 }
10350 }
10351 }
10352
10353 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_small_kernel) {
10354 TEST_REQUIRES_ARM_NEON_FMA;
10355 for (uint32_t n = 24; n <= 36; n += 12) {
10356 for (size_t k = 1; k <= 20; k += 5) {
10357 GemmMicrokernelTester()
10358 .mr(1)
10359 .nr(12)
10360 .kr(1)
10361 .sr(1)
10362 .m(1)
10363 .n(12)
10364 .k(k)
10365 .ks(3)
10366 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10367 }
10368 }
10369 }
10370
10371 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
10372 TEST_REQUIRES_ARM_NEON_FMA;
10373 for (size_t k = 1; k <= 20; k += 5) {
10374 for (uint32_t m = 1; m <= 1; m++) {
10375 for (uint32_t n = 1; n <= 12; n++) {
10376 GemmMicrokernelTester()
10377 .mr(1)
10378 .nr(12)
10379 .kr(1)
10380 .sr(1)
10381 .m(m)
10382 .n(n)
10383 .k(k)
10384 .cm_stride(17)
10385 .iterations(1)
10386 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10387 }
10388 }
10389 }
10390 }
10391
10392 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
10393 TEST_REQUIRES_ARM_NEON_FMA;
10394 for (size_t k = 1; k <= 20; k += 5) {
10395 GemmMicrokernelTester()
10396 .mr(1)
10397 .nr(12)
10398 .kr(1)
10399 .sr(1)
10400 .m(1)
10401 .n(12)
10402 .k(k)
10403 .ks(3)
10404 .a_offset(23)
10405 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10406 }
10407 }
10408
10409 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, zero) {
10410 TEST_REQUIRES_ARM_NEON_FMA;
10411 for (uint32_t mz = 0; mz < 1; mz++) {
10412 for (size_t k = 1; k <= 20; k += 5) {
10413 GemmMicrokernelTester()
10414 .mr(1)
10415 .nr(12)
10416 .kr(1)
10417 .sr(1)
10418 .m(1)
10419 .n(12)
10420 .k(k)
10421 .ks(3)
10422 .a_offset(23)
10423 .zero_index(mz)
10424 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10425 }
10426 }
10427 }
10428
10429 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
10430 TEST_REQUIRES_ARM_NEON_FMA;
10431 GemmMicrokernelTester()
10432 .mr(1)
10433 .nr(12)
10434 .kr(1)
10435 .sr(1)
10436 .m(1)
10437 .n(12)
10438 .k(4)
10439 .qmin(128)
10440 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10441 }
10442
10443 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
10444 TEST_REQUIRES_ARM_NEON_FMA;
10445 GemmMicrokernelTester()
10446 .mr(1)
10447 .nr(12)
10448 .kr(1)
10449 .sr(1)
10450 .m(1)
10451 .n(12)
10452 .k(4)
10453 .qmax(128)
10454 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10455 }
10456
10457 TEST(F32_IGEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
10458 TEST_REQUIRES_ARM_NEON_FMA;
10459 GemmMicrokernelTester()
10460 .mr(1)
10461 .nr(12)
10462 .kr(1)
10463 .sr(1)
10464 .m(1)
10465 .n(12)
10466 .k(4)
10467 .cm_stride(17)
10468 .Test(xnn_f32_igemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
10469 }
10470#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
10471
10472
10473#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
10474 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
10475 TEST_REQUIRES_ARM_NEON_FMA;
10476 GemmMicrokernelTester()
10477 .mr(4)
10478 .nr(12)
10479 .kr(1)
10480 .sr(1)
10481 .m(4)
10482 .n(12)
10483 .k(4)
10484 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10485 }
10486
10487 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
10488 TEST_REQUIRES_ARM_NEON_FMA;
10489 GemmMicrokernelTester()
10490 .mr(4)
10491 .nr(12)
10492 .kr(1)
10493 .sr(1)
10494 .m(4)
10495 .n(12)
10496 .k(4)
10497 .cn_stride(17)
10498 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10499 }
10500
10501 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
10502 TEST_REQUIRES_ARM_NEON_FMA;
10503 for (uint32_t m = 1; m <= 4; m++) {
10504 for (uint32_t n = 1; n <= 12; n++) {
10505 GemmMicrokernelTester()
10506 .mr(4)
10507 .nr(12)
10508 .kr(1)
10509 .sr(1)
10510 .m(m)
10511 .n(n)
10512 .k(4)
10513 .iterations(1)
10514 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10515 }
10516 }
10517 }
10518
10519 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
10520 TEST_REQUIRES_ARM_NEON_FMA;
10521 for (uint32_t m = 1; m <= 4; m++) {
10522 GemmMicrokernelTester()
10523 .mr(4)
10524 .nr(12)
10525 .kr(1)
10526 .sr(1)
10527 .m(m)
10528 .n(12)
10529 .k(4)
10530 .iterations(1)
10531 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10532 }
10533 }
10534
10535 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
10536 TEST_REQUIRES_ARM_NEON_FMA;
10537 for (uint32_t n = 1; n <= 12; n++) {
10538 GemmMicrokernelTester()
10539 .mr(4)
10540 .nr(12)
10541 .kr(1)
10542 .sr(1)
10543 .m(4)
10544 .n(n)
10545 .k(4)
10546 .iterations(1)
10547 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10548 }
10549 }
10550
10551 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
10552 TEST_REQUIRES_ARM_NEON_FMA;
10553 GemmMicrokernelTester()
10554 .mr(4)
10555 .nr(12)
10556 .kr(1)
10557 .sr(1)
10558 .m(4)
10559 .n(12)
10560 .k(8)
10561 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10562 }
10563
10564 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
10565 TEST_REQUIRES_ARM_NEON_FMA;
10566 for (uint32_t m = 1; m <= 4; m++) {
10567 for (uint32_t n = 1; n <= 12; n++) {
10568 GemmMicrokernelTester()
10569 .mr(4)
10570 .nr(12)
10571 .kr(1)
10572 .sr(1)
10573 .m(m)
10574 .n(n)
10575 .k(8)
10576 .iterations(1)
10577 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10578 }
10579 }
10580 }
10581
10582 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
10583 TEST_REQUIRES_ARM_NEON_FMA;
10584 for (size_t k = 1; k < 8; k++) {
10585 GemmMicrokernelTester()
10586 .mr(4)
10587 .nr(12)
10588 .kr(1)
10589 .sr(1)
10590 .m(4)
10591 .n(12)
10592 .k(k)
10593 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10594 }
10595 }
10596
10597 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
10598 TEST_REQUIRES_ARM_NEON_FMA;
10599 for (size_t k = 1; k < 8; k++) {
10600 for (uint32_t m = 1; m <= 4; m++) {
10601 for (uint32_t n = 1; n <= 12; n++) {
10602 GemmMicrokernelTester()
10603 .mr(4)
10604 .nr(12)
10605 .kr(1)
10606 .sr(1)
10607 .m(m)
10608 .n(n)
10609 .k(k)
10610 .iterations(1)
10611 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10612 }
10613 }
10614 }
10615 }
10616
10617 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
10618 TEST_REQUIRES_ARM_NEON_FMA;
10619 for (size_t k = 9; k < 8; k++) {
10620 GemmMicrokernelTester()
10621 .mr(4)
10622 .nr(12)
10623 .kr(1)
10624 .sr(1)
10625 .m(4)
10626 .n(12)
10627 .k(k)
10628 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10629 }
10630 }
10631
10632 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
10633 TEST_REQUIRES_ARM_NEON_FMA;
10634 for (size_t k = 9; k < 8; k++) {
10635 for (uint32_t m = 1; m <= 4; m++) {
10636 for (uint32_t n = 1; n <= 12; n++) {
10637 GemmMicrokernelTester()
10638 .mr(4)
10639 .nr(12)
10640 .kr(1)
10641 .sr(1)
10642 .m(m)
10643 .n(n)
10644 .k(k)
10645 .iterations(1)
10646 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10647 }
10648 }
10649 }
10650 }
10651
10652 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
10653 TEST_REQUIRES_ARM_NEON_FMA;
10654 for (size_t k = 12; k <= 40; k += 4) {
10655 GemmMicrokernelTester()
10656 .mr(4)
10657 .nr(12)
10658 .kr(1)
10659 .sr(1)
10660 .m(4)
10661 .n(12)
10662 .k(k)
10663 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10664 }
10665 }
10666
10667 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
10668 TEST_REQUIRES_ARM_NEON_FMA;
10669 for (size_t k = 12; k <= 40; k += 4) {
10670 for (uint32_t m = 1; m <= 4; m++) {
10671 for (uint32_t n = 1; n <= 12; n++) {
10672 GemmMicrokernelTester()
10673 .mr(4)
10674 .nr(12)
10675 .kr(1)
10676 .sr(1)
10677 .m(m)
10678 .n(n)
10679 .k(k)
10680 .iterations(1)
10681 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10682 }
10683 }
10684 }
10685 }
10686
10687 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
10688 TEST_REQUIRES_ARM_NEON_FMA;
10689 for (uint32_t n = 13; n < 24; n++) {
10690 for (size_t k = 1; k <= 20; k += 5) {
10691 GemmMicrokernelTester()
10692 .mr(4)
10693 .nr(12)
10694 .kr(1)
10695 .sr(1)
10696 .m(4)
10697 .n(12)
10698 .k(k)
10699 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10700 }
10701 }
10702 }
10703
10704 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
10705 TEST_REQUIRES_ARM_NEON_FMA;
10706 for (uint32_t n = 13; n < 24; n++) {
10707 for (size_t k = 1; k <= 20; k += 5) {
10708 GemmMicrokernelTester()
10709 .mr(4)
10710 .nr(12)
10711 .kr(1)
10712 .sr(1)
10713 .m(4)
10714 .n(12)
10715 .k(k)
10716 .cn_stride(17)
10717 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10718 }
10719 }
10720 }
10721
10722 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
10723 TEST_REQUIRES_ARM_NEON_FMA;
10724 for (uint32_t n = 13; n < 24; n++) {
10725 for (size_t k = 1; k <= 20; k += 5) {
10726 for (uint32_t m = 1; m <= 4; m++) {
10727 GemmMicrokernelTester()
10728 .mr(4)
10729 .nr(12)
10730 .kr(1)
10731 .sr(1)
10732 .m(m)
10733 .n(n)
10734 .k(k)
10735 .iterations(1)
10736 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10737 }
10738 }
10739 }
10740 }
10741
10742 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
10743 TEST_REQUIRES_ARM_NEON_FMA;
10744 for (uint32_t n = 24; n <= 36; n += 12) {
10745 for (size_t k = 1; k <= 20; k += 5) {
10746 GemmMicrokernelTester()
10747 .mr(4)
10748 .nr(12)
10749 .kr(1)
10750 .sr(1)
10751 .m(4)
10752 .n(12)
10753 .k(k)
10754 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10755 }
10756 }
10757 }
10758
10759 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
10760 TEST_REQUIRES_ARM_NEON_FMA;
10761 for (uint32_t n = 24; n <= 36; n += 12) {
10762 for (size_t k = 1; k <= 20; k += 5) {
10763 GemmMicrokernelTester()
10764 .mr(4)
10765 .nr(12)
10766 .kr(1)
10767 .sr(1)
10768 .m(4)
10769 .n(n)
10770 .k(k)
10771 .cn_stride(17)
10772 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10773 }
10774 }
10775 }
10776
10777 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
10778 TEST_REQUIRES_ARM_NEON_FMA;
10779 for (uint32_t n = 24; n <= 36; n += 12) {
10780 for (size_t k = 1; k <= 20; k += 5) {
10781 for (uint32_t m = 1; m <= 4; m++) {
10782 GemmMicrokernelTester()
10783 .mr(4)
10784 .nr(12)
10785 .kr(1)
10786 .sr(1)
10787 .m(m)
10788 .n(n)
10789 .k(k)
10790 .iterations(1)
10791 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10792 }
10793 }
10794 }
10795 }
10796
10797 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel) {
10798 TEST_REQUIRES_ARM_NEON_FMA;
10799 for (size_t k = 1; k <= 20; k += 5) {
10800 GemmMicrokernelTester()
10801 .mr(4)
10802 .nr(12)
10803 .kr(1)
10804 .sr(1)
10805 .m(4)
10806 .n(12)
10807 .k(k)
10808 .ks(3)
10809 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10810 }
10811 }
10812
10813 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, small_kernel_subtile) {
10814 TEST_REQUIRES_ARM_NEON_FMA;
10815 for (size_t k = 1; k <= 20; k += 5) {
10816 for (uint32_t m = 1; m <= 4; m++) {
10817 for (uint32_t n = 1; n <= 12; n++) {
10818 GemmMicrokernelTester()
10819 .mr(4)
10820 .nr(12)
10821 .kr(1)
10822 .sr(1)
10823 .m(m)
10824 .n(n)
10825 .k(k)
10826 .ks(3)
10827 .iterations(1)
10828 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10829 }
10830 }
10831 }
10832 }
10833
10834 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_small_kernel) {
10835 TEST_REQUIRES_ARM_NEON_FMA;
10836 for (uint32_t n = 13; n < 24; n++) {
10837 for (size_t k = 1; k <= 20; k += 5) {
10838 GemmMicrokernelTester()
10839 .mr(4)
10840 .nr(12)
10841 .kr(1)
10842 .sr(1)
10843 .m(4)
10844 .n(12)
10845 .k(k)
10846 .ks(3)
10847 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10848 }
10849 }
10850 }
10851
10852 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_small_kernel) {
10853 TEST_REQUIRES_ARM_NEON_FMA;
10854 for (uint32_t n = 24; n <= 36; n += 12) {
10855 for (size_t k = 1; k <= 20; k += 5) {
10856 GemmMicrokernelTester()
10857 .mr(4)
10858 .nr(12)
10859 .kr(1)
10860 .sr(1)
10861 .m(4)
10862 .n(12)
10863 .k(k)
10864 .ks(3)
10865 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10866 }
10867 }
10868 }
10869
10870 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
10871 TEST_REQUIRES_ARM_NEON_FMA;
10872 for (size_t k = 1; k <= 20; k += 5) {
10873 for (uint32_t m = 1; m <= 4; m++) {
10874 for (uint32_t n = 1; n <= 12; n++) {
10875 GemmMicrokernelTester()
10876 .mr(4)
10877 .nr(12)
10878 .kr(1)
10879 .sr(1)
10880 .m(m)
10881 .n(n)
10882 .k(k)
10883 .cm_stride(17)
10884 .iterations(1)
10885 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10886 }
10887 }
10888 }
10889 }
10890
10891 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, a_offset) {
10892 TEST_REQUIRES_ARM_NEON_FMA;
10893 for (size_t k = 1; k <= 20; k += 5) {
10894 GemmMicrokernelTester()
10895 .mr(4)
10896 .nr(12)
10897 .kr(1)
10898 .sr(1)
10899 .m(4)
10900 .n(12)
10901 .k(k)
10902 .ks(3)
10903 .a_offset(83)
10904 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10905 }
10906 }
10907
10908 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, zero) {
10909 TEST_REQUIRES_ARM_NEON_FMA;
10910 for (uint32_t mz = 0; mz < 4; mz++) {
10911 for (size_t k = 1; k <= 20; k += 5) {
10912 GemmMicrokernelTester()
10913 .mr(4)
10914 .nr(12)
10915 .kr(1)
10916 .sr(1)
10917 .m(4)
10918 .n(12)
10919 .k(k)
10920 .ks(3)
10921 .a_offset(83)
10922 .zero_index(mz)
10923 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10924 }
10925 }
10926 }
10927
10928 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
10929 TEST_REQUIRES_ARM_NEON_FMA;
10930 GemmMicrokernelTester()
10931 .mr(4)
10932 .nr(12)
10933 .kr(1)
10934 .sr(1)
10935 .m(4)
10936 .n(12)
10937 .k(4)
10938 .qmin(128)
10939 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10940 }
10941
10942 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
10943 TEST_REQUIRES_ARM_NEON_FMA;
10944 GemmMicrokernelTester()
10945 .mr(4)
10946 .nr(12)
10947 .kr(1)
10948 .sr(1)
10949 .m(4)
10950 .n(12)
10951 .k(4)
10952 .qmax(128)
10953 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10954 }
10955
10956 TEST(F32_IGEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
10957 TEST_REQUIRES_ARM_NEON_FMA;
10958 GemmMicrokernelTester()
10959 .mr(4)
10960 .nr(12)
10961 .kr(1)
10962 .sr(1)
10963 .m(4)
10964 .n(12)
10965 .k(4)
10966 .cm_stride(17)
10967 .Test(xnn_f32_igemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
10968 }
10969#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
10970
10971
10972#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10973 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2) {
10974 TEST_REQUIRES_ARM_NEON;
10975 GemmMicrokernelTester()
10976 .mr(1)
10977 .nr(8)
10978 .kr(1)
10979 .sr(1)
10980 .m(1)
10981 .n(8)
10982 .k(2)
10983 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
10984 }
10985
10986 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cn) {
10987 TEST_REQUIRES_ARM_NEON;
10988 GemmMicrokernelTester()
10989 .mr(1)
10990 .nr(8)
10991 .kr(1)
10992 .sr(1)
10993 .m(1)
10994 .n(8)
10995 .k(2)
10996 .cn_stride(11)
10997 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
10998 }
10999
11000 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
11001 TEST_REQUIRES_ARM_NEON;
11002 for (uint32_t m = 1; m <= 1; m++) {
11003 for (uint32_t n = 1; n <= 8; n++) {
11004 GemmMicrokernelTester()
11005 .mr(1)
11006 .nr(8)
11007 .kr(1)
11008 .sr(1)
11009 .m(m)
11010 .n(n)
11011 .k(2)
11012 .iterations(1)
11013 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11014 }
11015 }
11016 }
11017
11018 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
11019 TEST_REQUIRES_ARM_NEON;
11020 for (uint32_t m = 1; m <= 1; m++) {
11021 GemmMicrokernelTester()
11022 .mr(1)
11023 .nr(8)
11024 .kr(1)
11025 .sr(1)
11026 .m(m)
11027 .n(8)
11028 .k(2)
11029 .iterations(1)
11030 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11031 }
11032 }
11033
11034 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
11035 TEST_REQUIRES_ARM_NEON;
11036 for (uint32_t n = 1; n <= 8; n++) {
11037 GemmMicrokernelTester()
11038 .mr(1)
11039 .nr(8)
11040 .kr(1)
11041 .sr(1)
11042 .m(1)
11043 .n(n)
11044 .k(2)
11045 .iterations(1)
11046 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11047 }
11048 }
11049
11050 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_lt_2) {
11051 TEST_REQUIRES_ARM_NEON;
11052 for (size_t k = 1; k < 2; k++) {
11053 GemmMicrokernelTester()
11054 .mr(1)
11055 .nr(8)
11056 .kr(1)
11057 .sr(1)
11058 .m(1)
11059 .n(8)
11060 .k(k)
11061 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11062 }
11063 }
11064
11065 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
11066 TEST_REQUIRES_ARM_NEON;
11067 for (size_t k = 1; k < 2; k++) {
11068 for (uint32_t m = 1; m <= 1; m++) {
11069 for (uint32_t n = 1; n <= 8; n++) {
11070 GemmMicrokernelTester()
11071 .mr(1)
11072 .nr(8)
11073 .kr(1)
11074 .sr(1)
11075 .m(m)
11076 .n(n)
11077 .k(k)
11078 .iterations(1)
11079 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11080 }
11081 }
11082 }
11083 }
11084
11085 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_gt_2) {
11086 TEST_REQUIRES_ARM_NEON;
11087 for (size_t k = 3; k < 4; k++) {
11088 GemmMicrokernelTester()
11089 .mr(1)
11090 .nr(8)
11091 .kr(1)
11092 .sr(1)
11093 .m(1)
11094 .n(8)
11095 .k(k)
11096 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11097 }
11098 }
11099
11100 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
11101 TEST_REQUIRES_ARM_NEON;
11102 for (size_t k = 3; k < 4; k++) {
11103 for (uint32_t m = 1; m <= 1; m++) {
11104 for (uint32_t n = 1; n <= 8; n++) {
11105 GemmMicrokernelTester()
11106 .mr(1)
11107 .nr(8)
11108 .kr(1)
11109 .sr(1)
11110 .m(m)
11111 .n(n)
11112 .k(k)
11113 .iterations(1)
11114 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11115 }
11116 }
11117 }
11118 }
11119
11120 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_div_2) {
11121 TEST_REQUIRES_ARM_NEON;
11122 for (size_t k = 4; k <= 20; k += 2) {
11123 GemmMicrokernelTester()
11124 .mr(1)
11125 .nr(8)
11126 .kr(1)
11127 .sr(1)
11128 .m(1)
11129 .n(8)
11130 .k(k)
11131 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11132 }
11133 }
11134
11135 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, k_div_2_subtile) {
11136 TEST_REQUIRES_ARM_NEON;
11137 for (size_t k = 4; k <= 20; k += 2) {
11138 for (uint32_t m = 1; m <= 1; m++) {
11139 for (uint32_t n = 1; n <= 8; n++) {
11140 GemmMicrokernelTester()
11141 .mr(1)
11142 .nr(8)
11143 .kr(1)
11144 .sr(1)
11145 .m(m)
11146 .n(n)
11147 .k(k)
11148 .iterations(1)
11149 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11150 }
11151 }
11152 }
11153 }
11154
11155 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8) {
11156 TEST_REQUIRES_ARM_NEON;
11157 for (uint32_t n = 9; n < 16; n++) {
11158 for (size_t k = 1; k <= 10; k += 3) {
11159 GemmMicrokernelTester()
11160 .mr(1)
11161 .nr(8)
11162 .kr(1)
11163 .sr(1)
11164 .m(1)
11165 .n(8)
11166 .k(k)
11167 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11168 }
11169 }
11170 }
11171
11172 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
11173 TEST_REQUIRES_ARM_NEON;
11174 for (uint32_t n = 9; n < 16; n++) {
11175 for (size_t k = 1; k <= 10; k += 3) {
11176 GemmMicrokernelTester()
11177 .mr(1)
11178 .nr(8)
11179 .kr(1)
11180 .sr(1)
11181 .m(1)
11182 .n(8)
11183 .k(k)
11184 .cn_stride(11)
11185 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11186 }
11187 }
11188 }
11189
11190 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
11191 TEST_REQUIRES_ARM_NEON;
11192 for (uint32_t n = 9; n < 16; n++) {
11193 for (size_t k = 1; k <= 10; k += 3) {
11194 for (uint32_t m = 1; m <= 1; m++) {
11195 GemmMicrokernelTester()
11196 .mr(1)
11197 .nr(8)
11198 .kr(1)
11199 .sr(1)
11200 .m(m)
11201 .n(n)
11202 .k(k)
11203 .iterations(1)
11204 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11205 }
11206 }
11207 }
11208 }
11209
11210 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8) {
11211 TEST_REQUIRES_ARM_NEON;
11212 for (uint32_t n = 16; n <= 24; n += 8) {
11213 for (size_t k = 1; k <= 10; k += 3) {
11214 GemmMicrokernelTester()
11215 .mr(1)
11216 .nr(8)
11217 .kr(1)
11218 .sr(1)
11219 .m(1)
11220 .n(8)
11221 .k(k)
11222 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11223 }
11224 }
11225 }
11226
11227 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
11228 TEST_REQUIRES_ARM_NEON;
11229 for (uint32_t n = 16; n <= 24; n += 8) {
11230 for (size_t k = 1; k <= 10; k += 3) {
11231 GemmMicrokernelTester()
11232 .mr(1)
11233 .nr(8)
11234 .kr(1)
11235 .sr(1)
11236 .m(1)
11237 .n(n)
11238 .k(k)
11239 .cn_stride(11)
11240 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11241 }
11242 }
11243 }
11244
11245 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_subtile) {
11246 TEST_REQUIRES_ARM_NEON;
11247 for (uint32_t n = 16; n <= 24; n += 8) {
11248 for (size_t k = 1; k <= 10; k += 3) {
11249 for (uint32_t m = 1; m <= 1; m++) {
11250 GemmMicrokernelTester()
11251 .mr(1)
11252 .nr(8)
11253 .kr(1)
11254 .sr(1)
11255 .m(m)
11256 .n(n)
11257 .k(k)
11258 .iterations(1)
11259 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11260 }
11261 }
11262 }
11263 }
11264
11265 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, small_kernel) {
11266 TEST_REQUIRES_ARM_NEON;
11267 for (size_t k = 1; k <= 10; k += 3) {
11268 GemmMicrokernelTester()
11269 .mr(1)
11270 .nr(8)
11271 .kr(1)
11272 .sr(1)
11273 .m(1)
11274 .n(8)
11275 .k(k)
11276 .ks(3)
11277 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11278 }
11279 }
11280
11281 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, small_kernel_subtile) {
11282 TEST_REQUIRES_ARM_NEON;
11283 for (size_t k = 1; k <= 10; k += 3) {
11284 for (uint32_t m = 1; m <= 1; m++) {
11285 for (uint32_t n = 1; n <= 8; n++) {
11286 GemmMicrokernelTester()
11287 .mr(1)
11288 .nr(8)
11289 .kr(1)
11290 .sr(1)
11291 .m(m)
11292 .n(n)
11293 .k(k)
11294 .ks(3)
11295 .iterations(1)
11296 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11297 }
11298 }
11299 }
11300 }
11301
11302 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
11303 TEST_REQUIRES_ARM_NEON;
11304 for (uint32_t n = 9; n < 16; n++) {
11305 for (size_t k = 1; k <= 10; k += 3) {
11306 GemmMicrokernelTester()
11307 .mr(1)
11308 .nr(8)
11309 .kr(1)
11310 .sr(1)
11311 .m(1)
11312 .n(8)
11313 .k(k)
11314 .ks(3)
11315 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11316 }
11317 }
11318 }
11319
11320 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, n_div_8_small_kernel) {
11321 TEST_REQUIRES_ARM_NEON;
11322 for (uint32_t n = 16; n <= 24; n += 8) {
11323 for (size_t k = 1; k <= 10; k += 3) {
11324 GemmMicrokernelTester()
11325 .mr(1)
11326 .nr(8)
11327 .kr(1)
11328 .sr(1)
11329 .m(1)
11330 .n(8)
11331 .k(k)
11332 .ks(3)
11333 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11334 }
11335 }
11336 }
11337
11338 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cm_subtile) {
11339 TEST_REQUIRES_ARM_NEON;
11340 for (size_t k = 1; k <= 10; k += 3) {
11341 for (uint32_t m = 1; m <= 1; m++) {
11342 for (uint32_t n = 1; n <= 8; n++) {
11343 GemmMicrokernelTester()
11344 .mr(1)
11345 .nr(8)
11346 .kr(1)
11347 .sr(1)
11348 .m(m)
11349 .n(n)
11350 .k(k)
11351 .cm_stride(11)
11352 .iterations(1)
11353 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11354 }
11355 }
11356 }
11357 }
11358
11359 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, a_offset) {
11360 TEST_REQUIRES_ARM_NEON;
11361 for (size_t k = 1; k <= 10; k += 3) {
11362 GemmMicrokernelTester()
11363 .mr(1)
11364 .nr(8)
11365 .kr(1)
11366 .sr(1)
11367 .m(1)
11368 .n(8)
11369 .k(k)
11370 .ks(3)
11371 .a_offset(13)
11372 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11373 }
11374 }
11375
11376 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, zero) {
11377 TEST_REQUIRES_ARM_NEON;
11378 for (uint32_t mz = 0; mz < 1; mz++) {
11379 for (size_t k = 1; k <= 10; k += 3) {
11380 GemmMicrokernelTester()
11381 .mr(1)
11382 .nr(8)
11383 .kr(1)
11384 .sr(1)
11385 .m(1)
11386 .n(8)
11387 .k(k)
11388 .ks(3)
11389 .a_offset(13)
11390 .zero_index(mz)
11391 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11392 }
11393 }
11394 }
11395
11396 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, qmin) {
11397 TEST_REQUIRES_ARM_NEON;
11398 GemmMicrokernelTester()
11399 .mr(1)
11400 .nr(8)
11401 .kr(1)
11402 .sr(1)
11403 .m(1)
11404 .n(8)
11405 .k(2)
11406 .qmin(128)
11407 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11408 }
11409
11410 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, qmax) {
11411 TEST_REQUIRES_ARM_NEON;
11412 GemmMicrokernelTester()
11413 .mr(1)
11414 .nr(8)
11415 .kr(1)
11416 .sr(1)
11417 .m(1)
11418 .n(8)
11419 .k(2)
11420 .qmax(128)
11421 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11422 }
11423
11424 TEST(F32_IGEMM_1X8__NEON_LANE_LD64, strided_cm) {
11425 TEST_REQUIRES_ARM_NEON;
11426 GemmMicrokernelTester()
11427 .mr(1)
11428 .nr(8)
11429 .kr(1)
11430 .sr(1)
11431 .m(1)
11432 .n(8)
11433 .k(2)
11434 .cm_stride(11)
11435 .Test(xnn_f32_igemm_ukernel_1x8__neon_lane_ld64);
11436 }
11437#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11438
11439
11440#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11441 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2) {
11442 TEST_REQUIRES_ARM_NEON;
11443 GemmMicrokernelTester()
11444 .mr(4)
11445 .nr(2)
11446 .kr(1)
11447 .sr(1)
11448 .m(4)
11449 .n(2)
11450 .k(2)
11451 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11452 }
11453
11454 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cn) {
11455 TEST_REQUIRES_ARM_NEON;
11456 GemmMicrokernelTester()
11457 .mr(4)
11458 .nr(2)
11459 .kr(1)
11460 .sr(1)
11461 .m(4)
11462 .n(2)
11463 .k(2)
11464 .cn_stride(5)
11465 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11466 }
11467
11468 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile) {
11469 TEST_REQUIRES_ARM_NEON;
11470 for (uint32_t m = 1; m <= 4; m++) {
11471 for (uint32_t n = 1; n <= 2; n++) {
11472 GemmMicrokernelTester()
11473 .mr(4)
11474 .nr(2)
11475 .kr(1)
11476 .sr(1)
11477 .m(m)
11478 .n(n)
11479 .k(2)
11480 .iterations(1)
11481 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11482 }
11483 }
11484 }
11485
11486 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
11487 TEST_REQUIRES_ARM_NEON;
11488 for (uint32_t m = 1; m <= 4; m++) {
11489 GemmMicrokernelTester()
11490 .mr(4)
11491 .nr(2)
11492 .kr(1)
11493 .sr(1)
11494 .m(m)
11495 .n(2)
11496 .k(2)
11497 .iterations(1)
11498 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11499 }
11500 }
11501
11502 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
11503 TEST_REQUIRES_ARM_NEON;
11504 for (uint32_t n = 1; n <= 2; n++) {
11505 GemmMicrokernelTester()
11506 .mr(4)
11507 .nr(2)
11508 .kr(1)
11509 .sr(1)
11510 .m(4)
11511 .n(n)
11512 .k(2)
11513 .iterations(1)
11514 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11515 }
11516 }
11517
11518 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_lt_2) {
11519 TEST_REQUIRES_ARM_NEON;
11520 for (size_t k = 1; k < 2; k++) {
11521 GemmMicrokernelTester()
11522 .mr(4)
11523 .nr(2)
11524 .kr(1)
11525 .sr(1)
11526 .m(4)
11527 .n(2)
11528 .k(k)
11529 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11530 }
11531 }
11532
11533 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_lt_2_subtile) {
11534 TEST_REQUIRES_ARM_NEON;
11535 for (size_t k = 1; k < 2; k++) {
11536 for (uint32_t m = 1; m <= 4; m++) {
11537 for (uint32_t n = 1; n <= 2; n++) {
11538 GemmMicrokernelTester()
11539 .mr(4)
11540 .nr(2)
11541 .kr(1)
11542 .sr(1)
11543 .m(m)
11544 .n(n)
11545 .k(k)
11546 .iterations(1)
11547 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11548 }
11549 }
11550 }
11551 }
11552
11553 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_gt_2) {
11554 TEST_REQUIRES_ARM_NEON;
11555 for (size_t k = 3; k < 4; k++) {
11556 GemmMicrokernelTester()
11557 .mr(4)
11558 .nr(2)
11559 .kr(1)
11560 .sr(1)
11561 .m(4)
11562 .n(2)
11563 .k(k)
11564 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11565 }
11566 }
11567
11568 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_gt_2_subtile) {
11569 TEST_REQUIRES_ARM_NEON;
11570 for (size_t k = 3; k < 4; k++) {
11571 for (uint32_t m = 1; m <= 4; m++) {
11572 for (uint32_t n = 1; n <= 2; n++) {
11573 GemmMicrokernelTester()
11574 .mr(4)
11575 .nr(2)
11576 .kr(1)
11577 .sr(1)
11578 .m(m)
11579 .n(n)
11580 .k(k)
11581 .iterations(1)
11582 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11583 }
11584 }
11585 }
11586 }
11587
11588 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_div_2) {
11589 TEST_REQUIRES_ARM_NEON;
11590 for (size_t k = 4; k <= 20; k += 2) {
11591 GemmMicrokernelTester()
11592 .mr(4)
11593 .nr(2)
11594 .kr(1)
11595 .sr(1)
11596 .m(4)
11597 .n(2)
11598 .k(k)
11599 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11600 }
11601 }
11602
11603 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, k_div_2_subtile) {
11604 TEST_REQUIRES_ARM_NEON;
11605 for (size_t k = 4; k <= 20; k += 2) {
11606 for (uint32_t m = 1; m <= 4; m++) {
11607 for (uint32_t n = 1; n <= 2; n++) {
11608 GemmMicrokernelTester()
11609 .mr(4)
11610 .nr(2)
11611 .kr(1)
11612 .sr(1)
11613 .m(m)
11614 .n(n)
11615 .k(k)
11616 .iterations(1)
11617 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11618 }
11619 }
11620 }
11621 }
11622
11623 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2) {
11624 TEST_REQUIRES_ARM_NEON;
11625 for (uint32_t n = 3; n < 4; n++) {
11626 for (size_t k = 1; k <= 10; k += 3) {
11627 GemmMicrokernelTester()
11628 .mr(4)
11629 .nr(2)
11630 .kr(1)
11631 .sr(1)
11632 .m(4)
11633 .n(2)
11634 .k(k)
11635 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11636 }
11637 }
11638 }
11639
11640 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
11641 TEST_REQUIRES_ARM_NEON;
11642 for (uint32_t n = 3; n < 4; n++) {
11643 for (size_t k = 1; k <= 10; k += 3) {
11644 GemmMicrokernelTester()
11645 .mr(4)
11646 .nr(2)
11647 .kr(1)
11648 .sr(1)
11649 .m(4)
11650 .n(2)
11651 .k(k)
11652 .cn_stride(5)
11653 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11654 }
11655 }
11656 }
11657
11658 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_subtile) {
11659 TEST_REQUIRES_ARM_NEON;
11660 for (uint32_t n = 3; n < 4; n++) {
11661 for (size_t k = 1; k <= 10; k += 3) {
11662 for (uint32_t m = 1; m <= 4; m++) {
11663 GemmMicrokernelTester()
11664 .mr(4)
11665 .nr(2)
11666 .kr(1)
11667 .sr(1)
11668 .m(m)
11669 .n(n)
11670 .k(k)
11671 .iterations(1)
11672 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11673 }
11674 }
11675 }
11676 }
11677
11678 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2) {
11679 TEST_REQUIRES_ARM_NEON;
11680 for (uint32_t n = 4; n <= 6; n += 2) {
11681 for (size_t k = 1; k <= 10; k += 3) {
11682 GemmMicrokernelTester()
11683 .mr(4)
11684 .nr(2)
11685 .kr(1)
11686 .sr(1)
11687 .m(4)
11688 .n(2)
11689 .k(k)
11690 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11691 }
11692 }
11693 }
11694
11695 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_strided_cn) {
11696 TEST_REQUIRES_ARM_NEON;
11697 for (uint32_t n = 4; n <= 6; n += 2) {
11698 for (size_t k = 1; k <= 10; k += 3) {
11699 GemmMicrokernelTester()
11700 .mr(4)
11701 .nr(2)
11702 .kr(1)
11703 .sr(1)
11704 .m(4)
11705 .n(n)
11706 .k(k)
11707 .cn_stride(5)
11708 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11709 }
11710 }
11711 }
11712
11713 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_subtile) {
11714 TEST_REQUIRES_ARM_NEON;
11715 for (uint32_t n = 4; n <= 6; n += 2) {
11716 for (size_t k = 1; k <= 10; k += 3) {
11717 for (uint32_t m = 1; m <= 4; m++) {
11718 GemmMicrokernelTester()
11719 .mr(4)
11720 .nr(2)
11721 .kr(1)
11722 .sr(1)
11723 .m(m)
11724 .n(n)
11725 .k(k)
11726 .iterations(1)
11727 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11728 }
11729 }
11730 }
11731 }
11732
11733 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, small_kernel) {
11734 TEST_REQUIRES_ARM_NEON;
11735 for (size_t k = 1; k <= 10; k += 3) {
11736 GemmMicrokernelTester()
11737 .mr(4)
11738 .nr(2)
11739 .kr(1)
11740 .sr(1)
11741 .m(4)
11742 .n(2)
11743 .k(k)
11744 .ks(3)
11745 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11746 }
11747 }
11748
11749 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, small_kernel_subtile) {
11750 TEST_REQUIRES_ARM_NEON;
11751 for (size_t k = 1; k <= 10; k += 3) {
11752 for (uint32_t m = 1; m <= 4; m++) {
11753 for (uint32_t n = 1; n <= 2; n++) {
11754 GemmMicrokernelTester()
11755 .mr(4)
11756 .nr(2)
11757 .kr(1)
11758 .sr(1)
11759 .m(m)
11760 .n(n)
11761 .k(k)
11762 .ks(3)
11763 .iterations(1)
11764 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11765 }
11766 }
11767 }
11768 }
11769
11770 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_gt_2_small_kernel) {
11771 TEST_REQUIRES_ARM_NEON;
11772 for (uint32_t n = 3; n < 4; n++) {
11773 for (size_t k = 1; k <= 10; k += 3) {
11774 GemmMicrokernelTester()
11775 .mr(4)
11776 .nr(2)
11777 .kr(1)
11778 .sr(1)
11779 .m(4)
11780 .n(2)
11781 .k(k)
11782 .ks(3)
11783 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11784 }
11785 }
11786 }
11787
11788 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, n_div_2_small_kernel) {
11789 TEST_REQUIRES_ARM_NEON;
11790 for (uint32_t n = 4; n <= 6; n += 2) {
11791 for (size_t k = 1; k <= 10; k += 3) {
11792 GemmMicrokernelTester()
11793 .mr(4)
11794 .nr(2)
11795 .kr(1)
11796 .sr(1)
11797 .m(4)
11798 .n(2)
11799 .k(k)
11800 .ks(3)
11801 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11802 }
11803 }
11804 }
11805
11806 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cm_subtile) {
11807 TEST_REQUIRES_ARM_NEON;
11808 for (size_t k = 1; k <= 10; k += 3) {
11809 for (uint32_t m = 1; m <= 4; m++) {
11810 for (uint32_t n = 1; n <= 2; n++) {
11811 GemmMicrokernelTester()
11812 .mr(4)
11813 .nr(2)
11814 .kr(1)
11815 .sr(1)
11816 .m(m)
11817 .n(n)
11818 .k(k)
11819 .cm_stride(5)
11820 .iterations(1)
11821 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11822 }
11823 }
11824 }
11825 }
11826
11827 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, a_offset) {
11828 TEST_REQUIRES_ARM_NEON;
11829 for (size_t k = 1; k <= 10; k += 3) {
11830 GemmMicrokernelTester()
11831 .mr(4)
11832 .nr(2)
11833 .kr(1)
11834 .sr(1)
11835 .m(4)
11836 .n(2)
11837 .k(k)
11838 .ks(3)
11839 .a_offset(43)
11840 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11841 }
11842 }
11843
11844 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, zero) {
11845 TEST_REQUIRES_ARM_NEON;
11846 for (uint32_t mz = 0; mz < 4; mz++) {
11847 for (size_t k = 1; k <= 10; k += 3) {
11848 GemmMicrokernelTester()
11849 .mr(4)
11850 .nr(2)
11851 .kr(1)
11852 .sr(1)
11853 .m(4)
11854 .n(2)
11855 .k(k)
11856 .ks(3)
11857 .a_offset(43)
11858 .zero_index(mz)
11859 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11860 }
11861 }
11862 }
11863
11864 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, qmin) {
11865 TEST_REQUIRES_ARM_NEON;
11866 GemmMicrokernelTester()
11867 .mr(4)
11868 .nr(2)
11869 .kr(1)
11870 .sr(1)
11871 .m(4)
11872 .n(2)
11873 .k(2)
11874 .qmin(128)
11875 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11876 }
11877
11878 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, qmax) {
11879 TEST_REQUIRES_ARM_NEON;
11880 GemmMicrokernelTester()
11881 .mr(4)
11882 .nr(2)
11883 .kr(1)
11884 .sr(1)
11885 .m(4)
11886 .n(2)
11887 .k(2)
11888 .qmax(128)
11889 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11890 }
11891
11892 TEST(F32_IGEMM_4X2__NEON_LANE_LD64, strided_cm) {
11893 TEST_REQUIRES_ARM_NEON;
11894 GemmMicrokernelTester()
11895 .mr(4)
11896 .nr(2)
11897 .kr(1)
11898 .sr(1)
11899 .m(4)
11900 .n(2)
11901 .k(2)
11902 .cm_stride(5)
11903 .Test(xnn_f32_igemm_ukernel_4x2__neon_lane_ld64);
11904 }
11905#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11906
11907
11908#if XNN_ARCH_ARM || XNN_ARCH_ARM64
11909 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2) {
11910 TEST_REQUIRES_ARM_NEON;
11911 GemmMicrokernelTester()
11912 .mr(4)
11913 .nr(4)
11914 .kr(1)
11915 .sr(1)
11916 .m(4)
11917 .n(4)
11918 .k(2)
11919 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
11920 }
11921
11922 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cn) {
11923 TEST_REQUIRES_ARM_NEON;
11924 GemmMicrokernelTester()
11925 .mr(4)
11926 .nr(4)
11927 .kr(1)
11928 .sr(1)
11929 .m(4)
11930 .n(4)
11931 .k(2)
11932 .cn_stride(7)
11933 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
11934 }
11935
11936 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile) {
11937 TEST_REQUIRES_ARM_NEON;
11938 for (uint32_t m = 1; m <= 4; m++) {
11939 for (uint32_t n = 1; n <= 4; n++) {
11940 GemmMicrokernelTester()
11941 .mr(4)
11942 .nr(4)
11943 .kr(1)
11944 .sr(1)
11945 .m(m)
11946 .n(n)
11947 .k(2)
11948 .iterations(1)
11949 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
11950 }
11951 }
11952 }
11953
11954 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile_m) {
11955 TEST_REQUIRES_ARM_NEON;
11956 for (uint32_t m = 1; m <= 4; m++) {
11957 GemmMicrokernelTester()
11958 .mr(4)
11959 .nr(4)
11960 .kr(1)
11961 .sr(1)
11962 .m(m)
11963 .n(4)
11964 .k(2)
11965 .iterations(1)
11966 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
11967 }
11968 }
11969
11970 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_eq_2_subtile_n) {
11971 TEST_REQUIRES_ARM_NEON;
11972 for (uint32_t n = 1; n <= 4; n++) {
11973 GemmMicrokernelTester()
11974 .mr(4)
11975 .nr(4)
11976 .kr(1)
11977 .sr(1)
11978 .m(4)
11979 .n(n)
11980 .k(2)
11981 .iterations(1)
11982 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
11983 }
11984 }
11985
11986 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_lt_2) {
11987 TEST_REQUIRES_ARM_NEON;
11988 for (size_t k = 1; k < 2; k++) {
11989 GemmMicrokernelTester()
11990 .mr(4)
11991 .nr(4)
11992 .kr(1)
11993 .sr(1)
11994 .m(4)
11995 .n(4)
11996 .k(k)
11997 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
11998 }
11999 }
12000
12001 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_lt_2_subtile) {
12002 TEST_REQUIRES_ARM_NEON;
12003 for (size_t k = 1; k < 2; k++) {
12004 for (uint32_t m = 1; m <= 4; m++) {
12005 for (uint32_t n = 1; n <= 4; n++) {
12006 GemmMicrokernelTester()
12007 .mr(4)
12008 .nr(4)
12009 .kr(1)
12010 .sr(1)
12011 .m(m)
12012 .n(n)
12013 .k(k)
12014 .iterations(1)
12015 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12016 }
12017 }
12018 }
12019 }
12020
12021 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_gt_2) {
12022 TEST_REQUIRES_ARM_NEON;
12023 for (size_t k = 3; k < 4; k++) {
12024 GemmMicrokernelTester()
12025 .mr(4)
12026 .nr(4)
12027 .kr(1)
12028 .sr(1)
12029 .m(4)
12030 .n(4)
12031 .k(k)
12032 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12033 }
12034 }
12035
12036 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_gt_2_subtile) {
12037 TEST_REQUIRES_ARM_NEON;
12038 for (size_t k = 3; k < 4; k++) {
12039 for (uint32_t m = 1; m <= 4; m++) {
12040 for (uint32_t n = 1; n <= 4; n++) {
12041 GemmMicrokernelTester()
12042 .mr(4)
12043 .nr(4)
12044 .kr(1)
12045 .sr(1)
12046 .m(m)
12047 .n(n)
12048 .k(k)
12049 .iterations(1)
12050 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12051 }
12052 }
12053 }
12054 }
12055
12056 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_div_2) {
12057 TEST_REQUIRES_ARM_NEON;
12058 for (size_t k = 4; k <= 20; k += 2) {
12059 GemmMicrokernelTester()
12060 .mr(4)
12061 .nr(4)
12062 .kr(1)
12063 .sr(1)
12064 .m(4)
12065 .n(4)
12066 .k(k)
12067 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12068 }
12069 }
12070
12071 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, k_div_2_subtile) {
12072 TEST_REQUIRES_ARM_NEON;
12073 for (size_t k = 4; k <= 20; k += 2) {
12074 for (uint32_t m = 1; m <= 4; m++) {
12075 for (uint32_t n = 1; n <= 4; n++) {
12076 GemmMicrokernelTester()
12077 .mr(4)
12078 .nr(4)
12079 .kr(1)
12080 .sr(1)
12081 .m(m)
12082 .n(n)
12083 .k(k)
12084 .iterations(1)
12085 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12086 }
12087 }
12088 }
12089 }
12090
12091 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4) {
12092 TEST_REQUIRES_ARM_NEON;
12093 for (uint32_t n = 5; n < 8; n++) {
12094 for (size_t k = 1; k <= 10; k += 3) {
12095 GemmMicrokernelTester()
12096 .mr(4)
12097 .nr(4)
12098 .kr(1)
12099 .sr(1)
12100 .m(4)
12101 .n(4)
12102 .k(k)
12103 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12104 }
12105 }
12106 }
12107
12108 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_strided_cn) {
12109 TEST_REQUIRES_ARM_NEON;
12110 for (uint32_t n = 5; n < 8; n++) {
12111 for (size_t k = 1; k <= 10; k += 3) {
12112 GemmMicrokernelTester()
12113 .mr(4)
12114 .nr(4)
12115 .kr(1)
12116 .sr(1)
12117 .m(4)
12118 .n(4)
12119 .k(k)
12120 .cn_stride(7)
12121 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12122 }
12123 }
12124 }
12125
12126 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_subtile) {
12127 TEST_REQUIRES_ARM_NEON;
12128 for (uint32_t n = 5; n < 8; n++) {
12129 for (size_t k = 1; k <= 10; k += 3) {
12130 for (uint32_t m = 1; m <= 4; m++) {
12131 GemmMicrokernelTester()
12132 .mr(4)
12133 .nr(4)
12134 .kr(1)
12135 .sr(1)
12136 .m(m)
12137 .n(n)
12138 .k(k)
12139 .iterations(1)
12140 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12141 }
12142 }
12143 }
12144 }
12145
12146 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4) {
12147 TEST_REQUIRES_ARM_NEON;
12148 for (uint32_t n = 8; n <= 12; n += 4) {
12149 for (size_t k = 1; k <= 10; k += 3) {
12150 GemmMicrokernelTester()
12151 .mr(4)
12152 .nr(4)
12153 .kr(1)
12154 .sr(1)
12155 .m(4)
12156 .n(4)
12157 .k(k)
12158 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12159 }
12160 }
12161 }
12162
12163 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_strided_cn) {
12164 TEST_REQUIRES_ARM_NEON;
12165 for (uint32_t n = 8; n <= 12; n += 4) {
12166 for (size_t k = 1; k <= 10; k += 3) {
12167 GemmMicrokernelTester()
12168 .mr(4)
12169 .nr(4)
12170 .kr(1)
12171 .sr(1)
12172 .m(4)
12173 .n(n)
12174 .k(k)
12175 .cn_stride(7)
12176 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12177 }
12178 }
12179 }
12180
12181 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_subtile) {
12182 TEST_REQUIRES_ARM_NEON;
12183 for (uint32_t n = 8; n <= 12; n += 4) {
12184 for (size_t k = 1; k <= 10; k += 3) {
12185 for (uint32_t m = 1; m <= 4; m++) {
12186 GemmMicrokernelTester()
12187 .mr(4)
12188 .nr(4)
12189 .kr(1)
12190 .sr(1)
12191 .m(m)
12192 .n(n)
12193 .k(k)
12194 .iterations(1)
12195 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12196 }
12197 }
12198 }
12199 }
12200
12201 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, small_kernel) {
12202 TEST_REQUIRES_ARM_NEON;
12203 for (size_t k = 1; k <= 10; k += 3) {
12204 GemmMicrokernelTester()
12205 .mr(4)
12206 .nr(4)
12207 .kr(1)
12208 .sr(1)
12209 .m(4)
12210 .n(4)
12211 .k(k)
12212 .ks(3)
12213 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12214 }
12215 }
12216
12217 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, small_kernel_subtile) {
12218 TEST_REQUIRES_ARM_NEON;
12219 for (size_t k = 1; k <= 10; k += 3) {
12220 for (uint32_t m = 1; m <= 4; m++) {
12221 for (uint32_t n = 1; n <= 4; n++) {
12222 GemmMicrokernelTester()
12223 .mr(4)
12224 .nr(4)
12225 .kr(1)
12226 .sr(1)
12227 .m(m)
12228 .n(n)
12229 .k(k)
12230 .ks(3)
12231 .iterations(1)
12232 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12233 }
12234 }
12235 }
12236 }
12237
12238 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_gt_4_small_kernel) {
12239 TEST_REQUIRES_ARM_NEON;
12240 for (uint32_t n = 5; n < 8; n++) {
12241 for (size_t k = 1; k <= 10; k += 3) {
12242 GemmMicrokernelTester()
12243 .mr(4)
12244 .nr(4)
12245 .kr(1)
12246 .sr(1)
12247 .m(4)
12248 .n(4)
12249 .k(k)
12250 .ks(3)
12251 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12252 }
12253 }
12254 }
12255
12256 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, n_div_4_small_kernel) {
12257 TEST_REQUIRES_ARM_NEON;
12258 for (uint32_t n = 8; n <= 12; n += 4) {
12259 for (size_t k = 1; k <= 10; k += 3) {
12260 GemmMicrokernelTester()
12261 .mr(4)
12262 .nr(4)
12263 .kr(1)
12264 .sr(1)
12265 .m(4)
12266 .n(4)
12267 .k(k)
12268 .ks(3)
12269 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12270 }
12271 }
12272 }
12273
12274 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cm_subtile) {
12275 TEST_REQUIRES_ARM_NEON;
12276 for (size_t k = 1; k <= 10; k += 3) {
12277 for (uint32_t m = 1; m <= 4; m++) {
12278 for (uint32_t n = 1; n <= 4; n++) {
12279 GemmMicrokernelTester()
12280 .mr(4)
12281 .nr(4)
12282 .kr(1)
12283 .sr(1)
12284 .m(m)
12285 .n(n)
12286 .k(k)
12287 .cm_stride(7)
12288 .iterations(1)
12289 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12290 }
12291 }
12292 }
12293 }
12294
12295 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, a_offset) {
12296 TEST_REQUIRES_ARM_NEON;
12297 for (size_t k = 1; k <= 10; k += 3) {
12298 GemmMicrokernelTester()
12299 .mr(4)
12300 .nr(4)
12301 .kr(1)
12302 .sr(1)
12303 .m(4)
12304 .n(4)
12305 .k(k)
12306 .ks(3)
12307 .a_offset(43)
12308 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12309 }
12310 }
12311
12312 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, zero) {
12313 TEST_REQUIRES_ARM_NEON;
12314 for (uint32_t mz = 0; mz < 4; mz++) {
12315 for (size_t k = 1; k <= 10; k += 3) {
12316 GemmMicrokernelTester()
12317 .mr(4)
12318 .nr(4)
12319 .kr(1)
12320 .sr(1)
12321 .m(4)
12322 .n(4)
12323 .k(k)
12324 .ks(3)
12325 .a_offset(43)
12326 .zero_index(mz)
12327 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12328 }
12329 }
12330 }
12331
12332 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, qmin) {
12333 TEST_REQUIRES_ARM_NEON;
12334 GemmMicrokernelTester()
12335 .mr(4)
12336 .nr(4)
12337 .kr(1)
12338 .sr(1)
12339 .m(4)
12340 .n(4)
12341 .k(2)
12342 .qmin(128)
12343 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12344 }
12345
12346 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, qmax) {
12347 TEST_REQUIRES_ARM_NEON;
12348 GemmMicrokernelTester()
12349 .mr(4)
12350 .nr(4)
12351 .kr(1)
12352 .sr(1)
12353 .m(4)
12354 .n(4)
12355 .k(2)
12356 .qmax(128)
12357 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12358 }
12359
12360 TEST(F32_IGEMM_4X4__NEON_LANE_LD64, strided_cm) {
12361 TEST_REQUIRES_ARM_NEON;
12362 GemmMicrokernelTester()
12363 .mr(4)
12364 .nr(4)
12365 .kr(1)
12366 .sr(1)
12367 .m(4)
12368 .n(4)
12369 .k(2)
12370 .cm_stride(7)
12371 .Test(xnn_f32_igemm_ukernel_4x4__neon_lane_ld64);
12372 }
12373#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12374
12375
12376#if XNN_ARCH_ARM || XNN_ARCH_ARM64
12377 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4) {
12378 TEST_REQUIRES_ARM_NEON;
12379 GemmMicrokernelTester()
12380 .mr(4)
12381 .nr(8)
12382 .kr(1)
12383 .sr(1)
12384 .m(4)
12385 .n(8)
12386 .k(4)
12387 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12388 }
12389
12390 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cn) {
12391 TEST_REQUIRES_ARM_NEON;
12392 GemmMicrokernelTester()
12393 .mr(4)
12394 .nr(8)
12395 .kr(1)
12396 .sr(1)
12397 .m(4)
12398 .n(8)
12399 .k(4)
12400 .cn_stride(11)
12401 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12402 }
12403
12404 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
12405 TEST_REQUIRES_ARM_NEON;
12406 for (uint32_t m = 1; m <= 4; m++) {
12407 for (uint32_t n = 1; n <= 8; n++) {
12408 GemmMicrokernelTester()
12409 .mr(4)
12410 .nr(8)
12411 .kr(1)
12412 .sr(1)
12413 .m(m)
12414 .n(n)
12415 .k(4)
12416 .iterations(1)
12417 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12418 }
12419 }
12420 }
12421
12422 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
12423 TEST_REQUIRES_ARM_NEON;
12424 for (uint32_t m = 1; m <= 4; m++) {
12425 GemmMicrokernelTester()
12426 .mr(4)
12427 .nr(8)
12428 .kr(1)
12429 .sr(1)
12430 .m(m)
12431 .n(8)
12432 .k(4)
12433 .iterations(1)
12434 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12435 }
12436 }
12437
12438 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
12439 TEST_REQUIRES_ARM_NEON;
12440 for (uint32_t n = 1; n <= 8; n++) {
12441 GemmMicrokernelTester()
12442 .mr(4)
12443 .nr(8)
12444 .kr(1)
12445 .sr(1)
12446 .m(4)
12447 .n(n)
12448 .k(4)
12449 .iterations(1)
12450 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12451 }
12452 }
12453
12454 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_lt_4) {
12455 TEST_REQUIRES_ARM_NEON;
12456 for (size_t k = 1; k < 4; k++) {
12457 GemmMicrokernelTester()
12458 .mr(4)
12459 .nr(8)
12460 .kr(1)
12461 .sr(1)
12462 .m(4)
12463 .n(8)
12464 .k(k)
12465 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12466 }
12467 }
12468
12469 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
12470 TEST_REQUIRES_ARM_NEON;
12471 for (size_t k = 1; k < 4; k++) {
12472 for (uint32_t m = 1; m <= 4; m++) {
12473 for (uint32_t n = 1; n <= 8; n++) {
12474 GemmMicrokernelTester()
12475 .mr(4)
12476 .nr(8)
12477 .kr(1)
12478 .sr(1)
12479 .m(m)
12480 .n(n)
12481 .k(k)
12482 .iterations(1)
12483 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12484 }
12485 }
12486 }
12487 }
12488
12489 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_gt_4) {
12490 TEST_REQUIRES_ARM_NEON;
12491 for (size_t k = 5; k < 8; k++) {
12492 GemmMicrokernelTester()
12493 .mr(4)
12494 .nr(8)
12495 .kr(1)
12496 .sr(1)
12497 .m(4)
12498 .n(8)
12499 .k(k)
12500 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12501 }
12502 }
12503
12504 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
12505 TEST_REQUIRES_ARM_NEON;
12506 for (size_t k = 5; k < 8; k++) {
12507 for (uint32_t m = 1; m <= 4; m++) {
12508 for (uint32_t n = 1; n <= 8; n++) {
12509 GemmMicrokernelTester()
12510 .mr(4)
12511 .nr(8)
12512 .kr(1)
12513 .sr(1)
12514 .m(m)
12515 .n(n)
12516 .k(k)
12517 .iterations(1)
12518 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12519 }
12520 }
12521 }
12522 }
12523
12524 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_div_4) {
12525 TEST_REQUIRES_ARM_NEON;
12526 for (size_t k = 8; k <= 40; k += 4) {
12527 GemmMicrokernelTester()
12528 .mr(4)
12529 .nr(8)
12530 .kr(1)
12531 .sr(1)
12532 .m(4)
12533 .n(8)
12534 .k(k)
12535 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12536 }
12537 }
12538
12539 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, k_div_4_subtile) {
12540 TEST_REQUIRES_ARM_NEON;
12541 for (size_t k = 8; k <= 40; k += 4) {
12542 for (uint32_t m = 1; m <= 4; m++) {
12543 for (uint32_t n = 1; n <= 8; n++) {
12544 GemmMicrokernelTester()
12545 .mr(4)
12546 .nr(8)
12547 .kr(1)
12548 .sr(1)
12549 .m(m)
12550 .n(n)
12551 .k(k)
12552 .iterations(1)
12553 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12554 }
12555 }
12556 }
12557 }
12558
12559 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8) {
12560 TEST_REQUIRES_ARM_NEON;
12561 for (uint32_t n = 9; n < 16; n++) {
12562 for (size_t k = 1; k <= 20; k += 5) {
12563 GemmMicrokernelTester()
12564 .mr(4)
12565 .nr(8)
12566 .kr(1)
12567 .sr(1)
12568 .m(4)
12569 .n(8)
12570 .k(k)
12571 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12572 }
12573 }
12574 }
12575
12576 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
12577 TEST_REQUIRES_ARM_NEON;
12578 for (uint32_t n = 9; n < 16; n++) {
12579 for (size_t k = 1; k <= 20; k += 5) {
12580 GemmMicrokernelTester()
12581 .mr(4)
12582 .nr(8)
12583 .kr(1)
12584 .sr(1)
12585 .m(4)
12586 .n(8)
12587 .k(k)
12588 .cn_stride(11)
12589 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12590 }
12591 }
12592 }
12593
12594 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
12595 TEST_REQUIRES_ARM_NEON;
12596 for (uint32_t n = 9; n < 16; n++) {
12597 for (size_t k = 1; k <= 20; k += 5) {
12598 for (uint32_t m = 1; m <= 4; m++) {
12599 GemmMicrokernelTester()
12600 .mr(4)
12601 .nr(8)
12602 .kr(1)
12603 .sr(1)
12604 .m(m)
12605 .n(n)
12606 .k(k)
12607 .iterations(1)
12608 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12609 }
12610 }
12611 }
12612 }
12613
12614 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8) {
12615 TEST_REQUIRES_ARM_NEON;
12616 for (uint32_t n = 16; n <= 24; n += 8) {
12617 for (size_t k = 1; k <= 20; k += 5) {
12618 GemmMicrokernelTester()
12619 .mr(4)
12620 .nr(8)
12621 .kr(1)
12622 .sr(1)
12623 .m(4)
12624 .n(8)
12625 .k(k)
12626 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12627 }
12628 }
12629 }
12630
12631 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
12632 TEST_REQUIRES_ARM_NEON;
12633 for (uint32_t n = 16; n <= 24; n += 8) {
12634 for (size_t k = 1; k <= 20; k += 5) {
12635 GemmMicrokernelTester()
12636 .mr(4)
12637 .nr(8)
12638 .kr(1)
12639 .sr(1)
12640 .m(4)
12641 .n(n)
12642 .k(k)
12643 .cn_stride(11)
12644 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12645 }
12646 }
12647 }
12648
12649 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_subtile) {
12650 TEST_REQUIRES_ARM_NEON;
12651 for (uint32_t n = 16; n <= 24; n += 8) {
12652 for (size_t k = 1; k <= 20; k += 5) {
12653 for (uint32_t m = 1; m <= 4; m++) {
12654 GemmMicrokernelTester()
12655 .mr(4)
12656 .nr(8)
12657 .kr(1)
12658 .sr(1)
12659 .m(m)
12660 .n(n)
12661 .k(k)
12662 .iterations(1)
12663 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12664 }
12665 }
12666 }
12667 }
12668
12669 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, small_kernel) {
12670 TEST_REQUIRES_ARM_NEON;
12671 for (size_t k = 1; k <= 20; k += 5) {
12672 GemmMicrokernelTester()
12673 .mr(4)
12674 .nr(8)
12675 .kr(1)
12676 .sr(1)
12677 .m(4)
12678 .n(8)
12679 .k(k)
12680 .ks(3)
12681 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12682 }
12683 }
12684
12685 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, small_kernel_subtile) {
12686 TEST_REQUIRES_ARM_NEON;
12687 for (size_t k = 1; k <= 20; k += 5) {
12688 for (uint32_t m = 1; m <= 4; m++) {
12689 for (uint32_t n = 1; n <= 8; n++) {
12690 GemmMicrokernelTester()
12691 .mr(4)
12692 .nr(8)
12693 .kr(1)
12694 .sr(1)
12695 .m(m)
12696 .n(n)
12697 .k(k)
12698 .ks(3)
12699 .iterations(1)
12700 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12701 }
12702 }
12703 }
12704 }
12705
12706 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_gt_8_small_kernel) {
12707 TEST_REQUIRES_ARM_NEON;
12708 for (uint32_t n = 9; n < 16; n++) {
12709 for (size_t k = 1; k <= 20; k += 5) {
12710 GemmMicrokernelTester()
12711 .mr(4)
12712 .nr(8)
12713 .kr(1)
12714 .sr(1)
12715 .m(4)
12716 .n(8)
12717 .k(k)
12718 .ks(3)
12719 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12720 }
12721 }
12722 }
12723
12724 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, n_div_8_small_kernel) {
12725 TEST_REQUIRES_ARM_NEON;
12726 for (uint32_t n = 16; n <= 24; n += 8) {
12727 for (size_t k = 1; k <= 20; k += 5) {
12728 GemmMicrokernelTester()
12729 .mr(4)
12730 .nr(8)
12731 .kr(1)
12732 .sr(1)
12733 .m(4)
12734 .n(8)
12735 .k(k)
12736 .ks(3)
12737 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12738 }
12739 }
12740 }
12741
12742 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cm_subtile) {
12743 TEST_REQUIRES_ARM_NEON;
12744 for (size_t k = 1; k <= 20; k += 5) {
12745 for (uint32_t m = 1; m <= 4; m++) {
12746 for (uint32_t n = 1; n <= 8; n++) {
12747 GemmMicrokernelTester()
12748 .mr(4)
12749 .nr(8)
12750 .kr(1)
12751 .sr(1)
12752 .m(m)
12753 .n(n)
12754 .k(k)
12755 .cm_stride(11)
12756 .iterations(1)
12757 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12758 }
12759 }
12760 }
12761 }
12762
12763 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, a_offset) {
12764 TEST_REQUIRES_ARM_NEON;
12765 for (size_t k = 1; k <= 20; k += 5) {
12766 GemmMicrokernelTester()
12767 .mr(4)
12768 .nr(8)
12769 .kr(1)
12770 .sr(1)
12771 .m(4)
12772 .n(8)
12773 .k(k)
12774 .ks(3)
12775 .a_offset(83)
12776 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12777 }
12778 }
12779
12780 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, zero) {
12781 TEST_REQUIRES_ARM_NEON;
12782 for (uint32_t mz = 0; mz < 4; mz++) {
12783 for (size_t k = 1; k <= 20; k += 5) {
12784 GemmMicrokernelTester()
12785 .mr(4)
12786 .nr(8)
12787 .kr(1)
12788 .sr(1)
12789 .m(4)
12790 .n(8)
12791 .k(k)
12792 .ks(3)
12793 .a_offset(83)
12794 .zero_index(mz)
12795 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12796 }
12797 }
12798 }
12799
12800 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, qmin) {
12801 TEST_REQUIRES_ARM_NEON;
12802 GemmMicrokernelTester()
12803 .mr(4)
12804 .nr(8)
12805 .kr(1)
12806 .sr(1)
12807 .m(4)
12808 .n(8)
12809 .k(4)
12810 .qmin(128)
12811 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12812 }
12813
12814 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, qmax) {
12815 TEST_REQUIRES_ARM_NEON;
12816 GemmMicrokernelTester()
12817 .mr(4)
12818 .nr(8)
12819 .kr(1)
12820 .sr(1)
12821 .m(4)
12822 .n(8)
12823 .k(4)
12824 .qmax(128)
12825 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12826 }
12827
12828 TEST(F32_IGEMM_4X8__NEON_LANE_LD128, strided_cm) {
12829 TEST_REQUIRES_ARM_NEON;
12830 GemmMicrokernelTester()
12831 .mr(4)
12832 .nr(8)
12833 .kr(1)
12834 .sr(1)
12835 .m(4)
12836 .n(8)
12837 .k(4)
12838 .cm_stride(11)
12839 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld128);
12840 }
12841#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
12842
12843
12844#if XNN_ARCH_ARM || XNN_ARCH_ARM64
12845 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2) {
12846 TEST_REQUIRES_ARM_NEON;
12847 GemmMicrokernelTester()
12848 .mr(4)
12849 .nr(8)
12850 .kr(1)
12851 .sr(1)
12852 .m(4)
12853 .n(8)
12854 .k(2)
12855 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12856 }
12857
12858 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cn) {
12859 TEST_REQUIRES_ARM_NEON;
12860 GemmMicrokernelTester()
12861 .mr(4)
12862 .nr(8)
12863 .kr(1)
12864 .sr(1)
12865 .m(4)
12866 .n(8)
12867 .k(2)
12868 .cn_stride(11)
12869 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12870 }
12871
12872 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
12873 TEST_REQUIRES_ARM_NEON;
12874 for (uint32_t m = 1; m <= 4; m++) {
12875 for (uint32_t n = 1; n <= 8; n++) {
12876 GemmMicrokernelTester()
12877 .mr(4)
12878 .nr(8)
12879 .kr(1)
12880 .sr(1)
12881 .m(m)
12882 .n(n)
12883 .k(2)
12884 .iterations(1)
12885 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12886 }
12887 }
12888 }
12889
12890 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
12891 TEST_REQUIRES_ARM_NEON;
12892 for (uint32_t m = 1; m <= 4; m++) {
12893 GemmMicrokernelTester()
12894 .mr(4)
12895 .nr(8)
12896 .kr(1)
12897 .sr(1)
12898 .m(m)
12899 .n(8)
12900 .k(2)
12901 .iterations(1)
12902 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12903 }
12904 }
12905
12906 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
12907 TEST_REQUIRES_ARM_NEON;
12908 for (uint32_t n = 1; n <= 8; n++) {
12909 GemmMicrokernelTester()
12910 .mr(4)
12911 .nr(8)
12912 .kr(1)
12913 .sr(1)
12914 .m(4)
12915 .n(n)
12916 .k(2)
12917 .iterations(1)
12918 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12919 }
12920 }
12921
12922 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_lt_2) {
12923 TEST_REQUIRES_ARM_NEON;
12924 for (size_t k = 1; k < 2; k++) {
12925 GemmMicrokernelTester()
12926 .mr(4)
12927 .nr(8)
12928 .kr(1)
12929 .sr(1)
12930 .m(4)
12931 .n(8)
12932 .k(k)
12933 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12934 }
12935 }
12936
12937 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
12938 TEST_REQUIRES_ARM_NEON;
12939 for (size_t k = 1; k < 2; k++) {
12940 for (uint32_t m = 1; m <= 4; m++) {
12941 for (uint32_t n = 1; n <= 8; n++) {
12942 GemmMicrokernelTester()
12943 .mr(4)
12944 .nr(8)
12945 .kr(1)
12946 .sr(1)
12947 .m(m)
12948 .n(n)
12949 .k(k)
12950 .iterations(1)
12951 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12952 }
12953 }
12954 }
12955 }
12956
12957 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_gt_2) {
12958 TEST_REQUIRES_ARM_NEON;
12959 for (size_t k = 3; k < 4; k++) {
12960 GemmMicrokernelTester()
12961 .mr(4)
12962 .nr(8)
12963 .kr(1)
12964 .sr(1)
12965 .m(4)
12966 .n(8)
12967 .k(k)
12968 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12969 }
12970 }
12971
12972 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
12973 TEST_REQUIRES_ARM_NEON;
12974 for (size_t k = 3; k < 4; k++) {
12975 for (uint32_t m = 1; m <= 4; m++) {
12976 for (uint32_t n = 1; n <= 8; n++) {
12977 GemmMicrokernelTester()
12978 .mr(4)
12979 .nr(8)
12980 .kr(1)
12981 .sr(1)
12982 .m(m)
12983 .n(n)
12984 .k(k)
12985 .iterations(1)
12986 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
12987 }
12988 }
12989 }
12990 }
12991
12992 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_div_2) {
12993 TEST_REQUIRES_ARM_NEON;
12994 for (size_t k = 4; k <= 20; k += 2) {
12995 GemmMicrokernelTester()
12996 .mr(4)
12997 .nr(8)
12998 .kr(1)
12999 .sr(1)
13000 .m(4)
13001 .n(8)
13002 .k(k)
13003 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13004 }
13005 }
13006
13007 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, k_div_2_subtile) {
13008 TEST_REQUIRES_ARM_NEON;
13009 for (size_t k = 4; k <= 20; k += 2) {
13010 for (uint32_t m = 1; m <= 4; m++) {
13011 for (uint32_t n = 1; n <= 8; n++) {
13012 GemmMicrokernelTester()
13013 .mr(4)
13014 .nr(8)
13015 .kr(1)
13016 .sr(1)
13017 .m(m)
13018 .n(n)
13019 .k(k)
13020 .iterations(1)
13021 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13022 }
13023 }
13024 }
13025 }
13026
13027 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8) {
13028 TEST_REQUIRES_ARM_NEON;
13029 for (uint32_t n = 9; n < 16; n++) {
13030 for (size_t k = 1; k <= 10; k += 3) {
13031 GemmMicrokernelTester()
13032 .mr(4)
13033 .nr(8)
13034 .kr(1)
13035 .sr(1)
13036 .m(4)
13037 .n(8)
13038 .k(k)
13039 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13040 }
13041 }
13042 }
13043
13044 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
13045 TEST_REQUIRES_ARM_NEON;
13046 for (uint32_t n = 9; n < 16; n++) {
13047 for (size_t k = 1; k <= 10; k += 3) {
13048 GemmMicrokernelTester()
13049 .mr(4)
13050 .nr(8)
13051 .kr(1)
13052 .sr(1)
13053 .m(4)
13054 .n(8)
13055 .k(k)
13056 .cn_stride(11)
13057 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13058 }
13059 }
13060 }
13061
13062 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
13063 TEST_REQUIRES_ARM_NEON;
13064 for (uint32_t n = 9; n < 16; n++) {
13065 for (size_t k = 1; k <= 10; k += 3) {
13066 for (uint32_t m = 1; m <= 4; m++) {
13067 GemmMicrokernelTester()
13068 .mr(4)
13069 .nr(8)
13070 .kr(1)
13071 .sr(1)
13072 .m(m)
13073 .n(n)
13074 .k(k)
13075 .iterations(1)
13076 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13077 }
13078 }
13079 }
13080 }
13081
13082 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8) {
13083 TEST_REQUIRES_ARM_NEON;
13084 for (uint32_t n = 16; n <= 24; n += 8) {
13085 for (size_t k = 1; k <= 10; k += 3) {
13086 GemmMicrokernelTester()
13087 .mr(4)
13088 .nr(8)
13089 .kr(1)
13090 .sr(1)
13091 .m(4)
13092 .n(8)
13093 .k(k)
13094 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13095 }
13096 }
13097 }
13098
13099 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
13100 TEST_REQUIRES_ARM_NEON;
13101 for (uint32_t n = 16; n <= 24; n += 8) {
13102 for (size_t k = 1; k <= 10; k += 3) {
13103 GemmMicrokernelTester()
13104 .mr(4)
13105 .nr(8)
13106 .kr(1)
13107 .sr(1)
13108 .m(4)
13109 .n(n)
13110 .k(k)
13111 .cn_stride(11)
13112 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13113 }
13114 }
13115 }
13116
13117 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_subtile) {
13118 TEST_REQUIRES_ARM_NEON;
13119 for (uint32_t n = 16; n <= 24; n += 8) {
13120 for (size_t k = 1; k <= 10; k += 3) {
13121 for (uint32_t m = 1; m <= 4; m++) {
13122 GemmMicrokernelTester()
13123 .mr(4)
13124 .nr(8)
13125 .kr(1)
13126 .sr(1)
13127 .m(m)
13128 .n(n)
13129 .k(k)
13130 .iterations(1)
13131 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13132 }
13133 }
13134 }
13135 }
13136
13137 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, small_kernel) {
13138 TEST_REQUIRES_ARM_NEON;
13139 for (size_t k = 1; k <= 10; k += 3) {
13140 GemmMicrokernelTester()
13141 .mr(4)
13142 .nr(8)
13143 .kr(1)
13144 .sr(1)
13145 .m(4)
13146 .n(8)
13147 .k(k)
13148 .ks(3)
13149 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13150 }
13151 }
13152
13153 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, small_kernel_subtile) {
13154 TEST_REQUIRES_ARM_NEON;
13155 for (size_t k = 1; k <= 10; k += 3) {
13156 for (uint32_t m = 1; m <= 4; m++) {
13157 for (uint32_t n = 1; n <= 8; n++) {
13158 GemmMicrokernelTester()
13159 .mr(4)
13160 .nr(8)
13161 .kr(1)
13162 .sr(1)
13163 .m(m)
13164 .n(n)
13165 .k(k)
13166 .ks(3)
13167 .iterations(1)
13168 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13169 }
13170 }
13171 }
13172 }
13173
13174 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
13175 TEST_REQUIRES_ARM_NEON;
13176 for (uint32_t n = 9; n < 16; n++) {
13177 for (size_t k = 1; k <= 10; k += 3) {
13178 GemmMicrokernelTester()
13179 .mr(4)
13180 .nr(8)
13181 .kr(1)
13182 .sr(1)
13183 .m(4)
13184 .n(8)
13185 .k(k)
13186 .ks(3)
13187 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13188 }
13189 }
13190 }
13191
13192 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, n_div_8_small_kernel) {
13193 TEST_REQUIRES_ARM_NEON;
13194 for (uint32_t n = 16; n <= 24; n += 8) {
13195 for (size_t k = 1; k <= 10; k += 3) {
13196 GemmMicrokernelTester()
13197 .mr(4)
13198 .nr(8)
13199 .kr(1)
13200 .sr(1)
13201 .m(4)
13202 .n(8)
13203 .k(k)
13204 .ks(3)
13205 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13206 }
13207 }
13208 }
13209
13210 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cm_subtile) {
13211 TEST_REQUIRES_ARM_NEON;
13212 for (size_t k = 1; k <= 10; k += 3) {
13213 for (uint32_t m = 1; m <= 4; m++) {
13214 for (uint32_t n = 1; n <= 8; n++) {
13215 GemmMicrokernelTester()
13216 .mr(4)
13217 .nr(8)
13218 .kr(1)
13219 .sr(1)
13220 .m(m)
13221 .n(n)
13222 .k(k)
13223 .cm_stride(11)
13224 .iterations(1)
13225 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13226 }
13227 }
13228 }
13229 }
13230
13231 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, a_offset) {
13232 TEST_REQUIRES_ARM_NEON;
13233 for (size_t k = 1; k <= 10; k += 3) {
13234 GemmMicrokernelTester()
13235 .mr(4)
13236 .nr(8)
13237 .kr(1)
13238 .sr(1)
13239 .m(4)
13240 .n(8)
13241 .k(k)
13242 .ks(3)
13243 .a_offset(43)
13244 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13245 }
13246 }
13247
13248 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, zero) {
13249 TEST_REQUIRES_ARM_NEON;
13250 for (uint32_t mz = 0; mz < 4; mz++) {
13251 for (size_t k = 1; k <= 10; k += 3) {
13252 GemmMicrokernelTester()
13253 .mr(4)
13254 .nr(8)
13255 .kr(1)
13256 .sr(1)
13257 .m(4)
13258 .n(8)
13259 .k(k)
13260 .ks(3)
13261 .a_offset(43)
13262 .zero_index(mz)
13263 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13264 }
13265 }
13266 }
13267
13268 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, qmin) {
13269 TEST_REQUIRES_ARM_NEON;
13270 GemmMicrokernelTester()
13271 .mr(4)
13272 .nr(8)
13273 .kr(1)
13274 .sr(1)
13275 .m(4)
13276 .n(8)
13277 .k(2)
13278 .qmin(128)
13279 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13280 }
13281
13282 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, qmax) {
13283 TEST_REQUIRES_ARM_NEON;
13284 GemmMicrokernelTester()
13285 .mr(4)
13286 .nr(8)
13287 .kr(1)
13288 .sr(1)
13289 .m(4)
13290 .n(8)
13291 .k(2)
13292 .qmax(128)
13293 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13294 }
13295
13296 TEST(F32_IGEMM_4X8__NEON_LANE_LD64, strided_cm) {
13297 TEST_REQUIRES_ARM_NEON;
13298 GemmMicrokernelTester()
13299 .mr(4)
13300 .nr(8)
13301 .kr(1)
13302 .sr(1)
13303 .m(4)
13304 .n(8)
13305 .k(2)
13306 .cm_stride(11)
13307 .Test(xnn_f32_igemm_ukernel_4x8__neon_lane_ld64);
13308 }
13309#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13310
13311
13312#if XNN_ARCH_ARM || XNN_ARCH_ARM64
13313 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2) {
13314 TEST_REQUIRES_ARM_NEON;
13315 GemmMicrokernelTester()
13316 .mr(6)
13317 .nr(8)
13318 .kr(1)
13319 .sr(1)
13320 .m(6)
13321 .n(8)
13322 .k(2)
13323 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13324 }
13325
13326 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cn) {
13327 TEST_REQUIRES_ARM_NEON;
13328 GemmMicrokernelTester()
13329 .mr(6)
13330 .nr(8)
13331 .kr(1)
13332 .sr(1)
13333 .m(6)
13334 .n(8)
13335 .k(2)
13336 .cn_stride(11)
13337 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13338 }
13339
13340 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
13341 TEST_REQUIRES_ARM_NEON;
13342 for (uint32_t m = 1; m <= 6; m++) {
13343 for (uint32_t n = 1; n <= 8; n++) {
13344 GemmMicrokernelTester()
13345 .mr(6)
13346 .nr(8)
13347 .kr(1)
13348 .sr(1)
13349 .m(m)
13350 .n(n)
13351 .k(2)
13352 .iterations(1)
13353 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13354 }
13355 }
13356 }
13357
13358 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
13359 TEST_REQUIRES_ARM_NEON;
13360 for (uint32_t m = 1; m <= 6; m++) {
13361 GemmMicrokernelTester()
13362 .mr(6)
13363 .nr(8)
13364 .kr(1)
13365 .sr(1)
13366 .m(m)
13367 .n(8)
13368 .k(2)
13369 .iterations(1)
13370 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13371 }
13372 }
13373
13374 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
13375 TEST_REQUIRES_ARM_NEON;
13376 for (uint32_t n = 1; n <= 8; n++) {
13377 GemmMicrokernelTester()
13378 .mr(6)
13379 .nr(8)
13380 .kr(1)
13381 .sr(1)
13382 .m(6)
13383 .n(n)
13384 .k(2)
13385 .iterations(1)
13386 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13387 }
13388 }
13389
13390 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_lt_2) {
13391 TEST_REQUIRES_ARM_NEON;
13392 for (size_t k = 1; k < 2; k++) {
13393 GemmMicrokernelTester()
13394 .mr(6)
13395 .nr(8)
13396 .kr(1)
13397 .sr(1)
13398 .m(6)
13399 .n(8)
13400 .k(k)
13401 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13402 }
13403 }
13404
13405 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
13406 TEST_REQUIRES_ARM_NEON;
13407 for (size_t k = 1; k < 2; k++) {
13408 for (uint32_t m = 1; m <= 6; m++) {
13409 for (uint32_t n = 1; n <= 8; n++) {
13410 GemmMicrokernelTester()
13411 .mr(6)
13412 .nr(8)
13413 .kr(1)
13414 .sr(1)
13415 .m(m)
13416 .n(n)
13417 .k(k)
13418 .iterations(1)
13419 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13420 }
13421 }
13422 }
13423 }
13424
13425 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_gt_2) {
13426 TEST_REQUIRES_ARM_NEON;
13427 for (size_t k = 3; k < 4; k++) {
13428 GemmMicrokernelTester()
13429 .mr(6)
13430 .nr(8)
13431 .kr(1)
13432 .sr(1)
13433 .m(6)
13434 .n(8)
13435 .k(k)
13436 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13437 }
13438 }
13439
13440 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
13441 TEST_REQUIRES_ARM_NEON;
13442 for (size_t k = 3; k < 4; k++) {
13443 for (uint32_t m = 1; m <= 6; m++) {
13444 for (uint32_t n = 1; n <= 8; n++) {
13445 GemmMicrokernelTester()
13446 .mr(6)
13447 .nr(8)
13448 .kr(1)
13449 .sr(1)
13450 .m(m)
13451 .n(n)
13452 .k(k)
13453 .iterations(1)
13454 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13455 }
13456 }
13457 }
13458 }
13459
13460 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_div_2) {
13461 TEST_REQUIRES_ARM_NEON;
13462 for (size_t k = 4; k <= 20; k += 2) {
13463 GemmMicrokernelTester()
13464 .mr(6)
13465 .nr(8)
13466 .kr(1)
13467 .sr(1)
13468 .m(6)
13469 .n(8)
13470 .k(k)
13471 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13472 }
13473 }
13474
13475 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, k_div_2_subtile) {
13476 TEST_REQUIRES_ARM_NEON;
13477 for (size_t k = 4; k <= 20; k += 2) {
13478 for (uint32_t m = 1; m <= 6; m++) {
13479 for (uint32_t n = 1; n <= 8; n++) {
13480 GemmMicrokernelTester()
13481 .mr(6)
13482 .nr(8)
13483 .kr(1)
13484 .sr(1)
13485 .m(m)
13486 .n(n)
13487 .k(k)
13488 .iterations(1)
13489 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13490 }
13491 }
13492 }
13493 }
13494
13495 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8) {
13496 TEST_REQUIRES_ARM_NEON;
13497 for (uint32_t n = 9; n < 16; n++) {
13498 for (size_t k = 1; k <= 10; k += 3) {
13499 GemmMicrokernelTester()
13500 .mr(6)
13501 .nr(8)
13502 .kr(1)
13503 .sr(1)
13504 .m(6)
13505 .n(8)
13506 .k(k)
13507 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13508 }
13509 }
13510 }
13511
13512 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
13513 TEST_REQUIRES_ARM_NEON;
13514 for (uint32_t n = 9; n < 16; n++) {
13515 for (size_t k = 1; k <= 10; k += 3) {
13516 GemmMicrokernelTester()
13517 .mr(6)
13518 .nr(8)
13519 .kr(1)
13520 .sr(1)
13521 .m(6)
13522 .n(8)
13523 .k(k)
13524 .cn_stride(11)
13525 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13526 }
13527 }
13528 }
13529
13530 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
13531 TEST_REQUIRES_ARM_NEON;
13532 for (uint32_t n = 9; n < 16; n++) {
13533 for (size_t k = 1; k <= 10; k += 3) {
13534 for (uint32_t m = 1; m <= 6; m++) {
13535 GemmMicrokernelTester()
13536 .mr(6)
13537 .nr(8)
13538 .kr(1)
13539 .sr(1)
13540 .m(m)
13541 .n(n)
13542 .k(k)
13543 .iterations(1)
13544 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13545 }
13546 }
13547 }
13548 }
13549
13550 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8) {
13551 TEST_REQUIRES_ARM_NEON;
13552 for (uint32_t n = 16; n <= 24; n += 8) {
13553 for (size_t k = 1; k <= 10; k += 3) {
13554 GemmMicrokernelTester()
13555 .mr(6)
13556 .nr(8)
13557 .kr(1)
13558 .sr(1)
13559 .m(6)
13560 .n(8)
13561 .k(k)
13562 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13563 }
13564 }
13565 }
13566
13567 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
13568 TEST_REQUIRES_ARM_NEON;
13569 for (uint32_t n = 16; n <= 24; n += 8) {
13570 for (size_t k = 1; k <= 10; k += 3) {
13571 GemmMicrokernelTester()
13572 .mr(6)
13573 .nr(8)
13574 .kr(1)
13575 .sr(1)
13576 .m(6)
13577 .n(n)
13578 .k(k)
13579 .cn_stride(11)
13580 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13581 }
13582 }
13583 }
13584
13585 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_subtile) {
13586 TEST_REQUIRES_ARM_NEON;
13587 for (uint32_t n = 16; n <= 24; n += 8) {
13588 for (size_t k = 1; k <= 10; k += 3) {
13589 for (uint32_t m = 1; m <= 6; m++) {
13590 GemmMicrokernelTester()
13591 .mr(6)
13592 .nr(8)
13593 .kr(1)
13594 .sr(1)
13595 .m(m)
13596 .n(n)
13597 .k(k)
13598 .iterations(1)
13599 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13600 }
13601 }
13602 }
13603 }
13604
13605 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, small_kernel) {
13606 TEST_REQUIRES_ARM_NEON;
13607 for (size_t k = 1; k <= 10; k += 3) {
13608 GemmMicrokernelTester()
13609 .mr(6)
13610 .nr(8)
13611 .kr(1)
13612 .sr(1)
13613 .m(6)
13614 .n(8)
13615 .k(k)
13616 .ks(3)
13617 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13618 }
13619 }
13620
13621 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, small_kernel_subtile) {
13622 TEST_REQUIRES_ARM_NEON;
13623 for (size_t k = 1; k <= 10; k += 3) {
13624 for (uint32_t m = 1; m <= 6; m++) {
13625 for (uint32_t n = 1; n <= 8; n++) {
13626 GemmMicrokernelTester()
13627 .mr(6)
13628 .nr(8)
13629 .kr(1)
13630 .sr(1)
13631 .m(m)
13632 .n(n)
13633 .k(k)
13634 .ks(3)
13635 .iterations(1)
13636 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13637 }
13638 }
13639 }
13640 }
13641
13642 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_gt_8_small_kernel) {
13643 TEST_REQUIRES_ARM_NEON;
13644 for (uint32_t n = 9; n < 16; n++) {
13645 for (size_t k = 1; k <= 10; k += 3) {
13646 GemmMicrokernelTester()
13647 .mr(6)
13648 .nr(8)
13649 .kr(1)
13650 .sr(1)
13651 .m(6)
13652 .n(8)
13653 .k(k)
13654 .ks(3)
13655 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13656 }
13657 }
13658 }
13659
13660 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, n_div_8_small_kernel) {
13661 TEST_REQUIRES_ARM_NEON;
13662 for (uint32_t n = 16; n <= 24; n += 8) {
13663 for (size_t k = 1; k <= 10; k += 3) {
13664 GemmMicrokernelTester()
13665 .mr(6)
13666 .nr(8)
13667 .kr(1)
13668 .sr(1)
13669 .m(6)
13670 .n(8)
13671 .k(k)
13672 .ks(3)
13673 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13674 }
13675 }
13676 }
13677
13678 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cm_subtile) {
13679 TEST_REQUIRES_ARM_NEON;
13680 for (size_t k = 1; k <= 10; k += 3) {
13681 for (uint32_t m = 1; m <= 6; m++) {
13682 for (uint32_t n = 1; n <= 8; n++) {
13683 GemmMicrokernelTester()
13684 .mr(6)
13685 .nr(8)
13686 .kr(1)
13687 .sr(1)
13688 .m(m)
13689 .n(n)
13690 .k(k)
13691 .cm_stride(11)
13692 .iterations(1)
13693 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13694 }
13695 }
13696 }
13697 }
13698
13699 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, a_offset) {
13700 TEST_REQUIRES_ARM_NEON;
13701 for (size_t k = 1; k <= 10; k += 3) {
13702 GemmMicrokernelTester()
13703 .mr(6)
13704 .nr(8)
13705 .kr(1)
13706 .sr(1)
13707 .m(6)
13708 .n(8)
13709 .k(k)
13710 .ks(3)
13711 .a_offset(67)
13712 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13713 }
13714 }
13715
13716 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, zero) {
13717 TEST_REQUIRES_ARM_NEON;
13718 for (uint32_t mz = 0; mz < 6; mz++) {
13719 for (size_t k = 1; k <= 10; k += 3) {
13720 GemmMicrokernelTester()
13721 .mr(6)
13722 .nr(8)
13723 .kr(1)
13724 .sr(1)
13725 .m(6)
13726 .n(8)
13727 .k(k)
13728 .ks(3)
13729 .a_offset(67)
13730 .zero_index(mz)
13731 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13732 }
13733 }
13734 }
13735
13736 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, qmin) {
13737 TEST_REQUIRES_ARM_NEON;
13738 GemmMicrokernelTester()
13739 .mr(6)
13740 .nr(8)
13741 .kr(1)
13742 .sr(1)
13743 .m(6)
13744 .n(8)
13745 .k(2)
13746 .qmin(128)
13747 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13748 }
13749
13750 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, qmax) {
13751 TEST_REQUIRES_ARM_NEON;
13752 GemmMicrokernelTester()
13753 .mr(6)
13754 .nr(8)
13755 .kr(1)
13756 .sr(1)
13757 .m(6)
13758 .n(8)
13759 .k(2)
13760 .qmax(128)
13761 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13762 }
13763
13764 TEST(F32_IGEMM_6X8__NEON_LANE_LD64, strided_cm) {
13765 TEST_REQUIRES_ARM_NEON;
13766 GemmMicrokernelTester()
13767 .mr(6)
13768 .nr(8)
13769 .kr(1)
13770 .sr(1)
13771 .m(6)
13772 .n(8)
13773 .k(2)
13774 .cm_stride(11)
13775 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld64);
13776 }
13777#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
13778
13779
13780#if XNN_ARCH_ARM || XNN_ARCH_ARM64
13781 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4) {
13782 TEST_REQUIRES_ARM_NEON;
13783 GemmMicrokernelTester()
13784 .mr(6)
13785 .nr(8)
13786 .kr(1)
13787 .sr(1)
13788 .m(6)
13789 .n(8)
13790 .k(4)
13791 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13792 }
13793
13794 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, strided_cn) {
13795 TEST_REQUIRES_ARM_NEON;
13796 GemmMicrokernelTester()
13797 .mr(6)
13798 .nr(8)
13799 .kr(1)
13800 .sr(1)
13801 .m(6)
13802 .n(8)
13803 .k(4)
13804 .cn_stride(11)
13805 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13806 }
13807
13808 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
13809 TEST_REQUIRES_ARM_NEON;
13810 for (uint32_t m = 1; m <= 6; m++) {
13811 for (uint32_t n = 1; n <= 8; n++) {
13812 GemmMicrokernelTester()
13813 .mr(6)
13814 .nr(8)
13815 .kr(1)
13816 .sr(1)
13817 .m(m)
13818 .n(n)
13819 .k(4)
13820 .iterations(1)
13821 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13822 }
13823 }
13824 }
13825
13826 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
13827 TEST_REQUIRES_ARM_NEON;
13828 for (uint32_t m = 1; m <= 6; m++) {
13829 GemmMicrokernelTester()
13830 .mr(6)
13831 .nr(8)
13832 .kr(1)
13833 .sr(1)
13834 .m(m)
13835 .n(8)
13836 .k(4)
13837 .iterations(1)
13838 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13839 }
13840 }
13841
13842 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
13843 TEST_REQUIRES_ARM_NEON;
13844 for (uint32_t n = 1; n <= 8; n++) {
13845 GemmMicrokernelTester()
13846 .mr(6)
13847 .nr(8)
13848 .kr(1)
13849 .sr(1)
13850 .m(6)
13851 .n(n)
13852 .k(4)
13853 .iterations(1)
13854 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13855 }
13856 }
13857
13858 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_lt_4) {
13859 TEST_REQUIRES_ARM_NEON;
13860 for (size_t k = 1; k < 4; k++) {
13861 GemmMicrokernelTester()
13862 .mr(6)
13863 .nr(8)
13864 .kr(1)
13865 .sr(1)
13866 .m(6)
13867 .n(8)
13868 .k(k)
13869 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13870 }
13871 }
13872
13873 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
13874 TEST_REQUIRES_ARM_NEON;
13875 for (size_t k = 1; k < 4; k++) {
13876 for (uint32_t m = 1; m <= 6; m++) {
13877 for (uint32_t n = 1; n <= 8; n++) {
13878 GemmMicrokernelTester()
13879 .mr(6)
13880 .nr(8)
13881 .kr(1)
13882 .sr(1)
13883 .m(m)
13884 .n(n)
13885 .k(k)
13886 .iterations(1)
13887 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13888 }
13889 }
13890 }
13891 }
13892
13893 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_gt_4) {
13894 TEST_REQUIRES_ARM_NEON;
13895 for (size_t k = 5; k < 8; k++) {
13896 GemmMicrokernelTester()
13897 .mr(6)
13898 .nr(8)
13899 .kr(1)
13900 .sr(1)
13901 .m(6)
13902 .n(8)
13903 .k(k)
13904 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13905 }
13906 }
13907
13908 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
13909 TEST_REQUIRES_ARM_NEON;
13910 for (size_t k = 5; k < 8; k++) {
13911 for (uint32_t m = 1; m <= 6; m++) {
13912 for (uint32_t n = 1; n <= 8; n++) {
13913 GemmMicrokernelTester()
13914 .mr(6)
13915 .nr(8)
13916 .kr(1)
13917 .sr(1)
13918 .m(m)
13919 .n(n)
13920 .k(k)
13921 .iterations(1)
13922 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13923 }
13924 }
13925 }
13926 }
13927
13928 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_div_4) {
13929 TEST_REQUIRES_ARM_NEON;
13930 for (size_t k = 8; k <= 40; k += 4) {
13931 GemmMicrokernelTester()
13932 .mr(6)
13933 .nr(8)
13934 .kr(1)
13935 .sr(1)
13936 .m(6)
13937 .n(8)
13938 .k(k)
13939 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13940 }
13941 }
13942
13943 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, k_div_4_subtile) {
13944 TEST_REQUIRES_ARM_NEON;
13945 for (size_t k = 8; k <= 40; k += 4) {
13946 for (uint32_t m = 1; m <= 6; m++) {
13947 for (uint32_t n = 1; n <= 8; n++) {
13948 GemmMicrokernelTester()
13949 .mr(6)
13950 .nr(8)
13951 .kr(1)
13952 .sr(1)
13953 .m(m)
13954 .n(n)
13955 .k(k)
13956 .iterations(1)
13957 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13958 }
13959 }
13960 }
13961 }
13962
13963 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8) {
13964 TEST_REQUIRES_ARM_NEON;
13965 for (uint32_t n = 9; n < 16; n++) {
13966 for (size_t k = 1; k <= 20; k += 5) {
13967 GemmMicrokernelTester()
13968 .mr(6)
13969 .nr(8)
13970 .kr(1)
13971 .sr(1)
13972 .m(6)
13973 .n(8)
13974 .k(k)
13975 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13976 }
13977 }
13978 }
13979
13980 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
13981 TEST_REQUIRES_ARM_NEON;
13982 for (uint32_t n = 9; n < 16; n++) {
13983 for (size_t k = 1; k <= 20; k += 5) {
13984 GemmMicrokernelTester()
13985 .mr(6)
13986 .nr(8)
13987 .kr(1)
13988 .sr(1)
13989 .m(6)
13990 .n(8)
13991 .k(k)
13992 .cn_stride(11)
13993 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
13994 }
13995 }
13996 }
13997
13998 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
13999 TEST_REQUIRES_ARM_NEON;
14000 for (uint32_t n = 9; n < 16; n++) {
14001 for (size_t k = 1; k <= 20; k += 5) {
14002 for (uint32_t m = 1; m <= 6; m++) {
14003 GemmMicrokernelTester()
14004 .mr(6)
14005 .nr(8)
14006 .kr(1)
14007 .sr(1)
14008 .m(m)
14009 .n(n)
14010 .k(k)
14011 .iterations(1)
14012 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14013 }
14014 }
14015 }
14016 }
14017
14018 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8) {
14019 TEST_REQUIRES_ARM_NEON;
14020 for (uint32_t n = 16; n <= 24; n += 8) {
14021 for (size_t k = 1; k <= 20; k += 5) {
14022 GemmMicrokernelTester()
14023 .mr(6)
14024 .nr(8)
14025 .kr(1)
14026 .sr(1)
14027 .m(6)
14028 .n(8)
14029 .k(k)
14030 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14031 }
14032 }
14033 }
14034
14035 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
14036 TEST_REQUIRES_ARM_NEON;
14037 for (uint32_t n = 16; n <= 24; n += 8) {
14038 for (size_t k = 1; k <= 20; k += 5) {
14039 GemmMicrokernelTester()
14040 .mr(6)
14041 .nr(8)
14042 .kr(1)
14043 .sr(1)
14044 .m(6)
14045 .n(n)
14046 .k(k)
14047 .cn_stride(11)
14048 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14049 }
14050 }
14051 }
14052
14053 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8_subtile) {
14054 TEST_REQUIRES_ARM_NEON;
14055 for (uint32_t n = 16; n <= 24; n += 8) {
14056 for (size_t k = 1; k <= 20; k += 5) {
14057 for (uint32_t m = 1; m <= 6; m++) {
14058 GemmMicrokernelTester()
14059 .mr(6)
14060 .nr(8)
14061 .kr(1)
14062 .sr(1)
14063 .m(m)
14064 .n(n)
14065 .k(k)
14066 .iterations(1)
14067 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14068 }
14069 }
14070 }
14071 }
14072
14073 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, small_kernel) {
14074 TEST_REQUIRES_ARM_NEON;
14075 for (size_t k = 1; k <= 20; k += 5) {
14076 GemmMicrokernelTester()
14077 .mr(6)
14078 .nr(8)
14079 .kr(1)
14080 .sr(1)
14081 .m(6)
14082 .n(8)
14083 .k(k)
14084 .ks(3)
14085 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14086 }
14087 }
14088
14089 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, small_kernel_subtile) {
14090 TEST_REQUIRES_ARM_NEON;
14091 for (size_t k = 1; k <= 20; k += 5) {
14092 for (uint32_t m = 1; m <= 6; m++) {
14093 for (uint32_t n = 1; n <= 8; n++) {
14094 GemmMicrokernelTester()
14095 .mr(6)
14096 .nr(8)
14097 .kr(1)
14098 .sr(1)
14099 .m(m)
14100 .n(n)
14101 .k(k)
14102 .ks(3)
14103 .iterations(1)
14104 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14105 }
14106 }
14107 }
14108 }
14109
14110 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_gt_8_small_kernel) {
14111 TEST_REQUIRES_ARM_NEON;
14112 for (uint32_t n = 9; n < 16; n++) {
14113 for (size_t k = 1; k <= 20; k += 5) {
14114 GemmMicrokernelTester()
14115 .mr(6)
14116 .nr(8)
14117 .kr(1)
14118 .sr(1)
14119 .m(6)
14120 .n(8)
14121 .k(k)
14122 .ks(3)
14123 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14124 }
14125 }
14126 }
14127
14128 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, n_div_8_small_kernel) {
14129 TEST_REQUIRES_ARM_NEON;
14130 for (uint32_t n = 16; n <= 24; n += 8) {
14131 for (size_t k = 1; k <= 20; k += 5) {
14132 GemmMicrokernelTester()
14133 .mr(6)
14134 .nr(8)
14135 .kr(1)
14136 .sr(1)
14137 .m(6)
14138 .n(8)
14139 .k(k)
14140 .ks(3)
14141 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14142 }
14143 }
14144 }
14145
14146 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, strided_cm_subtile) {
14147 TEST_REQUIRES_ARM_NEON;
14148 for (size_t k = 1; k <= 20; k += 5) {
14149 for (uint32_t m = 1; m <= 6; m++) {
14150 for (uint32_t n = 1; n <= 8; n++) {
14151 GemmMicrokernelTester()
14152 .mr(6)
14153 .nr(8)
14154 .kr(1)
14155 .sr(1)
14156 .m(m)
14157 .n(n)
14158 .k(k)
14159 .cm_stride(11)
14160 .iterations(1)
14161 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14162 }
14163 }
14164 }
14165 }
14166
14167 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, a_offset) {
14168 TEST_REQUIRES_ARM_NEON;
14169 for (size_t k = 1; k <= 20; k += 5) {
14170 GemmMicrokernelTester()
14171 .mr(6)
14172 .nr(8)
14173 .kr(1)
14174 .sr(1)
14175 .m(6)
14176 .n(8)
14177 .k(k)
14178 .ks(3)
14179 .a_offset(127)
14180 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14181 }
14182 }
14183
14184 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, zero) {
14185 TEST_REQUIRES_ARM_NEON;
14186 for (uint32_t mz = 0; mz < 6; mz++) {
14187 for (size_t k = 1; k <= 20; k += 5) {
14188 GemmMicrokernelTester()
14189 .mr(6)
14190 .nr(8)
14191 .kr(1)
14192 .sr(1)
14193 .m(6)
14194 .n(8)
14195 .k(k)
14196 .ks(3)
14197 .a_offset(127)
14198 .zero_index(mz)
14199 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14200 }
14201 }
14202 }
14203
14204 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, qmin) {
14205 TEST_REQUIRES_ARM_NEON;
14206 GemmMicrokernelTester()
14207 .mr(6)
14208 .nr(8)
14209 .kr(1)
14210 .sr(1)
14211 .m(6)
14212 .n(8)
14213 .k(4)
14214 .qmin(128)
14215 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14216 }
14217
14218 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, qmax) {
14219 TEST_REQUIRES_ARM_NEON;
14220 GemmMicrokernelTester()
14221 .mr(6)
14222 .nr(8)
14223 .kr(1)
14224 .sr(1)
14225 .m(6)
14226 .n(8)
14227 .k(4)
14228 .qmax(128)
14229 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14230 }
14231
14232 TEST(F32_IGEMM_6X8__NEON_LANE_LD128, strided_cm) {
14233 TEST_REQUIRES_ARM_NEON;
14234 GemmMicrokernelTester()
14235 .mr(6)
14236 .nr(8)
14237 .kr(1)
14238 .sr(1)
14239 .m(6)
14240 .n(8)
14241 .k(4)
14242 .cm_stride(11)
14243 .Test(xnn_f32_igemm_ukernel_6x8__neon_lane_ld128);
14244 }
14245#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14246
14247
14248#if XNN_ARCH_ARM64
14249 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2) {
14250 TEST_REQUIRES_ARM_NEON_FMA;
14251 GemmMicrokernelTester()
14252 .mr(1)
14253 .nr(8)
14254 .kr(1)
14255 .sr(1)
14256 .m(1)
14257 .n(8)
14258 .k(2)
14259 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14260 }
14261
14262 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, strided_cn) {
14263 TEST_REQUIRES_ARM_NEON_FMA;
14264 GemmMicrokernelTester()
14265 .mr(1)
14266 .nr(8)
14267 .kr(1)
14268 .sr(1)
14269 .m(1)
14270 .n(8)
14271 .k(2)
14272 .cn_stride(11)
14273 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14274 }
14275
14276 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
14277 TEST_REQUIRES_ARM_NEON_FMA;
14278 for (uint32_t m = 1; m <= 1; m++) {
14279 for (uint32_t n = 1; n <= 8; n++) {
14280 GemmMicrokernelTester()
14281 .mr(1)
14282 .nr(8)
14283 .kr(1)
14284 .sr(1)
14285 .m(m)
14286 .n(n)
14287 .k(2)
14288 .iterations(1)
14289 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14290 }
14291 }
14292 }
14293
14294 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
14295 TEST_REQUIRES_ARM_NEON_FMA;
14296 for (uint32_t m = 1; m <= 1; m++) {
14297 GemmMicrokernelTester()
14298 .mr(1)
14299 .nr(8)
14300 .kr(1)
14301 .sr(1)
14302 .m(m)
14303 .n(8)
14304 .k(2)
14305 .iterations(1)
14306 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14307 }
14308 }
14309
14310 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
14311 TEST_REQUIRES_ARM_NEON_FMA;
14312 for (uint32_t n = 1; n <= 8; n++) {
14313 GemmMicrokernelTester()
14314 .mr(1)
14315 .nr(8)
14316 .kr(1)
14317 .sr(1)
14318 .m(1)
14319 .n(n)
14320 .k(2)
14321 .iterations(1)
14322 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14323 }
14324 }
14325
14326 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_lt_2) {
14327 TEST_REQUIRES_ARM_NEON_FMA;
14328 for (size_t k = 1; k < 2; k++) {
14329 GemmMicrokernelTester()
14330 .mr(1)
14331 .nr(8)
14332 .kr(1)
14333 .sr(1)
14334 .m(1)
14335 .n(8)
14336 .k(k)
14337 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14338 }
14339 }
14340
14341 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
14342 TEST_REQUIRES_ARM_NEON_FMA;
14343 for (size_t k = 1; k < 2; k++) {
14344 for (uint32_t m = 1; m <= 1; m++) {
14345 for (uint32_t n = 1; n <= 8; n++) {
14346 GemmMicrokernelTester()
14347 .mr(1)
14348 .nr(8)
14349 .kr(1)
14350 .sr(1)
14351 .m(m)
14352 .n(n)
14353 .k(k)
14354 .iterations(1)
14355 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14356 }
14357 }
14358 }
14359 }
14360
14361 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_gt_2) {
14362 TEST_REQUIRES_ARM_NEON_FMA;
14363 for (size_t k = 3; k < 4; k++) {
14364 GemmMicrokernelTester()
14365 .mr(1)
14366 .nr(8)
14367 .kr(1)
14368 .sr(1)
14369 .m(1)
14370 .n(8)
14371 .k(k)
14372 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14373 }
14374 }
14375
14376 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
14377 TEST_REQUIRES_ARM_NEON_FMA;
14378 for (size_t k = 3; k < 4; k++) {
14379 for (uint32_t m = 1; m <= 1; m++) {
14380 for (uint32_t n = 1; n <= 8; n++) {
14381 GemmMicrokernelTester()
14382 .mr(1)
14383 .nr(8)
14384 .kr(1)
14385 .sr(1)
14386 .m(m)
14387 .n(n)
14388 .k(k)
14389 .iterations(1)
14390 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14391 }
14392 }
14393 }
14394 }
14395
14396 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_div_2) {
14397 TEST_REQUIRES_ARM_NEON_FMA;
14398 for (size_t k = 4; k <= 20; k += 2) {
14399 GemmMicrokernelTester()
14400 .mr(1)
14401 .nr(8)
14402 .kr(1)
14403 .sr(1)
14404 .m(1)
14405 .n(8)
14406 .k(k)
14407 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14408 }
14409 }
14410
14411 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
14412 TEST_REQUIRES_ARM_NEON_FMA;
14413 for (size_t k = 4; k <= 20; k += 2) {
14414 for (uint32_t m = 1; m <= 1; m++) {
14415 for (uint32_t n = 1; n <= 8; n++) {
14416 GemmMicrokernelTester()
14417 .mr(1)
14418 .nr(8)
14419 .kr(1)
14420 .sr(1)
14421 .m(m)
14422 .n(n)
14423 .k(k)
14424 .iterations(1)
14425 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14426 }
14427 }
14428 }
14429 }
14430
14431 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8) {
14432 TEST_REQUIRES_ARM_NEON_FMA;
14433 for (uint32_t n = 9; n < 16; n++) {
14434 for (size_t k = 1; k <= 10; k += 3) {
14435 GemmMicrokernelTester()
14436 .mr(1)
14437 .nr(8)
14438 .kr(1)
14439 .sr(1)
14440 .m(1)
14441 .n(8)
14442 .k(k)
14443 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14444 }
14445 }
14446 }
14447
14448 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
14449 TEST_REQUIRES_ARM_NEON_FMA;
14450 for (uint32_t n = 9; n < 16; n++) {
14451 for (size_t k = 1; k <= 10; k += 3) {
14452 GemmMicrokernelTester()
14453 .mr(1)
14454 .nr(8)
14455 .kr(1)
14456 .sr(1)
14457 .m(1)
14458 .n(8)
14459 .k(k)
14460 .cn_stride(11)
14461 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14462 }
14463 }
14464 }
14465
14466 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
14467 TEST_REQUIRES_ARM_NEON_FMA;
14468 for (uint32_t n = 9; n < 16; n++) {
14469 for (size_t k = 1; k <= 10; k += 3) {
14470 for (uint32_t m = 1; m <= 1; m++) {
14471 GemmMicrokernelTester()
14472 .mr(1)
14473 .nr(8)
14474 .kr(1)
14475 .sr(1)
14476 .m(m)
14477 .n(n)
14478 .k(k)
14479 .iterations(1)
14480 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14481 }
14482 }
14483 }
14484 }
14485
14486 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8) {
14487 TEST_REQUIRES_ARM_NEON_FMA;
14488 for (uint32_t n = 16; n <= 24; n += 8) {
14489 for (size_t k = 1; k <= 10; k += 3) {
14490 GemmMicrokernelTester()
14491 .mr(1)
14492 .nr(8)
14493 .kr(1)
14494 .sr(1)
14495 .m(1)
14496 .n(8)
14497 .k(k)
14498 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14499 }
14500 }
14501 }
14502
14503 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
14504 TEST_REQUIRES_ARM_NEON_FMA;
14505 for (uint32_t n = 16; n <= 24; n += 8) {
14506 for (size_t k = 1; k <= 10; k += 3) {
14507 GemmMicrokernelTester()
14508 .mr(1)
14509 .nr(8)
14510 .kr(1)
14511 .sr(1)
14512 .m(1)
14513 .n(n)
14514 .k(k)
14515 .cn_stride(11)
14516 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14517 }
14518 }
14519 }
14520
14521 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
14522 TEST_REQUIRES_ARM_NEON_FMA;
14523 for (uint32_t n = 16; n <= 24; n += 8) {
14524 for (size_t k = 1; k <= 10; k += 3) {
14525 for (uint32_t m = 1; m <= 1; m++) {
14526 GemmMicrokernelTester()
14527 .mr(1)
14528 .nr(8)
14529 .kr(1)
14530 .sr(1)
14531 .m(m)
14532 .n(n)
14533 .k(k)
14534 .iterations(1)
14535 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14536 }
14537 }
14538 }
14539 }
14540
14541 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, small_kernel) {
14542 TEST_REQUIRES_ARM_NEON_FMA;
14543 for (size_t k = 1; k <= 10; k += 3) {
14544 GemmMicrokernelTester()
14545 .mr(1)
14546 .nr(8)
14547 .kr(1)
14548 .sr(1)
14549 .m(1)
14550 .n(8)
14551 .k(k)
14552 .ks(3)
14553 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14554 }
14555 }
14556
14557 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
14558 TEST_REQUIRES_ARM_NEON_FMA;
14559 for (size_t k = 1; k <= 10; k += 3) {
14560 for (uint32_t m = 1; m <= 1; m++) {
14561 for (uint32_t n = 1; n <= 8; n++) {
14562 GemmMicrokernelTester()
14563 .mr(1)
14564 .nr(8)
14565 .kr(1)
14566 .sr(1)
14567 .m(m)
14568 .n(n)
14569 .k(k)
14570 .ks(3)
14571 .iterations(1)
14572 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14573 }
14574 }
14575 }
14576 }
14577
14578 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
14579 TEST_REQUIRES_ARM_NEON_FMA;
14580 for (uint32_t n = 9; n < 16; n++) {
14581 for (size_t k = 1; k <= 10; k += 3) {
14582 GemmMicrokernelTester()
14583 .mr(1)
14584 .nr(8)
14585 .kr(1)
14586 .sr(1)
14587 .m(1)
14588 .n(8)
14589 .k(k)
14590 .ks(3)
14591 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14592 }
14593 }
14594 }
14595
14596 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
14597 TEST_REQUIRES_ARM_NEON_FMA;
14598 for (uint32_t n = 16; n <= 24; n += 8) {
14599 for (size_t k = 1; k <= 10; k += 3) {
14600 GemmMicrokernelTester()
14601 .mr(1)
14602 .nr(8)
14603 .kr(1)
14604 .sr(1)
14605 .m(1)
14606 .n(8)
14607 .k(k)
14608 .ks(3)
14609 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14610 }
14611 }
14612 }
14613
14614 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
14615 TEST_REQUIRES_ARM_NEON_FMA;
14616 for (size_t k = 1; k <= 10; k += 3) {
14617 for (uint32_t m = 1; m <= 1; m++) {
14618 for (uint32_t n = 1; n <= 8; n++) {
14619 GemmMicrokernelTester()
14620 .mr(1)
14621 .nr(8)
14622 .kr(1)
14623 .sr(1)
14624 .m(m)
14625 .n(n)
14626 .k(k)
14627 .cm_stride(11)
14628 .iterations(1)
14629 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14630 }
14631 }
14632 }
14633 }
14634
14635 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, a_offset) {
14636 TEST_REQUIRES_ARM_NEON_FMA;
14637 for (size_t k = 1; k <= 10; k += 3) {
14638 GemmMicrokernelTester()
14639 .mr(1)
14640 .nr(8)
14641 .kr(1)
14642 .sr(1)
14643 .m(1)
14644 .n(8)
14645 .k(k)
14646 .ks(3)
14647 .a_offset(13)
14648 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14649 }
14650 }
14651
14652 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, zero) {
14653 TEST_REQUIRES_ARM_NEON_FMA;
14654 for (uint32_t mz = 0; mz < 1; mz++) {
14655 for (size_t k = 1; k <= 10; k += 3) {
14656 GemmMicrokernelTester()
14657 .mr(1)
14658 .nr(8)
14659 .kr(1)
14660 .sr(1)
14661 .m(1)
14662 .n(8)
14663 .k(k)
14664 .ks(3)
14665 .a_offset(13)
14666 .zero_index(mz)
14667 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14668 }
14669 }
14670 }
14671
14672 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, qmin) {
14673 TEST_REQUIRES_ARM_NEON_FMA;
14674 GemmMicrokernelTester()
14675 .mr(1)
14676 .nr(8)
14677 .kr(1)
14678 .sr(1)
14679 .m(1)
14680 .n(8)
14681 .k(2)
14682 .qmin(128)
14683 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14684 }
14685
14686 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, qmax) {
14687 TEST_REQUIRES_ARM_NEON_FMA;
14688 GemmMicrokernelTester()
14689 .mr(1)
14690 .nr(8)
14691 .kr(1)
14692 .sr(1)
14693 .m(1)
14694 .n(8)
14695 .k(2)
14696 .qmax(128)
14697 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14698 }
14699
14700 TEST(F32_IGEMM_1X8__NEONFMA_LANE_LD64, strided_cm) {
14701 TEST_REQUIRES_ARM_NEON_FMA;
14702 GemmMicrokernelTester()
14703 .mr(1)
14704 .nr(8)
14705 .kr(1)
14706 .sr(1)
14707 .m(1)
14708 .n(8)
14709 .k(2)
14710 .cm_stride(11)
14711 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_lane_ld64);
14712 }
14713#endif // XNN_ARCH_ARM64
14714
14715
14716#if XNN_ARCH_ARM64
14717 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2) {
14718 TEST_REQUIRES_ARM_NEON_FMA;
14719 GemmMicrokernelTester()
14720 .mr(4)
14721 .nr(2)
14722 .kr(1)
14723 .sr(1)
14724 .m(4)
14725 .n(2)
14726 .k(2)
14727 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14728 }
14729
14730 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cn) {
14731 TEST_REQUIRES_ARM_NEON_FMA;
14732 GemmMicrokernelTester()
14733 .mr(4)
14734 .nr(2)
14735 .kr(1)
14736 .sr(1)
14737 .m(4)
14738 .n(2)
14739 .k(2)
14740 .cn_stride(5)
14741 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14742 }
14743
14744 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile) {
14745 TEST_REQUIRES_ARM_NEON_FMA;
14746 for (uint32_t m = 1; m <= 4; m++) {
14747 for (uint32_t n = 1; n <= 2; n++) {
14748 GemmMicrokernelTester()
14749 .mr(4)
14750 .nr(2)
14751 .kr(1)
14752 .sr(1)
14753 .m(m)
14754 .n(n)
14755 .k(2)
14756 .iterations(1)
14757 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14758 }
14759 }
14760 }
14761
14762 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
14763 TEST_REQUIRES_ARM_NEON_FMA;
14764 for (uint32_t m = 1; m <= 4; m++) {
14765 GemmMicrokernelTester()
14766 .mr(4)
14767 .nr(2)
14768 .kr(1)
14769 .sr(1)
14770 .m(m)
14771 .n(2)
14772 .k(2)
14773 .iterations(1)
14774 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14775 }
14776 }
14777
14778 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
14779 TEST_REQUIRES_ARM_NEON_FMA;
14780 for (uint32_t n = 1; n <= 2; n++) {
14781 GemmMicrokernelTester()
14782 .mr(4)
14783 .nr(2)
14784 .kr(1)
14785 .sr(1)
14786 .m(4)
14787 .n(n)
14788 .k(2)
14789 .iterations(1)
14790 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14791 }
14792 }
14793
14794 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_lt_2) {
14795 TEST_REQUIRES_ARM_NEON_FMA;
14796 for (size_t k = 1; k < 2; k++) {
14797 GemmMicrokernelTester()
14798 .mr(4)
14799 .nr(2)
14800 .kr(1)
14801 .sr(1)
14802 .m(4)
14803 .n(2)
14804 .k(k)
14805 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14806 }
14807 }
14808
14809 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_lt_2_subtile) {
14810 TEST_REQUIRES_ARM_NEON_FMA;
14811 for (size_t k = 1; k < 2; k++) {
14812 for (uint32_t m = 1; m <= 4; m++) {
14813 for (uint32_t n = 1; n <= 2; n++) {
14814 GemmMicrokernelTester()
14815 .mr(4)
14816 .nr(2)
14817 .kr(1)
14818 .sr(1)
14819 .m(m)
14820 .n(n)
14821 .k(k)
14822 .iterations(1)
14823 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14824 }
14825 }
14826 }
14827 }
14828
14829 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_gt_2) {
14830 TEST_REQUIRES_ARM_NEON_FMA;
14831 for (size_t k = 3; k < 4; k++) {
14832 GemmMicrokernelTester()
14833 .mr(4)
14834 .nr(2)
14835 .kr(1)
14836 .sr(1)
14837 .m(4)
14838 .n(2)
14839 .k(k)
14840 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14841 }
14842 }
14843
14844 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_gt_2_subtile) {
14845 TEST_REQUIRES_ARM_NEON_FMA;
14846 for (size_t k = 3; k < 4; k++) {
14847 for (uint32_t m = 1; m <= 4; m++) {
14848 for (uint32_t n = 1; n <= 2; n++) {
14849 GemmMicrokernelTester()
14850 .mr(4)
14851 .nr(2)
14852 .kr(1)
14853 .sr(1)
14854 .m(m)
14855 .n(n)
14856 .k(k)
14857 .iterations(1)
14858 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14859 }
14860 }
14861 }
14862 }
14863
14864 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_div_2) {
14865 TEST_REQUIRES_ARM_NEON_FMA;
14866 for (size_t k = 4; k <= 20; k += 2) {
14867 GemmMicrokernelTester()
14868 .mr(4)
14869 .nr(2)
14870 .kr(1)
14871 .sr(1)
14872 .m(4)
14873 .n(2)
14874 .k(k)
14875 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14876 }
14877 }
14878
14879 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, k_div_2_subtile) {
14880 TEST_REQUIRES_ARM_NEON_FMA;
14881 for (size_t k = 4; k <= 20; k += 2) {
14882 for (uint32_t m = 1; m <= 4; m++) {
14883 for (uint32_t n = 1; n <= 2; n++) {
14884 GemmMicrokernelTester()
14885 .mr(4)
14886 .nr(2)
14887 .kr(1)
14888 .sr(1)
14889 .m(m)
14890 .n(n)
14891 .k(k)
14892 .iterations(1)
14893 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14894 }
14895 }
14896 }
14897 }
14898
14899 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2) {
14900 TEST_REQUIRES_ARM_NEON_FMA;
14901 for (uint32_t n = 3; n < 4; n++) {
14902 for (size_t k = 1; k <= 10; k += 3) {
14903 GemmMicrokernelTester()
14904 .mr(4)
14905 .nr(2)
14906 .kr(1)
14907 .sr(1)
14908 .m(4)
14909 .n(2)
14910 .k(k)
14911 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14912 }
14913 }
14914 }
14915
14916 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_strided_cn) {
14917 TEST_REQUIRES_ARM_NEON_FMA;
14918 for (uint32_t n = 3; n < 4; n++) {
14919 for (size_t k = 1; k <= 10; k += 3) {
14920 GemmMicrokernelTester()
14921 .mr(4)
14922 .nr(2)
14923 .kr(1)
14924 .sr(1)
14925 .m(4)
14926 .n(2)
14927 .k(k)
14928 .cn_stride(5)
14929 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14930 }
14931 }
14932 }
14933
14934 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_subtile) {
14935 TEST_REQUIRES_ARM_NEON_FMA;
14936 for (uint32_t n = 3; n < 4; n++) {
14937 for (size_t k = 1; k <= 10; k += 3) {
14938 for (uint32_t m = 1; m <= 4; m++) {
14939 GemmMicrokernelTester()
14940 .mr(4)
14941 .nr(2)
14942 .kr(1)
14943 .sr(1)
14944 .m(m)
14945 .n(n)
14946 .k(k)
14947 .iterations(1)
14948 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14949 }
14950 }
14951 }
14952 }
14953
14954 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2) {
14955 TEST_REQUIRES_ARM_NEON_FMA;
14956 for (uint32_t n = 4; n <= 6; n += 2) {
14957 for (size_t k = 1; k <= 10; k += 3) {
14958 GemmMicrokernelTester()
14959 .mr(4)
14960 .nr(2)
14961 .kr(1)
14962 .sr(1)
14963 .m(4)
14964 .n(2)
14965 .k(k)
14966 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14967 }
14968 }
14969 }
14970
14971 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_strided_cn) {
14972 TEST_REQUIRES_ARM_NEON_FMA;
14973 for (uint32_t n = 4; n <= 6; n += 2) {
14974 for (size_t k = 1; k <= 10; k += 3) {
14975 GemmMicrokernelTester()
14976 .mr(4)
14977 .nr(2)
14978 .kr(1)
14979 .sr(1)
14980 .m(4)
14981 .n(n)
14982 .k(k)
14983 .cn_stride(5)
14984 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
14985 }
14986 }
14987 }
14988
14989 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_subtile) {
14990 TEST_REQUIRES_ARM_NEON_FMA;
14991 for (uint32_t n = 4; n <= 6; n += 2) {
14992 for (size_t k = 1; k <= 10; k += 3) {
14993 for (uint32_t m = 1; m <= 4; m++) {
14994 GemmMicrokernelTester()
14995 .mr(4)
14996 .nr(2)
14997 .kr(1)
14998 .sr(1)
14999 .m(m)
15000 .n(n)
15001 .k(k)
15002 .iterations(1)
15003 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15004 }
15005 }
15006 }
15007 }
15008
15009 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, small_kernel) {
15010 TEST_REQUIRES_ARM_NEON_FMA;
15011 for (size_t k = 1; k <= 10; k += 3) {
15012 GemmMicrokernelTester()
15013 .mr(4)
15014 .nr(2)
15015 .kr(1)
15016 .sr(1)
15017 .m(4)
15018 .n(2)
15019 .k(k)
15020 .ks(3)
15021 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15022 }
15023 }
15024
15025 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, small_kernel_subtile) {
15026 TEST_REQUIRES_ARM_NEON_FMA;
15027 for (size_t k = 1; k <= 10; k += 3) {
15028 for (uint32_t m = 1; m <= 4; m++) {
15029 for (uint32_t n = 1; n <= 2; n++) {
15030 GemmMicrokernelTester()
15031 .mr(4)
15032 .nr(2)
15033 .kr(1)
15034 .sr(1)
15035 .m(m)
15036 .n(n)
15037 .k(k)
15038 .ks(3)
15039 .iterations(1)
15040 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15041 }
15042 }
15043 }
15044 }
15045
15046 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_gt_2_small_kernel) {
15047 TEST_REQUIRES_ARM_NEON_FMA;
15048 for (uint32_t n = 3; n < 4; n++) {
15049 for (size_t k = 1; k <= 10; k += 3) {
15050 GemmMicrokernelTester()
15051 .mr(4)
15052 .nr(2)
15053 .kr(1)
15054 .sr(1)
15055 .m(4)
15056 .n(2)
15057 .k(k)
15058 .ks(3)
15059 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15060 }
15061 }
15062 }
15063
15064 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, n_div_2_small_kernel) {
15065 TEST_REQUIRES_ARM_NEON_FMA;
15066 for (uint32_t n = 4; n <= 6; n += 2) {
15067 for (size_t k = 1; k <= 10; k += 3) {
15068 GemmMicrokernelTester()
15069 .mr(4)
15070 .nr(2)
15071 .kr(1)
15072 .sr(1)
15073 .m(4)
15074 .n(2)
15075 .k(k)
15076 .ks(3)
15077 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15078 }
15079 }
15080 }
15081
15082 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cm_subtile) {
15083 TEST_REQUIRES_ARM_NEON_FMA;
15084 for (size_t k = 1; k <= 10; k += 3) {
15085 for (uint32_t m = 1; m <= 4; m++) {
15086 for (uint32_t n = 1; n <= 2; n++) {
15087 GemmMicrokernelTester()
15088 .mr(4)
15089 .nr(2)
15090 .kr(1)
15091 .sr(1)
15092 .m(m)
15093 .n(n)
15094 .k(k)
15095 .cm_stride(5)
15096 .iterations(1)
15097 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15098 }
15099 }
15100 }
15101 }
15102
15103 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, a_offset) {
15104 TEST_REQUIRES_ARM_NEON_FMA;
15105 for (size_t k = 1; k <= 10; k += 3) {
15106 GemmMicrokernelTester()
15107 .mr(4)
15108 .nr(2)
15109 .kr(1)
15110 .sr(1)
15111 .m(4)
15112 .n(2)
15113 .k(k)
15114 .ks(3)
15115 .a_offset(43)
15116 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15117 }
15118 }
15119
15120 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, zero) {
15121 TEST_REQUIRES_ARM_NEON_FMA;
15122 for (uint32_t mz = 0; mz < 4; mz++) {
15123 for (size_t k = 1; k <= 10; k += 3) {
15124 GemmMicrokernelTester()
15125 .mr(4)
15126 .nr(2)
15127 .kr(1)
15128 .sr(1)
15129 .m(4)
15130 .n(2)
15131 .k(k)
15132 .ks(3)
15133 .a_offset(43)
15134 .zero_index(mz)
15135 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15136 }
15137 }
15138 }
15139
15140 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, qmin) {
15141 TEST_REQUIRES_ARM_NEON_FMA;
15142 GemmMicrokernelTester()
15143 .mr(4)
15144 .nr(2)
15145 .kr(1)
15146 .sr(1)
15147 .m(4)
15148 .n(2)
15149 .k(2)
15150 .qmin(128)
15151 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15152 }
15153
15154 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, qmax) {
15155 TEST_REQUIRES_ARM_NEON_FMA;
15156 GemmMicrokernelTester()
15157 .mr(4)
15158 .nr(2)
15159 .kr(1)
15160 .sr(1)
15161 .m(4)
15162 .n(2)
15163 .k(2)
15164 .qmax(128)
15165 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15166 }
15167
15168 TEST(F32_IGEMM_4X2__NEONFMA_LANE_LD64, strided_cm) {
15169 TEST_REQUIRES_ARM_NEON_FMA;
15170 GemmMicrokernelTester()
15171 .mr(4)
15172 .nr(2)
15173 .kr(1)
15174 .sr(1)
15175 .m(4)
15176 .n(2)
15177 .k(2)
15178 .cm_stride(5)
15179 .Test(xnn_f32_igemm_ukernel_4x2__neonfma_lane_ld64);
15180 }
15181#endif // XNN_ARCH_ARM64
15182
15183
15184#if XNN_ARCH_ARM64
15185 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2) {
15186 TEST_REQUIRES_ARM_NEON_FMA;
15187 GemmMicrokernelTester()
15188 .mr(4)
15189 .nr(4)
15190 .kr(1)
15191 .sr(1)
15192 .m(4)
15193 .n(4)
15194 .k(2)
15195 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15196 }
15197
15198 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cn) {
15199 TEST_REQUIRES_ARM_NEON_FMA;
15200 GemmMicrokernelTester()
15201 .mr(4)
15202 .nr(4)
15203 .kr(1)
15204 .sr(1)
15205 .m(4)
15206 .n(4)
15207 .k(2)
15208 .cn_stride(7)
15209 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15210 }
15211
15212 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile) {
15213 TEST_REQUIRES_ARM_NEON_FMA;
15214 for (uint32_t m = 1; m <= 4; m++) {
15215 for (uint32_t n = 1; n <= 4; n++) {
15216 GemmMicrokernelTester()
15217 .mr(4)
15218 .nr(4)
15219 .kr(1)
15220 .sr(1)
15221 .m(m)
15222 .n(n)
15223 .k(2)
15224 .iterations(1)
15225 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15226 }
15227 }
15228 }
15229
15230 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
15231 TEST_REQUIRES_ARM_NEON_FMA;
15232 for (uint32_t m = 1; m <= 4; m++) {
15233 GemmMicrokernelTester()
15234 .mr(4)
15235 .nr(4)
15236 .kr(1)
15237 .sr(1)
15238 .m(m)
15239 .n(4)
15240 .k(2)
15241 .iterations(1)
15242 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15243 }
15244 }
15245
15246 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
15247 TEST_REQUIRES_ARM_NEON_FMA;
15248 for (uint32_t n = 1; n <= 4; n++) {
15249 GemmMicrokernelTester()
15250 .mr(4)
15251 .nr(4)
15252 .kr(1)
15253 .sr(1)
15254 .m(4)
15255 .n(n)
15256 .k(2)
15257 .iterations(1)
15258 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15259 }
15260 }
15261
15262 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_lt_2) {
15263 TEST_REQUIRES_ARM_NEON_FMA;
15264 for (size_t k = 1; k < 2; k++) {
15265 GemmMicrokernelTester()
15266 .mr(4)
15267 .nr(4)
15268 .kr(1)
15269 .sr(1)
15270 .m(4)
15271 .n(4)
15272 .k(k)
15273 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15274 }
15275 }
15276
15277 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_lt_2_subtile) {
15278 TEST_REQUIRES_ARM_NEON_FMA;
15279 for (size_t k = 1; k < 2; k++) {
15280 for (uint32_t m = 1; m <= 4; m++) {
15281 for (uint32_t n = 1; n <= 4; n++) {
15282 GemmMicrokernelTester()
15283 .mr(4)
15284 .nr(4)
15285 .kr(1)
15286 .sr(1)
15287 .m(m)
15288 .n(n)
15289 .k(k)
15290 .iterations(1)
15291 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15292 }
15293 }
15294 }
15295 }
15296
15297 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_gt_2) {
15298 TEST_REQUIRES_ARM_NEON_FMA;
15299 for (size_t k = 3; k < 4; k++) {
15300 GemmMicrokernelTester()
15301 .mr(4)
15302 .nr(4)
15303 .kr(1)
15304 .sr(1)
15305 .m(4)
15306 .n(4)
15307 .k(k)
15308 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15309 }
15310 }
15311
15312 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_gt_2_subtile) {
15313 TEST_REQUIRES_ARM_NEON_FMA;
15314 for (size_t k = 3; k < 4; k++) {
15315 for (uint32_t m = 1; m <= 4; m++) {
15316 for (uint32_t n = 1; n <= 4; n++) {
15317 GemmMicrokernelTester()
15318 .mr(4)
15319 .nr(4)
15320 .kr(1)
15321 .sr(1)
15322 .m(m)
15323 .n(n)
15324 .k(k)
15325 .iterations(1)
15326 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15327 }
15328 }
15329 }
15330 }
15331
15332 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_div_2) {
15333 TEST_REQUIRES_ARM_NEON_FMA;
15334 for (size_t k = 4; k <= 20; k += 2) {
15335 GemmMicrokernelTester()
15336 .mr(4)
15337 .nr(4)
15338 .kr(1)
15339 .sr(1)
15340 .m(4)
15341 .n(4)
15342 .k(k)
15343 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15344 }
15345 }
15346
15347 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, k_div_2_subtile) {
15348 TEST_REQUIRES_ARM_NEON_FMA;
15349 for (size_t k = 4; k <= 20; k += 2) {
15350 for (uint32_t m = 1; m <= 4; m++) {
15351 for (uint32_t n = 1; n <= 4; n++) {
15352 GemmMicrokernelTester()
15353 .mr(4)
15354 .nr(4)
15355 .kr(1)
15356 .sr(1)
15357 .m(m)
15358 .n(n)
15359 .k(k)
15360 .iterations(1)
15361 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15362 }
15363 }
15364 }
15365 }
15366
15367 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4) {
15368 TEST_REQUIRES_ARM_NEON_FMA;
15369 for (uint32_t n = 5; n < 8; n++) {
15370 for (size_t k = 1; k <= 10; k += 3) {
15371 GemmMicrokernelTester()
15372 .mr(4)
15373 .nr(4)
15374 .kr(1)
15375 .sr(1)
15376 .m(4)
15377 .n(4)
15378 .k(k)
15379 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15380 }
15381 }
15382 }
15383
15384 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_strided_cn) {
15385 TEST_REQUIRES_ARM_NEON_FMA;
15386 for (uint32_t n = 5; n < 8; n++) {
15387 for (size_t k = 1; k <= 10; k += 3) {
15388 GemmMicrokernelTester()
15389 .mr(4)
15390 .nr(4)
15391 .kr(1)
15392 .sr(1)
15393 .m(4)
15394 .n(4)
15395 .k(k)
15396 .cn_stride(7)
15397 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15398 }
15399 }
15400 }
15401
15402 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_subtile) {
15403 TEST_REQUIRES_ARM_NEON_FMA;
15404 for (uint32_t n = 5; n < 8; n++) {
15405 for (size_t k = 1; k <= 10; k += 3) {
15406 for (uint32_t m = 1; m <= 4; m++) {
15407 GemmMicrokernelTester()
15408 .mr(4)
15409 .nr(4)
15410 .kr(1)
15411 .sr(1)
15412 .m(m)
15413 .n(n)
15414 .k(k)
15415 .iterations(1)
15416 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15417 }
15418 }
15419 }
15420 }
15421
15422 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4) {
15423 TEST_REQUIRES_ARM_NEON_FMA;
15424 for (uint32_t n = 8; n <= 12; n += 4) {
15425 for (size_t k = 1; k <= 10; k += 3) {
15426 GemmMicrokernelTester()
15427 .mr(4)
15428 .nr(4)
15429 .kr(1)
15430 .sr(1)
15431 .m(4)
15432 .n(4)
15433 .k(k)
15434 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15435 }
15436 }
15437 }
15438
15439 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_strided_cn) {
15440 TEST_REQUIRES_ARM_NEON_FMA;
15441 for (uint32_t n = 8; n <= 12; n += 4) {
15442 for (size_t k = 1; k <= 10; k += 3) {
15443 GemmMicrokernelTester()
15444 .mr(4)
15445 .nr(4)
15446 .kr(1)
15447 .sr(1)
15448 .m(4)
15449 .n(n)
15450 .k(k)
15451 .cn_stride(7)
15452 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15453 }
15454 }
15455 }
15456
15457 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_subtile) {
15458 TEST_REQUIRES_ARM_NEON_FMA;
15459 for (uint32_t n = 8; n <= 12; n += 4) {
15460 for (size_t k = 1; k <= 10; k += 3) {
15461 for (uint32_t m = 1; m <= 4; m++) {
15462 GemmMicrokernelTester()
15463 .mr(4)
15464 .nr(4)
15465 .kr(1)
15466 .sr(1)
15467 .m(m)
15468 .n(n)
15469 .k(k)
15470 .iterations(1)
15471 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15472 }
15473 }
15474 }
15475 }
15476
15477 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, small_kernel) {
15478 TEST_REQUIRES_ARM_NEON_FMA;
15479 for (size_t k = 1; k <= 10; k += 3) {
15480 GemmMicrokernelTester()
15481 .mr(4)
15482 .nr(4)
15483 .kr(1)
15484 .sr(1)
15485 .m(4)
15486 .n(4)
15487 .k(k)
15488 .ks(3)
15489 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15490 }
15491 }
15492
15493 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, small_kernel_subtile) {
15494 TEST_REQUIRES_ARM_NEON_FMA;
15495 for (size_t k = 1; k <= 10; k += 3) {
15496 for (uint32_t m = 1; m <= 4; m++) {
15497 for (uint32_t n = 1; n <= 4; n++) {
15498 GemmMicrokernelTester()
15499 .mr(4)
15500 .nr(4)
15501 .kr(1)
15502 .sr(1)
15503 .m(m)
15504 .n(n)
15505 .k(k)
15506 .ks(3)
15507 .iterations(1)
15508 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15509 }
15510 }
15511 }
15512 }
15513
15514 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_gt_4_small_kernel) {
15515 TEST_REQUIRES_ARM_NEON_FMA;
15516 for (uint32_t n = 5; n < 8; n++) {
15517 for (size_t k = 1; k <= 10; k += 3) {
15518 GemmMicrokernelTester()
15519 .mr(4)
15520 .nr(4)
15521 .kr(1)
15522 .sr(1)
15523 .m(4)
15524 .n(4)
15525 .k(k)
15526 .ks(3)
15527 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15528 }
15529 }
15530 }
15531
15532 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, n_div_4_small_kernel) {
15533 TEST_REQUIRES_ARM_NEON_FMA;
15534 for (uint32_t n = 8; n <= 12; n += 4) {
15535 for (size_t k = 1; k <= 10; k += 3) {
15536 GemmMicrokernelTester()
15537 .mr(4)
15538 .nr(4)
15539 .kr(1)
15540 .sr(1)
15541 .m(4)
15542 .n(4)
15543 .k(k)
15544 .ks(3)
15545 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15546 }
15547 }
15548 }
15549
15550 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cm_subtile) {
15551 TEST_REQUIRES_ARM_NEON_FMA;
15552 for (size_t k = 1; k <= 10; k += 3) {
15553 for (uint32_t m = 1; m <= 4; m++) {
15554 for (uint32_t n = 1; n <= 4; n++) {
15555 GemmMicrokernelTester()
15556 .mr(4)
15557 .nr(4)
15558 .kr(1)
15559 .sr(1)
15560 .m(m)
15561 .n(n)
15562 .k(k)
15563 .cm_stride(7)
15564 .iterations(1)
15565 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15566 }
15567 }
15568 }
15569 }
15570
15571 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, a_offset) {
15572 TEST_REQUIRES_ARM_NEON_FMA;
15573 for (size_t k = 1; k <= 10; k += 3) {
15574 GemmMicrokernelTester()
15575 .mr(4)
15576 .nr(4)
15577 .kr(1)
15578 .sr(1)
15579 .m(4)
15580 .n(4)
15581 .k(k)
15582 .ks(3)
15583 .a_offset(43)
15584 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15585 }
15586 }
15587
15588 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, zero) {
15589 TEST_REQUIRES_ARM_NEON_FMA;
15590 for (uint32_t mz = 0; mz < 4; mz++) {
15591 for (size_t k = 1; k <= 10; k += 3) {
15592 GemmMicrokernelTester()
15593 .mr(4)
15594 .nr(4)
15595 .kr(1)
15596 .sr(1)
15597 .m(4)
15598 .n(4)
15599 .k(k)
15600 .ks(3)
15601 .a_offset(43)
15602 .zero_index(mz)
15603 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15604 }
15605 }
15606 }
15607
15608 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, qmin) {
15609 TEST_REQUIRES_ARM_NEON_FMA;
15610 GemmMicrokernelTester()
15611 .mr(4)
15612 .nr(4)
15613 .kr(1)
15614 .sr(1)
15615 .m(4)
15616 .n(4)
15617 .k(2)
15618 .qmin(128)
15619 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15620 }
15621
15622 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, qmax) {
15623 TEST_REQUIRES_ARM_NEON_FMA;
15624 GemmMicrokernelTester()
15625 .mr(4)
15626 .nr(4)
15627 .kr(1)
15628 .sr(1)
15629 .m(4)
15630 .n(4)
15631 .k(2)
15632 .qmax(128)
15633 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15634 }
15635
15636 TEST(F32_IGEMM_4X4__NEONFMA_LANE_LD64, strided_cm) {
15637 TEST_REQUIRES_ARM_NEON_FMA;
15638 GemmMicrokernelTester()
15639 .mr(4)
15640 .nr(4)
15641 .kr(1)
15642 .sr(1)
15643 .m(4)
15644 .n(4)
15645 .k(2)
15646 .cm_stride(7)
15647 .Test(xnn_f32_igemm_ukernel_4x4__neonfma_lane_ld64);
15648 }
15649#endif // XNN_ARCH_ARM64
15650
15651
15652#if XNN_ARCH_ARM64
15653 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4) {
15654 TEST_REQUIRES_ARM_NEON_FMA;
15655 GemmMicrokernelTester()
15656 .mr(4)
15657 .nr(8)
15658 .kr(1)
15659 .sr(1)
15660 .m(4)
15661 .n(8)
15662 .k(4)
15663 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15664 }
15665
15666 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cn) {
15667 TEST_REQUIRES_ARM_NEON_FMA;
15668 GemmMicrokernelTester()
15669 .mr(4)
15670 .nr(8)
15671 .kr(1)
15672 .sr(1)
15673 .m(4)
15674 .n(8)
15675 .k(4)
15676 .cn_stride(11)
15677 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15678 }
15679
15680 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
15681 TEST_REQUIRES_ARM_NEON_FMA;
15682 for (uint32_t m = 1; m <= 4; m++) {
15683 for (uint32_t n = 1; n <= 8; n++) {
15684 GemmMicrokernelTester()
15685 .mr(4)
15686 .nr(8)
15687 .kr(1)
15688 .sr(1)
15689 .m(m)
15690 .n(n)
15691 .k(4)
15692 .iterations(1)
15693 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15694 }
15695 }
15696 }
15697
15698 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
15699 TEST_REQUIRES_ARM_NEON_FMA;
15700 for (uint32_t m = 1; m <= 4; m++) {
15701 GemmMicrokernelTester()
15702 .mr(4)
15703 .nr(8)
15704 .kr(1)
15705 .sr(1)
15706 .m(m)
15707 .n(8)
15708 .k(4)
15709 .iterations(1)
15710 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15711 }
15712 }
15713
15714 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
15715 TEST_REQUIRES_ARM_NEON_FMA;
15716 for (uint32_t n = 1; n <= 8; n++) {
15717 GemmMicrokernelTester()
15718 .mr(4)
15719 .nr(8)
15720 .kr(1)
15721 .sr(1)
15722 .m(4)
15723 .n(n)
15724 .k(4)
15725 .iterations(1)
15726 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15727 }
15728 }
15729
15730 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_lt_4) {
15731 TEST_REQUIRES_ARM_NEON_FMA;
15732 for (size_t k = 1; k < 4; k++) {
15733 GemmMicrokernelTester()
15734 .mr(4)
15735 .nr(8)
15736 .kr(1)
15737 .sr(1)
15738 .m(4)
15739 .n(8)
15740 .k(k)
15741 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15742 }
15743 }
15744
15745 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
15746 TEST_REQUIRES_ARM_NEON_FMA;
15747 for (size_t k = 1; k < 4; k++) {
15748 for (uint32_t m = 1; m <= 4; m++) {
15749 for (uint32_t n = 1; n <= 8; n++) {
15750 GemmMicrokernelTester()
15751 .mr(4)
15752 .nr(8)
15753 .kr(1)
15754 .sr(1)
15755 .m(m)
15756 .n(n)
15757 .k(k)
15758 .iterations(1)
15759 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15760 }
15761 }
15762 }
15763 }
15764
15765 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_gt_4) {
15766 TEST_REQUIRES_ARM_NEON_FMA;
15767 for (size_t k = 5; k < 8; k++) {
15768 GemmMicrokernelTester()
15769 .mr(4)
15770 .nr(8)
15771 .kr(1)
15772 .sr(1)
15773 .m(4)
15774 .n(8)
15775 .k(k)
15776 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15777 }
15778 }
15779
15780 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
15781 TEST_REQUIRES_ARM_NEON_FMA;
15782 for (size_t k = 5; k < 8; k++) {
15783 for (uint32_t m = 1; m <= 4; m++) {
15784 for (uint32_t n = 1; n <= 8; n++) {
15785 GemmMicrokernelTester()
15786 .mr(4)
15787 .nr(8)
15788 .kr(1)
15789 .sr(1)
15790 .m(m)
15791 .n(n)
15792 .k(k)
15793 .iterations(1)
15794 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15795 }
15796 }
15797 }
15798 }
15799
15800 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_div_4) {
15801 TEST_REQUIRES_ARM_NEON_FMA;
15802 for (size_t k = 8; k <= 40; k += 4) {
15803 GemmMicrokernelTester()
15804 .mr(4)
15805 .nr(8)
15806 .kr(1)
15807 .sr(1)
15808 .m(4)
15809 .n(8)
15810 .k(k)
15811 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15812 }
15813 }
15814
15815 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
15816 TEST_REQUIRES_ARM_NEON_FMA;
15817 for (size_t k = 8; k <= 40; k += 4) {
15818 for (uint32_t m = 1; m <= 4; m++) {
15819 for (uint32_t n = 1; n <= 8; n++) {
15820 GemmMicrokernelTester()
15821 .mr(4)
15822 .nr(8)
15823 .kr(1)
15824 .sr(1)
15825 .m(m)
15826 .n(n)
15827 .k(k)
15828 .iterations(1)
15829 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15830 }
15831 }
15832 }
15833 }
15834
15835 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8) {
15836 TEST_REQUIRES_ARM_NEON_FMA;
15837 for (uint32_t n = 9; n < 16; n++) {
15838 for (size_t k = 1; k <= 20; k += 5) {
15839 GemmMicrokernelTester()
15840 .mr(4)
15841 .nr(8)
15842 .kr(1)
15843 .sr(1)
15844 .m(4)
15845 .n(8)
15846 .k(k)
15847 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15848 }
15849 }
15850 }
15851
15852 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
15853 TEST_REQUIRES_ARM_NEON_FMA;
15854 for (uint32_t n = 9; n < 16; n++) {
15855 for (size_t k = 1; k <= 20; k += 5) {
15856 GemmMicrokernelTester()
15857 .mr(4)
15858 .nr(8)
15859 .kr(1)
15860 .sr(1)
15861 .m(4)
15862 .n(8)
15863 .k(k)
15864 .cn_stride(11)
15865 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15866 }
15867 }
15868 }
15869
15870 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
15871 TEST_REQUIRES_ARM_NEON_FMA;
15872 for (uint32_t n = 9; n < 16; n++) {
15873 for (size_t k = 1; k <= 20; k += 5) {
15874 for (uint32_t m = 1; m <= 4; m++) {
15875 GemmMicrokernelTester()
15876 .mr(4)
15877 .nr(8)
15878 .kr(1)
15879 .sr(1)
15880 .m(m)
15881 .n(n)
15882 .k(k)
15883 .iterations(1)
15884 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15885 }
15886 }
15887 }
15888 }
15889
15890 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8) {
15891 TEST_REQUIRES_ARM_NEON_FMA;
15892 for (uint32_t n = 16; n <= 24; n += 8) {
15893 for (size_t k = 1; k <= 20; k += 5) {
15894 GemmMicrokernelTester()
15895 .mr(4)
15896 .nr(8)
15897 .kr(1)
15898 .sr(1)
15899 .m(4)
15900 .n(8)
15901 .k(k)
15902 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15903 }
15904 }
15905 }
15906
15907 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
15908 TEST_REQUIRES_ARM_NEON_FMA;
15909 for (uint32_t n = 16; n <= 24; n += 8) {
15910 for (size_t k = 1; k <= 20; k += 5) {
15911 GemmMicrokernelTester()
15912 .mr(4)
15913 .nr(8)
15914 .kr(1)
15915 .sr(1)
15916 .m(4)
15917 .n(n)
15918 .k(k)
15919 .cn_stride(11)
15920 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15921 }
15922 }
15923 }
15924
15925 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
15926 TEST_REQUIRES_ARM_NEON_FMA;
15927 for (uint32_t n = 16; n <= 24; n += 8) {
15928 for (size_t k = 1; k <= 20; k += 5) {
15929 for (uint32_t m = 1; m <= 4; m++) {
15930 GemmMicrokernelTester()
15931 .mr(4)
15932 .nr(8)
15933 .kr(1)
15934 .sr(1)
15935 .m(m)
15936 .n(n)
15937 .k(k)
15938 .iterations(1)
15939 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15940 }
15941 }
15942 }
15943 }
15944
15945 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, small_kernel) {
15946 TEST_REQUIRES_ARM_NEON_FMA;
15947 for (size_t k = 1; k <= 20; k += 5) {
15948 GemmMicrokernelTester()
15949 .mr(4)
15950 .nr(8)
15951 .kr(1)
15952 .sr(1)
15953 .m(4)
15954 .n(8)
15955 .k(k)
15956 .ks(3)
15957 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15958 }
15959 }
15960
15961 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, small_kernel_subtile) {
15962 TEST_REQUIRES_ARM_NEON_FMA;
15963 for (size_t k = 1; k <= 20; k += 5) {
15964 for (uint32_t m = 1; m <= 4; m++) {
15965 for (uint32_t n = 1; n <= 8; n++) {
15966 GemmMicrokernelTester()
15967 .mr(4)
15968 .nr(8)
15969 .kr(1)
15970 .sr(1)
15971 .m(m)
15972 .n(n)
15973 .k(k)
15974 .ks(3)
15975 .iterations(1)
15976 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15977 }
15978 }
15979 }
15980 }
15981
15982 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_small_kernel) {
15983 TEST_REQUIRES_ARM_NEON_FMA;
15984 for (uint32_t n = 9; n < 16; n++) {
15985 for (size_t k = 1; k <= 20; k += 5) {
15986 GemmMicrokernelTester()
15987 .mr(4)
15988 .nr(8)
15989 .kr(1)
15990 .sr(1)
15991 .m(4)
15992 .n(8)
15993 .k(k)
15994 .ks(3)
15995 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
15996 }
15997 }
15998 }
15999
16000 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, n_div_8_small_kernel) {
16001 TEST_REQUIRES_ARM_NEON_FMA;
16002 for (uint32_t n = 16; n <= 24; n += 8) {
16003 for (size_t k = 1; k <= 20; k += 5) {
16004 GemmMicrokernelTester()
16005 .mr(4)
16006 .nr(8)
16007 .kr(1)
16008 .sr(1)
16009 .m(4)
16010 .n(8)
16011 .k(k)
16012 .ks(3)
16013 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16014 }
16015 }
16016 }
16017
16018 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
16019 TEST_REQUIRES_ARM_NEON_FMA;
16020 for (size_t k = 1; k <= 20; k += 5) {
16021 for (uint32_t m = 1; m <= 4; m++) {
16022 for (uint32_t n = 1; n <= 8; n++) {
16023 GemmMicrokernelTester()
16024 .mr(4)
16025 .nr(8)
16026 .kr(1)
16027 .sr(1)
16028 .m(m)
16029 .n(n)
16030 .k(k)
16031 .cm_stride(11)
16032 .iterations(1)
16033 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16034 }
16035 }
16036 }
16037 }
16038
16039 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, a_offset) {
16040 TEST_REQUIRES_ARM_NEON_FMA;
16041 for (size_t k = 1; k <= 20; k += 5) {
16042 GemmMicrokernelTester()
16043 .mr(4)
16044 .nr(8)
16045 .kr(1)
16046 .sr(1)
16047 .m(4)
16048 .n(8)
16049 .k(k)
16050 .ks(3)
16051 .a_offset(83)
16052 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16053 }
16054 }
16055
16056 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, zero) {
16057 TEST_REQUIRES_ARM_NEON_FMA;
16058 for (uint32_t mz = 0; mz < 4; mz++) {
16059 for (size_t k = 1; k <= 20; k += 5) {
16060 GemmMicrokernelTester()
16061 .mr(4)
16062 .nr(8)
16063 .kr(1)
16064 .sr(1)
16065 .m(4)
16066 .n(8)
16067 .k(k)
16068 .ks(3)
16069 .a_offset(83)
16070 .zero_index(mz)
16071 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16072 }
16073 }
16074 }
16075
16076 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, qmin) {
16077 TEST_REQUIRES_ARM_NEON_FMA;
16078 GemmMicrokernelTester()
16079 .mr(4)
16080 .nr(8)
16081 .kr(1)
16082 .sr(1)
16083 .m(4)
16084 .n(8)
16085 .k(4)
16086 .qmin(128)
16087 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16088 }
16089
16090 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, qmax) {
16091 TEST_REQUIRES_ARM_NEON_FMA;
16092 GemmMicrokernelTester()
16093 .mr(4)
16094 .nr(8)
16095 .kr(1)
16096 .sr(1)
16097 .m(4)
16098 .n(8)
16099 .k(4)
16100 .qmax(128)
16101 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16102 }
16103
16104 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD128, strided_cm) {
16105 TEST_REQUIRES_ARM_NEON_FMA;
16106 GemmMicrokernelTester()
16107 .mr(4)
16108 .nr(8)
16109 .kr(1)
16110 .sr(1)
16111 .m(4)
16112 .n(8)
16113 .k(4)
16114 .cm_stride(11)
16115 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld128);
16116 }
16117#endif // XNN_ARCH_ARM64
16118
16119
16120#if XNN_ARCH_ARM64
16121 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2) {
16122 TEST_REQUIRES_ARM_NEON_FMA;
16123 GemmMicrokernelTester()
16124 .mr(4)
16125 .nr(8)
16126 .kr(1)
16127 .sr(1)
16128 .m(4)
16129 .n(8)
16130 .k(2)
16131 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16132 }
16133
16134 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cn) {
16135 TEST_REQUIRES_ARM_NEON_FMA;
16136 GemmMicrokernelTester()
16137 .mr(4)
16138 .nr(8)
16139 .kr(1)
16140 .sr(1)
16141 .m(4)
16142 .n(8)
16143 .k(2)
16144 .cn_stride(11)
16145 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16146 }
16147
16148 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
16149 TEST_REQUIRES_ARM_NEON_FMA;
16150 for (uint32_t m = 1; m <= 4; m++) {
16151 for (uint32_t n = 1; n <= 8; n++) {
16152 GemmMicrokernelTester()
16153 .mr(4)
16154 .nr(8)
16155 .kr(1)
16156 .sr(1)
16157 .m(m)
16158 .n(n)
16159 .k(2)
16160 .iterations(1)
16161 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16162 }
16163 }
16164 }
16165
16166 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
16167 TEST_REQUIRES_ARM_NEON_FMA;
16168 for (uint32_t m = 1; m <= 4; m++) {
16169 GemmMicrokernelTester()
16170 .mr(4)
16171 .nr(8)
16172 .kr(1)
16173 .sr(1)
16174 .m(m)
16175 .n(8)
16176 .k(2)
16177 .iterations(1)
16178 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16179 }
16180 }
16181
16182 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
16183 TEST_REQUIRES_ARM_NEON_FMA;
16184 for (uint32_t n = 1; n <= 8; n++) {
16185 GemmMicrokernelTester()
16186 .mr(4)
16187 .nr(8)
16188 .kr(1)
16189 .sr(1)
16190 .m(4)
16191 .n(n)
16192 .k(2)
16193 .iterations(1)
16194 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16195 }
16196 }
16197
16198 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_lt_2) {
16199 TEST_REQUIRES_ARM_NEON_FMA;
16200 for (size_t k = 1; k < 2; k++) {
16201 GemmMicrokernelTester()
16202 .mr(4)
16203 .nr(8)
16204 .kr(1)
16205 .sr(1)
16206 .m(4)
16207 .n(8)
16208 .k(k)
16209 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16210 }
16211 }
16212
16213 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
16214 TEST_REQUIRES_ARM_NEON_FMA;
16215 for (size_t k = 1; k < 2; k++) {
16216 for (uint32_t m = 1; m <= 4; m++) {
16217 for (uint32_t n = 1; n <= 8; n++) {
16218 GemmMicrokernelTester()
16219 .mr(4)
16220 .nr(8)
16221 .kr(1)
16222 .sr(1)
16223 .m(m)
16224 .n(n)
16225 .k(k)
16226 .iterations(1)
16227 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16228 }
16229 }
16230 }
16231 }
16232
16233 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_gt_2) {
16234 TEST_REQUIRES_ARM_NEON_FMA;
16235 for (size_t k = 3; k < 4; k++) {
16236 GemmMicrokernelTester()
16237 .mr(4)
16238 .nr(8)
16239 .kr(1)
16240 .sr(1)
16241 .m(4)
16242 .n(8)
16243 .k(k)
16244 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16245 }
16246 }
16247
16248 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
16249 TEST_REQUIRES_ARM_NEON_FMA;
16250 for (size_t k = 3; k < 4; k++) {
16251 for (uint32_t m = 1; m <= 4; m++) {
16252 for (uint32_t n = 1; n <= 8; n++) {
16253 GemmMicrokernelTester()
16254 .mr(4)
16255 .nr(8)
16256 .kr(1)
16257 .sr(1)
16258 .m(m)
16259 .n(n)
16260 .k(k)
16261 .iterations(1)
16262 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16263 }
16264 }
16265 }
16266 }
16267
16268 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_div_2) {
16269 TEST_REQUIRES_ARM_NEON_FMA;
16270 for (size_t k = 4; k <= 20; k += 2) {
16271 GemmMicrokernelTester()
16272 .mr(4)
16273 .nr(8)
16274 .kr(1)
16275 .sr(1)
16276 .m(4)
16277 .n(8)
16278 .k(k)
16279 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16280 }
16281 }
16282
16283 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
16284 TEST_REQUIRES_ARM_NEON_FMA;
16285 for (size_t k = 4; k <= 20; k += 2) {
16286 for (uint32_t m = 1; m <= 4; m++) {
16287 for (uint32_t n = 1; n <= 8; n++) {
16288 GemmMicrokernelTester()
16289 .mr(4)
16290 .nr(8)
16291 .kr(1)
16292 .sr(1)
16293 .m(m)
16294 .n(n)
16295 .k(k)
16296 .iterations(1)
16297 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16298 }
16299 }
16300 }
16301 }
16302
16303 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8) {
16304 TEST_REQUIRES_ARM_NEON_FMA;
16305 for (uint32_t n = 9; n < 16; n++) {
16306 for (size_t k = 1; k <= 10; k += 3) {
16307 GemmMicrokernelTester()
16308 .mr(4)
16309 .nr(8)
16310 .kr(1)
16311 .sr(1)
16312 .m(4)
16313 .n(8)
16314 .k(k)
16315 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16316 }
16317 }
16318 }
16319
16320 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
16321 TEST_REQUIRES_ARM_NEON_FMA;
16322 for (uint32_t n = 9; n < 16; n++) {
16323 for (size_t k = 1; k <= 10; k += 3) {
16324 GemmMicrokernelTester()
16325 .mr(4)
16326 .nr(8)
16327 .kr(1)
16328 .sr(1)
16329 .m(4)
16330 .n(8)
16331 .k(k)
16332 .cn_stride(11)
16333 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16334 }
16335 }
16336 }
16337
16338 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
16339 TEST_REQUIRES_ARM_NEON_FMA;
16340 for (uint32_t n = 9; n < 16; n++) {
16341 for (size_t k = 1; k <= 10; k += 3) {
16342 for (uint32_t m = 1; m <= 4; m++) {
16343 GemmMicrokernelTester()
16344 .mr(4)
16345 .nr(8)
16346 .kr(1)
16347 .sr(1)
16348 .m(m)
16349 .n(n)
16350 .k(k)
16351 .iterations(1)
16352 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16353 }
16354 }
16355 }
16356 }
16357
16358 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8) {
16359 TEST_REQUIRES_ARM_NEON_FMA;
16360 for (uint32_t n = 16; n <= 24; n += 8) {
16361 for (size_t k = 1; k <= 10; k += 3) {
16362 GemmMicrokernelTester()
16363 .mr(4)
16364 .nr(8)
16365 .kr(1)
16366 .sr(1)
16367 .m(4)
16368 .n(8)
16369 .k(k)
16370 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16371 }
16372 }
16373 }
16374
16375 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
16376 TEST_REQUIRES_ARM_NEON_FMA;
16377 for (uint32_t n = 16; n <= 24; n += 8) {
16378 for (size_t k = 1; k <= 10; k += 3) {
16379 GemmMicrokernelTester()
16380 .mr(4)
16381 .nr(8)
16382 .kr(1)
16383 .sr(1)
16384 .m(4)
16385 .n(n)
16386 .k(k)
16387 .cn_stride(11)
16388 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16389 }
16390 }
16391 }
16392
16393 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
16394 TEST_REQUIRES_ARM_NEON_FMA;
16395 for (uint32_t n = 16; n <= 24; n += 8) {
16396 for (size_t k = 1; k <= 10; k += 3) {
16397 for (uint32_t m = 1; m <= 4; m++) {
16398 GemmMicrokernelTester()
16399 .mr(4)
16400 .nr(8)
16401 .kr(1)
16402 .sr(1)
16403 .m(m)
16404 .n(n)
16405 .k(k)
16406 .iterations(1)
16407 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16408 }
16409 }
16410 }
16411 }
16412
16413 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, small_kernel) {
16414 TEST_REQUIRES_ARM_NEON_FMA;
16415 for (size_t k = 1; k <= 10; k += 3) {
16416 GemmMicrokernelTester()
16417 .mr(4)
16418 .nr(8)
16419 .kr(1)
16420 .sr(1)
16421 .m(4)
16422 .n(8)
16423 .k(k)
16424 .ks(3)
16425 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16426 }
16427 }
16428
16429 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
16430 TEST_REQUIRES_ARM_NEON_FMA;
16431 for (size_t k = 1; k <= 10; k += 3) {
16432 for (uint32_t m = 1; m <= 4; m++) {
16433 for (uint32_t n = 1; n <= 8; n++) {
16434 GemmMicrokernelTester()
16435 .mr(4)
16436 .nr(8)
16437 .kr(1)
16438 .sr(1)
16439 .m(m)
16440 .n(n)
16441 .k(k)
16442 .ks(3)
16443 .iterations(1)
16444 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16445 }
16446 }
16447 }
16448 }
16449
16450 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
16451 TEST_REQUIRES_ARM_NEON_FMA;
16452 for (uint32_t n = 9; n < 16; n++) {
16453 for (size_t k = 1; k <= 10; k += 3) {
16454 GemmMicrokernelTester()
16455 .mr(4)
16456 .nr(8)
16457 .kr(1)
16458 .sr(1)
16459 .m(4)
16460 .n(8)
16461 .k(k)
16462 .ks(3)
16463 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16464 }
16465 }
16466 }
16467
16468 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
16469 TEST_REQUIRES_ARM_NEON_FMA;
16470 for (uint32_t n = 16; n <= 24; n += 8) {
16471 for (size_t k = 1; k <= 10; k += 3) {
16472 GemmMicrokernelTester()
16473 .mr(4)
16474 .nr(8)
16475 .kr(1)
16476 .sr(1)
16477 .m(4)
16478 .n(8)
16479 .k(k)
16480 .ks(3)
16481 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16482 }
16483 }
16484 }
16485
16486 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
16487 TEST_REQUIRES_ARM_NEON_FMA;
16488 for (size_t k = 1; k <= 10; k += 3) {
16489 for (uint32_t m = 1; m <= 4; m++) {
16490 for (uint32_t n = 1; n <= 8; n++) {
16491 GemmMicrokernelTester()
16492 .mr(4)
16493 .nr(8)
16494 .kr(1)
16495 .sr(1)
16496 .m(m)
16497 .n(n)
16498 .k(k)
16499 .cm_stride(11)
16500 .iterations(1)
16501 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16502 }
16503 }
16504 }
16505 }
16506
16507 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, a_offset) {
16508 TEST_REQUIRES_ARM_NEON_FMA;
16509 for (size_t k = 1; k <= 10; k += 3) {
16510 GemmMicrokernelTester()
16511 .mr(4)
16512 .nr(8)
16513 .kr(1)
16514 .sr(1)
16515 .m(4)
16516 .n(8)
16517 .k(k)
16518 .ks(3)
16519 .a_offset(43)
16520 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16521 }
16522 }
16523
16524 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, zero) {
16525 TEST_REQUIRES_ARM_NEON_FMA;
16526 for (uint32_t mz = 0; mz < 4; mz++) {
16527 for (size_t k = 1; k <= 10; k += 3) {
16528 GemmMicrokernelTester()
16529 .mr(4)
16530 .nr(8)
16531 .kr(1)
16532 .sr(1)
16533 .m(4)
16534 .n(8)
16535 .k(k)
16536 .ks(3)
16537 .a_offset(43)
16538 .zero_index(mz)
16539 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16540 }
16541 }
16542 }
16543
16544 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, qmin) {
16545 TEST_REQUIRES_ARM_NEON_FMA;
16546 GemmMicrokernelTester()
16547 .mr(4)
16548 .nr(8)
16549 .kr(1)
16550 .sr(1)
16551 .m(4)
16552 .n(8)
16553 .k(2)
16554 .qmin(128)
16555 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16556 }
16557
16558 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, qmax) {
16559 TEST_REQUIRES_ARM_NEON_FMA;
16560 GemmMicrokernelTester()
16561 .mr(4)
16562 .nr(8)
16563 .kr(1)
16564 .sr(1)
16565 .m(4)
16566 .n(8)
16567 .k(2)
16568 .qmax(128)
16569 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16570 }
16571
16572 TEST(F32_IGEMM_4X8__NEONFMA_LANE_LD64, strided_cm) {
16573 TEST_REQUIRES_ARM_NEON_FMA;
16574 GemmMicrokernelTester()
16575 .mr(4)
16576 .nr(8)
16577 .kr(1)
16578 .sr(1)
16579 .m(4)
16580 .n(8)
16581 .k(2)
16582 .cm_stride(11)
16583 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_lane_ld64);
16584 }
16585#endif // XNN_ARCH_ARM64
16586
16587
16588#if XNN_ARCH_ARM64
16589 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2) {
16590 TEST_REQUIRES_ARM_NEON_FMA;
16591 GemmMicrokernelTester()
16592 .mr(6)
16593 .nr(8)
16594 .kr(1)
16595 .sr(1)
16596 .m(6)
16597 .n(8)
16598 .k(2)
16599 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16600 }
16601
16602 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cn) {
16603 TEST_REQUIRES_ARM_NEON_FMA;
16604 GemmMicrokernelTester()
16605 .mr(6)
16606 .nr(8)
16607 .kr(1)
16608 .sr(1)
16609 .m(6)
16610 .n(8)
16611 .k(2)
16612 .cn_stride(11)
16613 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16614 }
16615
16616 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
16617 TEST_REQUIRES_ARM_NEON_FMA;
16618 for (uint32_t m = 1; m <= 6; m++) {
16619 for (uint32_t n = 1; n <= 8; n++) {
16620 GemmMicrokernelTester()
16621 .mr(6)
16622 .nr(8)
16623 .kr(1)
16624 .sr(1)
16625 .m(m)
16626 .n(n)
16627 .k(2)
16628 .iterations(1)
16629 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16630 }
16631 }
16632 }
16633
16634 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
16635 TEST_REQUIRES_ARM_NEON_FMA;
16636 for (uint32_t m = 1; m <= 6; m++) {
16637 GemmMicrokernelTester()
16638 .mr(6)
16639 .nr(8)
16640 .kr(1)
16641 .sr(1)
16642 .m(m)
16643 .n(8)
16644 .k(2)
16645 .iterations(1)
16646 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16647 }
16648 }
16649
16650 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
16651 TEST_REQUIRES_ARM_NEON_FMA;
16652 for (uint32_t n = 1; n <= 8; n++) {
16653 GemmMicrokernelTester()
16654 .mr(6)
16655 .nr(8)
16656 .kr(1)
16657 .sr(1)
16658 .m(6)
16659 .n(n)
16660 .k(2)
16661 .iterations(1)
16662 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16663 }
16664 }
16665
16666 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_lt_2) {
16667 TEST_REQUIRES_ARM_NEON_FMA;
16668 for (size_t k = 1; k < 2; k++) {
16669 GemmMicrokernelTester()
16670 .mr(6)
16671 .nr(8)
16672 .kr(1)
16673 .sr(1)
16674 .m(6)
16675 .n(8)
16676 .k(k)
16677 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16678 }
16679 }
16680
16681 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
16682 TEST_REQUIRES_ARM_NEON_FMA;
16683 for (size_t k = 1; k < 2; k++) {
16684 for (uint32_t m = 1; m <= 6; m++) {
16685 for (uint32_t n = 1; n <= 8; n++) {
16686 GemmMicrokernelTester()
16687 .mr(6)
16688 .nr(8)
16689 .kr(1)
16690 .sr(1)
16691 .m(m)
16692 .n(n)
16693 .k(k)
16694 .iterations(1)
16695 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16696 }
16697 }
16698 }
16699 }
16700
16701 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_gt_2) {
16702 TEST_REQUIRES_ARM_NEON_FMA;
16703 for (size_t k = 3; k < 4; k++) {
16704 GemmMicrokernelTester()
16705 .mr(6)
16706 .nr(8)
16707 .kr(1)
16708 .sr(1)
16709 .m(6)
16710 .n(8)
16711 .k(k)
16712 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16713 }
16714 }
16715
16716 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
16717 TEST_REQUIRES_ARM_NEON_FMA;
16718 for (size_t k = 3; k < 4; k++) {
16719 for (uint32_t m = 1; m <= 6; m++) {
16720 for (uint32_t n = 1; n <= 8; n++) {
16721 GemmMicrokernelTester()
16722 .mr(6)
16723 .nr(8)
16724 .kr(1)
16725 .sr(1)
16726 .m(m)
16727 .n(n)
16728 .k(k)
16729 .iterations(1)
16730 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16731 }
16732 }
16733 }
16734 }
16735
16736 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_div_2) {
16737 TEST_REQUIRES_ARM_NEON_FMA;
16738 for (size_t k = 4; k <= 20; k += 2) {
16739 GemmMicrokernelTester()
16740 .mr(6)
16741 .nr(8)
16742 .kr(1)
16743 .sr(1)
16744 .m(6)
16745 .n(8)
16746 .k(k)
16747 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16748 }
16749 }
16750
16751 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
16752 TEST_REQUIRES_ARM_NEON_FMA;
16753 for (size_t k = 4; k <= 20; k += 2) {
16754 for (uint32_t m = 1; m <= 6; m++) {
16755 for (uint32_t n = 1; n <= 8; n++) {
16756 GemmMicrokernelTester()
16757 .mr(6)
16758 .nr(8)
16759 .kr(1)
16760 .sr(1)
16761 .m(m)
16762 .n(n)
16763 .k(k)
16764 .iterations(1)
16765 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16766 }
16767 }
16768 }
16769 }
16770
16771 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8) {
16772 TEST_REQUIRES_ARM_NEON_FMA;
16773 for (uint32_t n = 9; n < 16; n++) {
16774 for (size_t k = 1; k <= 10; k += 3) {
16775 GemmMicrokernelTester()
16776 .mr(6)
16777 .nr(8)
16778 .kr(1)
16779 .sr(1)
16780 .m(6)
16781 .n(8)
16782 .k(k)
16783 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16784 }
16785 }
16786 }
16787
16788 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
16789 TEST_REQUIRES_ARM_NEON_FMA;
16790 for (uint32_t n = 9; n < 16; n++) {
16791 for (size_t k = 1; k <= 10; k += 3) {
16792 GemmMicrokernelTester()
16793 .mr(6)
16794 .nr(8)
16795 .kr(1)
16796 .sr(1)
16797 .m(6)
16798 .n(8)
16799 .k(k)
16800 .cn_stride(11)
16801 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16802 }
16803 }
16804 }
16805
16806 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
16807 TEST_REQUIRES_ARM_NEON_FMA;
16808 for (uint32_t n = 9; n < 16; n++) {
16809 for (size_t k = 1; k <= 10; k += 3) {
16810 for (uint32_t m = 1; m <= 6; m++) {
16811 GemmMicrokernelTester()
16812 .mr(6)
16813 .nr(8)
16814 .kr(1)
16815 .sr(1)
16816 .m(m)
16817 .n(n)
16818 .k(k)
16819 .iterations(1)
16820 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16821 }
16822 }
16823 }
16824 }
16825
16826 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8) {
16827 TEST_REQUIRES_ARM_NEON_FMA;
16828 for (uint32_t n = 16; n <= 24; n += 8) {
16829 for (size_t k = 1; k <= 10; k += 3) {
16830 GemmMicrokernelTester()
16831 .mr(6)
16832 .nr(8)
16833 .kr(1)
16834 .sr(1)
16835 .m(6)
16836 .n(8)
16837 .k(k)
16838 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16839 }
16840 }
16841 }
16842
16843 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
16844 TEST_REQUIRES_ARM_NEON_FMA;
16845 for (uint32_t n = 16; n <= 24; n += 8) {
16846 for (size_t k = 1; k <= 10; k += 3) {
16847 GemmMicrokernelTester()
16848 .mr(6)
16849 .nr(8)
16850 .kr(1)
16851 .sr(1)
16852 .m(6)
16853 .n(n)
16854 .k(k)
16855 .cn_stride(11)
16856 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16857 }
16858 }
16859 }
16860
16861 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
16862 TEST_REQUIRES_ARM_NEON_FMA;
16863 for (uint32_t n = 16; n <= 24; n += 8) {
16864 for (size_t k = 1; k <= 10; k += 3) {
16865 for (uint32_t m = 1; m <= 6; m++) {
16866 GemmMicrokernelTester()
16867 .mr(6)
16868 .nr(8)
16869 .kr(1)
16870 .sr(1)
16871 .m(m)
16872 .n(n)
16873 .k(k)
16874 .iterations(1)
16875 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16876 }
16877 }
16878 }
16879 }
16880
16881 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, small_kernel) {
16882 TEST_REQUIRES_ARM_NEON_FMA;
16883 for (size_t k = 1; k <= 10; k += 3) {
16884 GemmMicrokernelTester()
16885 .mr(6)
16886 .nr(8)
16887 .kr(1)
16888 .sr(1)
16889 .m(6)
16890 .n(8)
16891 .k(k)
16892 .ks(3)
16893 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16894 }
16895 }
16896
16897 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, small_kernel_subtile) {
16898 TEST_REQUIRES_ARM_NEON_FMA;
16899 for (size_t k = 1; k <= 10; k += 3) {
16900 for (uint32_t m = 1; m <= 6; m++) {
16901 for (uint32_t n = 1; n <= 8; n++) {
16902 GemmMicrokernelTester()
16903 .mr(6)
16904 .nr(8)
16905 .kr(1)
16906 .sr(1)
16907 .m(m)
16908 .n(n)
16909 .k(k)
16910 .ks(3)
16911 .iterations(1)
16912 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16913 }
16914 }
16915 }
16916 }
16917
16918 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_small_kernel) {
16919 TEST_REQUIRES_ARM_NEON_FMA;
16920 for (uint32_t n = 9; n < 16; n++) {
16921 for (size_t k = 1; k <= 10; k += 3) {
16922 GemmMicrokernelTester()
16923 .mr(6)
16924 .nr(8)
16925 .kr(1)
16926 .sr(1)
16927 .m(6)
16928 .n(8)
16929 .k(k)
16930 .ks(3)
16931 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16932 }
16933 }
16934 }
16935
16936 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, n_div_8_small_kernel) {
16937 TEST_REQUIRES_ARM_NEON_FMA;
16938 for (uint32_t n = 16; n <= 24; n += 8) {
16939 for (size_t k = 1; k <= 10; k += 3) {
16940 GemmMicrokernelTester()
16941 .mr(6)
16942 .nr(8)
16943 .kr(1)
16944 .sr(1)
16945 .m(6)
16946 .n(8)
16947 .k(k)
16948 .ks(3)
16949 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16950 }
16951 }
16952 }
16953
16954 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
16955 TEST_REQUIRES_ARM_NEON_FMA;
16956 for (size_t k = 1; k <= 10; k += 3) {
16957 for (uint32_t m = 1; m <= 6; m++) {
16958 for (uint32_t n = 1; n <= 8; n++) {
16959 GemmMicrokernelTester()
16960 .mr(6)
16961 .nr(8)
16962 .kr(1)
16963 .sr(1)
16964 .m(m)
16965 .n(n)
16966 .k(k)
16967 .cm_stride(11)
16968 .iterations(1)
16969 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16970 }
16971 }
16972 }
16973 }
16974
16975 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, a_offset) {
16976 TEST_REQUIRES_ARM_NEON_FMA;
16977 for (size_t k = 1; k <= 10; k += 3) {
16978 GemmMicrokernelTester()
16979 .mr(6)
16980 .nr(8)
16981 .kr(1)
16982 .sr(1)
16983 .m(6)
16984 .n(8)
16985 .k(k)
16986 .ks(3)
16987 .a_offset(67)
16988 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
16989 }
16990 }
16991
16992 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, zero) {
16993 TEST_REQUIRES_ARM_NEON_FMA;
16994 for (uint32_t mz = 0; mz < 6; mz++) {
16995 for (size_t k = 1; k <= 10; k += 3) {
16996 GemmMicrokernelTester()
16997 .mr(6)
16998 .nr(8)
16999 .kr(1)
17000 .sr(1)
17001 .m(6)
17002 .n(8)
17003 .k(k)
17004 .ks(3)
17005 .a_offset(67)
17006 .zero_index(mz)
17007 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
17008 }
17009 }
17010 }
17011
17012 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, qmin) {
17013 TEST_REQUIRES_ARM_NEON_FMA;
17014 GemmMicrokernelTester()
17015 .mr(6)
17016 .nr(8)
17017 .kr(1)
17018 .sr(1)
17019 .m(6)
17020 .n(8)
17021 .k(2)
17022 .qmin(128)
17023 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
17024 }
17025
17026 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, qmax) {
17027 TEST_REQUIRES_ARM_NEON_FMA;
17028 GemmMicrokernelTester()
17029 .mr(6)
17030 .nr(8)
17031 .kr(1)
17032 .sr(1)
17033 .m(6)
17034 .n(8)
17035 .k(2)
17036 .qmax(128)
17037 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
17038 }
17039
17040 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD64, strided_cm) {
17041 TEST_REQUIRES_ARM_NEON_FMA;
17042 GemmMicrokernelTester()
17043 .mr(6)
17044 .nr(8)
17045 .kr(1)
17046 .sr(1)
17047 .m(6)
17048 .n(8)
17049 .k(2)
17050 .cm_stride(11)
17051 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld64);
17052 }
17053#endif // XNN_ARCH_ARM64
17054
17055
17056#if XNN_ARCH_ARM64
17057 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4) {
17058 TEST_REQUIRES_ARM_NEON_FMA;
17059 GemmMicrokernelTester()
17060 .mr(6)
17061 .nr(8)
17062 .kr(1)
17063 .sr(1)
17064 .m(6)
17065 .n(8)
17066 .k(4)
17067 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17068 }
17069
17070 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, strided_cn) {
17071 TEST_REQUIRES_ARM_NEON_FMA;
17072 GemmMicrokernelTester()
17073 .mr(6)
17074 .nr(8)
17075 .kr(1)
17076 .sr(1)
17077 .m(6)
17078 .n(8)
17079 .k(4)
17080 .cn_stride(11)
17081 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17082 }
17083
17084 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
17085 TEST_REQUIRES_ARM_NEON_FMA;
17086 for (uint32_t m = 1; m <= 6; m++) {
17087 for (uint32_t n = 1; n <= 8; n++) {
17088 GemmMicrokernelTester()
17089 .mr(6)
17090 .nr(8)
17091 .kr(1)
17092 .sr(1)
17093 .m(m)
17094 .n(n)
17095 .k(4)
17096 .iterations(1)
17097 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17098 }
17099 }
17100 }
17101
17102 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
17103 TEST_REQUIRES_ARM_NEON_FMA;
17104 for (uint32_t m = 1; m <= 6; m++) {
17105 GemmMicrokernelTester()
17106 .mr(6)
17107 .nr(8)
17108 .kr(1)
17109 .sr(1)
17110 .m(m)
17111 .n(8)
17112 .k(4)
17113 .iterations(1)
17114 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17115 }
17116 }
17117
17118 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
17119 TEST_REQUIRES_ARM_NEON_FMA;
17120 for (uint32_t n = 1; n <= 8; n++) {
17121 GemmMicrokernelTester()
17122 .mr(6)
17123 .nr(8)
17124 .kr(1)
17125 .sr(1)
17126 .m(6)
17127 .n(n)
17128 .k(4)
17129 .iterations(1)
17130 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17131 }
17132 }
17133
17134 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_lt_4) {
17135 TEST_REQUIRES_ARM_NEON_FMA;
17136 for (size_t k = 1; k < 4; k++) {
17137 GemmMicrokernelTester()
17138 .mr(6)
17139 .nr(8)
17140 .kr(1)
17141 .sr(1)
17142 .m(6)
17143 .n(8)
17144 .k(k)
17145 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17146 }
17147 }
17148
17149 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
17150 TEST_REQUIRES_ARM_NEON_FMA;
17151 for (size_t k = 1; k < 4; k++) {
17152 for (uint32_t m = 1; m <= 6; m++) {
17153 for (uint32_t n = 1; n <= 8; n++) {
17154 GemmMicrokernelTester()
17155 .mr(6)
17156 .nr(8)
17157 .kr(1)
17158 .sr(1)
17159 .m(m)
17160 .n(n)
17161 .k(k)
17162 .iterations(1)
17163 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17164 }
17165 }
17166 }
17167 }
17168
17169 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_gt_4) {
17170 TEST_REQUIRES_ARM_NEON_FMA;
17171 for (size_t k = 5; k < 8; k++) {
17172 GemmMicrokernelTester()
17173 .mr(6)
17174 .nr(8)
17175 .kr(1)
17176 .sr(1)
17177 .m(6)
17178 .n(8)
17179 .k(k)
17180 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17181 }
17182 }
17183
17184 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
17185 TEST_REQUIRES_ARM_NEON_FMA;
17186 for (size_t k = 5; k < 8; k++) {
17187 for (uint32_t m = 1; m <= 6; m++) {
17188 for (uint32_t n = 1; n <= 8; n++) {
17189 GemmMicrokernelTester()
17190 .mr(6)
17191 .nr(8)
17192 .kr(1)
17193 .sr(1)
17194 .m(m)
17195 .n(n)
17196 .k(k)
17197 .iterations(1)
17198 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17199 }
17200 }
17201 }
17202 }
17203
17204 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_div_4) {
17205 TEST_REQUIRES_ARM_NEON_FMA;
17206 for (size_t k = 8; k <= 40; k += 4) {
17207 GemmMicrokernelTester()
17208 .mr(6)
17209 .nr(8)
17210 .kr(1)
17211 .sr(1)
17212 .m(6)
17213 .n(8)
17214 .k(k)
17215 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17216 }
17217 }
17218
17219 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
17220 TEST_REQUIRES_ARM_NEON_FMA;
17221 for (size_t k = 8; k <= 40; k += 4) {
17222 for (uint32_t m = 1; m <= 6; m++) {
17223 for (uint32_t n = 1; n <= 8; n++) {
17224 GemmMicrokernelTester()
17225 .mr(6)
17226 .nr(8)
17227 .kr(1)
17228 .sr(1)
17229 .m(m)
17230 .n(n)
17231 .k(k)
17232 .iterations(1)
17233 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17234 }
17235 }
17236 }
17237 }
17238
17239 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8) {
17240 TEST_REQUIRES_ARM_NEON_FMA;
17241 for (uint32_t n = 9; n < 16; n++) {
17242 for (size_t k = 1; k <= 20; k += 5) {
17243 GemmMicrokernelTester()
17244 .mr(6)
17245 .nr(8)
17246 .kr(1)
17247 .sr(1)
17248 .m(6)
17249 .n(8)
17250 .k(k)
17251 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17252 }
17253 }
17254 }
17255
17256 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
17257 TEST_REQUIRES_ARM_NEON_FMA;
17258 for (uint32_t n = 9; n < 16; n++) {
17259 for (size_t k = 1; k <= 20; k += 5) {
17260 GemmMicrokernelTester()
17261 .mr(6)
17262 .nr(8)
17263 .kr(1)
17264 .sr(1)
17265 .m(6)
17266 .n(8)
17267 .k(k)
17268 .cn_stride(11)
17269 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17270 }
17271 }
17272 }
17273
17274 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
17275 TEST_REQUIRES_ARM_NEON_FMA;
17276 for (uint32_t n = 9; n < 16; n++) {
17277 for (size_t k = 1; k <= 20; k += 5) {
17278 for (uint32_t m = 1; m <= 6; m++) {
17279 GemmMicrokernelTester()
17280 .mr(6)
17281 .nr(8)
17282 .kr(1)
17283 .sr(1)
17284 .m(m)
17285 .n(n)
17286 .k(k)
17287 .iterations(1)
17288 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17289 }
17290 }
17291 }
17292 }
17293
17294 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8) {
17295 TEST_REQUIRES_ARM_NEON_FMA;
17296 for (uint32_t n = 16; n <= 24; n += 8) {
17297 for (size_t k = 1; k <= 20; k += 5) {
17298 GemmMicrokernelTester()
17299 .mr(6)
17300 .nr(8)
17301 .kr(1)
17302 .sr(1)
17303 .m(6)
17304 .n(8)
17305 .k(k)
17306 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17307 }
17308 }
17309 }
17310
17311 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
17312 TEST_REQUIRES_ARM_NEON_FMA;
17313 for (uint32_t n = 16; n <= 24; n += 8) {
17314 for (size_t k = 1; k <= 20; k += 5) {
17315 GemmMicrokernelTester()
17316 .mr(6)
17317 .nr(8)
17318 .kr(1)
17319 .sr(1)
17320 .m(6)
17321 .n(n)
17322 .k(k)
17323 .cn_stride(11)
17324 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17325 }
17326 }
17327 }
17328
17329 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
17330 TEST_REQUIRES_ARM_NEON_FMA;
17331 for (uint32_t n = 16; n <= 24; n += 8) {
17332 for (size_t k = 1; k <= 20; k += 5) {
17333 for (uint32_t m = 1; m <= 6; m++) {
17334 GemmMicrokernelTester()
17335 .mr(6)
17336 .nr(8)
17337 .kr(1)
17338 .sr(1)
17339 .m(m)
17340 .n(n)
17341 .k(k)
17342 .iterations(1)
17343 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17344 }
17345 }
17346 }
17347 }
17348
17349 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, small_kernel) {
17350 TEST_REQUIRES_ARM_NEON_FMA;
17351 for (size_t k = 1; k <= 20; k += 5) {
17352 GemmMicrokernelTester()
17353 .mr(6)
17354 .nr(8)
17355 .kr(1)
17356 .sr(1)
17357 .m(6)
17358 .n(8)
17359 .k(k)
17360 .ks(3)
17361 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17362 }
17363 }
17364
17365 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, small_kernel_subtile) {
17366 TEST_REQUIRES_ARM_NEON_FMA;
17367 for (size_t k = 1; k <= 20; k += 5) {
17368 for (uint32_t m = 1; m <= 6; m++) {
17369 for (uint32_t n = 1; n <= 8; n++) {
17370 GemmMicrokernelTester()
17371 .mr(6)
17372 .nr(8)
17373 .kr(1)
17374 .sr(1)
17375 .m(m)
17376 .n(n)
17377 .k(k)
17378 .ks(3)
17379 .iterations(1)
17380 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17381 }
17382 }
17383 }
17384 }
17385
17386 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_small_kernel) {
17387 TEST_REQUIRES_ARM_NEON_FMA;
17388 for (uint32_t n = 9; n < 16; n++) {
17389 for (size_t k = 1; k <= 20; k += 5) {
17390 GemmMicrokernelTester()
17391 .mr(6)
17392 .nr(8)
17393 .kr(1)
17394 .sr(1)
17395 .m(6)
17396 .n(8)
17397 .k(k)
17398 .ks(3)
17399 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17400 }
17401 }
17402 }
17403
17404 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, n_div_8_small_kernel) {
17405 TEST_REQUIRES_ARM_NEON_FMA;
17406 for (uint32_t n = 16; n <= 24; n += 8) {
17407 for (size_t k = 1; k <= 20; k += 5) {
17408 GemmMicrokernelTester()
17409 .mr(6)
17410 .nr(8)
17411 .kr(1)
17412 .sr(1)
17413 .m(6)
17414 .n(8)
17415 .k(k)
17416 .ks(3)
17417 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17418 }
17419 }
17420 }
17421
17422 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
17423 TEST_REQUIRES_ARM_NEON_FMA;
17424 for (size_t k = 1; k <= 20; k += 5) {
17425 for (uint32_t m = 1; m <= 6; m++) {
17426 for (uint32_t n = 1; n <= 8; n++) {
17427 GemmMicrokernelTester()
17428 .mr(6)
17429 .nr(8)
17430 .kr(1)
17431 .sr(1)
17432 .m(m)
17433 .n(n)
17434 .k(k)
17435 .cm_stride(11)
17436 .iterations(1)
17437 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17438 }
17439 }
17440 }
17441 }
17442
17443 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, a_offset) {
17444 TEST_REQUIRES_ARM_NEON_FMA;
17445 for (size_t k = 1; k <= 20; k += 5) {
17446 GemmMicrokernelTester()
17447 .mr(6)
17448 .nr(8)
17449 .kr(1)
17450 .sr(1)
17451 .m(6)
17452 .n(8)
17453 .k(k)
17454 .ks(3)
17455 .a_offset(127)
17456 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17457 }
17458 }
17459
17460 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, zero) {
17461 TEST_REQUIRES_ARM_NEON_FMA;
17462 for (uint32_t mz = 0; mz < 6; mz++) {
17463 for (size_t k = 1; k <= 20; k += 5) {
17464 GemmMicrokernelTester()
17465 .mr(6)
17466 .nr(8)
17467 .kr(1)
17468 .sr(1)
17469 .m(6)
17470 .n(8)
17471 .k(k)
17472 .ks(3)
17473 .a_offset(127)
17474 .zero_index(mz)
17475 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17476 }
17477 }
17478 }
17479
17480 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, qmin) {
17481 TEST_REQUIRES_ARM_NEON_FMA;
17482 GemmMicrokernelTester()
17483 .mr(6)
17484 .nr(8)
17485 .kr(1)
17486 .sr(1)
17487 .m(6)
17488 .n(8)
17489 .k(4)
17490 .qmin(128)
17491 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17492 }
17493
17494 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, qmax) {
17495 TEST_REQUIRES_ARM_NEON_FMA;
17496 GemmMicrokernelTester()
17497 .mr(6)
17498 .nr(8)
17499 .kr(1)
17500 .sr(1)
17501 .m(6)
17502 .n(8)
17503 .k(4)
17504 .qmax(128)
17505 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17506 }
17507
17508 TEST(F32_IGEMM_6X8__NEONFMA_LANE_LD128, strided_cm) {
17509 TEST_REQUIRES_ARM_NEON_FMA;
17510 GemmMicrokernelTester()
17511 .mr(6)
17512 .nr(8)
17513 .kr(1)
17514 .sr(1)
17515 .m(6)
17516 .n(8)
17517 .k(4)
17518 .cm_stride(11)
17519 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_lane_ld128);
17520 }
17521#endif // XNN_ARCH_ARM64
17522
17523
17524#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17525 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2) {
17526 TEST_REQUIRES_ARM_NEON;
17527 GemmMicrokernelTester()
17528 .mr(1)
17529 .nr(8)
17530 .kr(1)
17531 .sr(1)
17532 .m(1)
17533 .n(8)
17534 .k(2)
17535 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17536 }
17537
17538 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, strided_cn) {
17539 TEST_REQUIRES_ARM_NEON;
17540 GemmMicrokernelTester()
17541 .mr(1)
17542 .nr(8)
17543 .kr(1)
17544 .sr(1)
17545 .m(1)
17546 .n(8)
17547 .k(2)
17548 .cn_stride(11)
17549 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17550 }
17551
17552 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
17553 TEST_REQUIRES_ARM_NEON;
17554 for (uint32_t m = 1; m <= 1; m++) {
17555 for (uint32_t n = 1; n <= 8; n++) {
17556 GemmMicrokernelTester()
17557 .mr(1)
17558 .nr(8)
17559 .kr(1)
17560 .sr(1)
17561 .m(m)
17562 .n(n)
17563 .k(2)
17564 .iterations(1)
17565 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17566 }
17567 }
17568 }
17569
17570 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
17571 TEST_REQUIRES_ARM_NEON;
17572 for (uint32_t m = 1; m <= 1; m++) {
17573 GemmMicrokernelTester()
17574 .mr(1)
17575 .nr(8)
17576 .kr(1)
17577 .sr(1)
17578 .m(m)
17579 .n(8)
17580 .k(2)
17581 .iterations(1)
17582 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17583 }
17584 }
17585
17586 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
17587 TEST_REQUIRES_ARM_NEON;
17588 for (uint32_t n = 1; n <= 8; n++) {
17589 GemmMicrokernelTester()
17590 .mr(1)
17591 .nr(8)
17592 .kr(1)
17593 .sr(1)
17594 .m(1)
17595 .n(n)
17596 .k(2)
17597 .iterations(1)
17598 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17599 }
17600 }
17601
17602 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_lt_2) {
17603 TEST_REQUIRES_ARM_NEON;
17604 for (size_t k = 1; k < 2; k++) {
17605 GemmMicrokernelTester()
17606 .mr(1)
17607 .nr(8)
17608 .kr(1)
17609 .sr(1)
17610 .m(1)
17611 .n(8)
17612 .k(k)
17613 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17614 }
17615 }
17616
17617 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
17618 TEST_REQUIRES_ARM_NEON;
17619 for (size_t k = 1; k < 2; k++) {
17620 for (uint32_t m = 1; m <= 1; m++) {
17621 for (uint32_t n = 1; n <= 8; n++) {
17622 GemmMicrokernelTester()
17623 .mr(1)
17624 .nr(8)
17625 .kr(1)
17626 .sr(1)
17627 .m(m)
17628 .n(n)
17629 .k(k)
17630 .iterations(1)
17631 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17632 }
17633 }
17634 }
17635 }
17636
17637 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_gt_2) {
17638 TEST_REQUIRES_ARM_NEON;
17639 for (size_t k = 3; k < 4; k++) {
17640 GemmMicrokernelTester()
17641 .mr(1)
17642 .nr(8)
17643 .kr(1)
17644 .sr(1)
17645 .m(1)
17646 .n(8)
17647 .k(k)
17648 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17649 }
17650 }
17651
17652 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
17653 TEST_REQUIRES_ARM_NEON;
17654 for (size_t k = 3; k < 4; k++) {
17655 for (uint32_t m = 1; m <= 1; m++) {
17656 for (uint32_t n = 1; n <= 8; n++) {
17657 GemmMicrokernelTester()
17658 .mr(1)
17659 .nr(8)
17660 .kr(1)
17661 .sr(1)
17662 .m(m)
17663 .n(n)
17664 .k(k)
17665 .iterations(1)
17666 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17667 }
17668 }
17669 }
17670 }
17671
17672 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_div_2) {
17673 TEST_REQUIRES_ARM_NEON;
17674 for (size_t k = 4; k <= 20; k += 2) {
17675 GemmMicrokernelTester()
17676 .mr(1)
17677 .nr(8)
17678 .kr(1)
17679 .sr(1)
17680 .m(1)
17681 .n(8)
17682 .k(k)
17683 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17684 }
17685 }
17686
17687 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, k_div_2_subtile) {
17688 TEST_REQUIRES_ARM_NEON;
17689 for (size_t k = 4; k <= 20; k += 2) {
17690 for (uint32_t m = 1; m <= 1; m++) {
17691 for (uint32_t n = 1; n <= 8; n++) {
17692 GemmMicrokernelTester()
17693 .mr(1)
17694 .nr(8)
17695 .kr(1)
17696 .sr(1)
17697 .m(m)
17698 .n(n)
17699 .k(k)
17700 .iterations(1)
17701 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17702 }
17703 }
17704 }
17705 }
17706
17707 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8) {
17708 TEST_REQUIRES_ARM_NEON;
17709 for (uint32_t n = 9; n < 16; n++) {
17710 for (size_t k = 1; k <= 10; k += 3) {
17711 GemmMicrokernelTester()
17712 .mr(1)
17713 .nr(8)
17714 .kr(1)
17715 .sr(1)
17716 .m(1)
17717 .n(8)
17718 .k(k)
17719 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17720 }
17721 }
17722 }
17723
17724 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
17725 TEST_REQUIRES_ARM_NEON;
17726 for (uint32_t n = 9; n < 16; n++) {
17727 for (size_t k = 1; k <= 10; k += 3) {
17728 GemmMicrokernelTester()
17729 .mr(1)
17730 .nr(8)
17731 .kr(1)
17732 .sr(1)
17733 .m(1)
17734 .n(8)
17735 .k(k)
17736 .cn_stride(11)
17737 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17738 }
17739 }
17740 }
17741
17742 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
17743 TEST_REQUIRES_ARM_NEON;
17744 for (uint32_t n = 9; n < 16; n++) {
17745 for (size_t k = 1; k <= 10; k += 3) {
17746 for (uint32_t m = 1; m <= 1; m++) {
17747 GemmMicrokernelTester()
17748 .mr(1)
17749 .nr(8)
17750 .kr(1)
17751 .sr(1)
17752 .m(m)
17753 .n(n)
17754 .k(k)
17755 .iterations(1)
17756 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17757 }
17758 }
17759 }
17760 }
17761
17762 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8) {
17763 TEST_REQUIRES_ARM_NEON;
17764 for (uint32_t n = 16; n <= 24; n += 8) {
17765 for (size_t k = 1; k <= 10; k += 3) {
17766 GemmMicrokernelTester()
17767 .mr(1)
17768 .nr(8)
17769 .kr(1)
17770 .sr(1)
17771 .m(1)
17772 .n(8)
17773 .k(k)
17774 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17775 }
17776 }
17777 }
17778
17779 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
17780 TEST_REQUIRES_ARM_NEON;
17781 for (uint32_t n = 16; n <= 24; n += 8) {
17782 for (size_t k = 1; k <= 10; k += 3) {
17783 GemmMicrokernelTester()
17784 .mr(1)
17785 .nr(8)
17786 .kr(1)
17787 .sr(1)
17788 .m(1)
17789 .n(n)
17790 .k(k)
17791 .cn_stride(11)
17792 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17793 }
17794 }
17795 }
17796
17797 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8_subtile) {
17798 TEST_REQUIRES_ARM_NEON;
17799 for (uint32_t n = 16; n <= 24; n += 8) {
17800 for (size_t k = 1; k <= 10; k += 3) {
17801 for (uint32_t m = 1; m <= 1; m++) {
17802 GemmMicrokernelTester()
17803 .mr(1)
17804 .nr(8)
17805 .kr(1)
17806 .sr(1)
17807 .m(m)
17808 .n(n)
17809 .k(k)
17810 .iterations(1)
17811 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17812 }
17813 }
17814 }
17815 }
17816
17817 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, small_kernel) {
17818 TEST_REQUIRES_ARM_NEON;
17819 for (size_t k = 1; k <= 10; k += 3) {
17820 GemmMicrokernelTester()
17821 .mr(1)
17822 .nr(8)
17823 .kr(1)
17824 .sr(1)
17825 .m(1)
17826 .n(8)
17827 .k(k)
17828 .ks(3)
17829 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17830 }
17831 }
17832
17833 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, small_kernel_subtile) {
17834 TEST_REQUIRES_ARM_NEON;
17835 for (size_t k = 1; k <= 10; k += 3) {
17836 for (uint32_t m = 1; m <= 1; m++) {
17837 for (uint32_t n = 1; n <= 8; n++) {
17838 GemmMicrokernelTester()
17839 .mr(1)
17840 .nr(8)
17841 .kr(1)
17842 .sr(1)
17843 .m(m)
17844 .n(n)
17845 .k(k)
17846 .ks(3)
17847 .iterations(1)
17848 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17849 }
17850 }
17851 }
17852 }
17853
17854 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_gt_8_small_kernel) {
17855 TEST_REQUIRES_ARM_NEON;
17856 for (uint32_t n = 9; n < 16; n++) {
17857 for (size_t k = 1; k <= 10; k += 3) {
17858 GemmMicrokernelTester()
17859 .mr(1)
17860 .nr(8)
17861 .kr(1)
17862 .sr(1)
17863 .m(1)
17864 .n(8)
17865 .k(k)
17866 .ks(3)
17867 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17868 }
17869 }
17870 }
17871
17872 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, n_div_8_small_kernel) {
17873 TEST_REQUIRES_ARM_NEON;
17874 for (uint32_t n = 16; n <= 24; n += 8) {
17875 for (size_t k = 1; k <= 10; k += 3) {
17876 GemmMicrokernelTester()
17877 .mr(1)
17878 .nr(8)
17879 .kr(1)
17880 .sr(1)
17881 .m(1)
17882 .n(8)
17883 .k(k)
17884 .ks(3)
17885 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17886 }
17887 }
17888 }
17889
17890 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, strided_cm_subtile) {
17891 TEST_REQUIRES_ARM_NEON;
17892 for (size_t k = 1; k <= 10; k += 3) {
17893 for (uint32_t m = 1; m <= 1; m++) {
17894 for (uint32_t n = 1; n <= 8; n++) {
17895 GemmMicrokernelTester()
17896 .mr(1)
17897 .nr(8)
17898 .kr(1)
17899 .sr(1)
17900 .m(m)
17901 .n(n)
17902 .k(k)
17903 .cm_stride(11)
17904 .iterations(1)
17905 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17906 }
17907 }
17908 }
17909 }
17910
17911 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, a_offset) {
17912 TEST_REQUIRES_ARM_NEON;
17913 for (size_t k = 1; k <= 10; k += 3) {
17914 GemmMicrokernelTester()
17915 .mr(1)
17916 .nr(8)
17917 .kr(1)
17918 .sr(1)
17919 .m(1)
17920 .n(8)
17921 .k(k)
17922 .ks(3)
17923 .a_offset(13)
17924 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17925 }
17926 }
17927
17928 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, zero) {
17929 TEST_REQUIRES_ARM_NEON;
17930 for (uint32_t mz = 0; mz < 1; mz++) {
17931 for (size_t k = 1; k <= 10; k += 3) {
17932 GemmMicrokernelTester()
17933 .mr(1)
17934 .nr(8)
17935 .kr(1)
17936 .sr(1)
17937 .m(1)
17938 .n(8)
17939 .k(k)
17940 .ks(3)
17941 .a_offset(13)
17942 .zero_index(mz)
17943 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17944 }
17945 }
17946 }
17947
17948 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, qmin) {
17949 TEST_REQUIRES_ARM_NEON;
17950 GemmMicrokernelTester()
17951 .mr(1)
17952 .nr(8)
17953 .kr(1)
17954 .sr(1)
17955 .m(1)
17956 .n(8)
17957 .k(2)
17958 .qmin(128)
17959 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17960 }
17961
17962 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, qmax) {
17963 TEST_REQUIRES_ARM_NEON;
17964 GemmMicrokernelTester()
17965 .mr(1)
17966 .nr(8)
17967 .kr(1)
17968 .sr(1)
17969 .m(1)
17970 .n(8)
17971 .k(2)
17972 .qmax(128)
17973 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17974 }
17975
17976 TEST(F32_IGEMM_1X8__NEON_DUP_LD64, strided_cm) {
17977 TEST_REQUIRES_ARM_NEON;
17978 GemmMicrokernelTester()
17979 .mr(1)
17980 .nr(8)
17981 .kr(1)
17982 .sr(1)
17983 .m(1)
17984 .n(8)
17985 .k(2)
17986 .cm_stride(11)
17987 .Test(xnn_f32_igemm_ukernel_1x8__neon_dup_ld64);
17988 }
17989#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17990
17991
17992#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17993 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4) {
17994 TEST_REQUIRES_ARM_NEON;
17995 GemmMicrokernelTester()
17996 .mr(4)
17997 .nr(8)
17998 .kr(1)
17999 .sr(1)
18000 .m(4)
18001 .n(8)
18002 .k(4)
18003 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18004 }
18005
18006 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, strided_cn) {
18007 TEST_REQUIRES_ARM_NEON;
18008 GemmMicrokernelTester()
18009 .mr(4)
18010 .nr(8)
18011 .kr(1)
18012 .sr(1)
18013 .m(4)
18014 .n(8)
18015 .k(4)
18016 .cn_stride(11)
18017 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18018 }
18019
18020 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
18021 TEST_REQUIRES_ARM_NEON;
18022 for (uint32_t m = 1; m <= 4; m++) {
18023 for (uint32_t n = 1; n <= 8; n++) {
18024 GemmMicrokernelTester()
18025 .mr(4)
18026 .nr(8)
18027 .kr(1)
18028 .sr(1)
18029 .m(m)
18030 .n(n)
18031 .k(4)
18032 .iterations(1)
18033 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18034 }
18035 }
18036 }
18037
18038 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
18039 TEST_REQUIRES_ARM_NEON;
18040 for (uint32_t m = 1; m <= 4; m++) {
18041 GemmMicrokernelTester()
18042 .mr(4)
18043 .nr(8)
18044 .kr(1)
18045 .sr(1)
18046 .m(m)
18047 .n(8)
18048 .k(4)
18049 .iterations(1)
18050 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18051 }
18052 }
18053
18054 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
18055 TEST_REQUIRES_ARM_NEON;
18056 for (uint32_t n = 1; n <= 8; n++) {
18057 GemmMicrokernelTester()
18058 .mr(4)
18059 .nr(8)
18060 .kr(1)
18061 .sr(1)
18062 .m(4)
18063 .n(n)
18064 .k(4)
18065 .iterations(1)
18066 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18067 }
18068 }
18069
18070 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_lt_4) {
18071 TEST_REQUIRES_ARM_NEON;
18072 for (size_t k = 1; k < 4; k++) {
18073 GemmMicrokernelTester()
18074 .mr(4)
18075 .nr(8)
18076 .kr(1)
18077 .sr(1)
18078 .m(4)
18079 .n(8)
18080 .k(k)
18081 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18082 }
18083 }
18084
18085 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
18086 TEST_REQUIRES_ARM_NEON;
18087 for (size_t k = 1; k < 4; k++) {
18088 for (uint32_t m = 1; m <= 4; m++) {
18089 for (uint32_t n = 1; n <= 8; n++) {
18090 GemmMicrokernelTester()
18091 .mr(4)
18092 .nr(8)
18093 .kr(1)
18094 .sr(1)
18095 .m(m)
18096 .n(n)
18097 .k(k)
18098 .iterations(1)
18099 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18100 }
18101 }
18102 }
18103 }
18104
18105 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_gt_4) {
18106 TEST_REQUIRES_ARM_NEON;
18107 for (size_t k = 5; k < 8; k++) {
18108 GemmMicrokernelTester()
18109 .mr(4)
18110 .nr(8)
18111 .kr(1)
18112 .sr(1)
18113 .m(4)
18114 .n(8)
18115 .k(k)
18116 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18117 }
18118 }
18119
18120 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
18121 TEST_REQUIRES_ARM_NEON;
18122 for (size_t k = 5; k < 8; k++) {
18123 for (uint32_t m = 1; m <= 4; m++) {
18124 for (uint32_t n = 1; n <= 8; n++) {
18125 GemmMicrokernelTester()
18126 .mr(4)
18127 .nr(8)
18128 .kr(1)
18129 .sr(1)
18130 .m(m)
18131 .n(n)
18132 .k(k)
18133 .iterations(1)
18134 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18135 }
18136 }
18137 }
18138 }
18139
18140 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_div_4) {
18141 TEST_REQUIRES_ARM_NEON;
18142 for (size_t k = 8; k <= 40; k += 4) {
18143 GemmMicrokernelTester()
18144 .mr(4)
18145 .nr(8)
18146 .kr(1)
18147 .sr(1)
18148 .m(4)
18149 .n(8)
18150 .k(k)
18151 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18152 }
18153 }
18154
18155 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, k_div_4_subtile) {
18156 TEST_REQUIRES_ARM_NEON;
18157 for (size_t k = 8; k <= 40; k += 4) {
18158 for (uint32_t m = 1; m <= 4; m++) {
18159 for (uint32_t n = 1; n <= 8; n++) {
18160 GemmMicrokernelTester()
18161 .mr(4)
18162 .nr(8)
18163 .kr(1)
18164 .sr(1)
18165 .m(m)
18166 .n(n)
18167 .k(k)
18168 .iterations(1)
18169 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18170 }
18171 }
18172 }
18173 }
18174
18175 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8) {
18176 TEST_REQUIRES_ARM_NEON;
18177 for (uint32_t n = 9; n < 16; n++) {
18178 for (size_t k = 1; k <= 20; k += 5) {
18179 GemmMicrokernelTester()
18180 .mr(4)
18181 .nr(8)
18182 .kr(1)
18183 .sr(1)
18184 .m(4)
18185 .n(8)
18186 .k(k)
18187 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18188 }
18189 }
18190 }
18191
18192 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
18193 TEST_REQUIRES_ARM_NEON;
18194 for (uint32_t n = 9; n < 16; n++) {
18195 for (size_t k = 1; k <= 20; k += 5) {
18196 GemmMicrokernelTester()
18197 .mr(4)
18198 .nr(8)
18199 .kr(1)
18200 .sr(1)
18201 .m(4)
18202 .n(8)
18203 .k(k)
18204 .cn_stride(11)
18205 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18206 }
18207 }
18208 }
18209
18210 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
18211 TEST_REQUIRES_ARM_NEON;
18212 for (uint32_t n = 9; n < 16; n++) {
18213 for (size_t k = 1; k <= 20; k += 5) {
18214 for (uint32_t m = 1; m <= 4; m++) {
18215 GemmMicrokernelTester()
18216 .mr(4)
18217 .nr(8)
18218 .kr(1)
18219 .sr(1)
18220 .m(m)
18221 .n(n)
18222 .k(k)
18223 .iterations(1)
18224 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18225 }
18226 }
18227 }
18228 }
18229
18230 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8) {
18231 TEST_REQUIRES_ARM_NEON;
18232 for (uint32_t n = 16; n <= 24; n += 8) {
18233 for (size_t k = 1; k <= 20; k += 5) {
18234 GemmMicrokernelTester()
18235 .mr(4)
18236 .nr(8)
18237 .kr(1)
18238 .sr(1)
18239 .m(4)
18240 .n(8)
18241 .k(k)
18242 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18243 }
18244 }
18245 }
18246
18247 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
18248 TEST_REQUIRES_ARM_NEON;
18249 for (uint32_t n = 16; n <= 24; n += 8) {
18250 for (size_t k = 1; k <= 20; k += 5) {
18251 GemmMicrokernelTester()
18252 .mr(4)
18253 .nr(8)
18254 .kr(1)
18255 .sr(1)
18256 .m(4)
18257 .n(n)
18258 .k(k)
18259 .cn_stride(11)
18260 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18261 }
18262 }
18263 }
18264
18265 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8_subtile) {
18266 TEST_REQUIRES_ARM_NEON;
18267 for (uint32_t n = 16; n <= 24; n += 8) {
18268 for (size_t k = 1; k <= 20; k += 5) {
18269 for (uint32_t m = 1; m <= 4; m++) {
18270 GemmMicrokernelTester()
18271 .mr(4)
18272 .nr(8)
18273 .kr(1)
18274 .sr(1)
18275 .m(m)
18276 .n(n)
18277 .k(k)
18278 .iterations(1)
18279 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18280 }
18281 }
18282 }
18283 }
18284
18285 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, small_kernel) {
18286 TEST_REQUIRES_ARM_NEON;
18287 for (size_t k = 1; k <= 20; k += 5) {
18288 GemmMicrokernelTester()
18289 .mr(4)
18290 .nr(8)
18291 .kr(1)
18292 .sr(1)
18293 .m(4)
18294 .n(8)
18295 .k(k)
18296 .ks(3)
18297 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18298 }
18299 }
18300
18301 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, small_kernel_subtile) {
18302 TEST_REQUIRES_ARM_NEON;
18303 for (size_t k = 1; k <= 20; k += 5) {
18304 for (uint32_t m = 1; m <= 4; m++) {
18305 for (uint32_t n = 1; n <= 8; n++) {
18306 GemmMicrokernelTester()
18307 .mr(4)
18308 .nr(8)
18309 .kr(1)
18310 .sr(1)
18311 .m(m)
18312 .n(n)
18313 .k(k)
18314 .ks(3)
18315 .iterations(1)
18316 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18317 }
18318 }
18319 }
18320 }
18321
18322 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_gt_8_small_kernel) {
18323 TEST_REQUIRES_ARM_NEON;
18324 for (uint32_t n = 9; n < 16; n++) {
18325 for (size_t k = 1; k <= 20; k += 5) {
18326 GemmMicrokernelTester()
18327 .mr(4)
18328 .nr(8)
18329 .kr(1)
18330 .sr(1)
18331 .m(4)
18332 .n(8)
18333 .k(k)
18334 .ks(3)
18335 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18336 }
18337 }
18338 }
18339
18340 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, n_div_8_small_kernel) {
18341 TEST_REQUIRES_ARM_NEON;
18342 for (uint32_t n = 16; n <= 24; n += 8) {
18343 for (size_t k = 1; k <= 20; k += 5) {
18344 GemmMicrokernelTester()
18345 .mr(4)
18346 .nr(8)
18347 .kr(1)
18348 .sr(1)
18349 .m(4)
18350 .n(8)
18351 .k(k)
18352 .ks(3)
18353 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18354 }
18355 }
18356 }
18357
18358 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, strided_cm_subtile) {
18359 TEST_REQUIRES_ARM_NEON;
18360 for (size_t k = 1; k <= 20; k += 5) {
18361 for (uint32_t m = 1; m <= 4; m++) {
18362 for (uint32_t n = 1; n <= 8; n++) {
18363 GemmMicrokernelTester()
18364 .mr(4)
18365 .nr(8)
18366 .kr(1)
18367 .sr(1)
18368 .m(m)
18369 .n(n)
18370 .k(k)
18371 .cm_stride(11)
18372 .iterations(1)
18373 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18374 }
18375 }
18376 }
18377 }
18378
18379 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, a_offset) {
18380 TEST_REQUIRES_ARM_NEON;
18381 for (size_t k = 1; k <= 20; k += 5) {
18382 GemmMicrokernelTester()
18383 .mr(4)
18384 .nr(8)
18385 .kr(1)
18386 .sr(1)
18387 .m(4)
18388 .n(8)
18389 .k(k)
18390 .ks(3)
18391 .a_offset(83)
18392 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18393 }
18394 }
18395
18396 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, zero) {
18397 TEST_REQUIRES_ARM_NEON;
18398 for (uint32_t mz = 0; mz < 4; mz++) {
18399 for (size_t k = 1; k <= 20; k += 5) {
18400 GemmMicrokernelTester()
18401 .mr(4)
18402 .nr(8)
18403 .kr(1)
18404 .sr(1)
18405 .m(4)
18406 .n(8)
18407 .k(k)
18408 .ks(3)
18409 .a_offset(83)
18410 .zero_index(mz)
18411 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18412 }
18413 }
18414 }
18415
18416 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, qmin) {
18417 TEST_REQUIRES_ARM_NEON;
18418 GemmMicrokernelTester()
18419 .mr(4)
18420 .nr(8)
18421 .kr(1)
18422 .sr(1)
18423 .m(4)
18424 .n(8)
18425 .k(4)
18426 .qmin(128)
18427 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18428 }
18429
18430 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, qmax) {
18431 TEST_REQUIRES_ARM_NEON;
18432 GemmMicrokernelTester()
18433 .mr(4)
18434 .nr(8)
18435 .kr(1)
18436 .sr(1)
18437 .m(4)
18438 .n(8)
18439 .k(4)
18440 .qmax(128)
18441 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18442 }
18443
18444 TEST(F32_IGEMM_4X8__NEON_DUP_LD128, strided_cm) {
18445 TEST_REQUIRES_ARM_NEON;
18446 GemmMicrokernelTester()
18447 .mr(4)
18448 .nr(8)
18449 .kr(1)
18450 .sr(1)
18451 .m(4)
18452 .n(8)
18453 .k(4)
18454 .cm_stride(11)
18455 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld128);
18456 }
18457#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18458
18459
18460#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18461 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2) {
18462 TEST_REQUIRES_ARM_NEON;
18463 GemmMicrokernelTester()
18464 .mr(4)
18465 .nr(8)
18466 .kr(1)
18467 .sr(1)
18468 .m(4)
18469 .n(8)
18470 .k(2)
18471 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18472 }
18473
18474 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, strided_cn) {
18475 TEST_REQUIRES_ARM_NEON;
18476 GemmMicrokernelTester()
18477 .mr(4)
18478 .nr(8)
18479 .kr(1)
18480 .sr(1)
18481 .m(4)
18482 .n(8)
18483 .k(2)
18484 .cn_stride(11)
18485 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18486 }
18487
18488 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile) {
18489 TEST_REQUIRES_ARM_NEON;
18490 for (uint32_t m = 1; m <= 4; m++) {
18491 for (uint32_t n = 1; n <= 8; n++) {
18492 GemmMicrokernelTester()
18493 .mr(4)
18494 .nr(8)
18495 .kr(1)
18496 .sr(1)
18497 .m(m)
18498 .n(n)
18499 .k(2)
18500 .iterations(1)
18501 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18502 }
18503 }
18504 }
18505
18506 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
18507 TEST_REQUIRES_ARM_NEON;
18508 for (uint32_t m = 1; m <= 4; m++) {
18509 GemmMicrokernelTester()
18510 .mr(4)
18511 .nr(8)
18512 .kr(1)
18513 .sr(1)
18514 .m(m)
18515 .n(8)
18516 .k(2)
18517 .iterations(1)
18518 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18519 }
18520 }
18521
18522 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
18523 TEST_REQUIRES_ARM_NEON;
18524 for (uint32_t n = 1; n <= 8; n++) {
18525 GemmMicrokernelTester()
18526 .mr(4)
18527 .nr(8)
18528 .kr(1)
18529 .sr(1)
18530 .m(4)
18531 .n(n)
18532 .k(2)
18533 .iterations(1)
18534 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18535 }
18536 }
18537
18538 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_lt_2) {
18539 TEST_REQUIRES_ARM_NEON;
18540 for (size_t k = 1; k < 2; k++) {
18541 GemmMicrokernelTester()
18542 .mr(4)
18543 .nr(8)
18544 .kr(1)
18545 .sr(1)
18546 .m(4)
18547 .n(8)
18548 .k(k)
18549 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18550 }
18551 }
18552
18553 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_lt_2_subtile) {
18554 TEST_REQUIRES_ARM_NEON;
18555 for (size_t k = 1; k < 2; k++) {
18556 for (uint32_t m = 1; m <= 4; m++) {
18557 for (uint32_t n = 1; n <= 8; n++) {
18558 GemmMicrokernelTester()
18559 .mr(4)
18560 .nr(8)
18561 .kr(1)
18562 .sr(1)
18563 .m(m)
18564 .n(n)
18565 .k(k)
18566 .iterations(1)
18567 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18568 }
18569 }
18570 }
18571 }
18572
18573 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_gt_2) {
18574 TEST_REQUIRES_ARM_NEON;
18575 for (size_t k = 3; k < 4; k++) {
18576 GemmMicrokernelTester()
18577 .mr(4)
18578 .nr(8)
18579 .kr(1)
18580 .sr(1)
18581 .m(4)
18582 .n(8)
18583 .k(k)
18584 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18585 }
18586 }
18587
18588 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_gt_2_subtile) {
18589 TEST_REQUIRES_ARM_NEON;
18590 for (size_t k = 3; k < 4; k++) {
18591 for (uint32_t m = 1; m <= 4; m++) {
18592 for (uint32_t n = 1; n <= 8; n++) {
18593 GemmMicrokernelTester()
18594 .mr(4)
18595 .nr(8)
18596 .kr(1)
18597 .sr(1)
18598 .m(m)
18599 .n(n)
18600 .k(k)
18601 .iterations(1)
18602 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18603 }
18604 }
18605 }
18606 }
18607
18608 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_div_2) {
18609 TEST_REQUIRES_ARM_NEON;
18610 for (size_t k = 4; k <= 20; k += 2) {
18611 GemmMicrokernelTester()
18612 .mr(4)
18613 .nr(8)
18614 .kr(1)
18615 .sr(1)
18616 .m(4)
18617 .n(8)
18618 .k(k)
18619 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18620 }
18621 }
18622
18623 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, k_div_2_subtile) {
18624 TEST_REQUIRES_ARM_NEON;
18625 for (size_t k = 4; k <= 20; k += 2) {
18626 for (uint32_t m = 1; m <= 4; m++) {
18627 for (uint32_t n = 1; n <= 8; n++) {
18628 GemmMicrokernelTester()
18629 .mr(4)
18630 .nr(8)
18631 .kr(1)
18632 .sr(1)
18633 .m(m)
18634 .n(n)
18635 .k(k)
18636 .iterations(1)
18637 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18638 }
18639 }
18640 }
18641 }
18642
18643 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8) {
18644 TEST_REQUIRES_ARM_NEON;
18645 for (uint32_t n = 9; n < 16; n++) {
18646 for (size_t k = 1; k <= 10; k += 3) {
18647 GemmMicrokernelTester()
18648 .mr(4)
18649 .nr(8)
18650 .kr(1)
18651 .sr(1)
18652 .m(4)
18653 .n(8)
18654 .k(k)
18655 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18656 }
18657 }
18658 }
18659
18660 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
18661 TEST_REQUIRES_ARM_NEON;
18662 for (uint32_t n = 9; n < 16; n++) {
18663 for (size_t k = 1; k <= 10; k += 3) {
18664 GemmMicrokernelTester()
18665 .mr(4)
18666 .nr(8)
18667 .kr(1)
18668 .sr(1)
18669 .m(4)
18670 .n(8)
18671 .k(k)
18672 .cn_stride(11)
18673 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18674 }
18675 }
18676 }
18677
18678 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8_subtile) {
18679 TEST_REQUIRES_ARM_NEON;
18680 for (uint32_t n = 9; n < 16; n++) {
18681 for (size_t k = 1; k <= 10; k += 3) {
18682 for (uint32_t m = 1; m <= 4; m++) {
18683 GemmMicrokernelTester()
18684 .mr(4)
18685 .nr(8)
18686 .kr(1)
18687 .sr(1)
18688 .m(m)
18689 .n(n)
18690 .k(k)
18691 .iterations(1)
18692 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18693 }
18694 }
18695 }
18696 }
18697
18698 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8) {
18699 TEST_REQUIRES_ARM_NEON;
18700 for (uint32_t n = 16; n <= 24; n += 8) {
18701 for (size_t k = 1; k <= 10; k += 3) {
18702 GemmMicrokernelTester()
18703 .mr(4)
18704 .nr(8)
18705 .kr(1)
18706 .sr(1)
18707 .m(4)
18708 .n(8)
18709 .k(k)
18710 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18711 }
18712 }
18713 }
18714
18715 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8_strided_cn) {
18716 TEST_REQUIRES_ARM_NEON;
18717 for (uint32_t n = 16; n <= 24; n += 8) {
18718 for (size_t k = 1; k <= 10; k += 3) {
18719 GemmMicrokernelTester()
18720 .mr(4)
18721 .nr(8)
18722 .kr(1)
18723 .sr(1)
18724 .m(4)
18725 .n(n)
18726 .k(k)
18727 .cn_stride(11)
18728 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18729 }
18730 }
18731 }
18732
18733 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8_subtile) {
18734 TEST_REQUIRES_ARM_NEON;
18735 for (uint32_t n = 16; n <= 24; n += 8) {
18736 for (size_t k = 1; k <= 10; k += 3) {
18737 for (uint32_t m = 1; m <= 4; m++) {
18738 GemmMicrokernelTester()
18739 .mr(4)
18740 .nr(8)
18741 .kr(1)
18742 .sr(1)
18743 .m(m)
18744 .n(n)
18745 .k(k)
18746 .iterations(1)
18747 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18748 }
18749 }
18750 }
18751 }
18752
18753 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, small_kernel) {
18754 TEST_REQUIRES_ARM_NEON;
18755 for (size_t k = 1; k <= 10; k += 3) {
18756 GemmMicrokernelTester()
18757 .mr(4)
18758 .nr(8)
18759 .kr(1)
18760 .sr(1)
18761 .m(4)
18762 .n(8)
18763 .k(k)
18764 .ks(3)
18765 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18766 }
18767 }
18768
18769 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, small_kernel_subtile) {
18770 TEST_REQUIRES_ARM_NEON;
18771 for (size_t k = 1; k <= 10; k += 3) {
18772 for (uint32_t m = 1; m <= 4; m++) {
18773 for (uint32_t n = 1; n <= 8; n++) {
18774 GemmMicrokernelTester()
18775 .mr(4)
18776 .nr(8)
18777 .kr(1)
18778 .sr(1)
18779 .m(m)
18780 .n(n)
18781 .k(k)
18782 .ks(3)
18783 .iterations(1)
18784 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18785 }
18786 }
18787 }
18788 }
18789
18790 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_gt_8_small_kernel) {
18791 TEST_REQUIRES_ARM_NEON;
18792 for (uint32_t n = 9; n < 16; n++) {
18793 for (size_t k = 1; k <= 10; k += 3) {
18794 GemmMicrokernelTester()
18795 .mr(4)
18796 .nr(8)
18797 .kr(1)
18798 .sr(1)
18799 .m(4)
18800 .n(8)
18801 .k(k)
18802 .ks(3)
18803 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18804 }
18805 }
18806 }
18807
18808 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, n_div_8_small_kernel) {
18809 TEST_REQUIRES_ARM_NEON;
18810 for (uint32_t n = 16; n <= 24; n += 8) {
18811 for (size_t k = 1; k <= 10; k += 3) {
18812 GemmMicrokernelTester()
18813 .mr(4)
18814 .nr(8)
18815 .kr(1)
18816 .sr(1)
18817 .m(4)
18818 .n(8)
18819 .k(k)
18820 .ks(3)
18821 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18822 }
18823 }
18824 }
18825
18826 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, strided_cm_subtile) {
18827 TEST_REQUIRES_ARM_NEON;
18828 for (size_t k = 1; k <= 10; k += 3) {
18829 for (uint32_t m = 1; m <= 4; m++) {
18830 for (uint32_t n = 1; n <= 8; n++) {
18831 GemmMicrokernelTester()
18832 .mr(4)
18833 .nr(8)
18834 .kr(1)
18835 .sr(1)
18836 .m(m)
18837 .n(n)
18838 .k(k)
18839 .cm_stride(11)
18840 .iterations(1)
18841 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18842 }
18843 }
18844 }
18845 }
18846
18847 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, a_offset) {
18848 TEST_REQUIRES_ARM_NEON;
18849 for (size_t k = 1; k <= 10; k += 3) {
18850 GemmMicrokernelTester()
18851 .mr(4)
18852 .nr(8)
18853 .kr(1)
18854 .sr(1)
18855 .m(4)
18856 .n(8)
18857 .k(k)
18858 .ks(3)
18859 .a_offset(43)
18860 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18861 }
18862 }
18863
18864 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, zero) {
18865 TEST_REQUIRES_ARM_NEON;
18866 for (uint32_t mz = 0; mz < 4; mz++) {
18867 for (size_t k = 1; k <= 10; k += 3) {
18868 GemmMicrokernelTester()
18869 .mr(4)
18870 .nr(8)
18871 .kr(1)
18872 .sr(1)
18873 .m(4)
18874 .n(8)
18875 .k(k)
18876 .ks(3)
18877 .a_offset(43)
18878 .zero_index(mz)
18879 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18880 }
18881 }
18882 }
18883
18884 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, qmin) {
18885 TEST_REQUIRES_ARM_NEON;
18886 GemmMicrokernelTester()
18887 .mr(4)
18888 .nr(8)
18889 .kr(1)
18890 .sr(1)
18891 .m(4)
18892 .n(8)
18893 .k(2)
18894 .qmin(128)
18895 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18896 }
18897
18898 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, qmax) {
18899 TEST_REQUIRES_ARM_NEON;
18900 GemmMicrokernelTester()
18901 .mr(4)
18902 .nr(8)
18903 .kr(1)
18904 .sr(1)
18905 .m(4)
18906 .n(8)
18907 .k(2)
18908 .qmax(128)
18909 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18910 }
18911
18912 TEST(F32_IGEMM_4X8__NEON_DUP_LD64, strided_cm) {
18913 TEST_REQUIRES_ARM_NEON;
18914 GemmMicrokernelTester()
18915 .mr(4)
18916 .nr(8)
18917 .kr(1)
18918 .sr(1)
18919 .m(4)
18920 .n(8)
18921 .k(2)
18922 .cm_stride(11)
18923 .Test(xnn_f32_igemm_ukernel_4x8__neon_dup_ld64);
18924 }
18925#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18926
18927
18928#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18929 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2) {
18930 TEST_REQUIRES_ARM_NEON;
18931 GemmMicrokernelTester()
18932 .mr(6)
18933 .nr(8)
18934 .kr(1)
18935 .sr(1)
18936 .m(6)
18937 .n(8)
18938 .k(2)
18939 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
18940 }
18941
18942 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, strided_cn) {
18943 TEST_REQUIRES_ARM_NEON;
18944 GemmMicrokernelTester()
18945 .mr(6)
18946 .nr(8)
18947 .kr(1)
18948 .sr(1)
18949 .m(6)
18950 .n(8)
18951 .k(2)
18952 .cn_stride(11)
18953 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
18954 }
18955
18956 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
18957 TEST_REQUIRES_ARM_NEON;
18958 for (uint32_t m = 1; m <= 6; m++) {
18959 for (uint32_t n = 1; n <= 8; n++) {
18960 GemmMicrokernelTester()
18961 .mr(6)
18962 .nr(8)
18963 .kr(1)
18964 .sr(1)
18965 .m(m)
18966 .n(n)
18967 .k(2)
18968 .iterations(1)
18969 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
18970 }
18971 }
18972 }
18973
18974 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
18975 TEST_REQUIRES_ARM_NEON;
18976 for (uint32_t m = 1; m <= 6; m++) {
18977 GemmMicrokernelTester()
18978 .mr(6)
18979 .nr(8)
18980 .kr(1)
18981 .sr(1)
18982 .m(m)
18983 .n(8)
18984 .k(2)
18985 .iterations(1)
18986 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
18987 }
18988 }
18989
18990 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
18991 TEST_REQUIRES_ARM_NEON;
18992 for (uint32_t n = 1; n <= 8; n++) {
18993 GemmMicrokernelTester()
18994 .mr(6)
18995 .nr(8)
18996 .kr(1)
18997 .sr(1)
18998 .m(6)
18999 .n(n)
19000 .k(2)
19001 .iterations(1)
19002 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19003 }
19004 }
19005
19006 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_lt_2) {
19007 TEST_REQUIRES_ARM_NEON;
19008 for (size_t k = 1; k < 2; k++) {
19009 GemmMicrokernelTester()
19010 .mr(6)
19011 .nr(8)
19012 .kr(1)
19013 .sr(1)
19014 .m(6)
19015 .n(8)
19016 .k(k)
19017 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19018 }
19019 }
19020
19021 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
19022 TEST_REQUIRES_ARM_NEON;
19023 for (size_t k = 1; k < 2; k++) {
19024 for (uint32_t m = 1; m <= 6; m++) {
19025 for (uint32_t n = 1; n <= 8; n++) {
19026 GemmMicrokernelTester()
19027 .mr(6)
19028 .nr(8)
19029 .kr(1)
19030 .sr(1)
19031 .m(m)
19032 .n(n)
19033 .k(k)
19034 .iterations(1)
19035 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19036 }
19037 }
19038 }
19039 }
19040
19041 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_gt_2) {
19042 TEST_REQUIRES_ARM_NEON;
19043 for (size_t k = 3; k < 4; k++) {
19044 GemmMicrokernelTester()
19045 .mr(6)
19046 .nr(8)
19047 .kr(1)
19048 .sr(1)
19049 .m(6)
19050 .n(8)
19051 .k(k)
19052 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19053 }
19054 }
19055
19056 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
19057 TEST_REQUIRES_ARM_NEON;
19058 for (size_t k = 3; k < 4; k++) {
19059 for (uint32_t m = 1; m <= 6; m++) {
19060 for (uint32_t n = 1; n <= 8; n++) {
19061 GemmMicrokernelTester()
19062 .mr(6)
19063 .nr(8)
19064 .kr(1)
19065 .sr(1)
19066 .m(m)
19067 .n(n)
19068 .k(k)
19069 .iterations(1)
19070 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19071 }
19072 }
19073 }
19074 }
19075
19076 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_div_2) {
19077 TEST_REQUIRES_ARM_NEON;
19078 for (size_t k = 4; k <= 20; k += 2) {
19079 GemmMicrokernelTester()
19080 .mr(6)
19081 .nr(8)
19082 .kr(1)
19083 .sr(1)
19084 .m(6)
19085 .n(8)
19086 .k(k)
19087 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19088 }
19089 }
19090
19091 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, k_div_2_subtile) {
19092 TEST_REQUIRES_ARM_NEON;
19093 for (size_t k = 4; k <= 20; k += 2) {
19094 for (uint32_t m = 1; m <= 6; m++) {
19095 for (uint32_t n = 1; n <= 8; n++) {
19096 GemmMicrokernelTester()
19097 .mr(6)
19098 .nr(8)
19099 .kr(1)
19100 .sr(1)
19101 .m(m)
19102 .n(n)
19103 .k(k)
19104 .iterations(1)
19105 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19106 }
19107 }
19108 }
19109 }
19110
19111 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8) {
19112 TEST_REQUIRES_ARM_NEON;
19113 for (uint32_t n = 9; n < 16; n++) {
19114 for (size_t k = 1; k <= 10; k += 3) {
19115 GemmMicrokernelTester()
19116 .mr(6)
19117 .nr(8)
19118 .kr(1)
19119 .sr(1)
19120 .m(6)
19121 .n(8)
19122 .k(k)
19123 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19124 }
19125 }
19126 }
19127
19128 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
19129 TEST_REQUIRES_ARM_NEON;
19130 for (uint32_t n = 9; n < 16; n++) {
19131 for (size_t k = 1; k <= 10; k += 3) {
19132 GemmMicrokernelTester()
19133 .mr(6)
19134 .nr(8)
19135 .kr(1)
19136 .sr(1)
19137 .m(6)
19138 .n(8)
19139 .k(k)
19140 .cn_stride(11)
19141 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19142 }
19143 }
19144 }
19145
19146 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
19147 TEST_REQUIRES_ARM_NEON;
19148 for (uint32_t n = 9; n < 16; n++) {
19149 for (size_t k = 1; k <= 10; k += 3) {
19150 for (uint32_t m = 1; m <= 6; m++) {
19151 GemmMicrokernelTester()
19152 .mr(6)
19153 .nr(8)
19154 .kr(1)
19155 .sr(1)
19156 .m(m)
19157 .n(n)
19158 .k(k)
19159 .iterations(1)
19160 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19161 }
19162 }
19163 }
19164 }
19165
19166 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8) {
19167 TEST_REQUIRES_ARM_NEON;
19168 for (uint32_t n = 16; n <= 24; n += 8) {
19169 for (size_t k = 1; k <= 10; k += 3) {
19170 GemmMicrokernelTester()
19171 .mr(6)
19172 .nr(8)
19173 .kr(1)
19174 .sr(1)
19175 .m(6)
19176 .n(8)
19177 .k(k)
19178 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19179 }
19180 }
19181 }
19182
19183 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
19184 TEST_REQUIRES_ARM_NEON;
19185 for (uint32_t n = 16; n <= 24; n += 8) {
19186 for (size_t k = 1; k <= 10; k += 3) {
19187 GemmMicrokernelTester()
19188 .mr(6)
19189 .nr(8)
19190 .kr(1)
19191 .sr(1)
19192 .m(6)
19193 .n(n)
19194 .k(k)
19195 .cn_stride(11)
19196 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19197 }
19198 }
19199 }
19200
19201 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8_subtile) {
19202 TEST_REQUIRES_ARM_NEON;
19203 for (uint32_t n = 16; n <= 24; n += 8) {
19204 for (size_t k = 1; k <= 10; k += 3) {
19205 for (uint32_t m = 1; m <= 6; m++) {
19206 GemmMicrokernelTester()
19207 .mr(6)
19208 .nr(8)
19209 .kr(1)
19210 .sr(1)
19211 .m(m)
19212 .n(n)
19213 .k(k)
19214 .iterations(1)
19215 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19216 }
19217 }
19218 }
19219 }
19220
19221 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, small_kernel) {
19222 TEST_REQUIRES_ARM_NEON;
19223 for (size_t k = 1; k <= 10; k += 3) {
19224 GemmMicrokernelTester()
19225 .mr(6)
19226 .nr(8)
19227 .kr(1)
19228 .sr(1)
19229 .m(6)
19230 .n(8)
19231 .k(k)
19232 .ks(3)
19233 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19234 }
19235 }
19236
19237 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, small_kernel_subtile) {
19238 TEST_REQUIRES_ARM_NEON;
19239 for (size_t k = 1; k <= 10; k += 3) {
19240 for (uint32_t m = 1; m <= 6; m++) {
19241 for (uint32_t n = 1; n <= 8; n++) {
19242 GemmMicrokernelTester()
19243 .mr(6)
19244 .nr(8)
19245 .kr(1)
19246 .sr(1)
19247 .m(m)
19248 .n(n)
19249 .k(k)
19250 .ks(3)
19251 .iterations(1)
19252 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19253 }
19254 }
19255 }
19256 }
19257
19258 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_gt_8_small_kernel) {
19259 TEST_REQUIRES_ARM_NEON;
19260 for (uint32_t n = 9; n < 16; n++) {
19261 for (size_t k = 1; k <= 10; k += 3) {
19262 GemmMicrokernelTester()
19263 .mr(6)
19264 .nr(8)
19265 .kr(1)
19266 .sr(1)
19267 .m(6)
19268 .n(8)
19269 .k(k)
19270 .ks(3)
19271 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19272 }
19273 }
19274 }
19275
19276 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, n_div_8_small_kernel) {
19277 TEST_REQUIRES_ARM_NEON;
19278 for (uint32_t n = 16; n <= 24; n += 8) {
19279 for (size_t k = 1; k <= 10; k += 3) {
19280 GemmMicrokernelTester()
19281 .mr(6)
19282 .nr(8)
19283 .kr(1)
19284 .sr(1)
19285 .m(6)
19286 .n(8)
19287 .k(k)
19288 .ks(3)
19289 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19290 }
19291 }
19292 }
19293
19294 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, strided_cm_subtile) {
19295 TEST_REQUIRES_ARM_NEON;
19296 for (size_t k = 1; k <= 10; k += 3) {
19297 for (uint32_t m = 1; m <= 6; m++) {
19298 for (uint32_t n = 1; n <= 8; n++) {
19299 GemmMicrokernelTester()
19300 .mr(6)
19301 .nr(8)
19302 .kr(1)
19303 .sr(1)
19304 .m(m)
19305 .n(n)
19306 .k(k)
19307 .cm_stride(11)
19308 .iterations(1)
19309 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19310 }
19311 }
19312 }
19313 }
19314
19315 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, a_offset) {
19316 TEST_REQUIRES_ARM_NEON;
19317 for (size_t k = 1; k <= 10; k += 3) {
19318 GemmMicrokernelTester()
19319 .mr(6)
19320 .nr(8)
19321 .kr(1)
19322 .sr(1)
19323 .m(6)
19324 .n(8)
19325 .k(k)
19326 .ks(3)
19327 .a_offset(67)
19328 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19329 }
19330 }
19331
19332 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, zero) {
19333 TEST_REQUIRES_ARM_NEON;
19334 for (uint32_t mz = 0; mz < 6; mz++) {
19335 for (size_t k = 1; k <= 10; k += 3) {
19336 GemmMicrokernelTester()
19337 .mr(6)
19338 .nr(8)
19339 .kr(1)
19340 .sr(1)
19341 .m(6)
19342 .n(8)
19343 .k(k)
19344 .ks(3)
19345 .a_offset(67)
19346 .zero_index(mz)
19347 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19348 }
19349 }
19350 }
19351
19352 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, qmin) {
19353 TEST_REQUIRES_ARM_NEON;
19354 GemmMicrokernelTester()
19355 .mr(6)
19356 .nr(8)
19357 .kr(1)
19358 .sr(1)
19359 .m(6)
19360 .n(8)
19361 .k(2)
19362 .qmin(128)
19363 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19364 }
19365
19366 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, qmax) {
19367 TEST_REQUIRES_ARM_NEON;
19368 GemmMicrokernelTester()
19369 .mr(6)
19370 .nr(8)
19371 .kr(1)
19372 .sr(1)
19373 .m(6)
19374 .n(8)
19375 .k(2)
19376 .qmax(128)
19377 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19378 }
19379
19380 TEST(F32_IGEMM_6X8__NEON_DUP_LD64, strided_cm) {
19381 TEST_REQUIRES_ARM_NEON;
19382 GemmMicrokernelTester()
19383 .mr(6)
19384 .nr(8)
19385 .kr(1)
19386 .sr(1)
19387 .m(6)
19388 .n(8)
19389 .k(2)
19390 .cm_stride(11)
19391 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld64);
19392 }
19393#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19394
19395
19396#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19397 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4) {
19398 TEST_REQUIRES_ARM_NEON;
19399 GemmMicrokernelTester()
19400 .mr(6)
19401 .nr(8)
19402 .kr(1)
19403 .sr(1)
19404 .m(6)
19405 .n(8)
19406 .k(4)
19407 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19408 }
19409
19410 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, strided_cn) {
19411 TEST_REQUIRES_ARM_NEON;
19412 GemmMicrokernelTester()
19413 .mr(6)
19414 .nr(8)
19415 .kr(1)
19416 .sr(1)
19417 .m(6)
19418 .n(8)
19419 .k(4)
19420 .cn_stride(11)
19421 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19422 }
19423
19424 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile) {
19425 TEST_REQUIRES_ARM_NEON;
19426 for (uint32_t m = 1; m <= 6; m++) {
19427 for (uint32_t n = 1; n <= 8; n++) {
19428 GemmMicrokernelTester()
19429 .mr(6)
19430 .nr(8)
19431 .kr(1)
19432 .sr(1)
19433 .m(m)
19434 .n(n)
19435 .k(4)
19436 .iterations(1)
19437 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19438 }
19439 }
19440 }
19441
19442 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
19443 TEST_REQUIRES_ARM_NEON;
19444 for (uint32_t m = 1; m <= 6; m++) {
19445 GemmMicrokernelTester()
19446 .mr(6)
19447 .nr(8)
19448 .kr(1)
19449 .sr(1)
19450 .m(m)
19451 .n(8)
19452 .k(4)
19453 .iterations(1)
19454 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19455 }
19456 }
19457
19458 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
19459 TEST_REQUIRES_ARM_NEON;
19460 for (uint32_t n = 1; n <= 8; n++) {
19461 GemmMicrokernelTester()
19462 .mr(6)
19463 .nr(8)
19464 .kr(1)
19465 .sr(1)
19466 .m(6)
19467 .n(n)
19468 .k(4)
19469 .iterations(1)
19470 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19471 }
19472 }
19473
19474 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_lt_4) {
19475 TEST_REQUIRES_ARM_NEON;
19476 for (size_t k = 1; k < 4; k++) {
19477 GemmMicrokernelTester()
19478 .mr(6)
19479 .nr(8)
19480 .kr(1)
19481 .sr(1)
19482 .m(6)
19483 .n(8)
19484 .k(k)
19485 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19486 }
19487 }
19488
19489 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_lt_4_subtile) {
19490 TEST_REQUIRES_ARM_NEON;
19491 for (size_t k = 1; k < 4; k++) {
19492 for (uint32_t m = 1; m <= 6; m++) {
19493 for (uint32_t n = 1; n <= 8; n++) {
19494 GemmMicrokernelTester()
19495 .mr(6)
19496 .nr(8)
19497 .kr(1)
19498 .sr(1)
19499 .m(m)
19500 .n(n)
19501 .k(k)
19502 .iterations(1)
19503 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19504 }
19505 }
19506 }
19507 }
19508
19509 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_gt_4) {
19510 TEST_REQUIRES_ARM_NEON;
19511 for (size_t k = 5; k < 8; k++) {
19512 GemmMicrokernelTester()
19513 .mr(6)
19514 .nr(8)
19515 .kr(1)
19516 .sr(1)
19517 .m(6)
19518 .n(8)
19519 .k(k)
19520 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19521 }
19522 }
19523
19524 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_gt_4_subtile) {
19525 TEST_REQUIRES_ARM_NEON;
19526 for (size_t k = 5; k < 8; k++) {
19527 for (uint32_t m = 1; m <= 6; m++) {
19528 for (uint32_t n = 1; n <= 8; n++) {
19529 GemmMicrokernelTester()
19530 .mr(6)
19531 .nr(8)
19532 .kr(1)
19533 .sr(1)
19534 .m(m)
19535 .n(n)
19536 .k(k)
19537 .iterations(1)
19538 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19539 }
19540 }
19541 }
19542 }
19543
19544 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_div_4) {
19545 TEST_REQUIRES_ARM_NEON;
19546 for (size_t k = 8; k <= 40; k += 4) {
19547 GemmMicrokernelTester()
19548 .mr(6)
19549 .nr(8)
19550 .kr(1)
19551 .sr(1)
19552 .m(6)
19553 .n(8)
19554 .k(k)
19555 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19556 }
19557 }
19558
19559 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, k_div_4_subtile) {
19560 TEST_REQUIRES_ARM_NEON;
19561 for (size_t k = 8; k <= 40; k += 4) {
19562 for (uint32_t m = 1; m <= 6; m++) {
19563 for (uint32_t n = 1; n <= 8; n++) {
19564 GemmMicrokernelTester()
19565 .mr(6)
19566 .nr(8)
19567 .kr(1)
19568 .sr(1)
19569 .m(m)
19570 .n(n)
19571 .k(k)
19572 .iterations(1)
19573 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19574 }
19575 }
19576 }
19577 }
19578
19579 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8) {
19580 TEST_REQUIRES_ARM_NEON;
19581 for (uint32_t n = 9; n < 16; n++) {
19582 for (size_t k = 1; k <= 20; k += 5) {
19583 GemmMicrokernelTester()
19584 .mr(6)
19585 .nr(8)
19586 .kr(1)
19587 .sr(1)
19588 .m(6)
19589 .n(8)
19590 .k(k)
19591 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19592 }
19593 }
19594 }
19595
19596 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
19597 TEST_REQUIRES_ARM_NEON;
19598 for (uint32_t n = 9; n < 16; n++) {
19599 for (size_t k = 1; k <= 20; k += 5) {
19600 GemmMicrokernelTester()
19601 .mr(6)
19602 .nr(8)
19603 .kr(1)
19604 .sr(1)
19605 .m(6)
19606 .n(8)
19607 .k(k)
19608 .cn_stride(11)
19609 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19610 }
19611 }
19612 }
19613
19614 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8_subtile) {
19615 TEST_REQUIRES_ARM_NEON;
19616 for (uint32_t n = 9; n < 16; n++) {
19617 for (size_t k = 1; k <= 20; k += 5) {
19618 for (uint32_t m = 1; m <= 6; m++) {
19619 GemmMicrokernelTester()
19620 .mr(6)
19621 .nr(8)
19622 .kr(1)
19623 .sr(1)
19624 .m(m)
19625 .n(n)
19626 .k(k)
19627 .iterations(1)
19628 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19629 }
19630 }
19631 }
19632 }
19633
19634 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8) {
19635 TEST_REQUIRES_ARM_NEON;
19636 for (uint32_t n = 16; n <= 24; n += 8) {
19637 for (size_t k = 1; k <= 20; k += 5) {
19638 GemmMicrokernelTester()
19639 .mr(6)
19640 .nr(8)
19641 .kr(1)
19642 .sr(1)
19643 .m(6)
19644 .n(8)
19645 .k(k)
19646 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19647 }
19648 }
19649 }
19650
19651 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8_strided_cn) {
19652 TEST_REQUIRES_ARM_NEON;
19653 for (uint32_t n = 16; n <= 24; n += 8) {
19654 for (size_t k = 1; k <= 20; k += 5) {
19655 GemmMicrokernelTester()
19656 .mr(6)
19657 .nr(8)
19658 .kr(1)
19659 .sr(1)
19660 .m(6)
19661 .n(n)
19662 .k(k)
19663 .cn_stride(11)
19664 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19665 }
19666 }
19667 }
19668
19669 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8_subtile) {
19670 TEST_REQUIRES_ARM_NEON;
19671 for (uint32_t n = 16; n <= 24; n += 8) {
19672 for (size_t k = 1; k <= 20; k += 5) {
19673 for (uint32_t m = 1; m <= 6; m++) {
19674 GemmMicrokernelTester()
19675 .mr(6)
19676 .nr(8)
19677 .kr(1)
19678 .sr(1)
19679 .m(m)
19680 .n(n)
19681 .k(k)
19682 .iterations(1)
19683 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19684 }
19685 }
19686 }
19687 }
19688
19689 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, small_kernel) {
19690 TEST_REQUIRES_ARM_NEON;
19691 for (size_t k = 1; k <= 20; k += 5) {
19692 GemmMicrokernelTester()
19693 .mr(6)
19694 .nr(8)
19695 .kr(1)
19696 .sr(1)
19697 .m(6)
19698 .n(8)
19699 .k(k)
19700 .ks(3)
19701 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19702 }
19703 }
19704
19705 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, small_kernel_subtile) {
19706 TEST_REQUIRES_ARM_NEON;
19707 for (size_t k = 1; k <= 20; k += 5) {
19708 for (uint32_t m = 1; m <= 6; m++) {
19709 for (uint32_t n = 1; n <= 8; n++) {
19710 GemmMicrokernelTester()
19711 .mr(6)
19712 .nr(8)
19713 .kr(1)
19714 .sr(1)
19715 .m(m)
19716 .n(n)
19717 .k(k)
19718 .ks(3)
19719 .iterations(1)
19720 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19721 }
19722 }
19723 }
19724 }
19725
19726 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_gt_8_small_kernel) {
19727 TEST_REQUIRES_ARM_NEON;
19728 for (uint32_t n = 9; n < 16; n++) {
19729 for (size_t k = 1; k <= 20; k += 5) {
19730 GemmMicrokernelTester()
19731 .mr(6)
19732 .nr(8)
19733 .kr(1)
19734 .sr(1)
19735 .m(6)
19736 .n(8)
19737 .k(k)
19738 .ks(3)
19739 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19740 }
19741 }
19742 }
19743
19744 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, n_div_8_small_kernel) {
19745 TEST_REQUIRES_ARM_NEON;
19746 for (uint32_t n = 16; n <= 24; n += 8) {
19747 for (size_t k = 1; k <= 20; k += 5) {
19748 GemmMicrokernelTester()
19749 .mr(6)
19750 .nr(8)
19751 .kr(1)
19752 .sr(1)
19753 .m(6)
19754 .n(8)
19755 .k(k)
19756 .ks(3)
19757 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19758 }
19759 }
19760 }
19761
19762 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, strided_cm_subtile) {
19763 TEST_REQUIRES_ARM_NEON;
19764 for (size_t k = 1; k <= 20; k += 5) {
19765 for (uint32_t m = 1; m <= 6; m++) {
19766 for (uint32_t n = 1; n <= 8; n++) {
19767 GemmMicrokernelTester()
19768 .mr(6)
19769 .nr(8)
19770 .kr(1)
19771 .sr(1)
19772 .m(m)
19773 .n(n)
19774 .k(k)
19775 .cm_stride(11)
19776 .iterations(1)
19777 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19778 }
19779 }
19780 }
19781 }
19782
19783 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, a_offset) {
19784 TEST_REQUIRES_ARM_NEON;
19785 for (size_t k = 1; k <= 20; k += 5) {
19786 GemmMicrokernelTester()
19787 .mr(6)
19788 .nr(8)
19789 .kr(1)
19790 .sr(1)
19791 .m(6)
19792 .n(8)
19793 .k(k)
19794 .ks(3)
19795 .a_offset(127)
19796 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19797 }
19798 }
19799
19800 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, zero) {
19801 TEST_REQUIRES_ARM_NEON;
19802 for (uint32_t mz = 0; mz < 6; mz++) {
19803 for (size_t k = 1; k <= 20; k += 5) {
19804 GemmMicrokernelTester()
19805 .mr(6)
19806 .nr(8)
19807 .kr(1)
19808 .sr(1)
19809 .m(6)
19810 .n(8)
19811 .k(k)
19812 .ks(3)
19813 .a_offset(127)
19814 .zero_index(mz)
19815 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19816 }
19817 }
19818 }
19819
19820 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, qmin) {
19821 TEST_REQUIRES_ARM_NEON;
19822 GemmMicrokernelTester()
19823 .mr(6)
19824 .nr(8)
19825 .kr(1)
19826 .sr(1)
19827 .m(6)
19828 .n(8)
19829 .k(4)
19830 .qmin(128)
19831 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19832 }
19833
19834 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, qmax) {
19835 TEST_REQUIRES_ARM_NEON;
19836 GemmMicrokernelTester()
19837 .mr(6)
19838 .nr(8)
19839 .kr(1)
19840 .sr(1)
19841 .m(6)
19842 .n(8)
19843 .k(4)
19844 .qmax(128)
19845 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19846 }
19847
19848 TEST(F32_IGEMM_6X8__NEON_DUP_LD128, strided_cm) {
19849 TEST_REQUIRES_ARM_NEON;
19850 GemmMicrokernelTester()
19851 .mr(6)
19852 .nr(8)
19853 .kr(1)
19854 .sr(1)
19855 .m(6)
19856 .n(8)
19857 .k(4)
19858 .cm_stride(11)
19859 .Test(xnn_f32_igemm_ukernel_6x8__neon_dup_ld128);
19860 }
19861#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19862
19863
19864#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19865 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2) {
19866 TEST_REQUIRES_ARM_NEON_FMA;
19867 GemmMicrokernelTester()
19868 .mr(1)
19869 .nr(8)
19870 .kr(1)
19871 .sr(1)
19872 .m(1)
19873 .n(8)
19874 .k(2)
19875 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19876 }
19877
19878 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, strided_cn) {
19879 TEST_REQUIRES_ARM_NEON_FMA;
19880 GemmMicrokernelTester()
19881 .mr(1)
19882 .nr(8)
19883 .kr(1)
19884 .sr(1)
19885 .m(1)
19886 .n(8)
19887 .k(2)
19888 .cn_stride(11)
19889 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19890 }
19891
19892 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
19893 TEST_REQUIRES_ARM_NEON_FMA;
19894 for (uint32_t m = 1; m <= 1; m++) {
19895 for (uint32_t n = 1; n <= 8; n++) {
19896 GemmMicrokernelTester()
19897 .mr(1)
19898 .nr(8)
19899 .kr(1)
19900 .sr(1)
19901 .m(m)
19902 .n(n)
19903 .k(2)
19904 .iterations(1)
19905 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19906 }
19907 }
19908 }
19909
19910 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
19911 TEST_REQUIRES_ARM_NEON_FMA;
19912 for (uint32_t m = 1; m <= 1; m++) {
19913 GemmMicrokernelTester()
19914 .mr(1)
19915 .nr(8)
19916 .kr(1)
19917 .sr(1)
19918 .m(m)
19919 .n(8)
19920 .k(2)
19921 .iterations(1)
19922 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19923 }
19924 }
19925
19926 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
19927 TEST_REQUIRES_ARM_NEON_FMA;
19928 for (uint32_t n = 1; n <= 8; n++) {
19929 GemmMicrokernelTester()
19930 .mr(1)
19931 .nr(8)
19932 .kr(1)
19933 .sr(1)
19934 .m(1)
19935 .n(n)
19936 .k(2)
19937 .iterations(1)
19938 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19939 }
19940 }
19941
19942 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_lt_2) {
19943 TEST_REQUIRES_ARM_NEON_FMA;
19944 for (size_t k = 1; k < 2; k++) {
19945 GemmMicrokernelTester()
19946 .mr(1)
19947 .nr(8)
19948 .kr(1)
19949 .sr(1)
19950 .m(1)
19951 .n(8)
19952 .k(k)
19953 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19954 }
19955 }
19956
19957 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
19958 TEST_REQUIRES_ARM_NEON_FMA;
19959 for (size_t k = 1; k < 2; k++) {
19960 for (uint32_t m = 1; m <= 1; m++) {
19961 for (uint32_t n = 1; n <= 8; n++) {
19962 GemmMicrokernelTester()
19963 .mr(1)
19964 .nr(8)
19965 .kr(1)
19966 .sr(1)
19967 .m(m)
19968 .n(n)
19969 .k(k)
19970 .iterations(1)
19971 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19972 }
19973 }
19974 }
19975 }
19976
19977 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_gt_2) {
19978 TEST_REQUIRES_ARM_NEON_FMA;
19979 for (size_t k = 3; k < 4; k++) {
19980 GemmMicrokernelTester()
19981 .mr(1)
19982 .nr(8)
19983 .kr(1)
19984 .sr(1)
19985 .m(1)
19986 .n(8)
19987 .k(k)
19988 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
19989 }
19990 }
19991
19992 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
19993 TEST_REQUIRES_ARM_NEON_FMA;
19994 for (size_t k = 3; k < 4; k++) {
19995 for (uint32_t m = 1; m <= 1; m++) {
19996 for (uint32_t n = 1; n <= 8; n++) {
19997 GemmMicrokernelTester()
19998 .mr(1)
19999 .nr(8)
20000 .kr(1)
20001 .sr(1)
20002 .m(m)
20003 .n(n)
20004 .k(k)
20005 .iterations(1)
20006 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20007 }
20008 }
20009 }
20010 }
20011
20012 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_div_2) {
20013 TEST_REQUIRES_ARM_NEON_FMA;
20014 for (size_t k = 4; k <= 20; k += 2) {
20015 GemmMicrokernelTester()
20016 .mr(1)
20017 .nr(8)
20018 .kr(1)
20019 .sr(1)
20020 .m(1)
20021 .n(8)
20022 .k(k)
20023 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20024 }
20025 }
20026
20027 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
20028 TEST_REQUIRES_ARM_NEON_FMA;
20029 for (size_t k = 4; k <= 20; k += 2) {
20030 for (uint32_t m = 1; m <= 1; m++) {
20031 for (uint32_t n = 1; n <= 8; n++) {
20032 GemmMicrokernelTester()
20033 .mr(1)
20034 .nr(8)
20035 .kr(1)
20036 .sr(1)
20037 .m(m)
20038 .n(n)
20039 .k(k)
20040 .iterations(1)
20041 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20042 }
20043 }
20044 }
20045 }
20046
20047 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8) {
20048 TEST_REQUIRES_ARM_NEON_FMA;
20049 for (uint32_t n = 9; n < 16; n++) {
20050 for (size_t k = 1; k <= 10; k += 3) {
20051 GemmMicrokernelTester()
20052 .mr(1)
20053 .nr(8)
20054 .kr(1)
20055 .sr(1)
20056 .m(1)
20057 .n(8)
20058 .k(k)
20059 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20060 }
20061 }
20062 }
20063
20064 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
20065 TEST_REQUIRES_ARM_NEON_FMA;
20066 for (uint32_t n = 9; n < 16; n++) {
20067 for (size_t k = 1; k <= 10; k += 3) {
20068 GemmMicrokernelTester()
20069 .mr(1)
20070 .nr(8)
20071 .kr(1)
20072 .sr(1)
20073 .m(1)
20074 .n(8)
20075 .k(k)
20076 .cn_stride(11)
20077 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20078 }
20079 }
20080 }
20081
20082 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
20083 TEST_REQUIRES_ARM_NEON_FMA;
20084 for (uint32_t n = 9; n < 16; n++) {
20085 for (size_t k = 1; k <= 10; k += 3) {
20086 for (uint32_t m = 1; m <= 1; m++) {
20087 GemmMicrokernelTester()
20088 .mr(1)
20089 .nr(8)
20090 .kr(1)
20091 .sr(1)
20092 .m(m)
20093 .n(n)
20094 .k(k)
20095 .iterations(1)
20096 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20097 }
20098 }
20099 }
20100 }
20101
20102 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8) {
20103 TEST_REQUIRES_ARM_NEON_FMA;
20104 for (uint32_t n = 16; n <= 24; n += 8) {
20105 for (size_t k = 1; k <= 10; k += 3) {
20106 GemmMicrokernelTester()
20107 .mr(1)
20108 .nr(8)
20109 .kr(1)
20110 .sr(1)
20111 .m(1)
20112 .n(8)
20113 .k(k)
20114 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20115 }
20116 }
20117 }
20118
20119 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
20120 TEST_REQUIRES_ARM_NEON_FMA;
20121 for (uint32_t n = 16; n <= 24; n += 8) {
20122 for (size_t k = 1; k <= 10; k += 3) {
20123 GemmMicrokernelTester()
20124 .mr(1)
20125 .nr(8)
20126 .kr(1)
20127 .sr(1)
20128 .m(1)
20129 .n(n)
20130 .k(k)
20131 .cn_stride(11)
20132 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20133 }
20134 }
20135 }
20136
20137 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
20138 TEST_REQUIRES_ARM_NEON_FMA;
20139 for (uint32_t n = 16; n <= 24; n += 8) {
20140 for (size_t k = 1; k <= 10; k += 3) {
20141 for (uint32_t m = 1; m <= 1; m++) {
20142 GemmMicrokernelTester()
20143 .mr(1)
20144 .nr(8)
20145 .kr(1)
20146 .sr(1)
20147 .m(m)
20148 .n(n)
20149 .k(k)
20150 .iterations(1)
20151 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20152 }
20153 }
20154 }
20155 }
20156
20157 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, small_kernel) {
20158 TEST_REQUIRES_ARM_NEON_FMA;
20159 for (size_t k = 1; k <= 10; k += 3) {
20160 GemmMicrokernelTester()
20161 .mr(1)
20162 .nr(8)
20163 .kr(1)
20164 .sr(1)
20165 .m(1)
20166 .n(8)
20167 .k(k)
20168 .ks(3)
20169 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20170 }
20171 }
20172
20173 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, small_kernel_subtile) {
20174 TEST_REQUIRES_ARM_NEON_FMA;
20175 for (size_t k = 1; k <= 10; k += 3) {
20176 for (uint32_t m = 1; m <= 1; m++) {
20177 for (uint32_t n = 1; n <= 8; n++) {
20178 GemmMicrokernelTester()
20179 .mr(1)
20180 .nr(8)
20181 .kr(1)
20182 .sr(1)
20183 .m(m)
20184 .n(n)
20185 .k(k)
20186 .ks(3)
20187 .iterations(1)
20188 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20189 }
20190 }
20191 }
20192 }
20193
20194 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_small_kernel) {
20195 TEST_REQUIRES_ARM_NEON_FMA;
20196 for (uint32_t n = 9; n < 16; n++) {
20197 for (size_t k = 1; k <= 10; k += 3) {
20198 GemmMicrokernelTester()
20199 .mr(1)
20200 .nr(8)
20201 .kr(1)
20202 .sr(1)
20203 .m(1)
20204 .n(8)
20205 .k(k)
20206 .ks(3)
20207 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20208 }
20209 }
20210 }
20211
20212 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, n_div_8_small_kernel) {
20213 TEST_REQUIRES_ARM_NEON_FMA;
20214 for (uint32_t n = 16; n <= 24; n += 8) {
20215 for (size_t k = 1; k <= 10; k += 3) {
20216 GemmMicrokernelTester()
20217 .mr(1)
20218 .nr(8)
20219 .kr(1)
20220 .sr(1)
20221 .m(1)
20222 .n(8)
20223 .k(k)
20224 .ks(3)
20225 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20226 }
20227 }
20228 }
20229
20230 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
20231 TEST_REQUIRES_ARM_NEON_FMA;
20232 for (size_t k = 1; k <= 10; k += 3) {
20233 for (uint32_t m = 1; m <= 1; m++) {
20234 for (uint32_t n = 1; n <= 8; n++) {
20235 GemmMicrokernelTester()
20236 .mr(1)
20237 .nr(8)
20238 .kr(1)
20239 .sr(1)
20240 .m(m)
20241 .n(n)
20242 .k(k)
20243 .cm_stride(11)
20244 .iterations(1)
20245 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20246 }
20247 }
20248 }
20249 }
20250
20251 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, a_offset) {
20252 TEST_REQUIRES_ARM_NEON_FMA;
20253 for (size_t k = 1; k <= 10; k += 3) {
20254 GemmMicrokernelTester()
20255 .mr(1)
20256 .nr(8)
20257 .kr(1)
20258 .sr(1)
20259 .m(1)
20260 .n(8)
20261 .k(k)
20262 .ks(3)
20263 .a_offset(13)
20264 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20265 }
20266 }
20267
20268 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, zero) {
20269 TEST_REQUIRES_ARM_NEON_FMA;
20270 for (uint32_t mz = 0; mz < 1; mz++) {
20271 for (size_t k = 1; k <= 10; k += 3) {
20272 GemmMicrokernelTester()
20273 .mr(1)
20274 .nr(8)
20275 .kr(1)
20276 .sr(1)
20277 .m(1)
20278 .n(8)
20279 .k(k)
20280 .ks(3)
20281 .a_offset(13)
20282 .zero_index(mz)
20283 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20284 }
20285 }
20286 }
20287
20288 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, qmin) {
20289 TEST_REQUIRES_ARM_NEON_FMA;
20290 GemmMicrokernelTester()
20291 .mr(1)
20292 .nr(8)
20293 .kr(1)
20294 .sr(1)
20295 .m(1)
20296 .n(8)
20297 .k(2)
20298 .qmin(128)
20299 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20300 }
20301
20302 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, qmax) {
20303 TEST_REQUIRES_ARM_NEON_FMA;
20304 GemmMicrokernelTester()
20305 .mr(1)
20306 .nr(8)
20307 .kr(1)
20308 .sr(1)
20309 .m(1)
20310 .n(8)
20311 .k(2)
20312 .qmax(128)
20313 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20314 }
20315
20316 TEST(F32_IGEMM_1X8__NEONFMA_DUP_LD64, strided_cm) {
20317 TEST_REQUIRES_ARM_NEON_FMA;
20318 GemmMicrokernelTester()
20319 .mr(1)
20320 .nr(8)
20321 .kr(1)
20322 .sr(1)
20323 .m(1)
20324 .n(8)
20325 .k(2)
20326 .cm_stride(11)
20327 .Test(xnn_f32_igemm_ukernel_1x8__neonfma_dup_ld64);
20328 }
20329#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20330
20331
20332#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20333 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4) {
20334 TEST_REQUIRES_ARM_NEON_FMA;
20335 GemmMicrokernelTester()
20336 .mr(4)
20337 .nr(8)
20338 .kr(1)
20339 .sr(1)
20340 .m(4)
20341 .n(8)
20342 .k(4)
20343 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20344 }
20345
20346 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, strided_cn) {
20347 TEST_REQUIRES_ARM_NEON_FMA;
20348 GemmMicrokernelTester()
20349 .mr(4)
20350 .nr(8)
20351 .kr(1)
20352 .sr(1)
20353 .m(4)
20354 .n(8)
20355 .k(4)
20356 .cn_stride(11)
20357 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20358 }
20359
20360 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
20361 TEST_REQUIRES_ARM_NEON_FMA;
20362 for (uint32_t m = 1; m <= 4; m++) {
20363 for (uint32_t n = 1; n <= 8; n++) {
20364 GemmMicrokernelTester()
20365 .mr(4)
20366 .nr(8)
20367 .kr(1)
20368 .sr(1)
20369 .m(m)
20370 .n(n)
20371 .k(4)
20372 .iterations(1)
20373 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20374 }
20375 }
20376 }
20377
20378 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
20379 TEST_REQUIRES_ARM_NEON_FMA;
20380 for (uint32_t m = 1; m <= 4; m++) {
20381 GemmMicrokernelTester()
20382 .mr(4)
20383 .nr(8)
20384 .kr(1)
20385 .sr(1)
20386 .m(m)
20387 .n(8)
20388 .k(4)
20389 .iterations(1)
20390 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20391 }
20392 }
20393
20394 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
20395 TEST_REQUIRES_ARM_NEON_FMA;
20396 for (uint32_t n = 1; n <= 8; n++) {
20397 GemmMicrokernelTester()
20398 .mr(4)
20399 .nr(8)
20400 .kr(1)
20401 .sr(1)
20402 .m(4)
20403 .n(n)
20404 .k(4)
20405 .iterations(1)
20406 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20407 }
20408 }
20409
20410 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_lt_4) {
20411 TEST_REQUIRES_ARM_NEON_FMA;
20412 for (size_t k = 1; k < 4; k++) {
20413 GemmMicrokernelTester()
20414 .mr(4)
20415 .nr(8)
20416 .kr(1)
20417 .sr(1)
20418 .m(4)
20419 .n(8)
20420 .k(k)
20421 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20422 }
20423 }
20424
20425 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
20426 TEST_REQUIRES_ARM_NEON_FMA;
20427 for (size_t k = 1; k < 4; k++) {
20428 for (uint32_t m = 1; m <= 4; m++) {
20429 for (uint32_t n = 1; n <= 8; n++) {
20430 GemmMicrokernelTester()
20431 .mr(4)
20432 .nr(8)
20433 .kr(1)
20434 .sr(1)
20435 .m(m)
20436 .n(n)
20437 .k(k)
20438 .iterations(1)
20439 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20440 }
20441 }
20442 }
20443 }
20444
20445 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_gt_4) {
20446 TEST_REQUIRES_ARM_NEON_FMA;
20447 for (size_t k = 5; k < 8; k++) {
20448 GemmMicrokernelTester()
20449 .mr(4)
20450 .nr(8)
20451 .kr(1)
20452 .sr(1)
20453 .m(4)
20454 .n(8)
20455 .k(k)
20456 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20457 }
20458 }
20459
20460 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
20461 TEST_REQUIRES_ARM_NEON_FMA;
20462 for (size_t k = 5; k < 8; k++) {
20463 for (uint32_t m = 1; m <= 4; m++) {
20464 for (uint32_t n = 1; n <= 8; n++) {
20465 GemmMicrokernelTester()
20466 .mr(4)
20467 .nr(8)
20468 .kr(1)
20469 .sr(1)
20470 .m(m)
20471 .n(n)
20472 .k(k)
20473 .iterations(1)
20474 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20475 }
20476 }
20477 }
20478 }
20479
20480 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_div_4) {
20481 TEST_REQUIRES_ARM_NEON_FMA;
20482 for (size_t k = 8; k <= 40; k += 4) {
20483 GemmMicrokernelTester()
20484 .mr(4)
20485 .nr(8)
20486 .kr(1)
20487 .sr(1)
20488 .m(4)
20489 .n(8)
20490 .k(k)
20491 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20492 }
20493 }
20494
20495 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
20496 TEST_REQUIRES_ARM_NEON_FMA;
20497 for (size_t k = 8; k <= 40; k += 4) {
20498 for (uint32_t m = 1; m <= 4; m++) {
20499 for (uint32_t n = 1; n <= 8; n++) {
20500 GemmMicrokernelTester()
20501 .mr(4)
20502 .nr(8)
20503 .kr(1)
20504 .sr(1)
20505 .m(m)
20506 .n(n)
20507 .k(k)
20508 .iterations(1)
20509 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20510 }
20511 }
20512 }
20513 }
20514
20515 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8) {
20516 TEST_REQUIRES_ARM_NEON_FMA;
20517 for (uint32_t n = 9; n < 16; n++) {
20518 for (size_t k = 1; k <= 20; k += 5) {
20519 GemmMicrokernelTester()
20520 .mr(4)
20521 .nr(8)
20522 .kr(1)
20523 .sr(1)
20524 .m(4)
20525 .n(8)
20526 .k(k)
20527 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20528 }
20529 }
20530 }
20531
20532 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
20533 TEST_REQUIRES_ARM_NEON_FMA;
20534 for (uint32_t n = 9; n < 16; n++) {
20535 for (size_t k = 1; k <= 20; k += 5) {
20536 GemmMicrokernelTester()
20537 .mr(4)
20538 .nr(8)
20539 .kr(1)
20540 .sr(1)
20541 .m(4)
20542 .n(8)
20543 .k(k)
20544 .cn_stride(11)
20545 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20546 }
20547 }
20548 }
20549
20550 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
20551 TEST_REQUIRES_ARM_NEON_FMA;
20552 for (uint32_t n = 9; n < 16; n++) {
20553 for (size_t k = 1; k <= 20; k += 5) {
20554 for (uint32_t m = 1; m <= 4; m++) {
20555 GemmMicrokernelTester()
20556 .mr(4)
20557 .nr(8)
20558 .kr(1)
20559 .sr(1)
20560 .m(m)
20561 .n(n)
20562 .k(k)
20563 .iterations(1)
20564 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20565 }
20566 }
20567 }
20568 }
20569
20570 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8) {
20571 TEST_REQUIRES_ARM_NEON_FMA;
20572 for (uint32_t n = 16; n <= 24; n += 8) {
20573 for (size_t k = 1; k <= 20; k += 5) {
20574 GemmMicrokernelTester()
20575 .mr(4)
20576 .nr(8)
20577 .kr(1)
20578 .sr(1)
20579 .m(4)
20580 .n(8)
20581 .k(k)
20582 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20583 }
20584 }
20585 }
20586
20587 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
20588 TEST_REQUIRES_ARM_NEON_FMA;
20589 for (uint32_t n = 16; n <= 24; n += 8) {
20590 for (size_t k = 1; k <= 20; k += 5) {
20591 GemmMicrokernelTester()
20592 .mr(4)
20593 .nr(8)
20594 .kr(1)
20595 .sr(1)
20596 .m(4)
20597 .n(n)
20598 .k(k)
20599 .cn_stride(11)
20600 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20601 }
20602 }
20603 }
20604
20605 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
20606 TEST_REQUIRES_ARM_NEON_FMA;
20607 for (uint32_t n = 16; n <= 24; n += 8) {
20608 for (size_t k = 1; k <= 20; k += 5) {
20609 for (uint32_t m = 1; m <= 4; m++) {
20610 GemmMicrokernelTester()
20611 .mr(4)
20612 .nr(8)
20613 .kr(1)
20614 .sr(1)
20615 .m(m)
20616 .n(n)
20617 .k(k)
20618 .iterations(1)
20619 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20620 }
20621 }
20622 }
20623 }
20624
20625 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, small_kernel) {
20626 TEST_REQUIRES_ARM_NEON_FMA;
20627 for (size_t k = 1; k <= 20; k += 5) {
20628 GemmMicrokernelTester()
20629 .mr(4)
20630 .nr(8)
20631 .kr(1)
20632 .sr(1)
20633 .m(4)
20634 .n(8)
20635 .k(k)
20636 .ks(3)
20637 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20638 }
20639 }
20640
20641 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, small_kernel_subtile) {
20642 TEST_REQUIRES_ARM_NEON_FMA;
20643 for (size_t k = 1; k <= 20; k += 5) {
20644 for (uint32_t m = 1; m <= 4; m++) {
20645 for (uint32_t n = 1; n <= 8; n++) {
20646 GemmMicrokernelTester()
20647 .mr(4)
20648 .nr(8)
20649 .kr(1)
20650 .sr(1)
20651 .m(m)
20652 .n(n)
20653 .k(k)
20654 .ks(3)
20655 .iterations(1)
20656 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20657 }
20658 }
20659 }
20660 }
20661
20662 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_small_kernel) {
20663 TEST_REQUIRES_ARM_NEON_FMA;
20664 for (uint32_t n = 9; n < 16; n++) {
20665 for (size_t k = 1; k <= 20; k += 5) {
20666 GemmMicrokernelTester()
20667 .mr(4)
20668 .nr(8)
20669 .kr(1)
20670 .sr(1)
20671 .m(4)
20672 .n(8)
20673 .k(k)
20674 .ks(3)
20675 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20676 }
20677 }
20678 }
20679
20680 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, n_div_8_small_kernel) {
20681 TEST_REQUIRES_ARM_NEON_FMA;
20682 for (uint32_t n = 16; n <= 24; n += 8) {
20683 for (size_t k = 1; k <= 20; k += 5) {
20684 GemmMicrokernelTester()
20685 .mr(4)
20686 .nr(8)
20687 .kr(1)
20688 .sr(1)
20689 .m(4)
20690 .n(8)
20691 .k(k)
20692 .ks(3)
20693 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20694 }
20695 }
20696 }
20697
20698 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
20699 TEST_REQUIRES_ARM_NEON_FMA;
20700 for (size_t k = 1; k <= 20; k += 5) {
20701 for (uint32_t m = 1; m <= 4; m++) {
20702 for (uint32_t n = 1; n <= 8; n++) {
20703 GemmMicrokernelTester()
20704 .mr(4)
20705 .nr(8)
20706 .kr(1)
20707 .sr(1)
20708 .m(m)
20709 .n(n)
20710 .k(k)
20711 .cm_stride(11)
20712 .iterations(1)
20713 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20714 }
20715 }
20716 }
20717 }
20718
20719 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, a_offset) {
20720 TEST_REQUIRES_ARM_NEON_FMA;
20721 for (size_t k = 1; k <= 20; k += 5) {
20722 GemmMicrokernelTester()
20723 .mr(4)
20724 .nr(8)
20725 .kr(1)
20726 .sr(1)
20727 .m(4)
20728 .n(8)
20729 .k(k)
20730 .ks(3)
20731 .a_offset(83)
20732 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20733 }
20734 }
20735
20736 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, zero) {
20737 TEST_REQUIRES_ARM_NEON_FMA;
20738 for (uint32_t mz = 0; mz < 4; mz++) {
20739 for (size_t k = 1; k <= 20; k += 5) {
20740 GemmMicrokernelTester()
20741 .mr(4)
20742 .nr(8)
20743 .kr(1)
20744 .sr(1)
20745 .m(4)
20746 .n(8)
20747 .k(k)
20748 .ks(3)
20749 .a_offset(83)
20750 .zero_index(mz)
20751 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20752 }
20753 }
20754 }
20755
20756 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, qmin) {
20757 TEST_REQUIRES_ARM_NEON_FMA;
20758 GemmMicrokernelTester()
20759 .mr(4)
20760 .nr(8)
20761 .kr(1)
20762 .sr(1)
20763 .m(4)
20764 .n(8)
20765 .k(4)
20766 .qmin(128)
20767 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20768 }
20769
20770 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, qmax) {
20771 TEST_REQUIRES_ARM_NEON_FMA;
20772 GemmMicrokernelTester()
20773 .mr(4)
20774 .nr(8)
20775 .kr(1)
20776 .sr(1)
20777 .m(4)
20778 .n(8)
20779 .k(4)
20780 .qmax(128)
20781 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20782 }
20783
20784 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD128, strided_cm) {
20785 TEST_REQUIRES_ARM_NEON_FMA;
20786 GemmMicrokernelTester()
20787 .mr(4)
20788 .nr(8)
20789 .kr(1)
20790 .sr(1)
20791 .m(4)
20792 .n(8)
20793 .k(4)
20794 .cm_stride(11)
20795 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld128);
20796 }
20797#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20798
20799
20800#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20801 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2) {
20802 TEST_REQUIRES_ARM_NEON_FMA;
20803 GemmMicrokernelTester()
20804 .mr(4)
20805 .nr(8)
20806 .kr(1)
20807 .sr(1)
20808 .m(4)
20809 .n(8)
20810 .k(2)
20811 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20812 }
20813
20814 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, strided_cn) {
20815 TEST_REQUIRES_ARM_NEON_FMA;
20816 GemmMicrokernelTester()
20817 .mr(4)
20818 .nr(8)
20819 .kr(1)
20820 .sr(1)
20821 .m(4)
20822 .n(8)
20823 .k(2)
20824 .cn_stride(11)
20825 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20826 }
20827
20828 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
20829 TEST_REQUIRES_ARM_NEON_FMA;
20830 for (uint32_t m = 1; m <= 4; m++) {
20831 for (uint32_t n = 1; n <= 8; n++) {
20832 GemmMicrokernelTester()
20833 .mr(4)
20834 .nr(8)
20835 .kr(1)
20836 .sr(1)
20837 .m(m)
20838 .n(n)
20839 .k(2)
20840 .iterations(1)
20841 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20842 }
20843 }
20844 }
20845
20846 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
20847 TEST_REQUIRES_ARM_NEON_FMA;
20848 for (uint32_t m = 1; m <= 4; m++) {
20849 GemmMicrokernelTester()
20850 .mr(4)
20851 .nr(8)
20852 .kr(1)
20853 .sr(1)
20854 .m(m)
20855 .n(8)
20856 .k(2)
20857 .iterations(1)
20858 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20859 }
20860 }
20861
20862 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
20863 TEST_REQUIRES_ARM_NEON_FMA;
20864 for (uint32_t n = 1; n <= 8; n++) {
20865 GemmMicrokernelTester()
20866 .mr(4)
20867 .nr(8)
20868 .kr(1)
20869 .sr(1)
20870 .m(4)
20871 .n(n)
20872 .k(2)
20873 .iterations(1)
20874 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20875 }
20876 }
20877
20878 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_lt_2) {
20879 TEST_REQUIRES_ARM_NEON_FMA;
20880 for (size_t k = 1; k < 2; k++) {
20881 GemmMicrokernelTester()
20882 .mr(4)
20883 .nr(8)
20884 .kr(1)
20885 .sr(1)
20886 .m(4)
20887 .n(8)
20888 .k(k)
20889 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20890 }
20891 }
20892
20893 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
20894 TEST_REQUIRES_ARM_NEON_FMA;
20895 for (size_t k = 1; k < 2; k++) {
20896 for (uint32_t m = 1; m <= 4; m++) {
20897 for (uint32_t n = 1; n <= 8; n++) {
20898 GemmMicrokernelTester()
20899 .mr(4)
20900 .nr(8)
20901 .kr(1)
20902 .sr(1)
20903 .m(m)
20904 .n(n)
20905 .k(k)
20906 .iterations(1)
20907 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20908 }
20909 }
20910 }
20911 }
20912
20913 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_gt_2) {
20914 TEST_REQUIRES_ARM_NEON_FMA;
20915 for (size_t k = 3; k < 4; k++) {
20916 GemmMicrokernelTester()
20917 .mr(4)
20918 .nr(8)
20919 .kr(1)
20920 .sr(1)
20921 .m(4)
20922 .n(8)
20923 .k(k)
20924 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20925 }
20926 }
20927
20928 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
20929 TEST_REQUIRES_ARM_NEON_FMA;
20930 for (size_t k = 3; k < 4; k++) {
20931 for (uint32_t m = 1; m <= 4; m++) {
20932 for (uint32_t n = 1; n <= 8; n++) {
20933 GemmMicrokernelTester()
20934 .mr(4)
20935 .nr(8)
20936 .kr(1)
20937 .sr(1)
20938 .m(m)
20939 .n(n)
20940 .k(k)
20941 .iterations(1)
20942 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20943 }
20944 }
20945 }
20946 }
20947
20948 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_div_2) {
20949 TEST_REQUIRES_ARM_NEON_FMA;
20950 for (size_t k = 4; k <= 20; k += 2) {
20951 GemmMicrokernelTester()
20952 .mr(4)
20953 .nr(8)
20954 .kr(1)
20955 .sr(1)
20956 .m(4)
20957 .n(8)
20958 .k(k)
20959 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20960 }
20961 }
20962
20963 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
20964 TEST_REQUIRES_ARM_NEON_FMA;
20965 for (size_t k = 4; k <= 20; k += 2) {
20966 for (uint32_t m = 1; m <= 4; m++) {
20967 for (uint32_t n = 1; n <= 8; n++) {
20968 GemmMicrokernelTester()
20969 .mr(4)
20970 .nr(8)
20971 .kr(1)
20972 .sr(1)
20973 .m(m)
20974 .n(n)
20975 .k(k)
20976 .iterations(1)
20977 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20978 }
20979 }
20980 }
20981 }
20982
20983 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8) {
20984 TEST_REQUIRES_ARM_NEON_FMA;
20985 for (uint32_t n = 9; n < 16; n++) {
20986 for (size_t k = 1; k <= 10; k += 3) {
20987 GemmMicrokernelTester()
20988 .mr(4)
20989 .nr(8)
20990 .kr(1)
20991 .sr(1)
20992 .m(4)
20993 .n(8)
20994 .k(k)
20995 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
20996 }
20997 }
20998 }
20999
21000 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
21001 TEST_REQUIRES_ARM_NEON_FMA;
21002 for (uint32_t n = 9; n < 16; n++) {
21003 for (size_t k = 1; k <= 10; k += 3) {
21004 GemmMicrokernelTester()
21005 .mr(4)
21006 .nr(8)
21007 .kr(1)
21008 .sr(1)
21009 .m(4)
21010 .n(8)
21011 .k(k)
21012 .cn_stride(11)
21013 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21014 }
21015 }
21016 }
21017
21018 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
21019 TEST_REQUIRES_ARM_NEON_FMA;
21020 for (uint32_t n = 9; n < 16; n++) {
21021 for (size_t k = 1; k <= 10; k += 3) {
21022 for (uint32_t m = 1; m <= 4; m++) {
21023 GemmMicrokernelTester()
21024 .mr(4)
21025 .nr(8)
21026 .kr(1)
21027 .sr(1)
21028 .m(m)
21029 .n(n)
21030 .k(k)
21031 .iterations(1)
21032 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21033 }
21034 }
21035 }
21036 }
21037
21038 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8) {
21039 TEST_REQUIRES_ARM_NEON_FMA;
21040 for (uint32_t n = 16; n <= 24; n += 8) {
21041 for (size_t k = 1; k <= 10; k += 3) {
21042 GemmMicrokernelTester()
21043 .mr(4)
21044 .nr(8)
21045 .kr(1)
21046 .sr(1)
21047 .m(4)
21048 .n(8)
21049 .k(k)
21050 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21051 }
21052 }
21053 }
21054
21055 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
21056 TEST_REQUIRES_ARM_NEON_FMA;
21057 for (uint32_t n = 16; n <= 24; n += 8) {
21058 for (size_t k = 1; k <= 10; k += 3) {
21059 GemmMicrokernelTester()
21060 .mr(4)
21061 .nr(8)
21062 .kr(1)
21063 .sr(1)
21064 .m(4)
21065 .n(n)
21066 .k(k)
21067 .cn_stride(11)
21068 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21069 }
21070 }
21071 }
21072
21073 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
21074 TEST_REQUIRES_ARM_NEON_FMA;
21075 for (uint32_t n = 16; n <= 24; n += 8) {
21076 for (size_t k = 1; k <= 10; k += 3) {
21077 for (uint32_t m = 1; m <= 4; m++) {
21078 GemmMicrokernelTester()
21079 .mr(4)
21080 .nr(8)
21081 .kr(1)
21082 .sr(1)
21083 .m(m)
21084 .n(n)
21085 .k(k)
21086 .iterations(1)
21087 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21088 }
21089 }
21090 }
21091 }
21092
21093 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, small_kernel) {
21094 TEST_REQUIRES_ARM_NEON_FMA;
21095 for (size_t k = 1; k <= 10; k += 3) {
21096 GemmMicrokernelTester()
21097 .mr(4)
21098 .nr(8)
21099 .kr(1)
21100 .sr(1)
21101 .m(4)
21102 .n(8)
21103 .k(k)
21104 .ks(3)
21105 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21106 }
21107 }
21108
21109 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, small_kernel_subtile) {
21110 TEST_REQUIRES_ARM_NEON_FMA;
21111 for (size_t k = 1; k <= 10; k += 3) {
21112 for (uint32_t m = 1; m <= 4; m++) {
21113 for (uint32_t n = 1; n <= 8; n++) {
21114 GemmMicrokernelTester()
21115 .mr(4)
21116 .nr(8)
21117 .kr(1)
21118 .sr(1)
21119 .m(m)
21120 .n(n)
21121 .k(k)
21122 .ks(3)
21123 .iterations(1)
21124 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21125 }
21126 }
21127 }
21128 }
21129
21130 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_small_kernel) {
21131 TEST_REQUIRES_ARM_NEON_FMA;
21132 for (uint32_t n = 9; n < 16; n++) {
21133 for (size_t k = 1; k <= 10; k += 3) {
21134 GemmMicrokernelTester()
21135 .mr(4)
21136 .nr(8)
21137 .kr(1)
21138 .sr(1)
21139 .m(4)
21140 .n(8)
21141 .k(k)
21142 .ks(3)
21143 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21144 }
21145 }
21146 }
21147
21148 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, n_div_8_small_kernel) {
21149 TEST_REQUIRES_ARM_NEON_FMA;
21150 for (uint32_t n = 16; n <= 24; n += 8) {
21151 for (size_t k = 1; k <= 10; k += 3) {
21152 GemmMicrokernelTester()
21153 .mr(4)
21154 .nr(8)
21155 .kr(1)
21156 .sr(1)
21157 .m(4)
21158 .n(8)
21159 .k(k)
21160 .ks(3)
21161 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21162 }
21163 }
21164 }
21165
21166 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
21167 TEST_REQUIRES_ARM_NEON_FMA;
21168 for (size_t k = 1; k <= 10; k += 3) {
21169 for (uint32_t m = 1; m <= 4; m++) {
21170 for (uint32_t n = 1; n <= 8; n++) {
21171 GemmMicrokernelTester()
21172 .mr(4)
21173 .nr(8)
21174 .kr(1)
21175 .sr(1)
21176 .m(m)
21177 .n(n)
21178 .k(k)
21179 .cm_stride(11)
21180 .iterations(1)
21181 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21182 }
21183 }
21184 }
21185 }
21186
21187 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, a_offset) {
21188 TEST_REQUIRES_ARM_NEON_FMA;
21189 for (size_t k = 1; k <= 10; k += 3) {
21190 GemmMicrokernelTester()
21191 .mr(4)
21192 .nr(8)
21193 .kr(1)
21194 .sr(1)
21195 .m(4)
21196 .n(8)
21197 .k(k)
21198 .ks(3)
21199 .a_offset(43)
21200 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21201 }
21202 }
21203
21204 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, zero) {
21205 TEST_REQUIRES_ARM_NEON_FMA;
21206 for (uint32_t mz = 0; mz < 4; mz++) {
21207 for (size_t k = 1; k <= 10; k += 3) {
21208 GemmMicrokernelTester()
21209 .mr(4)
21210 .nr(8)
21211 .kr(1)
21212 .sr(1)
21213 .m(4)
21214 .n(8)
21215 .k(k)
21216 .ks(3)
21217 .a_offset(43)
21218 .zero_index(mz)
21219 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21220 }
21221 }
21222 }
21223
21224 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, qmin) {
21225 TEST_REQUIRES_ARM_NEON_FMA;
21226 GemmMicrokernelTester()
21227 .mr(4)
21228 .nr(8)
21229 .kr(1)
21230 .sr(1)
21231 .m(4)
21232 .n(8)
21233 .k(2)
21234 .qmin(128)
21235 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21236 }
21237
21238 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, qmax) {
21239 TEST_REQUIRES_ARM_NEON_FMA;
21240 GemmMicrokernelTester()
21241 .mr(4)
21242 .nr(8)
21243 .kr(1)
21244 .sr(1)
21245 .m(4)
21246 .n(8)
21247 .k(2)
21248 .qmax(128)
21249 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21250 }
21251
21252 TEST(F32_IGEMM_4X8__NEONFMA_DUP_LD64, strided_cm) {
21253 TEST_REQUIRES_ARM_NEON_FMA;
21254 GemmMicrokernelTester()
21255 .mr(4)
21256 .nr(8)
21257 .kr(1)
21258 .sr(1)
21259 .m(4)
21260 .n(8)
21261 .k(2)
21262 .cm_stride(11)
21263 .Test(xnn_f32_igemm_ukernel_4x8__neonfma_dup_ld64);
21264 }
21265#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21266
21267
21268#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21269 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2) {
21270 TEST_REQUIRES_ARM_NEON_FMA;
21271 GemmMicrokernelTester()
21272 .mr(6)
21273 .nr(8)
21274 .kr(1)
21275 .sr(1)
21276 .m(6)
21277 .n(8)
21278 .k(2)
21279 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21280 }
21281
21282 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, strided_cn) {
21283 TEST_REQUIRES_ARM_NEON_FMA;
21284 GemmMicrokernelTester()
21285 .mr(6)
21286 .nr(8)
21287 .kr(1)
21288 .sr(1)
21289 .m(6)
21290 .n(8)
21291 .k(2)
21292 .cn_stride(11)
21293 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21294 }
21295
21296 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
21297 TEST_REQUIRES_ARM_NEON_FMA;
21298 for (uint32_t m = 1; m <= 6; m++) {
21299 for (uint32_t n = 1; n <= 8; n++) {
21300 GemmMicrokernelTester()
21301 .mr(6)
21302 .nr(8)
21303 .kr(1)
21304 .sr(1)
21305 .m(m)
21306 .n(n)
21307 .k(2)
21308 .iterations(1)
21309 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21310 }
21311 }
21312 }
21313
21314 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
21315 TEST_REQUIRES_ARM_NEON_FMA;
21316 for (uint32_t m = 1; m <= 6; m++) {
21317 GemmMicrokernelTester()
21318 .mr(6)
21319 .nr(8)
21320 .kr(1)
21321 .sr(1)
21322 .m(m)
21323 .n(8)
21324 .k(2)
21325 .iterations(1)
21326 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21327 }
21328 }
21329
21330 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
21331 TEST_REQUIRES_ARM_NEON_FMA;
21332 for (uint32_t n = 1; n <= 8; n++) {
21333 GemmMicrokernelTester()
21334 .mr(6)
21335 .nr(8)
21336 .kr(1)
21337 .sr(1)
21338 .m(6)
21339 .n(n)
21340 .k(2)
21341 .iterations(1)
21342 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21343 }
21344 }
21345
21346 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_lt_2) {
21347 TEST_REQUIRES_ARM_NEON_FMA;
21348 for (size_t k = 1; k < 2; k++) {
21349 GemmMicrokernelTester()
21350 .mr(6)
21351 .nr(8)
21352 .kr(1)
21353 .sr(1)
21354 .m(6)
21355 .n(8)
21356 .k(k)
21357 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21358 }
21359 }
21360
21361 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
21362 TEST_REQUIRES_ARM_NEON_FMA;
21363 for (size_t k = 1; k < 2; k++) {
21364 for (uint32_t m = 1; m <= 6; m++) {
21365 for (uint32_t n = 1; n <= 8; n++) {
21366 GemmMicrokernelTester()
21367 .mr(6)
21368 .nr(8)
21369 .kr(1)
21370 .sr(1)
21371 .m(m)
21372 .n(n)
21373 .k(k)
21374 .iterations(1)
21375 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21376 }
21377 }
21378 }
21379 }
21380
21381 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_gt_2) {
21382 TEST_REQUIRES_ARM_NEON_FMA;
21383 for (size_t k = 3; k < 4; k++) {
21384 GemmMicrokernelTester()
21385 .mr(6)
21386 .nr(8)
21387 .kr(1)
21388 .sr(1)
21389 .m(6)
21390 .n(8)
21391 .k(k)
21392 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21393 }
21394 }
21395
21396 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
21397 TEST_REQUIRES_ARM_NEON_FMA;
21398 for (size_t k = 3; k < 4; k++) {
21399 for (uint32_t m = 1; m <= 6; m++) {
21400 for (uint32_t n = 1; n <= 8; n++) {
21401 GemmMicrokernelTester()
21402 .mr(6)
21403 .nr(8)
21404 .kr(1)
21405 .sr(1)
21406 .m(m)
21407 .n(n)
21408 .k(k)
21409 .iterations(1)
21410 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21411 }
21412 }
21413 }
21414 }
21415
21416 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_div_2) {
21417 TEST_REQUIRES_ARM_NEON_FMA;
21418 for (size_t k = 4; k <= 20; k += 2) {
21419 GemmMicrokernelTester()
21420 .mr(6)
21421 .nr(8)
21422 .kr(1)
21423 .sr(1)
21424 .m(6)
21425 .n(8)
21426 .k(k)
21427 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21428 }
21429 }
21430
21431 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
21432 TEST_REQUIRES_ARM_NEON_FMA;
21433 for (size_t k = 4; k <= 20; k += 2) {
21434 for (uint32_t m = 1; m <= 6; m++) {
21435 for (uint32_t n = 1; n <= 8; n++) {
21436 GemmMicrokernelTester()
21437 .mr(6)
21438 .nr(8)
21439 .kr(1)
21440 .sr(1)
21441 .m(m)
21442 .n(n)
21443 .k(k)
21444 .iterations(1)
21445 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21446 }
21447 }
21448 }
21449 }
21450
21451 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8) {
21452 TEST_REQUIRES_ARM_NEON_FMA;
21453 for (uint32_t n = 9; n < 16; n++) {
21454 for (size_t k = 1; k <= 10; k += 3) {
21455 GemmMicrokernelTester()
21456 .mr(6)
21457 .nr(8)
21458 .kr(1)
21459 .sr(1)
21460 .m(6)
21461 .n(8)
21462 .k(k)
21463 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21464 }
21465 }
21466 }
21467
21468 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
21469 TEST_REQUIRES_ARM_NEON_FMA;
21470 for (uint32_t n = 9; n < 16; n++) {
21471 for (size_t k = 1; k <= 10; k += 3) {
21472 GemmMicrokernelTester()
21473 .mr(6)
21474 .nr(8)
21475 .kr(1)
21476 .sr(1)
21477 .m(6)
21478 .n(8)
21479 .k(k)
21480 .cn_stride(11)
21481 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21482 }
21483 }
21484 }
21485
21486 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
21487 TEST_REQUIRES_ARM_NEON_FMA;
21488 for (uint32_t n = 9; n < 16; n++) {
21489 for (size_t k = 1; k <= 10; k += 3) {
21490 for (uint32_t m = 1; m <= 6; m++) {
21491 GemmMicrokernelTester()
21492 .mr(6)
21493 .nr(8)
21494 .kr(1)
21495 .sr(1)
21496 .m(m)
21497 .n(n)
21498 .k(k)
21499 .iterations(1)
21500 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21501 }
21502 }
21503 }
21504 }
21505
21506 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8) {
21507 TEST_REQUIRES_ARM_NEON_FMA;
21508 for (uint32_t n = 16; n <= 24; n += 8) {
21509 for (size_t k = 1; k <= 10; k += 3) {
21510 GemmMicrokernelTester()
21511 .mr(6)
21512 .nr(8)
21513 .kr(1)
21514 .sr(1)
21515 .m(6)
21516 .n(8)
21517 .k(k)
21518 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21519 }
21520 }
21521 }
21522
21523 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
21524 TEST_REQUIRES_ARM_NEON_FMA;
21525 for (uint32_t n = 16; n <= 24; n += 8) {
21526 for (size_t k = 1; k <= 10; k += 3) {
21527 GemmMicrokernelTester()
21528 .mr(6)
21529 .nr(8)
21530 .kr(1)
21531 .sr(1)
21532 .m(6)
21533 .n(n)
21534 .k(k)
21535 .cn_stride(11)
21536 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21537 }
21538 }
21539 }
21540
21541 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
21542 TEST_REQUIRES_ARM_NEON_FMA;
21543 for (uint32_t n = 16; n <= 24; n += 8) {
21544 for (size_t k = 1; k <= 10; k += 3) {
21545 for (uint32_t m = 1; m <= 6; m++) {
21546 GemmMicrokernelTester()
21547 .mr(6)
21548 .nr(8)
21549 .kr(1)
21550 .sr(1)
21551 .m(m)
21552 .n(n)
21553 .k(k)
21554 .iterations(1)
21555 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21556 }
21557 }
21558 }
21559 }
21560
21561 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, small_kernel) {
21562 TEST_REQUIRES_ARM_NEON_FMA;
21563 for (size_t k = 1; k <= 10; k += 3) {
21564 GemmMicrokernelTester()
21565 .mr(6)
21566 .nr(8)
21567 .kr(1)
21568 .sr(1)
21569 .m(6)
21570 .n(8)
21571 .k(k)
21572 .ks(3)
21573 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21574 }
21575 }
21576
21577 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, small_kernel_subtile) {
21578 TEST_REQUIRES_ARM_NEON_FMA;
21579 for (size_t k = 1; k <= 10; k += 3) {
21580 for (uint32_t m = 1; m <= 6; m++) {
21581 for (uint32_t n = 1; n <= 8; n++) {
21582 GemmMicrokernelTester()
21583 .mr(6)
21584 .nr(8)
21585 .kr(1)
21586 .sr(1)
21587 .m(m)
21588 .n(n)
21589 .k(k)
21590 .ks(3)
21591 .iterations(1)
21592 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21593 }
21594 }
21595 }
21596 }
21597
21598 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_small_kernel) {
21599 TEST_REQUIRES_ARM_NEON_FMA;
21600 for (uint32_t n = 9; n < 16; n++) {
21601 for (size_t k = 1; k <= 10; k += 3) {
21602 GemmMicrokernelTester()
21603 .mr(6)
21604 .nr(8)
21605 .kr(1)
21606 .sr(1)
21607 .m(6)
21608 .n(8)
21609 .k(k)
21610 .ks(3)
21611 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21612 }
21613 }
21614 }
21615
21616 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, n_div_8_small_kernel) {
21617 TEST_REQUIRES_ARM_NEON_FMA;
21618 for (uint32_t n = 16; n <= 24; n += 8) {
21619 for (size_t k = 1; k <= 10; k += 3) {
21620 GemmMicrokernelTester()
21621 .mr(6)
21622 .nr(8)
21623 .kr(1)
21624 .sr(1)
21625 .m(6)
21626 .n(8)
21627 .k(k)
21628 .ks(3)
21629 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21630 }
21631 }
21632 }
21633
21634 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
21635 TEST_REQUIRES_ARM_NEON_FMA;
21636 for (size_t k = 1; k <= 10; k += 3) {
21637 for (uint32_t m = 1; m <= 6; m++) {
21638 for (uint32_t n = 1; n <= 8; n++) {
21639 GemmMicrokernelTester()
21640 .mr(6)
21641 .nr(8)
21642 .kr(1)
21643 .sr(1)
21644 .m(m)
21645 .n(n)
21646 .k(k)
21647 .cm_stride(11)
21648 .iterations(1)
21649 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21650 }
21651 }
21652 }
21653 }
21654
21655 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, a_offset) {
21656 TEST_REQUIRES_ARM_NEON_FMA;
21657 for (size_t k = 1; k <= 10; k += 3) {
21658 GemmMicrokernelTester()
21659 .mr(6)
21660 .nr(8)
21661 .kr(1)
21662 .sr(1)
21663 .m(6)
21664 .n(8)
21665 .k(k)
21666 .ks(3)
21667 .a_offset(67)
21668 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21669 }
21670 }
21671
21672 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, zero) {
21673 TEST_REQUIRES_ARM_NEON_FMA;
21674 for (uint32_t mz = 0; mz < 6; mz++) {
21675 for (size_t k = 1; k <= 10; k += 3) {
21676 GemmMicrokernelTester()
21677 .mr(6)
21678 .nr(8)
21679 .kr(1)
21680 .sr(1)
21681 .m(6)
21682 .n(8)
21683 .k(k)
21684 .ks(3)
21685 .a_offset(67)
21686 .zero_index(mz)
21687 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21688 }
21689 }
21690 }
21691
21692 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, qmin) {
21693 TEST_REQUIRES_ARM_NEON_FMA;
21694 GemmMicrokernelTester()
21695 .mr(6)
21696 .nr(8)
21697 .kr(1)
21698 .sr(1)
21699 .m(6)
21700 .n(8)
21701 .k(2)
21702 .qmin(128)
21703 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21704 }
21705
21706 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, qmax) {
21707 TEST_REQUIRES_ARM_NEON_FMA;
21708 GemmMicrokernelTester()
21709 .mr(6)
21710 .nr(8)
21711 .kr(1)
21712 .sr(1)
21713 .m(6)
21714 .n(8)
21715 .k(2)
21716 .qmax(128)
21717 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21718 }
21719
21720 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD64, strided_cm) {
21721 TEST_REQUIRES_ARM_NEON_FMA;
21722 GemmMicrokernelTester()
21723 .mr(6)
21724 .nr(8)
21725 .kr(1)
21726 .sr(1)
21727 .m(6)
21728 .n(8)
21729 .k(2)
21730 .cm_stride(11)
21731 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld64);
21732 }
21733#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21734
21735
21736#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21737 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4) {
21738 TEST_REQUIRES_ARM_NEON_FMA;
21739 GemmMicrokernelTester()
21740 .mr(6)
21741 .nr(8)
21742 .kr(1)
21743 .sr(1)
21744 .m(6)
21745 .n(8)
21746 .k(4)
21747 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21748 }
21749
21750 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, strided_cn) {
21751 TEST_REQUIRES_ARM_NEON_FMA;
21752 GemmMicrokernelTester()
21753 .mr(6)
21754 .nr(8)
21755 .kr(1)
21756 .sr(1)
21757 .m(6)
21758 .n(8)
21759 .k(4)
21760 .cn_stride(11)
21761 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21762 }
21763
21764 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
21765 TEST_REQUIRES_ARM_NEON_FMA;
21766 for (uint32_t m = 1; m <= 6; m++) {
21767 for (uint32_t n = 1; n <= 8; n++) {
21768 GemmMicrokernelTester()
21769 .mr(6)
21770 .nr(8)
21771 .kr(1)
21772 .sr(1)
21773 .m(m)
21774 .n(n)
21775 .k(4)
21776 .iterations(1)
21777 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21778 }
21779 }
21780 }
21781
21782 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
21783 TEST_REQUIRES_ARM_NEON_FMA;
21784 for (uint32_t m = 1; m <= 6; m++) {
21785 GemmMicrokernelTester()
21786 .mr(6)
21787 .nr(8)
21788 .kr(1)
21789 .sr(1)
21790 .m(m)
21791 .n(8)
21792 .k(4)
21793 .iterations(1)
21794 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21795 }
21796 }
21797
21798 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
21799 TEST_REQUIRES_ARM_NEON_FMA;
21800 for (uint32_t n = 1; n <= 8; n++) {
21801 GemmMicrokernelTester()
21802 .mr(6)
21803 .nr(8)
21804 .kr(1)
21805 .sr(1)
21806 .m(6)
21807 .n(n)
21808 .k(4)
21809 .iterations(1)
21810 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21811 }
21812 }
21813
21814 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_lt_4) {
21815 TEST_REQUIRES_ARM_NEON_FMA;
21816 for (size_t k = 1; k < 4; k++) {
21817 GemmMicrokernelTester()
21818 .mr(6)
21819 .nr(8)
21820 .kr(1)
21821 .sr(1)
21822 .m(6)
21823 .n(8)
21824 .k(k)
21825 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21826 }
21827 }
21828
21829 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
21830 TEST_REQUIRES_ARM_NEON_FMA;
21831 for (size_t k = 1; k < 4; k++) {
21832 for (uint32_t m = 1; m <= 6; m++) {
21833 for (uint32_t n = 1; n <= 8; n++) {
21834 GemmMicrokernelTester()
21835 .mr(6)
21836 .nr(8)
21837 .kr(1)
21838 .sr(1)
21839 .m(m)
21840 .n(n)
21841 .k(k)
21842 .iterations(1)
21843 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21844 }
21845 }
21846 }
21847 }
21848
21849 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_gt_4) {
21850 TEST_REQUIRES_ARM_NEON_FMA;
21851 for (size_t k = 5; k < 8; k++) {
21852 GemmMicrokernelTester()
21853 .mr(6)
21854 .nr(8)
21855 .kr(1)
21856 .sr(1)
21857 .m(6)
21858 .n(8)
21859 .k(k)
21860 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21861 }
21862 }
21863
21864 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
21865 TEST_REQUIRES_ARM_NEON_FMA;
21866 for (size_t k = 5; k < 8; k++) {
21867 for (uint32_t m = 1; m <= 6; m++) {
21868 for (uint32_t n = 1; n <= 8; n++) {
21869 GemmMicrokernelTester()
21870 .mr(6)
21871 .nr(8)
21872 .kr(1)
21873 .sr(1)
21874 .m(m)
21875 .n(n)
21876 .k(k)
21877 .iterations(1)
21878 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21879 }
21880 }
21881 }
21882 }
21883
21884 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_div_4) {
21885 TEST_REQUIRES_ARM_NEON_FMA;
21886 for (size_t k = 8; k <= 40; k += 4) {
21887 GemmMicrokernelTester()
21888 .mr(6)
21889 .nr(8)
21890 .kr(1)
21891 .sr(1)
21892 .m(6)
21893 .n(8)
21894 .k(k)
21895 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21896 }
21897 }
21898
21899 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
21900 TEST_REQUIRES_ARM_NEON_FMA;
21901 for (size_t k = 8; k <= 40; k += 4) {
21902 for (uint32_t m = 1; m <= 6; m++) {
21903 for (uint32_t n = 1; n <= 8; n++) {
21904 GemmMicrokernelTester()
21905 .mr(6)
21906 .nr(8)
21907 .kr(1)
21908 .sr(1)
21909 .m(m)
21910 .n(n)
21911 .k(k)
21912 .iterations(1)
21913 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21914 }
21915 }
21916 }
21917 }
21918
21919 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8) {
21920 TEST_REQUIRES_ARM_NEON_FMA;
21921 for (uint32_t n = 9; n < 16; n++) {
21922 for (size_t k = 1; k <= 20; k += 5) {
21923 GemmMicrokernelTester()
21924 .mr(6)
21925 .nr(8)
21926 .kr(1)
21927 .sr(1)
21928 .m(6)
21929 .n(8)
21930 .k(k)
21931 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21932 }
21933 }
21934 }
21935
21936 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
21937 TEST_REQUIRES_ARM_NEON_FMA;
21938 for (uint32_t n = 9; n < 16; n++) {
21939 for (size_t k = 1; k <= 20; k += 5) {
21940 GemmMicrokernelTester()
21941 .mr(6)
21942 .nr(8)
21943 .kr(1)
21944 .sr(1)
21945 .m(6)
21946 .n(8)
21947 .k(k)
21948 .cn_stride(11)
21949 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21950 }
21951 }
21952 }
21953
21954 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
21955 TEST_REQUIRES_ARM_NEON_FMA;
21956 for (uint32_t n = 9; n < 16; n++) {
21957 for (size_t k = 1; k <= 20; k += 5) {
21958 for (uint32_t m = 1; m <= 6; m++) {
21959 GemmMicrokernelTester()
21960 .mr(6)
21961 .nr(8)
21962 .kr(1)
21963 .sr(1)
21964 .m(m)
21965 .n(n)
21966 .k(k)
21967 .iterations(1)
21968 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21969 }
21970 }
21971 }
21972 }
21973
21974 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8) {
21975 TEST_REQUIRES_ARM_NEON_FMA;
21976 for (uint32_t n = 16; n <= 24; n += 8) {
21977 for (size_t k = 1; k <= 20; k += 5) {
21978 GemmMicrokernelTester()
21979 .mr(6)
21980 .nr(8)
21981 .kr(1)
21982 .sr(1)
21983 .m(6)
21984 .n(8)
21985 .k(k)
21986 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
21987 }
21988 }
21989 }
21990
21991 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
21992 TEST_REQUIRES_ARM_NEON_FMA;
21993 for (uint32_t n = 16; n <= 24; n += 8) {
21994 for (size_t k = 1; k <= 20; k += 5) {
21995 GemmMicrokernelTester()
21996 .mr(6)
21997 .nr(8)
21998 .kr(1)
21999 .sr(1)
22000 .m(6)
22001 .n(n)
22002 .k(k)
22003 .cn_stride(11)
22004 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22005 }
22006 }
22007 }
22008
22009 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
22010 TEST_REQUIRES_ARM_NEON_FMA;
22011 for (uint32_t n = 16; n <= 24; n += 8) {
22012 for (size_t k = 1; k <= 20; k += 5) {
22013 for (uint32_t m = 1; m <= 6; m++) {
22014 GemmMicrokernelTester()
22015 .mr(6)
22016 .nr(8)
22017 .kr(1)
22018 .sr(1)
22019 .m(m)
22020 .n(n)
22021 .k(k)
22022 .iterations(1)
22023 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22024 }
22025 }
22026 }
22027 }
22028
22029 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, small_kernel) {
22030 TEST_REQUIRES_ARM_NEON_FMA;
22031 for (size_t k = 1; k <= 20; k += 5) {
22032 GemmMicrokernelTester()
22033 .mr(6)
22034 .nr(8)
22035 .kr(1)
22036 .sr(1)
22037 .m(6)
22038 .n(8)
22039 .k(k)
22040 .ks(3)
22041 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22042 }
22043 }
22044
22045 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, small_kernel_subtile) {
22046 TEST_REQUIRES_ARM_NEON_FMA;
22047 for (size_t k = 1; k <= 20; k += 5) {
22048 for (uint32_t m = 1; m <= 6; m++) {
22049 for (uint32_t n = 1; n <= 8; n++) {
22050 GemmMicrokernelTester()
22051 .mr(6)
22052 .nr(8)
22053 .kr(1)
22054 .sr(1)
22055 .m(m)
22056 .n(n)
22057 .k(k)
22058 .ks(3)
22059 .iterations(1)
22060 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22061 }
22062 }
22063 }
22064 }
22065
22066 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_small_kernel) {
22067 TEST_REQUIRES_ARM_NEON_FMA;
22068 for (uint32_t n = 9; n < 16; n++) {
22069 for (size_t k = 1; k <= 20; k += 5) {
22070 GemmMicrokernelTester()
22071 .mr(6)
22072 .nr(8)
22073 .kr(1)
22074 .sr(1)
22075 .m(6)
22076 .n(8)
22077 .k(k)
22078 .ks(3)
22079 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22080 }
22081 }
22082 }
22083
22084 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, n_div_8_small_kernel) {
22085 TEST_REQUIRES_ARM_NEON_FMA;
22086 for (uint32_t n = 16; n <= 24; n += 8) {
22087 for (size_t k = 1; k <= 20; k += 5) {
22088 GemmMicrokernelTester()
22089 .mr(6)
22090 .nr(8)
22091 .kr(1)
22092 .sr(1)
22093 .m(6)
22094 .n(8)
22095 .k(k)
22096 .ks(3)
22097 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22098 }
22099 }
22100 }
22101
22102 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
22103 TEST_REQUIRES_ARM_NEON_FMA;
22104 for (size_t k = 1; k <= 20; k += 5) {
22105 for (uint32_t m = 1; m <= 6; m++) {
22106 for (uint32_t n = 1; n <= 8; n++) {
22107 GemmMicrokernelTester()
22108 .mr(6)
22109 .nr(8)
22110 .kr(1)
22111 .sr(1)
22112 .m(m)
22113 .n(n)
22114 .k(k)
22115 .cm_stride(11)
22116 .iterations(1)
22117 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22118 }
22119 }
22120 }
22121 }
22122
22123 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, a_offset) {
22124 TEST_REQUIRES_ARM_NEON_FMA;
22125 for (size_t k = 1; k <= 20; k += 5) {
22126 GemmMicrokernelTester()
22127 .mr(6)
22128 .nr(8)
22129 .kr(1)
22130 .sr(1)
22131 .m(6)
22132 .n(8)
22133 .k(k)
22134 .ks(3)
22135 .a_offset(127)
22136 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22137 }
22138 }
22139
22140 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, zero) {
22141 TEST_REQUIRES_ARM_NEON_FMA;
22142 for (uint32_t mz = 0; mz < 6; mz++) {
22143 for (size_t k = 1; k <= 20; k += 5) {
22144 GemmMicrokernelTester()
22145 .mr(6)
22146 .nr(8)
22147 .kr(1)
22148 .sr(1)
22149 .m(6)
22150 .n(8)
22151 .k(k)
22152 .ks(3)
22153 .a_offset(127)
22154 .zero_index(mz)
22155 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22156 }
22157 }
22158 }
22159
22160 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, qmin) {
22161 TEST_REQUIRES_ARM_NEON_FMA;
22162 GemmMicrokernelTester()
22163 .mr(6)
22164 .nr(8)
22165 .kr(1)
22166 .sr(1)
22167 .m(6)
22168 .n(8)
22169 .k(4)
22170 .qmin(128)
22171 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22172 }
22173
22174 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, qmax) {
22175 TEST_REQUIRES_ARM_NEON_FMA;
22176 GemmMicrokernelTester()
22177 .mr(6)
22178 .nr(8)
22179 .kr(1)
22180 .sr(1)
22181 .m(6)
22182 .n(8)
22183 .k(4)
22184 .qmax(128)
22185 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22186 }
22187
22188 TEST(F32_IGEMM_6X8__NEONFMA_DUP_LD128, strided_cm) {
22189 TEST_REQUIRES_ARM_NEON_FMA;
22190 GemmMicrokernelTester()
22191 .mr(6)
22192 .nr(8)
22193 .kr(1)
22194 .sr(1)
22195 .m(6)
22196 .n(8)
22197 .k(4)
22198 .cm_stride(11)
22199 .Test(xnn_f32_igemm_ukernel_6x8__neonfma_dup_ld128);
22200 }
22201#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22202
22203
22204#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22205 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4) {
22206 TEST_REQUIRES_ARM_NEON;
22207 GemmMicrokernelTester()
22208 .mr(1)
22209 .nr(8)
22210 .kr(1)
22211 .sr(4)
22212 .m(1)
22213 .n(8)
22214 .k(4)
22215 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22216 }
22217
22218 TEST(F32_IGEMM_1X8S4__NEON, strided_cn) {
22219 TEST_REQUIRES_ARM_NEON;
22220 GemmMicrokernelTester()
22221 .mr(1)
22222 .nr(8)
22223 .kr(1)
22224 .sr(4)
22225 .m(1)
22226 .n(8)
22227 .k(4)
22228 .cn_stride(11)
22229 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22230 }
22231
22232 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4_subtile) {
22233 TEST_REQUIRES_ARM_NEON;
22234 for (uint32_t m = 1; m <= 1; m++) {
22235 for (uint32_t n = 1; n <= 8; n++) {
22236 GemmMicrokernelTester()
22237 .mr(1)
22238 .nr(8)
22239 .kr(1)
22240 .sr(4)
22241 .m(m)
22242 .n(n)
22243 .k(4)
22244 .iterations(1)
22245 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22246 }
22247 }
22248 }
22249
22250 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4_subtile_m) {
22251 TEST_REQUIRES_ARM_NEON;
22252 for (uint32_t m = 1; m <= 1; m++) {
22253 GemmMicrokernelTester()
22254 .mr(1)
22255 .nr(8)
22256 .kr(1)
22257 .sr(4)
22258 .m(m)
22259 .n(8)
22260 .k(4)
22261 .iterations(1)
22262 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22263 }
22264 }
22265
22266 TEST(F32_IGEMM_1X8S4__NEON, k_eq_4_subtile_n) {
22267 TEST_REQUIRES_ARM_NEON;
22268 for (uint32_t n = 1; n <= 8; n++) {
22269 GemmMicrokernelTester()
22270 .mr(1)
22271 .nr(8)
22272 .kr(1)
22273 .sr(4)
22274 .m(1)
22275 .n(n)
22276 .k(4)
22277 .iterations(1)
22278 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22279 }
22280 }
22281
22282 TEST(F32_IGEMM_1X8S4__NEON, k_lt_4) {
22283 TEST_REQUIRES_ARM_NEON;
22284 for (size_t k = 1; k < 4; k++) {
22285 GemmMicrokernelTester()
22286 .mr(1)
22287 .nr(8)
22288 .kr(1)
22289 .sr(4)
22290 .m(1)
22291 .n(8)
22292 .k(k)
22293 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22294 }
22295 }
22296
22297 TEST(F32_IGEMM_1X8S4__NEON, k_lt_4_subtile) {
22298 TEST_REQUIRES_ARM_NEON;
22299 for (size_t k = 1; k < 4; k++) {
22300 for (uint32_t m = 1; m <= 1; m++) {
22301 for (uint32_t n = 1; n <= 8; n++) {
22302 GemmMicrokernelTester()
22303 .mr(1)
22304 .nr(8)
22305 .kr(1)
22306 .sr(4)
22307 .m(m)
22308 .n(n)
22309 .k(k)
22310 .iterations(1)
22311 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22312 }
22313 }
22314 }
22315 }
22316
22317 TEST(F32_IGEMM_1X8S4__NEON, k_gt_4) {
22318 TEST_REQUIRES_ARM_NEON;
22319 for (size_t k = 5; k < 8; k++) {
22320 GemmMicrokernelTester()
22321 .mr(1)
22322 .nr(8)
22323 .kr(1)
22324 .sr(4)
22325 .m(1)
22326 .n(8)
22327 .k(k)
22328 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22329 }
22330 }
22331
22332 TEST(F32_IGEMM_1X8S4__NEON, k_gt_4_subtile) {
22333 TEST_REQUIRES_ARM_NEON;
22334 for (size_t k = 5; k < 8; k++) {
22335 for (uint32_t m = 1; m <= 1; m++) {
22336 for (uint32_t n = 1; n <= 8; n++) {
22337 GemmMicrokernelTester()
22338 .mr(1)
22339 .nr(8)
22340 .kr(1)
22341 .sr(4)
22342 .m(m)
22343 .n(n)
22344 .k(k)
22345 .iterations(1)
22346 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22347 }
22348 }
22349 }
22350 }
22351
22352 TEST(F32_IGEMM_1X8S4__NEON, k_div_4) {
22353 TEST_REQUIRES_ARM_NEON;
22354 for (size_t k = 8; k <= 40; k += 4) {
22355 GemmMicrokernelTester()
22356 .mr(1)
22357 .nr(8)
22358 .kr(1)
22359 .sr(4)
22360 .m(1)
22361 .n(8)
22362 .k(k)
22363 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22364 }
22365 }
22366
22367 TEST(F32_IGEMM_1X8S4__NEON, k_div_4_subtile) {
22368 TEST_REQUIRES_ARM_NEON;
22369 for (size_t k = 8; k <= 40; k += 4) {
22370 for (uint32_t m = 1; m <= 1; m++) {
22371 for (uint32_t n = 1; n <= 8; n++) {
22372 GemmMicrokernelTester()
22373 .mr(1)
22374 .nr(8)
22375 .kr(1)
22376 .sr(4)
22377 .m(m)
22378 .n(n)
22379 .k(k)
22380 .iterations(1)
22381 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22382 }
22383 }
22384 }
22385 }
22386
22387 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8) {
22388 TEST_REQUIRES_ARM_NEON;
22389 for (uint32_t n = 9; n < 16; n++) {
22390 for (size_t k = 1; k <= 20; k += 5) {
22391 GemmMicrokernelTester()
22392 .mr(1)
22393 .nr(8)
22394 .kr(1)
22395 .sr(4)
22396 .m(1)
22397 .n(8)
22398 .k(k)
22399 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22400 }
22401 }
22402 }
22403
22404 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8_strided_cn) {
22405 TEST_REQUIRES_ARM_NEON;
22406 for (uint32_t n = 9; n < 16; n++) {
22407 for (size_t k = 1; k <= 20; k += 5) {
22408 GemmMicrokernelTester()
22409 .mr(1)
22410 .nr(8)
22411 .kr(1)
22412 .sr(4)
22413 .m(1)
22414 .n(8)
22415 .k(k)
22416 .cn_stride(11)
22417 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22418 }
22419 }
22420 }
22421
22422 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8_subtile) {
22423 TEST_REQUIRES_ARM_NEON;
22424 for (uint32_t n = 9; n < 16; n++) {
22425 for (size_t k = 1; k <= 20; k += 5) {
22426 for (uint32_t m = 1; m <= 1; m++) {
22427 GemmMicrokernelTester()
22428 .mr(1)
22429 .nr(8)
22430 .kr(1)
22431 .sr(4)
22432 .m(m)
22433 .n(n)
22434 .k(k)
22435 .iterations(1)
22436 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22437 }
22438 }
22439 }
22440 }
22441
22442 TEST(F32_IGEMM_1X8S4__NEON, n_div_8) {
22443 TEST_REQUIRES_ARM_NEON;
22444 for (uint32_t n = 16; n <= 24; n += 8) {
22445 for (size_t k = 1; k <= 20; k += 5) {
22446 GemmMicrokernelTester()
22447 .mr(1)
22448 .nr(8)
22449 .kr(1)
22450 .sr(4)
22451 .m(1)
22452 .n(8)
22453 .k(k)
22454 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22455 }
22456 }
22457 }
22458
22459 TEST(F32_IGEMM_1X8S4__NEON, n_div_8_strided_cn) {
22460 TEST_REQUIRES_ARM_NEON;
22461 for (uint32_t n = 16; n <= 24; n += 8) {
22462 for (size_t k = 1; k <= 20; k += 5) {
22463 GemmMicrokernelTester()
22464 .mr(1)
22465 .nr(8)
22466 .kr(1)
22467 .sr(4)
22468 .m(1)
22469 .n(n)
22470 .k(k)
22471 .cn_stride(11)
22472 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22473 }
22474 }
22475 }
22476
22477 TEST(F32_IGEMM_1X8S4__NEON, n_div_8_subtile) {
22478 TEST_REQUIRES_ARM_NEON;
22479 for (uint32_t n = 16; n <= 24; n += 8) {
22480 for (size_t k = 1; k <= 20; k += 5) {
22481 for (uint32_t m = 1; m <= 1; m++) {
22482 GemmMicrokernelTester()
22483 .mr(1)
22484 .nr(8)
22485 .kr(1)
22486 .sr(4)
22487 .m(m)
22488 .n(n)
22489 .k(k)
22490 .iterations(1)
22491 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22492 }
22493 }
22494 }
22495 }
22496
22497 TEST(F32_IGEMM_1X8S4__NEON, small_kernel) {
22498 TEST_REQUIRES_ARM_NEON;
22499 for (size_t k = 1; k <= 20; k += 5) {
22500 GemmMicrokernelTester()
22501 .mr(1)
22502 .nr(8)
22503 .kr(1)
22504 .sr(4)
22505 .m(1)
22506 .n(8)
22507 .k(k)
22508 .ks(3)
22509 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22510 }
22511 }
22512
22513 TEST(F32_IGEMM_1X8S4__NEON, small_kernel_subtile) {
22514 TEST_REQUIRES_ARM_NEON;
22515 for (size_t k = 1; k <= 20; k += 5) {
22516 for (uint32_t m = 1; m <= 1; m++) {
22517 for (uint32_t n = 1; n <= 8; n++) {
22518 GemmMicrokernelTester()
22519 .mr(1)
22520 .nr(8)
22521 .kr(1)
22522 .sr(4)
22523 .m(m)
22524 .n(n)
22525 .k(k)
22526 .ks(3)
22527 .iterations(1)
22528 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22529 }
22530 }
22531 }
22532 }
22533
22534 TEST(F32_IGEMM_1X8S4__NEON, n_gt_8_small_kernel) {
22535 TEST_REQUIRES_ARM_NEON;
22536 for (uint32_t n = 9; n < 16; n++) {
22537 for (size_t k = 1; k <= 20; k += 5) {
22538 GemmMicrokernelTester()
22539 .mr(1)
22540 .nr(8)
22541 .kr(1)
22542 .sr(4)
22543 .m(1)
22544 .n(8)
22545 .k(k)
22546 .ks(3)
22547 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22548 }
22549 }
22550 }
22551
22552 TEST(F32_IGEMM_1X8S4__NEON, n_div_8_small_kernel) {
22553 TEST_REQUIRES_ARM_NEON;
22554 for (uint32_t n = 16; n <= 24; n += 8) {
22555 for (size_t k = 1; k <= 20; k += 5) {
22556 GemmMicrokernelTester()
22557 .mr(1)
22558 .nr(8)
22559 .kr(1)
22560 .sr(4)
22561 .m(1)
22562 .n(8)
22563 .k(k)
22564 .ks(3)
22565 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22566 }
22567 }
22568 }
22569
22570 TEST(F32_IGEMM_1X8S4__NEON, strided_cm_subtile) {
22571 TEST_REQUIRES_ARM_NEON;
22572 for (size_t k = 1; k <= 20; k += 5) {
22573 for (uint32_t m = 1; m <= 1; m++) {
22574 for (uint32_t n = 1; n <= 8; n++) {
22575 GemmMicrokernelTester()
22576 .mr(1)
22577 .nr(8)
22578 .kr(1)
22579 .sr(4)
22580 .m(m)
22581 .n(n)
22582 .k(k)
22583 .cm_stride(11)
22584 .iterations(1)
22585 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22586 }
22587 }
22588 }
22589 }
22590
22591 TEST(F32_IGEMM_1X8S4__NEON, a_offset) {
22592 TEST_REQUIRES_ARM_NEON;
22593 for (size_t k = 1; k <= 20; k += 5) {
22594 GemmMicrokernelTester()
22595 .mr(1)
22596 .nr(8)
22597 .kr(1)
22598 .sr(4)
22599 .m(1)
22600 .n(8)
22601 .k(k)
22602 .ks(3)
22603 .a_offset(23)
22604 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22605 }
22606 }
22607
22608 TEST(F32_IGEMM_1X8S4__NEON, zero) {
22609 TEST_REQUIRES_ARM_NEON;
22610 for (uint32_t mz = 0; mz < 1; mz++) {
22611 for (size_t k = 1; k <= 20; k += 5) {
22612 GemmMicrokernelTester()
22613 .mr(1)
22614 .nr(8)
22615 .kr(1)
22616 .sr(4)
22617 .m(1)
22618 .n(8)
22619 .k(k)
22620 .ks(3)
22621 .a_offset(23)
22622 .zero_index(mz)
22623 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22624 }
22625 }
22626 }
22627
22628 TEST(F32_IGEMM_1X8S4__NEON, qmin) {
22629 TEST_REQUIRES_ARM_NEON;
22630 GemmMicrokernelTester()
22631 .mr(1)
22632 .nr(8)
22633 .kr(1)
22634 .sr(4)
22635 .m(1)
22636 .n(8)
22637 .k(4)
22638 .qmin(128)
22639 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22640 }
22641
22642 TEST(F32_IGEMM_1X8S4__NEON, qmax) {
22643 TEST_REQUIRES_ARM_NEON;
22644 GemmMicrokernelTester()
22645 .mr(1)
22646 .nr(8)
22647 .kr(1)
22648 .sr(4)
22649 .m(1)
22650 .n(8)
22651 .k(4)
22652 .qmax(128)
22653 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22654 }
22655
22656 TEST(F32_IGEMM_1X8S4__NEON, strided_cm) {
22657 TEST_REQUIRES_ARM_NEON;
22658 GemmMicrokernelTester()
22659 .mr(1)
22660 .nr(8)
22661 .kr(1)
22662 .sr(4)
22663 .m(1)
22664 .n(8)
22665 .k(4)
22666 .cm_stride(11)
22667 .Test(xnn_f32_igemm_ukernel_1x8s4__neon);
22668 }
22669#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22670
22671
22672#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22673 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4) {
22674 TEST_REQUIRES_ARM_NEON;
22675 GemmMicrokernelTester()
22676 .mr(4)
22677 .nr(8)
22678 .kr(1)
22679 .sr(4)
22680 .m(4)
22681 .n(8)
22682 .k(4)
22683 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22684 }
22685
22686 TEST(F32_IGEMM_4X8S4__NEON, strided_cn) {
22687 TEST_REQUIRES_ARM_NEON;
22688 GemmMicrokernelTester()
22689 .mr(4)
22690 .nr(8)
22691 .kr(1)
22692 .sr(4)
22693 .m(4)
22694 .n(8)
22695 .k(4)
22696 .cn_stride(11)
22697 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22698 }
22699
22700 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4_subtile) {
22701 TEST_REQUIRES_ARM_NEON;
22702 for (uint32_t m = 1; m <= 4; m++) {
22703 for (uint32_t n = 1; n <= 8; n++) {
22704 GemmMicrokernelTester()
22705 .mr(4)
22706 .nr(8)
22707 .kr(1)
22708 .sr(4)
22709 .m(m)
22710 .n(n)
22711 .k(4)
22712 .iterations(1)
22713 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22714 }
22715 }
22716 }
22717
22718 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4_subtile_m) {
22719 TEST_REQUIRES_ARM_NEON;
22720 for (uint32_t m = 1; m <= 4; m++) {
22721 GemmMicrokernelTester()
22722 .mr(4)
22723 .nr(8)
22724 .kr(1)
22725 .sr(4)
22726 .m(m)
22727 .n(8)
22728 .k(4)
22729 .iterations(1)
22730 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22731 }
22732 }
22733
22734 TEST(F32_IGEMM_4X8S4__NEON, k_eq_4_subtile_n) {
22735 TEST_REQUIRES_ARM_NEON;
22736 for (uint32_t n = 1; n <= 8; n++) {
22737 GemmMicrokernelTester()
22738 .mr(4)
22739 .nr(8)
22740 .kr(1)
22741 .sr(4)
22742 .m(4)
22743 .n(n)
22744 .k(4)
22745 .iterations(1)
22746 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22747 }
22748 }
22749
22750 TEST(F32_IGEMM_4X8S4__NEON, k_lt_4) {
22751 TEST_REQUIRES_ARM_NEON;
22752 for (size_t k = 1; k < 4; k++) {
22753 GemmMicrokernelTester()
22754 .mr(4)
22755 .nr(8)
22756 .kr(1)
22757 .sr(4)
22758 .m(4)
22759 .n(8)
22760 .k(k)
22761 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22762 }
22763 }
22764
22765 TEST(F32_IGEMM_4X8S4__NEON, k_lt_4_subtile) {
22766 TEST_REQUIRES_ARM_NEON;
22767 for (size_t k = 1; k < 4; k++) {
22768 for (uint32_t m = 1; m <= 4; m++) {
22769 for (uint32_t n = 1; n <= 8; n++) {
22770 GemmMicrokernelTester()
22771 .mr(4)
22772 .nr(8)
22773 .kr(1)
22774 .sr(4)
22775 .m(m)
22776 .n(n)
22777 .k(k)
22778 .iterations(1)
22779 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22780 }
22781 }
22782 }
22783 }
22784
22785 TEST(F32_IGEMM_4X8S4__NEON, k_gt_4) {
22786 TEST_REQUIRES_ARM_NEON;
22787 for (size_t k = 5; k < 8; k++) {
22788 GemmMicrokernelTester()
22789 .mr(4)
22790 .nr(8)
22791 .kr(1)
22792 .sr(4)
22793 .m(4)
22794 .n(8)
22795 .k(k)
22796 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22797 }
22798 }
22799
22800 TEST(F32_IGEMM_4X8S4__NEON, k_gt_4_subtile) {
22801 TEST_REQUIRES_ARM_NEON;
22802 for (size_t k = 5; k < 8; k++) {
22803 for (uint32_t m = 1; m <= 4; m++) {
22804 for (uint32_t n = 1; n <= 8; n++) {
22805 GemmMicrokernelTester()
22806 .mr(4)
22807 .nr(8)
22808 .kr(1)
22809 .sr(4)
22810 .m(m)
22811 .n(n)
22812 .k(k)
22813 .iterations(1)
22814 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22815 }
22816 }
22817 }
22818 }
22819
22820 TEST(F32_IGEMM_4X8S4__NEON, k_div_4) {
22821 TEST_REQUIRES_ARM_NEON;
22822 for (size_t k = 8; k <= 40; k += 4) {
22823 GemmMicrokernelTester()
22824 .mr(4)
22825 .nr(8)
22826 .kr(1)
22827 .sr(4)
22828 .m(4)
22829 .n(8)
22830 .k(k)
22831 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22832 }
22833 }
22834
22835 TEST(F32_IGEMM_4X8S4__NEON, k_div_4_subtile) {
22836 TEST_REQUIRES_ARM_NEON;
22837 for (size_t k = 8; k <= 40; k += 4) {
22838 for (uint32_t m = 1; m <= 4; m++) {
22839 for (uint32_t n = 1; n <= 8; n++) {
22840 GemmMicrokernelTester()
22841 .mr(4)
22842 .nr(8)
22843 .kr(1)
22844 .sr(4)
22845 .m(m)
22846 .n(n)
22847 .k(k)
22848 .iterations(1)
22849 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22850 }
22851 }
22852 }
22853 }
22854
22855 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8) {
22856 TEST_REQUIRES_ARM_NEON;
22857 for (uint32_t n = 9; n < 16; n++) {
22858 for (size_t k = 1; k <= 20; k += 5) {
22859 GemmMicrokernelTester()
22860 .mr(4)
22861 .nr(8)
22862 .kr(1)
22863 .sr(4)
22864 .m(4)
22865 .n(8)
22866 .k(k)
22867 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22868 }
22869 }
22870 }
22871
22872 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8_strided_cn) {
22873 TEST_REQUIRES_ARM_NEON;
22874 for (uint32_t n = 9; n < 16; n++) {
22875 for (size_t k = 1; k <= 20; k += 5) {
22876 GemmMicrokernelTester()
22877 .mr(4)
22878 .nr(8)
22879 .kr(1)
22880 .sr(4)
22881 .m(4)
22882 .n(8)
22883 .k(k)
22884 .cn_stride(11)
22885 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22886 }
22887 }
22888 }
22889
22890 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8_subtile) {
22891 TEST_REQUIRES_ARM_NEON;
22892 for (uint32_t n = 9; n < 16; n++) {
22893 for (size_t k = 1; k <= 20; k += 5) {
22894 for (uint32_t m = 1; m <= 4; m++) {
22895 GemmMicrokernelTester()
22896 .mr(4)
22897 .nr(8)
22898 .kr(1)
22899 .sr(4)
22900 .m(m)
22901 .n(n)
22902 .k(k)
22903 .iterations(1)
22904 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22905 }
22906 }
22907 }
22908 }
22909
22910 TEST(F32_IGEMM_4X8S4__NEON, n_div_8) {
22911 TEST_REQUIRES_ARM_NEON;
22912 for (uint32_t n = 16; n <= 24; n += 8) {
22913 for (size_t k = 1; k <= 20; k += 5) {
22914 GemmMicrokernelTester()
22915 .mr(4)
22916 .nr(8)
22917 .kr(1)
22918 .sr(4)
22919 .m(4)
22920 .n(8)
22921 .k(k)
22922 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22923 }
22924 }
22925 }
22926
22927 TEST(F32_IGEMM_4X8S4__NEON, n_div_8_strided_cn) {
22928 TEST_REQUIRES_ARM_NEON;
22929 for (uint32_t n = 16; n <= 24; n += 8) {
22930 for (size_t k = 1; k <= 20; k += 5) {
22931 GemmMicrokernelTester()
22932 .mr(4)
22933 .nr(8)
22934 .kr(1)
22935 .sr(4)
22936 .m(4)
22937 .n(n)
22938 .k(k)
22939 .cn_stride(11)
22940 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22941 }
22942 }
22943 }
22944
22945 TEST(F32_IGEMM_4X8S4__NEON, n_div_8_subtile) {
22946 TEST_REQUIRES_ARM_NEON;
22947 for (uint32_t n = 16; n <= 24; n += 8) {
22948 for (size_t k = 1; k <= 20; k += 5) {
22949 for (uint32_t m = 1; m <= 4; m++) {
22950 GemmMicrokernelTester()
22951 .mr(4)
22952 .nr(8)
22953 .kr(1)
22954 .sr(4)
22955 .m(m)
22956 .n(n)
22957 .k(k)
22958 .iterations(1)
22959 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22960 }
22961 }
22962 }
22963 }
22964
22965 TEST(F32_IGEMM_4X8S4__NEON, small_kernel) {
22966 TEST_REQUIRES_ARM_NEON;
22967 for (size_t k = 1; k <= 20; k += 5) {
22968 GemmMicrokernelTester()
22969 .mr(4)
22970 .nr(8)
22971 .kr(1)
22972 .sr(4)
22973 .m(4)
22974 .n(8)
22975 .k(k)
22976 .ks(3)
22977 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22978 }
22979 }
22980
22981 TEST(F32_IGEMM_4X8S4__NEON, small_kernel_subtile) {
22982 TEST_REQUIRES_ARM_NEON;
22983 for (size_t k = 1; k <= 20; k += 5) {
22984 for (uint32_t m = 1; m <= 4; m++) {
22985 for (uint32_t n = 1; n <= 8; n++) {
22986 GemmMicrokernelTester()
22987 .mr(4)
22988 .nr(8)
22989 .kr(1)
22990 .sr(4)
22991 .m(m)
22992 .n(n)
22993 .k(k)
22994 .ks(3)
22995 .iterations(1)
22996 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
22997 }
22998 }
22999 }
23000 }
23001
23002 TEST(F32_IGEMM_4X8S4__NEON, n_gt_8_small_kernel) {
23003 TEST_REQUIRES_ARM_NEON;
23004 for (uint32_t n = 9; n < 16; n++) {
23005 for (size_t k = 1; k <= 20; k += 5) {
23006 GemmMicrokernelTester()
23007 .mr(4)
23008 .nr(8)
23009 .kr(1)
23010 .sr(4)
23011 .m(4)
23012 .n(8)
23013 .k(k)
23014 .ks(3)
23015 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23016 }
23017 }
23018 }
23019
23020 TEST(F32_IGEMM_4X8S4__NEON, n_div_8_small_kernel) {
23021 TEST_REQUIRES_ARM_NEON;
23022 for (uint32_t n = 16; n <= 24; n += 8) {
23023 for (size_t k = 1; k <= 20; k += 5) {
23024 GemmMicrokernelTester()
23025 .mr(4)
23026 .nr(8)
23027 .kr(1)
23028 .sr(4)
23029 .m(4)
23030 .n(8)
23031 .k(k)
23032 .ks(3)
23033 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23034 }
23035 }
23036 }
23037
23038 TEST(F32_IGEMM_4X8S4__NEON, strided_cm_subtile) {
23039 TEST_REQUIRES_ARM_NEON;
23040 for (size_t k = 1; k <= 20; k += 5) {
23041 for (uint32_t m = 1; m <= 4; m++) {
23042 for (uint32_t n = 1; n <= 8; n++) {
23043 GemmMicrokernelTester()
23044 .mr(4)
23045 .nr(8)
23046 .kr(1)
23047 .sr(4)
23048 .m(m)
23049 .n(n)
23050 .k(k)
23051 .cm_stride(11)
23052 .iterations(1)
23053 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23054 }
23055 }
23056 }
23057 }
23058
23059 TEST(F32_IGEMM_4X8S4__NEON, a_offset) {
23060 TEST_REQUIRES_ARM_NEON;
23061 for (size_t k = 1; k <= 20; k += 5) {
23062 GemmMicrokernelTester()
23063 .mr(4)
23064 .nr(8)
23065 .kr(1)
23066 .sr(4)
23067 .m(4)
23068 .n(8)
23069 .k(k)
23070 .ks(3)
23071 .a_offset(83)
23072 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23073 }
23074 }
23075
23076 TEST(F32_IGEMM_4X8S4__NEON, zero) {
23077 TEST_REQUIRES_ARM_NEON;
23078 for (uint32_t mz = 0; mz < 4; mz++) {
23079 for (size_t k = 1; k <= 20; k += 5) {
23080 GemmMicrokernelTester()
23081 .mr(4)
23082 .nr(8)
23083 .kr(1)
23084 .sr(4)
23085 .m(4)
23086 .n(8)
23087 .k(k)
23088 .ks(3)
23089 .a_offset(83)
23090 .zero_index(mz)
23091 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23092 }
23093 }
23094 }
23095
23096 TEST(F32_IGEMM_4X8S4__NEON, qmin) {
23097 TEST_REQUIRES_ARM_NEON;
23098 GemmMicrokernelTester()
23099 .mr(4)
23100 .nr(8)
23101 .kr(1)
23102 .sr(4)
23103 .m(4)
23104 .n(8)
23105 .k(4)
23106 .qmin(128)
23107 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23108 }
23109
23110 TEST(F32_IGEMM_4X8S4__NEON, qmax) {
23111 TEST_REQUIRES_ARM_NEON;
23112 GemmMicrokernelTester()
23113 .mr(4)
23114 .nr(8)
23115 .kr(1)
23116 .sr(4)
23117 .m(4)
23118 .n(8)
23119 .k(4)
23120 .qmax(128)
23121 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23122 }
23123
23124 TEST(F32_IGEMM_4X8S4__NEON, strided_cm) {
23125 TEST_REQUIRES_ARM_NEON;
23126 GemmMicrokernelTester()
23127 .mr(4)
23128 .nr(8)
23129 .kr(1)
23130 .sr(4)
23131 .m(4)
23132 .n(8)
23133 .k(4)
23134 .cm_stride(11)
23135 .Test(xnn_f32_igemm_ukernel_4x8s4__neon);
23136 }
23137#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23138
23139
23140#if XNN_ARCH_ARM || XNN_ARCH_ARM64
23141 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4) {
23142 TEST_REQUIRES_ARM_NEON;
23143 GemmMicrokernelTester()
23144 .mr(6)
23145 .nr(8)
23146 .kr(1)
23147 .sr(4)
23148 .m(6)
23149 .n(8)
23150 .k(4)
23151 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23152 }
23153
23154 TEST(F32_IGEMM_6X8S4__NEON, strided_cn) {
23155 TEST_REQUIRES_ARM_NEON;
23156 GemmMicrokernelTester()
23157 .mr(6)
23158 .nr(8)
23159 .kr(1)
23160 .sr(4)
23161 .m(6)
23162 .n(8)
23163 .k(4)
23164 .cn_stride(11)
23165 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23166 }
23167
23168 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4_subtile) {
23169 TEST_REQUIRES_ARM_NEON;
23170 for (uint32_t m = 1; m <= 6; m++) {
23171 for (uint32_t n = 1; n <= 8; n++) {
23172 GemmMicrokernelTester()
23173 .mr(6)
23174 .nr(8)
23175 .kr(1)
23176 .sr(4)
23177 .m(m)
23178 .n(n)
23179 .k(4)
23180 .iterations(1)
23181 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23182 }
23183 }
23184 }
23185
23186 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4_subtile_m) {
23187 TEST_REQUIRES_ARM_NEON;
23188 for (uint32_t m = 1; m <= 6; m++) {
23189 GemmMicrokernelTester()
23190 .mr(6)
23191 .nr(8)
23192 .kr(1)
23193 .sr(4)
23194 .m(m)
23195 .n(8)
23196 .k(4)
23197 .iterations(1)
23198 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23199 }
23200 }
23201
23202 TEST(F32_IGEMM_6X8S4__NEON, k_eq_4_subtile_n) {
23203 TEST_REQUIRES_ARM_NEON;
23204 for (uint32_t n = 1; n <= 8; n++) {
23205 GemmMicrokernelTester()
23206 .mr(6)
23207 .nr(8)
23208 .kr(1)
23209 .sr(4)
23210 .m(6)
23211 .n(n)
23212 .k(4)
23213 .iterations(1)
23214 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23215 }
23216 }
23217
23218 TEST(F32_IGEMM_6X8S4__NEON, k_lt_4) {
23219 TEST_REQUIRES_ARM_NEON;
23220 for (size_t k = 1; k < 4; k++) {
23221 GemmMicrokernelTester()
23222 .mr(6)
23223 .nr(8)
23224 .kr(1)
23225 .sr(4)
23226 .m(6)
23227 .n(8)
23228 .k(k)
23229 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23230 }
23231 }
23232
23233 TEST(F32_IGEMM_6X8S4__NEON, k_lt_4_subtile) {
23234 TEST_REQUIRES_ARM_NEON;
23235 for (size_t k = 1; k < 4; k++) {
23236 for (uint32_t m = 1; m <= 6; m++) {
23237 for (uint32_t n = 1; n <= 8; n++) {
23238 GemmMicrokernelTester()
23239 .mr(6)
23240 .nr(8)
23241 .kr(1)
23242 .sr(4)
23243 .m(m)
23244 .n(n)
23245 .k(k)
23246 .iterations(1)
23247 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23248 }
23249 }
23250 }
23251 }
23252
23253 TEST(F32_IGEMM_6X8S4__NEON, k_gt_4) {
23254 TEST_REQUIRES_ARM_NEON;
23255 for (size_t k = 5; k < 8; k++) {
23256 GemmMicrokernelTester()
23257 .mr(6)
23258 .nr(8)
23259 .kr(1)
23260 .sr(4)
23261 .m(6)
23262 .n(8)
23263 .k(k)
23264 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23265 }
23266 }
23267
23268 TEST(F32_IGEMM_6X8S4__NEON, k_gt_4_subtile) {
23269 TEST_REQUIRES_ARM_NEON;
23270 for (size_t k = 5; k < 8; k++) {
23271 for (uint32_t m = 1; m <= 6; m++) {
23272 for (uint32_t n = 1; n <= 8; n++) {
23273 GemmMicrokernelTester()
23274 .mr(6)
23275 .nr(8)
23276 .kr(1)
23277 .sr(4)
23278 .m(m)
23279 .n(n)
23280 .k(k)
23281 .iterations(1)
23282 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23283 }
23284 }
23285 }
23286 }
23287
23288 TEST(F32_IGEMM_6X8S4__NEON, k_div_4) {
23289 TEST_REQUIRES_ARM_NEON;
23290 for (size_t k = 8; k <= 40; k += 4) {
23291 GemmMicrokernelTester()
23292 .mr(6)
23293 .nr(8)
23294 .kr(1)
23295 .sr(4)
23296 .m(6)
23297 .n(8)
23298 .k(k)
23299 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23300 }
23301 }
23302
23303 TEST(F32_IGEMM_6X8S4__NEON, k_div_4_subtile) {
23304 TEST_REQUIRES_ARM_NEON;
23305 for (size_t k = 8; k <= 40; k += 4) {
23306 for (uint32_t m = 1; m <= 6; m++) {
23307 for (uint32_t n = 1; n <= 8; n++) {
23308 GemmMicrokernelTester()
23309 .mr(6)
23310 .nr(8)
23311 .kr(1)
23312 .sr(4)
23313 .m(m)
23314 .n(n)
23315 .k(k)
23316 .iterations(1)
23317 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23318 }
23319 }
23320 }
23321 }
23322
23323 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8) {
23324 TEST_REQUIRES_ARM_NEON;
23325 for (uint32_t n = 9; n < 16; n++) {
23326 for (size_t k = 1; k <= 20; k += 5) {
23327 GemmMicrokernelTester()
23328 .mr(6)
23329 .nr(8)
23330 .kr(1)
23331 .sr(4)
23332 .m(6)
23333 .n(8)
23334 .k(k)
23335 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23336 }
23337 }
23338 }
23339
23340 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8_strided_cn) {
23341 TEST_REQUIRES_ARM_NEON;
23342 for (uint32_t n = 9; n < 16; n++) {
23343 for (size_t k = 1; k <= 20; k += 5) {
23344 GemmMicrokernelTester()
23345 .mr(6)
23346 .nr(8)
23347 .kr(1)
23348 .sr(4)
23349 .m(6)
23350 .n(8)
23351 .k(k)
23352 .cn_stride(11)
23353 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23354 }
23355 }
23356 }
23357
23358 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8_subtile) {
23359 TEST_REQUIRES_ARM_NEON;
23360 for (uint32_t n = 9; n < 16; n++) {
23361 for (size_t k = 1; k <= 20; k += 5) {
23362 for (uint32_t m = 1; m <= 6; m++) {
23363 GemmMicrokernelTester()
23364 .mr(6)
23365 .nr(8)
23366 .kr(1)
23367 .sr(4)
23368 .m(m)
23369 .n(n)
23370 .k(k)
23371 .iterations(1)
23372 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23373 }
23374 }
23375 }
23376 }
23377
23378 TEST(F32_IGEMM_6X8S4__NEON, n_div_8) {
23379 TEST_REQUIRES_ARM_NEON;
23380 for (uint32_t n = 16; n <= 24; n += 8) {
23381 for (size_t k = 1; k <= 20; k += 5) {
23382 GemmMicrokernelTester()
23383 .mr(6)
23384 .nr(8)
23385 .kr(1)
23386 .sr(4)
23387 .m(6)
23388 .n(8)
23389 .k(k)
23390 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23391 }
23392 }
23393 }
23394
23395 TEST(F32_IGEMM_6X8S4__NEON, n_div_8_strided_cn) {
23396 TEST_REQUIRES_ARM_NEON;
23397 for (uint32_t n = 16; n <= 24; n += 8) {
23398 for (size_t k = 1; k <= 20; k += 5) {
23399 GemmMicrokernelTester()
23400 .mr(6)
23401 .nr(8)
23402 .kr(1)
23403 .sr(4)
23404 .m(6)
23405 .n(n)
23406 .k(k)
23407 .cn_stride(11)
23408 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23409 }
23410 }
23411 }
23412
23413 TEST(F32_IGEMM_6X8S4__NEON, n_div_8_subtile) {
23414 TEST_REQUIRES_ARM_NEON;
23415 for (uint32_t n = 16; n <= 24; n += 8) {
23416 for (size_t k = 1; k <= 20; k += 5) {
23417 for (uint32_t m = 1; m <= 6; m++) {
23418 GemmMicrokernelTester()
23419 .mr(6)
23420 .nr(8)
23421 .kr(1)
23422 .sr(4)
23423 .m(m)
23424 .n(n)
23425 .k(k)
23426 .iterations(1)
23427 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23428 }
23429 }
23430 }
23431 }
23432
23433 TEST(F32_IGEMM_6X8S4__NEON, small_kernel) {
23434 TEST_REQUIRES_ARM_NEON;
23435 for (size_t k = 1; k <= 20; k += 5) {
23436 GemmMicrokernelTester()
23437 .mr(6)
23438 .nr(8)
23439 .kr(1)
23440 .sr(4)
23441 .m(6)
23442 .n(8)
23443 .k(k)
23444 .ks(3)
23445 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23446 }
23447 }
23448
23449 TEST(F32_IGEMM_6X8S4__NEON, small_kernel_subtile) {
23450 TEST_REQUIRES_ARM_NEON;
23451 for (size_t k = 1; k <= 20; k += 5) {
23452 for (uint32_t m = 1; m <= 6; m++) {
23453 for (uint32_t n = 1; n <= 8; n++) {
23454 GemmMicrokernelTester()
23455 .mr(6)
23456 .nr(8)
23457 .kr(1)
23458 .sr(4)
23459 .m(m)
23460 .n(n)
23461 .k(k)
23462 .ks(3)
23463 .iterations(1)
23464 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23465 }
23466 }
23467 }
23468 }
23469
23470 TEST(F32_IGEMM_6X8S4__NEON, n_gt_8_small_kernel) {
23471 TEST_REQUIRES_ARM_NEON;
23472 for (uint32_t n = 9; n < 16; n++) {
23473 for (size_t k = 1; k <= 20; k += 5) {
23474 GemmMicrokernelTester()
23475 .mr(6)
23476 .nr(8)
23477 .kr(1)
23478 .sr(4)
23479 .m(6)
23480 .n(8)
23481 .k(k)
23482 .ks(3)
23483 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23484 }
23485 }
23486 }
23487
23488 TEST(F32_IGEMM_6X8S4__NEON, n_div_8_small_kernel) {
23489 TEST_REQUIRES_ARM_NEON;
23490 for (uint32_t n = 16; n <= 24; n += 8) {
23491 for (size_t k = 1; k <= 20; k += 5) {
23492 GemmMicrokernelTester()
23493 .mr(6)
23494 .nr(8)
23495 .kr(1)
23496 .sr(4)
23497 .m(6)
23498 .n(8)
23499 .k(k)
23500 .ks(3)
23501 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23502 }
23503 }
23504 }
23505
23506 TEST(F32_IGEMM_6X8S4__NEON, strided_cm_subtile) {
23507 TEST_REQUIRES_ARM_NEON;
23508 for (size_t k = 1; k <= 20; k += 5) {
23509 for (uint32_t m = 1; m <= 6; m++) {
23510 for (uint32_t n = 1; n <= 8; n++) {
23511 GemmMicrokernelTester()
23512 .mr(6)
23513 .nr(8)
23514 .kr(1)
23515 .sr(4)
23516 .m(m)
23517 .n(n)
23518 .k(k)
23519 .cm_stride(11)
23520 .iterations(1)
23521 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23522 }
23523 }
23524 }
23525 }
23526
23527 TEST(F32_IGEMM_6X8S4__NEON, a_offset) {
23528 TEST_REQUIRES_ARM_NEON;
23529 for (size_t k = 1; k <= 20; k += 5) {
23530 GemmMicrokernelTester()
23531 .mr(6)
23532 .nr(8)
23533 .kr(1)
23534 .sr(4)
23535 .m(6)
23536 .n(8)
23537 .k(k)
23538 .ks(3)
23539 .a_offset(127)
23540 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23541 }
23542 }
23543
23544 TEST(F32_IGEMM_6X8S4__NEON, zero) {
23545 TEST_REQUIRES_ARM_NEON;
23546 for (uint32_t mz = 0; mz < 6; mz++) {
23547 for (size_t k = 1; k <= 20; k += 5) {
23548 GemmMicrokernelTester()
23549 .mr(6)
23550 .nr(8)
23551 .kr(1)
23552 .sr(4)
23553 .m(6)
23554 .n(8)
23555 .k(k)
23556 .ks(3)
23557 .a_offset(127)
23558 .zero_index(mz)
23559 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23560 }
23561 }
23562 }
23563
23564 TEST(F32_IGEMM_6X8S4__NEON, qmin) {
23565 TEST_REQUIRES_ARM_NEON;
23566 GemmMicrokernelTester()
23567 .mr(6)
23568 .nr(8)
23569 .kr(1)
23570 .sr(4)
23571 .m(6)
23572 .n(8)
23573 .k(4)
23574 .qmin(128)
23575 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23576 }
23577
23578 TEST(F32_IGEMM_6X8S4__NEON, qmax) {
23579 TEST_REQUIRES_ARM_NEON;
23580 GemmMicrokernelTester()
23581 .mr(6)
23582 .nr(8)
23583 .kr(1)
23584 .sr(4)
23585 .m(6)
23586 .n(8)
23587 .k(4)
23588 .qmax(128)
23589 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23590 }
23591
23592 TEST(F32_IGEMM_6X8S4__NEON, strided_cm) {
23593 TEST_REQUIRES_ARM_NEON;
23594 GemmMicrokernelTester()
23595 .mr(6)
23596 .nr(8)
23597 .kr(1)
23598 .sr(4)
23599 .m(6)
23600 .n(8)
23601 .k(4)
23602 .cm_stride(11)
23603 .Test(xnn_f32_igemm_ukernel_6x8s4__neon);
23604 }
23605#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23606
23607
23608#if XNN_ARCH_ARM || XNN_ARCH_ARM64
23609 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4) {
23610 TEST_REQUIRES_ARM_NEON;
23611 GemmMicrokernelTester()
23612 .mr(8)
23613 .nr(8)
23614 .kr(1)
23615 .sr(4)
23616 .m(8)
23617 .n(8)
23618 .k(4)
23619 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23620 }
23621
23622 TEST(F32_IGEMM_8X8S4__NEON, strided_cn) {
23623 TEST_REQUIRES_ARM_NEON;
23624 GemmMicrokernelTester()
23625 .mr(8)
23626 .nr(8)
23627 .kr(1)
23628 .sr(4)
23629 .m(8)
23630 .n(8)
23631 .k(4)
23632 .cn_stride(11)
23633 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23634 }
23635
23636 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4_subtile) {
23637 TEST_REQUIRES_ARM_NEON;
23638 for (uint32_t m = 1; m <= 8; m++) {
23639 for (uint32_t n = 1; n <= 8; n++) {
23640 GemmMicrokernelTester()
23641 .mr(8)
23642 .nr(8)
23643 .kr(1)
23644 .sr(4)
23645 .m(m)
23646 .n(n)
23647 .k(4)
23648 .iterations(1)
23649 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23650 }
23651 }
23652 }
23653
23654 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4_subtile_m) {
23655 TEST_REQUIRES_ARM_NEON;
23656 for (uint32_t m = 1; m <= 8; m++) {
23657 GemmMicrokernelTester()
23658 .mr(8)
23659 .nr(8)
23660 .kr(1)
23661 .sr(4)
23662 .m(m)
23663 .n(8)
23664 .k(4)
23665 .iterations(1)
23666 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23667 }
23668 }
23669
23670 TEST(F32_IGEMM_8X8S4__NEON, k_eq_4_subtile_n) {
23671 TEST_REQUIRES_ARM_NEON;
23672 for (uint32_t n = 1; n <= 8; n++) {
23673 GemmMicrokernelTester()
23674 .mr(8)
23675 .nr(8)
23676 .kr(1)
23677 .sr(4)
23678 .m(8)
23679 .n(n)
23680 .k(4)
23681 .iterations(1)
23682 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23683 }
23684 }
23685
23686 TEST(F32_IGEMM_8X8S4__NEON, k_lt_4) {
23687 TEST_REQUIRES_ARM_NEON;
23688 for (size_t k = 1; k < 4; k++) {
23689 GemmMicrokernelTester()
23690 .mr(8)
23691 .nr(8)
23692 .kr(1)
23693 .sr(4)
23694 .m(8)
23695 .n(8)
23696 .k(k)
23697 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23698 }
23699 }
23700
23701 TEST(F32_IGEMM_8X8S4__NEON, k_lt_4_subtile) {
23702 TEST_REQUIRES_ARM_NEON;
23703 for (size_t k = 1; k < 4; k++) {
23704 for (uint32_t m = 1; m <= 8; m++) {
23705 for (uint32_t n = 1; n <= 8; n++) {
23706 GemmMicrokernelTester()
23707 .mr(8)
23708 .nr(8)
23709 .kr(1)
23710 .sr(4)
23711 .m(m)
23712 .n(n)
23713 .k(k)
23714 .iterations(1)
23715 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23716 }
23717 }
23718 }
23719 }
23720
23721 TEST(F32_IGEMM_8X8S4__NEON, k_gt_4) {
23722 TEST_REQUIRES_ARM_NEON;
23723 for (size_t k = 5; k < 8; k++) {
23724 GemmMicrokernelTester()
23725 .mr(8)
23726 .nr(8)
23727 .kr(1)
23728 .sr(4)
23729 .m(8)
23730 .n(8)
23731 .k(k)
23732 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23733 }
23734 }
23735
23736 TEST(F32_IGEMM_8X8S4__NEON, k_gt_4_subtile) {
23737 TEST_REQUIRES_ARM_NEON;
23738 for (size_t k = 5; k < 8; k++) {
23739 for (uint32_t m = 1; m <= 8; m++) {
23740 for (uint32_t n = 1; n <= 8; n++) {
23741 GemmMicrokernelTester()
23742 .mr(8)
23743 .nr(8)
23744 .kr(1)
23745 .sr(4)
23746 .m(m)
23747 .n(n)
23748 .k(k)
23749 .iterations(1)
23750 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23751 }
23752 }
23753 }
23754 }
23755
23756 TEST(F32_IGEMM_8X8S4__NEON, k_div_4) {
23757 TEST_REQUIRES_ARM_NEON;
23758 for (size_t k = 8; k <= 40; k += 4) {
23759 GemmMicrokernelTester()
23760 .mr(8)
23761 .nr(8)
23762 .kr(1)
23763 .sr(4)
23764 .m(8)
23765 .n(8)
23766 .k(k)
23767 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23768 }
23769 }
23770
23771 TEST(F32_IGEMM_8X8S4__NEON, k_div_4_subtile) {
23772 TEST_REQUIRES_ARM_NEON;
23773 for (size_t k = 8; k <= 40; k += 4) {
23774 for (uint32_t m = 1; m <= 8; m++) {
23775 for (uint32_t n = 1; n <= 8; n++) {
23776 GemmMicrokernelTester()
23777 .mr(8)
23778 .nr(8)
23779 .kr(1)
23780 .sr(4)
23781 .m(m)
23782 .n(n)
23783 .k(k)
23784 .iterations(1)
23785 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23786 }
23787 }
23788 }
23789 }
23790
23791 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8) {
23792 TEST_REQUIRES_ARM_NEON;
23793 for (uint32_t n = 9; n < 16; n++) {
23794 for (size_t k = 1; k <= 20; k += 5) {
23795 GemmMicrokernelTester()
23796 .mr(8)
23797 .nr(8)
23798 .kr(1)
23799 .sr(4)
23800 .m(8)
23801 .n(8)
23802 .k(k)
23803 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23804 }
23805 }
23806 }
23807
23808 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8_strided_cn) {
23809 TEST_REQUIRES_ARM_NEON;
23810 for (uint32_t n = 9; n < 16; n++) {
23811 for (size_t k = 1; k <= 20; k += 5) {
23812 GemmMicrokernelTester()
23813 .mr(8)
23814 .nr(8)
23815 .kr(1)
23816 .sr(4)
23817 .m(8)
23818 .n(8)
23819 .k(k)
23820 .cn_stride(11)
23821 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23822 }
23823 }
23824 }
23825
23826 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8_subtile) {
23827 TEST_REQUIRES_ARM_NEON;
23828 for (uint32_t n = 9; n < 16; n++) {
23829 for (size_t k = 1; k <= 20; k += 5) {
23830 for (uint32_t m = 1; m <= 8; m++) {
23831 GemmMicrokernelTester()
23832 .mr(8)
23833 .nr(8)
23834 .kr(1)
23835 .sr(4)
23836 .m(m)
23837 .n(n)
23838 .k(k)
23839 .iterations(1)
23840 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23841 }
23842 }
23843 }
23844 }
23845
23846 TEST(F32_IGEMM_8X8S4__NEON, n_div_8) {
23847 TEST_REQUIRES_ARM_NEON;
23848 for (uint32_t n = 16; n <= 24; n += 8) {
23849 for (size_t k = 1; k <= 20; k += 5) {
23850 GemmMicrokernelTester()
23851 .mr(8)
23852 .nr(8)
23853 .kr(1)
23854 .sr(4)
23855 .m(8)
23856 .n(8)
23857 .k(k)
23858 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23859 }
23860 }
23861 }
23862
23863 TEST(F32_IGEMM_8X8S4__NEON, n_div_8_strided_cn) {
23864 TEST_REQUIRES_ARM_NEON;
23865 for (uint32_t n = 16; n <= 24; n += 8) {
23866 for (size_t k = 1; k <= 20; k += 5) {
23867 GemmMicrokernelTester()
23868 .mr(8)
23869 .nr(8)
23870 .kr(1)
23871 .sr(4)
23872 .m(8)
23873 .n(n)
23874 .k(k)
23875 .cn_stride(11)
23876 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23877 }
23878 }
23879 }
23880
23881 TEST(F32_IGEMM_8X8S4__NEON, n_div_8_subtile) {
23882 TEST_REQUIRES_ARM_NEON;
23883 for (uint32_t n = 16; n <= 24; n += 8) {
23884 for (size_t k = 1; k <= 20; k += 5) {
23885 for (uint32_t m = 1; m <= 8; m++) {
23886 GemmMicrokernelTester()
23887 .mr(8)
23888 .nr(8)
23889 .kr(1)
23890 .sr(4)
23891 .m(m)
23892 .n(n)
23893 .k(k)
23894 .iterations(1)
23895 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23896 }
23897 }
23898 }
23899 }
23900
23901 TEST(F32_IGEMM_8X8S4__NEON, small_kernel) {
23902 TEST_REQUIRES_ARM_NEON;
23903 for (size_t k = 1; k <= 20; k += 5) {
23904 GemmMicrokernelTester()
23905 .mr(8)
23906 .nr(8)
23907 .kr(1)
23908 .sr(4)
23909 .m(8)
23910 .n(8)
23911 .k(k)
23912 .ks(3)
23913 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23914 }
23915 }
23916
23917 TEST(F32_IGEMM_8X8S4__NEON, small_kernel_subtile) {
23918 TEST_REQUIRES_ARM_NEON;
23919 for (size_t k = 1; k <= 20; k += 5) {
23920 for (uint32_t m = 1; m <= 8; m++) {
23921 for (uint32_t n = 1; n <= 8; n++) {
23922 GemmMicrokernelTester()
23923 .mr(8)
23924 .nr(8)
23925 .kr(1)
23926 .sr(4)
23927 .m(m)
23928 .n(n)
23929 .k(k)
23930 .ks(3)
23931 .iterations(1)
23932 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23933 }
23934 }
23935 }
23936 }
23937
23938 TEST(F32_IGEMM_8X8S4__NEON, n_gt_8_small_kernel) {
23939 TEST_REQUIRES_ARM_NEON;
23940 for (uint32_t n = 9; n < 16; n++) {
23941 for (size_t k = 1; k <= 20; k += 5) {
23942 GemmMicrokernelTester()
23943 .mr(8)
23944 .nr(8)
23945 .kr(1)
23946 .sr(4)
23947 .m(8)
23948 .n(8)
23949 .k(k)
23950 .ks(3)
23951 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23952 }
23953 }
23954 }
23955
23956 TEST(F32_IGEMM_8X8S4__NEON, n_div_8_small_kernel) {
23957 TEST_REQUIRES_ARM_NEON;
23958 for (uint32_t n = 16; n <= 24; n += 8) {
23959 for (size_t k = 1; k <= 20; k += 5) {
23960 GemmMicrokernelTester()
23961 .mr(8)
23962 .nr(8)
23963 .kr(1)
23964 .sr(4)
23965 .m(8)
23966 .n(8)
23967 .k(k)
23968 .ks(3)
23969 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23970 }
23971 }
23972 }
23973
23974 TEST(F32_IGEMM_8X8S4__NEON, strided_cm_subtile) {
23975 TEST_REQUIRES_ARM_NEON;
23976 for (size_t k = 1; k <= 20; k += 5) {
23977 for (uint32_t m = 1; m <= 8; m++) {
23978 for (uint32_t n = 1; n <= 8; n++) {
23979 GemmMicrokernelTester()
23980 .mr(8)
23981 .nr(8)
23982 .kr(1)
23983 .sr(4)
23984 .m(m)
23985 .n(n)
23986 .k(k)
23987 .cm_stride(11)
23988 .iterations(1)
23989 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
23990 }
23991 }
23992 }
23993 }
23994
23995 TEST(F32_IGEMM_8X8S4__NEON, a_offset) {
23996 TEST_REQUIRES_ARM_NEON;
23997 for (size_t k = 1; k <= 20; k += 5) {
23998 GemmMicrokernelTester()
23999 .mr(8)
24000 .nr(8)
24001 .kr(1)
24002 .sr(4)
24003 .m(8)
24004 .n(8)
24005 .k(k)
24006 .ks(3)
24007 .a_offset(163)
24008 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
24009 }
24010 }
24011
24012 TEST(F32_IGEMM_8X8S4__NEON, zero) {
24013 TEST_REQUIRES_ARM_NEON;
24014 for (uint32_t mz = 0; mz < 8; mz++) {
24015 for (size_t k = 1; k <= 20; k += 5) {
24016 GemmMicrokernelTester()
24017 .mr(8)
24018 .nr(8)
24019 .kr(1)
24020 .sr(4)
24021 .m(8)
24022 .n(8)
24023 .k(k)
24024 .ks(3)
24025 .a_offset(163)
24026 .zero_index(mz)
24027 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
24028 }
24029 }
24030 }
24031
24032 TEST(F32_IGEMM_8X8S4__NEON, qmin) {
24033 TEST_REQUIRES_ARM_NEON;
24034 GemmMicrokernelTester()
24035 .mr(8)
24036 .nr(8)
24037 .kr(1)
24038 .sr(4)
24039 .m(8)
24040 .n(8)
24041 .k(4)
24042 .qmin(128)
24043 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
24044 }
24045
24046 TEST(F32_IGEMM_8X8S4__NEON, qmax) {
24047 TEST_REQUIRES_ARM_NEON;
24048 GemmMicrokernelTester()
24049 .mr(8)
24050 .nr(8)
24051 .kr(1)
24052 .sr(4)
24053 .m(8)
24054 .n(8)
24055 .k(4)
24056 .qmax(128)
24057 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
24058 }
24059
24060 TEST(F32_IGEMM_8X8S4__NEON, strided_cm) {
24061 TEST_REQUIRES_ARM_NEON;
24062 GemmMicrokernelTester()
24063 .mr(8)
24064 .nr(8)
24065 .kr(1)
24066 .sr(4)
24067 .m(8)
24068 .n(8)
24069 .k(4)
24070 .cm_stride(11)
24071 .Test(xnn_f32_igemm_ukernel_8x8s4__neon);
24072 }
24073#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24074
24075
24076#if XNN_ARCH_ARM || XNN_ARCH_ARM64
24077 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4) {
24078 TEST_REQUIRES_ARM_NEON_FMA;
24079 GemmMicrokernelTester()
24080 .mr(1)
24081 .nr(8)
24082 .kr(1)
24083 .sr(4)
24084 .m(1)
24085 .n(8)
24086 .k(4)
24087 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24088 }
24089
24090 TEST(F32_IGEMM_1X8S4__NEONFMA, strided_cn) {
24091 TEST_REQUIRES_ARM_NEON_FMA;
24092 GemmMicrokernelTester()
24093 .mr(1)
24094 .nr(8)
24095 .kr(1)
24096 .sr(4)
24097 .m(1)
24098 .n(8)
24099 .k(4)
24100 .cn_stride(11)
24101 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24102 }
24103
24104 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4_subtile) {
24105 TEST_REQUIRES_ARM_NEON_FMA;
24106 for (uint32_t m = 1; m <= 1; m++) {
24107 for (uint32_t n = 1; n <= 8; n++) {
24108 GemmMicrokernelTester()
24109 .mr(1)
24110 .nr(8)
24111 .kr(1)
24112 .sr(4)
24113 .m(m)
24114 .n(n)
24115 .k(4)
24116 .iterations(1)
24117 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24118 }
24119 }
24120 }
24121
24122 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4_subtile_m) {
24123 TEST_REQUIRES_ARM_NEON_FMA;
24124 for (uint32_t m = 1; m <= 1; m++) {
24125 GemmMicrokernelTester()
24126 .mr(1)
24127 .nr(8)
24128 .kr(1)
24129 .sr(4)
24130 .m(m)
24131 .n(8)
24132 .k(4)
24133 .iterations(1)
24134 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24135 }
24136 }
24137
24138 TEST(F32_IGEMM_1X8S4__NEONFMA, k_eq_4_subtile_n) {
24139 TEST_REQUIRES_ARM_NEON_FMA;
24140 for (uint32_t n = 1; n <= 8; n++) {
24141 GemmMicrokernelTester()
24142 .mr(1)
24143 .nr(8)
24144 .kr(1)
24145 .sr(4)
24146 .m(1)
24147 .n(n)
24148 .k(4)
24149 .iterations(1)
24150 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24151 }
24152 }
24153
24154 TEST(F32_IGEMM_1X8S4__NEONFMA, k_lt_4) {
24155 TEST_REQUIRES_ARM_NEON_FMA;
24156 for (size_t k = 1; k < 4; k++) {
24157 GemmMicrokernelTester()
24158 .mr(1)
24159 .nr(8)
24160 .kr(1)
24161 .sr(4)
24162 .m(1)
24163 .n(8)
24164 .k(k)
24165 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24166 }
24167 }
24168
24169 TEST(F32_IGEMM_1X8S4__NEONFMA, k_lt_4_subtile) {
24170 TEST_REQUIRES_ARM_NEON_FMA;
24171 for (size_t k = 1; k < 4; k++) {
24172 for (uint32_t m = 1; m <= 1; m++) {
24173 for (uint32_t n = 1; n <= 8; n++) {
24174 GemmMicrokernelTester()
24175 .mr(1)
24176 .nr(8)
24177 .kr(1)
24178 .sr(4)
24179 .m(m)
24180 .n(n)
24181 .k(k)
24182 .iterations(1)
24183 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24184 }
24185 }
24186 }
24187 }
24188
24189 TEST(F32_IGEMM_1X8S4__NEONFMA, k_gt_4) {
24190 TEST_REQUIRES_ARM_NEON_FMA;
24191 for (size_t k = 5; k < 8; k++) {
24192 GemmMicrokernelTester()
24193 .mr(1)
24194 .nr(8)
24195 .kr(1)
24196 .sr(4)
24197 .m(1)
24198 .n(8)
24199 .k(k)
24200 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24201 }
24202 }
24203
24204 TEST(F32_IGEMM_1X8S4__NEONFMA, k_gt_4_subtile) {
24205 TEST_REQUIRES_ARM_NEON_FMA;
24206 for (size_t k = 5; k < 8; k++) {
24207 for (uint32_t m = 1; m <= 1; m++) {
24208 for (uint32_t n = 1; n <= 8; n++) {
24209 GemmMicrokernelTester()
24210 .mr(1)
24211 .nr(8)
24212 .kr(1)
24213 .sr(4)
24214 .m(m)
24215 .n(n)
24216 .k(k)
24217 .iterations(1)
24218 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24219 }
24220 }
24221 }
24222 }
24223
24224 TEST(F32_IGEMM_1X8S4__NEONFMA, k_div_4) {
24225 TEST_REQUIRES_ARM_NEON_FMA;
24226 for (size_t k = 8; k <= 40; k += 4) {
24227 GemmMicrokernelTester()
24228 .mr(1)
24229 .nr(8)
24230 .kr(1)
24231 .sr(4)
24232 .m(1)
24233 .n(8)
24234 .k(k)
24235 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24236 }
24237 }
24238
24239 TEST(F32_IGEMM_1X8S4__NEONFMA, k_div_4_subtile) {
24240 TEST_REQUIRES_ARM_NEON_FMA;
24241 for (size_t k = 8; k <= 40; k += 4) {
24242 for (uint32_t m = 1; m <= 1; m++) {
24243 for (uint32_t n = 1; n <= 8; n++) {
24244 GemmMicrokernelTester()
24245 .mr(1)
24246 .nr(8)
24247 .kr(1)
24248 .sr(4)
24249 .m(m)
24250 .n(n)
24251 .k(k)
24252 .iterations(1)
24253 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24254 }
24255 }
24256 }
24257 }
24258
24259 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8) {
24260 TEST_REQUIRES_ARM_NEON_FMA;
24261 for (uint32_t n = 9; n < 16; n++) {
24262 for (size_t k = 1; k <= 20; k += 5) {
24263 GemmMicrokernelTester()
24264 .mr(1)
24265 .nr(8)
24266 .kr(1)
24267 .sr(4)
24268 .m(1)
24269 .n(8)
24270 .k(k)
24271 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24272 }
24273 }
24274 }
24275
24276 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8_strided_cn) {
24277 TEST_REQUIRES_ARM_NEON_FMA;
24278 for (uint32_t n = 9; n < 16; n++) {
24279 for (size_t k = 1; k <= 20; k += 5) {
24280 GemmMicrokernelTester()
24281 .mr(1)
24282 .nr(8)
24283 .kr(1)
24284 .sr(4)
24285 .m(1)
24286 .n(8)
24287 .k(k)
24288 .cn_stride(11)
24289 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24290 }
24291 }
24292 }
24293
24294 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8_subtile) {
24295 TEST_REQUIRES_ARM_NEON_FMA;
24296 for (uint32_t n = 9; n < 16; n++) {
24297 for (size_t k = 1; k <= 20; k += 5) {
24298 for (uint32_t m = 1; m <= 1; m++) {
24299 GemmMicrokernelTester()
24300 .mr(1)
24301 .nr(8)
24302 .kr(1)
24303 .sr(4)
24304 .m(m)
24305 .n(n)
24306 .k(k)
24307 .iterations(1)
24308 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24309 }
24310 }
24311 }
24312 }
24313
24314 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8) {
24315 TEST_REQUIRES_ARM_NEON_FMA;
24316 for (uint32_t n = 16; n <= 24; n += 8) {
24317 for (size_t k = 1; k <= 20; k += 5) {
24318 GemmMicrokernelTester()
24319 .mr(1)
24320 .nr(8)
24321 .kr(1)
24322 .sr(4)
24323 .m(1)
24324 .n(8)
24325 .k(k)
24326 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24327 }
24328 }
24329 }
24330
24331 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8_strided_cn) {
24332 TEST_REQUIRES_ARM_NEON_FMA;
24333 for (uint32_t n = 16; n <= 24; n += 8) {
24334 for (size_t k = 1; k <= 20; k += 5) {
24335 GemmMicrokernelTester()
24336 .mr(1)
24337 .nr(8)
24338 .kr(1)
24339 .sr(4)
24340 .m(1)
24341 .n(n)
24342 .k(k)
24343 .cn_stride(11)
24344 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24345 }
24346 }
24347 }
24348
24349 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8_subtile) {
24350 TEST_REQUIRES_ARM_NEON_FMA;
24351 for (uint32_t n = 16; n <= 24; n += 8) {
24352 for (size_t k = 1; k <= 20; k += 5) {
24353 for (uint32_t m = 1; m <= 1; m++) {
24354 GemmMicrokernelTester()
24355 .mr(1)
24356 .nr(8)
24357 .kr(1)
24358 .sr(4)
24359 .m(m)
24360 .n(n)
24361 .k(k)
24362 .iterations(1)
24363 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24364 }
24365 }
24366 }
24367 }
24368
24369 TEST(F32_IGEMM_1X8S4__NEONFMA, small_kernel) {
24370 TEST_REQUIRES_ARM_NEON_FMA;
24371 for (size_t k = 1; k <= 20; k += 5) {
24372 GemmMicrokernelTester()
24373 .mr(1)
24374 .nr(8)
24375 .kr(1)
24376 .sr(4)
24377 .m(1)
24378 .n(8)
24379 .k(k)
24380 .ks(3)
24381 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24382 }
24383 }
24384
24385 TEST(F32_IGEMM_1X8S4__NEONFMA, small_kernel_subtile) {
24386 TEST_REQUIRES_ARM_NEON_FMA;
24387 for (size_t k = 1; k <= 20; k += 5) {
24388 for (uint32_t m = 1; m <= 1; m++) {
24389 for (uint32_t n = 1; n <= 8; n++) {
24390 GemmMicrokernelTester()
24391 .mr(1)
24392 .nr(8)
24393 .kr(1)
24394 .sr(4)
24395 .m(m)
24396 .n(n)
24397 .k(k)
24398 .ks(3)
24399 .iterations(1)
24400 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24401 }
24402 }
24403 }
24404 }
24405
24406 TEST(F32_IGEMM_1X8S4__NEONFMA, n_gt_8_small_kernel) {
24407 TEST_REQUIRES_ARM_NEON_FMA;
24408 for (uint32_t n = 9; n < 16; n++) {
24409 for (size_t k = 1; k <= 20; k += 5) {
24410 GemmMicrokernelTester()
24411 .mr(1)
24412 .nr(8)
24413 .kr(1)
24414 .sr(4)
24415 .m(1)
24416 .n(8)
24417 .k(k)
24418 .ks(3)
24419 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24420 }
24421 }
24422 }
24423
24424 TEST(F32_IGEMM_1X8S4__NEONFMA, n_div_8_small_kernel) {
24425 TEST_REQUIRES_ARM_NEON_FMA;
24426 for (uint32_t n = 16; n <= 24; n += 8) {
24427 for (size_t k = 1; k <= 20; k += 5) {
24428 GemmMicrokernelTester()
24429 .mr(1)
24430 .nr(8)
24431 .kr(1)
24432 .sr(4)
24433 .m(1)
24434 .n(8)
24435 .k(k)
24436 .ks(3)
24437 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24438 }
24439 }
24440 }
24441
24442 TEST(F32_IGEMM_1X8S4__NEONFMA, strided_cm_subtile) {
24443 TEST_REQUIRES_ARM_NEON_FMA;
24444 for (size_t k = 1; k <= 20; k += 5) {
24445 for (uint32_t m = 1; m <= 1; m++) {
24446 for (uint32_t n = 1; n <= 8; n++) {
24447 GemmMicrokernelTester()
24448 .mr(1)
24449 .nr(8)
24450 .kr(1)
24451 .sr(4)
24452 .m(m)
24453 .n(n)
24454 .k(k)
24455 .cm_stride(11)
24456 .iterations(1)
24457 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24458 }
24459 }
24460 }
24461 }
24462
24463 TEST(F32_IGEMM_1X8S4__NEONFMA, a_offset) {
24464 TEST_REQUIRES_ARM_NEON_FMA;
24465 for (size_t k = 1; k <= 20; k += 5) {
24466 GemmMicrokernelTester()
24467 .mr(1)
24468 .nr(8)
24469 .kr(1)
24470 .sr(4)
24471 .m(1)
24472 .n(8)
24473 .k(k)
24474 .ks(3)
24475 .a_offset(23)
24476 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24477 }
24478 }
24479
24480 TEST(F32_IGEMM_1X8S4__NEONFMA, zero) {
24481 TEST_REQUIRES_ARM_NEON_FMA;
24482 for (uint32_t mz = 0; mz < 1; mz++) {
24483 for (size_t k = 1; k <= 20; k += 5) {
24484 GemmMicrokernelTester()
24485 .mr(1)
24486 .nr(8)
24487 .kr(1)
24488 .sr(4)
24489 .m(1)
24490 .n(8)
24491 .k(k)
24492 .ks(3)
24493 .a_offset(23)
24494 .zero_index(mz)
24495 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24496 }
24497 }
24498 }
24499
24500 TEST(F32_IGEMM_1X8S4__NEONFMA, qmin) {
24501 TEST_REQUIRES_ARM_NEON_FMA;
24502 GemmMicrokernelTester()
24503 .mr(1)
24504 .nr(8)
24505 .kr(1)
24506 .sr(4)
24507 .m(1)
24508 .n(8)
24509 .k(4)
24510 .qmin(128)
24511 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24512 }
24513
24514 TEST(F32_IGEMM_1X8S4__NEONFMA, qmax) {
24515 TEST_REQUIRES_ARM_NEON_FMA;
24516 GemmMicrokernelTester()
24517 .mr(1)
24518 .nr(8)
24519 .kr(1)
24520 .sr(4)
24521 .m(1)
24522 .n(8)
24523 .k(4)
24524 .qmax(128)
24525 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24526 }
24527
24528 TEST(F32_IGEMM_1X8S4__NEONFMA, strided_cm) {
24529 TEST_REQUIRES_ARM_NEON_FMA;
24530 GemmMicrokernelTester()
24531 .mr(1)
24532 .nr(8)
24533 .kr(1)
24534 .sr(4)
24535 .m(1)
24536 .n(8)
24537 .k(4)
24538 .cm_stride(11)
24539 .Test(xnn_f32_igemm_ukernel_1x8s4__neonfma);
24540 }
24541#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24542
24543
24544#if XNN_ARCH_ARM || XNN_ARCH_ARM64
24545 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4) {
24546 TEST_REQUIRES_ARM_NEON_FMA;
24547 GemmMicrokernelTester()
24548 .mr(4)
24549 .nr(8)
24550 .kr(1)
24551 .sr(4)
24552 .m(4)
24553 .n(8)
24554 .k(4)
24555 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24556 }
24557
24558 TEST(F32_IGEMM_4X8S4__NEONFMA, strided_cn) {
24559 TEST_REQUIRES_ARM_NEON_FMA;
24560 GemmMicrokernelTester()
24561 .mr(4)
24562 .nr(8)
24563 .kr(1)
24564 .sr(4)
24565 .m(4)
24566 .n(8)
24567 .k(4)
24568 .cn_stride(11)
24569 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24570 }
24571
24572 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4_subtile) {
24573 TEST_REQUIRES_ARM_NEON_FMA;
24574 for (uint32_t m = 1; m <= 4; m++) {
24575 for (uint32_t n = 1; n <= 8; n++) {
24576 GemmMicrokernelTester()
24577 .mr(4)
24578 .nr(8)
24579 .kr(1)
24580 .sr(4)
24581 .m(m)
24582 .n(n)
24583 .k(4)
24584 .iterations(1)
24585 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24586 }
24587 }
24588 }
24589
24590 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4_subtile_m) {
24591 TEST_REQUIRES_ARM_NEON_FMA;
24592 for (uint32_t m = 1; m <= 4; m++) {
24593 GemmMicrokernelTester()
24594 .mr(4)
24595 .nr(8)
24596 .kr(1)
24597 .sr(4)
24598 .m(m)
24599 .n(8)
24600 .k(4)
24601 .iterations(1)
24602 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24603 }
24604 }
24605
24606 TEST(F32_IGEMM_4X8S4__NEONFMA, k_eq_4_subtile_n) {
24607 TEST_REQUIRES_ARM_NEON_FMA;
24608 for (uint32_t n = 1; n <= 8; n++) {
24609 GemmMicrokernelTester()
24610 .mr(4)
24611 .nr(8)
24612 .kr(1)
24613 .sr(4)
24614 .m(4)
24615 .n(n)
24616 .k(4)
24617 .iterations(1)
24618 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24619 }
24620 }
24621
24622 TEST(F32_IGEMM_4X8S4__NEONFMA, k_lt_4) {
24623 TEST_REQUIRES_ARM_NEON_FMA;
24624 for (size_t k = 1; k < 4; k++) {
24625 GemmMicrokernelTester()
24626 .mr(4)
24627 .nr(8)
24628 .kr(1)
24629 .sr(4)
24630 .m(4)
24631 .n(8)
24632 .k(k)
24633 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24634 }
24635 }
24636
24637 TEST(F32_IGEMM_4X8S4__NEONFMA, k_lt_4_subtile) {
24638 TEST_REQUIRES_ARM_NEON_FMA;
24639 for (size_t k = 1; k < 4; k++) {
24640 for (uint32_t m = 1; m <= 4; m++) {
24641 for (uint32_t n = 1; n <= 8; n++) {
24642 GemmMicrokernelTester()
24643 .mr(4)
24644 .nr(8)
24645 .kr(1)
24646 .sr(4)
24647 .m(m)
24648 .n(n)
24649 .k(k)
24650 .iterations(1)
24651 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24652 }
24653 }
24654 }
24655 }
24656
24657 TEST(F32_IGEMM_4X8S4__NEONFMA, k_gt_4) {
24658 TEST_REQUIRES_ARM_NEON_FMA;
24659 for (size_t k = 5; k < 8; k++) {
24660 GemmMicrokernelTester()
24661 .mr(4)
24662 .nr(8)
24663 .kr(1)
24664 .sr(4)
24665 .m(4)
24666 .n(8)
24667 .k(k)
24668 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24669 }
24670 }
24671
24672 TEST(F32_IGEMM_4X8S4__NEONFMA, k_gt_4_subtile) {
24673 TEST_REQUIRES_ARM_NEON_FMA;
24674 for (size_t k = 5; k < 8; k++) {
24675 for (uint32_t m = 1; m <= 4; m++) {
24676 for (uint32_t n = 1; n <= 8; n++) {
24677 GemmMicrokernelTester()
24678 .mr(4)
24679 .nr(8)
24680 .kr(1)
24681 .sr(4)
24682 .m(m)
24683 .n(n)
24684 .k(k)
24685 .iterations(1)
24686 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24687 }
24688 }
24689 }
24690 }
24691
24692 TEST(F32_IGEMM_4X8S4__NEONFMA, k_div_4) {
24693 TEST_REQUIRES_ARM_NEON_FMA;
24694 for (size_t k = 8; k <= 40; k += 4) {
24695 GemmMicrokernelTester()
24696 .mr(4)
24697 .nr(8)
24698 .kr(1)
24699 .sr(4)
24700 .m(4)
24701 .n(8)
24702 .k(k)
24703 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24704 }
24705 }
24706
24707 TEST(F32_IGEMM_4X8S4__NEONFMA, k_div_4_subtile) {
24708 TEST_REQUIRES_ARM_NEON_FMA;
24709 for (size_t k = 8; k <= 40; k += 4) {
24710 for (uint32_t m = 1; m <= 4; m++) {
24711 for (uint32_t n = 1; n <= 8; n++) {
24712 GemmMicrokernelTester()
24713 .mr(4)
24714 .nr(8)
24715 .kr(1)
24716 .sr(4)
24717 .m(m)
24718 .n(n)
24719 .k(k)
24720 .iterations(1)
24721 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24722 }
24723 }
24724 }
24725 }
24726
24727 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8) {
24728 TEST_REQUIRES_ARM_NEON_FMA;
24729 for (uint32_t n = 9; n < 16; n++) {
24730 for (size_t k = 1; k <= 20; k += 5) {
24731 GemmMicrokernelTester()
24732 .mr(4)
24733 .nr(8)
24734 .kr(1)
24735 .sr(4)
24736 .m(4)
24737 .n(8)
24738 .k(k)
24739 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24740 }
24741 }
24742 }
24743
24744 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8_strided_cn) {
24745 TEST_REQUIRES_ARM_NEON_FMA;
24746 for (uint32_t n = 9; n < 16; n++) {
24747 for (size_t k = 1; k <= 20; k += 5) {
24748 GemmMicrokernelTester()
24749 .mr(4)
24750 .nr(8)
24751 .kr(1)
24752 .sr(4)
24753 .m(4)
24754 .n(8)
24755 .k(k)
24756 .cn_stride(11)
24757 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24758 }
24759 }
24760 }
24761
24762 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8_subtile) {
24763 TEST_REQUIRES_ARM_NEON_FMA;
24764 for (uint32_t n = 9; n < 16; n++) {
24765 for (size_t k = 1; k <= 20; k += 5) {
24766 for (uint32_t m = 1; m <= 4; m++) {
24767 GemmMicrokernelTester()
24768 .mr(4)
24769 .nr(8)
24770 .kr(1)
24771 .sr(4)
24772 .m(m)
24773 .n(n)
24774 .k(k)
24775 .iterations(1)
24776 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24777 }
24778 }
24779 }
24780 }
24781
24782 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8) {
24783 TEST_REQUIRES_ARM_NEON_FMA;
24784 for (uint32_t n = 16; n <= 24; n += 8) {
24785 for (size_t k = 1; k <= 20; k += 5) {
24786 GemmMicrokernelTester()
24787 .mr(4)
24788 .nr(8)
24789 .kr(1)
24790 .sr(4)
24791 .m(4)
24792 .n(8)
24793 .k(k)
24794 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24795 }
24796 }
24797 }
24798
24799 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8_strided_cn) {
24800 TEST_REQUIRES_ARM_NEON_FMA;
24801 for (uint32_t n = 16; n <= 24; n += 8) {
24802 for (size_t k = 1; k <= 20; k += 5) {
24803 GemmMicrokernelTester()
24804 .mr(4)
24805 .nr(8)
24806 .kr(1)
24807 .sr(4)
24808 .m(4)
24809 .n(n)
24810 .k(k)
24811 .cn_stride(11)
24812 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24813 }
24814 }
24815 }
24816
24817 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8_subtile) {
24818 TEST_REQUIRES_ARM_NEON_FMA;
24819 for (uint32_t n = 16; n <= 24; n += 8) {
24820 for (size_t k = 1; k <= 20; k += 5) {
24821 for (uint32_t m = 1; m <= 4; m++) {
24822 GemmMicrokernelTester()
24823 .mr(4)
24824 .nr(8)
24825 .kr(1)
24826 .sr(4)
24827 .m(m)
24828 .n(n)
24829 .k(k)
24830 .iterations(1)
24831 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24832 }
24833 }
24834 }
24835 }
24836
24837 TEST(F32_IGEMM_4X8S4__NEONFMA, small_kernel) {
24838 TEST_REQUIRES_ARM_NEON_FMA;
24839 for (size_t k = 1; k <= 20; k += 5) {
24840 GemmMicrokernelTester()
24841 .mr(4)
24842 .nr(8)
24843 .kr(1)
24844 .sr(4)
24845 .m(4)
24846 .n(8)
24847 .k(k)
24848 .ks(3)
24849 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24850 }
24851 }
24852
24853 TEST(F32_IGEMM_4X8S4__NEONFMA, small_kernel_subtile) {
24854 TEST_REQUIRES_ARM_NEON_FMA;
24855 for (size_t k = 1; k <= 20; k += 5) {
24856 for (uint32_t m = 1; m <= 4; m++) {
24857 for (uint32_t n = 1; n <= 8; n++) {
24858 GemmMicrokernelTester()
24859 .mr(4)
24860 .nr(8)
24861 .kr(1)
24862 .sr(4)
24863 .m(m)
24864 .n(n)
24865 .k(k)
24866 .ks(3)
24867 .iterations(1)
24868 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24869 }
24870 }
24871 }
24872 }
24873
24874 TEST(F32_IGEMM_4X8S4__NEONFMA, n_gt_8_small_kernel) {
24875 TEST_REQUIRES_ARM_NEON_FMA;
24876 for (uint32_t n = 9; n < 16; n++) {
24877 for (size_t k = 1; k <= 20; k += 5) {
24878 GemmMicrokernelTester()
24879 .mr(4)
24880 .nr(8)
24881 .kr(1)
24882 .sr(4)
24883 .m(4)
24884 .n(8)
24885 .k(k)
24886 .ks(3)
24887 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24888 }
24889 }
24890 }
24891
24892 TEST(F32_IGEMM_4X8S4__NEONFMA, n_div_8_small_kernel) {
24893 TEST_REQUIRES_ARM_NEON_FMA;
24894 for (uint32_t n = 16; n <= 24; n += 8) {
24895 for (size_t k = 1; k <= 20; k += 5) {
24896 GemmMicrokernelTester()
24897 .mr(4)
24898 .nr(8)
24899 .kr(1)
24900 .sr(4)
24901 .m(4)
24902 .n(8)
24903 .k(k)
24904 .ks(3)
24905 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24906 }
24907 }
24908 }
24909
24910 TEST(F32_IGEMM_4X8S4__NEONFMA, strided_cm_subtile) {
24911 TEST_REQUIRES_ARM_NEON_FMA;
24912 for (size_t k = 1; k <= 20; k += 5) {
24913 for (uint32_t m = 1; m <= 4; m++) {
24914 for (uint32_t n = 1; n <= 8; n++) {
24915 GemmMicrokernelTester()
24916 .mr(4)
24917 .nr(8)
24918 .kr(1)
24919 .sr(4)
24920 .m(m)
24921 .n(n)
24922 .k(k)
24923 .cm_stride(11)
24924 .iterations(1)
24925 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24926 }
24927 }
24928 }
24929 }
24930
24931 TEST(F32_IGEMM_4X8S4__NEONFMA, a_offset) {
24932 TEST_REQUIRES_ARM_NEON_FMA;
24933 for (size_t k = 1; k <= 20; k += 5) {
24934 GemmMicrokernelTester()
24935 .mr(4)
24936 .nr(8)
24937 .kr(1)
24938 .sr(4)
24939 .m(4)
24940 .n(8)
24941 .k(k)
24942 .ks(3)
24943 .a_offset(83)
24944 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24945 }
24946 }
24947
24948 TEST(F32_IGEMM_4X8S4__NEONFMA, zero) {
24949 TEST_REQUIRES_ARM_NEON_FMA;
24950 for (uint32_t mz = 0; mz < 4; mz++) {
24951 for (size_t k = 1; k <= 20; k += 5) {
24952 GemmMicrokernelTester()
24953 .mr(4)
24954 .nr(8)
24955 .kr(1)
24956 .sr(4)
24957 .m(4)
24958 .n(8)
24959 .k(k)
24960 .ks(3)
24961 .a_offset(83)
24962 .zero_index(mz)
24963 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24964 }
24965 }
24966 }
24967
24968 TEST(F32_IGEMM_4X8S4__NEONFMA, qmin) {
24969 TEST_REQUIRES_ARM_NEON_FMA;
24970 GemmMicrokernelTester()
24971 .mr(4)
24972 .nr(8)
24973 .kr(1)
24974 .sr(4)
24975 .m(4)
24976 .n(8)
24977 .k(4)
24978 .qmin(128)
24979 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24980 }
24981
24982 TEST(F32_IGEMM_4X8S4__NEONFMA, qmax) {
24983 TEST_REQUIRES_ARM_NEON_FMA;
24984 GemmMicrokernelTester()
24985 .mr(4)
24986 .nr(8)
24987 .kr(1)
24988 .sr(4)
24989 .m(4)
24990 .n(8)
24991 .k(4)
24992 .qmax(128)
24993 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
24994 }
24995
24996 TEST(F32_IGEMM_4X8S4__NEONFMA, strided_cm) {
24997 TEST_REQUIRES_ARM_NEON_FMA;
24998 GemmMicrokernelTester()
24999 .mr(4)
25000 .nr(8)
25001 .kr(1)
25002 .sr(4)
25003 .m(4)
25004 .n(8)
25005 .k(4)
25006 .cm_stride(11)
25007 .Test(xnn_f32_igemm_ukernel_4x8s4__neonfma);
25008 }
25009#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
25010
25011
25012#if XNN_ARCH_ARM || XNN_ARCH_ARM64
25013 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4) {
25014 TEST_REQUIRES_ARM_NEON_FMA;
25015 GemmMicrokernelTester()
25016 .mr(6)
25017 .nr(8)
25018 .kr(1)
25019 .sr(4)
25020 .m(6)
25021 .n(8)
25022 .k(4)
25023 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25024 }
25025
25026 TEST(F32_IGEMM_6X8S4__NEONFMA, strided_cn) {
25027 TEST_REQUIRES_ARM_NEON_FMA;
25028 GemmMicrokernelTester()
25029 .mr(6)
25030 .nr(8)
25031 .kr(1)
25032 .sr(4)
25033 .m(6)
25034 .n(8)
25035 .k(4)
25036 .cn_stride(11)
25037 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25038 }
25039
25040 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4_subtile) {
25041 TEST_REQUIRES_ARM_NEON_FMA;
25042 for (uint32_t m = 1; m <= 6; m++) {
25043 for (uint32_t n = 1; n <= 8; n++) {
25044 GemmMicrokernelTester()
25045 .mr(6)
25046 .nr(8)
25047 .kr(1)
25048 .sr(4)
25049 .m(m)
25050 .n(n)
25051 .k(4)
25052 .iterations(1)
25053 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25054 }
25055 }
25056 }
25057
25058 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4_subtile_m) {
25059 TEST_REQUIRES_ARM_NEON_FMA;
25060 for (uint32_t m = 1; m <= 6; m++) {
25061 GemmMicrokernelTester()
25062 .mr(6)
25063 .nr(8)
25064 .kr(1)
25065 .sr(4)
25066 .m(m)
25067 .n(8)
25068 .k(4)
25069 .iterations(1)
25070 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25071 }
25072 }
25073
25074 TEST(F32_IGEMM_6X8S4__NEONFMA, k_eq_4_subtile_n) {
25075 TEST_REQUIRES_ARM_NEON_FMA;
25076 for (uint32_t n = 1; n <= 8; n++) {
25077 GemmMicrokernelTester()
25078 .mr(6)
25079 .nr(8)
25080 .kr(1)
25081 .sr(4)
25082 .m(6)
25083 .n(n)
25084 .k(4)
25085 .iterations(1)
25086 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25087 }
25088 }
25089
25090 TEST(F32_IGEMM_6X8S4__NEONFMA, k_lt_4) {
25091 TEST_REQUIRES_ARM_NEON_FMA;
25092 for (size_t k = 1; k < 4; k++) {
25093 GemmMicrokernelTester()
25094 .mr(6)
25095 .nr(8)
25096 .kr(1)
25097 .sr(4)
25098 .m(6)
25099 .n(8)
25100 .k(k)
25101 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25102 }
25103 }
25104
25105 TEST(F32_IGEMM_6X8S4__NEONFMA, k_lt_4_subtile) {
25106 TEST_REQUIRES_ARM_NEON_FMA;
25107 for (size_t k = 1; k < 4; k++) {
25108 for (uint32_t m = 1; m <= 6; m++) {
25109 for (uint32_t n = 1; n <= 8; n++) {
25110 GemmMicrokernelTester()
25111 .mr(6)
25112 .nr(8)
25113 .kr(1)
25114 .sr(4)
25115 .m(m)
25116 .n(n)
25117 .k(k)
25118 .iterations(1)
25119 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25120 }
25121 }
25122 }
25123 }
25124
25125 TEST(F32_IGEMM_6X8S4__NEONFMA, k_gt_4) {
25126 TEST_REQUIRES_ARM_NEON_FMA;
25127 for (size_t k = 5; k < 8; k++) {
25128 GemmMicrokernelTester()
25129 .mr(6)
25130 .nr(8)
25131 .kr(1)
25132 .sr(4)
25133 .m(6)
25134 .n(8)
25135 .k(k)
25136 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25137 }
25138 }
25139
25140 TEST(F32_IGEMM_6X8S4__NEONFMA, k_gt_4_subtile) {
25141 TEST_REQUIRES_ARM_NEON_FMA;
25142 for (size_t k = 5; k < 8; k++) {
25143 for (uint32_t m = 1; m <= 6; m++) {
25144 for (uint32_t n = 1; n <= 8; n++) {
25145 GemmMicrokernelTester()
25146 .mr(6)
25147 .nr(8)
25148 .kr(1)
25149 .sr(4)
25150 .m(m)
25151 .n(n)
25152 .k(k)
25153 .iterations(1)
25154 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25155 }
25156 }
25157 }
25158 }
25159
25160 TEST(F32_IGEMM_6X8S4__NEONFMA, k_div_4) {
25161 TEST_REQUIRES_ARM_NEON_FMA;
25162 for (size_t k = 8; k <= 40; k += 4) {
25163 GemmMicrokernelTester()
25164 .mr(6)
25165 .nr(8)
25166 .kr(1)
25167 .sr(4)
25168 .m(6)
25169 .n(8)
25170 .k(k)
25171 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25172 }
25173 }
25174
25175 TEST(F32_IGEMM_6X8S4__NEONFMA, k_div_4_subtile) {
25176 TEST_REQUIRES_ARM_NEON_FMA;
25177 for (size_t k = 8; k <= 40; k += 4) {
25178 for (uint32_t m = 1; m <= 6; m++) {
25179 for (uint32_t n = 1; n <= 8; n++) {
25180 GemmMicrokernelTester()
25181 .mr(6)
25182 .nr(8)
25183 .kr(1)
25184 .sr(4)
25185 .m(m)
25186 .n(n)
25187 .k(k)
25188 .iterations(1)
25189 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25190 }
25191 }
25192 }
25193 }
25194
25195 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8) {
25196 TEST_REQUIRES_ARM_NEON_FMA;
25197 for (uint32_t n = 9; n < 16; n++) {
25198 for (size_t k = 1; k <= 20; k += 5) {
25199 GemmMicrokernelTester()
25200 .mr(6)
25201 .nr(8)
25202 .kr(1)
25203 .sr(4)
25204 .m(6)
25205 .n(8)
25206 .k(k)
25207 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25208 }
25209 }
25210 }
25211
25212 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8_strided_cn) {
25213 TEST_REQUIRES_ARM_NEON_FMA;
25214 for (uint32_t n = 9; n < 16; n++) {
25215 for (size_t k = 1; k <= 20; k += 5) {
25216 GemmMicrokernelTester()
25217 .mr(6)
25218 .nr(8)
25219 .kr(1)
25220 .sr(4)
25221 .m(6)
25222 .n(8)
25223 .k(k)
25224 .cn_stride(11)
25225 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25226 }
25227 }
25228 }
25229
25230 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8_subtile) {
25231 TEST_REQUIRES_ARM_NEON_FMA;
25232 for (uint32_t n = 9; n < 16; n++) {
25233 for (size_t k = 1; k <= 20; k += 5) {
25234 for (uint32_t m = 1; m <= 6; m++) {
25235 GemmMicrokernelTester()
25236 .mr(6)
25237 .nr(8)
25238 .kr(1)
25239 .sr(4)
25240 .m(m)
25241 .n(n)
25242 .k(k)
25243 .iterations(1)
25244 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25245 }
25246 }
25247 }
25248 }
25249
25250 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8) {
25251 TEST_REQUIRES_ARM_NEON_FMA;
25252 for (uint32_t n = 16; n <= 24; n += 8) {
25253 for (size_t k = 1; k <= 20; k += 5) {
25254 GemmMicrokernelTester()
25255 .mr(6)
25256 .nr(8)
25257 .kr(1)
25258 .sr(4)
25259 .m(6)
25260 .n(8)
25261 .k(k)
25262 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25263 }
25264 }
25265 }
25266
25267 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8_strided_cn) {
25268 TEST_REQUIRES_ARM_NEON_FMA;
25269 for (uint32_t n = 16; n <= 24; n += 8) {
25270 for (size_t k = 1; k <= 20; k += 5) {
25271 GemmMicrokernelTester()
25272 .mr(6)
25273 .nr(8)
25274 .kr(1)
25275 .sr(4)
25276 .m(6)
25277 .n(n)
25278 .k(k)
25279 .cn_stride(11)
25280 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25281 }
25282 }
25283 }
25284
25285 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8_subtile) {
25286 TEST_REQUIRES_ARM_NEON_FMA;
25287 for (uint32_t n = 16; n <= 24; n += 8) {
25288 for (size_t k = 1; k <= 20; k += 5) {
25289 for (uint32_t m = 1; m <= 6; m++) {
25290 GemmMicrokernelTester()
25291 .mr(6)
25292 .nr(8)
25293 .kr(1)
25294 .sr(4)
25295 .m(m)
25296 .n(n)
25297 .k(k)
25298 .iterations(1)
25299 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25300 }
25301 }
25302 }
25303 }
25304
25305 TEST(F32_IGEMM_6X8S4__NEONFMA, small_kernel) {
25306 TEST_REQUIRES_ARM_NEON_FMA;
25307 for (size_t k = 1; k <= 20; k += 5) {
25308 GemmMicrokernelTester()
25309 .mr(6)
25310 .nr(8)
25311 .kr(1)
25312 .sr(4)
25313 .m(6)
25314 .n(8)
25315 .k(k)
25316 .ks(3)
25317 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25318 }
25319 }
25320
25321 TEST(F32_IGEMM_6X8S4__NEONFMA, small_kernel_subtile) {
25322 TEST_REQUIRES_ARM_NEON_FMA;
25323 for (size_t k = 1; k <= 20; k += 5) {
25324 for (uint32_t m = 1; m <= 6; m++) {
25325 for (uint32_t n = 1; n <= 8; n++) {
25326 GemmMicrokernelTester()
25327 .mr(6)
25328 .nr(8)
25329 .kr(1)
25330 .sr(4)
25331 .m(m)
25332 .n(n)
25333 .k(k)
25334 .ks(3)
25335 .iterations(1)
25336 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25337 }
25338 }
25339 }
25340 }
25341
25342 TEST(F32_IGEMM_6X8S4__NEONFMA, n_gt_8_small_kernel) {
25343 TEST_REQUIRES_ARM_NEON_FMA;
25344 for (uint32_t n = 9; n < 16; n++) {
25345 for (size_t k = 1; k <= 20; k += 5) {
25346 GemmMicrokernelTester()
25347 .mr(6)
25348 .nr(8)
25349 .kr(1)
25350 .sr(4)
25351 .m(6)
25352 .n(8)
25353 .k(k)
25354 .ks(3)
25355 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25356 }
25357 }
25358 }
25359
25360 TEST(F32_IGEMM_6X8S4__NEONFMA, n_div_8_small_kernel) {
25361 TEST_REQUIRES_ARM_NEON_FMA;
25362 for (uint32_t n = 16; n <= 24; n += 8) {
25363 for (size_t k = 1; k <= 20; k += 5) {
25364 GemmMicrokernelTester()
25365 .mr(6)
25366 .nr(8)
25367 .kr(1)
25368 .sr(4)
25369 .m(6)
25370 .n(8)
25371 .k(k)
25372 .ks(3)
25373 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25374 }
25375 }
25376 }
25377
25378 TEST(F32_IGEMM_6X8S4__NEONFMA, strided_cm_subtile) {
25379 TEST_REQUIRES_ARM_NEON_FMA;
25380 for (size_t k = 1; k <= 20; k += 5) {
25381 for (uint32_t m = 1; m <= 6; m++) {
25382 for (uint32_t n = 1; n <= 8; n++) {
25383 GemmMicrokernelTester()
25384 .mr(6)
25385 .nr(8)
25386 .kr(1)
25387 .sr(4)
25388 .m(m)
25389 .n(n)
25390 .k(k)
25391 .cm_stride(11)
25392 .iterations(1)
25393 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25394 }
25395 }
25396 }
25397 }
25398
25399 TEST(F32_IGEMM_6X8S4__NEONFMA, a_offset) {
25400 TEST_REQUIRES_ARM_NEON_FMA;
25401 for (size_t k = 1; k <= 20; k += 5) {
25402 GemmMicrokernelTester()
25403 .mr(6)
25404 .nr(8)
25405 .kr(1)
25406 .sr(4)
25407 .m(6)
25408 .n(8)
25409 .k(k)
25410 .ks(3)
25411 .a_offset(127)
25412 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25413 }
25414 }
25415
25416 TEST(F32_IGEMM_6X8S4__NEONFMA, zero) {
25417 TEST_REQUIRES_ARM_NEON_FMA;
25418 for (uint32_t mz = 0; mz < 6; mz++) {
25419 for (size_t k = 1; k <= 20; k += 5) {
25420 GemmMicrokernelTester()
25421 .mr(6)
25422 .nr(8)
25423 .kr(1)
25424 .sr(4)
25425 .m(6)
25426 .n(8)
25427 .k(k)
25428 .ks(3)
25429 .a_offset(127)
25430 .zero_index(mz)
25431 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25432 }
25433 }
25434 }
25435
25436 TEST(F32_IGEMM_6X8S4__NEONFMA, qmin) {
25437 TEST_REQUIRES_ARM_NEON_FMA;
25438 GemmMicrokernelTester()
25439 .mr(6)
25440 .nr(8)
25441 .kr(1)
25442 .sr(4)
25443 .m(6)
25444 .n(8)
25445 .k(4)
25446 .qmin(128)
25447 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25448 }
25449
25450 TEST(F32_IGEMM_6X8S4__NEONFMA, qmax) {
25451 TEST_REQUIRES_ARM_NEON_FMA;
25452 GemmMicrokernelTester()
25453 .mr(6)
25454 .nr(8)
25455 .kr(1)
25456 .sr(4)
25457 .m(6)
25458 .n(8)
25459 .k(4)
25460 .qmax(128)
25461 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25462 }
25463
25464 TEST(F32_IGEMM_6X8S4__NEONFMA, strided_cm) {
25465 TEST_REQUIRES_ARM_NEON_FMA;
25466 GemmMicrokernelTester()
25467 .mr(6)
25468 .nr(8)
25469 .kr(1)
25470 .sr(4)
25471 .m(6)
25472 .n(8)
25473 .k(4)
25474 .cm_stride(11)
25475 .Test(xnn_f32_igemm_ukernel_6x8s4__neonfma);
25476 }
25477#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
25478
25479
25480#if XNN_ARCH_ARM || XNN_ARCH_ARM64
25481 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4) {
25482 TEST_REQUIRES_ARM_NEON_FMA;
25483 GemmMicrokernelTester()
25484 .mr(8)
25485 .nr(8)
25486 .kr(1)
25487 .sr(4)
25488 .m(8)
25489 .n(8)
25490 .k(4)
25491 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25492 }
25493
25494 TEST(F32_IGEMM_8X8S4__NEONFMA, strided_cn) {
25495 TEST_REQUIRES_ARM_NEON_FMA;
25496 GemmMicrokernelTester()
25497 .mr(8)
25498 .nr(8)
25499 .kr(1)
25500 .sr(4)
25501 .m(8)
25502 .n(8)
25503 .k(4)
25504 .cn_stride(11)
25505 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25506 }
25507
25508 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4_subtile) {
25509 TEST_REQUIRES_ARM_NEON_FMA;
25510 for (uint32_t m = 1; m <= 8; m++) {
25511 for (uint32_t n = 1; n <= 8; n++) {
25512 GemmMicrokernelTester()
25513 .mr(8)
25514 .nr(8)
25515 .kr(1)
25516 .sr(4)
25517 .m(m)
25518 .n(n)
25519 .k(4)
25520 .iterations(1)
25521 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25522 }
25523 }
25524 }
25525
25526 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4_subtile_m) {
25527 TEST_REQUIRES_ARM_NEON_FMA;
25528 for (uint32_t m = 1; m <= 8; m++) {
25529 GemmMicrokernelTester()
25530 .mr(8)
25531 .nr(8)
25532 .kr(1)
25533 .sr(4)
25534 .m(m)
25535 .n(8)
25536 .k(4)
25537 .iterations(1)
25538 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25539 }
25540 }
25541
25542 TEST(F32_IGEMM_8X8S4__NEONFMA, k_eq_4_subtile_n) {
25543 TEST_REQUIRES_ARM_NEON_FMA;
25544 for (uint32_t n = 1; n <= 8; n++) {
25545 GemmMicrokernelTester()
25546 .mr(8)
25547 .nr(8)
25548 .kr(1)
25549 .sr(4)
25550 .m(8)
25551 .n(n)
25552 .k(4)
25553 .iterations(1)
25554 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25555 }
25556 }
25557
25558 TEST(F32_IGEMM_8X8S4__NEONFMA, k_lt_4) {
25559 TEST_REQUIRES_ARM_NEON_FMA;
25560 for (size_t k = 1; k < 4; k++) {
25561 GemmMicrokernelTester()
25562 .mr(8)
25563 .nr(8)
25564 .kr(1)
25565 .sr(4)
25566 .m(8)
25567 .n(8)
25568 .k(k)
25569 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25570 }
25571 }
25572
25573 TEST(F32_IGEMM_8X8S4__NEONFMA, k_lt_4_subtile) {
25574 TEST_REQUIRES_ARM_NEON_FMA;
25575 for (size_t k = 1; k < 4; k++) {
25576 for (uint32_t m = 1; m <= 8; m++) {
25577 for (uint32_t n = 1; n <= 8; n++) {
25578 GemmMicrokernelTester()
25579 .mr(8)
25580 .nr(8)
25581 .kr(1)
25582 .sr(4)
25583 .m(m)
25584 .n(n)
25585 .k(k)
25586 .iterations(1)
25587 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25588 }
25589 }
25590 }
25591 }
25592
25593 TEST(F32_IGEMM_8X8S4__NEONFMA, k_gt_4) {
25594 TEST_REQUIRES_ARM_NEON_FMA;
25595 for (size_t k = 5; k < 8; k++) {
25596 GemmMicrokernelTester()
25597 .mr(8)
25598 .nr(8)
25599 .kr(1)
25600 .sr(4)
25601 .m(8)
25602 .n(8)
25603 .k(k)
25604 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25605 }
25606 }
25607
25608 TEST(F32_IGEMM_8X8S4__NEONFMA, k_gt_4_subtile) {
25609 TEST_REQUIRES_ARM_NEON_FMA;
25610 for (size_t k = 5; k < 8; k++) {
25611 for (uint32_t m = 1; m <= 8; m++) {
25612 for (uint32_t n = 1; n <= 8; n++) {
25613 GemmMicrokernelTester()
25614 .mr(8)
25615 .nr(8)
25616 .kr(1)
25617 .sr(4)
25618 .m(m)
25619 .n(n)
25620 .k(k)
25621 .iterations(1)
25622 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25623 }
25624 }
25625 }
25626 }
25627
25628 TEST(F32_IGEMM_8X8S4__NEONFMA, k_div_4) {
25629 TEST_REQUIRES_ARM_NEON_FMA;
25630 for (size_t k = 8; k <= 40; k += 4) {
25631 GemmMicrokernelTester()
25632 .mr(8)
25633 .nr(8)
25634 .kr(1)
25635 .sr(4)
25636 .m(8)
25637 .n(8)
25638 .k(k)
25639 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25640 }
25641 }
25642
25643 TEST(F32_IGEMM_8X8S4__NEONFMA, k_div_4_subtile) {
25644 TEST_REQUIRES_ARM_NEON_FMA;
25645 for (size_t k = 8; k <= 40; k += 4) {
25646 for (uint32_t m = 1; m <= 8; m++) {
25647 for (uint32_t n = 1; n <= 8; n++) {
25648 GemmMicrokernelTester()
25649 .mr(8)
25650 .nr(8)
25651 .kr(1)
25652 .sr(4)
25653 .m(m)
25654 .n(n)
25655 .k(k)
25656 .iterations(1)
25657 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25658 }
25659 }
25660 }
25661 }
25662
25663 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8) {
25664 TEST_REQUIRES_ARM_NEON_FMA;
25665 for (uint32_t n = 9; n < 16; n++) {
25666 for (size_t k = 1; k <= 20; k += 5) {
25667 GemmMicrokernelTester()
25668 .mr(8)
25669 .nr(8)
25670 .kr(1)
25671 .sr(4)
25672 .m(8)
25673 .n(8)
25674 .k(k)
25675 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25676 }
25677 }
25678 }
25679
25680 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8_strided_cn) {
25681 TEST_REQUIRES_ARM_NEON_FMA;
25682 for (uint32_t n = 9; n < 16; n++) {
25683 for (size_t k = 1; k <= 20; k += 5) {
25684 GemmMicrokernelTester()
25685 .mr(8)
25686 .nr(8)
25687 .kr(1)
25688 .sr(4)
25689 .m(8)
25690 .n(8)
25691 .k(k)
25692 .cn_stride(11)
25693 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25694 }
25695 }
25696 }
25697
25698 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8_subtile) {
25699 TEST_REQUIRES_ARM_NEON_FMA;
25700 for (uint32_t n = 9; n < 16; n++) {
25701 for (size_t k = 1; k <= 20; k += 5) {
25702 for (uint32_t m = 1; m <= 8; m++) {
25703 GemmMicrokernelTester()
25704 .mr(8)
25705 .nr(8)
25706 .kr(1)
25707 .sr(4)
25708 .m(m)
25709 .n(n)
25710 .k(k)
25711 .iterations(1)
25712 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25713 }
25714 }
25715 }
25716 }
25717
25718 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8) {
25719 TEST_REQUIRES_ARM_NEON_FMA;
25720 for (uint32_t n = 16; n <= 24; n += 8) {
25721 for (size_t k = 1; k <= 20; k += 5) {
25722 GemmMicrokernelTester()
25723 .mr(8)
25724 .nr(8)
25725 .kr(1)
25726 .sr(4)
25727 .m(8)
25728 .n(8)
25729 .k(k)
25730 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25731 }
25732 }
25733 }
25734
25735 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8_strided_cn) {
25736 TEST_REQUIRES_ARM_NEON_FMA;
25737 for (uint32_t n = 16; n <= 24; n += 8) {
25738 for (size_t k = 1; k <= 20; k += 5) {
25739 GemmMicrokernelTester()
25740 .mr(8)
25741 .nr(8)
25742 .kr(1)
25743 .sr(4)
25744 .m(8)
25745 .n(n)
25746 .k(k)
25747 .cn_stride(11)
25748 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25749 }
25750 }
25751 }
25752
25753 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8_subtile) {
25754 TEST_REQUIRES_ARM_NEON_FMA;
25755 for (uint32_t n = 16; n <= 24; n += 8) {
25756 for (size_t k = 1; k <= 20; k += 5) {
25757 for (uint32_t m = 1; m <= 8; m++) {
25758 GemmMicrokernelTester()
25759 .mr(8)
25760 .nr(8)
25761 .kr(1)
25762 .sr(4)
25763 .m(m)
25764 .n(n)
25765 .k(k)
25766 .iterations(1)
25767 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25768 }
25769 }
25770 }
25771 }
25772
25773 TEST(F32_IGEMM_8X8S4__NEONFMA, small_kernel) {
25774 TEST_REQUIRES_ARM_NEON_FMA;
25775 for (size_t k = 1; k <= 20; k += 5) {
25776 GemmMicrokernelTester()
25777 .mr(8)
25778 .nr(8)
25779 .kr(1)
25780 .sr(4)
25781 .m(8)
25782 .n(8)
25783 .k(k)
25784 .ks(3)
25785 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25786 }
25787 }
25788
25789 TEST(F32_IGEMM_8X8S4__NEONFMA, small_kernel_subtile) {
25790 TEST_REQUIRES_ARM_NEON_FMA;
25791 for (size_t k = 1; k <= 20; k += 5) {
25792 for (uint32_t m = 1; m <= 8; m++) {
25793 for (uint32_t n = 1; n <= 8; n++) {
25794 GemmMicrokernelTester()
25795 .mr(8)
25796 .nr(8)
25797 .kr(1)
25798 .sr(4)
25799 .m(m)
25800 .n(n)
25801 .k(k)
25802 .ks(3)
25803 .iterations(1)
25804 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25805 }
25806 }
25807 }
25808 }
25809
25810 TEST(F32_IGEMM_8X8S4__NEONFMA, n_gt_8_small_kernel) {
25811 TEST_REQUIRES_ARM_NEON_FMA;
25812 for (uint32_t n = 9; n < 16; n++) {
25813 for (size_t k = 1; k <= 20; k += 5) {
25814 GemmMicrokernelTester()
25815 .mr(8)
25816 .nr(8)
25817 .kr(1)
25818 .sr(4)
25819 .m(8)
25820 .n(8)
25821 .k(k)
25822 .ks(3)
25823 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25824 }
25825 }
25826 }
25827
25828 TEST(F32_IGEMM_8X8S4__NEONFMA, n_div_8_small_kernel) {
25829 TEST_REQUIRES_ARM_NEON_FMA;
25830 for (uint32_t n = 16; n <= 24; n += 8) {
25831 for (size_t k = 1; k <= 20; k += 5) {
25832 GemmMicrokernelTester()
25833 .mr(8)
25834 .nr(8)
25835 .kr(1)
25836 .sr(4)
25837 .m(8)
25838 .n(8)
25839 .k(k)
25840 .ks(3)
25841 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25842 }
25843 }
25844 }
25845
25846 TEST(F32_IGEMM_8X8S4__NEONFMA, strided_cm_subtile) {
25847 TEST_REQUIRES_ARM_NEON_FMA;
25848 for (size_t k = 1; k <= 20; k += 5) {
25849 for (uint32_t m = 1; m <= 8; m++) {
25850 for (uint32_t n = 1; n <= 8; n++) {
25851 GemmMicrokernelTester()
25852 .mr(8)
25853 .nr(8)
25854 .kr(1)
25855 .sr(4)
25856 .m(m)
25857 .n(n)
25858 .k(k)
25859 .cm_stride(11)
25860 .iterations(1)
25861 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25862 }
25863 }
25864 }
25865 }
25866
25867 TEST(F32_IGEMM_8X8S4__NEONFMA, a_offset) {
25868 TEST_REQUIRES_ARM_NEON_FMA;
25869 for (size_t k = 1; k <= 20; k += 5) {
25870 GemmMicrokernelTester()
25871 .mr(8)
25872 .nr(8)
25873 .kr(1)
25874 .sr(4)
25875 .m(8)
25876 .n(8)
25877 .k(k)
25878 .ks(3)
25879 .a_offset(163)
25880 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25881 }
25882 }
25883
25884 TEST(F32_IGEMM_8X8S4__NEONFMA, zero) {
25885 TEST_REQUIRES_ARM_NEON_FMA;
25886 for (uint32_t mz = 0; mz < 8; mz++) {
25887 for (size_t k = 1; k <= 20; k += 5) {
25888 GemmMicrokernelTester()
25889 .mr(8)
25890 .nr(8)
25891 .kr(1)
25892 .sr(4)
25893 .m(8)
25894 .n(8)
25895 .k(k)
25896 .ks(3)
25897 .a_offset(163)
25898 .zero_index(mz)
25899 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25900 }
25901 }
25902 }
25903
25904 TEST(F32_IGEMM_8X8S4__NEONFMA, qmin) {
25905 TEST_REQUIRES_ARM_NEON_FMA;
25906 GemmMicrokernelTester()
25907 .mr(8)
25908 .nr(8)
25909 .kr(1)
25910 .sr(4)
25911 .m(8)
25912 .n(8)
25913 .k(4)
25914 .qmin(128)
25915 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25916 }
25917
25918 TEST(F32_IGEMM_8X8S4__NEONFMA, qmax) {
25919 TEST_REQUIRES_ARM_NEON_FMA;
25920 GemmMicrokernelTester()
25921 .mr(8)
25922 .nr(8)
25923 .kr(1)
25924 .sr(4)
25925 .m(8)
25926 .n(8)
25927 .k(4)
25928 .qmax(128)
25929 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25930 }
25931
25932 TEST(F32_IGEMM_8X8S4__NEONFMA, strided_cm) {
25933 TEST_REQUIRES_ARM_NEON_FMA;
25934 GemmMicrokernelTester()
25935 .mr(8)
25936 .nr(8)
25937 .kr(1)
25938 .sr(4)
25939 .m(8)
25940 .n(8)
25941 .k(4)
25942 .cm_stride(11)
25943 .Test(xnn_f32_igemm_ukernel_8x8s4__neonfma);
25944 }
25945#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
25946
25947
25948#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25949 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1) {
25950 TEST_REQUIRES_X86_SSE;
25951 GemmMicrokernelTester()
25952 .mr(1)
25953 .nr(8)
25954 .kr(1)
25955 .sr(1)
25956 .m(1)
25957 .n(8)
25958 .k(1)
25959 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
25960 }
25961
25962 TEST(F32_IGEMM_1X8__SSE_LOAD1, strided_cn) {
25963 TEST_REQUIRES_X86_SSE;
25964 GemmMicrokernelTester()
25965 .mr(1)
25966 .nr(8)
25967 .kr(1)
25968 .sr(1)
25969 .m(1)
25970 .n(8)
25971 .k(1)
25972 .cn_stride(11)
25973 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
25974 }
25975
25976 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1_subtile) {
25977 TEST_REQUIRES_X86_SSE;
25978 for (uint32_t m = 1; m <= 1; m++) {
25979 for (uint32_t n = 1; n <= 8; n++) {
25980 GemmMicrokernelTester()
25981 .mr(1)
25982 .nr(8)
25983 .kr(1)
25984 .sr(1)
25985 .m(m)
25986 .n(n)
25987 .k(1)
25988 .iterations(1)
25989 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
25990 }
25991 }
25992 }
25993
25994 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1_subtile_m) {
25995 TEST_REQUIRES_X86_SSE;
25996 for (uint32_t m = 1; m <= 1; m++) {
25997 GemmMicrokernelTester()
25998 .mr(1)
25999 .nr(8)
26000 .kr(1)
26001 .sr(1)
26002 .m(m)
26003 .n(8)
26004 .k(1)
26005 .iterations(1)
26006 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26007 }
26008 }
26009
26010 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_eq_1_subtile_n) {
26011 TEST_REQUIRES_X86_SSE;
26012 for (uint32_t n = 1; n <= 8; n++) {
26013 GemmMicrokernelTester()
26014 .mr(1)
26015 .nr(8)
26016 .kr(1)
26017 .sr(1)
26018 .m(1)
26019 .n(n)
26020 .k(1)
26021 .iterations(1)
26022 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26023 }
26024 }
26025
26026 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_gt_1) {
26027 TEST_REQUIRES_X86_SSE;
26028 for (size_t k = 2; k < 10; k++) {
26029 GemmMicrokernelTester()
26030 .mr(1)
26031 .nr(8)
26032 .kr(1)
26033 .sr(1)
26034 .m(1)
26035 .n(8)
26036 .k(k)
26037 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26038 }
26039 }
26040
26041 TEST(F32_IGEMM_1X8__SSE_LOAD1, k_gt_1_subtile) {
26042 TEST_REQUIRES_X86_SSE;
26043 for (size_t k = 2; k < 10; k++) {
26044 for (uint32_t m = 1; m <= 1; m++) {
26045 for (uint32_t n = 1; n <= 8; n++) {
26046 GemmMicrokernelTester()
26047 .mr(1)
26048 .nr(8)
26049 .kr(1)
26050 .sr(1)
26051 .m(m)
26052 .n(n)
26053 .k(k)
26054 .iterations(1)
26055 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26056 }
26057 }
26058 }
26059 }
26060
26061 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8) {
26062 TEST_REQUIRES_X86_SSE;
26063 for (uint32_t n = 9; n < 16; n++) {
26064 for (size_t k = 1; k <= 5; k += 2) {
26065 GemmMicrokernelTester()
26066 .mr(1)
26067 .nr(8)
26068 .kr(1)
26069 .sr(1)
26070 .m(1)
26071 .n(8)
26072 .k(k)
26073 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26074 }
26075 }
26076 }
26077
26078 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8_strided_cn) {
26079 TEST_REQUIRES_X86_SSE;
26080 for (uint32_t n = 9; n < 16; n++) {
26081 for (size_t k = 1; k <= 5; k += 2) {
26082 GemmMicrokernelTester()
26083 .mr(1)
26084 .nr(8)
26085 .kr(1)
26086 .sr(1)
26087 .m(1)
26088 .n(8)
26089 .k(k)
26090 .cn_stride(11)
26091 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26092 }
26093 }
26094 }
26095
26096 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8_subtile) {
26097 TEST_REQUIRES_X86_SSE;
26098 for (uint32_t n = 9; n < 16; n++) {
26099 for (size_t k = 1; k <= 5; k += 2) {
26100 for (uint32_t m = 1; m <= 1; m++) {
26101 GemmMicrokernelTester()
26102 .mr(1)
26103 .nr(8)
26104 .kr(1)
26105 .sr(1)
26106 .m(m)
26107 .n(n)
26108 .k(k)
26109 .iterations(1)
26110 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26111 }
26112 }
26113 }
26114 }
26115
26116 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8) {
26117 TEST_REQUIRES_X86_SSE;
26118 for (uint32_t n = 16; n <= 24; n += 8) {
26119 for (size_t k = 1; k <= 5; k += 2) {
26120 GemmMicrokernelTester()
26121 .mr(1)
26122 .nr(8)
26123 .kr(1)
26124 .sr(1)
26125 .m(1)
26126 .n(8)
26127 .k(k)
26128 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26129 }
26130 }
26131 }
26132
26133 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8_strided_cn) {
26134 TEST_REQUIRES_X86_SSE;
26135 for (uint32_t n = 16; n <= 24; n += 8) {
26136 for (size_t k = 1; k <= 5; k += 2) {
26137 GemmMicrokernelTester()
26138 .mr(1)
26139 .nr(8)
26140 .kr(1)
26141 .sr(1)
26142 .m(1)
26143 .n(n)
26144 .k(k)
26145 .cn_stride(11)
26146 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26147 }
26148 }
26149 }
26150
26151 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8_subtile) {
26152 TEST_REQUIRES_X86_SSE;
26153 for (uint32_t n = 16; n <= 24; n += 8) {
26154 for (size_t k = 1; k <= 5; k += 2) {
26155 for (uint32_t m = 1; m <= 1; m++) {
26156 GemmMicrokernelTester()
26157 .mr(1)
26158 .nr(8)
26159 .kr(1)
26160 .sr(1)
26161 .m(m)
26162 .n(n)
26163 .k(k)
26164 .iterations(1)
26165 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26166 }
26167 }
26168 }
26169 }
26170
26171 TEST(F32_IGEMM_1X8__SSE_LOAD1, small_kernel) {
26172 TEST_REQUIRES_X86_SSE;
26173 for (size_t k = 1; k <= 5; k += 2) {
26174 GemmMicrokernelTester()
26175 .mr(1)
26176 .nr(8)
26177 .kr(1)
26178 .sr(1)
26179 .m(1)
26180 .n(8)
26181 .k(k)
26182 .ks(3)
26183 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26184 }
26185 }
26186
26187 TEST(F32_IGEMM_1X8__SSE_LOAD1, small_kernel_subtile) {
26188 TEST_REQUIRES_X86_SSE;
26189 for (size_t k = 1; k <= 5; k += 2) {
26190 for (uint32_t m = 1; m <= 1; m++) {
26191 for (uint32_t n = 1; n <= 8; n++) {
26192 GemmMicrokernelTester()
26193 .mr(1)
26194 .nr(8)
26195 .kr(1)
26196 .sr(1)
26197 .m(m)
26198 .n(n)
26199 .k(k)
26200 .ks(3)
26201 .iterations(1)
26202 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26203 }
26204 }
26205 }
26206 }
26207
26208 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_gt_8_small_kernel) {
26209 TEST_REQUIRES_X86_SSE;
26210 for (uint32_t n = 9; n < 16; n++) {
26211 for (size_t k = 1; k <= 5; k += 2) {
26212 GemmMicrokernelTester()
26213 .mr(1)
26214 .nr(8)
26215 .kr(1)
26216 .sr(1)
26217 .m(1)
26218 .n(8)
26219 .k(k)
26220 .ks(3)
26221 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26222 }
26223 }
26224 }
26225
26226 TEST(F32_IGEMM_1X8__SSE_LOAD1, n_div_8_small_kernel) {
26227 TEST_REQUIRES_X86_SSE;
26228 for (uint32_t n = 16; n <= 24; n += 8) {
26229 for (size_t k = 1; k <= 5; k += 2) {
26230 GemmMicrokernelTester()
26231 .mr(1)
26232 .nr(8)
26233 .kr(1)
26234 .sr(1)
26235 .m(1)
26236 .n(8)
26237 .k(k)
26238 .ks(3)
26239 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26240 }
26241 }
26242 }
26243
26244 TEST(F32_IGEMM_1X8__SSE_LOAD1, strided_cm_subtile) {
26245 TEST_REQUIRES_X86_SSE;
26246 for (size_t k = 1; k <= 5; k += 2) {
26247 for (uint32_t m = 1; m <= 1; m++) {
26248 for (uint32_t n = 1; n <= 8; n++) {
26249 GemmMicrokernelTester()
26250 .mr(1)
26251 .nr(8)
26252 .kr(1)
26253 .sr(1)
26254 .m(m)
26255 .n(n)
26256 .k(k)
26257 .cm_stride(11)
26258 .iterations(1)
26259 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26260 }
26261 }
26262 }
26263 }
26264
26265 TEST(F32_IGEMM_1X8__SSE_LOAD1, a_offset) {
26266 TEST_REQUIRES_X86_SSE;
26267 for (size_t k = 1; k <= 5; k += 2) {
26268 GemmMicrokernelTester()
26269 .mr(1)
26270 .nr(8)
26271 .kr(1)
26272 .sr(1)
26273 .m(1)
26274 .n(8)
26275 .k(k)
26276 .ks(3)
26277 .a_offset(7)
26278 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26279 }
26280 }
26281
26282 TEST(F32_IGEMM_1X8__SSE_LOAD1, zero) {
26283 TEST_REQUIRES_X86_SSE;
26284 for (uint32_t mz = 0; mz < 1; mz++) {
26285 for (size_t k = 1; k <= 5; k += 2) {
26286 GemmMicrokernelTester()
26287 .mr(1)
26288 .nr(8)
26289 .kr(1)
26290 .sr(1)
26291 .m(1)
26292 .n(8)
26293 .k(k)
26294 .ks(3)
26295 .a_offset(7)
26296 .zero_index(mz)
26297 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26298 }
26299 }
26300 }
26301
26302 TEST(F32_IGEMM_1X8__SSE_LOAD1, qmin) {
26303 TEST_REQUIRES_X86_SSE;
26304 GemmMicrokernelTester()
26305 .mr(1)
26306 .nr(8)
26307 .kr(1)
26308 .sr(1)
26309 .m(1)
26310 .n(8)
26311 .k(1)
26312 .qmin(128)
26313 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26314 }
26315
26316 TEST(F32_IGEMM_1X8__SSE_LOAD1, qmax) {
26317 TEST_REQUIRES_X86_SSE;
26318 GemmMicrokernelTester()
26319 .mr(1)
26320 .nr(8)
26321 .kr(1)
26322 .sr(1)
26323 .m(1)
26324 .n(8)
26325 .k(1)
26326 .qmax(128)
26327 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26328 }
26329
26330 TEST(F32_IGEMM_1X8__SSE_LOAD1, strided_cm) {
26331 TEST_REQUIRES_X86_SSE;
26332 GemmMicrokernelTester()
26333 .mr(1)
26334 .nr(8)
26335 .kr(1)
26336 .sr(1)
26337 .m(1)
26338 .n(8)
26339 .k(1)
26340 .cm_stride(11)
26341 .Test(xnn_f32_igemm_ukernel_1x8__sse_load1);
26342 }
26343#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26344
26345
26346#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26347 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1) {
26348 TEST_REQUIRES_X86_SSE;
26349 GemmMicrokernelTester()
26350 .mr(4)
26351 .nr(8)
26352 .kr(1)
26353 .sr(1)
26354 .m(4)
26355 .n(8)
26356 .k(1)
26357 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26358 }
26359
26360 TEST(F32_IGEMM_4X8__SSE_LOAD1, strided_cn) {
26361 TEST_REQUIRES_X86_SSE;
26362 GemmMicrokernelTester()
26363 .mr(4)
26364 .nr(8)
26365 .kr(1)
26366 .sr(1)
26367 .m(4)
26368 .n(8)
26369 .k(1)
26370 .cn_stride(11)
26371 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26372 }
26373
26374 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1_subtile) {
26375 TEST_REQUIRES_X86_SSE;
26376 for (uint32_t m = 1; m <= 4; m++) {
26377 for (uint32_t n = 1; n <= 8; n++) {
26378 GemmMicrokernelTester()
26379 .mr(4)
26380 .nr(8)
26381 .kr(1)
26382 .sr(1)
26383 .m(m)
26384 .n(n)
26385 .k(1)
26386 .iterations(1)
26387 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26388 }
26389 }
26390 }
26391
26392 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
26393 TEST_REQUIRES_X86_SSE;
26394 for (uint32_t m = 1; m <= 4; m++) {
26395 GemmMicrokernelTester()
26396 .mr(4)
26397 .nr(8)
26398 .kr(1)
26399 .sr(1)
26400 .m(m)
26401 .n(8)
26402 .k(1)
26403 .iterations(1)
26404 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26405 }
26406 }
26407
26408 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
26409 TEST_REQUIRES_X86_SSE;
26410 for (uint32_t n = 1; n <= 8; n++) {
26411 GemmMicrokernelTester()
26412 .mr(4)
26413 .nr(8)
26414 .kr(1)
26415 .sr(1)
26416 .m(4)
26417 .n(n)
26418 .k(1)
26419 .iterations(1)
26420 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26421 }
26422 }
26423
26424 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_gt_1) {
26425 TEST_REQUIRES_X86_SSE;
26426 for (size_t k = 2; k < 10; k++) {
26427 GemmMicrokernelTester()
26428 .mr(4)
26429 .nr(8)
26430 .kr(1)
26431 .sr(1)
26432 .m(4)
26433 .n(8)
26434 .k(k)
26435 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26436 }
26437 }
26438
26439 TEST(F32_IGEMM_4X8__SSE_LOAD1, k_gt_1_subtile) {
26440 TEST_REQUIRES_X86_SSE;
26441 for (size_t k = 2; k < 10; k++) {
26442 for (uint32_t m = 1; m <= 4; m++) {
26443 for (uint32_t n = 1; n <= 8; n++) {
26444 GemmMicrokernelTester()
26445 .mr(4)
26446 .nr(8)
26447 .kr(1)
26448 .sr(1)
26449 .m(m)
26450 .n(n)
26451 .k(k)
26452 .iterations(1)
26453 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26454 }
26455 }
26456 }
26457 }
26458
26459 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8) {
26460 TEST_REQUIRES_X86_SSE;
26461 for (uint32_t n = 9; n < 16; n++) {
26462 for (size_t k = 1; k <= 5; k += 2) {
26463 GemmMicrokernelTester()
26464 .mr(4)
26465 .nr(8)
26466 .kr(1)
26467 .sr(1)
26468 .m(4)
26469 .n(8)
26470 .k(k)
26471 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26472 }
26473 }
26474 }
26475
26476 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
26477 TEST_REQUIRES_X86_SSE;
26478 for (uint32_t n = 9; n < 16; n++) {
26479 for (size_t k = 1; k <= 5; k += 2) {
26480 GemmMicrokernelTester()
26481 .mr(4)
26482 .nr(8)
26483 .kr(1)
26484 .sr(1)
26485 .m(4)
26486 .n(8)
26487 .k(k)
26488 .cn_stride(11)
26489 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26490 }
26491 }
26492 }
26493
26494 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8_subtile) {
26495 TEST_REQUIRES_X86_SSE;
26496 for (uint32_t n = 9; n < 16; n++) {
26497 for (size_t k = 1; k <= 5; k += 2) {
26498 for (uint32_t m = 1; m <= 4; m++) {
26499 GemmMicrokernelTester()
26500 .mr(4)
26501 .nr(8)
26502 .kr(1)
26503 .sr(1)
26504 .m(m)
26505 .n(n)
26506 .k(k)
26507 .iterations(1)
26508 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26509 }
26510 }
26511 }
26512 }
26513
26514 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8) {
26515 TEST_REQUIRES_X86_SSE;
26516 for (uint32_t n = 16; n <= 24; n += 8) {
26517 for (size_t k = 1; k <= 5; k += 2) {
26518 GemmMicrokernelTester()
26519 .mr(4)
26520 .nr(8)
26521 .kr(1)
26522 .sr(1)
26523 .m(4)
26524 .n(8)
26525 .k(k)
26526 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26527 }
26528 }
26529 }
26530
26531 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8_strided_cn) {
26532 TEST_REQUIRES_X86_SSE;
26533 for (uint32_t n = 16; n <= 24; n += 8) {
26534 for (size_t k = 1; k <= 5; k += 2) {
26535 GemmMicrokernelTester()
26536 .mr(4)
26537 .nr(8)
26538 .kr(1)
26539 .sr(1)
26540 .m(4)
26541 .n(n)
26542 .k(k)
26543 .cn_stride(11)
26544 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26545 }
26546 }
26547 }
26548
26549 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8_subtile) {
26550 TEST_REQUIRES_X86_SSE;
26551 for (uint32_t n = 16; n <= 24; n += 8) {
26552 for (size_t k = 1; k <= 5; k += 2) {
26553 for (uint32_t m = 1; m <= 4; m++) {
26554 GemmMicrokernelTester()
26555 .mr(4)
26556 .nr(8)
26557 .kr(1)
26558 .sr(1)
26559 .m(m)
26560 .n(n)
26561 .k(k)
26562 .iterations(1)
26563 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26564 }
26565 }
26566 }
26567 }
26568
26569 TEST(F32_IGEMM_4X8__SSE_LOAD1, small_kernel) {
26570 TEST_REQUIRES_X86_SSE;
26571 for (size_t k = 1; k <= 5; k += 2) {
26572 GemmMicrokernelTester()
26573 .mr(4)
26574 .nr(8)
26575 .kr(1)
26576 .sr(1)
26577 .m(4)
26578 .n(8)
26579 .k(k)
26580 .ks(3)
26581 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26582 }
26583 }
26584
26585 TEST(F32_IGEMM_4X8__SSE_LOAD1, small_kernel_subtile) {
26586 TEST_REQUIRES_X86_SSE;
26587 for (size_t k = 1; k <= 5; k += 2) {
26588 for (uint32_t m = 1; m <= 4; m++) {
26589 for (uint32_t n = 1; n <= 8; n++) {
26590 GemmMicrokernelTester()
26591 .mr(4)
26592 .nr(8)
26593 .kr(1)
26594 .sr(1)
26595 .m(m)
26596 .n(n)
26597 .k(k)
26598 .ks(3)
26599 .iterations(1)
26600 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26601 }
26602 }
26603 }
26604 }
26605
26606 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_gt_8_small_kernel) {
26607 TEST_REQUIRES_X86_SSE;
26608 for (uint32_t n = 9; n < 16; n++) {
26609 for (size_t k = 1; k <= 5; k += 2) {
26610 GemmMicrokernelTester()
26611 .mr(4)
26612 .nr(8)
26613 .kr(1)
26614 .sr(1)
26615 .m(4)
26616 .n(8)
26617 .k(k)
26618 .ks(3)
26619 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26620 }
26621 }
26622 }
26623
26624 TEST(F32_IGEMM_4X8__SSE_LOAD1, n_div_8_small_kernel) {
26625 TEST_REQUIRES_X86_SSE;
26626 for (uint32_t n = 16; n <= 24; n += 8) {
26627 for (size_t k = 1; k <= 5; k += 2) {
26628 GemmMicrokernelTester()
26629 .mr(4)
26630 .nr(8)
26631 .kr(1)
26632 .sr(1)
26633 .m(4)
26634 .n(8)
26635 .k(k)
26636 .ks(3)
26637 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26638 }
26639 }
26640 }
26641
26642 TEST(F32_IGEMM_4X8__SSE_LOAD1, strided_cm_subtile) {
26643 TEST_REQUIRES_X86_SSE;
26644 for (size_t k = 1; k <= 5; k += 2) {
26645 for (uint32_t m = 1; m <= 4; m++) {
26646 for (uint32_t n = 1; n <= 8; n++) {
26647 GemmMicrokernelTester()
26648 .mr(4)
26649 .nr(8)
26650 .kr(1)
26651 .sr(1)
26652 .m(m)
26653 .n(n)
26654 .k(k)
26655 .cm_stride(11)
26656 .iterations(1)
26657 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26658 }
26659 }
26660 }
26661 }
26662
26663 TEST(F32_IGEMM_4X8__SSE_LOAD1, a_offset) {
26664 TEST_REQUIRES_X86_SSE;
26665 for (size_t k = 1; k <= 5; k += 2) {
26666 GemmMicrokernelTester()
26667 .mr(4)
26668 .nr(8)
26669 .kr(1)
26670 .sr(1)
26671 .m(4)
26672 .n(8)
26673 .k(k)
26674 .ks(3)
26675 .a_offset(23)
26676 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26677 }
26678 }
26679
26680 TEST(F32_IGEMM_4X8__SSE_LOAD1, zero) {
26681 TEST_REQUIRES_X86_SSE;
26682 for (uint32_t mz = 0; mz < 4; mz++) {
26683 for (size_t k = 1; k <= 5; k += 2) {
26684 GemmMicrokernelTester()
26685 .mr(4)
26686 .nr(8)
26687 .kr(1)
26688 .sr(1)
26689 .m(4)
26690 .n(8)
26691 .k(k)
26692 .ks(3)
26693 .a_offset(23)
26694 .zero_index(mz)
26695 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26696 }
26697 }
26698 }
26699
26700 TEST(F32_IGEMM_4X8__SSE_LOAD1, qmin) {
26701 TEST_REQUIRES_X86_SSE;
26702 GemmMicrokernelTester()
26703 .mr(4)
26704 .nr(8)
26705 .kr(1)
26706 .sr(1)
26707 .m(4)
26708 .n(8)
26709 .k(1)
26710 .qmin(128)
26711 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26712 }
26713
26714 TEST(F32_IGEMM_4X8__SSE_LOAD1, qmax) {
26715 TEST_REQUIRES_X86_SSE;
26716 GemmMicrokernelTester()
26717 .mr(4)
26718 .nr(8)
26719 .kr(1)
26720 .sr(1)
26721 .m(4)
26722 .n(8)
26723 .k(1)
26724 .qmax(128)
26725 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26726 }
26727
26728 TEST(F32_IGEMM_4X8__SSE_LOAD1, strided_cm) {
26729 TEST_REQUIRES_X86_SSE;
26730 GemmMicrokernelTester()
26731 .mr(4)
26732 .nr(8)
26733 .kr(1)
26734 .sr(1)
26735 .m(4)
26736 .n(8)
26737 .k(1)
26738 .cm_stride(11)
26739 .Test(xnn_f32_igemm_ukernel_4x8__sse_load1);
26740 }
26741#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26742
26743
26744#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26745 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4) {
26746 TEST_REQUIRES_X86_SSE;
26747 GemmMicrokernelTester()
26748 .mr(1)
26749 .nr(8)
26750 .kr(1)
26751 .sr(1)
26752 .m(1)
26753 .n(8)
26754 .k(4)
26755 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26756 }
26757
26758 TEST(F32_IGEMM_1X8__SSE_DUP, strided_cn) {
26759 TEST_REQUIRES_X86_SSE;
26760 GemmMicrokernelTester()
26761 .mr(1)
26762 .nr(8)
26763 .kr(1)
26764 .sr(1)
26765 .m(1)
26766 .n(8)
26767 .k(4)
26768 .cn_stride(11)
26769 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26770 }
26771
26772 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4_subtile) {
26773 TEST_REQUIRES_X86_SSE;
26774 for (uint32_t m = 1; m <= 1; m++) {
26775 for (uint32_t n = 1; n <= 8; n++) {
26776 GemmMicrokernelTester()
26777 .mr(1)
26778 .nr(8)
26779 .kr(1)
26780 .sr(1)
26781 .m(m)
26782 .n(n)
26783 .k(4)
26784 .iterations(1)
26785 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26786 }
26787 }
26788 }
26789
26790 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4_subtile_m) {
26791 TEST_REQUIRES_X86_SSE;
26792 for (uint32_t m = 1; m <= 1; m++) {
26793 GemmMicrokernelTester()
26794 .mr(1)
26795 .nr(8)
26796 .kr(1)
26797 .sr(1)
26798 .m(m)
26799 .n(8)
26800 .k(4)
26801 .iterations(1)
26802 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26803 }
26804 }
26805
26806 TEST(F32_IGEMM_1X8__SSE_DUP, k_eq_4_subtile_n) {
26807 TEST_REQUIRES_X86_SSE;
26808 for (uint32_t n = 1; n <= 8; n++) {
26809 GemmMicrokernelTester()
26810 .mr(1)
26811 .nr(8)
26812 .kr(1)
26813 .sr(1)
26814 .m(1)
26815 .n(n)
26816 .k(4)
26817 .iterations(1)
26818 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26819 }
26820 }
26821
26822 TEST(F32_IGEMM_1X8__SSE_DUP, k_lt_4) {
26823 TEST_REQUIRES_X86_SSE;
26824 for (size_t k = 1; k < 4; k++) {
26825 GemmMicrokernelTester()
26826 .mr(1)
26827 .nr(8)
26828 .kr(1)
26829 .sr(1)
26830 .m(1)
26831 .n(8)
26832 .k(k)
26833 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26834 }
26835 }
26836
26837 TEST(F32_IGEMM_1X8__SSE_DUP, k_lt_4_subtile) {
26838 TEST_REQUIRES_X86_SSE;
26839 for (size_t k = 1; k < 4; k++) {
26840 for (uint32_t m = 1; m <= 1; m++) {
26841 for (uint32_t n = 1; n <= 8; n++) {
26842 GemmMicrokernelTester()
26843 .mr(1)
26844 .nr(8)
26845 .kr(1)
26846 .sr(1)
26847 .m(m)
26848 .n(n)
26849 .k(k)
26850 .iterations(1)
26851 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26852 }
26853 }
26854 }
26855 }
26856
26857 TEST(F32_IGEMM_1X8__SSE_DUP, k_gt_4) {
26858 TEST_REQUIRES_X86_SSE;
26859 for (size_t k = 5; k < 8; k++) {
26860 GemmMicrokernelTester()
26861 .mr(1)
26862 .nr(8)
26863 .kr(1)
26864 .sr(1)
26865 .m(1)
26866 .n(8)
26867 .k(k)
26868 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26869 }
26870 }
26871
26872 TEST(F32_IGEMM_1X8__SSE_DUP, k_gt_4_subtile) {
26873 TEST_REQUIRES_X86_SSE;
26874 for (size_t k = 5; k < 8; k++) {
26875 for (uint32_t m = 1; m <= 1; m++) {
26876 for (uint32_t n = 1; n <= 8; n++) {
26877 GemmMicrokernelTester()
26878 .mr(1)
26879 .nr(8)
26880 .kr(1)
26881 .sr(1)
26882 .m(m)
26883 .n(n)
26884 .k(k)
26885 .iterations(1)
26886 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26887 }
26888 }
26889 }
26890 }
26891
26892 TEST(F32_IGEMM_1X8__SSE_DUP, k_div_4) {
26893 TEST_REQUIRES_X86_SSE;
26894 for (size_t k = 8; k <= 40; k += 4) {
26895 GemmMicrokernelTester()
26896 .mr(1)
26897 .nr(8)
26898 .kr(1)
26899 .sr(1)
26900 .m(1)
26901 .n(8)
26902 .k(k)
26903 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26904 }
26905 }
26906
26907 TEST(F32_IGEMM_1X8__SSE_DUP, k_div_4_subtile) {
26908 TEST_REQUIRES_X86_SSE;
26909 for (size_t k = 8; k <= 40; k += 4) {
26910 for (uint32_t m = 1; m <= 1; m++) {
26911 for (uint32_t n = 1; n <= 8; n++) {
26912 GemmMicrokernelTester()
26913 .mr(1)
26914 .nr(8)
26915 .kr(1)
26916 .sr(1)
26917 .m(m)
26918 .n(n)
26919 .k(k)
26920 .iterations(1)
26921 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26922 }
26923 }
26924 }
26925 }
26926
26927 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8) {
26928 TEST_REQUIRES_X86_SSE;
26929 for (uint32_t n = 9; n < 16; n++) {
26930 for (size_t k = 1; k <= 20; k += 5) {
26931 GemmMicrokernelTester()
26932 .mr(1)
26933 .nr(8)
26934 .kr(1)
26935 .sr(1)
26936 .m(1)
26937 .n(8)
26938 .k(k)
26939 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26940 }
26941 }
26942 }
26943
26944 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8_strided_cn) {
26945 TEST_REQUIRES_X86_SSE;
26946 for (uint32_t n = 9; n < 16; n++) {
26947 for (size_t k = 1; k <= 20; k += 5) {
26948 GemmMicrokernelTester()
26949 .mr(1)
26950 .nr(8)
26951 .kr(1)
26952 .sr(1)
26953 .m(1)
26954 .n(8)
26955 .k(k)
26956 .cn_stride(11)
26957 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26958 }
26959 }
26960 }
26961
26962 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8_subtile) {
26963 TEST_REQUIRES_X86_SSE;
26964 for (uint32_t n = 9; n < 16; n++) {
26965 for (size_t k = 1; k <= 20; k += 5) {
26966 for (uint32_t m = 1; m <= 1; m++) {
26967 GemmMicrokernelTester()
26968 .mr(1)
26969 .nr(8)
26970 .kr(1)
26971 .sr(1)
26972 .m(m)
26973 .n(n)
26974 .k(k)
26975 .iterations(1)
26976 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26977 }
26978 }
26979 }
26980 }
26981
26982 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8) {
26983 TEST_REQUIRES_X86_SSE;
26984 for (uint32_t n = 16; n <= 24; n += 8) {
26985 for (size_t k = 1; k <= 20; k += 5) {
26986 GemmMicrokernelTester()
26987 .mr(1)
26988 .nr(8)
26989 .kr(1)
26990 .sr(1)
26991 .m(1)
26992 .n(8)
26993 .k(k)
26994 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
26995 }
26996 }
26997 }
26998
26999 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8_strided_cn) {
27000 TEST_REQUIRES_X86_SSE;
27001 for (uint32_t n = 16; n <= 24; n += 8) {
27002 for (size_t k = 1; k <= 20; k += 5) {
27003 GemmMicrokernelTester()
27004 .mr(1)
27005 .nr(8)
27006 .kr(1)
27007 .sr(1)
27008 .m(1)
27009 .n(n)
27010 .k(k)
27011 .cn_stride(11)
27012 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27013 }
27014 }
27015 }
27016
27017 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8_subtile) {
27018 TEST_REQUIRES_X86_SSE;
27019 for (uint32_t n = 16; n <= 24; n += 8) {
27020 for (size_t k = 1; k <= 20; k += 5) {
27021 for (uint32_t m = 1; m <= 1; m++) {
27022 GemmMicrokernelTester()
27023 .mr(1)
27024 .nr(8)
27025 .kr(1)
27026 .sr(1)
27027 .m(m)
27028 .n(n)
27029 .k(k)
27030 .iterations(1)
27031 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27032 }
27033 }
27034 }
27035 }
27036
27037 TEST(F32_IGEMM_1X8__SSE_DUP, small_kernel) {
27038 TEST_REQUIRES_X86_SSE;
27039 for (size_t k = 1; k <= 20; k += 5) {
27040 GemmMicrokernelTester()
27041 .mr(1)
27042 .nr(8)
27043 .kr(1)
27044 .sr(1)
27045 .m(1)
27046 .n(8)
27047 .k(k)
27048 .ks(3)
27049 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27050 }
27051 }
27052
27053 TEST(F32_IGEMM_1X8__SSE_DUP, small_kernel_subtile) {
27054 TEST_REQUIRES_X86_SSE;
27055 for (size_t k = 1; k <= 20; k += 5) {
27056 for (uint32_t m = 1; m <= 1; m++) {
27057 for (uint32_t n = 1; n <= 8; n++) {
27058 GemmMicrokernelTester()
27059 .mr(1)
27060 .nr(8)
27061 .kr(1)
27062 .sr(1)
27063 .m(m)
27064 .n(n)
27065 .k(k)
27066 .ks(3)
27067 .iterations(1)
27068 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27069 }
27070 }
27071 }
27072 }
27073
27074 TEST(F32_IGEMM_1X8__SSE_DUP, n_gt_8_small_kernel) {
27075 TEST_REQUIRES_X86_SSE;
27076 for (uint32_t n = 9; n < 16; n++) {
27077 for (size_t k = 1; k <= 20; k += 5) {
27078 GemmMicrokernelTester()
27079 .mr(1)
27080 .nr(8)
27081 .kr(1)
27082 .sr(1)
27083 .m(1)
27084 .n(8)
27085 .k(k)
27086 .ks(3)
27087 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27088 }
27089 }
27090 }
27091
27092 TEST(F32_IGEMM_1X8__SSE_DUP, n_div_8_small_kernel) {
27093 TEST_REQUIRES_X86_SSE;
27094 for (uint32_t n = 16; n <= 24; n += 8) {
27095 for (size_t k = 1; k <= 20; k += 5) {
27096 GemmMicrokernelTester()
27097 .mr(1)
27098 .nr(8)
27099 .kr(1)
27100 .sr(1)
27101 .m(1)
27102 .n(8)
27103 .k(k)
27104 .ks(3)
27105 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27106 }
27107 }
27108 }
27109
27110 TEST(F32_IGEMM_1X8__SSE_DUP, strided_cm_subtile) {
27111 TEST_REQUIRES_X86_SSE;
27112 for (size_t k = 1; k <= 20; k += 5) {
27113 for (uint32_t m = 1; m <= 1; m++) {
27114 for (uint32_t n = 1; n <= 8; n++) {
27115 GemmMicrokernelTester()
27116 .mr(1)
27117 .nr(8)
27118 .kr(1)
27119 .sr(1)
27120 .m(m)
27121 .n(n)
27122 .k(k)
27123 .cm_stride(11)
27124 .iterations(1)
27125 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27126 }
27127 }
27128 }
27129 }
27130
27131 TEST(F32_IGEMM_1X8__SSE_DUP, a_offset) {
27132 TEST_REQUIRES_X86_SSE;
27133 for (size_t k = 1; k <= 20; k += 5) {
27134 GemmMicrokernelTester()
27135 .mr(1)
27136 .nr(8)
27137 .kr(1)
27138 .sr(1)
27139 .m(1)
27140 .n(8)
27141 .k(k)
27142 .ks(3)
27143 .a_offset(23)
27144 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27145 }
27146 }
27147
27148 TEST(F32_IGEMM_1X8__SSE_DUP, zero) {
27149 TEST_REQUIRES_X86_SSE;
27150 for (uint32_t mz = 0; mz < 1; mz++) {
27151 for (size_t k = 1; k <= 20; k += 5) {
27152 GemmMicrokernelTester()
27153 .mr(1)
27154 .nr(8)
27155 .kr(1)
27156 .sr(1)
27157 .m(1)
27158 .n(8)
27159 .k(k)
27160 .ks(3)
27161 .a_offset(23)
27162 .zero_index(mz)
27163 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27164 }
27165 }
27166 }
27167
27168 TEST(F32_IGEMM_1X8__SSE_DUP, qmin) {
27169 TEST_REQUIRES_X86_SSE;
27170 GemmMicrokernelTester()
27171 .mr(1)
27172 .nr(8)
27173 .kr(1)
27174 .sr(1)
27175 .m(1)
27176 .n(8)
27177 .k(4)
27178 .qmin(128)
27179 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27180 }
27181
27182 TEST(F32_IGEMM_1X8__SSE_DUP, qmax) {
27183 TEST_REQUIRES_X86_SSE;
27184 GemmMicrokernelTester()
27185 .mr(1)
27186 .nr(8)
27187 .kr(1)
27188 .sr(1)
27189 .m(1)
27190 .n(8)
27191 .k(4)
27192 .qmax(128)
27193 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27194 }
27195
27196 TEST(F32_IGEMM_1X8__SSE_DUP, strided_cm) {
27197 TEST_REQUIRES_X86_SSE;
27198 GemmMicrokernelTester()
27199 .mr(1)
27200 .nr(8)
27201 .kr(1)
27202 .sr(1)
27203 .m(1)
27204 .n(8)
27205 .k(4)
27206 .cm_stride(11)
27207 .Test(xnn_f32_igemm_ukernel_1x8__sse_dup);
27208 }
27209#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27210
27211
27212#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27213 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4) {
27214 TEST_REQUIRES_X86_SSE;
27215 GemmMicrokernelTester()
27216 .mr(4)
27217 .nr(8)
27218 .kr(1)
27219 .sr(1)
27220 .m(4)
27221 .n(8)
27222 .k(4)
27223 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27224 }
27225
27226 TEST(F32_IGEMM_4X8__SSE_DUP, strided_cn) {
27227 TEST_REQUIRES_X86_SSE;
27228 GemmMicrokernelTester()
27229 .mr(4)
27230 .nr(8)
27231 .kr(1)
27232 .sr(1)
27233 .m(4)
27234 .n(8)
27235 .k(4)
27236 .cn_stride(11)
27237 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27238 }
27239
27240 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4_subtile) {
27241 TEST_REQUIRES_X86_SSE;
27242 for (uint32_t m = 1; m <= 4; m++) {
27243 for (uint32_t n = 1; n <= 8; n++) {
27244 GemmMicrokernelTester()
27245 .mr(4)
27246 .nr(8)
27247 .kr(1)
27248 .sr(1)
27249 .m(m)
27250 .n(n)
27251 .k(4)
27252 .iterations(1)
27253 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27254 }
27255 }
27256 }
27257
27258 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4_subtile_m) {
27259 TEST_REQUIRES_X86_SSE;
27260 for (uint32_t m = 1; m <= 4; m++) {
27261 GemmMicrokernelTester()
27262 .mr(4)
27263 .nr(8)
27264 .kr(1)
27265 .sr(1)
27266 .m(m)
27267 .n(8)
27268 .k(4)
27269 .iterations(1)
27270 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27271 }
27272 }
27273
27274 TEST(F32_IGEMM_4X8__SSE_DUP, k_eq_4_subtile_n) {
27275 TEST_REQUIRES_X86_SSE;
27276 for (uint32_t n = 1; n <= 8; n++) {
27277 GemmMicrokernelTester()
27278 .mr(4)
27279 .nr(8)
27280 .kr(1)
27281 .sr(1)
27282 .m(4)
27283 .n(n)
27284 .k(4)
27285 .iterations(1)
27286 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27287 }
27288 }
27289
27290 TEST(F32_IGEMM_4X8__SSE_DUP, k_lt_4) {
27291 TEST_REQUIRES_X86_SSE;
27292 for (size_t k = 1; k < 4; k++) {
27293 GemmMicrokernelTester()
27294 .mr(4)
27295 .nr(8)
27296 .kr(1)
27297 .sr(1)
27298 .m(4)
27299 .n(8)
27300 .k(k)
27301 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27302 }
27303 }
27304
27305 TEST(F32_IGEMM_4X8__SSE_DUP, k_lt_4_subtile) {
27306 TEST_REQUIRES_X86_SSE;
27307 for (size_t k = 1; k < 4; k++) {
27308 for (uint32_t m = 1; m <= 4; m++) {
27309 for (uint32_t n = 1; n <= 8; n++) {
27310 GemmMicrokernelTester()
27311 .mr(4)
27312 .nr(8)
27313 .kr(1)
27314 .sr(1)
27315 .m(m)
27316 .n(n)
27317 .k(k)
27318 .iterations(1)
27319 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27320 }
27321 }
27322 }
27323 }
27324
27325 TEST(F32_IGEMM_4X8__SSE_DUP, k_gt_4) {
27326 TEST_REQUIRES_X86_SSE;
27327 for (size_t k = 5; k < 8; k++) {
27328 GemmMicrokernelTester()
27329 .mr(4)
27330 .nr(8)
27331 .kr(1)
27332 .sr(1)
27333 .m(4)
27334 .n(8)
27335 .k(k)
27336 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27337 }
27338 }
27339
27340 TEST(F32_IGEMM_4X8__SSE_DUP, k_gt_4_subtile) {
27341 TEST_REQUIRES_X86_SSE;
27342 for (size_t k = 5; k < 8; k++) {
27343 for (uint32_t m = 1; m <= 4; m++) {
27344 for (uint32_t n = 1; n <= 8; n++) {
27345 GemmMicrokernelTester()
27346 .mr(4)
27347 .nr(8)
27348 .kr(1)
27349 .sr(1)
27350 .m(m)
27351 .n(n)
27352 .k(k)
27353 .iterations(1)
27354 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27355 }
27356 }
27357 }
27358 }
27359
27360 TEST(F32_IGEMM_4X8__SSE_DUP, k_div_4) {
27361 TEST_REQUIRES_X86_SSE;
27362 for (size_t k = 8; k <= 40; k += 4) {
27363 GemmMicrokernelTester()
27364 .mr(4)
27365 .nr(8)
27366 .kr(1)
27367 .sr(1)
27368 .m(4)
27369 .n(8)
27370 .k(k)
27371 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27372 }
27373 }
27374
27375 TEST(F32_IGEMM_4X8__SSE_DUP, k_div_4_subtile) {
27376 TEST_REQUIRES_X86_SSE;
27377 for (size_t k = 8; k <= 40; k += 4) {
27378 for (uint32_t m = 1; m <= 4; m++) {
27379 for (uint32_t n = 1; n <= 8; n++) {
27380 GemmMicrokernelTester()
27381 .mr(4)
27382 .nr(8)
27383 .kr(1)
27384 .sr(1)
27385 .m(m)
27386 .n(n)
27387 .k(k)
27388 .iterations(1)
27389 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27390 }
27391 }
27392 }
27393 }
27394
27395 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8) {
27396 TEST_REQUIRES_X86_SSE;
27397 for (uint32_t n = 9; n < 16; n++) {
27398 for (size_t k = 1; k <= 20; k += 5) {
27399 GemmMicrokernelTester()
27400 .mr(4)
27401 .nr(8)
27402 .kr(1)
27403 .sr(1)
27404 .m(4)
27405 .n(8)
27406 .k(k)
27407 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27408 }
27409 }
27410 }
27411
27412 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8_strided_cn) {
27413 TEST_REQUIRES_X86_SSE;
27414 for (uint32_t n = 9; n < 16; n++) {
27415 for (size_t k = 1; k <= 20; k += 5) {
27416 GemmMicrokernelTester()
27417 .mr(4)
27418 .nr(8)
27419 .kr(1)
27420 .sr(1)
27421 .m(4)
27422 .n(8)
27423 .k(k)
27424 .cn_stride(11)
27425 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27426 }
27427 }
27428 }
27429
27430 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8_subtile) {
27431 TEST_REQUIRES_X86_SSE;
27432 for (uint32_t n = 9; n < 16; n++) {
27433 for (size_t k = 1; k <= 20; k += 5) {
27434 for (uint32_t m = 1; m <= 4; m++) {
27435 GemmMicrokernelTester()
27436 .mr(4)
27437 .nr(8)
27438 .kr(1)
27439 .sr(1)
27440 .m(m)
27441 .n(n)
27442 .k(k)
27443 .iterations(1)
27444 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27445 }
27446 }
27447 }
27448 }
27449
27450 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8) {
27451 TEST_REQUIRES_X86_SSE;
27452 for (uint32_t n = 16; n <= 24; n += 8) {
27453 for (size_t k = 1; k <= 20; k += 5) {
27454 GemmMicrokernelTester()
27455 .mr(4)
27456 .nr(8)
27457 .kr(1)
27458 .sr(1)
27459 .m(4)
27460 .n(8)
27461 .k(k)
27462 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27463 }
27464 }
27465 }
27466
27467 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8_strided_cn) {
27468 TEST_REQUIRES_X86_SSE;
27469 for (uint32_t n = 16; n <= 24; n += 8) {
27470 for (size_t k = 1; k <= 20; k += 5) {
27471 GemmMicrokernelTester()
27472 .mr(4)
27473 .nr(8)
27474 .kr(1)
27475 .sr(1)
27476 .m(4)
27477 .n(n)
27478 .k(k)
27479 .cn_stride(11)
27480 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27481 }
27482 }
27483 }
27484
27485 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8_subtile) {
27486 TEST_REQUIRES_X86_SSE;
27487 for (uint32_t n = 16; n <= 24; n += 8) {
27488 for (size_t k = 1; k <= 20; k += 5) {
27489 for (uint32_t m = 1; m <= 4; m++) {
27490 GemmMicrokernelTester()
27491 .mr(4)
27492 .nr(8)
27493 .kr(1)
27494 .sr(1)
27495 .m(m)
27496 .n(n)
27497 .k(k)
27498 .iterations(1)
27499 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27500 }
27501 }
27502 }
27503 }
27504
27505 TEST(F32_IGEMM_4X8__SSE_DUP, small_kernel) {
27506 TEST_REQUIRES_X86_SSE;
27507 for (size_t k = 1; k <= 20; k += 5) {
27508 GemmMicrokernelTester()
27509 .mr(4)
27510 .nr(8)
27511 .kr(1)
27512 .sr(1)
27513 .m(4)
27514 .n(8)
27515 .k(k)
27516 .ks(3)
27517 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27518 }
27519 }
27520
27521 TEST(F32_IGEMM_4X8__SSE_DUP, small_kernel_subtile) {
27522 TEST_REQUIRES_X86_SSE;
27523 for (size_t k = 1; k <= 20; k += 5) {
27524 for (uint32_t m = 1; m <= 4; m++) {
27525 for (uint32_t n = 1; n <= 8; n++) {
27526 GemmMicrokernelTester()
27527 .mr(4)
27528 .nr(8)
27529 .kr(1)
27530 .sr(1)
27531 .m(m)
27532 .n(n)
27533 .k(k)
27534 .ks(3)
27535 .iterations(1)
27536 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27537 }
27538 }
27539 }
27540 }
27541
27542 TEST(F32_IGEMM_4X8__SSE_DUP, n_gt_8_small_kernel) {
27543 TEST_REQUIRES_X86_SSE;
27544 for (uint32_t n = 9; n < 16; n++) {
27545 for (size_t k = 1; k <= 20; k += 5) {
27546 GemmMicrokernelTester()
27547 .mr(4)
27548 .nr(8)
27549 .kr(1)
27550 .sr(1)
27551 .m(4)
27552 .n(8)
27553 .k(k)
27554 .ks(3)
27555 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27556 }
27557 }
27558 }
27559
27560 TEST(F32_IGEMM_4X8__SSE_DUP, n_div_8_small_kernel) {
27561 TEST_REQUIRES_X86_SSE;
27562 for (uint32_t n = 16; n <= 24; n += 8) {
27563 for (size_t k = 1; k <= 20; k += 5) {
27564 GemmMicrokernelTester()
27565 .mr(4)
27566 .nr(8)
27567 .kr(1)
27568 .sr(1)
27569 .m(4)
27570 .n(8)
27571 .k(k)
27572 .ks(3)
27573 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27574 }
27575 }
27576 }
27577
27578 TEST(F32_IGEMM_4X8__SSE_DUP, strided_cm_subtile) {
27579 TEST_REQUIRES_X86_SSE;
27580 for (size_t k = 1; k <= 20; k += 5) {
27581 for (uint32_t m = 1; m <= 4; m++) {
27582 for (uint32_t n = 1; n <= 8; n++) {
27583 GemmMicrokernelTester()
27584 .mr(4)
27585 .nr(8)
27586 .kr(1)
27587 .sr(1)
27588 .m(m)
27589 .n(n)
27590 .k(k)
27591 .cm_stride(11)
27592 .iterations(1)
27593 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27594 }
27595 }
27596 }
27597 }
27598
27599 TEST(F32_IGEMM_4X8__SSE_DUP, a_offset) {
27600 TEST_REQUIRES_X86_SSE;
27601 for (size_t k = 1; k <= 20; k += 5) {
27602 GemmMicrokernelTester()
27603 .mr(4)
27604 .nr(8)
27605 .kr(1)
27606 .sr(1)
27607 .m(4)
27608 .n(8)
27609 .k(k)
27610 .ks(3)
27611 .a_offset(83)
27612 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27613 }
27614 }
27615
27616 TEST(F32_IGEMM_4X8__SSE_DUP, zero) {
27617 TEST_REQUIRES_X86_SSE;
27618 for (uint32_t mz = 0; mz < 4; mz++) {
27619 for (size_t k = 1; k <= 20; k += 5) {
27620 GemmMicrokernelTester()
27621 .mr(4)
27622 .nr(8)
27623 .kr(1)
27624 .sr(1)
27625 .m(4)
27626 .n(8)
27627 .k(k)
27628 .ks(3)
27629 .a_offset(83)
27630 .zero_index(mz)
27631 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27632 }
27633 }
27634 }
27635
27636 TEST(F32_IGEMM_4X8__SSE_DUP, qmin) {
27637 TEST_REQUIRES_X86_SSE;
27638 GemmMicrokernelTester()
27639 .mr(4)
27640 .nr(8)
27641 .kr(1)
27642 .sr(1)
27643 .m(4)
27644 .n(8)
27645 .k(4)
27646 .qmin(128)
27647 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27648 }
27649
27650 TEST(F32_IGEMM_4X8__SSE_DUP, qmax) {
27651 TEST_REQUIRES_X86_SSE;
27652 GemmMicrokernelTester()
27653 .mr(4)
27654 .nr(8)
27655 .kr(1)
27656 .sr(1)
27657 .m(4)
27658 .n(8)
27659 .k(4)
27660 .qmax(128)
27661 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27662 }
27663
27664 TEST(F32_IGEMM_4X8__SSE_DUP, strided_cm) {
27665 TEST_REQUIRES_X86_SSE;
27666 GemmMicrokernelTester()
27667 .mr(4)
27668 .nr(8)
27669 .kr(1)
27670 .sr(1)
27671 .m(4)
27672 .n(8)
27673 .k(4)
27674 .cm_stride(11)
27675 .Test(xnn_f32_igemm_ukernel_4x8__sse_dup);
27676 }
27677#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27678
27679
27680#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27681 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4) {
27682 TEST_REQUIRES_X86_SSE;
27683 GemmMicrokernelTester()
27684 .mr(1)
27685 .nr(8)
27686 .kr(1)
27687 .sr(4)
27688 .m(1)
27689 .n(8)
27690 .k(4)
27691 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27692 }
27693
27694 TEST(F32_IGEMM_1X8S4__SSE, strided_cn) {
27695 TEST_REQUIRES_X86_SSE;
27696 GemmMicrokernelTester()
27697 .mr(1)
27698 .nr(8)
27699 .kr(1)
27700 .sr(4)
27701 .m(1)
27702 .n(8)
27703 .k(4)
27704 .cn_stride(11)
27705 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27706 }
27707
27708 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4_subtile) {
27709 TEST_REQUIRES_X86_SSE;
27710 for (uint32_t m = 1; m <= 1; m++) {
27711 for (uint32_t n = 1; n <= 8; n++) {
27712 GemmMicrokernelTester()
27713 .mr(1)
27714 .nr(8)
27715 .kr(1)
27716 .sr(4)
27717 .m(m)
27718 .n(n)
27719 .k(4)
27720 .iterations(1)
27721 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27722 }
27723 }
27724 }
27725
27726 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4_subtile_m) {
27727 TEST_REQUIRES_X86_SSE;
27728 for (uint32_t m = 1; m <= 1; m++) {
27729 GemmMicrokernelTester()
27730 .mr(1)
27731 .nr(8)
27732 .kr(1)
27733 .sr(4)
27734 .m(m)
27735 .n(8)
27736 .k(4)
27737 .iterations(1)
27738 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27739 }
27740 }
27741
27742 TEST(F32_IGEMM_1X8S4__SSE, k_eq_4_subtile_n) {
27743 TEST_REQUIRES_X86_SSE;
27744 for (uint32_t n = 1; n <= 8; n++) {
27745 GemmMicrokernelTester()
27746 .mr(1)
27747 .nr(8)
27748 .kr(1)
27749 .sr(4)
27750 .m(1)
27751 .n(n)
27752 .k(4)
27753 .iterations(1)
27754 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27755 }
27756 }
27757
27758 TEST(F32_IGEMM_1X8S4__SSE, k_lt_4) {
27759 TEST_REQUIRES_X86_SSE;
27760 for (size_t k = 1; k < 4; k++) {
27761 GemmMicrokernelTester()
27762 .mr(1)
27763 .nr(8)
27764 .kr(1)
27765 .sr(4)
27766 .m(1)
27767 .n(8)
27768 .k(k)
27769 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27770 }
27771 }
27772
27773 TEST(F32_IGEMM_1X8S4__SSE, k_lt_4_subtile) {
27774 TEST_REQUIRES_X86_SSE;
27775 for (size_t k = 1; k < 4; k++) {
27776 for (uint32_t m = 1; m <= 1; m++) {
27777 for (uint32_t n = 1; n <= 8; n++) {
27778 GemmMicrokernelTester()
27779 .mr(1)
27780 .nr(8)
27781 .kr(1)
27782 .sr(4)
27783 .m(m)
27784 .n(n)
27785 .k(k)
27786 .iterations(1)
27787 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27788 }
27789 }
27790 }
27791 }
27792
27793 TEST(F32_IGEMM_1X8S4__SSE, k_gt_4) {
27794 TEST_REQUIRES_X86_SSE;
27795 for (size_t k = 5; k < 8; k++) {
27796 GemmMicrokernelTester()
27797 .mr(1)
27798 .nr(8)
27799 .kr(1)
27800 .sr(4)
27801 .m(1)
27802 .n(8)
27803 .k(k)
27804 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27805 }
27806 }
27807
27808 TEST(F32_IGEMM_1X8S4__SSE, k_gt_4_subtile) {
27809 TEST_REQUIRES_X86_SSE;
27810 for (size_t k = 5; k < 8; k++) {
27811 for (uint32_t m = 1; m <= 1; m++) {
27812 for (uint32_t n = 1; n <= 8; n++) {
27813 GemmMicrokernelTester()
27814 .mr(1)
27815 .nr(8)
27816 .kr(1)
27817 .sr(4)
27818 .m(m)
27819 .n(n)
27820 .k(k)
27821 .iterations(1)
27822 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27823 }
27824 }
27825 }
27826 }
27827
27828 TEST(F32_IGEMM_1X8S4__SSE, k_div_4) {
27829 TEST_REQUIRES_X86_SSE;
27830 for (size_t k = 8; k <= 40; k += 4) {
27831 GemmMicrokernelTester()
27832 .mr(1)
27833 .nr(8)
27834 .kr(1)
27835 .sr(4)
27836 .m(1)
27837 .n(8)
27838 .k(k)
27839 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27840 }
27841 }
27842
27843 TEST(F32_IGEMM_1X8S4__SSE, k_div_4_subtile) {
27844 TEST_REQUIRES_X86_SSE;
27845 for (size_t k = 8; k <= 40; k += 4) {
27846 for (uint32_t m = 1; m <= 1; m++) {
27847 for (uint32_t n = 1; n <= 8; n++) {
27848 GemmMicrokernelTester()
27849 .mr(1)
27850 .nr(8)
27851 .kr(1)
27852 .sr(4)
27853 .m(m)
27854 .n(n)
27855 .k(k)
27856 .iterations(1)
27857 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27858 }
27859 }
27860 }
27861 }
27862
27863 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8) {
27864 TEST_REQUIRES_X86_SSE;
27865 for (uint32_t n = 9; n < 16; n++) {
27866 for (size_t k = 1; k <= 20; k += 5) {
27867 GemmMicrokernelTester()
27868 .mr(1)
27869 .nr(8)
27870 .kr(1)
27871 .sr(4)
27872 .m(1)
27873 .n(8)
27874 .k(k)
27875 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27876 }
27877 }
27878 }
27879
27880 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8_strided_cn) {
27881 TEST_REQUIRES_X86_SSE;
27882 for (uint32_t n = 9; n < 16; n++) {
27883 for (size_t k = 1; k <= 20; k += 5) {
27884 GemmMicrokernelTester()
27885 .mr(1)
27886 .nr(8)
27887 .kr(1)
27888 .sr(4)
27889 .m(1)
27890 .n(8)
27891 .k(k)
27892 .cn_stride(11)
27893 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27894 }
27895 }
27896 }
27897
27898 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8_subtile) {
27899 TEST_REQUIRES_X86_SSE;
27900 for (uint32_t n = 9; n < 16; n++) {
27901 for (size_t k = 1; k <= 20; k += 5) {
27902 for (uint32_t m = 1; m <= 1; m++) {
27903 GemmMicrokernelTester()
27904 .mr(1)
27905 .nr(8)
27906 .kr(1)
27907 .sr(4)
27908 .m(m)
27909 .n(n)
27910 .k(k)
27911 .iterations(1)
27912 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27913 }
27914 }
27915 }
27916 }
27917
27918 TEST(F32_IGEMM_1X8S4__SSE, n_div_8) {
27919 TEST_REQUIRES_X86_SSE;
27920 for (uint32_t n = 16; n <= 24; n += 8) {
27921 for (size_t k = 1; k <= 20; k += 5) {
27922 GemmMicrokernelTester()
27923 .mr(1)
27924 .nr(8)
27925 .kr(1)
27926 .sr(4)
27927 .m(1)
27928 .n(8)
27929 .k(k)
27930 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27931 }
27932 }
27933 }
27934
27935 TEST(F32_IGEMM_1X8S4__SSE, n_div_8_strided_cn) {
27936 TEST_REQUIRES_X86_SSE;
27937 for (uint32_t n = 16; n <= 24; n += 8) {
27938 for (size_t k = 1; k <= 20; k += 5) {
27939 GemmMicrokernelTester()
27940 .mr(1)
27941 .nr(8)
27942 .kr(1)
27943 .sr(4)
27944 .m(1)
27945 .n(n)
27946 .k(k)
27947 .cn_stride(11)
27948 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27949 }
27950 }
27951 }
27952
27953 TEST(F32_IGEMM_1X8S4__SSE, n_div_8_subtile) {
27954 TEST_REQUIRES_X86_SSE;
27955 for (uint32_t n = 16; n <= 24; n += 8) {
27956 for (size_t k = 1; k <= 20; k += 5) {
27957 for (uint32_t m = 1; m <= 1; m++) {
27958 GemmMicrokernelTester()
27959 .mr(1)
27960 .nr(8)
27961 .kr(1)
27962 .sr(4)
27963 .m(m)
27964 .n(n)
27965 .k(k)
27966 .iterations(1)
27967 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27968 }
27969 }
27970 }
27971 }
27972
27973 TEST(F32_IGEMM_1X8S4__SSE, small_kernel) {
27974 TEST_REQUIRES_X86_SSE;
27975 for (size_t k = 1; k <= 20; k += 5) {
27976 GemmMicrokernelTester()
27977 .mr(1)
27978 .nr(8)
27979 .kr(1)
27980 .sr(4)
27981 .m(1)
27982 .n(8)
27983 .k(k)
27984 .ks(3)
27985 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
27986 }
27987 }
27988
27989 TEST(F32_IGEMM_1X8S4__SSE, small_kernel_subtile) {
27990 TEST_REQUIRES_X86_SSE;
27991 for (size_t k = 1; k <= 20; k += 5) {
27992 for (uint32_t m = 1; m <= 1; m++) {
27993 for (uint32_t n = 1; n <= 8; n++) {
27994 GemmMicrokernelTester()
27995 .mr(1)
27996 .nr(8)
27997 .kr(1)
27998 .sr(4)
27999 .m(m)
28000 .n(n)
28001 .k(k)
28002 .ks(3)
28003 .iterations(1)
28004 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28005 }
28006 }
28007 }
28008 }
28009
28010 TEST(F32_IGEMM_1X8S4__SSE, n_gt_8_small_kernel) {
28011 TEST_REQUIRES_X86_SSE;
28012 for (uint32_t n = 9; n < 16; n++) {
28013 for (size_t k = 1; k <= 20; k += 5) {
28014 GemmMicrokernelTester()
28015 .mr(1)
28016 .nr(8)
28017 .kr(1)
28018 .sr(4)
28019 .m(1)
28020 .n(8)
28021 .k(k)
28022 .ks(3)
28023 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28024 }
28025 }
28026 }
28027
28028 TEST(F32_IGEMM_1X8S4__SSE, n_div_8_small_kernel) {
28029 TEST_REQUIRES_X86_SSE;
28030 for (uint32_t n = 16; n <= 24; n += 8) {
28031 for (size_t k = 1; k <= 20; k += 5) {
28032 GemmMicrokernelTester()
28033 .mr(1)
28034 .nr(8)
28035 .kr(1)
28036 .sr(4)
28037 .m(1)
28038 .n(8)
28039 .k(k)
28040 .ks(3)
28041 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28042 }
28043 }
28044 }
28045
28046 TEST(F32_IGEMM_1X8S4__SSE, strided_cm_subtile) {
28047 TEST_REQUIRES_X86_SSE;
28048 for (size_t k = 1; k <= 20; k += 5) {
28049 for (uint32_t m = 1; m <= 1; m++) {
28050 for (uint32_t n = 1; n <= 8; n++) {
28051 GemmMicrokernelTester()
28052 .mr(1)
28053 .nr(8)
28054 .kr(1)
28055 .sr(4)
28056 .m(m)
28057 .n(n)
28058 .k(k)
28059 .cm_stride(11)
28060 .iterations(1)
28061 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28062 }
28063 }
28064 }
28065 }
28066
28067 TEST(F32_IGEMM_1X8S4__SSE, a_offset) {
28068 TEST_REQUIRES_X86_SSE;
28069 for (size_t k = 1; k <= 20; k += 5) {
28070 GemmMicrokernelTester()
28071 .mr(1)
28072 .nr(8)
28073 .kr(1)
28074 .sr(4)
28075 .m(1)
28076 .n(8)
28077 .k(k)
28078 .ks(3)
28079 .a_offset(23)
28080 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28081 }
28082 }
28083
28084 TEST(F32_IGEMM_1X8S4__SSE, zero) {
28085 TEST_REQUIRES_X86_SSE;
28086 for (uint32_t mz = 0; mz < 1; mz++) {
28087 for (size_t k = 1; k <= 20; k += 5) {
28088 GemmMicrokernelTester()
28089 .mr(1)
28090 .nr(8)
28091 .kr(1)
28092 .sr(4)
28093 .m(1)
28094 .n(8)
28095 .k(k)
28096 .ks(3)
28097 .a_offset(23)
28098 .zero_index(mz)
28099 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28100 }
28101 }
28102 }
28103
28104 TEST(F32_IGEMM_1X8S4__SSE, qmin) {
28105 TEST_REQUIRES_X86_SSE;
28106 GemmMicrokernelTester()
28107 .mr(1)
28108 .nr(8)
28109 .kr(1)
28110 .sr(4)
28111 .m(1)
28112 .n(8)
28113 .k(4)
28114 .qmin(128)
28115 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28116 }
28117
28118 TEST(F32_IGEMM_1X8S4__SSE, qmax) {
28119 TEST_REQUIRES_X86_SSE;
28120 GemmMicrokernelTester()
28121 .mr(1)
28122 .nr(8)
28123 .kr(1)
28124 .sr(4)
28125 .m(1)
28126 .n(8)
28127 .k(4)
28128 .qmax(128)
28129 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28130 }
28131
28132 TEST(F32_IGEMM_1X8S4__SSE, strided_cm) {
28133 TEST_REQUIRES_X86_SSE;
28134 GemmMicrokernelTester()
28135 .mr(1)
28136 .nr(8)
28137 .kr(1)
28138 .sr(4)
28139 .m(1)
28140 .n(8)
28141 .k(4)
28142 .cm_stride(11)
28143 .Test(xnn_f32_igemm_ukernel_1x8s4__sse);
28144 }
28145#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28146
28147
28148#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28149 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4) {
28150 TEST_REQUIRES_X86_SSE;
28151 GemmMicrokernelTester()
28152 .mr(4)
28153 .nr(8)
28154 .kr(1)
28155 .sr(4)
28156 .m(4)
28157 .n(8)
28158 .k(4)
28159 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28160 }
28161
28162 TEST(F32_IGEMM_4X8S4__SSE, strided_cn) {
28163 TEST_REQUIRES_X86_SSE;
28164 GemmMicrokernelTester()
28165 .mr(4)
28166 .nr(8)
28167 .kr(1)
28168 .sr(4)
28169 .m(4)
28170 .n(8)
28171 .k(4)
28172 .cn_stride(11)
28173 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28174 }
28175
28176 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4_subtile) {
28177 TEST_REQUIRES_X86_SSE;
28178 for (uint32_t m = 1; m <= 4; m++) {
28179 for (uint32_t n = 1; n <= 8; n++) {
28180 GemmMicrokernelTester()
28181 .mr(4)
28182 .nr(8)
28183 .kr(1)
28184 .sr(4)
28185 .m(m)
28186 .n(n)
28187 .k(4)
28188 .iterations(1)
28189 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28190 }
28191 }
28192 }
28193
28194 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4_subtile_m) {
28195 TEST_REQUIRES_X86_SSE;
28196 for (uint32_t m = 1; m <= 4; m++) {
28197 GemmMicrokernelTester()
28198 .mr(4)
28199 .nr(8)
28200 .kr(1)
28201 .sr(4)
28202 .m(m)
28203 .n(8)
28204 .k(4)
28205 .iterations(1)
28206 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28207 }
28208 }
28209
28210 TEST(F32_IGEMM_4X8S4__SSE, k_eq_4_subtile_n) {
28211 TEST_REQUIRES_X86_SSE;
28212 for (uint32_t n = 1; n <= 8; n++) {
28213 GemmMicrokernelTester()
28214 .mr(4)
28215 .nr(8)
28216 .kr(1)
28217 .sr(4)
28218 .m(4)
28219 .n(n)
28220 .k(4)
28221 .iterations(1)
28222 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28223 }
28224 }
28225
28226 TEST(F32_IGEMM_4X8S4__SSE, k_lt_4) {
28227 TEST_REQUIRES_X86_SSE;
28228 for (size_t k = 1; k < 4; k++) {
28229 GemmMicrokernelTester()
28230 .mr(4)
28231 .nr(8)
28232 .kr(1)
28233 .sr(4)
28234 .m(4)
28235 .n(8)
28236 .k(k)
28237 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28238 }
28239 }
28240
28241 TEST(F32_IGEMM_4X8S4__SSE, k_lt_4_subtile) {
28242 TEST_REQUIRES_X86_SSE;
28243 for (size_t k = 1; k < 4; k++) {
28244 for (uint32_t m = 1; m <= 4; m++) {
28245 for (uint32_t n = 1; n <= 8; n++) {
28246 GemmMicrokernelTester()
28247 .mr(4)
28248 .nr(8)
28249 .kr(1)
28250 .sr(4)
28251 .m(m)
28252 .n(n)
28253 .k(k)
28254 .iterations(1)
28255 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28256 }
28257 }
28258 }
28259 }
28260
28261 TEST(F32_IGEMM_4X8S4__SSE, k_gt_4) {
28262 TEST_REQUIRES_X86_SSE;
28263 for (size_t k = 5; k < 8; k++) {
28264 GemmMicrokernelTester()
28265 .mr(4)
28266 .nr(8)
28267 .kr(1)
28268 .sr(4)
28269 .m(4)
28270 .n(8)
28271 .k(k)
28272 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28273 }
28274 }
28275
28276 TEST(F32_IGEMM_4X8S4__SSE, k_gt_4_subtile) {
28277 TEST_REQUIRES_X86_SSE;
28278 for (size_t k = 5; k < 8; k++) {
28279 for (uint32_t m = 1; m <= 4; m++) {
28280 for (uint32_t n = 1; n <= 8; n++) {
28281 GemmMicrokernelTester()
28282 .mr(4)
28283 .nr(8)
28284 .kr(1)
28285 .sr(4)
28286 .m(m)
28287 .n(n)
28288 .k(k)
28289 .iterations(1)
28290 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28291 }
28292 }
28293 }
28294 }
28295
28296 TEST(F32_IGEMM_4X8S4__SSE, k_div_4) {
28297 TEST_REQUIRES_X86_SSE;
28298 for (size_t k = 8; k <= 40; k += 4) {
28299 GemmMicrokernelTester()
28300 .mr(4)
28301 .nr(8)
28302 .kr(1)
28303 .sr(4)
28304 .m(4)
28305 .n(8)
28306 .k(k)
28307 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28308 }
28309 }
28310
28311 TEST(F32_IGEMM_4X8S4__SSE, k_div_4_subtile) {
28312 TEST_REQUIRES_X86_SSE;
28313 for (size_t k = 8; k <= 40; k += 4) {
28314 for (uint32_t m = 1; m <= 4; m++) {
28315 for (uint32_t n = 1; n <= 8; n++) {
28316 GemmMicrokernelTester()
28317 .mr(4)
28318 .nr(8)
28319 .kr(1)
28320 .sr(4)
28321 .m(m)
28322 .n(n)
28323 .k(k)
28324 .iterations(1)
28325 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28326 }
28327 }
28328 }
28329 }
28330
28331 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8) {
28332 TEST_REQUIRES_X86_SSE;
28333 for (uint32_t n = 9; n < 16; n++) {
28334 for (size_t k = 1; k <= 20; k += 5) {
28335 GemmMicrokernelTester()
28336 .mr(4)
28337 .nr(8)
28338 .kr(1)
28339 .sr(4)
28340 .m(4)
28341 .n(8)
28342 .k(k)
28343 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28344 }
28345 }
28346 }
28347
28348 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8_strided_cn) {
28349 TEST_REQUIRES_X86_SSE;
28350 for (uint32_t n = 9; n < 16; n++) {
28351 for (size_t k = 1; k <= 20; k += 5) {
28352 GemmMicrokernelTester()
28353 .mr(4)
28354 .nr(8)
28355 .kr(1)
28356 .sr(4)
28357 .m(4)
28358 .n(8)
28359 .k(k)
28360 .cn_stride(11)
28361 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28362 }
28363 }
28364 }
28365
28366 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8_subtile) {
28367 TEST_REQUIRES_X86_SSE;
28368 for (uint32_t n = 9; n < 16; n++) {
28369 for (size_t k = 1; k <= 20; k += 5) {
28370 for (uint32_t m = 1; m <= 4; m++) {
28371 GemmMicrokernelTester()
28372 .mr(4)
28373 .nr(8)
28374 .kr(1)
28375 .sr(4)
28376 .m(m)
28377 .n(n)
28378 .k(k)
28379 .iterations(1)
28380 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28381 }
28382 }
28383 }
28384 }
28385
28386 TEST(F32_IGEMM_4X8S4__SSE, n_div_8) {
28387 TEST_REQUIRES_X86_SSE;
28388 for (uint32_t n = 16; n <= 24; n += 8) {
28389 for (size_t k = 1; k <= 20; k += 5) {
28390 GemmMicrokernelTester()
28391 .mr(4)
28392 .nr(8)
28393 .kr(1)
28394 .sr(4)
28395 .m(4)
28396 .n(8)
28397 .k(k)
28398 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28399 }
28400 }
28401 }
28402
28403 TEST(F32_IGEMM_4X8S4__SSE, n_div_8_strided_cn) {
28404 TEST_REQUIRES_X86_SSE;
28405 for (uint32_t n = 16; n <= 24; n += 8) {
28406 for (size_t k = 1; k <= 20; k += 5) {
28407 GemmMicrokernelTester()
28408 .mr(4)
28409 .nr(8)
28410 .kr(1)
28411 .sr(4)
28412 .m(4)
28413 .n(n)
28414 .k(k)
28415 .cn_stride(11)
28416 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28417 }
28418 }
28419 }
28420
28421 TEST(F32_IGEMM_4X8S4__SSE, n_div_8_subtile) {
28422 TEST_REQUIRES_X86_SSE;
28423 for (uint32_t n = 16; n <= 24; n += 8) {
28424 for (size_t k = 1; k <= 20; k += 5) {
28425 for (uint32_t m = 1; m <= 4; m++) {
28426 GemmMicrokernelTester()
28427 .mr(4)
28428 .nr(8)
28429 .kr(1)
28430 .sr(4)
28431 .m(m)
28432 .n(n)
28433 .k(k)
28434 .iterations(1)
28435 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28436 }
28437 }
28438 }
28439 }
28440
28441 TEST(F32_IGEMM_4X8S4__SSE, small_kernel) {
28442 TEST_REQUIRES_X86_SSE;
28443 for (size_t k = 1; k <= 20; k += 5) {
28444 GemmMicrokernelTester()
28445 .mr(4)
28446 .nr(8)
28447 .kr(1)
28448 .sr(4)
28449 .m(4)
28450 .n(8)
28451 .k(k)
28452 .ks(3)
28453 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28454 }
28455 }
28456
28457 TEST(F32_IGEMM_4X8S4__SSE, small_kernel_subtile) {
28458 TEST_REQUIRES_X86_SSE;
28459 for (size_t k = 1; k <= 20; k += 5) {
28460 for (uint32_t m = 1; m <= 4; m++) {
28461 for (uint32_t n = 1; n <= 8; n++) {
28462 GemmMicrokernelTester()
28463 .mr(4)
28464 .nr(8)
28465 .kr(1)
28466 .sr(4)
28467 .m(m)
28468 .n(n)
28469 .k(k)
28470 .ks(3)
28471 .iterations(1)
28472 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28473 }
28474 }
28475 }
28476 }
28477
28478 TEST(F32_IGEMM_4X8S4__SSE, n_gt_8_small_kernel) {
28479 TEST_REQUIRES_X86_SSE;
28480 for (uint32_t n = 9; n < 16; n++) {
28481 for (size_t k = 1; k <= 20; k += 5) {
28482 GemmMicrokernelTester()
28483 .mr(4)
28484 .nr(8)
28485 .kr(1)
28486 .sr(4)
28487 .m(4)
28488 .n(8)
28489 .k(k)
28490 .ks(3)
28491 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28492 }
28493 }
28494 }
28495
28496 TEST(F32_IGEMM_4X8S4__SSE, n_div_8_small_kernel) {
28497 TEST_REQUIRES_X86_SSE;
28498 for (uint32_t n = 16; n <= 24; n += 8) {
28499 for (size_t k = 1; k <= 20; k += 5) {
28500 GemmMicrokernelTester()
28501 .mr(4)
28502 .nr(8)
28503 .kr(1)
28504 .sr(4)
28505 .m(4)
28506 .n(8)
28507 .k(k)
28508 .ks(3)
28509 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28510 }
28511 }
28512 }
28513
28514 TEST(F32_IGEMM_4X8S4__SSE, strided_cm_subtile) {
28515 TEST_REQUIRES_X86_SSE;
28516 for (size_t k = 1; k <= 20; k += 5) {
28517 for (uint32_t m = 1; m <= 4; m++) {
28518 for (uint32_t n = 1; n <= 8; n++) {
28519 GemmMicrokernelTester()
28520 .mr(4)
28521 .nr(8)
28522 .kr(1)
28523 .sr(4)
28524 .m(m)
28525 .n(n)
28526 .k(k)
28527 .cm_stride(11)
28528 .iterations(1)
28529 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28530 }
28531 }
28532 }
28533 }
28534
28535 TEST(F32_IGEMM_4X8S4__SSE, a_offset) {
28536 TEST_REQUIRES_X86_SSE;
28537 for (size_t k = 1; k <= 20; k += 5) {
28538 GemmMicrokernelTester()
28539 .mr(4)
28540 .nr(8)
28541 .kr(1)
28542 .sr(4)
28543 .m(4)
28544 .n(8)
28545 .k(k)
28546 .ks(3)
28547 .a_offset(83)
28548 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28549 }
28550 }
28551
28552 TEST(F32_IGEMM_4X8S4__SSE, zero) {
28553 TEST_REQUIRES_X86_SSE;
28554 for (uint32_t mz = 0; mz < 4; mz++) {
28555 for (size_t k = 1; k <= 20; k += 5) {
28556 GemmMicrokernelTester()
28557 .mr(4)
28558 .nr(8)
28559 .kr(1)
28560 .sr(4)
28561 .m(4)
28562 .n(8)
28563 .k(k)
28564 .ks(3)
28565 .a_offset(83)
28566 .zero_index(mz)
28567 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28568 }
28569 }
28570 }
28571
28572 TEST(F32_IGEMM_4X8S4__SSE, qmin) {
28573 TEST_REQUIRES_X86_SSE;
28574 GemmMicrokernelTester()
28575 .mr(4)
28576 .nr(8)
28577 .kr(1)
28578 .sr(4)
28579 .m(4)
28580 .n(8)
28581 .k(4)
28582 .qmin(128)
28583 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28584 }
28585
28586 TEST(F32_IGEMM_4X8S4__SSE, qmax) {
28587 TEST_REQUIRES_X86_SSE;
28588 GemmMicrokernelTester()
28589 .mr(4)
28590 .nr(8)
28591 .kr(1)
28592 .sr(4)
28593 .m(4)
28594 .n(8)
28595 .k(4)
28596 .qmax(128)
28597 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28598 }
28599
28600 TEST(F32_IGEMM_4X8S4__SSE, strided_cm) {
28601 TEST_REQUIRES_X86_SSE;
28602 GemmMicrokernelTester()
28603 .mr(4)
28604 .nr(8)
28605 .kr(1)
28606 .sr(4)
28607 .m(4)
28608 .n(8)
28609 .k(4)
28610 .cm_stride(11)
28611 .Test(xnn_f32_igemm_ukernel_4x8s4__sse);
28612 }
28613#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28614
28615
28616#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28617 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4) {
28618 TEST_REQUIRES_X86_SSE;
28619 GemmMicrokernelTester()
28620 .mr(4)
28621 .nr(2)
28622 .kr(4)
28623 .sr(1)
28624 .m(4)
28625 .n(2)
28626 .k(4)
28627 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28628 }
28629
28630 TEST(F32_IGEMM_4X2C4__SSE, strided_cn) {
28631 TEST_REQUIRES_X86_SSE;
28632 GemmMicrokernelTester()
28633 .mr(4)
28634 .nr(2)
28635 .kr(4)
28636 .sr(1)
28637 .m(4)
28638 .n(2)
28639 .k(4)
28640 .cn_stride(5)
28641 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28642 }
28643
28644 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4_subtile) {
28645 TEST_REQUIRES_X86_SSE;
28646 for (uint32_t m = 1; m <= 4; m++) {
28647 for (uint32_t n = 1; n <= 2; n++) {
28648 GemmMicrokernelTester()
28649 .mr(4)
28650 .nr(2)
28651 .kr(4)
28652 .sr(1)
28653 .m(m)
28654 .n(n)
28655 .k(4)
28656 .iterations(1)
28657 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28658 }
28659 }
28660 }
28661
28662 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4_subtile_m) {
28663 TEST_REQUIRES_X86_SSE;
28664 for (uint32_t m = 1; m <= 4; m++) {
28665 GemmMicrokernelTester()
28666 .mr(4)
28667 .nr(2)
28668 .kr(4)
28669 .sr(1)
28670 .m(m)
28671 .n(2)
28672 .k(4)
28673 .iterations(1)
28674 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28675 }
28676 }
28677
28678 TEST(F32_IGEMM_4X2C4__SSE, k_eq_4_subtile_n) {
28679 TEST_REQUIRES_X86_SSE;
28680 for (uint32_t n = 1; n <= 2; n++) {
28681 GemmMicrokernelTester()
28682 .mr(4)
28683 .nr(2)
28684 .kr(4)
28685 .sr(1)
28686 .m(4)
28687 .n(n)
28688 .k(4)
28689 .iterations(1)
28690 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28691 }
28692 }
28693
28694 TEST(F32_IGEMM_4X2C4__SSE, k_lt_4) {
28695 TEST_REQUIRES_X86_SSE;
28696 for (size_t k = 1; k < 4; k++) {
28697 GemmMicrokernelTester()
28698 .mr(4)
28699 .nr(2)
28700 .kr(4)
28701 .sr(1)
28702 .m(4)
28703 .n(2)
28704 .k(k)
28705 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28706 }
28707 }
28708
28709 TEST(F32_IGEMM_4X2C4__SSE, k_lt_4_subtile) {
28710 TEST_REQUIRES_X86_SSE;
28711 for (size_t k = 1; k < 4; k++) {
28712 for (uint32_t m = 1; m <= 4; m++) {
28713 for (uint32_t n = 1; n <= 2; n++) {
28714 GemmMicrokernelTester()
28715 .mr(4)
28716 .nr(2)
28717 .kr(4)
28718 .sr(1)
28719 .m(m)
28720 .n(n)
28721 .k(k)
28722 .iterations(1)
28723 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28724 }
28725 }
28726 }
28727 }
28728
28729 TEST(F32_IGEMM_4X2C4__SSE, k_gt_4) {
28730 TEST_REQUIRES_X86_SSE;
28731 for (size_t k = 5; k < 8; k++) {
28732 GemmMicrokernelTester()
28733 .mr(4)
28734 .nr(2)
28735 .kr(4)
28736 .sr(1)
28737 .m(4)
28738 .n(2)
28739 .k(k)
28740 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28741 }
28742 }
28743
28744 TEST(F32_IGEMM_4X2C4__SSE, k_gt_4_subtile) {
28745 TEST_REQUIRES_X86_SSE;
28746 for (size_t k = 5; k < 8; k++) {
28747 for (uint32_t m = 1; m <= 4; m++) {
28748 for (uint32_t n = 1; n <= 2; n++) {
28749 GemmMicrokernelTester()
28750 .mr(4)
28751 .nr(2)
28752 .kr(4)
28753 .sr(1)
28754 .m(m)
28755 .n(n)
28756 .k(k)
28757 .iterations(1)
28758 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28759 }
28760 }
28761 }
28762 }
28763
28764 TEST(F32_IGEMM_4X2C4__SSE, k_div_4) {
28765 TEST_REQUIRES_X86_SSE;
28766 for (size_t k = 8; k <= 40; k += 4) {
28767 GemmMicrokernelTester()
28768 .mr(4)
28769 .nr(2)
28770 .kr(4)
28771 .sr(1)
28772 .m(4)
28773 .n(2)
28774 .k(k)
28775 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28776 }
28777 }
28778
28779 TEST(F32_IGEMM_4X2C4__SSE, k_div_4_subtile) {
28780 TEST_REQUIRES_X86_SSE;
28781 for (size_t k = 8; k <= 40; k += 4) {
28782 for (uint32_t m = 1; m <= 4; m++) {
28783 for (uint32_t n = 1; n <= 2; n++) {
28784 GemmMicrokernelTester()
28785 .mr(4)
28786 .nr(2)
28787 .kr(4)
28788 .sr(1)
28789 .m(m)
28790 .n(n)
28791 .k(k)
28792 .iterations(1)
28793 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28794 }
28795 }
28796 }
28797 }
28798
28799 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2) {
28800 TEST_REQUIRES_X86_SSE;
28801 for (uint32_t n = 3; n < 4; n++) {
28802 for (size_t k = 1; k <= 20; k += 5) {
28803 GemmMicrokernelTester()
28804 .mr(4)
28805 .nr(2)
28806 .kr(4)
28807 .sr(1)
28808 .m(4)
28809 .n(2)
28810 .k(k)
28811 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28812 }
28813 }
28814 }
28815
28816 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2_strided_cn) {
28817 TEST_REQUIRES_X86_SSE;
28818 for (uint32_t n = 3; n < 4; n++) {
28819 for (size_t k = 1; k <= 20; k += 5) {
28820 GemmMicrokernelTester()
28821 .mr(4)
28822 .nr(2)
28823 .kr(4)
28824 .sr(1)
28825 .m(4)
28826 .n(2)
28827 .k(k)
28828 .cn_stride(5)
28829 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28830 }
28831 }
28832 }
28833
28834 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2_subtile) {
28835 TEST_REQUIRES_X86_SSE;
28836 for (uint32_t n = 3; n < 4; n++) {
28837 for (size_t k = 1; k <= 20; k += 5) {
28838 for (uint32_t m = 1; m <= 4; m++) {
28839 GemmMicrokernelTester()
28840 .mr(4)
28841 .nr(2)
28842 .kr(4)
28843 .sr(1)
28844 .m(m)
28845 .n(n)
28846 .k(k)
28847 .iterations(1)
28848 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28849 }
28850 }
28851 }
28852 }
28853
28854 TEST(F32_IGEMM_4X2C4__SSE, n_div_2) {
28855 TEST_REQUIRES_X86_SSE;
28856 for (uint32_t n = 4; n <= 6; n += 2) {
28857 for (size_t k = 1; k <= 20; k += 5) {
28858 GemmMicrokernelTester()
28859 .mr(4)
28860 .nr(2)
28861 .kr(4)
28862 .sr(1)
28863 .m(4)
28864 .n(2)
28865 .k(k)
28866 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28867 }
28868 }
28869 }
28870
28871 TEST(F32_IGEMM_4X2C4__SSE, n_div_2_strided_cn) {
28872 TEST_REQUIRES_X86_SSE;
28873 for (uint32_t n = 4; n <= 6; n += 2) {
28874 for (size_t k = 1; k <= 20; k += 5) {
28875 GemmMicrokernelTester()
28876 .mr(4)
28877 .nr(2)
28878 .kr(4)
28879 .sr(1)
28880 .m(4)
28881 .n(n)
28882 .k(k)
28883 .cn_stride(5)
28884 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28885 }
28886 }
28887 }
28888
28889 TEST(F32_IGEMM_4X2C4__SSE, n_div_2_subtile) {
28890 TEST_REQUIRES_X86_SSE;
28891 for (uint32_t n = 4; n <= 6; n += 2) {
28892 for (size_t k = 1; k <= 20; k += 5) {
28893 for (uint32_t m = 1; m <= 4; m++) {
28894 GemmMicrokernelTester()
28895 .mr(4)
28896 .nr(2)
28897 .kr(4)
28898 .sr(1)
28899 .m(m)
28900 .n(n)
28901 .k(k)
28902 .iterations(1)
28903 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28904 }
28905 }
28906 }
28907 }
28908
28909 TEST(F32_IGEMM_4X2C4__SSE, small_kernel) {
28910 TEST_REQUIRES_X86_SSE;
28911 for (size_t k = 1; k <= 20; k += 5) {
28912 GemmMicrokernelTester()
28913 .mr(4)
28914 .nr(2)
28915 .kr(4)
28916 .sr(1)
28917 .m(4)
28918 .n(2)
28919 .k(k)
28920 .ks(3)
28921 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28922 }
28923 }
28924
28925 TEST(F32_IGEMM_4X2C4__SSE, small_kernel_subtile) {
28926 TEST_REQUIRES_X86_SSE;
28927 for (size_t k = 1; k <= 20; k += 5) {
28928 for (uint32_t m = 1; m <= 4; m++) {
28929 for (uint32_t n = 1; n <= 2; n++) {
28930 GemmMicrokernelTester()
28931 .mr(4)
28932 .nr(2)
28933 .kr(4)
28934 .sr(1)
28935 .m(m)
28936 .n(n)
28937 .k(k)
28938 .ks(3)
28939 .iterations(1)
28940 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28941 }
28942 }
28943 }
28944 }
28945
28946 TEST(F32_IGEMM_4X2C4__SSE, n_gt_2_small_kernel) {
28947 TEST_REQUIRES_X86_SSE;
28948 for (uint32_t n = 3; n < 4; n++) {
28949 for (size_t k = 1; k <= 20; k += 5) {
28950 GemmMicrokernelTester()
28951 .mr(4)
28952 .nr(2)
28953 .kr(4)
28954 .sr(1)
28955 .m(4)
28956 .n(2)
28957 .k(k)
28958 .ks(3)
28959 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28960 }
28961 }
28962 }
28963
28964 TEST(F32_IGEMM_4X2C4__SSE, n_div_2_small_kernel) {
28965 TEST_REQUIRES_X86_SSE;
28966 for (uint32_t n = 4; n <= 6; n += 2) {
28967 for (size_t k = 1; k <= 20; k += 5) {
28968 GemmMicrokernelTester()
28969 .mr(4)
28970 .nr(2)
28971 .kr(4)
28972 .sr(1)
28973 .m(4)
28974 .n(2)
28975 .k(k)
28976 .ks(3)
28977 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28978 }
28979 }
28980 }
28981
28982 TEST(F32_IGEMM_4X2C4__SSE, strided_cm_subtile) {
28983 TEST_REQUIRES_X86_SSE;
28984 for (size_t k = 1; k <= 20; k += 5) {
28985 for (uint32_t m = 1; m <= 4; m++) {
28986 for (uint32_t n = 1; n <= 2; n++) {
28987 GemmMicrokernelTester()
28988 .mr(4)
28989 .nr(2)
28990 .kr(4)
28991 .sr(1)
28992 .m(m)
28993 .n(n)
28994 .k(k)
28995 .cm_stride(5)
28996 .iterations(1)
28997 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
28998 }
28999 }
29000 }
29001 }
29002
29003 TEST(F32_IGEMM_4X2C4__SSE, a_offset) {
29004 TEST_REQUIRES_X86_SSE;
29005 for (size_t k = 1; k <= 20; k += 5) {
29006 GemmMicrokernelTester()
29007 .mr(4)
29008 .nr(2)
29009 .kr(4)
29010 .sr(1)
29011 .m(4)
29012 .n(2)
29013 .k(k)
29014 .ks(3)
29015 .a_offset(83)
29016 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
29017 }
29018 }
29019
29020 TEST(F32_IGEMM_4X2C4__SSE, zero) {
29021 TEST_REQUIRES_X86_SSE;
29022 for (uint32_t mz = 0; mz < 4; mz++) {
29023 for (size_t k = 1; k <= 20; k += 5) {
29024 GemmMicrokernelTester()
29025 .mr(4)
29026 .nr(2)
29027 .kr(4)
29028 .sr(1)
29029 .m(4)
29030 .n(2)
29031 .k(k)
29032 .ks(3)
29033 .a_offset(83)
29034 .zero_index(mz)
29035 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
29036 }
29037 }
29038 }
29039
29040 TEST(F32_IGEMM_4X2C4__SSE, qmin) {
29041 TEST_REQUIRES_X86_SSE;
29042 GemmMicrokernelTester()
29043 .mr(4)
29044 .nr(2)
29045 .kr(4)
29046 .sr(1)
29047 .m(4)
29048 .n(2)
29049 .k(4)
29050 .qmin(128)
29051 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
29052 }
29053
29054 TEST(F32_IGEMM_4X2C4__SSE, qmax) {
29055 TEST_REQUIRES_X86_SSE;
29056 GemmMicrokernelTester()
29057 .mr(4)
29058 .nr(2)
29059 .kr(4)
29060 .sr(1)
29061 .m(4)
29062 .n(2)
29063 .k(4)
29064 .qmax(128)
29065 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
29066 }
29067
29068 TEST(F32_IGEMM_4X2C4__SSE, strided_cm) {
29069 TEST_REQUIRES_X86_SSE;
29070 GemmMicrokernelTester()
29071 .mr(4)
29072 .nr(2)
29073 .kr(4)
29074 .sr(1)
29075 .m(4)
29076 .n(2)
29077 .k(4)
29078 .cm_stride(5)
29079 .Test(xnn_f32_igemm_ukernel_4x2c4__sse);
29080 }
29081#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29082
29083
29084#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
29085 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4) {
29086 TEST_REQUIRES_PSIMD;
29087 GemmMicrokernelTester()
29088 .mr(4)
29089 .nr(2)
29090 .kr(4)
29091 .sr(1)
29092 .m(4)
29093 .n(2)
29094 .k(4)
29095 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29096 }
29097
29098 TEST(F32_IGEMM_4X2C4__PSIMD, strided_cn) {
29099 TEST_REQUIRES_PSIMD;
29100 GemmMicrokernelTester()
29101 .mr(4)
29102 .nr(2)
29103 .kr(4)
29104 .sr(1)
29105 .m(4)
29106 .n(2)
29107 .k(4)
29108 .cn_stride(5)
29109 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29110 }
29111
29112 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4_subtile) {
29113 TEST_REQUIRES_PSIMD;
29114 for (uint32_t m = 1; m <= 4; m++) {
29115 for (uint32_t n = 1; n <= 2; n++) {
29116 GemmMicrokernelTester()
29117 .mr(4)
29118 .nr(2)
29119 .kr(4)
29120 .sr(1)
29121 .m(m)
29122 .n(n)
29123 .k(4)
29124 .iterations(1)
29125 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29126 }
29127 }
29128 }
29129
29130 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4_subtile_m) {
29131 TEST_REQUIRES_PSIMD;
29132 for (uint32_t m = 1; m <= 4; m++) {
29133 GemmMicrokernelTester()
29134 .mr(4)
29135 .nr(2)
29136 .kr(4)
29137 .sr(1)
29138 .m(m)
29139 .n(2)
29140 .k(4)
29141 .iterations(1)
29142 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29143 }
29144 }
29145
29146 TEST(F32_IGEMM_4X2C4__PSIMD, k_eq_4_subtile_n) {
29147 TEST_REQUIRES_PSIMD;
29148 for (uint32_t n = 1; n <= 2; n++) {
29149 GemmMicrokernelTester()
29150 .mr(4)
29151 .nr(2)
29152 .kr(4)
29153 .sr(1)
29154 .m(4)
29155 .n(n)
29156 .k(4)
29157 .iterations(1)
29158 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29159 }
29160 }
29161
29162 TEST(F32_IGEMM_4X2C4__PSIMD, k_lt_4) {
29163 TEST_REQUIRES_PSIMD;
29164 for (size_t k = 1; k < 4; k++) {
29165 GemmMicrokernelTester()
29166 .mr(4)
29167 .nr(2)
29168 .kr(4)
29169 .sr(1)
29170 .m(4)
29171 .n(2)
29172 .k(k)
29173 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29174 }
29175 }
29176
29177 TEST(F32_IGEMM_4X2C4__PSIMD, k_lt_4_subtile) {
29178 TEST_REQUIRES_PSIMD;
29179 for (size_t k = 1; k < 4; k++) {
29180 for (uint32_t m = 1; m <= 4; m++) {
29181 for (uint32_t n = 1; n <= 2; n++) {
29182 GemmMicrokernelTester()
29183 .mr(4)
29184 .nr(2)
29185 .kr(4)
29186 .sr(1)
29187 .m(m)
29188 .n(n)
29189 .k(k)
29190 .iterations(1)
29191 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29192 }
29193 }
29194 }
29195 }
29196
29197 TEST(F32_IGEMM_4X2C4__PSIMD, k_gt_4) {
29198 TEST_REQUIRES_PSIMD;
29199 for (size_t k = 5; k < 8; k++) {
29200 GemmMicrokernelTester()
29201 .mr(4)
29202 .nr(2)
29203 .kr(4)
29204 .sr(1)
29205 .m(4)
29206 .n(2)
29207 .k(k)
29208 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29209 }
29210 }
29211
29212 TEST(F32_IGEMM_4X2C4__PSIMD, k_gt_4_subtile) {
29213 TEST_REQUIRES_PSIMD;
29214 for (size_t k = 5; k < 8; k++) {
29215 for (uint32_t m = 1; m <= 4; m++) {
29216 for (uint32_t n = 1; n <= 2; n++) {
29217 GemmMicrokernelTester()
29218 .mr(4)
29219 .nr(2)
29220 .kr(4)
29221 .sr(1)
29222 .m(m)
29223 .n(n)
29224 .k(k)
29225 .iterations(1)
29226 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29227 }
29228 }
29229 }
29230 }
29231
29232 TEST(F32_IGEMM_4X2C4__PSIMD, k_div_4) {
29233 TEST_REQUIRES_PSIMD;
29234 for (size_t k = 8; k <= 40; k += 4) {
29235 GemmMicrokernelTester()
29236 .mr(4)
29237 .nr(2)
29238 .kr(4)
29239 .sr(1)
29240 .m(4)
29241 .n(2)
29242 .k(k)
29243 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29244 }
29245 }
29246
29247 TEST(F32_IGEMM_4X2C4__PSIMD, k_div_4_subtile) {
29248 TEST_REQUIRES_PSIMD;
29249 for (size_t k = 8; k <= 40; k += 4) {
29250 for (uint32_t m = 1; m <= 4; m++) {
29251 for (uint32_t n = 1; n <= 2; n++) {
29252 GemmMicrokernelTester()
29253 .mr(4)
29254 .nr(2)
29255 .kr(4)
29256 .sr(1)
29257 .m(m)
29258 .n(n)
29259 .k(k)
29260 .iterations(1)
29261 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29262 }
29263 }
29264 }
29265 }
29266
29267 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2) {
29268 TEST_REQUIRES_PSIMD;
29269 for (uint32_t n = 3; n < 4; n++) {
29270 for (size_t k = 1; k <= 20; k += 5) {
29271 GemmMicrokernelTester()
29272 .mr(4)
29273 .nr(2)
29274 .kr(4)
29275 .sr(1)
29276 .m(4)
29277 .n(2)
29278 .k(k)
29279 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29280 }
29281 }
29282 }
29283
29284 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2_strided_cn) {
29285 TEST_REQUIRES_PSIMD;
29286 for (uint32_t n = 3; n < 4; n++) {
29287 for (size_t k = 1; k <= 20; k += 5) {
29288 GemmMicrokernelTester()
29289 .mr(4)
29290 .nr(2)
29291 .kr(4)
29292 .sr(1)
29293 .m(4)
29294 .n(2)
29295 .k(k)
29296 .cn_stride(5)
29297 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29298 }
29299 }
29300 }
29301
29302 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2_subtile) {
29303 TEST_REQUIRES_PSIMD;
29304 for (uint32_t n = 3; n < 4; n++) {
29305 for (size_t k = 1; k <= 20; k += 5) {
29306 for (uint32_t m = 1; m <= 4; m++) {
29307 GemmMicrokernelTester()
29308 .mr(4)
29309 .nr(2)
29310 .kr(4)
29311 .sr(1)
29312 .m(m)
29313 .n(n)
29314 .k(k)
29315 .iterations(1)
29316 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29317 }
29318 }
29319 }
29320 }
29321
29322 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2) {
29323 TEST_REQUIRES_PSIMD;
29324 for (uint32_t n = 4; n <= 6; n += 2) {
29325 for (size_t k = 1; k <= 20; k += 5) {
29326 GemmMicrokernelTester()
29327 .mr(4)
29328 .nr(2)
29329 .kr(4)
29330 .sr(1)
29331 .m(4)
29332 .n(2)
29333 .k(k)
29334 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29335 }
29336 }
29337 }
29338
29339 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2_strided_cn) {
29340 TEST_REQUIRES_PSIMD;
29341 for (uint32_t n = 4; n <= 6; n += 2) {
29342 for (size_t k = 1; k <= 20; k += 5) {
29343 GemmMicrokernelTester()
29344 .mr(4)
29345 .nr(2)
29346 .kr(4)
29347 .sr(1)
29348 .m(4)
29349 .n(n)
29350 .k(k)
29351 .cn_stride(5)
29352 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29353 }
29354 }
29355 }
29356
29357 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2_subtile) {
29358 TEST_REQUIRES_PSIMD;
29359 for (uint32_t n = 4; n <= 6; n += 2) {
29360 for (size_t k = 1; k <= 20; k += 5) {
29361 for (uint32_t m = 1; m <= 4; m++) {
29362 GemmMicrokernelTester()
29363 .mr(4)
29364 .nr(2)
29365 .kr(4)
29366 .sr(1)
29367 .m(m)
29368 .n(n)
29369 .k(k)
29370 .iterations(1)
29371 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29372 }
29373 }
29374 }
29375 }
29376
29377 TEST(F32_IGEMM_4X2C4__PSIMD, small_kernel) {
29378 TEST_REQUIRES_PSIMD;
29379 for (size_t k = 1; k <= 20; k += 5) {
29380 GemmMicrokernelTester()
29381 .mr(4)
29382 .nr(2)
29383 .kr(4)
29384 .sr(1)
29385 .m(4)
29386 .n(2)
29387 .k(k)
29388 .ks(3)
29389 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29390 }
29391 }
29392
29393 TEST(F32_IGEMM_4X2C4__PSIMD, small_kernel_subtile) {
29394 TEST_REQUIRES_PSIMD;
29395 for (size_t k = 1; k <= 20; k += 5) {
29396 for (uint32_t m = 1; m <= 4; m++) {
29397 for (uint32_t n = 1; n <= 2; n++) {
29398 GemmMicrokernelTester()
29399 .mr(4)
29400 .nr(2)
29401 .kr(4)
29402 .sr(1)
29403 .m(m)
29404 .n(n)
29405 .k(k)
29406 .ks(3)
29407 .iterations(1)
29408 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29409 }
29410 }
29411 }
29412 }
29413
29414 TEST(F32_IGEMM_4X2C4__PSIMD, n_gt_2_small_kernel) {
29415 TEST_REQUIRES_PSIMD;
29416 for (uint32_t n = 3; n < 4; n++) {
29417 for (size_t k = 1; k <= 20; k += 5) {
29418 GemmMicrokernelTester()
29419 .mr(4)
29420 .nr(2)
29421 .kr(4)
29422 .sr(1)
29423 .m(4)
29424 .n(2)
29425 .k(k)
29426 .ks(3)
29427 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29428 }
29429 }
29430 }
29431
29432 TEST(F32_IGEMM_4X2C4__PSIMD, n_div_2_small_kernel) {
29433 TEST_REQUIRES_PSIMD;
29434 for (uint32_t n = 4; n <= 6; n += 2) {
29435 for (size_t k = 1; k <= 20; k += 5) {
29436 GemmMicrokernelTester()
29437 .mr(4)
29438 .nr(2)
29439 .kr(4)
29440 .sr(1)
29441 .m(4)
29442 .n(2)
29443 .k(k)
29444 .ks(3)
29445 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29446 }
29447 }
29448 }
29449
29450 TEST(F32_IGEMM_4X2C4__PSIMD, strided_cm_subtile) {
29451 TEST_REQUIRES_PSIMD;
29452 for (size_t k = 1; k <= 20; k += 5) {
29453 for (uint32_t m = 1; m <= 4; m++) {
29454 for (uint32_t n = 1; n <= 2; n++) {
29455 GemmMicrokernelTester()
29456 .mr(4)
29457 .nr(2)
29458 .kr(4)
29459 .sr(1)
29460 .m(m)
29461 .n(n)
29462 .k(k)
29463 .cm_stride(5)
29464 .iterations(1)
29465 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29466 }
29467 }
29468 }
29469 }
29470
29471 TEST(F32_IGEMM_4X2C4__PSIMD, a_offset) {
29472 TEST_REQUIRES_PSIMD;
29473 for (size_t k = 1; k <= 20; k += 5) {
29474 GemmMicrokernelTester()
29475 .mr(4)
29476 .nr(2)
29477 .kr(4)
29478 .sr(1)
29479 .m(4)
29480 .n(2)
29481 .k(k)
29482 .ks(3)
29483 .a_offset(83)
29484 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29485 }
29486 }
29487
29488 TEST(F32_IGEMM_4X2C4__PSIMD, zero) {
29489 TEST_REQUIRES_PSIMD;
29490 for (uint32_t mz = 0; mz < 4; mz++) {
29491 for (size_t k = 1; k <= 20; k += 5) {
29492 GemmMicrokernelTester()
29493 .mr(4)
29494 .nr(2)
29495 .kr(4)
29496 .sr(1)
29497 .m(4)
29498 .n(2)
29499 .k(k)
29500 .ks(3)
29501 .a_offset(83)
29502 .zero_index(mz)
29503 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29504 }
29505 }
29506 }
29507
29508 TEST(F32_IGEMM_4X2C4__PSIMD, qmin) {
29509 TEST_REQUIRES_PSIMD;
29510 GemmMicrokernelTester()
29511 .mr(4)
29512 .nr(2)
29513 .kr(4)
29514 .sr(1)
29515 .m(4)
29516 .n(2)
29517 .k(4)
29518 .qmin(128)
29519 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29520 }
29521
29522 TEST(F32_IGEMM_4X2C4__PSIMD, qmax) {
29523 TEST_REQUIRES_PSIMD;
29524 GemmMicrokernelTester()
29525 .mr(4)
29526 .nr(2)
29527 .kr(4)
29528 .sr(1)
29529 .m(4)
29530 .n(2)
29531 .k(4)
29532 .qmax(128)
29533 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29534 }
29535
29536 TEST(F32_IGEMM_4X2C4__PSIMD, strided_cm) {
29537 TEST_REQUIRES_PSIMD;
29538 GemmMicrokernelTester()
29539 .mr(4)
29540 .nr(2)
29541 .kr(4)
29542 .sr(1)
29543 .m(4)
29544 .n(2)
29545 .k(4)
29546 .cm_stride(5)
29547 .Test(xnn_f32_igemm_ukernel_4x2c4__psimd, GemmMicrokernelTester::Variant::Scalar);
29548 }
29549#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
29550
29551
29552#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29553 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1) {
29554 TEST_REQUIRES_X86_AVX;
29555 GemmMicrokernelTester()
29556 .mr(1)
29557 .nr(8)
29558 .kr(1)
29559 .sr(1)
29560 .m(1)
29561 .n(8)
29562 .k(1)
29563 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29564 }
29565
29566 TEST(F32_IGEMM_1X8__AVX_BROADCAST, strided_cn) {
29567 TEST_REQUIRES_X86_AVX;
29568 GemmMicrokernelTester()
29569 .mr(1)
29570 .nr(8)
29571 .kr(1)
29572 .sr(1)
29573 .m(1)
29574 .n(8)
29575 .k(1)
29576 .cn_stride(11)
29577 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29578 }
29579
29580 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1_subtile) {
29581 TEST_REQUIRES_X86_AVX;
29582 for (uint32_t m = 1; m <= 1; m++) {
29583 for (uint32_t n = 1; n <= 8; n++) {
29584 GemmMicrokernelTester()
29585 .mr(1)
29586 .nr(8)
29587 .kr(1)
29588 .sr(1)
29589 .m(m)
29590 .n(n)
29591 .k(1)
29592 .iterations(1)
29593 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29594 }
29595 }
29596 }
29597
29598 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
29599 TEST_REQUIRES_X86_AVX;
29600 for (uint32_t m = 1; m <= 1; m++) {
29601 GemmMicrokernelTester()
29602 .mr(1)
29603 .nr(8)
29604 .kr(1)
29605 .sr(1)
29606 .m(m)
29607 .n(8)
29608 .k(1)
29609 .iterations(1)
29610 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29611 }
29612 }
29613
29614 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
29615 TEST_REQUIRES_X86_AVX;
29616 for (uint32_t n = 1; n <= 8; n++) {
29617 GemmMicrokernelTester()
29618 .mr(1)
29619 .nr(8)
29620 .kr(1)
29621 .sr(1)
29622 .m(1)
29623 .n(n)
29624 .k(1)
29625 .iterations(1)
29626 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29627 }
29628 }
29629
29630 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_gt_1) {
29631 TEST_REQUIRES_X86_AVX;
29632 for (size_t k = 2; k < 10; k++) {
29633 GemmMicrokernelTester()
29634 .mr(1)
29635 .nr(8)
29636 .kr(1)
29637 .sr(1)
29638 .m(1)
29639 .n(8)
29640 .k(k)
29641 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29642 }
29643 }
29644
29645 TEST(F32_IGEMM_1X8__AVX_BROADCAST, k_gt_1_subtile) {
29646 TEST_REQUIRES_X86_AVX;
29647 for (size_t k = 2; k < 10; k++) {
29648 for (uint32_t m = 1; m <= 1; m++) {
29649 for (uint32_t n = 1; n <= 8; n++) {
29650 GemmMicrokernelTester()
29651 .mr(1)
29652 .nr(8)
29653 .kr(1)
29654 .sr(1)
29655 .m(m)
29656 .n(n)
29657 .k(k)
29658 .iterations(1)
29659 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29660 }
29661 }
29662 }
29663 }
29664
29665 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8) {
29666 TEST_REQUIRES_X86_AVX;
29667 for (uint32_t n = 9; n < 16; n++) {
29668 for (size_t k = 1; k <= 5; k += 2) {
29669 GemmMicrokernelTester()
29670 .mr(1)
29671 .nr(8)
29672 .kr(1)
29673 .sr(1)
29674 .m(1)
29675 .n(8)
29676 .k(k)
29677 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29678 }
29679 }
29680 }
29681
29682 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
29683 TEST_REQUIRES_X86_AVX;
29684 for (uint32_t n = 9; n < 16; n++) {
29685 for (size_t k = 1; k <= 5; k += 2) {
29686 GemmMicrokernelTester()
29687 .mr(1)
29688 .nr(8)
29689 .kr(1)
29690 .sr(1)
29691 .m(1)
29692 .n(8)
29693 .k(k)
29694 .cn_stride(11)
29695 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29696 }
29697 }
29698 }
29699
29700 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8_subtile) {
29701 TEST_REQUIRES_X86_AVX;
29702 for (uint32_t n = 9; n < 16; n++) {
29703 for (size_t k = 1; k <= 5; k += 2) {
29704 for (uint32_t m = 1; m <= 1; m++) {
29705 GemmMicrokernelTester()
29706 .mr(1)
29707 .nr(8)
29708 .kr(1)
29709 .sr(1)
29710 .m(m)
29711 .n(n)
29712 .k(k)
29713 .iterations(1)
29714 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29715 }
29716 }
29717 }
29718 }
29719
29720 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8) {
29721 TEST_REQUIRES_X86_AVX;
29722 for (uint32_t n = 16; n <= 24; n += 8) {
29723 for (size_t k = 1; k <= 5; k += 2) {
29724 GemmMicrokernelTester()
29725 .mr(1)
29726 .nr(8)
29727 .kr(1)
29728 .sr(1)
29729 .m(1)
29730 .n(8)
29731 .k(k)
29732 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29733 }
29734 }
29735 }
29736
29737 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
29738 TEST_REQUIRES_X86_AVX;
29739 for (uint32_t n = 16; n <= 24; n += 8) {
29740 for (size_t k = 1; k <= 5; k += 2) {
29741 GemmMicrokernelTester()
29742 .mr(1)
29743 .nr(8)
29744 .kr(1)
29745 .sr(1)
29746 .m(1)
29747 .n(n)
29748 .k(k)
29749 .cn_stride(11)
29750 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29751 }
29752 }
29753 }
29754
29755 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8_subtile) {
29756 TEST_REQUIRES_X86_AVX;
29757 for (uint32_t n = 16; n <= 24; n += 8) {
29758 for (size_t k = 1; k <= 5; k += 2) {
29759 for (uint32_t m = 1; m <= 1; m++) {
29760 GemmMicrokernelTester()
29761 .mr(1)
29762 .nr(8)
29763 .kr(1)
29764 .sr(1)
29765 .m(m)
29766 .n(n)
29767 .k(k)
29768 .iterations(1)
29769 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29770 }
29771 }
29772 }
29773 }
29774
29775 TEST(F32_IGEMM_1X8__AVX_BROADCAST, small_kernel) {
29776 TEST_REQUIRES_X86_AVX;
29777 for (size_t k = 1; k <= 5; k += 2) {
29778 GemmMicrokernelTester()
29779 .mr(1)
29780 .nr(8)
29781 .kr(1)
29782 .sr(1)
29783 .m(1)
29784 .n(8)
29785 .k(k)
29786 .ks(3)
29787 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29788 }
29789 }
29790
29791 TEST(F32_IGEMM_1X8__AVX_BROADCAST, small_kernel_subtile) {
29792 TEST_REQUIRES_X86_AVX;
29793 for (size_t k = 1; k <= 5; k += 2) {
29794 for (uint32_t m = 1; m <= 1; m++) {
29795 for (uint32_t n = 1; n <= 8; n++) {
29796 GemmMicrokernelTester()
29797 .mr(1)
29798 .nr(8)
29799 .kr(1)
29800 .sr(1)
29801 .m(m)
29802 .n(n)
29803 .k(k)
29804 .ks(3)
29805 .iterations(1)
29806 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29807 }
29808 }
29809 }
29810 }
29811
29812 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_gt_8_small_kernel) {
29813 TEST_REQUIRES_X86_AVX;
29814 for (uint32_t n = 9; n < 16; n++) {
29815 for (size_t k = 1; k <= 5; k += 2) {
29816 GemmMicrokernelTester()
29817 .mr(1)
29818 .nr(8)
29819 .kr(1)
29820 .sr(1)
29821 .m(1)
29822 .n(8)
29823 .k(k)
29824 .ks(3)
29825 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29826 }
29827 }
29828 }
29829
29830 TEST(F32_IGEMM_1X8__AVX_BROADCAST, n_div_8_small_kernel) {
29831 TEST_REQUIRES_X86_AVX;
29832 for (uint32_t n = 16; n <= 24; n += 8) {
29833 for (size_t k = 1; k <= 5; k += 2) {
29834 GemmMicrokernelTester()
29835 .mr(1)
29836 .nr(8)
29837 .kr(1)
29838 .sr(1)
29839 .m(1)
29840 .n(8)
29841 .k(k)
29842 .ks(3)
29843 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29844 }
29845 }
29846 }
29847
29848 TEST(F32_IGEMM_1X8__AVX_BROADCAST, strided_cm_subtile) {
29849 TEST_REQUIRES_X86_AVX;
29850 for (size_t k = 1; k <= 5; k += 2) {
29851 for (uint32_t m = 1; m <= 1; m++) {
29852 for (uint32_t n = 1; n <= 8; n++) {
29853 GemmMicrokernelTester()
29854 .mr(1)
29855 .nr(8)
29856 .kr(1)
29857 .sr(1)
29858 .m(m)
29859 .n(n)
29860 .k(k)
29861 .cm_stride(11)
29862 .iterations(1)
29863 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29864 }
29865 }
29866 }
29867 }
29868
29869 TEST(F32_IGEMM_1X8__AVX_BROADCAST, a_offset) {
29870 TEST_REQUIRES_X86_AVX;
29871 for (size_t k = 1; k <= 5; k += 2) {
29872 GemmMicrokernelTester()
29873 .mr(1)
29874 .nr(8)
29875 .kr(1)
29876 .sr(1)
29877 .m(1)
29878 .n(8)
29879 .k(k)
29880 .ks(3)
29881 .a_offset(7)
29882 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29883 }
29884 }
29885
29886 TEST(F32_IGEMM_1X8__AVX_BROADCAST, zero) {
29887 TEST_REQUIRES_X86_AVX;
29888 for (uint32_t mz = 0; mz < 1; mz++) {
29889 for (size_t k = 1; k <= 5; k += 2) {
29890 GemmMicrokernelTester()
29891 .mr(1)
29892 .nr(8)
29893 .kr(1)
29894 .sr(1)
29895 .m(1)
29896 .n(8)
29897 .k(k)
29898 .ks(3)
29899 .a_offset(7)
29900 .zero_index(mz)
29901 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29902 }
29903 }
29904 }
29905
29906 TEST(F32_IGEMM_1X8__AVX_BROADCAST, qmin) {
29907 TEST_REQUIRES_X86_AVX;
29908 GemmMicrokernelTester()
29909 .mr(1)
29910 .nr(8)
29911 .kr(1)
29912 .sr(1)
29913 .m(1)
29914 .n(8)
29915 .k(1)
29916 .qmin(128)
29917 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29918 }
29919
29920 TEST(F32_IGEMM_1X8__AVX_BROADCAST, qmax) {
29921 TEST_REQUIRES_X86_AVX;
29922 GemmMicrokernelTester()
29923 .mr(1)
29924 .nr(8)
29925 .kr(1)
29926 .sr(1)
29927 .m(1)
29928 .n(8)
29929 .k(1)
29930 .qmax(128)
29931 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29932 }
29933
29934 TEST(F32_IGEMM_1X8__AVX_BROADCAST, strided_cm) {
29935 TEST_REQUIRES_X86_AVX;
29936 GemmMicrokernelTester()
29937 .mr(1)
29938 .nr(8)
29939 .kr(1)
29940 .sr(1)
29941 .m(1)
29942 .n(8)
29943 .k(1)
29944 .cm_stride(11)
29945 .Test(xnn_f32_igemm_ukernel_1x8__avx_broadcast);
29946 }
29947#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29948
29949
29950#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29951 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1) {
29952 TEST_REQUIRES_X86_AVX;
29953 GemmMicrokernelTester()
29954 .mr(4)
29955 .nr(8)
29956 .kr(1)
29957 .sr(1)
29958 .m(4)
29959 .n(8)
29960 .k(1)
29961 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
29962 }
29963
29964 TEST(F32_IGEMM_4X8__AVX_BROADCAST, strided_cn) {
29965 TEST_REQUIRES_X86_AVX;
29966 GemmMicrokernelTester()
29967 .mr(4)
29968 .nr(8)
29969 .kr(1)
29970 .sr(1)
29971 .m(4)
29972 .n(8)
29973 .k(1)
29974 .cn_stride(11)
29975 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
29976 }
29977
29978 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1_subtile) {
29979 TEST_REQUIRES_X86_AVX;
29980 for (uint32_t m = 1; m <= 4; m++) {
29981 for (uint32_t n = 1; n <= 8; n++) {
29982 GemmMicrokernelTester()
29983 .mr(4)
29984 .nr(8)
29985 .kr(1)
29986 .sr(1)
29987 .m(m)
29988 .n(n)
29989 .k(1)
29990 .iterations(1)
29991 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
29992 }
29993 }
29994 }
29995
29996 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_m) {
29997 TEST_REQUIRES_X86_AVX;
29998 for (uint32_t m = 1; m <= 4; m++) {
29999 GemmMicrokernelTester()
30000 .mr(4)
30001 .nr(8)
30002 .kr(1)
30003 .sr(1)
30004 .m(m)
30005 .n(8)
30006 .k(1)
30007 .iterations(1)
30008 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30009 }
30010 }
30011
30012 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_n) {
30013 TEST_REQUIRES_X86_AVX;
30014 for (uint32_t n = 1; n <= 8; n++) {
30015 GemmMicrokernelTester()
30016 .mr(4)
30017 .nr(8)
30018 .kr(1)
30019 .sr(1)
30020 .m(4)
30021 .n(n)
30022 .k(1)
30023 .iterations(1)
30024 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30025 }
30026 }
30027
30028 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_gt_1) {
30029 TEST_REQUIRES_X86_AVX;
30030 for (size_t k = 2; k < 10; k++) {
30031 GemmMicrokernelTester()
30032 .mr(4)
30033 .nr(8)
30034 .kr(1)
30035 .sr(1)
30036 .m(4)
30037 .n(8)
30038 .k(k)
30039 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30040 }
30041 }
30042
30043 TEST(F32_IGEMM_4X8__AVX_BROADCAST, k_gt_1_subtile) {
30044 TEST_REQUIRES_X86_AVX;
30045 for (size_t k = 2; k < 10; k++) {
30046 for (uint32_t m = 1; m <= 4; m++) {
30047 for (uint32_t n = 1; n <= 8; n++) {
30048 GemmMicrokernelTester()
30049 .mr(4)
30050 .nr(8)
30051 .kr(1)
30052 .sr(1)
30053 .m(m)
30054 .n(n)
30055 .k(k)
30056 .iterations(1)
30057 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30058 }
30059 }
30060 }
30061 }
30062
30063 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8) {
30064 TEST_REQUIRES_X86_AVX;
30065 for (uint32_t n = 9; n < 16; n++) {
30066 for (size_t k = 1; k <= 5; k += 2) {
30067 GemmMicrokernelTester()
30068 .mr(4)
30069 .nr(8)
30070 .kr(1)
30071 .sr(1)
30072 .m(4)
30073 .n(8)
30074 .k(k)
30075 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30076 }
30077 }
30078 }
30079
30080 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8_strided_cn) {
30081 TEST_REQUIRES_X86_AVX;
30082 for (uint32_t n = 9; n < 16; n++) {
30083 for (size_t k = 1; k <= 5; k += 2) {
30084 GemmMicrokernelTester()
30085 .mr(4)
30086 .nr(8)
30087 .kr(1)
30088 .sr(1)
30089 .m(4)
30090 .n(8)
30091 .k(k)
30092 .cn_stride(11)
30093 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30094 }
30095 }
30096 }
30097
30098 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8_subtile) {
30099 TEST_REQUIRES_X86_AVX;
30100 for (uint32_t n = 9; n < 16; n++) {
30101 for (size_t k = 1; k <= 5; k += 2) {
30102 for (uint32_t m = 1; m <= 4; m++) {
30103 GemmMicrokernelTester()
30104 .mr(4)
30105 .nr(8)
30106 .kr(1)
30107 .sr(1)
30108 .m(m)
30109 .n(n)
30110 .k(k)
30111 .iterations(1)
30112 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30113 }
30114 }
30115 }
30116 }
30117
30118 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8) {
30119 TEST_REQUIRES_X86_AVX;
30120 for (uint32_t n = 16; n <= 24; n += 8) {
30121 for (size_t k = 1; k <= 5; k += 2) {
30122 GemmMicrokernelTester()
30123 .mr(4)
30124 .nr(8)
30125 .kr(1)
30126 .sr(1)
30127 .m(4)
30128 .n(8)
30129 .k(k)
30130 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30131 }
30132 }
30133 }
30134
30135 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8_strided_cn) {
30136 TEST_REQUIRES_X86_AVX;
30137 for (uint32_t n = 16; n <= 24; n += 8) {
30138 for (size_t k = 1; k <= 5; k += 2) {
30139 GemmMicrokernelTester()
30140 .mr(4)
30141 .nr(8)
30142 .kr(1)
30143 .sr(1)
30144 .m(4)
30145 .n(n)
30146 .k(k)
30147 .cn_stride(11)
30148 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30149 }
30150 }
30151 }
30152
30153 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8_subtile) {
30154 TEST_REQUIRES_X86_AVX;
30155 for (uint32_t n = 16; n <= 24; n += 8) {
30156 for (size_t k = 1; k <= 5; k += 2) {
30157 for (uint32_t m = 1; m <= 4; m++) {
30158 GemmMicrokernelTester()
30159 .mr(4)
30160 .nr(8)
30161 .kr(1)
30162 .sr(1)
30163 .m(m)
30164 .n(n)
30165 .k(k)
30166 .iterations(1)
30167 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30168 }
30169 }
30170 }
30171 }
30172
30173 TEST(F32_IGEMM_4X8__AVX_BROADCAST, small_kernel) {
30174 TEST_REQUIRES_X86_AVX;
30175 for (size_t k = 1; k <= 5; k += 2) {
30176 GemmMicrokernelTester()
30177 .mr(4)
30178 .nr(8)
30179 .kr(1)
30180 .sr(1)
30181 .m(4)
30182 .n(8)
30183 .k(k)
30184 .ks(3)
30185 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30186 }
30187 }
30188
30189 TEST(F32_IGEMM_4X8__AVX_BROADCAST, small_kernel_subtile) {
30190 TEST_REQUIRES_X86_AVX;
30191 for (size_t k = 1; k <= 5; k += 2) {
30192 for (uint32_t m = 1; m <= 4; m++) {
30193 for (uint32_t n = 1; n <= 8; n++) {
30194 GemmMicrokernelTester()
30195 .mr(4)
30196 .nr(8)
30197 .kr(1)
30198 .sr(1)
30199 .m(m)
30200 .n(n)
30201 .k(k)
30202 .ks(3)
30203 .iterations(1)
30204 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30205 }
30206 }
30207 }
30208 }
30209
30210 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_gt_8_small_kernel) {
30211 TEST_REQUIRES_X86_AVX;
30212 for (uint32_t n = 9; n < 16; n++) {
30213 for (size_t k = 1; k <= 5; k += 2) {
30214 GemmMicrokernelTester()
30215 .mr(4)
30216 .nr(8)
30217 .kr(1)
30218 .sr(1)
30219 .m(4)
30220 .n(8)
30221 .k(k)
30222 .ks(3)
30223 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30224 }
30225 }
30226 }
30227
30228 TEST(F32_IGEMM_4X8__AVX_BROADCAST, n_div_8_small_kernel) {
30229 TEST_REQUIRES_X86_AVX;
30230 for (uint32_t n = 16; n <= 24; n += 8) {
30231 for (size_t k = 1; k <= 5; k += 2) {
30232 GemmMicrokernelTester()
30233 .mr(4)
30234 .nr(8)
30235 .kr(1)
30236 .sr(1)
30237 .m(4)
30238 .n(8)
30239 .k(k)
30240 .ks(3)
30241 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30242 }
30243 }
30244 }
30245
30246 TEST(F32_IGEMM_4X8__AVX_BROADCAST, strided_cm_subtile) {
30247 TEST_REQUIRES_X86_AVX;
30248 for (size_t k = 1; k <= 5; k += 2) {
30249 for (uint32_t m = 1; m <= 4; m++) {
30250 for (uint32_t n = 1; n <= 8; n++) {
30251 GemmMicrokernelTester()
30252 .mr(4)
30253 .nr(8)
30254 .kr(1)
30255 .sr(1)
30256 .m(m)
30257 .n(n)
30258 .k(k)
30259 .cm_stride(11)
30260 .iterations(1)
30261 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30262 }
30263 }
30264 }
30265 }
30266
30267 TEST(F32_IGEMM_4X8__AVX_BROADCAST, a_offset) {
30268 TEST_REQUIRES_X86_AVX;
30269 for (size_t k = 1; k <= 5; k += 2) {
30270 GemmMicrokernelTester()
30271 .mr(4)
30272 .nr(8)
30273 .kr(1)
30274 .sr(1)
30275 .m(4)
30276 .n(8)
30277 .k(k)
30278 .ks(3)
30279 .a_offset(23)
30280 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30281 }
30282 }
30283
30284 TEST(F32_IGEMM_4X8__AVX_BROADCAST, zero) {
30285 TEST_REQUIRES_X86_AVX;
30286 for (uint32_t mz = 0; mz < 4; mz++) {
30287 for (size_t k = 1; k <= 5; k += 2) {
30288 GemmMicrokernelTester()
30289 .mr(4)
30290 .nr(8)
30291 .kr(1)
30292 .sr(1)
30293 .m(4)
30294 .n(8)
30295 .k(k)
30296 .ks(3)
30297 .a_offset(23)
30298 .zero_index(mz)
30299 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30300 }
30301 }
30302 }
30303
30304 TEST(F32_IGEMM_4X8__AVX_BROADCAST, qmin) {
30305 TEST_REQUIRES_X86_AVX;
30306 GemmMicrokernelTester()
30307 .mr(4)
30308 .nr(8)
30309 .kr(1)
30310 .sr(1)
30311 .m(4)
30312 .n(8)
30313 .k(1)
30314 .qmin(128)
30315 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30316 }
30317
30318 TEST(F32_IGEMM_4X8__AVX_BROADCAST, qmax) {
30319 TEST_REQUIRES_X86_AVX;
30320 GemmMicrokernelTester()
30321 .mr(4)
30322 .nr(8)
30323 .kr(1)
30324 .sr(1)
30325 .m(4)
30326 .n(8)
30327 .k(1)
30328 .qmax(128)
30329 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30330 }
30331
30332 TEST(F32_IGEMM_4X8__AVX_BROADCAST, strided_cm) {
30333 TEST_REQUIRES_X86_AVX;
30334 GemmMicrokernelTester()
30335 .mr(4)
30336 .nr(8)
30337 .kr(1)
30338 .sr(1)
30339 .m(4)
30340 .n(8)
30341 .k(1)
30342 .cm_stride(11)
30343 .Test(xnn_f32_igemm_ukernel_4x8__avx_broadcast);
30344 }
30345#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30346
30347
30348#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30349 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1) {
30350 TEST_REQUIRES_X86_AVX;
30351 GemmMicrokernelTester()
30352 .mr(5)
30353 .nr(8)
30354 .kr(1)
30355 .sr(1)
30356 .m(5)
30357 .n(8)
30358 .k(1)
30359 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30360 }
30361
30362 TEST(F32_IGEMM_5X8__AVX_BROADCAST, strided_cn) {
30363 TEST_REQUIRES_X86_AVX;
30364 GemmMicrokernelTester()
30365 .mr(5)
30366 .nr(8)
30367 .kr(1)
30368 .sr(1)
30369 .m(5)
30370 .n(8)
30371 .k(1)
30372 .cn_stride(11)
30373 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30374 }
30375
30376 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1_subtile) {
30377 TEST_REQUIRES_X86_AVX;
30378 for (uint32_t m = 1; m <= 5; m++) {
30379 for (uint32_t n = 1; n <= 8; n++) {
30380 GemmMicrokernelTester()
30381 .mr(5)
30382 .nr(8)
30383 .kr(1)
30384 .sr(1)
30385 .m(m)
30386 .n(n)
30387 .k(1)
30388 .iterations(1)
30389 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30390 }
30391 }
30392 }
30393
30394 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
30395 TEST_REQUIRES_X86_AVX;
30396 for (uint32_t m = 1; m <= 5; m++) {
30397 GemmMicrokernelTester()
30398 .mr(5)
30399 .nr(8)
30400 .kr(1)
30401 .sr(1)
30402 .m(m)
30403 .n(8)
30404 .k(1)
30405 .iterations(1)
30406 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30407 }
30408 }
30409
30410 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
30411 TEST_REQUIRES_X86_AVX;
30412 for (uint32_t n = 1; n <= 8; n++) {
30413 GemmMicrokernelTester()
30414 .mr(5)
30415 .nr(8)
30416 .kr(1)
30417 .sr(1)
30418 .m(5)
30419 .n(n)
30420 .k(1)
30421 .iterations(1)
30422 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30423 }
30424 }
30425
30426 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_gt_1) {
30427 TEST_REQUIRES_X86_AVX;
30428 for (size_t k = 2; k < 10; k++) {
30429 GemmMicrokernelTester()
30430 .mr(5)
30431 .nr(8)
30432 .kr(1)
30433 .sr(1)
30434 .m(5)
30435 .n(8)
30436 .k(k)
30437 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30438 }
30439 }
30440
30441 TEST(F32_IGEMM_5X8__AVX_BROADCAST, k_gt_1_subtile) {
30442 TEST_REQUIRES_X86_AVX;
30443 for (size_t k = 2; k < 10; k++) {
30444 for (uint32_t m = 1; m <= 5; m++) {
30445 for (uint32_t n = 1; n <= 8; n++) {
30446 GemmMicrokernelTester()
30447 .mr(5)
30448 .nr(8)
30449 .kr(1)
30450 .sr(1)
30451 .m(m)
30452 .n(n)
30453 .k(k)
30454 .iterations(1)
30455 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30456 }
30457 }
30458 }
30459 }
30460
30461 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8) {
30462 TEST_REQUIRES_X86_AVX;
30463 for (uint32_t n = 9; n < 16; n++) {
30464 for (size_t k = 1; k <= 5; k += 2) {
30465 GemmMicrokernelTester()
30466 .mr(5)
30467 .nr(8)
30468 .kr(1)
30469 .sr(1)
30470 .m(5)
30471 .n(8)
30472 .k(k)
30473 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30474 }
30475 }
30476 }
30477
30478 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
30479 TEST_REQUIRES_X86_AVX;
30480 for (uint32_t n = 9; n < 16; n++) {
30481 for (size_t k = 1; k <= 5; k += 2) {
30482 GemmMicrokernelTester()
30483 .mr(5)
30484 .nr(8)
30485 .kr(1)
30486 .sr(1)
30487 .m(5)
30488 .n(8)
30489 .k(k)
30490 .cn_stride(11)
30491 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30492 }
30493 }
30494 }
30495
30496 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8_subtile) {
30497 TEST_REQUIRES_X86_AVX;
30498 for (uint32_t n = 9; n < 16; n++) {
30499 for (size_t k = 1; k <= 5; k += 2) {
30500 for (uint32_t m = 1; m <= 5; m++) {
30501 GemmMicrokernelTester()
30502 .mr(5)
30503 .nr(8)
30504 .kr(1)
30505 .sr(1)
30506 .m(m)
30507 .n(n)
30508 .k(k)
30509 .iterations(1)
30510 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30511 }
30512 }
30513 }
30514 }
30515
30516 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8) {
30517 TEST_REQUIRES_X86_AVX;
30518 for (uint32_t n = 16; n <= 24; n += 8) {
30519 for (size_t k = 1; k <= 5; k += 2) {
30520 GemmMicrokernelTester()
30521 .mr(5)
30522 .nr(8)
30523 .kr(1)
30524 .sr(1)
30525 .m(5)
30526 .n(8)
30527 .k(k)
30528 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30529 }
30530 }
30531 }
30532
30533 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
30534 TEST_REQUIRES_X86_AVX;
30535 for (uint32_t n = 16; n <= 24; n += 8) {
30536 for (size_t k = 1; k <= 5; k += 2) {
30537 GemmMicrokernelTester()
30538 .mr(5)
30539 .nr(8)
30540 .kr(1)
30541 .sr(1)
30542 .m(5)
30543 .n(n)
30544 .k(k)
30545 .cn_stride(11)
30546 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30547 }
30548 }
30549 }
30550
30551 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8_subtile) {
30552 TEST_REQUIRES_X86_AVX;
30553 for (uint32_t n = 16; n <= 24; n += 8) {
30554 for (size_t k = 1; k <= 5; k += 2) {
30555 for (uint32_t m = 1; m <= 5; m++) {
30556 GemmMicrokernelTester()
30557 .mr(5)
30558 .nr(8)
30559 .kr(1)
30560 .sr(1)
30561 .m(m)
30562 .n(n)
30563 .k(k)
30564 .iterations(1)
30565 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30566 }
30567 }
30568 }
30569 }
30570
30571 TEST(F32_IGEMM_5X8__AVX_BROADCAST, small_kernel) {
30572 TEST_REQUIRES_X86_AVX;
30573 for (size_t k = 1; k <= 5; k += 2) {
30574 GemmMicrokernelTester()
30575 .mr(5)
30576 .nr(8)
30577 .kr(1)
30578 .sr(1)
30579 .m(5)
30580 .n(8)
30581 .k(k)
30582 .ks(3)
30583 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30584 }
30585 }
30586
30587 TEST(F32_IGEMM_5X8__AVX_BROADCAST, small_kernel_subtile) {
30588 TEST_REQUIRES_X86_AVX;
30589 for (size_t k = 1; k <= 5; k += 2) {
30590 for (uint32_t m = 1; m <= 5; m++) {
30591 for (uint32_t n = 1; n <= 8; n++) {
30592 GemmMicrokernelTester()
30593 .mr(5)
30594 .nr(8)
30595 .kr(1)
30596 .sr(1)
30597 .m(m)
30598 .n(n)
30599 .k(k)
30600 .ks(3)
30601 .iterations(1)
30602 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30603 }
30604 }
30605 }
30606 }
30607
30608 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_gt_8_small_kernel) {
30609 TEST_REQUIRES_X86_AVX;
30610 for (uint32_t n = 9; n < 16; n++) {
30611 for (size_t k = 1; k <= 5; k += 2) {
30612 GemmMicrokernelTester()
30613 .mr(5)
30614 .nr(8)
30615 .kr(1)
30616 .sr(1)
30617 .m(5)
30618 .n(8)
30619 .k(k)
30620 .ks(3)
30621 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30622 }
30623 }
30624 }
30625
30626 TEST(F32_IGEMM_5X8__AVX_BROADCAST, n_div_8_small_kernel) {
30627 TEST_REQUIRES_X86_AVX;
30628 for (uint32_t n = 16; n <= 24; n += 8) {
30629 for (size_t k = 1; k <= 5; k += 2) {
30630 GemmMicrokernelTester()
30631 .mr(5)
30632 .nr(8)
30633 .kr(1)
30634 .sr(1)
30635 .m(5)
30636 .n(8)
30637 .k(k)
30638 .ks(3)
30639 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30640 }
30641 }
30642 }
30643
30644 TEST(F32_IGEMM_5X8__AVX_BROADCAST, strided_cm_subtile) {
30645 TEST_REQUIRES_X86_AVX;
30646 for (size_t k = 1; k <= 5; k += 2) {
30647 for (uint32_t m = 1; m <= 5; m++) {
30648 for (uint32_t n = 1; n <= 8; n++) {
30649 GemmMicrokernelTester()
30650 .mr(5)
30651 .nr(8)
30652 .kr(1)
30653 .sr(1)
30654 .m(m)
30655 .n(n)
30656 .k(k)
30657 .cm_stride(11)
30658 .iterations(1)
30659 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30660 }
30661 }
30662 }
30663 }
30664
30665 TEST(F32_IGEMM_5X8__AVX_BROADCAST, a_offset) {
30666 TEST_REQUIRES_X86_AVX;
30667 for (size_t k = 1; k <= 5; k += 2) {
30668 GemmMicrokernelTester()
30669 .mr(5)
30670 .nr(8)
30671 .kr(1)
30672 .sr(1)
30673 .m(5)
30674 .n(8)
30675 .k(k)
30676 .ks(3)
30677 .a_offset(29)
30678 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30679 }
30680 }
30681
30682 TEST(F32_IGEMM_5X8__AVX_BROADCAST, zero) {
30683 TEST_REQUIRES_X86_AVX;
30684 for (uint32_t mz = 0; mz < 5; mz++) {
30685 for (size_t k = 1; k <= 5; k += 2) {
30686 GemmMicrokernelTester()
30687 .mr(5)
30688 .nr(8)
30689 .kr(1)
30690 .sr(1)
30691 .m(5)
30692 .n(8)
30693 .k(k)
30694 .ks(3)
30695 .a_offset(29)
30696 .zero_index(mz)
30697 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30698 }
30699 }
30700 }
30701
30702 TEST(F32_IGEMM_5X8__AVX_BROADCAST, qmin) {
30703 TEST_REQUIRES_X86_AVX;
30704 GemmMicrokernelTester()
30705 .mr(5)
30706 .nr(8)
30707 .kr(1)
30708 .sr(1)
30709 .m(5)
30710 .n(8)
30711 .k(1)
30712 .qmin(128)
30713 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30714 }
30715
30716 TEST(F32_IGEMM_5X8__AVX_BROADCAST, qmax) {
30717 TEST_REQUIRES_X86_AVX;
30718 GemmMicrokernelTester()
30719 .mr(5)
30720 .nr(8)
30721 .kr(1)
30722 .sr(1)
30723 .m(5)
30724 .n(8)
30725 .k(1)
30726 .qmax(128)
30727 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30728 }
30729
30730 TEST(F32_IGEMM_5X8__AVX_BROADCAST, strided_cm) {
30731 TEST_REQUIRES_X86_AVX;
30732 GemmMicrokernelTester()
30733 .mr(5)
30734 .nr(8)
30735 .kr(1)
30736 .sr(1)
30737 .m(5)
30738 .n(8)
30739 .k(1)
30740 .cm_stride(11)
30741 .Test(xnn_f32_igemm_ukernel_5x8__avx_broadcast);
30742 }
30743#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30744
30745
30746#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30747 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1) {
30748 TEST_REQUIRES_X86_AVX;
30749 GemmMicrokernelTester()
30750 .mr(6)
30751 .nr(8)
30752 .kr(1)
30753 .sr(1)
30754 .m(6)
30755 .n(8)
30756 .k(1)
30757 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30758 }
30759
30760 TEST(F32_IGEMM_6X8__AVX_BROADCAST, strided_cn) {
30761 TEST_REQUIRES_X86_AVX;
30762 GemmMicrokernelTester()
30763 .mr(6)
30764 .nr(8)
30765 .kr(1)
30766 .sr(1)
30767 .m(6)
30768 .n(8)
30769 .k(1)
30770 .cn_stride(11)
30771 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30772 }
30773
30774 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1_subtile) {
30775 TEST_REQUIRES_X86_AVX;
30776 for (uint32_t m = 1; m <= 6; m++) {
30777 for (uint32_t n = 1; n <= 8; n++) {
30778 GemmMicrokernelTester()
30779 .mr(6)
30780 .nr(8)
30781 .kr(1)
30782 .sr(1)
30783 .m(m)
30784 .n(n)
30785 .k(1)
30786 .iterations(1)
30787 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30788 }
30789 }
30790 }
30791
30792 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
30793 TEST_REQUIRES_X86_AVX;
30794 for (uint32_t m = 1; m <= 6; m++) {
30795 GemmMicrokernelTester()
30796 .mr(6)
30797 .nr(8)
30798 .kr(1)
30799 .sr(1)
30800 .m(m)
30801 .n(8)
30802 .k(1)
30803 .iterations(1)
30804 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30805 }
30806 }
30807
30808 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
30809 TEST_REQUIRES_X86_AVX;
30810 for (uint32_t n = 1; n <= 8; n++) {
30811 GemmMicrokernelTester()
30812 .mr(6)
30813 .nr(8)
30814 .kr(1)
30815 .sr(1)
30816 .m(6)
30817 .n(n)
30818 .k(1)
30819 .iterations(1)
30820 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30821 }
30822 }
30823
30824 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_gt_1) {
30825 TEST_REQUIRES_X86_AVX;
30826 for (size_t k = 2; k < 10; k++) {
30827 GemmMicrokernelTester()
30828 .mr(6)
30829 .nr(8)
30830 .kr(1)
30831 .sr(1)
30832 .m(6)
30833 .n(8)
30834 .k(k)
30835 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30836 }
30837 }
30838
30839 TEST(F32_IGEMM_6X8__AVX_BROADCAST, k_gt_1_subtile) {
30840 TEST_REQUIRES_X86_AVX;
30841 for (size_t k = 2; k < 10; k++) {
30842 for (uint32_t m = 1; m <= 6; m++) {
30843 for (uint32_t n = 1; n <= 8; n++) {
30844 GemmMicrokernelTester()
30845 .mr(6)
30846 .nr(8)
30847 .kr(1)
30848 .sr(1)
30849 .m(m)
30850 .n(n)
30851 .k(k)
30852 .iterations(1)
30853 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30854 }
30855 }
30856 }
30857 }
30858
30859 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8) {
30860 TEST_REQUIRES_X86_AVX;
30861 for (uint32_t n = 9; n < 16; n++) {
30862 for (size_t k = 1; k <= 5; k += 2) {
30863 GemmMicrokernelTester()
30864 .mr(6)
30865 .nr(8)
30866 .kr(1)
30867 .sr(1)
30868 .m(6)
30869 .n(8)
30870 .k(k)
30871 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30872 }
30873 }
30874 }
30875
30876 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
30877 TEST_REQUIRES_X86_AVX;
30878 for (uint32_t n = 9; n < 16; n++) {
30879 for (size_t k = 1; k <= 5; k += 2) {
30880 GemmMicrokernelTester()
30881 .mr(6)
30882 .nr(8)
30883 .kr(1)
30884 .sr(1)
30885 .m(6)
30886 .n(8)
30887 .k(k)
30888 .cn_stride(11)
30889 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30890 }
30891 }
30892 }
30893
30894 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8_subtile) {
30895 TEST_REQUIRES_X86_AVX;
30896 for (uint32_t n = 9; n < 16; n++) {
30897 for (size_t k = 1; k <= 5; k += 2) {
30898 for (uint32_t m = 1; m <= 6; m++) {
30899 GemmMicrokernelTester()
30900 .mr(6)
30901 .nr(8)
30902 .kr(1)
30903 .sr(1)
30904 .m(m)
30905 .n(n)
30906 .k(k)
30907 .iterations(1)
30908 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30909 }
30910 }
30911 }
30912 }
30913
30914 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8) {
30915 TEST_REQUIRES_X86_AVX;
30916 for (uint32_t n = 16; n <= 24; n += 8) {
30917 for (size_t k = 1; k <= 5; k += 2) {
30918 GemmMicrokernelTester()
30919 .mr(6)
30920 .nr(8)
30921 .kr(1)
30922 .sr(1)
30923 .m(6)
30924 .n(8)
30925 .k(k)
30926 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30927 }
30928 }
30929 }
30930
30931 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
30932 TEST_REQUIRES_X86_AVX;
30933 for (uint32_t n = 16; n <= 24; n += 8) {
30934 for (size_t k = 1; k <= 5; k += 2) {
30935 GemmMicrokernelTester()
30936 .mr(6)
30937 .nr(8)
30938 .kr(1)
30939 .sr(1)
30940 .m(6)
30941 .n(n)
30942 .k(k)
30943 .cn_stride(11)
30944 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30945 }
30946 }
30947 }
30948
30949 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8_subtile) {
30950 TEST_REQUIRES_X86_AVX;
30951 for (uint32_t n = 16; n <= 24; n += 8) {
30952 for (size_t k = 1; k <= 5; k += 2) {
30953 for (uint32_t m = 1; m <= 6; m++) {
30954 GemmMicrokernelTester()
30955 .mr(6)
30956 .nr(8)
30957 .kr(1)
30958 .sr(1)
30959 .m(m)
30960 .n(n)
30961 .k(k)
30962 .iterations(1)
30963 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30964 }
30965 }
30966 }
30967 }
30968
30969 TEST(F32_IGEMM_6X8__AVX_BROADCAST, small_kernel) {
30970 TEST_REQUIRES_X86_AVX;
30971 for (size_t k = 1; k <= 5; k += 2) {
30972 GemmMicrokernelTester()
30973 .mr(6)
30974 .nr(8)
30975 .kr(1)
30976 .sr(1)
30977 .m(6)
30978 .n(8)
30979 .k(k)
30980 .ks(3)
30981 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
30982 }
30983 }
30984
30985 TEST(F32_IGEMM_6X8__AVX_BROADCAST, small_kernel_subtile) {
30986 TEST_REQUIRES_X86_AVX;
30987 for (size_t k = 1; k <= 5; k += 2) {
30988 for (uint32_t m = 1; m <= 6; m++) {
30989 for (uint32_t n = 1; n <= 8; n++) {
30990 GemmMicrokernelTester()
30991 .mr(6)
30992 .nr(8)
30993 .kr(1)
30994 .sr(1)
30995 .m(m)
30996 .n(n)
30997 .k(k)
30998 .ks(3)
30999 .iterations(1)
31000 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31001 }
31002 }
31003 }
31004 }
31005
31006 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_gt_8_small_kernel) {
31007 TEST_REQUIRES_X86_AVX;
31008 for (uint32_t n = 9; n < 16; n++) {
31009 for (size_t k = 1; k <= 5; k += 2) {
31010 GemmMicrokernelTester()
31011 .mr(6)
31012 .nr(8)
31013 .kr(1)
31014 .sr(1)
31015 .m(6)
31016 .n(8)
31017 .k(k)
31018 .ks(3)
31019 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31020 }
31021 }
31022 }
31023
31024 TEST(F32_IGEMM_6X8__AVX_BROADCAST, n_div_8_small_kernel) {
31025 TEST_REQUIRES_X86_AVX;
31026 for (uint32_t n = 16; n <= 24; n += 8) {
31027 for (size_t k = 1; k <= 5; k += 2) {
31028 GemmMicrokernelTester()
31029 .mr(6)
31030 .nr(8)
31031 .kr(1)
31032 .sr(1)
31033 .m(6)
31034 .n(8)
31035 .k(k)
31036 .ks(3)
31037 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31038 }
31039 }
31040 }
31041
31042 TEST(F32_IGEMM_6X8__AVX_BROADCAST, strided_cm_subtile) {
31043 TEST_REQUIRES_X86_AVX;
31044 for (size_t k = 1; k <= 5; k += 2) {
31045 for (uint32_t m = 1; m <= 6; m++) {
31046 for (uint32_t n = 1; n <= 8; n++) {
31047 GemmMicrokernelTester()
31048 .mr(6)
31049 .nr(8)
31050 .kr(1)
31051 .sr(1)
31052 .m(m)
31053 .n(n)
31054 .k(k)
31055 .cm_stride(11)
31056 .iterations(1)
31057 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31058 }
31059 }
31060 }
31061 }
31062
31063 TEST(F32_IGEMM_6X8__AVX_BROADCAST, a_offset) {
31064 TEST_REQUIRES_X86_AVX;
31065 for (size_t k = 1; k <= 5; k += 2) {
31066 GemmMicrokernelTester()
31067 .mr(6)
31068 .nr(8)
31069 .kr(1)
31070 .sr(1)
31071 .m(6)
31072 .n(8)
31073 .k(k)
31074 .ks(3)
31075 .a_offset(37)
31076 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31077 }
31078 }
31079
31080 TEST(F32_IGEMM_6X8__AVX_BROADCAST, zero) {
31081 TEST_REQUIRES_X86_AVX;
31082 for (uint32_t mz = 0; mz < 6; mz++) {
31083 for (size_t k = 1; k <= 5; k += 2) {
31084 GemmMicrokernelTester()
31085 .mr(6)
31086 .nr(8)
31087 .kr(1)
31088 .sr(1)
31089 .m(6)
31090 .n(8)
31091 .k(k)
31092 .ks(3)
31093 .a_offset(37)
31094 .zero_index(mz)
31095 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31096 }
31097 }
31098 }
31099
31100 TEST(F32_IGEMM_6X8__AVX_BROADCAST, qmin) {
31101 TEST_REQUIRES_X86_AVX;
31102 GemmMicrokernelTester()
31103 .mr(6)
31104 .nr(8)
31105 .kr(1)
31106 .sr(1)
31107 .m(6)
31108 .n(8)
31109 .k(1)
31110 .qmin(128)
31111 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31112 }
31113
31114 TEST(F32_IGEMM_6X8__AVX_BROADCAST, qmax) {
31115 TEST_REQUIRES_X86_AVX;
31116 GemmMicrokernelTester()
31117 .mr(6)
31118 .nr(8)
31119 .kr(1)
31120 .sr(1)
31121 .m(6)
31122 .n(8)
31123 .k(1)
31124 .qmax(128)
31125 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31126 }
31127
31128 TEST(F32_IGEMM_6X8__AVX_BROADCAST, strided_cm) {
31129 TEST_REQUIRES_X86_AVX;
31130 GemmMicrokernelTester()
31131 .mr(6)
31132 .nr(8)
31133 .kr(1)
31134 .sr(1)
31135 .m(6)
31136 .n(8)
31137 .k(1)
31138 .cm_stride(11)
31139 .Test(xnn_f32_igemm_ukernel_6x8__avx_broadcast);
31140 }
31141#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31142
31143
31144#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31145 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1) {
31146 TEST_REQUIRES_X86_AVX;
31147 GemmMicrokernelTester()
31148 .mr(7)
31149 .nr(8)
31150 .kr(1)
31151 .sr(1)
31152 .m(7)
31153 .n(8)
31154 .k(1)
31155 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31156 }
31157
31158 TEST(F32_IGEMM_7X8__AVX_BROADCAST, strided_cn) {
31159 TEST_REQUIRES_X86_AVX;
31160 GemmMicrokernelTester()
31161 .mr(7)
31162 .nr(8)
31163 .kr(1)
31164 .sr(1)
31165 .m(7)
31166 .n(8)
31167 .k(1)
31168 .cn_stride(11)
31169 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31170 }
31171
31172 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1_subtile) {
31173 TEST_REQUIRES_X86_AVX;
31174 for (uint32_t m = 1; m <= 7; m++) {
31175 for (uint32_t n = 1; n <= 8; n++) {
31176 GemmMicrokernelTester()
31177 .mr(7)
31178 .nr(8)
31179 .kr(1)
31180 .sr(1)
31181 .m(m)
31182 .n(n)
31183 .k(1)
31184 .iterations(1)
31185 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31186 }
31187 }
31188 }
31189
31190 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_m) {
31191 TEST_REQUIRES_X86_AVX;
31192 for (uint32_t m = 1; m <= 7; m++) {
31193 GemmMicrokernelTester()
31194 .mr(7)
31195 .nr(8)
31196 .kr(1)
31197 .sr(1)
31198 .m(m)
31199 .n(8)
31200 .k(1)
31201 .iterations(1)
31202 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31203 }
31204 }
31205
31206 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_n) {
31207 TEST_REQUIRES_X86_AVX;
31208 for (uint32_t n = 1; n <= 8; n++) {
31209 GemmMicrokernelTester()
31210 .mr(7)
31211 .nr(8)
31212 .kr(1)
31213 .sr(1)
31214 .m(7)
31215 .n(n)
31216 .k(1)
31217 .iterations(1)
31218 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31219 }
31220 }
31221
31222 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_gt_1) {
31223 TEST_REQUIRES_X86_AVX;
31224 for (size_t k = 2; k < 10; k++) {
31225 GemmMicrokernelTester()
31226 .mr(7)
31227 .nr(8)
31228 .kr(1)
31229 .sr(1)
31230 .m(7)
31231 .n(8)
31232 .k(k)
31233 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31234 }
31235 }
31236
31237 TEST(F32_IGEMM_7X8__AVX_BROADCAST, k_gt_1_subtile) {
31238 TEST_REQUIRES_X86_AVX;
31239 for (size_t k = 2; k < 10; k++) {
31240 for (uint32_t m = 1; m <= 7; m++) {
31241 for (uint32_t n = 1; n <= 8; n++) {
31242 GemmMicrokernelTester()
31243 .mr(7)
31244 .nr(8)
31245 .kr(1)
31246 .sr(1)
31247 .m(m)
31248 .n(n)
31249 .k(k)
31250 .iterations(1)
31251 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31252 }
31253 }
31254 }
31255 }
31256
31257 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8) {
31258 TEST_REQUIRES_X86_AVX;
31259 for (uint32_t n = 9; n < 16; n++) {
31260 for (size_t k = 1; k <= 5; k += 2) {
31261 GemmMicrokernelTester()
31262 .mr(7)
31263 .nr(8)
31264 .kr(1)
31265 .sr(1)
31266 .m(7)
31267 .n(8)
31268 .k(k)
31269 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31270 }
31271 }
31272 }
31273
31274 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8_strided_cn) {
31275 TEST_REQUIRES_X86_AVX;
31276 for (uint32_t n = 9; n < 16; n++) {
31277 for (size_t k = 1; k <= 5; k += 2) {
31278 GemmMicrokernelTester()
31279 .mr(7)
31280 .nr(8)
31281 .kr(1)
31282 .sr(1)
31283 .m(7)
31284 .n(8)
31285 .k(k)
31286 .cn_stride(11)
31287 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31288 }
31289 }
31290 }
31291
31292 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8_subtile) {
31293 TEST_REQUIRES_X86_AVX;
31294 for (uint32_t n = 9; n < 16; n++) {
31295 for (size_t k = 1; k <= 5; k += 2) {
31296 for (uint32_t m = 1; m <= 7; m++) {
31297 GemmMicrokernelTester()
31298 .mr(7)
31299 .nr(8)
31300 .kr(1)
31301 .sr(1)
31302 .m(m)
31303 .n(n)
31304 .k(k)
31305 .iterations(1)
31306 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31307 }
31308 }
31309 }
31310 }
31311
31312 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8) {
31313 TEST_REQUIRES_X86_AVX;
31314 for (uint32_t n = 16; n <= 24; n += 8) {
31315 for (size_t k = 1; k <= 5; k += 2) {
31316 GemmMicrokernelTester()
31317 .mr(7)
31318 .nr(8)
31319 .kr(1)
31320 .sr(1)
31321 .m(7)
31322 .n(8)
31323 .k(k)
31324 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31325 }
31326 }
31327 }
31328
31329 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8_strided_cn) {
31330 TEST_REQUIRES_X86_AVX;
31331 for (uint32_t n = 16; n <= 24; n += 8) {
31332 for (size_t k = 1; k <= 5; k += 2) {
31333 GemmMicrokernelTester()
31334 .mr(7)
31335 .nr(8)
31336 .kr(1)
31337 .sr(1)
31338 .m(7)
31339 .n(n)
31340 .k(k)
31341 .cn_stride(11)
31342 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31343 }
31344 }
31345 }
31346
31347 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8_subtile) {
31348 TEST_REQUIRES_X86_AVX;
31349 for (uint32_t n = 16; n <= 24; n += 8) {
31350 for (size_t k = 1; k <= 5; k += 2) {
31351 for (uint32_t m = 1; m <= 7; m++) {
31352 GemmMicrokernelTester()
31353 .mr(7)
31354 .nr(8)
31355 .kr(1)
31356 .sr(1)
31357 .m(m)
31358 .n(n)
31359 .k(k)
31360 .iterations(1)
31361 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31362 }
31363 }
31364 }
31365 }
31366
31367 TEST(F32_IGEMM_7X8__AVX_BROADCAST, small_kernel) {
31368 TEST_REQUIRES_X86_AVX;
31369 for (size_t k = 1; k <= 5; k += 2) {
31370 GemmMicrokernelTester()
31371 .mr(7)
31372 .nr(8)
31373 .kr(1)
31374 .sr(1)
31375 .m(7)
31376 .n(8)
31377 .k(k)
31378 .ks(3)
31379 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31380 }
31381 }
31382
31383 TEST(F32_IGEMM_7X8__AVX_BROADCAST, small_kernel_subtile) {
31384 TEST_REQUIRES_X86_AVX;
31385 for (size_t k = 1; k <= 5; k += 2) {
31386 for (uint32_t m = 1; m <= 7; m++) {
31387 for (uint32_t n = 1; n <= 8; n++) {
31388 GemmMicrokernelTester()
31389 .mr(7)
31390 .nr(8)
31391 .kr(1)
31392 .sr(1)
31393 .m(m)
31394 .n(n)
31395 .k(k)
31396 .ks(3)
31397 .iterations(1)
31398 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31399 }
31400 }
31401 }
31402 }
31403
31404 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_gt_8_small_kernel) {
31405 TEST_REQUIRES_X86_AVX;
31406 for (uint32_t n = 9; n < 16; n++) {
31407 for (size_t k = 1; k <= 5; k += 2) {
31408 GemmMicrokernelTester()
31409 .mr(7)
31410 .nr(8)
31411 .kr(1)
31412 .sr(1)
31413 .m(7)
31414 .n(8)
31415 .k(k)
31416 .ks(3)
31417 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31418 }
31419 }
31420 }
31421
31422 TEST(F32_IGEMM_7X8__AVX_BROADCAST, n_div_8_small_kernel) {
31423 TEST_REQUIRES_X86_AVX;
31424 for (uint32_t n = 16; n <= 24; n += 8) {
31425 for (size_t k = 1; k <= 5; k += 2) {
31426 GemmMicrokernelTester()
31427 .mr(7)
31428 .nr(8)
31429 .kr(1)
31430 .sr(1)
31431 .m(7)
31432 .n(8)
31433 .k(k)
31434 .ks(3)
31435 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31436 }
31437 }
31438 }
31439
31440 TEST(F32_IGEMM_7X8__AVX_BROADCAST, strided_cm_subtile) {
31441 TEST_REQUIRES_X86_AVX;
31442 for (size_t k = 1; k <= 5; k += 2) {
31443 for (uint32_t m = 1; m <= 7; m++) {
31444 for (uint32_t n = 1; n <= 8; n++) {
31445 GemmMicrokernelTester()
31446 .mr(7)
31447 .nr(8)
31448 .kr(1)
31449 .sr(1)
31450 .m(m)
31451 .n(n)
31452 .k(k)
31453 .cm_stride(11)
31454 .iterations(1)
31455 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31456 }
31457 }
31458 }
31459 }
31460
31461 TEST(F32_IGEMM_7X8__AVX_BROADCAST, a_offset) {
31462 TEST_REQUIRES_X86_AVX;
31463 for (size_t k = 1; k <= 5; k += 2) {
31464 GemmMicrokernelTester()
31465 .mr(7)
31466 .nr(8)
31467 .kr(1)
31468 .sr(1)
31469 .m(7)
31470 .n(8)
31471 .k(k)
31472 .ks(3)
31473 .a_offset(37)
31474 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31475 }
31476 }
31477
31478 TEST(F32_IGEMM_7X8__AVX_BROADCAST, zero) {
31479 TEST_REQUIRES_X86_AVX;
31480 for (uint32_t mz = 0; mz < 7; mz++) {
31481 for (size_t k = 1; k <= 5; k += 2) {
31482 GemmMicrokernelTester()
31483 .mr(7)
31484 .nr(8)
31485 .kr(1)
31486 .sr(1)
31487 .m(7)
31488 .n(8)
31489 .k(k)
31490 .ks(3)
31491 .a_offset(37)
31492 .zero_index(mz)
31493 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31494 }
31495 }
31496 }
31497
31498 TEST(F32_IGEMM_7X8__AVX_BROADCAST, qmin) {
31499 TEST_REQUIRES_X86_AVX;
31500 GemmMicrokernelTester()
31501 .mr(7)
31502 .nr(8)
31503 .kr(1)
31504 .sr(1)
31505 .m(7)
31506 .n(8)
31507 .k(1)
31508 .qmin(128)
31509 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31510 }
31511
31512 TEST(F32_IGEMM_7X8__AVX_BROADCAST, qmax) {
31513 TEST_REQUIRES_X86_AVX;
31514 GemmMicrokernelTester()
31515 .mr(7)
31516 .nr(8)
31517 .kr(1)
31518 .sr(1)
31519 .m(7)
31520 .n(8)
31521 .k(1)
31522 .qmax(128)
31523 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31524 }
31525
31526 TEST(F32_IGEMM_7X8__AVX_BROADCAST, strided_cm) {
31527 TEST_REQUIRES_X86_AVX;
31528 GemmMicrokernelTester()
31529 .mr(7)
31530 .nr(8)
31531 .kr(1)
31532 .sr(1)
31533 .m(7)
31534 .n(8)
31535 .k(1)
31536 .cm_stride(11)
31537 .Test(xnn_f32_igemm_ukernel_7x8__avx_broadcast);
31538 }
31539#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31540
31541
31542#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31543 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1) {
31544 TEST_REQUIRES_X86_AVX;
31545 GemmMicrokernelTester()
31546 .mr(1)
31547 .nr(16)
31548 .kr(1)
31549 .sr(1)
31550 .m(1)
31551 .n(16)
31552 .k(1)
31553 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31554 }
31555
31556 TEST(F32_IGEMM_1X16__AVX_BROADCAST, strided_cn) {
31557 TEST_REQUIRES_X86_AVX;
31558 GemmMicrokernelTester()
31559 .mr(1)
31560 .nr(16)
31561 .kr(1)
31562 .sr(1)
31563 .m(1)
31564 .n(16)
31565 .k(1)
31566 .cn_stride(19)
31567 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31568 }
31569
31570 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1_subtile) {
31571 TEST_REQUIRES_X86_AVX;
31572 for (uint32_t m = 1; m <= 1; m++) {
31573 for (uint32_t n = 1; n <= 16; n++) {
31574 GemmMicrokernelTester()
31575 .mr(1)
31576 .nr(16)
31577 .kr(1)
31578 .sr(1)
31579 .m(m)
31580 .n(n)
31581 .k(1)
31582 .iterations(1)
31583 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31584 }
31585 }
31586 }
31587
31588 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_m) {
31589 TEST_REQUIRES_X86_AVX;
31590 for (uint32_t m = 1; m <= 1; m++) {
31591 GemmMicrokernelTester()
31592 .mr(1)
31593 .nr(16)
31594 .kr(1)
31595 .sr(1)
31596 .m(m)
31597 .n(16)
31598 .k(1)
31599 .iterations(1)
31600 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31601 }
31602 }
31603
31604 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_n) {
31605 TEST_REQUIRES_X86_AVX;
31606 for (uint32_t n = 1; n <= 16; n++) {
31607 GemmMicrokernelTester()
31608 .mr(1)
31609 .nr(16)
31610 .kr(1)
31611 .sr(1)
31612 .m(1)
31613 .n(n)
31614 .k(1)
31615 .iterations(1)
31616 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31617 }
31618 }
31619
31620 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_gt_1) {
31621 TEST_REQUIRES_X86_AVX;
31622 for (size_t k = 2; k < 10; k++) {
31623 GemmMicrokernelTester()
31624 .mr(1)
31625 .nr(16)
31626 .kr(1)
31627 .sr(1)
31628 .m(1)
31629 .n(16)
31630 .k(k)
31631 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31632 }
31633 }
31634
31635 TEST(F32_IGEMM_1X16__AVX_BROADCAST, k_gt_1_subtile) {
31636 TEST_REQUIRES_X86_AVX;
31637 for (size_t k = 2; k < 10; k++) {
31638 for (uint32_t m = 1; m <= 1; m++) {
31639 for (uint32_t n = 1; n <= 16; n++) {
31640 GemmMicrokernelTester()
31641 .mr(1)
31642 .nr(16)
31643 .kr(1)
31644 .sr(1)
31645 .m(m)
31646 .n(n)
31647 .k(k)
31648 .iterations(1)
31649 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31650 }
31651 }
31652 }
31653 }
31654
31655 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16) {
31656 TEST_REQUIRES_X86_AVX;
31657 for (uint32_t n = 17; n < 32; n++) {
31658 for (size_t k = 1; k <= 5; k += 2) {
31659 GemmMicrokernelTester()
31660 .mr(1)
31661 .nr(16)
31662 .kr(1)
31663 .sr(1)
31664 .m(1)
31665 .n(16)
31666 .k(k)
31667 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31668 }
31669 }
31670 }
31671
31672 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16_strided_cn) {
31673 TEST_REQUIRES_X86_AVX;
31674 for (uint32_t n = 17; n < 32; n++) {
31675 for (size_t k = 1; k <= 5; k += 2) {
31676 GemmMicrokernelTester()
31677 .mr(1)
31678 .nr(16)
31679 .kr(1)
31680 .sr(1)
31681 .m(1)
31682 .n(16)
31683 .k(k)
31684 .cn_stride(19)
31685 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31686 }
31687 }
31688 }
31689
31690 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16_subtile) {
31691 TEST_REQUIRES_X86_AVX;
31692 for (uint32_t n = 17; n < 32; n++) {
31693 for (size_t k = 1; k <= 5; k += 2) {
31694 for (uint32_t m = 1; m <= 1; m++) {
31695 GemmMicrokernelTester()
31696 .mr(1)
31697 .nr(16)
31698 .kr(1)
31699 .sr(1)
31700 .m(m)
31701 .n(n)
31702 .k(k)
31703 .iterations(1)
31704 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31705 }
31706 }
31707 }
31708 }
31709
31710 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16) {
31711 TEST_REQUIRES_X86_AVX;
31712 for (uint32_t n = 32; n <= 48; n += 16) {
31713 for (size_t k = 1; k <= 5; k += 2) {
31714 GemmMicrokernelTester()
31715 .mr(1)
31716 .nr(16)
31717 .kr(1)
31718 .sr(1)
31719 .m(1)
31720 .n(16)
31721 .k(k)
31722 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31723 }
31724 }
31725 }
31726
31727 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16_strided_cn) {
31728 TEST_REQUIRES_X86_AVX;
31729 for (uint32_t n = 32; n <= 48; n += 16) {
31730 for (size_t k = 1; k <= 5; k += 2) {
31731 GemmMicrokernelTester()
31732 .mr(1)
31733 .nr(16)
31734 .kr(1)
31735 .sr(1)
31736 .m(1)
31737 .n(n)
31738 .k(k)
31739 .cn_stride(19)
31740 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31741 }
31742 }
31743 }
31744
31745 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16_subtile) {
31746 TEST_REQUIRES_X86_AVX;
31747 for (uint32_t n = 32; n <= 48; n += 16) {
31748 for (size_t k = 1; k <= 5; k += 2) {
31749 for (uint32_t m = 1; m <= 1; m++) {
31750 GemmMicrokernelTester()
31751 .mr(1)
31752 .nr(16)
31753 .kr(1)
31754 .sr(1)
31755 .m(m)
31756 .n(n)
31757 .k(k)
31758 .iterations(1)
31759 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31760 }
31761 }
31762 }
31763 }
31764
31765 TEST(F32_IGEMM_1X16__AVX_BROADCAST, small_kernel) {
31766 TEST_REQUIRES_X86_AVX;
31767 for (size_t k = 1; k <= 5; k += 2) {
31768 GemmMicrokernelTester()
31769 .mr(1)
31770 .nr(16)
31771 .kr(1)
31772 .sr(1)
31773 .m(1)
31774 .n(16)
31775 .k(k)
31776 .ks(3)
31777 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31778 }
31779 }
31780
31781 TEST(F32_IGEMM_1X16__AVX_BROADCAST, small_kernel_subtile) {
31782 TEST_REQUIRES_X86_AVX;
31783 for (size_t k = 1; k <= 5; k += 2) {
31784 for (uint32_t m = 1; m <= 1; m++) {
31785 for (uint32_t n = 1; n <= 16; n++) {
31786 GemmMicrokernelTester()
31787 .mr(1)
31788 .nr(16)
31789 .kr(1)
31790 .sr(1)
31791 .m(m)
31792 .n(n)
31793 .k(k)
31794 .ks(3)
31795 .iterations(1)
31796 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31797 }
31798 }
31799 }
31800 }
31801
31802 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_gt_16_small_kernel) {
31803 TEST_REQUIRES_X86_AVX;
31804 for (uint32_t n = 17; n < 32; n++) {
31805 for (size_t k = 1; k <= 5; k += 2) {
31806 GemmMicrokernelTester()
31807 .mr(1)
31808 .nr(16)
31809 .kr(1)
31810 .sr(1)
31811 .m(1)
31812 .n(16)
31813 .k(k)
31814 .ks(3)
31815 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31816 }
31817 }
31818 }
31819
31820 TEST(F32_IGEMM_1X16__AVX_BROADCAST, n_div_16_small_kernel) {
31821 TEST_REQUIRES_X86_AVX;
31822 for (uint32_t n = 32; n <= 48; n += 16) {
31823 for (size_t k = 1; k <= 5; k += 2) {
31824 GemmMicrokernelTester()
31825 .mr(1)
31826 .nr(16)
31827 .kr(1)
31828 .sr(1)
31829 .m(1)
31830 .n(16)
31831 .k(k)
31832 .ks(3)
31833 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31834 }
31835 }
31836 }
31837
31838 TEST(F32_IGEMM_1X16__AVX_BROADCAST, strided_cm_subtile) {
31839 TEST_REQUIRES_X86_AVX;
31840 for (size_t k = 1; k <= 5; k += 2) {
31841 for (uint32_t m = 1; m <= 1; m++) {
31842 for (uint32_t n = 1; n <= 16; n++) {
31843 GemmMicrokernelTester()
31844 .mr(1)
31845 .nr(16)
31846 .kr(1)
31847 .sr(1)
31848 .m(m)
31849 .n(n)
31850 .k(k)
31851 .cm_stride(19)
31852 .iterations(1)
31853 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31854 }
31855 }
31856 }
31857 }
31858
31859 TEST(F32_IGEMM_1X16__AVX_BROADCAST, a_offset) {
31860 TEST_REQUIRES_X86_AVX;
31861 for (size_t k = 1; k <= 5; k += 2) {
31862 GemmMicrokernelTester()
31863 .mr(1)
31864 .nr(16)
31865 .kr(1)
31866 .sr(1)
31867 .m(1)
31868 .n(16)
31869 .k(k)
31870 .ks(3)
31871 .a_offset(7)
31872 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31873 }
31874 }
31875
31876 TEST(F32_IGEMM_1X16__AVX_BROADCAST, zero) {
31877 TEST_REQUIRES_X86_AVX;
31878 for (uint32_t mz = 0; mz < 1; mz++) {
31879 for (size_t k = 1; k <= 5; k += 2) {
31880 GemmMicrokernelTester()
31881 .mr(1)
31882 .nr(16)
31883 .kr(1)
31884 .sr(1)
31885 .m(1)
31886 .n(16)
31887 .k(k)
31888 .ks(3)
31889 .a_offset(7)
31890 .zero_index(mz)
31891 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31892 }
31893 }
31894 }
31895
31896 TEST(F32_IGEMM_1X16__AVX_BROADCAST, qmin) {
31897 TEST_REQUIRES_X86_AVX;
31898 GemmMicrokernelTester()
31899 .mr(1)
31900 .nr(16)
31901 .kr(1)
31902 .sr(1)
31903 .m(1)
31904 .n(16)
31905 .k(1)
31906 .qmin(128)
31907 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31908 }
31909
31910 TEST(F32_IGEMM_1X16__AVX_BROADCAST, qmax) {
31911 TEST_REQUIRES_X86_AVX;
31912 GemmMicrokernelTester()
31913 .mr(1)
31914 .nr(16)
31915 .kr(1)
31916 .sr(1)
31917 .m(1)
31918 .n(16)
31919 .k(1)
31920 .qmax(128)
31921 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31922 }
31923
31924 TEST(F32_IGEMM_1X16__AVX_BROADCAST, strided_cm) {
31925 TEST_REQUIRES_X86_AVX;
31926 GemmMicrokernelTester()
31927 .mr(1)
31928 .nr(16)
31929 .kr(1)
31930 .sr(1)
31931 .m(1)
31932 .n(16)
31933 .k(1)
31934 .cm_stride(19)
31935 .Test(xnn_f32_igemm_ukernel_1x16__avx_broadcast);
31936 }
31937#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31938
31939
31940#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31941 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1) {
31942 TEST_REQUIRES_X86_AVX;
31943 GemmMicrokernelTester()
31944 .mr(3)
31945 .nr(16)
31946 .kr(1)
31947 .sr(1)
31948 .m(3)
31949 .n(16)
31950 .k(1)
31951 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
31952 }
31953
31954 TEST(F32_IGEMM_3X16__AVX_BROADCAST, strided_cn) {
31955 TEST_REQUIRES_X86_AVX;
31956 GemmMicrokernelTester()
31957 .mr(3)
31958 .nr(16)
31959 .kr(1)
31960 .sr(1)
31961 .m(3)
31962 .n(16)
31963 .k(1)
31964 .cn_stride(19)
31965 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
31966 }
31967
31968 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1_subtile) {
31969 TEST_REQUIRES_X86_AVX;
31970 for (uint32_t m = 1; m <= 3; m++) {
31971 for (uint32_t n = 1; n <= 16; n++) {
31972 GemmMicrokernelTester()
31973 .mr(3)
31974 .nr(16)
31975 .kr(1)
31976 .sr(1)
31977 .m(m)
31978 .n(n)
31979 .k(1)
31980 .iterations(1)
31981 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
31982 }
31983 }
31984 }
31985
31986 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
31987 TEST_REQUIRES_X86_AVX;
31988 for (uint32_t m = 1; m <= 3; m++) {
31989 GemmMicrokernelTester()
31990 .mr(3)
31991 .nr(16)
31992 .kr(1)
31993 .sr(1)
31994 .m(m)
31995 .n(16)
31996 .k(1)
31997 .iterations(1)
31998 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
31999 }
32000 }
32001
32002 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
32003 TEST_REQUIRES_X86_AVX;
32004 for (uint32_t n = 1; n <= 16; n++) {
32005 GemmMicrokernelTester()
32006 .mr(3)
32007 .nr(16)
32008 .kr(1)
32009 .sr(1)
32010 .m(3)
32011 .n(n)
32012 .k(1)
32013 .iterations(1)
32014 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32015 }
32016 }
32017
32018 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_gt_1) {
32019 TEST_REQUIRES_X86_AVX;
32020 for (size_t k = 2; k < 10; k++) {
32021 GemmMicrokernelTester()
32022 .mr(3)
32023 .nr(16)
32024 .kr(1)
32025 .sr(1)
32026 .m(3)
32027 .n(16)
32028 .k(k)
32029 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32030 }
32031 }
32032
32033 TEST(F32_IGEMM_3X16__AVX_BROADCAST, k_gt_1_subtile) {
32034 TEST_REQUIRES_X86_AVX;
32035 for (size_t k = 2; k < 10; k++) {
32036 for (uint32_t m = 1; m <= 3; m++) {
32037 for (uint32_t n = 1; n <= 16; n++) {
32038 GemmMicrokernelTester()
32039 .mr(3)
32040 .nr(16)
32041 .kr(1)
32042 .sr(1)
32043 .m(m)
32044 .n(n)
32045 .k(k)
32046 .iterations(1)
32047 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32048 }
32049 }
32050 }
32051 }
32052
32053 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16) {
32054 TEST_REQUIRES_X86_AVX;
32055 for (uint32_t n = 17; n < 32; n++) {
32056 for (size_t k = 1; k <= 5; k += 2) {
32057 GemmMicrokernelTester()
32058 .mr(3)
32059 .nr(16)
32060 .kr(1)
32061 .sr(1)
32062 .m(3)
32063 .n(16)
32064 .k(k)
32065 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32066 }
32067 }
32068 }
32069
32070 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
32071 TEST_REQUIRES_X86_AVX;
32072 for (uint32_t n = 17; n < 32; n++) {
32073 for (size_t k = 1; k <= 5; k += 2) {
32074 GemmMicrokernelTester()
32075 .mr(3)
32076 .nr(16)
32077 .kr(1)
32078 .sr(1)
32079 .m(3)
32080 .n(16)
32081 .k(k)
32082 .cn_stride(19)
32083 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32084 }
32085 }
32086 }
32087
32088 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16_subtile) {
32089 TEST_REQUIRES_X86_AVX;
32090 for (uint32_t n = 17; n < 32; n++) {
32091 for (size_t k = 1; k <= 5; k += 2) {
32092 for (uint32_t m = 1; m <= 3; m++) {
32093 GemmMicrokernelTester()
32094 .mr(3)
32095 .nr(16)
32096 .kr(1)
32097 .sr(1)
32098 .m(m)
32099 .n(n)
32100 .k(k)
32101 .iterations(1)
32102 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32103 }
32104 }
32105 }
32106 }
32107
32108 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16) {
32109 TEST_REQUIRES_X86_AVX;
32110 for (uint32_t n = 32; n <= 48; n += 16) {
32111 for (size_t k = 1; k <= 5; k += 2) {
32112 GemmMicrokernelTester()
32113 .mr(3)
32114 .nr(16)
32115 .kr(1)
32116 .sr(1)
32117 .m(3)
32118 .n(16)
32119 .k(k)
32120 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32121 }
32122 }
32123 }
32124
32125 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
32126 TEST_REQUIRES_X86_AVX;
32127 for (uint32_t n = 32; n <= 48; n += 16) {
32128 for (size_t k = 1; k <= 5; k += 2) {
32129 GemmMicrokernelTester()
32130 .mr(3)
32131 .nr(16)
32132 .kr(1)
32133 .sr(1)
32134 .m(3)
32135 .n(n)
32136 .k(k)
32137 .cn_stride(19)
32138 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32139 }
32140 }
32141 }
32142
32143 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16_subtile) {
32144 TEST_REQUIRES_X86_AVX;
32145 for (uint32_t n = 32; n <= 48; n += 16) {
32146 for (size_t k = 1; k <= 5; k += 2) {
32147 for (uint32_t m = 1; m <= 3; m++) {
32148 GemmMicrokernelTester()
32149 .mr(3)
32150 .nr(16)
32151 .kr(1)
32152 .sr(1)
32153 .m(m)
32154 .n(n)
32155 .k(k)
32156 .iterations(1)
32157 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32158 }
32159 }
32160 }
32161 }
32162
32163 TEST(F32_IGEMM_3X16__AVX_BROADCAST, small_kernel) {
32164 TEST_REQUIRES_X86_AVX;
32165 for (size_t k = 1; k <= 5; k += 2) {
32166 GemmMicrokernelTester()
32167 .mr(3)
32168 .nr(16)
32169 .kr(1)
32170 .sr(1)
32171 .m(3)
32172 .n(16)
32173 .k(k)
32174 .ks(3)
32175 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32176 }
32177 }
32178
32179 TEST(F32_IGEMM_3X16__AVX_BROADCAST, small_kernel_subtile) {
32180 TEST_REQUIRES_X86_AVX;
32181 for (size_t k = 1; k <= 5; k += 2) {
32182 for (uint32_t m = 1; m <= 3; m++) {
32183 for (uint32_t n = 1; n <= 16; n++) {
32184 GemmMicrokernelTester()
32185 .mr(3)
32186 .nr(16)
32187 .kr(1)
32188 .sr(1)
32189 .m(m)
32190 .n(n)
32191 .k(k)
32192 .ks(3)
32193 .iterations(1)
32194 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32195 }
32196 }
32197 }
32198 }
32199
32200 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_gt_16_small_kernel) {
32201 TEST_REQUIRES_X86_AVX;
32202 for (uint32_t n = 17; n < 32; n++) {
32203 for (size_t k = 1; k <= 5; k += 2) {
32204 GemmMicrokernelTester()
32205 .mr(3)
32206 .nr(16)
32207 .kr(1)
32208 .sr(1)
32209 .m(3)
32210 .n(16)
32211 .k(k)
32212 .ks(3)
32213 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32214 }
32215 }
32216 }
32217
32218 TEST(F32_IGEMM_3X16__AVX_BROADCAST, n_div_16_small_kernel) {
32219 TEST_REQUIRES_X86_AVX;
32220 for (uint32_t n = 32; n <= 48; n += 16) {
32221 for (size_t k = 1; k <= 5; k += 2) {
32222 GemmMicrokernelTester()
32223 .mr(3)
32224 .nr(16)
32225 .kr(1)
32226 .sr(1)
32227 .m(3)
32228 .n(16)
32229 .k(k)
32230 .ks(3)
32231 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32232 }
32233 }
32234 }
32235
32236 TEST(F32_IGEMM_3X16__AVX_BROADCAST, strided_cm_subtile) {
32237 TEST_REQUIRES_X86_AVX;
32238 for (size_t k = 1; k <= 5; k += 2) {
32239 for (uint32_t m = 1; m <= 3; m++) {
32240 for (uint32_t n = 1; n <= 16; n++) {
32241 GemmMicrokernelTester()
32242 .mr(3)
32243 .nr(16)
32244 .kr(1)
32245 .sr(1)
32246 .m(m)
32247 .n(n)
32248 .k(k)
32249 .cm_stride(19)
32250 .iterations(1)
32251 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32252 }
32253 }
32254 }
32255 }
32256
32257 TEST(F32_IGEMM_3X16__AVX_BROADCAST, a_offset) {
32258 TEST_REQUIRES_X86_AVX;
32259 for (size_t k = 1; k <= 5; k += 2) {
32260 GemmMicrokernelTester()
32261 .mr(3)
32262 .nr(16)
32263 .kr(1)
32264 .sr(1)
32265 .m(3)
32266 .n(16)
32267 .k(k)
32268 .ks(3)
32269 .a_offset(17)
32270 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32271 }
32272 }
32273
32274 TEST(F32_IGEMM_3X16__AVX_BROADCAST, zero) {
32275 TEST_REQUIRES_X86_AVX;
32276 for (uint32_t mz = 0; mz < 3; mz++) {
32277 for (size_t k = 1; k <= 5; k += 2) {
32278 GemmMicrokernelTester()
32279 .mr(3)
32280 .nr(16)
32281 .kr(1)
32282 .sr(1)
32283 .m(3)
32284 .n(16)
32285 .k(k)
32286 .ks(3)
32287 .a_offset(17)
32288 .zero_index(mz)
32289 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32290 }
32291 }
32292 }
32293
32294 TEST(F32_IGEMM_3X16__AVX_BROADCAST, qmin) {
32295 TEST_REQUIRES_X86_AVX;
32296 GemmMicrokernelTester()
32297 .mr(3)
32298 .nr(16)
32299 .kr(1)
32300 .sr(1)
32301 .m(3)
32302 .n(16)
32303 .k(1)
32304 .qmin(128)
32305 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32306 }
32307
32308 TEST(F32_IGEMM_3X16__AVX_BROADCAST, qmax) {
32309 TEST_REQUIRES_X86_AVX;
32310 GemmMicrokernelTester()
32311 .mr(3)
32312 .nr(16)
32313 .kr(1)
32314 .sr(1)
32315 .m(3)
32316 .n(16)
32317 .k(1)
32318 .qmax(128)
32319 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32320 }
32321
32322 TEST(F32_IGEMM_3X16__AVX_BROADCAST, strided_cm) {
32323 TEST_REQUIRES_X86_AVX;
32324 GemmMicrokernelTester()
32325 .mr(3)
32326 .nr(16)
32327 .kr(1)
32328 .sr(1)
32329 .m(3)
32330 .n(16)
32331 .k(1)
32332 .cm_stride(19)
32333 .Test(xnn_f32_igemm_ukernel_3x16__avx_broadcast);
32334 }
32335#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32336
32337
32338#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32339 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1) {
32340 TEST_REQUIRES_X86_AVX;
32341 GemmMicrokernelTester()
32342 .mr(4)
32343 .nr(16)
32344 .kr(1)
32345 .sr(1)
32346 .m(4)
32347 .n(16)
32348 .k(1)
32349 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32350 }
32351
32352 TEST(F32_IGEMM_4X16__AVX_BROADCAST, strided_cn) {
32353 TEST_REQUIRES_X86_AVX;
32354 GemmMicrokernelTester()
32355 .mr(4)
32356 .nr(16)
32357 .kr(1)
32358 .sr(1)
32359 .m(4)
32360 .n(16)
32361 .k(1)
32362 .cn_stride(19)
32363 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32364 }
32365
32366 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1_subtile) {
32367 TEST_REQUIRES_X86_AVX;
32368 for (uint32_t m = 1; m <= 4; m++) {
32369 for (uint32_t n = 1; n <= 16; n++) {
32370 GemmMicrokernelTester()
32371 .mr(4)
32372 .nr(16)
32373 .kr(1)
32374 .sr(1)
32375 .m(m)
32376 .n(n)
32377 .k(1)
32378 .iterations(1)
32379 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32380 }
32381 }
32382 }
32383
32384 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_m) {
32385 TEST_REQUIRES_X86_AVX;
32386 for (uint32_t m = 1; m <= 4; m++) {
32387 GemmMicrokernelTester()
32388 .mr(4)
32389 .nr(16)
32390 .kr(1)
32391 .sr(1)
32392 .m(m)
32393 .n(16)
32394 .k(1)
32395 .iterations(1)
32396 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32397 }
32398 }
32399
32400 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_n) {
32401 TEST_REQUIRES_X86_AVX;
32402 for (uint32_t n = 1; n <= 16; n++) {
32403 GemmMicrokernelTester()
32404 .mr(4)
32405 .nr(16)
32406 .kr(1)
32407 .sr(1)
32408 .m(4)
32409 .n(n)
32410 .k(1)
32411 .iterations(1)
32412 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32413 }
32414 }
32415
32416 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_gt_1) {
32417 TEST_REQUIRES_X86_AVX;
32418 for (size_t k = 2; k < 10; k++) {
32419 GemmMicrokernelTester()
32420 .mr(4)
32421 .nr(16)
32422 .kr(1)
32423 .sr(1)
32424 .m(4)
32425 .n(16)
32426 .k(k)
32427 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32428 }
32429 }
32430
32431 TEST(F32_IGEMM_4X16__AVX_BROADCAST, k_gt_1_subtile) {
32432 TEST_REQUIRES_X86_AVX;
32433 for (size_t k = 2; k < 10; k++) {
32434 for (uint32_t m = 1; m <= 4; m++) {
32435 for (uint32_t n = 1; n <= 16; n++) {
32436 GemmMicrokernelTester()
32437 .mr(4)
32438 .nr(16)
32439 .kr(1)
32440 .sr(1)
32441 .m(m)
32442 .n(n)
32443 .k(k)
32444 .iterations(1)
32445 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32446 }
32447 }
32448 }
32449 }
32450
32451 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16) {
32452 TEST_REQUIRES_X86_AVX;
32453 for (uint32_t n = 17; n < 32; n++) {
32454 for (size_t k = 1; k <= 5; k += 2) {
32455 GemmMicrokernelTester()
32456 .mr(4)
32457 .nr(16)
32458 .kr(1)
32459 .sr(1)
32460 .m(4)
32461 .n(16)
32462 .k(k)
32463 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32464 }
32465 }
32466 }
32467
32468 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16_strided_cn) {
32469 TEST_REQUIRES_X86_AVX;
32470 for (uint32_t n = 17; n < 32; n++) {
32471 for (size_t k = 1; k <= 5; k += 2) {
32472 GemmMicrokernelTester()
32473 .mr(4)
32474 .nr(16)
32475 .kr(1)
32476 .sr(1)
32477 .m(4)
32478 .n(16)
32479 .k(k)
32480 .cn_stride(19)
32481 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32482 }
32483 }
32484 }
32485
32486 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16_subtile) {
32487 TEST_REQUIRES_X86_AVX;
32488 for (uint32_t n = 17; n < 32; n++) {
32489 for (size_t k = 1; k <= 5; k += 2) {
32490 for (uint32_t m = 1; m <= 4; m++) {
32491 GemmMicrokernelTester()
32492 .mr(4)
32493 .nr(16)
32494 .kr(1)
32495 .sr(1)
32496 .m(m)
32497 .n(n)
32498 .k(k)
32499 .iterations(1)
32500 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32501 }
32502 }
32503 }
32504 }
32505
32506 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16) {
32507 TEST_REQUIRES_X86_AVX;
32508 for (uint32_t n = 32; n <= 48; n += 16) {
32509 for (size_t k = 1; k <= 5; k += 2) {
32510 GemmMicrokernelTester()
32511 .mr(4)
32512 .nr(16)
32513 .kr(1)
32514 .sr(1)
32515 .m(4)
32516 .n(16)
32517 .k(k)
32518 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32519 }
32520 }
32521 }
32522
32523 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16_strided_cn) {
32524 TEST_REQUIRES_X86_AVX;
32525 for (uint32_t n = 32; n <= 48; n += 16) {
32526 for (size_t k = 1; k <= 5; k += 2) {
32527 GemmMicrokernelTester()
32528 .mr(4)
32529 .nr(16)
32530 .kr(1)
32531 .sr(1)
32532 .m(4)
32533 .n(n)
32534 .k(k)
32535 .cn_stride(19)
32536 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32537 }
32538 }
32539 }
32540
32541 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16_subtile) {
32542 TEST_REQUIRES_X86_AVX;
32543 for (uint32_t n = 32; n <= 48; n += 16) {
32544 for (size_t k = 1; k <= 5; k += 2) {
32545 for (uint32_t m = 1; m <= 4; m++) {
32546 GemmMicrokernelTester()
32547 .mr(4)
32548 .nr(16)
32549 .kr(1)
32550 .sr(1)
32551 .m(m)
32552 .n(n)
32553 .k(k)
32554 .iterations(1)
32555 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32556 }
32557 }
32558 }
32559 }
32560
32561 TEST(F32_IGEMM_4X16__AVX_BROADCAST, small_kernel) {
32562 TEST_REQUIRES_X86_AVX;
32563 for (size_t k = 1; k <= 5; k += 2) {
32564 GemmMicrokernelTester()
32565 .mr(4)
32566 .nr(16)
32567 .kr(1)
32568 .sr(1)
32569 .m(4)
32570 .n(16)
32571 .k(k)
32572 .ks(3)
32573 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32574 }
32575 }
32576
32577 TEST(F32_IGEMM_4X16__AVX_BROADCAST, small_kernel_subtile) {
32578 TEST_REQUIRES_X86_AVX;
32579 for (size_t k = 1; k <= 5; k += 2) {
32580 for (uint32_t m = 1; m <= 4; m++) {
32581 for (uint32_t n = 1; n <= 16; n++) {
32582 GemmMicrokernelTester()
32583 .mr(4)
32584 .nr(16)
32585 .kr(1)
32586 .sr(1)
32587 .m(m)
32588 .n(n)
32589 .k(k)
32590 .ks(3)
32591 .iterations(1)
32592 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32593 }
32594 }
32595 }
32596 }
32597
32598 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_gt_16_small_kernel) {
32599 TEST_REQUIRES_X86_AVX;
32600 for (uint32_t n = 17; n < 32; n++) {
32601 for (size_t k = 1; k <= 5; k += 2) {
32602 GemmMicrokernelTester()
32603 .mr(4)
32604 .nr(16)
32605 .kr(1)
32606 .sr(1)
32607 .m(4)
32608 .n(16)
32609 .k(k)
32610 .ks(3)
32611 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32612 }
32613 }
32614 }
32615
32616 TEST(F32_IGEMM_4X16__AVX_BROADCAST, n_div_16_small_kernel) {
32617 TEST_REQUIRES_X86_AVX;
32618 for (uint32_t n = 32; n <= 48; n += 16) {
32619 for (size_t k = 1; k <= 5; k += 2) {
32620 GemmMicrokernelTester()
32621 .mr(4)
32622 .nr(16)
32623 .kr(1)
32624 .sr(1)
32625 .m(4)
32626 .n(16)
32627 .k(k)
32628 .ks(3)
32629 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32630 }
32631 }
32632 }
32633
32634 TEST(F32_IGEMM_4X16__AVX_BROADCAST, strided_cm_subtile) {
32635 TEST_REQUIRES_X86_AVX;
32636 for (size_t k = 1; k <= 5; k += 2) {
32637 for (uint32_t m = 1; m <= 4; m++) {
32638 for (uint32_t n = 1; n <= 16; n++) {
32639 GemmMicrokernelTester()
32640 .mr(4)
32641 .nr(16)
32642 .kr(1)
32643 .sr(1)
32644 .m(m)
32645 .n(n)
32646 .k(k)
32647 .cm_stride(19)
32648 .iterations(1)
32649 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32650 }
32651 }
32652 }
32653 }
32654
32655 TEST(F32_IGEMM_4X16__AVX_BROADCAST, a_offset) {
32656 TEST_REQUIRES_X86_AVX;
32657 for (size_t k = 1; k <= 5; k += 2) {
32658 GemmMicrokernelTester()
32659 .mr(4)
32660 .nr(16)
32661 .kr(1)
32662 .sr(1)
32663 .m(4)
32664 .n(16)
32665 .k(k)
32666 .ks(3)
32667 .a_offset(23)
32668 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32669 }
32670 }
32671
32672 TEST(F32_IGEMM_4X16__AVX_BROADCAST, zero) {
32673 TEST_REQUIRES_X86_AVX;
32674 for (uint32_t mz = 0; mz < 4; mz++) {
32675 for (size_t k = 1; k <= 5; k += 2) {
32676 GemmMicrokernelTester()
32677 .mr(4)
32678 .nr(16)
32679 .kr(1)
32680 .sr(1)
32681 .m(4)
32682 .n(16)
32683 .k(k)
32684 .ks(3)
32685 .a_offset(23)
32686 .zero_index(mz)
32687 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32688 }
32689 }
32690 }
32691
32692 TEST(F32_IGEMM_4X16__AVX_BROADCAST, qmin) {
32693 TEST_REQUIRES_X86_AVX;
32694 GemmMicrokernelTester()
32695 .mr(4)
32696 .nr(16)
32697 .kr(1)
32698 .sr(1)
32699 .m(4)
32700 .n(16)
32701 .k(1)
32702 .qmin(128)
32703 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32704 }
32705
32706 TEST(F32_IGEMM_4X16__AVX_BROADCAST, qmax) {
32707 TEST_REQUIRES_X86_AVX;
32708 GemmMicrokernelTester()
32709 .mr(4)
32710 .nr(16)
32711 .kr(1)
32712 .sr(1)
32713 .m(4)
32714 .n(16)
32715 .k(1)
32716 .qmax(128)
32717 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32718 }
32719
32720 TEST(F32_IGEMM_4X16__AVX_BROADCAST, strided_cm) {
32721 TEST_REQUIRES_X86_AVX;
32722 GemmMicrokernelTester()
32723 .mr(4)
32724 .nr(16)
32725 .kr(1)
32726 .sr(1)
32727 .m(4)
32728 .n(16)
32729 .k(1)
32730 .cm_stride(19)
32731 .Test(xnn_f32_igemm_ukernel_4x16__avx_broadcast);
32732 }
32733#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32734
32735
32736#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32737 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1) {
32738 TEST_REQUIRES_X86_AVX;
32739 GemmMicrokernelTester()
32740 .mr(5)
32741 .nr(16)
32742 .kr(1)
32743 .sr(1)
32744 .m(5)
32745 .n(16)
32746 .k(1)
32747 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32748 }
32749
32750 TEST(F32_IGEMM_5X16__AVX_BROADCAST, strided_cn) {
32751 TEST_REQUIRES_X86_AVX;
32752 GemmMicrokernelTester()
32753 .mr(5)
32754 .nr(16)
32755 .kr(1)
32756 .sr(1)
32757 .m(5)
32758 .n(16)
32759 .k(1)
32760 .cn_stride(19)
32761 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32762 }
32763
32764 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1_subtile) {
32765 TEST_REQUIRES_X86_AVX;
32766 for (uint32_t m = 1; m <= 5; m++) {
32767 for (uint32_t n = 1; n <= 16; n++) {
32768 GemmMicrokernelTester()
32769 .mr(5)
32770 .nr(16)
32771 .kr(1)
32772 .sr(1)
32773 .m(m)
32774 .n(n)
32775 .k(1)
32776 .iterations(1)
32777 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32778 }
32779 }
32780 }
32781
32782 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_m) {
32783 TEST_REQUIRES_X86_AVX;
32784 for (uint32_t m = 1; m <= 5; m++) {
32785 GemmMicrokernelTester()
32786 .mr(5)
32787 .nr(16)
32788 .kr(1)
32789 .sr(1)
32790 .m(m)
32791 .n(16)
32792 .k(1)
32793 .iterations(1)
32794 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32795 }
32796 }
32797
32798 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_n) {
32799 TEST_REQUIRES_X86_AVX;
32800 for (uint32_t n = 1; n <= 16; n++) {
32801 GemmMicrokernelTester()
32802 .mr(5)
32803 .nr(16)
32804 .kr(1)
32805 .sr(1)
32806 .m(5)
32807 .n(n)
32808 .k(1)
32809 .iterations(1)
32810 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32811 }
32812 }
32813
32814 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_gt_1) {
32815 TEST_REQUIRES_X86_AVX;
32816 for (size_t k = 2; k < 10; k++) {
32817 GemmMicrokernelTester()
32818 .mr(5)
32819 .nr(16)
32820 .kr(1)
32821 .sr(1)
32822 .m(5)
32823 .n(16)
32824 .k(k)
32825 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32826 }
32827 }
32828
32829 TEST(F32_IGEMM_5X16__AVX_BROADCAST, k_gt_1_subtile) {
32830 TEST_REQUIRES_X86_AVX;
32831 for (size_t k = 2; k < 10; k++) {
32832 for (uint32_t m = 1; m <= 5; m++) {
32833 for (uint32_t n = 1; n <= 16; n++) {
32834 GemmMicrokernelTester()
32835 .mr(5)
32836 .nr(16)
32837 .kr(1)
32838 .sr(1)
32839 .m(m)
32840 .n(n)
32841 .k(k)
32842 .iterations(1)
32843 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32844 }
32845 }
32846 }
32847 }
32848
32849 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16) {
32850 TEST_REQUIRES_X86_AVX;
32851 for (uint32_t n = 17; n < 32; n++) {
32852 for (size_t k = 1; k <= 5; k += 2) {
32853 GemmMicrokernelTester()
32854 .mr(5)
32855 .nr(16)
32856 .kr(1)
32857 .sr(1)
32858 .m(5)
32859 .n(16)
32860 .k(k)
32861 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32862 }
32863 }
32864 }
32865
32866 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16_strided_cn) {
32867 TEST_REQUIRES_X86_AVX;
32868 for (uint32_t n = 17; n < 32; n++) {
32869 for (size_t k = 1; k <= 5; k += 2) {
32870 GemmMicrokernelTester()
32871 .mr(5)
32872 .nr(16)
32873 .kr(1)
32874 .sr(1)
32875 .m(5)
32876 .n(16)
32877 .k(k)
32878 .cn_stride(19)
32879 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32880 }
32881 }
32882 }
32883
32884 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16_subtile) {
32885 TEST_REQUIRES_X86_AVX;
32886 for (uint32_t n = 17; n < 32; n++) {
32887 for (size_t k = 1; k <= 5; k += 2) {
32888 for (uint32_t m = 1; m <= 5; m++) {
32889 GemmMicrokernelTester()
32890 .mr(5)
32891 .nr(16)
32892 .kr(1)
32893 .sr(1)
32894 .m(m)
32895 .n(n)
32896 .k(k)
32897 .iterations(1)
32898 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32899 }
32900 }
32901 }
32902 }
32903
32904 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16) {
32905 TEST_REQUIRES_X86_AVX;
32906 for (uint32_t n = 32; n <= 48; n += 16) {
32907 for (size_t k = 1; k <= 5; k += 2) {
32908 GemmMicrokernelTester()
32909 .mr(5)
32910 .nr(16)
32911 .kr(1)
32912 .sr(1)
32913 .m(5)
32914 .n(16)
32915 .k(k)
32916 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32917 }
32918 }
32919 }
32920
32921 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16_strided_cn) {
32922 TEST_REQUIRES_X86_AVX;
32923 for (uint32_t n = 32; n <= 48; n += 16) {
32924 for (size_t k = 1; k <= 5; k += 2) {
32925 GemmMicrokernelTester()
32926 .mr(5)
32927 .nr(16)
32928 .kr(1)
32929 .sr(1)
32930 .m(5)
32931 .n(n)
32932 .k(k)
32933 .cn_stride(19)
32934 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32935 }
32936 }
32937 }
32938
32939 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16_subtile) {
32940 TEST_REQUIRES_X86_AVX;
32941 for (uint32_t n = 32; n <= 48; n += 16) {
32942 for (size_t k = 1; k <= 5; k += 2) {
32943 for (uint32_t m = 1; m <= 5; m++) {
32944 GemmMicrokernelTester()
32945 .mr(5)
32946 .nr(16)
32947 .kr(1)
32948 .sr(1)
32949 .m(m)
32950 .n(n)
32951 .k(k)
32952 .iterations(1)
32953 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32954 }
32955 }
32956 }
32957 }
32958
32959 TEST(F32_IGEMM_5X16__AVX_BROADCAST, small_kernel) {
32960 TEST_REQUIRES_X86_AVX;
32961 for (size_t k = 1; k <= 5; k += 2) {
32962 GemmMicrokernelTester()
32963 .mr(5)
32964 .nr(16)
32965 .kr(1)
32966 .sr(1)
32967 .m(5)
32968 .n(16)
32969 .k(k)
32970 .ks(3)
32971 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32972 }
32973 }
32974
32975 TEST(F32_IGEMM_5X16__AVX_BROADCAST, small_kernel_subtile) {
32976 TEST_REQUIRES_X86_AVX;
32977 for (size_t k = 1; k <= 5; k += 2) {
32978 for (uint32_t m = 1; m <= 5; m++) {
32979 for (uint32_t n = 1; n <= 16; n++) {
32980 GemmMicrokernelTester()
32981 .mr(5)
32982 .nr(16)
32983 .kr(1)
32984 .sr(1)
32985 .m(m)
32986 .n(n)
32987 .k(k)
32988 .ks(3)
32989 .iterations(1)
32990 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
32991 }
32992 }
32993 }
32994 }
32995
32996 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_gt_16_small_kernel) {
32997 TEST_REQUIRES_X86_AVX;
32998 for (uint32_t n = 17; n < 32; n++) {
32999 for (size_t k = 1; k <= 5; k += 2) {
33000 GemmMicrokernelTester()
33001 .mr(5)
33002 .nr(16)
33003 .kr(1)
33004 .sr(1)
33005 .m(5)
33006 .n(16)
33007 .k(k)
33008 .ks(3)
33009 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33010 }
33011 }
33012 }
33013
33014 TEST(F32_IGEMM_5X16__AVX_BROADCAST, n_div_16_small_kernel) {
33015 TEST_REQUIRES_X86_AVX;
33016 for (uint32_t n = 32; n <= 48; n += 16) {
33017 for (size_t k = 1; k <= 5; k += 2) {
33018 GemmMicrokernelTester()
33019 .mr(5)
33020 .nr(16)
33021 .kr(1)
33022 .sr(1)
33023 .m(5)
33024 .n(16)
33025 .k(k)
33026 .ks(3)
33027 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33028 }
33029 }
33030 }
33031
33032 TEST(F32_IGEMM_5X16__AVX_BROADCAST, strided_cm_subtile) {
33033 TEST_REQUIRES_X86_AVX;
33034 for (size_t k = 1; k <= 5; k += 2) {
33035 for (uint32_t m = 1; m <= 5; m++) {
33036 for (uint32_t n = 1; n <= 16; n++) {
33037 GemmMicrokernelTester()
33038 .mr(5)
33039 .nr(16)
33040 .kr(1)
33041 .sr(1)
33042 .m(m)
33043 .n(n)
33044 .k(k)
33045 .cm_stride(19)
33046 .iterations(1)
33047 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33048 }
33049 }
33050 }
33051 }
33052
33053 TEST(F32_IGEMM_5X16__AVX_BROADCAST, a_offset) {
33054 TEST_REQUIRES_X86_AVX;
33055 for (size_t k = 1; k <= 5; k += 2) {
33056 GemmMicrokernelTester()
33057 .mr(5)
33058 .nr(16)
33059 .kr(1)
33060 .sr(1)
33061 .m(5)
33062 .n(16)
33063 .k(k)
33064 .ks(3)
33065 .a_offset(29)
33066 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33067 }
33068 }
33069
33070 TEST(F32_IGEMM_5X16__AVX_BROADCAST, zero) {
33071 TEST_REQUIRES_X86_AVX;
33072 for (uint32_t mz = 0; mz < 5; mz++) {
33073 for (size_t k = 1; k <= 5; k += 2) {
33074 GemmMicrokernelTester()
33075 .mr(5)
33076 .nr(16)
33077 .kr(1)
33078 .sr(1)
33079 .m(5)
33080 .n(16)
33081 .k(k)
33082 .ks(3)
33083 .a_offset(29)
33084 .zero_index(mz)
33085 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33086 }
33087 }
33088 }
33089
33090 TEST(F32_IGEMM_5X16__AVX_BROADCAST, qmin) {
33091 TEST_REQUIRES_X86_AVX;
33092 GemmMicrokernelTester()
33093 .mr(5)
33094 .nr(16)
33095 .kr(1)
33096 .sr(1)
33097 .m(5)
33098 .n(16)
33099 .k(1)
33100 .qmin(128)
33101 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33102 }
33103
33104 TEST(F32_IGEMM_5X16__AVX_BROADCAST, qmax) {
33105 TEST_REQUIRES_X86_AVX;
33106 GemmMicrokernelTester()
33107 .mr(5)
33108 .nr(16)
33109 .kr(1)
33110 .sr(1)
33111 .m(5)
33112 .n(16)
33113 .k(1)
33114 .qmax(128)
33115 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33116 }
33117
33118 TEST(F32_IGEMM_5X16__AVX_BROADCAST, strided_cm) {
33119 TEST_REQUIRES_X86_AVX;
33120 GemmMicrokernelTester()
33121 .mr(5)
33122 .nr(16)
33123 .kr(1)
33124 .sr(1)
33125 .m(5)
33126 .n(16)
33127 .k(1)
33128 .cm_stride(19)
33129 .Test(xnn_f32_igemm_ukernel_5x16__avx_broadcast);
33130 }
33131#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33132
33133
33134#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33135 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1) {
33136 TEST_REQUIRES_X86_FMA3;
33137 GemmMicrokernelTester()
33138 .mr(1)
33139 .nr(8)
33140 .kr(1)
33141 .sr(1)
33142 .m(1)
33143 .n(8)
33144 .k(1)
33145 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33146 }
33147
33148 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, strided_cn) {
33149 TEST_REQUIRES_X86_FMA3;
33150 GemmMicrokernelTester()
33151 .mr(1)
33152 .nr(8)
33153 .kr(1)
33154 .sr(1)
33155 .m(1)
33156 .n(8)
33157 .k(1)
33158 .cn_stride(11)
33159 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33160 }
33161
33162 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
33163 TEST_REQUIRES_X86_FMA3;
33164 for (uint32_t m = 1; m <= 1; m++) {
33165 for (uint32_t n = 1; n <= 8; n++) {
33166 GemmMicrokernelTester()
33167 .mr(1)
33168 .nr(8)
33169 .kr(1)
33170 .sr(1)
33171 .m(m)
33172 .n(n)
33173 .k(1)
33174 .iterations(1)
33175 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33176 }
33177 }
33178 }
33179
33180 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
33181 TEST_REQUIRES_X86_FMA3;
33182 for (uint32_t m = 1; m <= 1; m++) {
33183 GemmMicrokernelTester()
33184 .mr(1)
33185 .nr(8)
33186 .kr(1)
33187 .sr(1)
33188 .m(m)
33189 .n(8)
33190 .k(1)
33191 .iterations(1)
33192 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33193 }
33194 }
33195
33196 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
33197 TEST_REQUIRES_X86_FMA3;
33198 for (uint32_t n = 1; n <= 8; n++) {
33199 GemmMicrokernelTester()
33200 .mr(1)
33201 .nr(8)
33202 .kr(1)
33203 .sr(1)
33204 .m(1)
33205 .n(n)
33206 .k(1)
33207 .iterations(1)
33208 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33209 }
33210 }
33211
33212 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_gt_1) {
33213 TEST_REQUIRES_X86_FMA3;
33214 for (size_t k = 2; k < 10; k++) {
33215 GemmMicrokernelTester()
33216 .mr(1)
33217 .nr(8)
33218 .kr(1)
33219 .sr(1)
33220 .m(1)
33221 .n(8)
33222 .k(k)
33223 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33224 }
33225 }
33226
33227 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
33228 TEST_REQUIRES_X86_FMA3;
33229 for (size_t k = 2; k < 10; k++) {
33230 for (uint32_t m = 1; m <= 1; m++) {
33231 for (uint32_t n = 1; n <= 8; n++) {
33232 GemmMicrokernelTester()
33233 .mr(1)
33234 .nr(8)
33235 .kr(1)
33236 .sr(1)
33237 .m(m)
33238 .n(n)
33239 .k(k)
33240 .iterations(1)
33241 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33242 }
33243 }
33244 }
33245 }
33246
33247 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8) {
33248 TEST_REQUIRES_X86_FMA3;
33249 for (uint32_t n = 9; n < 16; n++) {
33250 for (size_t k = 1; k <= 5; k += 2) {
33251 GemmMicrokernelTester()
33252 .mr(1)
33253 .nr(8)
33254 .kr(1)
33255 .sr(1)
33256 .m(1)
33257 .n(8)
33258 .k(k)
33259 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33260 }
33261 }
33262 }
33263
33264 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
33265 TEST_REQUIRES_X86_FMA3;
33266 for (uint32_t n = 9; n < 16; n++) {
33267 for (size_t k = 1; k <= 5; k += 2) {
33268 GemmMicrokernelTester()
33269 .mr(1)
33270 .nr(8)
33271 .kr(1)
33272 .sr(1)
33273 .m(1)
33274 .n(8)
33275 .k(k)
33276 .cn_stride(11)
33277 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33278 }
33279 }
33280 }
33281
33282 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
33283 TEST_REQUIRES_X86_FMA3;
33284 for (uint32_t n = 9; n < 16; n++) {
33285 for (size_t k = 1; k <= 5; k += 2) {
33286 for (uint32_t m = 1; m <= 1; m++) {
33287 GemmMicrokernelTester()
33288 .mr(1)
33289 .nr(8)
33290 .kr(1)
33291 .sr(1)
33292 .m(m)
33293 .n(n)
33294 .k(k)
33295 .iterations(1)
33296 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33297 }
33298 }
33299 }
33300 }
33301
33302 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8) {
33303 TEST_REQUIRES_X86_FMA3;
33304 for (uint32_t n = 16; n <= 24; n += 8) {
33305 for (size_t k = 1; k <= 5; k += 2) {
33306 GemmMicrokernelTester()
33307 .mr(1)
33308 .nr(8)
33309 .kr(1)
33310 .sr(1)
33311 .m(1)
33312 .n(8)
33313 .k(k)
33314 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33315 }
33316 }
33317 }
33318
33319 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
33320 TEST_REQUIRES_X86_FMA3;
33321 for (uint32_t n = 16; n <= 24; n += 8) {
33322 for (size_t k = 1; k <= 5; k += 2) {
33323 GemmMicrokernelTester()
33324 .mr(1)
33325 .nr(8)
33326 .kr(1)
33327 .sr(1)
33328 .m(1)
33329 .n(n)
33330 .k(k)
33331 .cn_stride(11)
33332 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33333 }
33334 }
33335 }
33336
33337 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8_subtile) {
33338 TEST_REQUIRES_X86_FMA3;
33339 for (uint32_t n = 16; n <= 24; n += 8) {
33340 for (size_t k = 1; k <= 5; k += 2) {
33341 for (uint32_t m = 1; m <= 1; m++) {
33342 GemmMicrokernelTester()
33343 .mr(1)
33344 .nr(8)
33345 .kr(1)
33346 .sr(1)
33347 .m(m)
33348 .n(n)
33349 .k(k)
33350 .iterations(1)
33351 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33352 }
33353 }
33354 }
33355 }
33356
33357 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, small_kernel) {
33358 TEST_REQUIRES_X86_FMA3;
33359 for (size_t k = 1; k <= 5; k += 2) {
33360 GemmMicrokernelTester()
33361 .mr(1)
33362 .nr(8)
33363 .kr(1)
33364 .sr(1)
33365 .m(1)
33366 .n(8)
33367 .k(k)
33368 .ks(3)
33369 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33370 }
33371 }
33372
33373 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, small_kernel_subtile) {
33374 TEST_REQUIRES_X86_FMA3;
33375 for (size_t k = 1; k <= 5; k += 2) {
33376 for (uint32_t m = 1; m <= 1; m++) {
33377 for (uint32_t n = 1; n <= 8; n++) {
33378 GemmMicrokernelTester()
33379 .mr(1)
33380 .nr(8)
33381 .kr(1)
33382 .sr(1)
33383 .m(m)
33384 .n(n)
33385 .k(k)
33386 .ks(3)
33387 .iterations(1)
33388 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33389 }
33390 }
33391 }
33392 }
33393
33394 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
33395 TEST_REQUIRES_X86_FMA3;
33396 for (uint32_t n = 9; n < 16; n++) {
33397 for (size_t k = 1; k <= 5; k += 2) {
33398 GemmMicrokernelTester()
33399 .mr(1)
33400 .nr(8)
33401 .kr(1)
33402 .sr(1)
33403 .m(1)
33404 .n(8)
33405 .k(k)
33406 .ks(3)
33407 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33408 }
33409 }
33410 }
33411
33412 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, n_div_8_small_kernel) {
33413 TEST_REQUIRES_X86_FMA3;
33414 for (uint32_t n = 16; n <= 24; n += 8) {
33415 for (size_t k = 1; k <= 5; k += 2) {
33416 GemmMicrokernelTester()
33417 .mr(1)
33418 .nr(8)
33419 .kr(1)
33420 .sr(1)
33421 .m(1)
33422 .n(8)
33423 .k(k)
33424 .ks(3)
33425 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33426 }
33427 }
33428 }
33429
33430 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, strided_cm_subtile) {
33431 TEST_REQUIRES_X86_FMA3;
33432 for (size_t k = 1; k <= 5; k += 2) {
33433 for (uint32_t m = 1; m <= 1; m++) {
33434 for (uint32_t n = 1; n <= 8; n++) {
33435 GemmMicrokernelTester()
33436 .mr(1)
33437 .nr(8)
33438 .kr(1)
33439 .sr(1)
33440 .m(m)
33441 .n(n)
33442 .k(k)
33443 .cm_stride(11)
33444 .iterations(1)
33445 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33446 }
33447 }
33448 }
33449 }
33450
33451 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, a_offset) {
33452 TEST_REQUIRES_X86_FMA3;
33453 for (size_t k = 1; k <= 5; k += 2) {
33454 GemmMicrokernelTester()
33455 .mr(1)
33456 .nr(8)
33457 .kr(1)
33458 .sr(1)
33459 .m(1)
33460 .n(8)
33461 .k(k)
33462 .ks(3)
33463 .a_offset(7)
33464 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33465 }
33466 }
33467
33468 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, zero) {
33469 TEST_REQUIRES_X86_FMA3;
33470 for (uint32_t mz = 0; mz < 1; mz++) {
33471 for (size_t k = 1; k <= 5; k += 2) {
33472 GemmMicrokernelTester()
33473 .mr(1)
33474 .nr(8)
33475 .kr(1)
33476 .sr(1)
33477 .m(1)
33478 .n(8)
33479 .k(k)
33480 .ks(3)
33481 .a_offset(7)
33482 .zero_index(mz)
33483 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33484 }
33485 }
33486 }
33487
33488 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, qmin) {
33489 TEST_REQUIRES_X86_FMA3;
33490 GemmMicrokernelTester()
33491 .mr(1)
33492 .nr(8)
33493 .kr(1)
33494 .sr(1)
33495 .m(1)
33496 .n(8)
33497 .k(1)
33498 .qmin(128)
33499 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33500 }
33501
33502 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, qmax) {
33503 TEST_REQUIRES_X86_FMA3;
33504 GemmMicrokernelTester()
33505 .mr(1)
33506 .nr(8)
33507 .kr(1)
33508 .sr(1)
33509 .m(1)
33510 .n(8)
33511 .k(1)
33512 .qmax(128)
33513 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33514 }
33515
33516 TEST(F32_IGEMM_1X8__FMA3_BROADCAST, strided_cm) {
33517 TEST_REQUIRES_X86_FMA3;
33518 GemmMicrokernelTester()
33519 .mr(1)
33520 .nr(8)
33521 .kr(1)
33522 .sr(1)
33523 .m(1)
33524 .n(8)
33525 .k(1)
33526 .cm_stride(11)
33527 .Test(xnn_f32_igemm_ukernel_1x8__fma3_broadcast);
33528 }
33529#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33530
33531
33532#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33533 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1) {
33534 TEST_REQUIRES_X86_FMA3;
33535 GemmMicrokernelTester()
33536 .mr(4)
33537 .nr(8)
33538 .kr(1)
33539 .sr(1)
33540 .m(4)
33541 .n(8)
33542 .k(1)
33543 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33544 }
33545
33546 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, strided_cn) {
33547 TEST_REQUIRES_X86_FMA3;
33548 GemmMicrokernelTester()
33549 .mr(4)
33550 .nr(8)
33551 .kr(1)
33552 .sr(1)
33553 .m(4)
33554 .n(8)
33555 .k(1)
33556 .cn_stride(11)
33557 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33558 }
33559
33560 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
33561 TEST_REQUIRES_X86_FMA3;
33562 for (uint32_t m = 1; m <= 4; m++) {
33563 for (uint32_t n = 1; n <= 8; n++) {
33564 GemmMicrokernelTester()
33565 .mr(4)
33566 .nr(8)
33567 .kr(1)
33568 .sr(1)
33569 .m(m)
33570 .n(n)
33571 .k(1)
33572 .iterations(1)
33573 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33574 }
33575 }
33576 }
33577
33578 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
33579 TEST_REQUIRES_X86_FMA3;
33580 for (uint32_t m = 1; m <= 4; m++) {
33581 GemmMicrokernelTester()
33582 .mr(4)
33583 .nr(8)
33584 .kr(1)
33585 .sr(1)
33586 .m(m)
33587 .n(8)
33588 .k(1)
33589 .iterations(1)
33590 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33591 }
33592 }
33593
33594 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
33595 TEST_REQUIRES_X86_FMA3;
33596 for (uint32_t n = 1; n <= 8; n++) {
33597 GemmMicrokernelTester()
33598 .mr(4)
33599 .nr(8)
33600 .kr(1)
33601 .sr(1)
33602 .m(4)
33603 .n(n)
33604 .k(1)
33605 .iterations(1)
33606 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33607 }
33608 }
33609
33610 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_gt_1) {
33611 TEST_REQUIRES_X86_FMA3;
33612 for (size_t k = 2; k < 10; k++) {
33613 GemmMicrokernelTester()
33614 .mr(4)
33615 .nr(8)
33616 .kr(1)
33617 .sr(1)
33618 .m(4)
33619 .n(8)
33620 .k(k)
33621 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33622 }
33623 }
33624
33625 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
33626 TEST_REQUIRES_X86_FMA3;
33627 for (size_t k = 2; k < 10; k++) {
33628 for (uint32_t m = 1; m <= 4; m++) {
33629 for (uint32_t n = 1; n <= 8; n++) {
33630 GemmMicrokernelTester()
33631 .mr(4)
33632 .nr(8)
33633 .kr(1)
33634 .sr(1)
33635 .m(m)
33636 .n(n)
33637 .k(k)
33638 .iterations(1)
33639 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33640 }
33641 }
33642 }
33643 }
33644
33645 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8) {
33646 TEST_REQUIRES_X86_FMA3;
33647 for (uint32_t n = 9; n < 16; n++) {
33648 for (size_t k = 1; k <= 5; k += 2) {
33649 GemmMicrokernelTester()
33650 .mr(4)
33651 .nr(8)
33652 .kr(1)
33653 .sr(1)
33654 .m(4)
33655 .n(8)
33656 .k(k)
33657 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33658 }
33659 }
33660 }
33661
33662 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
33663 TEST_REQUIRES_X86_FMA3;
33664 for (uint32_t n = 9; n < 16; n++) {
33665 for (size_t k = 1; k <= 5; k += 2) {
33666 GemmMicrokernelTester()
33667 .mr(4)
33668 .nr(8)
33669 .kr(1)
33670 .sr(1)
33671 .m(4)
33672 .n(8)
33673 .k(k)
33674 .cn_stride(11)
33675 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33676 }
33677 }
33678 }
33679
33680 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
33681 TEST_REQUIRES_X86_FMA3;
33682 for (uint32_t n = 9; n < 16; n++) {
33683 for (size_t k = 1; k <= 5; k += 2) {
33684 for (uint32_t m = 1; m <= 4; m++) {
33685 GemmMicrokernelTester()
33686 .mr(4)
33687 .nr(8)
33688 .kr(1)
33689 .sr(1)
33690 .m(m)
33691 .n(n)
33692 .k(k)
33693 .iterations(1)
33694 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33695 }
33696 }
33697 }
33698 }
33699
33700 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8) {
33701 TEST_REQUIRES_X86_FMA3;
33702 for (uint32_t n = 16; n <= 24; n += 8) {
33703 for (size_t k = 1; k <= 5; k += 2) {
33704 GemmMicrokernelTester()
33705 .mr(4)
33706 .nr(8)
33707 .kr(1)
33708 .sr(1)
33709 .m(4)
33710 .n(8)
33711 .k(k)
33712 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33713 }
33714 }
33715 }
33716
33717 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
33718 TEST_REQUIRES_X86_FMA3;
33719 for (uint32_t n = 16; n <= 24; n += 8) {
33720 for (size_t k = 1; k <= 5; k += 2) {
33721 GemmMicrokernelTester()
33722 .mr(4)
33723 .nr(8)
33724 .kr(1)
33725 .sr(1)
33726 .m(4)
33727 .n(n)
33728 .k(k)
33729 .cn_stride(11)
33730 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33731 }
33732 }
33733 }
33734
33735 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8_subtile) {
33736 TEST_REQUIRES_X86_FMA3;
33737 for (uint32_t n = 16; n <= 24; n += 8) {
33738 for (size_t k = 1; k <= 5; k += 2) {
33739 for (uint32_t m = 1; m <= 4; m++) {
33740 GemmMicrokernelTester()
33741 .mr(4)
33742 .nr(8)
33743 .kr(1)
33744 .sr(1)
33745 .m(m)
33746 .n(n)
33747 .k(k)
33748 .iterations(1)
33749 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33750 }
33751 }
33752 }
33753 }
33754
33755 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, small_kernel) {
33756 TEST_REQUIRES_X86_FMA3;
33757 for (size_t k = 1; k <= 5; k += 2) {
33758 GemmMicrokernelTester()
33759 .mr(4)
33760 .nr(8)
33761 .kr(1)
33762 .sr(1)
33763 .m(4)
33764 .n(8)
33765 .k(k)
33766 .ks(3)
33767 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33768 }
33769 }
33770
33771 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, small_kernel_subtile) {
33772 TEST_REQUIRES_X86_FMA3;
33773 for (size_t k = 1; k <= 5; k += 2) {
33774 for (uint32_t m = 1; m <= 4; m++) {
33775 for (uint32_t n = 1; n <= 8; n++) {
33776 GemmMicrokernelTester()
33777 .mr(4)
33778 .nr(8)
33779 .kr(1)
33780 .sr(1)
33781 .m(m)
33782 .n(n)
33783 .k(k)
33784 .ks(3)
33785 .iterations(1)
33786 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33787 }
33788 }
33789 }
33790 }
33791
33792 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
33793 TEST_REQUIRES_X86_FMA3;
33794 for (uint32_t n = 9; n < 16; n++) {
33795 for (size_t k = 1; k <= 5; k += 2) {
33796 GemmMicrokernelTester()
33797 .mr(4)
33798 .nr(8)
33799 .kr(1)
33800 .sr(1)
33801 .m(4)
33802 .n(8)
33803 .k(k)
33804 .ks(3)
33805 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33806 }
33807 }
33808 }
33809
33810 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, n_div_8_small_kernel) {
33811 TEST_REQUIRES_X86_FMA3;
33812 for (uint32_t n = 16; n <= 24; n += 8) {
33813 for (size_t k = 1; k <= 5; k += 2) {
33814 GemmMicrokernelTester()
33815 .mr(4)
33816 .nr(8)
33817 .kr(1)
33818 .sr(1)
33819 .m(4)
33820 .n(8)
33821 .k(k)
33822 .ks(3)
33823 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33824 }
33825 }
33826 }
33827
33828 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, strided_cm_subtile) {
33829 TEST_REQUIRES_X86_FMA3;
33830 for (size_t k = 1; k <= 5; k += 2) {
33831 for (uint32_t m = 1; m <= 4; m++) {
33832 for (uint32_t n = 1; n <= 8; n++) {
33833 GemmMicrokernelTester()
33834 .mr(4)
33835 .nr(8)
33836 .kr(1)
33837 .sr(1)
33838 .m(m)
33839 .n(n)
33840 .k(k)
33841 .cm_stride(11)
33842 .iterations(1)
33843 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33844 }
33845 }
33846 }
33847 }
33848
33849 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, a_offset) {
33850 TEST_REQUIRES_X86_FMA3;
33851 for (size_t k = 1; k <= 5; k += 2) {
33852 GemmMicrokernelTester()
33853 .mr(4)
33854 .nr(8)
33855 .kr(1)
33856 .sr(1)
33857 .m(4)
33858 .n(8)
33859 .k(k)
33860 .ks(3)
33861 .a_offset(23)
33862 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33863 }
33864 }
33865
33866 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, zero) {
33867 TEST_REQUIRES_X86_FMA3;
33868 for (uint32_t mz = 0; mz < 4; mz++) {
33869 for (size_t k = 1; k <= 5; k += 2) {
33870 GemmMicrokernelTester()
33871 .mr(4)
33872 .nr(8)
33873 .kr(1)
33874 .sr(1)
33875 .m(4)
33876 .n(8)
33877 .k(k)
33878 .ks(3)
33879 .a_offset(23)
33880 .zero_index(mz)
33881 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33882 }
33883 }
33884 }
33885
33886 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, qmin) {
33887 TEST_REQUIRES_X86_FMA3;
33888 GemmMicrokernelTester()
33889 .mr(4)
33890 .nr(8)
33891 .kr(1)
33892 .sr(1)
33893 .m(4)
33894 .n(8)
33895 .k(1)
33896 .qmin(128)
33897 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33898 }
33899
33900 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, qmax) {
33901 TEST_REQUIRES_X86_FMA3;
33902 GemmMicrokernelTester()
33903 .mr(4)
33904 .nr(8)
33905 .kr(1)
33906 .sr(1)
33907 .m(4)
33908 .n(8)
33909 .k(1)
33910 .qmax(128)
33911 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33912 }
33913
33914 TEST(F32_IGEMM_4X8__FMA3_BROADCAST, strided_cm) {
33915 TEST_REQUIRES_X86_FMA3;
33916 GemmMicrokernelTester()
33917 .mr(4)
33918 .nr(8)
33919 .kr(1)
33920 .sr(1)
33921 .m(4)
33922 .n(8)
33923 .k(1)
33924 .cm_stride(11)
33925 .Test(xnn_f32_igemm_ukernel_4x8__fma3_broadcast);
33926 }
33927#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33928
33929
33930#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33931 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1) {
33932 TEST_REQUIRES_X86_FMA3;
33933 GemmMicrokernelTester()
33934 .mr(5)
33935 .nr(8)
33936 .kr(1)
33937 .sr(1)
33938 .m(5)
33939 .n(8)
33940 .k(1)
33941 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
33942 }
33943
33944 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, strided_cn) {
33945 TEST_REQUIRES_X86_FMA3;
33946 GemmMicrokernelTester()
33947 .mr(5)
33948 .nr(8)
33949 .kr(1)
33950 .sr(1)
33951 .m(5)
33952 .n(8)
33953 .k(1)
33954 .cn_stride(11)
33955 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
33956 }
33957
33958 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
33959 TEST_REQUIRES_X86_FMA3;
33960 for (uint32_t m = 1; m <= 5; m++) {
33961 for (uint32_t n = 1; n <= 8; n++) {
33962 GemmMicrokernelTester()
33963 .mr(5)
33964 .nr(8)
33965 .kr(1)
33966 .sr(1)
33967 .m(m)
33968 .n(n)
33969 .k(1)
33970 .iterations(1)
33971 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
33972 }
33973 }
33974 }
33975
33976 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
33977 TEST_REQUIRES_X86_FMA3;
33978 for (uint32_t m = 1; m <= 5; m++) {
33979 GemmMicrokernelTester()
33980 .mr(5)
33981 .nr(8)
33982 .kr(1)
33983 .sr(1)
33984 .m(m)
33985 .n(8)
33986 .k(1)
33987 .iterations(1)
33988 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
33989 }
33990 }
33991
33992 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
33993 TEST_REQUIRES_X86_FMA3;
33994 for (uint32_t n = 1; n <= 8; n++) {
33995 GemmMicrokernelTester()
33996 .mr(5)
33997 .nr(8)
33998 .kr(1)
33999 .sr(1)
34000 .m(5)
34001 .n(n)
34002 .k(1)
34003 .iterations(1)
34004 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34005 }
34006 }
34007
34008 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_gt_1) {
34009 TEST_REQUIRES_X86_FMA3;
34010 for (size_t k = 2; k < 10; k++) {
34011 GemmMicrokernelTester()
34012 .mr(5)
34013 .nr(8)
34014 .kr(1)
34015 .sr(1)
34016 .m(5)
34017 .n(8)
34018 .k(k)
34019 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34020 }
34021 }
34022
34023 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
34024 TEST_REQUIRES_X86_FMA3;
34025 for (size_t k = 2; k < 10; k++) {
34026 for (uint32_t m = 1; m <= 5; m++) {
34027 for (uint32_t n = 1; n <= 8; n++) {
34028 GemmMicrokernelTester()
34029 .mr(5)
34030 .nr(8)
34031 .kr(1)
34032 .sr(1)
34033 .m(m)
34034 .n(n)
34035 .k(k)
34036 .iterations(1)
34037 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34038 }
34039 }
34040 }
34041 }
34042
34043 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8) {
34044 TEST_REQUIRES_X86_FMA3;
34045 for (uint32_t n = 9; n < 16; n++) {
34046 for (size_t k = 1; k <= 5; k += 2) {
34047 GemmMicrokernelTester()
34048 .mr(5)
34049 .nr(8)
34050 .kr(1)
34051 .sr(1)
34052 .m(5)
34053 .n(8)
34054 .k(k)
34055 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34056 }
34057 }
34058 }
34059
34060 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
34061 TEST_REQUIRES_X86_FMA3;
34062 for (uint32_t n = 9; n < 16; n++) {
34063 for (size_t k = 1; k <= 5; k += 2) {
34064 GemmMicrokernelTester()
34065 .mr(5)
34066 .nr(8)
34067 .kr(1)
34068 .sr(1)
34069 .m(5)
34070 .n(8)
34071 .k(k)
34072 .cn_stride(11)
34073 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34074 }
34075 }
34076 }
34077
34078 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
34079 TEST_REQUIRES_X86_FMA3;
34080 for (uint32_t n = 9; n < 16; n++) {
34081 for (size_t k = 1; k <= 5; k += 2) {
34082 for (uint32_t m = 1; m <= 5; m++) {
34083 GemmMicrokernelTester()
34084 .mr(5)
34085 .nr(8)
34086 .kr(1)
34087 .sr(1)
34088 .m(m)
34089 .n(n)
34090 .k(k)
34091 .iterations(1)
34092 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34093 }
34094 }
34095 }
34096 }
34097
34098 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8) {
34099 TEST_REQUIRES_X86_FMA3;
34100 for (uint32_t n = 16; n <= 24; n += 8) {
34101 for (size_t k = 1; k <= 5; k += 2) {
34102 GemmMicrokernelTester()
34103 .mr(5)
34104 .nr(8)
34105 .kr(1)
34106 .sr(1)
34107 .m(5)
34108 .n(8)
34109 .k(k)
34110 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34111 }
34112 }
34113 }
34114
34115 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
34116 TEST_REQUIRES_X86_FMA3;
34117 for (uint32_t n = 16; n <= 24; n += 8) {
34118 for (size_t k = 1; k <= 5; k += 2) {
34119 GemmMicrokernelTester()
34120 .mr(5)
34121 .nr(8)
34122 .kr(1)
34123 .sr(1)
34124 .m(5)
34125 .n(n)
34126 .k(k)
34127 .cn_stride(11)
34128 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34129 }
34130 }
34131 }
34132
34133 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8_subtile) {
34134 TEST_REQUIRES_X86_FMA3;
34135 for (uint32_t n = 16; n <= 24; n += 8) {
34136 for (size_t k = 1; k <= 5; k += 2) {
34137 for (uint32_t m = 1; m <= 5; m++) {
34138 GemmMicrokernelTester()
34139 .mr(5)
34140 .nr(8)
34141 .kr(1)
34142 .sr(1)
34143 .m(m)
34144 .n(n)
34145 .k(k)
34146 .iterations(1)
34147 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34148 }
34149 }
34150 }
34151 }
34152
34153 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, small_kernel) {
34154 TEST_REQUIRES_X86_FMA3;
34155 for (size_t k = 1; k <= 5; k += 2) {
34156 GemmMicrokernelTester()
34157 .mr(5)
34158 .nr(8)
34159 .kr(1)
34160 .sr(1)
34161 .m(5)
34162 .n(8)
34163 .k(k)
34164 .ks(3)
34165 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34166 }
34167 }
34168
34169 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, small_kernel_subtile) {
34170 TEST_REQUIRES_X86_FMA3;
34171 for (size_t k = 1; k <= 5; k += 2) {
34172 for (uint32_t m = 1; m <= 5; m++) {
34173 for (uint32_t n = 1; n <= 8; n++) {
34174 GemmMicrokernelTester()
34175 .mr(5)
34176 .nr(8)
34177 .kr(1)
34178 .sr(1)
34179 .m(m)
34180 .n(n)
34181 .k(k)
34182 .ks(3)
34183 .iterations(1)
34184 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34185 }
34186 }
34187 }
34188 }
34189
34190 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
34191 TEST_REQUIRES_X86_FMA3;
34192 for (uint32_t n = 9; n < 16; n++) {
34193 for (size_t k = 1; k <= 5; k += 2) {
34194 GemmMicrokernelTester()
34195 .mr(5)
34196 .nr(8)
34197 .kr(1)
34198 .sr(1)
34199 .m(5)
34200 .n(8)
34201 .k(k)
34202 .ks(3)
34203 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34204 }
34205 }
34206 }
34207
34208 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, n_div_8_small_kernel) {
34209 TEST_REQUIRES_X86_FMA3;
34210 for (uint32_t n = 16; n <= 24; n += 8) {
34211 for (size_t k = 1; k <= 5; k += 2) {
34212 GemmMicrokernelTester()
34213 .mr(5)
34214 .nr(8)
34215 .kr(1)
34216 .sr(1)
34217 .m(5)
34218 .n(8)
34219 .k(k)
34220 .ks(3)
34221 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34222 }
34223 }
34224 }
34225
34226 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, strided_cm_subtile) {
34227 TEST_REQUIRES_X86_FMA3;
34228 for (size_t k = 1; k <= 5; k += 2) {
34229 for (uint32_t m = 1; m <= 5; m++) {
34230 for (uint32_t n = 1; n <= 8; n++) {
34231 GemmMicrokernelTester()
34232 .mr(5)
34233 .nr(8)
34234 .kr(1)
34235 .sr(1)
34236 .m(m)
34237 .n(n)
34238 .k(k)
34239 .cm_stride(11)
34240 .iterations(1)
34241 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34242 }
34243 }
34244 }
34245 }
34246
34247 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, a_offset) {
34248 TEST_REQUIRES_X86_FMA3;
34249 for (size_t k = 1; k <= 5; k += 2) {
34250 GemmMicrokernelTester()
34251 .mr(5)
34252 .nr(8)
34253 .kr(1)
34254 .sr(1)
34255 .m(5)
34256 .n(8)
34257 .k(k)
34258 .ks(3)
34259 .a_offset(29)
34260 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34261 }
34262 }
34263
34264 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, zero) {
34265 TEST_REQUIRES_X86_FMA3;
34266 for (uint32_t mz = 0; mz < 5; mz++) {
34267 for (size_t k = 1; k <= 5; k += 2) {
34268 GemmMicrokernelTester()
34269 .mr(5)
34270 .nr(8)
34271 .kr(1)
34272 .sr(1)
34273 .m(5)
34274 .n(8)
34275 .k(k)
34276 .ks(3)
34277 .a_offset(29)
34278 .zero_index(mz)
34279 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34280 }
34281 }
34282 }
34283
34284 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, qmin) {
34285 TEST_REQUIRES_X86_FMA3;
34286 GemmMicrokernelTester()
34287 .mr(5)
34288 .nr(8)
34289 .kr(1)
34290 .sr(1)
34291 .m(5)
34292 .n(8)
34293 .k(1)
34294 .qmin(128)
34295 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34296 }
34297
34298 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, qmax) {
34299 TEST_REQUIRES_X86_FMA3;
34300 GemmMicrokernelTester()
34301 .mr(5)
34302 .nr(8)
34303 .kr(1)
34304 .sr(1)
34305 .m(5)
34306 .n(8)
34307 .k(1)
34308 .qmax(128)
34309 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34310 }
34311
34312 TEST(F32_IGEMM_5X8__FMA3_BROADCAST, strided_cm) {
34313 TEST_REQUIRES_X86_FMA3;
34314 GemmMicrokernelTester()
34315 .mr(5)
34316 .nr(8)
34317 .kr(1)
34318 .sr(1)
34319 .m(5)
34320 .n(8)
34321 .k(1)
34322 .cm_stride(11)
34323 .Test(xnn_f32_igemm_ukernel_5x8__fma3_broadcast);
34324 }
34325#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34326
34327
34328#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34329 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1) {
34330 TEST_REQUIRES_X86_FMA3;
34331 GemmMicrokernelTester()
34332 .mr(6)
34333 .nr(8)
34334 .kr(1)
34335 .sr(1)
34336 .m(6)
34337 .n(8)
34338 .k(1)
34339 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34340 }
34341
34342 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, strided_cn) {
34343 TEST_REQUIRES_X86_FMA3;
34344 GemmMicrokernelTester()
34345 .mr(6)
34346 .nr(8)
34347 .kr(1)
34348 .sr(1)
34349 .m(6)
34350 .n(8)
34351 .k(1)
34352 .cn_stride(11)
34353 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34354 }
34355
34356 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile) {
34357 TEST_REQUIRES_X86_FMA3;
34358 for (uint32_t m = 1; m <= 6; m++) {
34359 for (uint32_t n = 1; n <= 8; n++) {
34360 GemmMicrokernelTester()
34361 .mr(6)
34362 .nr(8)
34363 .kr(1)
34364 .sr(1)
34365 .m(m)
34366 .n(n)
34367 .k(1)
34368 .iterations(1)
34369 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34370 }
34371 }
34372 }
34373
34374 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
34375 TEST_REQUIRES_X86_FMA3;
34376 for (uint32_t m = 1; m <= 6; m++) {
34377 GemmMicrokernelTester()
34378 .mr(6)
34379 .nr(8)
34380 .kr(1)
34381 .sr(1)
34382 .m(m)
34383 .n(8)
34384 .k(1)
34385 .iterations(1)
34386 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34387 }
34388 }
34389
34390 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
34391 TEST_REQUIRES_X86_FMA3;
34392 for (uint32_t n = 1; n <= 8; n++) {
34393 GemmMicrokernelTester()
34394 .mr(6)
34395 .nr(8)
34396 .kr(1)
34397 .sr(1)
34398 .m(6)
34399 .n(n)
34400 .k(1)
34401 .iterations(1)
34402 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34403 }
34404 }
34405
34406 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_gt_1) {
34407 TEST_REQUIRES_X86_FMA3;
34408 for (size_t k = 2; k < 10; k++) {
34409 GemmMicrokernelTester()
34410 .mr(6)
34411 .nr(8)
34412 .kr(1)
34413 .sr(1)
34414 .m(6)
34415 .n(8)
34416 .k(k)
34417 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34418 }
34419 }
34420
34421 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, k_gt_1_subtile) {
34422 TEST_REQUIRES_X86_FMA3;
34423 for (size_t k = 2; k < 10; k++) {
34424 for (uint32_t m = 1; m <= 6; m++) {
34425 for (uint32_t n = 1; n <= 8; n++) {
34426 GemmMicrokernelTester()
34427 .mr(6)
34428 .nr(8)
34429 .kr(1)
34430 .sr(1)
34431 .m(m)
34432 .n(n)
34433 .k(k)
34434 .iterations(1)
34435 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34436 }
34437 }
34438 }
34439 }
34440
34441 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8) {
34442 TEST_REQUIRES_X86_FMA3;
34443 for (uint32_t n = 9; n < 16; n++) {
34444 for (size_t k = 1; k <= 5; k += 2) {
34445 GemmMicrokernelTester()
34446 .mr(6)
34447 .nr(8)
34448 .kr(1)
34449 .sr(1)
34450 .m(6)
34451 .n(8)
34452 .k(k)
34453 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34454 }
34455 }
34456 }
34457
34458 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
34459 TEST_REQUIRES_X86_FMA3;
34460 for (uint32_t n = 9; n < 16; n++) {
34461 for (size_t k = 1; k <= 5; k += 2) {
34462 GemmMicrokernelTester()
34463 .mr(6)
34464 .nr(8)
34465 .kr(1)
34466 .sr(1)
34467 .m(6)
34468 .n(8)
34469 .k(k)
34470 .cn_stride(11)
34471 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34472 }
34473 }
34474 }
34475
34476 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8_subtile) {
34477 TEST_REQUIRES_X86_FMA3;
34478 for (uint32_t n = 9; n < 16; n++) {
34479 for (size_t k = 1; k <= 5; k += 2) {
34480 for (uint32_t m = 1; m <= 6; m++) {
34481 GemmMicrokernelTester()
34482 .mr(6)
34483 .nr(8)
34484 .kr(1)
34485 .sr(1)
34486 .m(m)
34487 .n(n)
34488 .k(k)
34489 .iterations(1)
34490 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34491 }
34492 }
34493 }
34494 }
34495
34496 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8) {
34497 TEST_REQUIRES_X86_FMA3;
34498 for (uint32_t n = 16; n <= 24; n += 8) {
34499 for (size_t k = 1; k <= 5; k += 2) {
34500 GemmMicrokernelTester()
34501 .mr(6)
34502 .nr(8)
34503 .kr(1)
34504 .sr(1)
34505 .m(6)
34506 .n(8)
34507 .k(k)
34508 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34509 }
34510 }
34511 }
34512
34513 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8_strided_cn) {
34514 TEST_REQUIRES_X86_FMA3;
34515 for (uint32_t n = 16; n <= 24; n += 8) {
34516 for (size_t k = 1; k <= 5; k += 2) {
34517 GemmMicrokernelTester()
34518 .mr(6)
34519 .nr(8)
34520 .kr(1)
34521 .sr(1)
34522 .m(6)
34523 .n(n)
34524 .k(k)
34525 .cn_stride(11)
34526 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34527 }
34528 }
34529 }
34530
34531 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8_subtile) {
34532 TEST_REQUIRES_X86_FMA3;
34533 for (uint32_t n = 16; n <= 24; n += 8) {
34534 for (size_t k = 1; k <= 5; k += 2) {
34535 for (uint32_t m = 1; m <= 6; m++) {
34536 GemmMicrokernelTester()
34537 .mr(6)
34538 .nr(8)
34539 .kr(1)
34540 .sr(1)
34541 .m(m)
34542 .n(n)
34543 .k(k)
34544 .iterations(1)
34545 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34546 }
34547 }
34548 }
34549 }
34550
34551 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, small_kernel) {
34552 TEST_REQUIRES_X86_FMA3;
34553 for (size_t k = 1; k <= 5; k += 2) {
34554 GemmMicrokernelTester()
34555 .mr(6)
34556 .nr(8)
34557 .kr(1)
34558 .sr(1)
34559 .m(6)
34560 .n(8)
34561 .k(k)
34562 .ks(3)
34563 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34564 }
34565 }
34566
34567 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, small_kernel_subtile) {
34568 TEST_REQUIRES_X86_FMA3;
34569 for (size_t k = 1; k <= 5; k += 2) {
34570 for (uint32_t m = 1; m <= 6; m++) {
34571 for (uint32_t n = 1; n <= 8; n++) {
34572 GemmMicrokernelTester()
34573 .mr(6)
34574 .nr(8)
34575 .kr(1)
34576 .sr(1)
34577 .m(m)
34578 .n(n)
34579 .k(k)
34580 .ks(3)
34581 .iterations(1)
34582 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34583 }
34584 }
34585 }
34586 }
34587
34588 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
34589 TEST_REQUIRES_X86_FMA3;
34590 for (uint32_t n = 9; n < 16; n++) {
34591 for (size_t k = 1; k <= 5; k += 2) {
34592 GemmMicrokernelTester()
34593 .mr(6)
34594 .nr(8)
34595 .kr(1)
34596 .sr(1)
34597 .m(6)
34598 .n(8)
34599 .k(k)
34600 .ks(3)
34601 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34602 }
34603 }
34604 }
34605
34606 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, n_div_8_small_kernel) {
34607 TEST_REQUIRES_X86_FMA3;
34608 for (uint32_t n = 16; n <= 24; n += 8) {
34609 for (size_t k = 1; k <= 5; k += 2) {
34610 GemmMicrokernelTester()
34611 .mr(6)
34612 .nr(8)
34613 .kr(1)
34614 .sr(1)
34615 .m(6)
34616 .n(8)
34617 .k(k)
34618 .ks(3)
34619 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34620 }
34621 }
34622 }
34623
34624 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, strided_cm_subtile) {
34625 TEST_REQUIRES_X86_FMA3;
34626 for (size_t k = 1; k <= 5; k += 2) {
34627 for (uint32_t m = 1; m <= 6; m++) {
34628 for (uint32_t n = 1; n <= 8; n++) {
34629 GemmMicrokernelTester()
34630 .mr(6)
34631 .nr(8)
34632 .kr(1)
34633 .sr(1)
34634 .m(m)
34635 .n(n)
34636 .k(k)
34637 .cm_stride(11)
34638 .iterations(1)
34639 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34640 }
34641 }
34642 }
34643 }
34644
34645 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, a_offset) {
34646 TEST_REQUIRES_X86_FMA3;
34647 for (size_t k = 1; k <= 5; k += 2) {
34648 GemmMicrokernelTester()
34649 .mr(6)
34650 .nr(8)
34651 .kr(1)
34652 .sr(1)
34653 .m(6)
34654 .n(8)
34655 .k(k)
34656 .ks(3)
34657 .a_offset(37)
34658 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34659 }
34660 }
34661
34662 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, zero) {
34663 TEST_REQUIRES_X86_FMA3;
34664 for (uint32_t mz = 0; mz < 6; mz++) {
34665 for (size_t k = 1; k <= 5; k += 2) {
34666 GemmMicrokernelTester()
34667 .mr(6)
34668 .nr(8)
34669 .kr(1)
34670 .sr(1)
34671 .m(6)
34672 .n(8)
34673 .k(k)
34674 .ks(3)
34675 .a_offset(37)
34676 .zero_index(mz)
34677 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34678 }
34679 }
34680 }
34681
34682 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, qmin) {
34683 TEST_REQUIRES_X86_FMA3;
34684 GemmMicrokernelTester()
34685 .mr(6)
34686 .nr(8)
34687 .kr(1)
34688 .sr(1)
34689 .m(6)
34690 .n(8)
34691 .k(1)
34692 .qmin(128)
34693 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34694 }
34695
34696 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, qmax) {
34697 TEST_REQUIRES_X86_FMA3;
34698 GemmMicrokernelTester()
34699 .mr(6)
34700 .nr(8)
34701 .kr(1)
34702 .sr(1)
34703 .m(6)
34704 .n(8)
34705 .k(1)
34706 .qmax(128)
34707 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34708 }
34709
34710 TEST(F32_IGEMM_6X8__FMA3_BROADCAST, strided_cm) {
34711 TEST_REQUIRES_X86_FMA3;
34712 GemmMicrokernelTester()
34713 .mr(6)
34714 .nr(8)
34715 .kr(1)
34716 .sr(1)
34717 .m(6)
34718 .n(8)
34719 .k(1)
34720 .cm_stride(11)
34721 .Test(xnn_f32_igemm_ukernel_6x8__fma3_broadcast);
34722 }
34723#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34724
34725
34726#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34727 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1) {
34728 TEST_REQUIRES_X86_FMA3;
34729 GemmMicrokernelTester()
34730 .mr(7)
34731 .nr(8)
34732 .kr(1)
34733 .sr(1)
34734 .m(7)
34735 .n(8)
34736 .k(1)
34737 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34738 }
34739
34740 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, strided_cn) {
34741 TEST_REQUIRES_X86_FMA3;
34742 GemmMicrokernelTester()
34743 .mr(7)
34744 .nr(8)
34745 .kr(1)
34746 .sr(1)
34747 .m(7)
34748 .n(8)
34749 .k(1)
34750 .cn_stride(11)
34751 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34752 }
34753
34754 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile) {
34755 TEST_REQUIRES_X86_FMA3;
34756 for (uint32_t m = 1; m <= 7; m++) {
34757 for (uint32_t n = 1; n <= 8; n++) {
34758 GemmMicrokernelTester()
34759 .mr(7)
34760 .nr(8)
34761 .kr(1)
34762 .sr(1)
34763 .m(m)
34764 .n(n)
34765 .k(1)
34766 .iterations(1)
34767 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34768 }
34769 }
34770 }
34771
34772 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
34773 TEST_REQUIRES_X86_FMA3;
34774 for (uint32_t m = 1; m <= 7; m++) {
34775 GemmMicrokernelTester()
34776 .mr(7)
34777 .nr(8)
34778 .kr(1)
34779 .sr(1)
34780 .m(m)
34781 .n(8)
34782 .k(1)
34783 .iterations(1)
34784 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34785 }
34786 }
34787
34788 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
34789 TEST_REQUIRES_X86_FMA3;
34790 for (uint32_t n = 1; n <= 8; n++) {
34791 GemmMicrokernelTester()
34792 .mr(7)
34793 .nr(8)
34794 .kr(1)
34795 .sr(1)
34796 .m(7)
34797 .n(n)
34798 .k(1)
34799 .iterations(1)
34800 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34801 }
34802 }
34803
34804 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_gt_1) {
34805 TEST_REQUIRES_X86_FMA3;
34806 for (size_t k = 2; k < 10; k++) {
34807 GemmMicrokernelTester()
34808 .mr(7)
34809 .nr(8)
34810 .kr(1)
34811 .sr(1)
34812 .m(7)
34813 .n(8)
34814 .k(k)
34815 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34816 }
34817 }
34818
34819 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, k_gt_1_subtile) {
34820 TEST_REQUIRES_X86_FMA3;
34821 for (size_t k = 2; k < 10; k++) {
34822 for (uint32_t m = 1; m <= 7; m++) {
34823 for (uint32_t n = 1; n <= 8; n++) {
34824 GemmMicrokernelTester()
34825 .mr(7)
34826 .nr(8)
34827 .kr(1)
34828 .sr(1)
34829 .m(m)
34830 .n(n)
34831 .k(k)
34832 .iterations(1)
34833 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34834 }
34835 }
34836 }
34837 }
34838
34839 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8) {
34840 TEST_REQUIRES_X86_FMA3;
34841 for (uint32_t n = 9; n < 16; n++) {
34842 for (size_t k = 1; k <= 5; k += 2) {
34843 GemmMicrokernelTester()
34844 .mr(7)
34845 .nr(8)
34846 .kr(1)
34847 .sr(1)
34848 .m(7)
34849 .n(8)
34850 .k(k)
34851 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34852 }
34853 }
34854 }
34855
34856 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
34857 TEST_REQUIRES_X86_FMA3;
34858 for (uint32_t n = 9; n < 16; n++) {
34859 for (size_t k = 1; k <= 5; k += 2) {
34860 GemmMicrokernelTester()
34861 .mr(7)
34862 .nr(8)
34863 .kr(1)
34864 .sr(1)
34865 .m(7)
34866 .n(8)
34867 .k(k)
34868 .cn_stride(11)
34869 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34870 }
34871 }
34872 }
34873
34874 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8_subtile) {
34875 TEST_REQUIRES_X86_FMA3;
34876 for (uint32_t n = 9; n < 16; n++) {
34877 for (size_t k = 1; k <= 5; k += 2) {
34878 for (uint32_t m = 1; m <= 7; m++) {
34879 GemmMicrokernelTester()
34880 .mr(7)
34881 .nr(8)
34882 .kr(1)
34883 .sr(1)
34884 .m(m)
34885 .n(n)
34886 .k(k)
34887 .iterations(1)
34888 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34889 }
34890 }
34891 }
34892 }
34893
34894 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8) {
34895 TEST_REQUIRES_X86_FMA3;
34896 for (uint32_t n = 16; n <= 24; n += 8) {
34897 for (size_t k = 1; k <= 5; k += 2) {
34898 GemmMicrokernelTester()
34899 .mr(7)
34900 .nr(8)
34901 .kr(1)
34902 .sr(1)
34903 .m(7)
34904 .n(8)
34905 .k(k)
34906 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34907 }
34908 }
34909 }
34910
34911 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8_strided_cn) {
34912 TEST_REQUIRES_X86_FMA3;
34913 for (uint32_t n = 16; n <= 24; n += 8) {
34914 for (size_t k = 1; k <= 5; k += 2) {
34915 GemmMicrokernelTester()
34916 .mr(7)
34917 .nr(8)
34918 .kr(1)
34919 .sr(1)
34920 .m(7)
34921 .n(n)
34922 .k(k)
34923 .cn_stride(11)
34924 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34925 }
34926 }
34927 }
34928
34929 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8_subtile) {
34930 TEST_REQUIRES_X86_FMA3;
34931 for (uint32_t n = 16; n <= 24; n += 8) {
34932 for (size_t k = 1; k <= 5; k += 2) {
34933 for (uint32_t m = 1; m <= 7; m++) {
34934 GemmMicrokernelTester()
34935 .mr(7)
34936 .nr(8)
34937 .kr(1)
34938 .sr(1)
34939 .m(m)
34940 .n(n)
34941 .k(k)
34942 .iterations(1)
34943 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34944 }
34945 }
34946 }
34947 }
34948
34949 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, small_kernel) {
34950 TEST_REQUIRES_X86_FMA3;
34951 for (size_t k = 1; k <= 5; k += 2) {
34952 GemmMicrokernelTester()
34953 .mr(7)
34954 .nr(8)
34955 .kr(1)
34956 .sr(1)
34957 .m(7)
34958 .n(8)
34959 .k(k)
34960 .ks(3)
34961 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34962 }
34963 }
34964
34965 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, small_kernel_subtile) {
34966 TEST_REQUIRES_X86_FMA3;
34967 for (size_t k = 1; k <= 5; k += 2) {
34968 for (uint32_t m = 1; m <= 7; m++) {
34969 for (uint32_t n = 1; n <= 8; n++) {
34970 GemmMicrokernelTester()
34971 .mr(7)
34972 .nr(8)
34973 .kr(1)
34974 .sr(1)
34975 .m(m)
34976 .n(n)
34977 .k(k)
34978 .ks(3)
34979 .iterations(1)
34980 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
34981 }
34982 }
34983 }
34984 }
34985
34986 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
34987 TEST_REQUIRES_X86_FMA3;
34988 for (uint32_t n = 9; n < 16; n++) {
34989 for (size_t k = 1; k <= 5; k += 2) {
34990 GemmMicrokernelTester()
34991 .mr(7)
34992 .nr(8)
34993 .kr(1)
34994 .sr(1)
34995 .m(7)
34996 .n(8)
34997 .k(k)
34998 .ks(3)
34999 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35000 }
35001 }
35002 }
35003
35004 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, n_div_8_small_kernel) {
35005 TEST_REQUIRES_X86_FMA3;
35006 for (uint32_t n = 16; n <= 24; n += 8) {
35007 for (size_t k = 1; k <= 5; k += 2) {
35008 GemmMicrokernelTester()
35009 .mr(7)
35010 .nr(8)
35011 .kr(1)
35012 .sr(1)
35013 .m(7)
35014 .n(8)
35015 .k(k)
35016 .ks(3)
35017 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35018 }
35019 }
35020 }
35021
35022 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, strided_cm_subtile) {
35023 TEST_REQUIRES_X86_FMA3;
35024 for (size_t k = 1; k <= 5; k += 2) {
35025 for (uint32_t m = 1; m <= 7; m++) {
35026 for (uint32_t n = 1; n <= 8; n++) {
35027 GemmMicrokernelTester()
35028 .mr(7)
35029 .nr(8)
35030 .kr(1)
35031 .sr(1)
35032 .m(m)
35033 .n(n)
35034 .k(k)
35035 .cm_stride(11)
35036 .iterations(1)
35037 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35038 }
35039 }
35040 }
35041 }
35042
35043 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, a_offset) {
35044 TEST_REQUIRES_X86_FMA3;
35045 for (size_t k = 1; k <= 5; k += 2) {
35046 GemmMicrokernelTester()
35047 .mr(7)
35048 .nr(8)
35049 .kr(1)
35050 .sr(1)
35051 .m(7)
35052 .n(8)
35053 .k(k)
35054 .ks(3)
35055 .a_offset(37)
35056 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35057 }
35058 }
35059
35060 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, zero) {
35061 TEST_REQUIRES_X86_FMA3;
35062 for (uint32_t mz = 0; mz < 7; mz++) {
35063 for (size_t k = 1; k <= 5; k += 2) {
35064 GemmMicrokernelTester()
35065 .mr(7)
35066 .nr(8)
35067 .kr(1)
35068 .sr(1)
35069 .m(7)
35070 .n(8)
35071 .k(k)
35072 .ks(3)
35073 .a_offset(37)
35074 .zero_index(mz)
35075 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35076 }
35077 }
35078 }
35079
35080 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, qmin) {
35081 TEST_REQUIRES_X86_FMA3;
35082 GemmMicrokernelTester()
35083 .mr(7)
35084 .nr(8)
35085 .kr(1)
35086 .sr(1)
35087 .m(7)
35088 .n(8)
35089 .k(1)
35090 .qmin(128)
35091 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35092 }
35093
35094 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, qmax) {
35095 TEST_REQUIRES_X86_FMA3;
35096 GemmMicrokernelTester()
35097 .mr(7)
35098 .nr(8)
35099 .kr(1)
35100 .sr(1)
35101 .m(7)
35102 .n(8)
35103 .k(1)
35104 .qmax(128)
35105 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35106 }
35107
35108 TEST(F32_IGEMM_7X8__FMA3_BROADCAST, strided_cm) {
35109 TEST_REQUIRES_X86_FMA3;
35110 GemmMicrokernelTester()
35111 .mr(7)
35112 .nr(8)
35113 .kr(1)
35114 .sr(1)
35115 .m(7)
35116 .n(8)
35117 .k(1)
35118 .cm_stride(11)
35119 .Test(xnn_f32_igemm_ukernel_7x8__fma3_broadcast);
35120 }
35121#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35122
35123
35124#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35125 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1) {
35126 TEST_REQUIRES_X86_FMA3;
35127 GemmMicrokernelTester()
35128 .mr(8)
35129 .nr(8)
35130 .kr(1)
35131 .sr(1)
35132 .m(8)
35133 .n(8)
35134 .k(1)
35135 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35136 }
35137
35138 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, strided_cn) {
35139 TEST_REQUIRES_X86_FMA3;
35140 GemmMicrokernelTester()
35141 .mr(8)
35142 .nr(8)
35143 .kr(1)
35144 .sr(1)
35145 .m(8)
35146 .n(8)
35147 .k(1)
35148 .cn_stride(11)
35149 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35150 }
35151
35152 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile) {
35153 TEST_REQUIRES_X86_FMA3;
35154 for (uint32_t m = 1; m <= 8; m++) {
35155 for (uint32_t n = 1; n <= 8; n++) {
35156 GemmMicrokernelTester()
35157 .mr(8)
35158 .nr(8)
35159 .kr(1)
35160 .sr(1)
35161 .m(m)
35162 .n(n)
35163 .k(1)
35164 .iterations(1)
35165 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35166 }
35167 }
35168 }
35169
35170 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
35171 TEST_REQUIRES_X86_FMA3;
35172 for (uint32_t m = 1; m <= 8; m++) {
35173 GemmMicrokernelTester()
35174 .mr(8)
35175 .nr(8)
35176 .kr(1)
35177 .sr(1)
35178 .m(m)
35179 .n(8)
35180 .k(1)
35181 .iterations(1)
35182 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35183 }
35184 }
35185
35186 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
35187 TEST_REQUIRES_X86_FMA3;
35188 for (uint32_t n = 1; n <= 8; n++) {
35189 GemmMicrokernelTester()
35190 .mr(8)
35191 .nr(8)
35192 .kr(1)
35193 .sr(1)
35194 .m(8)
35195 .n(n)
35196 .k(1)
35197 .iterations(1)
35198 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35199 }
35200 }
35201
35202 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_gt_1) {
35203 TEST_REQUIRES_X86_FMA3;
35204 for (size_t k = 2; k < 10; k++) {
35205 GemmMicrokernelTester()
35206 .mr(8)
35207 .nr(8)
35208 .kr(1)
35209 .sr(1)
35210 .m(8)
35211 .n(8)
35212 .k(k)
35213 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35214 }
35215 }
35216
35217 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, k_gt_1_subtile) {
35218 TEST_REQUIRES_X86_FMA3;
35219 for (size_t k = 2; k < 10; k++) {
35220 for (uint32_t m = 1; m <= 8; m++) {
35221 for (uint32_t n = 1; n <= 8; n++) {
35222 GemmMicrokernelTester()
35223 .mr(8)
35224 .nr(8)
35225 .kr(1)
35226 .sr(1)
35227 .m(m)
35228 .n(n)
35229 .k(k)
35230 .iterations(1)
35231 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35232 }
35233 }
35234 }
35235 }
35236
35237 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8) {
35238 TEST_REQUIRES_X86_FMA3;
35239 for (uint32_t n = 9; n < 16; n++) {
35240 for (size_t k = 1; k <= 5; k += 2) {
35241 GemmMicrokernelTester()
35242 .mr(8)
35243 .nr(8)
35244 .kr(1)
35245 .sr(1)
35246 .m(8)
35247 .n(8)
35248 .k(k)
35249 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35250 }
35251 }
35252 }
35253
35254 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
35255 TEST_REQUIRES_X86_FMA3;
35256 for (uint32_t n = 9; n < 16; n++) {
35257 for (size_t k = 1; k <= 5; k += 2) {
35258 GemmMicrokernelTester()
35259 .mr(8)
35260 .nr(8)
35261 .kr(1)
35262 .sr(1)
35263 .m(8)
35264 .n(8)
35265 .k(k)
35266 .cn_stride(11)
35267 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35268 }
35269 }
35270 }
35271
35272 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8_subtile) {
35273 TEST_REQUIRES_X86_FMA3;
35274 for (uint32_t n = 9; n < 16; n++) {
35275 for (size_t k = 1; k <= 5; k += 2) {
35276 for (uint32_t m = 1; m <= 8; m++) {
35277 GemmMicrokernelTester()
35278 .mr(8)
35279 .nr(8)
35280 .kr(1)
35281 .sr(1)
35282 .m(m)
35283 .n(n)
35284 .k(k)
35285 .iterations(1)
35286 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35287 }
35288 }
35289 }
35290 }
35291
35292 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8) {
35293 TEST_REQUIRES_X86_FMA3;
35294 for (uint32_t n = 16; n <= 24; n += 8) {
35295 for (size_t k = 1; k <= 5; k += 2) {
35296 GemmMicrokernelTester()
35297 .mr(8)
35298 .nr(8)
35299 .kr(1)
35300 .sr(1)
35301 .m(8)
35302 .n(8)
35303 .k(k)
35304 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35305 }
35306 }
35307 }
35308
35309 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8_strided_cn) {
35310 TEST_REQUIRES_X86_FMA3;
35311 for (uint32_t n = 16; n <= 24; n += 8) {
35312 for (size_t k = 1; k <= 5; k += 2) {
35313 GemmMicrokernelTester()
35314 .mr(8)
35315 .nr(8)
35316 .kr(1)
35317 .sr(1)
35318 .m(8)
35319 .n(n)
35320 .k(k)
35321 .cn_stride(11)
35322 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35323 }
35324 }
35325 }
35326
35327 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8_subtile) {
35328 TEST_REQUIRES_X86_FMA3;
35329 for (uint32_t n = 16; n <= 24; n += 8) {
35330 for (size_t k = 1; k <= 5; k += 2) {
35331 for (uint32_t m = 1; m <= 8; m++) {
35332 GemmMicrokernelTester()
35333 .mr(8)
35334 .nr(8)
35335 .kr(1)
35336 .sr(1)
35337 .m(m)
35338 .n(n)
35339 .k(k)
35340 .iterations(1)
35341 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35342 }
35343 }
35344 }
35345 }
35346
35347 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, small_kernel) {
35348 TEST_REQUIRES_X86_FMA3;
35349 for (size_t k = 1; k <= 5; k += 2) {
35350 GemmMicrokernelTester()
35351 .mr(8)
35352 .nr(8)
35353 .kr(1)
35354 .sr(1)
35355 .m(8)
35356 .n(8)
35357 .k(k)
35358 .ks(3)
35359 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35360 }
35361 }
35362
35363 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, small_kernel_subtile) {
35364 TEST_REQUIRES_X86_FMA3;
35365 for (size_t k = 1; k <= 5; k += 2) {
35366 for (uint32_t m = 1; m <= 8; m++) {
35367 for (uint32_t n = 1; n <= 8; n++) {
35368 GemmMicrokernelTester()
35369 .mr(8)
35370 .nr(8)
35371 .kr(1)
35372 .sr(1)
35373 .m(m)
35374 .n(n)
35375 .k(k)
35376 .ks(3)
35377 .iterations(1)
35378 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35379 }
35380 }
35381 }
35382 }
35383
35384 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_gt_8_small_kernel) {
35385 TEST_REQUIRES_X86_FMA3;
35386 for (uint32_t n = 9; n < 16; n++) {
35387 for (size_t k = 1; k <= 5; k += 2) {
35388 GemmMicrokernelTester()
35389 .mr(8)
35390 .nr(8)
35391 .kr(1)
35392 .sr(1)
35393 .m(8)
35394 .n(8)
35395 .k(k)
35396 .ks(3)
35397 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35398 }
35399 }
35400 }
35401
35402 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, n_div_8_small_kernel) {
35403 TEST_REQUIRES_X86_FMA3;
35404 for (uint32_t n = 16; n <= 24; n += 8) {
35405 for (size_t k = 1; k <= 5; k += 2) {
35406 GemmMicrokernelTester()
35407 .mr(8)
35408 .nr(8)
35409 .kr(1)
35410 .sr(1)
35411 .m(8)
35412 .n(8)
35413 .k(k)
35414 .ks(3)
35415 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35416 }
35417 }
35418 }
35419
35420 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, strided_cm_subtile) {
35421 TEST_REQUIRES_X86_FMA3;
35422 for (size_t k = 1; k <= 5; k += 2) {
35423 for (uint32_t m = 1; m <= 8; m++) {
35424 for (uint32_t n = 1; n <= 8; n++) {
35425 GemmMicrokernelTester()
35426 .mr(8)
35427 .nr(8)
35428 .kr(1)
35429 .sr(1)
35430 .m(m)
35431 .n(n)
35432 .k(k)
35433 .cm_stride(11)
35434 .iterations(1)
35435 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35436 }
35437 }
35438 }
35439 }
35440
35441 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, a_offset) {
35442 TEST_REQUIRES_X86_FMA3;
35443 for (size_t k = 1; k <= 5; k += 2) {
35444 GemmMicrokernelTester()
35445 .mr(8)
35446 .nr(8)
35447 .kr(1)
35448 .sr(1)
35449 .m(8)
35450 .n(8)
35451 .k(k)
35452 .ks(3)
35453 .a_offset(43)
35454 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35455 }
35456 }
35457
35458 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, zero) {
35459 TEST_REQUIRES_X86_FMA3;
35460 for (uint32_t mz = 0; mz < 8; mz++) {
35461 for (size_t k = 1; k <= 5; k += 2) {
35462 GemmMicrokernelTester()
35463 .mr(8)
35464 .nr(8)
35465 .kr(1)
35466 .sr(1)
35467 .m(8)
35468 .n(8)
35469 .k(k)
35470 .ks(3)
35471 .a_offset(43)
35472 .zero_index(mz)
35473 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35474 }
35475 }
35476 }
35477
35478 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, qmin) {
35479 TEST_REQUIRES_X86_FMA3;
35480 GemmMicrokernelTester()
35481 .mr(8)
35482 .nr(8)
35483 .kr(1)
35484 .sr(1)
35485 .m(8)
35486 .n(8)
35487 .k(1)
35488 .qmin(128)
35489 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35490 }
35491
35492 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, qmax) {
35493 TEST_REQUIRES_X86_FMA3;
35494 GemmMicrokernelTester()
35495 .mr(8)
35496 .nr(8)
35497 .kr(1)
35498 .sr(1)
35499 .m(8)
35500 .n(8)
35501 .k(1)
35502 .qmax(128)
35503 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35504 }
35505
35506 TEST(F32_IGEMM_8X8__FMA3_BROADCAST, strided_cm) {
35507 TEST_REQUIRES_X86_FMA3;
35508 GemmMicrokernelTester()
35509 .mr(8)
35510 .nr(8)
35511 .kr(1)
35512 .sr(1)
35513 .m(8)
35514 .n(8)
35515 .k(1)
35516 .cm_stride(11)
35517 .Test(xnn_f32_igemm_ukernel_8x8__fma3_broadcast);
35518 }
35519#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35520
35521
35522#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35523 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1) {
35524 TEST_REQUIRES_X86_FMA3;
35525 GemmMicrokernelTester()
35526 .mr(1)
35527 .nr(16)
35528 .kr(1)
35529 .sr(1)
35530 .m(1)
35531 .n(16)
35532 .k(1)
35533 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35534 }
35535
35536 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, strided_cn) {
35537 TEST_REQUIRES_X86_FMA3;
35538 GemmMicrokernelTester()
35539 .mr(1)
35540 .nr(16)
35541 .kr(1)
35542 .sr(1)
35543 .m(1)
35544 .n(16)
35545 .k(1)
35546 .cn_stride(19)
35547 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35548 }
35549
35550 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
35551 TEST_REQUIRES_X86_FMA3;
35552 for (uint32_t m = 1; m <= 1; m++) {
35553 for (uint32_t n = 1; n <= 16; n++) {
35554 GemmMicrokernelTester()
35555 .mr(1)
35556 .nr(16)
35557 .kr(1)
35558 .sr(1)
35559 .m(m)
35560 .n(n)
35561 .k(1)
35562 .iterations(1)
35563 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35564 }
35565 }
35566 }
35567
35568 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
35569 TEST_REQUIRES_X86_FMA3;
35570 for (uint32_t m = 1; m <= 1; m++) {
35571 GemmMicrokernelTester()
35572 .mr(1)
35573 .nr(16)
35574 .kr(1)
35575 .sr(1)
35576 .m(m)
35577 .n(16)
35578 .k(1)
35579 .iterations(1)
35580 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35581 }
35582 }
35583
35584 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
35585 TEST_REQUIRES_X86_FMA3;
35586 for (uint32_t n = 1; n <= 16; n++) {
35587 GemmMicrokernelTester()
35588 .mr(1)
35589 .nr(16)
35590 .kr(1)
35591 .sr(1)
35592 .m(1)
35593 .n(n)
35594 .k(1)
35595 .iterations(1)
35596 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35597 }
35598 }
35599
35600 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_gt_1) {
35601 TEST_REQUIRES_X86_FMA3;
35602 for (size_t k = 2; k < 10; k++) {
35603 GemmMicrokernelTester()
35604 .mr(1)
35605 .nr(16)
35606 .kr(1)
35607 .sr(1)
35608 .m(1)
35609 .n(16)
35610 .k(k)
35611 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35612 }
35613 }
35614
35615 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
35616 TEST_REQUIRES_X86_FMA3;
35617 for (size_t k = 2; k < 10; k++) {
35618 for (uint32_t m = 1; m <= 1; m++) {
35619 for (uint32_t n = 1; n <= 16; n++) {
35620 GemmMicrokernelTester()
35621 .mr(1)
35622 .nr(16)
35623 .kr(1)
35624 .sr(1)
35625 .m(m)
35626 .n(n)
35627 .k(k)
35628 .iterations(1)
35629 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35630 }
35631 }
35632 }
35633 }
35634
35635 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16) {
35636 TEST_REQUIRES_X86_FMA3;
35637 for (uint32_t n = 17; n < 32; n++) {
35638 for (size_t k = 1; k <= 5; k += 2) {
35639 GemmMicrokernelTester()
35640 .mr(1)
35641 .nr(16)
35642 .kr(1)
35643 .sr(1)
35644 .m(1)
35645 .n(16)
35646 .k(k)
35647 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35648 }
35649 }
35650 }
35651
35652 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
35653 TEST_REQUIRES_X86_FMA3;
35654 for (uint32_t n = 17; n < 32; n++) {
35655 for (size_t k = 1; k <= 5; k += 2) {
35656 GemmMicrokernelTester()
35657 .mr(1)
35658 .nr(16)
35659 .kr(1)
35660 .sr(1)
35661 .m(1)
35662 .n(16)
35663 .k(k)
35664 .cn_stride(19)
35665 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35666 }
35667 }
35668 }
35669
35670 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
35671 TEST_REQUIRES_X86_FMA3;
35672 for (uint32_t n = 17; n < 32; n++) {
35673 for (size_t k = 1; k <= 5; k += 2) {
35674 for (uint32_t m = 1; m <= 1; m++) {
35675 GemmMicrokernelTester()
35676 .mr(1)
35677 .nr(16)
35678 .kr(1)
35679 .sr(1)
35680 .m(m)
35681 .n(n)
35682 .k(k)
35683 .iterations(1)
35684 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35685 }
35686 }
35687 }
35688 }
35689
35690 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16) {
35691 TEST_REQUIRES_X86_FMA3;
35692 for (uint32_t n = 32; n <= 48; n += 16) {
35693 for (size_t k = 1; k <= 5; k += 2) {
35694 GemmMicrokernelTester()
35695 .mr(1)
35696 .nr(16)
35697 .kr(1)
35698 .sr(1)
35699 .m(1)
35700 .n(16)
35701 .k(k)
35702 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35703 }
35704 }
35705 }
35706
35707 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
35708 TEST_REQUIRES_X86_FMA3;
35709 for (uint32_t n = 32; n <= 48; n += 16) {
35710 for (size_t k = 1; k <= 5; k += 2) {
35711 GemmMicrokernelTester()
35712 .mr(1)
35713 .nr(16)
35714 .kr(1)
35715 .sr(1)
35716 .m(1)
35717 .n(n)
35718 .k(k)
35719 .cn_stride(19)
35720 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35721 }
35722 }
35723 }
35724
35725 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16_subtile) {
35726 TEST_REQUIRES_X86_FMA3;
35727 for (uint32_t n = 32; n <= 48; n += 16) {
35728 for (size_t k = 1; k <= 5; k += 2) {
35729 for (uint32_t m = 1; m <= 1; m++) {
35730 GemmMicrokernelTester()
35731 .mr(1)
35732 .nr(16)
35733 .kr(1)
35734 .sr(1)
35735 .m(m)
35736 .n(n)
35737 .k(k)
35738 .iterations(1)
35739 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35740 }
35741 }
35742 }
35743 }
35744
35745 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, small_kernel) {
35746 TEST_REQUIRES_X86_FMA3;
35747 for (size_t k = 1; k <= 5; k += 2) {
35748 GemmMicrokernelTester()
35749 .mr(1)
35750 .nr(16)
35751 .kr(1)
35752 .sr(1)
35753 .m(1)
35754 .n(16)
35755 .k(k)
35756 .ks(3)
35757 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35758 }
35759 }
35760
35761 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, small_kernel_subtile) {
35762 TEST_REQUIRES_X86_FMA3;
35763 for (size_t k = 1; k <= 5; k += 2) {
35764 for (uint32_t m = 1; m <= 1; m++) {
35765 for (uint32_t n = 1; n <= 16; n++) {
35766 GemmMicrokernelTester()
35767 .mr(1)
35768 .nr(16)
35769 .kr(1)
35770 .sr(1)
35771 .m(m)
35772 .n(n)
35773 .k(k)
35774 .ks(3)
35775 .iterations(1)
35776 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35777 }
35778 }
35779 }
35780 }
35781
35782 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
35783 TEST_REQUIRES_X86_FMA3;
35784 for (uint32_t n = 17; n < 32; n++) {
35785 for (size_t k = 1; k <= 5; k += 2) {
35786 GemmMicrokernelTester()
35787 .mr(1)
35788 .nr(16)
35789 .kr(1)
35790 .sr(1)
35791 .m(1)
35792 .n(16)
35793 .k(k)
35794 .ks(3)
35795 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35796 }
35797 }
35798 }
35799
35800 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, n_div_16_small_kernel) {
35801 TEST_REQUIRES_X86_FMA3;
35802 for (uint32_t n = 32; n <= 48; n += 16) {
35803 for (size_t k = 1; k <= 5; k += 2) {
35804 GemmMicrokernelTester()
35805 .mr(1)
35806 .nr(16)
35807 .kr(1)
35808 .sr(1)
35809 .m(1)
35810 .n(16)
35811 .k(k)
35812 .ks(3)
35813 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35814 }
35815 }
35816 }
35817
35818 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, strided_cm_subtile) {
35819 TEST_REQUIRES_X86_FMA3;
35820 for (size_t k = 1; k <= 5; k += 2) {
35821 for (uint32_t m = 1; m <= 1; m++) {
35822 for (uint32_t n = 1; n <= 16; n++) {
35823 GemmMicrokernelTester()
35824 .mr(1)
35825 .nr(16)
35826 .kr(1)
35827 .sr(1)
35828 .m(m)
35829 .n(n)
35830 .k(k)
35831 .cm_stride(19)
35832 .iterations(1)
35833 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35834 }
35835 }
35836 }
35837 }
35838
35839 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, a_offset) {
35840 TEST_REQUIRES_X86_FMA3;
35841 for (size_t k = 1; k <= 5; k += 2) {
35842 GemmMicrokernelTester()
35843 .mr(1)
35844 .nr(16)
35845 .kr(1)
35846 .sr(1)
35847 .m(1)
35848 .n(16)
35849 .k(k)
35850 .ks(3)
35851 .a_offset(7)
35852 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35853 }
35854 }
35855
35856 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, zero) {
35857 TEST_REQUIRES_X86_FMA3;
35858 for (uint32_t mz = 0; mz < 1; mz++) {
35859 for (size_t k = 1; k <= 5; k += 2) {
35860 GemmMicrokernelTester()
35861 .mr(1)
35862 .nr(16)
35863 .kr(1)
35864 .sr(1)
35865 .m(1)
35866 .n(16)
35867 .k(k)
35868 .ks(3)
35869 .a_offset(7)
35870 .zero_index(mz)
35871 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35872 }
35873 }
35874 }
35875
35876 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, qmin) {
35877 TEST_REQUIRES_X86_FMA3;
35878 GemmMicrokernelTester()
35879 .mr(1)
35880 .nr(16)
35881 .kr(1)
35882 .sr(1)
35883 .m(1)
35884 .n(16)
35885 .k(1)
35886 .qmin(128)
35887 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35888 }
35889
35890 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, qmax) {
35891 TEST_REQUIRES_X86_FMA3;
35892 GemmMicrokernelTester()
35893 .mr(1)
35894 .nr(16)
35895 .kr(1)
35896 .sr(1)
35897 .m(1)
35898 .n(16)
35899 .k(1)
35900 .qmax(128)
35901 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35902 }
35903
35904 TEST(F32_IGEMM_1X16__FMA3_BROADCAST, strided_cm) {
35905 TEST_REQUIRES_X86_FMA3;
35906 GemmMicrokernelTester()
35907 .mr(1)
35908 .nr(16)
35909 .kr(1)
35910 .sr(1)
35911 .m(1)
35912 .n(16)
35913 .k(1)
35914 .cm_stride(19)
35915 .Test(xnn_f32_igemm_ukernel_1x16__fma3_broadcast);
35916 }
35917#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35918
35919
35920#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35921 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1) {
35922 TEST_REQUIRES_X86_FMA3;
35923 GemmMicrokernelTester()
35924 .mr(3)
35925 .nr(16)
35926 .kr(1)
35927 .sr(1)
35928 .m(3)
35929 .n(16)
35930 .k(1)
35931 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
35932 }
35933
35934 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, strided_cn) {
35935 TEST_REQUIRES_X86_FMA3;
35936 GemmMicrokernelTester()
35937 .mr(3)
35938 .nr(16)
35939 .kr(1)
35940 .sr(1)
35941 .m(3)
35942 .n(16)
35943 .k(1)
35944 .cn_stride(19)
35945 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
35946 }
35947
35948 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
35949 TEST_REQUIRES_X86_FMA3;
35950 for (uint32_t m = 1; m <= 3; m++) {
35951 for (uint32_t n = 1; n <= 16; n++) {
35952 GemmMicrokernelTester()
35953 .mr(3)
35954 .nr(16)
35955 .kr(1)
35956 .sr(1)
35957 .m(m)
35958 .n(n)
35959 .k(1)
35960 .iterations(1)
35961 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
35962 }
35963 }
35964 }
35965
35966 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
35967 TEST_REQUIRES_X86_FMA3;
35968 for (uint32_t m = 1; m <= 3; m++) {
35969 GemmMicrokernelTester()
35970 .mr(3)
35971 .nr(16)
35972 .kr(1)
35973 .sr(1)
35974 .m(m)
35975 .n(16)
35976 .k(1)
35977 .iterations(1)
35978 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
35979 }
35980 }
35981
35982 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
35983 TEST_REQUIRES_X86_FMA3;
35984 for (uint32_t n = 1; n <= 16; n++) {
35985 GemmMicrokernelTester()
35986 .mr(3)
35987 .nr(16)
35988 .kr(1)
35989 .sr(1)
35990 .m(3)
35991 .n(n)
35992 .k(1)
35993 .iterations(1)
35994 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
35995 }
35996 }
35997
35998 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_gt_1) {
35999 TEST_REQUIRES_X86_FMA3;
36000 for (size_t k = 2; k < 10; k++) {
36001 GemmMicrokernelTester()
36002 .mr(3)
36003 .nr(16)
36004 .kr(1)
36005 .sr(1)
36006 .m(3)
36007 .n(16)
36008 .k(k)
36009 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36010 }
36011 }
36012
36013 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
36014 TEST_REQUIRES_X86_FMA3;
36015 for (size_t k = 2; k < 10; k++) {
36016 for (uint32_t m = 1; m <= 3; m++) {
36017 for (uint32_t n = 1; n <= 16; n++) {
36018 GemmMicrokernelTester()
36019 .mr(3)
36020 .nr(16)
36021 .kr(1)
36022 .sr(1)
36023 .m(m)
36024 .n(n)
36025 .k(k)
36026 .iterations(1)
36027 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36028 }
36029 }
36030 }
36031 }
36032
36033 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16) {
36034 TEST_REQUIRES_X86_FMA3;
36035 for (uint32_t n = 17; n < 32; n++) {
36036 for (size_t k = 1; k <= 5; k += 2) {
36037 GemmMicrokernelTester()
36038 .mr(3)
36039 .nr(16)
36040 .kr(1)
36041 .sr(1)
36042 .m(3)
36043 .n(16)
36044 .k(k)
36045 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36046 }
36047 }
36048 }
36049
36050 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
36051 TEST_REQUIRES_X86_FMA3;
36052 for (uint32_t n = 17; n < 32; n++) {
36053 for (size_t k = 1; k <= 5; k += 2) {
36054 GemmMicrokernelTester()
36055 .mr(3)
36056 .nr(16)
36057 .kr(1)
36058 .sr(1)
36059 .m(3)
36060 .n(16)
36061 .k(k)
36062 .cn_stride(19)
36063 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36064 }
36065 }
36066 }
36067
36068 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
36069 TEST_REQUIRES_X86_FMA3;
36070 for (uint32_t n = 17; n < 32; n++) {
36071 for (size_t k = 1; k <= 5; k += 2) {
36072 for (uint32_t m = 1; m <= 3; m++) {
36073 GemmMicrokernelTester()
36074 .mr(3)
36075 .nr(16)
36076 .kr(1)
36077 .sr(1)
36078 .m(m)
36079 .n(n)
36080 .k(k)
36081 .iterations(1)
36082 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36083 }
36084 }
36085 }
36086 }
36087
36088 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16) {
36089 TEST_REQUIRES_X86_FMA3;
36090 for (uint32_t n = 32; n <= 48; n += 16) {
36091 for (size_t k = 1; k <= 5; k += 2) {
36092 GemmMicrokernelTester()
36093 .mr(3)
36094 .nr(16)
36095 .kr(1)
36096 .sr(1)
36097 .m(3)
36098 .n(16)
36099 .k(k)
36100 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36101 }
36102 }
36103 }
36104
36105 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
36106 TEST_REQUIRES_X86_FMA3;
36107 for (uint32_t n = 32; n <= 48; n += 16) {
36108 for (size_t k = 1; k <= 5; k += 2) {
36109 GemmMicrokernelTester()
36110 .mr(3)
36111 .nr(16)
36112 .kr(1)
36113 .sr(1)
36114 .m(3)
36115 .n(n)
36116 .k(k)
36117 .cn_stride(19)
36118 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36119 }
36120 }
36121 }
36122
36123 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16_subtile) {
36124 TEST_REQUIRES_X86_FMA3;
36125 for (uint32_t n = 32; n <= 48; n += 16) {
36126 for (size_t k = 1; k <= 5; k += 2) {
36127 for (uint32_t m = 1; m <= 3; m++) {
36128 GemmMicrokernelTester()
36129 .mr(3)
36130 .nr(16)
36131 .kr(1)
36132 .sr(1)
36133 .m(m)
36134 .n(n)
36135 .k(k)
36136 .iterations(1)
36137 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36138 }
36139 }
36140 }
36141 }
36142
36143 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, small_kernel) {
36144 TEST_REQUIRES_X86_FMA3;
36145 for (size_t k = 1; k <= 5; k += 2) {
36146 GemmMicrokernelTester()
36147 .mr(3)
36148 .nr(16)
36149 .kr(1)
36150 .sr(1)
36151 .m(3)
36152 .n(16)
36153 .k(k)
36154 .ks(3)
36155 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36156 }
36157 }
36158
36159 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, small_kernel_subtile) {
36160 TEST_REQUIRES_X86_FMA3;
36161 for (size_t k = 1; k <= 5; k += 2) {
36162 for (uint32_t m = 1; m <= 3; m++) {
36163 for (uint32_t n = 1; n <= 16; n++) {
36164 GemmMicrokernelTester()
36165 .mr(3)
36166 .nr(16)
36167 .kr(1)
36168 .sr(1)
36169 .m(m)
36170 .n(n)
36171 .k(k)
36172 .ks(3)
36173 .iterations(1)
36174 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36175 }
36176 }
36177 }
36178 }
36179
36180 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
36181 TEST_REQUIRES_X86_FMA3;
36182 for (uint32_t n = 17; n < 32; n++) {
36183 for (size_t k = 1; k <= 5; k += 2) {
36184 GemmMicrokernelTester()
36185 .mr(3)
36186 .nr(16)
36187 .kr(1)
36188 .sr(1)
36189 .m(3)
36190 .n(16)
36191 .k(k)
36192 .ks(3)
36193 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36194 }
36195 }
36196 }
36197
36198 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, n_div_16_small_kernel) {
36199 TEST_REQUIRES_X86_FMA3;
36200 for (uint32_t n = 32; n <= 48; n += 16) {
36201 for (size_t k = 1; k <= 5; k += 2) {
36202 GemmMicrokernelTester()
36203 .mr(3)
36204 .nr(16)
36205 .kr(1)
36206 .sr(1)
36207 .m(3)
36208 .n(16)
36209 .k(k)
36210 .ks(3)
36211 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36212 }
36213 }
36214 }
36215
36216 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, strided_cm_subtile) {
36217 TEST_REQUIRES_X86_FMA3;
36218 for (size_t k = 1; k <= 5; k += 2) {
36219 for (uint32_t m = 1; m <= 3; m++) {
36220 for (uint32_t n = 1; n <= 16; n++) {
36221 GemmMicrokernelTester()
36222 .mr(3)
36223 .nr(16)
36224 .kr(1)
36225 .sr(1)
36226 .m(m)
36227 .n(n)
36228 .k(k)
36229 .cm_stride(19)
36230 .iterations(1)
36231 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36232 }
36233 }
36234 }
36235 }
36236
36237 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, a_offset) {
36238 TEST_REQUIRES_X86_FMA3;
36239 for (size_t k = 1; k <= 5; k += 2) {
36240 GemmMicrokernelTester()
36241 .mr(3)
36242 .nr(16)
36243 .kr(1)
36244 .sr(1)
36245 .m(3)
36246 .n(16)
36247 .k(k)
36248 .ks(3)
36249 .a_offset(17)
36250 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36251 }
36252 }
36253
36254 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, zero) {
36255 TEST_REQUIRES_X86_FMA3;
36256 for (uint32_t mz = 0; mz < 3; mz++) {
36257 for (size_t k = 1; k <= 5; k += 2) {
36258 GemmMicrokernelTester()
36259 .mr(3)
36260 .nr(16)
36261 .kr(1)
36262 .sr(1)
36263 .m(3)
36264 .n(16)
36265 .k(k)
36266 .ks(3)
36267 .a_offset(17)
36268 .zero_index(mz)
36269 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36270 }
36271 }
36272 }
36273
36274 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, qmin) {
36275 TEST_REQUIRES_X86_FMA3;
36276 GemmMicrokernelTester()
36277 .mr(3)
36278 .nr(16)
36279 .kr(1)
36280 .sr(1)
36281 .m(3)
36282 .n(16)
36283 .k(1)
36284 .qmin(128)
36285 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36286 }
36287
36288 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, qmax) {
36289 TEST_REQUIRES_X86_FMA3;
36290 GemmMicrokernelTester()
36291 .mr(3)
36292 .nr(16)
36293 .kr(1)
36294 .sr(1)
36295 .m(3)
36296 .n(16)
36297 .k(1)
36298 .qmax(128)
36299 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36300 }
36301
36302 TEST(F32_IGEMM_3X16__FMA3_BROADCAST, strided_cm) {
36303 TEST_REQUIRES_X86_FMA3;
36304 GemmMicrokernelTester()
36305 .mr(3)
36306 .nr(16)
36307 .kr(1)
36308 .sr(1)
36309 .m(3)
36310 .n(16)
36311 .k(1)
36312 .cm_stride(19)
36313 .Test(xnn_f32_igemm_ukernel_3x16__fma3_broadcast);
36314 }
36315#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36316
36317
36318#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36319 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1) {
36320 TEST_REQUIRES_X86_FMA3;
36321 GemmMicrokernelTester()
36322 .mr(4)
36323 .nr(16)
36324 .kr(1)
36325 .sr(1)
36326 .m(4)
36327 .n(16)
36328 .k(1)
36329 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36330 }
36331
36332 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, strided_cn) {
36333 TEST_REQUIRES_X86_FMA3;
36334 GemmMicrokernelTester()
36335 .mr(4)
36336 .nr(16)
36337 .kr(1)
36338 .sr(1)
36339 .m(4)
36340 .n(16)
36341 .k(1)
36342 .cn_stride(19)
36343 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36344 }
36345
36346 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile) {
36347 TEST_REQUIRES_X86_FMA3;
36348 for (uint32_t m = 1; m <= 4; m++) {
36349 for (uint32_t n = 1; n <= 16; n++) {
36350 GemmMicrokernelTester()
36351 .mr(4)
36352 .nr(16)
36353 .kr(1)
36354 .sr(1)
36355 .m(m)
36356 .n(n)
36357 .k(1)
36358 .iterations(1)
36359 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36360 }
36361 }
36362 }
36363
36364 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
36365 TEST_REQUIRES_X86_FMA3;
36366 for (uint32_t m = 1; m <= 4; m++) {
36367 GemmMicrokernelTester()
36368 .mr(4)
36369 .nr(16)
36370 .kr(1)
36371 .sr(1)
36372 .m(m)
36373 .n(16)
36374 .k(1)
36375 .iterations(1)
36376 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36377 }
36378 }
36379
36380 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
36381 TEST_REQUIRES_X86_FMA3;
36382 for (uint32_t n = 1; n <= 16; n++) {
36383 GemmMicrokernelTester()
36384 .mr(4)
36385 .nr(16)
36386 .kr(1)
36387 .sr(1)
36388 .m(4)
36389 .n(n)
36390 .k(1)
36391 .iterations(1)
36392 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36393 }
36394 }
36395
36396 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_gt_1) {
36397 TEST_REQUIRES_X86_FMA3;
36398 for (size_t k = 2; k < 10; k++) {
36399 GemmMicrokernelTester()
36400 .mr(4)
36401 .nr(16)
36402 .kr(1)
36403 .sr(1)
36404 .m(4)
36405 .n(16)
36406 .k(k)
36407 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36408 }
36409 }
36410
36411 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, k_gt_1_subtile) {
36412 TEST_REQUIRES_X86_FMA3;
36413 for (size_t k = 2; k < 10; k++) {
36414 for (uint32_t m = 1; m <= 4; m++) {
36415 for (uint32_t n = 1; n <= 16; n++) {
36416 GemmMicrokernelTester()
36417 .mr(4)
36418 .nr(16)
36419 .kr(1)
36420 .sr(1)
36421 .m(m)
36422 .n(n)
36423 .k(k)
36424 .iterations(1)
36425 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36426 }
36427 }
36428 }
36429 }
36430
36431 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16) {
36432 TEST_REQUIRES_X86_FMA3;
36433 for (uint32_t n = 17; n < 32; n++) {
36434 for (size_t k = 1; k <= 5; k += 2) {
36435 GemmMicrokernelTester()
36436 .mr(4)
36437 .nr(16)
36438 .kr(1)
36439 .sr(1)
36440 .m(4)
36441 .n(16)
36442 .k(k)
36443 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36444 }
36445 }
36446 }
36447
36448 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
36449 TEST_REQUIRES_X86_FMA3;
36450 for (uint32_t n = 17; n < 32; n++) {
36451 for (size_t k = 1; k <= 5; k += 2) {
36452 GemmMicrokernelTester()
36453 .mr(4)
36454 .nr(16)
36455 .kr(1)
36456 .sr(1)
36457 .m(4)
36458 .n(16)
36459 .k(k)
36460 .cn_stride(19)
36461 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36462 }
36463 }
36464 }
36465
36466 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16_subtile) {
36467 TEST_REQUIRES_X86_FMA3;
36468 for (uint32_t n = 17; n < 32; n++) {
36469 for (size_t k = 1; k <= 5; k += 2) {
36470 for (uint32_t m = 1; m <= 4; m++) {
36471 GemmMicrokernelTester()
36472 .mr(4)
36473 .nr(16)
36474 .kr(1)
36475 .sr(1)
36476 .m(m)
36477 .n(n)
36478 .k(k)
36479 .iterations(1)
36480 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36481 }
36482 }
36483 }
36484 }
36485
36486 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16) {
36487 TEST_REQUIRES_X86_FMA3;
36488 for (uint32_t n = 32; n <= 48; n += 16) {
36489 for (size_t k = 1; k <= 5; k += 2) {
36490 GemmMicrokernelTester()
36491 .mr(4)
36492 .nr(16)
36493 .kr(1)
36494 .sr(1)
36495 .m(4)
36496 .n(16)
36497 .k(k)
36498 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36499 }
36500 }
36501 }
36502
36503 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16_strided_cn) {
36504 TEST_REQUIRES_X86_FMA3;
36505 for (uint32_t n = 32; n <= 48; n += 16) {
36506 for (size_t k = 1; k <= 5; k += 2) {
36507 GemmMicrokernelTester()
36508 .mr(4)
36509 .nr(16)
36510 .kr(1)
36511 .sr(1)
36512 .m(4)
36513 .n(n)
36514 .k(k)
36515 .cn_stride(19)
36516 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36517 }
36518 }
36519 }
36520
36521 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16_subtile) {
36522 TEST_REQUIRES_X86_FMA3;
36523 for (uint32_t n = 32; n <= 48; n += 16) {
36524 for (size_t k = 1; k <= 5; k += 2) {
36525 for (uint32_t m = 1; m <= 4; m++) {
36526 GemmMicrokernelTester()
36527 .mr(4)
36528 .nr(16)
36529 .kr(1)
36530 .sr(1)
36531 .m(m)
36532 .n(n)
36533 .k(k)
36534 .iterations(1)
36535 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36536 }
36537 }
36538 }
36539 }
36540
36541 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, small_kernel) {
36542 TEST_REQUIRES_X86_FMA3;
36543 for (size_t k = 1; k <= 5; k += 2) {
36544 GemmMicrokernelTester()
36545 .mr(4)
36546 .nr(16)
36547 .kr(1)
36548 .sr(1)
36549 .m(4)
36550 .n(16)
36551 .k(k)
36552 .ks(3)
36553 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36554 }
36555 }
36556
36557 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, small_kernel_subtile) {
36558 TEST_REQUIRES_X86_FMA3;
36559 for (size_t k = 1; k <= 5; k += 2) {
36560 for (uint32_t m = 1; m <= 4; m++) {
36561 for (uint32_t n = 1; n <= 16; n++) {
36562 GemmMicrokernelTester()
36563 .mr(4)
36564 .nr(16)
36565 .kr(1)
36566 .sr(1)
36567 .m(m)
36568 .n(n)
36569 .k(k)
36570 .ks(3)
36571 .iterations(1)
36572 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36573 }
36574 }
36575 }
36576 }
36577
36578 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
36579 TEST_REQUIRES_X86_FMA3;
36580 for (uint32_t n = 17; n < 32; n++) {
36581 for (size_t k = 1; k <= 5; k += 2) {
36582 GemmMicrokernelTester()
36583 .mr(4)
36584 .nr(16)
36585 .kr(1)
36586 .sr(1)
36587 .m(4)
36588 .n(16)
36589 .k(k)
36590 .ks(3)
36591 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36592 }
36593 }
36594 }
36595
36596 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, n_div_16_small_kernel) {
36597 TEST_REQUIRES_X86_FMA3;
36598 for (uint32_t n = 32; n <= 48; n += 16) {
36599 for (size_t k = 1; k <= 5; k += 2) {
36600 GemmMicrokernelTester()
36601 .mr(4)
36602 .nr(16)
36603 .kr(1)
36604 .sr(1)
36605 .m(4)
36606 .n(16)
36607 .k(k)
36608 .ks(3)
36609 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36610 }
36611 }
36612 }
36613
36614 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, strided_cm_subtile) {
36615 TEST_REQUIRES_X86_FMA3;
36616 for (size_t k = 1; k <= 5; k += 2) {
36617 for (uint32_t m = 1; m <= 4; m++) {
36618 for (uint32_t n = 1; n <= 16; n++) {
36619 GemmMicrokernelTester()
36620 .mr(4)
36621 .nr(16)
36622 .kr(1)
36623 .sr(1)
36624 .m(m)
36625 .n(n)
36626 .k(k)
36627 .cm_stride(19)
36628 .iterations(1)
36629 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36630 }
36631 }
36632 }
36633 }
36634
36635 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, a_offset) {
36636 TEST_REQUIRES_X86_FMA3;
36637 for (size_t k = 1; k <= 5; k += 2) {
36638 GemmMicrokernelTester()
36639 .mr(4)
36640 .nr(16)
36641 .kr(1)
36642 .sr(1)
36643 .m(4)
36644 .n(16)
36645 .k(k)
36646 .ks(3)
36647 .a_offset(23)
36648 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36649 }
36650 }
36651
36652 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, zero) {
36653 TEST_REQUIRES_X86_FMA3;
36654 for (uint32_t mz = 0; mz < 4; mz++) {
36655 for (size_t k = 1; k <= 5; k += 2) {
36656 GemmMicrokernelTester()
36657 .mr(4)
36658 .nr(16)
36659 .kr(1)
36660 .sr(1)
36661 .m(4)
36662 .n(16)
36663 .k(k)
36664 .ks(3)
36665 .a_offset(23)
36666 .zero_index(mz)
36667 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36668 }
36669 }
36670 }
36671
36672 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, qmin) {
36673 TEST_REQUIRES_X86_FMA3;
36674 GemmMicrokernelTester()
36675 .mr(4)
36676 .nr(16)
36677 .kr(1)
36678 .sr(1)
36679 .m(4)
36680 .n(16)
36681 .k(1)
36682 .qmin(128)
36683 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36684 }
36685
36686 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, qmax) {
36687 TEST_REQUIRES_X86_FMA3;
36688 GemmMicrokernelTester()
36689 .mr(4)
36690 .nr(16)
36691 .kr(1)
36692 .sr(1)
36693 .m(4)
36694 .n(16)
36695 .k(1)
36696 .qmax(128)
36697 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36698 }
36699
36700 TEST(F32_IGEMM_4X16__FMA3_BROADCAST, strided_cm) {
36701 TEST_REQUIRES_X86_FMA3;
36702 GemmMicrokernelTester()
36703 .mr(4)
36704 .nr(16)
36705 .kr(1)
36706 .sr(1)
36707 .m(4)
36708 .n(16)
36709 .k(1)
36710 .cm_stride(19)
36711 .Test(xnn_f32_igemm_ukernel_4x16__fma3_broadcast);
36712 }
36713#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36714
36715
36716#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36717 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1) {
36718 TEST_REQUIRES_X86_FMA3;
36719 GemmMicrokernelTester()
36720 .mr(5)
36721 .nr(16)
36722 .kr(1)
36723 .sr(1)
36724 .m(5)
36725 .n(16)
36726 .k(1)
36727 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36728 }
36729
36730 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, strided_cn) {
36731 TEST_REQUIRES_X86_FMA3;
36732 GemmMicrokernelTester()
36733 .mr(5)
36734 .nr(16)
36735 .kr(1)
36736 .sr(1)
36737 .m(5)
36738 .n(16)
36739 .k(1)
36740 .cn_stride(19)
36741 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36742 }
36743
36744 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile) {
36745 TEST_REQUIRES_X86_FMA3;
36746 for (uint32_t m = 1; m <= 5; m++) {
36747 for (uint32_t n = 1; n <= 16; n++) {
36748 GemmMicrokernelTester()
36749 .mr(5)
36750 .nr(16)
36751 .kr(1)
36752 .sr(1)
36753 .m(m)
36754 .n(n)
36755 .k(1)
36756 .iterations(1)
36757 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36758 }
36759 }
36760 }
36761
36762 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
36763 TEST_REQUIRES_X86_FMA3;
36764 for (uint32_t m = 1; m <= 5; m++) {
36765 GemmMicrokernelTester()
36766 .mr(5)
36767 .nr(16)
36768 .kr(1)
36769 .sr(1)
36770 .m(m)
36771 .n(16)
36772 .k(1)
36773 .iterations(1)
36774 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36775 }
36776 }
36777
36778 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
36779 TEST_REQUIRES_X86_FMA3;
36780 for (uint32_t n = 1; n <= 16; n++) {
36781 GemmMicrokernelTester()
36782 .mr(5)
36783 .nr(16)
36784 .kr(1)
36785 .sr(1)
36786 .m(5)
36787 .n(n)
36788 .k(1)
36789 .iterations(1)
36790 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36791 }
36792 }
36793
36794 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_gt_1) {
36795 TEST_REQUIRES_X86_FMA3;
36796 for (size_t k = 2; k < 10; k++) {
36797 GemmMicrokernelTester()
36798 .mr(5)
36799 .nr(16)
36800 .kr(1)
36801 .sr(1)
36802 .m(5)
36803 .n(16)
36804 .k(k)
36805 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36806 }
36807 }
36808
36809 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, k_gt_1_subtile) {
36810 TEST_REQUIRES_X86_FMA3;
36811 for (size_t k = 2; k < 10; k++) {
36812 for (uint32_t m = 1; m <= 5; m++) {
36813 for (uint32_t n = 1; n <= 16; n++) {
36814 GemmMicrokernelTester()
36815 .mr(5)
36816 .nr(16)
36817 .kr(1)
36818 .sr(1)
36819 .m(m)
36820 .n(n)
36821 .k(k)
36822 .iterations(1)
36823 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36824 }
36825 }
36826 }
36827 }
36828
36829 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16) {
36830 TEST_REQUIRES_X86_FMA3;
36831 for (uint32_t n = 17; n < 32; n++) {
36832 for (size_t k = 1; k <= 5; k += 2) {
36833 GemmMicrokernelTester()
36834 .mr(5)
36835 .nr(16)
36836 .kr(1)
36837 .sr(1)
36838 .m(5)
36839 .n(16)
36840 .k(k)
36841 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36842 }
36843 }
36844 }
36845
36846 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
36847 TEST_REQUIRES_X86_FMA3;
36848 for (uint32_t n = 17; n < 32; n++) {
36849 for (size_t k = 1; k <= 5; k += 2) {
36850 GemmMicrokernelTester()
36851 .mr(5)
36852 .nr(16)
36853 .kr(1)
36854 .sr(1)
36855 .m(5)
36856 .n(16)
36857 .k(k)
36858 .cn_stride(19)
36859 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36860 }
36861 }
36862 }
36863
36864 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16_subtile) {
36865 TEST_REQUIRES_X86_FMA3;
36866 for (uint32_t n = 17; n < 32; n++) {
36867 for (size_t k = 1; k <= 5; k += 2) {
36868 for (uint32_t m = 1; m <= 5; m++) {
36869 GemmMicrokernelTester()
36870 .mr(5)
36871 .nr(16)
36872 .kr(1)
36873 .sr(1)
36874 .m(m)
36875 .n(n)
36876 .k(k)
36877 .iterations(1)
36878 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36879 }
36880 }
36881 }
36882 }
36883
36884 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16) {
36885 TEST_REQUIRES_X86_FMA3;
36886 for (uint32_t n = 32; n <= 48; n += 16) {
36887 for (size_t k = 1; k <= 5; k += 2) {
36888 GemmMicrokernelTester()
36889 .mr(5)
36890 .nr(16)
36891 .kr(1)
36892 .sr(1)
36893 .m(5)
36894 .n(16)
36895 .k(k)
36896 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36897 }
36898 }
36899 }
36900
36901 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16_strided_cn) {
36902 TEST_REQUIRES_X86_FMA3;
36903 for (uint32_t n = 32; n <= 48; n += 16) {
36904 for (size_t k = 1; k <= 5; k += 2) {
36905 GemmMicrokernelTester()
36906 .mr(5)
36907 .nr(16)
36908 .kr(1)
36909 .sr(1)
36910 .m(5)
36911 .n(n)
36912 .k(k)
36913 .cn_stride(19)
36914 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36915 }
36916 }
36917 }
36918
36919 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16_subtile) {
36920 TEST_REQUIRES_X86_FMA3;
36921 for (uint32_t n = 32; n <= 48; n += 16) {
36922 for (size_t k = 1; k <= 5; k += 2) {
36923 for (uint32_t m = 1; m <= 5; m++) {
36924 GemmMicrokernelTester()
36925 .mr(5)
36926 .nr(16)
36927 .kr(1)
36928 .sr(1)
36929 .m(m)
36930 .n(n)
36931 .k(k)
36932 .iterations(1)
36933 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36934 }
36935 }
36936 }
36937 }
36938
36939 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, small_kernel) {
36940 TEST_REQUIRES_X86_FMA3;
36941 for (size_t k = 1; k <= 5; k += 2) {
36942 GemmMicrokernelTester()
36943 .mr(5)
36944 .nr(16)
36945 .kr(1)
36946 .sr(1)
36947 .m(5)
36948 .n(16)
36949 .k(k)
36950 .ks(3)
36951 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36952 }
36953 }
36954
36955 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, small_kernel_subtile) {
36956 TEST_REQUIRES_X86_FMA3;
36957 for (size_t k = 1; k <= 5; k += 2) {
36958 for (uint32_t m = 1; m <= 5; m++) {
36959 for (uint32_t n = 1; n <= 16; n++) {
36960 GemmMicrokernelTester()
36961 .mr(5)
36962 .nr(16)
36963 .kr(1)
36964 .sr(1)
36965 .m(m)
36966 .n(n)
36967 .k(k)
36968 .ks(3)
36969 .iterations(1)
36970 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36971 }
36972 }
36973 }
36974 }
36975
36976 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_gt_16_small_kernel) {
36977 TEST_REQUIRES_X86_FMA3;
36978 for (uint32_t n = 17; n < 32; n++) {
36979 for (size_t k = 1; k <= 5; k += 2) {
36980 GemmMicrokernelTester()
36981 .mr(5)
36982 .nr(16)
36983 .kr(1)
36984 .sr(1)
36985 .m(5)
36986 .n(16)
36987 .k(k)
36988 .ks(3)
36989 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
36990 }
36991 }
36992 }
36993
36994 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, n_div_16_small_kernel) {
36995 TEST_REQUIRES_X86_FMA3;
36996 for (uint32_t n = 32; n <= 48; n += 16) {
36997 for (size_t k = 1; k <= 5; k += 2) {
36998 GemmMicrokernelTester()
36999 .mr(5)
37000 .nr(16)
37001 .kr(1)
37002 .sr(1)
37003 .m(5)
37004 .n(16)
37005 .k(k)
37006 .ks(3)
37007 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37008 }
37009 }
37010 }
37011
37012 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, strided_cm_subtile) {
37013 TEST_REQUIRES_X86_FMA3;
37014 for (size_t k = 1; k <= 5; k += 2) {
37015 for (uint32_t m = 1; m <= 5; m++) {
37016 for (uint32_t n = 1; n <= 16; n++) {
37017 GemmMicrokernelTester()
37018 .mr(5)
37019 .nr(16)
37020 .kr(1)
37021 .sr(1)
37022 .m(m)
37023 .n(n)
37024 .k(k)
37025 .cm_stride(19)
37026 .iterations(1)
37027 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37028 }
37029 }
37030 }
37031 }
37032
37033 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, a_offset) {
37034 TEST_REQUIRES_X86_FMA3;
37035 for (size_t k = 1; k <= 5; k += 2) {
37036 GemmMicrokernelTester()
37037 .mr(5)
37038 .nr(16)
37039 .kr(1)
37040 .sr(1)
37041 .m(5)
37042 .n(16)
37043 .k(k)
37044 .ks(3)
37045 .a_offset(29)
37046 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37047 }
37048 }
37049
37050 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, zero) {
37051 TEST_REQUIRES_X86_FMA3;
37052 for (uint32_t mz = 0; mz < 5; mz++) {
37053 for (size_t k = 1; k <= 5; k += 2) {
37054 GemmMicrokernelTester()
37055 .mr(5)
37056 .nr(16)
37057 .kr(1)
37058 .sr(1)
37059 .m(5)
37060 .n(16)
37061 .k(k)
37062 .ks(3)
37063 .a_offset(29)
37064 .zero_index(mz)
37065 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37066 }
37067 }
37068 }
37069
37070 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, qmin) {
37071 TEST_REQUIRES_X86_FMA3;
37072 GemmMicrokernelTester()
37073 .mr(5)
37074 .nr(16)
37075 .kr(1)
37076 .sr(1)
37077 .m(5)
37078 .n(16)
37079 .k(1)
37080 .qmin(128)
37081 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37082 }
37083
37084 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, qmax) {
37085 TEST_REQUIRES_X86_FMA3;
37086 GemmMicrokernelTester()
37087 .mr(5)
37088 .nr(16)
37089 .kr(1)
37090 .sr(1)
37091 .m(5)
37092 .n(16)
37093 .k(1)
37094 .qmax(128)
37095 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37096 }
37097
37098 TEST(F32_IGEMM_5X16__FMA3_BROADCAST, strided_cm) {
37099 TEST_REQUIRES_X86_FMA3;
37100 GemmMicrokernelTester()
37101 .mr(5)
37102 .nr(16)
37103 .kr(1)
37104 .sr(1)
37105 .m(5)
37106 .n(16)
37107 .k(1)
37108 .cm_stride(19)
37109 .Test(xnn_f32_igemm_ukernel_5x16__fma3_broadcast);
37110 }
37111#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37112
37113
37114#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37115 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4) {
37116 TEST_REQUIRES_X86_FMA3;
37117 GemmMicrokernelTester()
37118 .mr(1)
37119 .nr(16)
37120 .kr(1)
37121 .sr(4)
37122 .m(1)
37123 .n(16)
37124 .k(4)
37125 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37126 }
37127
37128 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, strided_cn) {
37129 TEST_REQUIRES_X86_FMA3;
37130 GemmMicrokernelTester()
37131 .mr(1)
37132 .nr(16)
37133 .kr(1)
37134 .sr(4)
37135 .m(1)
37136 .n(16)
37137 .k(4)
37138 .cn_stride(19)
37139 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37140 }
37141
37142 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
37143 TEST_REQUIRES_X86_FMA3;
37144 for (uint32_t m = 1; m <= 1; m++) {
37145 for (uint32_t n = 1; n <= 16; n++) {
37146 GemmMicrokernelTester()
37147 .mr(1)
37148 .nr(16)
37149 .kr(1)
37150 .sr(4)
37151 .m(m)
37152 .n(n)
37153 .k(4)
37154 .iterations(1)
37155 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37156 }
37157 }
37158 }
37159
37160 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
37161 TEST_REQUIRES_X86_FMA3;
37162 for (uint32_t m = 1; m <= 1; m++) {
37163 GemmMicrokernelTester()
37164 .mr(1)
37165 .nr(16)
37166 .kr(1)
37167 .sr(4)
37168 .m(m)
37169 .n(16)
37170 .k(4)
37171 .iterations(1)
37172 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37173 }
37174 }
37175
37176 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
37177 TEST_REQUIRES_X86_FMA3;
37178 for (uint32_t n = 1; n <= 16; n++) {
37179 GemmMicrokernelTester()
37180 .mr(1)
37181 .nr(16)
37182 .kr(1)
37183 .sr(4)
37184 .m(1)
37185 .n(n)
37186 .k(4)
37187 .iterations(1)
37188 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37189 }
37190 }
37191
37192 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_lt_4) {
37193 TEST_REQUIRES_X86_FMA3;
37194 for (size_t k = 1; k < 4; k++) {
37195 GemmMicrokernelTester()
37196 .mr(1)
37197 .nr(16)
37198 .kr(1)
37199 .sr(4)
37200 .m(1)
37201 .n(16)
37202 .k(k)
37203 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37204 }
37205 }
37206
37207 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
37208 TEST_REQUIRES_X86_FMA3;
37209 for (size_t k = 1; k < 4; k++) {
37210 for (uint32_t m = 1; m <= 1; m++) {
37211 for (uint32_t n = 1; n <= 16; n++) {
37212 GemmMicrokernelTester()
37213 .mr(1)
37214 .nr(16)
37215 .kr(1)
37216 .sr(4)
37217 .m(m)
37218 .n(n)
37219 .k(k)
37220 .iterations(1)
37221 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37222 }
37223 }
37224 }
37225 }
37226
37227 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_gt_4) {
37228 TEST_REQUIRES_X86_FMA3;
37229 for (size_t k = 5; k < 8; k++) {
37230 GemmMicrokernelTester()
37231 .mr(1)
37232 .nr(16)
37233 .kr(1)
37234 .sr(4)
37235 .m(1)
37236 .n(16)
37237 .k(k)
37238 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37239 }
37240 }
37241
37242 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
37243 TEST_REQUIRES_X86_FMA3;
37244 for (size_t k = 5; k < 8; k++) {
37245 for (uint32_t m = 1; m <= 1; m++) {
37246 for (uint32_t n = 1; n <= 16; n++) {
37247 GemmMicrokernelTester()
37248 .mr(1)
37249 .nr(16)
37250 .kr(1)
37251 .sr(4)
37252 .m(m)
37253 .n(n)
37254 .k(k)
37255 .iterations(1)
37256 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37257 }
37258 }
37259 }
37260 }
37261
37262 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_div_4) {
37263 TEST_REQUIRES_X86_FMA3;
37264 for (size_t k = 8; k <= 40; k += 4) {
37265 GemmMicrokernelTester()
37266 .mr(1)
37267 .nr(16)
37268 .kr(1)
37269 .sr(4)
37270 .m(1)
37271 .n(16)
37272 .k(k)
37273 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37274 }
37275 }
37276
37277 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, k_div_4_subtile) {
37278 TEST_REQUIRES_X86_FMA3;
37279 for (size_t k = 8; k <= 40; k += 4) {
37280 for (uint32_t m = 1; m <= 1; m++) {
37281 for (uint32_t n = 1; n <= 16; n++) {
37282 GemmMicrokernelTester()
37283 .mr(1)
37284 .nr(16)
37285 .kr(1)
37286 .sr(4)
37287 .m(m)
37288 .n(n)
37289 .k(k)
37290 .iterations(1)
37291 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37292 }
37293 }
37294 }
37295 }
37296
37297 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16) {
37298 TEST_REQUIRES_X86_FMA3;
37299 for (uint32_t n = 17; n < 32; n++) {
37300 for (size_t k = 1; k <= 20; k += 5) {
37301 GemmMicrokernelTester()
37302 .mr(1)
37303 .nr(16)
37304 .kr(1)
37305 .sr(4)
37306 .m(1)
37307 .n(16)
37308 .k(k)
37309 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37310 }
37311 }
37312 }
37313
37314 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
37315 TEST_REQUIRES_X86_FMA3;
37316 for (uint32_t n = 17; n < 32; n++) {
37317 for (size_t k = 1; k <= 20; k += 5) {
37318 GemmMicrokernelTester()
37319 .mr(1)
37320 .nr(16)
37321 .kr(1)
37322 .sr(4)
37323 .m(1)
37324 .n(16)
37325 .k(k)
37326 .cn_stride(19)
37327 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37328 }
37329 }
37330 }
37331
37332 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
37333 TEST_REQUIRES_X86_FMA3;
37334 for (uint32_t n = 17; n < 32; n++) {
37335 for (size_t k = 1; k <= 20; k += 5) {
37336 for (uint32_t m = 1; m <= 1; m++) {
37337 GemmMicrokernelTester()
37338 .mr(1)
37339 .nr(16)
37340 .kr(1)
37341 .sr(4)
37342 .m(m)
37343 .n(n)
37344 .k(k)
37345 .iterations(1)
37346 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37347 }
37348 }
37349 }
37350 }
37351
37352 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16) {
37353 TEST_REQUIRES_X86_FMA3;
37354 for (uint32_t n = 32; n <= 48; n += 16) {
37355 for (size_t k = 1; k <= 20; k += 5) {
37356 GemmMicrokernelTester()
37357 .mr(1)
37358 .nr(16)
37359 .kr(1)
37360 .sr(4)
37361 .m(1)
37362 .n(16)
37363 .k(k)
37364 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37365 }
37366 }
37367 }
37368
37369 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
37370 TEST_REQUIRES_X86_FMA3;
37371 for (uint32_t n = 32; n <= 48; n += 16) {
37372 for (size_t k = 1; k <= 20; k += 5) {
37373 GemmMicrokernelTester()
37374 .mr(1)
37375 .nr(16)
37376 .kr(1)
37377 .sr(4)
37378 .m(1)
37379 .n(n)
37380 .k(k)
37381 .cn_stride(19)
37382 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37383 }
37384 }
37385 }
37386
37387 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16_subtile) {
37388 TEST_REQUIRES_X86_FMA3;
37389 for (uint32_t n = 32; n <= 48; n += 16) {
37390 for (size_t k = 1; k <= 20; k += 5) {
37391 for (uint32_t m = 1; m <= 1; m++) {
37392 GemmMicrokernelTester()
37393 .mr(1)
37394 .nr(16)
37395 .kr(1)
37396 .sr(4)
37397 .m(m)
37398 .n(n)
37399 .k(k)
37400 .iterations(1)
37401 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37402 }
37403 }
37404 }
37405 }
37406
37407 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, small_kernel) {
37408 TEST_REQUIRES_X86_FMA3;
37409 for (size_t k = 1; k <= 20; k += 5) {
37410 GemmMicrokernelTester()
37411 .mr(1)
37412 .nr(16)
37413 .kr(1)
37414 .sr(4)
37415 .m(1)
37416 .n(16)
37417 .k(k)
37418 .ks(3)
37419 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37420 }
37421 }
37422
37423 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, small_kernel_subtile) {
37424 TEST_REQUIRES_X86_FMA3;
37425 for (size_t k = 1; k <= 20; k += 5) {
37426 for (uint32_t m = 1; m <= 1; m++) {
37427 for (uint32_t n = 1; n <= 16; n++) {
37428 GemmMicrokernelTester()
37429 .mr(1)
37430 .nr(16)
37431 .kr(1)
37432 .sr(4)
37433 .m(m)
37434 .n(n)
37435 .k(k)
37436 .ks(3)
37437 .iterations(1)
37438 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37439 }
37440 }
37441 }
37442 }
37443
37444 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
37445 TEST_REQUIRES_X86_FMA3;
37446 for (uint32_t n = 17; n < 32; n++) {
37447 for (size_t k = 1; k <= 20; k += 5) {
37448 GemmMicrokernelTester()
37449 .mr(1)
37450 .nr(16)
37451 .kr(1)
37452 .sr(4)
37453 .m(1)
37454 .n(16)
37455 .k(k)
37456 .ks(3)
37457 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37458 }
37459 }
37460 }
37461
37462 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
37463 TEST_REQUIRES_X86_FMA3;
37464 for (uint32_t n = 32; n <= 48; n += 16) {
37465 for (size_t k = 1; k <= 20; k += 5) {
37466 GemmMicrokernelTester()
37467 .mr(1)
37468 .nr(16)
37469 .kr(1)
37470 .sr(4)
37471 .m(1)
37472 .n(16)
37473 .k(k)
37474 .ks(3)
37475 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37476 }
37477 }
37478 }
37479
37480 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, strided_cm_subtile) {
37481 TEST_REQUIRES_X86_FMA3;
37482 for (size_t k = 1; k <= 20; k += 5) {
37483 for (uint32_t m = 1; m <= 1; m++) {
37484 for (uint32_t n = 1; n <= 16; n++) {
37485 GemmMicrokernelTester()
37486 .mr(1)
37487 .nr(16)
37488 .kr(1)
37489 .sr(4)
37490 .m(m)
37491 .n(n)
37492 .k(k)
37493 .cm_stride(19)
37494 .iterations(1)
37495 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37496 }
37497 }
37498 }
37499 }
37500
37501 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, a_offset) {
37502 TEST_REQUIRES_X86_FMA3;
37503 for (size_t k = 1; k <= 20; k += 5) {
37504 GemmMicrokernelTester()
37505 .mr(1)
37506 .nr(16)
37507 .kr(1)
37508 .sr(4)
37509 .m(1)
37510 .n(16)
37511 .k(k)
37512 .ks(3)
37513 .a_offset(23)
37514 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37515 }
37516 }
37517
37518 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, zero) {
37519 TEST_REQUIRES_X86_FMA3;
37520 for (uint32_t mz = 0; mz < 1; mz++) {
37521 for (size_t k = 1; k <= 20; k += 5) {
37522 GemmMicrokernelTester()
37523 .mr(1)
37524 .nr(16)
37525 .kr(1)
37526 .sr(4)
37527 .m(1)
37528 .n(16)
37529 .k(k)
37530 .ks(3)
37531 .a_offset(23)
37532 .zero_index(mz)
37533 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37534 }
37535 }
37536 }
37537
37538 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, qmin) {
37539 TEST_REQUIRES_X86_FMA3;
37540 GemmMicrokernelTester()
37541 .mr(1)
37542 .nr(16)
37543 .kr(1)
37544 .sr(4)
37545 .m(1)
37546 .n(16)
37547 .k(4)
37548 .qmin(128)
37549 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37550 }
37551
37552 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, qmax) {
37553 TEST_REQUIRES_X86_FMA3;
37554 GemmMicrokernelTester()
37555 .mr(1)
37556 .nr(16)
37557 .kr(1)
37558 .sr(4)
37559 .m(1)
37560 .n(16)
37561 .k(4)
37562 .qmax(128)
37563 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37564 }
37565
37566 TEST(F32_IGEMM_1X16S4__FMA3_BROADCAST, strided_cm) {
37567 TEST_REQUIRES_X86_FMA3;
37568 GemmMicrokernelTester()
37569 .mr(1)
37570 .nr(16)
37571 .kr(1)
37572 .sr(4)
37573 .m(1)
37574 .n(16)
37575 .k(4)
37576 .cm_stride(19)
37577 .Test(xnn_f32_igemm_ukernel_1x16s4__fma3_broadcast);
37578 }
37579#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37580
37581
37582#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37583 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4) {
37584 TEST_REQUIRES_X86_FMA3;
37585 GemmMicrokernelTester()
37586 .mr(3)
37587 .nr(16)
37588 .kr(1)
37589 .sr(4)
37590 .m(3)
37591 .n(16)
37592 .k(4)
37593 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37594 }
37595
37596 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, strided_cn) {
37597 TEST_REQUIRES_X86_FMA3;
37598 GemmMicrokernelTester()
37599 .mr(3)
37600 .nr(16)
37601 .kr(1)
37602 .sr(4)
37603 .m(3)
37604 .n(16)
37605 .k(4)
37606 .cn_stride(19)
37607 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37608 }
37609
37610 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
37611 TEST_REQUIRES_X86_FMA3;
37612 for (uint32_t m = 1; m <= 3; m++) {
37613 for (uint32_t n = 1; n <= 16; n++) {
37614 GemmMicrokernelTester()
37615 .mr(3)
37616 .nr(16)
37617 .kr(1)
37618 .sr(4)
37619 .m(m)
37620 .n(n)
37621 .k(4)
37622 .iterations(1)
37623 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37624 }
37625 }
37626 }
37627
37628 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
37629 TEST_REQUIRES_X86_FMA3;
37630 for (uint32_t m = 1; m <= 3; m++) {
37631 GemmMicrokernelTester()
37632 .mr(3)
37633 .nr(16)
37634 .kr(1)
37635 .sr(4)
37636 .m(m)
37637 .n(16)
37638 .k(4)
37639 .iterations(1)
37640 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37641 }
37642 }
37643
37644 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
37645 TEST_REQUIRES_X86_FMA3;
37646 for (uint32_t n = 1; n <= 16; n++) {
37647 GemmMicrokernelTester()
37648 .mr(3)
37649 .nr(16)
37650 .kr(1)
37651 .sr(4)
37652 .m(3)
37653 .n(n)
37654 .k(4)
37655 .iterations(1)
37656 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37657 }
37658 }
37659
37660 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_lt_4) {
37661 TEST_REQUIRES_X86_FMA3;
37662 for (size_t k = 1; k < 4; k++) {
37663 GemmMicrokernelTester()
37664 .mr(3)
37665 .nr(16)
37666 .kr(1)
37667 .sr(4)
37668 .m(3)
37669 .n(16)
37670 .k(k)
37671 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37672 }
37673 }
37674
37675 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
37676 TEST_REQUIRES_X86_FMA3;
37677 for (size_t k = 1; k < 4; k++) {
37678 for (uint32_t m = 1; m <= 3; m++) {
37679 for (uint32_t n = 1; n <= 16; n++) {
37680 GemmMicrokernelTester()
37681 .mr(3)
37682 .nr(16)
37683 .kr(1)
37684 .sr(4)
37685 .m(m)
37686 .n(n)
37687 .k(k)
37688 .iterations(1)
37689 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37690 }
37691 }
37692 }
37693 }
37694
37695 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_gt_4) {
37696 TEST_REQUIRES_X86_FMA3;
37697 for (size_t k = 5; k < 8; k++) {
37698 GemmMicrokernelTester()
37699 .mr(3)
37700 .nr(16)
37701 .kr(1)
37702 .sr(4)
37703 .m(3)
37704 .n(16)
37705 .k(k)
37706 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37707 }
37708 }
37709
37710 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
37711 TEST_REQUIRES_X86_FMA3;
37712 for (size_t k = 5; k < 8; k++) {
37713 for (uint32_t m = 1; m <= 3; m++) {
37714 for (uint32_t n = 1; n <= 16; n++) {
37715 GemmMicrokernelTester()
37716 .mr(3)
37717 .nr(16)
37718 .kr(1)
37719 .sr(4)
37720 .m(m)
37721 .n(n)
37722 .k(k)
37723 .iterations(1)
37724 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37725 }
37726 }
37727 }
37728 }
37729
37730 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_div_4) {
37731 TEST_REQUIRES_X86_FMA3;
37732 for (size_t k = 8; k <= 40; k += 4) {
37733 GemmMicrokernelTester()
37734 .mr(3)
37735 .nr(16)
37736 .kr(1)
37737 .sr(4)
37738 .m(3)
37739 .n(16)
37740 .k(k)
37741 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37742 }
37743 }
37744
37745 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, k_div_4_subtile) {
37746 TEST_REQUIRES_X86_FMA3;
37747 for (size_t k = 8; k <= 40; k += 4) {
37748 for (uint32_t m = 1; m <= 3; m++) {
37749 for (uint32_t n = 1; n <= 16; n++) {
37750 GemmMicrokernelTester()
37751 .mr(3)
37752 .nr(16)
37753 .kr(1)
37754 .sr(4)
37755 .m(m)
37756 .n(n)
37757 .k(k)
37758 .iterations(1)
37759 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37760 }
37761 }
37762 }
37763 }
37764
37765 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16) {
37766 TEST_REQUIRES_X86_FMA3;
37767 for (uint32_t n = 17; n < 32; n++) {
37768 for (size_t k = 1; k <= 20; k += 5) {
37769 GemmMicrokernelTester()
37770 .mr(3)
37771 .nr(16)
37772 .kr(1)
37773 .sr(4)
37774 .m(3)
37775 .n(16)
37776 .k(k)
37777 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37778 }
37779 }
37780 }
37781
37782 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
37783 TEST_REQUIRES_X86_FMA3;
37784 for (uint32_t n = 17; n < 32; n++) {
37785 for (size_t k = 1; k <= 20; k += 5) {
37786 GemmMicrokernelTester()
37787 .mr(3)
37788 .nr(16)
37789 .kr(1)
37790 .sr(4)
37791 .m(3)
37792 .n(16)
37793 .k(k)
37794 .cn_stride(19)
37795 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37796 }
37797 }
37798 }
37799
37800 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
37801 TEST_REQUIRES_X86_FMA3;
37802 for (uint32_t n = 17; n < 32; n++) {
37803 for (size_t k = 1; k <= 20; k += 5) {
37804 for (uint32_t m = 1; m <= 3; m++) {
37805 GemmMicrokernelTester()
37806 .mr(3)
37807 .nr(16)
37808 .kr(1)
37809 .sr(4)
37810 .m(m)
37811 .n(n)
37812 .k(k)
37813 .iterations(1)
37814 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37815 }
37816 }
37817 }
37818 }
37819
37820 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16) {
37821 TEST_REQUIRES_X86_FMA3;
37822 for (uint32_t n = 32; n <= 48; n += 16) {
37823 for (size_t k = 1; k <= 20; k += 5) {
37824 GemmMicrokernelTester()
37825 .mr(3)
37826 .nr(16)
37827 .kr(1)
37828 .sr(4)
37829 .m(3)
37830 .n(16)
37831 .k(k)
37832 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37833 }
37834 }
37835 }
37836
37837 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
37838 TEST_REQUIRES_X86_FMA3;
37839 for (uint32_t n = 32; n <= 48; n += 16) {
37840 for (size_t k = 1; k <= 20; k += 5) {
37841 GemmMicrokernelTester()
37842 .mr(3)
37843 .nr(16)
37844 .kr(1)
37845 .sr(4)
37846 .m(3)
37847 .n(n)
37848 .k(k)
37849 .cn_stride(19)
37850 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37851 }
37852 }
37853 }
37854
37855 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16_subtile) {
37856 TEST_REQUIRES_X86_FMA3;
37857 for (uint32_t n = 32; n <= 48; n += 16) {
37858 for (size_t k = 1; k <= 20; k += 5) {
37859 for (uint32_t m = 1; m <= 3; m++) {
37860 GemmMicrokernelTester()
37861 .mr(3)
37862 .nr(16)
37863 .kr(1)
37864 .sr(4)
37865 .m(m)
37866 .n(n)
37867 .k(k)
37868 .iterations(1)
37869 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37870 }
37871 }
37872 }
37873 }
37874
37875 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, small_kernel) {
37876 TEST_REQUIRES_X86_FMA3;
37877 for (size_t k = 1; k <= 20; k += 5) {
37878 GemmMicrokernelTester()
37879 .mr(3)
37880 .nr(16)
37881 .kr(1)
37882 .sr(4)
37883 .m(3)
37884 .n(16)
37885 .k(k)
37886 .ks(3)
37887 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37888 }
37889 }
37890
37891 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, small_kernel_subtile) {
37892 TEST_REQUIRES_X86_FMA3;
37893 for (size_t k = 1; k <= 20; k += 5) {
37894 for (uint32_t m = 1; m <= 3; m++) {
37895 for (uint32_t n = 1; n <= 16; n++) {
37896 GemmMicrokernelTester()
37897 .mr(3)
37898 .nr(16)
37899 .kr(1)
37900 .sr(4)
37901 .m(m)
37902 .n(n)
37903 .k(k)
37904 .ks(3)
37905 .iterations(1)
37906 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37907 }
37908 }
37909 }
37910 }
37911
37912 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
37913 TEST_REQUIRES_X86_FMA3;
37914 for (uint32_t n = 17; n < 32; n++) {
37915 for (size_t k = 1; k <= 20; k += 5) {
37916 GemmMicrokernelTester()
37917 .mr(3)
37918 .nr(16)
37919 .kr(1)
37920 .sr(4)
37921 .m(3)
37922 .n(16)
37923 .k(k)
37924 .ks(3)
37925 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37926 }
37927 }
37928 }
37929
37930 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
37931 TEST_REQUIRES_X86_FMA3;
37932 for (uint32_t n = 32; n <= 48; n += 16) {
37933 for (size_t k = 1; k <= 20; k += 5) {
37934 GemmMicrokernelTester()
37935 .mr(3)
37936 .nr(16)
37937 .kr(1)
37938 .sr(4)
37939 .m(3)
37940 .n(16)
37941 .k(k)
37942 .ks(3)
37943 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37944 }
37945 }
37946 }
37947
37948 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, strided_cm_subtile) {
37949 TEST_REQUIRES_X86_FMA3;
37950 for (size_t k = 1; k <= 20; k += 5) {
37951 for (uint32_t m = 1; m <= 3; m++) {
37952 for (uint32_t n = 1; n <= 16; n++) {
37953 GemmMicrokernelTester()
37954 .mr(3)
37955 .nr(16)
37956 .kr(1)
37957 .sr(4)
37958 .m(m)
37959 .n(n)
37960 .k(k)
37961 .cm_stride(19)
37962 .iterations(1)
37963 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37964 }
37965 }
37966 }
37967 }
37968
37969 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, a_offset) {
37970 TEST_REQUIRES_X86_FMA3;
37971 for (size_t k = 1; k <= 20; k += 5) {
37972 GemmMicrokernelTester()
37973 .mr(3)
37974 .nr(16)
37975 .kr(1)
37976 .sr(4)
37977 .m(3)
37978 .n(16)
37979 .k(k)
37980 .ks(3)
37981 .a_offset(67)
37982 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
37983 }
37984 }
37985
37986 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, zero) {
37987 TEST_REQUIRES_X86_FMA3;
37988 for (uint32_t mz = 0; mz < 3; mz++) {
37989 for (size_t k = 1; k <= 20; k += 5) {
37990 GemmMicrokernelTester()
37991 .mr(3)
37992 .nr(16)
37993 .kr(1)
37994 .sr(4)
37995 .m(3)
37996 .n(16)
37997 .k(k)
37998 .ks(3)
37999 .a_offset(67)
38000 .zero_index(mz)
38001 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
38002 }
38003 }
38004 }
38005
38006 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, qmin) {
38007 TEST_REQUIRES_X86_FMA3;
38008 GemmMicrokernelTester()
38009 .mr(3)
38010 .nr(16)
38011 .kr(1)
38012 .sr(4)
38013 .m(3)
38014 .n(16)
38015 .k(4)
38016 .qmin(128)
38017 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
38018 }
38019
38020 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, qmax) {
38021 TEST_REQUIRES_X86_FMA3;
38022 GemmMicrokernelTester()
38023 .mr(3)
38024 .nr(16)
38025 .kr(1)
38026 .sr(4)
38027 .m(3)
38028 .n(16)
38029 .k(4)
38030 .qmax(128)
38031 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
38032 }
38033
38034 TEST(F32_IGEMM_3X16S4__FMA3_BROADCAST, strided_cm) {
38035 TEST_REQUIRES_X86_FMA3;
38036 GemmMicrokernelTester()
38037 .mr(3)
38038 .nr(16)
38039 .kr(1)
38040 .sr(4)
38041 .m(3)
38042 .n(16)
38043 .k(4)
38044 .cm_stride(19)
38045 .Test(xnn_f32_igemm_ukernel_3x16s4__fma3_broadcast);
38046 }
38047#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38048
38049
38050#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38051 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4) {
38052 TEST_REQUIRES_X86_FMA3;
38053 GemmMicrokernelTester()
38054 .mr(4)
38055 .nr(16)
38056 .kr(1)
38057 .sr(4)
38058 .m(4)
38059 .n(16)
38060 .k(4)
38061 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38062 }
38063
38064 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, strided_cn) {
38065 TEST_REQUIRES_X86_FMA3;
38066 GemmMicrokernelTester()
38067 .mr(4)
38068 .nr(16)
38069 .kr(1)
38070 .sr(4)
38071 .m(4)
38072 .n(16)
38073 .k(4)
38074 .cn_stride(19)
38075 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38076 }
38077
38078 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
38079 TEST_REQUIRES_X86_FMA3;
38080 for (uint32_t m = 1; m <= 4; m++) {
38081 for (uint32_t n = 1; n <= 16; n++) {
38082 GemmMicrokernelTester()
38083 .mr(4)
38084 .nr(16)
38085 .kr(1)
38086 .sr(4)
38087 .m(m)
38088 .n(n)
38089 .k(4)
38090 .iterations(1)
38091 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38092 }
38093 }
38094 }
38095
38096 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
38097 TEST_REQUIRES_X86_FMA3;
38098 for (uint32_t m = 1; m <= 4; m++) {
38099 GemmMicrokernelTester()
38100 .mr(4)
38101 .nr(16)
38102 .kr(1)
38103 .sr(4)
38104 .m(m)
38105 .n(16)
38106 .k(4)
38107 .iterations(1)
38108 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38109 }
38110 }
38111
38112 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
38113 TEST_REQUIRES_X86_FMA3;
38114 for (uint32_t n = 1; n <= 16; n++) {
38115 GemmMicrokernelTester()
38116 .mr(4)
38117 .nr(16)
38118 .kr(1)
38119 .sr(4)
38120 .m(4)
38121 .n(n)
38122 .k(4)
38123 .iterations(1)
38124 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38125 }
38126 }
38127
38128 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_lt_4) {
38129 TEST_REQUIRES_X86_FMA3;
38130 for (size_t k = 1; k < 4; k++) {
38131 GemmMicrokernelTester()
38132 .mr(4)
38133 .nr(16)
38134 .kr(1)
38135 .sr(4)
38136 .m(4)
38137 .n(16)
38138 .k(k)
38139 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38140 }
38141 }
38142
38143 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
38144 TEST_REQUIRES_X86_FMA3;
38145 for (size_t k = 1; k < 4; k++) {
38146 for (uint32_t m = 1; m <= 4; m++) {
38147 for (uint32_t n = 1; n <= 16; n++) {
38148 GemmMicrokernelTester()
38149 .mr(4)
38150 .nr(16)
38151 .kr(1)
38152 .sr(4)
38153 .m(m)
38154 .n(n)
38155 .k(k)
38156 .iterations(1)
38157 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38158 }
38159 }
38160 }
38161 }
38162
38163 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_gt_4) {
38164 TEST_REQUIRES_X86_FMA3;
38165 for (size_t k = 5; k < 8; k++) {
38166 GemmMicrokernelTester()
38167 .mr(4)
38168 .nr(16)
38169 .kr(1)
38170 .sr(4)
38171 .m(4)
38172 .n(16)
38173 .k(k)
38174 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38175 }
38176 }
38177
38178 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
38179 TEST_REQUIRES_X86_FMA3;
38180 for (size_t k = 5; k < 8; k++) {
38181 for (uint32_t m = 1; m <= 4; m++) {
38182 for (uint32_t n = 1; n <= 16; n++) {
38183 GemmMicrokernelTester()
38184 .mr(4)
38185 .nr(16)
38186 .kr(1)
38187 .sr(4)
38188 .m(m)
38189 .n(n)
38190 .k(k)
38191 .iterations(1)
38192 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38193 }
38194 }
38195 }
38196 }
38197
38198 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_div_4) {
38199 TEST_REQUIRES_X86_FMA3;
38200 for (size_t k = 8; k <= 40; k += 4) {
38201 GemmMicrokernelTester()
38202 .mr(4)
38203 .nr(16)
38204 .kr(1)
38205 .sr(4)
38206 .m(4)
38207 .n(16)
38208 .k(k)
38209 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38210 }
38211 }
38212
38213 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, k_div_4_subtile) {
38214 TEST_REQUIRES_X86_FMA3;
38215 for (size_t k = 8; k <= 40; k += 4) {
38216 for (uint32_t m = 1; m <= 4; m++) {
38217 for (uint32_t n = 1; n <= 16; n++) {
38218 GemmMicrokernelTester()
38219 .mr(4)
38220 .nr(16)
38221 .kr(1)
38222 .sr(4)
38223 .m(m)
38224 .n(n)
38225 .k(k)
38226 .iterations(1)
38227 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38228 }
38229 }
38230 }
38231 }
38232
38233 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16) {
38234 TEST_REQUIRES_X86_FMA3;
38235 for (uint32_t n = 17; n < 32; n++) {
38236 for (size_t k = 1; k <= 20; k += 5) {
38237 GemmMicrokernelTester()
38238 .mr(4)
38239 .nr(16)
38240 .kr(1)
38241 .sr(4)
38242 .m(4)
38243 .n(16)
38244 .k(k)
38245 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38246 }
38247 }
38248 }
38249
38250 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
38251 TEST_REQUIRES_X86_FMA3;
38252 for (uint32_t n = 17; n < 32; n++) {
38253 for (size_t k = 1; k <= 20; k += 5) {
38254 GemmMicrokernelTester()
38255 .mr(4)
38256 .nr(16)
38257 .kr(1)
38258 .sr(4)
38259 .m(4)
38260 .n(16)
38261 .k(k)
38262 .cn_stride(19)
38263 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38264 }
38265 }
38266 }
38267
38268 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
38269 TEST_REQUIRES_X86_FMA3;
38270 for (uint32_t n = 17; n < 32; n++) {
38271 for (size_t k = 1; k <= 20; k += 5) {
38272 for (uint32_t m = 1; m <= 4; m++) {
38273 GemmMicrokernelTester()
38274 .mr(4)
38275 .nr(16)
38276 .kr(1)
38277 .sr(4)
38278 .m(m)
38279 .n(n)
38280 .k(k)
38281 .iterations(1)
38282 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38283 }
38284 }
38285 }
38286 }
38287
38288 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16) {
38289 TEST_REQUIRES_X86_FMA3;
38290 for (uint32_t n = 32; n <= 48; n += 16) {
38291 for (size_t k = 1; k <= 20; k += 5) {
38292 GemmMicrokernelTester()
38293 .mr(4)
38294 .nr(16)
38295 .kr(1)
38296 .sr(4)
38297 .m(4)
38298 .n(16)
38299 .k(k)
38300 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38301 }
38302 }
38303 }
38304
38305 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
38306 TEST_REQUIRES_X86_FMA3;
38307 for (uint32_t n = 32; n <= 48; n += 16) {
38308 for (size_t k = 1; k <= 20; k += 5) {
38309 GemmMicrokernelTester()
38310 .mr(4)
38311 .nr(16)
38312 .kr(1)
38313 .sr(4)
38314 .m(4)
38315 .n(n)
38316 .k(k)
38317 .cn_stride(19)
38318 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38319 }
38320 }
38321 }
38322
38323 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16_subtile) {
38324 TEST_REQUIRES_X86_FMA3;
38325 for (uint32_t n = 32; n <= 48; n += 16) {
38326 for (size_t k = 1; k <= 20; k += 5) {
38327 for (uint32_t m = 1; m <= 4; m++) {
38328 GemmMicrokernelTester()
38329 .mr(4)
38330 .nr(16)
38331 .kr(1)
38332 .sr(4)
38333 .m(m)
38334 .n(n)
38335 .k(k)
38336 .iterations(1)
38337 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38338 }
38339 }
38340 }
38341 }
38342
38343 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, small_kernel) {
38344 TEST_REQUIRES_X86_FMA3;
38345 for (size_t k = 1; k <= 20; k += 5) {
38346 GemmMicrokernelTester()
38347 .mr(4)
38348 .nr(16)
38349 .kr(1)
38350 .sr(4)
38351 .m(4)
38352 .n(16)
38353 .k(k)
38354 .ks(3)
38355 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38356 }
38357 }
38358
38359 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, small_kernel_subtile) {
38360 TEST_REQUIRES_X86_FMA3;
38361 for (size_t k = 1; k <= 20; k += 5) {
38362 for (uint32_t m = 1; m <= 4; m++) {
38363 for (uint32_t n = 1; n <= 16; n++) {
38364 GemmMicrokernelTester()
38365 .mr(4)
38366 .nr(16)
38367 .kr(1)
38368 .sr(4)
38369 .m(m)
38370 .n(n)
38371 .k(k)
38372 .ks(3)
38373 .iterations(1)
38374 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38375 }
38376 }
38377 }
38378 }
38379
38380 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
38381 TEST_REQUIRES_X86_FMA3;
38382 for (uint32_t n = 17; n < 32; n++) {
38383 for (size_t k = 1; k <= 20; k += 5) {
38384 GemmMicrokernelTester()
38385 .mr(4)
38386 .nr(16)
38387 .kr(1)
38388 .sr(4)
38389 .m(4)
38390 .n(16)
38391 .k(k)
38392 .ks(3)
38393 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38394 }
38395 }
38396 }
38397
38398 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
38399 TEST_REQUIRES_X86_FMA3;
38400 for (uint32_t n = 32; n <= 48; n += 16) {
38401 for (size_t k = 1; k <= 20; k += 5) {
38402 GemmMicrokernelTester()
38403 .mr(4)
38404 .nr(16)
38405 .kr(1)
38406 .sr(4)
38407 .m(4)
38408 .n(16)
38409 .k(k)
38410 .ks(3)
38411 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38412 }
38413 }
38414 }
38415
38416 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, strided_cm_subtile) {
38417 TEST_REQUIRES_X86_FMA3;
38418 for (size_t k = 1; k <= 20; k += 5) {
38419 for (uint32_t m = 1; m <= 4; m++) {
38420 for (uint32_t n = 1; n <= 16; n++) {
38421 GemmMicrokernelTester()
38422 .mr(4)
38423 .nr(16)
38424 .kr(1)
38425 .sr(4)
38426 .m(m)
38427 .n(n)
38428 .k(k)
38429 .cm_stride(19)
38430 .iterations(1)
38431 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38432 }
38433 }
38434 }
38435 }
38436
38437 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, a_offset) {
38438 TEST_REQUIRES_X86_FMA3;
38439 for (size_t k = 1; k <= 20; k += 5) {
38440 GemmMicrokernelTester()
38441 .mr(4)
38442 .nr(16)
38443 .kr(1)
38444 .sr(4)
38445 .m(4)
38446 .n(16)
38447 .k(k)
38448 .ks(3)
38449 .a_offset(83)
38450 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38451 }
38452 }
38453
38454 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, zero) {
38455 TEST_REQUIRES_X86_FMA3;
38456 for (uint32_t mz = 0; mz < 4; mz++) {
38457 for (size_t k = 1; k <= 20; k += 5) {
38458 GemmMicrokernelTester()
38459 .mr(4)
38460 .nr(16)
38461 .kr(1)
38462 .sr(4)
38463 .m(4)
38464 .n(16)
38465 .k(k)
38466 .ks(3)
38467 .a_offset(83)
38468 .zero_index(mz)
38469 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38470 }
38471 }
38472 }
38473
38474 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, qmin) {
38475 TEST_REQUIRES_X86_FMA3;
38476 GemmMicrokernelTester()
38477 .mr(4)
38478 .nr(16)
38479 .kr(1)
38480 .sr(4)
38481 .m(4)
38482 .n(16)
38483 .k(4)
38484 .qmin(128)
38485 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38486 }
38487
38488 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, qmax) {
38489 TEST_REQUIRES_X86_FMA3;
38490 GemmMicrokernelTester()
38491 .mr(4)
38492 .nr(16)
38493 .kr(1)
38494 .sr(4)
38495 .m(4)
38496 .n(16)
38497 .k(4)
38498 .qmax(128)
38499 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38500 }
38501
38502 TEST(F32_IGEMM_4X16S4__FMA3_BROADCAST, strided_cm) {
38503 TEST_REQUIRES_X86_FMA3;
38504 GemmMicrokernelTester()
38505 .mr(4)
38506 .nr(16)
38507 .kr(1)
38508 .sr(4)
38509 .m(4)
38510 .n(16)
38511 .k(4)
38512 .cm_stride(19)
38513 .Test(xnn_f32_igemm_ukernel_4x16s4__fma3_broadcast);
38514 }
38515#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38516
38517
38518#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38519 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4) {
38520 TEST_REQUIRES_X86_FMA3;
38521 GemmMicrokernelTester()
38522 .mr(5)
38523 .nr(16)
38524 .kr(1)
38525 .sr(4)
38526 .m(5)
38527 .n(16)
38528 .k(4)
38529 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38530 }
38531
38532 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, strided_cn) {
38533 TEST_REQUIRES_X86_FMA3;
38534 GemmMicrokernelTester()
38535 .mr(5)
38536 .nr(16)
38537 .kr(1)
38538 .sr(4)
38539 .m(5)
38540 .n(16)
38541 .k(4)
38542 .cn_stride(19)
38543 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38544 }
38545
38546 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
38547 TEST_REQUIRES_X86_FMA3;
38548 for (uint32_t m = 1; m <= 5; m++) {
38549 for (uint32_t n = 1; n <= 16; n++) {
38550 GemmMicrokernelTester()
38551 .mr(5)
38552 .nr(16)
38553 .kr(1)
38554 .sr(4)
38555 .m(m)
38556 .n(n)
38557 .k(4)
38558 .iterations(1)
38559 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38560 }
38561 }
38562 }
38563
38564 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
38565 TEST_REQUIRES_X86_FMA3;
38566 for (uint32_t m = 1; m <= 5; m++) {
38567 GemmMicrokernelTester()
38568 .mr(5)
38569 .nr(16)
38570 .kr(1)
38571 .sr(4)
38572 .m(m)
38573 .n(16)
38574 .k(4)
38575 .iterations(1)
38576 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38577 }
38578 }
38579
38580 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
38581 TEST_REQUIRES_X86_FMA3;
38582 for (uint32_t n = 1; n <= 16; n++) {
38583 GemmMicrokernelTester()
38584 .mr(5)
38585 .nr(16)
38586 .kr(1)
38587 .sr(4)
38588 .m(5)
38589 .n(n)
38590 .k(4)
38591 .iterations(1)
38592 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38593 }
38594 }
38595
38596 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_lt_4) {
38597 TEST_REQUIRES_X86_FMA3;
38598 for (size_t k = 1; k < 4; k++) {
38599 GemmMicrokernelTester()
38600 .mr(5)
38601 .nr(16)
38602 .kr(1)
38603 .sr(4)
38604 .m(5)
38605 .n(16)
38606 .k(k)
38607 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38608 }
38609 }
38610
38611 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
38612 TEST_REQUIRES_X86_FMA3;
38613 for (size_t k = 1; k < 4; k++) {
38614 for (uint32_t m = 1; m <= 5; m++) {
38615 for (uint32_t n = 1; n <= 16; n++) {
38616 GemmMicrokernelTester()
38617 .mr(5)
38618 .nr(16)
38619 .kr(1)
38620 .sr(4)
38621 .m(m)
38622 .n(n)
38623 .k(k)
38624 .iterations(1)
38625 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38626 }
38627 }
38628 }
38629 }
38630
38631 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_gt_4) {
38632 TEST_REQUIRES_X86_FMA3;
38633 for (size_t k = 5; k < 8; k++) {
38634 GemmMicrokernelTester()
38635 .mr(5)
38636 .nr(16)
38637 .kr(1)
38638 .sr(4)
38639 .m(5)
38640 .n(16)
38641 .k(k)
38642 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38643 }
38644 }
38645
38646 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
38647 TEST_REQUIRES_X86_FMA3;
38648 for (size_t k = 5; k < 8; k++) {
38649 for (uint32_t m = 1; m <= 5; m++) {
38650 for (uint32_t n = 1; n <= 16; n++) {
38651 GemmMicrokernelTester()
38652 .mr(5)
38653 .nr(16)
38654 .kr(1)
38655 .sr(4)
38656 .m(m)
38657 .n(n)
38658 .k(k)
38659 .iterations(1)
38660 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38661 }
38662 }
38663 }
38664 }
38665
38666 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_div_4) {
38667 TEST_REQUIRES_X86_FMA3;
38668 for (size_t k = 8; k <= 40; k += 4) {
38669 GemmMicrokernelTester()
38670 .mr(5)
38671 .nr(16)
38672 .kr(1)
38673 .sr(4)
38674 .m(5)
38675 .n(16)
38676 .k(k)
38677 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38678 }
38679 }
38680
38681 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, k_div_4_subtile) {
38682 TEST_REQUIRES_X86_FMA3;
38683 for (size_t k = 8; k <= 40; k += 4) {
38684 for (uint32_t m = 1; m <= 5; m++) {
38685 for (uint32_t n = 1; n <= 16; n++) {
38686 GemmMicrokernelTester()
38687 .mr(5)
38688 .nr(16)
38689 .kr(1)
38690 .sr(4)
38691 .m(m)
38692 .n(n)
38693 .k(k)
38694 .iterations(1)
38695 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38696 }
38697 }
38698 }
38699 }
38700
38701 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16) {
38702 TEST_REQUIRES_X86_FMA3;
38703 for (uint32_t n = 17; n < 32; n++) {
38704 for (size_t k = 1; k <= 20; k += 5) {
38705 GemmMicrokernelTester()
38706 .mr(5)
38707 .nr(16)
38708 .kr(1)
38709 .sr(4)
38710 .m(5)
38711 .n(16)
38712 .k(k)
38713 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38714 }
38715 }
38716 }
38717
38718 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
38719 TEST_REQUIRES_X86_FMA3;
38720 for (uint32_t n = 17; n < 32; n++) {
38721 for (size_t k = 1; k <= 20; k += 5) {
38722 GemmMicrokernelTester()
38723 .mr(5)
38724 .nr(16)
38725 .kr(1)
38726 .sr(4)
38727 .m(5)
38728 .n(16)
38729 .k(k)
38730 .cn_stride(19)
38731 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38732 }
38733 }
38734 }
38735
38736 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
38737 TEST_REQUIRES_X86_FMA3;
38738 for (uint32_t n = 17; n < 32; n++) {
38739 for (size_t k = 1; k <= 20; k += 5) {
38740 for (uint32_t m = 1; m <= 5; m++) {
38741 GemmMicrokernelTester()
38742 .mr(5)
38743 .nr(16)
38744 .kr(1)
38745 .sr(4)
38746 .m(m)
38747 .n(n)
38748 .k(k)
38749 .iterations(1)
38750 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38751 }
38752 }
38753 }
38754 }
38755
38756 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16) {
38757 TEST_REQUIRES_X86_FMA3;
38758 for (uint32_t n = 32; n <= 48; n += 16) {
38759 for (size_t k = 1; k <= 20; k += 5) {
38760 GemmMicrokernelTester()
38761 .mr(5)
38762 .nr(16)
38763 .kr(1)
38764 .sr(4)
38765 .m(5)
38766 .n(16)
38767 .k(k)
38768 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38769 }
38770 }
38771 }
38772
38773 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
38774 TEST_REQUIRES_X86_FMA3;
38775 for (uint32_t n = 32; n <= 48; n += 16) {
38776 for (size_t k = 1; k <= 20; k += 5) {
38777 GemmMicrokernelTester()
38778 .mr(5)
38779 .nr(16)
38780 .kr(1)
38781 .sr(4)
38782 .m(5)
38783 .n(n)
38784 .k(k)
38785 .cn_stride(19)
38786 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38787 }
38788 }
38789 }
38790
38791 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16_subtile) {
38792 TEST_REQUIRES_X86_FMA3;
38793 for (uint32_t n = 32; n <= 48; n += 16) {
38794 for (size_t k = 1; k <= 20; k += 5) {
38795 for (uint32_t m = 1; m <= 5; m++) {
38796 GemmMicrokernelTester()
38797 .mr(5)
38798 .nr(16)
38799 .kr(1)
38800 .sr(4)
38801 .m(m)
38802 .n(n)
38803 .k(k)
38804 .iterations(1)
38805 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38806 }
38807 }
38808 }
38809 }
38810
38811 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, small_kernel) {
38812 TEST_REQUIRES_X86_FMA3;
38813 for (size_t k = 1; k <= 20; k += 5) {
38814 GemmMicrokernelTester()
38815 .mr(5)
38816 .nr(16)
38817 .kr(1)
38818 .sr(4)
38819 .m(5)
38820 .n(16)
38821 .k(k)
38822 .ks(3)
38823 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38824 }
38825 }
38826
38827 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, small_kernel_subtile) {
38828 TEST_REQUIRES_X86_FMA3;
38829 for (size_t k = 1; k <= 20; k += 5) {
38830 for (uint32_t m = 1; m <= 5; m++) {
38831 for (uint32_t n = 1; n <= 16; n++) {
38832 GemmMicrokernelTester()
38833 .mr(5)
38834 .nr(16)
38835 .kr(1)
38836 .sr(4)
38837 .m(m)
38838 .n(n)
38839 .k(k)
38840 .ks(3)
38841 .iterations(1)
38842 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38843 }
38844 }
38845 }
38846 }
38847
38848 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_gt_16_small_kernel) {
38849 TEST_REQUIRES_X86_FMA3;
38850 for (uint32_t n = 17; n < 32; n++) {
38851 for (size_t k = 1; k <= 20; k += 5) {
38852 GemmMicrokernelTester()
38853 .mr(5)
38854 .nr(16)
38855 .kr(1)
38856 .sr(4)
38857 .m(5)
38858 .n(16)
38859 .k(k)
38860 .ks(3)
38861 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38862 }
38863 }
38864 }
38865
38866 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, n_div_16_small_kernel) {
38867 TEST_REQUIRES_X86_FMA3;
38868 for (uint32_t n = 32; n <= 48; n += 16) {
38869 for (size_t k = 1; k <= 20; k += 5) {
38870 GemmMicrokernelTester()
38871 .mr(5)
38872 .nr(16)
38873 .kr(1)
38874 .sr(4)
38875 .m(5)
38876 .n(16)
38877 .k(k)
38878 .ks(3)
38879 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38880 }
38881 }
38882 }
38883
38884 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, strided_cm_subtile) {
38885 TEST_REQUIRES_X86_FMA3;
38886 for (size_t k = 1; k <= 20; k += 5) {
38887 for (uint32_t m = 1; m <= 5; m++) {
38888 for (uint32_t n = 1; n <= 16; n++) {
38889 GemmMicrokernelTester()
38890 .mr(5)
38891 .nr(16)
38892 .kr(1)
38893 .sr(4)
38894 .m(m)
38895 .n(n)
38896 .k(k)
38897 .cm_stride(19)
38898 .iterations(1)
38899 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38900 }
38901 }
38902 }
38903 }
38904
38905 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, a_offset) {
38906 TEST_REQUIRES_X86_FMA3;
38907 for (size_t k = 1; k <= 20; k += 5) {
38908 GemmMicrokernelTester()
38909 .mr(5)
38910 .nr(16)
38911 .kr(1)
38912 .sr(4)
38913 .m(5)
38914 .n(16)
38915 .k(k)
38916 .ks(3)
38917 .a_offset(103)
38918 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38919 }
38920 }
38921
38922 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, zero) {
38923 TEST_REQUIRES_X86_FMA3;
38924 for (uint32_t mz = 0; mz < 5; mz++) {
38925 for (size_t k = 1; k <= 20; k += 5) {
38926 GemmMicrokernelTester()
38927 .mr(5)
38928 .nr(16)
38929 .kr(1)
38930 .sr(4)
38931 .m(5)
38932 .n(16)
38933 .k(k)
38934 .ks(3)
38935 .a_offset(103)
38936 .zero_index(mz)
38937 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38938 }
38939 }
38940 }
38941
38942 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, qmin) {
38943 TEST_REQUIRES_X86_FMA3;
38944 GemmMicrokernelTester()
38945 .mr(5)
38946 .nr(16)
38947 .kr(1)
38948 .sr(4)
38949 .m(5)
38950 .n(16)
38951 .k(4)
38952 .qmin(128)
38953 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38954 }
38955
38956 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, qmax) {
38957 TEST_REQUIRES_X86_FMA3;
38958 GemmMicrokernelTester()
38959 .mr(5)
38960 .nr(16)
38961 .kr(1)
38962 .sr(4)
38963 .m(5)
38964 .n(16)
38965 .k(4)
38966 .qmax(128)
38967 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38968 }
38969
38970 TEST(F32_IGEMM_5X16S4__FMA3_BROADCAST, strided_cm) {
38971 TEST_REQUIRES_X86_FMA3;
38972 GemmMicrokernelTester()
38973 .mr(5)
38974 .nr(16)
38975 .kr(1)
38976 .sr(4)
38977 .m(5)
38978 .n(16)
38979 .k(4)
38980 .cm_stride(19)
38981 .Test(xnn_f32_igemm_ukernel_5x16s4__fma3_broadcast);
38982 }
38983#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38984
38985
38986#if XNN_ARCH_X86 || XNN_ARCH_X86_64
38987 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1) {
38988 TEST_REQUIRES_X86_AVX512F;
38989 GemmMicrokernelTester()
38990 .mr(1)
38991 .nr(16)
38992 .kr(1)
38993 .sr(1)
38994 .m(1)
38995 .n(16)
38996 .k(1)
38997 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
38998 }
38999
39000 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, strided_cn) {
39001 TEST_REQUIRES_X86_AVX512F;
39002 GemmMicrokernelTester()
39003 .mr(1)
39004 .nr(16)
39005 .kr(1)
39006 .sr(1)
39007 .m(1)
39008 .n(16)
39009 .k(1)
39010 .cn_stride(19)
39011 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39012 }
39013
39014 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
39015 TEST_REQUIRES_X86_AVX512F;
39016 for (uint32_t m = 1; m <= 1; m++) {
39017 for (uint32_t n = 1; n <= 16; n++) {
39018 GemmMicrokernelTester()
39019 .mr(1)
39020 .nr(16)
39021 .kr(1)
39022 .sr(1)
39023 .m(m)
39024 .n(n)
39025 .k(1)
39026 .iterations(1)
39027 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39028 }
39029 }
39030 }
39031
39032 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
39033 TEST_REQUIRES_X86_AVX512F;
39034 for (uint32_t m = 1; m <= 1; m++) {
39035 GemmMicrokernelTester()
39036 .mr(1)
39037 .nr(16)
39038 .kr(1)
39039 .sr(1)
39040 .m(m)
39041 .n(16)
39042 .k(1)
39043 .iterations(1)
39044 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39045 }
39046 }
39047
39048 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
39049 TEST_REQUIRES_X86_AVX512F;
39050 for (uint32_t n = 1; n <= 16; n++) {
39051 GemmMicrokernelTester()
39052 .mr(1)
39053 .nr(16)
39054 .kr(1)
39055 .sr(1)
39056 .m(1)
39057 .n(n)
39058 .k(1)
39059 .iterations(1)
39060 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39061 }
39062 }
39063
39064 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_gt_1) {
39065 TEST_REQUIRES_X86_AVX512F;
39066 for (size_t k = 2; k < 10; k++) {
39067 GemmMicrokernelTester()
39068 .mr(1)
39069 .nr(16)
39070 .kr(1)
39071 .sr(1)
39072 .m(1)
39073 .n(16)
39074 .k(k)
39075 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39076 }
39077 }
39078
39079 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
39080 TEST_REQUIRES_X86_AVX512F;
39081 for (size_t k = 2; k < 10; k++) {
39082 for (uint32_t m = 1; m <= 1; m++) {
39083 for (uint32_t n = 1; n <= 16; n++) {
39084 GemmMicrokernelTester()
39085 .mr(1)
39086 .nr(16)
39087 .kr(1)
39088 .sr(1)
39089 .m(m)
39090 .n(n)
39091 .k(k)
39092 .iterations(1)
39093 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39094 }
39095 }
39096 }
39097 }
39098
39099 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16) {
39100 TEST_REQUIRES_X86_AVX512F;
39101 for (uint32_t n = 17; n < 32; n++) {
39102 for (size_t k = 1; k <= 5; k += 2) {
39103 GemmMicrokernelTester()
39104 .mr(1)
39105 .nr(16)
39106 .kr(1)
39107 .sr(1)
39108 .m(1)
39109 .n(16)
39110 .k(k)
39111 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39112 }
39113 }
39114 }
39115
39116 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
39117 TEST_REQUIRES_X86_AVX512F;
39118 for (uint32_t n = 17; n < 32; n++) {
39119 for (size_t k = 1; k <= 5; k += 2) {
39120 GemmMicrokernelTester()
39121 .mr(1)
39122 .nr(16)
39123 .kr(1)
39124 .sr(1)
39125 .m(1)
39126 .n(16)
39127 .k(k)
39128 .cn_stride(19)
39129 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39130 }
39131 }
39132 }
39133
39134 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
39135 TEST_REQUIRES_X86_AVX512F;
39136 for (uint32_t n = 17; n < 32; n++) {
39137 for (size_t k = 1; k <= 5; k += 2) {
39138 for (uint32_t m = 1; m <= 1; m++) {
39139 GemmMicrokernelTester()
39140 .mr(1)
39141 .nr(16)
39142 .kr(1)
39143 .sr(1)
39144 .m(m)
39145 .n(n)
39146 .k(k)
39147 .iterations(1)
39148 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39149 }
39150 }
39151 }
39152 }
39153
39154 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16) {
39155 TEST_REQUIRES_X86_AVX512F;
39156 for (uint32_t n = 32; n <= 48; n += 16) {
39157 for (size_t k = 1; k <= 5; k += 2) {
39158 GemmMicrokernelTester()
39159 .mr(1)
39160 .nr(16)
39161 .kr(1)
39162 .sr(1)
39163 .m(1)
39164 .n(16)
39165 .k(k)
39166 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39167 }
39168 }
39169 }
39170
39171 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
39172 TEST_REQUIRES_X86_AVX512F;
39173 for (uint32_t n = 32; n <= 48; n += 16) {
39174 for (size_t k = 1; k <= 5; k += 2) {
39175 GemmMicrokernelTester()
39176 .mr(1)
39177 .nr(16)
39178 .kr(1)
39179 .sr(1)
39180 .m(1)
39181 .n(n)
39182 .k(k)
39183 .cn_stride(19)
39184 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39185 }
39186 }
39187 }
39188
39189 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
39190 TEST_REQUIRES_X86_AVX512F;
39191 for (uint32_t n = 32; n <= 48; n += 16) {
39192 for (size_t k = 1; k <= 5; k += 2) {
39193 for (uint32_t m = 1; m <= 1; m++) {
39194 GemmMicrokernelTester()
39195 .mr(1)
39196 .nr(16)
39197 .kr(1)
39198 .sr(1)
39199 .m(m)
39200 .n(n)
39201 .k(k)
39202 .iterations(1)
39203 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39204 }
39205 }
39206 }
39207 }
39208
39209 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, small_kernel) {
39210 TEST_REQUIRES_X86_AVX512F;
39211 for (size_t k = 1; k <= 5; k += 2) {
39212 GemmMicrokernelTester()
39213 .mr(1)
39214 .nr(16)
39215 .kr(1)
39216 .sr(1)
39217 .m(1)
39218 .n(16)
39219 .k(k)
39220 .ks(3)
39221 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39222 }
39223 }
39224
39225 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, small_kernel_subtile) {
39226 TEST_REQUIRES_X86_AVX512F;
39227 for (size_t k = 1; k <= 5; k += 2) {
39228 for (uint32_t m = 1; m <= 1; m++) {
39229 for (uint32_t n = 1; n <= 16; n++) {
39230 GemmMicrokernelTester()
39231 .mr(1)
39232 .nr(16)
39233 .kr(1)
39234 .sr(1)
39235 .m(m)
39236 .n(n)
39237 .k(k)
39238 .ks(3)
39239 .iterations(1)
39240 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39241 }
39242 }
39243 }
39244 }
39245
39246 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
39247 TEST_REQUIRES_X86_AVX512F;
39248 for (uint32_t n = 17; n < 32; n++) {
39249 for (size_t k = 1; k <= 5; k += 2) {
39250 GemmMicrokernelTester()
39251 .mr(1)
39252 .nr(16)
39253 .kr(1)
39254 .sr(1)
39255 .m(1)
39256 .n(16)
39257 .k(k)
39258 .ks(3)
39259 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39260 }
39261 }
39262 }
39263
39264 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
39265 TEST_REQUIRES_X86_AVX512F;
39266 for (uint32_t n = 32; n <= 48; n += 16) {
39267 for (size_t k = 1; k <= 5; k += 2) {
39268 GemmMicrokernelTester()
39269 .mr(1)
39270 .nr(16)
39271 .kr(1)
39272 .sr(1)
39273 .m(1)
39274 .n(16)
39275 .k(k)
39276 .ks(3)
39277 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39278 }
39279 }
39280 }
39281
39282 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
39283 TEST_REQUIRES_X86_AVX512F;
39284 for (size_t k = 1; k <= 5; k += 2) {
39285 for (uint32_t m = 1; m <= 1; m++) {
39286 for (uint32_t n = 1; n <= 16; n++) {
39287 GemmMicrokernelTester()
39288 .mr(1)
39289 .nr(16)
39290 .kr(1)
39291 .sr(1)
39292 .m(m)
39293 .n(n)
39294 .k(k)
39295 .cm_stride(19)
39296 .iterations(1)
39297 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39298 }
39299 }
39300 }
39301 }
39302
39303 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, a_offset) {
39304 TEST_REQUIRES_X86_AVX512F;
39305 for (size_t k = 1; k <= 5; k += 2) {
39306 GemmMicrokernelTester()
39307 .mr(1)
39308 .nr(16)
39309 .kr(1)
39310 .sr(1)
39311 .m(1)
39312 .n(16)
39313 .k(k)
39314 .ks(3)
39315 .a_offset(7)
39316 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39317 }
39318 }
39319
39320 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, zero) {
39321 TEST_REQUIRES_X86_AVX512F;
39322 for (uint32_t mz = 0; mz < 1; mz++) {
39323 for (size_t k = 1; k <= 5; k += 2) {
39324 GemmMicrokernelTester()
39325 .mr(1)
39326 .nr(16)
39327 .kr(1)
39328 .sr(1)
39329 .m(1)
39330 .n(16)
39331 .k(k)
39332 .ks(3)
39333 .a_offset(7)
39334 .zero_index(mz)
39335 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39336 }
39337 }
39338 }
39339
39340 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, qmin) {
39341 TEST_REQUIRES_X86_AVX512F;
39342 GemmMicrokernelTester()
39343 .mr(1)
39344 .nr(16)
39345 .kr(1)
39346 .sr(1)
39347 .m(1)
39348 .n(16)
39349 .k(1)
39350 .qmin(128)
39351 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39352 }
39353
39354 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, qmax) {
39355 TEST_REQUIRES_X86_AVX512F;
39356 GemmMicrokernelTester()
39357 .mr(1)
39358 .nr(16)
39359 .kr(1)
39360 .sr(1)
39361 .m(1)
39362 .n(16)
39363 .k(1)
39364 .qmax(128)
39365 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39366 }
39367
39368 TEST(F32_IGEMM_1X16__AVX512F_BROADCAST, strided_cm) {
39369 TEST_REQUIRES_X86_AVX512F;
39370 GemmMicrokernelTester()
39371 .mr(1)
39372 .nr(16)
39373 .kr(1)
39374 .sr(1)
39375 .m(1)
39376 .n(16)
39377 .k(1)
39378 .cm_stride(19)
39379 .Test(xnn_f32_igemm_ukernel_1x16__avx512f_broadcast);
39380 }
39381#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
39382
39383
39384#if XNN_ARCH_X86 || XNN_ARCH_X86_64
39385 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1) {
39386 TEST_REQUIRES_X86_AVX512F;
39387 GemmMicrokernelTester()
39388 .mr(4)
39389 .nr(16)
39390 .kr(1)
39391 .sr(1)
39392 .m(4)
39393 .n(16)
39394 .k(1)
39395 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39396 }
39397
39398 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, strided_cn) {
39399 TEST_REQUIRES_X86_AVX512F;
39400 GemmMicrokernelTester()
39401 .mr(4)
39402 .nr(16)
39403 .kr(1)
39404 .sr(1)
39405 .m(4)
39406 .n(16)
39407 .k(1)
39408 .cn_stride(19)
39409 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39410 }
39411
39412 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile) {
39413 TEST_REQUIRES_X86_AVX512F;
39414 for (uint32_t m = 1; m <= 4; m++) {
39415 for (uint32_t n = 1; n <= 16; n++) {
39416 GemmMicrokernelTester()
39417 .mr(4)
39418 .nr(16)
39419 .kr(1)
39420 .sr(1)
39421 .m(m)
39422 .n(n)
39423 .k(1)
39424 .iterations(1)
39425 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39426 }
39427 }
39428 }
39429
39430 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
39431 TEST_REQUIRES_X86_AVX512F;
39432 for (uint32_t m = 1; m <= 4; m++) {
39433 GemmMicrokernelTester()
39434 .mr(4)
39435 .nr(16)
39436 .kr(1)
39437 .sr(1)
39438 .m(m)
39439 .n(16)
39440 .k(1)
39441 .iterations(1)
39442 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39443 }
39444 }
39445
39446 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
39447 TEST_REQUIRES_X86_AVX512F;
39448 for (uint32_t n = 1; n <= 16; n++) {
39449 GemmMicrokernelTester()
39450 .mr(4)
39451 .nr(16)
39452 .kr(1)
39453 .sr(1)
39454 .m(4)
39455 .n(n)
39456 .k(1)
39457 .iterations(1)
39458 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39459 }
39460 }
39461
39462 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_gt_1) {
39463 TEST_REQUIRES_X86_AVX512F;
39464 for (size_t k = 2; k < 10; k++) {
39465 GemmMicrokernelTester()
39466 .mr(4)
39467 .nr(16)
39468 .kr(1)
39469 .sr(1)
39470 .m(4)
39471 .n(16)
39472 .k(k)
39473 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39474 }
39475 }
39476
39477 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, k_gt_1_subtile) {
39478 TEST_REQUIRES_X86_AVX512F;
39479 for (size_t k = 2; k < 10; k++) {
39480 for (uint32_t m = 1; m <= 4; m++) {
39481 for (uint32_t n = 1; n <= 16; n++) {
39482 GemmMicrokernelTester()
39483 .mr(4)
39484 .nr(16)
39485 .kr(1)
39486 .sr(1)
39487 .m(m)
39488 .n(n)
39489 .k(k)
39490 .iterations(1)
39491 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39492 }
39493 }
39494 }
39495 }
39496
39497 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16) {
39498 TEST_REQUIRES_X86_AVX512F;
39499 for (uint32_t n = 17; n < 32; n++) {
39500 for (size_t k = 1; k <= 5; k += 2) {
39501 GemmMicrokernelTester()
39502 .mr(4)
39503 .nr(16)
39504 .kr(1)
39505 .sr(1)
39506 .m(4)
39507 .n(16)
39508 .k(k)
39509 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39510 }
39511 }
39512 }
39513
39514 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
39515 TEST_REQUIRES_X86_AVX512F;
39516 for (uint32_t n = 17; n < 32; n++) {
39517 for (size_t k = 1; k <= 5; k += 2) {
39518 GemmMicrokernelTester()
39519 .mr(4)
39520 .nr(16)
39521 .kr(1)
39522 .sr(1)
39523 .m(4)
39524 .n(16)
39525 .k(k)
39526 .cn_stride(19)
39527 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39528 }
39529 }
39530 }
39531
39532 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16_subtile) {
39533 TEST_REQUIRES_X86_AVX512F;
39534 for (uint32_t n = 17; n < 32; n++) {
39535 for (size_t k = 1; k <= 5; k += 2) {
39536 for (uint32_t m = 1; m <= 4; m++) {
39537 GemmMicrokernelTester()
39538 .mr(4)
39539 .nr(16)
39540 .kr(1)
39541 .sr(1)
39542 .m(m)
39543 .n(n)
39544 .k(k)
39545 .iterations(1)
39546 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39547 }
39548 }
39549 }
39550 }
39551
39552 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16) {
39553 TEST_REQUIRES_X86_AVX512F;
39554 for (uint32_t n = 32; n <= 48; n += 16) {
39555 for (size_t k = 1; k <= 5; k += 2) {
39556 GemmMicrokernelTester()
39557 .mr(4)
39558 .nr(16)
39559 .kr(1)
39560 .sr(1)
39561 .m(4)
39562 .n(16)
39563 .k(k)
39564 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39565 }
39566 }
39567 }
39568
39569 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
39570 TEST_REQUIRES_X86_AVX512F;
39571 for (uint32_t n = 32; n <= 48; n += 16) {
39572 for (size_t k = 1; k <= 5; k += 2) {
39573 GemmMicrokernelTester()
39574 .mr(4)
39575 .nr(16)
39576 .kr(1)
39577 .sr(1)
39578 .m(4)
39579 .n(n)
39580 .k(k)
39581 .cn_stride(19)
39582 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39583 }
39584 }
39585 }
39586
39587 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16_subtile) {
39588 TEST_REQUIRES_X86_AVX512F;
39589 for (uint32_t n = 32; n <= 48; n += 16) {
39590 for (size_t k = 1; k <= 5; k += 2) {
39591 for (uint32_t m = 1; m <= 4; m++) {
39592 GemmMicrokernelTester()
39593 .mr(4)
39594 .nr(16)
39595 .kr(1)
39596 .sr(1)
39597 .m(m)
39598 .n(n)
39599 .k(k)
39600 .iterations(1)
39601 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39602 }
39603 }
39604 }
39605 }
39606
39607 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, small_kernel) {
39608 TEST_REQUIRES_X86_AVX512F;
39609 for (size_t k = 1; k <= 5; k += 2) {
39610 GemmMicrokernelTester()
39611 .mr(4)
39612 .nr(16)
39613 .kr(1)
39614 .sr(1)
39615 .m(4)
39616 .n(16)
39617 .k(k)
39618 .ks(3)
39619 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39620 }
39621 }
39622
39623 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, small_kernel_subtile) {
39624 TEST_REQUIRES_X86_AVX512F;
39625 for (size_t k = 1; k <= 5; k += 2) {
39626 for (uint32_t m = 1; m <= 4; m++) {
39627 for (uint32_t n = 1; n <= 16; n++) {
39628 GemmMicrokernelTester()
39629 .mr(4)
39630 .nr(16)
39631 .kr(1)
39632 .sr(1)
39633 .m(m)
39634 .n(n)
39635 .k(k)
39636 .ks(3)
39637 .iterations(1)
39638 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39639 }
39640 }
39641 }
39642 }
39643
39644 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
39645 TEST_REQUIRES_X86_AVX512F;
39646 for (uint32_t n = 17; n < 32; n++) {
39647 for (size_t k = 1; k <= 5; k += 2) {
39648 GemmMicrokernelTester()
39649 .mr(4)
39650 .nr(16)
39651 .kr(1)
39652 .sr(1)
39653 .m(4)
39654 .n(16)
39655 .k(k)
39656 .ks(3)
39657 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39658 }
39659 }
39660 }
39661
39662 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
39663 TEST_REQUIRES_X86_AVX512F;
39664 for (uint32_t n = 32; n <= 48; n += 16) {
39665 for (size_t k = 1; k <= 5; k += 2) {
39666 GemmMicrokernelTester()
39667 .mr(4)
39668 .nr(16)
39669 .kr(1)
39670 .sr(1)
39671 .m(4)
39672 .n(16)
39673 .k(k)
39674 .ks(3)
39675 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39676 }
39677 }
39678 }
39679
39680 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, strided_cm_subtile) {
39681 TEST_REQUIRES_X86_AVX512F;
39682 for (size_t k = 1; k <= 5; k += 2) {
39683 for (uint32_t m = 1; m <= 4; m++) {
39684 for (uint32_t n = 1; n <= 16; n++) {
39685 GemmMicrokernelTester()
39686 .mr(4)
39687 .nr(16)
39688 .kr(1)
39689 .sr(1)
39690 .m(m)
39691 .n(n)
39692 .k(k)
39693 .cm_stride(19)
39694 .iterations(1)
39695 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39696 }
39697 }
39698 }
39699 }
39700
39701 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, a_offset) {
39702 TEST_REQUIRES_X86_AVX512F;
39703 for (size_t k = 1; k <= 5; k += 2) {
39704 GemmMicrokernelTester()
39705 .mr(4)
39706 .nr(16)
39707 .kr(1)
39708 .sr(1)
39709 .m(4)
39710 .n(16)
39711 .k(k)
39712 .ks(3)
39713 .a_offset(23)
39714 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39715 }
39716 }
39717
39718 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, zero) {
39719 TEST_REQUIRES_X86_AVX512F;
39720 for (uint32_t mz = 0; mz < 4; mz++) {
39721 for (size_t k = 1; k <= 5; k += 2) {
39722 GemmMicrokernelTester()
39723 .mr(4)
39724 .nr(16)
39725 .kr(1)
39726 .sr(1)
39727 .m(4)
39728 .n(16)
39729 .k(k)
39730 .ks(3)
39731 .a_offset(23)
39732 .zero_index(mz)
39733 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39734 }
39735 }
39736 }
39737
39738 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, qmin) {
39739 TEST_REQUIRES_X86_AVX512F;
39740 GemmMicrokernelTester()
39741 .mr(4)
39742 .nr(16)
39743 .kr(1)
39744 .sr(1)
39745 .m(4)
39746 .n(16)
39747 .k(1)
39748 .qmin(128)
39749 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39750 }
39751
39752 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, qmax) {
39753 TEST_REQUIRES_X86_AVX512F;
39754 GemmMicrokernelTester()
39755 .mr(4)
39756 .nr(16)
39757 .kr(1)
39758 .sr(1)
39759 .m(4)
39760 .n(16)
39761 .k(1)
39762 .qmax(128)
39763 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39764 }
39765
39766 TEST(F32_IGEMM_4X16__AVX512F_BROADCAST, strided_cm) {
39767 TEST_REQUIRES_X86_AVX512F;
39768 GemmMicrokernelTester()
39769 .mr(4)
39770 .nr(16)
39771 .kr(1)
39772 .sr(1)
39773 .m(4)
39774 .n(16)
39775 .k(1)
39776 .cm_stride(19)
39777 .Test(xnn_f32_igemm_ukernel_4x16__avx512f_broadcast);
39778 }
39779#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
39780
39781
39782#if XNN_ARCH_X86 || XNN_ARCH_X86_64
39783 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1) {
39784 TEST_REQUIRES_X86_AVX512F;
39785 GemmMicrokernelTester()
39786 .mr(5)
39787 .nr(16)
39788 .kr(1)
39789 .sr(1)
39790 .m(5)
39791 .n(16)
39792 .k(1)
39793 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39794 }
39795
39796 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, strided_cn) {
39797 TEST_REQUIRES_X86_AVX512F;
39798 GemmMicrokernelTester()
39799 .mr(5)
39800 .nr(16)
39801 .kr(1)
39802 .sr(1)
39803 .m(5)
39804 .n(16)
39805 .k(1)
39806 .cn_stride(19)
39807 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39808 }
39809
39810 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile) {
39811 TEST_REQUIRES_X86_AVX512F;
39812 for (uint32_t m = 1; m <= 5; m++) {
39813 for (uint32_t n = 1; n <= 16; n++) {
39814 GemmMicrokernelTester()
39815 .mr(5)
39816 .nr(16)
39817 .kr(1)
39818 .sr(1)
39819 .m(m)
39820 .n(n)
39821 .k(1)
39822 .iterations(1)
39823 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39824 }
39825 }
39826 }
39827
39828 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
39829 TEST_REQUIRES_X86_AVX512F;
39830 for (uint32_t m = 1; m <= 5; m++) {
39831 GemmMicrokernelTester()
39832 .mr(5)
39833 .nr(16)
39834 .kr(1)
39835 .sr(1)
39836 .m(m)
39837 .n(16)
39838 .k(1)
39839 .iterations(1)
39840 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39841 }
39842 }
39843
39844 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
39845 TEST_REQUIRES_X86_AVX512F;
39846 for (uint32_t n = 1; n <= 16; n++) {
39847 GemmMicrokernelTester()
39848 .mr(5)
39849 .nr(16)
39850 .kr(1)
39851 .sr(1)
39852 .m(5)
39853 .n(n)
39854 .k(1)
39855 .iterations(1)
39856 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39857 }
39858 }
39859
39860 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_gt_1) {
39861 TEST_REQUIRES_X86_AVX512F;
39862 for (size_t k = 2; k < 10; k++) {
39863 GemmMicrokernelTester()
39864 .mr(5)
39865 .nr(16)
39866 .kr(1)
39867 .sr(1)
39868 .m(5)
39869 .n(16)
39870 .k(k)
39871 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39872 }
39873 }
39874
39875 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, k_gt_1_subtile) {
39876 TEST_REQUIRES_X86_AVX512F;
39877 for (size_t k = 2; k < 10; k++) {
39878 for (uint32_t m = 1; m <= 5; m++) {
39879 for (uint32_t n = 1; n <= 16; n++) {
39880 GemmMicrokernelTester()
39881 .mr(5)
39882 .nr(16)
39883 .kr(1)
39884 .sr(1)
39885 .m(m)
39886 .n(n)
39887 .k(k)
39888 .iterations(1)
39889 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39890 }
39891 }
39892 }
39893 }
39894
39895 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16) {
39896 TEST_REQUIRES_X86_AVX512F;
39897 for (uint32_t n = 17; n < 32; n++) {
39898 for (size_t k = 1; k <= 5; k += 2) {
39899 GemmMicrokernelTester()
39900 .mr(5)
39901 .nr(16)
39902 .kr(1)
39903 .sr(1)
39904 .m(5)
39905 .n(16)
39906 .k(k)
39907 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39908 }
39909 }
39910 }
39911
39912 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
39913 TEST_REQUIRES_X86_AVX512F;
39914 for (uint32_t n = 17; n < 32; n++) {
39915 for (size_t k = 1; k <= 5; k += 2) {
39916 GemmMicrokernelTester()
39917 .mr(5)
39918 .nr(16)
39919 .kr(1)
39920 .sr(1)
39921 .m(5)
39922 .n(16)
39923 .k(k)
39924 .cn_stride(19)
39925 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39926 }
39927 }
39928 }
39929
39930 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16_subtile) {
39931 TEST_REQUIRES_X86_AVX512F;
39932 for (uint32_t n = 17; n < 32; n++) {
39933 for (size_t k = 1; k <= 5; k += 2) {
39934 for (uint32_t m = 1; m <= 5; m++) {
39935 GemmMicrokernelTester()
39936 .mr(5)
39937 .nr(16)
39938 .kr(1)
39939 .sr(1)
39940 .m(m)
39941 .n(n)
39942 .k(k)
39943 .iterations(1)
39944 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39945 }
39946 }
39947 }
39948 }
39949
39950 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16) {
39951 TEST_REQUIRES_X86_AVX512F;
39952 for (uint32_t n = 32; n <= 48; n += 16) {
39953 for (size_t k = 1; k <= 5; k += 2) {
39954 GemmMicrokernelTester()
39955 .mr(5)
39956 .nr(16)
39957 .kr(1)
39958 .sr(1)
39959 .m(5)
39960 .n(16)
39961 .k(k)
39962 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39963 }
39964 }
39965 }
39966
39967 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
39968 TEST_REQUIRES_X86_AVX512F;
39969 for (uint32_t n = 32; n <= 48; n += 16) {
39970 for (size_t k = 1; k <= 5; k += 2) {
39971 GemmMicrokernelTester()
39972 .mr(5)
39973 .nr(16)
39974 .kr(1)
39975 .sr(1)
39976 .m(5)
39977 .n(n)
39978 .k(k)
39979 .cn_stride(19)
39980 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
39981 }
39982 }
39983 }
39984
39985 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16_subtile) {
39986 TEST_REQUIRES_X86_AVX512F;
39987 for (uint32_t n = 32; n <= 48; n += 16) {
39988 for (size_t k = 1; k <= 5; k += 2) {
39989 for (uint32_t m = 1; m <= 5; m++) {
39990 GemmMicrokernelTester()
39991 .mr(5)
39992 .nr(16)
39993 .kr(1)
39994 .sr(1)
39995 .m(m)
39996 .n(n)
39997 .k(k)
39998 .iterations(1)
39999 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40000 }
40001 }
40002 }
40003 }
40004
40005 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, small_kernel) {
40006 TEST_REQUIRES_X86_AVX512F;
40007 for (size_t k = 1; k <= 5; k += 2) {
40008 GemmMicrokernelTester()
40009 .mr(5)
40010 .nr(16)
40011 .kr(1)
40012 .sr(1)
40013 .m(5)
40014 .n(16)
40015 .k(k)
40016 .ks(3)
40017 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40018 }
40019 }
40020
40021 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, small_kernel_subtile) {
40022 TEST_REQUIRES_X86_AVX512F;
40023 for (size_t k = 1; k <= 5; k += 2) {
40024 for (uint32_t m = 1; m <= 5; m++) {
40025 for (uint32_t n = 1; n <= 16; n++) {
40026 GemmMicrokernelTester()
40027 .mr(5)
40028 .nr(16)
40029 .kr(1)
40030 .sr(1)
40031 .m(m)
40032 .n(n)
40033 .k(k)
40034 .ks(3)
40035 .iterations(1)
40036 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40037 }
40038 }
40039 }
40040 }
40041
40042 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
40043 TEST_REQUIRES_X86_AVX512F;
40044 for (uint32_t n = 17; n < 32; n++) {
40045 for (size_t k = 1; k <= 5; k += 2) {
40046 GemmMicrokernelTester()
40047 .mr(5)
40048 .nr(16)
40049 .kr(1)
40050 .sr(1)
40051 .m(5)
40052 .n(16)
40053 .k(k)
40054 .ks(3)
40055 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40056 }
40057 }
40058 }
40059
40060 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
40061 TEST_REQUIRES_X86_AVX512F;
40062 for (uint32_t n = 32; n <= 48; n += 16) {
40063 for (size_t k = 1; k <= 5; k += 2) {
40064 GemmMicrokernelTester()
40065 .mr(5)
40066 .nr(16)
40067 .kr(1)
40068 .sr(1)
40069 .m(5)
40070 .n(16)
40071 .k(k)
40072 .ks(3)
40073 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40074 }
40075 }
40076 }
40077
40078 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, strided_cm_subtile) {
40079 TEST_REQUIRES_X86_AVX512F;
40080 for (size_t k = 1; k <= 5; k += 2) {
40081 for (uint32_t m = 1; m <= 5; m++) {
40082 for (uint32_t n = 1; n <= 16; n++) {
40083 GemmMicrokernelTester()
40084 .mr(5)
40085 .nr(16)
40086 .kr(1)
40087 .sr(1)
40088 .m(m)
40089 .n(n)
40090 .k(k)
40091 .cm_stride(19)
40092 .iterations(1)
40093 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40094 }
40095 }
40096 }
40097 }
40098
40099 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, a_offset) {
40100 TEST_REQUIRES_X86_AVX512F;
40101 for (size_t k = 1; k <= 5; k += 2) {
40102 GemmMicrokernelTester()
40103 .mr(5)
40104 .nr(16)
40105 .kr(1)
40106 .sr(1)
40107 .m(5)
40108 .n(16)
40109 .k(k)
40110 .ks(3)
40111 .a_offset(29)
40112 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40113 }
40114 }
40115
40116 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, zero) {
40117 TEST_REQUIRES_X86_AVX512F;
40118 for (uint32_t mz = 0; mz < 5; mz++) {
40119 for (size_t k = 1; k <= 5; k += 2) {
40120 GemmMicrokernelTester()
40121 .mr(5)
40122 .nr(16)
40123 .kr(1)
40124 .sr(1)
40125 .m(5)
40126 .n(16)
40127 .k(k)
40128 .ks(3)
40129 .a_offset(29)
40130 .zero_index(mz)
40131 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40132 }
40133 }
40134 }
40135
40136 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, qmin) {
40137 TEST_REQUIRES_X86_AVX512F;
40138 GemmMicrokernelTester()
40139 .mr(5)
40140 .nr(16)
40141 .kr(1)
40142 .sr(1)
40143 .m(5)
40144 .n(16)
40145 .k(1)
40146 .qmin(128)
40147 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40148 }
40149
40150 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, qmax) {
40151 TEST_REQUIRES_X86_AVX512F;
40152 GemmMicrokernelTester()
40153 .mr(5)
40154 .nr(16)
40155 .kr(1)
40156 .sr(1)
40157 .m(5)
40158 .n(16)
40159 .k(1)
40160 .qmax(128)
40161 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40162 }
40163
40164 TEST(F32_IGEMM_5X16__AVX512F_BROADCAST, strided_cm) {
40165 TEST_REQUIRES_X86_AVX512F;
40166 GemmMicrokernelTester()
40167 .mr(5)
40168 .nr(16)
40169 .kr(1)
40170 .sr(1)
40171 .m(5)
40172 .n(16)
40173 .k(1)
40174 .cm_stride(19)
40175 .Test(xnn_f32_igemm_ukernel_5x16__avx512f_broadcast);
40176 }
40177#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
40178
40179
40180#if XNN_ARCH_X86 || XNN_ARCH_X86_64
40181 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1) {
40182 TEST_REQUIRES_X86_AVX512F;
40183 GemmMicrokernelTester()
40184 .mr(6)
40185 .nr(16)
40186 .kr(1)
40187 .sr(1)
40188 .m(6)
40189 .n(16)
40190 .k(1)
40191 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40192 }
40193
40194 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, strided_cn) {
40195 TEST_REQUIRES_X86_AVX512F;
40196 GemmMicrokernelTester()
40197 .mr(6)
40198 .nr(16)
40199 .kr(1)
40200 .sr(1)
40201 .m(6)
40202 .n(16)
40203 .k(1)
40204 .cn_stride(19)
40205 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40206 }
40207
40208 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
40209 TEST_REQUIRES_X86_AVX512F;
40210 for (uint32_t m = 1; m <= 6; m++) {
40211 for (uint32_t n = 1; n <= 16; n++) {
40212 GemmMicrokernelTester()
40213 .mr(6)
40214 .nr(16)
40215 .kr(1)
40216 .sr(1)
40217 .m(m)
40218 .n(n)
40219 .k(1)
40220 .iterations(1)
40221 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40222 }
40223 }
40224 }
40225
40226 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
40227 TEST_REQUIRES_X86_AVX512F;
40228 for (uint32_t m = 1; m <= 6; m++) {
40229 GemmMicrokernelTester()
40230 .mr(6)
40231 .nr(16)
40232 .kr(1)
40233 .sr(1)
40234 .m(m)
40235 .n(16)
40236 .k(1)
40237 .iterations(1)
40238 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40239 }
40240 }
40241
40242 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
40243 TEST_REQUIRES_X86_AVX512F;
40244 for (uint32_t n = 1; n <= 16; n++) {
40245 GemmMicrokernelTester()
40246 .mr(6)
40247 .nr(16)
40248 .kr(1)
40249 .sr(1)
40250 .m(6)
40251 .n(n)
40252 .k(1)
40253 .iterations(1)
40254 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40255 }
40256 }
40257
40258 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_gt_1) {
40259 TEST_REQUIRES_X86_AVX512F;
40260 for (size_t k = 2; k < 10; k++) {
40261 GemmMicrokernelTester()
40262 .mr(6)
40263 .nr(16)
40264 .kr(1)
40265 .sr(1)
40266 .m(6)
40267 .n(16)
40268 .k(k)
40269 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40270 }
40271 }
40272
40273 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
40274 TEST_REQUIRES_X86_AVX512F;
40275 for (size_t k = 2; k < 10; k++) {
40276 for (uint32_t m = 1; m <= 6; m++) {
40277 for (uint32_t n = 1; n <= 16; n++) {
40278 GemmMicrokernelTester()
40279 .mr(6)
40280 .nr(16)
40281 .kr(1)
40282 .sr(1)
40283 .m(m)
40284 .n(n)
40285 .k(k)
40286 .iterations(1)
40287 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40288 }
40289 }
40290 }
40291 }
40292
40293 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16) {
40294 TEST_REQUIRES_X86_AVX512F;
40295 for (uint32_t n = 17; n < 32; n++) {
40296 for (size_t k = 1; k <= 5; k += 2) {
40297 GemmMicrokernelTester()
40298 .mr(6)
40299 .nr(16)
40300 .kr(1)
40301 .sr(1)
40302 .m(6)
40303 .n(16)
40304 .k(k)
40305 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40306 }
40307 }
40308 }
40309
40310 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
40311 TEST_REQUIRES_X86_AVX512F;
40312 for (uint32_t n = 17; n < 32; n++) {
40313 for (size_t k = 1; k <= 5; k += 2) {
40314 GemmMicrokernelTester()
40315 .mr(6)
40316 .nr(16)
40317 .kr(1)
40318 .sr(1)
40319 .m(6)
40320 .n(16)
40321 .k(k)
40322 .cn_stride(19)
40323 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40324 }
40325 }
40326 }
40327
40328 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
40329 TEST_REQUIRES_X86_AVX512F;
40330 for (uint32_t n = 17; n < 32; n++) {
40331 for (size_t k = 1; k <= 5; k += 2) {
40332 for (uint32_t m = 1; m <= 6; m++) {
40333 GemmMicrokernelTester()
40334 .mr(6)
40335 .nr(16)
40336 .kr(1)
40337 .sr(1)
40338 .m(m)
40339 .n(n)
40340 .k(k)
40341 .iterations(1)
40342 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40343 }
40344 }
40345 }
40346 }
40347
40348 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16) {
40349 TEST_REQUIRES_X86_AVX512F;
40350 for (uint32_t n = 32; n <= 48; n += 16) {
40351 for (size_t k = 1; k <= 5; k += 2) {
40352 GemmMicrokernelTester()
40353 .mr(6)
40354 .nr(16)
40355 .kr(1)
40356 .sr(1)
40357 .m(6)
40358 .n(16)
40359 .k(k)
40360 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40361 }
40362 }
40363 }
40364
40365 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
40366 TEST_REQUIRES_X86_AVX512F;
40367 for (uint32_t n = 32; n <= 48; n += 16) {
40368 for (size_t k = 1; k <= 5; k += 2) {
40369 GemmMicrokernelTester()
40370 .mr(6)
40371 .nr(16)
40372 .kr(1)
40373 .sr(1)
40374 .m(6)
40375 .n(n)
40376 .k(k)
40377 .cn_stride(19)
40378 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40379 }
40380 }
40381 }
40382
40383 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
40384 TEST_REQUIRES_X86_AVX512F;
40385 for (uint32_t n = 32; n <= 48; n += 16) {
40386 for (size_t k = 1; k <= 5; k += 2) {
40387 for (uint32_t m = 1; m <= 6; m++) {
40388 GemmMicrokernelTester()
40389 .mr(6)
40390 .nr(16)
40391 .kr(1)
40392 .sr(1)
40393 .m(m)
40394 .n(n)
40395 .k(k)
40396 .iterations(1)
40397 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40398 }
40399 }
40400 }
40401 }
40402
40403 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, small_kernel) {
40404 TEST_REQUIRES_X86_AVX512F;
40405 for (size_t k = 1; k <= 5; k += 2) {
40406 GemmMicrokernelTester()
40407 .mr(6)
40408 .nr(16)
40409 .kr(1)
40410 .sr(1)
40411 .m(6)
40412 .n(16)
40413 .k(k)
40414 .ks(3)
40415 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40416 }
40417 }
40418
40419 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, small_kernel_subtile) {
40420 TEST_REQUIRES_X86_AVX512F;
40421 for (size_t k = 1; k <= 5; k += 2) {
40422 for (uint32_t m = 1; m <= 6; m++) {
40423 for (uint32_t n = 1; n <= 16; n++) {
40424 GemmMicrokernelTester()
40425 .mr(6)
40426 .nr(16)
40427 .kr(1)
40428 .sr(1)
40429 .m(m)
40430 .n(n)
40431 .k(k)
40432 .ks(3)
40433 .iterations(1)
40434 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40435 }
40436 }
40437 }
40438 }
40439
40440 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
40441 TEST_REQUIRES_X86_AVX512F;
40442 for (uint32_t n = 17; n < 32; n++) {
40443 for (size_t k = 1; k <= 5; k += 2) {
40444 GemmMicrokernelTester()
40445 .mr(6)
40446 .nr(16)
40447 .kr(1)
40448 .sr(1)
40449 .m(6)
40450 .n(16)
40451 .k(k)
40452 .ks(3)
40453 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40454 }
40455 }
40456 }
40457
40458 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
40459 TEST_REQUIRES_X86_AVX512F;
40460 for (uint32_t n = 32; n <= 48; n += 16) {
40461 for (size_t k = 1; k <= 5; k += 2) {
40462 GemmMicrokernelTester()
40463 .mr(6)
40464 .nr(16)
40465 .kr(1)
40466 .sr(1)
40467 .m(6)
40468 .n(16)
40469 .k(k)
40470 .ks(3)
40471 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40472 }
40473 }
40474 }
40475
40476 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
40477 TEST_REQUIRES_X86_AVX512F;
40478 for (size_t k = 1; k <= 5; k += 2) {
40479 for (uint32_t m = 1; m <= 6; m++) {
40480 for (uint32_t n = 1; n <= 16; n++) {
40481 GemmMicrokernelTester()
40482 .mr(6)
40483 .nr(16)
40484 .kr(1)
40485 .sr(1)
40486 .m(m)
40487 .n(n)
40488 .k(k)
40489 .cm_stride(19)
40490 .iterations(1)
40491 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40492 }
40493 }
40494 }
40495 }
40496
40497 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, a_offset) {
40498 TEST_REQUIRES_X86_AVX512F;
40499 for (size_t k = 1; k <= 5; k += 2) {
40500 GemmMicrokernelTester()
40501 .mr(6)
40502 .nr(16)
40503 .kr(1)
40504 .sr(1)
40505 .m(6)
40506 .n(16)
40507 .k(k)
40508 .ks(3)
40509 .a_offset(37)
40510 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40511 }
40512 }
40513
40514 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, zero) {
40515 TEST_REQUIRES_X86_AVX512F;
40516 for (uint32_t mz = 0; mz < 6; mz++) {
40517 for (size_t k = 1; k <= 5; k += 2) {
40518 GemmMicrokernelTester()
40519 .mr(6)
40520 .nr(16)
40521 .kr(1)
40522 .sr(1)
40523 .m(6)
40524 .n(16)
40525 .k(k)
40526 .ks(3)
40527 .a_offset(37)
40528 .zero_index(mz)
40529 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40530 }
40531 }
40532 }
40533
40534 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, qmin) {
40535 TEST_REQUIRES_X86_AVX512F;
40536 GemmMicrokernelTester()
40537 .mr(6)
40538 .nr(16)
40539 .kr(1)
40540 .sr(1)
40541 .m(6)
40542 .n(16)
40543 .k(1)
40544 .qmin(128)
40545 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40546 }
40547
40548 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, qmax) {
40549 TEST_REQUIRES_X86_AVX512F;
40550 GemmMicrokernelTester()
40551 .mr(6)
40552 .nr(16)
40553 .kr(1)
40554 .sr(1)
40555 .m(6)
40556 .n(16)
40557 .k(1)
40558 .qmax(128)
40559 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40560 }
40561
40562 TEST(F32_IGEMM_6X16__AVX512F_BROADCAST, strided_cm) {
40563 TEST_REQUIRES_X86_AVX512F;
40564 GemmMicrokernelTester()
40565 .mr(6)
40566 .nr(16)
40567 .kr(1)
40568 .sr(1)
40569 .m(6)
40570 .n(16)
40571 .k(1)
40572 .cm_stride(19)
40573 .Test(xnn_f32_igemm_ukernel_6x16__avx512f_broadcast);
40574 }
40575#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
40576
40577
40578#if XNN_ARCH_X86 || XNN_ARCH_X86_64
40579 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1) {
40580 TEST_REQUIRES_X86_AVX512F;
40581 GemmMicrokernelTester()
40582 .mr(7)
40583 .nr(16)
40584 .kr(1)
40585 .sr(1)
40586 .m(7)
40587 .n(16)
40588 .k(1)
40589 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40590 }
40591
40592 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, strided_cn) {
40593 TEST_REQUIRES_X86_AVX512F;
40594 GemmMicrokernelTester()
40595 .mr(7)
40596 .nr(16)
40597 .kr(1)
40598 .sr(1)
40599 .m(7)
40600 .n(16)
40601 .k(1)
40602 .cn_stride(19)
40603 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40604 }
40605
40606 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
40607 TEST_REQUIRES_X86_AVX512F;
40608 for (uint32_t m = 1; m <= 7; m++) {
40609 for (uint32_t n = 1; n <= 16; n++) {
40610 GemmMicrokernelTester()
40611 .mr(7)
40612 .nr(16)
40613 .kr(1)
40614 .sr(1)
40615 .m(m)
40616 .n(n)
40617 .k(1)
40618 .iterations(1)
40619 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40620 }
40621 }
40622 }
40623
40624 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
40625 TEST_REQUIRES_X86_AVX512F;
40626 for (uint32_t m = 1; m <= 7; m++) {
40627 GemmMicrokernelTester()
40628 .mr(7)
40629 .nr(16)
40630 .kr(1)
40631 .sr(1)
40632 .m(m)
40633 .n(16)
40634 .k(1)
40635 .iterations(1)
40636 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40637 }
40638 }
40639
40640 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
40641 TEST_REQUIRES_X86_AVX512F;
40642 for (uint32_t n = 1; n <= 16; n++) {
40643 GemmMicrokernelTester()
40644 .mr(7)
40645 .nr(16)
40646 .kr(1)
40647 .sr(1)
40648 .m(7)
40649 .n(n)
40650 .k(1)
40651 .iterations(1)
40652 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40653 }
40654 }
40655
40656 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_gt_1) {
40657 TEST_REQUIRES_X86_AVX512F;
40658 for (size_t k = 2; k < 10; k++) {
40659 GemmMicrokernelTester()
40660 .mr(7)
40661 .nr(16)
40662 .kr(1)
40663 .sr(1)
40664 .m(7)
40665 .n(16)
40666 .k(k)
40667 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40668 }
40669 }
40670
40671 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
40672 TEST_REQUIRES_X86_AVX512F;
40673 for (size_t k = 2; k < 10; k++) {
40674 for (uint32_t m = 1; m <= 7; m++) {
40675 for (uint32_t n = 1; n <= 16; n++) {
40676 GemmMicrokernelTester()
40677 .mr(7)
40678 .nr(16)
40679 .kr(1)
40680 .sr(1)
40681 .m(m)
40682 .n(n)
40683 .k(k)
40684 .iterations(1)
40685 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40686 }
40687 }
40688 }
40689 }
40690
40691 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16) {
40692 TEST_REQUIRES_X86_AVX512F;
40693 for (uint32_t n = 17; n < 32; n++) {
40694 for (size_t k = 1; k <= 5; k += 2) {
40695 GemmMicrokernelTester()
40696 .mr(7)
40697 .nr(16)
40698 .kr(1)
40699 .sr(1)
40700 .m(7)
40701 .n(16)
40702 .k(k)
40703 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40704 }
40705 }
40706 }
40707
40708 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
40709 TEST_REQUIRES_X86_AVX512F;
40710 for (uint32_t n = 17; n < 32; n++) {
40711 for (size_t k = 1; k <= 5; k += 2) {
40712 GemmMicrokernelTester()
40713 .mr(7)
40714 .nr(16)
40715 .kr(1)
40716 .sr(1)
40717 .m(7)
40718 .n(16)
40719 .k(k)
40720 .cn_stride(19)
40721 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40722 }
40723 }
40724 }
40725
40726 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
40727 TEST_REQUIRES_X86_AVX512F;
40728 for (uint32_t n = 17; n < 32; n++) {
40729 for (size_t k = 1; k <= 5; k += 2) {
40730 for (uint32_t m = 1; m <= 7; m++) {
40731 GemmMicrokernelTester()
40732 .mr(7)
40733 .nr(16)
40734 .kr(1)
40735 .sr(1)
40736 .m(m)
40737 .n(n)
40738 .k(k)
40739 .iterations(1)
40740 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40741 }
40742 }
40743 }
40744 }
40745
40746 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16) {
40747 TEST_REQUIRES_X86_AVX512F;
40748 for (uint32_t n = 32; n <= 48; n += 16) {
40749 for (size_t k = 1; k <= 5; k += 2) {
40750 GemmMicrokernelTester()
40751 .mr(7)
40752 .nr(16)
40753 .kr(1)
40754 .sr(1)
40755 .m(7)
40756 .n(16)
40757 .k(k)
40758 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40759 }
40760 }
40761 }
40762
40763 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
40764 TEST_REQUIRES_X86_AVX512F;
40765 for (uint32_t n = 32; n <= 48; n += 16) {
40766 for (size_t k = 1; k <= 5; k += 2) {
40767 GemmMicrokernelTester()
40768 .mr(7)
40769 .nr(16)
40770 .kr(1)
40771 .sr(1)
40772 .m(7)
40773 .n(n)
40774 .k(k)
40775 .cn_stride(19)
40776 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40777 }
40778 }
40779 }
40780
40781 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
40782 TEST_REQUIRES_X86_AVX512F;
40783 for (uint32_t n = 32; n <= 48; n += 16) {
40784 for (size_t k = 1; k <= 5; k += 2) {
40785 for (uint32_t m = 1; m <= 7; m++) {
40786 GemmMicrokernelTester()
40787 .mr(7)
40788 .nr(16)
40789 .kr(1)
40790 .sr(1)
40791 .m(m)
40792 .n(n)
40793 .k(k)
40794 .iterations(1)
40795 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40796 }
40797 }
40798 }
40799 }
40800
40801 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, small_kernel) {
40802 TEST_REQUIRES_X86_AVX512F;
40803 for (size_t k = 1; k <= 5; k += 2) {
40804 GemmMicrokernelTester()
40805 .mr(7)
40806 .nr(16)
40807 .kr(1)
40808 .sr(1)
40809 .m(7)
40810 .n(16)
40811 .k(k)
40812 .ks(3)
40813 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40814 }
40815 }
40816
40817 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, small_kernel_subtile) {
40818 TEST_REQUIRES_X86_AVX512F;
40819 for (size_t k = 1; k <= 5; k += 2) {
40820 for (uint32_t m = 1; m <= 7; m++) {
40821 for (uint32_t n = 1; n <= 16; n++) {
40822 GemmMicrokernelTester()
40823 .mr(7)
40824 .nr(16)
40825 .kr(1)
40826 .sr(1)
40827 .m(m)
40828 .n(n)
40829 .k(k)
40830 .ks(3)
40831 .iterations(1)
40832 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40833 }
40834 }
40835 }
40836 }
40837
40838 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
40839 TEST_REQUIRES_X86_AVX512F;
40840 for (uint32_t n = 17; n < 32; n++) {
40841 for (size_t k = 1; k <= 5; k += 2) {
40842 GemmMicrokernelTester()
40843 .mr(7)
40844 .nr(16)
40845 .kr(1)
40846 .sr(1)
40847 .m(7)
40848 .n(16)
40849 .k(k)
40850 .ks(3)
40851 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40852 }
40853 }
40854 }
40855
40856 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
40857 TEST_REQUIRES_X86_AVX512F;
40858 for (uint32_t n = 32; n <= 48; n += 16) {
40859 for (size_t k = 1; k <= 5; k += 2) {
40860 GemmMicrokernelTester()
40861 .mr(7)
40862 .nr(16)
40863 .kr(1)
40864 .sr(1)
40865 .m(7)
40866 .n(16)
40867 .k(k)
40868 .ks(3)
40869 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40870 }
40871 }
40872 }
40873
40874 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
40875 TEST_REQUIRES_X86_AVX512F;
40876 for (size_t k = 1; k <= 5; k += 2) {
40877 for (uint32_t m = 1; m <= 7; m++) {
40878 for (uint32_t n = 1; n <= 16; n++) {
40879 GemmMicrokernelTester()
40880 .mr(7)
40881 .nr(16)
40882 .kr(1)
40883 .sr(1)
40884 .m(m)
40885 .n(n)
40886 .k(k)
40887 .cm_stride(19)
40888 .iterations(1)
40889 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40890 }
40891 }
40892 }
40893 }
40894
40895 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, a_offset) {
40896 TEST_REQUIRES_X86_AVX512F;
40897 for (size_t k = 1; k <= 5; k += 2) {
40898 GemmMicrokernelTester()
40899 .mr(7)
40900 .nr(16)
40901 .kr(1)
40902 .sr(1)
40903 .m(7)
40904 .n(16)
40905 .k(k)
40906 .ks(3)
40907 .a_offset(37)
40908 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40909 }
40910 }
40911
40912 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, zero) {
40913 TEST_REQUIRES_X86_AVX512F;
40914 for (uint32_t mz = 0; mz < 7; mz++) {
40915 for (size_t k = 1; k <= 5; k += 2) {
40916 GemmMicrokernelTester()
40917 .mr(7)
40918 .nr(16)
40919 .kr(1)
40920 .sr(1)
40921 .m(7)
40922 .n(16)
40923 .k(k)
40924 .ks(3)
40925 .a_offset(37)
40926 .zero_index(mz)
40927 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40928 }
40929 }
40930 }
40931
40932 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, qmin) {
40933 TEST_REQUIRES_X86_AVX512F;
40934 GemmMicrokernelTester()
40935 .mr(7)
40936 .nr(16)
40937 .kr(1)
40938 .sr(1)
40939 .m(7)
40940 .n(16)
40941 .k(1)
40942 .qmin(128)
40943 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40944 }
40945
40946 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, qmax) {
40947 TEST_REQUIRES_X86_AVX512F;
40948 GemmMicrokernelTester()
40949 .mr(7)
40950 .nr(16)
40951 .kr(1)
40952 .sr(1)
40953 .m(7)
40954 .n(16)
40955 .k(1)
40956 .qmax(128)
40957 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40958 }
40959
40960 TEST(F32_IGEMM_7X16__AVX512F_BROADCAST, strided_cm) {
40961 TEST_REQUIRES_X86_AVX512F;
40962 GemmMicrokernelTester()
40963 .mr(7)
40964 .nr(16)
40965 .kr(1)
40966 .sr(1)
40967 .m(7)
40968 .n(16)
40969 .k(1)
40970 .cm_stride(19)
40971 .Test(xnn_f32_igemm_ukernel_7x16__avx512f_broadcast);
40972 }
40973#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
40974
40975
40976#if XNN_ARCH_X86 || XNN_ARCH_X86_64
40977 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1) {
40978 TEST_REQUIRES_X86_AVX512F;
40979 GemmMicrokernelTester()
40980 .mr(8)
40981 .nr(16)
40982 .kr(1)
40983 .sr(1)
40984 .m(8)
40985 .n(16)
40986 .k(1)
40987 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
40988 }
40989
40990 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, strided_cn) {
40991 TEST_REQUIRES_X86_AVX512F;
40992 GemmMicrokernelTester()
40993 .mr(8)
40994 .nr(16)
40995 .kr(1)
40996 .sr(1)
40997 .m(8)
40998 .n(16)
40999 .k(1)
41000 .cn_stride(19)
41001 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41002 }
41003
41004 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
41005 TEST_REQUIRES_X86_AVX512F;
41006 for (uint32_t m = 1; m <= 8; m++) {
41007 for (uint32_t n = 1; n <= 16; n++) {
41008 GemmMicrokernelTester()
41009 .mr(8)
41010 .nr(16)
41011 .kr(1)
41012 .sr(1)
41013 .m(m)
41014 .n(n)
41015 .k(1)
41016 .iterations(1)
41017 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41018 }
41019 }
41020 }
41021
41022 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
41023 TEST_REQUIRES_X86_AVX512F;
41024 for (uint32_t m = 1; m <= 8; m++) {
41025 GemmMicrokernelTester()
41026 .mr(8)
41027 .nr(16)
41028 .kr(1)
41029 .sr(1)
41030 .m(m)
41031 .n(16)
41032 .k(1)
41033 .iterations(1)
41034 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41035 }
41036 }
41037
41038 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
41039 TEST_REQUIRES_X86_AVX512F;
41040 for (uint32_t n = 1; n <= 16; n++) {
41041 GemmMicrokernelTester()
41042 .mr(8)
41043 .nr(16)
41044 .kr(1)
41045 .sr(1)
41046 .m(8)
41047 .n(n)
41048 .k(1)
41049 .iterations(1)
41050 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41051 }
41052 }
41053
41054 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_gt_1) {
41055 TEST_REQUIRES_X86_AVX512F;
41056 for (size_t k = 2; k < 10; k++) {
41057 GemmMicrokernelTester()
41058 .mr(8)
41059 .nr(16)
41060 .kr(1)
41061 .sr(1)
41062 .m(8)
41063 .n(16)
41064 .k(k)
41065 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41066 }
41067 }
41068
41069 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
41070 TEST_REQUIRES_X86_AVX512F;
41071 for (size_t k = 2; k < 10; k++) {
41072 for (uint32_t m = 1; m <= 8; m++) {
41073 for (uint32_t n = 1; n <= 16; n++) {
41074 GemmMicrokernelTester()
41075 .mr(8)
41076 .nr(16)
41077 .kr(1)
41078 .sr(1)
41079 .m(m)
41080 .n(n)
41081 .k(k)
41082 .iterations(1)
41083 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41084 }
41085 }
41086 }
41087 }
41088
41089 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16) {
41090 TEST_REQUIRES_X86_AVX512F;
41091 for (uint32_t n = 17; n < 32; n++) {
41092 for (size_t k = 1; k <= 5; k += 2) {
41093 GemmMicrokernelTester()
41094 .mr(8)
41095 .nr(16)
41096 .kr(1)
41097 .sr(1)
41098 .m(8)
41099 .n(16)
41100 .k(k)
41101 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41102 }
41103 }
41104 }
41105
41106 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
41107 TEST_REQUIRES_X86_AVX512F;
41108 for (uint32_t n = 17; n < 32; n++) {
41109 for (size_t k = 1; k <= 5; k += 2) {
41110 GemmMicrokernelTester()
41111 .mr(8)
41112 .nr(16)
41113 .kr(1)
41114 .sr(1)
41115 .m(8)
41116 .n(16)
41117 .k(k)
41118 .cn_stride(19)
41119 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41120 }
41121 }
41122 }
41123
41124 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
41125 TEST_REQUIRES_X86_AVX512F;
41126 for (uint32_t n = 17; n < 32; n++) {
41127 for (size_t k = 1; k <= 5; k += 2) {
41128 for (uint32_t m = 1; m <= 8; m++) {
41129 GemmMicrokernelTester()
41130 .mr(8)
41131 .nr(16)
41132 .kr(1)
41133 .sr(1)
41134 .m(m)
41135 .n(n)
41136 .k(k)
41137 .iterations(1)
41138 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41139 }
41140 }
41141 }
41142 }
41143
41144 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16) {
41145 TEST_REQUIRES_X86_AVX512F;
41146 for (uint32_t n = 32; n <= 48; n += 16) {
41147 for (size_t k = 1; k <= 5; k += 2) {
41148 GemmMicrokernelTester()
41149 .mr(8)
41150 .nr(16)
41151 .kr(1)
41152 .sr(1)
41153 .m(8)
41154 .n(16)
41155 .k(k)
41156 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41157 }
41158 }
41159 }
41160
41161 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
41162 TEST_REQUIRES_X86_AVX512F;
41163 for (uint32_t n = 32; n <= 48; n += 16) {
41164 for (size_t k = 1; k <= 5; k += 2) {
41165 GemmMicrokernelTester()
41166 .mr(8)
41167 .nr(16)
41168 .kr(1)
41169 .sr(1)
41170 .m(8)
41171 .n(n)
41172 .k(k)
41173 .cn_stride(19)
41174 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41175 }
41176 }
41177 }
41178
41179 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
41180 TEST_REQUIRES_X86_AVX512F;
41181 for (uint32_t n = 32; n <= 48; n += 16) {
41182 for (size_t k = 1; k <= 5; k += 2) {
41183 for (uint32_t m = 1; m <= 8; m++) {
41184 GemmMicrokernelTester()
41185 .mr(8)
41186 .nr(16)
41187 .kr(1)
41188 .sr(1)
41189 .m(m)
41190 .n(n)
41191 .k(k)
41192 .iterations(1)
41193 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41194 }
41195 }
41196 }
41197 }
41198
41199 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, small_kernel) {
41200 TEST_REQUIRES_X86_AVX512F;
41201 for (size_t k = 1; k <= 5; k += 2) {
41202 GemmMicrokernelTester()
41203 .mr(8)
41204 .nr(16)
41205 .kr(1)
41206 .sr(1)
41207 .m(8)
41208 .n(16)
41209 .k(k)
41210 .ks(3)
41211 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41212 }
41213 }
41214
41215 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, small_kernel_subtile) {
41216 TEST_REQUIRES_X86_AVX512F;
41217 for (size_t k = 1; k <= 5; k += 2) {
41218 for (uint32_t m = 1; m <= 8; m++) {
41219 for (uint32_t n = 1; n <= 16; n++) {
41220 GemmMicrokernelTester()
41221 .mr(8)
41222 .nr(16)
41223 .kr(1)
41224 .sr(1)
41225 .m(m)
41226 .n(n)
41227 .k(k)
41228 .ks(3)
41229 .iterations(1)
41230 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41231 }
41232 }
41233 }
41234 }
41235
41236 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_gt_16_small_kernel) {
41237 TEST_REQUIRES_X86_AVX512F;
41238 for (uint32_t n = 17; n < 32; n++) {
41239 for (size_t k = 1; k <= 5; k += 2) {
41240 GemmMicrokernelTester()
41241 .mr(8)
41242 .nr(16)
41243 .kr(1)
41244 .sr(1)
41245 .m(8)
41246 .n(16)
41247 .k(k)
41248 .ks(3)
41249 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41250 }
41251 }
41252 }
41253
41254 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, n_div_16_small_kernel) {
41255 TEST_REQUIRES_X86_AVX512F;
41256 for (uint32_t n = 32; n <= 48; n += 16) {
41257 for (size_t k = 1; k <= 5; k += 2) {
41258 GemmMicrokernelTester()
41259 .mr(8)
41260 .nr(16)
41261 .kr(1)
41262 .sr(1)
41263 .m(8)
41264 .n(16)
41265 .k(k)
41266 .ks(3)
41267 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41268 }
41269 }
41270 }
41271
41272 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
41273 TEST_REQUIRES_X86_AVX512F;
41274 for (size_t k = 1; k <= 5; k += 2) {
41275 for (uint32_t m = 1; m <= 8; m++) {
41276 for (uint32_t n = 1; n <= 16; n++) {
41277 GemmMicrokernelTester()
41278 .mr(8)
41279 .nr(16)
41280 .kr(1)
41281 .sr(1)
41282 .m(m)
41283 .n(n)
41284 .k(k)
41285 .cm_stride(19)
41286 .iterations(1)
41287 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41288 }
41289 }
41290 }
41291 }
41292
41293 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, a_offset) {
41294 TEST_REQUIRES_X86_AVX512F;
41295 for (size_t k = 1; k <= 5; k += 2) {
41296 GemmMicrokernelTester()
41297 .mr(8)
41298 .nr(16)
41299 .kr(1)
41300 .sr(1)
41301 .m(8)
41302 .n(16)
41303 .k(k)
41304 .ks(3)
41305 .a_offset(43)
41306 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41307 }
41308 }
41309
41310 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, zero) {
41311 TEST_REQUIRES_X86_AVX512F;
41312 for (uint32_t mz = 0; mz < 8; mz++) {
41313 for (size_t k = 1; k <= 5; k += 2) {
41314 GemmMicrokernelTester()
41315 .mr(8)
41316 .nr(16)
41317 .kr(1)
41318 .sr(1)
41319 .m(8)
41320 .n(16)
41321 .k(k)
41322 .ks(3)
41323 .a_offset(43)
41324 .zero_index(mz)
41325 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41326 }
41327 }
41328 }
41329
41330 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, qmin) {
41331 TEST_REQUIRES_X86_AVX512F;
41332 GemmMicrokernelTester()
41333 .mr(8)
41334 .nr(16)
41335 .kr(1)
41336 .sr(1)
41337 .m(8)
41338 .n(16)
41339 .k(1)
41340 .qmin(128)
41341 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41342 }
41343
41344 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, qmax) {
41345 TEST_REQUIRES_X86_AVX512F;
41346 GemmMicrokernelTester()
41347 .mr(8)
41348 .nr(16)
41349 .kr(1)
41350 .sr(1)
41351 .m(8)
41352 .n(16)
41353 .k(1)
41354 .qmax(128)
41355 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41356 }
41357
41358 TEST(F32_IGEMM_8X16__AVX512F_BROADCAST, strided_cm) {
41359 TEST_REQUIRES_X86_AVX512F;
41360 GemmMicrokernelTester()
41361 .mr(8)
41362 .nr(16)
41363 .kr(1)
41364 .sr(1)
41365 .m(8)
41366 .n(16)
41367 .k(1)
41368 .cm_stride(19)
41369 .Test(xnn_f32_igemm_ukernel_8x16__avx512f_broadcast);
41370 }
41371#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
41372
41373
41374#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41375 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1) {
41376 TEST_REQUIRES_PSIMD;
41377 GemmMicrokernelTester()
41378 .mr(1)
41379 .nr(8)
41380 .kr(1)
41381 .sr(1)
41382 .m(1)
41383 .n(8)
41384 .k(1)
41385 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41386 }
41387
41388 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, strided_cn) {
41389 TEST_REQUIRES_PSIMD;
41390 GemmMicrokernelTester()
41391 .mr(1)
41392 .nr(8)
41393 .kr(1)
41394 .sr(1)
41395 .m(1)
41396 .n(8)
41397 .k(1)
41398 .cn_stride(11)
41399 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41400 }
41401
41402 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
41403 TEST_REQUIRES_PSIMD;
41404 for (uint32_t m = 1; m <= 1; m++) {
41405 for (uint32_t n = 1; n <= 8; n++) {
41406 GemmMicrokernelTester()
41407 .mr(1)
41408 .nr(8)
41409 .kr(1)
41410 .sr(1)
41411 .m(m)
41412 .n(n)
41413 .k(1)
41414 .iterations(1)
41415 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41416 }
41417 }
41418 }
41419
41420 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
41421 TEST_REQUIRES_PSIMD;
41422 for (uint32_t m = 1; m <= 1; m++) {
41423 GemmMicrokernelTester()
41424 .mr(1)
41425 .nr(8)
41426 .kr(1)
41427 .sr(1)
41428 .m(m)
41429 .n(8)
41430 .k(1)
41431 .iterations(1)
41432 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41433 }
41434 }
41435
41436 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
41437 TEST_REQUIRES_PSIMD;
41438 for (uint32_t n = 1; n <= 8; n++) {
41439 GemmMicrokernelTester()
41440 .mr(1)
41441 .nr(8)
41442 .kr(1)
41443 .sr(1)
41444 .m(1)
41445 .n(n)
41446 .k(1)
41447 .iterations(1)
41448 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41449 }
41450 }
41451
41452 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_gt_1) {
41453 TEST_REQUIRES_PSIMD;
41454 for (size_t k = 2; k < 10; k++) {
41455 GemmMicrokernelTester()
41456 .mr(1)
41457 .nr(8)
41458 .kr(1)
41459 .sr(1)
41460 .m(1)
41461 .n(8)
41462 .k(k)
41463 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41464 }
41465 }
41466
41467 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
41468 TEST_REQUIRES_PSIMD;
41469 for (size_t k = 2; k < 10; k++) {
41470 for (uint32_t m = 1; m <= 1; m++) {
41471 for (uint32_t n = 1; n <= 8; n++) {
41472 GemmMicrokernelTester()
41473 .mr(1)
41474 .nr(8)
41475 .kr(1)
41476 .sr(1)
41477 .m(m)
41478 .n(n)
41479 .k(k)
41480 .iterations(1)
41481 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41482 }
41483 }
41484 }
41485 }
41486
41487 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8) {
41488 TEST_REQUIRES_PSIMD;
41489 for (uint32_t n = 9; n < 16; n++) {
41490 for (size_t k = 1; k <= 5; k += 2) {
41491 GemmMicrokernelTester()
41492 .mr(1)
41493 .nr(8)
41494 .kr(1)
41495 .sr(1)
41496 .m(1)
41497 .n(8)
41498 .k(k)
41499 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41500 }
41501 }
41502 }
41503
41504 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
41505 TEST_REQUIRES_PSIMD;
41506 for (uint32_t n = 9; n < 16; n++) {
41507 for (size_t k = 1; k <= 5; k += 2) {
41508 GemmMicrokernelTester()
41509 .mr(1)
41510 .nr(8)
41511 .kr(1)
41512 .sr(1)
41513 .m(1)
41514 .n(8)
41515 .k(k)
41516 .cn_stride(11)
41517 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41518 }
41519 }
41520 }
41521
41522 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
41523 TEST_REQUIRES_PSIMD;
41524 for (uint32_t n = 9; n < 16; n++) {
41525 for (size_t k = 1; k <= 5; k += 2) {
41526 for (uint32_t m = 1; m <= 1; m++) {
41527 GemmMicrokernelTester()
41528 .mr(1)
41529 .nr(8)
41530 .kr(1)
41531 .sr(1)
41532 .m(m)
41533 .n(n)
41534 .k(k)
41535 .iterations(1)
41536 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41537 }
41538 }
41539 }
41540 }
41541
41542 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8) {
41543 TEST_REQUIRES_PSIMD;
41544 for (uint32_t n = 16; n <= 24; n += 8) {
41545 for (size_t k = 1; k <= 5; k += 2) {
41546 GemmMicrokernelTester()
41547 .mr(1)
41548 .nr(8)
41549 .kr(1)
41550 .sr(1)
41551 .m(1)
41552 .n(8)
41553 .k(k)
41554 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41555 }
41556 }
41557 }
41558
41559 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
41560 TEST_REQUIRES_PSIMD;
41561 for (uint32_t n = 16; n <= 24; n += 8) {
41562 for (size_t k = 1; k <= 5; k += 2) {
41563 GemmMicrokernelTester()
41564 .mr(1)
41565 .nr(8)
41566 .kr(1)
41567 .sr(1)
41568 .m(1)
41569 .n(n)
41570 .k(k)
41571 .cn_stride(11)
41572 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41573 }
41574 }
41575 }
41576
41577 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
41578 TEST_REQUIRES_PSIMD;
41579 for (uint32_t n = 16; n <= 24; n += 8) {
41580 for (size_t k = 1; k <= 5; k += 2) {
41581 for (uint32_t m = 1; m <= 1; m++) {
41582 GemmMicrokernelTester()
41583 .mr(1)
41584 .nr(8)
41585 .kr(1)
41586 .sr(1)
41587 .m(m)
41588 .n(n)
41589 .k(k)
41590 .iterations(1)
41591 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41592 }
41593 }
41594 }
41595 }
41596
41597 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, small_kernel) {
41598 TEST_REQUIRES_PSIMD;
41599 for (size_t k = 1; k <= 5; k += 2) {
41600 GemmMicrokernelTester()
41601 .mr(1)
41602 .nr(8)
41603 .kr(1)
41604 .sr(1)
41605 .m(1)
41606 .n(8)
41607 .k(k)
41608 .ks(3)
41609 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41610 }
41611 }
41612
41613 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, small_kernel_subtile) {
41614 TEST_REQUIRES_PSIMD;
41615 for (size_t k = 1; k <= 5; k += 2) {
41616 for (uint32_t m = 1; m <= 1; m++) {
41617 for (uint32_t n = 1; n <= 8; n++) {
41618 GemmMicrokernelTester()
41619 .mr(1)
41620 .nr(8)
41621 .kr(1)
41622 .sr(1)
41623 .m(m)
41624 .n(n)
41625 .k(k)
41626 .ks(3)
41627 .iterations(1)
41628 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41629 }
41630 }
41631 }
41632 }
41633
41634 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_small_kernel) {
41635 TEST_REQUIRES_PSIMD;
41636 for (uint32_t n = 9; n < 16; n++) {
41637 for (size_t k = 1; k <= 5; k += 2) {
41638 GemmMicrokernelTester()
41639 .mr(1)
41640 .nr(8)
41641 .kr(1)
41642 .sr(1)
41643 .m(1)
41644 .n(8)
41645 .k(k)
41646 .ks(3)
41647 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41648 }
41649 }
41650 }
41651
41652 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, n_div_8_small_kernel) {
41653 TEST_REQUIRES_PSIMD;
41654 for (uint32_t n = 16; n <= 24; n += 8) {
41655 for (size_t k = 1; k <= 5; k += 2) {
41656 GemmMicrokernelTester()
41657 .mr(1)
41658 .nr(8)
41659 .kr(1)
41660 .sr(1)
41661 .m(1)
41662 .n(8)
41663 .k(k)
41664 .ks(3)
41665 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41666 }
41667 }
41668 }
41669
41670 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
41671 TEST_REQUIRES_PSIMD;
41672 for (size_t k = 1; k <= 5; k += 2) {
41673 for (uint32_t m = 1; m <= 1; m++) {
41674 for (uint32_t n = 1; n <= 8; n++) {
41675 GemmMicrokernelTester()
41676 .mr(1)
41677 .nr(8)
41678 .kr(1)
41679 .sr(1)
41680 .m(m)
41681 .n(n)
41682 .k(k)
41683 .cm_stride(11)
41684 .iterations(1)
41685 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41686 }
41687 }
41688 }
41689 }
41690
41691 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, a_offset) {
41692 TEST_REQUIRES_PSIMD;
41693 for (size_t k = 1; k <= 5; k += 2) {
41694 GemmMicrokernelTester()
41695 .mr(1)
41696 .nr(8)
41697 .kr(1)
41698 .sr(1)
41699 .m(1)
41700 .n(8)
41701 .k(k)
41702 .ks(3)
41703 .a_offset(7)
41704 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41705 }
41706 }
41707
41708 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, zero) {
41709 TEST_REQUIRES_PSIMD;
41710 for (uint32_t mz = 0; mz < 1; mz++) {
41711 for (size_t k = 1; k <= 5; k += 2) {
41712 GemmMicrokernelTester()
41713 .mr(1)
41714 .nr(8)
41715 .kr(1)
41716 .sr(1)
41717 .m(1)
41718 .n(8)
41719 .k(k)
41720 .ks(3)
41721 .a_offset(7)
41722 .zero_index(mz)
41723 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41724 }
41725 }
41726 }
41727
41728 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, qmin) {
41729 TEST_REQUIRES_PSIMD;
41730 GemmMicrokernelTester()
41731 .mr(1)
41732 .nr(8)
41733 .kr(1)
41734 .sr(1)
41735 .m(1)
41736 .n(8)
41737 .k(1)
41738 .qmin(128)
41739 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41740 }
41741
41742 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, qmax) {
41743 TEST_REQUIRES_PSIMD;
41744 GemmMicrokernelTester()
41745 .mr(1)
41746 .nr(8)
41747 .kr(1)
41748 .sr(1)
41749 .m(1)
41750 .n(8)
41751 .k(1)
41752 .qmax(128)
41753 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41754 }
41755
41756 TEST(F32_IGEMM_1X8__PSIMD_LOADSPLAT, strided_cm) {
41757 TEST_REQUIRES_PSIMD;
41758 GemmMicrokernelTester()
41759 .mr(1)
41760 .nr(8)
41761 .kr(1)
41762 .sr(1)
41763 .m(1)
41764 .n(8)
41765 .k(1)
41766 .cm_stride(11)
41767 .Test(xnn_f32_igemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41768 }
41769#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41770
41771
41772#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
41773 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1) {
41774 TEST_REQUIRES_PSIMD;
41775 GemmMicrokernelTester()
41776 .mr(4)
41777 .nr(8)
41778 .kr(1)
41779 .sr(1)
41780 .m(4)
41781 .n(8)
41782 .k(1)
41783 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41784 }
41785
41786 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, strided_cn) {
41787 TEST_REQUIRES_PSIMD;
41788 GemmMicrokernelTester()
41789 .mr(4)
41790 .nr(8)
41791 .kr(1)
41792 .sr(1)
41793 .m(4)
41794 .n(8)
41795 .k(1)
41796 .cn_stride(11)
41797 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41798 }
41799
41800 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
41801 TEST_REQUIRES_PSIMD;
41802 for (uint32_t m = 1; m <= 4; m++) {
41803 for (uint32_t n = 1; n <= 8; n++) {
41804 GemmMicrokernelTester()
41805 .mr(4)
41806 .nr(8)
41807 .kr(1)
41808 .sr(1)
41809 .m(m)
41810 .n(n)
41811 .k(1)
41812 .iterations(1)
41813 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41814 }
41815 }
41816 }
41817
41818 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
41819 TEST_REQUIRES_PSIMD;
41820 for (uint32_t m = 1; m <= 4; m++) {
41821 GemmMicrokernelTester()
41822 .mr(4)
41823 .nr(8)
41824 .kr(1)
41825 .sr(1)
41826 .m(m)
41827 .n(8)
41828 .k(1)
41829 .iterations(1)
41830 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41831 }
41832 }
41833
41834 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
41835 TEST_REQUIRES_PSIMD;
41836 for (uint32_t n = 1; n <= 8; n++) {
41837 GemmMicrokernelTester()
41838 .mr(4)
41839 .nr(8)
41840 .kr(1)
41841 .sr(1)
41842 .m(4)
41843 .n(n)
41844 .k(1)
41845 .iterations(1)
41846 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41847 }
41848 }
41849
41850 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_gt_1) {
41851 TEST_REQUIRES_PSIMD;
41852 for (size_t k = 2; k < 10; k++) {
41853 GemmMicrokernelTester()
41854 .mr(4)
41855 .nr(8)
41856 .kr(1)
41857 .sr(1)
41858 .m(4)
41859 .n(8)
41860 .k(k)
41861 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41862 }
41863 }
41864
41865 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
41866 TEST_REQUIRES_PSIMD;
41867 for (size_t k = 2; k < 10; k++) {
41868 for (uint32_t m = 1; m <= 4; m++) {
41869 for (uint32_t n = 1; n <= 8; n++) {
41870 GemmMicrokernelTester()
41871 .mr(4)
41872 .nr(8)
41873 .kr(1)
41874 .sr(1)
41875 .m(m)
41876 .n(n)
41877 .k(k)
41878 .iterations(1)
41879 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41880 }
41881 }
41882 }
41883 }
41884
41885 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8) {
41886 TEST_REQUIRES_PSIMD;
41887 for (uint32_t n = 9; n < 16; n++) {
41888 for (size_t k = 1; k <= 5; k += 2) {
41889 GemmMicrokernelTester()
41890 .mr(4)
41891 .nr(8)
41892 .kr(1)
41893 .sr(1)
41894 .m(4)
41895 .n(8)
41896 .k(k)
41897 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41898 }
41899 }
41900 }
41901
41902 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
41903 TEST_REQUIRES_PSIMD;
41904 for (uint32_t n = 9; n < 16; n++) {
41905 for (size_t k = 1; k <= 5; k += 2) {
41906 GemmMicrokernelTester()
41907 .mr(4)
41908 .nr(8)
41909 .kr(1)
41910 .sr(1)
41911 .m(4)
41912 .n(8)
41913 .k(k)
41914 .cn_stride(11)
41915 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41916 }
41917 }
41918 }
41919
41920 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
41921 TEST_REQUIRES_PSIMD;
41922 for (uint32_t n = 9; n < 16; n++) {
41923 for (size_t k = 1; k <= 5; k += 2) {
41924 for (uint32_t m = 1; m <= 4; m++) {
41925 GemmMicrokernelTester()
41926 .mr(4)
41927 .nr(8)
41928 .kr(1)
41929 .sr(1)
41930 .m(m)
41931 .n(n)
41932 .k(k)
41933 .iterations(1)
41934 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41935 }
41936 }
41937 }
41938 }
41939
41940 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8) {
41941 TEST_REQUIRES_PSIMD;
41942 for (uint32_t n = 16; n <= 24; n += 8) {
41943 for (size_t k = 1; k <= 5; k += 2) {
41944 GemmMicrokernelTester()
41945 .mr(4)
41946 .nr(8)
41947 .kr(1)
41948 .sr(1)
41949 .m(4)
41950 .n(8)
41951 .k(k)
41952 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41953 }
41954 }
41955 }
41956
41957 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
41958 TEST_REQUIRES_PSIMD;
41959 for (uint32_t n = 16; n <= 24; n += 8) {
41960 for (size_t k = 1; k <= 5; k += 2) {
41961 GemmMicrokernelTester()
41962 .mr(4)
41963 .nr(8)
41964 .kr(1)
41965 .sr(1)
41966 .m(4)
41967 .n(n)
41968 .k(k)
41969 .cn_stride(11)
41970 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41971 }
41972 }
41973 }
41974
41975 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
41976 TEST_REQUIRES_PSIMD;
41977 for (uint32_t n = 16; n <= 24; n += 8) {
41978 for (size_t k = 1; k <= 5; k += 2) {
41979 for (uint32_t m = 1; m <= 4; m++) {
41980 GemmMicrokernelTester()
41981 .mr(4)
41982 .nr(8)
41983 .kr(1)
41984 .sr(1)
41985 .m(m)
41986 .n(n)
41987 .k(k)
41988 .iterations(1)
41989 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
41990 }
41991 }
41992 }
41993 }
41994
41995 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, small_kernel) {
41996 TEST_REQUIRES_PSIMD;
41997 for (size_t k = 1; k <= 5; k += 2) {
41998 GemmMicrokernelTester()
41999 .mr(4)
42000 .nr(8)
42001 .kr(1)
42002 .sr(1)
42003 .m(4)
42004 .n(8)
42005 .k(k)
42006 .ks(3)
42007 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42008 }
42009 }
42010
42011 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, small_kernel_subtile) {
42012 TEST_REQUIRES_PSIMD;
42013 for (size_t k = 1; k <= 5; k += 2) {
42014 for (uint32_t m = 1; m <= 4; m++) {
42015 for (uint32_t n = 1; n <= 8; n++) {
42016 GemmMicrokernelTester()
42017 .mr(4)
42018 .nr(8)
42019 .kr(1)
42020 .sr(1)
42021 .m(m)
42022 .n(n)
42023 .k(k)
42024 .ks(3)
42025 .iterations(1)
42026 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42027 }
42028 }
42029 }
42030 }
42031
42032 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_small_kernel) {
42033 TEST_REQUIRES_PSIMD;
42034 for (uint32_t n = 9; n < 16; n++) {
42035 for (size_t k = 1; k <= 5; k += 2) {
42036 GemmMicrokernelTester()
42037 .mr(4)
42038 .nr(8)
42039 .kr(1)
42040 .sr(1)
42041 .m(4)
42042 .n(8)
42043 .k(k)
42044 .ks(3)
42045 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42046 }
42047 }
42048 }
42049
42050 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, n_div_8_small_kernel) {
42051 TEST_REQUIRES_PSIMD;
42052 for (uint32_t n = 16; n <= 24; n += 8) {
42053 for (size_t k = 1; k <= 5; k += 2) {
42054 GemmMicrokernelTester()
42055 .mr(4)
42056 .nr(8)
42057 .kr(1)
42058 .sr(1)
42059 .m(4)
42060 .n(8)
42061 .k(k)
42062 .ks(3)
42063 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42064 }
42065 }
42066 }
42067
42068 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
42069 TEST_REQUIRES_PSIMD;
42070 for (size_t k = 1; k <= 5; k += 2) {
42071 for (uint32_t m = 1; m <= 4; m++) {
42072 for (uint32_t n = 1; n <= 8; n++) {
42073 GemmMicrokernelTester()
42074 .mr(4)
42075 .nr(8)
42076 .kr(1)
42077 .sr(1)
42078 .m(m)
42079 .n(n)
42080 .k(k)
42081 .cm_stride(11)
42082 .iterations(1)
42083 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42084 }
42085 }
42086 }
42087 }
42088
42089 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, a_offset) {
42090 TEST_REQUIRES_PSIMD;
42091 for (size_t k = 1; k <= 5; k += 2) {
42092 GemmMicrokernelTester()
42093 .mr(4)
42094 .nr(8)
42095 .kr(1)
42096 .sr(1)
42097 .m(4)
42098 .n(8)
42099 .k(k)
42100 .ks(3)
42101 .a_offset(23)
42102 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42103 }
42104 }
42105
42106 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, zero) {
42107 TEST_REQUIRES_PSIMD;
42108 for (uint32_t mz = 0; mz < 4; mz++) {
42109 for (size_t k = 1; k <= 5; k += 2) {
42110 GemmMicrokernelTester()
42111 .mr(4)
42112 .nr(8)
42113 .kr(1)
42114 .sr(1)
42115 .m(4)
42116 .n(8)
42117 .k(k)
42118 .ks(3)
42119 .a_offset(23)
42120 .zero_index(mz)
42121 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42122 }
42123 }
42124 }
42125
42126 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, qmin) {
42127 TEST_REQUIRES_PSIMD;
42128 GemmMicrokernelTester()
42129 .mr(4)
42130 .nr(8)
42131 .kr(1)
42132 .sr(1)
42133 .m(4)
42134 .n(8)
42135 .k(1)
42136 .qmin(128)
42137 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42138 }
42139
42140 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, qmax) {
42141 TEST_REQUIRES_PSIMD;
42142 GemmMicrokernelTester()
42143 .mr(4)
42144 .nr(8)
42145 .kr(1)
42146 .sr(1)
42147 .m(4)
42148 .n(8)
42149 .k(1)
42150 .qmax(128)
42151 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42152 }
42153
42154 TEST(F32_IGEMM_4X8__PSIMD_LOADSPLAT, strided_cm) {
42155 TEST_REQUIRES_PSIMD;
42156 GemmMicrokernelTester()
42157 .mr(4)
42158 .nr(8)
42159 .kr(1)
42160 .sr(1)
42161 .m(4)
42162 .n(8)
42163 .k(1)
42164 .cm_stride(11)
42165 .Test(xnn_f32_igemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42166 }
42167#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42168
42169
42170#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42171 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1) {
42172 TEST_REQUIRES_PSIMD;
42173 GemmMicrokernelTester()
42174 .mr(6)
42175 .nr(8)
42176 .kr(1)
42177 .sr(1)
42178 .m(6)
42179 .n(8)
42180 .k(1)
42181 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42182 }
42183
42184 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, strided_cn) {
42185 TEST_REQUIRES_PSIMD;
42186 GemmMicrokernelTester()
42187 .mr(6)
42188 .nr(8)
42189 .kr(1)
42190 .sr(1)
42191 .m(6)
42192 .n(8)
42193 .k(1)
42194 .cn_stride(11)
42195 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42196 }
42197
42198 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
42199 TEST_REQUIRES_PSIMD;
42200 for (uint32_t m = 1; m <= 6; m++) {
42201 for (uint32_t n = 1; n <= 8; n++) {
42202 GemmMicrokernelTester()
42203 .mr(6)
42204 .nr(8)
42205 .kr(1)
42206 .sr(1)
42207 .m(m)
42208 .n(n)
42209 .k(1)
42210 .iterations(1)
42211 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42212 }
42213 }
42214 }
42215
42216 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
42217 TEST_REQUIRES_PSIMD;
42218 for (uint32_t m = 1; m <= 6; m++) {
42219 GemmMicrokernelTester()
42220 .mr(6)
42221 .nr(8)
42222 .kr(1)
42223 .sr(1)
42224 .m(m)
42225 .n(8)
42226 .k(1)
42227 .iterations(1)
42228 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42229 }
42230 }
42231
42232 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
42233 TEST_REQUIRES_PSIMD;
42234 for (uint32_t n = 1; n <= 8; n++) {
42235 GemmMicrokernelTester()
42236 .mr(6)
42237 .nr(8)
42238 .kr(1)
42239 .sr(1)
42240 .m(6)
42241 .n(n)
42242 .k(1)
42243 .iterations(1)
42244 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42245 }
42246 }
42247
42248 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_gt_1) {
42249 TEST_REQUIRES_PSIMD;
42250 for (size_t k = 2; k < 10; k++) {
42251 GemmMicrokernelTester()
42252 .mr(6)
42253 .nr(8)
42254 .kr(1)
42255 .sr(1)
42256 .m(6)
42257 .n(8)
42258 .k(k)
42259 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42260 }
42261 }
42262
42263 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
42264 TEST_REQUIRES_PSIMD;
42265 for (size_t k = 2; k < 10; k++) {
42266 for (uint32_t m = 1; m <= 6; m++) {
42267 for (uint32_t n = 1; n <= 8; n++) {
42268 GemmMicrokernelTester()
42269 .mr(6)
42270 .nr(8)
42271 .kr(1)
42272 .sr(1)
42273 .m(m)
42274 .n(n)
42275 .k(k)
42276 .iterations(1)
42277 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42278 }
42279 }
42280 }
42281 }
42282
42283 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8) {
42284 TEST_REQUIRES_PSIMD;
42285 for (uint32_t n = 9; n < 16; n++) {
42286 for (size_t k = 1; k <= 5; k += 2) {
42287 GemmMicrokernelTester()
42288 .mr(6)
42289 .nr(8)
42290 .kr(1)
42291 .sr(1)
42292 .m(6)
42293 .n(8)
42294 .k(k)
42295 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42296 }
42297 }
42298 }
42299
42300 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
42301 TEST_REQUIRES_PSIMD;
42302 for (uint32_t n = 9; n < 16; n++) {
42303 for (size_t k = 1; k <= 5; k += 2) {
42304 GemmMicrokernelTester()
42305 .mr(6)
42306 .nr(8)
42307 .kr(1)
42308 .sr(1)
42309 .m(6)
42310 .n(8)
42311 .k(k)
42312 .cn_stride(11)
42313 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42314 }
42315 }
42316 }
42317
42318 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
42319 TEST_REQUIRES_PSIMD;
42320 for (uint32_t n = 9; n < 16; n++) {
42321 for (size_t k = 1; k <= 5; k += 2) {
42322 for (uint32_t m = 1; m <= 6; m++) {
42323 GemmMicrokernelTester()
42324 .mr(6)
42325 .nr(8)
42326 .kr(1)
42327 .sr(1)
42328 .m(m)
42329 .n(n)
42330 .k(k)
42331 .iterations(1)
42332 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42333 }
42334 }
42335 }
42336 }
42337
42338 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8) {
42339 TEST_REQUIRES_PSIMD;
42340 for (uint32_t n = 16; n <= 24; n += 8) {
42341 for (size_t k = 1; k <= 5; k += 2) {
42342 GemmMicrokernelTester()
42343 .mr(6)
42344 .nr(8)
42345 .kr(1)
42346 .sr(1)
42347 .m(6)
42348 .n(8)
42349 .k(k)
42350 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42351 }
42352 }
42353 }
42354
42355 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
42356 TEST_REQUIRES_PSIMD;
42357 for (uint32_t n = 16; n <= 24; n += 8) {
42358 for (size_t k = 1; k <= 5; k += 2) {
42359 GemmMicrokernelTester()
42360 .mr(6)
42361 .nr(8)
42362 .kr(1)
42363 .sr(1)
42364 .m(6)
42365 .n(n)
42366 .k(k)
42367 .cn_stride(11)
42368 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42369 }
42370 }
42371 }
42372
42373 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
42374 TEST_REQUIRES_PSIMD;
42375 for (uint32_t n = 16; n <= 24; n += 8) {
42376 for (size_t k = 1; k <= 5; k += 2) {
42377 for (uint32_t m = 1; m <= 6; m++) {
42378 GemmMicrokernelTester()
42379 .mr(6)
42380 .nr(8)
42381 .kr(1)
42382 .sr(1)
42383 .m(m)
42384 .n(n)
42385 .k(k)
42386 .iterations(1)
42387 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42388 }
42389 }
42390 }
42391 }
42392
42393 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, small_kernel) {
42394 TEST_REQUIRES_PSIMD;
42395 for (size_t k = 1; k <= 5; k += 2) {
42396 GemmMicrokernelTester()
42397 .mr(6)
42398 .nr(8)
42399 .kr(1)
42400 .sr(1)
42401 .m(6)
42402 .n(8)
42403 .k(k)
42404 .ks(3)
42405 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42406 }
42407 }
42408
42409 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, small_kernel_subtile) {
42410 TEST_REQUIRES_PSIMD;
42411 for (size_t k = 1; k <= 5; k += 2) {
42412 for (uint32_t m = 1; m <= 6; m++) {
42413 for (uint32_t n = 1; n <= 8; n++) {
42414 GemmMicrokernelTester()
42415 .mr(6)
42416 .nr(8)
42417 .kr(1)
42418 .sr(1)
42419 .m(m)
42420 .n(n)
42421 .k(k)
42422 .ks(3)
42423 .iterations(1)
42424 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42425 }
42426 }
42427 }
42428 }
42429
42430 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_small_kernel) {
42431 TEST_REQUIRES_PSIMD;
42432 for (uint32_t n = 9; n < 16; n++) {
42433 for (size_t k = 1; k <= 5; k += 2) {
42434 GemmMicrokernelTester()
42435 .mr(6)
42436 .nr(8)
42437 .kr(1)
42438 .sr(1)
42439 .m(6)
42440 .n(8)
42441 .k(k)
42442 .ks(3)
42443 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42444 }
42445 }
42446 }
42447
42448 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, n_div_8_small_kernel) {
42449 TEST_REQUIRES_PSIMD;
42450 for (uint32_t n = 16; n <= 24; n += 8) {
42451 for (size_t k = 1; k <= 5; k += 2) {
42452 GemmMicrokernelTester()
42453 .mr(6)
42454 .nr(8)
42455 .kr(1)
42456 .sr(1)
42457 .m(6)
42458 .n(8)
42459 .k(k)
42460 .ks(3)
42461 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42462 }
42463 }
42464 }
42465
42466 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
42467 TEST_REQUIRES_PSIMD;
42468 for (size_t k = 1; k <= 5; k += 2) {
42469 for (uint32_t m = 1; m <= 6; m++) {
42470 for (uint32_t n = 1; n <= 8; n++) {
42471 GemmMicrokernelTester()
42472 .mr(6)
42473 .nr(8)
42474 .kr(1)
42475 .sr(1)
42476 .m(m)
42477 .n(n)
42478 .k(k)
42479 .cm_stride(11)
42480 .iterations(1)
42481 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42482 }
42483 }
42484 }
42485 }
42486
42487 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, a_offset) {
42488 TEST_REQUIRES_PSIMD;
42489 for (size_t k = 1; k <= 5; k += 2) {
42490 GemmMicrokernelTester()
42491 .mr(6)
42492 .nr(8)
42493 .kr(1)
42494 .sr(1)
42495 .m(6)
42496 .n(8)
42497 .k(k)
42498 .ks(3)
42499 .a_offset(37)
42500 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42501 }
42502 }
42503
42504 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, zero) {
42505 TEST_REQUIRES_PSIMD;
42506 for (uint32_t mz = 0; mz < 6; mz++) {
42507 for (size_t k = 1; k <= 5; k += 2) {
42508 GemmMicrokernelTester()
42509 .mr(6)
42510 .nr(8)
42511 .kr(1)
42512 .sr(1)
42513 .m(6)
42514 .n(8)
42515 .k(k)
42516 .ks(3)
42517 .a_offset(37)
42518 .zero_index(mz)
42519 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42520 }
42521 }
42522 }
42523
42524 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, qmin) {
42525 TEST_REQUIRES_PSIMD;
42526 GemmMicrokernelTester()
42527 .mr(6)
42528 .nr(8)
42529 .kr(1)
42530 .sr(1)
42531 .m(6)
42532 .n(8)
42533 .k(1)
42534 .qmin(128)
42535 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42536 }
42537
42538 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, qmax) {
42539 TEST_REQUIRES_PSIMD;
42540 GemmMicrokernelTester()
42541 .mr(6)
42542 .nr(8)
42543 .kr(1)
42544 .sr(1)
42545 .m(6)
42546 .n(8)
42547 .k(1)
42548 .qmax(128)
42549 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42550 }
42551
42552 TEST(F32_IGEMM_6X8__PSIMD_LOADSPLAT, strided_cm) {
42553 TEST_REQUIRES_PSIMD;
42554 GemmMicrokernelTester()
42555 .mr(6)
42556 .nr(8)
42557 .kr(1)
42558 .sr(1)
42559 .m(6)
42560 .n(8)
42561 .k(1)
42562 .cm_stride(11)
42563 .Test(xnn_f32_igemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
42564 }
42565#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42566
42567
42568#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
42569 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4) {
42570 TEST_REQUIRES_PSIMD;
42571 GemmMicrokernelTester()
42572 .mr(1)
42573 .nr(8)
42574 .kr(1)
42575 .sr(1)
42576 .m(1)
42577 .n(8)
42578 .k(4)
42579 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42580 }
42581
42582 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, strided_cn) {
42583 TEST_REQUIRES_PSIMD;
42584 GemmMicrokernelTester()
42585 .mr(1)
42586 .nr(8)
42587 .kr(1)
42588 .sr(1)
42589 .m(1)
42590 .n(8)
42591 .k(4)
42592 .cn_stride(11)
42593 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42594 }
42595
42596 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile) {
42597 TEST_REQUIRES_PSIMD;
42598 for (uint32_t m = 1; m <= 1; m++) {
42599 for (uint32_t n = 1; n <= 8; n++) {
42600 GemmMicrokernelTester()
42601 .mr(1)
42602 .nr(8)
42603 .kr(1)
42604 .sr(1)
42605 .m(m)
42606 .n(n)
42607 .k(4)
42608 .iterations(1)
42609 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42610 }
42611 }
42612 }
42613
42614 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
42615 TEST_REQUIRES_PSIMD;
42616 for (uint32_t m = 1; m <= 1; m++) {
42617 GemmMicrokernelTester()
42618 .mr(1)
42619 .nr(8)
42620 .kr(1)
42621 .sr(1)
42622 .m(m)
42623 .n(8)
42624 .k(4)
42625 .iterations(1)
42626 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42627 }
42628 }
42629
42630 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
42631 TEST_REQUIRES_PSIMD;
42632 for (uint32_t n = 1; n <= 8; n++) {
42633 GemmMicrokernelTester()
42634 .mr(1)
42635 .nr(8)
42636 .kr(1)
42637 .sr(1)
42638 .m(1)
42639 .n(n)
42640 .k(4)
42641 .iterations(1)
42642 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42643 }
42644 }
42645
42646 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_lt_4) {
42647 TEST_REQUIRES_PSIMD;
42648 for (size_t k = 1; k < 4; k++) {
42649 GemmMicrokernelTester()
42650 .mr(1)
42651 .nr(8)
42652 .kr(1)
42653 .sr(1)
42654 .m(1)
42655 .n(8)
42656 .k(k)
42657 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42658 }
42659 }
42660
42661 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_lt_4_subtile) {
42662 TEST_REQUIRES_PSIMD;
42663 for (size_t k = 1; k < 4; k++) {
42664 for (uint32_t m = 1; m <= 1; m++) {
42665 for (uint32_t n = 1; n <= 8; n++) {
42666 GemmMicrokernelTester()
42667 .mr(1)
42668 .nr(8)
42669 .kr(1)
42670 .sr(1)
42671 .m(m)
42672 .n(n)
42673 .k(k)
42674 .iterations(1)
42675 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42676 }
42677 }
42678 }
42679 }
42680
42681 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_gt_4) {
42682 TEST_REQUIRES_PSIMD;
42683 for (size_t k = 5; k < 8; k++) {
42684 GemmMicrokernelTester()
42685 .mr(1)
42686 .nr(8)
42687 .kr(1)
42688 .sr(1)
42689 .m(1)
42690 .n(8)
42691 .k(k)
42692 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42693 }
42694 }
42695
42696 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_gt_4_subtile) {
42697 TEST_REQUIRES_PSIMD;
42698 for (size_t k = 5; k < 8; k++) {
42699 for (uint32_t m = 1; m <= 1; m++) {
42700 for (uint32_t n = 1; n <= 8; n++) {
42701 GemmMicrokernelTester()
42702 .mr(1)
42703 .nr(8)
42704 .kr(1)
42705 .sr(1)
42706 .m(m)
42707 .n(n)
42708 .k(k)
42709 .iterations(1)
42710 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42711 }
42712 }
42713 }
42714 }
42715
42716 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_div_4) {
42717 TEST_REQUIRES_PSIMD;
42718 for (size_t k = 8; k <= 40; k += 4) {
42719 GemmMicrokernelTester()
42720 .mr(1)
42721 .nr(8)
42722 .kr(1)
42723 .sr(1)
42724 .m(1)
42725 .n(8)
42726 .k(k)
42727 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42728 }
42729 }
42730
42731 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, k_div_4_subtile) {
42732 TEST_REQUIRES_PSIMD;
42733 for (size_t k = 8; k <= 40; k += 4) {
42734 for (uint32_t m = 1; m <= 1; m++) {
42735 for (uint32_t n = 1; n <= 8; n++) {
42736 GemmMicrokernelTester()
42737 .mr(1)
42738 .nr(8)
42739 .kr(1)
42740 .sr(1)
42741 .m(m)
42742 .n(n)
42743 .k(k)
42744 .iterations(1)
42745 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42746 }
42747 }
42748 }
42749 }
42750
42751 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8) {
42752 TEST_REQUIRES_PSIMD;
42753 for (uint32_t n = 9; n < 16; n++) {
42754 for (size_t k = 1; k <= 20; k += 5) {
42755 GemmMicrokernelTester()
42756 .mr(1)
42757 .nr(8)
42758 .kr(1)
42759 .sr(1)
42760 .m(1)
42761 .n(8)
42762 .k(k)
42763 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42764 }
42765 }
42766 }
42767
42768 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
42769 TEST_REQUIRES_PSIMD;
42770 for (uint32_t n = 9; n < 16; n++) {
42771 for (size_t k = 1; k <= 20; k += 5) {
42772 GemmMicrokernelTester()
42773 .mr(1)
42774 .nr(8)
42775 .kr(1)
42776 .sr(1)
42777 .m(1)
42778 .n(8)
42779 .k(k)
42780 .cn_stride(11)
42781 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42782 }
42783 }
42784 }
42785
42786 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8_subtile) {
42787 TEST_REQUIRES_PSIMD;
42788 for (uint32_t n = 9; n < 16; n++) {
42789 for (size_t k = 1; k <= 20; k += 5) {
42790 for (uint32_t m = 1; m <= 1; m++) {
42791 GemmMicrokernelTester()
42792 .mr(1)
42793 .nr(8)
42794 .kr(1)
42795 .sr(1)
42796 .m(m)
42797 .n(n)
42798 .k(k)
42799 .iterations(1)
42800 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42801 }
42802 }
42803 }
42804 }
42805
42806 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8) {
42807 TEST_REQUIRES_PSIMD;
42808 for (uint32_t n = 16; n <= 24; n += 8) {
42809 for (size_t k = 1; k <= 20; k += 5) {
42810 GemmMicrokernelTester()
42811 .mr(1)
42812 .nr(8)
42813 .kr(1)
42814 .sr(1)
42815 .m(1)
42816 .n(8)
42817 .k(k)
42818 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42819 }
42820 }
42821 }
42822
42823 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8_strided_cn) {
42824 TEST_REQUIRES_PSIMD;
42825 for (uint32_t n = 16; n <= 24; n += 8) {
42826 for (size_t k = 1; k <= 20; k += 5) {
42827 GemmMicrokernelTester()
42828 .mr(1)
42829 .nr(8)
42830 .kr(1)
42831 .sr(1)
42832 .m(1)
42833 .n(n)
42834 .k(k)
42835 .cn_stride(11)
42836 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42837 }
42838 }
42839 }
42840
42841 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8_subtile) {
42842 TEST_REQUIRES_PSIMD;
42843 for (uint32_t n = 16; n <= 24; n += 8) {
42844 for (size_t k = 1; k <= 20; k += 5) {
42845 for (uint32_t m = 1; m <= 1; m++) {
42846 GemmMicrokernelTester()
42847 .mr(1)
42848 .nr(8)
42849 .kr(1)
42850 .sr(1)
42851 .m(m)
42852 .n(n)
42853 .k(k)
42854 .iterations(1)
42855 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42856 }
42857 }
42858 }
42859 }
42860
42861 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, small_kernel) {
42862 TEST_REQUIRES_PSIMD;
42863 for (size_t k = 1; k <= 20; k += 5) {
42864 GemmMicrokernelTester()
42865 .mr(1)
42866 .nr(8)
42867 .kr(1)
42868 .sr(1)
42869 .m(1)
42870 .n(8)
42871 .k(k)
42872 .ks(3)
42873 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42874 }
42875 }
42876
42877 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, small_kernel_subtile) {
42878 TEST_REQUIRES_PSIMD;
42879 for (size_t k = 1; k <= 20; k += 5) {
42880 for (uint32_t m = 1; m <= 1; m++) {
42881 for (uint32_t n = 1; n <= 8; n++) {
42882 GemmMicrokernelTester()
42883 .mr(1)
42884 .nr(8)
42885 .kr(1)
42886 .sr(1)
42887 .m(m)
42888 .n(n)
42889 .k(k)
42890 .ks(3)
42891 .iterations(1)
42892 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42893 }
42894 }
42895 }
42896 }
42897
42898 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_gt_8_small_kernel) {
42899 TEST_REQUIRES_PSIMD;
42900 for (uint32_t n = 9; n < 16; n++) {
42901 for (size_t k = 1; k <= 20; k += 5) {
42902 GemmMicrokernelTester()
42903 .mr(1)
42904 .nr(8)
42905 .kr(1)
42906 .sr(1)
42907 .m(1)
42908 .n(8)
42909 .k(k)
42910 .ks(3)
42911 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42912 }
42913 }
42914 }
42915
42916 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, n_div_8_small_kernel) {
42917 TEST_REQUIRES_PSIMD;
42918 for (uint32_t n = 16; n <= 24; n += 8) {
42919 for (size_t k = 1; k <= 20; k += 5) {
42920 GemmMicrokernelTester()
42921 .mr(1)
42922 .nr(8)
42923 .kr(1)
42924 .sr(1)
42925 .m(1)
42926 .n(8)
42927 .k(k)
42928 .ks(3)
42929 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42930 }
42931 }
42932 }
42933
42934 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, strided_cm_subtile) {
42935 TEST_REQUIRES_PSIMD;
42936 for (size_t k = 1; k <= 20; k += 5) {
42937 for (uint32_t m = 1; m <= 1; m++) {
42938 for (uint32_t n = 1; n <= 8; n++) {
42939 GemmMicrokernelTester()
42940 .mr(1)
42941 .nr(8)
42942 .kr(1)
42943 .sr(1)
42944 .m(m)
42945 .n(n)
42946 .k(k)
42947 .cm_stride(11)
42948 .iterations(1)
42949 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42950 }
42951 }
42952 }
42953 }
42954
42955 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, a_offset) {
42956 TEST_REQUIRES_PSIMD;
42957 for (size_t k = 1; k <= 20; k += 5) {
42958 GemmMicrokernelTester()
42959 .mr(1)
42960 .nr(8)
42961 .kr(1)
42962 .sr(1)
42963 .m(1)
42964 .n(8)
42965 .k(k)
42966 .ks(3)
42967 .a_offset(23)
42968 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42969 }
42970 }
42971
42972 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, zero) {
42973 TEST_REQUIRES_PSIMD;
42974 for (uint32_t mz = 0; mz < 1; mz++) {
42975 for (size_t k = 1; k <= 20; k += 5) {
42976 GemmMicrokernelTester()
42977 .mr(1)
42978 .nr(8)
42979 .kr(1)
42980 .sr(1)
42981 .m(1)
42982 .n(8)
42983 .k(k)
42984 .ks(3)
42985 .a_offset(23)
42986 .zero_index(mz)
42987 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
42988 }
42989 }
42990 }
42991
42992 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, qmin) {
42993 TEST_REQUIRES_PSIMD;
42994 GemmMicrokernelTester()
42995 .mr(1)
42996 .nr(8)
42997 .kr(1)
42998 .sr(1)
42999 .m(1)
43000 .n(8)
43001 .k(4)
43002 .qmin(128)
43003 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43004 }
43005
43006 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, qmax) {
43007 TEST_REQUIRES_PSIMD;
43008 GemmMicrokernelTester()
43009 .mr(1)
43010 .nr(8)
43011 .kr(1)
43012 .sr(1)
43013 .m(1)
43014 .n(8)
43015 .k(4)
43016 .qmax(128)
43017 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43018 }
43019
43020 TEST(F32_IGEMM_1X8__PSIMD_SPLAT, strided_cm) {
43021 TEST_REQUIRES_PSIMD;
43022 GemmMicrokernelTester()
43023 .mr(1)
43024 .nr(8)
43025 .kr(1)
43026 .sr(1)
43027 .m(1)
43028 .n(8)
43029 .k(4)
43030 .cm_stride(11)
43031 .Test(xnn_f32_igemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43032 }
43033#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43034
43035
43036#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43037 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4) {
43038 TEST_REQUIRES_PSIMD;
43039 GemmMicrokernelTester()
43040 .mr(4)
43041 .nr(8)
43042 .kr(1)
43043 .sr(1)
43044 .m(4)
43045 .n(8)
43046 .k(4)
43047 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43048 }
43049
43050 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, strided_cn) {
43051 TEST_REQUIRES_PSIMD;
43052 GemmMicrokernelTester()
43053 .mr(4)
43054 .nr(8)
43055 .kr(1)
43056 .sr(1)
43057 .m(4)
43058 .n(8)
43059 .k(4)
43060 .cn_stride(11)
43061 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43062 }
43063
43064 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile) {
43065 TEST_REQUIRES_PSIMD;
43066 for (uint32_t m = 1; m <= 4; m++) {
43067 for (uint32_t n = 1; n <= 8; n++) {
43068 GemmMicrokernelTester()
43069 .mr(4)
43070 .nr(8)
43071 .kr(1)
43072 .sr(1)
43073 .m(m)
43074 .n(n)
43075 .k(4)
43076 .iterations(1)
43077 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43078 }
43079 }
43080 }
43081
43082 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
43083 TEST_REQUIRES_PSIMD;
43084 for (uint32_t m = 1; m <= 4; m++) {
43085 GemmMicrokernelTester()
43086 .mr(4)
43087 .nr(8)
43088 .kr(1)
43089 .sr(1)
43090 .m(m)
43091 .n(8)
43092 .k(4)
43093 .iterations(1)
43094 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43095 }
43096 }
43097
43098 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
43099 TEST_REQUIRES_PSIMD;
43100 for (uint32_t n = 1; n <= 8; n++) {
43101 GemmMicrokernelTester()
43102 .mr(4)
43103 .nr(8)
43104 .kr(1)
43105 .sr(1)
43106 .m(4)
43107 .n(n)
43108 .k(4)
43109 .iterations(1)
43110 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43111 }
43112 }
43113
43114 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_lt_4) {
43115 TEST_REQUIRES_PSIMD;
43116 for (size_t k = 1; k < 4; k++) {
43117 GemmMicrokernelTester()
43118 .mr(4)
43119 .nr(8)
43120 .kr(1)
43121 .sr(1)
43122 .m(4)
43123 .n(8)
43124 .k(k)
43125 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43126 }
43127 }
43128
43129 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_lt_4_subtile) {
43130 TEST_REQUIRES_PSIMD;
43131 for (size_t k = 1; k < 4; k++) {
43132 for (uint32_t m = 1; m <= 4; m++) {
43133 for (uint32_t n = 1; n <= 8; n++) {
43134 GemmMicrokernelTester()
43135 .mr(4)
43136 .nr(8)
43137 .kr(1)
43138 .sr(1)
43139 .m(m)
43140 .n(n)
43141 .k(k)
43142 .iterations(1)
43143 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43144 }
43145 }
43146 }
43147 }
43148
43149 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_gt_4) {
43150 TEST_REQUIRES_PSIMD;
43151 for (size_t k = 5; k < 8; k++) {
43152 GemmMicrokernelTester()
43153 .mr(4)
43154 .nr(8)
43155 .kr(1)
43156 .sr(1)
43157 .m(4)
43158 .n(8)
43159 .k(k)
43160 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43161 }
43162 }
43163
43164 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_gt_4_subtile) {
43165 TEST_REQUIRES_PSIMD;
43166 for (size_t k = 5; k < 8; k++) {
43167 for (uint32_t m = 1; m <= 4; m++) {
43168 for (uint32_t n = 1; n <= 8; n++) {
43169 GemmMicrokernelTester()
43170 .mr(4)
43171 .nr(8)
43172 .kr(1)
43173 .sr(1)
43174 .m(m)
43175 .n(n)
43176 .k(k)
43177 .iterations(1)
43178 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43179 }
43180 }
43181 }
43182 }
43183
43184 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_div_4) {
43185 TEST_REQUIRES_PSIMD;
43186 for (size_t k = 8; k <= 40; k += 4) {
43187 GemmMicrokernelTester()
43188 .mr(4)
43189 .nr(8)
43190 .kr(1)
43191 .sr(1)
43192 .m(4)
43193 .n(8)
43194 .k(k)
43195 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43196 }
43197 }
43198
43199 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, k_div_4_subtile) {
43200 TEST_REQUIRES_PSIMD;
43201 for (size_t k = 8; k <= 40; k += 4) {
43202 for (uint32_t m = 1; m <= 4; m++) {
43203 for (uint32_t n = 1; n <= 8; n++) {
43204 GemmMicrokernelTester()
43205 .mr(4)
43206 .nr(8)
43207 .kr(1)
43208 .sr(1)
43209 .m(m)
43210 .n(n)
43211 .k(k)
43212 .iterations(1)
43213 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43214 }
43215 }
43216 }
43217 }
43218
43219 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8) {
43220 TEST_REQUIRES_PSIMD;
43221 for (uint32_t n = 9; n < 16; n++) {
43222 for (size_t k = 1; k <= 20; k += 5) {
43223 GemmMicrokernelTester()
43224 .mr(4)
43225 .nr(8)
43226 .kr(1)
43227 .sr(1)
43228 .m(4)
43229 .n(8)
43230 .k(k)
43231 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43232 }
43233 }
43234 }
43235
43236 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
43237 TEST_REQUIRES_PSIMD;
43238 for (uint32_t n = 9; n < 16; n++) {
43239 for (size_t k = 1; k <= 20; k += 5) {
43240 GemmMicrokernelTester()
43241 .mr(4)
43242 .nr(8)
43243 .kr(1)
43244 .sr(1)
43245 .m(4)
43246 .n(8)
43247 .k(k)
43248 .cn_stride(11)
43249 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43250 }
43251 }
43252 }
43253
43254 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8_subtile) {
43255 TEST_REQUIRES_PSIMD;
43256 for (uint32_t n = 9; n < 16; n++) {
43257 for (size_t k = 1; k <= 20; k += 5) {
43258 for (uint32_t m = 1; m <= 4; m++) {
43259 GemmMicrokernelTester()
43260 .mr(4)
43261 .nr(8)
43262 .kr(1)
43263 .sr(1)
43264 .m(m)
43265 .n(n)
43266 .k(k)
43267 .iterations(1)
43268 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43269 }
43270 }
43271 }
43272 }
43273
43274 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8) {
43275 TEST_REQUIRES_PSIMD;
43276 for (uint32_t n = 16; n <= 24; n += 8) {
43277 for (size_t k = 1; k <= 20; k += 5) {
43278 GemmMicrokernelTester()
43279 .mr(4)
43280 .nr(8)
43281 .kr(1)
43282 .sr(1)
43283 .m(4)
43284 .n(8)
43285 .k(k)
43286 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43287 }
43288 }
43289 }
43290
43291 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8_strided_cn) {
43292 TEST_REQUIRES_PSIMD;
43293 for (uint32_t n = 16; n <= 24; n += 8) {
43294 for (size_t k = 1; k <= 20; k += 5) {
43295 GemmMicrokernelTester()
43296 .mr(4)
43297 .nr(8)
43298 .kr(1)
43299 .sr(1)
43300 .m(4)
43301 .n(n)
43302 .k(k)
43303 .cn_stride(11)
43304 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43305 }
43306 }
43307 }
43308
43309 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8_subtile) {
43310 TEST_REQUIRES_PSIMD;
43311 for (uint32_t n = 16; n <= 24; n += 8) {
43312 for (size_t k = 1; k <= 20; k += 5) {
43313 for (uint32_t m = 1; m <= 4; m++) {
43314 GemmMicrokernelTester()
43315 .mr(4)
43316 .nr(8)
43317 .kr(1)
43318 .sr(1)
43319 .m(m)
43320 .n(n)
43321 .k(k)
43322 .iterations(1)
43323 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43324 }
43325 }
43326 }
43327 }
43328
43329 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, small_kernel) {
43330 TEST_REQUIRES_PSIMD;
43331 for (size_t k = 1; k <= 20; k += 5) {
43332 GemmMicrokernelTester()
43333 .mr(4)
43334 .nr(8)
43335 .kr(1)
43336 .sr(1)
43337 .m(4)
43338 .n(8)
43339 .k(k)
43340 .ks(3)
43341 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43342 }
43343 }
43344
43345 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, small_kernel_subtile) {
43346 TEST_REQUIRES_PSIMD;
43347 for (size_t k = 1; k <= 20; k += 5) {
43348 for (uint32_t m = 1; m <= 4; m++) {
43349 for (uint32_t n = 1; n <= 8; n++) {
43350 GemmMicrokernelTester()
43351 .mr(4)
43352 .nr(8)
43353 .kr(1)
43354 .sr(1)
43355 .m(m)
43356 .n(n)
43357 .k(k)
43358 .ks(3)
43359 .iterations(1)
43360 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43361 }
43362 }
43363 }
43364 }
43365
43366 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_gt_8_small_kernel) {
43367 TEST_REQUIRES_PSIMD;
43368 for (uint32_t n = 9; n < 16; n++) {
43369 for (size_t k = 1; k <= 20; k += 5) {
43370 GemmMicrokernelTester()
43371 .mr(4)
43372 .nr(8)
43373 .kr(1)
43374 .sr(1)
43375 .m(4)
43376 .n(8)
43377 .k(k)
43378 .ks(3)
43379 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43380 }
43381 }
43382 }
43383
43384 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, n_div_8_small_kernel) {
43385 TEST_REQUIRES_PSIMD;
43386 for (uint32_t n = 16; n <= 24; n += 8) {
43387 for (size_t k = 1; k <= 20; k += 5) {
43388 GemmMicrokernelTester()
43389 .mr(4)
43390 .nr(8)
43391 .kr(1)
43392 .sr(1)
43393 .m(4)
43394 .n(8)
43395 .k(k)
43396 .ks(3)
43397 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43398 }
43399 }
43400 }
43401
43402 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, strided_cm_subtile) {
43403 TEST_REQUIRES_PSIMD;
43404 for (size_t k = 1; k <= 20; k += 5) {
43405 for (uint32_t m = 1; m <= 4; m++) {
43406 for (uint32_t n = 1; n <= 8; n++) {
43407 GemmMicrokernelTester()
43408 .mr(4)
43409 .nr(8)
43410 .kr(1)
43411 .sr(1)
43412 .m(m)
43413 .n(n)
43414 .k(k)
43415 .cm_stride(11)
43416 .iterations(1)
43417 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43418 }
43419 }
43420 }
43421 }
43422
43423 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, a_offset) {
43424 TEST_REQUIRES_PSIMD;
43425 for (size_t k = 1; k <= 20; k += 5) {
43426 GemmMicrokernelTester()
43427 .mr(4)
43428 .nr(8)
43429 .kr(1)
43430 .sr(1)
43431 .m(4)
43432 .n(8)
43433 .k(k)
43434 .ks(3)
43435 .a_offset(83)
43436 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43437 }
43438 }
43439
43440 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, zero) {
43441 TEST_REQUIRES_PSIMD;
43442 for (uint32_t mz = 0; mz < 4; mz++) {
43443 for (size_t k = 1; k <= 20; k += 5) {
43444 GemmMicrokernelTester()
43445 .mr(4)
43446 .nr(8)
43447 .kr(1)
43448 .sr(1)
43449 .m(4)
43450 .n(8)
43451 .k(k)
43452 .ks(3)
43453 .a_offset(83)
43454 .zero_index(mz)
43455 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43456 }
43457 }
43458 }
43459
43460 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, qmin) {
43461 TEST_REQUIRES_PSIMD;
43462 GemmMicrokernelTester()
43463 .mr(4)
43464 .nr(8)
43465 .kr(1)
43466 .sr(1)
43467 .m(4)
43468 .n(8)
43469 .k(4)
43470 .qmin(128)
43471 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43472 }
43473
43474 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, qmax) {
43475 TEST_REQUIRES_PSIMD;
43476 GemmMicrokernelTester()
43477 .mr(4)
43478 .nr(8)
43479 .kr(1)
43480 .sr(1)
43481 .m(4)
43482 .n(8)
43483 .k(4)
43484 .qmax(128)
43485 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43486 }
43487
43488 TEST(F32_IGEMM_4X8__PSIMD_SPLAT, strided_cm) {
43489 TEST_REQUIRES_PSIMD;
43490 GemmMicrokernelTester()
43491 .mr(4)
43492 .nr(8)
43493 .kr(1)
43494 .sr(1)
43495 .m(4)
43496 .n(8)
43497 .k(4)
43498 .cm_stride(11)
43499 .Test(xnn_f32_igemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43500 }
43501#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43502
43503
43504#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43505 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4) {
43506 TEST_REQUIRES_PSIMD;
43507 GemmMicrokernelTester()
43508 .mr(6)
43509 .nr(8)
43510 .kr(1)
43511 .sr(1)
43512 .m(6)
43513 .n(8)
43514 .k(4)
43515 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43516 }
43517
43518 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, strided_cn) {
43519 TEST_REQUIRES_PSIMD;
43520 GemmMicrokernelTester()
43521 .mr(6)
43522 .nr(8)
43523 .kr(1)
43524 .sr(1)
43525 .m(6)
43526 .n(8)
43527 .k(4)
43528 .cn_stride(11)
43529 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43530 }
43531
43532 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile) {
43533 TEST_REQUIRES_PSIMD;
43534 for (uint32_t m = 1; m <= 6; m++) {
43535 for (uint32_t n = 1; n <= 8; n++) {
43536 GemmMicrokernelTester()
43537 .mr(6)
43538 .nr(8)
43539 .kr(1)
43540 .sr(1)
43541 .m(m)
43542 .n(n)
43543 .k(4)
43544 .iterations(1)
43545 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43546 }
43547 }
43548 }
43549
43550 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
43551 TEST_REQUIRES_PSIMD;
43552 for (uint32_t m = 1; m <= 6; m++) {
43553 GemmMicrokernelTester()
43554 .mr(6)
43555 .nr(8)
43556 .kr(1)
43557 .sr(1)
43558 .m(m)
43559 .n(8)
43560 .k(4)
43561 .iterations(1)
43562 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43563 }
43564 }
43565
43566 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
43567 TEST_REQUIRES_PSIMD;
43568 for (uint32_t n = 1; n <= 8; n++) {
43569 GemmMicrokernelTester()
43570 .mr(6)
43571 .nr(8)
43572 .kr(1)
43573 .sr(1)
43574 .m(6)
43575 .n(n)
43576 .k(4)
43577 .iterations(1)
43578 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43579 }
43580 }
43581
43582 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_lt_4) {
43583 TEST_REQUIRES_PSIMD;
43584 for (size_t k = 1; k < 4; k++) {
43585 GemmMicrokernelTester()
43586 .mr(6)
43587 .nr(8)
43588 .kr(1)
43589 .sr(1)
43590 .m(6)
43591 .n(8)
43592 .k(k)
43593 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43594 }
43595 }
43596
43597 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_lt_4_subtile) {
43598 TEST_REQUIRES_PSIMD;
43599 for (size_t k = 1; k < 4; k++) {
43600 for (uint32_t m = 1; m <= 6; m++) {
43601 for (uint32_t n = 1; n <= 8; n++) {
43602 GemmMicrokernelTester()
43603 .mr(6)
43604 .nr(8)
43605 .kr(1)
43606 .sr(1)
43607 .m(m)
43608 .n(n)
43609 .k(k)
43610 .iterations(1)
43611 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43612 }
43613 }
43614 }
43615 }
43616
43617 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_gt_4) {
43618 TEST_REQUIRES_PSIMD;
43619 for (size_t k = 5; k < 8; k++) {
43620 GemmMicrokernelTester()
43621 .mr(6)
43622 .nr(8)
43623 .kr(1)
43624 .sr(1)
43625 .m(6)
43626 .n(8)
43627 .k(k)
43628 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43629 }
43630 }
43631
43632 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_gt_4_subtile) {
43633 TEST_REQUIRES_PSIMD;
43634 for (size_t k = 5; k < 8; k++) {
43635 for (uint32_t m = 1; m <= 6; m++) {
43636 for (uint32_t n = 1; n <= 8; n++) {
43637 GemmMicrokernelTester()
43638 .mr(6)
43639 .nr(8)
43640 .kr(1)
43641 .sr(1)
43642 .m(m)
43643 .n(n)
43644 .k(k)
43645 .iterations(1)
43646 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43647 }
43648 }
43649 }
43650 }
43651
43652 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_div_4) {
43653 TEST_REQUIRES_PSIMD;
43654 for (size_t k = 8; k <= 40; k += 4) {
43655 GemmMicrokernelTester()
43656 .mr(6)
43657 .nr(8)
43658 .kr(1)
43659 .sr(1)
43660 .m(6)
43661 .n(8)
43662 .k(k)
43663 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43664 }
43665 }
43666
43667 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, k_div_4_subtile) {
43668 TEST_REQUIRES_PSIMD;
43669 for (size_t k = 8; k <= 40; k += 4) {
43670 for (uint32_t m = 1; m <= 6; m++) {
43671 for (uint32_t n = 1; n <= 8; n++) {
43672 GemmMicrokernelTester()
43673 .mr(6)
43674 .nr(8)
43675 .kr(1)
43676 .sr(1)
43677 .m(m)
43678 .n(n)
43679 .k(k)
43680 .iterations(1)
43681 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43682 }
43683 }
43684 }
43685 }
43686
43687 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8) {
43688 TEST_REQUIRES_PSIMD;
43689 for (uint32_t n = 9; n < 16; n++) {
43690 for (size_t k = 1; k <= 20; k += 5) {
43691 GemmMicrokernelTester()
43692 .mr(6)
43693 .nr(8)
43694 .kr(1)
43695 .sr(1)
43696 .m(6)
43697 .n(8)
43698 .k(k)
43699 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43700 }
43701 }
43702 }
43703
43704 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
43705 TEST_REQUIRES_PSIMD;
43706 for (uint32_t n = 9; n < 16; n++) {
43707 for (size_t k = 1; k <= 20; k += 5) {
43708 GemmMicrokernelTester()
43709 .mr(6)
43710 .nr(8)
43711 .kr(1)
43712 .sr(1)
43713 .m(6)
43714 .n(8)
43715 .k(k)
43716 .cn_stride(11)
43717 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43718 }
43719 }
43720 }
43721
43722 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8_subtile) {
43723 TEST_REQUIRES_PSIMD;
43724 for (uint32_t n = 9; n < 16; n++) {
43725 for (size_t k = 1; k <= 20; k += 5) {
43726 for (uint32_t m = 1; m <= 6; m++) {
43727 GemmMicrokernelTester()
43728 .mr(6)
43729 .nr(8)
43730 .kr(1)
43731 .sr(1)
43732 .m(m)
43733 .n(n)
43734 .k(k)
43735 .iterations(1)
43736 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43737 }
43738 }
43739 }
43740 }
43741
43742 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8) {
43743 TEST_REQUIRES_PSIMD;
43744 for (uint32_t n = 16; n <= 24; n += 8) {
43745 for (size_t k = 1; k <= 20; k += 5) {
43746 GemmMicrokernelTester()
43747 .mr(6)
43748 .nr(8)
43749 .kr(1)
43750 .sr(1)
43751 .m(6)
43752 .n(8)
43753 .k(k)
43754 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43755 }
43756 }
43757 }
43758
43759 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8_strided_cn) {
43760 TEST_REQUIRES_PSIMD;
43761 for (uint32_t n = 16; n <= 24; n += 8) {
43762 for (size_t k = 1; k <= 20; k += 5) {
43763 GemmMicrokernelTester()
43764 .mr(6)
43765 .nr(8)
43766 .kr(1)
43767 .sr(1)
43768 .m(6)
43769 .n(n)
43770 .k(k)
43771 .cn_stride(11)
43772 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43773 }
43774 }
43775 }
43776
43777 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8_subtile) {
43778 TEST_REQUIRES_PSIMD;
43779 for (uint32_t n = 16; n <= 24; n += 8) {
43780 for (size_t k = 1; k <= 20; k += 5) {
43781 for (uint32_t m = 1; m <= 6; m++) {
43782 GemmMicrokernelTester()
43783 .mr(6)
43784 .nr(8)
43785 .kr(1)
43786 .sr(1)
43787 .m(m)
43788 .n(n)
43789 .k(k)
43790 .iterations(1)
43791 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43792 }
43793 }
43794 }
43795 }
43796
43797 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, small_kernel) {
43798 TEST_REQUIRES_PSIMD;
43799 for (size_t k = 1; k <= 20; k += 5) {
43800 GemmMicrokernelTester()
43801 .mr(6)
43802 .nr(8)
43803 .kr(1)
43804 .sr(1)
43805 .m(6)
43806 .n(8)
43807 .k(k)
43808 .ks(3)
43809 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43810 }
43811 }
43812
43813 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, small_kernel_subtile) {
43814 TEST_REQUIRES_PSIMD;
43815 for (size_t k = 1; k <= 20; k += 5) {
43816 for (uint32_t m = 1; m <= 6; m++) {
43817 for (uint32_t n = 1; n <= 8; n++) {
43818 GemmMicrokernelTester()
43819 .mr(6)
43820 .nr(8)
43821 .kr(1)
43822 .sr(1)
43823 .m(m)
43824 .n(n)
43825 .k(k)
43826 .ks(3)
43827 .iterations(1)
43828 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43829 }
43830 }
43831 }
43832 }
43833
43834 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_gt_8_small_kernel) {
43835 TEST_REQUIRES_PSIMD;
43836 for (uint32_t n = 9; n < 16; n++) {
43837 for (size_t k = 1; k <= 20; k += 5) {
43838 GemmMicrokernelTester()
43839 .mr(6)
43840 .nr(8)
43841 .kr(1)
43842 .sr(1)
43843 .m(6)
43844 .n(8)
43845 .k(k)
43846 .ks(3)
43847 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43848 }
43849 }
43850 }
43851
43852 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, n_div_8_small_kernel) {
43853 TEST_REQUIRES_PSIMD;
43854 for (uint32_t n = 16; n <= 24; n += 8) {
43855 for (size_t k = 1; k <= 20; k += 5) {
43856 GemmMicrokernelTester()
43857 .mr(6)
43858 .nr(8)
43859 .kr(1)
43860 .sr(1)
43861 .m(6)
43862 .n(8)
43863 .k(k)
43864 .ks(3)
43865 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43866 }
43867 }
43868 }
43869
43870 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, strided_cm_subtile) {
43871 TEST_REQUIRES_PSIMD;
43872 for (size_t k = 1; k <= 20; k += 5) {
43873 for (uint32_t m = 1; m <= 6; m++) {
43874 for (uint32_t n = 1; n <= 8; n++) {
43875 GemmMicrokernelTester()
43876 .mr(6)
43877 .nr(8)
43878 .kr(1)
43879 .sr(1)
43880 .m(m)
43881 .n(n)
43882 .k(k)
43883 .cm_stride(11)
43884 .iterations(1)
43885 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43886 }
43887 }
43888 }
43889 }
43890
43891 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, a_offset) {
43892 TEST_REQUIRES_PSIMD;
43893 for (size_t k = 1; k <= 20; k += 5) {
43894 GemmMicrokernelTester()
43895 .mr(6)
43896 .nr(8)
43897 .kr(1)
43898 .sr(1)
43899 .m(6)
43900 .n(8)
43901 .k(k)
43902 .ks(3)
43903 .a_offset(127)
43904 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43905 }
43906 }
43907
43908 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, zero) {
43909 TEST_REQUIRES_PSIMD;
43910 for (uint32_t mz = 0; mz < 6; mz++) {
43911 for (size_t k = 1; k <= 20; k += 5) {
43912 GemmMicrokernelTester()
43913 .mr(6)
43914 .nr(8)
43915 .kr(1)
43916 .sr(1)
43917 .m(6)
43918 .n(8)
43919 .k(k)
43920 .ks(3)
43921 .a_offset(127)
43922 .zero_index(mz)
43923 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43924 }
43925 }
43926 }
43927
43928 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, qmin) {
43929 TEST_REQUIRES_PSIMD;
43930 GemmMicrokernelTester()
43931 .mr(6)
43932 .nr(8)
43933 .kr(1)
43934 .sr(1)
43935 .m(6)
43936 .n(8)
43937 .k(4)
43938 .qmin(128)
43939 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43940 }
43941
43942 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, qmax) {
43943 TEST_REQUIRES_PSIMD;
43944 GemmMicrokernelTester()
43945 .mr(6)
43946 .nr(8)
43947 .kr(1)
43948 .sr(1)
43949 .m(6)
43950 .n(8)
43951 .k(4)
43952 .qmax(128)
43953 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43954 }
43955
43956 TEST(F32_IGEMM_6X8__PSIMD_SPLAT, strided_cm) {
43957 TEST_REQUIRES_PSIMD;
43958 GemmMicrokernelTester()
43959 .mr(6)
43960 .nr(8)
43961 .kr(1)
43962 .sr(1)
43963 .m(6)
43964 .n(8)
43965 .k(4)
43966 .cm_stride(11)
43967 .Test(xnn_f32_igemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
43968 }
43969#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43970
43971
43972#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
43973 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4) {
43974 TEST_REQUIRES_PSIMD;
43975 GemmMicrokernelTester()
43976 .mr(1)
43977 .nr(8)
43978 .kr(1)
43979 .sr(4)
43980 .m(1)
43981 .n(8)
43982 .k(4)
43983 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43984 }
43985
43986 TEST(F32_IGEMM_1X8S4__PSIMD, strided_cn) {
43987 TEST_REQUIRES_PSIMD;
43988 GemmMicrokernelTester()
43989 .mr(1)
43990 .nr(8)
43991 .kr(1)
43992 .sr(4)
43993 .m(1)
43994 .n(8)
43995 .k(4)
43996 .cn_stride(11)
43997 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
43998 }
43999
44000 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4_subtile) {
44001 TEST_REQUIRES_PSIMD;
44002 for (uint32_t m = 1; m <= 1; m++) {
44003 for (uint32_t n = 1; n <= 8; n++) {
44004 GemmMicrokernelTester()
44005 .mr(1)
44006 .nr(8)
44007 .kr(1)
44008 .sr(4)
44009 .m(m)
44010 .n(n)
44011 .k(4)
44012 .iterations(1)
44013 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44014 }
44015 }
44016 }
44017
44018 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4_subtile_m) {
44019 TEST_REQUIRES_PSIMD;
44020 for (uint32_t m = 1; m <= 1; m++) {
44021 GemmMicrokernelTester()
44022 .mr(1)
44023 .nr(8)
44024 .kr(1)
44025 .sr(4)
44026 .m(m)
44027 .n(8)
44028 .k(4)
44029 .iterations(1)
44030 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44031 }
44032 }
44033
44034 TEST(F32_IGEMM_1X8S4__PSIMD, k_eq_4_subtile_n) {
44035 TEST_REQUIRES_PSIMD;
44036 for (uint32_t n = 1; n <= 8; n++) {
44037 GemmMicrokernelTester()
44038 .mr(1)
44039 .nr(8)
44040 .kr(1)
44041 .sr(4)
44042 .m(1)
44043 .n(n)
44044 .k(4)
44045 .iterations(1)
44046 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44047 }
44048 }
44049
44050 TEST(F32_IGEMM_1X8S4__PSIMD, k_lt_4) {
44051 TEST_REQUIRES_PSIMD;
44052 for (size_t k = 1; k < 4; k++) {
44053 GemmMicrokernelTester()
44054 .mr(1)
44055 .nr(8)
44056 .kr(1)
44057 .sr(4)
44058 .m(1)
44059 .n(8)
44060 .k(k)
44061 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44062 }
44063 }
44064
44065 TEST(F32_IGEMM_1X8S4__PSIMD, k_lt_4_subtile) {
44066 TEST_REQUIRES_PSIMD;
44067 for (size_t k = 1; k < 4; k++) {
44068 for (uint32_t m = 1; m <= 1; m++) {
44069 for (uint32_t n = 1; n <= 8; n++) {
44070 GemmMicrokernelTester()
44071 .mr(1)
44072 .nr(8)
44073 .kr(1)
44074 .sr(4)
44075 .m(m)
44076 .n(n)
44077 .k(k)
44078 .iterations(1)
44079 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44080 }
44081 }
44082 }
44083 }
44084
44085 TEST(F32_IGEMM_1X8S4__PSIMD, k_gt_4) {
44086 TEST_REQUIRES_PSIMD;
44087 for (size_t k = 5; k < 8; k++) {
44088 GemmMicrokernelTester()
44089 .mr(1)
44090 .nr(8)
44091 .kr(1)
44092 .sr(4)
44093 .m(1)
44094 .n(8)
44095 .k(k)
44096 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44097 }
44098 }
44099
44100 TEST(F32_IGEMM_1X8S4__PSIMD, k_gt_4_subtile) {
44101 TEST_REQUIRES_PSIMD;
44102 for (size_t k = 5; k < 8; k++) {
44103 for (uint32_t m = 1; m <= 1; m++) {
44104 for (uint32_t n = 1; n <= 8; n++) {
44105 GemmMicrokernelTester()
44106 .mr(1)
44107 .nr(8)
44108 .kr(1)
44109 .sr(4)
44110 .m(m)
44111 .n(n)
44112 .k(k)
44113 .iterations(1)
44114 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44115 }
44116 }
44117 }
44118 }
44119
44120 TEST(F32_IGEMM_1X8S4__PSIMD, k_div_4) {
44121 TEST_REQUIRES_PSIMD;
44122 for (size_t k = 8; k <= 40; k += 4) {
44123 GemmMicrokernelTester()
44124 .mr(1)
44125 .nr(8)
44126 .kr(1)
44127 .sr(4)
44128 .m(1)
44129 .n(8)
44130 .k(k)
44131 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44132 }
44133 }
44134
44135 TEST(F32_IGEMM_1X8S4__PSIMD, k_div_4_subtile) {
44136 TEST_REQUIRES_PSIMD;
44137 for (size_t k = 8; k <= 40; k += 4) {
44138 for (uint32_t m = 1; m <= 1; m++) {
44139 for (uint32_t n = 1; n <= 8; n++) {
44140 GemmMicrokernelTester()
44141 .mr(1)
44142 .nr(8)
44143 .kr(1)
44144 .sr(4)
44145 .m(m)
44146 .n(n)
44147 .k(k)
44148 .iterations(1)
44149 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44150 }
44151 }
44152 }
44153 }
44154
44155 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8) {
44156 TEST_REQUIRES_PSIMD;
44157 for (uint32_t n = 9; n < 16; n++) {
44158 for (size_t k = 1; k <= 20; k += 5) {
44159 GemmMicrokernelTester()
44160 .mr(1)
44161 .nr(8)
44162 .kr(1)
44163 .sr(4)
44164 .m(1)
44165 .n(8)
44166 .k(k)
44167 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44168 }
44169 }
44170 }
44171
44172 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8_strided_cn) {
44173 TEST_REQUIRES_PSIMD;
44174 for (uint32_t n = 9; n < 16; n++) {
44175 for (size_t k = 1; k <= 20; k += 5) {
44176 GemmMicrokernelTester()
44177 .mr(1)
44178 .nr(8)
44179 .kr(1)
44180 .sr(4)
44181 .m(1)
44182 .n(8)
44183 .k(k)
44184 .cn_stride(11)
44185 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44186 }
44187 }
44188 }
44189
44190 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8_subtile) {
44191 TEST_REQUIRES_PSIMD;
44192 for (uint32_t n = 9; n < 16; n++) {
44193 for (size_t k = 1; k <= 20; k += 5) {
44194 for (uint32_t m = 1; m <= 1; m++) {
44195 GemmMicrokernelTester()
44196 .mr(1)
44197 .nr(8)
44198 .kr(1)
44199 .sr(4)
44200 .m(m)
44201 .n(n)
44202 .k(k)
44203 .iterations(1)
44204 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44205 }
44206 }
44207 }
44208 }
44209
44210 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8) {
44211 TEST_REQUIRES_PSIMD;
44212 for (uint32_t n = 16; n <= 24; n += 8) {
44213 for (size_t k = 1; k <= 20; k += 5) {
44214 GemmMicrokernelTester()
44215 .mr(1)
44216 .nr(8)
44217 .kr(1)
44218 .sr(4)
44219 .m(1)
44220 .n(8)
44221 .k(k)
44222 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44223 }
44224 }
44225 }
44226
44227 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8_strided_cn) {
44228 TEST_REQUIRES_PSIMD;
44229 for (uint32_t n = 16; n <= 24; n += 8) {
44230 for (size_t k = 1; k <= 20; k += 5) {
44231 GemmMicrokernelTester()
44232 .mr(1)
44233 .nr(8)
44234 .kr(1)
44235 .sr(4)
44236 .m(1)
44237 .n(n)
44238 .k(k)
44239 .cn_stride(11)
44240 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44241 }
44242 }
44243 }
44244
44245 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8_subtile) {
44246 TEST_REQUIRES_PSIMD;
44247 for (uint32_t n = 16; n <= 24; n += 8) {
44248 for (size_t k = 1; k <= 20; k += 5) {
44249 for (uint32_t m = 1; m <= 1; m++) {
44250 GemmMicrokernelTester()
44251 .mr(1)
44252 .nr(8)
44253 .kr(1)
44254 .sr(4)
44255 .m(m)
44256 .n(n)
44257 .k(k)
44258 .iterations(1)
44259 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44260 }
44261 }
44262 }
44263 }
44264
44265 TEST(F32_IGEMM_1X8S4__PSIMD, small_kernel) {
44266 TEST_REQUIRES_PSIMD;
44267 for (size_t k = 1; k <= 20; k += 5) {
44268 GemmMicrokernelTester()
44269 .mr(1)
44270 .nr(8)
44271 .kr(1)
44272 .sr(4)
44273 .m(1)
44274 .n(8)
44275 .k(k)
44276 .ks(3)
44277 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44278 }
44279 }
44280
44281 TEST(F32_IGEMM_1X8S4__PSIMD, small_kernel_subtile) {
44282 TEST_REQUIRES_PSIMD;
44283 for (size_t k = 1; k <= 20; k += 5) {
44284 for (uint32_t m = 1; m <= 1; m++) {
44285 for (uint32_t n = 1; n <= 8; n++) {
44286 GemmMicrokernelTester()
44287 .mr(1)
44288 .nr(8)
44289 .kr(1)
44290 .sr(4)
44291 .m(m)
44292 .n(n)
44293 .k(k)
44294 .ks(3)
44295 .iterations(1)
44296 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44297 }
44298 }
44299 }
44300 }
44301
44302 TEST(F32_IGEMM_1X8S4__PSIMD, n_gt_8_small_kernel) {
44303 TEST_REQUIRES_PSIMD;
44304 for (uint32_t n = 9; n < 16; n++) {
44305 for (size_t k = 1; k <= 20; k += 5) {
44306 GemmMicrokernelTester()
44307 .mr(1)
44308 .nr(8)
44309 .kr(1)
44310 .sr(4)
44311 .m(1)
44312 .n(8)
44313 .k(k)
44314 .ks(3)
44315 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44316 }
44317 }
44318 }
44319
44320 TEST(F32_IGEMM_1X8S4__PSIMD, n_div_8_small_kernel) {
44321 TEST_REQUIRES_PSIMD;
44322 for (uint32_t n = 16; n <= 24; n += 8) {
44323 for (size_t k = 1; k <= 20; k += 5) {
44324 GemmMicrokernelTester()
44325 .mr(1)
44326 .nr(8)
44327 .kr(1)
44328 .sr(4)
44329 .m(1)
44330 .n(8)
44331 .k(k)
44332 .ks(3)
44333 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44334 }
44335 }
44336 }
44337
44338 TEST(F32_IGEMM_1X8S4__PSIMD, strided_cm_subtile) {
44339 TEST_REQUIRES_PSIMD;
44340 for (size_t k = 1; k <= 20; k += 5) {
44341 for (uint32_t m = 1; m <= 1; m++) {
44342 for (uint32_t n = 1; n <= 8; n++) {
44343 GemmMicrokernelTester()
44344 .mr(1)
44345 .nr(8)
44346 .kr(1)
44347 .sr(4)
44348 .m(m)
44349 .n(n)
44350 .k(k)
44351 .cm_stride(11)
44352 .iterations(1)
44353 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44354 }
44355 }
44356 }
44357 }
44358
44359 TEST(F32_IGEMM_1X8S4__PSIMD, a_offset) {
44360 TEST_REQUIRES_PSIMD;
44361 for (size_t k = 1; k <= 20; k += 5) {
44362 GemmMicrokernelTester()
44363 .mr(1)
44364 .nr(8)
44365 .kr(1)
44366 .sr(4)
44367 .m(1)
44368 .n(8)
44369 .k(k)
44370 .ks(3)
44371 .a_offset(23)
44372 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44373 }
44374 }
44375
44376 TEST(F32_IGEMM_1X8S4__PSIMD, zero) {
44377 TEST_REQUIRES_PSIMD;
44378 for (uint32_t mz = 0; mz < 1; mz++) {
44379 for (size_t k = 1; k <= 20; k += 5) {
44380 GemmMicrokernelTester()
44381 .mr(1)
44382 .nr(8)
44383 .kr(1)
44384 .sr(4)
44385 .m(1)
44386 .n(8)
44387 .k(k)
44388 .ks(3)
44389 .a_offset(23)
44390 .zero_index(mz)
44391 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44392 }
44393 }
44394 }
44395
44396 TEST(F32_IGEMM_1X8S4__PSIMD, qmin) {
44397 TEST_REQUIRES_PSIMD;
44398 GemmMicrokernelTester()
44399 .mr(1)
44400 .nr(8)
44401 .kr(1)
44402 .sr(4)
44403 .m(1)
44404 .n(8)
44405 .k(4)
44406 .qmin(128)
44407 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44408 }
44409
44410 TEST(F32_IGEMM_1X8S4__PSIMD, qmax) {
44411 TEST_REQUIRES_PSIMD;
44412 GemmMicrokernelTester()
44413 .mr(1)
44414 .nr(8)
44415 .kr(1)
44416 .sr(4)
44417 .m(1)
44418 .n(8)
44419 .k(4)
44420 .qmax(128)
44421 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44422 }
44423
44424 TEST(F32_IGEMM_1X8S4__PSIMD, strided_cm) {
44425 TEST_REQUIRES_PSIMD;
44426 GemmMicrokernelTester()
44427 .mr(1)
44428 .nr(8)
44429 .kr(1)
44430 .sr(4)
44431 .m(1)
44432 .n(8)
44433 .k(4)
44434 .cm_stride(11)
44435 .Test(xnn_f32_igemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44436 }
44437#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44438
44439
44440#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44441 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4) {
44442 TEST_REQUIRES_PSIMD;
44443 GemmMicrokernelTester()
44444 .mr(4)
44445 .nr(8)
44446 .kr(1)
44447 .sr(4)
44448 .m(4)
44449 .n(8)
44450 .k(4)
44451 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44452 }
44453
44454 TEST(F32_IGEMM_4X8S4__PSIMD, strided_cn) {
44455 TEST_REQUIRES_PSIMD;
44456 GemmMicrokernelTester()
44457 .mr(4)
44458 .nr(8)
44459 .kr(1)
44460 .sr(4)
44461 .m(4)
44462 .n(8)
44463 .k(4)
44464 .cn_stride(11)
44465 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44466 }
44467
44468 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4_subtile) {
44469 TEST_REQUIRES_PSIMD;
44470 for (uint32_t m = 1; m <= 4; m++) {
44471 for (uint32_t n = 1; n <= 8; n++) {
44472 GemmMicrokernelTester()
44473 .mr(4)
44474 .nr(8)
44475 .kr(1)
44476 .sr(4)
44477 .m(m)
44478 .n(n)
44479 .k(4)
44480 .iterations(1)
44481 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44482 }
44483 }
44484 }
44485
44486 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4_subtile_m) {
44487 TEST_REQUIRES_PSIMD;
44488 for (uint32_t m = 1; m <= 4; m++) {
44489 GemmMicrokernelTester()
44490 .mr(4)
44491 .nr(8)
44492 .kr(1)
44493 .sr(4)
44494 .m(m)
44495 .n(8)
44496 .k(4)
44497 .iterations(1)
44498 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44499 }
44500 }
44501
44502 TEST(F32_IGEMM_4X8S4__PSIMD, k_eq_4_subtile_n) {
44503 TEST_REQUIRES_PSIMD;
44504 for (uint32_t n = 1; n <= 8; n++) {
44505 GemmMicrokernelTester()
44506 .mr(4)
44507 .nr(8)
44508 .kr(1)
44509 .sr(4)
44510 .m(4)
44511 .n(n)
44512 .k(4)
44513 .iterations(1)
44514 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44515 }
44516 }
44517
44518 TEST(F32_IGEMM_4X8S4__PSIMD, k_lt_4) {
44519 TEST_REQUIRES_PSIMD;
44520 for (size_t k = 1; k < 4; k++) {
44521 GemmMicrokernelTester()
44522 .mr(4)
44523 .nr(8)
44524 .kr(1)
44525 .sr(4)
44526 .m(4)
44527 .n(8)
44528 .k(k)
44529 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44530 }
44531 }
44532
44533 TEST(F32_IGEMM_4X8S4__PSIMD, k_lt_4_subtile) {
44534 TEST_REQUIRES_PSIMD;
44535 for (size_t k = 1; k < 4; k++) {
44536 for (uint32_t m = 1; m <= 4; m++) {
44537 for (uint32_t n = 1; n <= 8; n++) {
44538 GemmMicrokernelTester()
44539 .mr(4)
44540 .nr(8)
44541 .kr(1)
44542 .sr(4)
44543 .m(m)
44544 .n(n)
44545 .k(k)
44546 .iterations(1)
44547 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44548 }
44549 }
44550 }
44551 }
44552
44553 TEST(F32_IGEMM_4X8S4__PSIMD, k_gt_4) {
44554 TEST_REQUIRES_PSIMD;
44555 for (size_t k = 5; k < 8; k++) {
44556 GemmMicrokernelTester()
44557 .mr(4)
44558 .nr(8)
44559 .kr(1)
44560 .sr(4)
44561 .m(4)
44562 .n(8)
44563 .k(k)
44564 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44565 }
44566 }
44567
44568 TEST(F32_IGEMM_4X8S4__PSIMD, k_gt_4_subtile) {
44569 TEST_REQUIRES_PSIMD;
44570 for (size_t k = 5; k < 8; k++) {
44571 for (uint32_t m = 1; m <= 4; m++) {
44572 for (uint32_t n = 1; n <= 8; n++) {
44573 GemmMicrokernelTester()
44574 .mr(4)
44575 .nr(8)
44576 .kr(1)
44577 .sr(4)
44578 .m(m)
44579 .n(n)
44580 .k(k)
44581 .iterations(1)
44582 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44583 }
44584 }
44585 }
44586 }
44587
44588 TEST(F32_IGEMM_4X8S4__PSIMD, k_div_4) {
44589 TEST_REQUIRES_PSIMD;
44590 for (size_t k = 8; k <= 40; k += 4) {
44591 GemmMicrokernelTester()
44592 .mr(4)
44593 .nr(8)
44594 .kr(1)
44595 .sr(4)
44596 .m(4)
44597 .n(8)
44598 .k(k)
44599 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44600 }
44601 }
44602
44603 TEST(F32_IGEMM_4X8S4__PSIMD, k_div_4_subtile) {
44604 TEST_REQUIRES_PSIMD;
44605 for (size_t k = 8; k <= 40; k += 4) {
44606 for (uint32_t m = 1; m <= 4; m++) {
44607 for (uint32_t n = 1; n <= 8; n++) {
44608 GemmMicrokernelTester()
44609 .mr(4)
44610 .nr(8)
44611 .kr(1)
44612 .sr(4)
44613 .m(m)
44614 .n(n)
44615 .k(k)
44616 .iterations(1)
44617 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44618 }
44619 }
44620 }
44621 }
44622
44623 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8) {
44624 TEST_REQUIRES_PSIMD;
44625 for (uint32_t n = 9; n < 16; n++) {
44626 for (size_t k = 1; k <= 20; k += 5) {
44627 GemmMicrokernelTester()
44628 .mr(4)
44629 .nr(8)
44630 .kr(1)
44631 .sr(4)
44632 .m(4)
44633 .n(8)
44634 .k(k)
44635 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44636 }
44637 }
44638 }
44639
44640 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8_strided_cn) {
44641 TEST_REQUIRES_PSIMD;
44642 for (uint32_t n = 9; n < 16; n++) {
44643 for (size_t k = 1; k <= 20; k += 5) {
44644 GemmMicrokernelTester()
44645 .mr(4)
44646 .nr(8)
44647 .kr(1)
44648 .sr(4)
44649 .m(4)
44650 .n(8)
44651 .k(k)
44652 .cn_stride(11)
44653 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44654 }
44655 }
44656 }
44657
44658 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8_subtile) {
44659 TEST_REQUIRES_PSIMD;
44660 for (uint32_t n = 9; n < 16; n++) {
44661 for (size_t k = 1; k <= 20; k += 5) {
44662 for (uint32_t m = 1; m <= 4; m++) {
44663 GemmMicrokernelTester()
44664 .mr(4)
44665 .nr(8)
44666 .kr(1)
44667 .sr(4)
44668 .m(m)
44669 .n(n)
44670 .k(k)
44671 .iterations(1)
44672 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44673 }
44674 }
44675 }
44676 }
44677
44678 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8) {
44679 TEST_REQUIRES_PSIMD;
44680 for (uint32_t n = 16; n <= 24; n += 8) {
44681 for (size_t k = 1; k <= 20; k += 5) {
44682 GemmMicrokernelTester()
44683 .mr(4)
44684 .nr(8)
44685 .kr(1)
44686 .sr(4)
44687 .m(4)
44688 .n(8)
44689 .k(k)
44690 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44691 }
44692 }
44693 }
44694
44695 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8_strided_cn) {
44696 TEST_REQUIRES_PSIMD;
44697 for (uint32_t n = 16; n <= 24; n += 8) {
44698 for (size_t k = 1; k <= 20; k += 5) {
44699 GemmMicrokernelTester()
44700 .mr(4)
44701 .nr(8)
44702 .kr(1)
44703 .sr(4)
44704 .m(4)
44705 .n(n)
44706 .k(k)
44707 .cn_stride(11)
44708 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44709 }
44710 }
44711 }
44712
44713 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8_subtile) {
44714 TEST_REQUIRES_PSIMD;
44715 for (uint32_t n = 16; n <= 24; n += 8) {
44716 for (size_t k = 1; k <= 20; k += 5) {
44717 for (uint32_t m = 1; m <= 4; m++) {
44718 GemmMicrokernelTester()
44719 .mr(4)
44720 .nr(8)
44721 .kr(1)
44722 .sr(4)
44723 .m(m)
44724 .n(n)
44725 .k(k)
44726 .iterations(1)
44727 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44728 }
44729 }
44730 }
44731 }
44732
44733 TEST(F32_IGEMM_4X8S4__PSIMD, small_kernel) {
44734 TEST_REQUIRES_PSIMD;
44735 for (size_t k = 1; k <= 20; k += 5) {
44736 GemmMicrokernelTester()
44737 .mr(4)
44738 .nr(8)
44739 .kr(1)
44740 .sr(4)
44741 .m(4)
44742 .n(8)
44743 .k(k)
44744 .ks(3)
44745 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44746 }
44747 }
44748
44749 TEST(F32_IGEMM_4X8S4__PSIMD, small_kernel_subtile) {
44750 TEST_REQUIRES_PSIMD;
44751 for (size_t k = 1; k <= 20; k += 5) {
44752 for (uint32_t m = 1; m <= 4; m++) {
44753 for (uint32_t n = 1; n <= 8; n++) {
44754 GemmMicrokernelTester()
44755 .mr(4)
44756 .nr(8)
44757 .kr(1)
44758 .sr(4)
44759 .m(m)
44760 .n(n)
44761 .k(k)
44762 .ks(3)
44763 .iterations(1)
44764 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44765 }
44766 }
44767 }
44768 }
44769
44770 TEST(F32_IGEMM_4X8S4__PSIMD, n_gt_8_small_kernel) {
44771 TEST_REQUIRES_PSIMD;
44772 for (uint32_t n = 9; n < 16; n++) {
44773 for (size_t k = 1; k <= 20; k += 5) {
44774 GemmMicrokernelTester()
44775 .mr(4)
44776 .nr(8)
44777 .kr(1)
44778 .sr(4)
44779 .m(4)
44780 .n(8)
44781 .k(k)
44782 .ks(3)
44783 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44784 }
44785 }
44786 }
44787
44788 TEST(F32_IGEMM_4X8S4__PSIMD, n_div_8_small_kernel) {
44789 TEST_REQUIRES_PSIMD;
44790 for (uint32_t n = 16; n <= 24; n += 8) {
44791 for (size_t k = 1; k <= 20; k += 5) {
44792 GemmMicrokernelTester()
44793 .mr(4)
44794 .nr(8)
44795 .kr(1)
44796 .sr(4)
44797 .m(4)
44798 .n(8)
44799 .k(k)
44800 .ks(3)
44801 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44802 }
44803 }
44804 }
44805
44806 TEST(F32_IGEMM_4X8S4__PSIMD, strided_cm_subtile) {
44807 TEST_REQUIRES_PSIMD;
44808 for (size_t k = 1; k <= 20; k += 5) {
44809 for (uint32_t m = 1; m <= 4; m++) {
44810 for (uint32_t n = 1; n <= 8; n++) {
44811 GemmMicrokernelTester()
44812 .mr(4)
44813 .nr(8)
44814 .kr(1)
44815 .sr(4)
44816 .m(m)
44817 .n(n)
44818 .k(k)
44819 .cm_stride(11)
44820 .iterations(1)
44821 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44822 }
44823 }
44824 }
44825 }
44826
44827 TEST(F32_IGEMM_4X8S4__PSIMD, a_offset) {
44828 TEST_REQUIRES_PSIMD;
44829 for (size_t k = 1; k <= 20; k += 5) {
44830 GemmMicrokernelTester()
44831 .mr(4)
44832 .nr(8)
44833 .kr(1)
44834 .sr(4)
44835 .m(4)
44836 .n(8)
44837 .k(k)
44838 .ks(3)
44839 .a_offset(83)
44840 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44841 }
44842 }
44843
44844 TEST(F32_IGEMM_4X8S4__PSIMD, zero) {
44845 TEST_REQUIRES_PSIMD;
44846 for (uint32_t mz = 0; mz < 4; mz++) {
44847 for (size_t k = 1; k <= 20; k += 5) {
44848 GemmMicrokernelTester()
44849 .mr(4)
44850 .nr(8)
44851 .kr(1)
44852 .sr(4)
44853 .m(4)
44854 .n(8)
44855 .k(k)
44856 .ks(3)
44857 .a_offset(83)
44858 .zero_index(mz)
44859 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44860 }
44861 }
44862 }
44863
44864 TEST(F32_IGEMM_4X8S4__PSIMD, qmin) {
44865 TEST_REQUIRES_PSIMD;
44866 GemmMicrokernelTester()
44867 .mr(4)
44868 .nr(8)
44869 .kr(1)
44870 .sr(4)
44871 .m(4)
44872 .n(8)
44873 .k(4)
44874 .qmin(128)
44875 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44876 }
44877
44878 TEST(F32_IGEMM_4X8S4__PSIMD, qmax) {
44879 TEST_REQUIRES_PSIMD;
44880 GemmMicrokernelTester()
44881 .mr(4)
44882 .nr(8)
44883 .kr(1)
44884 .sr(4)
44885 .m(4)
44886 .n(8)
44887 .k(4)
44888 .qmax(128)
44889 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44890 }
44891
44892 TEST(F32_IGEMM_4X8S4__PSIMD, strided_cm) {
44893 TEST_REQUIRES_PSIMD;
44894 GemmMicrokernelTester()
44895 .mr(4)
44896 .nr(8)
44897 .kr(1)
44898 .sr(4)
44899 .m(4)
44900 .n(8)
44901 .k(4)
44902 .cm_stride(11)
44903 .Test(xnn_f32_igemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44904 }
44905#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44906
44907
44908#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
44909 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4) {
44910 TEST_REQUIRES_PSIMD;
44911 GemmMicrokernelTester()
44912 .mr(6)
44913 .nr(8)
44914 .kr(1)
44915 .sr(4)
44916 .m(6)
44917 .n(8)
44918 .k(4)
44919 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44920 }
44921
44922 TEST(F32_IGEMM_6X8S4__PSIMD, strided_cn) {
44923 TEST_REQUIRES_PSIMD;
44924 GemmMicrokernelTester()
44925 .mr(6)
44926 .nr(8)
44927 .kr(1)
44928 .sr(4)
44929 .m(6)
44930 .n(8)
44931 .k(4)
44932 .cn_stride(11)
44933 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44934 }
44935
44936 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4_subtile) {
44937 TEST_REQUIRES_PSIMD;
44938 for (uint32_t m = 1; m <= 6; m++) {
44939 for (uint32_t n = 1; n <= 8; n++) {
44940 GemmMicrokernelTester()
44941 .mr(6)
44942 .nr(8)
44943 .kr(1)
44944 .sr(4)
44945 .m(m)
44946 .n(n)
44947 .k(4)
44948 .iterations(1)
44949 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44950 }
44951 }
44952 }
44953
44954 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4_subtile_m) {
44955 TEST_REQUIRES_PSIMD;
44956 for (uint32_t m = 1; m <= 6; m++) {
44957 GemmMicrokernelTester()
44958 .mr(6)
44959 .nr(8)
44960 .kr(1)
44961 .sr(4)
44962 .m(m)
44963 .n(8)
44964 .k(4)
44965 .iterations(1)
44966 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44967 }
44968 }
44969
44970 TEST(F32_IGEMM_6X8S4__PSIMD, k_eq_4_subtile_n) {
44971 TEST_REQUIRES_PSIMD;
44972 for (uint32_t n = 1; n <= 8; n++) {
44973 GemmMicrokernelTester()
44974 .mr(6)
44975 .nr(8)
44976 .kr(1)
44977 .sr(4)
44978 .m(6)
44979 .n(n)
44980 .k(4)
44981 .iterations(1)
44982 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44983 }
44984 }
44985
44986 TEST(F32_IGEMM_6X8S4__PSIMD, k_lt_4) {
44987 TEST_REQUIRES_PSIMD;
44988 for (size_t k = 1; k < 4; k++) {
44989 GemmMicrokernelTester()
44990 .mr(6)
44991 .nr(8)
44992 .kr(1)
44993 .sr(4)
44994 .m(6)
44995 .n(8)
44996 .k(k)
44997 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
44998 }
44999 }
45000
45001 TEST(F32_IGEMM_6X8S4__PSIMD, k_lt_4_subtile) {
45002 TEST_REQUIRES_PSIMD;
45003 for (size_t k = 1; k < 4; k++) {
45004 for (uint32_t m = 1; m <= 6; m++) {
45005 for (uint32_t n = 1; n <= 8; n++) {
45006 GemmMicrokernelTester()
45007 .mr(6)
45008 .nr(8)
45009 .kr(1)
45010 .sr(4)
45011 .m(m)
45012 .n(n)
45013 .k(k)
45014 .iterations(1)
45015 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45016 }
45017 }
45018 }
45019 }
45020
45021 TEST(F32_IGEMM_6X8S4__PSIMD, k_gt_4) {
45022 TEST_REQUIRES_PSIMD;
45023 for (size_t k = 5; k < 8; k++) {
45024 GemmMicrokernelTester()
45025 .mr(6)
45026 .nr(8)
45027 .kr(1)
45028 .sr(4)
45029 .m(6)
45030 .n(8)
45031 .k(k)
45032 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45033 }
45034 }
45035
45036 TEST(F32_IGEMM_6X8S4__PSIMD, k_gt_4_subtile) {
45037 TEST_REQUIRES_PSIMD;
45038 for (size_t k = 5; k < 8; k++) {
45039 for (uint32_t m = 1; m <= 6; m++) {
45040 for (uint32_t n = 1; n <= 8; n++) {
45041 GemmMicrokernelTester()
45042 .mr(6)
45043 .nr(8)
45044 .kr(1)
45045 .sr(4)
45046 .m(m)
45047 .n(n)
45048 .k(k)
45049 .iterations(1)
45050 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45051 }
45052 }
45053 }
45054 }
45055
45056 TEST(F32_IGEMM_6X8S4__PSIMD, k_div_4) {
45057 TEST_REQUIRES_PSIMD;
45058 for (size_t k = 8; k <= 40; k += 4) {
45059 GemmMicrokernelTester()
45060 .mr(6)
45061 .nr(8)
45062 .kr(1)
45063 .sr(4)
45064 .m(6)
45065 .n(8)
45066 .k(k)
45067 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45068 }
45069 }
45070
45071 TEST(F32_IGEMM_6X8S4__PSIMD, k_div_4_subtile) {
45072 TEST_REQUIRES_PSIMD;
45073 for (size_t k = 8; k <= 40; k += 4) {
45074 for (uint32_t m = 1; m <= 6; m++) {
45075 for (uint32_t n = 1; n <= 8; n++) {
45076 GemmMicrokernelTester()
45077 .mr(6)
45078 .nr(8)
45079 .kr(1)
45080 .sr(4)
45081 .m(m)
45082 .n(n)
45083 .k(k)
45084 .iterations(1)
45085 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45086 }
45087 }
45088 }
45089 }
45090
45091 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8) {
45092 TEST_REQUIRES_PSIMD;
45093 for (uint32_t n = 9; n < 16; n++) {
45094 for (size_t k = 1; k <= 20; k += 5) {
45095 GemmMicrokernelTester()
45096 .mr(6)
45097 .nr(8)
45098 .kr(1)
45099 .sr(4)
45100 .m(6)
45101 .n(8)
45102 .k(k)
45103 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45104 }
45105 }
45106 }
45107
45108 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8_strided_cn) {
45109 TEST_REQUIRES_PSIMD;
45110 for (uint32_t n = 9; n < 16; n++) {
45111 for (size_t k = 1; k <= 20; k += 5) {
45112 GemmMicrokernelTester()
45113 .mr(6)
45114 .nr(8)
45115 .kr(1)
45116 .sr(4)
45117 .m(6)
45118 .n(8)
45119 .k(k)
45120 .cn_stride(11)
45121 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45122 }
45123 }
45124 }
45125
45126 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8_subtile) {
45127 TEST_REQUIRES_PSIMD;
45128 for (uint32_t n = 9; n < 16; n++) {
45129 for (size_t k = 1; k <= 20; k += 5) {
45130 for (uint32_t m = 1; m <= 6; m++) {
45131 GemmMicrokernelTester()
45132 .mr(6)
45133 .nr(8)
45134 .kr(1)
45135 .sr(4)
45136 .m(m)
45137 .n(n)
45138 .k(k)
45139 .iterations(1)
45140 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45141 }
45142 }
45143 }
45144 }
45145
45146 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8) {
45147 TEST_REQUIRES_PSIMD;
45148 for (uint32_t n = 16; n <= 24; n += 8) {
45149 for (size_t k = 1; k <= 20; k += 5) {
45150 GemmMicrokernelTester()
45151 .mr(6)
45152 .nr(8)
45153 .kr(1)
45154 .sr(4)
45155 .m(6)
45156 .n(8)
45157 .k(k)
45158 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45159 }
45160 }
45161 }
45162
45163 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8_strided_cn) {
45164 TEST_REQUIRES_PSIMD;
45165 for (uint32_t n = 16; n <= 24; n += 8) {
45166 for (size_t k = 1; k <= 20; k += 5) {
45167 GemmMicrokernelTester()
45168 .mr(6)
45169 .nr(8)
45170 .kr(1)
45171 .sr(4)
45172 .m(6)
45173 .n(n)
45174 .k(k)
45175 .cn_stride(11)
45176 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45177 }
45178 }
45179 }
45180
45181 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8_subtile) {
45182 TEST_REQUIRES_PSIMD;
45183 for (uint32_t n = 16; n <= 24; n += 8) {
45184 for (size_t k = 1; k <= 20; k += 5) {
45185 for (uint32_t m = 1; m <= 6; m++) {
45186 GemmMicrokernelTester()
45187 .mr(6)
45188 .nr(8)
45189 .kr(1)
45190 .sr(4)
45191 .m(m)
45192 .n(n)
45193 .k(k)
45194 .iterations(1)
45195 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45196 }
45197 }
45198 }
45199 }
45200
45201 TEST(F32_IGEMM_6X8S4__PSIMD, small_kernel) {
45202 TEST_REQUIRES_PSIMD;
45203 for (size_t k = 1; k <= 20; k += 5) {
45204 GemmMicrokernelTester()
45205 .mr(6)
45206 .nr(8)
45207 .kr(1)
45208 .sr(4)
45209 .m(6)
45210 .n(8)
45211 .k(k)
45212 .ks(3)
45213 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45214 }
45215 }
45216
45217 TEST(F32_IGEMM_6X8S4__PSIMD, small_kernel_subtile) {
45218 TEST_REQUIRES_PSIMD;
45219 for (size_t k = 1; k <= 20; k += 5) {
45220 for (uint32_t m = 1; m <= 6; m++) {
45221 for (uint32_t n = 1; n <= 8; n++) {
45222 GemmMicrokernelTester()
45223 .mr(6)
45224 .nr(8)
45225 .kr(1)
45226 .sr(4)
45227 .m(m)
45228 .n(n)
45229 .k(k)
45230 .ks(3)
45231 .iterations(1)
45232 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45233 }
45234 }
45235 }
45236 }
45237
45238 TEST(F32_IGEMM_6X8S4__PSIMD, n_gt_8_small_kernel) {
45239 TEST_REQUIRES_PSIMD;
45240 for (uint32_t n = 9; n < 16; n++) {
45241 for (size_t k = 1; k <= 20; k += 5) {
45242 GemmMicrokernelTester()
45243 .mr(6)
45244 .nr(8)
45245 .kr(1)
45246 .sr(4)
45247 .m(6)
45248 .n(8)
45249 .k(k)
45250 .ks(3)
45251 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45252 }
45253 }
45254 }
45255
45256 TEST(F32_IGEMM_6X8S4__PSIMD, n_div_8_small_kernel) {
45257 TEST_REQUIRES_PSIMD;
45258 for (uint32_t n = 16; n <= 24; n += 8) {
45259 for (size_t k = 1; k <= 20; k += 5) {
45260 GemmMicrokernelTester()
45261 .mr(6)
45262 .nr(8)
45263 .kr(1)
45264 .sr(4)
45265 .m(6)
45266 .n(8)
45267 .k(k)
45268 .ks(3)
45269 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45270 }
45271 }
45272 }
45273
45274 TEST(F32_IGEMM_6X8S4__PSIMD, strided_cm_subtile) {
45275 TEST_REQUIRES_PSIMD;
45276 for (size_t k = 1; k <= 20; k += 5) {
45277 for (uint32_t m = 1; m <= 6; m++) {
45278 for (uint32_t n = 1; n <= 8; n++) {
45279 GemmMicrokernelTester()
45280 .mr(6)
45281 .nr(8)
45282 .kr(1)
45283 .sr(4)
45284 .m(m)
45285 .n(n)
45286 .k(k)
45287 .cm_stride(11)
45288 .iterations(1)
45289 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45290 }
45291 }
45292 }
45293 }
45294
45295 TEST(F32_IGEMM_6X8S4__PSIMD, a_offset) {
45296 TEST_REQUIRES_PSIMD;
45297 for (size_t k = 1; k <= 20; k += 5) {
45298 GemmMicrokernelTester()
45299 .mr(6)
45300 .nr(8)
45301 .kr(1)
45302 .sr(4)
45303 .m(6)
45304 .n(8)
45305 .k(k)
45306 .ks(3)
45307 .a_offset(127)
45308 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45309 }
45310 }
45311
45312 TEST(F32_IGEMM_6X8S4__PSIMD, zero) {
45313 TEST_REQUIRES_PSIMD;
45314 for (uint32_t mz = 0; mz < 6; mz++) {
45315 for (size_t k = 1; k <= 20; k += 5) {
45316 GemmMicrokernelTester()
45317 .mr(6)
45318 .nr(8)
45319 .kr(1)
45320 .sr(4)
45321 .m(6)
45322 .n(8)
45323 .k(k)
45324 .ks(3)
45325 .a_offset(127)
45326 .zero_index(mz)
45327 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45328 }
45329 }
45330 }
45331
45332 TEST(F32_IGEMM_6X8S4__PSIMD, qmin) {
45333 TEST_REQUIRES_PSIMD;
45334 GemmMicrokernelTester()
45335 .mr(6)
45336 .nr(8)
45337 .kr(1)
45338 .sr(4)
45339 .m(6)
45340 .n(8)
45341 .k(4)
45342 .qmin(128)
45343 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45344 }
45345
45346 TEST(F32_IGEMM_6X8S4__PSIMD, qmax) {
45347 TEST_REQUIRES_PSIMD;
45348 GemmMicrokernelTester()
45349 .mr(6)
45350 .nr(8)
45351 .kr(1)
45352 .sr(4)
45353 .m(6)
45354 .n(8)
45355 .k(4)
45356 .qmax(128)
45357 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45358 }
45359
45360 TEST(F32_IGEMM_6X8S4__PSIMD, strided_cm) {
45361 TEST_REQUIRES_PSIMD;
45362 GemmMicrokernelTester()
45363 .mr(6)
45364 .nr(8)
45365 .kr(1)
45366 .sr(4)
45367 .m(6)
45368 .n(8)
45369 .k(4)
45370 .cm_stride(11)
45371 .Test(xnn_f32_igemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
45372 }
45373#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
45374
45375
45376#if XNN_ARCH_WASM
45377 TEST(F32_IGEMM_1X4__WASM, k_eq_1) {
45378 GemmMicrokernelTester()
45379 .mr(1)
45380 .nr(4)
45381 .kr(1)
45382 .sr(1)
45383 .m(1)
45384 .n(4)
45385 .k(1)
45386 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45387 }
45388
45389 TEST(F32_IGEMM_1X4__WASM, strided_cn) {
45390 GemmMicrokernelTester()
45391 .mr(1)
45392 .nr(4)
45393 .kr(1)
45394 .sr(1)
45395 .m(1)
45396 .n(4)
45397 .k(1)
45398 .cn_stride(7)
45399 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45400 }
45401
45402 TEST(F32_IGEMM_1X4__WASM, k_eq_1_subtile) {
45403 for (uint32_t m = 1; m <= 1; m++) {
45404 for (uint32_t n = 1; n <= 4; n++) {
45405 GemmMicrokernelTester()
45406 .mr(1)
45407 .nr(4)
45408 .kr(1)
45409 .sr(1)
45410 .m(m)
45411 .n(n)
45412 .k(1)
45413 .iterations(1)
45414 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45415 }
45416 }
45417 }
45418
45419 TEST(F32_IGEMM_1X4__WASM, k_eq_1_subtile_m) {
45420 for (uint32_t m = 1; m <= 1; m++) {
45421 GemmMicrokernelTester()
45422 .mr(1)
45423 .nr(4)
45424 .kr(1)
45425 .sr(1)
45426 .m(m)
45427 .n(4)
45428 .k(1)
45429 .iterations(1)
45430 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45431 }
45432 }
45433
45434 TEST(F32_IGEMM_1X4__WASM, k_eq_1_subtile_n) {
45435 for (uint32_t n = 1; n <= 4; n++) {
45436 GemmMicrokernelTester()
45437 .mr(1)
45438 .nr(4)
45439 .kr(1)
45440 .sr(1)
45441 .m(1)
45442 .n(n)
45443 .k(1)
45444 .iterations(1)
45445 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45446 }
45447 }
45448
45449 TEST(F32_IGEMM_1X4__WASM, k_gt_1) {
45450 for (size_t k = 2; k < 10; k++) {
45451 GemmMicrokernelTester()
45452 .mr(1)
45453 .nr(4)
45454 .kr(1)
45455 .sr(1)
45456 .m(1)
45457 .n(4)
45458 .k(k)
45459 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45460 }
45461 }
45462
45463 TEST(F32_IGEMM_1X4__WASM, k_gt_1_subtile) {
45464 for (size_t k = 2; k < 10; k++) {
45465 for (uint32_t m = 1; m <= 1; m++) {
45466 for (uint32_t n = 1; n <= 4; n++) {
45467 GemmMicrokernelTester()
45468 .mr(1)
45469 .nr(4)
45470 .kr(1)
45471 .sr(1)
45472 .m(m)
45473 .n(n)
45474 .k(k)
45475 .iterations(1)
45476 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45477 }
45478 }
45479 }
45480 }
45481
45482 TEST(F32_IGEMM_1X4__WASM, n_gt_4) {
45483 for (uint32_t n = 5; n < 8; n++) {
45484 for (size_t k = 1; k <= 5; k += 2) {
45485 GemmMicrokernelTester()
45486 .mr(1)
45487 .nr(4)
45488 .kr(1)
45489 .sr(1)
45490 .m(1)
45491 .n(4)
45492 .k(k)
45493 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45494 }
45495 }
45496 }
45497
45498 TEST(F32_IGEMM_1X4__WASM, n_gt_4_strided_cn) {
45499 for (uint32_t n = 5; n < 8; n++) {
45500 for (size_t k = 1; k <= 5; k += 2) {
45501 GemmMicrokernelTester()
45502 .mr(1)
45503 .nr(4)
45504 .kr(1)
45505 .sr(1)
45506 .m(1)
45507 .n(4)
45508 .k(k)
45509 .cn_stride(7)
45510 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45511 }
45512 }
45513 }
45514
45515 TEST(F32_IGEMM_1X4__WASM, n_gt_4_subtile) {
45516 for (uint32_t n = 5; n < 8; n++) {
45517 for (size_t k = 1; k <= 5; k += 2) {
45518 for (uint32_t m = 1; m <= 1; m++) {
45519 GemmMicrokernelTester()
45520 .mr(1)
45521 .nr(4)
45522 .kr(1)
45523 .sr(1)
45524 .m(m)
45525 .n(n)
45526 .k(k)
45527 .iterations(1)
45528 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45529 }
45530 }
45531 }
45532 }
45533
45534 TEST(F32_IGEMM_1X4__WASM, n_div_4) {
45535 for (uint32_t n = 8; n <= 12; n += 4) {
45536 for (size_t k = 1; k <= 5; k += 2) {
45537 GemmMicrokernelTester()
45538 .mr(1)
45539 .nr(4)
45540 .kr(1)
45541 .sr(1)
45542 .m(1)
45543 .n(4)
45544 .k(k)
45545 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45546 }
45547 }
45548 }
45549
45550 TEST(F32_IGEMM_1X4__WASM, n_div_4_strided_cn) {
45551 for (uint32_t n = 8; n <= 12; n += 4) {
45552 for (size_t k = 1; k <= 5; k += 2) {
45553 GemmMicrokernelTester()
45554 .mr(1)
45555 .nr(4)
45556 .kr(1)
45557 .sr(1)
45558 .m(1)
45559 .n(n)
45560 .k(k)
45561 .cn_stride(7)
45562 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45563 }
45564 }
45565 }
45566
45567 TEST(F32_IGEMM_1X4__WASM, n_div_4_subtile) {
45568 for (uint32_t n = 8; n <= 12; n += 4) {
45569 for (size_t k = 1; k <= 5; k += 2) {
45570 for (uint32_t m = 1; m <= 1; m++) {
45571 GemmMicrokernelTester()
45572 .mr(1)
45573 .nr(4)
45574 .kr(1)
45575 .sr(1)
45576 .m(m)
45577 .n(n)
45578 .k(k)
45579 .iterations(1)
45580 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45581 }
45582 }
45583 }
45584 }
45585
45586 TEST(F32_IGEMM_1X4__WASM, small_kernel) {
45587 for (size_t k = 1; k <= 5; k += 2) {
45588 GemmMicrokernelTester()
45589 .mr(1)
45590 .nr(4)
45591 .kr(1)
45592 .sr(1)
45593 .m(1)
45594 .n(4)
45595 .k(k)
45596 .ks(3)
45597 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45598 }
45599 }
45600
45601 TEST(F32_IGEMM_1X4__WASM, small_kernel_subtile) {
45602 for (size_t k = 1; k <= 5; k += 2) {
45603 for (uint32_t m = 1; m <= 1; m++) {
45604 for (uint32_t n = 1; n <= 4; n++) {
45605 GemmMicrokernelTester()
45606 .mr(1)
45607 .nr(4)
45608 .kr(1)
45609 .sr(1)
45610 .m(m)
45611 .n(n)
45612 .k(k)
45613 .ks(3)
45614 .iterations(1)
45615 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45616 }
45617 }
45618 }
45619 }
45620
45621 TEST(F32_IGEMM_1X4__WASM, n_gt_4_small_kernel) {
45622 for (uint32_t n = 5; n < 8; n++) {
45623 for (size_t k = 1; k <= 5; k += 2) {
45624 GemmMicrokernelTester()
45625 .mr(1)
45626 .nr(4)
45627 .kr(1)
45628 .sr(1)
45629 .m(1)
45630 .n(4)
45631 .k(k)
45632 .ks(3)
45633 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45634 }
45635 }
45636 }
45637
45638 TEST(F32_IGEMM_1X4__WASM, n_div_4_small_kernel) {
45639 for (uint32_t n = 8; n <= 12; n += 4) {
45640 for (size_t k = 1; k <= 5; k += 2) {
45641 GemmMicrokernelTester()
45642 .mr(1)
45643 .nr(4)
45644 .kr(1)
45645 .sr(1)
45646 .m(1)
45647 .n(4)
45648 .k(k)
45649 .ks(3)
45650 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45651 }
45652 }
45653 }
45654
45655 TEST(F32_IGEMM_1X4__WASM, strided_cm_subtile) {
45656 for (size_t k = 1; k <= 5; k += 2) {
45657 for (uint32_t m = 1; m <= 1; m++) {
45658 for (uint32_t n = 1; n <= 4; n++) {
45659 GemmMicrokernelTester()
45660 .mr(1)
45661 .nr(4)
45662 .kr(1)
45663 .sr(1)
45664 .m(m)
45665 .n(n)
45666 .k(k)
45667 .cm_stride(7)
45668 .iterations(1)
45669 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45670 }
45671 }
45672 }
45673 }
45674
45675 TEST(F32_IGEMM_1X4__WASM, a_offset) {
45676 for (size_t k = 1; k <= 5; k += 2) {
45677 GemmMicrokernelTester()
45678 .mr(1)
45679 .nr(4)
45680 .kr(1)
45681 .sr(1)
45682 .m(1)
45683 .n(4)
45684 .k(k)
45685 .ks(3)
45686 .a_offset(7)
45687 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45688 }
45689 }
45690
45691 TEST(F32_IGEMM_1X4__WASM, zero) {
45692 for (uint32_t mz = 0; mz < 1; mz++) {
45693 for (size_t k = 1; k <= 5; k += 2) {
45694 GemmMicrokernelTester()
45695 .mr(1)
45696 .nr(4)
45697 .kr(1)
45698 .sr(1)
45699 .m(1)
45700 .n(4)
45701 .k(k)
45702 .ks(3)
45703 .a_offset(7)
45704 .zero_index(mz)
45705 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45706 }
45707 }
45708 }
45709
45710 TEST(F32_IGEMM_1X4__WASM, qmin) {
45711 GemmMicrokernelTester()
45712 .mr(1)
45713 .nr(4)
45714 .kr(1)
45715 .sr(1)
45716 .m(1)
45717 .n(4)
45718 .k(1)
45719 .qmin(128)
45720 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45721 }
45722
45723 TEST(F32_IGEMM_1X4__WASM, qmax) {
45724 GemmMicrokernelTester()
45725 .mr(1)
45726 .nr(4)
45727 .kr(1)
45728 .sr(1)
45729 .m(1)
45730 .n(4)
45731 .k(1)
45732 .qmax(128)
45733 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45734 }
45735
45736 TEST(F32_IGEMM_1X4__WASM, strided_cm) {
45737 GemmMicrokernelTester()
45738 .mr(1)
45739 .nr(4)
45740 .kr(1)
45741 .sr(1)
45742 .m(1)
45743 .n(4)
45744 .k(1)
45745 .cm_stride(7)
45746 .Test(xnn_f32_igemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45747 }
45748#endif // XNN_ARCH_WASM
45749
45750
45751#if XNN_ARCH_WASM
45752 TEST(F32_IGEMM_2X4__WASM, k_eq_1) {
45753 GemmMicrokernelTester()
45754 .mr(2)
45755 .nr(4)
45756 .kr(1)
45757 .sr(1)
45758 .m(2)
45759 .n(4)
45760 .k(1)
45761 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45762 }
45763
45764 TEST(F32_IGEMM_2X4__WASM, strided_cn) {
45765 GemmMicrokernelTester()
45766 .mr(2)
45767 .nr(4)
45768 .kr(1)
45769 .sr(1)
45770 .m(2)
45771 .n(4)
45772 .k(1)
45773 .cn_stride(7)
45774 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45775 }
45776
45777 TEST(F32_IGEMM_2X4__WASM, k_eq_1_subtile) {
45778 for (uint32_t m = 1; m <= 2; m++) {
45779 for (uint32_t n = 1; n <= 4; n++) {
45780 GemmMicrokernelTester()
45781 .mr(2)
45782 .nr(4)
45783 .kr(1)
45784 .sr(1)
45785 .m(m)
45786 .n(n)
45787 .k(1)
45788 .iterations(1)
45789 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45790 }
45791 }
45792 }
45793
45794 TEST(F32_IGEMM_2X4__WASM, k_eq_1_subtile_m) {
45795 for (uint32_t m = 1; m <= 2; m++) {
45796 GemmMicrokernelTester()
45797 .mr(2)
45798 .nr(4)
45799 .kr(1)
45800 .sr(1)
45801 .m(m)
45802 .n(4)
45803 .k(1)
45804 .iterations(1)
45805 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45806 }
45807 }
45808
45809 TEST(F32_IGEMM_2X4__WASM, k_eq_1_subtile_n) {
45810 for (uint32_t n = 1; n <= 4; n++) {
45811 GemmMicrokernelTester()
45812 .mr(2)
45813 .nr(4)
45814 .kr(1)
45815 .sr(1)
45816 .m(2)
45817 .n(n)
45818 .k(1)
45819 .iterations(1)
45820 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45821 }
45822 }
45823
45824 TEST(F32_IGEMM_2X4__WASM, k_gt_1) {
45825 for (size_t k = 2; k < 10; k++) {
45826 GemmMicrokernelTester()
45827 .mr(2)
45828 .nr(4)
45829 .kr(1)
45830 .sr(1)
45831 .m(2)
45832 .n(4)
45833 .k(k)
45834 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45835 }
45836 }
45837
45838 TEST(F32_IGEMM_2X4__WASM, k_gt_1_subtile) {
45839 for (size_t k = 2; k < 10; k++) {
45840 for (uint32_t m = 1; m <= 2; m++) {
45841 for (uint32_t n = 1; n <= 4; n++) {
45842 GemmMicrokernelTester()
45843 .mr(2)
45844 .nr(4)
45845 .kr(1)
45846 .sr(1)
45847 .m(m)
45848 .n(n)
45849 .k(k)
45850 .iterations(1)
45851 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45852 }
45853 }
45854 }
45855 }
45856
45857 TEST(F32_IGEMM_2X4__WASM, n_gt_4) {
45858 for (uint32_t n = 5; n < 8; n++) {
45859 for (size_t k = 1; k <= 5; k += 2) {
45860 GemmMicrokernelTester()
45861 .mr(2)
45862 .nr(4)
45863 .kr(1)
45864 .sr(1)
45865 .m(2)
45866 .n(4)
45867 .k(k)
45868 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45869 }
45870 }
45871 }
45872
45873 TEST(F32_IGEMM_2X4__WASM, n_gt_4_strided_cn) {
45874 for (uint32_t n = 5; n < 8; n++) {
45875 for (size_t k = 1; k <= 5; k += 2) {
45876 GemmMicrokernelTester()
45877 .mr(2)
45878 .nr(4)
45879 .kr(1)
45880 .sr(1)
45881 .m(2)
45882 .n(4)
45883 .k(k)
45884 .cn_stride(7)
45885 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45886 }
45887 }
45888 }
45889
45890 TEST(F32_IGEMM_2X4__WASM, n_gt_4_subtile) {
45891 for (uint32_t n = 5; n < 8; n++) {
45892 for (size_t k = 1; k <= 5; k += 2) {
45893 for (uint32_t m = 1; m <= 2; m++) {
45894 GemmMicrokernelTester()
45895 .mr(2)
45896 .nr(4)
45897 .kr(1)
45898 .sr(1)
45899 .m(m)
45900 .n(n)
45901 .k(k)
45902 .iterations(1)
45903 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45904 }
45905 }
45906 }
45907 }
45908
45909 TEST(F32_IGEMM_2X4__WASM, n_div_4) {
45910 for (uint32_t n = 8; n <= 12; n += 4) {
45911 for (size_t k = 1; k <= 5; k += 2) {
45912 GemmMicrokernelTester()
45913 .mr(2)
45914 .nr(4)
45915 .kr(1)
45916 .sr(1)
45917 .m(2)
45918 .n(4)
45919 .k(k)
45920 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45921 }
45922 }
45923 }
45924
45925 TEST(F32_IGEMM_2X4__WASM, n_div_4_strided_cn) {
45926 for (uint32_t n = 8; n <= 12; n += 4) {
45927 for (size_t k = 1; k <= 5; k += 2) {
45928 GemmMicrokernelTester()
45929 .mr(2)
45930 .nr(4)
45931 .kr(1)
45932 .sr(1)
45933 .m(2)
45934 .n(n)
45935 .k(k)
45936 .cn_stride(7)
45937 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45938 }
45939 }
45940 }
45941
45942 TEST(F32_IGEMM_2X4__WASM, n_div_4_subtile) {
45943 for (uint32_t n = 8; n <= 12; n += 4) {
45944 for (size_t k = 1; k <= 5; k += 2) {
45945 for (uint32_t m = 1; m <= 2; m++) {
45946 GemmMicrokernelTester()
45947 .mr(2)
45948 .nr(4)
45949 .kr(1)
45950 .sr(1)
45951 .m(m)
45952 .n(n)
45953 .k(k)
45954 .iterations(1)
45955 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45956 }
45957 }
45958 }
45959 }
45960
45961 TEST(F32_IGEMM_2X4__WASM, small_kernel) {
45962 for (size_t k = 1; k <= 5; k += 2) {
45963 GemmMicrokernelTester()
45964 .mr(2)
45965 .nr(4)
45966 .kr(1)
45967 .sr(1)
45968 .m(2)
45969 .n(4)
45970 .k(k)
45971 .ks(3)
45972 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45973 }
45974 }
45975
45976 TEST(F32_IGEMM_2X4__WASM, small_kernel_subtile) {
45977 for (size_t k = 1; k <= 5; k += 2) {
45978 for (uint32_t m = 1; m <= 2; m++) {
45979 for (uint32_t n = 1; n <= 4; n++) {
45980 GemmMicrokernelTester()
45981 .mr(2)
45982 .nr(4)
45983 .kr(1)
45984 .sr(1)
45985 .m(m)
45986 .n(n)
45987 .k(k)
45988 .ks(3)
45989 .iterations(1)
45990 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
45991 }
45992 }
45993 }
45994 }
45995
45996 TEST(F32_IGEMM_2X4__WASM, n_gt_4_small_kernel) {
45997 for (uint32_t n = 5; n < 8; n++) {
45998 for (size_t k = 1; k <= 5; k += 2) {
45999 GemmMicrokernelTester()
46000 .mr(2)
46001 .nr(4)
46002 .kr(1)
46003 .sr(1)
46004 .m(2)
46005 .n(4)
46006 .k(k)
46007 .ks(3)
46008 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46009 }
46010 }
46011 }
46012
46013 TEST(F32_IGEMM_2X4__WASM, n_div_4_small_kernel) {
46014 for (uint32_t n = 8; n <= 12; n += 4) {
46015 for (size_t k = 1; k <= 5; k += 2) {
46016 GemmMicrokernelTester()
46017 .mr(2)
46018 .nr(4)
46019 .kr(1)
46020 .sr(1)
46021 .m(2)
46022 .n(4)
46023 .k(k)
46024 .ks(3)
46025 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46026 }
46027 }
46028 }
46029
46030 TEST(F32_IGEMM_2X4__WASM, strided_cm_subtile) {
46031 for (size_t k = 1; k <= 5; k += 2) {
46032 for (uint32_t m = 1; m <= 2; m++) {
46033 for (uint32_t n = 1; n <= 4; n++) {
46034 GemmMicrokernelTester()
46035 .mr(2)
46036 .nr(4)
46037 .kr(1)
46038 .sr(1)
46039 .m(m)
46040 .n(n)
46041 .k(k)
46042 .cm_stride(7)
46043 .iterations(1)
46044 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46045 }
46046 }
46047 }
46048 }
46049
46050 TEST(F32_IGEMM_2X4__WASM, a_offset) {
46051 for (size_t k = 1; k <= 5; k += 2) {
46052 GemmMicrokernelTester()
46053 .mr(2)
46054 .nr(4)
46055 .kr(1)
46056 .sr(1)
46057 .m(2)
46058 .n(4)
46059 .k(k)
46060 .ks(3)
46061 .a_offset(13)
46062 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46063 }
46064 }
46065
46066 TEST(F32_IGEMM_2X4__WASM, zero) {
46067 for (uint32_t mz = 0; mz < 2; mz++) {
46068 for (size_t k = 1; k <= 5; k += 2) {
46069 GemmMicrokernelTester()
46070 .mr(2)
46071 .nr(4)
46072 .kr(1)
46073 .sr(1)
46074 .m(2)
46075 .n(4)
46076 .k(k)
46077 .ks(3)
46078 .a_offset(13)
46079 .zero_index(mz)
46080 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46081 }
46082 }
46083 }
46084
46085 TEST(F32_IGEMM_2X4__WASM, qmin) {
46086 GemmMicrokernelTester()
46087 .mr(2)
46088 .nr(4)
46089 .kr(1)
46090 .sr(1)
46091 .m(2)
46092 .n(4)
46093 .k(1)
46094 .qmin(128)
46095 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46096 }
46097
46098 TEST(F32_IGEMM_2X4__WASM, qmax) {
46099 GemmMicrokernelTester()
46100 .mr(2)
46101 .nr(4)
46102 .kr(1)
46103 .sr(1)
46104 .m(2)
46105 .n(4)
46106 .k(1)
46107 .qmax(128)
46108 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46109 }
46110
46111 TEST(F32_IGEMM_2X4__WASM, strided_cm) {
46112 GemmMicrokernelTester()
46113 .mr(2)
46114 .nr(4)
46115 .kr(1)
46116 .sr(1)
46117 .m(2)
46118 .n(4)
46119 .k(1)
46120 .cm_stride(7)
46121 .Test(xnn_f32_igemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46122 }
46123#endif // XNN_ARCH_WASM
46124
46125
46126#if XNN_ARCH_WASM
46127 TEST(F32_IGEMM_4X4__WASM, k_eq_1) {
46128 GemmMicrokernelTester()
46129 .mr(4)
46130 .nr(4)
46131 .kr(1)
46132 .sr(1)
46133 .m(4)
46134 .n(4)
46135 .k(1)
46136 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46137 }
46138
46139 TEST(F32_IGEMM_4X4__WASM, strided_cn) {
46140 GemmMicrokernelTester()
46141 .mr(4)
46142 .nr(4)
46143 .kr(1)
46144 .sr(1)
46145 .m(4)
46146 .n(4)
46147 .k(1)
46148 .cn_stride(7)
46149 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46150 }
46151
46152 TEST(F32_IGEMM_4X4__WASM, k_eq_1_subtile) {
46153 for (uint32_t m = 1; m <= 4; m++) {
46154 for (uint32_t n = 1; n <= 4; n++) {
46155 GemmMicrokernelTester()
46156 .mr(4)
46157 .nr(4)
46158 .kr(1)
46159 .sr(1)
46160 .m(m)
46161 .n(n)
46162 .k(1)
46163 .iterations(1)
46164 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46165 }
46166 }
46167 }
46168
46169 TEST(F32_IGEMM_4X4__WASM, k_eq_1_subtile_m) {
46170 for (uint32_t m = 1; m <= 4; m++) {
46171 GemmMicrokernelTester()
46172 .mr(4)
46173 .nr(4)
46174 .kr(1)
46175 .sr(1)
46176 .m(m)
46177 .n(4)
46178 .k(1)
46179 .iterations(1)
46180 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46181 }
46182 }
46183
46184 TEST(F32_IGEMM_4X4__WASM, k_eq_1_subtile_n) {
46185 for (uint32_t n = 1; n <= 4; n++) {
46186 GemmMicrokernelTester()
46187 .mr(4)
46188 .nr(4)
46189 .kr(1)
46190 .sr(1)
46191 .m(4)
46192 .n(n)
46193 .k(1)
46194 .iterations(1)
46195 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46196 }
46197 }
46198
46199 TEST(F32_IGEMM_4X4__WASM, k_gt_1) {
46200 for (size_t k = 2; k < 10; k++) {
46201 GemmMicrokernelTester()
46202 .mr(4)
46203 .nr(4)
46204 .kr(1)
46205 .sr(1)
46206 .m(4)
46207 .n(4)
46208 .k(k)
46209 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46210 }
46211 }
46212
46213 TEST(F32_IGEMM_4X4__WASM, k_gt_1_subtile) {
46214 for (size_t k = 2; k < 10; k++) {
46215 for (uint32_t m = 1; m <= 4; m++) {
46216 for (uint32_t n = 1; n <= 4; n++) {
46217 GemmMicrokernelTester()
46218 .mr(4)
46219 .nr(4)
46220 .kr(1)
46221 .sr(1)
46222 .m(m)
46223 .n(n)
46224 .k(k)
46225 .iterations(1)
46226 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46227 }
46228 }
46229 }
46230 }
46231
46232 TEST(F32_IGEMM_4X4__WASM, n_gt_4) {
46233 for (uint32_t n = 5; n < 8; n++) {
46234 for (size_t k = 1; k <= 5; k += 2) {
46235 GemmMicrokernelTester()
46236 .mr(4)
46237 .nr(4)
46238 .kr(1)
46239 .sr(1)
46240 .m(4)
46241 .n(4)
46242 .k(k)
46243 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46244 }
46245 }
46246 }
46247
46248 TEST(F32_IGEMM_4X4__WASM, n_gt_4_strided_cn) {
46249 for (uint32_t n = 5; n < 8; n++) {
46250 for (size_t k = 1; k <= 5; k += 2) {
46251 GemmMicrokernelTester()
46252 .mr(4)
46253 .nr(4)
46254 .kr(1)
46255 .sr(1)
46256 .m(4)
46257 .n(4)
46258 .k(k)
46259 .cn_stride(7)
46260 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46261 }
46262 }
46263 }
46264
46265 TEST(F32_IGEMM_4X4__WASM, n_gt_4_subtile) {
46266 for (uint32_t n = 5; n < 8; n++) {
46267 for (size_t k = 1; k <= 5; k += 2) {
46268 for (uint32_t m = 1; m <= 4; m++) {
46269 GemmMicrokernelTester()
46270 .mr(4)
46271 .nr(4)
46272 .kr(1)
46273 .sr(1)
46274 .m(m)
46275 .n(n)
46276 .k(k)
46277 .iterations(1)
46278 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46279 }
46280 }
46281 }
46282 }
46283
46284 TEST(F32_IGEMM_4X4__WASM, n_div_4) {
46285 for (uint32_t n = 8; n <= 12; n += 4) {
46286 for (size_t k = 1; k <= 5; k += 2) {
46287 GemmMicrokernelTester()
46288 .mr(4)
46289 .nr(4)
46290 .kr(1)
46291 .sr(1)
46292 .m(4)
46293 .n(4)
46294 .k(k)
46295 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46296 }
46297 }
46298 }
46299
46300 TEST(F32_IGEMM_4X4__WASM, n_div_4_strided_cn) {
46301 for (uint32_t n = 8; n <= 12; n += 4) {
46302 for (size_t k = 1; k <= 5; k += 2) {
46303 GemmMicrokernelTester()
46304 .mr(4)
46305 .nr(4)
46306 .kr(1)
46307 .sr(1)
46308 .m(4)
46309 .n(n)
46310 .k(k)
46311 .cn_stride(7)
46312 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46313 }
46314 }
46315 }
46316
46317 TEST(F32_IGEMM_4X4__WASM, n_div_4_subtile) {
46318 for (uint32_t n = 8; n <= 12; n += 4) {
46319 for (size_t k = 1; k <= 5; k += 2) {
46320 for (uint32_t m = 1; m <= 4; m++) {
46321 GemmMicrokernelTester()
46322 .mr(4)
46323 .nr(4)
46324 .kr(1)
46325 .sr(1)
46326 .m(m)
46327 .n(n)
46328 .k(k)
46329 .iterations(1)
46330 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46331 }
46332 }
46333 }
46334 }
46335
46336 TEST(F32_IGEMM_4X4__WASM, small_kernel) {
46337 for (size_t k = 1; k <= 5; k += 2) {
46338 GemmMicrokernelTester()
46339 .mr(4)
46340 .nr(4)
46341 .kr(1)
46342 .sr(1)
46343 .m(4)
46344 .n(4)
46345 .k(k)
46346 .ks(3)
46347 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46348 }
46349 }
46350
46351 TEST(F32_IGEMM_4X4__WASM, small_kernel_subtile) {
46352 for (size_t k = 1; k <= 5; k += 2) {
46353 for (uint32_t m = 1; m <= 4; m++) {
46354 for (uint32_t n = 1; n <= 4; n++) {
46355 GemmMicrokernelTester()
46356 .mr(4)
46357 .nr(4)
46358 .kr(1)
46359 .sr(1)
46360 .m(m)
46361 .n(n)
46362 .k(k)
46363 .ks(3)
46364 .iterations(1)
46365 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46366 }
46367 }
46368 }
46369 }
46370
46371 TEST(F32_IGEMM_4X4__WASM, n_gt_4_small_kernel) {
46372 for (uint32_t n = 5; n < 8; n++) {
46373 for (size_t k = 1; k <= 5; k += 2) {
46374 GemmMicrokernelTester()
46375 .mr(4)
46376 .nr(4)
46377 .kr(1)
46378 .sr(1)
46379 .m(4)
46380 .n(4)
46381 .k(k)
46382 .ks(3)
46383 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46384 }
46385 }
46386 }
46387
46388 TEST(F32_IGEMM_4X4__WASM, n_div_4_small_kernel) {
46389 for (uint32_t n = 8; n <= 12; n += 4) {
46390 for (size_t k = 1; k <= 5; k += 2) {
46391 GemmMicrokernelTester()
46392 .mr(4)
46393 .nr(4)
46394 .kr(1)
46395 .sr(1)
46396 .m(4)
46397 .n(4)
46398 .k(k)
46399 .ks(3)
46400 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46401 }
46402 }
46403 }
46404
46405 TEST(F32_IGEMM_4X4__WASM, strided_cm_subtile) {
46406 for (size_t k = 1; k <= 5; k += 2) {
46407 for (uint32_t m = 1; m <= 4; m++) {
46408 for (uint32_t n = 1; n <= 4; n++) {
46409 GemmMicrokernelTester()
46410 .mr(4)
46411 .nr(4)
46412 .kr(1)
46413 .sr(1)
46414 .m(m)
46415 .n(n)
46416 .k(k)
46417 .cm_stride(7)
46418 .iterations(1)
46419 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46420 }
46421 }
46422 }
46423 }
46424
46425 TEST(F32_IGEMM_4X4__WASM, a_offset) {
46426 for (size_t k = 1; k <= 5; k += 2) {
46427 GemmMicrokernelTester()
46428 .mr(4)
46429 .nr(4)
46430 .kr(1)
46431 .sr(1)
46432 .m(4)
46433 .n(4)
46434 .k(k)
46435 .ks(3)
46436 .a_offset(23)
46437 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46438 }
46439 }
46440
46441 TEST(F32_IGEMM_4X4__WASM, zero) {
46442 for (uint32_t mz = 0; mz < 4; mz++) {
46443 for (size_t k = 1; k <= 5; k += 2) {
46444 GemmMicrokernelTester()
46445 .mr(4)
46446 .nr(4)
46447 .kr(1)
46448 .sr(1)
46449 .m(4)
46450 .n(4)
46451 .k(k)
46452 .ks(3)
46453 .a_offset(23)
46454 .zero_index(mz)
46455 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46456 }
46457 }
46458 }
46459
46460 TEST(F32_IGEMM_4X4__WASM, qmin) {
46461 GemmMicrokernelTester()
46462 .mr(4)
46463 .nr(4)
46464 .kr(1)
46465 .sr(1)
46466 .m(4)
46467 .n(4)
46468 .k(1)
46469 .qmin(128)
46470 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46471 }
46472
46473 TEST(F32_IGEMM_4X4__WASM, qmax) {
46474 GemmMicrokernelTester()
46475 .mr(4)
46476 .nr(4)
46477 .kr(1)
46478 .sr(1)
46479 .m(4)
46480 .n(4)
46481 .k(1)
46482 .qmax(128)
46483 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46484 }
46485
46486 TEST(F32_IGEMM_4X4__WASM, strided_cm) {
46487 GemmMicrokernelTester()
46488 .mr(4)
46489 .nr(4)
46490 .kr(1)
46491 .sr(1)
46492 .m(4)
46493 .n(4)
46494 .k(1)
46495 .cm_stride(7)
46496 .Test(xnn_f32_igemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
46497 }
46498#endif // XNN_ARCH_WASM
46499
46500
46501#if XNN_ARCH_WASM
46502 TEST(F32_IGEMM_4X2__WASM, k_eq_1) {
46503 GemmMicrokernelTester()
46504 .mr(4)
46505 .nr(2)
46506 .kr(1)
46507 .sr(1)
46508 .m(4)
46509 .n(2)
46510 .k(1)
46511 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46512 }
46513
46514 TEST(F32_IGEMM_4X2__WASM, strided_cn) {
46515 GemmMicrokernelTester()
46516 .mr(4)
46517 .nr(2)
46518 .kr(1)
46519 .sr(1)
46520 .m(4)
46521 .n(2)
46522 .k(1)
46523 .cn_stride(5)
46524 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46525 }
46526
46527 TEST(F32_IGEMM_4X2__WASM, k_eq_1_subtile) {
46528 for (uint32_t m = 1; m <= 4; m++) {
46529 for (uint32_t n = 1; n <= 2; n++) {
46530 GemmMicrokernelTester()
46531 .mr(4)
46532 .nr(2)
46533 .kr(1)
46534 .sr(1)
46535 .m(m)
46536 .n(n)
46537 .k(1)
46538 .iterations(1)
46539 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46540 }
46541 }
46542 }
46543
46544 TEST(F32_IGEMM_4X2__WASM, k_eq_1_subtile_m) {
46545 for (uint32_t m = 1; m <= 4; m++) {
46546 GemmMicrokernelTester()
46547 .mr(4)
46548 .nr(2)
46549 .kr(1)
46550 .sr(1)
46551 .m(m)
46552 .n(2)
46553 .k(1)
46554 .iterations(1)
46555 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46556 }
46557 }
46558
46559 TEST(F32_IGEMM_4X2__WASM, k_eq_1_subtile_n) {
46560 for (uint32_t n = 1; n <= 2; n++) {
46561 GemmMicrokernelTester()
46562 .mr(4)
46563 .nr(2)
46564 .kr(1)
46565 .sr(1)
46566 .m(4)
46567 .n(n)
46568 .k(1)
46569 .iterations(1)
46570 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46571 }
46572 }
46573
46574 TEST(F32_IGEMM_4X2__WASM, k_gt_1) {
46575 for (size_t k = 2; k < 10; k++) {
46576 GemmMicrokernelTester()
46577 .mr(4)
46578 .nr(2)
46579 .kr(1)
46580 .sr(1)
46581 .m(4)
46582 .n(2)
46583 .k(k)
46584 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46585 }
46586 }
46587
46588 TEST(F32_IGEMM_4X2__WASM, k_gt_1_subtile) {
46589 for (size_t k = 2; k < 10; k++) {
46590 for (uint32_t m = 1; m <= 4; m++) {
46591 for (uint32_t n = 1; n <= 2; n++) {
46592 GemmMicrokernelTester()
46593 .mr(4)
46594 .nr(2)
46595 .kr(1)
46596 .sr(1)
46597 .m(m)
46598 .n(n)
46599 .k(k)
46600 .iterations(1)
46601 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46602 }
46603 }
46604 }
46605 }
46606
46607 TEST(F32_IGEMM_4X2__WASM, n_gt_2) {
46608 for (uint32_t n = 3; n < 4; n++) {
46609 for (size_t k = 1; k <= 5; k += 2) {
46610 GemmMicrokernelTester()
46611 .mr(4)
46612 .nr(2)
46613 .kr(1)
46614 .sr(1)
46615 .m(4)
46616 .n(2)
46617 .k(k)
46618 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46619 }
46620 }
46621 }
46622
46623 TEST(F32_IGEMM_4X2__WASM, n_gt_2_strided_cn) {
46624 for (uint32_t n = 3; n < 4; n++) {
46625 for (size_t k = 1; k <= 5; k += 2) {
46626 GemmMicrokernelTester()
46627 .mr(4)
46628 .nr(2)
46629 .kr(1)
46630 .sr(1)
46631 .m(4)
46632 .n(2)
46633 .k(k)
46634 .cn_stride(5)
46635 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46636 }
46637 }
46638 }
46639
46640 TEST(F32_IGEMM_4X2__WASM, n_gt_2_subtile) {
46641 for (uint32_t n = 3; n < 4; n++) {
46642 for (size_t k = 1; k <= 5; k += 2) {
46643 for (uint32_t m = 1; m <= 4; m++) {
46644 GemmMicrokernelTester()
46645 .mr(4)
46646 .nr(2)
46647 .kr(1)
46648 .sr(1)
46649 .m(m)
46650 .n(n)
46651 .k(k)
46652 .iterations(1)
46653 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46654 }
46655 }
46656 }
46657 }
46658
46659 TEST(F32_IGEMM_4X2__WASM, n_div_2) {
46660 for (uint32_t n = 4; n <= 6; n += 2) {
46661 for (size_t k = 1; k <= 5; k += 2) {
46662 GemmMicrokernelTester()
46663 .mr(4)
46664 .nr(2)
46665 .kr(1)
46666 .sr(1)
46667 .m(4)
46668 .n(2)
46669 .k(k)
46670 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46671 }
46672 }
46673 }
46674
46675 TEST(F32_IGEMM_4X2__WASM, n_div_2_strided_cn) {
46676 for (uint32_t n = 4; n <= 6; n += 2) {
46677 for (size_t k = 1; k <= 5; k += 2) {
46678 GemmMicrokernelTester()
46679 .mr(4)
46680 .nr(2)
46681 .kr(1)
46682 .sr(1)
46683 .m(4)
46684 .n(n)
46685 .k(k)
46686 .cn_stride(5)
46687 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46688 }
46689 }
46690 }
46691
46692 TEST(F32_IGEMM_4X2__WASM, n_div_2_subtile) {
46693 for (uint32_t n = 4; n <= 6; n += 2) {
46694 for (size_t k = 1; k <= 5; k += 2) {
46695 for (uint32_t m = 1; m <= 4; m++) {
46696 GemmMicrokernelTester()
46697 .mr(4)
46698 .nr(2)
46699 .kr(1)
46700 .sr(1)
46701 .m(m)
46702 .n(n)
46703 .k(k)
46704 .iterations(1)
46705 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46706 }
46707 }
46708 }
46709 }
46710
46711 TEST(F32_IGEMM_4X2__WASM, small_kernel) {
46712 for (size_t k = 1; k <= 5; k += 2) {
46713 GemmMicrokernelTester()
46714 .mr(4)
46715 .nr(2)
46716 .kr(1)
46717 .sr(1)
46718 .m(4)
46719 .n(2)
46720 .k(k)
46721 .ks(3)
46722 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46723 }
46724 }
46725
46726 TEST(F32_IGEMM_4X2__WASM, small_kernel_subtile) {
46727 for (size_t k = 1; k <= 5; k += 2) {
46728 for (uint32_t m = 1; m <= 4; m++) {
46729 for (uint32_t n = 1; n <= 2; n++) {
46730 GemmMicrokernelTester()
46731 .mr(4)
46732 .nr(2)
46733 .kr(1)
46734 .sr(1)
46735 .m(m)
46736 .n(n)
46737 .k(k)
46738 .ks(3)
46739 .iterations(1)
46740 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46741 }
46742 }
46743 }
46744 }
46745
46746 TEST(F32_IGEMM_4X2__WASM, n_gt_2_small_kernel) {
46747 for (uint32_t n = 3; n < 4; n++) {
46748 for (size_t k = 1; k <= 5; k += 2) {
46749 GemmMicrokernelTester()
46750 .mr(4)
46751 .nr(2)
46752 .kr(1)
46753 .sr(1)
46754 .m(4)
46755 .n(2)
46756 .k(k)
46757 .ks(3)
46758 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46759 }
46760 }
46761 }
46762
46763 TEST(F32_IGEMM_4X2__WASM, n_div_2_small_kernel) {
46764 for (uint32_t n = 4; n <= 6; n += 2) {
46765 for (size_t k = 1; k <= 5; k += 2) {
46766 GemmMicrokernelTester()
46767 .mr(4)
46768 .nr(2)
46769 .kr(1)
46770 .sr(1)
46771 .m(4)
46772 .n(2)
46773 .k(k)
46774 .ks(3)
46775 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46776 }
46777 }
46778 }
46779
46780 TEST(F32_IGEMM_4X2__WASM, strided_cm_subtile) {
46781 for (size_t k = 1; k <= 5; k += 2) {
46782 for (uint32_t m = 1; m <= 4; m++) {
46783 for (uint32_t n = 1; n <= 2; n++) {
46784 GemmMicrokernelTester()
46785 .mr(4)
46786 .nr(2)
46787 .kr(1)
46788 .sr(1)
46789 .m(m)
46790 .n(n)
46791 .k(k)
46792 .cm_stride(5)
46793 .iterations(1)
46794 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46795 }
46796 }
46797 }
46798 }
46799
46800 TEST(F32_IGEMM_4X2__WASM, a_offset) {
46801 for (size_t k = 1; k <= 5; k += 2) {
46802 GemmMicrokernelTester()
46803 .mr(4)
46804 .nr(2)
46805 .kr(1)
46806 .sr(1)
46807 .m(4)
46808 .n(2)
46809 .k(k)
46810 .ks(3)
46811 .a_offset(23)
46812 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46813 }
46814 }
46815
46816 TEST(F32_IGEMM_4X2__WASM, zero) {
46817 for (uint32_t mz = 0; mz < 4; mz++) {
46818 for (size_t k = 1; k <= 5; k += 2) {
46819 GemmMicrokernelTester()
46820 .mr(4)
46821 .nr(2)
46822 .kr(1)
46823 .sr(1)
46824 .m(4)
46825 .n(2)
46826 .k(k)
46827 .ks(3)
46828 .a_offset(23)
46829 .zero_index(mz)
46830 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46831 }
46832 }
46833 }
46834
46835 TEST(F32_IGEMM_4X2__WASM, qmin) {
46836 GemmMicrokernelTester()
46837 .mr(4)
46838 .nr(2)
46839 .kr(1)
46840 .sr(1)
46841 .m(4)
46842 .n(2)
46843 .k(1)
46844 .qmin(128)
46845 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46846 }
46847
46848 TEST(F32_IGEMM_4X2__WASM, qmax) {
46849 GemmMicrokernelTester()
46850 .mr(4)
46851 .nr(2)
46852 .kr(1)
46853 .sr(1)
46854 .m(4)
46855 .n(2)
46856 .k(1)
46857 .qmax(128)
46858 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46859 }
46860
46861 TEST(F32_IGEMM_4X2__WASM, strided_cm) {
46862 GemmMicrokernelTester()
46863 .mr(4)
46864 .nr(2)
46865 .kr(1)
46866 .sr(1)
46867 .m(4)
46868 .n(2)
46869 .k(1)
46870 .cm_stride(5)
46871 .Test(xnn_f32_igemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
46872 }
46873#endif // XNN_ARCH_WASM
46874
46875
46876TEST(F32_IGEMM_1X4__SCALAR, k_eq_1) {
46877 GemmMicrokernelTester()
46878 .mr(1)
46879 .nr(4)
46880 .kr(1)
46881 .sr(1)
46882 .m(1)
46883 .n(4)
46884 .k(1)
46885 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46886}
46887
46888TEST(F32_IGEMM_1X4__SCALAR, strided_cn) {
46889 GemmMicrokernelTester()
46890 .mr(1)
46891 .nr(4)
46892 .kr(1)
46893 .sr(1)
46894 .m(1)
46895 .n(4)
46896 .k(1)
46897 .cn_stride(7)
46898 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46899}
46900
46901TEST(F32_IGEMM_1X4__SCALAR, k_eq_1_subtile) {
46902 for (uint32_t m = 1; m <= 1; m++) {
46903 for (uint32_t n = 1; n <= 4; n++) {
46904 GemmMicrokernelTester()
46905 .mr(1)
46906 .nr(4)
46907 .kr(1)
46908 .sr(1)
46909 .m(m)
46910 .n(n)
46911 .k(1)
46912 .iterations(1)
46913 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46914 }
46915 }
46916}
46917
46918TEST(F32_IGEMM_1X4__SCALAR, k_eq_1_subtile_m) {
46919 for (uint32_t m = 1; m <= 1; m++) {
46920 GemmMicrokernelTester()
46921 .mr(1)
46922 .nr(4)
46923 .kr(1)
46924 .sr(1)
46925 .m(m)
46926 .n(4)
46927 .k(1)
46928 .iterations(1)
46929 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46930 }
46931}
46932
46933TEST(F32_IGEMM_1X4__SCALAR, k_eq_1_subtile_n) {
46934 for (uint32_t n = 1; n <= 4; n++) {
46935 GemmMicrokernelTester()
46936 .mr(1)
46937 .nr(4)
46938 .kr(1)
46939 .sr(1)
46940 .m(1)
46941 .n(n)
46942 .k(1)
46943 .iterations(1)
46944 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46945 }
46946}
46947
46948TEST(F32_IGEMM_1X4__SCALAR, k_gt_1) {
46949 for (size_t k = 2; k < 10; k++) {
46950 GemmMicrokernelTester()
46951 .mr(1)
46952 .nr(4)
46953 .kr(1)
46954 .sr(1)
46955 .m(1)
46956 .n(4)
46957 .k(k)
46958 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46959 }
46960}
46961
46962TEST(F32_IGEMM_1X4__SCALAR, k_gt_1_subtile) {
46963 for (size_t k = 2; k < 10; k++) {
46964 for (uint32_t m = 1; m <= 1; m++) {
46965 for (uint32_t n = 1; n <= 4; n++) {
46966 GemmMicrokernelTester()
46967 .mr(1)
46968 .nr(4)
46969 .kr(1)
46970 .sr(1)
46971 .m(m)
46972 .n(n)
46973 .k(k)
46974 .iterations(1)
46975 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46976 }
46977 }
46978 }
46979}
46980
46981TEST(F32_IGEMM_1X4__SCALAR, n_gt_4) {
46982 for (uint32_t n = 5; n < 8; n++) {
46983 for (size_t k = 1; k <= 5; k += 2) {
46984 GemmMicrokernelTester()
46985 .mr(1)
46986 .nr(4)
46987 .kr(1)
46988 .sr(1)
46989 .m(1)
46990 .n(4)
46991 .k(k)
46992 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
46993 }
46994 }
46995}
46996
46997TEST(F32_IGEMM_1X4__SCALAR, n_gt_4_strided_cn) {
46998 for (uint32_t n = 5; n < 8; n++) {
46999 for (size_t k = 1; k <= 5; k += 2) {
47000 GemmMicrokernelTester()
47001 .mr(1)
47002 .nr(4)
47003 .kr(1)
47004 .sr(1)
47005 .m(1)
47006 .n(4)
47007 .k(k)
47008 .cn_stride(7)
47009 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47010 }
47011 }
47012}
47013
47014TEST(F32_IGEMM_1X4__SCALAR, n_gt_4_subtile) {
47015 for (uint32_t n = 5; n < 8; n++) {
47016 for (size_t k = 1; k <= 5; k += 2) {
47017 for (uint32_t m = 1; m <= 1; m++) {
47018 GemmMicrokernelTester()
47019 .mr(1)
47020 .nr(4)
47021 .kr(1)
47022 .sr(1)
47023 .m(m)
47024 .n(n)
47025 .k(k)
47026 .iterations(1)
47027 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47028 }
47029 }
47030 }
47031}
47032
47033TEST(F32_IGEMM_1X4__SCALAR, n_div_4) {
47034 for (uint32_t n = 8; n <= 12; n += 4) {
47035 for (size_t k = 1; k <= 5; k += 2) {
47036 GemmMicrokernelTester()
47037 .mr(1)
47038 .nr(4)
47039 .kr(1)
47040 .sr(1)
47041 .m(1)
47042 .n(4)
47043 .k(k)
47044 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47045 }
47046 }
47047}
47048
47049TEST(F32_IGEMM_1X4__SCALAR, n_div_4_strided_cn) {
47050 for (uint32_t n = 8; n <= 12; n += 4) {
47051 for (size_t k = 1; k <= 5; k += 2) {
47052 GemmMicrokernelTester()
47053 .mr(1)
47054 .nr(4)
47055 .kr(1)
47056 .sr(1)
47057 .m(1)
47058 .n(n)
47059 .k(k)
47060 .cn_stride(7)
47061 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47062 }
47063 }
47064}
47065
47066TEST(F32_IGEMM_1X4__SCALAR, n_div_4_subtile) {
47067 for (uint32_t n = 8; n <= 12; n += 4) {
47068 for (size_t k = 1; k <= 5; k += 2) {
47069 for (uint32_t m = 1; m <= 1; m++) {
47070 GemmMicrokernelTester()
47071 .mr(1)
47072 .nr(4)
47073 .kr(1)
47074 .sr(1)
47075 .m(m)
47076 .n(n)
47077 .k(k)
47078 .iterations(1)
47079 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47080 }
47081 }
47082 }
47083}
47084
47085TEST(F32_IGEMM_1X4__SCALAR, small_kernel) {
47086 for (size_t k = 1; k <= 5; k += 2) {
47087 GemmMicrokernelTester()
47088 .mr(1)
47089 .nr(4)
47090 .kr(1)
47091 .sr(1)
47092 .m(1)
47093 .n(4)
47094 .k(k)
47095 .ks(3)
47096 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47097 }
47098}
47099
47100TEST(F32_IGEMM_1X4__SCALAR, small_kernel_subtile) {
47101 for (size_t k = 1; k <= 5; k += 2) {
47102 for (uint32_t m = 1; m <= 1; m++) {
47103 for (uint32_t n = 1; n <= 4; n++) {
47104 GemmMicrokernelTester()
47105 .mr(1)
47106 .nr(4)
47107 .kr(1)
47108 .sr(1)
47109 .m(m)
47110 .n(n)
47111 .k(k)
47112 .ks(3)
47113 .iterations(1)
47114 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47115 }
47116 }
47117 }
47118}
47119
47120TEST(F32_IGEMM_1X4__SCALAR, n_gt_4_small_kernel) {
47121 for (uint32_t n = 5; n < 8; n++) {
47122 for (size_t k = 1; k <= 5; k += 2) {
47123 GemmMicrokernelTester()
47124 .mr(1)
47125 .nr(4)
47126 .kr(1)
47127 .sr(1)
47128 .m(1)
47129 .n(4)
47130 .k(k)
47131 .ks(3)
47132 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47133 }
47134 }
47135}
47136
47137TEST(F32_IGEMM_1X4__SCALAR, n_div_4_small_kernel) {
47138 for (uint32_t n = 8; n <= 12; n += 4) {
47139 for (size_t k = 1; k <= 5; k += 2) {
47140 GemmMicrokernelTester()
47141 .mr(1)
47142 .nr(4)
47143 .kr(1)
47144 .sr(1)
47145 .m(1)
47146 .n(4)
47147 .k(k)
47148 .ks(3)
47149 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47150 }
47151 }
47152}
47153
47154TEST(F32_IGEMM_1X4__SCALAR, strided_cm_subtile) {
47155 for (size_t k = 1; k <= 5; k += 2) {
47156 for (uint32_t m = 1; m <= 1; m++) {
47157 for (uint32_t n = 1; n <= 4; n++) {
47158 GemmMicrokernelTester()
47159 .mr(1)
47160 .nr(4)
47161 .kr(1)
47162 .sr(1)
47163 .m(m)
47164 .n(n)
47165 .k(k)
47166 .cm_stride(7)
47167 .iterations(1)
47168 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47169 }
47170 }
47171 }
47172}
47173
47174TEST(F32_IGEMM_1X4__SCALAR, a_offset) {
47175 for (size_t k = 1; k <= 5; k += 2) {
47176 GemmMicrokernelTester()
47177 .mr(1)
47178 .nr(4)
47179 .kr(1)
47180 .sr(1)
47181 .m(1)
47182 .n(4)
47183 .k(k)
47184 .ks(3)
47185 .a_offset(7)
47186 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47187 }
47188}
47189
47190TEST(F32_IGEMM_1X4__SCALAR, zero) {
47191 for (uint32_t mz = 0; mz < 1; mz++) {
47192 for (size_t k = 1; k <= 5; k += 2) {
47193 GemmMicrokernelTester()
47194 .mr(1)
47195 .nr(4)
47196 .kr(1)
47197 .sr(1)
47198 .m(1)
47199 .n(4)
47200 .k(k)
47201 .ks(3)
47202 .a_offset(7)
47203 .zero_index(mz)
47204 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47205 }
47206 }
47207}
47208
47209TEST(F32_IGEMM_1X4__SCALAR, qmin) {
47210 GemmMicrokernelTester()
47211 .mr(1)
47212 .nr(4)
47213 .kr(1)
47214 .sr(1)
47215 .m(1)
47216 .n(4)
47217 .k(1)
47218 .qmin(128)
47219 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47220}
47221
47222TEST(F32_IGEMM_1X4__SCALAR, qmax) {
47223 GemmMicrokernelTester()
47224 .mr(1)
47225 .nr(4)
47226 .kr(1)
47227 .sr(1)
47228 .m(1)
47229 .n(4)
47230 .k(1)
47231 .qmax(128)
47232 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47233}
47234
47235TEST(F32_IGEMM_1X4__SCALAR, strided_cm) {
47236 GemmMicrokernelTester()
47237 .mr(1)
47238 .nr(4)
47239 .kr(1)
47240 .sr(1)
47241 .m(1)
47242 .n(4)
47243 .k(1)
47244 .cm_stride(7)
47245 .Test(xnn_f32_igemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47246}
47247
47248
47249TEST(F32_IGEMM_2X4__SCALAR, k_eq_1) {
47250 GemmMicrokernelTester()
47251 .mr(2)
47252 .nr(4)
47253 .kr(1)
47254 .sr(1)
47255 .m(2)
47256 .n(4)
47257 .k(1)
47258 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47259}
47260
47261TEST(F32_IGEMM_2X4__SCALAR, strided_cn) {
47262 GemmMicrokernelTester()
47263 .mr(2)
47264 .nr(4)
47265 .kr(1)
47266 .sr(1)
47267 .m(2)
47268 .n(4)
47269 .k(1)
47270 .cn_stride(7)
47271 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47272}
47273
47274TEST(F32_IGEMM_2X4__SCALAR, k_eq_1_subtile) {
47275 for (uint32_t m = 1; m <= 2; m++) {
47276 for (uint32_t n = 1; n <= 4; n++) {
47277 GemmMicrokernelTester()
47278 .mr(2)
47279 .nr(4)
47280 .kr(1)
47281 .sr(1)
47282 .m(m)
47283 .n(n)
47284 .k(1)
47285 .iterations(1)
47286 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47287 }
47288 }
47289}
47290
47291TEST(F32_IGEMM_2X4__SCALAR, k_eq_1_subtile_m) {
47292 for (uint32_t m = 1; m <= 2; m++) {
47293 GemmMicrokernelTester()
47294 .mr(2)
47295 .nr(4)
47296 .kr(1)
47297 .sr(1)
47298 .m(m)
47299 .n(4)
47300 .k(1)
47301 .iterations(1)
47302 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47303 }
47304}
47305
47306TEST(F32_IGEMM_2X4__SCALAR, k_eq_1_subtile_n) {
47307 for (uint32_t n = 1; n <= 4; n++) {
47308 GemmMicrokernelTester()
47309 .mr(2)
47310 .nr(4)
47311 .kr(1)
47312 .sr(1)
47313 .m(2)
47314 .n(n)
47315 .k(1)
47316 .iterations(1)
47317 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47318 }
47319}
47320
47321TEST(F32_IGEMM_2X4__SCALAR, k_gt_1) {
47322 for (size_t k = 2; k < 10; k++) {
47323 GemmMicrokernelTester()
47324 .mr(2)
47325 .nr(4)
47326 .kr(1)
47327 .sr(1)
47328 .m(2)
47329 .n(4)
47330 .k(k)
47331 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47332 }
47333}
47334
47335TEST(F32_IGEMM_2X4__SCALAR, k_gt_1_subtile) {
47336 for (size_t k = 2; k < 10; k++) {
47337 for (uint32_t m = 1; m <= 2; m++) {
47338 for (uint32_t n = 1; n <= 4; n++) {
47339 GemmMicrokernelTester()
47340 .mr(2)
47341 .nr(4)
47342 .kr(1)
47343 .sr(1)
47344 .m(m)
47345 .n(n)
47346 .k(k)
47347 .iterations(1)
47348 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47349 }
47350 }
47351 }
47352}
47353
47354TEST(F32_IGEMM_2X4__SCALAR, n_gt_4) {
47355 for (uint32_t n = 5; n < 8; n++) {
47356 for (size_t k = 1; k <= 5; k += 2) {
47357 GemmMicrokernelTester()
47358 .mr(2)
47359 .nr(4)
47360 .kr(1)
47361 .sr(1)
47362 .m(2)
47363 .n(4)
47364 .k(k)
47365 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47366 }
47367 }
47368}
47369
47370TEST(F32_IGEMM_2X4__SCALAR, n_gt_4_strided_cn) {
47371 for (uint32_t n = 5; n < 8; n++) {
47372 for (size_t k = 1; k <= 5; k += 2) {
47373 GemmMicrokernelTester()
47374 .mr(2)
47375 .nr(4)
47376 .kr(1)
47377 .sr(1)
47378 .m(2)
47379 .n(4)
47380 .k(k)
47381 .cn_stride(7)
47382 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47383 }
47384 }
47385}
47386
47387TEST(F32_IGEMM_2X4__SCALAR, n_gt_4_subtile) {
47388 for (uint32_t n = 5; n < 8; n++) {
47389 for (size_t k = 1; k <= 5; k += 2) {
47390 for (uint32_t m = 1; m <= 2; m++) {
47391 GemmMicrokernelTester()
47392 .mr(2)
47393 .nr(4)
47394 .kr(1)
47395 .sr(1)
47396 .m(m)
47397 .n(n)
47398 .k(k)
47399 .iterations(1)
47400 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47401 }
47402 }
47403 }
47404}
47405
47406TEST(F32_IGEMM_2X4__SCALAR, n_div_4) {
47407 for (uint32_t n = 8; n <= 12; n += 4) {
47408 for (size_t k = 1; k <= 5; k += 2) {
47409 GemmMicrokernelTester()
47410 .mr(2)
47411 .nr(4)
47412 .kr(1)
47413 .sr(1)
47414 .m(2)
47415 .n(4)
47416 .k(k)
47417 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47418 }
47419 }
47420}
47421
47422TEST(F32_IGEMM_2X4__SCALAR, n_div_4_strided_cn) {
47423 for (uint32_t n = 8; n <= 12; n += 4) {
47424 for (size_t k = 1; k <= 5; k += 2) {
47425 GemmMicrokernelTester()
47426 .mr(2)
47427 .nr(4)
47428 .kr(1)
47429 .sr(1)
47430 .m(2)
47431 .n(n)
47432 .k(k)
47433 .cn_stride(7)
47434 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47435 }
47436 }
47437}
47438
47439TEST(F32_IGEMM_2X4__SCALAR, n_div_4_subtile) {
47440 for (uint32_t n = 8; n <= 12; n += 4) {
47441 for (size_t k = 1; k <= 5; k += 2) {
47442 for (uint32_t m = 1; m <= 2; m++) {
47443 GemmMicrokernelTester()
47444 .mr(2)
47445 .nr(4)
47446 .kr(1)
47447 .sr(1)
47448 .m(m)
47449 .n(n)
47450 .k(k)
47451 .iterations(1)
47452 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47453 }
47454 }
47455 }
47456}
47457
47458TEST(F32_IGEMM_2X4__SCALAR, small_kernel) {
47459 for (size_t k = 1; k <= 5; k += 2) {
47460 GemmMicrokernelTester()
47461 .mr(2)
47462 .nr(4)
47463 .kr(1)
47464 .sr(1)
47465 .m(2)
47466 .n(4)
47467 .k(k)
47468 .ks(3)
47469 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47470 }
47471}
47472
47473TEST(F32_IGEMM_2X4__SCALAR, small_kernel_subtile) {
47474 for (size_t k = 1; k <= 5; k += 2) {
47475 for (uint32_t m = 1; m <= 2; m++) {
47476 for (uint32_t n = 1; n <= 4; n++) {
47477 GemmMicrokernelTester()
47478 .mr(2)
47479 .nr(4)
47480 .kr(1)
47481 .sr(1)
47482 .m(m)
47483 .n(n)
47484 .k(k)
47485 .ks(3)
47486 .iterations(1)
47487 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47488 }
47489 }
47490 }
47491}
47492
47493TEST(F32_IGEMM_2X4__SCALAR, n_gt_4_small_kernel) {
47494 for (uint32_t n = 5; n < 8; n++) {
47495 for (size_t k = 1; k <= 5; k += 2) {
47496 GemmMicrokernelTester()
47497 .mr(2)
47498 .nr(4)
47499 .kr(1)
47500 .sr(1)
47501 .m(2)
47502 .n(4)
47503 .k(k)
47504 .ks(3)
47505 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47506 }
47507 }
47508}
47509
47510TEST(F32_IGEMM_2X4__SCALAR, n_div_4_small_kernel) {
47511 for (uint32_t n = 8; n <= 12; n += 4) {
47512 for (size_t k = 1; k <= 5; k += 2) {
47513 GemmMicrokernelTester()
47514 .mr(2)
47515 .nr(4)
47516 .kr(1)
47517 .sr(1)
47518 .m(2)
47519 .n(4)
47520 .k(k)
47521 .ks(3)
47522 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47523 }
47524 }
47525}
47526
47527TEST(F32_IGEMM_2X4__SCALAR, strided_cm_subtile) {
47528 for (size_t k = 1; k <= 5; k += 2) {
47529 for (uint32_t m = 1; m <= 2; m++) {
47530 for (uint32_t n = 1; n <= 4; n++) {
47531 GemmMicrokernelTester()
47532 .mr(2)
47533 .nr(4)
47534 .kr(1)
47535 .sr(1)
47536 .m(m)
47537 .n(n)
47538 .k(k)
47539 .cm_stride(7)
47540 .iterations(1)
47541 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47542 }
47543 }
47544 }
47545}
47546
47547TEST(F32_IGEMM_2X4__SCALAR, a_offset) {
47548 for (size_t k = 1; k <= 5; k += 2) {
47549 GemmMicrokernelTester()
47550 .mr(2)
47551 .nr(4)
47552 .kr(1)
47553 .sr(1)
47554 .m(2)
47555 .n(4)
47556 .k(k)
47557 .ks(3)
47558 .a_offset(13)
47559 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47560 }
47561}
47562
47563TEST(F32_IGEMM_2X4__SCALAR, zero) {
47564 for (uint32_t mz = 0; mz < 2; mz++) {
47565 for (size_t k = 1; k <= 5; k += 2) {
47566 GemmMicrokernelTester()
47567 .mr(2)
47568 .nr(4)
47569 .kr(1)
47570 .sr(1)
47571 .m(2)
47572 .n(4)
47573 .k(k)
47574 .ks(3)
47575 .a_offset(13)
47576 .zero_index(mz)
47577 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47578 }
47579 }
47580}
47581
47582TEST(F32_IGEMM_2X4__SCALAR, qmin) {
47583 GemmMicrokernelTester()
47584 .mr(2)
47585 .nr(4)
47586 .kr(1)
47587 .sr(1)
47588 .m(2)
47589 .n(4)
47590 .k(1)
47591 .qmin(128)
47592 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47593}
47594
47595TEST(F32_IGEMM_2X4__SCALAR, qmax) {
47596 GemmMicrokernelTester()
47597 .mr(2)
47598 .nr(4)
47599 .kr(1)
47600 .sr(1)
47601 .m(2)
47602 .n(4)
47603 .k(1)
47604 .qmax(128)
47605 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47606}
47607
47608TEST(F32_IGEMM_2X4__SCALAR, strided_cm) {
47609 GemmMicrokernelTester()
47610 .mr(2)
47611 .nr(4)
47612 .kr(1)
47613 .sr(1)
47614 .m(2)
47615 .n(4)
47616 .k(1)
47617 .cm_stride(7)
47618 .Test(xnn_f32_igemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47619}
47620
47621
47622TEST(F32_IGEMM_4X4__SCALAR, k_eq_1) {
47623 GemmMicrokernelTester()
47624 .mr(4)
47625 .nr(4)
47626 .kr(1)
47627 .sr(1)
47628 .m(4)
47629 .n(4)
47630 .k(1)
47631 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47632}
47633
47634TEST(F32_IGEMM_4X4__SCALAR, strided_cn) {
47635 GemmMicrokernelTester()
47636 .mr(4)
47637 .nr(4)
47638 .kr(1)
47639 .sr(1)
47640 .m(4)
47641 .n(4)
47642 .k(1)
47643 .cn_stride(7)
47644 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47645}
47646
47647TEST(F32_IGEMM_4X4__SCALAR, k_eq_1_subtile) {
47648 for (uint32_t m = 1; m <= 4; m++) {
47649 for (uint32_t n = 1; n <= 4; n++) {
47650 GemmMicrokernelTester()
47651 .mr(4)
47652 .nr(4)
47653 .kr(1)
47654 .sr(1)
47655 .m(m)
47656 .n(n)
47657 .k(1)
47658 .iterations(1)
47659 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47660 }
47661 }
47662}
47663
47664TEST(F32_IGEMM_4X4__SCALAR, k_eq_1_subtile_m) {
47665 for (uint32_t m = 1; m <= 4; m++) {
47666 GemmMicrokernelTester()
47667 .mr(4)
47668 .nr(4)
47669 .kr(1)
47670 .sr(1)
47671 .m(m)
47672 .n(4)
47673 .k(1)
47674 .iterations(1)
47675 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47676 }
47677}
47678
47679TEST(F32_IGEMM_4X4__SCALAR, k_eq_1_subtile_n) {
47680 for (uint32_t n = 1; n <= 4; n++) {
47681 GemmMicrokernelTester()
47682 .mr(4)
47683 .nr(4)
47684 .kr(1)
47685 .sr(1)
47686 .m(4)
47687 .n(n)
47688 .k(1)
47689 .iterations(1)
47690 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47691 }
47692}
47693
47694TEST(F32_IGEMM_4X4__SCALAR, k_gt_1) {
47695 for (size_t k = 2; k < 10; k++) {
47696 GemmMicrokernelTester()
47697 .mr(4)
47698 .nr(4)
47699 .kr(1)
47700 .sr(1)
47701 .m(4)
47702 .n(4)
47703 .k(k)
47704 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47705 }
47706}
47707
47708TEST(F32_IGEMM_4X4__SCALAR, k_gt_1_subtile) {
47709 for (size_t k = 2; k < 10; k++) {
47710 for (uint32_t m = 1; m <= 4; m++) {
47711 for (uint32_t n = 1; n <= 4; n++) {
47712 GemmMicrokernelTester()
47713 .mr(4)
47714 .nr(4)
47715 .kr(1)
47716 .sr(1)
47717 .m(m)
47718 .n(n)
47719 .k(k)
47720 .iterations(1)
47721 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47722 }
47723 }
47724 }
47725}
47726
47727TEST(F32_IGEMM_4X4__SCALAR, n_gt_4) {
47728 for (uint32_t n = 5; n < 8; n++) {
47729 for (size_t k = 1; k <= 5; k += 2) {
47730 GemmMicrokernelTester()
47731 .mr(4)
47732 .nr(4)
47733 .kr(1)
47734 .sr(1)
47735 .m(4)
47736 .n(4)
47737 .k(k)
47738 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47739 }
47740 }
47741}
47742
47743TEST(F32_IGEMM_4X4__SCALAR, n_gt_4_strided_cn) {
47744 for (uint32_t n = 5; n < 8; n++) {
47745 for (size_t k = 1; k <= 5; k += 2) {
47746 GemmMicrokernelTester()
47747 .mr(4)
47748 .nr(4)
47749 .kr(1)
47750 .sr(1)
47751 .m(4)
47752 .n(4)
47753 .k(k)
47754 .cn_stride(7)
47755 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47756 }
47757 }
47758}
47759
47760TEST(F32_IGEMM_4X4__SCALAR, n_gt_4_subtile) {
47761 for (uint32_t n = 5; n < 8; n++) {
47762 for (size_t k = 1; k <= 5; k += 2) {
47763 for (uint32_t m = 1; m <= 4; m++) {
47764 GemmMicrokernelTester()
47765 .mr(4)
47766 .nr(4)
47767 .kr(1)
47768 .sr(1)
47769 .m(m)
47770 .n(n)
47771 .k(k)
47772 .iterations(1)
47773 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47774 }
47775 }
47776 }
47777}
47778
47779TEST(F32_IGEMM_4X4__SCALAR, n_div_4) {
47780 for (uint32_t n = 8; n <= 12; n += 4) {
47781 for (size_t k = 1; k <= 5; k += 2) {
47782 GemmMicrokernelTester()
47783 .mr(4)
47784 .nr(4)
47785 .kr(1)
47786 .sr(1)
47787 .m(4)
47788 .n(4)
47789 .k(k)
47790 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47791 }
47792 }
47793}
47794
47795TEST(F32_IGEMM_4X4__SCALAR, n_div_4_strided_cn) {
47796 for (uint32_t n = 8; n <= 12; n += 4) {
47797 for (size_t k = 1; k <= 5; k += 2) {
47798 GemmMicrokernelTester()
47799 .mr(4)
47800 .nr(4)
47801 .kr(1)
47802 .sr(1)
47803 .m(4)
47804 .n(n)
47805 .k(k)
47806 .cn_stride(7)
47807 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47808 }
47809 }
47810}
47811
47812TEST(F32_IGEMM_4X4__SCALAR, n_div_4_subtile) {
47813 for (uint32_t n = 8; n <= 12; n += 4) {
47814 for (size_t k = 1; k <= 5; k += 2) {
47815 for (uint32_t m = 1; m <= 4; m++) {
47816 GemmMicrokernelTester()
47817 .mr(4)
47818 .nr(4)
47819 .kr(1)
47820 .sr(1)
47821 .m(m)
47822 .n(n)
47823 .k(k)
47824 .iterations(1)
47825 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47826 }
47827 }
47828 }
47829}
47830
47831TEST(F32_IGEMM_4X4__SCALAR, small_kernel) {
47832 for (size_t k = 1; k <= 5; k += 2) {
47833 GemmMicrokernelTester()
47834 .mr(4)
47835 .nr(4)
47836 .kr(1)
47837 .sr(1)
47838 .m(4)
47839 .n(4)
47840 .k(k)
47841 .ks(3)
47842 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47843 }
47844}
47845
47846TEST(F32_IGEMM_4X4__SCALAR, small_kernel_subtile) {
47847 for (size_t k = 1; k <= 5; k += 2) {
47848 for (uint32_t m = 1; m <= 4; m++) {
47849 for (uint32_t n = 1; n <= 4; n++) {
47850 GemmMicrokernelTester()
47851 .mr(4)
47852 .nr(4)
47853 .kr(1)
47854 .sr(1)
47855 .m(m)
47856 .n(n)
47857 .k(k)
47858 .ks(3)
47859 .iterations(1)
47860 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47861 }
47862 }
47863 }
47864}
47865
47866TEST(F32_IGEMM_4X4__SCALAR, n_gt_4_small_kernel) {
47867 for (uint32_t n = 5; n < 8; n++) {
47868 for (size_t k = 1; k <= 5; k += 2) {
47869 GemmMicrokernelTester()
47870 .mr(4)
47871 .nr(4)
47872 .kr(1)
47873 .sr(1)
47874 .m(4)
47875 .n(4)
47876 .k(k)
47877 .ks(3)
47878 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47879 }
47880 }
47881}
47882
47883TEST(F32_IGEMM_4X4__SCALAR, n_div_4_small_kernel) {
47884 for (uint32_t n = 8; n <= 12; n += 4) {
47885 for (size_t k = 1; k <= 5; k += 2) {
47886 GemmMicrokernelTester()
47887 .mr(4)
47888 .nr(4)
47889 .kr(1)
47890 .sr(1)
47891 .m(4)
47892 .n(4)
47893 .k(k)
47894 .ks(3)
47895 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47896 }
47897 }
47898}
47899
47900TEST(F32_IGEMM_4X4__SCALAR, strided_cm_subtile) {
47901 for (size_t k = 1; k <= 5; k += 2) {
47902 for (uint32_t m = 1; m <= 4; m++) {
47903 for (uint32_t n = 1; n <= 4; n++) {
47904 GemmMicrokernelTester()
47905 .mr(4)
47906 .nr(4)
47907 .kr(1)
47908 .sr(1)
47909 .m(m)
47910 .n(n)
47911 .k(k)
47912 .cm_stride(7)
47913 .iterations(1)
47914 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47915 }
47916 }
47917 }
47918}
47919
47920TEST(F32_IGEMM_4X4__SCALAR, a_offset) {
47921 for (size_t k = 1; k <= 5; k += 2) {
47922 GemmMicrokernelTester()
47923 .mr(4)
47924 .nr(4)
47925 .kr(1)
47926 .sr(1)
47927 .m(4)
47928 .n(4)
47929 .k(k)
47930 .ks(3)
47931 .a_offset(23)
47932 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47933 }
47934}
47935
47936TEST(F32_IGEMM_4X4__SCALAR, zero) {
47937 for (uint32_t mz = 0; mz < 4; mz++) {
47938 for (size_t k = 1; k <= 5; k += 2) {
47939 GemmMicrokernelTester()
47940 .mr(4)
47941 .nr(4)
47942 .kr(1)
47943 .sr(1)
47944 .m(4)
47945 .n(4)
47946 .k(k)
47947 .ks(3)
47948 .a_offset(23)
47949 .zero_index(mz)
47950 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47951 }
47952 }
47953}
47954
47955TEST(F32_IGEMM_4X4__SCALAR, qmin) {
47956 GemmMicrokernelTester()
47957 .mr(4)
47958 .nr(4)
47959 .kr(1)
47960 .sr(1)
47961 .m(4)
47962 .n(4)
47963 .k(1)
47964 .qmin(128)
47965 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47966}
47967
47968TEST(F32_IGEMM_4X4__SCALAR, qmax) {
47969 GemmMicrokernelTester()
47970 .mr(4)
47971 .nr(4)
47972 .kr(1)
47973 .sr(1)
47974 .m(4)
47975 .n(4)
47976 .k(1)
47977 .qmax(128)
47978 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47979}
47980
47981TEST(F32_IGEMM_4X4__SCALAR, strided_cm) {
47982 GemmMicrokernelTester()
47983 .mr(4)
47984 .nr(4)
47985 .kr(1)
47986 .sr(1)
47987 .m(4)
47988 .n(4)
47989 .k(1)
47990 .cm_stride(7)
47991 .Test(xnn_f32_igemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
47992}
47993
47994
47995TEST(F32_IGEMM_4X2__SCALAR, k_eq_1) {
47996 GemmMicrokernelTester()
47997 .mr(4)
47998 .nr(2)
47999 .kr(1)
48000 .sr(1)
48001 .m(4)
48002 .n(2)
48003 .k(1)
48004 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48005}
48006
48007TEST(F32_IGEMM_4X2__SCALAR, strided_cn) {
48008 GemmMicrokernelTester()
48009 .mr(4)
48010 .nr(2)
48011 .kr(1)
48012 .sr(1)
48013 .m(4)
48014 .n(2)
48015 .k(1)
48016 .cn_stride(5)
48017 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48018}
48019
48020TEST(F32_IGEMM_4X2__SCALAR, k_eq_1_subtile) {
48021 for (uint32_t m = 1; m <= 4; m++) {
48022 for (uint32_t n = 1; n <= 2; n++) {
48023 GemmMicrokernelTester()
48024 .mr(4)
48025 .nr(2)
48026 .kr(1)
48027 .sr(1)
48028 .m(m)
48029 .n(n)
48030 .k(1)
48031 .iterations(1)
48032 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48033 }
48034 }
48035}
48036
48037TEST(F32_IGEMM_4X2__SCALAR, k_eq_1_subtile_m) {
48038 for (uint32_t m = 1; m <= 4; m++) {
48039 GemmMicrokernelTester()
48040 .mr(4)
48041 .nr(2)
48042 .kr(1)
48043 .sr(1)
48044 .m(m)
48045 .n(2)
48046 .k(1)
48047 .iterations(1)
48048 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48049 }
48050}
48051
48052TEST(F32_IGEMM_4X2__SCALAR, k_eq_1_subtile_n) {
48053 for (uint32_t n = 1; n <= 2; n++) {
48054 GemmMicrokernelTester()
48055 .mr(4)
48056 .nr(2)
48057 .kr(1)
48058 .sr(1)
48059 .m(4)
48060 .n(n)
48061 .k(1)
48062 .iterations(1)
48063 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48064 }
48065}
48066
48067TEST(F32_IGEMM_4X2__SCALAR, k_gt_1) {
48068 for (size_t k = 2; k < 10; k++) {
48069 GemmMicrokernelTester()
48070 .mr(4)
48071 .nr(2)
48072 .kr(1)
48073 .sr(1)
48074 .m(4)
48075 .n(2)
48076 .k(k)
48077 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48078 }
48079}
48080
48081TEST(F32_IGEMM_4X2__SCALAR, k_gt_1_subtile) {
48082 for (size_t k = 2; k < 10; k++) {
48083 for (uint32_t m = 1; m <= 4; m++) {
48084 for (uint32_t n = 1; n <= 2; n++) {
48085 GemmMicrokernelTester()
48086 .mr(4)
48087 .nr(2)
48088 .kr(1)
48089 .sr(1)
48090 .m(m)
48091 .n(n)
48092 .k(k)
48093 .iterations(1)
48094 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48095 }
48096 }
48097 }
48098}
48099
48100TEST(F32_IGEMM_4X2__SCALAR, n_gt_2) {
48101 for (uint32_t n = 3; n < 4; n++) {
48102 for (size_t k = 1; k <= 5; k += 2) {
48103 GemmMicrokernelTester()
48104 .mr(4)
48105 .nr(2)
48106 .kr(1)
48107 .sr(1)
48108 .m(4)
48109 .n(2)
48110 .k(k)
48111 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48112 }
48113 }
48114}
48115
48116TEST(F32_IGEMM_4X2__SCALAR, n_gt_2_strided_cn) {
48117 for (uint32_t n = 3; n < 4; n++) {
48118 for (size_t k = 1; k <= 5; k += 2) {
48119 GemmMicrokernelTester()
48120 .mr(4)
48121 .nr(2)
48122 .kr(1)
48123 .sr(1)
48124 .m(4)
48125 .n(2)
48126 .k(k)
48127 .cn_stride(5)
48128 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48129 }
48130 }
48131}
48132
48133TEST(F32_IGEMM_4X2__SCALAR, n_gt_2_subtile) {
48134 for (uint32_t n = 3; n < 4; n++) {
48135 for (size_t k = 1; k <= 5; k += 2) {
48136 for (uint32_t m = 1; m <= 4; m++) {
48137 GemmMicrokernelTester()
48138 .mr(4)
48139 .nr(2)
48140 .kr(1)
48141 .sr(1)
48142 .m(m)
48143 .n(n)
48144 .k(k)
48145 .iterations(1)
48146 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48147 }
48148 }
48149 }
48150}
48151
48152TEST(F32_IGEMM_4X2__SCALAR, n_div_2) {
48153 for (uint32_t n = 4; n <= 6; n += 2) {
48154 for (size_t k = 1; k <= 5; k += 2) {
48155 GemmMicrokernelTester()
48156 .mr(4)
48157 .nr(2)
48158 .kr(1)
48159 .sr(1)
48160 .m(4)
48161 .n(2)
48162 .k(k)
48163 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48164 }
48165 }
48166}
48167
48168TEST(F32_IGEMM_4X2__SCALAR, n_div_2_strided_cn) {
48169 for (uint32_t n = 4; n <= 6; n += 2) {
48170 for (size_t k = 1; k <= 5; k += 2) {
48171 GemmMicrokernelTester()
48172 .mr(4)
48173 .nr(2)
48174 .kr(1)
48175 .sr(1)
48176 .m(4)
48177 .n(n)
48178 .k(k)
48179 .cn_stride(5)
48180 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48181 }
48182 }
48183}
48184
48185TEST(F32_IGEMM_4X2__SCALAR, n_div_2_subtile) {
48186 for (uint32_t n = 4; n <= 6; n += 2) {
48187 for (size_t k = 1; k <= 5; k += 2) {
48188 for (uint32_t m = 1; m <= 4; m++) {
48189 GemmMicrokernelTester()
48190 .mr(4)
48191 .nr(2)
48192 .kr(1)
48193 .sr(1)
48194 .m(m)
48195 .n(n)
48196 .k(k)
48197 .iterations(1)
48198 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48199 }
48200 }
48201 }
48202}
48203
48204TEST(F32_IGEMM_4X2__SCALAR, small_kernel) {
48205 for (size_t k = 1; k <= 5; k += 2) {
48206 GemmMicrokernelTester()
48207 .mr(4)
48208 .nr(2)
48209 .kr(1)
48210 .sr(1)
48211 .m(4)
48212 .n(2)
48213 .k(k)
48214 .ks(3)
48215 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48216 }
48217}
48218
48219TEST(F32_IGEMM_4X2__SCALAR, small_kernel_subtile) {
48220 for (size_t k = 1; k <= 5; k += 2) {
48221 for (uint32_t m = 1; m <= 4; m++) {
48222 for (uint32_t n = 1; n <= 2; n++) {
48223 GemmMicrokernelTester()
48224 .mr(4)
48225 .nr(2)
48226 .kr(1)
48227 .sr(1)
48228 .m(m)
48229 .n(n)
48230 .k(k)
48231 .ks(3)
48232 .iterations(1)
48233 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48234 }
48235 }
48236 }
48237}
48238
48239TEST(F32_IGEMM_4X2__SCALAR, n_gt_2_small_kernel) {
48240 for (uint32_t n = 3; n < 4; n++) {
48241 for (size_t k = 1; k <= 5; k += 2) {
48242 GemmMicrokernelTester()
48243 .mr(4)
48244 .nr(2)
48245 .kr(1)
48246 .sr(1)
48247 .m(4)
48248 .n(2)
48249 .k(k)
48250 .ks(3)
48251 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48252 }
48253 }
48254}
48255
48256TEST(F32_IGEMM_4X2__SCALAR, n_div_2_small_kernel) {
48257 for (uint32_t n = 4; n <= 6; n += 2) {
48258 for (size_t k = 1; k <= 5; k += 2) {
48259 GemmMicrokernelTester()
48260 .mr(4)
48261 .nr(2)
48262 .kr(1)
48263 .sr(1)
48264 .m(4)
48265 .n(2)
48266 .k(k)
48267 .ks(3)
48268 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48269 }
48270 }
48271}
48272
48273TEST(F32_IGEMM_4X2__SCALAR, strided_cm_subtile) {
48274 for (size_t k = 1; k <= 5; k += 2) {
48275 for (uint32_t m = 1; m <= 4; m++) {
48276 for (uint32_t n = 1; n <= 2; n++) {
48277 GemmMicrokernelTester()
48278 .mr(4)
48279 .nr(2)
48280 .kr(1)
48281 .sr(1)
48282 .m(m)
48283 .n(n)
48284 .k(k)
48285 .cm_stride(5)
48286 .iterations(1)
48287 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48288 }
48289 }
48290 }
48291}
48292
48293TEST(F32_IGEMM_4X2__SCALAR, a_offset) {
48294 for (size_t k = 1; k <= 5; k += 2) {
48295 GemmMicrokernelTester()
48296 .mr(4)
48297 .nr(2)
48298 .kr(1)
48299 .sr(1)
48300 .m(4)
48301 .n(2)
48302 .k(k)
48303 .ks(3)
48304 .a_offset(23)
48305 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48306 }
48307}
48308
48309TEST(F32_IGEMM_4X2__SCALAR, zero) {
48310 for (uint32_t mz = 0; mz < 4; mz++) {
48311 for (size_t k = 1; k <= 5; k += 2) {
48312 GemmMicrokernelTester()
48313 .mr(4)
48314 .nr(2)
48315 .kr(1)
48316 .sr(1)
48317 .m(4)
48318 .n(2)
48319 .k(k)
48320 .ks(3)
48321 .a_offset(23)
48322 .zero_index(mz)
48323 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48324 }
48325 }
48326}
48327
48328TEST(F32_IGEMM_4X2__SCALAR, qmin) {
48329 GemmMicrokernelTester()
48330 .mr(4)
48331 .nr(2)
48332 .kr(1)
48333 .sr(1)
48334 .m(4)
48335 .n(2)
48336 .k(1)
48337 .qmin(128)
48338 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48339}
48340
48341TEST(F32_IGEMM_4X2__SCALAR, qmax) {
48342 GemmMicrokernelTester()
48343 .mr(4)
48344 .nr(2)
48345 .kr(1)
48346 .sr(1)
48347 .m(4)
48348 .n(2)
48349 .k(1)
48350 .qmax(128)
48351 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48352}
48353
48354TEST(F32_IGEMM_4X2__SCALAR, strided_cm) {
48355 GemmMicrokernelTester()
48356 .mr(4)
48357 .nr(2)
48358 .kr(1)
48359 .sr(1)
48360 .m(4)
48361 .n(2)
48362 .k(1)
48363 .cm_stride(5)
48364 .Test(xnn_f32_igemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
48365}