blob: a0f4edd5c256e007dff68faeebdeda6fab89cf0d [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f32-gemminc.yaml
11// Generator: tools/generate-gemm-test.py
12
13
XNNPACK Teamb455b122019-09-27 18:10:33 -070014#include <gtest/gtest.h>
15
Marat Dukhan1dadbf72019-10-01 10:46:20 -070016#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "gemm-microkernel-tester.h"
23
24
Frank Barchard7e955972019-10-11 10:34:25 -070025#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard21be34f2019-10-09 19:32:19 -070026 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
27 TEST_REQUIRES_ARM_NEON_FMA;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(8)
36 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
37 }
38
39 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FMA;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(8)
49 .cn_stride(11)
50 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
51 }
52
53 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
54 TEST_REQUIRES_ARM_NEON_FMA;
55 GemmMicrokernelTester()
56 .mr(1)
57 .nr(8)
58 .kr(1)
59 .sr(1)
60 .m(1)
61 .n(8)
62 .k(8)
63 .a_stride(11)
64 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
65 }
66
67 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
68 TEST_REQUIRES_ARM_NEON_FMA;
69 for (uint32_t m = 1; m <= 1; m++) {
70 for (uint32_t n = 1; n <= 8; n++) {
71 GemmMicrokernelTester()
72 .mr(1)
73 .nr(8)
74 .kr(1)
75 .sr(1)
76 .m(m)
77 .n(n)
78 .k(8)
79 .iterations(1)
80 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
81 }
82 }
83 }
84
85 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
86 TEST_REQUIRES_ARM_NEON_FMA;
87 for (uint32_t m = 1; m <= 1; m++) {
88 GemmMicrokernelTester()
89 .mr(1)
90 .nr(8)
91 .kr(1)
92 .sr(1)
93 .m(m)
94 .n(8)
95 .k(8)
96 .iterations(1)
97 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
98 }
99 }
100
101 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
102 TEST_REQUIRES_ARM_NEON_FMA;
103 for (uint32_t n = 1; n <= 8; n++) {
104 GemmMicrokernelTester()
105 .mr(1)
106 .nr(8)
107 .kr(1)
108 .sr(1)
109 .m(1)
110 .n(n)
111 .k(8)
112 .iterations(1)
113 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
114 }
115 }
116
117 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
118 TEST_REQUIRES_ARM_NEON_FMA;
119 GemmMicrokernelTester()
120 .mr(1)
121 .nr(8)
122 .kr(1)
123 .sr(1)
124 .m(1)
125 .n(8)
126 .k(16)
127 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
128 }
129
130 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_strided_a) {
131 TEST_REQUIRES_ARM_NEON_FMA;
132 GemmMicrokernelTester()
133 .mr(1)
134 .nr(8)
135 .kr(1)
136 .sr(1)
137 .m(1)
138 .n(8)
139 .k(16)
140 .a_stride(19)
141 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
142 }
143
144 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
145 TEST_REQUIRES_ARM_NEON_FMA;
146 for (uint32_t m = 1; m <= 1; m++) {
147 for (uint32_t n = 1; n <= 8; n++) {
148 GemmMicrokernelTester()
149 .mr(1)
150 .nr(8)
151 .kr(1)
152 .sr(1)
153 .m(m)
154 .n(n)
155 .k(16)
156 .iterations(1)
157 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
158 }
159 }
160 }
161
162 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
163 TEST_REQUIRES_ARM_NEON_FMA;
164 for (size_t k = 1; k < 16; k++) {
165 GemmMicrokernelTester()
166 .mr(1)
167 .nr(8)
168 .kr(1)
169 .sr(1)
170 .m(1)
171 .n(8)
172 .k(k)
173 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
174 }
175 }
176
177 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_strided_a) {
178 TEST_REQUIRES_ARM_NEON_FMA;
179 for (size_t k = 1; k < 16; k++) {
180 GemmMicrokernelTester()
181 .mr(1)
182 .nr(8)
183 .kr(1)
184 .sr(1)
185 .m(1)
186 .n(8)
187 .k(k)
188 .a_stride(19)
189 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
190 }
191 }
192
193 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
194 TEST_REQUIRES_ARM_NEON_FMA;
195 for (size_t k = 1; k < 16; k++) {
196 for (uint32_t m = 1; m <= 1; m++) {
197 for (uint32_t n = 1; n <= 8; n++) {
198 GemmMicrokernelTester()
199 .mr(1)
200 .nr(8)
201 .kr(1)
202 .sr(1)
203 .m(m)
204 .n(n)
205 .k(k)
206 .iterations(1)
207 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
208 }
209 }
210 }
211 }
212
213 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
214 TEST_REQUIRES_ARM_NEON_FMA;
215 for (size_t k = 17; k < 16; k++) {
216 GemmMicrokernelTester()
217 .mr(1)
218 .nr(8)
219 .kr(1)
220 .sr(1)
221 .m(1)
222 .n(8)
223 .k(k)
224 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
225 }
226 }
227
228 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_strided_a) {
229 TEST_REQUIRES_ARM_NEON_FMA;
230 for (size_t k = 17; k < 16; k++) {
231 GemmMicrokernelTester()
232 .mr(1)
233 .nr(8)
234 .kr(1)
235 .sr(1)
236 .m(1)
237 .n(8)
238 .k(k)
239 .a_stride(19)
240 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
241 }
242 }
243
244 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_subtile) {
245 TEST_REQUIRES_ARM_NEON_FMA;
246 for (size_t k = 17; k < 16; k++) {
247 for (uint32_t m = 1; m <= 1; m++) {
248 for (uint32_t n = 1; n <= 8; n++) {
249 GemmMicrokernelTester()
250 .mr(1)
251 .nr(8)
252 .kr(1)
253 .sr(1)
254 .m(m)
255 .n(n)
256 .k(k)
257 .iterations(1)
258 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
259 }
260 }
261 }
262 }
263
264 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
265 TEST_REQUIRES_ARM_NEON_FMA;
266 for (size_t k = 24; k <= 80; k += 8) {
267 GemmMicrokernelTester()
268 .mr(1)
269 .nr(8)
270 .kr(1)
271 .sr(1)
272 .m(1)
273 .n(8)
274 .k(k)
275 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
276 }
277 }
278
279 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_strided_a) {
280 TEST_REQUIRES_ARM_NEON_FMA;
281 for (size_t k = 24; k <= 80; k += 8) {
282 GemmMicrokernelTester()
283 .mr(1)
284 .nr(8)
285 .kr(1)
286 .sr(1)
287 .m(1)
288 .n(8)
289 .k(k)
290 .a_stride(83)
291 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
292 }
293 }
294
295 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
296 TEST_REQUIRES_ARM_NEON_FMA;
297 for (size_t k = 24; k <= 80; k += 8) {
298 for (uint32_t m = 1; m <= 1; m++) {
299 for (uint32_t n = 1; n <= 8; n++) {
300 GemmMicrokernelTester()
301 .mr(1)
302 .nr(8)
303 .kr(1)
304 .sr(1)
305 .m(m)
306 .n(n)
307 .k(k)
308 .iterations(1)
309 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
310 }
311 }
312 }
313 }
314
315 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
316 TEST_REQUIRES_ARM_NEON_FMA;
317 for (uint32_t n = 9; n < 16; n++) {
318 for (size_t k = 1; k <= 40; k += 9) {
319 GemmMicrokernelTester()
320 .mr(1)
321 .nr(8)
322 .kr(1)
323 .sr(1)
324 .m(1)
325 .n(8)
326 .k(k)
327 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
328 }
329 }
330 }
331
332 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
333 TEST_REQUIRES_ARM_NEON_FMA;
334 for (uint32_t n = 9; n < 16; n++) {
335 for (size_t k = 1; k <= 40; k += 9) {
336 GemmMicrokernelTester()
337 .mr(1)
338 .nr(8)
339 .kr(1)
340 .sr(1)
341 .m(1)
342 .n(8)
343 .k(k)
344 .cn_stride(11)
345 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
346 }
347 }
348 }
349
350 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
351 TEST_REQUIRES_ARM_NEON_FMA;
352 for (uint32_t n = 9; n < 16; n++) {
353 for (size_t k = 1; k <= 40; k += 9) {
354 GemmMicrokernelTester()
355 .mr(1)
356 .nr(8)
357 .kr(1)
358 .sr(1)
359 .m(1)
360 .n(n)
361 .k(k)
362 .a_stride(43)
363 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
364 }
365 }
366 }
367
368 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
369 TEST_REQUIRES_ARM_NEON_FMA;
370 for (uint32_t n = 9; n < 16; n++) {
371 for (size_t k = 1; k <= 40; k += 9) {
372 for (uint32_t m = 1; m <= 1; m++) {
373 GemmMicrokernelTester()
374 .mr(1)
375 .nr(8)
376 .kr(1)
377 .sr(1)
378 .m(m)
379 .n(n)
380 .k(k)
381 .iterations(1)
382 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
383 }
384 }
385 }
386 }
387
388 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
389 TEST_REQUIRES_ARM_NEON_FMA;
390 for (uint32_t n = 16; n <= 24; n += 8) {
391 for (size_t k = 1; k <= 40; k += 9) {
392 GemmMicrokernelTester()
393 .mr(1)
394 .nr(8)
395 .kr(1)
396 .sr(1)
397 .m(1)
398 .n(8)
399 .k(k)
400 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
401 }
402 }
403 }
404
405 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
406 TEST_REQUIRES_ARM_NEON_FMA;
407 for (uint32_t n = 16; n <= 24; n += 8) {
408 for (size_t k = 1; k <= 40; k += 9) {
409 GemmMicrokernelTester()
410 .mr(1)
411 .nr(8)
412 .kr(1)
413 .sr(1)
414 .m(1)
415 .n(n)
416 .k(k)
417 .cn_stride(11)
418 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
419 }
420 }
421 }
422
423 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
424 TEST_REQUIRES_ARM_NEON_FMA;
425 for (uint32_t n = 16; n <= 24; n += 8) {
426 for (size_t k = 1; k <= 40; k += 9) {
427 GemmMicrokernelTester()
428 .mr(1)
429 .nr(8)
430 .kr(1)
431 .sr(1)
432 .m(1)
433 .n(n)
434 .k(k)
435 .a_stride(43)
436 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
437 }
438 }
439 }
440
441 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
442 TEST_REQUIRES_ARM_NEON_FMA;
443 for (uint32_t n = 16; n <= 24; n += 8) {
444 for (size_t k = 1; k <= 40; k += 9) {
445 for (uint32_t m = 1; m <= 1; m++) {
446 GemmMicrokernelTester()
447 .mr(1)
448 .nr(8)
449 .kr(1)
450 .sr(1)
451 .m(m)
452 .n(n)
453 .k(k)
454 .iterations(1)
455 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
456 }
457 }
458 }
459 }
460
461 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
462 TEST_REQUIRES_ARM_NEON_FMA;
463 for (size_t k = 1; k <= 40; k += 9) {
464 for (uint32_t m = 1; m <= 1; m++) {
465 for (uint32_t n = 1; n <= 8; n++) {
466 GemmMicrokernelTester()
467 .mr(1)
468 .nr(8)
469 .kr(1)
470 .sr(1)
471 .m(m)
472 .n(n)
473 .k(k)
474 .cm_stride(11)
475 .iterations(1)
476 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
477 }
478 }
479 }
480 }
481
482 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
483 TEST_REQUIRES_ARM_NEON_FMA;
484 GemmMicrokernelTester()
485 .mr(1)
486 .nr(8)
487 .kr(1)
488 .sr(1)
489 .m(1)
490 .n(8)
491 .k(8)
492 .qmin(128)
493 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
494 }
495
496 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
497 TEST_REQUIRES_ARM_NEON_FMA;
498 GemmMicrokernelTester()
499 .mr(1)
500 .nr(8)
501 .kr(1)
502 .sr(1)
503 .m(1)
504 .n(8)
505 .k(8)
506 .qmax(128)
507 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
508 }
509
510 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
511 TEST_REQUIRES_ARM_NEON_FMA;
512 GemmMicrokernelTester()
513 .mr(1)
514 .nr(8)
515 .kr(1)
516 .sr(1)
517 .m(1)
518 .n(8)
519 .k(8)
520 .cm_stride(11)
521 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a53);
522 }
Frank Barchard7e955972019-10-11 10:34:25 -0700523#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard21be34f2019-10-09 19:32:19 -0700524
525
Frank Barchard7e955972019-10-11 10:34:25 -0700526#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700527 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
528 TEST_REQUIRES_ARM_NEON_FMA;
529 GemmMicrokernelTester()
530 .mr(1)
531 .nr(8)
532 .kr(1)
533 .sr(1)
534 .m(1)
535 .n(8)
536 .k(8)
537 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
538 }
539
540 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
541 TEST_REQUIRES_ARM_NEON_FMA;
542 GemmMicrokernelTester()
543 .mr(1)
544 .nr(8)
545 .kr(1)
546 .sr(1)
547 .m(1)
548 .n(8)
549 .k(8)
550 .cn_stride(11)
551 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
552 }
553
554 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
555 TEST_REQUIRES_ARM_NEON_FMA;
556 GemmMicrokernelTester()
557 .mr(1)
558 .nr(8)
559 .kr(1)
560 .sr(1)
561 .m(1)
562 .n(8)
563 .k(8)
564 .a_stride(11)
565 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
566 }
567
568 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
569 TEST_REQUIRES_ARM_NEON_FMA;
570 for (uint32_t m = 1; m <= 1; m++) {
571 for (uint32_t n = 1; n <= 8; n++) {
572 GemmMicrokernelTester()
573 .mr(1)
574 .nr(8)
575 .kr(1)
576 .sr(1)
577 .m(m)
578 .n(n)
579 .k(8)
580 .iterations(1)
581 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
582 }
583 }
584 }
585
586 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
587 TEST_REQUIRES_ARM_NEON_FMA;
588 for (uint32_t m = 1; m <= 1; m++) {
589 GemmMicrokernelTester()
590 .mr(1)
591 .nr(8)
592 .kr(1)
593 .sr(1)
594 .m(m)
595 .n(8)
596 .k(8)
597 .iterations(1)
598 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
599 }
600 }
601
602 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
603 TEST_REQUIRES_ARM_NEON_FMA;
604 for (uint32_t n = 1; n <= 8; n++) {
605 GemmMicrokernelTester()
606 .mr(1)
607 .nr(8)
608 .kr(1)
609 .sr(1)
610 .m(1)
611 .n(n)
612 .k(8)
613 .iterations(1)
614 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
615 }
616 }
617
618 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
619 TEST_REQUIRES_ARM_NEON_FMA;
620 GemmMicrokernelTester()
621 .mr(1)
622 .nr(8)
623 .kr(1)
624 .sr(1)
625 .m(1)
626 .n(8)
627 .k(16)
628 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
629 }
630
631 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
632 TEST_REQUIRES_ARM_NEON_FMA;
633 GemmMicrokernelTester()
634 .mr(1)
635 .nr(8)
636 .kr(1)
637 .sr(1)
638 .m(1)
639 .n(8)
640 .k(16)
641 .a_stride(19)
642 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
643 }
644
645 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
646 TEST_REQUIRES_ARM_NEON_FMA;
647 for (uint32_t m = 1; m <= 1; m++) {
648 for (uint32_t n = 1; n <= 8; n++) {
649 GemmMicrokernelTester()
650 .mr(1)
651 .nr(8)
652 .kr(1)
653 .sr(1)
654 .m(m)
655 .n(n)
656 .k(16)
657 .iterations(1)
658 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
659 }
660 }
661 }
662
663 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
664 TEST_REQUIRES_ARM_NEON_FMA;
665 for (size_t k = 1; k < 16; k++) {
666 GemmMicrokernelTester()
667 .mr(1)
668 .nr(8)
669 .kr(1)
670 .sr(1)
671 .m(1)
672 .n(8)
673 .k(k)
674 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
675 }
676 }
677
678 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
679 TEST_REQUIRES_ARM_NEON_FMA;
680 for (size_t k = 1; k < 16; k++) {
681 GemmMicrokernelTester()
682 .mr(1)
683 .nr(8)
684 .kr(1)
685 .sr(1)
686 .m(1)
687 .n(8)
688 .k(k)
689 .a_stride(19)
690 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
691 }
692 }
693
694 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
695 TEST_REQUIRES_ARM_NEON_FMA;
696 for (size_t k = 1; k < 16; k++) {
697 for (uint32_t m = 1; m <= 1; m++) {
698 for (uint32_t n = 1; n <= 8; n++) {
699 GemmMicrokernelTester()
700 .mr(1)
701 .nr(8)
702 .kr(1)
703 .sr(1)
704 .m(m)
705 .n(n)
706 .k(k)
707 .iterations(1)
708 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
709 }
710 }
711 }
712 }
713
714 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
715 TEST_REQUIRES_ARM_NEON_FMA;
716 for (size_t k = 17; k < 16; k++) {
717 GemmMicrokernelTester()
718 .mr(1)
719 .nr(8)
720 .kr(1)
721 .sr(1)
722 .m(1)
723 .n(8)
724 .k(k)
725 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
726 }
727 }
728
729 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
730 TEST_REQUIRES_ARM_NEON_FMA;
731 for (size_t k = 17; k < 16; k++) {
732 GemmMicrokernelTester()
733 .mr(1)
734 .nr(8)
735 .kr(1)
736 .sr(1)
737 .m(1)
738 .n(8)
739 .k(k)
740 .a_stride(19)
741 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
742 }
743 }
744
745 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
746 TEST_REQUIRES_ARM_NEON_FMA;
747 for (size_t k = 17; k < 16; k++) {
748 for (uint32_t m = 1; m <= 1; m++) {
749 for (uint32_t n = 1; n <= 8; n++) {
750 GemmMicrokernelTester()
751 .mr(1)
752 .nr(8)
753 .kr(1)
754 .sr(1)
755 .m(m)
756 .n(n)
757 .k(k)
758 .iterations(1)
759 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
760 }
761 }
762 }
763 }
764
765 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
766 TEST_REQUIRES_ARM_NEON_FMA;
767 for (size_t k = 24; k <= 80; k += 8) {
768 GemmMicrokernelTester()
769 .mr(1)
770 .nr(8)
771 .kr(1)
772 .sr(1)
773 .m(1)
774 .n(8)
775 .k(k)
776 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
777 }
778 }
779
780 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
781 TEST_REQUIRES_ARM_NEON_FMA;
782 for (size_t k = 24; k <= 80; k += 8) {
783 GemmMicrokernelTester()
784 .mr(1)
785 .nr(8)
786 .kr(1)
787 .sr(1)
788 .m(1)
789 .n(8)
790 .k(k)
791 .a_stride(83)
792 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
793 }
794 }
795
796 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
797 TEST_REQUIRES_ARM_NEON_FMA;
798 for (size_t k = 24; k <= 80; k += 8) {
799 for (uint32_t m = 1; m <= 1; m++) {
800 for (uint32_t n = 1; n <= 8; n++) {
801 GemmMicrokernelTester()
802 .mr(1)
803 .nr(8)
804 .kr(1)
805 .sr(1)
806 .m(m)
807 .n(n)
808 .k(k)
809 .iterations(1)
810 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
811 }
812 }
813 }
814 }
815
816 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
817 TEST_REQUIRES_ARM_NEON_FMA;
818 for (uint32_t n = 9; n < 16; n++) {
819 for (size_t k = 1; k <= 40; k += 9) {
820 GemmMicrokernelTester()
821 .mr(1)
822 .nr(8)
823 .kr(1)
824 .sr(1)
825 .m(1)
826 .n(8)
827 .k(k)
828 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
829 }
830 }
831 }
832
833 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
834 TEST_REQUIRES_ARM_NEON_FMA;
835 for (uint32_t n = 9; n < 16; n++) {
836 for (size_t k = 1; k <= 40; k += 9) {
837 GemmMicrokernelTester()
838 .mr(1)
839 .nr(8)
840 .kr(1)
841 .sr(1)
842 .m(1)
843 .n(8)
844 .k(k)
845 .cn_stride(11)
846 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
847 }
848 }
849 }
850
851 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
852 TEST_REQUIRES_ARM_NEON_FMA;
853 for (uint32_t n = 9; n < 16; n++) {
854 for (size_t k = 1; k <= 40; k += 9) {
855 GemmMicrokernelTester()
856 .mr(1)
857 .nr(8)
858 .kr(1)
859 .sr(1)
860 .m(1)
861 .n(n)
862 .k(k)
863 .a_stride(43)
864 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
865 }
866 }
867 }
868
869 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
870 TEST_REQUIRES_ARM_NEON_FMA;
871 for (uint32_t n = 9; n < 16; n++) {
872 for (size_t k = 1; k <= 40; k += 9) {
873 for (uint32_t m = 1; m <= 1; m++) {
874 GemmMicrokernelTester()
875 .mr(1)
876 .nr(8)
877 .kr(1)
878 .sr(1)
879 .m(m)
880 .n(n)
881 .k(k)
882 .iterations(1)
883 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
884 }
885 }
886 }
887 }
888
889 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
890 TEST_REQUIRES_ARM_NEON_FMA;
891 for (uint32_t n = 16; n <= 24; n += 8) {
892 for (size_t k = 1; k <= 40; k += 9) {
893 GemmMicrokernelTester()
894 .mr(1)
895 .nr(8)
896 .kr(1)
897 .sr(1)
898 .m(1)
899 .n(8)
900 .k(k)
901 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
902 }
903 }
904 }
905
906 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
907 TEST_REQUIRES_ARM_NEON_FMA;
908 for (uint32_t n = 16; n <= 24; n += 8) {
909 for (size_t k = 1; k <= 40; k += 9) {
910 GemmMicrokernelTester()
911 .mr(1)
912 .nr(8)
913 .kr(1)
914 .sr(1)
915 .m(1)
916 .n(n)
917 .k(k)
918 .cn_stride(11)
919 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
920 }
921 }
922 }
923
924 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
925 TEST_REQUIRES_ARM_NEON_FMA;
926 for (uint32_t n = 16; n <= 24; n += 8) {
927 for (size_t k = 1; k <= 40; k += 9) {
928 GemmMicrokernelTester()
929 .mr(1)
930 .nr(8)
931 .kr(1)
932 .sr(1)
933 .m(1)
934 .n(n)
935 .k(k)
936 .a_stride(43)
937 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
938 }
939 }
940 }
941
942 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
943 TEST_REQUIRES_ARM_NEON_FMA;
944 for (uint32_t n = 16; n <= 24; n += 8) {
945 for (size_t k = 1; k <= 40; k += 9) {
946 for (uint32_t m = 1; m <= 1; m++) {
947 GemmMicrokernelTester()
948 .mr(1)
949 .nr(8)
950 .kr(1)
951 .sr(1)
952 .m(m)
953 .n(n)
954 .k(k)
955 .iterations(1)
956 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
957 }
958 }
959 }
960 }
961
962 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
963 TEST_REQUIRES_ARM_NEON_FMA;
964 for (size_t k = 1; k <= 40; k += 9) {
965 for (uint32_t m = 1; m <= 1; m++) {
966 for (uint32_t n = 1; n <= 8; n++) {
967 GemmMicrokernelTester()
968 .mr(1)
969 .nr(8)
970 .kr(1)
971 .sr(1)
972 .m(m)
973 .n(n)
974 .k(k)
975 .cm_stride(11)
976 .iterations(1)
977 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
978 }
979 }
980 }
981 }
982
983 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
984 TEST_REQUIRES_ARM_NEON_FMA;
985 GemmMicrokernelTester()
986 .mr(1)
987 .nr(8)
988 .kr(1)
989 .sr(1)
990 .m(1)
991 .n(8)
992 .k(8)
993 .qmin(128)
994 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
995 }
996
997 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
998 TEST_REQUIRES_ARM_NEON_FMA;
999 GemmMicrokernelTester()
1000 .mr(1)
1001 .nr(8)
1002 .kr(1)
1003 .sr(1)
1004 .m(1)
1005 .n(8)
1006 .k(8)
1007 .qmax(128)
1008 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
1009 }
1010
1011 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
1012 TEST_REQUIRES_ARM_NEON_FMA;
1013 GemmMicrokernelTester()
1014 .mr(1)
1015 .nr(8)
1016 .kr(1)
1017 .sr(1)
1018 .m(1)
1019 .n(8)
1020 .k(8)
1021 .cm_stride(11)
1022 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a57);
1023 }
Frank Barchard7e955972019-10-11 10:34:25 -07001024#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001025
1026
Frank Barchard7e955972019-10-11 10:34:25 -07001027#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001028 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
1029 TEST_REQUIRES_ARM_NEON_FMA;
1030 GemmMicrokernelTester()
1031 .mr(1)
1032 .nr(8)
1033 .kr(1)
1034 .sr(1)
1035 .m(1)
1036 .n(8)
1037 .k(8)
1038 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1039 }
1040
1041 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
1042 TEST_REQUIRES_ARM_NEON_FMA;
1043 GemmMicrokernelTester()
1044 .mr(1)
1045 .nr(8)
1046 .kr(1)
1047 .sr(1)
1048 .m(1)
1049 .n(8)
1050 .k(8)
1051 .cn_stride(11)
1052 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1053 }
1054
1055 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
1056 TEST_REQUIRES_ARM_NEON_FMA;
1057 GemmMicrokernelTester()
1058 .mr(1)
1059 .nr(8)
1060 .kr(1)
1061 .sr(1)
1062 .m(1)
1063 .n(8)
1064 .k(8)
1065 .a_stride(11)
1066 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1067 }
1068
1069 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
1070 TEST_REQUIRES_ARM_NEON_FMA;
1071 for (uint32_t m = 1; m <= 1; m++) {
1072 for (uint32_t n = 1; n <= 8; n++) {
1073 GemmMicrokernelTester()
1074 .mr(1)
1075 .nr(8)
1076 .kr(1)
1077 .sr(1)
1078 .m(m)
1079 .n(n)
1080 .k(8)
1081 .iterations(1)
1082 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1083 }
1084 }
1085 }
1086
1087 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
1088 TEST_REQUIRES_ARM_NEON_FMA;
1089 for (uint32_t m = 1; m <= 1; m++) {
1090 GemmMicrokernelTester()
1091 .mr(1)
1092 .nr(8)
1093 .kr(1)
1094 .sr(1)
1095 .m(m)
1096 .n(8)
1097 .k(8)
1098 .iterations(1)
1099 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1100 }
1101 }
1102
1103 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
1104 TEST_REQUIRES_ARM_NEON_FMA;
1105 for (uint32_t n = 1; n <= 8; n++) {
1106 GemmMicrokernelTester()
1107 .mr(1)
1108 .nr(8)
1109 .kr(1)
1110 .sr(1)
1111 .m(1)
1112 .n(n)
1113 .k(8)
1114 .iterations(1)
1115 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1116 }
1117 }
1118
1119 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
1120 TEST_REQUIRES_ARM_NEON_FMA;
1121 GemmMicrokernelTester()
1122 .mr(1)
1123 .nr(8)
1124 .kr(1)
1125 .sr(1)
1126 .m(1)
1127 .n(8)
1128 .k(16)
1129 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1130 }
1131
1132 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
1133 TEST_REQUIRES_ARM_NEON_FMA;
1134 GemmMicrokernelTester()
1135 .mr(1)
1136 .nr(8)
1137 .kr(1)
1138 .sr(1)
1139 .m(1)
1140 .n(8)
1141 .k(16)
1142 .a_stride(19)
1143 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1144 }
1145
1146 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
1147 TEST_REQUIRES_ARM_NEON_FMA;
1148 for (uint32_t m = 1; m <= 1; m++) {
1149 for (uint32_t n = 1; n <= 8; n++) {
1150 GemmMicrokernelTester()
1151 .mr(1)
1152 .nr(8)
1153 .kr(1)
1154 .sr(1)
1155 .m(m)
1156 .n(n)
1157 .k(16)
1158 .iterations(1)
1159 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1160 }
1161 }
1162 }
1163
1164 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
1165 TEST_REQUIRES_ARM_NEON_FMA;
1166 for (size_t k = 1; k < 16; k++) {
1167 GemmMicrokernelTester()
1168 .mr(1)
1169 .nr(8)
1170 .kr(1)
1171 .sr(1)
1172 .m(1)
1173 .n(8)
1174 .k(k)
1175 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1176 }
1177 }
1178
1179 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
1180 TEST_REQUIRES_ARM_NEON_FMA;
1181 for (size_t k = 1; k < 16; k++) {
1182 GemmMicrokernelTester()
1183 .mr(1)
1184 .nr(8)
1185 .kr(1)
1186 .sr(1)
1187 .m(1)
1188 .n(8)
1189 .k(k)
1190 .a_stride(19)
1191 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1192 }
1193 }
1194
1195 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
1196 TEST_REQUIRES_ARM_NEON_FMA;
1197 for (size_t k = 1; k < 16; k++) {
1198 for (uint32_t m = 1; m <= 1; m++) {
1199 for (uint32_t n = 1; n <= 8; n++) {
1200 GemmMicrokernelTester()
1201 .mr(1)
1202 .nr(8)
1203 .kr(1)
1204 .sr(1)
1205 .m(m)
1206 .n(n)
1207 .k(k)
1208 .iterations(1)
1209 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1210 }
1211 }
1212 }
1213 }
1214
1215 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
1216 TEST_REQUIRES_ARM_NEON_FMA;
1217 for (size_t k = 17; k < 16; k++) {
1218 GemmMicrokernelTester()
1219 .mr(1)
1220 .nr(8)
1221 .kr(1)
1222 .sr(1)
1223 .m(1)
1224 .n(8)
1225 .k(k)
1226 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1227 }
1228 }
1229
1230 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
1231 TEST_REQUIRES_ARM_NEON_FMA;
1232 for (size_t k = 17; k < 16; k++) {
1233 GemmMicrokernelTester()
1234 .mr(1)
1235 .nr(8)
1236 .kr(1)
1237 .sr(1)
1238 .m(1)
1239 .n(8)
1240 .k(k)
1241 .a_stride(19)
1242 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1243 }
1244 }
1245
1246 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
1247 TEST_REQUIRES_ARM_NEON_FMA;
1248 for (size_t k = 17; k < 16; k++) {
1249 for (uint32_t m = 1; m <= 1; m++) {
1250 for (uint32_t n = 1; n <= 8; n++) {
1251 GemmMicrokernelTester()
1252 .mr(1)
1253 .nr(8)
1254 .kr(1)
1255 .sr(1)
1256 .m(m)
1257 .n(n)
1258 .k(k)
1259 .iterations(1)
1260 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1261 }
1262 }
1263 }
1264 }
1265
1266 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
1267 TEST_REQUIRES_ARM_NEON_FMA;
1268 for (size_t k = 24; k <= 80; k += 8) {
1269 GemmMicrokernelTester()
1270 .mr(1)
1271 .nr(8)
1272 .kr(1)
1273 .sr(1)
1274 .m(1)
1275 .n(8)
1276 .k(k)
1277 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1278 }
1279 }
1280
1281 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
1282 TEST_REQUIRES_ARM_NEON_FMA;
1283 for (size_t k = 24; k <= 80; k += 8) {
1284 GemmMicrokernelTester()
1285 .mr(1)
1286 .nr(8)
1287 .kr(1)
1288 .sr(1)
1289 .m(1)
1290 .n(8)
1291 .k(k)
1292 .a_stride(83)
1293 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1294 }
1295 }
1296
1297 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
1298 TEST_REQUIRES_ARM_NEON_FMA;
1299 for (size_t k = 24; k <= 80; k += 8) {
1300 for (uint32_t m = 1; m <= 1; m++) {
1301 for (uint32_t n = 1; n <= 8; n++) {
1302 GemmMicrokernelTester()
1303 .mr(1)
1304 .nr(8)
1305 .kr(1)
1306 .sr(1)
1307 .m(m)
1308 .n(n)
1309 .k(k)
1310 .iterations(1)
1311 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1312 }
1313 }
1314 }
1315 }
1316
1317 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
1318 TEST_REQUIRES_ARM_NEON_FMA;
1319 for (uint32_t n = 9; n < 16; n++) {
1320 for (size_t k = 1; k <= 40; k += 9) {
1321 GemmMicrokernelTester()
1322 .mr(1)
1323 .nr(8)
1324 .kr(1)
1325 .sr(1)
1326 .m(1)
1327 .n(8)
1328 .k(k)
1329 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1330 }
1331 }
1332 }
1333
1334 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
1335 TEST_REQUIRES_ARM_NEON_FMA;
1336 for (uint32_t n = 9; n < 16; n++) {
1337 for (size_t k = 1; k <= 40; k += 9) {
1338 GemmMicrokernelTester()
1339 .mr(1)
1340 .nr(8)
1341 .kr(1)
1342 .sr(1)
1343 .m(1)
1344 .n(8)
1345 .k(k)
1346 .cn_stride(11)
1347 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1348 }
1349 }
1350 }
1351
1352 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
1353 TEST_REQUIRES_ARM_NEON_FMA;
1354 for (uint32_t n = 9; n < 16; n++) {
1355 for (size_t k = 1; k <= 40; k += 9) {
1356 GemmMicrokernelTester()
1357 .mr(1)
1358 .nr(8)
1359 .kr(1)
1360 .sr(1)
1361 .m(1)
1362 .n(n)
1363 .k(k)
1364 .a_stride(43)
1365 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1366 }
1367 }
1368 }
1369
1370 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
1371 TEST_REQUIRES_ARM_NEON_FMA;
1372 for (uint32_t n = 9; n < 16; n++) {
1373 for (size_t k = 1; k <= 40; k += 9) {
1374 for (uint32_t m = 1; m <= 1; m++) {
1375 GemmMicrokernelTester()
1376 .mr(1)
1377 .nr(8)
1378 .kr(1)
1379 .sr(1)
1380 .m(m)
1381 .n(n)
1382 .k(k)
1383 .iterations(1)
1384 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1385 }
1386 }
1387 }
1388 }
1389
1390 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
1391 TEST_REQUIRES_ARM_NEON_FMA;
1392 for (uint32_t n = 16; n <= 24; n += 8) {
1393 for (size_t k = 1; k <= 40; k += 9) {
1394 GemmMicrokernelTester()
1395 .mr(1)
1396 .nr(8)
1397 .kr(1)
1398 .sr(1)
1399 .m(1)
1400 .n(8)
1401 .k(k)
1402 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1403 }
1404 }
1405 }
1406
1407 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
1408 TEST_REQUIRES_ARM_NEON_FMA;
1409 for (uint32_t n = 16; n <= 24; n += 8) {
1410 for (size_t k = 1; k <= 40; k += 9) {
1411 GemmMicrokernelTester()
1412 .mr(1)
1413 .nr(8)
1414 .kr(1)
1415 .sr(1)
1416 .m(1)
1417 .n(n)
1418 .k(k)
1419 .cn_stride(11)
1420 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1421 }
1422 }
1423 }
1424
1425 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
1426 TEST_REQUIRES_ARM_NEON_FMA;
1427 for (uint32_t n = 16; n <= 24; n += 8) {
1428 for (size_t k = 1; k <= 40; k += 9) {
1429 GemmMicrokernelTester()
1430 .mr(1)
1431 .nr(8)
1432 .kr(1)
1433 .sr(1)
1434 .m(1)
1435 .n(n)
1436 .k(k)
1437 .a_stride(43)
1438 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1439 }
1440 }
1441 }
1442
1443 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
1444 TEST_REQUIRES_ARM_NEON_FMA;
1445 for (uint32_t n = 16; n <= 24; n += 8) {
1446 for (size_t k = 1; k <= 40; k += 9) {
1447 for (uint32_t m = 1; m <= 1; m++) {
1448 GemmMicrokernelTester()
1449 .mr(1)
1450 .nr(8)
1451 .kr(1)
1452 .sr(1)
1453 .m(m)
1454 .n(n)
1455 .k(k)
1456 .iterations(1)
1457 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1458 }
1459 }
1460 }
1461 }
1462
1463 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
1464 TEST_REQUIRES_ARM_NEON_FMA;
1465 for (size_t k = 1; k <= 40; k += 9) {
1466 for (uint32_t m = 1; m <= 1; m++) {
1467 for (uint32_t n = 1; n <= 8; n++) {
1468 GemmMicrokernelTester()
1469 .mr(1)
1470 .nr(8)
1471 .kr(1)
1472 .sr(1)
1473 .m(m)
1474 .n(n)
1475 .k(k)
1476 .cm_stride(11)
1477 .iterations(1)
1478 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1479 }
1480 }
1481 }
1482 }
1483
1484 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
1485 TEST_REQUIRES_ARM_NEON_FMA;
1486 GemmMicrokernelTester()
1487 .mr(1)
1488 .nr(8)
1489 .kr(1)
1490 .sr(1)
1491 .m(1)
1492 .n(8)
1493 .k(8)
1494 .qmin(128)
1495 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1496 }
1497
1498 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
1499 TEST_REQUIRES_ARM_NEON_FMA;
1500 GemmMicrokernelTester()
1501 .mr(1)
1502 .nr(8)
1503 .kr(1)
1504 .sr(1)
1505 .m(1)
1506 .n(8)
1507 .k(8)
1508 .qmax(128)
1509 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1510 }
1511
1512 TEST(F32_GEMMINC_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
1513 TEST_REQUIRES_ARM_NEON_FMA;
1514 GemmMicrokernelTester()
1515 .mr(1)
1516 .nr(8)
1517 .kr(1)
1518 .sr(1)
1519 .m(1)
1520 .n(8)
1521 .k(8)
1522 .cm_stride(11)
1523 .Test(xnn_f32_gemminc_ukernel_1x8__aarch64_neonfma_cortex_a75);
1524 }
Frank Barchard7e955972019-10-11 10:34:25 -07001525#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001526
1527
Frank Barchard7e955972019-10-11 10:34:25 -07001528#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001529 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001530 TEST_REQUIRES_ARM_NEON_FMA;
1531 GemmMicrokernelTester()
1532 .mr(4)
1533 .nr(8)
1534 .kr(1)
1535 .sr(1)
1536 .m(4)
1537 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001538 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001539 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1540 }
1541
1542 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1543 TEST_REQUIRES_ARM_NEON_FMA;
1544 GemmMicrokernelTester()
1545 .mr(4)
1546 .nr(8)
1547 .kr(1)
1548 .sr(1)
1549 .m(4)
1550 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001551 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001552 .cn_stride(11)
1553 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1554 }
1555
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001556 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001557 TEST_REQUIRES_ARM_NEON_FMA;
1558 GemmMicrokernelTester()
1559 .mr(4)
1560 .nr(8)
1561 .kr(1)
1562 .sr(1)
1563 .m(4)
1564 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001565 .k(4)
1566 .a_stride(7)
Frank Barchard46fb8072019-10-25 12:54:22 -07001567 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1568 }
1569
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001570 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001571 TEST_REQUIRES_ARM_NEON_FMA;
1572 for (uint32_t m = 1; m <= 4; m++) {
1573 for (uint32_t n = 1; n <= 8; n++) {
1574 GemmMicrokernelTester()
1575 .mr(4)
1576 .nr(8)
1577 .kr(1)
1578 .sr(1)
1579 .m(m)
1580 .n(n)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001581 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001582 .iterations(1)
1583 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1584 }
1585 }
1586 }
1587
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001588 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001589 TEST_REQUIRES_ARM_NEON_FMA;
1590 for (uint32_t m = 1; m <= 4; m++) {
1591 GemmMicrokernelTester()
1592 .mr(4)
1593 .nr(8)
1594 .kr(1)
1595 .sr(1)
1596 .m(m)
1597 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001598 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001599 .iterations(1)
1600 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1601 }
1602 }
1603
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001604 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001605 TEST_REQUIRES_ARM_NEON_FMA;
1606 for (uint32_t n = 1; n <= 8; n++) {
1607 GemmMicrokernelTester()
1608 .mr(4)
1609 .nr(8)
1610 .kr(1)
1611 .sr(1)
1612 .m(4)
1613 .n(n)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001614 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001615 .iterations(1)
1616 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1617 }
1618 }
1619
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001620 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001621 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001622 GemmMicrokernelTester()
1623 .mr(4)
1624 .nr(8)
1625 .kr(1)
1626 .sr(1)
1627 .m(4)
1628 .n(8)
1629 .k(8)
1630 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1631 }
1632
1633 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
1634 TEST_REQUIRES_ARM_NEON_FMA;
1635 GemmMicrokernelTester()
1636 .mr(4)
1637 .nr(8)
1638 .kr(1)
1639 .sr(1)
1640 .m(4)
1641 .n(8)
1642 .k(8)
1643 .a_stride(11)
1644 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1645 }
1646
1647 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1648 TEST_REQUIRES_ARM_NEON_FMA;
1649 for (uint32_t m = 1; m <= 4; m++) {
1650 for (uint32_t n = 1; n <= 8; n++) {
1651 GemmMicrokernelTester()
1652 .mr(4)
1653 .nr(8)
1654 .kr(1)
1655 .sr(1)
1656 .m(m)
1657 .n(n)
1658 .k(8)
1659 .iterations(1)
1660 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1661 }
1662 }
1663 }
1664
1665 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
1666 TEST_REQUIRES_ARM_NEON_FMA;
1667 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001668 GemmMicrokernelTester()
1669 .mr(4)
1670 .nr(8)
1671 .kr(1)
1672 .sr(1)
1673 .m(4)
1674 .n(8)
1675 .k(k)
1676 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1677 }
1678 }
1679
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001680 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001681 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001682 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001683 GemmMicrokernelTester()
1684 .mr(4)
1685 .nr(8)
1686 .kr(1)
1687 .sr(1)
1688 .m(4)
1689 .n(8)
1690 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001691 .a_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07001692 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1693 }
1694 }
1695
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001696 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001697 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001698 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001699 for (uint32_t m = 1; m <= 4; m++) {
1700 for (uint32_t n = 1; n <= 8; n++) {
1701 GemmMicrokernelTester()
1702 .mr(4)
1703 .nr(8)
1704 .kr(1)
1705 .sr(1)
1706 .m(m)
1707 .n(n)
1708 .k(k)
1709 .iterations(1)
1710 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1711 }
1712 }
1713 }
1714 }
1715
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001716 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001717 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001718 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001719 GemmMicrokernelTester()
1720 .mr(4)
1721 .nr(8)
1722 .kr(1)
1723 .sr(1)
1724 .m(4)
1725 .n(8)
1726 .k(k)
1727 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1728 }
1729 }
1730
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001731 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001732 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001733 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001734 GemmMicrokernelTester()
1735 .mr(4)
1736 .nr(8)
1737 .kr(1)
1738 .sr(1)
1739 .m(4)
1740 .n(8)
1741 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001742 .a_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07001743 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1744 }
1745 }
1746
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001747 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001748 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001749 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001750 for (uint32_t m = 1; m <= 4; m++) {
1751 for (uint32_t n = 1; n <= 8; n++) {
1752 GemmMicrokernelTester()
1753 .mr(4)
1754 .nr(8)
1755 .kr(1)
1756 .sr(1)
1757 .m(m)
1758 .n(n)
1759 .k(k)
1760 .iterations(1)
1761 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1762 }
1763 }
1764 }
1765 }
1766
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001767 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001768 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001769 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001770 GemmMicrokernelTester()
1771 .mr(4)
1772 .nr(8)
1773 .kr(1)
1774 .sr(1)
1775 .m(4)
1776 .n(8)
1777 .k(k)
1778 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1779 }
1780 }
1781
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001782 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001783 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001784 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001785 GemmMicrokernelTester()
1786 .mr(4)
1787 .nr(8)
1788 .kr(1)
1789 .sr(1)
1790 .m(4)
1791 .n(8)
1792 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001793 .a_stride(43)
Frank Barchard46fb8072019-10-25 12:54:22 -07001794 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1795 }
1796 }
1797
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001798 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001799 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001800 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001801 for (uint32_t m = 1; m <= 4; m++) {
1802 for (uint32_t n = 1; n <= 8; n++) {
1803 GemmMicrokernelTester()
1804 .mr(4)
1805 .nr(8)
1806 .kr(1)
1807 .sr(1)
1808 .m(m)
1809 .n(n)
1810 .k(k)
1811 .iterations(1)
1812 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1813 }
1814 }
1815 }
1816 }
1817
1818 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1819 TEST_REQUIRES_ARM_NEON_FMA;
1820 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001821 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001822 GemmMicrokernelTester()
1823 .mr(4)
1824 .nr(8)
1825 .kr(1)
1826 .sr(1)
1827 .m(4)
1828 .n(8)
1829 .k(k)
1830 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1831 }
1832 }
1833 }
1834
1835 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1836 TEST_REQUIRES_ARM_NEON_FMA;
1837 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001838 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001839 GemmMicrokernelTester()
1840 .mr(4)
1841 .nr(8)
1842 .kr(1)
1843 .sr(1)
1844 .m(4)
1845 .n(8)
1846 .k(k)
1847 .cn_stride(11)
1848 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1849 }
1850 }
1851 }
1852
1853 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
1854 TEST_REQUIRES_ARM_NEON_FMA;
1855 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001856 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001857 GemmMicrokernelTester()
1858 .mr(4)
1859 .nr(8)
1860 .kr(1)
1861 .sr(1)
1862 .m(4)
1863 .n(n)
1864 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001865 .a_stride(23)
Frank Barchard46fb8072019-10-25 12:54:22 -07001866 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1867 }
1868 }
1869 }
1870
1871 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1872 TEST_REQUIRES_ARM_NEON_FMA;
1873 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001874 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001875 for (uint32_t m = 1; m <= 4; m++) {
1876 GemmMicrokernelTester()
1877 .mr(4)
1878 .nr(8)
1879 .kr(1)
1880 .sr(1)
1881 .m(m)
1882 .n(n)
1883 .k(k)
1884 .iterations(1)
1885 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1886 }
1887 }
1888 }
1889 }
1890
1891 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1892 TEST_REQUIRES_ARM_NEON_FMA;
1893 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001894 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001895 GemmMicrokernelTester()
1896 .mr(4)
1897 .nr(8)
1898 .kr(1)
1899 .sr(1)
1900 .m(4)
1901 .n(8)
1902 .k(k)
1903 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1904 }
1905 }
1906 }
1907
1908 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1909 TEST_REQUIRES_ARM_NEON_FMA;
1910 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001911 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001912 GemmMicrokernelTester()
1913 .mr(4)
1914 .nr(8)
1915 .kr(1)
1916 .sr(1)
1917 .m(4)
1918 .n(n)
1919 .k(k)
1920 .cn_stride(11)
1921 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1922 }
1923 }
1924 }
1925
1926 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
1927 TEST_REQUIRES_ARM_NEON_FMA;
1928 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001929 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001930 GemmMicrokernelTester()
1931 .mr(4)
1932 .nr(8)
1933 .kr(1)
1934 .sr(1)
1935 .m(4)
1936 .n(n)
1937 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001938 .a_stride(23)
Frank Barchard46fb8072019-10-25 12:54:22 -07001939 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1940 }
1941 }
1942 }
1943
1944 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1945 TEST_REQUIRES_ARM_NEON_FMA;
1946 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001947 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001948 for (uint32_t m = 1; m <= 4; m++) {
1949 GemmMicrokernelTester()
1950 .mr(4)
1951 .nr(8)
1952 .kr(1)
1953 .sr(1)
1954 .m(m)
1955 .n(n)
1956 .k(k)
1957 .iterations(1)
1958 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1959 }
1960 }
1961 }
1962 }
1963
1964 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1965 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001966 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001967 for (uint32_t m = 1; m <= 4; m++) {
1968 for (uint32_t n = 1; n <= 8; n++) {
1969 GemmMicrokernelTester()
1970 .mr(4)
1971 .nr(8)
1972 .kr(1)
1973 .sr(1)
1974 .m(m)
1975 .n(n)
1976 .k(k)
1977 .cm_stride(11)
1978 .iterations(1)
1979 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1980 }
1981 }
1982 }
1983 }
1984
1985 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1986 TEST_REQUIRES_ARM_NEON_FMA;
1987 GemmMicrokernelTester()
1988 .mr(4)
1989 .nr(8)
1990 .kr(1)
1991 .sr(1)
1992 .m(4)
1993 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001994 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001995 .qmin(128)
1996 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
1997 }
1998
1999 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
2000 TEST_REQUIRES_ARM_NEON_FMA;
2001 GemmMicrokernelTester()
2002 .mr(4)
2003 .nr(8)
2004 .kr(1)
2005 .sr(1)
2006 .m(4)
2007 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08002008 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07002009 .qmax(128)
2010 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
2011 }
2012
2013 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2014 TEST_REQUIRES_ARM_NEON_FMA;
2015 GemmMicrokernelTester()
2016 .mr(4)
2017 .nr(8)
2018 .kr(1)
2019 .sr(1)
2020 .m(4)
2021 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08002022 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07002023 .cm_stride(11)
2024 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a53);
2025 }
2026#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2027
2028
2029#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002030 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
2031 TEST_REQUIRES_ARM_NEON_FMA;
2032 GemmMicrokernelTester()
2033 .mr(4)
2034 .nr(8)
2035 .kr(1)
2036 .sr(1)
2037 .m(4)
2038 .n(8)
2039 .k(8)
2040 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2041 }
2042
2043 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
2044 TEST_REQUIRES_ARM_NEON_FMA;
2045 GemmMicrokernelTester()
2046 .mr(4)
2047 .nr(8)
2048 .kr(1)
2049 .sr(1)
2050 .m(4)
2051 .n(8)
2052 .k(8)
2053 .cn_stride(11)
2054 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2055 }
2056
2057 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
2058 TEST_REQUIRES_ARM_NEON_FMA;
2059 GemmMicrokernelTester()
2060 .mr(4)
2061 .nr(8)
2062 .kr(1)
2063 .sr(1)
2064 .m(4)
2065 .n(8)
2066 .k(8)
2067 .a_stride(11)
2068 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2069 }
2070
2071 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
2072 TEST_REQUIRES_ARM_NEON_FMA;
2073 for (uint32_t m = 1; m <= 4; m++) {
2074 for (uint32_t n = 1; n <= 8; n++) {
2075 GemmMicrokernelTester()
2076 .mr(4)
2077 .nr(8)
2078 .kr(1)
2079 .sr(1)
2080 .m(m)
2081 .n(n)
2082 .k(8)
2083 .iterations(1)
2084 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2085 }
2086 }
2087 }
2088
2089 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
2090 TEST_REQUIRES_ARM_NEON_FMA;
2091 for (uint32_t m = 1; m <= 4; m++) {
2092 GemmMicrokernelTester()
2093 .mr(4)
2094 .nr(8)
2095 .kr(1)
2096 .sr(1)
2097 .m(m)
2098 .n(8)
2099 .k(8)
2100 .iterations(1)
2101 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2102 }
2103 }
2104
2105 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
2106 TEST_REQUIRES_ARM_NEON_FMA;
2107 for (uint32_t n = 1; n <= 8; n++) {
2108 GemmMicrokernelTester()
2109 .mr(4)
2110 .nr(8)
2111 .kr(1)
2112 .sr(1)
2113 .m(4)
2114 .n(n)
2115 .k(8)
2116 .iterations(1)
2117 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2118 }
2119 }
2120
2121 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
2122 TEST_REQUIRES_ARM_NEON_FMA;
2123 GemmMicrokernelTester()
2124 .mr(4)
2125 .nr(8)
2126 .kr(1)
2127 .sr(1)
2128 .m(4)
2129 .n(8)
2130 .k(16)
2131 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2132 }
2133
2134 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
2135 TEST_REQUIRES_ARM_NEON_FMA;
2136 GemmMicrokernelTester()
2137 .mr(4)
2138 .nr(8)
2139 .kr(1)
2140 .sr(1)
2141 .m(4)
2142 .n(8)
2143 .k(16)
2144 .a_stride(19)
2145 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2146 }
2147
2148 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
2149 TEST_REQUIRES_ARM_NEON_FMA;
2150 for (uint32_t m = 1; m <= 4; m++) {
2151 for (uint32_t n = 1; n <= 8; n++) {
2152 GemmMicrokernelTester()
2153 .mr(4)
2154 .nr(8)
2155 .kr(1)
2156 .sr(1)
2157 .m(m)
2158 .n(n)
2159 .k(16)
2160 .iterations(1)
2161 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2162 }
2163 }
2164 }
2165
2166 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
2167 TEST_REQUIRES_ARM_NEON_FMA;
2168 for (size_t k = 1; k < 16; k++) {
2169 GemmMicrokernelTester()
2170 .mr(4)
2171 .nr(8)
2172 .kr(1)
2173 .sr(1)
2174 .m(4)
2175 .n(8)
2176 .k(k)
2177 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2178 }
2179 }
2180
2181 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
2182 TEST_REQUIRES_ARM_NEON_FMA;
2183 for (size_t k = 1; k < 16; k++) {
2184 GemmMicrokernelTester()
2185 .mr(4)
2186 .nr(8)
2187 .kr(1)
2188 .sr(1)
2189 .m(4)
2190 .n(8)
2191 .k(k)
2192 .a_stride(19)
2193 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2194 }
2195 }
2196
2197 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
2198 TEST_REQUIRES_ARM_NEON_FMA;
2199 for (size_t k = 1; k < 16; k++) {
2200 for (uint32_t m = 1; m <= 4; m++) {
2201 for (uint32_t n = 1; n <= 8; n++) {
2202 GemmMicrokernelTester()
2203 .mr(4)
2204 .nr(8)
2205 .kr(1)
2206 .sr(1)
2207 .m(m)
2208 .n(n)
2209 .k(k)
2210 .iterations(1)
2211 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2212 }
2213 }
2214 }
2215 }
2216
2217 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
2218 TEST_REQUIRES_ARM_NEON_FMA;
2219 for (size_t k = 17; k < 16; k++) {
2220 GemmMicrokernelTester()
2221 .mr(4)
2222 .nr(8)
2223 .kr(1)
2224 .sr(1)
2225 .m(4)
2226 .n(8)
2227 .k(k)
2228 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2229 }
2230 }
2231
2232 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
2233 TEST_REQUIRES_ARM_NEON_FMA;
2234 for (size_t k = 17; k < 16; k++) {
2235 GemmMicrokernelTester()
2236 .mr(4)
2237 .nr(8)
2238 .kr(1)
2239 .sr(1)
2240 .m(4)
2241 .n(8)
2242 .k(k)
2243 .a_stride(19)
2244 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2245 }
2246 }
2247
2248 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
2249 TEST_REQUIRES_ARM_NEON_FMA;
2250 for (size_t k = 17; k < 16; k++) {
2251 for (uint32_t m = 1; m <= 4; m++) {
2252 for (uint32_t n = 1; n <= 8; n++) {
2253 GemmMicrokernelTester()
2254 .mr(4)
2255 .nr(8)
2256 .kr(1)
2257 .sr(1)
2258 .m(m)
2259 .n(n)
2260 .k(k)
2261 .iterations(1)
2262 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2263 }
2264 }
2265 }
2266 }
2267
2268 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
2269 TEST_REQUIRES_ARM_NEON_FMA;
2270 for (size_t k = 24; k <= 80; k += 8) {
2271 GemmMicrokernelTester()
2272 .mr(4)
2273 .nr(8)
2274 .kr(1)
2275 .sr(1)
2276 .m(4)
2277 .n(8)
2278 .k(k)
2279 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2280 }
2281 }
2282
2283 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
2284 TEST_REQUIRES_ARM_NEON_FMA;
2285 for (size_t k = 24; k <= 80; k += 8) {
2286 GemmMicrokernelTester()
2287 .mr(4)
2288 .nr(8)
2289 .kr(1)
2290 .sr(1)
2291 .m(4)
2292 .n(8)
2293 .k(k)
2294 .a_stride(83)
2295 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2296 }
2297 }
2298
2299 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
2300 TEST_REQUIRES_ARM_NEON_FMA;
2301 for (size_t k = 24; k <= 80; k += 8) {
2302 for (uint32_t m = 1; m <= 4; m++) {
2303 for (uint32_t n = 1; n <= 8; n++) {
2304 GemmMicrokernelTester()
2305 .mr(4)
2306 .nr(8)
2307 .kr(1)
2308 .sr(1)
2309 .m(m)
2310 .n(n)
2311 .k(k)
2312 .iterations(1)
2313 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2314 }
2315 }
2316 }
2317 }
2318
2319 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
2320 TEST_REQUIRES_ARM_NEON_FMA;
2321 for (uint32_t n = 9; n < 16; n++) {
2322 for (size_t k = 1; k <= 40; k += 9) {
2323 GemmMicrokernelTester()
2324 .mr(4)
2325 .nr(8)
2326 .kr(1)
2327 .sr(1)
2328 .m(4)
2329 .n(8)
2330 .k(k)
2331 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2332 }
2333 }
2334 }
2335
2336 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
2337 TEST_REQUIRES_ARM_NEON_FMA;
2338 for (uint32_t n = 9; n < 16; n++) {
2339 for (size_t k = 1; k <= 40; k += 9) {
2340 GemmMicrokernelTester()
2341 .mr(4)
2342 .nr(8)
2343 .kr(1)
2344 .sr(1)
2345 .m(4)
2346 .n(8)
2347 .k(k)
2348 .cn_stride(11)
2349 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2350 }
2351 }
2352 }
2353
2354 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
2355 TEST_REQUIRES_ARM_NEON_FMA;
2356 for (uint32_t n = 9; n < 16; n++) {
2357 for (size_t k = 1; k <= 40; k += 9) {
2358 GemmMicrokernelTester()
2359 .mr(4)
2360 .nr(8)
2361 .kr(1)
2362 .sr(1)
2363 .m(4)
2364 .n(n)
2365 .k(k)
2366 .a_stride(43)
2367 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2368 }
2369 }
2370 }
2371
2372 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
2373 TEST_REQUIRES_ARM_NEON_FMA;
2374 for (uint32_t n = 9; n < 16; n++) {
2375 for (size_t k = 1; k <= 40; k += 9) {
2376 for (uint32_t m = 1; m <= 4; m++) {
2377 GemmMicrokernelTester()
2378 .mr(4)
2379 .nr(8)
2380 .kr(1)
2381 .sr(1)
2382 .m(m)
2383 .n(n)
2384 .k(k)
2385 .iterations(1)
2386 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2387 }
2388 }
2389 }
2390 }
2391
2392 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
2393 TEST_REQUIRES_ARM_NEON_FMA;
2394 for (uint32_t n = 16; n <= 24; n += 8) {
2395 for (size_t k = 1; k <= 40; k += 9) {
2396 GemmMicrokernelTester()
2397 .mr(4)
2398 .nr(8)
2399 .kr(1)
2400 .sr(1)
2401 .m(4)
2402 .n(8)
2403 .k(k)
2404 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2405 }
2406 }
2407 }
2408
2409 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
2410 TEST_REQUIRES_ARM_NEON_FMA;
2411 for (uint32_t n = 16; n <= 24; n += 8) {
2412 for (size_t k = 1; k <= 40; k += 9) {
2413 GemmMicrokernelTester()
2414 .mr(4)
2415 .nr(8)
2416 .kr(1)
2417 .sr(1)
2418 .m(4)
2419 .n(n)
2420 .k(k)
2421 .cn_stride(11)
2422 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2423 }
2424 }
2425 }
2426
2427 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
2428 TEST_REQUIRES_ARM_NEON_FMA;
2429 for (uint32_t n = 16; n <= 24; n += 8) {
2430 for (size_t k = 1; k <= 40; k += 9) {
2431 GemmMicrokernelTester()
2432 .mr(4)
2433 .nr(8)
2434 .kr(1)
2435 .sr(1)
2436 .m(4)
2437 .n(n)
2438 .k(k)
2439 .a_stride(43)
2440 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2441 }
2442 }
2443 }
2444
2445 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
2446 TEST_REQUIRES_ARM_NEON_FMA;
2447 for (uint32_t n = 16; n <= 24; n += 8) {
2448 for (size_t k = 1; k <= 40; k += 9) {
2449 for (uint32_t m = 1; m <= 4; m++) {
2450 GemmMicrokernelTester()
2451 .mr(4)
2452 .nr(8)
2453 .kr(1)
2454 .sr(1)
2455 .m(m)
2456 .n(n)
2457 .k(k)
2458 .iterations(1)
2459 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2460 }
2461 }
2462 }
2463 }
2464
2465 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
2466 TEST_REQUIRES_ARM_NEON_FMA;
2467 for (size_t k = 1; k <= 40; k += 9) {
2468 for (uint32_t m = 1; m <= 4; m++) {
2469 for (uint32_t n = 1; n <= 8; n++) {
2470 GemmMicrokernelTester()
2471 .mr(4)
2472 .nr(8)
2473 .kr(1)
2474 .sr(1)
2475 .m(m)
2476 .n(n)
2477 .k(k)
2478 .cm_stride(11)
2479 .iterations(1)
2480 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2481 }
2482 }
2483 }
2484 }
2485
2486 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
2487 TEST_REQUIRES_ARM_NEON_FMA;
2488 GemmMicrokernelTester()
2489 .mr(4)
2490 .nr(8)
2491 .kr(1)
2492 .sr(1)
2493 .m(4)
2494 .n(8)
2495 .k(8)
2496 .qmin(128)
2497 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2498 }
2499
2500 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
2501 TEST_REQUIRES_ARM_NEON_FMA;
2502 GemmMicrokernelTester()
2503 .mr(4)
2504 .nr(8)
2505 .kr(1)
2506 .sr(1)
2507 .m(4)
2508 .n(8)
2509 .k(8)
2510 .qmax(128)
2511 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2512 }
2513
2514 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
2515 TEST_REQUIRES_ARM_NEON_FMA;
2516 GemmMicrokernelTester()
2517 .mr(4)
2518 .nr(8)
2519 .kr(1)
2520 .sr(1)
2521 .m(4)
2522 .n(8)
2523 .k(8)
2524 .cm_stride(11)
2525 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a57);
2526 }
Frank Barchard7e955972019-10-11 10:34:25 -07002527#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002528
2529
Frank Barchard7e955972019-10-11 10:34:25 -07002530#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002531 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
2532 TEST_REQUIRES_ARM_NEON_FMA;
2533 GemmMicrokernelTester()
2534 .mr(4)
2535 .nr(8)
2536 .kr(1)
2537 .sr(1)
2538 .m(4)
2539 .n(8)
2540 .k(8)
2541 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2542 }
2543
2544 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
2545 TEST_REQUIRES_ARM_NEON_FMA;
2546 GemmMicrokernelTester()
2547 .mr(4)
2548 .nr(8)
2549 .kr(1)
2550 .sr(1)
2551 .m(4)
2552 .n(8)
2553 .k(8)
2554 .cn_stride(11)
2555 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2556 }
2557
2558 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
2559 TEST_REQUIRES_ARM_NEON_FMA;
2560 GemmMicrokernelTester()
2561 .mr(4)
2562 .nr(8)
2563 .kr(1)
2564 .sr(1)
2565 .m(4)
2566 .n(8)
2567 .k(8)
2568 .a_stride(11)
2569 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2570 }
2571
2572 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
2573 TEST_REQUIRES_ARM_NEON_FMA;
2574 for (uint32_t m = 1; m <= 4; m++) {
2575 for (uint32_t n = 1; n <= 8; n++) {
2576 GemmMicrokernelTester()
2577 .mr(4)
2578 .nr(8)
2579 .kr(1)
2580 .sr(1)
2581 .m(m)
2582 .n(n)
2583 .k(8)
2584 .iterations(1)
2585 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2586 }
2587 }
2588 }
2589
2590 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
2591 TEST_REQUIRES_ARM_NEON_FMA;
2592 for (uint32_t m = 1; m <= 4; m++) {
2593 GemmMicrokernelTester()
2594 .mr(4)
2595 .nr(8)
2596 .kr(1)
2597 .sr(1)
2598 .m(m)
2599 .n(8)
2600 .k(8)
2601 .iterations(1)
2602 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2603 }
2604 }
2605
2606 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
2607 TEST_REQUIRES_ARM_NEON_FMA;
2608 for (uint32_t n = 1; n <= 8; n++) {
2609 GemmMicrokernelTester()
2610 .mr(4)
2611 .nr(8)
2612 .kr(1)
2613 .sr(1)
2614 .m(4)
2615 .n(n)
2616 .k(8)
2617 .iterations(1)
2618 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2619 }
2620 }
2621
2622 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
2623 TEST_REQUIRES_ARM_NEON_FMA;
2624 GemmMicrokernelTester()
2625 .mr(4)
2626 .nr(8)
2627 .kr(1)
2628 .sr(1)
2629 .m(4)
2630 .n(8)
2631 .k(16)
2632 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2633 }
2634
2635 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
2636 TEST_REQUIRES_ARM_NEON_FMA;
2637 GemmMicrokernelTester()
2638 .mr(4)
2639 .nr(8)
2640 .kr(1)
2641 .sr(1)
2642 .m(4)
2643 .n(8)
2644 .k(16)
2645 .a_stride(19)
2646 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2647 }
2648
2649 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
2650 TEST_REQUIRES_ARM_NEON_FMA;
2651 for (uint32_t m = 1; m <= 4; m++) {
2652 for (uint32_t n = 1; n <= 8; n++) {
2653 GemmMicrokernelTester()
2654 .mr(4)
2655 .nr(8)
2656 .kr(1)
2657 .sr(1)
2658 .m(m)
2659 .n(n)
2660 .k(16)
2661 .iterations(1)
2662 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2663 }
2664 }
2665 }
2666
2667 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
2668 TEST_REQUIRES_ARM_NEON_FMA;
2669 for (size_t k = 1; k < 16; k++) {
2670 GemmMicrokernelTester()
2671 .mr(4)
2672 .nr(8)
2673 .kr(1)
2674 .sr(1)
2675 .m(4)
2676 .n(8)
2677 .k(k)
2678 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2679 }
2680 }
2681
2682 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
2683 TEST_REQUIRES_ARM_NEON_FMA;
2684 for (size_t k = 1; k < 16; k++) {
2685 GemmMicrokernelTester()
2686 .mr(4)
2687 .nr(8)
2688 .kr(1)
2689 .sr(1)
2690 .m(4)
2691 .n(8)
2692 .k(k)
2693 .a_stride(19)
2694 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2695 }
2696 }
2697
2698 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
2699 TEST_REQUIRES_ARM_NEON_FMA;
2700 for (size_t k = 1; k < 16; k++) {
2701 for (uint32_t m = 1; m <= 4; m++) {
2702 for (uint32_t n = 1; n <= 8; n++) {
2703 GemmMicrokernelTester()
2704 .mr(4)
2705 .nr(8)
2706 .kr(1)
2707 .sr(1)
2708 .m(m)
2709 .n(n)
2710 .k(k)
2711 .iterations(1)
2712 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2713 }
2714 }
2715 }
2716 }
2717
2718 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
2719 TEST_REQUIRES_ARM_NEON_FMA;
2720 for (size_t k = 17; k < 16; k++) {
2721 GemmMicrokernelTester()
2722 .mr(4)
2723 .nr(8)
2724 .kr(1)
2725 .sr(1)
2726 .m(4)
2727 .n(8)
2728 .k(k)
2729 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2730 }
2731 }
2732
2733 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
2734 TEST_REQUIRES_ARM_NEON_FMA;
2735 for (size_t k = 17; k < 16; k++) {
2736 GemmMicrokernelTester()
2737 .mr(4)
2738 .nr(8)
2739 .kr(1)
2740 .sr(1)
2741 .m(4)
2742 .n(8)
2743 .k(k)
2744 .a_stride(19)
2745 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2746 }
2747 }
2748
2749 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
2750 TEST_REQUIRES_ARM_NEON_FMA;
2751 for (size_t k = 17; k < 16; k++) {
2752 for (uint32_t m = 1; m <= 4; m++) {
2753 for (uint32_t n = 1; n <= 8; n++) {
2754 GemmMicrokernelTester()
2755 .mr(4)
2756 .nr(8)
2757 .kr(1)
2758 .sr(1)
2759 .m(m)
2760 .n(n)
2761 .k(k)
2762 .iterations(1)
2763 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2764 }
2765 }
2766 }
2767 }
2768
2769 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
2770 TEST_REQUIRES_ARM_NEON_FMA;
2771 for (size_t k = 24; k <= 80; k += 8) {
2772 GemmMicrokernelTester()
2773 .mr(4)
2774 .nr(8)
2775 .kr(1)
2776 .sr(1)
2777 .m(4)
2778 .n(8)
2779 .k(k)
2780 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2781 }
2782 }
2783
2784 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
2785 TEST_REQUIRES_ARM_NEON_FMA;
2786 for (size_t k = 24; k <= 80; k += 8) {
2787 GemmMicrokernelTester()
2788 .mr(4)
2789 .nr(8)
2790 .kr(1)
2791 .sr(1)
2792 .m(4)
2793 .n(8)
2794 .k(k)
2795 .a_stride(83)
2796 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2797 }
2798 }
2799
2800 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
2801 TEST_REQUIRES_ARM_NEON_FMA;
2802 for (size_t k = 24; k <= 80; k += 8) {
2803 for (uint32_t m = 1; m <= 4; m++) {
2804 for (uint32_t n = 1; n <= 8; n++) {
2805 GemmMicrokernelTester()
2806 .mr(4)
2807 .nr(8)
2808 .kr(1)
2809 .sr(1)
2810 .m(m)
2811 .n(n)
2812 .k(k)
2813 .iterations(1)
2814 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2815 }
2816 }
2817 }
2818 }
2819
2820 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
2821 TEST_REQUIRES_ARM_NEON_FMA;
2822 for (uint32_t n = 9; n < 16; n++) {
2823 for (size_t k = 1; k <= 40; k += 9) {
2824 GemmMicrokernelTester()
2825 .mr(4)
2826 .nr(8)
2827 .kr(1)
2828 .sr(1)
2829 .m(4)
2830 .n(8)
2831 .k(k)
2832 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2833 }
2834 }
2835 }
2836
2837 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
2838 TEST_REQUIRES_ARM_NEON_FMA;
2839 for (uint32_t n = 9; n < 16; n++) {
2840 for (size_t k = 1; k <= 40; k += 9) {
2841 GemmMicrokernelTester()
2842 .mr(4)
2843 .nr(8)
2844 .kr(1)
2845 .sr(1)
2846 .m(4)
2847 .n(8)
2848 .k(k)
2849 .cn_stride(11)
2850 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2851 }
2852 }
2853 }
2854
2855 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
2856 TEST_REQUIRES_ARM_NEON_FMA;
2857 for (uint32_t n = 9; n < 16; n++) {
2858 for (size_t k = 1; k <= 40; k += 9) {
2859 GemmMicrokernelTester()
2860 .mr(4)
2861 .nr(8)
2862 .kr(1)
2863 .sr(1)
2864 .m(4)
2865 .n(n)
2866 .k(k)
2867 .a_stride(43)
2868 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2869 }
2870 }
2871 }
2872
2873 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
2874 TEST_REQUIRES_ARM_NEON_FMA;
2875 for (uint32_t n = 9; n < 16; n++) {
2876 for (size_t k = 1; k <= 40; k += 9) {
2877 for (uint32_t m = 1; m <= 4; m++) {
2878 GemmMicrokernelTester()
2879 .mr(4)
2880 .nr(8)
2881 .kr(1)
2882 .sr(1)
2883 .m(m)
2884 .n(n)
2885 .k(k)
2886 .iterations(1)
2887 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2888 }
2889 }
2890 }
2891 }
2892
2893 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
2894 TEST_REQUIRES_ARM_NEON_FMA;
2895 for (uint32_t n = 16; n <= 24; n += 8) {
2896 for (size_t k = 1; k <= 40; k += 9) {
2897 GemmMicrokernelTester()
2898 .mr(4)
2899 .nr(8)
2900 .kr(1)
2901 .sr(1)
2902 .m(4)
2903 .n(8)
2904 .k(k)
2905 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2906 }
2907 }
2908 }
2909
2910 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
2911 TEST_REQUIRES_ARM_NEON_FMA;
2912 for (uint32_t n = 16; n <= 24; n += 8) {
2913 for (size_t k = 1; k <= 40; k += 9) {
2914 GemmMicrokernelTester()
2915 .mr(4)
2916 .nr(8)
2917 .kr(1)
2918 .sr(1)
2919 .m(4)
2920 .n(n)
2921 .k(k)
2922 .cn_stride(11)
2923 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2924 }
2925 }
2926 }
2927
2928 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
2929 TEST_REQUIRES_ARM_NEON_FMA;
2930 for (uint32_t n = 16; n <= 24; n += 8) {
2931 for (size_t k = 1; k <= 40; k += 9) {
2932 GemmMicrokernelTester()
2933 .mr(4)
2934 .nr(8)
2935 .kr(1)
2936 .sr(1)
2937 .m(4)
2938 .n(n)
2939 .k(k)
2940 .a_stride(43)
2941 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2942 }
2943 }
2944 }
2945
2946 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
2947 TEST_REQUIRES_ARM_NEON_FMA;
2948 for (uint32_t n = 16; n <= 24; n += 8) {
2949 for (size_t k = 1; k <= 40; k += 9) {
2950 for (uint32_t m = 1; m <= 4; m++) {
2951 GemmMicrokernelTester()
2952 .mr(4)
2953 .nr(8)
2954 .kr(1)
2955 .sr(1)
2956 .m(m)
2957 .n(n)
2958 .k(k)
2959 .iterations(1)
2960 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2961 }
2962 }
2963 }
2964 }
2965
2966 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
2967 TEST_REQUIRES_ARM_NEON_FMA;
2968 for (size_t k = 1; k <= 40; k += 9) {
2969 for (uint32_t m = 1; m <= 4; m++) {
2970 for (uint32_t n = 1; n <= 8; n++) {
2971 GemmMicrokernelTester()
2972 .mr(4)
2973 .nr(8)
2974 .kr(1)
2975 .sr(1)
2976 .m(m)
2977 .n(n)
2978 .k(k)
2979 .cm_stride(11)
2980 .iterations(1)
2981 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2982 }
2983 }
2984 }
2985 }
2986
2987 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
2988 TEST_REQUIRES_ARM_NEON_FMA;
2989 GemmMicrokernelTester()
2990 .mr(4)
2991 .nr(8)
2992 .kr(1)
2993 .sr(1)
2994 .m(4)
2995 .n(8)
2996 .k(8)
2997 .qmin(128)
2998 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
2999 }
3000
3001 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3002 TEST_REQUIRES_ARM_NEON_FMA;
3003 GemmMicrokernelTester()
3004 .mr(4)
3005 .nr(8)
3006 .kr(1)
3007 .sr(1)
3008 .m(4)
3009 .n(8)
3010 .k(8)
3011 .qmax(128)
3012 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3013 }
3014
3015 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3016 TEST_REQUIRES_ARM_NEON_FMA;
3017 GemmMicrokernelTester()
3018 .mr(4)
3019 .nr(8)
3020 .kr(1)
3021 .sr(1)
3022 .m(4)
3023 .n(8)
3024 .k(8)
3025 .cm_stride(11)
3026 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_cortex_a75);
3027 }
Frank Barchard7e955972019-10-11 10:34:25 -07003028#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07003029
3030
Frank Barchard7e955972019-10-11 10:34:25 -07003031#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07003032 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
3033 TEST_REQUIRES_ARM_NEON_FMA;
3034 GemmMicrokernelTester()
3035 .mr(5)
3036 .nr(8)
3037 .kr(1)
3038 .sr(1)
3039 .m(5)
3040 .n(8)
3041 .k(8)
3042 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3043 }
3044
3045 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3046 TEST_REQUIRES_ARM_NEON_FMA;
3047 GemmMicrokernelTester()
3048 .mr(5)
3049 .nr(8)
3050 .kr(1)
3051 .sr(1)
3052 .m(5)
3053 .n(8)
3054 .k(8)
3055 .cn_stride(11)
3056 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3057 }
3058
3059 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
3060 TEST_REQUIRES_ARM_NEON_FMA;
3061 GemmMicrokernelTester()
3062 .mr(5)
3063 .nr(8)
3064 .kr(1)
3065 .sr(1)
3066 .m(5)
3067 .n(8)
3068 .k(8)
3069 .a_stride(11)
3070 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3071 }
3072
3073 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3074 TEST_REQUIRES_ARM_NEON_FMA;
3075 for (uint32_t m = 1; m <= 5; m++) {
3076 for (uint32_t n = 1; n <= 8; n++) {
3077 GemmMicrokernelTester()
3078 .mr(5)
3079 .nr(8)
3080 .kr(1)
3081 .sr(1)
3082 .m(m)
3083 .n(n)
3084 .k(8)
3085 .iterations(1)
3086 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3087 }
3088 }
3089 }
3090
3091 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3092 TEST_REQUIRES_ARM_NEON_FMA;
3093 for (uint32_t m = 1; m <= 5; m++) {
3094 GemmMicrokernelTester()
3095 .mr(5)
3096 .nr(8)
3097 .kr(1)
3098 .sr(1)
3099 .m(m)
3100 .n(8)
3101 .k(8)
3102 .iterations(1)
3103 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3104 }
3105 }
3106
3107 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3108 TEST_REQUIRES_ARM_NEON_FMA;
3109 for (uint32_t n = 1; n <= 8; n++) {
3110 GemmMicrokernelTester()
3111 .mr(5)
3112 .nr(8)
3113 .kr(1)
3114 .sr(1)
3115 .m(5)
3116 .n(n)
3117 .k(8)
3118 .iterations(1)
3119 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3120 }
3121 }
3122
3123 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
3124 TEST_REQUIRES_ARM_NEON_FMA;
3125 GemmMicrokernelTester()
3126 .mr(5)
3127 .nr(8)
3128 .kr(1)
3129 .sr(1)
3130 .m(5)
3131 .n(8)
3132 .k(16)
3133 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3134 }
3135
3136 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
3137 TEST_REQUIRES_ARM_NEON_FMA;
3138 GemmMicrokernelTester()
3139 .mr(5)
3140 .nr(8)
3141 .kr(1)
3142 .sr(1)
3143 .m(5)
3144 .n(8)
3145 .k(16)
3146 .a_stride(19)
3147 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3148 }
3149
3150 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
3151 TEST_REQUIRES_ARM_NEON_FMA;
3152 for (uint32_t m = 1; m <= 5; m++) {
3153 for (uint32_t n = 1; n <= 8; n++) {
3154 GemmMicrokernelTester()
3155 .mr(5)
3156 .nr(8)
3157 .kr(1)
3158 .sr(1)
3159 .m(m)
3160 .n(n)
3161 .k(16)
3162 .iterations(1)
3163 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3164 }
3165 }
3166 }
3167
3168 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
3169 TEST_REQUIRES_ARM_NEON_FMA;
3170 for (size_t k = 1; k < 16; k++) {
3171 GemmMicrokernelTester()
3172 .mr(5)
3173 .nr(8)
3174 .kr(1)
3175 .sr(1)
3176 .m(5)
3177 .n(8)
3178 .k(k)
3179 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3180 }
3181 }
3182
3183 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
3184 TEST_REQUIRES_ARM_NEON_FMA;
3185 for (size_t k = 1; k < 16; k++) {
3186 GemmMicrokernelTester()
3187 .mr(5)
3188 .nr(8)
3189 .kr(1)
3190 .sr(1)
3191 .m(5)
3192 .n(8)
3193 .k(k)
3194 .a_stride(19)
3195 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3196 }
3197 }
3198
3199 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
3200 TEST_REQUIRES_ARM_NEON_FMA;
3201 for (size_t k = 1; k < 16; k++) {
3202 for (uint32_t m = 1; m <= 5; m++) {
3203 for (uint32_t n = 1; n <= 8; n++) {
3204 GemmMicrokernelTester()
3205 .mr(5)
3206 .nr(8)
3207 .kr(1)
3208 .sr(1)
3209 .m(m)
3210 .n(n)
3211 .k(k)
3212 .iterations(1)
3213 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3214 }
3215 }
3216 }
3217 }
3218
3219 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
3220 TEST_REQUIRES_ARM_NEON_FMA;
3221 for (size_t k = 17; k < 16; k++) {
3222 GemmMicrokernelTester()
3223 .mr(5)
3224 .nr(8)
3225 .kr(1)
3226 .sr(1)
3227 .m(5)
3228 .n(8)
3229 .k(k)
3230 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3231 }
3232 }
3233
3234 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
3235 TEST_REQUIRES_ARM_NEON_FMA;
3236 for (size_t k = 17; k < 16; k++) {
3237 GemmMicrokernelTester()
3238 .mr(5)
3239 .nr(8)
3240 .kr(1)
3241 .sr(1)
3242 .m(5)
3243 .n(8)
3244 .k(k)
3245 .a_stride(19)
3246 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3247 }
3248 }
3249
3250 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3251 TEST_REQUIRES_ARM_NEON_FMA;
3252 for (size_t k = 17; k < 16; k++) {
3253 for (uint32_t m = 1; m <= 5; m++) {
3254 for (uint32_t n = 1; n <= 8; n++) {
3255 GemmMicrokernelTester()
3256 .mr(5)
3257 .nr(8)
3258 .kr(1)
3259 .sr(1)
3260 .m(m)
3261 .n(n)
3262 .k(k)
3263 .iterations(1)
3264 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3265 }
3266 }
3267 }
3268 }
3269
3270 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3271 TEST_REQUIRES_ARM_NEON_FMA;
3272 for (size_t k = 24; k <= 80; k += 8) {
3273 GemmMicrokernelTester()
3274 .mr(5)
3275 .nr(8)
3276 .kr(1)
3277 .sr(1)
3278 .m(5)
3279 .n(8)
3280 .k(k)
3281 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3282 }
3283 }
3284
3285 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
3286 TEST_REQUIRES_ARM_NEON_FMA;
3287 for (size_t k = 24; k <= 80; k += 8) {
3288 GemmMicrokernelTester()
3289 .mr(5)
3290 .nr(8)
3291 .kr(1)
3292 .sr(1)
3293 .m(5)
3294 .n(8)
3295 .k(k)
3296 .a_stride(83)
3297 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3298 }
3299 }
3300
3301 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3302 TEST_REQUIRES_ARM_NEON_FMA;
3303 for (size_t k = 24; k <= 80; k += 8) {
3304 for (uint32_t m = 1; m <= 5; m++) {
3305 for (uint32_t n = 1; n <= 8; n++) {
3306 GemmMicrokernelTester()
3307 .mr(5)
3308 .nr(8)
3309 .kr(1)
3310 .sr(1)
3311 .m(m)
3312 .n(n)
3313 .k(k)
3314 .iterations(1)
3315 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3316 }
3317 }
3318 }
3319 }
3320
3321 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
3322 TEST_REQUIRES_ARM_NEON_FMA;
3323 for (uint32_t n = 9; n < 16; n++) {
3324 for (size_t k = 1; k <= 40; k += 9) {
3325 GemmMicrokernelTester()
3326 .mr(5)
3327 .nr(8)
3328 .kr(1)
3329 .sr(1)
3330 .m(5)
3331 .n(8)
3332 .k(k)
3333 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3334 }
3335 }
3336 }
3337
3338 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
3339 TEST_REQUIRES_ARM_NEON_FMA;
3340 for (uint32_t n = 9; n < 16; n++) {
3341 for (size_t k = 1; k <= 40; k += 9) {
3342 GemmMicrokernelTester()
3343 .mr(5)
3344 .nr(8)
3345 .kr(1)
3346 .sr(1)
3347 .m(5)
3348 .n(8)
3349 .k(k)
3350 .cn_stride(11)
3351 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3352 }
3353 }
3354 }
3355
3356 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
3357 TEST_REQUIRES_ARM_NEON_FMA;
3358 for (uint32_t n = 9; n < 16; n++) {
3359 for (size_t k = 1; k <= 40; k += 9) {
3360 GemmMicrokernelTester()
3361 .mr(5)
3362 .nr(8)
3363 .kr(1)
3364 .sr(1)
3365 .m(5)
3366 .n(n)
3367 .k(k)
3368 .a_stride(43)
3369 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3370 }
3371 }
3372 }
3373
3374 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
3375 TEST_REQUIRES_ARM_NEON_FMA;
3376 for (uint32_t n = 9; n < 16; n++) {
3377 for (size_t k = 1; k <= 40; k += 9) {
3378 for (uint32_t m = 1; m <= 5; m++) {
3379 GemmMicrokernelTester()
3380 .mr(5)
3381 .nr(8)
3382 .kr(1)
3383 .sr(1)
3384 .m(m)
3385 .n(n)
3386 .k(k)
3387 .iterations(1)
3388 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3389 }
3390 }
3391 }
3392 }
3393
3394 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
3395 TEST_REQUIRES_ARM_NEON_FMA;
3396 for (uint32_t n = 16; n <= 24; n += 8) {
3397 for (size_t k = 1; k <= 40; k += 9) {
3398 GemmMicrokernelTester()
3399 .mr(5)
3400 .nr(8)
3401 .kr(1)
3402 .sr(1)
3403 .m(5)
3404 .n(8)
3405 .k(k)
3406 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3407 }
3408 }
3409 }
3410
3411 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
3412 TEST_REQUIRES_ARM_NEON_FMA;
3413 for (uint32_t n = 16; n <= 24; n += 8) {
3414 for (size_t k = 1; k <= 40; k += 9) {
3415 GemmMicrokernelTester()
3416 .mr(5)
3417 .nr(8)
3418 .kr(1)
3419 .sr(1)
3420 .m(5)
3421 .n(n)
3422 .k(k)
3423 .cn_stride(11)
3424 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3425 }
3426 }
3427 }
3428
3429 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
3430 TEST_REQUIRES_ARM_NEON_FMA;
3431 for (uint32_t n = 16; n <= 24; n += 8) {
3432 for (size_t k = 1; k <= 40; k += 9) {
3433 GemmMicrokernelTester()
3434 .mr(5)
3435 .nr(8)
3436 .kr(1)
3437 .sr(1)
3438 .m(5)
3439 .n(n)
3440 .k(k)
3441 .a_stride(43)
3442 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3443 }
3444 }
3445 }
3446
3447 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
3448 TEST_REQUIRES_ARM_NEON_FMA;
3449 for (uint32_t n = 16; n <= 24; n += 8) {
3450 for (size_t k = 1; k <= 40; k += 9) {
3451 for (uint32_t m = 1; m <= 5; m++) {
3452 GemmMicrokernelTester()
3453 .mr(5)
3454 .nr(8)
3455 .kr(1)
3456 .sr(1)
3457 .m(m)
3458 .n(n)
3459 .k(k)
3460 .iterations(1)
3461 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3462 }
3463 }
3464 }
3465 }
3466
3467 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3468 TEST_REQUIRES_ARM_NEON_FMA;
3469 for (size_t k = 1; k <= 40; k += 9) {
3470 for (uint32_t m = 1; m <= 5; m++) {
3471 for (uint32_t n = 1; n <= 8; n++) {
3472 GemmMicrokernelTester()
3473 .mr(5)
3474 .nr(8)
3475 .kr(1)
3476 .sr(1)
3477 .m(m)
3478 .n(n)
3479 .k(k)
3480 .cm_stride(11)
3481 .iterations(1)
3482 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3483 }
3484 }
3485 }
3486 }
3487
3488 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3489 TEST_REQUIRES_ARM_NEON_FMA;
3490 GemmMicrokernelTester()
3491 .mr(5)
3492 .nr(8)
3493 .kr(1)
3494 .sr(1)
3495 .m(5)
3496 .n(8)
3497 .k(8)
3498 .qmin(128)
3499 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3500 }
3501
3502 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3503 TEST_REQUIRES_ARM_NEON_FMA;
3504 GemmMicrokernelTester()
3505 .mr(5)
3506 .nr(8)
3507 .kr(1)
3508 .sr(1)
3509 .m(5)
3510 .n(8)
3511 .k(8)
3512 .qmax(128)
3513 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3514 }
3515
3516 TEST(F32_GEMMINC_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3517 TEST_REQUIRES_ARM_NEON_FMA;
3518 GemmMicrokernelTester()
3519 .mr(5)
3520 .nr(8)
3521 .kr(1)
3522 .sr(1)
3523 .m(5)
3524 .n(8)
3525 .k(8)
3526 .cm_stride(11)
3527 .Test(xnn_f32_gemminc_ukernel_5x8__aarch64_neonfma_cortex_a75);
3528 }
Frank Barchard7e955972019-10-11 10:34:25 -07003529#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07003530
3531
Frank Barchard7e955972019-10-11 10:34:25 -07003532#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard00bf68e2019-10-27 03:00:09 -07003533 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003534 TEST_REQUIRES_ARM_NEON_FMA;
3535 GemmMicrokernelTester()
3536 .mr(6)
3537 .nr(8)
3538 .kr(1)
3539 .sr(1)
3540 .m(6)
3541 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003542 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003543 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3544 }
3545
3546 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
3547 TEST_REQUIRES_ARM_NEON_FMA;
3548 GemmMicrokernelTester()
3549 .mr(6)
3550 .nr(8)
3551 .kr(1)
3552 .sr(1)
3553 .m(6)
3554 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003555 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003556 .cn_stride(11)
3557 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3558 }
3559
Frank Barchard00bf68e2019-10-27 03:00:09 -07003560 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003561 TEST_REQUIRES_ARM_NEON_FMA;
3562 GemmMicrokernelTester()
3563 .mr(6)
3564 .nr(8)
3565 .kr(1)
3566 .sr(1)
3567 .m(6)
3568 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003569 .k(4)
3570 .a_stride(7)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003571 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3572 }
3573
Frank Barchard00bf68e2019-10-27 03:00:09 -07003574 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003575 TEST_REQUIRES_ARM_NEON_FMA;
3576 for (uint32_t m = 1; m <= 6; m++) {
3577 for (uint32_t n = 1; n <= 8; n++) {
3578 GemmMicrokernelTester()
3579 .mr(6)
3580 .nr(8)
3581 .kr(1)
3582 .sr(1)
3583 .m(m)
3584 .n(n)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003585 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003586 .iterations(1)
3587 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3588 }
3589 }
3590 }
3591
Frank Barchard00bf68e2019-10-27 03:00:09 -07003592 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003593 TEST_REQUIRES_ARM_NEON_FMA;
3594 for (uint32_t m = 1; m <= 6; m++) {
3595 GemmMicrokernelTester()
3596 .mr(6)
3597 .nr(8)
3598 .kr(1)
3599 .sr(1)
3600 .m(m)
3601 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003602 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003603 .iterations(1)
3604 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3605 }
3606 }
3607
Frank Barchard00bf68e2019-10-27 03:00:09 -07003608 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003609 TEST_REQUIRES_ARM_NEON_FMA;
3610 for (uint32_t n = 1; n <= 8; n++) {
3611 GemmMicrokernelTester()
3612 .mr(6)
3613 .nr(8)
3614 .kr(1)
3615 .sr(1)
3616 .m(6)
3617 .n(n)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003618 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003619 .iterations(1)
3620 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3621 }
3622 }
3623
Frank Barcharde64f91a2019-11-11 13:18:00 -08003624 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003625 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003626 GemmMicrokernelTester()
3627 .mr(6)
3628 .nr(8)
3629 .kr(1)
3630 .sr(1)
3631 .m(6)
3632 .n(8)
3633 .k(8)
3634 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3635 }
3636
3637 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
3638 TEST_REQUIRES_ARM_NEON_FMA;
3639 GemmMicrokernelTester()
3640 .mr(6)
3641 .nr(8)
3642 .kr(1)
3643 .sr(1)
3644 .m(6)
3645 .n(8)
3646 .k(8)
3647 .a_stride(11)
3648 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3649 }
3650
3651 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
3652 TEST_REQUIRES_ARM_NEON_FMA;
3653 for (uint32_t m = 1; m <= 6; m++) {
3654 for (uint32_t n = 1; n <= 8; n++) {
3655 GemmMicrokernelTester()
3656 .mr(6)
3657 .nr(8)
3658 .kr(1)
3659 .sr(1)
3660 .m(m)
3661 .n(n)
3662 .k(8)
3663 .iterations(1)
3664 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3665 }
3666 }
3667 }
3668
3669 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
3670 TEST_REQUIRES_ARM_NEON_FMA;
3671 for (size_t k = 1; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003672 GemmMicrokernelTester()
3673 .mr(6)
3674 .nr(8)
3675 .kr(1)
3676 .sr(1)
3677 .m(6)
3678 .n(8)
3679 .k(k)
3680 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3681 }
3682 }
3683
Frank Barcharde64f91a2019-11-11 13:18:00 -08003684 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003685 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003686 for (size_t k = 1; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003687 GemmMicrokernelTester()
3688 .mr(6)
3689 .nr(8)
3690 .kr(1)
3691 .sr(1)
3692 .m(6)
3693 .n(8)
3694 .k(k)
Frank Barcharde64f91a2019-11-11 13:18:00 -08003695 .a_stride(11)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003696 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3697 }
3698 }
3699
Frank Barcharde64f91a2019-11-11 13:18:00 -08003700 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003701 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003702 for (size_t k = 1; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003703 for (uint32_t m = 1; m <= 6; m++) {
3704 for (uint32_t n = 1; n <= 8; n++) {
3705 GemmMicrokernelTester()
3706 .mr(6)
3707 .nr(8)
3708 .kr(1)
3709 .sr(1)
3710 .m(m)
3711 .n(n)
3712 .k(k)
3713 .iterations(1)
3714 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3715 }
3716 }
3717 }
3718 }
3719
Frank Barcharde64f91a2019-11-11 13:18:00 -08003720 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003721 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003722 for (size_t k = 9; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003723 GemmMicrokernelTester()
3724 .mr(6)
3725 .nr(8)
3726 .kr(1)
3727 .sr(1)
3728 .m(6)
3729 .n(8)
3730 .k(k)
3731 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3732 }
3733 }
3734
Frank Barchard00bf68e2019-10-27 03:00:09 -07003735 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003736 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003737 for (size_t k = 9; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003738 GemmMicrokernelTester()
3739 .mr(6)
3740 .nr(8)
3741 .kr(1)
3742 .sr(1)
3743 .m(6)
3744 .n(8)
3745 .k(k)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003746 .a_stride(11)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003747 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3748 }
3749 }
3750
Frank Barchard00bf68e2019-10-27 03:00:09 -07003751 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003752 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003753 for (size_t k = 9; k < 8; k++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003754 for (uint32_t m = 1; m <= 6; m++) {
3755 for (uint32_t n = 1; n <= 8; n++) {
3756 GemmMicrokernelTester()
3757 .mr(6)
3758 .nr(8)
3759 .kr(1)
3760 .sr(1)
3761 .m(m)
3762 .n(n)
3763 .k(k)
3764 .iterations(1)
3765 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3766 }
3767 }
3768 }
3769 }
3770
3771 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
3772 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003773 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003774 GemmMicrokernelTester()
3775 .mr(6)
3776 .nr(8)
3777 .kr(1)
3778 .sr(1)
3779 .m(6)
3780 .n(8)
3781 .k(k)
3782 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3783 }
3784 }
3785
3786 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
3787 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003788 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003789 GemmMicrokernelTester()
3790 .mr(6)
3791 .nr(8)
3792 .kr(1)
3793 .sr(1)
3794 .m(6)
3795 .n(8)
3796 .k(k)
3797 .a_stride(43)
3798 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3799 }
3800 }
3801
3802 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
3803 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08003804 for (size_t k = 12; k <= 40; k += 4) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003805 for (uint32_t m = 1; m <= 6; m++) {
3806 for (uint32_t n = 1; n <= 8; n++) {
3807 GemmMicrokernelTester()
3808 .mr(6)
3809 .nr(8)
3810 .kr(1)
3811 .sr(1)
3812 .m(m)
3813 .n(n)
3814 .k(k)
3815 .iterations(1)
3816 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3817 }
3818 }
3819 }
3820 }
3821
3822 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
3823 TEST_REQUIRES_ARM_NEON_FMA;
3824 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003825 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003826 GemmMicrokernelTester()
3827 .mr(6)
3828 .nr(8)
3829 .kr(1)
3830 .sr(1)
3831 .m(6)
3832 .n(8)
3833 .k(k)
3834 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3835 }
3836 }
3837 }
3838
3839 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
3840 TEST_REQUIRES_ARM_NEON_FMA;
3841 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003842 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003843 GemmMicrokernelTester()
3844 .mr(6)
3845 .nr(8)
3846 .kr(1)
3847 .sr(1)
3848 .m(6)
3849 .n(8)
3850 .k(k)
3851 .cn_stride(11)
3852 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3853 }
3854 }
3855 }
3856
3857 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
3858 TEST_REQUIRES_ARM_NEON_FMA;
3859 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003860 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003861 GemmMicrokernelTester()
3862 .mr(6)
3863 .nr(8)
3864 .kr(1)
3865 .sr(1)
3866 .m(6)
3867 .n(n)
3868 .k(k)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003869 .a_stride(23)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003870 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3871 }
3872 }
3873 }
3874
3875 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
3876 TEST_REQUIRES_ARM_NEON_FMA;
3877 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003878 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003879 for (uint32_t m = 1; m <= 6; m++) {
3880 GemmMicrokernelTester()
3881 .mr(6)
3882 .nr(8)
3883 .kr(1)
3884 .sr(1)
3885 .m(m)
3886 .n(n)
3887 .k(k)
3888 .iterations(1)
3889 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3890 }
3891 }
3892 }
3893 }
3894
3895 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
3896 TEST_REQUIRES_ARM_NEON_FMA;
3897 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003898 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003899 GemmMicrokernelTester()
3900 .mr(6)
3901 .nr(8)
3902 .kr(1)
3903 .sr(1)
3904 .m(6)
3905 .n(8)
3906 .k(k)
3907 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3908 }
3909 }
3910 }
3911
3912 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
3913 TEST_REQUIRES_ARM_NEON_FMA;
3914 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003915 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003916 GemmMicrokernelTester()
3917 .mr(6)
3918 .nr(8)
3919 .kr(1)
3920 .sr(1)
3921 .m(6)
3922 .n(n)
3923 .k(k)
3924 .cn_stride(11)
3925 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3926 }
3927 }
3928 }
3929
3930 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
3931 TEST_REQUIRES_ARM_NEON_FMA;
3932 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003933 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003934 GemmMicrokernelTester()
3935 .mr(6)
3936 .nr(8)
3937 .kr(1)
3938 .sr(1)
3939 .m(6)
3940 .n(n)
3941 .k(k)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003942 .a_stride(23)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003943 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3944 }
3945 }
3946 }
3947
3948 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
3949 TEST_REQUIRES_ARM_NEON_FMA;
3950 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07003951 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003952 for (uint32_t m = 1; m <= 6; m++) {
3953 GemmMicrokernelTester()
3954 .mr(6)
3955 .nr(8)
3956 .kr(1)
3957 .sr(1)
3958 .m(m)
3959 .n(n)
3960 .k(k)
3961 .iterations(1)
3962 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3963 }
3964 }
3965 }
3966 }
3967
3968 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
3969 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard00bf68e2019-10-27 03:00:09 -07003970 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07003971 for (uint32_t m = 1; m <= 6; m++) {
3972 for (uint32_t n = 1; n <= 8; n++) {
3973 GemmMicrokernelTester()
3974 .mr(6)
3975 .nr(8)
3976 .kr(1)
3977 .sr(1)
3978 .m(m)
3979 .n(n)
3980 .k(k)
3981 .cm_stride(11)
3982 .iterations(1)
3983 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
3984 }
3985 }
3986 }
3987 }
3988
3989 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
3990 TEST_REQUIRES_ARM_NEON_FMA;
3991 GemmMicrokernelTester()
3992 .mr(6)
3993 .nr(8)
3994 .kr(1)
3995 .sr(1)
3996 .m(6)
3997 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07003998 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07003999 .qmin(128)
4000 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4001 }
4002
4003 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
4004 TEST_REQUIRES_ARM_NEON_FMA;
4005 GemmMicrokernelTester()
4006 .mr(6)
4007 .nr(8)
4008 .kr(1)
4009 .sr(1)
4010 .m(6)
4011 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004012 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004013 .qmax(128)
4014 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4015 }
4016
4017 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
4018 TEST_REQUIRES_ARM_NEON_FMA;
4019 GemmMicrokernelTester()
4020 .mr(6)
4021 .nr(8)
4022 .kr(1)
4023 .sr(1)
4024 .m(6)
4025 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004026 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004027 .cm_stride(11)
4028 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a53);
4029 }
4030#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4031
4032
4033#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchardbd419712019-10-31 14:15:36 -07004034 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004035 TEST_REQUIRES_ARM_NEON_FMA;
4036 GemmMicrokernelTester()
4037 .mr(6)
4038 .nr(8)
4039 .kr(1)
4040 .sr(1)
4041 .m(6)
4042 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004043 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004044 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4045 }
4046
4047 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
4048 TEST_REQUIRES_ARM_NEON_FMA;
4049 GemmMicrokernelTester()
4050 .mr(6)
4051 .nr(8)
4052 .kr(1)
4053 .sr(1)
4054 .m(6)
4055 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004056 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004057 .cn_stride(11)
4058 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4059 }
4060
Frank Barchardbd419712019-10-31 14:15:36 -07004061 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004062 TEST_REQUIRES_ARM_NEON_FMA;
4063 GemmMicrokernelTester()
4064 .mr(6)
4065 .nr(8)
4066 .kr(1)
4067 .sr(1)
4068 .m(6)
4069 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004070 .k(4)
4071 .a_stride(7)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004072 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4073 }
4074
Frank Barchardbd419712019-10-31 14:15:36 -07004075 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004076 TEST_REQUIRES_ARM_NEON_FMA;
4077 for (uint32_t m = 1; m <= 6; m++) {
4078 for (uint32_t n = 1; n <= 8; n++) {
4079 GemmMicrokernelTester()
4080 .mr(6)
4081 .nr(8)
4082 .kr(1)
4083 .sr(1)
4084 .m(m)
4085 .n(n)
Frank Barchardbd419712019-10-31 14:15:36 -07004086 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004087 .iterations(1)
4088 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4089 }
4090 }
4091 }
4092
Frank Barchardbd419712019-10-31 14:15:36 -07004093 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004094 TEST_REQUIRES_ARM_NEON_FMA;
4095 for (uint32_t m = 1; m <= 6; m++) {
4096 GemmMicrokernelTester()
4097 .mr(6)
4098 .nr(8)
4099 .kr(1)
4100 .sr(1)
4101 .m(m)
4102 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004103 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004104 .iterations(1)
4105 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4106 }
4107 }
4108
Frank Barchardbd419712019-10-31 14:15:36 -07004109 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004110 TEST_REQUIRES_ARM_NEON_FMA;
4111 for (uint32_t n = 1; n <= 8; n++) {
4112 GemmMicrokernelTester()
4113 .mr(6)
4114 .nr(8)
4115 .kr(1)
4116 .sr(1)
4117 .m(6)
4118 .n(n)
Frank Barchardbd419712019-10-31 14:15:36 -07004119 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004120 .iterations(1)
4121 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4122 }
4123 }
4124
Frank Barchardbd419712019-10-31 14:15:36 -07004125 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004126 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004127 for (size_t k = 1; k < 4; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004128 GemmMicrokernelTester()
4129 .mr(6)
4130 .nr(8)
4131 .kr(1)
4132 .sr(1)
4133 .m(6)
4134 .n(8)
4135 .k(k)
4136 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4137 }
4138 }
4139
Frank Barchardbd419712019-10-31 14:15:36 -07004140 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004141 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004142 for (size_t k = 1; k < 4; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004143 GemmMicrokernelTester()
4144 .mr(6)
4145 .nr(8)
4146 .kr(1)
4147 .sr(1)
4148 .m(6)
4149 .n(8)
4150 .k(k)
Frank Barchardbd419712019-10-31 14:15:36 -07004151 .a_stride(7)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004152 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4153 }
4154 }
4155
Frank Barchardbd419712019-10-31 14:15:36 -07004156 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004157 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004158 for (size_t k = 1; k < 4; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004159 for (uint32_t m = 1; m <= 6; m++) {
4160 for (uint32_t n = 1; n <= 8; n++) {
4161 GemmMicrokernelTester()
4162 .mr(6)
4163 .nr(8)
4164 .kr(1)
4165 .sr(1)
4166 .m(m)
4167 .n(n)
4168 .k(k)
4169 .iterations(1)
4170 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4171 }
4172 }
4173 }
4174 }
4175
Frank Barchardbd419712019-10-31 14:15:36 -07004176 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004177 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004178 for (size_t k = 5; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004179 GemmMicrokernelTester()
4180 .mr(6)
4181 .nr(8)
4182 .kr(1)
4183 .sr(1)
4184 .m(6)
4185 .n(8)
4186 .k(k)
4187 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4188 }
4189 }
4190
Frank Barchardbd419712019-10-31 14:15:36 -07004191 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004192 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004193 for (size_t k = 5; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004194 GemmMicrokernelTester()
4195 .mr(6)
4196 .nr(8)
4197 .kr(1)
4198 .sr(1)
4199 .m(6)
4200 .n(8)
4201 .k(k)
Frank Barchardbd419712019-10-31 14:15:36 -07004202 .a_stride(11)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004203 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4204 }
4205 }
4206
Frank Barchardbd419712019-10-31 14:15:36 -07004207 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004208 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004209 for (size_t k = 5; k < 8; k++) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004210 for (uint32_t m = 1; m <= 6; m++) {
4211 for (uint32_t n = 1; n <= 8; n++) {
4212 GemmMicrokernelTester()
4213 .mr(6)
4214 .nr(8)
4215 .kr(1)
4216 .sr(1)
4217 .m(m)
4218 .n(n)
4219 .k(k)
4220 .iterations(1)
4221 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4222 }
4223 }
4224 }
4225 }
4226
Frank Barchardbd419712019-10-31 14:15:36 -07004227 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004228 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004229 for (size_t k = 8; k <= 40; k += 4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004230 GemmMicrokernelTester()
4231 .mr(6)
4232 .nr(8)
4233 .kr(1)
4234 .sr(1)
4235 .m(6)
4236 .n(8)
4237 .k(k)
4238 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4239 }
4240 }
4241
Frank Barchardbd419712019-10-31 14:15:36 -07004242 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004243 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004244 for (size_t k = 8; k <= 40; k += 4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004245 GemmMicrokernelTester()
4246 .mr(6)
4247 .nr(8)
4248 .kr(1)
4249 .sr(1)
4250 .m(6)
4251 .n(8)
4252 .k(k)
Frank Barchardbd419712019-10-31 14:15:36 -07004253 .a_stride(43)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004254 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4255 }
4256 }
4257
Frank Barchardbd419712019-10-31 14:15:36 -07004258 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004259 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004260 for (size_t k = 8; k <= 40; k += 4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004261 for (uint32_t m = 1; m <= 6; m++) {
4262 for (uint32_t n = 1; n <= 8; n++) {
4263 GemmMicrokernelTester()
4264 .mr(6)
4265 .nr(8)
4266 .kr(1)
4267 .sr(1)
4268 .m(m)
4269 .n(n)
4270 .k(k)
4271 .iterations(1)
4272 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4273 }
4274 }
4275 }
4276 }
4277
4278 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
4279 TEST_REQUIRES_ARM_NEON_FMA;
4280 for (uint32_t n = 9; n < 16; n++) {
Frank Barchardbd419712019-10-31 14:15:36 -07004281 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004282 GemmMicrokernelTester()
4283 .mr(6)
4284 .nr(8)
4285 .kr(1)
4286 .sr(1)
4287 .m(6)
4288 .n(8)
4289 .k(k)
4290 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4291 }
4292 }
4293 }
4294
4295 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
4296 TEST_REQUIRES_ARM_NEON_FMA;
4297 for (uint32_t n = 9; n < 16; n++) {
Frank Barchardbd419712019-10-31 14:15:36 -07004298 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004299 GemmMicrokernelTester()
4300 .mr(6)
4301 .nr(8)
4302 .kr(1)
4303 .sr(1)
4304 .m(6)
4305 .n(8)
4306 .k(k)
4307 .cn_stride(11)
4308 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4309 }
4310 }
4311 }
4312
4313 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
4314 TEST_REQUIRES_ARM_NEON_FMA;
4315 for (uint32_t n = 9; n < 16; n++) {
Frank Barchardbd419712019-10-31 14:15:36 -07004316 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004317 GemmMicrokernelTester()
4318 .mr(6)
4319 .nr(8)
4320 .kr(1)
4321 .sr(1)
4322 .m(6)
4323 .n(n)
4324 .k(k)
Frank Barchardbd419712019-10-31 14:15:36 -07004325 .a_stride(23)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004326 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4327 }
4328 }
4329 }
4330
4331 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
4332 TEST_REQUIRES_ARM_NEON_FMA;
4333 for (uint32_t n = 9; n < 16; n++) {
Frank Barchardbd419712019-10-31 14:15:36 -07004334 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004335 for (uint32_t m = 1; m <= 6; m++) {
4336 GemmMicrokernelTester()
4337 .mr(6)
4338 .nr(8)
4339 .kr(1)
4340 .sr(1)
4341 .m(m)
4342 .n(n)
4343 .k(k)
4344 .iterations(1)
4345 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4346 }
4347 }
4348 }
4349 }
4350
4351 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
4352 TEST_REQUIRES_ARM_NEON_FMA;
4353 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchardbd419712019-10-31 14:15:36 -07004354 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004355 GemmMicrokernelTester()
4356 .mr(6)
4357 .nr(8)
4358 .kr(1)
4359 .sr(1)
4360 .m(6)
4361 .n(8)
4362 .k(k)
4363 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4364 }
4365 }
4366 }
4367
4368 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
4369 TEST_REQUIRES_ARM_NEON_FMA;
4370 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchardbd419712019-10-31 14:15:36 -07004371 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004372 GemmMicrokernelTester()
4373 .mr(6)
4374 .nr(8)
4375 .kr(1)
4376 .sr(1)
4377 .m(6)
4378 .n(n)
4379 .k(k)
4380 .cn_stride(11)
4381 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4382 }
4383 }
4384 }
4385
4386 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
4387 TEST_REQUIRES_ARM_NEON_FMA;
4388 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchardbd419712019-10-31 14:15:36 -07004389 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004390 GemmMicrokernelTester()
4391 .mr(6)
4392 .nr(8)
4393 .kr(1)
4394 .sr(1)
4395 .m(6)
4396 .n(n)
4397 .k(k)
Frank Barchardbd419712019-10-31 14:15:36 -07004398 .a_stride(23)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004399 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4400 }
4401 }
4402 }
4403
4404 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
4405 TEST_REQUIRES_ARM_NEON_FMA;
4406 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchardbd419712019-10-31 14:15:36 -07004407 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004408 for (uint32_t m = 1; m <= 6; m++) {
4409 GemmMicrokernelTester()
4410 .mr(6)
4411 .nr(8)
4412 .kr(1)
4413 .sr(1)
4414 .m(m)
4415 .n(n)
4416 .k(k)
4417 .iterations(1)
4418 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4419 }
4420 }
4421 }
4422 }
4423
4424 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
4425 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchardbd419712019-10-31 14:15:36 -07004426 for (size_t k = 1; k <= 20; k += 5) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07004427 for (uint32_t m = 1; m <= 6; m++) {
4428 for (uint32_t n = 1; n <= 8; n++) {
4429 GemmMicrokernelTester()
4430 .mr(6)
4431 .nr(8)
4432 .kr(1)
4433 .sr(1)
4434 .m(m)
4435 .n(n)
4436 .k(k)
4437 .cm_stride(11)
4438 .iterations(1)
4439 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4440 }
4441 }
4442 }
4443 }
4444
4445 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
4446 TEST_REQUIRES_ARM_NEON_FMA;
4447 GemmMicrokernelTester()
4448 .mr(6)
4449 .nr(8)
4450 .kr(1)
4451 .sr(1)
4452 .m(6)
4453 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004454 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004455 .qmin(128)
4456 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4457 }
4458
4459 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
4460 TEST_REQUIRES_ARM_NEON_FMA;
4461 GemmMicrokernelTester()
4462 .mr(6)
4463 .nr(8)
4464 .kr(1)
4465 .sr(1)
4466 .m(6)
4467 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004468 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004469 .qmax(128)
4470 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4471 }
4472
4473 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
4474 TEST_REQUIRES_ARM_NEON_FMA;
4475 GemmMicrokernelTester()
4476 .mr(6)
4477 .nr(8)
4478 .kr(1)
4479 .sr(1)
4480 .m(6)
4481 .n(8)
Frank Barchardbd419712019-10-31 14:15:36 -07004482 .k(4)
XNNPACK Teamb455b122019-09-27 18:10:33 -07004483 .cm_stride(11)
4484 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a57);
4485 }
Frank Barchard7e955972019-10-11 10:34:25 -07004486#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004487
4488
Frank Barchard7e955972019-10-11 10:34:25 -07004489#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004490 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
4491 TEST_REQUIRES_ARM_NEON_FMA;
4492 GemmMicrokernelTester()
4493 .mr(6)
4494 .nr(8)
4495 .kr(1)
4496 .sr(1)
4497 .m(6)
4498 .n(8)
4499 .k(8)
4500 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4501 }
4502
4503 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cn) {
4504 TEST_REQUIRES_ARM_NEON_FMA;
4505 GemmMicrokernelTester()
4506 .mr(6)
4507 .nr(8)
4508 .kr(1)
4509 .sr(1)
4510 .m(6)
4511 .n(8)
4512 .k(8)
4513 .cn_stride(11)
4514 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4515 }
4516
4517 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_strided_a) {
4518 TEST_REQUIRES_ARM_NEON_FMA;
4519 GemmMicrokernelTester()
4520 .mr(6)
4521 .nr(8)
4522 .kr(1)
4523 .sr(1)
4524 .m(6)
4525 .n(8)
4526 .k(8)
4527 .a_stride(11)
4528 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4529 }
4530
4531 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile) {
4532 TEST_REQUIRES_ARM_NEON_FMA;
4533 for (uint32_t m = 1; m <= 6; m++) {
4534 for (uint32_t n = 1; n <= 8; n++) {
4535 GemmMicrokernelTester()
4536 .mr(6)
4537 .nr(8)
4538 .kr(1)
4539 .sr(1)
4540 .m(m)
4541 .n(n)
4542 .k(8)
4543 .iterations(1)
4544 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4545 }
4546 }
4547 }
4548
4549 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_m) {
4550 TEST_REQUIRES_ARM_NEON_FMA;
4551 for (uint32_t m = 1; m <= 6; m++) {
4552 GemmMicrokernelTester()
4553 .mr(6)
4554 .nr(8)
4555 .kr(1)
4556 .sr(1)
4557 .m(m)
4558 .n(8)
4559 .k(8)
4560 .iterations(1)
4561 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4562 }
4563 }
4564
4565 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_n) {
4566 TEST_REQUIRES_ARM_NEON_FMA;
4567 for (uint32_t n = 1; n <= 8; n++) {
4568 GemmMicrokernelTester()
4569 .mr(6)
4570 .nr(8)
4571 .kr(1)
4572 .sr(1)
4573 .m(6)
4574 .n(n)
4575 .k(8)
4576 .iterations(1)
4577 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4578 }
4579 }
4580
4581 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16) {
4582 TEST_REQUIRES_ARM_NEON_FMA;
4583 GemmMicrokernelTester()
4584 .mr(6)
4585 .nr(8)
4586 .kr(1)
4587 .sr(1)
4588 .m(6)
4589 .n(8)
4590 .k(16)
4591 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4592 }
4593
4594 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_strided_a) {
4595 TEST_REQUIRES_ARM_NEON_FMA;
4596 GemmMicrokernelTester()
4597 .mr(6)
4598 .nr(8)
4599 .kr(1)
4600 .sr(1)
4601 .m(6)
4602 .n(8)
4603 .k(16)
4604 .a_stride(19)
4605 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4606 }
4607
4608 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_subtile) {
4609 TEST_REQUIRES_ARM_NEON_FMA;
4610 for (uint32_t m = 1; m <= 6; m++) {
4611 for (uint32_t n = 1; n <= 8; n++) {
4612 GemmMicrokernelTester()
4613 .mr(6)
4614 .nr(8)
4615 .kr(1)
4616 .sr(1)
4617 .m(m)
4618 .n(n)
4619 .k(16)
4620 .iterations(1)
4621 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4622 }
4623 }
4624 }
4625
4626 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16) {
4627 TEST_REQUIRES_ARM_NEON_FMA;
4628 for (size_t k = 1; k < 16; k++) {
4629 GemmMicrokernelTester()
4630 .mr(6)
4631 .nr(8)
4632 .kr(1)
4633 .sr(1)
4634 .m(6)
4635 .n(8)
4636 .k(k)
4637 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4638 }
4639 }
4640
4641 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_strided_a) {
4642 TEST_REQUIRES_ARM_NEON_FMA;
4643 for (size_t k = 1; k < 16; k++) {
4644 GemmMicrokernelTester()
4645 .mr(6)
4646 .nr(8)
4647 .kr(1)
4648 .sr(1)
4649 .m(6)
4650 .n(8)
4651 .k(k)
4652 .a_stride(19)
4653 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4654 }
4655 }
4656
4657 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_subtile) {
4658 TEST_REQUIRES_ARM_NEON_FMA;
4659 for (size_t k = 1; k < 16; k++) {
4660 for (uint32_t m = 1; m <= 6; m++) {
4661 for (uint32_t n = 1; n <= 8; n++) {
4662 GemmMicrokernelTester()
4663 .mr(6)
4664 .nr(8)
4665 .kr(1)
4666 .sr(1)
4667 .m(m)
4668 .n(n)
4669 .k(k)
4670 .iterations(1)
4671 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4672 }
4673 }
4674 }
4675 }
4676
4677 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_16) {
4678 TEST_REQUIRES_ARM_NEON_FMA;
4679 for (size_t k = 17; k < 16; k++) {
4680 GemmMicrokernelTester()
4681 .mr(6)
4682 .nr(8)
4683 .kr(1)
4684 .sr(1)
4685 .m(6)
4686 .n(8)
4687 .k(k)
4688 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4689 }
4690 }
4691
4692 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_strided_a) {
4693 TEST_REQUIRES_ARM_NEON_FMA;
4694 for (size_t k = 17; k < 16; k++) {
4695 GemmMicrokernelTester()
4696 .mr(6)
4697 .nr(8)
4698 .kr(1)
4699 .sr(1)
4700 .m(6)
4701 .n(8)
4702 .k(k)
4703 .a_stride(19)
4704 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4705 }
4706 }
4707
4708 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_subtile) {
4709 TEST_REQUIRES_ARM_NEON_FMA;
4710 for (size_t k = 17; k < 16; k++) {
4711 for (uint32_t m = 1; m <= 6; m++) {
4712 for (uint32_t n = 1; n <= 8; n++) {
4713 GemmMicrokernelTester()
4714 .mr(6)
4715 .nr(8)
4716 .kr(1)
4717 .sr(1)
4718 .m(m)
4719 .n(n)
4720 .k(k)
4721 .iterations(1)
4722 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4723 }
4724 }
4725 }
4726 }
4727
4728 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8) {
4729 TEST_REQUIRES_ARM_NEON_FMA;
4730 for (size_t k = 24; k <= 80; k += 8) {
4731 GemmMicrokernelTester()
4732 .mr(6)
4733 .nr(8)
4734 .kr(1)
4735 .sr(1)
4736 .m(6)
4737 .n(8)
4738 .k(k)
4739 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4740 }
4741 }
4742
4743 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_strided_a) {
4744 TEST_REQUIRES_ARM_NEON_FMA;
4745 for (size_t k = 24; k <= 80; k += 8) {
4746 GemmMicrokernelTester()
4747 .mr(6)
4748 .nr(8)
4749 .kr(1)
4750 .sr(1)
4751 .m(6)
4752 .n(8)
4753 .k(k)
4754 .a_stride(83)
4755 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4756 }
4757 }
4758
4759 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_subtile) {
4760 TEST_REQUIRES_ARM_NEON_FMA;
4761 for (size_t k = 24; k <= 80; k += 8) {
4762 for (uint32_t m = 1; m <= 6; m++) {
4763 for (uint32_t n = 1; n <= 8; n++) {
4764 GemmMicrokernelTester()
4765 .mr(6)
4766 .nr(8)
4767 .kr(1)
4768 .sr(1)
4769 .m(m)
4770 .n(n)
4771 .k(k)
4772 .iterations(1)
4773 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4774 }
4775 }
4776 }
4777 }
4778
4779 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8) {
4780 TEST_REQUIRES_ARM_NEON_FMA;
4781 for (uint32_t n = 9; n < 16; n++) {
4782 for (size_t k = 1; k <= 40; k += 9) {
4783 GemmMicrokernelTester()
4784 .mr(6)
4785 .nr(8)
4786 .kr(1)
4787 .sr(1)
4788 .m(6)
4789 .n(8)
4790 .k(k)
4791 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4792 }
4793 }
4794 }
4795
4796 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_cn) {
4797 TEST_REQUIRES_ARM_NEON_FMA;
4798 for (uint32_t n = 9; n < 16; n++) {
4799 for (size_t k = 1; k <= 40; k += 9) {
4800 GemmMicrokernelTester()
4801 .mr(6)
4802 .nr(8)
4803 .kr(1)
4804 .sr(1)
4805 .m(6)
4806 .n(8)
4807 .k(k)
4808 .cn_stride(11)
4809 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4810 }
4811 }
4812 }
4813
4814 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_a) {
4815 TEST_REQUIRES_ARM_NEON_FMA;
4816 for (uint32_t n = 9; n < 16; n++) {
4817 for (size_t k = 1; k <= 40; k += 9) {
4818 GemmMicrokernelTester()
4819 .mr(6)
4820 .nr(8)
4821 .kr(1)
4822 .sr(1)
4823 .m(6)
4824 .n(n)
4825 .k(k)
4826 .a_stride(43)
4827 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4828 }
4829 }
4830 }
4831
4832 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_subtile) {
4833 TEST_REQUIRES_ARM_NEON_FMA;
4834 for (uint32_t n = 9; n < 16; n++) {
4835 for (size_t k = 1; k <= 40; k += 9) {
4836 for (uint32_t m = 1; m <= 6; m++) {
4837 GemmMicrokernelTester()
4838 .mr(6)
4839 .nr(8)
4840 .kr(1)
4841 .sr(1)
4842 .m(m)
4843 .n(n)
4844 .k(k)
4845 .iterations(1)
4846 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4847 }
4848 }
4849 }
4850 }
4851
4852 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8) {
4853 TEST_REQUIRES_ARM_NEON_FMA;
4854 for (uint32_t n = 16; n <= 24; n += 8) {
4855 for (size_t k = 1; k <= 40; k += 9) {
4856 GemmMicrokernelTester()
4857 .mr(6)
4858 .nr(8)
4859 .kr(1)
4860 .sr(1)
4861 .m(6)
4862 .n(8)
4863 .k(k)
4864 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4865 }
4866 }
4867 }
4868
4869 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_cn) {
4870 TEST_REQUIRES_ARM_NEON_FMA;
4871 for (uint32_t n = 16; n <= 24; n += 8) {
4872 for (size_t k = 1; k <= 40; k += 9) {
4873 GemmMicrokernelTester()
4874 .mr(6)
4875 .nr(8)
4876 .kr(1)
4877 .sr(1)
4878 .m(6)
4879 .n(n)
4880 .k(k)
4881 .cn_stride(11)
4882 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4883 }
4884 }
4885 }
4886
4887 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_a) {
4888 TEST_REQUIRES_ARM_NEON_FMA;
4889 for (uint32_t n = 16; n <= 24; n += 8) {
4890 for (size_t k = 1; k <= 40; k += 9) {
4891 GemmMicrokernelTester()
4892 .mr(6)
4893 .nr(8)
4894 .kr(1)
4895 .sr(1)
4896 .m(6)
4897 .n(n)
4898 .k(k)
4899 .a_stride(43)
4900 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4901 }
4902 }
4903 }
4904
4905 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_subtile) {
4906 TEST_REQUIRES_ARM_NEON_FMA;
4907 for (uint32_t n = 16; n <= 24; n += 8) {
4908 for (size_t k = 1; k <= 40; k += 9) {
4909 for (uint32_t m = 1; m <= 6; m++) {
4910 GemmMicrokernelTester()
4911 .mr(6)
4912 .nr(8)
4913 .kr(1)
4914 .sr(1)
4915 .m(m)
4916 .n(n)
4917 .k(k)
4918 .iterations(1)
4919 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4920 }
4921 }
4922 }
4923 }
4924
4925 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm_subtile) {
4926 TEST_REQUIRES_ARM_NEON_FMA;
4927 for (size_t k = 1; k <= 40; k += 9) {
4928 for (uint32_t m = 1; m <= 6; m++) {
4929 for (uint32_t n = 1; n <= 8; n++) {
4930 GemmMicrokernelTester()
4931 .mr(6)
4932 .nr(8)
4933 .kr(1)
4934 .sr(1)
4935 .m(m)
4936 .n(n)
4937 .k(k)
4938 .cm_stride(11)
4939 .iterations(1)
4940 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4941 }
4942 }
4943 }
4944 }
4945
4946 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, qmin) {
4947 TEST_REQUIRES_ARM_NEON_FMA;
4948 GemmMicrokernelTester()
4949 .mr(6)
4950 .nr(8)
4951 .kr(1)
4952 .sr(1)
4953 .m(6)
4954 .n(8)
4955 .k(8)
4956 .qmin(128)
4957 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4958 }
4959
4960 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, qmax) {
4961 TEST_REQUIRES_ARM_NEON_FMA;
4962 GemmMicrokernelTester()
4963 .mr(6)
4964 .nr(8)
4965 .kr(1)
4966 .sr(1)
4967 .m(6)
4968 .n(8)
4969 .k(8)
4970 .qmax(128)
4971 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4972 }
4973
4974 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm) {
4975 TEST_REQUIRES_ARM_NEON_FMA;
4976 GemmMicrokernelTester()
4977 .mr(6)
4978 .nr(8)
4979 .kr(1)
4980 .sr(1)
4981 .m(6)
4982 .n(8)
4983 .k(8)
4984 .cm_stride(11)
4985 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a73);
4986 }
Frank Barchard7e955972019-10-11 10:34:25 -07004987#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004988
4989
Frank Barchard7e955972019-10-11 10:34:25 -07004990#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004991 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
4992 TEST_REQUIRES_ARM_NEON_FMA;
4993 GemmMicrokernelTester()
4994 .mr(6)
4995 .nr(8)
4996 .kr(1)
4997 .sr(1)
4998 .m(6)
4999 .n(8)
5000 .k(8)
5001 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5002 }
5003
5004 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
5005 TEST_REQUIRES_ARM_NEON_FMA;
5006 GemmMicrokernelTester()
5007 .mr(6)
5008 .nr(8)
5009 .kr(1)
5010 .sr(1)
5011 .m(6)
5012 .n(8)
5013 .k(8)
5014 .cn_stride(11)
5015 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5016 }
5017
5018 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
5019 TEST_REQUIRES_ARM_NEON_FMA;
5020 GemmMicrokernelTester()
5021 .mr(6)
5022 .nr(8)
5023 .kr(1)
5024 .sr(1)
5025 .m(6)
5026 .n(8)
5027 .k(8)
5028 .a_stride(11)
5029 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5030 }
5031
5032 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
5033 TEST_REQUIRES_ARM_NEON_FMA;
5034 for (uint32_t m = 1; m <= 6; m++) {
5035 for (uint32_t n = 1; n <= 8; n++) {
5036 GemmMicrokernelTester()
5037 .mr(6)
5038 .nr(8)
5039 .kr(1)
5040 .sr(1)
5041 .m(m)
5042 .n(n)
5043 .k(8)
5044 .iterations(1)
5045 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5046 }
5047 }
5048 }
5049
5050 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
5051 TEST_REQUIRES_ARM_NEON_FMA;
5052 for (uint32_t m = 1; m <= 6; m++) {
5053 GemmMicrokernelTester()
5054 .mr(6)
5055 .nr(8)
5056 .kr(1)
5057 .sr(1)
5058 .m(m)
5059 .n(8)
5060 .k(8)
5061 .iterations(1)
5062 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5063 }
5064 }
5065
5066 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
5067 TEST_REQUIRES_ARM_NEON_FMA;
5068 for (uint32_t n = 1; n <= 8; n++) {
5069 GemmMicrokernelTester()
5070 .mr(6)
5071 .nr(8)
5072 .kr(1)
5073 .sr(1)
5074 .m(6)
5075 .n(n)
5076 .k(8)
5077 .iterations(1)
5078 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5079 }
5080 }
5081
5082 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
5083 TEST_REQUIRES_ARM_NEON_FMA;
5084 GemmMicrokernelTester()
5085 .mr(6)
5086 .nr(8)
5087 .kr(1)
5088 .sr(1)
5089 .m(6)
5090 .n(8)
5091 .k(16)
5092 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5093 }
5094
5095 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
5096 TEST_REQUIRES_ARM_NEON_FMA;
5097 GemmMicrokernelTester()
5098 .mr(6)
5099 .nr(8)
5100 .kr(1)
5101 .sr(1)
5102 .m(6)
5103 .n(8)
5104 .k(16)
5105 .a_stride(19)
5106 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5107 }
5108
5109 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
5110 TEST_REQUIRES_ARM_NEON_FMA;
5111 for (uint32_t m = 1; m <= 6; m++) {
5112 for (uint32_t n = 1; n <= 8; n++) {
5113 GemmMicrokernelTester()
5114 .mr(6)
5115 .nr(8)
5116 .kr(1)
5117 .sr(1)
5118 .m(m)
5119 .n(n)
5120 .k(16)
5121 .iterations(1)
5122 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5123 }
5124 }
5125 }
5126
5127 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
5128 TEST_REQUIRES_ARM_NEON_FMA;
5129 for (size_t k = 1; k < 16; k++) {
5130 GemmMicrokernelTester()
5131 .mr(6)
5132 .nr(8)
5133 .kr(1)
5134 .sr(1)
5135 .m(6)
5136 .n(8)
5137 .k(k)
5138 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5139 }
5140 }
5141
5142 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
5143 TEST_REQUIRES_ARM_NEON_FMA;
5144 for (size_t k = 1; k < 16; k++) {
5145 GemmMicrokernelTester()
5146 .mr(6)
5147 .nr(8)
5148 .kr(1)
5149 .sr(1)
5150 .m(6)
5151 .n(8)
5152 .k(k)
5153 .a_stride(19)
5154 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5155 }
5156 }
5157
5158 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
5159 TEST_REQUIRES_ARM_NEON_FMA;
5160 for (size_t k = 1; k < 16; k++) {
5161 for (uint32_t m = 1; m <= 6; m++) {
5162 for (uint32_t n = 1; n <= 8; n++) {
5163 GemmMicrokernelTester()
5164 .mr(6)
5165 .nr(8)
5166 .kr(1)
5167 .sr(1)
5168 .m(m)
5169 .n(n)
5170 .k(k)
5171 .iterations(1)
5172 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5173 }
5174 }
5175 }
5176 }
5177
5178 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
5179 TEST_REQUIRES_ARM_NEON_FMA;
5180 for (size_t k = 17; k < 16; k++) {
5181 GemmMicrokernelTester()
5182 .mr(6)
5183 .nr(8)
5184 .kr(1)
5185 .sr(1)
5186 .m(6)
5187 .n(8)
5188 .k(k)
5189 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5190 }
5191 }
5192
5193 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
5194 TEST_REQUIRES_ARM_NEON_FMA;
5195 for (size_t k = 17; k < 16; k++) {
5196 GemmMicrokernelTester()
5197 .mr(6)
5198 .nr(8)
5199 .kr(1)
5200 .sr(1)
5201 .m(6)
5202 .n(8)
5203 .k(k)
5204 .a_stride(19)
5205 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5206 }
5207 }
5208
5209 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
5210 TEST_REQUIRES_ARM_NEON_FMA;
5211 for (size_t k = 17; k < 16; k++) {
5212 for (uint32_t m = 1; m <= 6; m++) {
5213 for (uint32_t n = 1; n <= 8; n++) {
5214 GemmMicrokernelTester()
5215 .mr(6)
5216 .nr(8)
5217 .kr(1)
5218 .sr(1)
5219 .m(m)
5220 .n(n)
5221 .k(k)
5222 .iterations(1)
5223 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5224 }
5225 }
5226 }
5227 }
5228
5229 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
5230 TEST_REQUIRES_ARM_NEON_FMA;
5231 for (size_t k = 24; k <= 80; k += 8) {
5232 GemmMicrokernelTester()
5233 .mr(6)
5234 .nr(8)
5235 .kr(1)
5236 .sr(1)
5237 .m(6)
5238 .n(8)
5239 .k(k)
5240 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5241 }
5242 }
5243
5244 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
5245 TEST_REQUIRES_ARM_NEON_FMA;
5246 for (size_t k = 24; k <= 80; k += 8) {
5247 GemmMicrokernelTester()
5248 .mr(6)
5249 .nr(8)
5250 .kr(1)
5251 .sr(1)
5252 .m(6)
5253 .n(8)
5254 .k(k)
5255 .a_stride(83)
5256 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5257 }
5258 }
5259
5260 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
5261 TEST_REQUIRES_ARM_NEON_FMA;
5262 for (size_t k = 24; k <= 80; k += 8) {
5263 for (uint32_t m = 1; m <= 6; m++) {
5264 for (uint32_t n = 1; n <= 8; n++) {
5265 GemmMicrokernelTester()
5266 .mr(6)
5267 .nr(8)
5268 .kr(1)
5269 .sr(1)
5270 .m(m)
5271 .n(n)
5272 .k(k)
5273 .iterations(1)
5274 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5275 }
5276 }
5277 }
5278 }
5279
5280 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
5281 TEST_REQUIRES_ARM_NEON_FMA;
5282 for (uint32_t n = 9; n < 16; n++) {
5283 for (size_t k = 1; k <= 40; k += 9) {
5284 GemmMicrokernelTester()
5285 .mr(6)
5286 .nr(8)
5287 .kr(1)
5288 .sr(1)
5289 .m(6)
5290 .n(8)
5291 .k(k)
5292 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5293 }
5294 }
5295 }
5296
5297 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
5298 TEST_REQUIRES_ARM_NEON_FMA;
5299 for (uint32_t n = 9; n < 16; n++) {
5300 for (size_t k = 1; k <= 40; k += 9) {
5301 GemmMicrokernelTester()
5302 .mr(6)
5303 .nr(8)
5304 .kr(1)
5305 .sr(1)
5306 .m(6)
5307 .n(8)
5308 .k(k)
5309 .cn_stride(11)
5310 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5311 }
5312 }
5313 }
5314
5315 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
5316 TEST_REQUIRES_ARM_NEON_FMA;
5317 for (uint32_t n = 9; n < 16; n++) {
5318 for (size_t k = 1; k <= 40; k += 9) {
5319 GemmMicrokernelTester()
5320 .mr(6)
5321 .nr(8)
5322 .kr(1)
5323 .sr(1)
5324 .m(6)
5325 .n(n)
5326 .k(k)
5327 .a_stride(43)
5328 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5329 }
5330 }
5331 }
5332
5333 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
5334 TEST_REQUIRES_ARM_NEON_FMA;
5335 for (uint32_t n = 9; n < 16; n++) {
5336 for (size_t k = 1; k <= 40; k += 9) {
5337 for (uint32_t m = 1; m <= 6; m++) {
5338 GemmMicrokernelTester()
5339 .mr(6)
5340 .nr(8)
5341 .kr(1)
5342 .sr(1)
5343 .m(m)
5344 .n(n)
5345 .k(k)
5346 .iterations(1)
5347 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5348 }
5349 }
5350 }
5351 }
5352
5353 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
5354 TEST_REQUIRES_ARM_NEON_FMA;
5355 for (uint32_t n = 16; n <= 24; n += 8) {
5356 for (size_t k = 1; k <= 40; k += 9) {
5357 GemmMicrokernelTester()
5358 .mr(6)
5359 .nr(8)
5360 .kr(1)
5361 .sr(1)
5362 .m(6)
5363 .n(8)
5364 .k(k)
5365 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5366 }
5367 }
5368 }
5369
5370 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
5371 TEST_REQUIRES_ARM_NEON_FMA;
5372 for (uint32_t n = 16; n <= 24; n += 8) {
5373 for (size_t k = 1; k <= 40; k += 9) {
5374 GemmMicrokernelTester()
5375 .mr(6)
5376 .nr(8)
5377 .kr(1)
5378 .sr(1)
5379 .m(6)
5380 .n(n)
5381 .k(k)
5382 .cn_stride(11)
5383 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5384 }
5385 }
5386 }
5387
5388 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
5389 TEST_REQUIRES_ARM_NEON_FMA;
5390 for (uint32_t n = 16; n <= 24; n += 8) {
5391 for (size_t k = 1; k <= 40; k += 9) {
5392 GemmMicrokernelTester()
5393 .mr(6)
5394 .nr(8)
5395 .kr(1)
5396 .sr(1)
5397 .m(6)
5398 .n(n)
5399 .k(k)
5400 .a_stride(43)
5401 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5402 }
5403 }
5404 }
5405
5406 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
5407 TEST_REQUIRES_ARM_NEON_FMA;
5408 for (uint32_t n = 16; n <= 24; n += 8) {
5409 for (size_t k = 1; k <= 40; k += 9) {
5410 for (uint32_t m = 1; m <= 6; m++) {
5411 GemmMicrokernelTester()
5412 .mr(6)
5413 .nr(8)
5414 .kr(1)
5415 .sr(1)
5416 .m(m)
5417 .n(n)
5418 .k(k)
5419 .iterations(1)
5420 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5421 }
5422 }
5423 }
5424 }
5425
5426 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
5427 TEST_REQUIRES_ARM_NEON_FMA;
5428 for (size_t k = 1; k <= 40; k += 9) {
5429 for (uint32_t m = 1; m <= 6; m++) {
5430 for (uint32_t n = 1; n <= 8; n++) {
5431 GemmMicrokernelTester()
5432 .mr(6)
5433 .nr(8)
5434 .kr(1)
5435 .sr(1)
5436 .m(m)
5437 .n(n)
5438 .k(k)
5439 .cm_stride(11)
5440 .iterations(1)
5441 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5442 }
5443 }
5444 }
5445 }
5446
5447 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
5448 TEST_REQUIRES_ARM_NEON_FMA;
5449 GemmMicrokernelTester()
5450 .mr(6)
5451 .nr(8)
5452 .kr(1)
5453 .sr(1)
5454 .m(6)
5455 .n(8)
5456 .k(8)
5457 .qmin(128)
5458 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5459 }
5460
5461 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
5462 TEST_REQUIRES_ARM_NEON_FMA;
5463 GemmMicrokernelTester()
5464 .mr(6)
5465 .nr(8)
5466 .kr(1)
5467 .sr(1)
5468 .m(6)
5469 .n(8)
5470 .k(8)
5471 .qmax(128)
5472 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5473 }
5474
5475 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
5476 TEST_REQUIRES_ARM_NEON_FMA;
5477 GemmMicrokernelTester()
5478 .mr(6)
5479 .nr(8)
5480 .kr(1)
5481 .sr(1)
5482 .m(6)
5483 .n(8)
5484 .k(8)
5485 .cm_stride(11)
5486 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_cortex_a75);
5487 }
Frank Barchard7e955972019-10-11 10:34:25 -07005488#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005489
5490
Frank Barchard7e955972019-10-11 10:34:25 -07005491#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005492 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
5493 TEST_REQUIRES_ARM_NEON_FMA;
5494 GemmMicrokernelTester()
5495 .mr(1)
5496 .nr(12)
5497 .kr(1)
5498 .sr(1)
5499 .m(1)
5500 .n(12)
5501 .k(4)
5502 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5503 }
5504
5505 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
5506 TEST_REQUIRES_ARM_NEON_FMA;
5507 GemmMicrokernelTester()
5508 .mr(1)
5509 .nr(12)
5510 .kr(1)
5511 .sr(1)
5512 .m(1)
5513 .n(12)
5514 .k(4)
5515 .cn_stride(17)
5516 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5517 }
5518
5519 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
5520 TEST_REQUIRES_ARM_NEON_FMA;
5521 GemmMicrokernelTester()
5522 .mr(1)
5523 .nr(12)
5524 .kr(1)
5525 .sr(1)
5526 .m(1)
5527 .n(12)
5528 .k(4)
5529 .a_stride(7)
5530 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5531 }
5532
5533 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
5534 TEST_REQUIRES_ARM_NEON_FMA;
5535 for (uint32_t m = 1; m <= 1; m++) {
5536 for (uint32_t n = 1; n <= 12; n++) {
5537 GemmMicrokernelTester()
5538 .mr(1)
5539 .nr(12)
5540 .kr(1)
5541 .sr(1)
5542 .m(m)
5543 .n(n)
5544 .k(4)
5545 .iterations(1)
5546 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5547 }
5548 }
5549 }
5550
5551 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
5552 TEST_REQUIRES_ARM_NEON_FMA;
5553 for (uint32_t m = 1; m <= 1; m++) {
5554 GemmMicrokernelTester()
5555 .mr(1)
5556 .nr(12)
5557 .kr(1)
5558 .sr(1)
5559 .m(m)
5560 .n(12)
5561 .k(4)
5562 .iterations(1)
5563 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5564 }
5565 }
5566
5567 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
5568 TEST_REQUIRES_ARM_NEON_FMA;
5569 for (uint32_t n = 1; n <= 12; n++) {
5570 GemmMicrokernelTester()
5571 .mr(1)
5572 .nr(12)
5573 .kr(1)
5574 .sr(1)
5575 .m(1)
5576 .n(n)
5577 .k(4)
5578 .iterations(1)
5579 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5580 }
5581 }
5582
5583 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
5584 TEST_REQUIRES_ARM_NEON_FMA;
5585 GemmMicrokernelTester()
5586 .mr(1)
5587 .nr(12)
5588 .kr(1)
5589 .sr(1)
5590 .m(1)
5591 .n(12)
5592 .k(8)
5593 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5594 }
5595
5596 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
5597 TEST_REQUIRES_ARM_NEON_FMA;
5598 GemmMicrokernelTester()
5599 .mr(1)
5600 .nr(12)
5601 .kr(1)
5602 .sr(1)
5603 .m(1)
5604 .n(12)
5605 .k(8)
5606 .a_stride(11)
5607 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5608 }
5609
5610 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
5611 TEST_REQUIRES_ARM_NEON_FMA;
5612 for (uint32_t m = 1; m <= 1; m++) {
5613 for (uint32_t n = 1; n <= 12; n++) {
5614 GemmMicrokernelTester()
5615 .mr(1)
5616 .nr(12)
5617 .kr(1)
5618 .sr(1)
5619 .m(m)
5620 .n(n)
5621 .k(8)
5622 .iterations(1)
5623 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5624 }
5625 }
5626 }
5627
5628 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
5629 TEST_REQUIRES_ARM_NEON_FMA;
5630 for (size_t k = 1; k < 8; k++) {
5631 GemmMicrokernelTester()
5632 .mr(1)
5633 .nr(12)
5634 .kr(1)
5635 .sr(1)
5636 .m(1)
5637 .n(12)
5638 .k(k)
5639 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5640 }
5641 }
5642
5643 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
5644 TEST_REQUIRES_ARM_NEON_FMA;
5645 for (size_t k = 1; k < 8; k++) {
5646 GemmMicrokernelTester()
5647 .mr(1)
5648 .nr(12)
5649 .kr(1)
5650 .sr(1)
5651 .m(1)
5652 .n(12)
5653 .k(k)
5654 .a_stride(11)
5655 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5656 }
5657 }
5658
5659 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
5660 TEST_REQUIRES_ARM_NEON_FMA;
5661 for (size_t k = 1; k < 8; k++) {
5662 for (uint32_t m = 1; m <= 1; m++) {
5663 for (uint32_t n = 1; n <= 12; n++) {
5664 GemmMicrokernelTester()
5665 .mr(1)
5666 .nr(12)
5667 .kr(1)
5668 .sr(1)
5669 .m(m)
5670 .n(n)
5671 .k(k)
5672 .iterations(1)
5673 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5674 }
5675 }
5676 }
5677 }
5678
5679 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
5680 TEST_REQUIRES_ARM_NEON_FMA;
5681 for (size_t k = 9; k < 8; k++) {
5682 GemmMicrokernelTester()
5683 .mr(1)
5684 .nr(12)
5685 .kr(1)
5686 .sr(1)
5687 .m(1)
5688 .n(12)
5689 .k(k)
5690 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5691 }
5692 }
5693
5694 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
5695 TEST_REQUIRES_ARM_NEON_FMA;
5696 for (size_t k = 9; k < 8; k++) {
5697 GemmMicrokernelTester()
5698 .mr(1)
5699 .nr(12)
5700 .kr(1)
5701 .sr(1)
5702 .m(1)
5703 .n(12)
5704 .k(k)
5705 .a_stride(11)
5706 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5707 }
5708 }
5709
5710 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
5711 TEST_REQUIRES_ARM_NEON_FMA;
5712 for (size_t k = 9; k < 8; k++) {
5713 for (uint32_t m = 1; m <= 1; m++) {
5714 for (uint32_t n = 1; n <= 12; n++) {
5715 GemmMicrokernelTester()
5716 .mr(1)
5717 .nr(12)
5718 .kr(1)
5719 .sr(1)
5720 .m(m)
5721 .n(n)
5722 .k(k)
5723 .iterations(1)
5724 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5725 }
5726 }
5727 }
5728 }
5729
5730 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
5731 TEST_REQUIRES_ARM_NEON_FMA;
5732 for (size_t k = 12; k <= 40; k += 4) {
5733 GemmMicrokernelTester()
5734 .mr(1)
5735 .nr(12)
5736 .kr(1)
5737 .sr(1)
5738 .m(1)
5739 .n(12)
5740 .k(k)
5741 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5742 }
5743 }
5744
5745 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
5746 TEST_REQUIRES_ARM_NEON_FMA;
5747 for (size_t k = 12; k <= 40; k += 4) {
5748 GemmMicrokernelTester()
5749 .mr(1)
5750 .nr(12)
5751 .kr(1)
5752 .sr(1)
5753 .m(1)
5754 .n(12)
5755 .k(k)
5756 .a_stride(43)
5757 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5758 }
5759 }
5760
5761 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
5762 TEST_REQUIRES_ARM_NEON_FMA;
5763 for (size_t k = 12; k <= 40; k += 4) {
5764 for (uint32_t m = 1; m <= 1; m++) {
5765 for (uint32_t n = 1; n <= 12; n++) {
5766 GemmMicrokernelTester()
5767 .mr(1)
5768 .nr(12)
5769 .kr(1)
5770 .sr(1)
5771 .m(m)
5772 .n(n)
5773 .k(k)
5774 .iterations(1)
5775 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5776 }
5777 }
5778 }
5779 }
5780
5781 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
5782 TEST_REQUIRES_ARM_NEON_FMA;
5783 for (uint32_t n = 13; n < 24; n++) {
5784 for (size_t k = 1; k <= 20; k += 5) {
5785 GemmMicrokernelTester()
5786 .mr(1)
5787 .nr(12)
5788 .kr(1)
5789 .sr(1)
5790 .m(1)
5791 .n(12)
5792 .k(k)
5793 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5794 }
5795 }
5796 }
5797
5798 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
5799 TEST_REQUIRES_ARM_NEON_FMA;
5800 for (uint32_t n = 13; n < 24; n++) {
5801 for (size_t k = 1; k <= 20; k += 5) {
5802 GemmMicrokernelTester()
5803 .mr(1)
5804 .nr(12)
5805 .kr(1)
5806 .sr(1)
5807 .m(1)
5808 .n(12)
5809 .k(k)
5810 .cn_stride(17)
5811 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5812 }
5813 }
5814 }
5815
5816 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
5817 TEST_REQUIRES_ARM_NEON_FMA;
5818 for (uint32_t n = 13; n < 24; n++) {
5819 for (size_t k = 1; k <= 20; k += 5) {
5820 GemmMicrokernelTester()
5821 .mr(1)
5822 .nr(12)
5823 .kr(1)
5824 .sr(1)
5825 .m(1)
5826 .n(n)
5827 .k(k)
5828 .a_stride(23)
5829 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5830 }
5831 }
5832 }
5833
5834 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
5835 TEST_REQUIRES_ARM_NEON_FMA;
5836 for (uint32_t n = 13; n < 24; n++) {
5837 for (size_t k = 1; k <= 20; k += 5) {
5838 for (uint32_t m = 1; m <= 1; m++) {
5839 GemmMicrokernelTester()
5840 .mr(1)
5841 .nr(12)
5842 .kr(1)
5843 .sr(1)
5844 .m(m)
5845 .n(n)
5846 .k(k)
5847 .iterations(1)
5848 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5849 }
5850 }
5851 }
5852 }
5853
5854 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
5855 TEST_REQUIRES_ARM_NEON_FMA;
5856 for (uint32_t n = 24; n <= 36; n += 12) {
5857 for (size_t k = 1; k <= 20; k += 5) {
5858 GemmMicrokernelTester()
5859 .mr(1)
5860 .nr(12)
5861 .kr(1)
5862 .sr(1)
5863 .m(1)
5864 .n(12)
5865 .k(k)
5866 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5867 }
5868 }
5869 }
5870
5871 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
5872 TEST_REQUIRES_ARM_NEON_FMA;
5873 for (uint32_t n = 24; n <= 36; n += 12) {
5874 for (size_t k = 1; k <= 20; k += 5) {
5875 GemmMicrokernelTester()
5876 .mr(1)
5877 .nr(12)
5878 .kr(1)
5879 .sr(1)
5880 .m(1)
5881 .n(n)
5882 .k(k)
5883 .cn_stride(17)
5884 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5885 }
5886 }
5887 }
5888
5889 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
5890 TEST_REQUIRES_ARM_NEON_FMA;
5891 for (uint32_t n = 24; n <= 36; n += 12) {
5892 for (size_t k = 1; k <= 20; k += 5) {
5893 GemmMicrokernelTester()
5894 .mr(1)
5895 .nr(12)
5896 .kr(1)
5897 .sr(1)
5898 .m(1)
5899 .n(n)
5900 .k(k)
5901 .a_stride(23)
5902 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5903 }
5904 }
5905 }
5906
5907 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
5908 TEST_REQUIRES_ARM_NEON_FMA;
5909 for (uint32_t n = 24; n <= 36; n += 12) {
5910 for (size_t k = 1; k <= 20; k += 5) {
5911 for (uint32_t m = 1; m <= 1; m++) {
5912 GemmMicrokernelTester()
5913 .mr(1)
5914 .nr(12)
5915 .kr(1)
5916 .sr(1)
5917 .m(m)
5918 .n(n)
5919 .k(k)
5920 .iterations(1)
5921 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5922 }
5923 }
5924 }
5925 }
5926
5927 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
5928 TEST_REQUIRES_ARM_NEON_FMA;
5929 for (size_t k = 1; k <= 20; k += 5) {
5930 for (uint32_t m = 1; m <= 1; m++) {
5931 for (uint32_t n = 1; n <= 12; n++) {
5932 GemmMicrokernelTester()
5933 .mr(1)
5934 .nr(12)
5935 .kr(1)
5936 .sr(1)
5937 .m(m)
5938 .n(n)
5939 .k(k)
5940 .cm_stride(17)
5941 .iterations(1)
5942 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5943 }
5944 }
5945 }
5946 }
5947
5948 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
5949 TEST_REQUIRES_ARM_NEON_FMA;
5950 GemmMicrokernelTester()
5951 .mr(1)
5952 .nr(12)
5953 .kr(1)
5954 .sr(1)
5955 .m(1)
5956 .n(12)
5957 .k(4)
5958 .qmin(128)
5959 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5960 }
5961
5962 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
5963 TEST_REQUIRES_ARM_NEON_FMA;
5964 GemmMicrokernelTester()
5965 .mr(1)
5966 .nr(12)
5967 .kr(1)
5968 .sr(1)
5969 .m(1)
5970 .n(12)
5971 .k(4)
5972 .qmax(128)
5973 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5974 }
5975
5976 TEST(F32_GEMMINC_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
5977 TEST_REQUIRES_ARM_NEON_FMA;
5978 GemmMicrokernelTester()
5979 .mr(1)
5980 .nr(12)
5981 .kr(1)
5982 .sr(1)
5983 .m(1)
5984 .n(12)
5985 .k(4)
5986 .cm_stride(17)
5987 .Test(xnn_f32_gemminc_ukernel_1x12__aarch64_neonfma_cortex_a53);
5988 }
Frank Barchard7e955972019-10-11 10:34:25 -07005989#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005990
5991
Frank Barchard7e955972019-10-11 10:34:25 -07005992#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005993 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
5994 TEST_REQUIRES_ARM_NEON_FMA;
5995 GemmMicrokernelTester()
5996 .mr(4)
5997 .nr(12)
5998 .kr(1)
5999 .sr(1)
6000 .m(4)
6001 .n(12)
6002 .k(4)
6003 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6004 }
6005
6006 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
6007 TEST_REQUIRES_ARM_NEON_FMA;
6008 GemmMicrokernelTester()
6009 .mr(4)
6010 .nr(12)
6011 .kr(1)
6012 .sr(1)
6013 .m(4)
6014 .n(12)
6015 .k(4)
6016 .cn_stride(17)
6017 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6018 }
6019
6020 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
6021 TEST_REQUIRES_ARM_NEON_FMA;
6022 GemmMicrokernelTester()
6023 .mr(4)
6024 .nr(12)
6025 .kr(1)
6026 .sr(1)
6027 .m(4)
6028 .n(12)
6029 .k(4)
6030 .a_stride(7)
6031 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6032 }
6033
6034 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
6035 TEST_REQUIRES_ARM_NEON_FMA;
6036 for (uint32_t m = 1; m <= 4; m++) {
6037 for (uint32_t n = 1; n <= 12; n++) {
6038 GemmMicrokernelTester()
6039 .mr(4)
6040 .nr(12)
6041 .kr(1)
6042 .sr(1)
6043 .m(m)
6044 .n(n)
6045 .k(4)
6046 .iterations(1)
6047 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6048 }
6049 }
6050 }
6051
6052 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
6053 TEST_REQUIRES_ARM_NEON_FMA;
6054 for (uint32_t m = 1; m <= 4; m++) {
6055 GemmMicrokernelTester()
6056 .mr(4)
6057 .nr(12)
6058 .kr(1)
6059 .sr(1)
6060 .m(m)
6061 .n(12)
6062 .k(4)
6063 .iterations(1)
6064 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6065 }
6066 }
6067
6068 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
6069 TEST_REQUIRES_ARM_NEON_FMA;
6070 for (uint32_t n = 1; n <= 12; n++) {
6071 GemmMicrokernelTester()
6072 .mr(4)
6073 .nr(12)
6074 .kr(1)
6075 .sr(1)
6076 .m(4)
6077 .n(n)
6078 .k(4)
6079 .iterations(1)
6080 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6081 }
6082 }
6083
6084 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
6085 TEST_REQUIRES_ARM_NEON_FMA;
6086 GemmMicrokernelTester()
6087 .mr(4)
6088 .nr(12)
6089 .kr(1)
6090 .sr(1)
6091 .m(4)
6092 .n(12)
6093 .k(8)
6094 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6095 }
6096
6097 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
6098 TEST_REQUIRES_ARM_NEON_FMA;
6099 GemmMicrokernelTester()
6100 .mr(4)
6101 .nr(12)
6102 .kr(1)
6103 .sr(1)
6104 .m(4)
6105 .n(12)
6106 .k(8)
6107 .a_stride(11)
6108 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6109 }
6110
6111 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
6112 TEST_REQUIRES_ARM_NEON_FMA;
6113 for (uint32_t m = 1; m <= 4; m++) {
6114 for (uint32_t n = 1; n <= 12; n++) {
6115 GemmMicrokernelTester()
6116 .mr(4)
6117 .nr(12)
6118 .kr(1)
6119 .sr(1)
6120 .m(m)
6121 .n(n)
6122 .k(8)
6123 .iterations(1)
6124 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6125 }
6126 }
6127 }
6128
6129 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
6130 TEST_REQUIRES_ARM_NEON_FMA;
6131 for (size_t k = 1; k < 8; k++) {
6132 GemmMicrokernelTester()
6133 .mr(4)
6134 .nr(12)
6135 .kr(1)
6136 .sr(1)
6137 .m(4)
6138 .n(12)
6139 .k(k)
6140 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6141 }
6142 }
6143
6144 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
6145 TEST_REQUIRES_ARM_NEON_FMA;
6146 for (size_t k = 1; k < 8; k++) {
6147 GemmMicrokernelTester()
6148 .mr(4)
6149 .nr(12)
6150 .kr(1)
6151 .sr(1)
6152 .m(4)
6153 .n(12)
6154 .k(k)
6155 .a_stride(11)
6156 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6157 }
6158 }
6159
6160 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
6161 TEST_REQUIRES_ARM_NEON_FMA;
6162 for (size_t k = 1; k < 8; k++) {
6163 for (uint32_t m = 1; m <= 4; m++) {
6164 for (uint32_t n = 1; n <= 12; n++) {
6165 GemmMicrokernelTester()
6166 .mr(4)
6167 .nr(12)
6168 .kr(1)
6169 .sr(1)
6170 .m(m)
6171 .n(n)
6172 .k(k)
6173 .iterations(1)
6174 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6175 }
6176 }
6177 }
6178 }
6179
6180 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
6181 TEST_REQUIRES_ARM_NEON_FMA;
6182 for (size_t k = 9; k < 8; k++) {
6183 GemmMicrokernelTester()
6184 .mr(4)
6185 .nr(12)
6186 .kr(1)
6187 .sr(1)
6188 .m(4)
6189 .n(12)
6190 .k(k)
6191 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6192 }
6193 }
6194
6195 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
6196 TEST_REQUIRES_ARM_NEON_FMA;
6197 for (size_t k = 9; k < 8; k++) {
6198 GemmMicrokernelTester()
6199 .mr(4)
6200 .nr(12)
6201 .kr(1)
6202 .sr(1)
6203 .m(4)
6204 .n(12)
6205 .k(k)
6206 .a_stride(11)
6207 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6208 }
6209 }
6210
6211 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
6212 TEST_REQUIRES_ARM_NEON_FMA;
6213 for (size_t k = 9; k < 8; k++) {
6214 for (uint32_t m = 1; m <= 4; m++) {
6215 for (uint32_t n = 1; n <= 12; n++) {
6216 GemmMicrokernelTester()
6217 .mr(4)
6218 .nr(12)
6219 .kr(1)
6220 .sr(1)
6221 .m(m)
6222 .n(n)
6223 .k(k)
6224 .iterations(1)
6225 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6226 }
6227 }
6228 }
6229 }
6230
6231 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
6232 TEST_REQUIRES_ARM_NEON_FMA;
6233 for (size_t k = 12; k <= 40; k += 4) {
6234 GemmMicrokernelTester()
6235 .mr(4)
6236 .nr(12)
6237 .kr(1)
6238 .sr(1)
6239 .m(4)
6240 .n(12)
6241 .k(k)
6242 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6243 }
6244 }
6245
6246 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
6247 TEST_REQUIRES_ARM_NEON_FMA;
6248 for (size_t k = 12; k <= 40; k += 4) {
6249 GemmMicrokernelTester()
6250 .mr(4)
6251 .nr(12)
6252 .kr(1)
6253 .sr(1)
6254 .m(4)
6255 .n(12)
6256 .k(k)
6257 .a_stride(43)
6258 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6259 }
6260 }
6261
6262 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
6263 TEST_REQUIRES_ARM_NEON_FMA;
6264 for (size_t k = 12; k <= 40; k += 4) {
6265 for (uint32_t m = 1; m <= 4; m++) {
6266 for (uint32_t n = 1; n <= 12; n++) {
6267 GemmMicrokernelTester()
6268 .mr(4)
6269 .nr(12)
6270 .kr(1)
6271 .sr(1)
6272 .m(m)
6273 .n(n)
6274 .k(k)
6275 .iterations(1)
6276 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6277 }
6278 }
6279 }
6280 }
6281
6282 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
6283 TEST_REQUIRES_ARM_NEON_FMA;
6284 for (uint32_t n = 13; n < 24; n++) {
6285 for (size_t k = 1; k <= 20; k += 5) {
6286 GemmMicrokernelTester()
6287 .mr(4)
6288 .nr(12)
6289 .kr(1)
6290 .sr(1)
6291 .m(4)
6292 .n(12)
6293 .k(k)
6294 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6295 }
6296 }
6297 }
6298
6299 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
6300 TEST_REQUIRES_ARM_NEON_FMA;
6301 for (uint32_t n = 13; n < 24; n++) {
6302 for (size_t k = 1; k <= 20; k += 5) {
6303 GemmMicrokernelTester()
6304 .mr(4)
6305 .nr(12)
6306 .kr(1)
6307 .sr(1)
6308 .m(4)
6309 .n(12)
6310 .k(k)
6311 .cn_stride(17)
6312 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6313 }
6314 }
6315 }
6316
6317 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
6318 TEST_REQUIRES_ARM_NEON_FMA;
6319 for (uint32_t n = 13; n < 24; n++) {
6320 for (size_t k = 1; k <= 20; k += 5) {
6321 GemmMicrokernelTester()
6322 .mr(4)
6323 .nr(12)
6324 .kr(1)
6325 .sr(1)
6326 .m(4)
6327 .n(n)
6328 .k(k)
6329 .a_stride(23)
6330 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6331 }
6332 }
6333 }
6334
6335 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
6336 TEST_REQUIRES_ARM_NEON_FMA;
6337 for (uint32_t n = 13; n < 24; n++) {
6338 for (size_t k = 1; k <= 20; k += 5) {
6339 for (uint32_t m = 1; m <= 4; m++) {
6340 GemmMicrokernelTester()
6341 .mr(4)
6342 .nr(12)
6343 .kr(1)
6344 .sr(1)
6345 .m(m)
6346 .n(n)
6347 .k(k)
6348 .iterations(1)
6349 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6350 }
6351 }
6352 }
6353 }
6354
6355 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
6356 TEST_REQUIRES_ARM_NEON_FMA;
6357 for (uint32_t n = 24; n <= 36; n += 12) {
6358 for (size_t k = 1; k <= 20; k += 5) {
6359 GemmMicrokernelTester()
6360 .mr(4)
6361 .nr(12)
6362 .kr(1)
6363 .sr(1)
6364 .m(4)
6365 .n(12)
6366 .k(k)
6367 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6368 }
6369 }
6370 }
6371
6372 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
6373 TEST_REQUIRES_ARM_NEON_FMA;
6374 for (uint32_t n = 24; n <= 36; n += 12) {
6375 for (size_t k = 1; k <= 20; k += 5) {
6376 GemmMicrokernelTester()
6377 .mr(4)
6378 .nr(12)
6379 .kr(1)
6380 .sr(1)
6381 .m(4)
6382 .n(n)
6383 .k(k)
6384 .cn_stride(17)
6385 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6386 }
6387 }
6388 }
6389
6390 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
6391 TEST_REQUIRES_ARM_NEON_FMA;
6392 for (uint32_t n = 24; n <= 36; n += 12) {
6393 for (size_t k = 1; k <= 20; k += 5) {
6394 GemmMicrokernelTester()
6395 .mr(4)
6396 .nr(12)
6397 .kr(1)
6398 .sr(1)
6399 .m(4)
6400 .n(n)
6401 .k(k)
6402 .a_stride(23)
6403 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6404 }
6405 }
6406 }
6407
6408 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
6409 TEST_REQUIRES_ARM_NEON_FMA;
6410 for (uint32_t n = 24; n <= 36; n += 12) {
6411 for (size_t k = 1; k <= 20; k += 5) {
6412 for (uint32_t m = 1; m <= 4; m++) {
6413 GemmMicrokernelTester()
6414 .mr(4)
6415 .nr(12)
6416 .kr(1)
6417 .sr(1)
6418 .m(m)
6419 .n(n)
6420 .k(k)
6421 .iterations(1)
6422 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6423 }
6424 }
6425 }
6426 }
6427
6428 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
6429 TEST_REQUIRES_ARM_NEON_FMA;
6430 for (size_t k = 1; k <= 20; k += 5) {
6431 for (uint32_t m = 1; m <= 4; m++) {
6432 for (uint32_t n = 1; n <= 12; n++) {
6433 GemmMicrokernelTester()
6434 .mr(4)
6435 .nr(12)
6436 .kr(1)
6437 .sr(1)
6438 .m(m)
6439 .n(n)
6440 .k(k)
6441 .cm_stride(17)
6442 .iterations(1)
6443 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6444 }
6445 }
6446 }
6447 }
6448
6449 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
6450 TEST_REQUIRES_ARM_NEON_FMA;
6451 GemmMicrokernelTester()
6452 .mr(4)
6453 .nr(12)
6454 .kr(1)
6455 .sr(1)
6456 .m(4)
6457 .n(12)
6458 .k(4)
6459 .qmin(128)
6460 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6461 }
6462
6463 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
6464 TEST_REQUIRES_ARM_NEON_FMA;
6465 GemmMicrokernelTester()
6466 .mr(4)
6467 .nr(12)
6468 .kr(1)
6469 .sr(1)
6470 .m(4)
6471 .n(12)
6472 .k(4)
6473 .qmax(128)
6474 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6475 }
6476
6477 TEST(F32_GEMMINC_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
6478 TEST_REQUIRES_ARM_NEON_FMA;
6479 GemmMicrokernelTester()
6480 .mr(4)
6481 .nr(12)
6482 .kr(1)
6483 .sr(1)
6484 .m(4)
6485 .n(12)
6486 .k(4)
6487 .cm_stride(17)
6488 .Test(xnn_f32_gemminc_ukernel_4x12__aarch64_neonfma_cortex_a53);
6489 }
Frank Barchard7e955972019-10-11 10:34:25 -07006490#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006491
6492
Frank Barchard7e955972019-10-11 10:34:25 -07006493#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006494 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2) {
6495 TEST_REQUIRES_ARM_NEON_FMA;
6496 GemmMicrokernelTester()
6497 .mr(4)
6498 .nr(8)
6499 .kr(1)
6500 .sr(1)
6501 .m(4)
6502 .n(8)
6503 .k(2)
6504 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6505 }
6506
6507 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, strided_cn) {
6508 TEST_REQUIRES_ARM_NEON_FMA;
6509 GemmMicrokernelTester()
6510 .mr(4)
6511 .nr(8)
6512 .kr(1)
6513 .sr(1)
6514 .m(4)
6515 .n(8)
6516 .k(2)
6517 .cn_stride(11)
6518 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6519 }
6520
6521 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
6522 TEST_REQUIRES_ARM_NEON_FMA;
6523 GemmMicrokernelTester()
6524 .mr(4)
6525 .nr(8)
6526 .kr(1)
6527 .sr(1)
6528 .m(4)
6529 .n(8)
6530 .k(2)
6531 .a_stride(5)
6532 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6533 }
6534
6535 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
6536 TEST_REQUIRES_ARM_NEON_FMA;
6537 for (uint32_t m = 1; m <= 4; m++) {
6538 for (uint32_t n = 1; n <= 8; n++) {
6539 GemmMicrokernelTester()
6540 .mr(4)
6541 .nr(8)
6542 .kr(1)
6543 .sr(1)
6544 .m(m)
6545 .n(n)
6546 .k(2)
6547 .iterations(1)
6548 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6549 }
6550 }
6551 }
6552
6553 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
6554 TEST_REQUIRES_ARM_NEON_FMA;
6555 for (uint32_t m = 1; m <= 4; m++) {
6556 GemmMicrokernelTester()
6557 .mr(4)
6558 .nr(8)
6559 .kr(1)
6560 .sr(1)
6561 .m(m)
6562 .n(8)
6563 .k(2)
6564 .iterations(1)
6565 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6566 }
6567 }
6568
6569 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
6570 TEST_REQUIRES_ARM_NEON_FMA;
6571 for (uint32_t n = 1; n <= 8; n++) {
6572 GemmMicrokernelTester()
6573 .mr(4)
6574 .nr(8)
6575 .kr(1)
6576 .sr(1)
6577 .m(4)
6578 .n(n)
6579 .k(2)
6580 .iterations(1)
6581 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6582 }
6583 }
6584
6585 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_lt_2) {
6586 TEST_REQUIRES_ARM_NEON_FMA;
6587 for (size_t k = 1; k < 2; k++) {
6588 GemmMicrokernelTester()
6589 .mr(4)
6590 .nr(8)
6591 .kr(1)
6592 .sr(1)
6593 .m(4)
6594 .n(8)
6595 .k(k)
6596 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6597 }
6598 }
6599
6600 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
6601 TEST_REQUIRES_ARM_NEON_FMA;
6602 for (size_t k = 1; k < 2; k++) {
6603 GemmMicrokernelTester()
6604 .mr(4)
6605 .nr(8)
6606 .kr(1)
6607 .sr(1)
6608 .m(4)
6609 .n(8)
6610 .k(k)
6611 .a_stride(5)
6612 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6613 }
6614 }
6615
6616 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
6617 TEST_REQUIRES_ARM_NEON_FMA;
6618 for (size_t k = 1; k < 2; k++) {
6619 for (uint32_t m = 1; m <= 4; m++) {
6620 for (uint32_t n = 1; n <= 8; n++) {
6621 GemmMicrokernelTester()
6622 .mr(4)
6623 .nr(8)
6624 .kr(1)
6625 .sr(1)
6626 .m(m)
6627 .n(n)
6628 .k(k)
6629 .iterations(1)
6630 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6631 }
6632 }
6633 }
6634 }
6635
6636 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_gt_2) {
6637 TEST_REQUIRES_ARM_NEON_FMA;
6638 for (size_t k = 3; k < 4; k++) {
6639 GemmMicrokernelTester()
6640 .mr(4)
6641 .nr(8)
6642 .kr(1)
6643 .sr(1)
6644 .m(4)
6645 .n(8)
6646 .k(k)
6647 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6648 }
6649 }
6650
6651 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
6652 TEST_REQUIRES_ARM_NEON_FMA;
6653 for (size_t k = 3; k < 4; k++) {
6654 GemmMicrokernelTester()
6655 .mr(4)
6656 .nr(8)
6657 .kr(1)
6658 .sr(1)
6659 .m(4)
6660 .n(8)
6661 .k(k)
6662 .a_stride(7)
6663 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6664 }
6665 }
6666
6667 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
6668 TEST_REQUIRES_ARM_NEON_FMA;
6669 for (size_t k = 3; k < 4; k++) {
6670 for (uint32_t m = 1; m <= 4; m++) {
6671 for (uint32_t n = 1; n <= 8; n++) {
6672 GemmMicrokernelTester()
6673 .mr(4)
6674 .nr(8)
6675 .kr(1)
6676 .sr(1)
6677 .m(m)
6678 .n(n)
6679 .k(k)
6680 .iterations(1)
6681 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6682 }
6683 }
6684 }
6685 }
6686
6687 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_div_2) {
6688 TEST_REQUIRES_ARM_NEON_FMA;
6689 for (size_t k = 4; k <= 20; k += 2) {
6690 GemmMicrokernelTester()
6691 .mr(4)
6692 .nr(8)
6693 .kr(1)
6694 .sr(1)
6695 .m(4)
6696 .n(8)
6697 .k(k)
6698 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6699 }
6700 }
6701
6702 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
6703 TEST_REQUIRES_ARM_NEON_FMA;
6704 for (size_t k = 4; k <= 20; k += 2) {
6705 GemmMicrokernelTester()
6706 .mr(4)
6707 .nr(8)
6708 .kr(1)
6709 .sr(1)
6710 .m(4)
6711 .n(8)
6712 .k(k)
6713 .a_stride(23)
6714 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6715 }
6716 }
6717
6718 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
6719 TEST_REQUIRES_ARM_NEON_FMA;
6720 for (size_t k = 4; k <= 20; k += 2) {
6721 for (uint32_t m = 1; m <= 4; m++) {
6722 for (uint32_t n = 1; n <= 8; n++) {
6723 GemmMicrokernelTester()
6724 .mr(4)
6725 .nr(8)
6726 .kr(1)
6727 .sr(1)
6728 .m(m)
6729 .n(n)
6730 .k(k)
6731 .iterations(1)
6732 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6733 }
6734 }
6735 }
6736 }
6737
6738 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8) {
6739 TEST_REQUIRES_ARM_NEON_FMA;
6740 for (uint32_t n = 9; n < 16; n++) {
6741 for (size_t k = 1; k <= 10; k += 3) {
6742 GemmMicrokernelTester()
6743 .mr(4)
6744 .nr(8)
6745 .kr(1)
6746 .sr(1)
6747 .m(4)
6748 .n(8)
6749 .k(k)
6750 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6751 }
6752 }
6753 }
6754
6755 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
6756 TEST_REQUIRES_ARM_NEON_FMA;
6757 for (uint32_t n = 9; n < 16; n++) {
6758 for (size_t k = 1; k <= 10; k += 3) {
6759 GemmMicrokernelTester()
6760 .mr(4)
6761 .nr(8)
6762 .kr(1)
6763 .sr(1)
6764 .m(4)
6765 .n(8)
6766 .k(k)
6767 .cn_stride(11)
6768 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6769 }
6770 }
6771 }
6772
6773 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
6774 TEST_REQUIRES_ARM_NEON_FMA;
6775 for (uint32_t n = 9; n < 16; n++) {
6776 for (size_t k = 1; k <= 10; k += 3) {
6777 GemmMicrokernelTester()
6778 .mr(4)
6779 .nr(8)
6780 .kr(1)
6781 .sr(1)
6782 .m(4)
6783 .n(n)
6784 .k(k)
6785 .a_stride(13)
6786 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6787 }
6788 }
6789 }
6790
6791 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
6792 TEST_REQUIRES_ARM_NEON_FMA;
6793 for (uint32_t n = 9; n < 16; n++) {
6794 for (size_t k = 1; k <= 10; k += 3) {
6795 for (uint32_t m = 1; m <= 4; m++) {
6796 GemmMicrokernelTester()
6797 .mr(4)
6798 .nr(8)
6799 .kr(1)
6800 .sr(1)
6801 .m(m)
6802 .n(n)
6803 .k(k)
6804 .iterations(1)
6805 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6806 }
6807 }
6808 }
6809 }
6810
6811 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8) {
6812 TEST_REQUIRES_ARM_NEON_FMA;
6813 for (uint32_t n = 16; n <= 24; n += 8) {
6814 for (size_t k = 1; k <= 10; k += 3) {
6815 GemmMicrokernelTester()
6816 .mr(4)
6817 .nr(8)
6818 .kr(1)
6819 .sr(1)
6820 .m(4)
6821 .n(8)
6822 .k(k)
6823 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6824 }
6825 }
6826 }
6827
6828 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
6829 TEST_REQUIRES_ARM_NEON_FMA;
6830 for (uint32_t n = 16; n <= 24; n += 8) {
6831 for (size_t k = 1; k <= 10; k += 3) {
6832 GemmMicrokernelTester()
6833 .mr(4)
6834 .nr(8)
6835 .kr(1)
6836 .sr(1)
6837 .m(4)
6838 .n(n)
6839 .k(k)
6840 .cn_stride(11)
6841 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6842 }
6843 }
6844 }
6845
6846 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
6847 TEST_REQUIRES_ARM_NEON_FMA;
6848 for (uint32_t n = 16; n <= 24; n += 8) {
6849 for (size_t k = 1; k <= 10; k += 3) {
6850 GemmMicrokernelTester()
6851 .mr(4)
6852 .nr(8)
6853 .kr(1)
6854 .sr(1)
6855 .m(4)
6856 .n(n)
6857 .k(k)
6858 .a_stride(13)
6859 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6860 }
6861 }
6862 }
6863
6864 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
6865 TEST_REQUIRES_ARM_NEON_FMA;
6866 for (uint32_t n = 16; n <= 24; n += 8) {
6867 for (size_t k = 1; k <= 10; k += 3) {
6868 for (uint32_t m = 1; m <= 4; m++) {
6869 GemmMicrokernelTester()
6870 .mr(4)
6871 .nr(8)
6872 .kr(1)
6873 .sr(1)
6874 .m(m)
6875 .n(n)
6876 .k(k)
6877 .iterations(1)
6878 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6879 }
6880 }
6881 }
6882 }
6883
6884 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
6885 TEST_REQUIRES_ARM_NEON_FMA;
6886 for (size_t k = 1; k <= 10; k += 3) {
6887 for (uint32_t m = 1; m <= 4; m++) {
6888 for (uint32_t n = 1; n <= 8; n++) {
6889 GemmMicrokernelTester()
6890 .mr(4)
6891 .nr(8)
6892 .kr(1)
6893 .sr(1)
6894 .m(m)
6895 .n(n)
6896 .k(k)
6897 .cm_stride(11)
6898 .iterations(1)
6899 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6900 }
6901 }
6902 }
6903 }
6904
6905 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, qmin) {
6906 TEST_REQUIRES_ARM_NEON_FMA;
6907 GemmMicrokernelTester()
6908 .mr(4)
6909 .nr(8)
6910 .kr(1)
6911 .sr(1)
6912 .m(4)
6913 .n(8)
6914 .k(2)
6915 .qmin(128)
6916 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6917 }
6918
6919 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, qmax) {
6920 TEST_REQUIRES_ARM_NEON_FMA;
6921 GemmMicrokernelTester()
6922 .mr(4)
6923 .nr(8)
6924 .kr(1)
6925 .sr(1)
6926 .m(4)
6927 .n(8)
6928 .k(2)
6929 .qmax(128)
6930 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6931 }
6932
6933 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD64, strided_cm) {
6934 TEST_REQUIRES_ARM_NEON_FMA;
6935 GemmMicrokernelTester()
6936 .mr(4)
6937 .nr(8)
6938 .kr(1)
6939 .sr(1)
6940 .m(4)
6941 .n(8)
6942 .k(2)
6943 .cm_stride(11)
6944 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld64);
6945 }
Frank Barchard7e955972019-10-11 10:34:25 -07006946#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006947
6948
Frank Barchard7e955972019-10-11 10:34:25 -07006949#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006950 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
6951 TEST_REQUIRES_ARM_NEON_FMA;
6952 GemmMicrokernelTester()
6953 .mr(4)
6954 .nr(8)
6955 .kr(1)
6956 .sr(1)
6957 .m(4)
6958 .n(8)
6959 .k(4)
6960 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
6961 }
6962
6963 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, strided_cn) {
6964 TEST_REQUIRES_ARM_NEON_FMA;
6965 GemmMicrokernelTester()
6966 .mr(4)
6967 .nr(8)
6968 .kr(1)
6969 .sr(1)
6970 .m(4)
6971 .n(8)
6972 .k(4)
6973 .cn_stride(11)
6974 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
6975 }
6976
6977 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
6978 TEST_REQUIRES_ARM_NEON_FMA;
6979 GemmMicrokernelTester()
6980 .mr(4)
6981 .nr(8)
6982 .kr(1)
6983 .sr(1)
6984 .m(4)
6985 .n(8)
6986 .k(4)
6987 .a_stride(7)
6988 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
6989 }
6990
6991 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
6992 TEST_REQUIRES_ARM_NEON_FMA;
6993 for (uint32_t m = 1; m <= 4; m++) {
6994 for (uint32_t n = 1; n <= 8; n++) {
6995 GemmMicrokernelTester()
6996 .mr(4)
6997 .nr(8)
6998 .kr(1)
6999 .sr(1)
7000 .m(m)
7001 .n(n)
7002 .k(4)
7003 .iterations(1)
7004 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7005 }
7006 }
7007 }
7008
7009 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
7010 TEST_REQUIRES_ARM_NEON_FMA;
7011 for (uint32_t m = 1; m <= 4; m++) {
7012 GemmMicrokernelTester()
7013 .mr(4)
7014 .nr(8)
7015 .kr(1)
7016 .sr(1)
7017 .m(m)
7018 .n(8)
7019 .k(4)
7020 .iterations(1)
7021 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7022 }
7023 }
7024
7025 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
7026 TEST_REQUIRES_ARM_NEON_FMA;
7027 for (uint32_t n = 1; n <= 8; n++) {
7028 GemmMicrokernelTester()
7029 .mr(4)
7030 .nr(8)
7031 .kr(1)
7032 .sr(1)
7033 .m(4)
7034 .n(n)
7035 .k(4)
7036 .iterations(1)
7037 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7038 }
7039 }
7040
7041 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_lt_4) {
7042 TEST_REQUIRES_ARM_NEON_FMA;
7043 for (size_t k = 1; k < 4; k++) {
7044 GemmMicrokernelTester()
7045 .mr(4)
7046 .nr(8)
7047 .kr(1)
7048 .sr(1)
7049 .m(4)
7050 .n(8)
7051 .k(k)
7052 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7053 }
7054 }
7055
7056 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
7057 TEST_REQUIRES_ARM_NEON_FMA;
7058 for (size_t k = 1; k < 4; k++) {
7059 GemmMicrokernelTester()
7060 .mr(4)
7061 .nr(8)
7062 .kr(1)
7063 .sr(1)
7064 .m(4)
7065 .n(8)
7066 .k(k)
7067 .a_stride(7)
7068 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7069 }
7070 }
7071
7072 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
7073 TEST_REQUIRES_ARM_NEON_FMA;
7074 for (size_t k = 1; k < 4; k++) {
7075 for (uint32_t m = 1; m <= 4; m++) {
7076 for (uint32_t n = 1; n <= 8; n++) {
7077 GemmMicrokernelTester()
7078 .mr(4)
7079 .nr(8)
7080 .kr(1)
7081 .sr(1)
7082 .m(m)
7083 .n(n)
7084 .k(k)
7085 .iterations(1)
7086 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7087 }
7088 }
7089 }
7090 }
7091
7092 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_gt_4) {
7093 TEST_REQUIRES_ARM_NEON_FMA;
7094 for (size_t k = 5; k < 8; k++) {
7095 GemmMicrokernelTester()
7096 .mr(4)
7097 .nr(8)
7098 .kr(1)
7099 .sr(1)
7100 .m(4)
7101 .n(8)
7102 .k(k)
7103 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7104 }
7105 }
7106
7107 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
7108 TEST_REQUIRES_ARM_NEON_FMA;
7109 for (size_t k = 5; k < 8; k++) {
7110 GemmMicrokernelTester()
7111 .mr(4)
7112 .nr(8)
7113 .kr(1)
7114 .sr(1)
7115 .m(4)
7116 .n(8)
7117 .k(k)
7118 .a_stride(11)
7119 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7120 }
7121 }
7122
7123 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
7124 TEST_REQUIRES_ARM_NEON_FMA;
7125 for (size_t k = 5; k < 8; k++) {
7126 for (uint32_t m = 1; m <= 4; m++) {
7127 for (uint32_t n = 1; n <= 8; n++) {
7128 GemmMicrokernelTester()
7129 .mr(4)
7130 .nr(8)
7131 .kr(1)
7132 .sr(1)
7133 .m(m)
7134 .n(n)
7135 .k(k)
7136 .iterations(1)
7137 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7138 }
7139 }
7140 }
7141 }
7142
7143 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_div_4) {
7144 TEST_REQUIRES_ARM_NEON_FMA;
7145 for (size_t k = 8; k <= 40; k += 4) {
7146 GemmMicrokernelTester()
7147 .mr(4)
7148 .nr(8)
7149 .kr(1)
7150 .sr(1)
7151 .m(4)
7152 .n(8)
7153 .k(k)
7154 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7155 }
7156 }
7157
7158 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
7159 TEST_REQUIRES_ARM_NEON_FMA;
7160 for (size_t k = 8; k <= 40; k += 4) {
7161 GemmMicrokernelTester()
7162 .mr(4)
7163 .nr(8)
7164 .kr(1)
7165 .sr(1)
7166 .m(4)
7167 .n(8)
7168 .k(k)
7169 .a_stride(43)
7170 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7171 }
7172 }
7173
7174 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
7175 TEST_REQUIRES_ARM_NEON_FMA;
7176 for (size_t k = 8; k <= 40; k += 4) {
7177 for (uint32_t m = 1; m <= 4; m++) {
7178 for (uint32_t n = 1; n <= 8; n++) {
7179 GemmMicrokernelTester()
7180 .mr(4)
7181 .nr(8)
7182 .kr(1)
7183 .sr(1)
7184 .m(m)
7185 .n(n)
7186 .k(k)
7187 .iterations(1)
7188 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7189 }
7190 }
7191 }
7192 }
7193
7194 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8) {
7195 TEST_REQUIRES_ARM_NEON_FMA;
7196 for (uint32_t n = 9; n < 16; n++) {
7197 for (size_t k = 1; k <= 20; k += 5) {
7198 GemmMicrokernelTester()
7199 .mr(4)
7200 .nr(8)
7201 .kr(1)
7202 .sr(1)
7203 .m(4)
7204 .n(8)
7205 .k(k)
7206 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7207 }
7208 }
7209 }
7210
7211 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
7212 TEST_REQUIRES_ARM_NEON_FMA;
7213 for (uint32_t n = 9; n < 16; n++) {
7214 for (size_t k = 1; k <= 20; k += 5) {
7215 GemmMicrokernelTester()
7216 .mr(4)
7217 .nr(8)
7218 .kr(1)
7219 .sr(1)
7220 .m(4)
7221 .n(8)
7222 .k(k)
7223 .cn_stride(11)
7224 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7225 }
7226 }
7227 }
7228
7229 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
7230 TEST_REQUIRES_ARM_NEON_FMA;
7231 for (uint32_t n = 9; n < 16; n++) {
7232 for (size_t k = 1; k <= 20; k += 5) {
7233 GemmMicrokernelTester()
7234 .mr(4)
7235 .nr(8)
7236 .kr(1)
7237 .sr(1)
7238 .m(4)
7239 .n(n)
7240 .k(k)
7241 .a_stride(23)
7242 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7243 }
7244 }
7245 }
7246
7247 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
7248 TEST_REQUIRES_ARM_NEON_FMA;
7249 for (uint32_t n = 9; n < 16; n++) {
7250 for (size_t k = 1; k <= 20; k += 5) {
7251 for (uint32_t m = 1; m <= 4; m++) {
7252 GemmMicrokernelTester()
7253 .mr(4)
7254 .nr(8)
7255 .kr(1)
7256 .sr(1)
7257 .m(m)
7258 .n(n)
7259 .k(k)
7260 .iterations(1)
7261 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7262 }
7263 }
7264 }
7265 }
7266
7267 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8) {
7268 TEST_REQUIRES_ARM_NEON_FMA;
7269 for (uint32_t n = 16; n <= 24; n += 8) {
7270 for (size_t k = 1; k <= 20; k += 5) {
7271 GemmMicrokernelTester()
7272 .mr(4)
7273 .nr(8)
7274 .kr(1)
7275 .sr(1)
7276 .m(4)
7277 .n(8)
7278 .k(k)
7279 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7280 }
7281 }
7282 }
7283
7284 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
7285 TEST_REQUIRES_ARM_NEON_FMA;
7286 for (uint32_t n = 16; n <= 24; n += 8) {
7287 for (size_t k = 1; k <= 20; k += 5) {
7288 GemmMicrokernelTester()
7289 .mr(4)
7290 .nr(8)
7291 .kr(1)
7292 .sr(1)
7293 .m(4)
7294 .n(n)
7295 .k(k)
7296 .cn_stride(11)
7297 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7298 }
7299 }
7300 }
7301
7302 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
7303 TEST_REQUIRES_ARM_NEON_FMA;
7304 for (uint32_t n = 16; n <= 24; n += 8) {
7305 for (size_t k = 1; k <= 20; k += 5) {
7306 GemmMicrokernelTester()
7307 .mr(4)
7308 .nr(8)
7309 .kr(1)
7310 .sr(1)
7311 .m(4)
7312 .n(n)
7313 .k(k)
7314 .a_stride(23)
7315 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7316 }
7317 }
7318 }
7319
7320 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
7321 TEST_REQUIRES_ARM_NEON_FMA;
7322 for (uint32_t n = 16; n <= 24; n += 8) {
7323 for (size_t k = 1; k <= 20; k += 5) {
7324 for (uint32_t m = 1; m <= 4; m++) {
7325 GemmMicrokernelTester()
7326 .mr(4)
7327 .nr(8)
7328 .kr(1)
7329 .sr(1)
7330 .m(m)
7331 .n(n)
7332 .k(k)
7333 .iterations(1)
7334 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7335 }
7336 }
7337 }
7338 }
7339
7340 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
7341 TEST_REQUIRES_ARM_NEON_FMA;
7342 for (size_t k = 1; k <= 20; k += 5) {
7343 for (uint32_t m = 1; m <= 4; m++) {
7344 for (uint32_t n = 1; n <= 8; n++) {
7345 GemmMicrokernelTester()
7346 .mr(4)
7347 .nr(8)
7348 .kr(1)
7349 .sr(1)
7350 .m(m)
7351 .n(n)
7352 .k(k)
7353 .cm_stride(11)
7354 .iterations(1)
7355 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7356 }
7357 }
7358 }
7359 }
7360
7361 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, qmin) {
7362 TEST_REQUIRES_ARM_NEON_FMA;
7363 GemmMicrokernelTester()
7364 .mr(4)
7365 .nr(8)
7366 .kr(1)
7367 .sr(1)
7368 .m(4)
7369 .n(8)
7370 .k(4)
7371 .qmin(128)
7372 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7373 }
7374
7375 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, qmax) {
7376 TEST_REQUIRES_ARM_NEON_FMA;
7377 GemmMicrokernelTester()
7378 .mr(4)
7379 .nr(8)
7380 .kr(1)
7381 .sr(1)
7382 .m(4)
7383 .n(8)
7384 .k(4)
7385 .qmax(128)
7386 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7387 }
7388
7389 TEST(F32_GEMMINC_4X8__AARCH64_NEONFMA_LD128, strided_cm) {
7390 TEST_REQUIRES_ARM_NEON_FMA;
7391 GemmMicrokernelTester()
7392 .mr(4)
7393 .nr(8)
7394 .kr(1)
7395 .sr(1)
7396 .m(4)
7397 .n(8)
7398 .k(4)
7399 .cm_stride(11)
7400 .Test(xnn_f32_gemminc_ukernel_4x8__aarch64_neonfma_ld128);
7401 }
Frank Barchard7e955972019-10-11 10:34:25 -07007402#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007403
7404
Frank Barchard7e955972019-10-11 10:34:25 -07007405#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007406 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
7407 TEST_REQUIRES_ARM_NEON_FMA;
7408 GemmMicrokernelTester()
7409 .mr(6)
7410 .nr(8)
7411 .kr(1)
7412 .sr(1)
7413 .m(6)
7414 .n(8)
7415 .k(2)
7416 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7417 }
7418
7419 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, strided_cn) {
7420 TEST_REQUIRES_ARM_NEON_FMA;
7421 GemmMicrokernelTester()
7422 .mr(6)
7423 .nr(8)
7424 .kr(1)
7425 .sr(1)
7426 .m(6)
7427 .n(8)
7428 .k(2)
7429 .cn_stride(11)
7430 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7431 }
7432
7433 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
7434 TEST_REQUIRES_ARM_NEON_FMA;
7435 GemmMicrokernelTester()
7436 .mr(6)
7437 .nr(8)
7438 .kr(1)
7439 .sr(1)
7440 .m(6)
7441 .n(8)
7442 .k(2)
7443 .a_stride(5)
7444 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7445 }
7446
7447 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
7448 TEST_REQUIRES_ARM_NEON_FMA;
7449 for (uint32_t m = 1; m <= 6; m++) {
7450 for (uint32_t n = 1; n <= 8; n++) {
7451 GemmMicrokernelTester()
7452 .mr(6)
7453 .nr(8)
7454 .kr(1)
7455 .sr(1)
7456 .m(m)
7457 .n(n)
7458 .k(2)
7459 .iterations(1)
7460 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7461 }
7462 }
7463 }
7464
7465 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
7466 TEST_REQUIRES_ARM_NEON_FMA;
7467 for (uint32_t m = 1; m <= 6; m++) {
7468 GemmMicrokernelTester()
7469 .mr(6)
7470 .nr(8)
7471 .kr(1)
7472 .sr(1)
7473 .m(m)
7474 .n(8)
7475 .k(2)
7476 .iterations(1)
7477 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7478 }
7479 }
7480
7481 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
7482 TEST_REQUIRES_ARM_NEON_FMA;
7483 for (uint32_t n = 1; n <= 8; n++) {
7484 GemmMicrokernelTester()
7485 .mr(6)
7486 .nr(8)
7487 .kr(1)
7488 .sr(1)
7489 .m(6)
7490 .n(n)
7491 .k(2)
7492 .iterations(1)
7493 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7494 }
7495 }
7496
7497 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_lt_2) {
7498 TEST_REQUIRES_ARM_NEON_FMA;
7499 for (size_t k = 1; k < 2; k++) {
7500 GemmMicrokernelTester()
7501 .mr(6)
7502 .nr(8)
7503 .kr(1)
7504 .sr(1)
7505 .m(6)
7506 .n(8)
7507 .k(k)
7508 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7509 }
7510 }
7511
7512 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
7513 TEST_REQUIRES_ARM_NEON_FMA;
7514 for (size_t k = 1; k < 2; k++) {
7515 GemmMicrokernelTester()
7516 .mr(6)
7517 .nr(8)
7518 .kr(1)
7519 .sr(1)
7520 .m(6)
7521 .n(8)
7522 .k(k)
7523 .a_stride(5)
7524 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7525 }
7526 }
7527
7528 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
7529 TEST_REQUIRES_ARM_NEON_FMA;
7530 for (size_t k = 1; k < 2; k++) {
7531 for (uint32_t m = 1; m <= 6; m++) {
7532 for (uint32_t n = 1; n <= 8; n++) {
7533 GemmMicrokernelTester()
7534 .mr(6)
7535 .nr(8)
7536 .kr(1)
7537 .sr(1)
7538 .m(m)
7539 .n(n)
7540 .k(k)
7541 .iterations(1)
7542 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7543 }
7544 }
7545 }
7546 }
7547
7548 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_gt_2) {
7549 TEST_REQUIRES_ARM_NEON_FMA;
7550 for (size_t k = 3; k < 4; k++) {
7551 GemmMicrokernelTester()
7552 .mr(6)
7553 .nr(8)
7554 .kr(1)
7555 .sr(1)
7556 .m(6)
7557 .n(8)
7558 .k(k)
7559 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7560 }
7561 }
7562
7563 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
7564 TEST_REQUIRES_ARM_NEON_FMA;
7565 for (size_t k = 3; k < 4; k++) {
7566 GemmMicrokernelTester()
7567 .mr(6)
7568 .nr(8)
7569 .kr(1)
7570 .sr(1)
7571 .m(6)
7572 .n(8)
7573 .k(k)
7574 .a_stride(7)
7575 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7576 }
7577 }
7578
7579 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
7580 TEST_REQUIRES_ARM_NEON_FMA;
7581 for (size_t k = 3; k < 4; k++) {
7582 for (uint32_t m = 1; m <= 6; m++) {
7583 for (uint32_t n = 1; n <= 8; n++) {
7584 GemmMicrokernelTester()
7585 .mr(6)
7586 .nr(8)
7587 .kr(1)
7588 .sr(1)
7589 .m(m)
7590 .n(n)
7591 .k(k)
7592 .iterations(1)
7593 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7594 }
7595 }
7596 }
7597 }
7598
7599 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_div_2) {
7600 TEST_REQUIRES_ARM_NEON_FMA;
7601 for (size_t k = 4; k <= 20; k += 2) {
7602 GemmMicrokernelTester()
7603 .mr(6)
7604 .nr(8)
7605 .kr(1)
7606 .sr(1)
7607 .m(6)
7608 .n(8)
7609 .k(k)
7610 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7611 }
7612 }
7613
7614 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
7615 TEST_REQUIRES_ARM_NEON_FMA;
7616 for (size_t k = 4; k <= 20; k += 2) {
7617 GemmMicrokernelTester()
7618 .mr(6)
7619 .nr(8)
7620 .kr(1)
7621 .sr(1)
7622 .m(6)
7623 .n(8)
7624 .k(k)
7625 .a_stride(23)
7626 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7627 }
7628 }
7629
7630 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
7631 TEST_REQUIRES_ARM_NEON_FMA;
7632 for (size_t k = 4; k <= 20; k += 2) {
7633 for (uint32_t m = 1; m <= 6; m++) {
7634 for (uint32_t n = 1; n <= 8; n++) {
7635 GemmMicrokernelTester()
7636 .mr(6)
7637 .nr(8)
7638 .kr(1)
7639 .sr(1)
7640 .m(m)
7641 .n(n)
7642 .k(k)
7643 .iterations(1)
7644 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7645 }
7646 }
7647 }
7648 }
7649
7650 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8) {
7651 TEST_REQUIRES_ARM_NEON_FMA;
7652 for (uint32_t n = 9; n < 16; n++) {
7653 for (size_t k = 1; k <= 10; k += 3) {
7654 GemmMicrokernelTester()
7655 .mr(6)
7656 .nr(8)
7657 .kr(1)
7658 .sr(1)
7659 .m(6)
7660 .n(8)
7661 .k(k)
7662 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7663 }
7664 }
7665 }
7666
7667 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
7668 TEST_REQUIRES_ARM_NEON_FMA;
7669 for (uint32_t n = 9; n < 16; n++) {
7670 for (size_t k = 1; k <= 10; k += 3) {
7671 GemmMicrokernelTester()
7672 .mr(6)
7673 .nr(8)
7674 .kr(1)
7675 .sr(1)
7676 .m(6)
7677 .n(8)
7678 .k(k)
7679 .cn_stride(11)
7680 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7681 }
7682 }
7683 }
7684
7685 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
7686 TEST_REQUIRES_ARM_NEON_FMA;
7687 for (uint32_t n = 9; n < 16; n++) {
7688 for (size_t k = 1; k <= 10; k += 3) {
7689 GemmMicrokernelTester()
7690 .mr(6)
7691 .nr(8)
7692 .kr(1)
7693 .sr(1)
7694 .m(6)
7695 .n(n)
7696 .k(k)
7697 .a_stride(13)
7698 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7699 }
7700 }
7701 }
7702
7703 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
7704 TEST_REQUIRES_ARM_NEON_FMA;
7705 for (uint32_t n = 9; n < 16; n++) {
7706 for (size_t k = 1; k <= 10; k += 3) {
7707 for (uint32_t m = 1; m <= 6; m++) {
7708 GemmMicrokernelTester()
7709 .mr(6)
7710 .nr(8)
7711 .kr(1)
7712 .sr(1)
7713 .m(m)
7714 .n(n)
7715 .k(k)
7716 .iterations(1)
7717 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7718 }
7719 }
7720 }
7721 }
7722
7723 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8) {
7724 TEST_REQUIRES_ARM_NEON_FMA;
7725 for (uint32_t n = 16; n <= 24; n += 8) {
7726 for (size_t k = 1; k <= 10; k += 3) {
7727 GemmMicrokernelTester()
7728 .mr(6)
7729 .nr(8)
7730 .kr(1)
7731 .sr(1)
7732 .m(6)
7733 .n(8)
7734 .k(k)
7735 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7736 }
7737 }
7738 }
7739
7740 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
7741 TEST_REQUIRES_ARM_NEON_FMA;
7742 for (uint32_t n = 16; n <= 24; n += 8) {
7743 for (size_t k = 1; k <= 10; k += 3) {
7744 GemmMicrokernelTester()
7745 .mr(6)
7746 .nr(8)
7747 .kr(1)
7748 .sr(1)
7749 .m(6)
7750 .n(n)
7751 .k(k)
7752 .cn_stride(11)
7753 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7754 }
7755 }
7756 }
7757
7758 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
7759 TEST_REQUIRES_ARM_NEON_FMA;
7760 for (uint32_t n = 16; n <= 24; n += 8) {
7761 for (size_t k = 1; k <= 10; k += 3) {
7762 GemmMicrokernelTester()
7763 .mr(6)
7764 .nr(8)
7765 .kr(1)
7766 .sr(1)
7767 .m(6)
7768 .n(n)
7769 .k(k)
7770 .a_stride(13)
7771 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7772 }
7773 }
7774 }
7775
7776 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
7777 TEST_REQUIRES_ARM_NEON_FMA;
7778 for (uint32_t n = 16; n <= 24; n += 8) {
7779 for (size_t k = 1; k <= 10; k += 3) {
7780 for (uint32_t m = 1; m <= 6; m++) {
7781 GemmMicrokernelTester()
7782 .mr(6)
7783 .nr(8)
7784 .kr(1)
7785 .sr(1)
7786 .m(m)
7787 .n(n)
7788 .k(k)
7789 .iterations(1)
7790 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7791 }
7792 }
7793 }
7794 }
7795
7796 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
7797 TEST_REQUIRES_ARM_NEON_FMA;
7798 for (size_t k = 1; k <= 10; k += 3) {
7799 for (uint32_t m = 1; m <= 6; m++) {
7800 for (uint32_t n = 1; n <= 8; n++) {
7801 GemmMicrokernelTester()
7802 .mr(6)
7803 .nr(8)
7804 .kr(1)
7805 .sr(1)
7806 .m(m)
7807 .n(n)
7808 .k(k)
7809 .cm_stride(11)
7810 .iterations(1)
7811 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7812 }
7813 }
7814 }
7815 }
7816
7817 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, qmin) {
7818 TEST_REQUIRES_ARM_NEON_FMA;
7819 GemmMicrokernelTester()
7820 .mr(6)
7821 .nr(8)
7822 .kr(1)
7823 .sr(1)
7824 .m(6)
7825 .n(8)
7826 .k(2)
7827 .qmin(128)
7828 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7829 }
7830
7831 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, qmax) {
7832 TEST_REQUIRES_ARM_NEON_FMA;
7833 GemmMicrokernelTester()
7834 .mr(6)
7835 .nr(8)
7836 .kr(1)
7837 .sr(1)
7838 .m(6)
7839 .n(8)
7840 .k(2)
7841 .qmax(128)
7842 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7843 }
7844
7845 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD64, strided_cm) {
7846 TEST_REQUIRES_ARM_NEON_FMA;
7847 GemmMicrokernelTester()
7848 .mr(6)
7849 .nr(8)
7850 .kr(1)
7851 .sr(1)
7852 .m(6)
7853 .n(8)
7854 .k(2)
7855 .cm_stride(11)
7856 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld64);
7857 }
Frank Barchard7e955972019-10-11 10:34:25 -07007858#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007859
7860
Frank Barchard7e955972019-10-11 10:34:25 -07007861#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007862 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
7863 TEST_REQUIRES_ARM_NEON_FMA;
7864 GemmMicrokernelTester()
7865 .mr(6)
7866 .nr(8)
7867 .kr(1)
7868 .sr(1)
7869 .m(6)
7870 .n(8)
7871 .k(4)
7872 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7873 }
7874
7875 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, strided_cn) {
7876 TEST_REQUIRES_ARM_NEON_FMA;
7877 GemmMicrokernelTester()
7878 .mr(6)
7879 .nr(8)
7880 .kr(1)
7881 .sr(1)
7882 .m(6)
7883 .n(8)
7884 .k(4)
7885 .cn_stride(11)
7886 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7887 }
7888
7889 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
7890 TEST_REQUIRES_ARM_NEON_FMA;
7891 GemmMicrokernelTester()
7892 .mr(6)
7893 .nr(8)
7894 .kr(1)
7895 .sr(1)
7896 .m(6)
7897 .n(8)
7898 .k(4)
7899 .a_stride(7)
7900 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7901 }
7902
7903 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
7904 TEST_REQUIRES_ARM_NEON_FMA;
7905 for (uint32_t m = 1; m <= 6; m++) {
7906 for (uint32_t n = 1; n <= 8; n++) {
7907 GemmMicrokernelTester()
7908 .mr(6)
7909 .nr(8)
7910 .kr(1)
7911 .sr(1)
7912 .m(m)
7913 .n(n)
7914 .k(4)
7915 .iterations(1)
7916 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7917 }
7918 }
7919 }
7920
7921 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
7922 TEST_REQUIRES_ARM_NEON_FMA;
7923 for (uint32_t m = 1; m <= 6; m++) {
7924 GemmMicrokernelTester()
7925 .mr(6)
7926 .nr(8)
7927 .kr(1)
7928 .sr(1)
7929 .m(m)
7930 .n(8)
7931 .k(4)
7932 .iterations(1)
7933 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7934 }
7935 }
7936
7937 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
7938 TEST_REQUIRES_ARM_NEON_FMA;
7939 for (uint32_t n = 1; n <= 8; n++) {
7940 GemmMicrokernelTester()
7941 .mr(6)
7942 .nr(8)
7943 .kr(1)
7944 .sr(1)
7945 .m(6)
7946 .n(n)
7947 .k(4)
7948 .iterations(1)
7949 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7950 }
7951 }
7952
7953 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_lt_4) {
7954 TEST_REQUIRES_ARM_NEON_FMA;
7955 for (size_t k = 1; k < 4; k++) {
7956 GemmMicrokernelTester()
7957 .mr(6)
7958 .nr(8)
7959 .kr(1)
7960 .sr(1)
7961 .m(6)
7962 .n(8)
7963 .k(k)
7964 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7965 }
7966 }
7967
7968 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
7969 TEST_REQUIRES_ARM_NEON_FMA;
7970 for (size_t k = 1; k < 4; k++) {
7971 GemmMicrokernelTester()
7972 .mr(6)
7973 .nr(8)
7974 .kr(1)
7975 .sr(1)
7976 .m(6)
7977 .n(8)
7978 .k(k)
7979 .a_stride(7)
7980 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7981 }
7982 }
7983
7984 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
7985 TEST_REQUIRES_ARM_NEON_FMA;
7986 for (size_t k = 1; k < 4; k++) {
7987 for (uint32_t m = 1; m <= 6; m++) {
7988 for (uint32_t n = 1; n <= 8; n++) {
7989 GemmMicrokernelTester()
7990 .mr(6)
7991 .nr(8)
7992 .kr(1)
7993 .sr(1)
7994 .m(m)
7995 .n(n)
7996 .k(k)
7997 .iterations(1)
7998 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
7999 }
8000 }
8001 }
8002 }
8003
8004 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_gt_4) {
8005 TEST_REQUIRES_ARM_NEON_FMA;
8006 for (size_t k = 5; k < 8; k++) {
8007 GemmMicrokernelTester()
8008 .mr(6)
8009 .nr(8)
8010 .kr(1)
8011 .sr(1)
8012 .m(6)
8013 .n(8)
8014 .k(k)
8015 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8016 }
8017 }
8018
8019 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
8020 TEST_REQUIRES_ARM_NEON_FMA;
8021 for (size_t k = 5; k < 8; k++) {
8022 GemmMicrokernelTester()
8023 .mr(6)
8024 .nr(8)
8025 .kr(1)
8026 .sr(1)
8027 .m(6)
8028 .n(8)
8029 .k(k)
8030 .a_stride(11)
8031 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8032 }
8033 }
8034
8035 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
8036 TEST_REQUIRES_ARM_NEON_FMA;
8037 for (size_t k = 5; k < 8; k++) {
8038 for (uint32_t m = 1; m <= 6; m++) {
8039 for (uint32_t n = 1; n <= 8; n++) {
8040 GemmMicrokernelTester()
8041 .mr(6)
8042 .nr(8)
8043 .kr(1)
8044 .sr(1)
8045 .m(m)
8046 .n(n)
8047 .k(k)
8048 .iterations(1)
8049 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8050 }
8051 }
8052 }
8053 }
8054
8055 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_div_4) {
8056 TEST_REQUIRES_ARM_NEON_FMA;
8057 for (size_t k = 8; k <= 40; k += 4) {
8058 GemmMicrokernelTester()
8059 .mr(6)
8060 .nr(8)
8061 .kr(1)
8062 .sr(1)
8063 .m(6)
8064 .n(8)
8065 .k(k)
8066 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8067 }
8068 }
8069
8070 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
8071 TEST_REQUIRES_ARM_NEON_FMA;
8072 for (size_t k = 8; k <= 40; k += 4) {
8073 GemmMicrokernelTester()
8074 .mr(6)
8075 .nr(8)
8076 .kr(1)
8077 .sr(1)
8078 .m(6)
8079 .n(8)
8080 .k(k)
8081 .a_stride(43)
8082 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8083 }
8084 }
8085
8086 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
8087 TEST_REQUIRES_ARM_NEON_FMA;
8088 for (size_t k = 8; k <= 40; k += 4) {
8089 for (uint32_t m = 1; m <= 6; m++) {
8090 for (uint32_t n = 1; n <= 8; n++) {
8091 GemmMicrokernelTester()
8092 .mr(6)
8093 .nr(8)
8094 .kr(1)
8095 .sr(1)
8096 .m(m)
8097 .n(n)
8098 .k(k)
8099 .iterations(1)
8100 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8101 }
8102 }
8103 }
8104 }
8105
8106 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8) {
8107 TEST_REQUIRES_ARM_NEON_FMA;
8108 for (uint32_t n = 9; n < 16; n++) {
8109 for (size_t k = 1; k <= 20; k += 5) {
8110 GemmMicrokernelTester()
8111 .mr(6)
8112 .nr(8)
8113 .kr(1)
8114 .sr(1)
8115 .m(6)
8116 .n(8)
8117 .k(k)
8118 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8119 }
8120 }
8121 }
8122
8123 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
8124 TEST_REQUIRES_ARM_NEON_FMA;
8125 for (uint32_t n = 9; n < 16; n++) {
8126 for (size_t k = 1; k <= 20; k += 5) {
8127 GemmMicrokernelTester()
8128 .mr(6)
8129 .nr(8)
8130 .kr(1)
8131 .sr(1)
8132 .m(6)
8133 .n(8)
8134 .k(k)
8135 .cn_stride(11)
8136 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8137 }
8138 }
8139 }
8140
8141 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
8142 TEST_REQUIRES_ARM_NEON_FMA;
8143 for (uint32_t n = 9; n < 16; n++) {
8144 for (size_t k = 1; k <= 20; k += 5) {
8145 GemmMicrokernelTester()
8146 .mr(6)
8147 .nr(8)
8148 .kr(1)
8149 .sr(1)
8150 .m(6)
8151 .n(n)
8152 .k(k)
8153 .a_stride(23)
8154 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8155 }
8156 }
8157 }
8158
8159 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
8160 TEST_REQUIRES_ARM_NEON_FMA;
8161 for (uint32_t n = 9; n < 16; n++) {
8162 for (size_t k = 1; k <= 20; k += 5) {
8163 for (uint32_t m = 1; m <= 6; m++) {
8164 GemmMicrokernelTester()
8165 .mr(6)
8166 .nr(8)
8167 .kr(1)
8168 .sr(1)
8169 .m(m)
8170 .n(n)
8171 .k(k)
8172 .iterations(1)
8173 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8174 }
8175 }
8176 }
8177 }
8178
8179 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8) {
8180 TEST_REQUIRES_ARM_NEON_FMA;
8181 for (uint32_t n = 16; n <= 24; n += 8) {
8182 for (size_t k = 1; k <= 20; k += 5) {
8183 GemmMicrokernelTester()
8184 .mr(6)
8185 .nr(8)
8186 .kr(1)
8187 .sr(1)
8188 .m(6)
8189 .n(8)
8190 .k(k)
8191 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8192 }
8193 }
8194 }
8195
8196 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
8197 TEST_REQUIRES_ARM_NEON_FMA;
8198 for (uint32_t n = 16; n <= 24; n += 8) {
8199 for (size_t k = 1; k <= 20; k += 5) {
8200 GemmMicrokernelTester()
8201 .mr(6)
8202 .nr(8)
8203 .kr(1)
8204 .sr(1)
8205 .m(6)
8206 .n(n)
8207 .k(k)
8208 .cn_stride(11)
8209 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8210 }
8211 }
8212 }
8213
8214 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
8215 TEST_REQUIRES_ARM_NEON_FMA;
8216 for (uint32_t n = 16; n <= 24; n += 8) {
8217 for (size_t k = 1; k <= 20; k += 5) {
8218 GemmMicrokernelTester()
8219 .mr(6)
8220 .nr(8)
8221 .kr(1)
8222 .sr(1)
8223 .m(6)
8224 .n(n)
8225 .k(k)
8226 .a_stride(23)
8227 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8228 }
8229 }
8230 }
8231
8232 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
8233 TEST_REQUIRES_ARM_NEON_FMA;
8234 for (uint32_t n = 16; n <= 24; n += 8) {
8235 for (size_t k = 1; k <= 20; k += 5) {
8236 for (uint32_t m = 1; m <= 6; m++) {
8237 GemmMicrokernelTester()
8238 .mr(6)
8239 .nr(8)
8240 .kr(1)
8241 .sr(1)
8242 .m(m)
8243 .n(n)
8244 .k(k)
8245 .iterations(1)
8246 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8247 }
8248 }
8249 }
8250 }
8251
8252 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
8253 TEST_REQUIRES_ARM_NEON_FMA;
8254 for (size_t k = 1; k <= 20; k += 5) {
8255 for (uint32_t m = 1; m <= 6; m++) {
8256 for (uint32_t n = 1; n <= 8; n++) {
8257 GemmMicrokernelTester()
8258 .mr(6)
8259 .nr(8)
8260 .kr(1)
8261 .sr(1)
8262 .m(m)
8263 .n(n)
8264 .k(k)
8265 .cm_stride(11)
8266 .iterations(1)
8267 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8268 }
8269 }
8270 }
8271 }
8272
8273 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, qmin) {
8274 TEST_REQUIRES_ARM_NEON_FMA;
8275 GemmMicrokernelTester()
8276 .mr(6)
8277 .nr(8)
8278 .kr(1)
8279 .sr(1)
8280 .m(6)
8281 .n(8)
8282 .k(4)
8283 .qmin(128)
8284 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8285 }
8286
8287 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, qmax) {
8288 TEST_REQUIRES_ARM_NEON_FMA;
8289 GemmMicrokernelTester()
8290 .mr(6)
8291 .nr(8)
8292 .kr(1)
8293 .sr(1)
8294 .m(6)
8295 .n(8)
8296 .k(4)
8297 .qmax(128)
8298 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8299 }
8300
8301 TEST(F32_GEMMINC_6X8__AARCH64_NEONFMA_LD128, strided_cm) {
8302 TEST_REQUIRES_ARM_NEON_FMA;
8303 GemmMicrokernelTester()
8304 .mr(6)
8305 .nr(8)
8306 .kr(1)
8307 .sr(1)
8308 .m(6)
8309 .n(8)
8310 .k(4)
8311 .cm_stride(11)
8312 .Test(xnn_f32_gemminc_ukernel_6x8__aarch64_neonfma_ld128);
8313 }
Frank Barchard7e955972019-10-11 10:34:25 -07008314#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07008315
8316
Marat Dukhan1dadbf72019-10-01 10:46:20 -07008317#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08008318 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008319 TEST_REQUIRES_ARM_NEON;
8320 GemmMicrokernelTester()
8321 .mr(1)
8322 .nr(8)
8323 .kr(1)
8324 .sr(1)
8325 .m(1)
8326 .n(8)
8327 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -08008328 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008329 }
8330
Frank Barchard91317c52019-11-22 10:54:35 -08008331 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008332 TEST_REQUIRES_ARM_NEON;
8333 GemmMicrokernelTester()
8334 .mr(1)
8335 .nr(8)
8336 .kr(1)
8337 .sr(1)
8338 .m(1)
8339 .n(8)
8340 .k(2)
8341 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008342 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008343 }
8344
Frank Barchard91317c52019-11-22 10:54:35 -08008345 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008346 TEST_REQUIRES_ARM_NEON;
8347 GemmMicrokernelTester()
8348 .mr(1)
8349 .nr(8)
8350 .kr(1)
8351 .sr(1)
8352 .m(1)
8353 .n(8)
8354 .k(2)
8355 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08008356 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008357 }
8358
Frank Barchard91317c52019-11-22 10:54:35 -08008359 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008360 TEST_REQUIRES_ARM_NEON;
8361 for (uint32_t m = 1; m <= 1; m++) {
8362 for (uint32_t n = 1; n <= 8; n++) {
8363 GemmMicrokernelTester()
8364 .mr(1)
8365 .nr(8)
8366 .kr(1)
8367 .sr(1)
8368 .m(m)
8369 .n(n)
8370 .k(2)
8371 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008372 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008373 }
8374 }
8375 }
8376
Frank Barchard91317c52019-11-22 10:54:35 -08008377 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008378 TEST_REQUIRES_ARM_NEON;
8379 for (uint32_t m = 1; m <= 1; m++) {
8380 GemmMicrokernelTester()
8381 .mr(1)
8382 .nr(8)
8383 .kr(1)
8384 .sr(1)
8385 .m(m)
8386 .n(8)
8387 .k(2)
8388 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008389 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008390 }
8391 }
8392
Frank Barchard91317c52019-11-22 10:54:35 -08008393 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008394 TEST_REQUIRES_ARM_NEON;
8395 for (uint32_t n = 1; n <= 8; n++) {
8396 GemmMicrokernelTester()
8397 .mr(1)
8398 .nr(8)
8399 .kr(1)
8400 .sr(1)
8401 .m(1)
8402 .n(n)
8403 .k(2)
8404 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008405 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008406 }
8407 }
8408
Frank Barchard91317c52019-11-22 10:54:35 -08008409 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008410 TEST_REQUIRES_ARM_NEON;
8411 for (size_t k = 1; k < 2; k++) {
8412 GemmMicrokernelTester()
8413 .mr(1)
8414 .nr(8)
8415 .kr(1)
8416 .sr(1)
8417 .m(1)
8418 .n(8)
8419 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008420 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008421 }
8422 }
8423
Frank Barchard91317c52019-11-22 10:54:35 -08008424 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008425 TEST_REQUIRES_ARM_NEON;
8426 for (size_t k = 1; k < 2; k++) {
8427 GemmMicrokernelTester()
8428 .mr(1)
8429 .nr(8)
8430 .kr(1)
8431 .sr(1)
8432 .m(1)
8433 .n(8)
8434 .k(k)
8435 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08008436 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008437 }
8438 }
8439
Frank Barchard91317c52019-11-22 10:54:35 -08008440 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008441 TEST_REQUIRES_ARM_NEON;
8442 for (size_t k = 1; k < 2; k++) {
8443 for (uint32_t m = 1; m <= 1; m++) {
8444 for (uint32_t n = 1; n <= 8; n++) {
8445 GemmMicrokernelTester()
8446 .mr(1)
8447 .nr(8)
8448 .kr(1)
8449 .sr(1)
8450 .m(m)
8451 .n(n)
8452 .k(k)
8453 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008454 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008455 }
8456 }
8457 }
8458 }
8459
Frank Barchard91317c52019-11-22 10:54:35 -08008460 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008461 TEST_REQUIRES_ARM_NEON;
8462 for (size_t k = 3; k < 4; k++) {
8463 GemmMicrokernelTester()
8464 .mr(1)
8465 .nr(8)
8466 .kr(1)
8467 .sr(1)
8468 .m(1)
8469 .n(8)
8470 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008471 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008472 }
8473 }
8474
Frank Barchard91317c52019-11-22 10:54:35 -08008475 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008476 TEST_REQUIRES_ARM_NEON;
8477 for (size_t k = 3; k < 4; k++) {
8478 GemmMicrokernelTester()
8479 .mr(1)
8480 .nr(8)
8481 .kr(1)
8482 .sr(1)
8483 .m(1)
8484 .n(8)
8485 .k(k)
8486 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08008487 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008488 }
8489 }
8490
Frank Barchard91317c52019-11-22 10:54:35 -08008491 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008492 TEST_REQUIRES_ARM_NEON;
8493 for (size_t k = 3; k < 4; k++) {
8494 for (uint32_t m = 1; m <= 1; m++) {
8495 for (uint32_t n = 1; n <= 8; n++) {
8496 GemmMicrokernelTester()
8497 .mr(1)
8498 .nr(8)
8499 .kr(1)
8500 .sr(1)
8501 .m(m)
8502 .n(n)
8503 .k(k)
8504 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008505 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008506 }
8507 }
8508 }
8509 }
8510
Frank Barchard91317c52019-11-22 10:54:35 -08008511 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008512 TEST_REQUIRES_ARM_NEON;
8513 for (size_t k = 4; k <= 20; k += 2) {
8514 GemmMicrokernelTester()
8515 .mr(1)
8516 .nr(8)
8517 .kr(1)
8518 .sr(1)
8519 .m(1)
8520 .n(8)
8521 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008522 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008523 }
8524 }
8525
Frank Barchard91317c52019-11-22 10:54:35 -08008526 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008527 TEST_REQUIRES_ARM_NEON;
8528 for (size_t k = 4; k <= 20; k += 2) {
8529 GemmMicrokernelTester()
8530 .mr(1)
8531 .nr(8)
8532 .kr(1)
8533 .sr(1)
8534 .m(1)
8535 .n(8)
8536 .k(k)
8537 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -08008538 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008539 }
8540 }
8541
Frank Barchard91317c52019-11-22 10:54:35 -08008542 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008543 TEST_REQUIRES_ARM_NEON;
8544 for (size_t k = 4; k <= 20; k += 2) {
8545 for (uint32_t m = 1; m <= 1; m++) {
8546 for (uint32_t n = 1; n <= 8; n++) {
8547 GemmMicrokernelTester()
8548 .mr(1)
8549 .nr(8)
8550 .kr(1)
8551 .sr(1)
8552 .m(m)
8553 .n(n)
8554 .k(k)
8555 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008556 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008557 }
8558 }
8559 }
8560 }
8561
Frank Barchard91317c52019-11-22 10:54:35 -08008562 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008563 TEST_REQUIRES_ARM_NEON;
8564 for (uint32_t n = 9; n < 16; n++) {
8565 for (size_t k = 1; k <= 10; k += 3) {
8566 GemmMicrokernelTester()
8567 .mr(1)
8568 .nr(8)
8569 .kr(1)
8570 .sr(1)
8571 .m(1)
8572 .n(8)
8573 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008574 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008575 }
8576 }
8577 }
8578
Frank Barchard91317c52019-11-22 10:54:35 -08008579 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008580 TEST_REQUIRES_ARM_NEON;
8581 for (uint32_t n = 9; n < 16; n++) {
8582 for (size_t k = 1; k <= 10; k += 3) {
8583 GemmMicrokernelTester()
8584 .mr(1)
8585 .nr(8)
8586 .kr(1)
8587 .sr(1)
8588 .m(1)
8589 .n(8)
8590 .k(k)
8591 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008592 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008593 }
8594 }
8595 }
8596
Frank Barchard91317c52019-11-22 10:54:35 -08008597 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008598 TEST_REQUIRES_ARM_NEON;
8599 for (uint32_t n = 9; n < 16; n++) {
8600 for (size_t k = 1; k <= 10; k += 3) {
8601 GemmMicrokernelTester()
8602 .mr(1)
8603 .nr(8)
8604 .kr(1)
8605 .sr(1)
8606 .m(1)
8607 .n(n)
8608 .k(k)
8609 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -08008610 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008611 }
8612 }
8613 }
8614
Frank Barchard91317c52019-11-22 10:54:35 -08008615 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008616 TEST_REQUIRES_ARM_NEON;
8617 for (uint32_t n = 9; n < 16; n++) {
8618 for (size_t k = 1; k <= 10; k += 3) {
8619 for (uint32_t m = 1; m <= 1; m++) {
8620 GemmMicrokernelTester()
8621 .mr(1)
8622 .nr(8)
8623 .kr(1)
8624 .sr(1)
8625 .m(m)
8626 .n(n)
8627 .k(k)
8628 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008629 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008630 }
8631 }
8632 }
8633 }
8634
Frank Barchard91317c52019-11-22 10:54:35 -08008635 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008636 TEST_REQUIRES_ARM_NEON;
8637 for (uint32_t n = 16; n <= 24; n += 8) {
8638 for (size_t k = 1; k <= 10; k += 3) {
8639 GemmMicrokernelTester()
8640 .mr(1)
8641 .nr(8)
8642 .kr(1)
8643 .sr(1)
8644 .m(1)
8645 .n(8)
8646 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008647 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008648 }
8649 }
8650 }
8651
Frank Barchard91317c52019-11-22 10:54:35 -08008652 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008653 TEST_REQUIRES_ARM_NEON;
8654 for (uint32_t n = 16; n <= 24; n += 8) {
8655 for (size_t k = 1; k <= 10; k += 3) {
8656 GemmMicrokernelTester()
8657 .mr(1)
8658 .nr(8)
8659 .kr(1)
8660 .sr(1)
8661 .m(1)
8662 .n(n)
8663 .k(k)
8664 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008665 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008666 }
8667 }
8668 }
8669
Frank Barchard91317c52019-11-22 10:54:35 -08008670 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008671 TEST_REQUIRES_ARM_NEON;
8672 for (uint32_t n = 16; n <= 24; n += 8) {
8673 for (size_t k = 1; k <= 10; k += 3) {
8674 GemmMicrokernelTester()
8675 .mr(1)
8676 .nr(8)
8677 .kr(1)
8678 .sr(1)
8679 .m(1)
8680 .n(n)
8681 .k(k)
8682 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -08008683 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008684 }
8685 }
8686 }
8687
Frank Barchard91317c52019-11-22 10:54:35 -08008688 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008689 TEST_REQUIRES_ARM_NEON;
8690 for (uint32_t n = 16; n <= 24; n += 8) {
8691 for (size_t k = 1; k <= 10; k += 3) {
8692 for (uint32_t m = 1; m <= 1; m++) {
8693 GemmMicrokernelTester()
8694 .mr(1)
8695 .nr(8)
8696 .kr(1)
8697 .sr(1)
8698 .m(m)
8699 .n(n)
8700 .k(k)
8701 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008702 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008703 }
8704 }
8705 }
8706 }
8707
Frank Barchard91317c52019-11-22 10:54:35 -08008708 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008709 TEST_REQUIRES_ARM_NEON;
8710 for (size_t k = 1; k <= 10; k += 3) {
8711 for (uint32_t m = 1; m <= 1; m++) {
8712 for (uint32_t n = 1; n <= 8; n++) {
8713 GemmMicrokernelTester()
8714 .mr(1)
8715 .nr(8)
8716 .kr(1)
8717 .sr(1)
8718 .m(m)
8719 .n(n)
8720 .k(k)
8721 .cm_stride(11)
8722 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008723 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008724 }
8725 }
8726 }
8727 }
8728
Frank Barchard91317c52019-11-22 10:54:35 -08008729 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008730 TEST_REQUIRES_ARM_NEON;
8731 GemmMicrokernelTester()
8732 .mr(1)
8733 .nr(8)
8734 .kr(1)
8735 .sr(1)
8736 .m(1)
8737 .n(8)
8738 .k(2)
8739 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -08008740 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008741 }
8742
Frank Barchard91317c52019-11-22 10:54:35 -08008743 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008744 TEST_REQUIRES_ARM_NEON;
8745 GemmMicrokernelTester()
8746 .mr(1)
8747 .nr(8)
8748 .kr(1)
8749 .sr(1)
8750 .m(1)
8751 .n(8)
8752 .k(2)
8753 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -08008754 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008755 }
8756
Frank Barchard91317c52019-11-22 10:54:35 -08008757 TEST(F32_GEMMINC_1X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008758 TEST_REQUIRES_ARM_NEON;
8759 GemmMicrokernelTester()
8760 .mr(1)
8761 .nr(8)
8762 .kr(1)
8763 .sr(1)
8764 .m(1)
8765 .n(8)
8766 .k(2)
8767 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008768 .Test(xnn_f32_gemminc_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008769 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07008770#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07008771
8772
Marat Dukhan1dadbf72019-10-01 10:46:20 -07008773#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08008774 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008775 TEST_REQUIRES_ARM_NEON;
8776 GemmMicrokernelTester()
8777 .mr(4)
8778 .nr(8)
8779 .kr(1)
8780 .sr(1)
8781 .m(4)
8782 .n(8)
8783 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -08008784 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008785 }
8786
Frank Barchard91317c52019-11-22 10:54:35 -08008787 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008788 TEST_REQUIRES_ARM_NEON;
8789 GemmMicrokernelTester()
8790 .mr(4)
8791 .nr(8)
8792 .kr(1)
8793 .sr(1)
8794 .m(4)
8795 .n(8)
8796 .k(2)
8797 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08008798 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008799 }
8800
Frank Barchard91317c52019-11-22 10:54:35 -08008801 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008802 TEST_REQUIRES_ARM_NEON;
8803 GemmMicrokernelTester()
8804 .mr(4)
8805 .nr(8)
8806 .kr(1)
8807 .sr(1)
8808 .m(4)
8809 .n(8)
8810 .k(2)
8811 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08008812 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008813 }
8814
Frank Barchard91317c52019-11-22 10:54:35 -08008815 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008816 TEST_REQUIRES_ARM_NEON;
8817 for (uint32_t m = 1; m <= 4; m++) {
8818 for (uint32_t n = 1; n <= 8; n++) {
8819 GemmMicrokernelTester()
8820 .mr(4)
8821 .nr(8)
8822 .kr(1)
8823 .sr(1)
8824 .m(m)
8825 .n(n)
8826 .k(2)
8827 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008828 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008829 }
8830 }
8831 }
8832
Frank Barchard91317c52019-11-22 10:54:35 -08008833 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008834 TEST_REQUIRES_ARM_NEON;
8835 for (uint32_t m = 1; m <= 4; m++) {
8836 GemmMicrokernelTester()
8837 .mr(4)
8838 .nr(8)
8839 .kr(1)
8840 .sr(1)
8841 .m(m)
8842 .n(8)
8843 .k(2)
8844 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008845 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008846 }
8847 }
8848
Frank Barchard91317c52019-11-22 10:54:35 -08008849 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008850 TEST_REQUIRES_ARM_NEON;
8851 for (uint32_t n = 1; n <= 8; n++) {
8852 GemmMicrokernelTester()
8853 .mr(4)
8854 .nr(8)
8855 .kr(1)
8856 .sr(1)
8857 .m(4)
8858 .n(n)
8859 .k(2)
8860 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008861 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008862 }
8863 }
8864
Frank Barchard91317c52019-11-22 10:54:35 -08008865 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008866 TEST_REQUIRES_ARM_NEON;
8867 for (size_t k = 1; k < 2; k++) {
8868 GemmMicrokernelTester()
8869 .mr(4)
8870 .nr(8)
8871 .kr(1)
8872 .sr(1)
8873 .m(4)
8874 .n(8)
8875 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008876 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008877 }
8878 }
8879
Frank Barchard91317c52019-11-22 10:54:35 -08008880 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008881 TEST_REQUIRES_ARM_NEON;
8882 for (size_t k = 1; k < 2; k++) {
8883 GemmMicrokernelTester()
8884 .mr(4)
8885 .nr(8)
8886 .kr(1)
8887 .sr(1)
8888 .m(4)
8889 .n(8)
8890 .k(k)
8891 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08008892 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008893 }
8894 }
8895
Frank Barchard91317c52019-11-22 10:54:35 -08008896 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008897 TEST_REQUIRES_ARM_NEON;
8898 for (size_t k = 1; k < 2; k++) {
8899 for (uint32_t m = 1; m <= 4; m++) {
8900 for (uint32_t n = 1; n <= 8; n++) {
8901 GemmMicrokernelTester()
8902 .mr(4)
8903 .nr(8)
8904 .kr(1)
8905 .sr(1)
8906 .m(m)
8907 .n(n)
8908 .k(k)
8909 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008910 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008911 }
8912 }
8913 }
8914 }
8915
Frank Barchard91317c52019-11-22 10:54:35 -08008916 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008917 TEST_REQUIRES_ARM_NEON;
8918 for (size_t k = 3; k < 4; k++) {
8919 GemmMicrokernelTester()
8920 .mr(4)
8921 .nr(8)
8922 .kr(1)
8923 .sr(1)
8924 .m(4)
8925 .n(8)
8926 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008927 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008928 }
8929 }
8930
Frank Barchard91317c52019-11-22 10:54:35 -08008931 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008932 TEST_REQUIRES_ARM_NEON;
8933 for (size_t k = 3; k < 4; k++) {
8934 GemmMicrokernelTester()
8935 .mr(4)
8936 .nr(8)
8937 .kr(1)
8938 .sr(1)
8939 .m(4)
8940 .n(8)
8941 .k(k)
8942 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08008943 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008944 }
8945 }
8946
Frank Barchard91317c52019-11-22 10:54:35 -08008947 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008948 TEST_REQUIRES_ARM_NEON;
8949 for (size_t k = 3; k < 4; k++) {
8950 for (uint32_t m = 1; m <= 4; m++) {
8951 for (uint32_t n = 1; n <= 8; n++) {
8952 GemmMicrokernelTester()
8953 .mr(4)
8954 .nr(8)
8955 .kr(1)
8956 .sr(1)
8957 .m(m)
8958 .n(n)
8959 .k(k)
8960 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08008961 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008962 }
8963 }
8964 }
8965 }
8966
Frank Barchard91317c52019-11-22 10:54:35 -08008967 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008968 TEST_REQUIRES_ARM_NEON;
8969 for (size_t k = 4; k <= 20; k += 2) {
8970 GemmMicrokernelTester()
8971 .mr(4)
8972 .nr(8)
8973 .kr(1)
8974 .sr(1)
8975 .m(4)
8976 .n(8)
8977 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08008978 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008979 }
8980 }
8981
Frank Barchard91317c52019-11-22 10:54:35 -08008982 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008983 TEST_REQUIRES_ARM_NEON;
8984 for (size_t k = 4; k <= 20; k += 2) {
8985 GemmMicrokernelTester()
8986 .mr(4)
8987 .nr(8)
8988 .kr(1)
8989 .sr(1)
8990 .m(4)
8991 .n(8)
8992 .k(k)
8993 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -08008994 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07008995 }
8996 }
8997
Frank Barchard91317c52019-11-22 10:54:35 -08008998 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07008999 TEST_REQUIRES_ARM_NEON;
9000 for (size_t k = 4; k <= 20; k += 2) {
9001 for (uint32_t m = 1; m <= 4; m++) {
9002 for (uint32_t n = 1; n <= 8; n++) {
9003 GemmMicrokernelTester()
9004 .mr(4)
9005 .nr(8)
9006 .kr(1)
9007 .sr(1)
9008 .m(m)
9009 .n(n)
9010 .k(k)
9011 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009012 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009013 }
9014 }
9015 }
9016 }
9017
Frank Barchard91317c52019-11-22 10:54:35 -08009018 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009019 TEST_REQUIRES_ARM_NEON;
9020 for (uint32_t n = 9; n < 16; n++) {
9021 for (size_t k = 1; k <= 10; k += 3) {
9022 GemmMicrokernelTester()
9023 .mr(4)
9024 .nr(8)
9025 .kr(1)
9026 .sr(1)
9027 .m(4)
9028 .n(8)
9029 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009030 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009031 }
9032 }
9033 }
9034
Frank Barchard91317c52019-11-22 10:54:35 -08009035 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009036 TEST_REQUIRES_ARM_NEON;
9037 for (uint32_t n = 9; n < 16; n++) {
9038 for (size_t k = 1; k <= 10; k += 3) {
9039 GemmMicrokernelTester()
9040 .mr(4)
9041 .nr(8)
9042 .kr(1)
9043 .sr(1)
9044 .m(4)
9045 .n(8)
9046 .k(k)
9047 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009048 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009049 }
9050 }
9051 }
9052
Frank Barchard91317c52019-11-22 10:54:35 -08009053 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009054 TEST_REQUIRES_ARM_NEON;
9055 for (uint32_t n = 9; n < 16; n++) {
9056 for (size_t k = 1; k <= 10; k += 3) {
9057 GemmMicrokernelTester()
9058 .mr(4)
9059 .nr(8)
9060 .kr(1)
9061 .sr(1)
9062 .m(4)
9063 .n(n)
9064 .k(k)
9065 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -08009066 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009067 }
9068 }
9069 }
9070
Frank Barchard91317c52019-11-22 10:54:35 -08009071 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009072 TEST_REQUIRES_ARM_NEON;
9073 for (uint32_t n = 9; n < 16; n++) {
9074 for (size_t k = 1; k <= 10; k += 3) {
9075 for (uint32_t m = 1; m <= 4; m++) {
9076 GemmMicrokernelTester()
9077 .mr(4)
9078 .nr(8)
9079 .kr(1)
9080 .sr(1)
9081 .m(m)
9082 .n(n)
9083 .k(k)
9084 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009085 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009086 }
9087 }
9088 }
9089 }
9090
Frank Barchard91317c52019-11-22 10:54:35 -08009091 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009092 TEST_REQUIRES_ARM_NEON;
9093 for (uint32_t n = 16; n <= 24; n += 8) {
9094 for (size_t k = 1; k <= 10; k += 3) {
9095 GemmMicrokernelTester()
9096 .mr(4)
9097 .nr(8)
9098 .kr(1)
9099 .sr(1)
9100 .m(4)
9101 .n(8)
9102 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009103 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009104 }
9105 }
9106 }
9107
Frank Barchard91317c52019-11-22 10:54:35 -08009108 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009109 TEST_REQUIRES_ARM_NEON;
9110 for (uint32_t n = 16; n <= 24; n += 8) {
9111 for (size_t k = 1; k <= 10; k += 3) {
9112 GemmMicrokernelTester()
9113 .mr(4)
9114 .nr(8)
9115 .kr(1)
9116 .sr(1)
9117 .m(4)
9118 .n(n)
9119 .k(k)
9120 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009121 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009122 }
9123 }
9124 }
9125
Frank Barchard91317c52019-11-22 10:54:35 -08009126 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009127 TEST_REQUIRES_ARM_NEON;
9128 for (uint32_t n = 16; n <= 24; n += 8) {
9129 for (size_t k = 1; k <= 10; k += 3) {
9130 GemmMicrokernelTester()
9131 .mr(4)
9132 .nr(8)
9133 .kr(1)
9134 .sr(1)
9135 .m(4)
9136 .n(n)
9137 .k(k)
9138 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -08009139 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009140 }
9141 }
9142 }
9143
Frank Barchard91317c52019-11-22 10:54:35 -08009144 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009145 TEST_REQUIRES_ARM_NEON;
9146 for (uint32_t n = 16; n <= 24; n += 8) {
9147 for (size_t k = 1; k <= 10; k += 3) {
9148 for (uint32_t m = 1; m <= 4; m++) {
9149 GemmMicrokernelTester()
9150 .mr(4)
9151 .nr(8)
9152 .kr(1)
9153 .sr(1)
9154 .m(m)
9155 .n(n)
9156 .k(k)
9157 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009158 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009159 }
9160 }
9161 }
9162 }
9163
Frank Barchard91317c52019-11-22 10:54:35 -08009164 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009165 TEST_REQUIRES_ARM_NEON;
9166 for (size_t k = 1; k <= 10; k += 3) {
9167 for (uint32_t m = 1; m <= 4; m++) {
9168 for (uint32_t n = 1; n <= 8; n++) {
9169 GemmMicrokernelTester()
9170 .mr(4)
9171 .nr(8)
9172 .kr(1)
9173 .sr(1)
9174 .m(m)
9175 .n(n)
9176 .k(k)
9177 .cm_stride(11)
9178 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009179 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009180 }
9181 }
9182 }
9183 }
9184
Frank Barchard91317c52019-11-22 10:54:35 -08009185 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009186 TEST_REQUIRES_ARM_NEON;
9187 GemmMicrokernelTester()
9188 .mr(4)
9189 .nr(8)
9190 .kr(1)
9191 .sr(1)
9192 .m(4)
9193 .n(8)
9194 .k(2)
9195 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009196 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009197 }
9198
Frank Barchard91317c52019-11-22 10:54:35 -08009199 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009200 TEST_REQUIRES_ARM_NEON;
9201 GemmMicrokernelTester()
9202 .mr(4)
9203 .nr(8)
9204 .kr(1)
9205 .sr(1)
9206 .m(4)
9207 .n(8)
9208 .k(2)
9209 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009210 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009211 }
9212
Frank Barchard91317c52019-11-22 10:54:35 -08009213 TEST(F32_GEMMINC_4X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009214 TEST_REQUIRES_ARM_NEON;
9215 GemmMicrokernelTester()
9216 .mr(4)
9217 .nr(8)
9218 .kr(1)
9219 .sr(1)
9220 .m(4)
9221 .n(8)
9222 .k(2)
9223 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009224 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009225 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009226#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07009227
9228
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009229#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08009230 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009231 TEST_REQUIRES_ARM_NEON;
9232 GemmMicrokernelTester()
9233 .mr(4)
9234 .nr(8)
9235 .kr(1)
9236 .sr(1)
9237 .m(4)
9238 .n(8)
9239 .k(4)
Frank Barchard91317c52019-11-22 10:54:35 -08009240 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009241 }
9242
Frank Barchard91317c52019-11-22 10:54:35 -08009243 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009244 TEST_REQUIRES_ARM_NEON;
9245 GemmMicrokernelTester()
9246 .mr(4)
9247 .nr(8)
9248 .kr(1)
9249 .sr(1)
9250 .m(4)
9251 .n(8)
9252 .k(4)
9253 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009254 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009255 }
9256
Frank Barchard91317c52019-11-22 10:54:35 -08009257 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009258 TEST_REQUIRES_ARM_NEON;
9259 GemmMicrokernelTester()
9260 .mr(4)
9261 .nr(8)
9262 .kr(1)
9263 .sr(1)
9264 .m(4)
9265 .n(8)
9266 .k(4)
9267 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009268 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009269 }
9270
Frank Barchard91317c52019-11-22 10:54:35 -08009271 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009272 TEST_REQUIRES_ARM_NEON;
9273 for (uint32_t m = 1; m <= 4; m++) {
9274 for (uint32_t n = 1; n <= 8; n++) {
9275 GemmMicrokernelTester()
9276 .mr(4)
9277 .nr(8)
9278 .kr(1)
9279 .sr(1)
9280 .m(m)
9281 .n(n)
9282 .k(4)
9283 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009284 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009285 }
9286 }
9287 }
9288
Frank Barchard91317c52019-11-22 10:54:35 -08009289 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009290 TEST_REQUIRES_ARM_NEON;
9291 for (uint32_t m = 1; m <= 4; m++) {
9292 GemmMicrokernelTester()
9293 .mr(4)
9294 .nr(8)
9295 .kr(1)
9296 .sr(1)
9297 .m(m)
9298 .n(8)
9299 .k(4)
9300 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009301 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009302 }
9303 }
9304
Frank Barchard91317c52019-11-22 10:54:35 -08009305 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009306 TEST_REQUIRES_ARM_NEON;
9307 for (uint32_t n = 1; n <= 8; n++) {
9308 GemmMicrokernelTester()
9309 .mr(4)
9310 .nr(8)
9311 .kr(1)
9312 .sr(1)
9313 .m(4)
9314 .n(n)
9315 .k(4)
9316 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009317 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009318 }
9319 }
9320
Frank Barchard91317c52019-11-22 10:54:35 -08009321 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009322 TEST_REQUIRES_ARM_NEON;
9323 for (size_t k = 1; k < 4; k++) {
9324 GemmMicrokernelTester()
9325 .mr(4)
9326 .nr(8)
9327 .kr(1)
9328 .sr(1)
9329 .m(4)
9330 .n(8)
9331 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009332 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009333 }
9334 }
9335
Frank Barchard91317c52019-11-22 10:54:35 -08009336 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009337 TEST_REQUIRES_ARM_NEON;
9338 for (size_t k = 1; k < 4; k++) {
9339 GemmMicrokernelTester()
9340 .mr(4)
9341 .nr(8)
9342 .kr(1)
9343 .sr(1)
9344 .m(4)
9345 .n(8)
9346 .k(k)
9347 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009348 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009349 }
9350 }
9351
Frank Barchard91317c52019-11-22 10:54:35 -08009352 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009353 TEST_REQUIRES_ARM_NEON;
9354 for (size_t k = 1; k < 4; k++) {
9355 for (uint32_t m = 1; m <= 4; m++) {
9356 for (uint32_t n = 1; n <= 8; n++) {
9357 GemmMicrokernelTester()
9358 .mr(4)
9359 .nr(8)
9360 .kr(1)
9361 .sr(1)
9362 .m(m)
9363 .n(n)
9364 .k(k)
9365 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009366 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009367 }
9368 }
9369 }
9370 }
9371
Frank Barchard91317c52019-11-22 10:54:35 -08009372 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009373 TEST_REQUIRES_ARM_NEON;
9374 for (size_t k = 5; k < 8; k++) {
9375 GemmMicrokernelTester()
9376 .mr(4)
9377 .nr(8)
9378 .kr(1)
9379 .sr(1)
9380 .m(4)
9381 .n(8)
9382 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009383 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009384 }
9385 }
9386
Frank Barchard91317c52019-11-22 10:54:35 -08009387 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009388 TEST_REQUIRES_ARM_NEON;
9389 for (size_t k = 5; k < 8; k++) {
9390 GemmMicrokernelTester()
9391 .mr(4)
9392 .nr(8)
9393 .kr(1)
9394 .sr(1)
9395 .m(4)
9396 .n(8)
9397 .k(k)
9398 .a_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009399 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009400 }
9401 }
9402
Frank Barchard91317c52019-11-22 10:54:35 -08009403 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009404 TEST_REQUIRES_ARM_NEON;
9405 for (size_t k = 5; k < 8; k++) {
9406 for (uint32_t m = 1; m <= 4; m++) {
9407 for (uint32_t n = 1; n <= 8; n++) {
9408 GemmMicrokernelTester()
9409 .mr(4)
9410 .nr(8)
9411 .kr(1)
9412 .sr(1)
9413 .m(m)
9414 .n(n)
9415 .k(k)
9416 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009417 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009418 }
9419 }
9420 }
9421 }
9422
Frank Barchard91317c52019-11-22 10:54:35 -08009423 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009424 TEST_REQUIRES_ARM_NEON;
9425 for (size_t k = 8; k <= 40; k += 4) {
9426 GemmMicrokernelTester()
9427 .mr(4)
9428 .nr(8)
9429 .kr(1)
9430 .sr(1)
9431 .m(4)
9432 .n(8)
9433 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009434 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009435 }
9436 }
9437
Frank Barchard91317c52019-11-22 10:54:35 -08009438 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009439 TEST_REQUIRES_ARM_NEON;
9440 for (size_t k = 8; k <= 40; k += 4) {
9441 GemmMicrokernelTester()
9442 .mr(4)
9443 .nr(8)
9444 .kr(1)
9445 .sr(1)
9446 .m(4)
9447 .n(8)
9448 .k(k)
9449 .a_stride(43)
Frank Barchard91317c52019-11-22 10:54:35 -08009450 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009451 }
9452 }
9453
Frank Barchard91317c52019-11-22 10:54:35 -08009454 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009455 TEST_REQUIRES_ARM_NEON;
9456 for (size_t k = 8; k <= 40; k += 4) {
9457 for (uint32_t m = 1; m <= 4; m++) {
9458 for (uint32_t n = 1; n <= 8; n++) {
9459 GemmMicrokernelTester()
9460 .mr(4)
9461 .nr(8)
9462 .kr(1)
9463 .sr(1)
9464 .m(m)
9465 .n(n)
9466 .k(k)
9467 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009468 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009469 }
9470 }
9471 }
9472 }
9473
Frank Barchard91317c52019-11-22 10:54:35 -08009474 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009475 TEST_REQUIRES_ARM_NEON;
9476 for (uint32_t n = 9; n < 16; n++) {
9477 for (size_t k = 1; k <= 20; k += 5) {
9478 GemmMicrokernelTester()
9479 .mr(4)
9480 .nr(8)
9481 .kr(1)
9482 .sr(1)
9483 .m(4)
9484 .n(8)
9485 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009486 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009487 }
9488 }
9489 }
9490
Frank Barchard91317c52019-11-22 10:54:35 -08009491 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009492 TEST_REQUIRES_ARM_NEON;
9493 for (uint32_t n = 9; n < 16; n++) {
9494 for (size_t k = 1; k <= 20; k += 5) {
9495 GemmMicrokernelTester()
9496 .mr(4)
9497 .nr(8)
9498 .kr(1)
9499 .sr(1)
9500 .m(4)
9501 .n(8)
9502 .k(k)
9503 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009504 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009505 }
9506 }
9507 }
9508
Frank Barchard91317c52019-11-22 10:54:35 -08009509 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009510 TEST_REQUIRES_ARM_NEON;
9511 for (uint32_t n = 9; n < 16; n++) {
9512 for (size_t k = 1; k <= 20; k += 5) {
9513 GemmMicrokernelTester()
9514 .mr(4)
9515 .nr(8)
9516 .kr(1)
9517 .sr(1)
9518 .m(4)
9519 .n(n)
9520 .k(k)
9521 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -08009522 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009523 }
9524 }
9525 }
9526
Frank Barchard91317c52019-11-22 10:54:35 -08009527 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009528 TEST_REQUIRES_ARM_NEON;
9529 for (uint32_t n = 9; n < 16; n++) {
9530 for (size_t k = 1; k <= 20; k += 5) {
9531 for (uint32_t m = 1; m <= 4; m++) {
9532 GemmMicrokernelTester()
9533 .mr(4)
9534 .nr(8)
9535 .kr(1)
9536 .sr(1)
9537 .m(m)
9538 .n(n)
9539 .k(k)
9540 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009541 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009542 }
9543 }
9544 }
9545 }
9546
Frank Barchard91317c52019-11-22 10:54:35 -08009547 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009548 TEST_REQUIRES_ARM_NEON;
9549 for (uint32_t n = 16; n <= 24; n += 8) {
9550 for (size_t k = 1; k <= 20; k += 5) {
9551 GemmMicrokernelTester()
9552 .mr(4)
9553 .nr(8)
9554 .kr(1)
9555 .sr(1)
9556 .m(4)
9557 .n(8)
9558 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009559 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009560 }
9561 }
9562 }
9563
Frank Barchard91317c52019-11-22 10:54:35 -08009564 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009565 TEST_REQUIRES_ARM_NEON;
9566 for (uint32_t n = 16; n <= 24; n += 8) {
9567 for (size_t k = 1; k <= 20; k += 5) {
9568 GemmMicrokernelTester()
9569 .mr(4)
9570 .nr(8)
9571 .kr(1)
9572 .sr(1)
9573 .m(4)
9574 .n(n)
9575 .k(k)
9576 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009577 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009578 }
9579 }
9580 }
9581
Frank Barchard91317c52019-11-22 10:54:35 -08009582 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009583 TEST_REQUIRES_ARM_NEON;
9584 for (uint32_t n = 16; n <= 24; n += 8) {
9585 for (size_t k = 1; k <= 20; k += 5) {
9586 GemmMicrokernelTester()
9587 .mr(4)
9588 .nr(8)
9589 .kr(1)
9590 .sr(1)
9591 .m(4)
9592 .n(n)
9593 .k(k)
9594 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -08009595 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009596 }
9597 }
9598 }
9599
Frank Barchard91317c52019-11-22 10:54:35 -08009600 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009601 TEST_REQUIRES_ARM_NEON;
9602 for (uint32_t n = 16; n <= 24; n += 8) {
9603 for (size_t k = 1; k <= 20; k += 5) {
9604 for (uint32_t m = 1; m <= 4; m++) {
9605 GemmMicrokernelTester()
9606 .mr(4)
9607 .nr(8)
9608 .kr(1)
9609 .sr(1)
9610 .m(m)
9611 .n(n)
9612 .k(k)
9613 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009614 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009615 }
9616 }
9617 }
9618 }
9619
Frank Barchard91317c52019-11-22 10:54:35 -08009620 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009621 TEST_REQUIRES_ARM_NEON;
9622 for (size_t k = 1; k <= 20; k += 5) {
9623 for (uint32_t m = 1; m <= 4; m++) {
9624 for (uint32_t n = 1; n <= 8; n++) {
9625 GemmMicrokernelTester()
9626 .mr(4)
9627 .nr(8)
9628 .kr(1)
9629 .sr(1)
9630 .m(m)
9631 .n(n)
9632 .k(k)
9633 .cm_stride(11)
9634 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009635 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009636 }
9637 }
9638 }
9639 }
9640
Frank Barchard91317c52019-11-22 10:54:35 -08009641 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009642 TEST_REQUIRES_ARM_NEON;
9643 GemmMicrokernelTester()
9644 .mr(4)
9645 .nr(8)
9646 .kr(1)
9647 .sr(1)
9648 .m(4)
9649 .n(8)
9650 .k(4)
9651 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009652 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009653 }
9654
Frank Barchard91317c52019-11-22 10:54:35 -08009655 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009656 TEST_REQUIRES_ARM_NEON;
9657 GemmMicrokernelTester()
9658 .mr(4)
9659 .nr(8)
9660 .kr(1)
9661 .sr(1)
9662 .m(4)
9663 .n(8)
9664 .k(4)
9665 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -08009666 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009667 }
9668
Frank Barchard91317c52019-11-22 10:54:35 -08009669 TEST(F32_GEMMINC_4X8__NEON_LANE_LD128, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009670 TEST_REQUIRES_ARM_NEON;
9671 GemmMicrokernelTester()
9672 .mr(4)
9673 .nr(8)
9674 .kr(1)
9675 .sr(1)
9676 .m(4)
9677 .n(8)
9678 .k(4)
9679 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009680 .Test(xnn_f32_gemminc_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009681 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009682#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -07009683
9684
Marat Dukhan1dadbf72019-10-01 10:46:20 -07009685#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -08009686 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009687 TEST_REQUIRES_ARM_NEON;
9688 GemmMicrokernelTester()
9689 .mr(5)
9690 .nr(8)
9691 .kr(1)
9692 .sr(1)
9693 .m(5)
9694 .n(8)
9695 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -08009696 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009697 }
9698
Frank Barchard91317c52019-11-22 10:54:35 -08009699 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009700 TEST_REQUIRES_ARM_NEON;
9701 GemmMicrokernelTester()
9702 .mr(5)
9703 .nr(8)
9704 .kr(1)
9705 .sr(1)
9706 .m(5)
9707 .n(8)
9708 .k(2)
9709 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009710 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009711 }
9712
Frank Barchard91317c52019-11-22 10:54:35 -08009713 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009714 TEST_REQUIRES_ARM_NEON;
9715 GemmMicrokernelTester()
9716 .mr(5)
9717 .nr(8)
9718 .kr(1)
9719 .sr(1)
9720 .m(5)
9721 .n(8)
9722 .k(2)
9723 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08009724 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009725 }
9726
Frank Barchard91317c52019-11-22 10:54:35 -08009727 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009728 TEST_REQUIRES_ARM_NEON;
9729 for (uint32_t m = 1; m <= 5; m++) {
9730 for (uint32_t n = 1; n <= 8; n++) {
9731 GemmMicrokernelTester()
9732 .mr(5)
9733 .nr(8)
9734 .kr(1)
9735 .sr(1)
9736 .m(m)
9737 .n(n)
9738 .k(2)
9739 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009740 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009741 }
9742 }
9743 }
9744
Frank Barchard91317c52019-11-22 10:54:35 -08009745 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009746 TEST_REQUIRES_ARM_NEON;
9747 for (uint32_t m = 1; m <= 5; m++) {
9748 GemmMicrokernelTester()
9749 .mr(5)
9750 .nr(8)
9751 .kr(1)
9752 .sr(1)
9753 .m(m)
9754 .n(8)
9755 .k(2)
9756 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009757 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009758 }
9759 }
9760
Frank Barchard91317c52019-11-22 10:54:35 -08009761 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009762 TEST_REQUIRES_ARM_NEON;
9763 for (uint32_t n = 1; n <= 8; n++) {
9764 GemmMicrokernelTester()
9765 .mr(5)
9766 .nr(8)
9767 .kr(1)
9768 .sr(1)
9769 .m(5)
9770 .n(n)
9771 .k(2)
9772 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009773 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009774 }
9775 }
9776
Frank Barchard91317c52019-11-22 10:54:35 -08009777 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009778 TEST_REQUIRES_ARM_NEON;
9779 for (size_t k = 1; k < 2; k++) {
9780 GemmMicrokernelTester()
9781 .mr(5)
9782 .nr(8)
9783 .kr(1)
9784 .sr(1)
9785 .m(5)
9786 .n(8)
9787 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009788 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009789 }
9790 }
9791
Frank Barchard91317c52019-11-22 10:54:35 -08009792 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009793 TEST_REQUIRES_ARM_NEON;
9794 for (size_t k = 1; k < 2; k++) {
9795 GemmMicrokernelTester()
9796 .mr(5)
9797 .nr(8)
9798 .kr(1)
9799 .sr(1)
9800 .m(5)
9801 .n(8)
9802 .k(k)
9803 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -08009804 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009805 }
9806 }
9807
Frank Barchard91317c52019-11-22 10:54:35 -08009808 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009809 TEST_REQUIRES_ARM_NEON;
9810 for (size_t k = 1; k < 2; k++) {
9811 for (uint32_t m = 1; m <= 5; m++) {
9812 for (uint32_t n = 1; n <= 8; n++) {
9813 GemmMicrokernelTester()
9814 .mr(5)
9815 .nr(8)
9816 .kr(1)
9817 .sr(1)
9818 .m(m)
9819 .n(n)
9820 .k(k)
9821 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009822 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009823 }
9824 }
9825 }
9826 }
9827
Frank Barchard91317c52019-11-22 10:54:35 -08009828 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009829 TEST_REQUIRES_ARM_NEON;
9830 for (size_t k = 3; k < 4; k++) {
9831 GemmMicrokernelTester()
9832 .mr(5)
9833 .nr(8)
9834 .kr(1)
9835 .sr(1)
9836 .m(5)
9837 .n(8)
9838 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009839 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009840 }
9841 }
9842
Frank Barchard91317c52019-11-22 10:54:35 -08009843 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009844 TEST_REQUIRES_ARM_NEON;
9845 for (size_t k = 3; k < 4; k++) {
9846 GemmMicrokernelTester()
9847 .mr(5)
9848 .nr(8)
9849 .kr(1)
9850 .sr(1)
9851 .m(5)
9852 .n(8)
9853 .k(k)
9854 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -08009855 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009856 }
9857 }
9858
Frank Barchard91317c52019-11-22 10:54:35 -08009859 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009860 TEST_REQUIRES_ARM_NEON;
9861 for (size_t k = 3; k < 4; k++) {
9862 for (uint32_t m = 1; m <= 5; m++) {
9863 for (uint32_t n = 1; n <= 8; n++) {
9864 GemmMicrokernelTester()
9865 .mr(5)
9866 .nr(8)
9867 .kr(1)
9868 .sr(1)
9869 .m(m)
9870 .n(n)
9871 .k(k)
9872 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009873 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009874 }
9875 }
9876 }
9877 }
9878
Frank Barchard91317c52019-11-22 10:54:35 -08009879 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009880 TEST_REQUIRES_ARM_NEON;
9881 for (size_t k = 4; k <= 20; k += 2) {
9882 GemmMicrokernelTester()
9883 .mr(5)
9884 .nr(8)
9885 .kr(1)
9886 .sr(1)
9887 .m(5)
9888 .n(8)
9889 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009890 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009891 }
9892 }
9893
Frank Barchard91317c52019-11-22 10:54:35 -08009894 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009895 TEST_REQUIRES_ARM_NEON;
9896 for (size_t k = 4; k <= 20; k += 2) {
9897 GemmMicrokernelTester()
9898 .mr(5)
9899 .nr(8)
9900 .kr(1)
9901 .sr(1)
9902 .m(5)
9903 .n(8)
9904 .k(k)
9905 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -08009906 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009907 }
9908 }
9909
Frank Barchard91317c52019-11-22 10:54:35 -08009910 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009911 TEST_REQUIRES_ARM_NEON;
9912 for (size_t k = 4; k <= 20; k += 2) {
9913 for (uint32_t m = 1; m <= 5; m++) {
9914 for (uint32_t n = 1; n <= 8; n++) {
9915 GemmMicrokernelTester()
9916 .mr(5)
9917 .nr(8)
9918 .kr(1)
9919 .sr(1)
9920 .m(m)
9921 .n(n)
9922 .k(k)
9923 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009924 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009925 }
9926 }
9927 }
9928 }
9929
Frank Barchard91317c52019-11-22 10:54:35 -08009930 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009931 TEST_REQUIRES_ARM_NEON;
9932 for (uint32_t n = 9; n < 16; n++) {
9933 for (size_t k = 1; k <= 10; k += 3) {
9934 GemmMicrokernelTester()
9935 .mr(5)
9936 .nr(8)
9937 .kr(1)
9938 .sr(1)
9939 .m(5)
9940 .n(8)
9941 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -08009942 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009943 }
9944 }
9945 }
9946
Frank Barchard91317c52019-11-22 10:54:35 -08009947 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009948 TEST_REQUIRES_ARM_NEON;
9949 for (uint32_t n = 9; n < 16; n++) {
9950 for (size_t k = 1; k <= 10; k += 3) {
9951 GemmMicrokernelTester()
9952 .mr(5)
9953 .nr(8)
9954 .kr(1)
9955 .sr(1)
9956 .m(5)
9957 .n(8)
9958 .k(k)
9959 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -08009960 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009961 }
9962 }
9963 }
9964
Frank Barchard91317c52019-11-22 10:54:35 -08009965 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009966 TEST_REQUIRES_ARM_NEON;
9967 for (uint32_t n = 9; n < 16; n++) {
9968 for (size_t k = 1; k <= 10; k += 3) {
9969 GemmMicrokernelTester()
9970 .mr(5)
9971 .nr(8)
9972 .kr(1)
9973 .sr(1)
9974 .m(5)
9975 .n(n)
9976 .k(k)
9977 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -08009978 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009979 }
9980 }
9981 }
9982
Frank Barchard91317c52019-11-22 10:54:35 -08009983 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -07009984 TEST_REQUIRES_ARM_NEON;
9985 for (uint32_t n = 9; n < 16; n++) {
9986 for (size_t k = 1; k <= 10; k += 3) {
9987 for (uint32_t m = 1; m <= 5; m++) {
9988 GemmMicrokernelTester()
9989 .mr(5)
9990 .nr(8)
9991 .kr(1)
9992 .sr(1)
9993 .m(m)
9994 .n(n)
9995 .k(k)
9996 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -08009997 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -07009998 }
9999 }
10000 }
10001 }
10002
Frank Barchard91317c52019-11-22 10:54:35 -080010003 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010004 TEST_REQUIRES_ARM_NEON;
10005 for (uint32_t n = 16; n <= 24; n += 8) {
10006 for (size_t k = 1; k <= 10; k += 3) {
10007 GemmMicrokernelTester()
10008 .mr(5)
10009 .nr(8)
10010 .kr(1)
10011 .sr(1)
10012 .m(5)
10013 .n(8)
10014 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010015 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010016 }
10017 }
10018 }
10019
Frank Barchard91317c52019-11-22 10:54:35 -080010020 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010021 TEST_REQUIRES_ARM_NEON;
10022 for (uint32_t n = 16; n <= 24; n += 8) {
10023 for (size_t k = 1; k <= 10; k += 3) {
10024 GemmMicrokernelTester()
10025 .mr(5)
10026 .nr(8)
10027 .kr(1)
10028 .sr(1)
10029 .m(5)
10030 .n(n)
10031 .k(k)
10032 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010033 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010034 }
10035 }
10036 }
10037
Frank Barchard91317c52019-11-22 10:54:35 -080010038 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010039 TEST_REQUIRES_ARM_NEON;
10040 for (uint32_t n = 16; n <= 24; n += 8) {
10041 for (size_t k = 1; k <= 10; k += 3) {
10042 GemmMicrokernelTester()
10043 .mr(5)
10044 .nr(8)
10045 .kr(1)
10046 .sr(1)
10047 .m(5)
10048 .n(n)
10049 .k(k)
10050 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080010051 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010052 }
10053 }
10054 }
10055
Frank Barchard91317c52019-11-22 10:54:35 -080010056 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010057 TEST_REQUIRES_ARM_NEON;
10058 for (uint32_t n = 16; n <= 24; n += 8) {
10059 for (size_t k = 1; k <= 10; k += 3) {
10060 for (uint32_t m = 1; m <= 5; m++) {
10061 GemmMicrokernelTester()
10062 .mr(5)
10063 .nr(8)
10064 .kr(1)
10065 .sr(1)
10066 .m(m)
10067 .n(n)
10068 .k(k)
10069 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010070 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010071 }
10072 }
10073 }
10074 }
10075
Frank Barchard91317c52019-11-22 10:54:35 -080010076 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010077 TEST_REQUIRES_ARM_NEON;
10078 for (size_t k = 1; k <= 10; k += 3) {
10079 for (uint32_t m = 1; m <= 5; m++) {
10080 for (uint32_t n = 1; n <= 8; n++) {
10081 GemmMicrokernelTester()
10082 .mr(5)
10083 .nr(8)
10084 .kr(1)
10085 .sr(1)
10086 .m(m)
10087 .n(n)
10088 .k(k)
10089 .cm_stride(11)
10090 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010091 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010092 }
10093 }
10094 }
10095 }
10096
Frank Barchard91317c52019-11-22 10:54:35 -080010097 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010098 TEST_REQUIRES_ARM_NEON;
10099 GemmMicrokernelTester()
10100 .mr(5)
10101 .nr(8)
10102 .kr(1)
10103 .sr(1)
10104 .m(5)
10105 .n(8)
10106 .k(2)
10107 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010108 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010109 }
10110
Frank Barchard91317c52019-11-22 10:54:35 -080010111 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010112 TEST_REQUIRES_ARM_NEON;
10113 GemmMicrokernelTester()
10114 .mr(5)
10115 .nr(8)
10116 .kr(1)
10117 .sr(1)
10118 .m(5)
10119 .n(8)
10120 .k(2)
10121 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010122 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010123 }
10124
Frank Barchard91317c52019-11-22 10:54:35 -080010125 TEST(F32_GEMMINC_5X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010126 TEST_REQUIRES_ARM_NEON;
10127 GemmMicrokernelTester()
10128 .mr(5)
10129 .nr(8)
10130 .kr(1)
10131 .sr(1)
10132 .m(5)
10133 .n(8)
10134 .k(2)
10135 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010136 .Test(xnn_f32_gemminc_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010137 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010138#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070010139
10140
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010141#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080010142 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010143 TEST_REQUIRES_ARM_NEON;
10144 GemmMicrokernelTester()
10145 .mr(6)
10146 .nr(8)
10147 .kr(1)
10148 .sr(1)
10149 .m(6)
10150 .n(8)
10151 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080010152 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010153 }
10154
Frank Barchard91317c52019-11-22 10:54:35 -080010155 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010156 TEST_REQUIRES_ARM_NEON;
10157 GemmMicrokernelTester()
10158 .mr(6)
10159 .nr(8)
10160 .kr(1)
10161 .sr(1)
10162 .m(6)
10163 .n(8)
10164 .k(2)
10165 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010166 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010167 }
10168
Frank Barchard91317c52019-11-22 10:54:35 -080010169 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010170 TEST_REQUIRES_ARM_NEON;
10171 GemmMicrokernelTester()
10172 .mr(6)
10173 .nr(8)
10174 .kr(1)
10175 .sr(1)
10176 .m(6)
10177 .n(8)
10178 .k(2)
10179 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080010180 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010181 }
10182
Frank Barchard91317c52019-11-22 10:54:35 -080010183 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010184 TEST_REQUIRES_ARM_NEON;
10185 for (uint32_t m = 1; m <= 6; m++) {
10186 for (uint32_t n = 1; n <= 8; n++) {
10187 GemmMicrokernelTester()
10188 .mr(6)
10189 .nr(8)
10190 .kr(1)
10191 .sr(1)
10192 .m(m)
10193 .n(n)
10194 .k(2)
10195 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010196 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010197 }
10198 }
10199 }
10200
Frank Barchard91317c52019-11-22 10:54:35 -080010201 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010202 TEST_REQUIRES_ARM_NEON;
10203 for (uint32_t m = 1; m <= 6; m++) {
10204 GemmMicrokernelTester()
10205 .mr(6)
10206 .nr(8)
10207 .kr(1)
10208 .sr(1)
10209 .m(m)
10210 .n(8)
10211 .k(2)
10212 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010213 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010214 }
10215 }
10216
Frank Barchard91317c52019-11-22 10:54:35 -080010217 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010218 TEST_REQUIRES_ARM_NEON;
10219 for (uint32_t n = 1; n <= 8; n++) {
10220 GemmMicrokernelTester()
10221 .mr(6)
10222 .nr(8)
10223 .kr(1)
10224 .sr(1)
10225 .m(6)
10226 .n(n)
10227 .k(2)
10228 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010229 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010230 }
10231 }
10232
Frank Barchard91317c52019-11-22 10:54:35 -080010233 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010234 TEST_REQUIRES_ARM_NEON;
10235 for (size_t k = 1; k < 2; k++) {
10236 GemmMicrokernelTester()
10237 .mr(6)
10238 .nr(8)
10239 .kr(1)
10240 .sr(1)
10241 .m(6)
10242 .n(8)
10243 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010244 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010245 }
10246 }
10247
Frank Barchard91317c52019-11-22 10:54:35 -080010248 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010249 TEST_REQUIRES_ARM_NEON;
10250 for (size_t k = 1; k < 2; k++) {
10251 GemmMicrokernelTester()
10252 .mr(6)
10253 .nr(8)
10254 .kr(1)
10255 .sr(1)
10256 .m(6)
10257 .n(8)
10258 .k(k)
10259 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080010260 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010261 }
10262 }
10263
Frank Barchard91317c52019-11-22 10:54:35 -080010264 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010265 TEST_REQUIRES_ARM_NEON;
10266 for (size_t k = 1; k < 2; k++) {
10267 for (uint32_t m = 1; m <= 6; m++) {
10268 for (uint32_t n = 1; n <= 8; n++) {
10269 GemmMicrokernelTester()
10270 .mr(6)
10271 .nr(8)
10272 .kr(1)
10273 .sr(1)
10274 .m(m)
10275 .n(n)
10276 .k(k)
10277 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010278 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010279 }
10280 }
10281 }
10282 }
10283
Frank Barchard91317c52019-11-22 10:54:35 -080010284 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010285 TEST_REQUIRES_ARM_NEON;
10286 for (size_t k = 3; k < 4; k++) {
10287 GemmMicrokernelTester()
10288 .mr(6)
10289 .nr(8)
10290 .kr(1)
10291 .sr(1)
10292 .m(6)
10293 .n(8)
10294 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010295 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010296 }
10297 }
10298
Frank Barchard91317c52019-11-22 10:54:35 -080010299 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010300 TEST_REQUIRES_ARM_NEON;
10301 for (size_t k = 3; k < 4; k++) {
10302 GemmMicrokernelTester()
10303 .mr(6)
10304 .nr(8)
10305 .kr(1)
10306 .sr(1)
10307 .m(6)
10308 .n(8)
10309 .k(k)
10310 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080010311 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010312 }
10313 }
10314
Frank Barchard91317c52019-11-22 10:54:35 -080010315 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010316 TEST_REQUIRES_ARM_NEON;
10317 for (size_t k = 3; k < 4; k++) {
10318 for (uint32_t m = 1; m <= 6; m++) {
10319 for (uint32_t n = 1; n <= 8; n++) {
10320 GemmMicrokernelTester()
10321 .mr(6)
10322 .nr(8)
10323 .kr(1)
10324 .sr(1)
10325 .m(m)
10326 .n(n)
10327 .k(k)
10328 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010329 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010330 }
10331 }
10332 }
10333 }
10334
Frank Barchard91317c52019-11-22 10:54:35 -080010335 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010336 TEST_REQUIRES_ARM_NEON;
10337 for (size_t k = 4; k <= 20; k += 2) {
10338 GemmMicrokernelTester()
10339 .mr(6)
10340 .nr(8)
10341 .kr(1)
10342 .sr(1)
10343 .m(6)
10344 .n(8)
10345 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010346 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010347 }
10348 }
10349
Frank Barchard91317c52019-11-22 10:54:35 -080010350 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010351 TEST_REQUIRES_ARM_NEON;
10352 for (size_t k = 4; k <= 20; k += 2) {
10353 GemmMicrokernelTester()
10354 .mr(6)
10355 .nr(8)
10356 .kr(1)
10357 .sr(1)
10358 .m(6)
10359 .n(8)
10360 .k(k)
10361 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080010362 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010363 }
10364 }
10365
Frank Barchard91317c52019-11-22 10:54:35 -080010366 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010367 TEST_REQUIRES_ARM_NEON;
10368 for (size_t k = 4; k <= 20; k += 2) {
10369 for (uint32_t m = 1; m <= 6; m++) {
10370 for (uint32_t n = 1; n <= 8; n++) {
10371 GemmMicrokernelTester()
10372 .mr(6)
10373 .nr(8)
10374 .kr(1)
10375 .sr(1)
10376 .m(m)
10377 .n(n)
10378 .k(k)
10379 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010380 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010381 }
10382 }
10383 }
10384 }
10385
Frank Barchard91317c52019-11-22 10:54:35 -080010386 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010387 TEST_REQUIRES_ARM_NEON;
10388 for (uint32_t n = 9; n < 16; n++) {
10389 for (size_t k = 1; k <= 10; k += 3) {
10390 GemmMicrokernelTester()
10391 .mr(6)
10392 .nr(8)
10393 .kr(1)
10394 .sr(1)
10395 .m(6)
10396 .n(8)
10397 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010398 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010399 }
10400 }
10401 }
10402
Frank Barchard91317c52019-11-22 10:54:35 -080010403 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010404 TEST_REQUIRES_ARM_NEON;
10405 for (uint32_t n = 9; n < 16; n++) {
10406 for (size_t k = 1; k <= 10; k += 3) {
10407 GemmMicrokernelTester()
10408 .mr(6)
10409 .nr(8)
10410 .kr(1)
10411 .sr(1)
10412 .m(6)
10413 .n(8)
10414 .k(k)
10415 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010416 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010417 }
10418 }
10419 }
10420
Frank Barchard91317c52019-11-22 10:54:35 -080010421 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010422 TEST_REQUIRES_ARM_NEON;
10423 for (uint32_t n = 9; n < 16; n++) {
10424 for (size_t k = 1; k <= 10; k += 3) {
10425 GemmMicrokernelTester()
10426 .mr(6)
10427 .nr(8)
10428 .kr(1)
10429 .sr(1)
10430 .m(6)
10431 .n(n)
10432 .k(k)
10433 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080010434 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010435 }
10436 }
10437 }
10438
Frank Barchard91317c52019-11-22 10:54:35 -080010439 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010440 TEST_REQUIRES_ARM_NEON;
10441 for (uint32_t n = 9; n < 16; n++) {
10442 for (size_t k = 1; k <= 10; k += 3) {
10443 for (uint32_t m = 1; m <= 6; m++) {
10444 GemmMicrokernelTester()
10445 .mr(6)
10446 .nr(8)
10447 .kr(1)
10448 .sr(1)
10449 .m(m)
10450 .n(n)
10451 .k(k)
10452 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010453 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010454 }
10455 }
10456 }
10457 }
10458
Frank Barchard91317c52019-11-22 10:54:35 -080010459 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010460 TEST_REQUIRES_ARM_NEON;
10461 for (uint32_t n = 16; n <= 24; n += 8) {
10462 for (size_t k = 1; k <= 10; k += 3) {
10463 GemmMicrokernelTester()
10464 .mr(6)
10465 .nr(8)
10466 .kr(1)
10467 .sr(1)
10468 .m(6)
10469 .n(8)
10470 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010471 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010472 }
10473 }
10474 }
10475
Frank Barchard91317c52019-11-22 10:54:35 -080010476 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010477 TEST_REQUIRES_ARM_NEON;
10478 for (uint32_t n = 16; n <= 24; n += 8) {
10479 for (size_t k = 1; k <= 10; k += 3) {
10480 GemmMicrokernelTester()
10481 .mr(6)
10482 .nr(8)
10483 .kr(1)
10484 .sr(1)
10485 .m(6)
10486 .n(n)
10487 .k(k)
10488 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010489 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010490 }
10491 }
10492 }
10493
Frank Barchard91317c52019-11-22 10:54:35 -080010494 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010495 TEST_REQUIRES_ARM_NEON;
10496 for (uint32_t n = 16; n <= 24; n += 8) {
10497 for (size_t k = 1; k <= 10; k += 3) {
10498 GemmMicrokernelTester()
10499 .mr(6)
10500 .nr(8)
10501 .kr(1)
10502 .sr(1)
10503 .m(6)
10504 .n(n)
10505 .k(k)
10506 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080010507 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010508 }
10509 }
10510 }
10511
Frank Barchard91317c52019-11-22 10:54:35 -080010512 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010513 TEST_REQUIRES_ARM_NEON;
10514 for (uint32_t n = 16; n <= 24; n += 8) {
10515 for (size_t k = 1; k <= 10; k += 3) {
10516 for (uint32_t m = 1; m <= 6; m++) {
10517 GemmMicrokernelTester()
10518 .mr(6)
10519 .nr(8)
10520 .kr(1)
10521 .sr(1)
10522 .m(m)
10523 .n(n)
10524 .k(k)
10525 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010526 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010527 }
10528 }
10529 }
10530 }
10531
Frank Barchard91317c52019-11-22 10:54:35 -080010532 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010533 TEST_REQUIRES_ARM_NEON;
10534 for (size_t k = 1; k <= 10; k += 3) {
10535 for (uint32_t m = 1; m <= 6; m++) {
10536 for (uint32_t n = 1; n <= 8; n++) {
10537 GemmMicrokernelTester()
10538 .mr(6)
10539 .nr(8)
10540 .kr(1)
10541 .sr(1)
10542 .m(m)
10543 .n(n)
10544 .k(k)
10545 .cm_stride(11)
10546 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010547 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010548 }
10549 }
10550 }
10551 }
10552
Frank Barchard91317c52019-11-22 10:54:35 -080010553 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010554 TEST_REQUIRES_ARM_NEON;
10555 GemmMicrokernelTester()
10556 .mr(6)
10557 .nr(8)
10558 .kr(1)
10559 .sr(1)
10560 .m(6)
10561 .n(8)
10562 .k(2)
10563 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010564 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010565 }
10566
Frank Barchard91317c52019-11-22 10:54:35 -080010567 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010568 TEST_REQUIRES_ARM_NEON;
10569 GemmMicrokernelTester()
10570 .mr(6)
10571 .nr(8)
10572 .kr(1)
10573 .sr(1)
10574 .m(6)
10575 .n(8)
10576 .k(2)
10577 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080010578 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010579 }
10580
Frank Barchard91317c52019-11-22 10:54:35 -080010581 TEST(F32_GEMMINC_6X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010582 TEST_REQUIRES_ARM_NEON;
10583 GemmMicrokernelTester()
10584 .mr(6)
10585 .nr(8)
10586 .kr(1)
10587 .sr(1)
10588 .m(6)
10589 .n(8)
10590 .k(2)
10591 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010592 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010593 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010594#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070010595
10596
Frank Barchard69172d92019-11-26 16:22:39 -080010597#if XNN_ARCH_ARM || XNN_ARCH_ARM64
10598 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4) {
10599 TEST_REQUIRES_ARM_NEON;
10600 GemmMicrokernelTester()
10601 .mr(6)
10602 .nr(8)
10603 .kr(1)
10604 .sr(1)
10605 .m(6)
10606 .n(8)
10607 .k(4)
10608 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10609 }
10610
10611 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, strided_cn) {
10612 TEST_REQUIRES_ARM_NEON;
10613 GemmMicrokernelTester()
10614 .mr(6)
10615 .nr(8)
10616 .kr(1)
10617 .sr(1)
10618 .m(6)
10619 .n(8)
10620 .k(4)
10621 .cn_stride(11)
10622 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10623 }
10624
10625 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_strided_a) {
10626 TEST_REQUIRES_ARM_NEON;
10627 GemmMicrokernelTester()
10628 .mr(6)
10629 .nr(8)
10630 .kr(1)
10631 .sr(1)
10632 .m(6)
10633 .n(8)
10634 .k(4)
10635 .a_stride(7)
10636 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10637 }
10638
10639 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
10640 TEST_REQUIRES_ARM_NEON;
10641 for (uint32_t m = 1; m <= 6; m++) {
10642 for (uint32_t n = 1; n <= 8; n++) {
10643 GemmMicrokernelTester()
10644 .mr(6)
10645 .nr(8)
10646 .kr(1)
10647 .sr(1)
10648 .m(m)
10649 .n(n)
10650 .k(4)
10651 .iterations(1)
10652 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10653 }
10654 }
10655 }
10656
10657 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
10658 TEST_REQUIRES_ARM_NEON;
10659 for (uint32_t m = 1; m <= 6; m++) {
10660 GemmMicrokernelTester()
10661 .mr(6)
10662 .nr(8)
10663 .kr(1)
10664 .sr(1)
10665 .m(m)
10666 .n(8)
10667 .k(4)
10668 .iterations(1)
10669 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10670 }
10671 }
10672
10673 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
10674 TEST_REQUIRES_ARM_NEON;
10675 for (uint32_t n = 1; n <= 8; n++) {
10676 GemmMicrokernelTester()
10677 .mr(6)
10678 .nr(8)
10679 .kr(1)
10680 .sr(1)
10681 .m(6)
10682 .n(n)
10683 .k(4)
10684 .iterations(1)
10685 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10686 }
10687 }
10688
10689 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_lt_4) {
10690 TEST_REQUIRES_ARM_NEON;
10691 for (size_t k = 1; k < 4; k++) {
10692 GemmMicrokernelTester()
10693 .mr(6)
10694 .nr(8)
10695 .kr(1)
10696 .sr(1)
10697 .m(6)
10698 .n(8)
10699 .k(k)
10700 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10701 }
10702 }
10703
10704 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_lt_4_strided_a) {
10705 TEST_REQUIRES_ARM_NEON;
10706 for (size_t k = 1; k < 4; k++) {
10707 GemmMicrokernelTester()
10708 .mr(6)
10709 .nr(8)
10710 .kr(1)
10711 .sr(1)
10712 .m(6)
10713 .n(8)
10714 .k(k)
10715 .a_stride(7)
10716 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10717 }
10718 }
10719
10720 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
10721 TEST_REQUIRES_ARM_NEON;
10722 for (size_t k = 1; k < 4; k++) {
10723 for (uint32_t m = 1; m <= 6; m++) {
10724 for (uint32_t n = 1; n <= 8; n++) {
10725 GemmMicrokernelTester()
10726 .mr(6)
10727 .nr(8)
10728 .kr(1)
10729 .sr(1)
10730 .m(m)
10731 .n(n)
10732 .k(k)
10733 .iterations(1)
10734 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10735 }
10736 }
10737 }
10738 }
10739
10740 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_gt_4) {
10741 TEST_REQUIRES_ARM_NEON;
10742 for (size_t k = 5; k < 8; k++) {
10743 GemmMicrokernelTester()
10744 .mr(6)
10745 .nr(8)
10746 .kr(1)
10747 .sr(1)
10748 .m(6)
10749 .n(8)
10750 .k(k)
10751 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10752 }
10753 }
10754
10755 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_gt_4_strided_a) {
10756 TEST_REQUIRES_ARM_NEON;
10757 for (size_t k = 5; k < 8; k++) {
10758 GemmMicrokernelTester()
10759 .mr(6)
10760 .nr(8)
10761 .kr(1)
10762 .sr(1)
10763 .m(6)
10764 .n(8)
10765 .k(k)
10766 .a_stride(11)
10767 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10768 }
10769 }
10770
10771 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
10772 TEST_REQUIRES_ARM_NEON;
10773 for (size_t k = 5; k < 8; k++) {
10774 for (uint32_t m = 1; m <= 6; m++) {
10775 for (uint32_t n = 1; n <= 8; n++) {
10776 GemmMicrokernelTester()
10777 .mr(6)
10778 .nr(8)
10779 .kr(1)
10780 .sr(1)
10781 .m(m)
10782 .n(n)
10783 .k(k)
10784 .iterations(1)
10785 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10786 }
10787 }
10788 }
10789 }
10790
10791 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_div_4) {
10792 TEST_REQUIRES_ARM_NEON;
10793 for (size_t k = 8; k <= 40; k += 4) {
10794 GemmMicrokernelTester()
10795 .mr(6)
10796 .nr(8)
10797 .kr(1)
10798 .sr(1)
10799 .m(6)
10800 .n(8)
10801 .k(k)
10802 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10803 }
10804 }
10805
10806 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_div_4_strided_a) {
10807 TEST_REQUIRES_ARM_NEON;
10808 for (size_t k = 8; k <= 40; k += 4) {
10809 GemmMicrokernelTester()
10810 .mr(6)
10811 .nr(8)
10812 .kr(1)
10813 .sr(1)
10814 .m(6)
10815 .n(8)
10816 .k(k)
10817 .a_stride(43)
10818 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10819 }
10820 }
10821
10822 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, k_div_4_subtile) {
10823 TEST_REQUIRES_ARM_NEON;
10824 for (size_t k = 8; k <= 40; k += 4) {
10825 for (uint32_t m = 1; m <= 6; m++) {
10826 for (uint32_t n = 1; n <= 8; n++) {
10827 GemmMicrokernelTester()
10828 .mr(6)
10829 .nr(8)
10830 .kr(1)
10831 .sr(1)
10832 .m(m)
10833 .n(n)
10834 .k(k)
10835 .iterations(1)
10836 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10837 }
10838 }
10839 }
10840 }
10841
10842 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8) {
10843 TEST_REQUIRES_ARM_NEON;
10844 for (uint32_t n = 9; n < 16; n++) {
10845 for (size_t k = 1; k <= 20; k += 5) {
10846 GemmMicrokernelTester()
10847 .mr(6)
10848 .nr(8)
10849 .kr(1)
10850 .sr(1)
10851 .m(6)
10852 .n(8)
10853 .k(k)
10854 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10855 }
10856 }
10857 }
10858
10859 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
10860 TEST_REQUIRES_ARM_NEON;
10861 for (uint32_t n = 9; n < 16; n++) {
10862 for (size_t k = 1; k <= 20; k += 5) {
10863 GemmMicrokernelTester()
10864 .mr(6)
10865 .nr(8)
10866 .kr(1)
10867 .sr(1)
10868 .m(6)
10869 .n(8)
10870 .k(k)
10871 .cn_stride(11)
10872 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10873 }
10874 }
10875 }
10876
10877 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8_strided_a) {
10878 TEST_REQUIRES_ARM_NEON;
10879 for (uint32_t n = 9; n < 16; n++) {
10880 for (size_t k = 1; k <= 20; k += 5) {
10881 GemmMicrokernelTester()
10882 .mr(6)
10883 .nr(8)
10884 .kr(1)
10885 .sr(1)
10886 .m(6)
10887 .n(n)
10888 .k(k)
10889 .a_stride(23)
10890 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10891 }
10892 }
10893 }
10894
10895 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
10896 TEST_REQUIRES_ARM_NEON;
10897 for (uint32_t n = 9; n < 16; n++) {
10898 for (size_t k = 1; k <= 20; k += 5) {
10899 for (uint32_t m = 1; m <= 6; m++) {
10900 GemmMicrokernelTester()
10901 .mr(6)
10902 .nr(8)
10903 .kr(1)
10904 .sr(1)
10905 .m(m)
10906 .n(n)
10907 .k(k)
10908 .iterations(1)
10909 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10910 }
10911 }
10912 }
10913 }
10914
10915 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8) {
10916 TEST_REQUIRES_ARM_NEON;
10917 for (uint32_t n = 16; n <= 24; n += 8) {
10918 for (size_t k = 1; k <= 20; k += 5) {
10919 GemmMicrokernelTester()
10920 .mr(6)
10921 .nr(8)
10922 .kr(1)
10923 .sr(1)
10924 .m(6)
10925 .n(8)
10926 .k(k)
10927 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10928 }
10929 }
10930 }
10931
10932 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
10933 TEST_REQUIRES_ARM_NEON;
10934 for (uint32_t n = 16; n <= 24; n += 8) {
10935 for (size_t k = 1; k <= 20; k += 5) {
10936 GemmMicrokernelTester()
10937 .mr(6)
10938 .nr(8)
10939 .kr(1)
10940 .sr(1)
10941 .m(6)
10942 .n(n)
10943 .k(k)
10944 .cn_stride(11)
10945 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10946 }
10947 }
10948 }
10949
10950 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8_strided_a) {
10951 TEST_REQUIRES_ARM_NEON;
10952 for (uint32_t n = 16; n <= 24; n += 8) {
10953 for (size_t k = 1; k <= 20; k += 5) {
10954 GemmMicrokernelTester()
10955 .mr(6)
10956 .nr(8)
10957 .kr(1)
10958 .sr(1)
10959 .m(6)
10960 .n(n)
10961 .k(k)
10962 .a_stride(23)
10963 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10964 }
10965 }
10966 }
10967
10968 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, n_div_8_subtile) {
10969 TEST_REQUIRES_ARM_NEON;
10970 for (uint32_t n = 16; n <= 24; n += 8) {
10971 for (size_t k = 1; k <= 20; k += 5) {
10972 for (uint32_t m = 1; m <= 6; m++) {
10973 GemmMicrokernelTester()
10974 .mr(6)
10975 .nr(8)
10976 .kr(1)
10977 .sr(1)
10978 .m(m)
10979 .n(n)
10980 .k(k)
10981 .iterations(1)
10982 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
10983 }
10984 }
10985 }
10986 }
10987
10988 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, strided_cm_subtile) {
10989 TEST_REQUIRES_ARM_NEON;
10990 for (size_t k = 1; k <= 20; k += 5) {
10991 for (uint32_t m = 1; m <= 6; m++) {
10992 for (uint32_t n = 1; n <= 8; n++) {
10993 GemmMicrokernelTester()
10994 .mr(6)
10995 .nr(8)
10996 .kr(1)
10997 .sr(1)
10998 .m(m)
10999 .n(n)
11000 .k(k)
11001 .cm_stride(11)
11002 .iterations(1)
11003 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
11004 }
11005 }
11006 }
11007 }
11008
11009 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, qmin) {
11010 TEST_REQUIRES_ARM_NEON;
11011 GemmMicrokernelTester()
11012 .mr(6)
11013 .nr(8)
11014 .kr(1)
11015 .sr(1)
11016 .m(6)
11017 .n(8)
11018 .k(4)
11019 .qmin(128)
11020 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
11021 }
11022
11023 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, qmax) {
11024 TEST_REQUIRES_ARM_NEON;
11025 GemmMicrokernelTester()
11026 .mr(6)
11027 .nr(8)
11028 .kr(1)
11029 .sr(1)
11030 .m(6)
11031 .n(8)
11032 .k(4)
11033 .qmax(128)
11034 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
11035 }
11036
11037 TEST(F32_GEMMINC_6X8__NEON_LANE_LD128, strided_cm) {
11038 TEST_REQUIRES_ARM_NEON;
11039 GemmMicrokernelTester()
11040 .mr(6)
11041 .nr(8)
11042 .kr(1)
11043 .sr(1)
11044 .m(6)
11045 .n(8)
11046 .k(4)
11047 .cm_stride(11)
11048 .Test(xnn_f32_gemminc_ukernel_6x8__neon_lane_ld128);
11049 }
11050#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
11051
11052
Frank Barchard91317c52019-11-22 10:54:35 -080011053#if XNN_ARCH_ARM64
11054 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011055 TEST_REQUIRES_ARM_NEON_FMA;
11056 GemmMicrokernelTester()
11057 .mr(1)
11058 .nr(8)
11059 .kr(1)
11060 .sr(1)
11061 .m(1)
11062 .n(8)
11063 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080011064 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011065 }
11066
Frank Barchard91317c52019-11-22 10:54:35 -080011067 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011068 TEST_REQUIRES_ARM_NEON_FMA;
11069 GemmMicrokernelTester()
11070 .mr(1)
11071 .nr(8)
11072 .kr(1)
11073 .sr(1)
11074 .m(1)
11075 .n(8)
11076 .k(2)
11077 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011078 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011079 }
11080
Frank Barchard91317c52019-11-22 10:54:35 -080011081 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011082 TEST_REQUIRES_ARM_NEON_FMA;
11083 GemmMicrokernelTester()
11084 .mr(1)
11085 .nr(8)
11086 .kr(1)
11087 .sr(1)
11088 .m(1)
11089 .n(8)
11090 .k(2)
11091 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011092 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011093 }
11094
Frank Barchard91317c52019-11-22 10:54:35 -080011095 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011096 TEST_REQUIRES_ARM_NEON_FMA;
11097 for (uint32_t m = 1; m <= 1; m++) {
11098 for (uint32_t n = 1; n <= 8; n++) {
11099 GemmMicrokernelTester()
11100 .mr(1)
11101 .nr(8)
11102 .kr(1)
11103 .sr(1)
11104 .m(m)
11105 .n(n)
11106 .k(2)
11107 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011108 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011109 }
11110 }
11111 }
11112
Frank Barchard91317c52019-11-22 10:54:35 -080011113 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011114 TEST_REQUIRES_ARM_NEON_FMA;
11115 for (uint32_t m = 1; m <= 1; m++) {
11116 GemmMicrokernelTester()
11117 .mr(1)
11118 .nr(8)
11119 .kr(1)
11120 .sr(1)
11121 .m(m)
11122 .n(8)
11123 .k(2)
11124 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011125 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011126 }
11127 }
11128
Frank Barchard91317c52019-11-22 10:54:35 -080011129 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011130 TEST_REQUIRES_ARM_NEON_FMA;
11131 for (uint32_t n = 1; n <= 8; n++) {
11132 GemmMicrokernelTester()
11133 .mr(1)
11134 .nr(8)
11135 .kr(1)
11136 .sr(1)
11137 .m(1)
11138 .n(n)
11139 .k(2)
11140 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011141 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011142 }
11143 }
11144
Frank Barchard91317c52019-11-22 10:54:35 -080011145 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011146 TEST_REQUIRES_ARM_NEON_FMA;
11147 for (size_t k = 1; k < 2; k++) {
11148 GemmMicrokernelTester()
11149 .mr(1)
11150 .nr(8)
11151 .kr(1)
11152 .sr(1)
11153 .m(1)
11154 .n(8)
11155 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011156 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011157 }
11158 }
11159
Frank Barchard91317c52019-11-22 10:54:35 -080011160 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011161 TEST_REQUIRES_ARM_NEON_FMA;
11162 for (size_t k = 1; k < 2; k++) {
11163 GemmMicrokernelTester()
11164 .mr(1)
11165 .nr(8)
11166 .kr(1)
11167 .sr(1)
11168 .m(1)
11169 .n(8)
11170 .k(k)
11171 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011172 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011173 }
11174 }
11175
Frank Barchard91317c52019-11-22 10:54:35 -080011176 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011177 TEST_REQUIRES_ARM_NEON_FMA;
11178 for (size_t k = 1; k < 2; k++) {
11179 for (uint32_t m = 1; m <= 1; m++) {
11180 for (uint32_t n = 1; n <= 8; n++) {
11181 GemmMicrokernelTester()
11182 .mr(1)
11183 .nr(8)
11184 .kr(1)
11185 .sr(1)
11186 .m(m)
11187 .n(n)
11188 .k(k)
11189 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011190 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011191 }
11192 }
11193 }
11194 }
11195
Frank Barchard91317c52019-11-22 10:54:35 -080011196 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011197 TEST_REQUIRES_ARM_NEON_FMA;
11198 for (size_t k = 3; k < 4; k++) {
11199 GemmMicrokernelTester()
11200 .mr(1)
11201 .nr(8)
11202 .kr(1)
11203 .sr(1)
11204 .m(1)
11205 .n(8)
11206 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011207 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011208 }
11209 }
11210
Frank Barchard91317c52019-11-22 10:54:35 -080011211 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011212 TEST_REQUIRES_ARM_NEON_FMA;
11213 for (size_t k = 3; k < 4; k++) {
11214 GemmMicrokernelTester()
11215 .mr(1)
11216 .nr(8)
11217 .kr(1)
11218 .sr(1)
11219 .m(1)
11220 .n(8)
11221 .k(k)
11222 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080011223 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011224 }
11225 }
11226
Frank Barchard91317c52019-11-22 10:54:35 -080011227 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011228 TEST_REQUIRES_ARM_NEON_FMA;
11229 for (size_t k = 3; k < 4; k++) {
11230 for (uint32_t m = 1; m <= 1; m++) {
11231 for (uint32_t n = 1; n <= 8; n++) {
11232 GemmMicrokernelTester()
11233 .mr(1)
11234 .nr(8)
11235 .kr(1)
11236 .sr(1)
11237 .m(m)
11238 .n(n)
11239 .k(k)
11240 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011241 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011242 }
11243 }
11244 }
11245 }
11246
Frank Barchard91317c52019-11-22 10:54:35 -080011247 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011248 TEST_REQUIRES_ARM_NEON_FMA;
11249 for (size_t k = 4; k <= 20; k += 2) {
11250 GemmMicrokernelTester()
11251 .mr(1)
11252 .nr(8)
11253 .kr(1)
11254 .sr(1)
11255 .m(1)
11256 .n(8)
11257 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011258 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011259 }
11260 }
11261
Frank Barchard91317c52019-11-22 10:54:35 -080011262 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011263 TEST_REQUIRES_ARM_NEON_FMA;
11264 for (size_t k = 4; k <= 20; k += 2) {
11265 GemmMicrokernelTester()
11266 .mr(1)
11267 .nr(8)
11268 .kr(1)
11269 .sr(1)
11270 .m(1)
11271 .n(8)
11272 .k(k)
11273 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080011274 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011275 }
11276 }
11277
Frank Barchard91317c52019-11-22 10:54:35 -080011278 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011279 TEST_REQUIRES_ARM_NEON_FMA;
11280 for (size_t k = 4; k <= 20; k += 2) {
11281 for (uint32_t m = 1; m <= 1; m++) {
11282 for (uint32_t n = 1; n <= 8; n++) {
11283 GemmMicrokernelTester()
11284 .mr(1)
11285 .nr(8)
11286 .kr(1)
11287 .sr(1)
11288 .m(m)
11289 .n(n)
11290 .k(k)
11291 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011292 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011293 }
11294 }
11295 }
11296 }
11297
Frank Barchard91317c52019-11-22 10:54:35 -080011298 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011299 TEST_REQUIRES_ARM_NEON_FMA;
11300 for (uint32_t n = 9; n < 16; n++) {
11301 for (size_t k = 1; k <= 10; k += 3) {
11302 GemmMicrokernelTester()
11303 .mr(1)
11304 .nr(8)
11305 .kr(1)
11306 .sr(1)
11307 .m(1)
11308 .n(8)
11309 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011310 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011311 }
11312 }
11313 }
11314
Frank Barchard91317c52019-11-22 10:54:35 -080011315 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011316 TEST_REQUIRES_ARM_NEON_FMA;
11317 for (uint32_t n = 9; n < 16; n++) {
11318 for (size_t k = 1; k <= 10; k += 3) {
11319 GemmMicrokernelTester()
11320 .mr(1)
11321 .nr(8)
11322 .kr(1)
11323 .sr(1)
11324 .m(1)
11325 .n(8)
11326 .k(k)
11327 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011328 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011329 }
11330 }
11331 }
11332
Frank Barchard91317c52019-11-22 10:54:35 -080011333 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011334 TEST_REQUIRES_ARM_NEON_FMA;
11335 for (uint32_t n = 9; n < 16; n++) {
11336 for (size_t k = 1; k <= 10; k += 3) {
11337 GemmMicrokernelTester()
11338 .mr(1)
11339 .nr(8)
11340 .kr(1)
11341 .sr(1)
11342 .m(1)
11343 .n(n)
11344 .k(k)
11345 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011346 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011347 }
11348 }
11349 }
11350
Frank Barchard91317c52019-11-22 10:54:35 -080011351 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011352 TEST_REQUIRES_ARM_NEON_FMA;
11353 for (uint32_t n = 9; n < 16; n++) {
11354 for (size_t k = 1; k <= 10; k += 3) {
11355 for (uint32_t m = 1; m <= 1; m++) {
11356 GemmMicrokernelTester()
11357 .mr(1)
11358 .nr(8)
11359 .kr(1)
11360 .sr(1)
11361 .m(m)
11362 .n(n)
11363 .k(k)
11364 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011365 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011366 }
11367 }
11368 }
11369 }
11370
Frank Barchard91317c52019-11-22 10:54:35 -080011371 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011372 TEST_REQUIRES_ARM_NEON_FMA;
11373 for (uint32_t n = 16; n <= 24; n += 8) {
11374 for (size_t k = 1; k <= 10; k += 3) {
11375 GemmMicrokernelTester()
11376 .mr(1)
11377 .nr(8)
11378 .kr(1)
11379 .sr(1)
11380 .m(1)
11381 .n(8)
11382 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011383 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011384 }
11385 }
11386 }
11387
Frank Barchard91317c52019-11-22 10:54:35 -080011388 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011389 TEST_REQUIRES_ARM_NEON_FMA;
11390 for (uint32_t n = 16; n <= 24; n += 8) {
11391 for (size_t k = 1; k <= 10; k += 3) {
11392 GemmMicrokernelTester()
11393 .mr(1)
11394 .nr(8)
11395 .kr(1)
11396 .sr(1)
11397 .m(1)
11398 .n(n)
11399 .k(k)
11400 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011401 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011402 }
11403 }
11404 }
11405
Frank Barchard91317c52019-11-22 10:54:35 -080011406 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011407 TEST_REQUIRES_ARM_NEON_FMA;
11408 for (uint32_t n = 16; n <= 24; n += 8) {
11409 for (size_t k = 1; k <= 10; k += 3) {
11410 GemmMicrokernelTester()
11411 .mr(1)
11412 .nr(8)
11413 .kr(1)
11414 .sr(1)
11415 .m(1)
11416 .n(n)
11417 .k(k)
11418 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011419 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011420 }
11421 }
11422 }
11423
Frank Barchard91317c52019-11-22 10:54:35 -080011424 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011425 TEST_REQUIRES_ARM_NEON_FMA;
11426 for (uint32_t n = 16; n <= 24; n += 8) {
11427 for (size_t k = 1; k <= 10; k += 3) {
11428 for (uint32_t m = 1; m <= 1; m++) {
11429 GemmMicrokernelTester()
11430 .mr(1)
11431 .nr(8)
11432 .kr(1)
11433 .sr(1)
11434 .m(m)
11435 .n(n)
11436 .k(k)
11437 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011438 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011439 }
11440 }
11441 }
11442 }
11443
Frank Barchard91317c52019-11-22 10:54:35 -080011444 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011445 TEST_REQUIRES_ARM_NEON_FMA;
11446 for (size_t k = 1; k <= 10; k += 3) {
11447 for (uint32_t m = 1; m <= 1; m++) {
11448 for (uint32_t n = 1; n <= 8; n++) {
11449 GemmMicrokernelTester()
11450 .mr(1)
11451 .nr(8)
11452 .kr(1)
11453 .sr(1)
11454 .m(m)
11455 .n(n)
11456 .k(k)
11457 .cm_stride(11)
11458 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011459 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011460 }
11461 }
11462 }
11463 }
11464
Frank Barchard91317c52019-11-22 10:54:35 -080011465 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011466 TEST_REQUIRES_ARM_NEON_FMA;
11467 GemmMicrokernelTester()
11468 .mr(1)
11469 .nr(8)
11470 .kr(1)
11471 .sr(1)
11472 .m(1)
11473 .n(8)
11474 .k(2)
11475 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011476 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011477 }
11478
Frank Barchard91317c52019-11-22 10:54:35 -080011479 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011480 TEST_REQUIRES_ARM_NEON_FMA;
11481 GemmMicrokernelTester()
11482 .mr(1)
11483 .nr(8)
11484 .kr(1)
11485 .sr(1)
11486 .m(1)
11487 .n(8)
11488 .k(2)
11489 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011490 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011491 }
11492
Frank Barchard91317c52019-11-22 10:54:35 -080011493 TEST(F32_GEMMINC_1X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011494 TEST_REQUIRES_ARM_NEON_FMA;
11495 GemmMicrokernelTester()
11496 .mr(1)
11497 .nr(8)
11498 .kr(1)
11499 .sr(1)
11500 .m(1)
11501 .n(8)
11502 .k(2)
11503 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011504 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011505 }
Frank Barchard91317c52019-11-22 10:54:35 -080011506#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070011507
11508
Frank Barchard91317c52019-11-22 10:54:35 -080011509#if XNN_ARCH_ARM64
11510 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011511 TEST_REQUIRES_ARM_NEON_FMA;
11512 GemmMicrokernelTester()
11513 .mr(4)
11514 .nr(8)
11515 .kr(1)
11516 .sr(1)
11517 .m(4)
11518 .n(8)
11519 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080011520 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011521 }
11522
Frank Barchard91317c52019-11-22 10:54:35 -080011523 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011524 TEST_REQUIRES_ARM_NEON_FMA;
11525 GemmMicrokernelTester()
11526 .mr(4)
11527 .nr(8)
11528 .kr(1)
11529 .sr(1)
11530 .m(4)
11531 .n(8)
11532 .k(2)
11533 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011534 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011535 }
11536
Frank Barchard91317c52019-11-22 10:54:35 -080011537 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011538 TEST_REQUIRES_ARM_NEON_FMA;
11539 GemmMicrokernelTester()
11540 .mr(4)
11541 .nr(8)
11542 .kr(1)
11543 .sr(1)
11544 .m(4)
11545 .n(8)
11546 .k(2)
11547 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011548 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011549 }
11550
Frank Barchard91317c52019-11-22 10:54:35 -080011551 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011552 TEST_REQUIRES_ARM_NEON_FMA;
11553 for (uint32_t m = 1; m <= 4; m++) {
11554 for (uint32_t n = 1; n <= 8; n++) {
11555 GemmMicrokernelTester()
11556 .mr(4)
11557 .nr(8)
11558 .kr(1)
11559 .sr(1)
11560 .m(m)
11561 .n(n)
11562 .k(2)
11563 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011564 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011565 }
11566 }
11567 }
11568
Frank Barchard91317c52019-11-22 10:54:35 -080011569 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011570 TEST_REQUIRES_ARM_NEON_FMA;
11571 for (uint32_t m = 1; m <= 4; m++) {
11572 GemmMicrokernelTester()
11573 .mr(4)
11574 .nr(8)
11575 .kr(1)
11576 .sr(1)
11577 .m(m)
11578 .n(8)
11579 .k(2)
11580 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011581 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011582 }
11583 }
11584
Frank Barchard91317c52019-11-22 10:54:35 -080011585 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011586 TEST_REQUIRES_ARM_NEON_FMA;
11587 for (uint32_t n = 1; n <= 8; n++) {
11588 GemmMicrokernelTester()
11589 .mr(4)
11590 .nr(8)
11591 .kr(1)
11592 .sr(1)
11593 .m(4)
11594 .n(n)
11595 .k(2)
11596 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011597 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011598 }
11599 }
11600
Frank Barchard91317c52019-11-22 10:54:35 -080011601 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011602 TEST_REQUIRES_ARM_NEON_FMA;
11603 for (size_t k = 1; k < 2; k++) {
11604 GemmMicrokernelTester()
11605 .mr(4)
11606 .nr(8)
11607 .kr(1)
11608 .sr(1)
11609 .m(4)
11610 .n(8)
11611 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011612 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011613 }
11614 }
11615
Frank Barchard91317c52019-11-22 10:54:35 -080011616 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011617 TEST_REQUIRES_ARM_NEON_FMA;
11618 for (size_t k = 1; k < 2; k++) {
11619 GemmMicrokernelTester()
11620 .mr(4)
11621 .nr(8)
11622 .kr(1)
11623 .sr(1)
11624 .m(4)
11625 .n(8)
11626 .k(k)
11627 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011628 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011629 }
11630 }
11631
Frank Barchard91317c52019-11-22 10:54:35 -080011632 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011633 TEST_REQUIRES_ARM_NEON_FMA;
11634 for (size_t k = 1; k < 2; k++) {
11635 for (uint32_t m = 1; m <= 4; m++) {
11636 for (uint32_t n = 1; n <= 8; n++) {
11637 GemmMicrokernelTester()
11638 .mr(4)
11639 .nr(8)
11640 .kr(1)
11641 .sr(1)
11642 .m(m)
11643 .n(n)
11644 .k(k)
11645 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011646 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011647 }
11648 }
11649 }
11650 }
11651
Frank Barchard91317c52019-11-22 10:54:35 -080011652 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011653 TEST_REQUIRES_ARM_NEON_FMA;
11654 for (size_t k = 3; k < 4; k++) {
11655 GemmMicrokernelTester()
11656 .mr(4)
11657 .nr(8)
11658 .kr(1)
11659 .sr(1)
11660 .m(4)
11661 .n(8)
11662 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011663 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011664 }
11665 }
11666
Frank Barchard91317c52019-11-22 10:54:35 -080011667 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011668 TEST_REQUIRES_ARM_NEON_FMA;
11669 for (size_t k = 3; k < 4; k++) {
11670 GemmMicrokernelTester()
11671 .mr(4)
11672 .nr(8)
11673 .kr(1)
11674 .sr(1)
11675 .m(4)
11676 .n(8)
11677 .k(k)
11678 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080011679 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011680 }
11681 }
11682
Frank Barchard91317c52019-11-22 10:54:35 -080011683 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011684 TEST_REQUIRES_ARM_NEON_FMA;
11685 for (size_t k = 3; k < 4; k++) {
11686 for (uint32_t m = 1; m <= 4; m++) {
11687 for (uint32_t n = 1; n <= 8; n++) {
11688 GemmMicrokernelTester()
11689 .mr(4)
11690 .nr(8)
11691 .kr(1)
11692 .sr(1)
11693 .m(m)
11694 .n(n)
11695 .k(k)
11696 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011697 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011698 }
11699 }
11700 }
11701 }
11702
Frank Barchard91317c52019-11-22 10:54:35 -080011703 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011704 TEST_REQUIRES_ARM_NEON_FMA;
11705 for (size_t k = 4; k <= 20; k += 2) {
11706 GemmMicrokernelTester()
11707 .mr(4)
11708 .nr(8)
11709 .kr(1)
11710 .sr(1)
11711 .m(4)
11712 .n(8)
11713 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011714 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011715 }
11716 }
11717
Frank Barchard91317c52019-11-22 10:54:35 -080011718 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011719 TEST_REQUIRES_ARM_NEON_FMA;
11720 for (size_t k = 4; k <= 20; k += 2) {
11721 GemmMicrokernelTester()
11722 .mr(4)
11723 .nr(8)
11724 .kr(1)
11725 .sr(1)
11726 .m(4)
11727 .n(8)
11728 .k(k)
11729 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080011730 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011731 }
11732 }
11733
Frank Barchard91317c52019-11-22 10:54:35 -080011734 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011735 TEST_REQUIRES_ARM_NEON_FMA;
11736 for (size_t k = 4; k <= 20; k += 2) {
11737 for (uint32_t m = 1; m <= 4; m++) {
11738 for (uint32_t n = 1; n <= 8; n++) {
11739 GemmMicrokernelTester()
11740 .mr(4)
11741 .nr(8)
11742 .kr(1)
11743 .sr(1)
11744 .m(m)
11745 .n(n)
11746 .k(k)
11747 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011748 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011749 }
11750 }
11751 }
11752 }
11753
Frank Barchard91317c52019-11-22 10:54:35 -080011754 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011755 TEST_REQUIRES_ARM_NEON_FMA;
11756 for (uint32_t n = 9; n < 16; n++) {
11757 for (size_t k = 1; k <= 10; k += 3) {
11758 GemmMicrokernelTester()
11759 .mr(4)
11760 .nr(8)
11761 .kr(1)
11762 .sr(1)
11763 .m(4)
11764 .n(8)
11765 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011766 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011767 }
11768 }
11769 }
11770
Frank Barchard91317c52019-11-22 10:54:35 -080011771 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011772 TEST_REQUIRES_ARM_NEON_FMA;
11773 for (uint32_t n = 9; n < 16; n++) {
11774 for (size_t k = 1; k <= 10; k += 3) {
11775 GemmMicrokernelTester()
11776 .mr(4)
11777 .nr(8)
11778 .kr(1)
11779 .sr(1)
11780 .m(4)
11781 .n(8)
11782 .k(k)
11783 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011784 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011785 }
11786 }
11787 }
11788
Frank Barchard91317c52019-11-22 10:54:35 -080011789 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011790 TEST_REQUIRES_ARM_NEON_FMA;
11791 for (uint32_t n = 9; n < 16; n++) {
11792 for (size_t k = 1; k <= 10; k += 3) {
11793 GemmMicrokernelTester()
11794 .mr(4)
11795 .nr(8)
11796 .kr(1)
11797 .sr(1)
11798 .m(4)
11799 .n(n)
11800 .k(k)
11801 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011802 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011803 }
11804 }
11805 }
11806
Frank Barchard91317c52019-11-22 10:54:35 -080011807 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011808 TEST_REQUIRES_ARM_NEON_FMA;
11809 for (uint32_t n = 9; n < 16; n++) {
11810 for (size_t k = 1; k <= 10; k += 3) {
11811 for (uint32_t m = 1; m <= 4; m++) {
11812 GemmMicrokernelTester()
11813 .mr(4)
11814 .nr(8)
11815 .kr(1)
11816 .sr(1)
11817 .m(m)
11818 .n(n)
11819 .k(k)
11820 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011821 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011822 }
11823 }
11824 }
11825 }
11826
Frank Barchard91317c52019-11-22 10:54:35 -080011827 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011828 TEST_REQUIRES_ARM_NEON_FMA;
11829 for (uint32_t n = 16; n <= 24; n += 8) {
11830 for (size_t k = 1; k <= 10; k += 3) {
11831 GemmMicrokernelTester()
11832 .mr(4)
11833 .nr(8)
11834 .kr(1)
11835 .sr(1)
11836 .m(4)
11837 .n(8)
11838 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011839 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011840 }
11841 }
11842 }
11843
Frank Barchard91317c52019-11-22 10:54:35 -080011844 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011845 TEST_REQUIRES_ARM_NEON_FMA;
11846 for (uint32_t n = 16; n <= 24; n += 8) {
11847 for (size_t k = 1; k <= 10; k += 3) {
11848 GemmMicrokernelTester()
11849 .mr(4)
11850 .nr(8)
11851 .kr(1)
11852 .sr(1)
11853 .m(4)
11854 .n(n)
11855 .k(k)
11856 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011857 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011858 }
11859 }
11860 }
11861
Frank Barchard91317c52019-11-22 10:54:35 -080011862 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011863 TEST_REQUIRES_ARM_NEON_FMA;
11864 for (uint32_t n = 16; n <= 24; n += 8) {
11865 for (size_t k = 1; k <= 10; k += 3) {
11866 GemmMicrokernelTester()
11867 .mr(4)
11868 .nr(8)
11869 .kr(1)
11870 .sr(1)
11871 .m(4)
11872 .n(n)
11873 .k(k)
11874 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011875 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011876 }
11877 }
11878 }
11879
Frank Barchard91317c52019-11-22 10:54:35 -080011880 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011881 TEST_REQUIRES_ARM_NEON_FMA;
11882 for (uint32_t n = 16; n <= 24; n += 8) {
11883 for (size_t k = 1; k <= 10; k += 3) {
11884 for (uint32_t m = 1; m <= 4; m++) {
11885 GemmMicrokernelTester()
11886 .mr(4)
11887 .nr(8)
11888 .kr(1)
11889 .sr(1)
11890 .m(m)
11891 .n(n)
11892 .k(k)
11893 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011894 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011895 }
11896 }
11897 }
11898 }
11899
Frank Barchard91317c52019-11-22 10:54:35 -080011900 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011901 TEST_REQUIRES_ARM_NEON_FMA;
11902 for (size_t k = 1; k <= 10; k += 3) {
11903 for (uint32_t m = 1; m <= 4; m++) {
11904 for (uint32_t n = 1; n <= 8; n++) {
11905 GemmMicrokernelTester()
11906 .mr(4)
11907 .nr(8)
11908 .kr(1)
11909 .sr(1)
11910 .m(m)
11911 .n(n)
11912 .k(k)
11913 .cm_stride(11)
11914 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011915 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011916 }
11917 }
11918 }
11919 }
11920
Frank Barchard91317c52019-11-22 10:54:35 -080011921 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011922 TEST_REQUIRES_ARM_NEON_FMA;
11923 GemmMicrokernelTester()
11924 .mr(4)
11925 .nr(8)
11926 .kr(1)
11927 .sr(1)
11928 .m(4)
11929 .n(8)
11930 .k(2)
11931 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011932 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011933 }
11934
Frank Barchard91317c52019-11-22 10:54:35 -080011935 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011936 TEST_REQUIRES_ARM_NEON_FMA;
11937 GemmMicrokernelTester()
11938 .mr(4)
11939 .nr(8)
11940 .kr(1)
11941 .sr(1)
11942 .m(4)
11943 .n(8)
11944 .k(2)
11945 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011946 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011947 }
11948
Frank Barchard91317c52019-11-22 10:54:35 -080011949 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011950 TEST_REQUIRES_ARM_NEON_FMA;
11951 GemmMicrokernelTester()
11952 .mr(4)
11953 .nr(8)
11954 .kr(1)
11955 .sr(1)
11956 .m(4)
11957 .n(8)
11958 .k(2)
11959 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011960 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011961 }
Frank Barchard91317c52019-11-22 10:54:35 -080011962#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070011963
11964
Frank Barchard91317c52019-11-22 10:54:35 -080011965#if XNN_ARCH_ARM64
11966 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011967 TEST_REQUIRES_ARM_NEON_FMA;
11968 GemmMicrokernelTester()
11969 .mr(4)
11970 .nr(8)
11971 .kr(1)
11972 .sr(1)
11973 .m(4)
11974 .n(8)
11975 .k(4)
Frank Barchard91317c52019-11-22 10:54:35 -080011976 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011977 }
11978
Frank Barchard91317c52019-11-22 10:54:35 -080011979 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011980 TEST_REQUIRES_ARM_NEON_FMA;
11981 GemmMicrokernelTester()
11982 .mr(4)
11983 .nr(8)
11984 .kr(1)
11985 .sr(1)
11986 .m(4)
11987 .n(8)
11988 .k(4)
11989 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011990 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011991 }
11992
Frank Barchard91317c52019-11-22 10:54:35 -080011993 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011994 TEST_REQUIRES_ARM_NEON_FMA;
11995 GemmMicrokernelTester()
11996 .mr(4)
11997 .nr(8)
11998 .kr(1)
11999 .sr(1)
12000 .m(4)
12001 .n(8)
12002 .k(4)
12003 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012004 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012005 }
12006
Frank Barchard91317c52019-11-22 10:54:35 -080012007 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012008 TEST_REQUIRES_ARM_NEON_FMA;
12009 for (uint32_t m = 1; m <= 4; m++) {
12010 for (uint32_t n = 1; n <= 8; n++) {
12011 GemmMicrokernelTester()
12012 .mr(4)
12013 .nr(8)
12014 .kr(1)
12015 .sr(1)
12016 .m(m)
12017 .n(n)
12018 .k(4)
12019 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012020 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012021 }
12022 }
12023 }
12024
Frank Barchard91317c52019-11-22 10:54:35 -080012025 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012026 TEST_REQUIRES_ARM_NEON_FMA;
12027 for (uint32_t m = 1; m <= 4; m++) {
12028 GemmMicrokernelTester()
12029 .mr(4)
12030 .nr(8)
12031 .kr(1)
12032 .sr(1)
12033 .m(m)
12034 .n(8)
12035 .k(4)
12036 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012037 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012038 }
12039 }
12040
Frank Barchard91317c52019-11-22 10:54:35 -080012041 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012042 TEST_REQUIRES_ARM_NEON_FMA;
12043 for (uint32_t n = 1; n <= 8; n++) {
12044 GemmMicrokernelTester()
12045 .mr(4)
12046 .nr(8)
12047 .kr(1)
12048 .sr(1)
12049 .m(4)
12050 .n(n)
12051 .k(4)
12052 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012053 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012054 }
12055 }
12056
Frank Barchard91317c52019-11-22 10:54:35 -080012057 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012058 TEST_REQUIRES_ARM_NEON_FMA;
12059 for (size_t k = 1; k < 4; k++) {
12060 GemmMicrokernelTester()
12061 .mr(4)
12062 .nr(8)
12063 .kr(1)
12064 .sr(1)
12065 .m(4)
12066 .n(8)
12067 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012068 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012069 }
12070 }
12071
Frank Barchard91317c52019-11-22 10:54:35 -080012072 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012073 TEST_REQUIRES_ARM_NEON_FMA;
12074 for (size_t k = 1; k < 4; k++) {
12075 GemmMicrokernelTester()
12076 .mr(4)
12077 .nr(8)
12078 .kr(1)
12079 .sr(1)
12080 .m(4)
12081 .n(8)
12082 .k(k)
12083 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012084 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012085 }
12086 }
12087
Frank Barchard91317c52019-11-22 10:54:35 -080012088 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012089 TEST_REQUIRES_ARM_NEON_FMA;
12090 for (size_t k = 1; k < 4; k++) {
12091 for (uint32_t m = 1; m <= 4; m++) {
12092 for (uint32_t n = 1; n <= 8; n++) {
12093 GemmMicrokernelTester()
12094 .mr(4)
12095 .nr(8)
12096 .kr(1)
12097 .sr(1)
12098 .m(m)
12099 .n(n)
12100 .k(k)
12101 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012102 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012103 }
12104 }
12105 }
12106 }
12107
Frank Barchard91317c52019-11-22 10:54:35 -080012108 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012109 TEST_REQUIRES_ARM_NEON_FMA;
12110 for (size_t k = 5; k < 8; k++) {
12111 GemmMicrokernelTester()
12112 .mr(4)
12113 .nr(8)
12114 .kr(1)
12115 .sr(1)
12116 .m(4)
12117 .n(8)
12118 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012119 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012120 }
12121 }
12122
Frank Barchard91317c52019-11-22 10:54:35 -080012123 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012124 TEST_REQUIRES_ARM_NEON_FMA;
12125 for (size_t k = 5; k < 8; k++) {
12126 GemmMicrokernelTester()
12127 .mr(4)
12128 .nr(8)
12129 .kr(1)
12130 .sr(1)
12131 .m(4)
12132 .n(8)
12133 .k(k)
12134 .a_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012135 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012136 }
12137 }
12138
Frank Barchard91317c52019-11-22 10:54:35 -080012139 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012140 TEST_REQUIRES_ARM_NEON_FMA;
12141 for (size_t k = 5; k < 8; k++) {
12142 for (uint32_t m = 1; m <= 4; m++) {
12143 for (uint32_t n = 1; n <= 8; n++) {
12144 GemmMicrokernelTester()
12145 .mr(4)
12146 .nr(8)
12147 .kr(1)
12148 .sr(1)
12149 .m(m)
12150 .n(n)
12151 .k(k)
12152 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012153 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012154 }
12155 }
12156 }
12157 }
12158
Frank Barchard91317c52019-11-22 10:54:35 -080012159 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012160 TEST_REQUIRES_ARM_NEON_FMA;
12161 for (size_t k = 8; k <= 40; k += 4) {
12162 GemmMicrokernelTester()
12163 .mr(4)
12164 .nr(8)
12165 .kr(1)
12166 .sr(1)
12167 .m(4)
12168 .n(8)
12169 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012170 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012171 }
12172 }
12173
Frank Barchard91317c52019-11-22 10:54:35 -080012174 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012175 TEST_REQUIRES_ARM_NEON_FMA;
12176 for (size_t k = 8; k <= 40; k += 4) {
12177 GemmMicrokernelTester()
12178 .mr(4)
12179 .nr(8)
12180 .kr(1)
12181 .sr(1)
12182 .m(4)
12183 .n(8)
12184 .k(k)
12185 .a_stride(43)
Frank Barchard91317c52019-11-22 10:54:35 -080012186 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012187 }
12188 }
12189
Frank Barchard91317c52019-11-22 10:54:35 -080012190 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012191 TEST_REQUIRES_ARM_NEON_FMA;
12192 for (size_t k = 8; k <= 40; k += 4) {
12193 for (uint32_t m = 1; m <= 4; m++) {
12194 for (uint32_t n = 1; n <= 8; n++) {
12195 GemmMicrokernelTester()
12196 .mr(4)
12197 .nr(8)
12198 .kr(1)
12199 .sr(1)
12200 .m(m)
12201 .n(n)
12202 .k(k)
12203 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012204 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012205 }
12206 }
12207 }
12208 }
12209
Frank Barchard91317c52019-11-22 10:54:35 -080012210 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012211 TEST_REQUIRES_ARM_NEON_FMA;
12212 for (uint32_t n = 9; n < 16; n++) {
12213 for (size_t k = 1; k <= 20; k += 5) {
12214 GemmMicrokernelTester()
12215 .mr(4)
12216 .nr(8)
12217 .kr(1)
12218 .sr(1)
12219 .m(4)
12220 .n(8)
12221 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012222 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012223 }
12224 }
12225 }
12226
Frank Barchard91317c52019-11-22 10:54:35 -080012227 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012228 TEST_REQUIRES_ARM_NEON_FMA;
12229 for (uint32_t n = 9; n < 16; n++) {
12230 for (size_t k = 1; k <= 20; k += 5) {
12231 GemmMicrokernelTester()
12232 .mr(4)
12233 .nr(8)
12234 .kr(1)
12235 .sr(1)
12236 .m(4)
12237 .n(8)
12238 .k(k)
12239 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012240 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012241 }
12242 }
12243 }
12244
Frank Barchard91317c52019-11-22 10:54:35 -080012245 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012246 TEST_REQUIRES_ARM_NEON_FMA;
12247 for (uint32_t n = 9; n < 16; n++) {
12248 for (size_t k = 1; k <= 20; k += 5) {
12249 GemmMicrokernelTester()
12250 .mr(4)
12251 .nr(8)
12252 .kr(1)
12253 .sr(1)
12254 .m(4)
12255 .n(n)
12256 .k(k)
12257 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080012258 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012259 }
12260 }
12261 }
12262
Frank Barchard91317c52019-11-22 10:54:35 -080012263 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012264 TEST_REQUIRES_ARM_NEON_FMA;
12265 for (uint32_t n = 9; n < 16; n++) {
12266 for (size_t k = 1; k <= 20; k += 5) {
12267 for (uint32_t m = 1; m <= 4; m++) {
12268 GemmMicrokernelTester()
12269 .mr(4)
12270 .nr(8)
12271 .kr(1)
12272 .sr(1)
12273 .m(m)
12274 .n(n)
12275 .k(k)
12276 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012277 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012278 }
12279 }
12280 }
12281 }
12282
Frank Barchard91317c52019-11-22 10:54:35 -080012283 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012284 TEST_REQUIRES_ARM_NEON_FMA;
12285 for (uint32_t n = 16; n <= 24; n += 8) {
12286 for (size_t k = 1; k <= 20; k += 5) {
12287 GemmMicrokernelTester()
12288 .mr(4)
12289 .nr(8)
12290 .kr(1)
12291 .sr(1)
12292 .m(4)
12293 .n(8)
12294 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012295 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012296 }
12297 }
12298 }
12299
Frank Barchard91317c52019-11-22 10:54:35 -080012300 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012301 TEST_REQUIRES_ARM_NEON_FMA;
12302 for (uint32_t n = 16; n <= 24; n += 8) {
12303 for (size_t k = 1; k <= 20; k += 5) {
12304 GemmMicrokernelTester()
12305 .mr(4)
12306 .nr(8)
12307 .kr(1)
12308 .sr(1)
12309 .m(4)
12310 .n(n)
12311 .k(k)
12312 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012313 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012314 }
12315 }
12316 }
12317
Frank Barchard91317c52019-11-22 10:54:35 -080012318 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012319 TEST_REQUIRES_ARM_NEON_FMA;
12320 for (uint32_t n = 16; n <= 24; n += 8) {
12321 for (size_t k = 1; k <= 20; k += 5) {
12322 GemmMicrokernelTester()
12323 .mr(4)
12324 .nr(8)
12325 .kr(1)
12326 .sr(1)
12327 .m(4)
12328 .n(n)
12329 .k(k)
12330 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080012331 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012332 }
12333 }
12334 }
12335
Frank Barchard91317c52019-11-22 10:54:35 -080012336 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012337 TEST_REQUIRES_ARM_NEON_FMA;
12338 for (uint32_t n = 16; n <= 24; n += 8) {
12339 for (size_t k = 1; k <= 20; k += 5) {
12340 for (uint32_t m = 1; m <= 4; m++) {
12341 GemmMicrokernelTester()
12342 .mr(4)
12343 .nr(8)
12344 .kr(1)
12345 .sr(1)
12346 .m(m)
12347 .n(n)
12348 .k(k)
12349 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012350 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012351 }
12352 }
12353 }
12354 }
12355
Frank Barchard91317c52019-11-22 10:54:35 -080012356 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012357 TEST_REQUIRES_ARM_NEON_FMA;
12358 for (size_t k = 1; k <= 20; k += 5) {
12359 for (uint32_t m = 1; m <= 4; m++) {
12360 for (uint32_t n = 1; n <= 8; n++) {
12361 GemmMicrokernelTester()
12362 .mr(4)
12363 .nr(8)
12364 .kr(1)
12365 .sr(1)
12366 .m(m)
12367 .n(n)
12368 .k(k)
12369 .cm_stride(11)
12370 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012371 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012372 }
12373 }
12374 }
12375 }
12376
Frank Barchard91317c52019-11-22 10:54:35 -080012377 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012378 TEST_REQUIRES_ARM_NEON_FMA;
12379 GemmMicrokernelTester()
12380 .mr(4)
12381 .nr(8)
12382 .kr(1)
12383 .sr(1)
12384 .m(4)
12385 .n(8)
12386 .k(4)
12387 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012388 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012389 }
12390
Frank Barchard91317c52019-11-22 10:54:35 -080012391 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012392 TEST_REQUIRES_ARM_NEON_FMA;
12393 GemmMicrokernelTester()
12394 .mr(4)
12395 .nr(8)
12396 .kr(1)
12397 .sr(1)
12398 .m(4)
12399 .n(8)
12400 .k(4)
12401 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012402 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012403 }
12404
Frank Barchard91317c52019-11-22 10:54:35 -080012405 TEST(F32_GEMMINC_4X8__NEONFMA_LANE_LD128, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012406 TEST_REQUIRES_ARM_NEON_FMA;
12407 GemmMicrokernelTester()
12408 .mr(4)
12409 .nr(8)
12410 .kr(1)
12411 .sr(1)
12412 .m(4)
12413 .n(8)
12414 .k(4)
12415 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012416 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012417 }
Frank Barchard91317c52019-11-22 10:54:35 -080012418#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070012419
12420
Frank Barchard91317c52019-11-22 10:54:35 -080012421#if XNN_ARCH_ARM64
12422 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012423 TEST_REQUIRES_ARM_NEON_FMA;
12424 GemmMicrokernelTester()
12425 .mr(5)
12426 .nr(8)
12427 .kr(1)
12428 .sr(1)
12429 .m(5)
12430 .n(8)
12431 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080012432 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012433 }
12434
Frank Barchard91317c52019-11-22 10:54:35 -080012435 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012436 TEST_REQUIRES_ARM_NEON_FMA;
12437 GemmMicrokernelTester()
12438 .mr(5)
12439 .nr(8)
12440 .kr(1)
12441 .sr(1)
12442 .m(5)
12443 .n(8)
12444 .k(2)
12445 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012446 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012447 }
12448
Frank Barchard91317c52019-11-22 10:54:35 -080012449 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012450 TEST_REQUIRES_ARM_NEON_FMA;
12451 GemmMicrokernelTester()
12452 .mr(5)
12453 .nr(8)
12454 .kr(1)
12455 .sr(1)
12456 .m(5)
12457 .n(8)
12458 .k(2)
12459 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012460 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012461 }
12462
Frank Barchard91317c52019-11-22 10:54:35 -080012463 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012464 TEST_REQUIRES_ARM_NEON_FMA;
12465 for (uint32_t m = 1; m <= 5; m++) {
12466 for (uint32_t n = 1; n <= 8; n++) {
12467 GemmMicrokernelTester()
12468 .mr(5)
12469 .nr(8)
12470 .kr(1)
12471 .sr(1)
12472 .m(m)
12473 .n(n)
12474 .k(2)
12475 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012476 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012477 }
12478 }
12479 }
12480
Frank Barchard91317c52019-11-22 10:54:35 -080012481 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012482 TEST_REQUIRES_ARM_NEON_FMA;
12483 for (uint32_t m = 1; m <= 5; m++) {
12484 GemmMicrokernelTester()
12485 .mr(5)
12486 .nr(8)
12487 .kr(1)
12488 .sr(1)
12489 .m(m)
12490 .n(8)
12491 .k(2)
12492 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012493 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012494 }
12495 }
12496
Frank Barchard91317c52019-11-22 10:54:35 -080012497 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012498 TEST_REQUIRES_ARM_NEON_FMA;
12499 for (uint32_t n = 1; n <= 8; n++) {
12500 GemmMicrokernelTester()
12501 .mr(5)
12502 .nr(8)
12503 .kr(1)
12504 .sr(1)
12505 .m(5)
12506 .n(n)
12507 .k(2)
12508 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012509 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012510 }
12511 }
12512
Frank Barchard91317c52019-11-22 10:54:35 -080012513 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012514 TEST_REQUIRES_ARM_NEON_FMA;
12515 for (size_t k = 1; k < 2; k++) {
12516 GemmMicrokernelTester()
12517 .mr(5)
12518 .nr(8)
12519 .kr(1)
12520 .sr(1)
12521 .m(5)
12522 .n(8)
12523 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012524 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012525 }
12526 }
12527
Frank Barchard91317c52019-11-22 10:54:35 -080012528 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012529 TEST_REQUIRES_ARM_NEON_FMA;
12530 for (size_t k = 1; k < 2; k++) {
12531 GemmMicrokernelTester()
12532 .mr(5)
12533 .nr(8)
12534 .kr(1)
12535 .sr(1)
12536 .m(5)
12537 .n(8)
12538 .k(k)
12539 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012540 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012541 }
12542 }
12543
Frank Barchard91317c52019-11-22 10:54:35 -080012544 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012545 TEST_REQUIRES_ARM_NEON_FMA;
12546 for (size_t k = 1; k < 2; k++) {
12547 for (uint32_t m = 1; m <= 5; m++) {
12548 for (uint32_t n = 1; n <= 8; n++) {
12549 GemmMicrokernelTester()
12550 .mr(5)
12551 .nr(8)
12552 .kr(1)
12553 .sr(1)
12554 .m(m)
12555 .n(n)
12556 .k(k)
12557 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012558 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012559 }
12560 }
12561 }
12562 }
12563
Frank Barchard91317c52019-11-22 10:54:35 -080012564 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012565 TEST_REQUIRES_ARM_NEON_FMA;
12566 for (size_t k = 3; k < 4; k++) {
12567 GemmMicrokernelTester()
12568 .mr(5)
12569 .nr(8)
12570 .kr(1)
12571 .sr(1)
12572 .m(5)
12573 .n(8)
12574 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012575 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012576 }
12577 }
12578
Frank Barchard91317c52019-11-22 10:54:35 -080012579 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012580 TEST_REQUIRES_ARM_NEON_FMA;
12581 for (size_t k = 3; k < 4; k++) {
12582 GemmMicrokernelTester()
12583 .mr(5)
12584 .nr(8)
12585 .kr(1)
12586 .sr(1)
12587 .m(5)
12588 .n(8)
12589 .k(k)
12590 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012591 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012592 }
12593 }
12594
Frank Barchard91317c52019-11-22 10:54:35 -080012595 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012596 TEST_REQUIRES_ARM_NEON_FMA;
12597 for (size_t k = 3; k < 4; k++) {
12598 for (uint32_t m = 1; m <= 5; m++) {
12599 for (uint32_t n = 1; n <= 8; n++) {
12600 GemmMicrokernelTester()
12601 .mr(5)
12602 .nr(8)
12603 .kr(1)
12604 .sr(1)
12605 .m(m)
12606 .n(n)
12607 .k(k)
12608 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012609 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012610 }
12611 }
12612 }
12613 }
12614
Frank Barchard91317c52019-11-22 10:54:35 -080012615 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012616 TEST_REQUIRES_ARM_NEON_FMA;
12617 for (size_t k = 4; k <= 20; k += 2) {
12618 GemmMicrokernelTester()
12619 .mr(5)
12620 .nr(8)
12621 .kr(1)
12622 .sr(1)
12623 .m(5)
12624 .n(8)
12625 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012626 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012627 }
12628 }
12629
Frank Barchard91317c52019-11-22 10:54:35 -080012630 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012631 TEST_REQUIRES_ARM_NEON_FMA;
12632 for (size_t k = 4; k <= 20; k += 2) {
12633 GemmMicrokernelTester()
12634 .mr(5)
12635 .nr(8)
12636 .kr(1)
12637 .sr(1)
12638 .m(5)
12639 .n(8)
12640 .k(k)
12641 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080012642 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012643 }
12644 }
12645
Frank Barchard91317c52019-11-22 10:54:35 -080012646 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012647 TEST_REQUIRES_ARM_NEON_FMA;
12648 for (size_t k = 4; k <= 20; k += 2) {
12649 for (uint32_t m = 1; m <= 5; m++) {
12650 for (uint32_t n = 1; n <= 8; n++) {
12651 GemmMicrokernelTester()
12652 .mr(5)
12653 .nr(8)
12654 .kr(1)
12655 .sr(1)
12656 .m(m)
12657 .n(n)
12658 .k(k)
12659 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012660 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012661 }
12662 }
12663 }
12664 }
12665
Frank Barchard91317c52019-11-22 10:54:35 -080012666 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012667 TEST_REQUIRES_ARM_NEON_FMA;
12668 for (uint32_t n = 9; n < 16; n++) {
12669 for (size_t k = 1; k <= 10; k += 3) {
12670 GemmMicrokernelTester()
12671 .mr(5)
12672 .nr(8)
12673 .kr(1)
12674 .sr(1)
12675 .m(5)
12676 .n(8)
12677 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012678 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012679 }
12680 }
12681 }
12682
Frank Barchard91317c52019-11-22 10:54:35 -080012683 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012684 TEST_REQUIRES_ARM_NEON_FMA;
12685 for (uint32_t n = 9; n < 16; n++) {
12686 for (size_t k = 1; k <= 10; k += 3) {
12687 GemmMicrokernelTester()
12688 .mr(5)
12689 .nr(8)
12690 .kr(1)
12691 .sr(1)
12692 .m(5)
12693 .n(8)
12694 .k(k)
12695 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012696 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012697 }
12698 }
12699 }
12700
Frank Barchard91317c52019-11-22 10:54:35 -080012701 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012702 TEST_REQUIRES_ARM_NEON_FMA;
12703 for (uint32_t n = 9; n < 16; n++) {
12704 for (size_t k = 1; k <= 10; k += 3) {
12705 GemmMicrokernelTester()
12706 .mr(5)
12707 .nr(8)
12708 .kr(1)
12709 .sr(1)
12710 .m(5)
12711 .n(n)
12712 .k(k)
12713 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080012714 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012715 }
12716 }
12717 }
12718
Frank Barchard91317c52019-11-22 10:54:35 -080012719 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012720 TEST_REQUIRES_ARM_NEON_FMA;
12721 for (uint32_t n = 9; n < 16; n++) {
12722 for (size_t k = 1; k <= 10; k += 3) {
12723 for (uint32_t m = 1; m <= 5; m++) {
12724 GemmMicrokernelTester()
12725 .mr(5)
12726 .nr(8)
12727 .kr(1)
12728 .sr(1)
12729 .m(m)
12730 .n(n)
12731 .k(k)
12732 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012733 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012734 }
12735 }
12736 }
12737 }
12738
Frank Barchard91317c52019-11-22 10:54:35 -080012739 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012740 TEST_REQUIRES_ARM_NEON_FMA;
12741 for (uint32_t n = 16; n <= 24; n += 8) {
12742 for (size_t k = 1; k <= 10; k += 3) {
12743 GemmMicrokernelTester()
12744 .mr(5)
12745 .nr(8)
12746 .kr(1)
12747 .sr(1)
12748 .m(5)
12749 .n(8)
12750 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012751 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012752 }
12753 }
12754 }
12755
Frank Barchard91317c52019-11-22 10:54:35 -080012756 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012757 TEST_REQUIRES_ARM_NEON_FMA;
12758 for (uint32_t n = 16; n <= 24; n += 8) {
12759 for (size_t k = 1; k <= 10; k += 3) {
12760 GemmMicrokernelTester()
12761 .mr(5)
12762 .nr(8)
12763 .kr(1)
12764 .sr(1)
12765 .m(5)
12766 .n(n)
12767 .k(k)
12768 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012769 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012770 }
12771 }
12772 }
12773
Frank Barchard91317c52019-11-22 10:54:35 -080012774 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012775 TEST_REQUIRES_ARM_NEON_FMA;
12776 for (uint32_t n = 16; n <= 24; n += 8) {
12777 for (size_t k = 1; k <= 10; k += 3) {
12778 GemmMicrokernelTester()
12779 .mr(5)
12780 .nr(8)
12781 .kr(1)
12782 .sr(1)
12783 .m(5)
12784 .n(n)
12785 .k(k)
12786 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080012787 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012788 }
12789 }
12790 }
12791
Frank Barchard91317c52019-11-22 10:54:35 -080012792 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012793 TEST_REQUIRES_ARM_NEON_FMA;
12794 for (uint32_t n = 16; n <= 24; n += 8) {
12795 for (size_t k = 1; k <= 10; k += 3) {
12796 for (uint32_t m = 1; m <= 5; m++) {
12797 GemmMicrokernelTester()
12798 .mr(5)
12799 .nr(8)
12800 .kr(1)
12801 .sr(1)
12802 .m(m)
12803 .n(n)
12804 .k(k)
12805 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012806 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012807 }
12808 }
12809 }
12810 }
12811
Frank Barchard91317c52019-11-22 10:54:35 -080012812 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012813 TEST_REQUIRES_ARM_NEON_FMA;
12814 for (size_t k = 1; k <= 10; k += 3) {
12815 for (uint32_t m = 1; m <= 5; m++) {
12816 for (uint32_t n = 1; n <= 8; n++) {
12817 GemmMicrokernelTester()
12818 .mr(5)
12819 .nr(8)
12820 .kr(1)
12821 .sr(1)
12822 .m(m)
12823 .n(n)
12824 .k(k)
12825 .cm_stride(11)
12826 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012827 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012828 }
12829 }
12830 }
12831 }
12832
Frank Barchard91317c52019-11-22 10:54:35 -080012833 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012834 TEST_REQUIRES_ARM_NEON_FMA;
12835 GemmMicrokernelTester()
12836 .mr(5)
12837 .nr(8)
12838 .kr(1)
12839 .sr(1)
12840 .m(5)
12841 .n(8)
12842 .k(2)
12843 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012844 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012845 }
12846
Frank Barchard91317c52019-11-22 10:54:35 -080012847 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012848 TEST_REQUIRES_ARM_NEON_FMA;
12849 GemmMicrokernelTester()
12850 .mr(5)
12851 .nr(8)
12852 .kr(1)
12853 .sr(1)
12854 .m(5)
12855 .n(8)
12856 .k(2)
12857 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012858 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012859 }
12860
Frank Barchard91317c52019-11-22 10:54:35 -080012861 TEST(F32_GEMMINC_5X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012862 TEST_REQUIRES_ARM_NEON_FMA;
12863 GemmMicrokernelTester()
12864 .mr(5)
12865 .nr(8)
12866 .kr(1)
12867 .sr(1)
12868 .m(5)
12869 .n(8)
12870 .k(2)
12871 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012872 .Test(xnn_f32_gemminc_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012873 }
Frank Barchard91317c52019-11-22 10:54:35 -080012874#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070012875
12876
Frank Barchard91317c52019-11-22 10:54:35 -080012877#if XNN_ARCH_ARM64
12878 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012879 TEST_REQUIRES_ARM_NEON_FMA;
12880 GemmMicrokernelTester()
12881 .mr(6)
12882 .nr(8)
12883 .kr(1)
12884 .sr(1)
12885 .m(6)
12886 .n(8)
12887 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080012888 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012889 }
12890
Frank Barchard91317c52019-11-22 10:54:35 -080012891 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012892 TEST_REQUIRES_ARM_NEON_FMA;
12893 GemmMicrokernelTester()
12894 .mr(6)
12895 .nr(8)
12896 .kr(1)
12897 .sr(1)
12898 .m(6)
12899 .n(8)
12900 .k(2)
12901 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012902 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012903 }
12904
Frank Barchard91317c52019-11-22 10:54:35 -080012905 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012906 TEST_REQUIRES_ARM_NEON_FMA;
12907 GemmMicrokernelTester()
12908 .mr(6)
12909 .nr(8)
12910 .kr(1)
12911 .sr(1)
12912 .m(6)
12913 .n(8)
12914 .k(2)
12915 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012916 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012917 }
12918
Frank Barchard91317c52019-11-22 10:54:35 -080012919 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012920 TEST_REQUIRES_ARM_NEON_FMA;
12921 for (uint32_t m = 1; m <= 6; m++) {
12922 for (uint32_t n = 1; n <= 8; n++) {
12923 GemmMicrokernelTester()
12924 .mr(6)
12925 .nr(8)
12926 .kr(1)
12927 .sr(1)
12928 .m(m)
12929 .n(n)
12930 .k(2)
12931 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012932 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012933 }
12934 }
12935 }
12936
Frank Barchard91317c52019-11-22 10:54:35 -080012937 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012938 TEST_REQUIRES_ARM_NEON_FMA;
12939 for (uint32_t m = 1; m <= 6; m++) {
12940 GemmMicrokernelTester()
12941 .mr(6)
12942 .nr(8)
12943 .kr(1)
12944 .sr(1)
12945 .m(m)
12946 .n(8)
12947 .k(2)
12948 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012949 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012950 }
12951 }
12952
Frank Barchard91317c52019-11-22 10:54:35 -080012953 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012954 TEST_REQUIRES_ARM_NEON_FMA;
12955 for (uint32_t n = 1; n <= 8; n++) {
12956 GemmMicrokernelTester()
12957 .mr(6)
12958 .nr(8)
12959 .kr(1)
12960 .sr(1)
12961 .m(6)
12962 .n(n)
12963 .k(2)
12964 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012965 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012966 }
12967 }
12968
Frank Barchard91317c52019-11-22 10:54:35 -080012969 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012970 TEST_REQUIRES_ARM_NEON_FMA;
12971 for (size_t k = 1; k < 2; k++) {
12972 GemmMicrokernelTester()
12973 .mr(6)
12974 .nr(8)
12975 .kr(1)
12976 .sr(1)
12977 .m(6)
12978 .n(8)
12979 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012980 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012981 }
12982 }
12983
Frank Barchard91317c52019-11-22 10:54:35 -080012984 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012985 TEST_REQUIRES_ARM_NEON_FMA;
12986 for (size_t k = 1; k < 2; k++) {
12987 GemmMicrokernelTester()
12988 .mr(6)
12989 .nr(8)
12990 .kr(1)
12991 .sr(1)
12992 .m(6)
12993 .n(8)
12994 .k(k)
12995 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012996 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012997 }
12998 }
12999
Frank Barchard91317c52019-11-22 10:54:35 -080013000 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013001 TEST_REQUIRES_ARM_NEON_FMA;
13002 for (size_t k = 1; k < 2; k++) {
13003 for (uint32_t m = 1; m <= 6; m++) {
13004 for (uint32_t n = 1; n <= 8; n++) {
13005 GemmMicrokernelTester()
13006 .mr(6)
13007 .nr(8)
13008 .kr(1)
13009 .sr(1)
13010 .m(m)
13011 .n(n)
13012 .k(k)
13013 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013014 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013015 }
13016 }
13017 }
13018 }
13019
Frank Barchard91317c52019-11-22 10:54:35 -080013020 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013021 TEST_REQUIRES_ARM_NEON_FMA;
13022 for (size_t k = 3; k < 4; k++) {
13023 GemmMicrokernelTester()
13024 .mr(6)
13025 .nr(8)
13026 .kr(1)
13027 .sr(1)
13028 .m(6)
13029 .n(8)
13030 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013031 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013032 }
13033 }
13034
Frank Barchard91317c52019-11-22 10:54:35 -080013035 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013036 TEST_REQUIRES_ARM_NEON_FMA;
13037 for (size_t k = 3; k < 4; k++) {
13038 GemmMicrokernelTester()
13039 .mr(6)
13040 .nr(8)
13041 .kr(1)
13042 .sr(1)
13043 .m(6)
13044 .n(8)
13045 .k(k)
13046 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080013047 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013048 }
13049 }
13050
Frank Barchard91317c52019-11-22 10:54:35 -080013051 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013052 TEST_REQUIRES_ARM_NEON_FMA;
13053 for (size_t k = 3; k < 4; k++) {
13054 for (uint32_t m = 1; m <= 6; m++) {
13055 for (uint32_t n = 1; n <= 8; n++) {
13056 GemmMicrokernelTester()
13057 .mr(6)
13058 .nr(8)
13059 .kr(1)
13060 .sr(1)
13061 .m(m)
13062 .n(n)
13063 .k(k)
13064 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013065 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013066 }
13067 }
13068 }
13069 }
13070
Frank Barchard91317c52019-11-22 10:54:35 -080013071 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013072 TEST_REQUIRES_ARM_NEON_FMA;
13073 for (size_t k = 4; k <= 20; k += 2) {
13074 GemmMicrokernelTester()
13075 .mr(6)
13076 .nr(8)
13077 .kr(1)
13078 .sr(1)
13079 .m(6)
13080 .n(8)
13081 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013082 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013083 }
13084 }
13085
Frank Barchard91317c52019-11-22 10:54:35 -080013086 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013087 TEST_REQUIRES_ARM_NEON_FMA;
13088 for (size_t k = 4; k <= 20; k += 2) {
13089 GemmMicrokernelTester()
13090 .mr(6)
13091 .nr(8)
13092 .kr(1)
13093 .sr(1)
13094 .m(6)
13095 .n(8)
13096 .k(k)
13097 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080013098 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013099 }
13100 }
13101
Frank Barchard91317c52019-11-22 10:54:35 -080013102 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013103 TEST_REQUIRES_ARM_NEON_FMA;
13104 for (size_t k = 4; k <= 20; k += 2) {
13105 for (uint32_t m = 1; m <= 6; m++) {
13106 for (uint32_t n = 1; n <= 8; n++) {
13107 GemmMicrokernelTester()
13108 .mr(6)
13109 .nr(8)
13110 .kr(1)
13111 .sr(1)
13112 .m(m)
13113 .n(n)
13114 .k(k)
13115 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013116 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013117 }
13118 }
13119 }
13120 }
13121
Frank Barchard91317c52019-11-22 10:54:35 -080013122 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013123 TEST_REQUIRES_ARM_NEON_FMA;
13124 for (uint32_t n = 9; n < 16; n++) {
13125 for (size_t k = 1; k <= 10; k += 3) {
13126 GemmMicrokernelTester()
13127 .mr(6)
13128 .nr(8)
13129 .kr(1)
13130 .sr(1)
13131 .m(6)
13132 .n(8)
13133 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013134 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013135 }
13136 }
13137 }
13138
Frank Barchard91317c52019-11-22 10:54:35 -080013139 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013140 TEST_REQUIRES_ARM_NEON_FMA;
13141 for (uint32_t n = 9; n < 16; n++) {
13142 for (size_t k = 1; k <= 10; k += 3) {
13143 GemmMicrokernelTester()
13144 .mr(6)
13145 .nr(8)
13146 .kr(1)
13147 .sr(1)
13148 .m(6)
13149 .n(8)
13150 .k(k)
13151 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013152 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013153 }
13154 }
13155 }
13156
Frank Barchard91317c52019-11-22 10:54:35 -080013157 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013158 TEST_REQUIRES_ARM_NEON_FMA;
13159 for (uint32_t n = 9; n < 16; n++) {
13160 for (size_t k = 1; k <= 10; k += 3) {
13161 GemmMicrokernelTester()
13162 .mr(6)
13163 .nr(8)
13164 .kr(1)
13165 .sr(1)
13166 .m(6)
13167 .n(n)
13168 .k(k)
13169 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080013170 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013171 }
13172 }
13173 }
13174
Frank Barchard91317c52019-11-22 10:54:35 -080013175 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013176 TEST_REQUIRES_ARM_NEON_FMA;
13177 for (uint32_t n = 9; n < 16; n++) {
13178 for (size_t k = 1; k <= 10; k += 3) {
13179 for (uint32_t m = 1; m <= 6; m++) {
13180 GemmMicrokernelTester()
13181 .mr(6)
13182 .nr(8)
13183 .kr(1)
13184 .sr(1)
13185 .m(m)
13186 .n(n)
13187 .k(k)
13188 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013189 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013190 }
13191 }
13192 }
13193 }
13194
Frank Barchard91317c52019-11-22 10:54:35 -080013195 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013196 TEST_REQUIRES_ARM_NEON_FMA;
13197 for (uint32_t n = 16; n <= 24; n += 8) {
13198 for (size_t k = 1; k <= 10; k += 3) {
13199 GemmMicrokernelTester()
13200 .mr(6)
13201 .nr(8)
13202 .kr(1)
13203 .sr(1)
13204 .m(6)
13205 .n(8)
13206 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013207 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013208 }
13209 }
13210 }
13211
Frank Barchard91317c52019-11-22 10:54:35 -080013212 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013213 TEST_REQUIRES_ARM_NEON_FMA;
13214 for (uint32_t n = 16; n <= 24; n += 8) {
13215 for (size_t k = 1; k <= 10; k += 3) {
13216 GemmMicrokernelTester()
13217 .mr(6)
13218 .nr(8)
13219 .kr(1)
13220 .sr(1)
13221 .m(6)
13222 .n(n)
13223 .k(k)
13224 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013225 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013226 }
13227 }
13228 }
13229
Frank Barchard91317c52019-11-22 10:54:35 -080013230 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013231 TEST_REQUIRES_ARM_NEON_FMA;
13232 for (uint32_t n = 16; n <= 24; n += 8) {
13233 for (size_t k = 1; k <= 10; k += 3) {
13234 GemmMicrokernelTester()
13235 .mr(6)
13236 .nr(8)
13237 .kr(1)
13238 .sr(1)
13239 .m(6)
13240 .n(n)
13241 .k(k)
13242 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080013243 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013244 }
13245 }
13246 }
13247
Frank Barchard91317c52019-11-22 10:54:35 -080013248 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013249 TEST_REQUIRES_ARM_NEON_FMA;
13250 for (uint32_t n = 16; n <= 24; n += 8) {
13251 for (size_t k = 1; k <= 10; k += 3) {
13252 for (uint32_t m = 1; m <= 6; m++) {
13253 GemmMicrokernelTester()
13254 .mr(6)
13255 .nr(8)
13256 .kr(1)
13257 .sr(1)
13258 .m(m)
13259 .n(n)
13260 .k(k)
13261 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013262 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013263 }
13264 }
13265 }
13266 }
13267
Frank Barchard91317c52019-11-22 10:54:35 -080013268 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013269 TEST_REQUIRES_ARM_NEON_FMA;
13270 for (size_t k = 1; k <= 10; k += 3) {
13271 for (uint32_t m = 1; m <= 6; m++) {
13272 for (uint32_t n = 1; n <= 8; n++) {
13273 GemmMicrokernelTester()
13274 .mr(6)
13275 .nr(8)
13276 .kr(1)
13277 .sr(1)
13278 .m(m)
13279 .n(n)
13280 .k(k)
13281 .cm_stride(11)
13282 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013283 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013284 }
13285 }
13286 }
13287 }
13288
Frank Barchard91317c52019-11-22 10:54:35 -080013289 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013290 TEST_REQUIRES_ARM_NEON_FMA;
13291 GemmMicrokernelTester()
13292 .mr(6)
13293 .nr(8)
13294 .kr(1)
13295 .sr(1)
13296 .m(6)
13297 .n(8)
13298 .k(2)
13299 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013300 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013301 }
13302
Frank Barchard91317c52019-11-22 10:54:35 -080013303 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013304 TEST_REQUIRES_ARM_NEON_FMA;
13305 GemmMicrokernelTester()
13306 .mr(6)
13307 .nr(8)
13308 .kr(1)
13309 .sr(1)
13310 .m(6)
13311 .n(8)
13312 .k(2)
13313 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013314 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013315 }
13316
Frank Barchard91317c52019-11-22 10:54:35 -080013317 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013318 TEST_REQUIRES_ARM_NEON_FMA;
13319 GemmMicrokernelTester()
13320 .mr(6)
13321 .nr(8)
13322 .kr(1)
13323 .sr(1)
13324 .m(6)
13325 .n(8)
13326 .k(2)
13327 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013328 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013329 }
Frank Barchard91317c52019-11-22 10:54:35 -080013330#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070013331
13332
Frank Barchard69172d92019-11-26 16:22:39 -080013333#if XNN_ARCH_ARM64
13334 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4) {
13335 TEST_REQUIRES_ARM_NEON_FMA;
13336 GemmMicrokernelTester()
13337 .mr(6)
13338 .nr(8)
13339 .kr(1)
13340 .sr(1)
13341 .m(6)
13342 .n(8)
13343 .k(4)
13344 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13345 }
13346
13347 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, strided_cn) {
13348 TEST_REQUIRES_ARM_NEON_FMA;
13349 GemmMicrokernelTester()
13350 .mr(6)
13351 .nr(8)
13352 .kr(1)
13353 .sr(1)
13354 .m(6)
13355 .n(8)
13356 .k(4)
13357 .cn_stride(11)
13358 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13359 }
13360
13361 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
13362 TEST_REQUIRES_ARM_NEON_FMA;
13363 GemmMicrokernelTester()
13364 .mr(6)
13365 .nr(8)
13366 .kr(1)
13367 .sr(1)
13368 .m(6)
13369 .n(8)
13370 .k(4)
13371 .a_stride(7)
13372 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13373 }
13374
13375 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
13376 TEST_REQUIRES_ARM_NEON_FMA;
13377 for (uint32_t m = 1; m <= 6; m++) {
13378 for (uint32_t n = 1; n <= 8; n++) {
13379 GemmMicrokernelTester()
13380 .mr(6)
13381 .nr(8)
13382 .kr(1)
13383 .sr(1)
13384 .m(m)
13385 .n(n)
13386 .k(4)
13387 .iterations(1)
13388 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13389 }
13390 }
13391 }
13392
13393 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
13394 TEST_REQUIRES_ARM_NEON_FMA;
13395 for (uint32_t m = 1; m <= 6; m++) {
13396 GemmMicrokernelTester()
13397 .mr(6)
13398 .nr(8)
13399 .kr(1)
13400 .sr(1)
13401 .m(m)
13402 .n(8)
13403 .k(4)
13404 .iterations(1)
13405 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13406 }
13407 }
13408
13409 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
13410 TEST_REQUIRES_ARM_NEON_FMA;
13411 for (uint32_t n = 1; n <= 8; n++) {
13412 GemmMicrokernelTester()
13413 .mr(6)
13414 .nr(8)
13415 .kr(1)
13416 .sr(1)
13417 .m(6)
13418 .n(n)
13419 .k(4)
13420 .iterations(1)
13421 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13422 }
13423 }
13424
13425 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_lt_4) {
13426 TEST_REQUIRES_ARM_NEON_FMA;
13427 for (size_t k = 1; k < 4; k++) {
13428 GemmMicrokernelTester()
13429 .mr(6)
13430 .nr(8)
13431 .kr(1)
13432 .sr(1)
13433 .m(6)
13434 .n(8)
13435 .k(k)
13436 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13437 }
13438 }
13439
13440 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
13441 TEST_REQUIRES_ARM_NEON_FMA;
13442 for (size_t k = 1; k < 4; k++) {
13443 GemmMicrokernelTester()
13444 .mr(6)
13445 .nr(8)
13446 .kr(1)
13447 .sr(1)
13448 .m(6)
13449 .n(8)
13450 .k(k)
13451 .a_stride(7)
13452 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13453 }
13454 }
13455
13456 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
13457 TEST_REQUIRES_ARM_NEON_FMA;
13458 for (size_t k = 1; k < 4; k++) {
13459 for (uint32_t m = 1; m <= 6; m++) {
13460 for (uint32_t n = 1; n <= 8; n++) {
13461 GemmMicrokernelTester()
13462 .mr(6)
13463 .nr(8)
13464 .kr(1)
13465 .sr(1)
13466 .m(m)
13467 .n(n)
13468 .k(k)
13469 .iterations(1)
13470 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13471 }
13472 }
13473 }
13474 }
13475
13476 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_gt_4) {
13477 TEST_REQUIRES_ARM_NEON_FMA;
13478 for (size_t k = 5; k < 8; k++) {
13479 GemmMicrokernelTester()
13480 .mr(6)
13481 .nr(8)
13482 .kr(1)
13483 .sr(1)
13484 .m(6)
13485 .n(8)
13486 .k(k)
13487 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13488 }
13489 }
13490
13491 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
13492 TEST_REQUIRES_ARM_NEON_FMA;
13493 for (size_t k = 5; k < 8; k++) {
13494 GemmMicrokernelTester()
13495 .mr(6)
13496 .nr(8)
13497 .kr(1)
13498 .sr(1)
13499 .m(6)
13500 .n(8)
13501 .k(k)
13502 .a_stride(11)
13503 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13504 }
13505 }
13506
13507 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
13508 TEST_REQUIRES_ARM_NEON_FMA;
13509 for (size_t k = 5; k < 8; k++) {
13510 for (uint32_t m = 1; m <= 6; m++) {
13511 for (uint32_t n = 1; n <= 8; n++) {
13512 GemmMicrokernelTester()
13513 .mr(6)
13514 .nr(8)
13515 .kr(1)
13516 .sr(1)
13517 .m(m)
13518 .n(n)
13519 .k(k)
13520 .iterations(1)
13521 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13522 }
13523 }
13524 }
13525 }
13526
13527 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_div_4) {
13528 TEST_REQUIRES_ARM_NEON_FMA;
13529 for (size_t k = 8; k <= 40; k += 4) {
13530 GemmMicrokernelTester()
13531 .mr(6)
13532 .nr(8)
13533 .kr(1)
13534 .sr(1)
13535 .m(6)
13536 .n(8)
13537 .k(k)
13538 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13539 }
13540 }
13541
13542 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
13543 TEST_REQUIRES_ARM_NEON_FMA;
13544 for (size_t k = 8; k <= 40; k += 4) {
13545 GemmMicrokernelTester()
13546 .mr(6)
13547 .nr(8)
13548 .kr(1)
13549 .sr(1)
13550 .m(6)
13551 .n(8)
13552 .k(k)
13553 .a_stride(43)
13554 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13555 }
13556 }
13557
13558 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
13559 TEST_REQUIRES_ARM_NEON_FMA;
13560 for (size_t k = 8; k <= 40; k += 4) {
13561 for (uint32_t m = 1; m <= 6; m++) {
13562 for (uint32_t n = 1; n <= 8; n++) {
13563 GemmMicrokernelTester()
13564 .mr(6)
13565 .nr(8)
13566 .kr(1)
13567 .sr(1)
13568 .m(m)
13569 .n(n)
13570 .k(k)
13571 .iterations(1)
13572 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13573 }
13574 }
13575 }
13576 }
13577
13578 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8) {
13579 TEST_REQUIRES_ARM_NEON_FMA;
13580 for (uint32_t n = 9; n < 16; n++) {
13581 for (size_t k = 1; k <= 20; k += 5) {
13582 GemmMicrokernelTester()
13583 .mr(6)
13584 .nr(8)
13585 .kr(1)
13586 .sr(1)
13587 .m(6)
13588 .n(8)
13589 .k(k)
13590 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13591 }
13592 }
13593 }
13594
13595 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
13596 TEST_REQUIRES_ARM_NEON_FMA;
13597 for (uint32_t n = 9; n < 16; n++) {
13598 for (size_t k = 1; k <= 20; k += 5) {
13599 GemmMicrokernelTester()
13600 .mr(6)
13601 .nr(8)
13602 .kr(1)
13603 .sr(1)
13604 .m(6)
13605 .n(8)
13606 .k(k)
13607 .cn_stride(11)
13608 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13609 }
13610 }
13611 }
13612
13613 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
13614 TEST_REQUIRES_ARM_NEON_FMA;
13615 for (uint32_t n = 9; n < 16; n++) {
13616 for (size_t k = 1; k <= 20; k += 5) {
13617 GemmMicrokernelTester()
13618 .mr(6)
13619 .nr(8)
13620 .kr(1)
13621 .sr(1)
13622 .m(6)
13623 .n(n)
13624 .k(k)
13625 .a_stride(23)
13626 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13627 }
13628 }
13629 }
13630
13631 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
13632 TEST_REQUIRES_ARM_NEON_FMA;
13633 for (uint32_t n = 9; n < 16; n++) {
13634 for (size_t k = 1; k <= 20; k += 5) {
13635 for (uint32_t m = 1; m <= 6; m++) {
13636 GemmMicrokernelTester()
13637 .mr(6)
13638 .nr(8)
13639 .kr(1)
13640 .sr(1)
13641 .m(m)
13642 .n(n)
13643 .k(k)
13644 .iterations(1)
13645 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13646 }
13647 }
13648 }
13649 }
13650
13651 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8) {
13652 TEST_REQUIRES_ARM_NEON_FMA;
13653 for (uint32_t n = 16; n <= 24; n += 8) {
13654 for (size_t k = 1; k <= 20; k += 5) {
13655 GemmMicrokernelTester()
13656 .mr(6)
13657 .nr(8)
13658 .kr(1)
13659 .sr(1)
13660 .m(6)
13661 .n(8)
13662 .k(k)
13663 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13664 }
13665 }
13666 }
13667
13668 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
13669 TEST_REQUIRES_ARM_NEON_FMA;
13670 for (uint32_t n = 16; n <= 24; n += 8) {
13671 for (size_t k = 1; k <= 20; k += 5) {
13672 GemmMicrokernelTester()
13673 .mr(6)
13674 .nr(8)
13675 .kr(1)
13676 .sr(1)
13677 .m(6)
13678 .n(n)
13679 .k(k)
13680 .cn_stride(11)
13681 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13682 }
13683 }
13684 }
13685
13686 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
13687 TEST_REQUIRES_ARM_NEON_FMA;
13688 for (uint32_t n = 16; n <= 24; n += 8) {
13689 for (size_t k = 1; k <= 20; k += 5) {
13690 GemmMicrokernelTester()
13691 .mr(6)
13692 .nr(8)
13693 .kr(1)
13694 .sr(1)
13695 .m(6)
13696 .n(n)
13697 .k(k)
13698 .a_stride(23)
13699 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13700 }
13701 }
13702 }
13703
13704 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
13705 TEST_REQUIRES_ARM_NEON_FMA;
13706 for (uint32_t n = 16; n <= 24; n += 8) {
13707 for (size_t k = 1; k <= 20; k += 5) {
13708 for (uint32_t m = 1; m <= 6; m++) {
13709 GemmMicrokernelTester()
13710 .mr(6)
13711 .nr(8)
13712 .kr(1)
13713 .sr(1)
13714 .m(m)
13715 .n(n)
13716 .k(k)
13717 .iterations(1)
13718 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13719 }
13720 }
13721 }
13722 }
13723
13724 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
13725 TEST_REQUIRES_ARM_NEON_FMA;
13726 for (size_t k = 1; k <= 20; k += 5) {
13727 for (uint32_t m = 1; m <= 6; m++) {
13728 for (uint32_t n = 1; n <= 8; n++) {
13729 GemmMicrokernelTester()
13730 .mr(6)
13731 .nr(8)
13732 .kr(1)
13733 .sr(1)
13734 .m(m)
13735 .n(n)
13736 .k(k)
13737 .cm_stride(11)
13738 .iterations(1)
13739 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13740 }
13741 }
13742 }
13743 }
13744
13745 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, qmin) {
13746 TEST_REQUIRES_ARM_NEON_FMA;
13747 GemmMicrokernelTester()
13748 .mr(6)
13749 .nr(8)
13750 .kr(1)
13751 .sr(1)
13752 .m(6)
13753 .n(8)
13754 .k(4)
13755 .qmin(128)
13756 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13757 }
13758
13759 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, qmax) {
13760 TEST_REQUIRES_ARM_NEON_FMA;
13761 GemmMicrokernelTester()
13762 .mr(6)
13763 .nr(8)
13764 .kr(1)
13765 .sr(1)
13766 .m(6)
13767 .n(8)
13768 .k(4)
13769 .qmax(128)
13770 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13771 }
13772
13773 TEST(F32_GEMMINC_6X8__NEONFMA_LANE_LD128, strided_cm) {
13774 TEST_REQUIRES_ARM_NEON_FMA;
13775 GemmMicrokernelTester()
13776 .mr(6)
13777 .nr(8)
13778 .kr(1)
13779 .sr(1)
13780 .m(6)
13781 .n(8)
13782 .k(4)
13783 .cm_stride(11)
13784 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_lane_ld128);
13785 }
13786#endif // XNN_ARCH_ARM64
13787
13788
Frank Barcharddf06d802019-11-20 15:53:46 -080013789#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080013790 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2) {
13791 TEST_REQUIRES_ARM_NEON;
13792 GemmMicrokernelTester()
13793 .mr(1)
13794 .nr(8)
13795 .kr(1)
13796 .sr(1)
13797 .m(1)
13798 .n(8)
13799 .k(2)
13800 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13801 }
13802
13803 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, strided_cn) {
13804 TEST_REQUIRES_ARM_NEON;
13805 GemmMicrokernelTester()
13806 .mr(1)
13807 .nr(8)
13808 .kr(1)
13809 .sr(1)
13810 .m(1)
13811 .n(8)
13812 .k(2)
13813 .cn_stride(11)
13814 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13815 }
13816
13817 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_strided_a) {
13818 TEST_REQUIRES_ARM_NEON;
13819 GemmMicrokernelTester()
13820 .mr(1)
13821 .nr(8)
13822 .kr(1)
13823 .sr(1)
13824 .m(1)
13825 .n(8)
13826 .k(2)
13827 .a_stride(5)
13828 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13829 }
13830
13831 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
13832 TEST_REQUIRES_ARM_NEON;
13833 for (uint32_t m = 1; m <= 1; m++) {
13834 for (uint32_t n = 1; n <= 8; n++) {
13835 GemmMicrokernelTester()
13836 .mr(1)
13837 .nr(8)
13838 .kr(1)
13839 .sr(1)
13840 .m(m)
13841 .n(n)
13842 .k(2)
13843 .iterations(1)
13844 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13845 }
13846 }
13847 }
13848
13849 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
13850 TEST_REQUIRES_ARM_NEON;
13851 for (uint32_t m = 1; m <= 1; m++) {
13852 GemmMicrokernelTester()
13853 .mr(1)
13854 .nr(8)
13855 .kr(1)
13856 .sr(1)
13857 .m(m)
13858 .n(8)
13859 .k(2)
13860 .iterations(1)
13861 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13862 }
13863 }
13864
13865 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
13866 TEST_REQUIRES_ARM_NEON;
13867 for (uint32_t n = 1; n <= 8; n++) {
13868 GemmMicrokernelTester()
13869 .mr(1)
13870 .nr(8)
13871 .kr(1)
13872 .sr(1)
13873 .m(1)
13874 .n(n)
13875 .k(2)
13876 .iterations(1)
13877 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13878 }
13879 }
13880
13881 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_lt_2) {
13882 TEST_REQUIRES_ARM_NEON;
13883 for (size_t k = 1; k < 2; k++) {
13884 GemmMicrokernelTester()
13885 .mr(1)
13886 .nr(8)
13887 .kr(1)
13888 .sr(1)
13889 .m(1)
13890 .n(8)
13891 .k(k)
13892 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13893 }
13894 }
13895
13896 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_lt_2_strided_a) {
13897 TEST_REQUIRES_ARM_NEON;
13898 for (size_t k = 1; k < 2; k++) {
13899 GemmMicrokernelTester()
13900 .mr(1)
13901 .nr(8)
13902 .kr(1)
13903 .sr(1)
13904 .m(1)
13905 .n(8)
13906 .k(k)
13907 .a_stride(5)
13908 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13909 }
13910 }
13911
13912 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
13913 TEST_REQUIRES_ARM_NEON;
13914 for (size_t k = 1; k < 2; k++) {
13915 for (uint32_t m = 1; m <= 1; m++) {
13916 for (uint32_t n = 1; n <= 8; n++) {
13917 GemmMicrokernelTester()
13918 .mr(1)
13919 .nr(8)
13920 .kr(1)
13921 .sr(1)
13922 .m(m)
13923 .n(n)
13924 .k(k)
13925 .iterations(1)
13926 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13927 }
13928 }
13929 }
13930 }
13931
13932 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_gt_2) {
13933 TEST_REQUIRES_ARM_NEON;
13934 for (size_t k = 3; k < 4; k++) {
13935 GemmMicrokernelTester()
13936 .mr(1)
13937 .nr(8)
13938 .kr(1)
13939 .sr(1)
13940 .m(1)
13941 .n(8)
13942 .k(k)
13943 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13944 }
13945 }
13946
13947 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_gt_2_strided_a) {
13948 TEST_REQUIRES_ARM_NEON;
13949 for (size_t k = 3; k < 4; k++) {
13950 GemmMicrokernelTester()
13951 .mr(1)
13952 .nr(8)
13953 .kr(1)
13954 .sr(1)
13955 .m(1)
13956 .n(8)
13957 .k(k)
13958 .a_stride(7)
13959 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13960 }
13961 }
13962
13963 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
13964 TEST_REQUIRES_ARM_NEON;
13965 for (size_t k = 3; k < 4; k++) {
13966 for (uint32_t m = 1; m <= 1; m++) {
13967 for (uint32_t n = 1; n <= 8; n++) {
13968 GemmMicrokernelTester()
13969 .mr(1)
13970 .nr(8)
13971 .kr(1)
13972 .sr(1)
13973 .m(m)
13974 .n(n)
13975 .k(k)
13976 .iterations(1)
13977 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13978 }
13979 }
13980 }
13981 }
13982
13983 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_div_2) {
13984 TEST_REQUIRES_ARM_NEON;
13985 for (size_t k = 4; k <= 20; k += 2) {
13986 GemmMicrokernelTester()
13987 .mr(1)
13988 .nr(8)
13989 .kr(1)
13990 .sr(1)
13991 .m(1)
13992 .n(8)
13993 .k(k)
13994 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
13995 }
13996 }
13997
13998 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_div_2_strided_a) {
13999 TEST_REQUIRES_ARM_NEON;
14000 for (size_t k = 4; k <= 20; k += 2) {
14001 GemmMicrokernelTester()
14002 .mr(1)
14003 .nr(8)
14004 .kr(1)
14005 .sr(1)
14006 .m(1)
14007 .n(8)
14008 .k(k)
14009 .a_stride(23)
14010 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14011 }
14012 }
14013
14014 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, k_div_2_subtile) {
14015 TEST_REQUIRES_ARM_NEON;
14016 for (size_t k = 4; k <= 20; k += 2) {
14017 for (uint32_t m = 1; m <= 1; m++) {
14018 for (uint32_t n = 1; n <= 8; n++) {
14019 GemmMicrokernelTester()
14020 .mr(1)
14021 .nr(8)
14022 .kr(1)
14023 .sr(1)
14024 .m(m)
14025 .n(n)
14026 .k(k)
14027 .iterations(1)
14028 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14029 }
14030 }
14031 }
14032 }
14033
14034 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8) {
14035 TEST_REQUIRES_ARM_NEON;
14036 for (uint32_t n = 9; n < 16; n++) {
14037 for (size_t k = 1; k <= 10; k += 3) {
14038 GemmMicrokernelTester()
14039 .mr(1)
14040 .nr(8)
14041 .kr(1)
14042 .sr(1)
14043 .m(1)
14044 .n(8)
14045 .k(k)
14046 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14047 }
14048 }
14049 }
14050
14051 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
14052 TEST_REQUIRES_ARM_NEON;
14053 for (uint32_t n = 9; n < 16; n++) {
14054 for (size_t k = 1; k <= 10; k += 3) {
14055 GemmMicrokernelTester()
14056 .mr(1)
14057 .nr(8)
14058 .kr(1)
14059 .sr(1)
14060 .m(1)
14061 .n(8)
14062 .k(k)
14063 .cn_stride(11)
14064 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14065 }
14066 }
14067 }
14068
14069 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8_strided_a) {
14070 TEST_REQUIRES_ARM_NEON;
14071 for (uint32_t n = 9; n < 16; n++) {
14072 for (size_t k = 1; k <= 10; k += 3) {
14073 GemmMicrokernelTester()
14074 .mr(1)
14075 .nr(8)
14076 .kr(1)
14077 .sr(1)
14078 .m(1)
14079 .n(n)
14080 .k(k)
14081 .a_stride(13)
14082 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14083 }
14084 }
14085 }
14086
14087 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
14088 TEST_REQUIRES_ARM_NEON;
14089 for (uint32_t n = 9; n < 16; n++) {
14090 for (size_t k = 1; k <= 10; k += 3) {
14091 for (uint32_t m = 1; m <= 1; m++) {
14092 GemmMicrokernelTester()
14093 .mr(1)
14094 .nr(8)
14095 .kr(1)
14096 .sr(1)
14097 .m(m)
14098 .n(n)
14099 .k(k)
14100 .iterations(1)
14101 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14102 }
14103 }
14104 }
14105 }
14106
14107 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8) {
14108 TEST_REQUIRES_ARM_NEON;
14109 for (uint32_t n = 16; n <= 24; n += 8) {
14110 for (size_t k = 1; k <= 10; k += 3) {
14111 GemmMicrokernelTester()
14112 .mr(1)
14113 .nr(8)
14114 .kr(1)
14115 .sr(1)
14116 .m(1)
14117 .n(8)
14118 .k(k)
14119 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14120 }
14121 }
14122 }
14123
14124 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
14125 TEST_REQUIRES_ARM_NEON;
14126 for (uint32_t n = 16; n <= 24; n += 8) {
14127 for (size_t k = 1; k <= 10; k += 3) {
14128 GemmMicrokernelTester()
14129 .mr(1)
14130 .nr(8)
14131 .kr(1)
14132 .sr(1)
14133 .m(1)
14134 .n(n)
14135 .k(k)
14136 .cn_stride(11)
14137 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14138 }
14139 }
14140 }
14141
14142 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8_strided_a) {
14143 TEST_REQUIRES_ARM_NEON;
14144 for (uint32_t n = 16; n <= 24; n += 8) {
14145 for (size_t k = 1; k <= 10; k += 3) {
14146 GemmMicrokernelTester()
14147 .mr(1)
14148 .nr(8)
14149 .kr(1)
14150 .sr(1)
14151 .m(1)
14152 .n(n)
14153 .k(k)
14154 .a_stride(13)
14155 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14156 }
14157 }
14158 }
14159
14160 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, n_div_8_subtile) {
14161 TEST_REQUIRES_ARM_NEON;
14162 for (uint32_t n = 16; n <= 24; n += 8) {
14163 for (size_t k = 1; k <= 10; k += 3) {
14164 for (uint32_t m = 1; m <= 1; m++) {
14165 GemmMicrokernelTester()
14166 .mr(1)
14167 .nr(8)
14168 .kr(1)
14169 .sr(1)
14170 .m(m)
14171 .n(n)
14172 .k(k)
14173 .iterations(1)
14174 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14175 }
14176 }
14177 }
14178 }
14179
14180 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, strided_cm_subtile) {
14181 TEST_REQUIRES_ARM_NEON;
14182 for (size_t k = 1; k <= 10; k += 3) {
14183 for (uint32_t m = 1; m <= 1; m++) {
14184 for (uint32_t n = 1; n <= 8; n++) {
14185 GemmMicrokernelTester()
14186 .mr(1)
14187 .nr(8)
14188 .kr(1)
14189 .sr(1)
14190 .m(m)
14191 .n(n)
14192 .k(k)
14193 .cm_stride(11)
14194 .iterations(1)
14195 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14196 }
14197 }
14198 }
14199 }
14200
14201 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, qmin) {
14202 TEST_REQUIRES_ARM_NEON;
14203 GemmMicrokernelTester()
14204 .mr(1)
14205 .nr(8)
14206 .kr(1)
14207 .sr(1)
14208 .m(1)
14209 .n(8)
14210 .k(2)
14211 .qmin(128)
14212 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14213 }
14214
14215 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, qmax) {
14216 TEST_REQUIRES_ARM_NEON;
14217 GemmMicrokernelTester()
14218 .mr(1)
14219 .nr(8)
14220 .kr(1)
14221 .sr(1)
14222 .m(1)
14223 .n(8)
14224 .k(2)
14225 .qmax(128)
14226 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14227 }
14228
14229 TEST(F32_GEMMINC_1X8__NEON_DUP_LD64, strided_cm) {
14230 TEST_REQUIRES_ARM_NEON;
14231 GemmMicrokernelTester()
14232 .mr(1)
14233 .nr(8)
14234 .kr(1)
14235 .sr(1)
14236 .m(1)
14237 .n(8)
14238 .k(2)
14239 .cm_stride(11)
14240 .Test(xnn_f32_gemminc_ukernel_1x8__neon_dup_ld64);
14241 }
14242#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14243
14244
14245#if XNN_ARCH_ARM || XNN_ARCH_ARM64
14246 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2) {
14247 TEST_REQUIRES_ARM_NEON;
14248 GemmMicrokernelTester()
14249 .mr(4)
14250 .nr(8)
14251 .kr(1)
14252 .sr(1)
14253 .m(4)
14254 .n(8)
14255 .k(2)
14256 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14257 }
14258
14259 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, strided_cn) {
14260 TEST_REQUIRES_ARM_NEON;
14261 GemmMicrokernelTester()
14262 .mr(4)
14263 .nr(8)
14264 .kr(1)
14265 .sr(1)
14266 .m(4)
14267 .n(8)
14268 .k(2)
14269 .cn_stride(11)
14270 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14271 }
14272
14273 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_strided_a) {
14274 TEST_REQUIRES_ARM_NEON;
14275 GemmMicrokernelTester()
14276 .mr(4)
14277 .nr(8)
14278 .kr(1)
14279 .sr(1)
14280 .m(4)
14281 .n(8)
14282 .k(2)
14283 .a_stride(5)
14284 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14285 }
14286
14287 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_subtile) {
14288 TEST_REQUIRES_ARM_NEON;
14289 for (uint32_t m = 1; m <= 4; m++) {
14290 for (uint32_t n = 1; n <= 8; n++) {
14291 GemmMicrokernelTester()
14292 .mr(4)
14293 .nr(8)
14294 .kr(1)
14295 .sr(1)
14296 .m(m)
14297 .n(n)
14298 .k(2)
14299 .iterations(1)
14300 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14301 }
14302 }
14303 }
14304
14305 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
14306 TEST_REQUIRES_ARM_NEON;
14307 for (uint32_t m = 1; m <= 4; m++) {
14308 GemmMicrokernelTester()
14309 .mr(4)
14310 .nr(8)
14311 .kr(1)
14312 .sr(1)
14313 .m(m)
14314 .n(8)
14315 .k(2)
14316 .iterations(1)
14317 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14318 }
14319 }
14320
14321 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
14322 TEST_REQUIRES_ARM_NEON;
14323 for (uint32_t n = 1; n <= 8; n++) {
14324 GemmMicrokernelTester()
14325 .mr(4)
14326 .nr(8)
14327 .kr(1)
14328 .sr(1)
14329 .m(4)
14330 .n(n)
14331 .k(2)
14332 .iterations(1)
14333 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14334 }
14335 }
14336
14337 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_lt_2) {
14338 TEST_REQUIRES_ARM_NEON;
14339 for (size_t k = 1; k < 2; k++) {
14340 GemmMicrokernelTester()
14341 .mr(4)
14342 .nr(8)
14343 .kr(1)
14344 .sr(1)
14345 .m(4)
14346 .n(8)
14347 .k(k)
14348 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14349 }
14350 }
14351
14352 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_lt_2_strided_a) {
14353 TEST_REQUIRES_ARM_NEON;
14354 for (size_t k = 1; k < 2; k++) {
14355 GemmMicrokernelTester()
14356 .mr(4)
14357 .nr(8)
14358 .kr(1)
14359 .sr(1)
14360 .m(4)
14361 .n(8)
14362 .k(k)
14363 .a_stride(5)
14364 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14365 }
14366 }
14367
14368 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_lt_2_subtile) {
14369 TEST_REQUIRES_ARM_NEON;
14370 for (size_t k = 1; k < 2; k++) {
14371 for (uint32_t m = 1; m <= 4; m++) {
14372 for (uint32_t n = 1; n <= 8; n++) {
14373 GemmMicrokernelTester()
14374 .mr(4)
14375 .nr(8)
14376 .kr(1)
14377 .sr(1)
14378 .m(m)
14379 .n(n)
14380 .k(k)
14381 .iterations(1)
14382 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14383 }
14384 }
14385 }
14386 }
14387
14388 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_gt_2) {
14389 TEST_REQUIRES_ARM_NEON;
14390 for (size_t k = 3; k < 4; k++) {
14391 GemmMicrokernelTester()
14392 .mr(4)
14393 .nr(8)
14394 .kr(1)
14395 .sr(1)
14396 .m(4)
14397 .n(8)
14398 .k(k)
14399 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14400 }
14401 }
14402
14403 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_gt_2_strided_a) {
14404 TEST_REQUIRES_ARM_NEON;
14405 for (size_t k = 3; k < 4; k++) {
14406 GemmMicrokernelTester()
14407 .mr(4)
14408 .nr(8)
14409 .kr(1)
14410 .sr(1)
14411 .m(4)
14412 .n(8)
14413 .k(k)
14414 .a_stride(7)
14415 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14416 }
14417 }
14418
14419 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_gt_2_subtile) {
14420 TEST_REQUIRES_ARM_NEON;
14421 for (size_t k = 3; k < 4; k++) {
14422 for (uint32_t m = 1; m <= 4; m++) {
14423 for (uint32_t n = 1; n <= 8; n++) {
14424 GemmMicrokernelTester()
14425 .mr(4)
14426 .nr(8)
14427 .kr(1)
14428 .sr(1)
14429 .m(m)
14430 .n(n)
14431 .k(k)
14432 .iterations(1)
14433 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14434 }
14435 }
14436 }
14437 }
14438
14439 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_div_2) {
14440 TEST_REQUIRES_ARM_NEON;
14441 for (size_t k = 4; k <= 20; k += 2) {
14442 GemmMicrokernelTester()
14443 .mr(4)
14444 .nr(8)
14445 .kr(1)
14446 .sr(1)
14447 .m(4)
14448 .n(8)
14449 .k(k)
14450 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14451 }
14452 }
14453
14454 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_div_2_strided_a) {
14455 TEST_REQUIRES_ARM_NEON;
14456 for (size_t k = 4; k <= 20; k += 2) {
14457 GemmMicrokernelTester()
14458 .mr(4)
14459 .nr(8)
14460 .kr(1)
14461 .sr(1)
14462 .m(4)
14463 .n(8)
14464 .k(k)
14465 .a_stride(23)
14466 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14467 }
14468 }
14469
14470 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, k_div_2_subtile) {
14471 TEST_REQUIRES_ARM_NEON;
14472 for (size_t k = 4; k <= 20; k += 2) {
14473 for (uint32_t m = 1; m <= 4; m++) {
14474 for (uint32_t n = 1; n <= 8; n++) {
14475 GemmMicrokernelTester()
14476 .mr(4)
14477 .nr(8)
14478 .kr(1)
14479 .sr(1)
14480 .m(m)
14481 .n(n)
14482 .k(k)
14483 .iterations(1)
14484 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14485 }
14486 }
14487 }
14488 }
14489
14490 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8) {
14491 TEST_REQUIRES_ARM_NEON;
14492 for (uint32_t n = 9; n < 16; n++) {
14493 for (size_t k = 1; k <= 10; k += 3) {
14494 GemmMicrokernelTester()
14495 .mr(4)
14496 .nr(8)
14497 .kr(1)
14498 .sr(1)
14499 .m(4)
14500 .n(8)
14501 .k(k)
14502 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14503 }
14504 }
14505 }
14506
14507 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
14508 TEST_REQUIRES_ARM_NEON;
14509 for (uint32_t n = 9; n < 16; n++) {
14510 for (size_t k = 1; k <= 10; k += 3) {
14511 GemmMicrokernelTester()
14512 .mr(4)
14513 .nr(8)
14514 .kr(1)
14515 .sr(1)
14516 .m(4)
14517 .n(8)
14518 .k(k)
14519 .cn_stride(11)
14520 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14521 }
14522 }
14523 }
14524
14525 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8_strided_a) {
14526 TEST_REQUIRES_ARM_NEON;
14527 for (uint32_t n = 9; n < 16; n++) {
14528 for (size_t k = 1; k <= 10; k += 3) {
14529 GemmMicrokernelTester()
14530 .mr(4)
14531 .nr(8)
14532 .kr(1)
14533 .sr(1)
14534 .m(4)
14535 .n(n)
14536 .k(k)
14537 .a_stride(13)
14538 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14539 }
14540 }
14541 }
14542
14543 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_gt_8_subtile) {
14544 TEST_REQUIRES_ARM_NEON;
14545 for (uint32_t n = 9; n < 16; n++) {
14546 for (size_t k = 1; k <= 10; k += 3) {
14547 for (uint32_t m = 1; m <= 4; m++) {
14548 GemmMicrokernelTester()
14549 .mr(4)
14550 .nr(8)
14551 .kr(1)
14552 .sr(1)
14553 .m(m)
14554 .n(n)
14555 .k(k)
14556 .iterations(1)
14557 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14558 }
14559 }
14560 }
14561 }
14562
14563 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8) {
14564 TEST_REQUIRES_ARM_NEON;
14565 for (uint32_t n = 16; n <= 24; n += 8) {
14566 for (size_t k = 1; k <= 10; k += 3) {
14567 GemmMicrokernelTester()
14568 .mr(4)
14569 .nr(8)
14570 .kr(1)
14571 .sr(1)
14572 .m(4)
14573 .n(8)
14574 .k(k)
14575 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14576 }
14577 }
14578 }
14579
14580 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8_strided_cn) {
14581 TEST_REQUIRES_ARM_NEON;
14582 for (uint32_t n = 16; n <= 24; n += 8) {
14583 for (size_t k = 1; k <= 10; k += 3) {
14584 GemmMicrokernelTester()
14585 .mr(4)
14586 .nr(8)
14587 .kr(1)
14588 .sr(1)
14589 .m(4)
14590 .n(n)
14591 .k(k)
14592 .cn_stride(11)
14593 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14594 }
14595 }
14596 }
14597
14598 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8_strided_a) {
14599 TEST_REQUIRES_ARM_NEON;
14600 for (uint32_t n = 16; n <= 24; n += 8) {
14601 for (size_t k = 1; k <= 10; k += 3) {
14602 GemmMicrokernelTester()
14603 .mr(4)
14604 .nr(8)
14605 .kr(1)
14606 .sr(1)
14607 .m(4)
14608 .n(n)
14609 .k(k)
14610 .a_stride(13)
14611 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14612 }
14613 }
14614 }
14615
14616 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, n_div_8_subtile) {
14617 TEST_REQUIRES_ARM_NEON;
14618 for (uint32_t n = 16; n <= 24; n += 8) {
14619 for (size_t k = 1; k <= 10; k += 3) {
14620 for (uint32_t m = 1; m <= 4; m++) {
14621 GemmMicrokernelTester()
14622 .mr(4)
14623 .nr(8)
14624 .kr(1)
14625 .sr(1)
14626 .m(m)
14627 .n(n)
14628 .k(k)
14629 .iterations(1)
14630 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14631 }
14632 }
14633 }
14634 }
14635
14636 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, strided_cm_subtile) {
14637 TEST_REQUIRES_ARM_NEON;
14638 for (size_t k = 1; k <= 10; k += 3) {
14639 for (uint32_t m = 1; m <= 4; m++) {
14640 for (uint32_t n = 1; n <= 8; n++) {
14641 GemmMicrokernelTester()
14642 .mr(4)
14643 .nr(8)
14644 .kr(1)
14645 .sr(1)
14646 .m(m)
14647 .n(n)
14648 .k(k)
14649 .cm_stride(11)
14650 .iterations(1)
14651 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14652 }
14653 }
14654 }
14655 }
14656
14657 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, qmin) {
14658 TEST_REQUIRES_ARM_NEON;
14659 GemmMicrokernelTester()
14660 .mr(4)
14661 .nr(8)
14662 .kr(1)
14663 .sr(1)
14664 .m(4)
14665 .n(8)
14666 .k(2)
14667 .qmin(128)
14668 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14669 }
14670
14671 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, qmax) {
14672 TEST_REQUIRES_ARM_NEON;
14673 GemmMicrokernelTester()
14674 .mr(4)
14675 .nr(8)
14676 .kr(1)
14677 .sr(1)
14678 .m(4)
14679 .n(8)
14680 .k(2)
14681 .qmax(128)
14682 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14683 }
14684
14685 TEST(F32_GEMMINC_4X8__NEON_DUP_LD64, strided_cm) {
14686 TEST_REQUIRES_ARM_NEON;
14687 GemmMicrokernelTester()
14688 .mr(4)
14689 .nr(8)
14690 .kr(1)
14691 .sr(1)
14692 .m(4)
14693 .n(8)
14694 .k(2)
14695 .cm_stride(11)
14696 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld64);
14697 }
14698#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14699
14700
14701#if XNN_ARCH_ARM || XNN_ARCH_ARM64
14702 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4) {
14703 TEST_REQUIRES_ARM_NEON;
14704 GemmMicrokernelTester()
14705 .mr(4)
14706 .nr(8)
14707 .kr(1)
14708 .sr(1)
14709 .m(4)
14710 .n(8)
14711 .k(4)
14712 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14713 }
14714
14715 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, strided_cn) {
14716 TEST_REQUIRES_ARM_NEON;
14717 GemmMicrokernelTester()
14718 .mr(4)
14719 .nr(8)
14720 .kr(1)
14721 .sr(1)
14722 .m(4)
14723 .n(8)
14724 .k(4)
14725 .cn_stride(11)
14726 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14727 }
14728
14729 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_strided_a) {
14730 TEST_REQUIRES_ARM_NEON;
14731 GemmMicrokernelTester()
14732 .mr(4)
14733 .nr(8)
14734 .kr(1)
14735 .sr(1)
14736 .m(4)
14737 .n(8)
14738 .k(4)
14739 .a_stride(7)
14740 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14741 }
14742
14743 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
14744 TEST_REQUIRES_ARM_NEON;
14745 for (uint32_t m = 1; m <= 4; m++) {
14746 for (uint32_t n = 1; n <= 8; n++) {
14747 GemmMicrokernelTester()
14748 .mr(4)
14749 .nr(8)
14750 .kr(1)
14751 .sr(1)
14752 .m(m)
14753 .n(n)
14754 .k(4)
14755 .iterations(1)
14756 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14757 }
14758 }
14759 }
14760
14761 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
14762 TEST_REQUIRES_ARM_NEON;
14763 for (uint32_t m = 1; m <= 4; m++) {
14764 GemmMicrokernelTester()
14765 .mr(4)
14766 .nr(8)
14767 .kr(1)
14768 .sr(1)
14769 .m(m)
14770 .n(8)
14771 .k(4)
14772 .iterations(1)
14773 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14774 }
14775 }
14776
14777 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
14778 TEST_REQUIRES_ARM_NEON;
14779 for (uint32_t n = 1; n <= 8; n++) {
14780 GemmMicrokernelTester()
14781 .mr(4)
14782 .nr(8)
14783 .kr(1)
14784 .sr(1)
14785 .m(4)
14786 .n(n)
14787 .k(4)
14788 .iterations(1)
14789 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14790 }
14791 }
14792
14793 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_lt_4) {
14794 TEST_REQUIRES_ARM_NEON;
14795 for (size_t k = 1; k < 4; k++) {
14796 GemmMicrokernelTester()
14797 .mr(4)
14798 .nr(8)
14799 .kr(1)
14800 .sr(1)
14801 .m(4)
14802 .n(8)
14803 .k(k)
14804 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14805 }
14806 }
14807
14808 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_lt_4_strided_a) {
14809 TEST_REQUIRES_ARM_NEON;
14810 for (size_t k = 1; k < 4; k++) {
14811 GemmMicrokernelTester()
14812 .mr(4)
14813 .nr(8)
14814 .kr(1)
14815 .sr(1)
14816 .m(4)
14817 .n(8)
14818 .k(k)
14819 .a_stride(7)
14820 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14821 }
14822 }
14823
14824 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
14825 TEST_REQUIRES_ARM_NEON;
14826 for (size_t k = 1; k < 4; k++) {
14827 for (uint32_t m = 1; m <= 4; m++) {
14828 for (uint32_t n = 1; n <= 8; n++) {
14829 GemmMicrokernelTester()
14830 .mr(4)
14831 .nr(8)
14832 .kr(1)
14833 .sr(1)
14834 .m(m)
14835 .n(n)
14836 .k(k)
14837 .iterations(1)
14838 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14839 }
14840 }
14841 }
14842 }
14843
14844 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_gt_4) {
14845 TEST_REQUIRES_ARM_NEON;
14846 for (size_t k = 5; k < 8; k++) {
14847 GemmMicrokernelTester()
14848 .mr(4)
14849 .nr(8)
14850 .kr(1)
14851 .sr(1)
14852 .m(4)
14853 .n(8)
14854 .k(k)
14855 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14856 }
14857 }
14858
14859 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_gt_4_strided_a) {
14860 TEST_REQUIRES_ARM_NEON;
14861 for (size_t k = 5; k < 8; k++) {
14862 GemmMicrokernelTester()
14863 .mr(4)
14864 .nr(8)
14865 .kr(1)
14866 .sr(1)
14867 .m(4)
14868 .n(8)
14869 .k(k)
14870 .a_stride(11)
14871 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14872 }
14873 }
14874
14875 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
14876 TEST_REQUIRES_ARM_NEON;
14877 for (size_t k = 5; k < 8; k++) {
14878 for (uint32_t m = 1; m <= 4; m++) {
14879 for (uint32_t n = 1; n <= 8; n++) {
14880 GemmMicrokernelTester()
14881 .mr(4)
14882 .nr(8)
14883 .kr(1)
14884 .sr(1)
14885 .m(m)
14886 .n(n)
14887 .k(k)
14888 .iterations(1)
14889 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14890 }
14891 }
14892 }
14893 }
14894
14895 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_div_4) {
14896 TEST_REQUIRES_ARM_NEON;
14897 for (size_t k = 8; k <= 40; k += 4) {
14898 GemmMicrokernelTester()
14899 .mr(4)
14900 .nr(8)
14901 .kr(1)
14902 .sr(1)
14903 .m(4)
14904 .n(8)
14905 .k(k)
14906 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14907 }
14908 }
14909
14910 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_div_4_strided_a) {
14911 TEST_REQUIRES_ARM_NEON;
14912 for (size_t k = 8; k <= 40; k += 4) {
14913 GemmMicrokernelTester()
14914 .mr(4)
14915 .nr(8)
14916 .kr(1)
14917 .sr(1)
14918 .m(4)
14919 .n(8)
14920 .k(k)
14921 .a_stride(43)
14922 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14923 }
14924 }
14925
14926 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, k_div_4_subtile) {
14927 TEST_REQUIRES_ARM_NEON;
14928 for (size_t k = 8; k <= 40; k += 4) {
14929 for (uint32_t m = 1; m <= 4; m++) {
14930 for (uint32_t n = 1; n <= 8; n++) {
14931 GemmMicrokernelTester()
14932 .mr(4)
14933 .nr(8)
14934 .kr(1)
14935 .sr(1)
14936 .m(m)
14937 .n(n)
14938 .k(k)
14939 .iterations(1)
14940 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14941 }
14942 }
14943 }
14944 }
14945
14946 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8) {
14947 TEST_REQUIRES_ARM_NEON;
14948 for (uint32_t n = 9; n < 16; n++) {
14949 for (size_t k = 1; k <= 20; k += 5) {
14950 GemmMicrokernelTester()
14951 .mr(4)
14952 .nr(8)
14953 .kr(1)
14954 .sr(1)
14955 .m(4)
14956 .n(8)
14957 .k(k)
14958 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14959 }
14960 }
14961 }
14962
14963 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
14964 TEST_REQUIRES_ARM_NEON;
14965 for (uint32_t n = 9; n < 16; n++) {
14966 for (size_t k = 1; k <= 20; k += 5) {
14967 GemmMicrokernelTester()
14968 .mr(4)
14969 .nr(8)
14970 .kr(1)
14971 .sr(1)
14972 .m(4)
14973 .n(8)
14974 .k(k)
14975 .cn_stride(11)
14976 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14977 }
14978 }
14979 }
14980
14981 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8_strided_a) {
14982 TEST_REQUIRES_ARM_NEON;
14983 for (uint32_t n = 9; n < 16; n++) {
14984 for (size_t k = 1; k <= 20; k += 5) {
14985 GemmMicrokernelTester()
14986 .mr(4)
14987 .nr(8)
14988 .kr(1)
14989 .sr(1)
14990 .m(4)
14991 .n(n)
14992 .k(k)
14993 .a_stride(23)
14994 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
14995 }
14996 }
14997 }
14998
14999 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
15000 TEST_REQUIRES_ARM_NEON;
15001 for (uint32_t n = 9; n < 16; n++) {
15002 for (size_t k = 1; k <= 20; k += 5) {
15003 for (uint32_t m = 1; m <= 4; m++) {
15004 GemmMicrokernelTester()
15005 .mr(4)
15006 .nr(8)
15007 .kr(1)
15008 .sr(1)
15009 .m(m)
15010 .n(n)
15011 .k(k)
15012 .iterations(1)
15013 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15014 }
15015 }
15016 }
15017 }
15018
15019 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8) {
15020 TEST_REQUIRES_ARM_NEON;
15021 for (uint32_t n = 16; n <= 24; n += 8) {
15022 for (size_t k = 1; k <= 20; k += 5) {
15023 GemmMicrokernelTester()
15024 .mr(4)
15025 .nr(8)
15026 .kr(1)
15027 .sr(1)
15028 .m(4)
15029 .n(8)
15030 .k(k)
15031 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15032 }
15033 }
15034 }
15035
15036 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
15037 TEST_REQUIRES_ARM_NEON;
15038 for (uint32_t n = 16; n <= 24; n += 8) {
15039 for (size_t k = 1; k <= 20; k += 5) {
15040 GemmMicrokernelTester()
15041 .mr(4)
15042 .nr(8)
15043 .kr(1)
15044 .sr(1)
15045 .m(4)
15046 .n(n)
15047 .k(k)
15048 .cn_stride(11)
15049 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15050 }
15051 }
15052 }
15053
15054 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8_strided_a) {
15055 TEST_REQUIRES_ARM_NEON;
15056 for (uint32_t n = 16; n <= 24; n += 8) {
15057 for (size_t k = 1; k <= 20; k += 5) {
15058 GemmMicrokernelTester()
15059 .mr(4)
15060 .nr(8)
15061 .kr(1)
15062 .sr(1)
15063 .m(4)
15064 .n(n)
15065 .k(k)
15066 .a_stride(23)
15067 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15068 }
15069 }
15070 }
15071
15072 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, n_div_8_subtile) {
15073 TEST_REQUIRES_ARM_NEON;
15074 for (uint32_t n = 16; n <= 24; n += 8) {
15075 for (size_t k = 1; k <= 20; k += 5) {
15076 for (uint32_t m = 1; m <= 4; m++) {
15077 GemmMicrokernelTester()
15078 .mr(4)
15079 .nr(8)
15080 .kr(1)
15081 .sr(1)
15082 .m(m)
15083 .n(n)
15084 .k(k)
15085 .iterations(1)
15086 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15087 }
15088 }
15089 }
15090 }
15091
15092 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, strided_cm_subtile) {
15093 TEST_REQUIRES_ARM_NEON;
15094 for (size_t k = 1; k <= 20; k += 5) {
15095 for (uint32_t m = 1; m <= 4; m++) {
15096 for (uint32_t n = 1; n <= 8; n++) {
15097 GemmMicrokernelTester()
15098 .mr(4)
15099 .nr(8)
15100 .kr(1)
15101 .sr(1)
15102 .m(m)
15103 .n(n)
15104 .k(k)
15105 .cm_stride(11)
15106 .iterations(1)
15107 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15108 }
15109 }
15110 }
15111 }
15112
15113 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, qmin) {
15114 TEST_REQUIRES_ARM_NEON;
15115 GemmMicrokernelTester()
15116 .mr(4)
15117 .nr(8)
15118 .kr(1)
15119 .sr(1)
15120 .m(4)
15121 .n(8)
15122 .k(4)
15123 .qmin(128)
15124 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15125 }
15126
15127 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, qmax) {
15128 TEST_REQUIRES_ARM_NEON;
15129 GemmMicrokernelTester()
15130 .mr(4)
15131 .nr(8)
15132 .kr(1)
15133 .sr(1)
15134 .m(4)
15135 .n(8)
15136 .k(4)
15137 .qmax(128)
15138 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15139 }
15140
15141 TEST(F32_GEMMINC_4X8__NEON_DUP_LD128, strided_cm) {
15142 TEST_REQUIRES_ARM_NEON;
15143 GemmMicrokernelTester()
15144 .mr(4)
15145 .nr(8)
15146 .kr(1)
15147 .sr(1)
15148 .m(4)
15149 .n(8)
15150 .k(4)
15151 .cm_stride(11)
15152 .Test(xnn_f32_gemminc_ukernel_4x8__neon_dup_ld128);
15153 }
15154#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15155
15156
15157#if XNN_ARCH_ARM || XNN_ARCH_ARM64
15158 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2) {
15159 TEST_REQUIRES_ARM_NEON;
15160 GemmMicrokernelTester()
15161 .mr(6)
15162 .nr(8)
15163 .kr(1)
15164 .sr(1)
15165 .m(6)
15166 .n(8)
15167 .k(2)
15168 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15169 }
15170
15171 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, strided_cn) {
15172 TEST_REQUIRES_ARM_NEON;
15173 GemmMicrokernelTester()
15174 .mr(6)
15175 .nr(8)
15176 .kr(1)
15177 .sr(1)
15178 .m(6)
15179 .n(8)
15180 .k(2)
15181 .cn_stride(11)
15182 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15183 }
15184
15185 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_strided_a) {
15186 TEST_REQUIRES_ARM_NEON;
15187 GemmMicrokernelTester()
15188 .mr(6)
15189 .nr(8)
15190 .kr(1)
15191 .sr(1)
15192 .m(6)
15193 .n(8)
15194 .k(2)
15195 .a_stride(5)
15196 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15197 }
15198
15199 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
15200 TEST_REQUIRES_ARM_NEON;
15201 for (uint32_t m = 1; m <= 6; m++) {
15202 for (uint32_t n = 1; n <= 8; n++) {
15203 GemmMicrokernelTester()
15204 .mr(6)
15205 .nr(8)
15206 .kr(1)
15207 .sr(1)
15208 .m(m)
15209 .n(n)
15210 .k(2)
15211 .iterations(1)
15212 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15213 }
15214 }
15215 }
15216
15217 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
15218 TEST_REQUIRES_ARM_NEON;
15219 for (uint32_t m = 1; m <= 6; m++) {
15220 GemmMicrokernelTester()
15221 .mr(6)
15222 .nr(8)
15223 .kr(1)
15224 .sr(1)
15225 .m(m)
15226 .n(8)
15227 .k(2)
15228 .iterations(1)
15229 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15230 }
15231 }
15232
15233 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
15234 TEST_REQUIRES_ARM_NEON;
15235 for (uint32_t n = 1; n <= 8; n++) {
15236 GemmMicrokernelTester()
15237 .mr(6)
15238 .nr(8)
15239 .kr(1)
15240 .sr(1)
15241 .m(6)
15242 .n(n)
15243 .k(2)
15244 .iterations(1)
15245 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15246 }
15247 }
15248
15249 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_lt_2) {
15250 TEST_REQUIRES_ARM_NEON;
15251 for (size_t k = 1; k < 2; k++) {
15252 GemmMicrokernelTester()
15253 .mr(6)
15254 .nr(8)
15255 .kr(1)
15256 .sr(1)
15257 .m(6)
15258 .n(8)
15259 .k(k)
15260 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15261 }
15262 }
15263
15264 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_lt_2_strided_a) {
15265 TEST_REQUIRES_ARM_NEON;
15266 for (size_t k = 1; k < 2; k++) {
15267 GemmMicrokernelTester()
15268 .mr(6)
15269 .nr(8)
15270 .kr(1)
15271 .sr(1)
15272 .m(6)
15273 .n(8)
15274 .k(k)
15275 .a_stride(5)
15276 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15277 }
15278 }
15279
15280 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
15281 TEST_REQUIRES_ARM_NEON;
15282 for (size_t k = 1; k < 2; k++) {
15283 for (uint32_t m = 1; m <= 6; m++) {
15284 for (uint32_t n = 1; n <= 8; n++) {
15285 GemmMicrokernelTester()
15286 .mr(6)
15287 .nr(8)
15288 .kr(1)
15289 .sr(1)
15290 .m(m)
15291 .n(n)
15292 .k(k)
15293 .iterations(1)
15294 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15295 }
15296 }
15297 }
15298 }
15299
15300 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_gt_2) {
15301 TEST_REQUIRES_ARM_NEON;
15302 for (size_t k = 3; k < 4; k++) {
15303 GemmMicrokernelTester()
15304 .mr(6)
15305 .nr(8)
15306 .kr(1)
15307 .sr(1)
15308 .m(6)
15309 .n(8)
15310 .k(k)
15311 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15312 }
15313 }
15314
15315 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_gt_2_strided_a) {
15316 TEST_REQUIRES_ARM_NEON;
15317 for (size_t k = 3; k < 4; k++) {
15318 GemmMicrokernelTester()
15319 .mr(6)
15320 .nr(8)
15321 .kr(1)
15322 .sr(1)
15323 .m(6)
15324 .n(8)
15325 .k(k)
15326 .a_stride(7)
15327 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15328 }
15329 }
15330
15331 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
15332 TEST_REQUIRES_ARM_NEON;
15333 for (size_t k = 3; k < 4; k++) {
15334 for (uint32_t m = 1; m <= 6; m++) {
15335 for (uint32_t n = 1; n <= 8; n++) {
15336 GemmMicrokernelTester()
15337 .mr(6)
15338 .nr(8)
15339 .kr(1)
15340 .sr(1)
15341 .m(m)
15342 .n(n)
15343 .k(k)
15344 .iterations(1)
15345 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15346 }
15347 }
15348 }
15349 }
15350
15351 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_div_2) {
15352 TEST_REQUIRES_ARM_NEON;
15353 for (size_t k = 4; k <= 20; k += 2) {
15354 GemmMicrokernelTester()
15355 .mr(6)
15356 .nr(8)
15357 .kr(1)
15358 .sr(1)
15359 .m(6)
15360 .n(8)
15361 .k(k)
15362 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15363 }
15364 }
15365
15366 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_div_2_strided_a) {
15367 TEST_REQUIRES_ARM_NEON;
15368 for (size_t k = 4; k <= 20; k += 2) {
15369 GemmMicrokernelTester()
15370 .mr(6)
15371 .nr(8)
15372 .kr(1)
15373 .sr(1)
15374 .m(6)
15375 .n(8)
15376 .k(k)
15377 .a_stride(23)
15378 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15379 }
15380 }
15381
15382 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, k_div_2_subtile) {
15383 TEST_REQUIRES_ARM_NEON;
15384 for (size_t k = 4; k <= 20; k += 2) {
15385 for (uint32_t m = 1; m <= 6; m++) {
15386 for (uint32_t n = 1; n <= 8; n++) {
15387 GemmMicrokernelTester()
15388 .mr(6)
15389 .nr(8)
15390 .kr(1)
15391 .sr(1)
15392 .m(m)
15393 .n(n)
15394 .k(k)
15395 .iterations(1)
15396 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15397 }
15398 }
15399 }
15400 }
15401
15402 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8) {
15403 TEST_REQUIRES_ARM_NEON;
15404 for (uint32_t n = 9; n < 16; n++) {
15405 for (size_t k = 1; k <= 10; k += 3) {
15406 GemmMicrokernelTester()
15407 .mr(6)
15408 .nr(8)
15409 .kr(1)
15410 .sr(1)
15411 .m(6)
15412 .n(8)
15413 .k(k)
15414 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15415 }
15416 }
15417 }
15418
15419 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
15420 TEST_REQUIRES_ARM_NEON;
15421 for (uint32_t n = 9; n < 16; n++) {
15422 for (size_t k = 1; k <= 10; k += 3) {
15423 GemmMicrokernelTester()
15424 .mr(6)
15425 .nr(8)
15426 .kr(1)
15427 .sr(1)
15428 .m(6)
15429 .n(8)
15430 .k(k)
15431 .cn_stride(11)
15432 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15433 }
15434 }
15435 }
15436
15437 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8_strided_a) {
15438 TEST_REQUIRES_ARM_NEON;
15439 for (uint32_t n = 9; n < 16; n++) {
15440 for (size_t k = 1; k <= 10; k += 3) {
15441 GemmMicrokernelTester()
15442 .mr(6)
15443 .nr(8)
15444 .kr(1)
15445 .sr(1)
15446 .m(6)
15447 .n(n)
15448 .k(k)
15449 .a_stride(13)
15450 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15451 }
15452 }
15453 }
15454
15455 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
15456 TEST_REQUIRES_ARM_NEON;
15457 for (uint32_t n = 9; n < 16; n++) {
15458 for (size_t k = 1; k <= 10; k += 3) {
15459 for (uint32_t m = 1; m <= 6; m++) {
15460 GemmMicrokernelTester()
15461 .mr(6)
15462 .nr(8)
15463 .kr(1)
15464 .sr(1)
15465 .m(m)
15466 .n(n)
15467 .k(k)
15468 .iterations(1)
15469 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15470 }
15471 }
15472 }
15473 }
15474
15475 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8) {
15476 TEST_REQUIRES_ARM_NEON;
15477 for (uint32_t n = 16; n <= 24; n += 8) {
15478 for (size_t k = 1; k <= 10; k += 3) {
15479 GemmMicrokernelTester()
15480 .mr(6)
15481 .nr(8)
15482 .kr(1)
15483 .sr(1)
15484 .m(6)
15485 .n(8)
15486 .k(k)
15487 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15488 }
15489 }
15490 }
15491
15492 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
15493 TEST_REQUIRES_ARM_NEON;
15494 for (uint32_t n = 16; n <= 24; n += 8) {
15495 for (size_t k = 1; k <= 10; k += 3) {
15496 GemmMicrokernelTester()
15497 .mr(6)
15498 .nr(8)
15499 .kr(1)
15500 .sr(1)
15501 .m(6)
15502 .n(n)
15503 .k(k)
15504 .cn_stride(11)
15505 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15506 }
15507 }
15508 }
15509
15510 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8_strided_a) {
15511 TEST_REQUIRES_ARM_NEON;
15512 for (uint32_t n = 16; n <= 24; n += 8) {
15513 for (size_t k = 1; k <= 10; k += 3) {
15514 GemmMicrokernelTester()
15515 .mr(6)
15516 .nr(8)
15517 .kr(1)
15518 .sr(1)
15519 .m(6)
15520 .n(n)
15521 .k(k)
15522 .a_stride(13)
15523 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15524 }
15525 }
15526 }
15527
15528 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, n_div_8_subtile) {
15529 TEST_REQUIRES_ARM_NEON;
15530 for (uint32_t n = 16; n <= 24; n += 8) {
15531 for (size_t k = 1; k <= 10; k += 3) {
15532 for (uint32_t m = 1; m <= 6; m++) {
15533 GemmMicrokernelTester()
15534 .mr(6)
15535 .nr(8)
15536 .kr(1)
15537 .sr(1)
15538 .m(m)
15539 .n(n)
15540 .k(k)
15541 .iterations(1)
15542 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15543 }
15544 }
15545 }
15546 }
15547
15548 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, strided_cm_subtile) {
15549 TEST_REQUIRES_ARM_NEON;
15550 for (size_t k = 1; k <= 10; k += 3) {
15551 for (uint32_t m = 1; m <= 6; m++) {
15552 for (uint32_t n = 1; n <= 8; n++) {
15553 GemmMicrokernelTester()
15554 .mr(6)
15555 .nr(8)
15556 .kr(1)
15557 .sr(1)
15558 .m(m)
15559 .n(n)
15560 .k(k)
15561 .cm_stride(11)
15562 .iterations(1)
15563 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15564 }
15565 }
15566 }
15567 }
15568
15569 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, qmin) {
15570 TEST_REQUIRES_ARM_NEON;
15571 GemmMicrokernelTester()
15572 .mr(6)
15573 .nr(8)
15574 .kr(1)
15575 .sr(1)
15576 .m(6)
15577 .n(8)
15578 .k(2)
15579 .qmin(128)
15580 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15581 }
15582
15583 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, qmax) {
15584 TEST_REQUIRES_ARM_NEON;
15585 GemmMicrokernelTester()
15586 .mr(6)
15587 .nr(8)
15588 .kr(1)
15589 .sr(1)
15590 .m(6)
15591 .n(8)
15592 .k(2)
15593 .qmax(128)
15594 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15595 }
15596
15597 TEST(F32_GEMMINC_6X8__NEON_DUP_LD64, strided_cm) {
15598 TEST_REQUIRES_ARM_NEON;
15599 GemmMicrokernelTester()
15600 .mr(6)
15601 .nr(8)
15602 .kr(1)
15603 .sr(1)
15604 .m(6)
15605 .n(8)
15606 .k(2)
15607 .cm_stride(11)
15608 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld64);
15609 }
15610#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
15611
15612
15613#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard69172d92019-11-26 16:22:39 -080015614 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4) {
15615 TEST_REQUIRES_ARM_NEON;
15616 GemmMicrokernelTester()
15617 .mr(6)
15618 .nr(8)
15619 .kr(1)
15620 .sr(1)
15621 .m(6)
15622 .n(8)
15623 .k(4)
15624 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15625 }
15626
15627 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, strided_cn) {
15628 TEST_REQUIRES_ARM_NEON;
15629 GemmMicrokernelTester()
15630 .mr(6)
15631 .nr(8)
15632 .kr(1)
15633 .sr(1)
15634 .m(6)
15635 .n(8)
15636 .k(4)
15637 .cn_stride(11)
15638 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15639 }
15640
15641 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_strided_a) {
15642 TEST_REQUIRES_ARM_NEON;
15643 GemmMicrokernelTester()
15644 .mr(6)
15645 .nr(8)
15646 .kr(1)
15647 .sr(1)
15648 .m(6)
15649 .n(8)
15650 .k(4)
15651 .a_stride(7)
15652 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15653 }
15654
15655 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_subtile) {
15656 TEST_REQUIRES_ARM_NEON;
15657 for (uint32_t m = 1; m <= 6; m++) {
15658 for (uint32_t n = 1; n <= 8; n++) {
15659 GemmMicrokernelTester()
15660 .mr(6)
15661 .nr(8)
15662 .kr(1)
15663 .sr(1)
15664 .m(m)
15665 .n(n)
15666 .k(4)
15667 .iterations(1)
15668 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15669 }
15670 }
15671 }
15672
15673 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
15674 TEST_REQUIRES_ARM_NEON;
15675 for (uint32_t m = 1; m <= 6; m++) {
15676 GemmMicrokernelTester()
15677 .mr(6)
15678 .nr(8)
15679 .kr(1)
15680 .sr(1)
15681 .m(m)
15682 .n(8)
15683 .k(4)
15684 .iterations(1)
15685 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15686 }
15687 }
15688
15689 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
15690 TEST_REQUIRES_ARM_NEON;
15691 for (uint32_t n = 1; n <= 8; n++) {
15692 GemmMicrokernelTester()
15693 .mr(6)
15694 .nr(8)
15695 .kr(1)
15696 .sr(1)
15697 .m(6)
15698 .n(n)
15699 .k(4)
15700 .iterations(1)
15701 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15702 }
15703 }
15704
15705 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_lt_4) {
15706 TEST_REQUIRES_ARM_NEON;
15707 for (size_t k = 1; k < 4; k++) {
15708 GemmMicrokernelTester()
15709 .mr(6)
15710 .nr(8)
15711 .kr(1)
15712 .sr(1)
15713 .m(6)
15714 .n(8)
15715 .k(k)
15716 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15717 }
15718 }
15719
15720 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_lt_4_strided_a) {
15721 TEST_REQUIRES_ARM_NEON;
15722 for (size_t k = 1; k < 4; k++) {
15723 GemmMicrokernelTester()
15724 .mr(6)
15725 .nr(8)
15726 .kr(1)
15727 .sr(1)
15728 .m(6)
15729 .n(8)
15730 .k(k)
15731 .a_stride(7)
15732 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15733 }
15734 }
15735
15736 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_lt_4_subtile) {
15737 TEST_REQUIRES_ARM_NEON;
15738 for (size_t k = 1; k < 4; k++) {
15739 for (uint32_t m = 1; m <= 6; m++) {
15740 for (uint32_t n = 1; n <= 8; n++) {
15741 GemmMicrokernelTester()
15742 .mr(6)
15743 .nr(8)
15744 .kr(1)
15745 .sr(1)
15746 .m(m)
15747 .n(n)
15748 .k(k)
15749 .iterations(1)
15750 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15751 }
15752 }
15753 }
15754 }
15755
15756 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_gt_4) {
15757 TEST_REQUIRES_ARM_NEON;
15758 for (size_t k = 5; k < 8; k++) {
15759 GemmMicrokernelTester()
15760 .mr(6)
15761 .nr(8)
15762 .kr(1)
15763 .sr(1)
15764 .m(6)
15765 .n(8)
15766 .k(k)
15767 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15768 }
15769 }
15770
15771 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_gt_4_strided_a) {
15772 TEST_REQUIRES_ARM_NEON;
15773 for (size_t k = 5; k < 8; k++) {
15774 GemmMicrokernelTester()
15775 .mr(6)
15776 .nr(8)
15777 .kr(1)
15778 .sr(1)
15779 .m(6)
15780 .n(8)
15781 .k(k)
15782 .a_stride(11)
15783 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15784 }
15785 }
15786
15787 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_gt_4_subtile) {
15788 TEST_REQUIRES_ARM_NEON;
15789 for (size_t k = 5; k < 8; k++) {
15790 for (uint32_t m = 1; m <= 6; m++) {
15791 for (uint32_t n = 1; n <= 8; n++) {
15792 GemmMicrokernelTester()
15793 .mr(6)
15794 .nr(8)
15795 .kr(1)
15796 .sr(1)
15797 .m(m)
15798 .n(n)
15799 .k(k)
15800 .iterations(1)
15801 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15802 }
15803 }
15804 }
15805 }
15806
15807 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_div_4) {
15808 TEST_REQUIRES_ARM_NEON;
15809 for (size_t k = 8; k <= 40; k += 4) {
15810 GemmMicrokernelTester()
15811 .mr(6)
15812 .nr(8)
15813 .kr(1)
15814 .sr(1)
15815 .m(6)
15816 .n(8)
15817 .k(k)
15818 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15819 }
15820 }
15821
15822 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_div_4_strided_a) {
15823 TEST_REQUIRES_ARM_NEON;
15824 for (size_t k = 8; k <= 40; k += 4) {
15825 GemmMicrokernelTester()
15826 .mr(6)
15827 .nr(8)
15828 .kr(1)
15829 .sr(1)
15830 .m(6)
15831 .n(8)
15832 .k(k)
15833 .a_stride(43)
15834 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15835 }
15836 }
15837
15838 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, k_div_4_subtile) {
15839 TEST_REQUIRES_ARM_NEON;
15840 for (size_t k = 8; k <= 40; k += 4) {
15841 for (uint32_t m = 1; m <= 6; m++) {
15842 for (uint32_t n = 1; n <= 8; n++) {
15843 GemmMicrokernelTester()
15844 .mr(6)
15845 .nr(8)
15846 .kr(1)
15847 .sr(1)
15848 .m(m)
15849 .n(n)
15850 .k(k)
15851 .iterations(1)
15852 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15853 }
15854 }
15855 }
15856 }
15857
15858 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8) {
15859 TEST_REQUIRES_ARM_NEON;
15860 for (uint32_t n = 9; n < 16; n++) {
15861 for (size_t k = 1; k <= 20; k += 5) {
15862 GemmMicrokernelTester()
15863 .mr(6)
15864 .nr(8)
15865 .kr(1)
15866 .sr(1)
15867 .m(6)
15868 .n(8)
15869 .k(k)
15870 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15871 }
15872 }
15873 }
15874
15875 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
15876 TEST_REQUIRES_ARM_NEON;
15877 for (uint32_t n = 9; n < 16; n++) {
15878 for (size_t k = 1; k <= 20; k += 5) {
15879 GemmMicrokernelTester()
15880 .mr(6)
15881 .nr(8)
15882 .kr(1)
15883 .sr(1)
15884 .m(6)
15885 .n(8)
15886 .k(k)
15887 .cn_stride(11)
15888 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15889 }
15890 }
15891 }
15892
15893 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8_strided_a) {
15894 TEST_REQUIRES_ARM_NEON;
15895 for (uint32_t n = 9; n < 16; n++) {
15896 for (size_t k = 1; k <= 20; k += 5) {
15897 GemmMicrokernelTester()
15898 .mr(6)
15899 .nr(8)
15900 .kr(1)
15901 .sr(1)
15902 .m(6)
15903 .n(n)
15904 .k(k)
15905 .a_stride(23)
15906 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15907 }
15908 }
15909 }
15910
15911 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_gt_8_subtile) {
15912 TEST_REQUIRES_ARM_NEON;
15913 for (uint32_t n = 9; n < 16; n++) {
15914 for (size_t k = 1; k <= 20; k += 5) {
15915 for (uint32_t m = 1; m <= 6; m++) {
15916 GemmMicrokernelTester()
15917 .mr(6)
15918 .nr(8)
15919 .kr(1)
15920 .sr(1)
15921 .m(m)
15922 .n(n)
15923 .k(k)
15924 .iterations(1)
15925 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15926 }
15927 }
15928 }
15929 }
15930
15931 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8) {
15932 TEST_REQUIRES_ARM_NEON;
15933 for (uint32_t n = 16; n <= 24; n += 8) {
15934 for (size_t k = 1; k <= 20; k += 5) {
15935 GemmMicrokernelTester()
15936 .mr(6)
15937 .nr(8)
15938 .kr(1)
15939 .sr(1)
15940 .m(6)
15941 .n(8)
15942 .k(k)
15943 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15944 }
15945 }
15946 }
15947
15948 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8_strided_cn) {
15949 TEST_REQUIRES_ARM_NEON;
15950 for (uint32_t n = 16; n <= 24; n += 8) {
15951 for (size_t k = 1; k <= 20; k += 5) {
15952 GemmMicrokernelTester()
15953 .mr(6)
15954 .nr(8)
15955 .kr(1)
15956 .sr(1)
15957 .m(6)
15958 .n(n)
15959 .k(k)
15960 .cn_stride(11)
15961 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15962 }
15963 }
15964 }
15965
15966 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8_strided_a) {
15967 TEST_REQUIRES_ARM_NEON;
15968 for (uint32_t n = 16; n <= 24; n += 8) {
15969 for (size_t k = 1; k <= 20; k += 5) {
15970 GemmMicrokernelTester()
15971 .mr(6)
15972 .nr(8)
15973 .kr(1)
15974 .sr(1)
15975 .m(6)
15976 .n(n)
15977 .k(k)
15978 .a_stride(23)
15979 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15980 }
15981 }
15982 }
15983
15984 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, n_div_8_subtile) {
15985 TEST_REQUIRES_ARM_NEON;
15986 for (uint32_t n = 16; n <= 24; n += 8) {
15987 for (size_t k = 1; k <= 20; k += 5) {
15988 for (uint32_t m = 1; m <= 6; m++) {
15989 GemmMicrokernelTester()
15990 .mr(6)
15991 .nr(8)
15992 .kr(1)
15993 .sr(1)
15994 .m(m)
15995 .n(n)
15996 .k(k)
15997 .iterations(1)
15998 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
15999 }
16000 }
16001 }
16002 }
16003
16004 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, strided_cm_subtile) {
16005 TEST_REQUIRES_ARM_NEON;
16006 for (size_t k = 1; k <= 20; k += 5) {
16007 for (uint32_t m = 1; m <= 6; m++) {
16008 for (uint32_t n = 1; n <= 8; n++) {
16009 GemmMicrokernelTester()
16010 .mr(6)
16011 .nr(8)
16012 .kr(1)
16013 .sr(1)
16014 .m(m)
16015 .n(n)
16016 .k(k)
16017 .cm_stride(11)
16018 .iterations(1)
16019 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
16020 }
16021 }
16022 }
16023 }
16024
16025 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, qmin) {
16026 TEST_REQUIRES_ARM_NEON;
16027 GemmMicrokernelTester()
16028 .mr(6)
16029 .nr(8)
16030 .kr(1)
16031 .sr(1)
16032 .m(6)
16033 .n(8)
16034 .k(4)
16035 .qmin(128)
16036 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
16037 }
16038
16039 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, qmax) {
16040 TEST_REQUIRES_ARM_NEON;
16041 GemmMicrokernelTester()
16042 .mr(6)
16043 .nr(8)
16044 .kr(1)
16045 .sr(1)
16046 .m(6)
16047 .n(8)
16048 .k(4)
16049 .qmax(128)
16050 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
16051 }
16052
16053 TEST(F32_GEMMINC_6X8__NEON_DUP_LD128, strided_cm) {
16054 TEST_REQUIRES_ARM_NEON;
16055 GemmMicrokernelTester()
16056 .mr(6)
16057 .nr(8)
16058 .kr(1)
16059 .sr(1)
16060 .m(6)
16061 .n(8)
16062 .k(4)
16063 .cm_stride(11)
16064 .Test(xnn_f32_gemminc_ukernel_6x8__neon_dup_ld128);
16065 }
16066#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16067
16068
16069#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080016070 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2) {
16071 TEST_REQUIRES_ARM_NEON_FMA;
16072 GemmMicrokernelTester()
16073 .mr(1)
16074 .nr(8)
16075 .kr(1)
16076 .sr(1)
16077 .m(1)
16078 .n(8)
16079 .k(2)
16080 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16081 }
16082
16083 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, strided_cn) {
16084 TEST_REQUIRES_ARM_NEON_FMA;
16085 GemmMicrokernelTester()
16086 .mr(1)
16087 .nr(8)
16088 .kr(1)
16089 .sr(1)
16090 .m(1)
16091 .n(8)
16092 .k(2)
16093 .cn_stride(11)
16094 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16095 }
16096
16097 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
16098 TEST_REQUIRES_ARM_NEON_FMA;
16099 GemmMicrokernelTester()
16100 .mr(1)
16101 .nr(8)
16102 .kr(1)
16103 .sr(1)
16104 .m(1)
16105 .n(8)
16106 .k(2)
16107 .a_stride(5)
16108 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16109 }
16110
16111 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
16112 TEST_REQUIRES_ARM_NEON_FMA;
16113 for (uint32_t m = 1; m <= 1; m++) {
16114 for (uint32_t n = 1; n <= 8; n++) {
16115 GemmMicrokernelTester()
16116 .mr(1)
16117 .nr(8)
16118 .kr(1)
16119 .sr(1)
16120 .m(m)
16121 .n(n)
16122 .k(2)
16123 .iterations(1)
16124 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16125 }
16126 }
16127 }
16128
16129 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
16130 TEST_REQUIRES_ARM_NEON_FMA;
16131 for (uint32_t m = 1; m <= 1; m++) {
16132 GemmMicrokernelTester()
16133 .mr(1)
16134 .nr(8)
16135 .kr(1)
16136 .sr(1)
16137 .m(m)
16138 .n(8)
16139 .k(2)
16140 .iterations(1)
16141 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16142 }
16143 }
16144
16145 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
16146 TEST_REQUIRES_ARM_NEON_FMA;
16147 for (uint32_t n = 1; n <= 8; n++) {
16148 GemmMicrokernelTester()
16149 .mr(1)
16150 .nr(8)
16151 .kr(1)
16152 .sr(1)
16153 .m(1)
16154 .n(n)
16155 .k(2)
16156 .iterations(1)
16157 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16158 }
16159 }
16160
16161 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_lt_2) {
16162 TEST_REQUIRES_ARM_NEON_FMA;
16163 for (size_t k = 1; k < 2; k++) {
16164 GemmMicrokernelTester()
16165 .mr(1)
16166 .nr(8)
16167 .kr(1)
16168 .sr(1)
16169 .m(1)
16170 .n(8)
16171 .k(k)
16172 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16173 }
16174 }
16175
16176 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
16177 TEST_REQUIRES_ARM_NEON_FMA;
16178 for (size_t k = 1; k < 2; k++) {
16179 GemmMicrokernelTester()
16180 .mr(1)
16181 .nr(8)
16182 .kr(1)
16183 .sr(1)
16184 .m(1)
16185 .n(8)
16186 .k(k)
16187 .a_stride(5)
16188 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16189 }
16190 }
16191
16192 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
16193 TEST_REQUIRES_ARM_NEON_FMA;
16194 for (size_t k = 1; k < 2; k++) {
16195 for (uint32_t m = 1; m <= 1; m++) {
16196 for (uint32_t n = 1; n <= 8; n++) {
16197 GemmMicrokernelTester()
16198 .mr(1)
16199 .nr(8)
16200 .kr(1)
16201 .sr(1)
16202 .m(m)
16203 .n(n)
16204 .k(k)
16205 .iterations(1)
16206 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16207 }
16208 }
16209 }
16210 }
16211
16212 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_gt_2) {
16213 TEST_REQUIRES_ARM_NEON_FMA;
16214 for (size_t k = 3; k < 4; k++) {
16215 GemmMicrokernelTester()
16216 .mr(1)
16217 .nr(8)
16218 .kr(1)
16219 .sr(1)
16220 .m(1)
16221 .n(8)
16222 .k(k)
16223 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16224 }
16225 }
16226
16227 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
16228 TEST_REQUIRES_ARM_NEON_FMA;
16229 for (size_t k = 3; k < 4; k++) {
16230 GemmMicrokernelTester()
16231 .mr(1)
16232 .nr(8)
16233 .kr(1)
16234 .sr(1)
16235 .m(1)
16236 .n(8)
16237 .k(k)
16238 .a_stride(7)
16239 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16240 }
16241 }
16242
16243 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
16244 TEST_REQUIRES_ARM_NEON_FMA;
16245 for (size_t k = 3; k < 4; k++) {
16246 for (uint32_t m = 1; m <= 1; m++) {
16247 for (uint32_t n = 1; n <= 8; n++) {
16248 GemmMicrokernelTester()
16249 .mr(1)
16250 .nr(8)
16251 .kr(1)
16252 .sr(1)
16253 .m(m)
16254 .n(n)
16255 .k(k)
16256 .iterations(1)
16257 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16258 }
16259 }
16260 }
16261 }
16262
16263 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_div_2) {
16264 TEST_REQUIRES_ARM_NEON_FMA;
16265 for (size_t k = 4; k <= 20; k += 2) {
16266 GemmMicrokernelTester()
16267 .mr(1)
16268 .nr(8)
16269 .kr(1)
16270 .sr(1)
16271 .m(1)
16272 .n(8)
16273 .k(k)
16274 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16275 }
16276 }
16277
16278 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
16279 TEST_REQUIRES_ARM_NEON_FMA;
16280 for (size_t k = 4; k <= 20; k += 2) {
16281 GemmMicrokernelTester()
16282 .mr(1)
16283 .nr(8)
16284 .kr(1)
16285 .sr(1)
16286 .m(1)
16287 .n(8)
16288 .k(k)
16289 .a_stride(23)
16290 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16291 }
16292 }
16293
16294 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
16295 TEST_REQUIRES_ARM_NEON_FMA;
16296 for (size_t k = 4; k <= 20; k += 2) {
16297 for (uint32_t m = 1; m <= 1; m++) {
16298 for (uint32_t n = 1; n <= 8; n++) {
16299 GemmMicrokernelTester()
16300 .mr(1)
16301 .nr(8)
16302 .kr(1)
16303 .sr(1)
16304 .m(m)
16305 .n(n)
16306 .k(k)
16307 .iterations(1)
16308 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16309 }
16310 }
16311 }
16312 }
16313
16314 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8) {
16315 TEST_REQUIRES_ARM_NEON_FMA;
16316 for (uint32_t n = 9; n < 16; n++) {
16317 for (size_t k = 1; k <= 10; k += 3) {
16318 GemmMicrokernelTester()
16319 .mr(1)
16320 .nr(8)
16321 .kr(1)
16322 .sr(1)
16323 .m(1)
16324 .n(8)
16325 .k(k)
16326 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16327 }
16328 }
16329 }
16330
16331 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
16332 TEST_REQUIRES_ARM_NEON_FMA;
16333 for (uint32_t n = 9; n < 16; n++) {
16334 for (size_t k = 1; k <= 10; k += 3) {
16335 GemmMicrokernelTester()
16336 .mr(1)
16337 .nr(8)
16338 .kr(1)
16339 .sr(1)
16340 .m(1)
16341 .n(8)
16342 .k(k)
16343 .cn_stride(11)
16344 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16345 }
16346 }
16347 }
16348
16349 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
16350 TEST_REQUIRES_ARM_NEON_FMA;
16351 for (uint32_t n = 9; n < 16; n++) {
16352 for (size_t k = 1; k <= 10; k += 3) {
16353 GemmMicrokernelTester()
16354 .mr(1)
16355 .nr(8)
16356 .kr(1)
16357 .sr(1)
16358 .m(1)
16359 .n(n)
16360 .k(k)
16361 .a_stride(13)
16362 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16363 }
16364 }
16365 }
16366
16367 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
16368 TEST_REQUIRES_ARM_NEON_FMA;
16369 for (uint32_t n = 9; n < 16; n++) {
16370 for (size_t k = 1; k <= 10; k += 3) {
16371 for (uint32_t m = 1; m <= 1; m++) {
16372 GemmMicrokernelTester()
16373 .mr(1)
16374 .nr(8)
16375 .kr(1)
16376 .sr(1)
16377 .m(m)
16378 .n(n)
16379 .k(k)
16380 .iterations(1)
16381 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16382 }
16383 }
16384 }
16385 }
16386
16387 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8) {
16388 TEST_REQUIRES_ARM_NEON_FMA;
16389 for (uint32_t n = 16; n <= 24; n += 8) {
16390 for (size_t k = 1; k <= 10; k += 3) {
16391 GemmMicrokernelTester()
16392 .mr(1)
16393 .nr(8)
16394 .kr(1)
16395 .sr(1)
16396 .m(1)
16397 .n(8)
16398 .k(k)
16399 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16400 }
16401 }
16402 }
16403
16404 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
16405 TEST_REQUIRES_ARM_NEON_FMA;
16406 for (uint32_t n = 16; n <= 24; n += 8) {
16407 for (size_t k = 1; k <= 10; k += 3) {
16408 GemmMicrokernelTester()
16409 .mr(1)
16410 .nr(8)
16411 .kr(1)
16412 .sr(1)
16413 .m(1)
16414 .n(n)
16415 .k(k)
16416 .cn_stride(11)
16417 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16418 }
16419 }
16420 }
16421
16422 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
16423 TEST_REQUIRES_ARM_NEON_FMA;
16424 for (uint32_t n = 16; n <= 24; n += 8) {
16425 for (size_t k = 1; k <= 10; k += 3) {
16426 GemmMicrokernelTester()
16427 .mr(1)
16428 .nr(8)
16429 .kr(1)
16430 .sr(1)
16431 .m(1)
16432 .n(n)
16433 .k(k)
16434 .a_stride(13)
16435 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16436 }
16437 }
16438 }
16439
16440 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
16441 TEST_REQUIRES_ARM_NEON_FMA;
16442 for (uint32_t n = 16; n <= 24; n += 8) {
16443 for (size_t k = 1; k <= 10; k += 3) {
16444 for (uint32_t m = 1; m <= 1; m++) {
16445 GemmMicrokernelTester()
16446 .mr(1)
16447 .nr(8)
16448 .kr(1)
16449 .sr(1)
16450 .m(m)
16451 .n(n)
16452 .k(k)
16453 .iterations(1)
16454 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16455 }
16456 }
16457 }
16458 }
16459
16460 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
16461 TEST_REQUIRES_ARM_NEON_FMA;
16462 for (size_t k = 1; k <= 10; k += 3) {
16463 for (uint32_t m = 1; m <= 1; m++) {
16464 for (uint32_t n = 1; n <= 8; n++) {
16465 GemmMicrokernelTester()
16466 .mr(1)
16467 .nr(8)
16468 .kr(1)
16469 .sr(1)
16470 .m(m)
16471 .n(n)
16472 .k(k)
16473 .cm_stride(11)
16474 .iterations(1)
16475 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16476 }
16477 }
16478 }
16479 }
16480
16481 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, qmin) {
16482 TEST_REQUIRES_ARM_NEON_FMA;
16483 GemmMicrokernelTester()
16484 .mr(1)
16485 .nr(8)
16486 .kr(1)
16487 .sr(1)
16488 .m(1)
16489 .n(8)
16490 .k(2)
16491 .qmin(128)
16492 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16493 }
16494
16495 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, qmax) {
16496 TEST_REQUIRES_ARM_NEON_FMA;
16497 GemmMicrokernelTester()
16498 .mr(1)
16499 .nr(8)
16500 .kr(1)
16501 .sr(1)
16502 .m(1)
16503 .n(8)
16504 .k(2)
16505 .qmax(128)
16506 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16507 }
16508
16509 TEST(F32_GEMMINC_1X8__NEONFMA_DUP_LD64, strided_cm) {
16510 TEST_REQUIRES_ARM_NEON_FMA;
16511 GemmMicrokernelTester()
16512 .mr(1)
16513 .nr(8)
16514 .kr(1)
16515 .sr(1)
16516 .m(1)
16517 .n(8)
16518 .k(2)
16519 .cm_stride(11)
16520 .Test(xnn_f32_gemminc_ukernel_1x8__neonfma_dup_ld64);
16521 }
16522#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16523
16524
16525#if XNN_ARCH_ARM || XNN_ARCH_ARM64
16526 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2) {
16527 TEST_REQUIRES_ARM_NEON_FMA;
16528 GemmMicrokernelTester()
16529 .mr(4)
16530 .nr(8)
16531 .kr(1)
16532 .sr(1)
16533 .m(4)
16534 .n(8)
16535 .k(2)
16536 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16537 }
16538
16539 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, strided_cn) {
16540 TEST_REQUIRES_ARM_NEON_FMA;
16541 GemmMicrokernelTester()
16542 .mr(4)
16543 .nr(8)
16544 .kr(1)
16545 .sr(1)
16546 .m(4)
16547 .n(8)
16548 .k(2)
16549 .cn_stride(11)
16550 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16551 }
16552
16553 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
16554 TEST_REQUIRES_ARM_NEON_FMA;
16555 GemmMicrokernelTester()
16556 .mr(4)
16557 .nr(8)
16558 .kr(1)
16559 .sr(1)
16560 .m(4)
16561 .n(8)
16562 .k(2)
16563 .a_stride(5)
16564 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16565 }
16566
16567 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
16568 TEST_REQUIRES_ARM_NEON_FMA;
16569 for (uint32_t m = 1; m <= 4; m++) {
16570 for (uint32_t n = 1; n <= 8; n++) {
16571 GemmMicrokernelTester()
16572 .mr(4)
16573 .nr(8)
16574 .kr(1)
16575 .sr(1)
16576 .m(m)
16577 .n(n)
16578 .k(2)
16579 .iterations(1)
16580 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16581 }
16582 }
16583 }
16584
16585 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
16586 TEST_REQUIRES_ARM_NEON_FMA;
16587 for (uint32_t m = 1; m <= 4; m++) {
16588 GemmMicrokernelTester()
16589 .mr(4)
16590 .nr(8)
16591 .kr(1)
16592 .sr(1)
16593 .m(m)
16594 .n(8)
16595 .k(2)
16596 .iterations(1)
16597 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16598 }
16599 }
16600
16601 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
16602 TEST_REQUIRES_ARM_NEON_FMA;
16603 for (uint32_t n = 1; n <= 8; n++) {
16604 GemmMicrokernelTester()
16605 .mr(4)
16606 .nr(8)
16607 .kr(1)
16608 .sr(1)
16609 .m(4)
16610 .n(n)
16611 .k(2)
16612 .iterations(1)
16613 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16614 }
16615 }
16616
16617 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_lt_2) {
16618 TEST_REQUIRES_ARM_NEON_FMA;
16619 for (size_t k = 1; k < 2; k++) {
16620 GemmMicrokernelTester()
16621 .mr(4)
16622 .nr(8)
16623 .kr(1)
16624 .sr(1)
16625 .m(4)
16626 .n(8)
16627 .k(k)
16628 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16629 }
16630 }
16631
16632 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
16633 TEST_REQUIRES_ARM_NEON_FMA;
16634 for (size_t k = 1; k < 2; k++) {
16635 GemmMicrokernelTester()
16636 .mr(4)
16637 .nr(8)
16638 .kr(1)
16639 .sr(1)
16640 .m(4)
16641 .n(8)
16642 .k(k)
16643 .a_stride(5)
16644 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16645 }
16646 }
16647
16648 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
16649 TEST_REQUIRES_ARM_NEON_FMA;
16650 for (size_t k = 1; k < 2; k++) {
16651 for (uint32_t m = 1; m <= 4; m++) {
16652 for (uint32_t n = 1; n <= 8; n++) {
16653 GemmMicrokernelTester()
16654 .mr(4)
16655 .nr(8)
16656 .kr(1)
16657 .sr(1)
16658 .m(m)
16659 .n(n)
16660 .k(k)
16661 .iterations(1)
16662 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16663 }
16664 }
16665 }
16666 }
16667
16668 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_gt_2) {
16669 TEST_REQUIRES_ARM_NEON_FMA;
16670 for (size_t k = 3; k < 4; k++) {
16671 GemmMicrokernelTester()
16672 .mr(4)
16673 .nr(8)
16674 .kr(1)
16675 .sr(1)
16676 .m(4)
16677 .n(8)
16678 .k(k)
16679 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16680 }
16681 }
16682
16683 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
16684 TEST_REQUIRES_ARM_NEON_FMA;
16685 for (size_t k = 3; k < 4; k++) {
16686 GemmMicrokernelTester()
16687 .mr(4)
16688 .nr(8)
16689 .kr(1)
16690 .sr(1)
16691 .m(4)
16692 .n(8)
16693 .k(k)
16694 .a_stride(7)
16695 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16696 }
16697 }
16698
16699 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
16700 TEST_REQUIRES_ARM_NEON_FMA;
16701 for (size_t k = 3; k < 4; k++) {
16702 for (uint32_t m = 1; m <= 4; m++) {
16703 for (uint32_t n = 1; n <= 8; n++) {
16704 GemmMicrokernelTester()
16705 .mr(4)
16706 .nr(8)
16707 .kr(1)
16708 .sr(1)
16709 .m(m)
16710 .n(n)
16711 .k(k)
16712 .iterations(1)
16713 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16714 }
16715 }
16716 }
16717 }
16718
16719 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_div_2) {
16720 TEST_REQUIRES_ARM_NEON_FMA;
16721 for (size_t k = 4; k <= 20; k += 2) {
16722 GemmMicrokernelTester()
16723 .mr(4)
16724 .nr(8)
16725 .kr(1)
16726 .sr(1)
16727 .m(4)
16728 .n(8)
16729 .k(k)
16730 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16731 }
16732 }
16733
16734 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
16735 TEST_REQUIRES_ARM_NEON_FMA;
16736 for (size_t k = 4; k <= 20; k += 2) {
16737 GemmMicrokernelTester()
16738 .mr(4)
16739 .nr(8)
16740 .kr(1)
16741 .sr(1)
16742 .m(4)
16743 .n(8)
16744 .k(k)
16745 .a_stride(23)
16746 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16747 }
16748 }
16749
16750 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
16751 TEST_REQUIRES_ARM_NEON_FMA;
16752 for (size_t k = 4; k <= 20; k += 2) {
16753 for (uint32_t m = 1; m <= 4; m++) {
16754 for (uint32_t n = 1; n <= 8; n++) {
16755 GemmMicrokernelTester()
16756 .mr(4)
16757 .nr(8)
16758 .kr(1)
16759 .sr(1)
16760 .m(m)
16761 .n(n)
16762 .k(k)
16763 .iterations(1)
16764 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16765 }
16766 }
16767 }
16768 }
16769
16770 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8) {
16771 TEST_REQUIRES_ARM_NEON_FMA;
16772 for (uint32_t n = 9; n < 16; n++) {
16773 for (size_t k = 1; k <= 10; k += 3) {
16774 GemmMicrokernelTester()
16775 .mr(4)
16776 .nr(8)
16777 .kr(1)
16778 .sr(1)
16779 .m(4)
16780 .n(8)
16781 .k(k)
16782 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16783 }
16784 }
16785 }
16786
16787 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
16788 TEST_REQUIRES_ARM_NEON_FMA;
16789 for (uint32_t n = 9; n < 16; n++) {
16790 for (size_t k = 1; k <= 10; k += 3) {
16791 GemmMicrokernelTester()
16792 .mr(4)
16793 .nr(8)
16794 .kr(1)
16795 .sr(1)
16796 .m(4)
16797 .n(8)
16798 .k(k)
16799 .cn_stride(11)
16800 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16801 }
16802 }
16803 }
16804
16805 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
16806 TEST_REQUIRES_ARM_NEON_FMA;
16807 for (uint32_t n = 9; n < 16; n++) {
16808 for (size_t k = 1; k <= 10; k += 3) {
16809 GemmMicrokernelTester()
16810 .mr(4)
16811 .nr(8)
16812 .kr(1)
16813 .sr(1)
16814 .m(4)
16815 .n(n)
16816 .k(k)
16817 .a_stride(13)
16818 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16819 }
16820 }
16821 }
16822
16823 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
16824 TEST_REQUIRES_ARM_NEON_FMA;
16825 for (uint32_t n = 9; n < 16; n++) {
16826 for (size_t k = 1; k <= 10; k += 3) {
16827 for (uint32_t m = 1; m <= 4; m++) {
16828 GemmMicrokernelTester()
16829 .mr(4)
16830 .nr(8)
16831 .kr(1)
16832 .sr(1)
16833 .m(m)
16834 .n(n)
16835 .k(k)
16836 .iterations(1)
16837 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16838 }
16839 }
16840 }
16841 }
16842
16843 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8) {
16844 TEST_REQUIRES_ARM_NEON_FMA;
16845 for (uint32_t n = 16; n <= 24; n += 8) {
16846 for (size_t k = 1; k <= 10; k += 3) {
16847 GemmMicrokernelTester()
16848 .mr(4)
16849 .nr(8)
16850 .kr(1)
16851 .sr(1)
16852 .m(4)
16853 .n(8)
16854 .k(k)
16855 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16856 }
16857 }
16858 }
16859
16860 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
16861 TEST_REQUIRES_ARM_NEON_FMA;
16862 for (uint32_t n = 16; n <= 24; n += 8) {
16863 for (size_t k = 1; k <= 10; k += 3) {
16864 GemmMicrokernelTester()
16865 .mr(4)
16866 .nr(8)
16867 .kr(1)
16868 .sr(1)
16869 .m(4)
16870 .n(n)
16871 .k(k)
16872 .cn_stride(11)
16873 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16874 }
16875 }
16876 }
16877
16878 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
16879 TEST_REQUIRES_ARM_NEON_FMA;
16880 for (uint32_t n = 16; n <= 24; n += 8) {
16881 for (size_t k = 1; k <= 10; k += 3) {
16882 GemmMicrokernelTester()
16883 .mr(4)
16884 .nr(8)
16885 .kr(1)
16886 .sr(1)
16887 .m(4)
16888 .n(n)
16889 .k(k)
16890 .a_stride(13)
16891 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16892 }
16893 }
16894 }
16895
16896 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
16897 TEST_REQUIRES_ARM_NEON_FMA;
16898 for (uint32_t n = 16; n <= 24; n += 8) {
16899 for (size_t k = 1; k <= 10; k += 3) {
16900 for (uint32_t m = 1; m <= 4; m++) {
16901 GemmMicrokernelTester()
16902 .mr(4)
16903 .nr(8)
16904 .kr(1)
16905 .sr(1)
16906 .m(m)
16907 .n(n)
16908 .k(k)
16909 .iterations(1)
16910 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16911 }
16912 }
16913 }
16914 }
16915
16916 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
16917 TEST_REQUIRES_ARM_NEON_FMA;
16918 for (size_t k = 1; k <= 10; k += 3) {
16919 for (uint32_t m = 1; m <= 4; m++) {
16920 for (uint32_t n = 1; n <= 8; n++) {
16921 GemmMicrokernelTester()
16922 .mr(4)
16923 .nr(8)
16924 .kr(1)
16925 .sr(1)
16926 .m(m)
16927 .n(n)
16928 .k(k)
16929 .cm_stride(11)
16930 .iterations(1)
16931 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16932 }
16933 }
16934 }
16935 }
16936
16937 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, qmin) {
16938 TEST_REQUIRES_ARM_NEON_FMA;
16939 GemmMicrokernelTester()
16940 .mr(4)
16941 .nr(8)
16942 .kr(1)
16943 .sr(1)
16944 .m(4)
16945 .n(8)
16946 .k(2)
16947 .qmin(128)
16948 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16949 }
16950
16951 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, qmax) {
16952 TEST_REQUIRES_ARM_NEON_FMA;
16953 GemmMicrokernelTester()
16954 .mr(4)
16955 .nr(8)
16956 .kr(1)
16957 .sr(1)
16958 .m(4)
16959 .n(8)
16960 .k(2)
16961 .qmax(128)
16962 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16963 }
16964
16965 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD64, strided_cm) {
16966 TEST_REQUIRES_ARM_NEON_FMA;
16967 GemmMicrokernelTester()
16968 .mr(4)
16969 .nr(8)
16970 .kr(1)
16971 .sr(1)
16972 .m(4)
16973 .n(8)
16974 .k(2)
16975 .cm_stride(11)
16976 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld64);
16977 }
16978#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
16979
16980
16981#if XNN_ARCH_ARM || XNN_ARCH_ARM64
16982 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4) {
16983 TEST_REQUIRES_ARM_NEON_FMA;
16984 GemmMicrokernelTester()
16985 .mr(4)
16986 .nr(8)
16987 .kr(1)
16988 .sr(1)
16989 .m(4)
16990 .n(8)
16991 .k(4)
16992 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
16993 }
16994
16995 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, strided_cn) {
16996 TEST_REQUIRES_ARM_NEON_FMA;
16997 GemmMicrokernelTester()
16998 .mr(4)
16999 .nr(8)
17000 .kr(1)
17001 .sr(1)
17002 .m(4)
17003 .n(8)
17004 .k(4)
17005 .cn_stride(11)
17006 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17007 }
17008
17009 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
17010 TEST_REQUIRES_ARM_NEON_FMA;
17011 GemmMicrokernelTester()
17012 .mr(4)
17013 .nr(8)
17014 .kr(1)
17015 .sr(1)
17016 .m(4)
17017 .n(8)
17018 .k(4)
17019 .a_stride(7)
17020 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17021 }
17022
17023 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
17024 TEST_REQUIRES_ARM_NEON_FMA;
17025 for (uint32_t m = 1; m <= 4; m++) {
17026 for (uint32_t n = 1; n <= 8; n++) {
17027 GemmMicrokernelTester()
17028 .mr(4)
17029 .nr(8)
17030 .kr(1)
17031 .sr(1)
17032 .m(m)
17033 .n(n)
17034 .k(4)
17035 .iterations(1)
17036 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17037 }
17038 }
17039 }
17040
17041 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
17042 TEST_REQUIRES_ARM_NEON_FMA;
17043 for (uint32_t m = 1; m <= 4; m++) {
17044 GemmMicrokernelTester()
17045 .mr(4)
17046 .nr(8)
17047 .kr(1)
17048 .sr(1)
17049 .m(m)
17050 .n(8)
17051 .k(4)
17052 .iterations(1)
17053 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17054 }
17055 }
17056
17057 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
17058 TEST_REQUIRES_ARM_NEON_FMA;
17059 for (uint32_t n = 1; n <= 8; n++) {
17060 GemmMicrokernelTester()
17061 .mr(4)
17062 .nr(8)
17063 .kr(1)
17064 .sr(1)
17065 .m(4)
17066 .n(n)
17067 .k(4)
17068 .iterations(1)
17069 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17070 }
17071 }
17072
17073 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_lt_4) {
17074 TEST_REQUIRES_ARM_NEON_FMA;
17075 for (size_t k = 1; k < 4; k++) {
17076 GemmMicrokernelTester()
17077 .mr(4)
17078 .nr(8)
17079 .kr(1)
17080 .sr(1)
17081 .m(4)
17082 .n(8)
17083 .k(k)
17084 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17085 }
17086 }
17087
17088 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
17089 TEST_REQUIRES_ARM_NEON_FMA;
17090 for (size_t k = 1; k < 4; k++) {
17091 GemmMicrokernelTester()
17092 .mr(4)
17093 .nr(8)
17094 .kr(1)
17095 .sr(1)
17096 .m(4)
17097 .n(8)
17098 .k(k)
17099 .a_stride(7)
17100 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17101 }
17102 }
17103
17104 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
17105 TEST_REQUIRES_ARM_NEON_FMA;
17106 for (size_t k = 1; k < 4; k++) {
17107 for (uint32_t m = 1; m <= 4; m++) {
17108 for (uint32_t n = 1; n <= 8; n++) {
17109 GemmMicrokernelTester()
17110 .mr(4)
17111 .nr(8)
17112 .kr(1)
17113 .sr(1)
17114 .m(m)
17115 .n(n)
17116 .k(k)
17117 .iterations(1)
17118 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17119 }
17120 }
17121 }
17122 }
17123
17124 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_gt_4) {
17125 TEST_REQUIRES_ARM_NEON_FMA;
17126 for (size_t k = 5; k < 8; k++) {
17127 GemmMicrokernelTester()
17128 .mr(4)
17129 .nr(8)
17130 .kr(1)
17131 .sr(1)
17132 .m(4)
17133 .n(8)
17134 .k(k)
17135 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17136 }
17137 }
17138
17139 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
17140 TEST_REQUIRES_ARM_NEON_FMA;
17141 for (size_t k = 5; k < 8; k++) {
17142 GemmMicrokernelTester()
17143 .mr(4)
17144 .nr(8)
17145 .kr(1)
17146 .sr(1)
17147 .m(4)
17148 .n(8)
17149 .k(k)
17150 .a_stride(11)
17151 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17152 }
17153 }
17154
17155 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
17156 TEST_REQUIRES_ARM_NEON_FMA;
17157 for (size_t k = 5; k < 8; k++) {
17158 for (uint32_t m = 1; m <= 4; m++) {
17159 for (uint32_t n = 1; n <= 8; n++) {
17160 GemmMicrokernelTester()
17161 .mr(4)
17162 .nr(8)
17163 .kr(1)
17164 .sr(1)
17165 .m(m)
17166 .n(n)
17167 .k(k)
17168 .iterations(1)
17169 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17170 }
17171 }
17172 }
17173 }
17174
17175 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_div_4) {
17176 TEST_REQUIRES_ARM_NEON_FMA;
17177 for (size_t k = 8; k <= 40; k += 4) {
17178 GemmMicrokernelTester()
17179 .mr(4)
17180 .nr(8)
17181 .kr(1)
17182 .sr(1)
17183 .m(4)
17184 .n(8)
17185 .k(k)
17186 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17187 }
17188 }
17189
17190 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
17191 TEST_REQUIRES_ARM_NEON_FMA;
17192 for (size_t k = 8; k <= 40; k += 4) {
17193 GemmMicrokernelTester()
17194 .mr(4)
17195 .nr(8)
17196 .kr(1)
17197 .sr(1)
17198 .m(4)
17199 .n(8)
17200 .k(k)
17201 .a_stride(43)
17202 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17203 }
17204 }
17205
17206 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
17207 TEST_REQUIRES_ARM_NEON_FMA;
17208 for (size_t k = 8; k <= 40; k += 4) {
17209 for (uint32_t m = 1; m <= 4; m++) {
17210 for (uint32_t n = 1; n <= 8; n++) {
17211 GemmMicrokernelTester()
17212 .mr(4)
17213 .nr(8)
17214 .kr(1)
17215 .sr(1)
17216 .m(m)
17217 .n(n)
17218 .k(k)
17219 .iterations(1)
17220 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17221 }
17222 }
17223 }
17224 }
17225
17226 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8) {
17227 TEST_REQUIRES_ARM_NEON_FMA;
17228 for (uint32_t n = 9; n < 16; n++) {
17229 for (size_t k = 1; k <= 20; k += 5) {
17230 GemmMicrokernelTester()
17231 .mr(4)
17232 .nr(8)
17233 .kr(1)
17234 .sr(1)
17235 .m(4)
17236 .n(8)
17237 .k(k)
17238 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17239 }
17240 }
17241 }
17242
17243 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
17244 TEST_REQUIRES_ARM_NEON_FMA;
17245 for (uint32_t n = 9; n < 16; n++) {
17246 for (size_t k = 1; k <= 20; k += 5) {
17247 GemmMicrokernelTester()
17248 .mr(4)
17249 .nr(8)
17250 .kr(1)
17251 .sr(1)
17252 .m(4)
17253 .n(8)
17254 .k(k)
17255 .cn_stride(11)
17256 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17257 }
17258 }
17259 }
17260
17261 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
17262 TEST_REQUIRES_ARM_NEON_FMA;
17263 for (uint32_t n = 9; n < 16; n++) {
17264 for (size_t k = 1; k <= 20; k += 5) {
17265 GemmMicrokernelTester()
17266 .mr(4)
17267 .nr(8)
17268 .kr(1)
17269 .sr(1)
17270 .m(4)
17271 .n(n)
17272 .k(k)
17273 .a_stride(23)
17274 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17275 }
17276 }
17277 }
17278
17279 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
17280 TEST_REQUIRES_ARM_NEON_FMA;
17281 for (uint32_t n = 9; n < 16; n++) {
17282 for (size_t k = 1; k <= 20; k += 5) {
17283 for (uint32_t m = 1; m <= 4; m++) {
17284 GemmMicrokernelTester()
17285 .mr(4)
17286 .nr(8)
17287 .kr(1)
17288 .sr(1)
17289 .m(m)
17290 .n(n)
17291 .k(k)
17292 .iterations(1)
17293 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17294 }
17295 }
17296 }
17297 }
17298
17299 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8) {
17300 TEST_REQUIRES_ARM_NEON_FMA;
17301 for (uint32_t n = 16; n <= 24; n += 8) {
17302 for (size_t k = 1; k <= 20; k += 5) {
17303 GemmMicrokernelTester()
17304 .mr(4)
17305 .nr(8)
17306 .kr(1)
17307 .sr(1)
17308 .m(4)
17309 .n(8)
17310 .k(k)
17311 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17312 }
17313 }
17314 }
17315
17316 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
17317 TEST_REQUIRES_ARM_NEON_FMA;
17318 for (uint32_t n = 16; n <= 24; n += 8) {
17319 for (size_t k = 1; k <= 20; k += 5) {
17320 GemmMicrokernelTester()
17321 .mr(4)
17322 .nr(8)
17323 .kr(1)
17324 .sr(1)
17325 .m(4)
17326 .n(n)
17327 .k(k)
17328 .cn_stride(11)
17329 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17330 }
17331 }
17332 }
17333
17334 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
17335 TEST_REQUIRES_ARM_NEON_FMA;
17336 for (uint32_t n = 16; n <= 24; n += 8) {
17337 for (size_t k = 1; k <= 20; k += 5) {
17338 GemmMicrokernelTester()
17339 .mr(4)
17340 .nr(8)
17341 .kr(1)
17342 .sr(1)
17343 .m(4)
17344 .n(n)
17345 .k(k)
17346 .a_stride(23)
17347 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17348 }
17349 }
17350 }
17351
17352 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
17353 TEST_REQUIRES_ARM_NEON_FMA;
17354 for (uint32_t n = 16; n <= 24; n += 8) {
17355 for (size_t k = 1; k <= 20; k += 5) {
17356 for (uint32_t m = 1; m <= 4; m++) {
17357 GemmMicrokernelTester()
17358 .mr(4)
17359 .nr(8)
17360 .kr(1)
17361 .sr(1)
17362 .m(m)
17363 .n(n)
17364 .k(k)
17365 .iterations(1)
17366 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17367 }
17368 }
17369 }
17370 }
17371
17372 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
17373 TEST_REQUIRES_ARM_NEON_FMA;
17374 for (size_t k = 1; k <= 20; k += 5) {
17375 for (uint32_t m = 1; m <= 4; m++) {
17376 for (uint32_t n = 1; n <= 8; n++) {
17377 GemmMicrokernelTester()
17378 .mr(4)
17379 .nr(8)
17380 .kr(1)
17381 .sr(1)
17382 .m(m)
17383 .n(n)
17384 .k(k)
17385 .cm_stride(11)
17386 .iterations(1)
17387 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17388 }
17389 }
17390 }
17391 }
17392
17393 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, qmin) {
17394 TEST_REQUIRES_ARM_NEON_FMA;
17395 GemmMicrokernelTester()
17396 .mr(4)
17397 .nr(8)
17398 .kr(1)
17399 .sr(1)
17400 .m(4)
17401 .n(8)
17402 .k(4)
17403 .qmin(128)
17404 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17405 }
17406
17407 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, qmax) {
17408 TEST_REQUIRES_ARM_NEON_FMA;
17409 GemmMicrokernelTester()
17410 .mr(4)
17411 .nr(8)
17412 .kr(1)
17413 .sr(1)
17414 .m(4)
17415 .n(8)
17416 .k(4)
17417 .qmax(128)
17418 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17419 }
17420
17421 TEST(F32_GEMMINC_4X8__NEONFMA_DUP_LD128, strided_cm) {
17422 TEST_REQUIRES_ARM_NEON_FMA;
17423 GemmMicrokernelTester()
17424 .mr(4)
17425 .nr(8)
17426 .kr(1)
17427 .sr(1)
17428 .m(4)
17429 .n(8)
17430 .k(4)
17431 .cm_stride(11)
17432 .Test(xnn_f32_gemminc_ukernel_4x8__neonfma_dup_ld128);
17433 }
17434#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17435
17436
17437#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17438 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2) {
17439 TEST_REQUIRES_ARM_NEON_FMA;
17440 GemmMicrokernelTester()
17441 .mr(6)
17442 .nr(8)
17443 .kr(1)
17444 .sr(1)
17445 .m(6)
17446 .n(8)
17447 .k(2)
17448 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17449 }
17450
17451 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, strided_cn) {
17452 TEST_REQUIRES_ARM_NEON_FMA;
17453 GemmMicrokernelTester()
17454 .mr(6)
17455 .nr(8)
17456 .kr(1)
17457 .sr(1)
17458 .m(6)
17459 .n(8)
17460 .k(2)
17461 .cn_stride(11)
17462 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17463 }
17464
17465 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
17466 TEST_REQUIRES_ARM_NEON_FMA;
17467 GemmMicrokernelTester()
17468 .mr(6)
17469 .nr(8)
17470 .kr(1)
17471 .sr(1)
17472 .m(6)
17473 .n(8)
17474 .k(2)
17475 .a_stride(5)
17476 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17477 }
17478
17479 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
17480 TEST_REQUIRES_ARM_NEON_FMA;
17481 for (uint32_t m = 1; m <= 6; m++) {
17482 for (uint32_t n = 1; n <= 8; n++) {
17483 GemmMicrokernelTester()
17484 .mr(6)
17485 .nr(8)
17486 .kr(1)
17487 .sr(1)
17488 .m(m)
17489 .n(n)
17490 .k(2)
17491 .iterations(1)
17492 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17493 }
17494 }
17495 }
17496
17497 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
17498 TEST_REQUIRES_ARM_NEON_FMA;
17499 for (uint32_t m = 1; m <= 6; m++) {
17500 GemmMicrokernelTester()
17501 .mr(6)
17502 .nr(8)
17503 .kr(1)
17504 .sr(1)
17505 .m(m)
17506 .n(8)
17507 .k(2)
17508 .iterations(1)
17509 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17510 }
17511 }
17512
17513 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
17514 TEST_REQUIRES_ARM_NEON_FMA;
17515 for (uint32_t n = 1; n <= 8; n++) {
17516 GemmMicrokernelTester()
17517 .mr(6)
17518 .nr(8)
17519 .kr(1)
17520 .sr(1)
17521 .m(6)
17522 .n(n)
17523 .k(2)
17524 .iterations(1)
17525 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17526 }
17527 }
17528
17529 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_lt_2) {
17530 TEST_REQUIRES_ARM_NEON_FMA;
17531 for (size_t k = 1; k < 2; k++) {
17532 GemmMicrokernelTester()
17533 .mr(6)
17534 .nr(8)
17535 .kr(1)
17536 .sr(1)
17537 .m(6)
17538 .n(8)
17539 .k(k)
17540 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17541 }
17542 }
17543
17544 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
17545 TEST_REQUIRES_ARM_NEON_FMA;
17546 for (size_t k = 1; k < 2; k++) {
17547 GemmMicrokernelTester()
17548 .mr(6)
17549 .nr(8)
17550 .kr(1)
17551 .sr(1)
17552 .m(6)
17553 .n(8)
17554 .k(k)
17555 .a_stride(5)
17556 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17557 }
17558 }
17559
17560 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
17561 TEST_REQUIRES_ARM_NEON_FMA;
17562 for (size_t k = 1; k < 2; k++) {
17563 for (uint32_t m = 1; m <= 6; m++) {
17564 for (uint32_t n = 1; n <= 8; n++) {
17565 GemmMicrokernelTester()
17566 .mr(6)
17567 .nr(8)
17568 .kr(1)
17569 .sr(1)
17570 .m(m)
17571 .n(n)
17572 .k(k)
17573 .iterations(1)
17574 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17575 }
17576 }
17577 }
17578 }
17579
17580 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_gt_2) {
17581 TEST_REQUIRES_ARM_NEON_FMA;
17582 for (size_t k = 3; k < 4; k++) {
17583 GemmMicrokernelTester()
17584 .mr(6)
17585 .nr(8)
17586 .kr(1)
17587 .sr(1)
17588 .m(6)
17589 .n(8)
17590 .k(k)
17591 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17592 }
17593 }
17594
17595 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
17596 TEST_REQUIRES_ARM_NEON_FMA;
17597 for (size_t k = 3; k < 4; k++) {
17598 GemmMicrokernelTester()
17599 .mr(6)
17600 .nr(8)
17601 .kr(1)
17602 .sr(1)
17603 .m(6)
17604 .n(8)
17605 .k(k)
17606 .a_stride(7)
17607 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17608 }
17609 }
17610
17611 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
17612 TEST_REQUIRES_ARM_NEON_FMA;
17613 for (size_t k = 3; k < 4; k++) {
17614 for (uint32_t m = 1; m <= 6; m++) {
17615 for (uint32_t n = 1; n <= 8; n++) {
17616 GemmMicrokernelTester()
17617 .mr(6)
17618 .nr(8)
17619 .kr(1)
17620 .sr(1)
17621 .m(m)
17622 .n(n)
17623 .k(k)
17624 .iterations(1)
17625 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17626 }
17627 }
17628 }
17629 }
17630
17631 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_div_2) {
17632 TEST_REQUIRES_ARM_NEON_FMA;
17633 for (size_t k = 4; k <= 20; k += 2) {
17634 GemmMicrokernelTester()
17635 .mr(6)
17636 .nr(8)
17637 .kr(1)
17638 .sr(1)
17639 .m(6)
17640 .n(8)
17641 .k(k)
17642 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17643 }
17644 }
17645
17646 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
17647 TEST_REQUIRES_ARM_NEON_FMA;
17648 for (size_t k = 4; k <= 20; k += 2) {
17649 GemmMicrokernelTester()
17650 .mr(6)
17651 .nr(8)
17652 .kr(1)
17653 .sr(1)
17654 .m(6)
17655 .n(8)
17656 .k(k)
17657 .a_stride(23)
17658 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17659 }
17660 }
17661
17662 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
17663 TEST_REQUIRES_ARM_NEON_FMA;
17664 for (size_t k = 4; k <= 20; k += 2) {
17665 for (uint32_t m = 1; m <= 6; m++) {
17666 for (uint32_t n = 1; n <= 8; n++) {
17667 GemmMicrokernelTester()
17668 .mr(6)
17669 .nr(8)
17670 .kr(1)
17671 .sr(1)
17672 .m(m)
17673 .n(n)
17674 .k(k)
17675 .iterations(1)
17676 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17677 }
17678 }
17679 }
17680 }
17681
17682 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8) {
17683 TEST_REQUIRES_ARM_NEON_FMA;
17684 for (uint32_t n = 9; n < 16; n++) {
17685 for (size_t k = 1; k <= 10; k += 3) {
17686 GemmMicrokernelTester()
17687 .mr(6)
17688 .nr(8)
17689 .kr(1)
17690 .sr(1)
17691 .m(6)
17692 .n(8)
17693 .k(k)
17694 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17695 }
17696 }
17697 }
17698
17699 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
17700 TEST_REQUIRES_ARM_NEON_FMA;
17701 for (uint32_t n = 9; n < 16; n++) {
17702 for (size_t k = 1; k <= 10; k += 3) {
17703 GemmMicrokernelTester()
17704 .mr(6)
17705 .nr(8)
17706 .kr(1)
17707 .sr(1)
17708 .m(6)
17709 .n(8)
17710 .k(k)
17711 .cn_stride(11)
17712 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17713 }
17714 }
17715 }
17716
17717 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
17718 TEST_REQUIRES_ARM_NEON_FMA;
17719 for (uint32_t n = 9; n < 16; n++) {
17720 for (size_t k = 1; k <= 10; k += 3) {
17721 GemmMicrokernelTester()
17722 .mr(6)
17723 .nr(8)
17724 .kr(1)
17725 .sr(1)
17726 .m(6)
17727 .n(n)
17728 .k(k)
17729 .a_stride(13)
17730 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17731 }
17732 }
17733 }
17734
17735 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
17736 TEST_REQUIRES_ARM_NEON_FMA;
17737 for (uint32_t n = 9; n < 16; n++) {
17738 for (size_t k = 1; k <= 10; k += 3) {
17739 for (uint32_t m = 1; m <= 6; m++) {
17740 GemmMicrokernelTester()
17741 .mr(6)
17742 .nr(8)
17743 .kr(1)
17744 .sr(1)
17745 .m(m)
17746 .n(n)
17747 .k(k)
17748 .iterations(1)
17749 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17750 }
17751 }
17752 }
17753 }
17754
17755 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8) {
17756 TEST_REQUIRES_ARM_NEON_FMA;
17757 for (uint32_t n = 16; n <= 24; n += 8) {
17758 for (size_t k = 1; k <= 10; k += 3) {
17759 GemmMicrokernelTester()
17760 .mr(6)
17761 .nr(8)
17762 .kr(1)
17763 .sr(1)
17764 .m(6)
17765 .n(8)
17766 .k(k)
17767 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17768 }
17769 }
17770 }
17771
17772 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
17773 TEST_REQUIRES_ARM_NEON_FMA;
17774 for (uint32_t n = 16; n <= 24; n += 8) {
17775 for (size_t k = 1; k <= 10; k += 3) {
17776 GemmMicrokernelTester()
17777 .mr(6)
17778 .nr(8)
17779 .kr(1)
17780 .sr(1)
17781 .m(6)
17782 .n(n)
17783 .k(k)
17784 .cn_stride(11)
17785 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17786 }
17787 }
17788 }
17789
17790 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
17791 TEST_REQUIRES_ARM_NEON_FMA;
17792 for (uint32_t n = 16; n <= 24; n += 8) {
17793 for (size_t k = 1; k <= 10; k += 3) {
17794 GemmMicrokernelTester()
17795 .mr(6)
17796 .nr(8)
17797 .kr(1)
17798 .sr(1)
17799 .m(6)
17800 .n(n)
17801 .k(k)
17802 .a_stride(13)
17803 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17804 }
17805 }
17806 }
17807
17808 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
17809 TEST_REQUIRES_ARM_NEON_FMA;
17810 for (uint32_t n = 16; n <= 24; n += 8) {
17811 for (size_t k = 1; k <= 10; k += 3) {
17812 for (uint32_t m = 1; m <= 6; m++) {
17813 GemmMicrokernelTester()
17814 .mr(6)
17815 .nr(8)
17816 .kr(1)
17817 .sr(1)
17818 .m(m)
17819 .n(n)
17820 .k(k)
17821 .iterations(1)
17822 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17823 }
17824 }
17825 }
17826 }
17827
17828 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
17829 TEST_REQUIRES_ARM_NEON_FMA;
17830 for (size_t k = 1; k <= 10; k += 3) {
17831 for (uint32_t m = 1; m <= 6; m++) {
17832 for (uint32_t n = 1; n <= 8; n++) {
17833 GemmMicrokernelTester()
17834 .mr(6)
17835 .nr(8)
17836 .kr(1)
17837 .sr(1)
17838 .m(m)
17839 .n(n)
17840 .k(k)
17841 .cm_stride(11)
17842 .iterations(1)
17843 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17844 }
17845 }
17846 }
17847 }
17848
17849 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, qmin) {
17850 TEST_REQUIRES_ARM_NEON_FMA;
17851 GemmMicrokernelTester()
17852 .mr(6)
17853 .nr(8)
17854 .kr(1)
17855 .sr(1)
17856 .m(6)
17857 .n(8)
17858 .k(2)
17859 .qmin(128)
17860 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17861 }
17862
17863 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, qmax) {
17864 TEST_REQUIRES_ARM_NEON_FMA;
17865 GemmMicrokernelTester()
17866 .mr(6)
17867 .nr(8)
17868 .kr(1)
17869 .sr(1)
17870 .m(6)
17871 .n(8)
17872 .k(2)
17873 .qmax(128)
17874 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17875 }
17876
17877 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD64, strided_cm) {
17878 TEST_REQUIRES_ARM_NEON_FMA;
17879 GemmMicrokernelTester()
17880 .mr(6)
17881 .nr(8)
17882 .kr(1)
17883 .sr(1)
17884 .m(6)
17885 .n(8)
17886 .k(2)
17887 .cm_stride(11)
17888 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld64);
17889 }
17890#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17891
17892
17893#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard69172d92019-11-26 16:22:39 -080017894 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4) {
17895 TEST_REQUIRES_ARM_NEON_FMA;
17896 GemmMicrokernelTester()
17897 .mr(6)
17898 .nr(8)
17899 .kr(1)
17900 .sr(1)
17901 .m(6)
17902 .n(8)
17903 .k(4)
17904 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17905 }
17906
17907 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, strided_cn) {
17908 TEST_REQUIRES_ARM_NEON_FMA;
17909 GemmMicrokernelTester()
17910 .mr(6)
17911 .nr(8)
17912 .kr(1)
17913 .sr(1)
17914 .m(6)
17915 .n(8)
17916 .k(4)
17917 .cn_stride(11)
17918 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17919 }
17920
17921 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
17922 TEST_REQUIRES_ARM_NEON_FMA;
17923 GemmMicrokernelTester()
17924 .mr(6)
17925 .nr(8)
17926 .kr(1)
17927 .sr(1)
17928 .m(6)
17929 .n(8)
17930 .k(4)
17931 .a_stride(7)
17932 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17933 }
17934
17935 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
17936 TEST_REQUIRES_ARM_NEON_FMA;
17937 for (uint32_t m = 1; m <= 6; m++) {
17938 for (uint32_t n = 1; n <= 8; n++) {
17939 GemmMicrokernelTester()
17940 .mr(6)
17941 .nr(8)
17942 .kr(1)
17943 .sr(1)
17944 .m(m)
17945 .n(n)
17946 .k(4)
17947 .iterations(1)
17948 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17949 }
17950 }
17951 }
17952
17953 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
17954 TEST_REQUIRES_ARM_NEON_FMA;
17955 for (uint32_t m = 1; m <= 6; m++) {
17956 GemmMicrokernelTester()
17957 .mr(6)
17958 .nr(8)
17959 .kr(1)
17960 .sr(1)
17961 .m(m)
17962 .n(8)
17963 .k(4)
17964 .iterations(1)
17965 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17966 }
17967 }
17968
17969 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
17970 TEST_REQUIRES_ARM_NEON_FMA;
17971 for (uint32_t n = 1; n <= 8; n++) {
17972 GemmMicrokernelTester()
17973 .mr(6)
17974 .nr(8)
17975 .kr(1)
17976 .sr(1)
17977 .m(6)
17978 .n(n)
17979 .k(4)
17980 .iterations(1)
17981 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17982 }
17983 }
17984
17985 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_lt_4) {
17986 TEST_REQUIRES_ARM_NEON_FMA;
17987 for (size_t k = 1; k < 4; k++) {
17988 GemmMicrokernelTester()
17989 .mr(6)
17990 .nr(8)
17991 .kr(1)
17992 .sr(1)
17993 .m(6)
17994 .n(8)
17995 .k(k)
17996 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
17997 }
17998 }
17999
18000 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
18001 TEST_REQUIRES_ARM_NEON_FMA;
18002 for (size_t k = 1; k < 4; k++) {
18003 GemmMicrokernelTester()
18004 .mr(6)
18005 .nr(8)
18006 .kr(1)
18007 .sr(1)
18008 .m(6)
18009 .n(8)
18010 .k(k)
18011 .a_stride(7)
18012 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18013 }
18014 }
18015
18016 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
18017 TEST_REQUIRES_ARM_NEON_FMA;
18018 for (size_t k = 1; k < 4; k++) {
18019 for (uint32_t m = 1; m <= 6; m++) {
18020 for (uint32_t n = 1; n <= 8; n++) {
18021 GemmMicrokernelTester()
18022 .mr(6)
18023 .nr(8)
18024 .kr(1)
18025 .sr(1)
18026 .m(m)
18027 .n(n)
18028 .k(k)
18029 .iterations(1)
18030 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18031 }
18032 }
18033 }
18034 }
18035
18036 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_gt_4) {
18037 TEST_REQUIRES_ARM_NEON_FMA;
18038 for (size_t k = 5; k < 8; k++) {
18039 GemmMicrokernelTester()
18040 .mr(6)
18041 .nr(8)
18042 .kr(1)
18043 .sr(1)
18044 .m(6)
18045 .n(8)
18046 .k(k)
18047 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18048 }
18049 }
18050
18051 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
18052 TEST_REQUIRES_ARM_NEON_FMA;
18053 for (size_t k = 5; k < 8; k++) {
18054 GemmMicrokernelTester()
18055 .mr(6)
18056 .nr(8)
18057 .kr(1)
18058 .sr(1)
18059 .m(6)
18060 .n(8)
18061 .k(k)
18062 .a_stride(11)
18063 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18064 }
18065 }
18066
18067 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
18068 TEST_REQUIRES_ARM_NEON_FMA;
18069 for (size_t k = 5; k < 8; k++) {
18070 for (uint32_t m = 1; m <= 6; m++) {
18071 for (uint32_t n = 1; n <= 8; n++) {
18072 GemmMicrokernelTester()
18073 .mr(6)
18074 .nr(8)
18075 .kr(1)
18076 .sr(1)
18077 .m(m)
18078 .n(n)
18079 .k(k)
18080 .iterations(1)
18081 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18082 }
18083 }
18084 }
18085 }
18086
18087 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_div_4) {
18088 TEST_REQUIRES_ARM_NEON_FMA;
18089 for (size_t k = 8; k <= 40; k += 4) {
18090 GemmMicrokernelTester()
18091 .mr(6)
18092 .nr(8)
18093 .kr(1)
18094 .sr(1)
18095 .m(6)
18096 .n(8)
18097 .k(k)
18098 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18099 }
18100 }
18101
18102 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
18103 TEST_REQUIRES_ARM_NEON_FMA;
18104 for (size_t k = 8; k <= 40; k += 4) {
18105 GemmMicrokernelTester()
18106 .mr(6)
18107 .nr(8)
18108 .kr(1)
18109 .sr(1)
18110 .m(6)
18111 .n(8)
18112 .k(k)
18113 .a_stride(43)
18114 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18115 }
18116 }
18117
18118 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
18119 TEST_REQUIRES_ARM_NEON_FMA;
18120 for (size_t k = 8; k <= 40; k += 4) {
18121 for (uint32_t m = 1; m <= 6; m++) {
18122 for (uint32_t n = 1; n <= 8; n++) {
18123 GemmMicrokernelTester()
18124 .mr(6)
18125 .nr(8)
18126 .kr(1)
18127 .sr(1)
18128 .m(m)
18129 .n(n)
18130 .k(k)
18131 .iterations(1)
18132 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18133 }
18134 }
18135 }
18136 }
18137
18138 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8) {
18139 TEST_REQUIRES_ARM_NEON_FMA;
18140 for (uint32_t n = 9; n < 16; n++) {
18141 for (size_t k = 1; k <= 20; k += 5) {
18142 GemmMicrokernelTester()
18143 .mr(6)
18144 .nr(8)
18145 .kr(1)
18146 .sr(1)
18147 .m(6)
18148 .n(8)
18149 .k(k)
18150 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18151 }
18152 }
18153 }
18154
18155 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
18156 TEST_REQUIRES_ARM_NEON_FMA;
18157 for (uint32_t n = 9; n < 16; n++) {
18158 for (size_t k = 1; k <= 20; k += 5) {
18159 GemmMicrokernelTester()
18160 .mr(6)
18161 .nr(8)
18162 .kr(1)
18163 .sr(1)
18164 .m(6)
18165 .n(8)
18166 .k(k)
18167 .cn_stride(11)
18168 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18169 }
18170 }
18171 }
18172
18173 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
18174 TEST_REQUIRES_ARM_NEON_FMA;
18175 for (uint32_t n = 9; n < 16; n++) {
18176 for (size_t k = 1; k <= 20; k += 5) {
18177 GemmMicrokernelTester()
18178 .mr(6)
18179 .nr(8)
18180 .kr(1)
18181 .sr(1)
18182 .m(6)
18183 .n(n)
18184 .k(k)
18185 .a_stride(23)
18186 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18187 }
18188 }
18189 }
18190
18191 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
18192 TEST_REQUIRES_ARM_NEON_FMA;
18193 for (uint32_t n = 9; n < 16; n++) {
18194 for (size_t k = 1; k <= 20; k += 5) {
18195 for (uint32_t m = 1; m <= 6; m++) {
18196 GemmMicrokernelTester()
18197 .mr(6)
18198 .nr(8)
18199 .kr(1)
18200 .sr(1)
18201 .m(m)
18202 .n(n)
18203 .k(k)
18204 .iterations(1)
18205 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18206 }
18207 }
18208 }
18209 }
18210
18211 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8) {
18212 TEST_REQUIRES_ARM_NEON_FMA;
18213 for (uint32_t n = 16; n <= 24; n += 8) {
18214 for (size_t k = 1; k <= 20; k += 5) {
18215 GemmMicrokernelTester()
18216 .mr(6)
18217 .nr(8)
18218 .kr(1)
18219 .sr(1)
18220 .m(6)
18221 .n(8)
18222 .k(k)
18223 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18224 }
18225 }
18226 }
18227
18228 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
18229 TEST_REQUIRES_ARM_NEON_FMA;
18230 for (uint32_t n = 16; n <= 24; n += 8) {
18231 for (size_t k = 1; k <= 20; k += 5) {
18232 GemmMicrokernelTester()
18233 .mr(6)
18234 .nr(8)
18235 .kr(1)
18236 .sr(1)
18237 .m(6)
18238 .n(n)
18239 .k(k)
18240 .cn_stride(11)
18241 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18242 }
18243 }
18244 }
18245
18246 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
18247 TEST_REQUIRES_ARM_NEON_FMA;
18248 for (uint32_t n = 16; n <= 24; n += 8) {
18249 for (size_t k = 1; k <= 20; k += 5) {
18250 GemmMicrokernelTester()
18251 .mr(6)
18252 .nr(8)
18253 .kr(1)
18254 .sr(1)
18255 .m(6)
18256 .n(n)
18257 .k(k)
18258 .a_stride(23)
18259 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18260 }
18261 }
18262 }
18263
18264 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
18265 TEST_REQUIRES_ARM_NEON_FMA;
18266 for (uint32_t n = 16; n <= 24; n += 8) {
18267 for (size_t k = 1; k <= 20; k += 5) {
18268 for (uint32_t m = 1; m <= 6; m++) {
18269 GemmMicrokernelTester()
18270 .mr(6)
18271 .nr(8)
18272 .kr(1)
18273 .sr(1)
18274 .m(m)
18275 .n(n)
18276 .k(k)
18277 .iterations(1)
18278 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18279 }
18280 }
18281 }
18282 }
18283
18284 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
18285 TEST_REQUIRES_ARM_NEON_FMA;
18286 for (size_t k = 1; k <= 20; k += 5) {
18287 for (uint32_t m = 1; m <= 6; m++) {
18288 for (uint32_t n = 1; n <= 8; n++) {
18289 GemmMicrokernelTester()
18290 .mr(6)
18291 .nr(8)
18292 .kr(1)
18293 .sr(1)
18294 .m(m)
18295 .n(n)
18296 .k(k)
18297 .cm_stride(11)
18298 .iterations(1)
18299 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18300 }
18301 }
18302 }
18303 }
18304
18305 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, qmin) {
18306 TEST_REQUIRES_ARM_NEON_FMA;
18307 GemmMicrokernelTester()
18308 .mr(6)
18309 .nr(8)
18310 .kr(1)
18311 .sr(1)
18312 .m(6)
18313 .n(8)
18314 .k(4)
18315 .qmin(128)
18316 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18317 }
18318
18319 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, qmax) {
18320 TEST_REQUIRES_ARM_NEON_FMA;
18321 GemmMicrokernelTester()
18322 .mr(6)
18323 .nr(8)
18324 .kr(1)
18325 .sr(1)
18326 .m(6)
18327 .n(8)
18328 .k(4)
18329 .qmax(128)
18330 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18331 }
18332
18333 TEST(F32_GEMMINC_6X8__NEONFMA_DUP_LD128, strided_cm) {
18334 TEST_REQUIRES_ARM_NEON_FMA;
18335 GemmMicrokernelTester()
18336 .mr(6)
18337 .nr(8)
18338 .kr(1)
18339 .sr(1)
18340 .m(6)
18341 .n(8)
18342 .k(4)
18343 .cm_stride(11)
18344 .Test(xnn_f32_gemminc_ukernel_6x8__neonfma_dup_ld128);
18345 }
18346#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18347
18348
18349#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080018350 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4) {
18351 TEST_REQUIRES_ARM_NEON;
18352 GemmMicrokernelTester()
18353 .mr(1)
18354 .nr(8)
18355 .kr(1)
18356 .sr(4)
18357 .m(1)
18358 .n(8)
18359 .k(4)
18360 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18361 }
18362
18363 TEST(F32_GEMMINC_1X8S4__NEON, strided_cn) {
18364 TEST_REQUIRES_ARM_NEON;
18365 GemmMicrokernelTester()
18366 .mr(1)
18367 .nr(8)
18368 .kr(1)
18369 .sr(4)
18370 .m(1)
18371 .n(8)
18372 .k(4)
18373 .cn_stride(11)
18374 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18375 }
18376
18377 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_strided_a) {
18378 TEST_REQUIRES_ARM_NEON;
18379 GemmMicrokernelTester()
18380 .mr(1)
18381 .nr(8)
18382 .kr(1)
18383 .sr(4)
18384 .m(1)
18385 .n(8)
18386 .k(4)
18387 .a_stride(7)
18388 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18389 }
18390
18391 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_subtile) {
18392 TEST_REQUIRES_ARM_NEON;
18393 for (uint32_t m = 1; m <= 1; m++) {
18394 for (uint32_t n = 1; n <= 8; n++) {
18395 GemmMicrokernelTester()
18396 .mr(1)
18397 .nr(8)
18398 .kr(1)
18399 .sr(4)
18400 .m(m)
18401 .n(n)
18402 .k(4)
18403 .iterations(1)
18404 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18405 }
18406 }
18407 }
18408
18409 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_subtile_m) {
18410 TEST_REQUIRES_ARM_NEON;
18411 for (uint32_t m = 1; m <= 1; m++) {
18412 GemmMicrokernelTester()
18413 .mr(1)
18414 .nr(8)
18415 .kr(1)
18416 .sr(4)
18417 .m(m)
18418 .n(8)
18419 .k(4)
18420 .iterations(1)
18421 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18422 }
18423 }
18424
18425 TEST(F32_GEMMINC_1X8S4__NEON, k_eq_4_subtile_n) {
18426 TEST_REQUIRES_ARM_NEON;
18427 for (uint32_t n = 1; n <= 8; n++) {
18428 GemmMicrokernelTester()
18429 .mr(1)
18430 .nr(8)
18431 .kr(1)
18432 .sr(4)
18433 .m(1)
18434 .n(n)
18435 .k(4)
18436 .iterations(1)
18437 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18438 }
18439 }
18440
18441 TEST(F32_GEMMINC_1X8S4__NEON, k_lt_4) {
18442 TEST_REQUIRES_ARM_NEON;
18443 for (size_t k = 1; k < 4; k++) {
18444 GemmMicrokernelTester()
18445 .mr(1)
18446 .nr(8)
18447 .kr(1)
18448 .sr(4)
18449 .m(1)
18450 .n(8)
18451 .k(k)
18452 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18453 }
18454 }
18455
18456 TEST(F32_GEMMINC_1X8S4__NEON, k_lt_4_strided_a) {
18457 TEST_REQUIRES_ARM_NEON;
18458 for (size_t k = 1; k < 4; k++) {
18459 GemmMicrokernelTester()
18460 .mr(1)
18461 .nr(8)
18462 .kr(1)
18463 .sr(4)
18464 .m(1)
18465 .n(8)
18466 .k(k)
18467 .a_stride(7)
18468 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18469 }
18470 }
18471
18472 TEST(F32_GEMMINC_1X8S4__NEON, k_lt_4_subtile) {
18473 TEST_REQUIRES_ARM_NEON;
18474 for (size_t k = 1; k < 4; k++) {
18475 for (uint32_t m = 1; m <= 1; m++) {
18476 for (uint32_t n = 1; n <= 8; n++) {
18477 GemmMicrokernelTester()
18478 .mr(1)
18479 .nr(8)
18480 .kr(1)
18481 .sr(4)
18482 .m(m)
18483 .n(n)
18484 .k(k)
18485 .iterations(1)
18486 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18487 }
18488 }
18489 }
18490 }
18491
18492 TEST(F32_GEMMINC_1X8S4__NEON, k_gt_4) {
18493 TEST_REQUIRES_ARM_NEON;
18494 for (size_t k = 5; k < 8; k++) {
18495 GemmMicrokernelTester()
18496 .mr(1)
18497 .nr(8)
18498 .kr(1)
18499 .sr(4)
18500 .m(1)
18501 .n(8)
18502 .k(k)
18503 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18504 }
18505 }
18506
18507 TEST(F32_GEMMINC_1X8S4__NEON, k_gt_4_strided_a) {
18508 TEST_REQUIRES_ARM_NEON;
18509 for (size_t k = 5; k < 8; k++) {
18510 GemmMicrokernelTester()
18511 .mr(1)
18512 .nr(8)
18513 .kr(1)
18514 .sr(4)
18515 .m(1)
18516 .n(8)
18517 .k(k)
18518 .a_stride(11)
18519 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18520 }
18521 }
18522
18523 TEST(F32_GEMMINC_1X8S4__NEON, k_gt_4_subtile) {
18524 TEST_REQUIRES_ARM_NEON;
18525 for (size_t k = 5; k < 8; k++) {
18526 for (uint32_t m = 1; m <= 1; m++) {
18527 for (uint32_t n = 1; n <= 8; n++) {
18528 GemmMicrokernelTester()
18529 .mr(1)
18530 .nr(8)
18531 .kr(1)
18532 .sr(4)
18533 .m(m)
18534 .n(n)
18535 .k(k)
18536 .iterations(1)
18537 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18538 }
18539 }
18540 }
18541 }
18542
18543 TEST(F32_GEMMINC_1X8S4__NEON, k_div_4) {
18544 TEST_REQUIRES_ARM_NEON;
18545 for (size_t k = 8; k <= 40; k += 4) {
18546 GemmMicrokernelTester()
18547 .mr(1)
18548 .nr(8)
18549 .kr(1)
18550 .sr(4)
18551 .m(1)
18552 .n(8)
18553 .k(k)
18554 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18555 }
18556 }
18557
18558 TEST(F32_GEMMINC_1X8S4__NEON, k_div_4_strided_a) {
18559 TEST_REQUIRES_ARM_NEON;
18560 for (size_t k = 8; k <= 40; k += 4) {
18561 GemmMicrokernelTester()
18562 .mr(1)
18563 .nr(8)
18564 .kr(1)
18565 .sr(4)
18566 .m(1)
18567 .n(8)
18568 .k(k)
18569 .a_stride(43)
18570 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18571 }
18572 }
18573
18574 TEST(F32_GEMMINC_1X8S4__NEON, k_div_4_subtile) {
18575 TEST_REQUIRES_ARM_NEON;
18576 for (size_t k = 8; k <= 40; k += 4) {
18577 for (uint32_t m = 1; m <= 1; m++) {
18578 for (uint32_t n = 1; n <= 8; n++) {
18579 GemmMicrokernelTester()
18580 .mr(1)
18581 .nr(8)
18582 .kr(1)
18583 .sr(4)
18584 .m(m)
18585 .n(n)
18586 .k(k)
18587 .iterations(1)
18588 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18589 }
18590 }
18591 }
18592 }
18593
18594 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8) {
18595 TEST_REQUIRES_ARM_NEON;
18596 for (uint32_t n = 9; n < 16; n++) {
18597 for (size_t k = 1; k <= 20; k += 5) {
18598 GemmMicrokernelTester()
18599 .mr(1)
18600 .nr(8)
18601 .kr(1)
18602 .sr(4)
18603 .m(1)
18604 .n(8)
18605 .k(k)
18606 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18607 }
18608 }
18609 }
18610
18611 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8_strided_cn) {
18612 TEST_REQUIRES_ARM_NEON;
18613 for (uint32_t n = 9; n < 16; n++) {
18614 for (size_t k = 1; k <= 20; k += 5) {
18615 GemmMicrokernelTester()
18616 .mr(1)
18617 .nr(8)
18618 .kr(1)
18619 .sr(4)
18620 .m(1)
18621 .n(8)
18622 .k(k)
18623 .cn_stride(11)
18624 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18625 }
18626 }
18627 }
18628
18629 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8_strided_a) {
18630 TEST_REQUIRES_ARM_NEON;
18631 for (uint32_t n = 9; n < 16; n++) {
18632 for (size_t k = 1; k <= 20; k += 5) {
18633 GemmMicrokernelTester()
18634 .mr(1)
18635 .nr(8)
18636 .kr(1)
18637 .sr(4)
18638 .m(1)
18639 .n(n)
18640 .k(k)
18641 .a_stride(23)
18642 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18643 }
18644 }
18645 }
18646
18647 TEST(F32_GEMMINC_1X8S4__NEON, n_gt_8_subtile) {
18648 TEST_REQUIRES_ARM_NEON;
18649 for (uint32_t n = 9; n < 16; n++) {
18650 for (size_t k = 1; k <= 20; k += 5) {
18651 for (uint32_t m = 1; m <= 1; m++) {
18652 GemmMicrokernelTester()
18653 .mr(1)
18654 .nr(8)
18655 .kr(1)
18656 .sr(4)
18657 .m(m)
18658 .n(n)
18659 .k(k)
18660 .iterations(1)
18661 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18662 }
18663 }
18664 }
18665 }
18666
18667 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8) {
18668 TEST_REQUIRES_ARM_NEON;
18669 for (uint32_t n = 16; n <= 24; n += 8) {
18670 for (size_t k = 1; k <= 20; k += 5) {
18671 GemmMicrokernelTester()
18672 .mr(1)
18673 .nr(8)
18674 .kr(1)
18675 .sr(4)
18676 .m(1)
18677 .n(8)
18678 .k(k)
18679 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18680 }
18681 }
18682 }
18683
18684 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8_strided_cn) {
18685 TEST_REQUIRES_ARM_NEON;
18686 for (uint32_t n = 16; n <= 24; n += 8) {
18687 for (size_t k = 1; k <= 20; k += 5) {
18688 GemmMicrokernelTester()
18689 .mr(1)
18690 .nr(8)
18691 .kr(1)
18692 .sr(4)
18693 .m(1)
18694 .n(n)
18695 .k(k)
18696 .cn_stride(11)
18697 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18698 }
18699 }
18700 }
18701
18702 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8_strided_a) {
18703 TEST_REQUIRES_ARM_NEON;
18704 for (uint32_t n = 16; n <= 24; n += 8) {
18705 for (size_t k = 1; k <= 20; k += 5) {
18706 GemmMicrokernelTester()
18707 .mr(1)
18708 .nr(8)
18709 .kr(1)
18710 .sr(4)
18711 .m(1)
18712 .n(n)
18713 .k(k)
18714 .a_stride(23)
18715 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18716 }
18717 }
18718 }
18719
18720 TEST(F32_GEMMINC_1X8S4__NEON, n_div_8_subtile) {
18721 TEST_REQUIRES_ARM_NEON;
18722 for (uint32_t n = 16; n <= 24; n += 8) {
18723 for (size_t k = 1; k <= 20; k += 5) {
18724 for (uint32_t m = 1; m <= 1; m++) {
18725 GemmMicrokernelTester()
18726 .mr(1)
18727 .nr(8)
18728 .kr(1)
18729 .sr(4)
18730 .m(m)
18731 .n(n)
18732 .k(k)
18733 .iterations(1)
18734 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18735 }
18736 }
18737 }
18738 }
18739
18740 TEST(F32_GEMMINC_1X8S4__NEON, strided_cm_subtile) {
18741 TEST_REQUIRES_ARM_NEON;
18742 for (size_t k = 1; k <= 20; k += 5) {
18743 for (uint32_t m = 1; m <= 1; m++) {
18744 for (uint32_t n = 1; n <= 8; n++) {
18745 GemmMicrokernelTester()
18746 .mr(1)
18747 .nr(8)
18748 .kr(1)
18749 .sr(4)
18750 .m(m)
18751 .n(n)
18752 .k(k)
18753 .cm_stride(11)
18754 .iterations(1)
18755 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18756 }
18757 }
18758 }
18759 }
18760
18761 TEST(F32_GEMMINC_1X8S4__NEON, qmin) {
18762 TEST_REQUIRES_ARM_NEON;
18763 GemmMicrokernelTester()
18764 .mr(1)
18765 .nr(8)
18766 .kr(1)
18767 .sr(4)
18768 .m(1)
18769 .n(8)
18770 .k(4)
18771 .qmin(128)
18772 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18773 }
18774
18775 TEST(F32_GEMMINC_1X8S4__NEON, qmax) {
18776 TEST_REQUIRES_ARM_NEON;
18777 GemmMicrokernelTester()
18778 .mr(1)
18779 .nr(8)
18780 .kr(1)
18781 .sr(4)
18782 .m(1)
18783 .n(8)
18784 .k(4)
18785 .qmax(128)
18786 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18787 }
18788
18789 TEST(F32_GEMMINC_1X8S4__NEON, strided_cm) {
18790 TEST_REQUIRES_ARM_NEON;
18791 GemmMicrokernelTester()
18792 .mr(1)
18793 .nr(8)
18794 .kr(1)
18795 .sr(4)
18796 .m(1)
18797 .n(8)
18798 .k(4)
18799 .cm_stride(11)
18800 .Test(xnn_f32_gemminc_ukernel_1x8s4__neon);
18801 }
18802#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18803
18804
18805#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18806 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4) {
18807 TEST_REQUIRES_ARM_NEON;
18808 GemmMicrokernelTester()
18809 .mr(4)
18810 .nr(8)
18811 .kr(1)
18812 .sr(4)
18813 .m(4)
18814 .n(8)
18815 .k(4)
18816 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18817 }
18818
18819 TEST(F32_GEMMINC_4X8S4__NEON, strided_cn) {
18820 TEST_REQUIRES_ARM_NEON;
18821 GemmMicrokernelTester()
18822 .mr(4)
18823 .nr(8)
18824 .kr(1)
18825 .sr(4)
18826 .m(4)
18827 .n(8)
18828 .k(4)
18829 .cn_stride(11)
18830 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18831 }
18832
18833 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_strided_a) {
18834 TEST_REQUIRES_ARM_NEON;
18835 GemmMicrokernelTester()
18836 .mr(4)
18837 .nr(8)
18838 .kr(1)
18839 .sr(4)
18840 .m(4)
18841 .n(8)
18842 .k(4)
18843 .a_stride(7)
18844 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18845 }
18846
18847 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_subtile) {
18848 TEST_REQUIRES_ARM_NEON;
18849 for (uint32_t m = 1; m <= 4; m++) {
18850 for (uint32_t n = 1; n <= 8; n++) {
18851 GemmMicrokernelTester()
18852 .mr(4)
18853 .nr(8)
18854 .kr(1)
18855 .sr(4)
18856 .m(m)
18857 .n(n)
18858 .k(4)
18859 .iterations(1)
18860 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18861 }
18862 }
18863 }
18864
18865 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_subtile_m) {
18866 TEST_REQUIRES_ARM_NEON;
18867 for (uint32_t m = 1; m <= 4; m++) {
18868 GemmMicrokernelTester()
18869 .mr(4)
18870 .nr(8)
18871 .kr(1)
18872 .sr(4)
18873 .m(m)
18874 .n(8)
18875 .k(4)
18876 .iterations(1)
18877 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18878 }
18879 }
18880
18881 TEST(F32_GEMMINC_4X8S4__NEON, k_eq_4_subtile_n) {
18882 TEST_REQUIRES_ARM_NEON;
18883 for (uint32_t n = 1; n <= 8; n++) {
18884 GemmMicrokernelTester()
18885 .mr(4)
18886 .nr(8)
18887 .kr(1)
18888 .sr(4)
18889 .m(4)
18890 .n(n)
18891 .k(4)
18892 .iterations(1)
18893 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18894 }
18895 }
18896
18897 TEST(F32_GEMMINC_4X8S4__NEON, k_lt_4) {
18898 TEST_REQUIRES_ARM_NEON;
18899 for (size_t k = 1; k < 4; k++) {
18900 GemmMicrokernelTester()
18901 .mr(4)
18902 .nr(8)
18903 .kr(1)
18904 .sr(4)
18905 .m(4)
18906 .n(8)
18907 .k(k)
18908 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18909 }
18910 }
18911
18912 TEST(F32_GEMMINC_4X8S4__NEON, k_lt_4_strided_a) {
18913 TEST_REQUIRES_ARM_NEON;
18914 for (size_t k = 1; k < 4; k++) {
18915 GemmMicrokernelTester()
18916 .mr(4)
18917 .nr(8)
18918 .kr(1)
18919 .sr(4)
18920 .m(4)
18921 .n(8)
18922 .k(k)
18923 .a_stride(7)
18924 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18925 }
18926 }
18927
18928 TEST(F32_GEMMINC_4X8S4__NEON, k_lt_4_subtile) {
18929 TEST_REQUIRES_ARM_NEON;
18930 for (size_t k = 1; k < 4; k++) {
18931 for (uint32_t m = 1; m <= 4; m++) {
18932 for (uint32_t n = 1; n <= 8; n++) {
18933 GemmMicrokernelTester()
18934 .mr(4)
18935 .nr(8)
18936 .kr(1)
18937 .sr(4)
18938 .m(m)
18939 .n(n)
18940 .k(k)
18941 .iterations(1)
18942 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18943 }
18944 }
18945 }
18946 }
18947
18948 TEST(F32_GEMMINC_4X8S4__NEON, k_gt_4) {
18949 TEST_REQUIRES_ARM_NEON;
18950 for (size_t k = 5; k < 8; k++) {
18951 GemmMicrokernelTester()
18952 .mr(4)
18953 .nr(8)
18954 .kr(1)
18955 .sr(4)
18956 .m(4)
18957 .n(8)
18958 .k(k)
18959 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18960 }
18961 }
18962
18963 TEST(F32_GEMMINC_4X8S4__NEON, k_gt_4_strided_a) {
18964 TEST_REQUIRES_ARM_NEON;
18965 for (size_t k = 5; k < 8; k++) {
18966 GemmMicrokernelTester()
18967 .mr(4)
18968 .nr(8)
18969 .kr(1)
18970 .sr(4)
18971 .m(4)
18972 .n(8)
18973 .k(k)
18974 .a_stride(11)
18975 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18976 }
18977 }
18978
18979 TEST(F32_GEMMINC_4X8S4__NEON, k_gt_4_subtile) {
18980 TEST_REQUIRES_ARM_NEON;
18981 for (size_t k = 5; k < 8; k++) {
18982 for (uint32_t m = 1; m <= 4; m++) {
18983 for (uint32_t n = 1; n <= 8; n++) {
18984 GemmMicrokernelTester()
18985 .mr(4)
18986 .nr(8)
18987 .kr(1)
18988 .sr(4)
18989 .m(m)
18990 .n(n)
18991 .k(k)
18992 .iterations(1)
18993 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
18994 }
18995 }
18996 }
18997 }
18998
18999 TEST(F32_GEMMINC_4X8S4__NEON, k_div_4) {
19000 TEST_REQUIRES_ARM_NEON;
19001 for (size_t k = 8; k <= 40; k += 4) {
19002 GemmMicrokernelTester()
19003 .mr(4)
19004 .nr(8)
19005 .kr(1)
19006 .sr(4)
19007 .m(4)
19008 .n(8)
19009 .k(k)
19010 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19011 }
19012 }
19013
19014 TEST(F32_GEMMINC_4X8S4__NEON, k_div_4_strided_a) {
19015 TEST_REQUIRES_ARM_NEON;
19016 for (size_t k = 8; k <= 40; k += 4) {
19017 GemmMicrokernelTester()
19018 .mr(4)
19019 .nr(8)
19020 .kr(1)
19021 .sr(4)
19022 .m(4)
19023 .n(8)
19024 .k(k)
19025 .a_stride(43)
19026 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19027 }
19028 }
19029
19030 TEST(F32_GEMMINC_4X8S4__NEON, k_div_4_subtile) {
19031 TEST_REQUIRES_ARM_NEON;
19032 for (size_t k = 8; k <= 40; k += 4) {
19033 for (uint32_t m = 1; m <= 4; m++) {
19034 for (uint32_t n = 1; n <= 8; n++) {
19035 GemmMicrokernelTester()
19036 .mr(4)
19037 .nr(8)
19038 .kr(1)
19039 .sr(4)
19040 .m(m)
19041 .n(n)
19042 .k(k)
19043 .iterations(1)
19044 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19045 }
19046 }
19047 }
19048 }
19049
19050 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8) {
19051 TEST_REQUIRES_ARM_NEON;
19052 for (uint32_t n = 9; n < 16; n++) {
19053 for (size_t k = 1; k <= 20; k += 5) {
19054 GemmMicrokernelTester()
19055 .mr(4)
19056 .nr(8)
19057 .kr(1)
19058 .sr(4)
19059 .m(4)
19060 .n(8)
19061 .k(k)
19062 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19063 }
19064 }
19065 }
19066
19067 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8_strided_cn) {
19068 TEST_REQUIRES_ARM_NEON;
19069 for (uint32_t n = 9; n < 16; n++) {
19070 for (size_t k = 1; k <= 20; k += 5) {
19071 GemmMicrokernelTester()
19072 .mr(4)
19073 .nr(8)
19074 .kr(1)
19075 .sr(4)
19076 .m(4)
19077 .n(8)
19078 .k(k)
19079 .cn_stride(11)
19080 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19081 }
19082 }
19083 }
19084
19085 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8_strided_a) {
19086 TEST_REQUIRES_ARM_NEON;
19087 for (uint32_t n = 9; n < 16; n++) {
19088 for (size_t k = 1; k <= 20; k += 5) {
19089 GemmMicrokernelTester()
19090 .mr(4)
19091 .nr(8)
19092 .kr(1)
19093 .sr(4)
19094 .m(4)
19095 .n(n)
19096 .k(k)
19097 .a_stride(23)
19098 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19099 }
19100 }
19101 }
19102
19103 TEST(F32_GEMMINC_4X8S4__NEON, n_gt_8_subtile) {
19104 TEST_REQUIRES_ARM_NEON;
19105 for (uint32_t n = 9; n < 16; n++) {
19106 for (size_t k = 1; k <= 20; k += 5) {
19107 for (uint32_t m = 1; m <= 4; m++) {
19108 GemmMicrokernelTester()
19109 .mr(4)
19110 .nr(8)
19111 .kr(1)
19112 .sr(4)
19113 .m(m)
19114 .n(n)
19115 .k(k)
19116 .iterations(1)
19117 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19118 }
19119 }
19120 }
19121 }
19122
19123 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8) {
19124 TEST_REQUIRES_ARM_NEON;
19125 for (uint32_t n = 16; n <= 24; n += 8) {
19126 for (size_t k = 1; k <= 20; k += 5) {
19127 GemmMicrokernelTester()
19128 .mr(4)
19129 .nr(8)
19130 .kr(1)
19131 .sr(4)
19132 .m(4)
19133 .n(8)
19134 .k(k)
19135 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19136 }
19137 }
19138 }
19139
19140 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8_strided_cn) {
19141 TEST_REQUIRES_ARM_NEON;
19142 for (uint32_t n = 16; n <= 24; n += 8) {
19143 for (size_t k = 1; k <= 20; k += 5) {
19144 GemmMicrokernelTester()
19145 .mr(4)
19146 .nr(8)
19147 .kr(1)
19148 .sr(4)
19149 .m(4)
19150 .n(n)
19151 .k(k)
19152 .cn_stride(11)
19153 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19154 }
19155 }
19156 }
19157
19158 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8_strided_a) {
19159 TEST_REQUIRES_ARM_NEON;
19160 for (uint32_t n = 16; n <= 24; n += 8) {
19161 for (size_t k = 1; k <= 20; k += 5) {
19162 GemmMicrokernelTester()
19163 .mr(4)
19164 .nr(8)
19165 .kr(1)
19166 .sr(4)
19167 .m(4)
19168 .n(n)
19169 .k(k)
19170 .a_stride(23)
19171 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19172 }
19173 }
19174 }
19175
19176 TEST(F32_GEMMINC_4X8S4__NEON, n_div_8_subtile) {
19177 TEST_REQUIRES_ARM_NEON;
19178 for (uint32_t n = 16; n <= 24; n += 8) {
19179 for (size_t k = 1; k <= 20; k += 5) {
19180 for (uint32_t m = 1; m <= 4; m++) {
19181 GemmMicrokernelTester()
19182 .mr(4)
19183 .nr(8)
19184 .kr(1)
19185 .sr(4)
19186 .m(m)
19187 .n(n)
19188 .k(k)
19189 .iterations(1)
19190 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19191 }
19192 }
19193 }
19194 }
19195
19196 TEST(F32_GEMMINC_4X8S4__NEON, strided_cm_subtile) {
19197 TEST_REQUIRES_ARM_NEON;
19198 for (size_t k = 1; k <= 20; k += 5) {
19199 for (uint32_t m = 1; m <= 4; m++) {
19200 for (uint32_t n = 1; n <= 8; n++) {
19201 GemmMicrokernelTester()
19202 .mr(4)
19203 .nr(8)
19204 .kr(1)
19205 .sr(4)
19206 .m(m)
19207 .n(n)
19208 .k(k)
19209 .cm_stride(11)
19210 .iterations(1)
19211 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19212 }
19213 }
19214 }
19215 }
19216
19217 TEST(F32_GEMMINC_4X8S4__NEON, qmin) {
19218 TEST_REQUIRES_ARM_NEON;
19219 GemmMicrokernelTester()
19220 .mr(4)
19221 .nr(8)
19222 .kr(1)
19223 .sr(4)
19224 .m(4)
19225 .n(8)
19226 .k(4)
19227 .qmin(128)
19228 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19229 }
19230
19231 TEST(F32_GEMMINC_4X8S4__NEON, qmax) {
19232 TEST_REQUIRES_ARM_NEON;
19233 GemmMicrokernelTester()
19234 .mr(4)
19235 .nr(8)
19236 .kr(1)
19237 .sr(4)
19238 .m(4)
19239 .n(8)
19240 .k(4)
19241 .qmax(128)
19242 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19243 }
19244
19245 TEST(F32_GEMMINC_4X8S4__NEON, strided_cm) {
19246 TEST_REQUIRES_ARM_NEON;
19247 GemmMicrokernelTester()
19248 .mr(4)
19249 .nr(8)
19250 .kr(1)
19251 .sr(4)
19252 .m(4)
19253 .n(8)
19254 .k(4)
19255 .cm_stride(11)
19256 .Test(xnn_f32_gemminc_ukernel_4x8s4__neon);
19257 }
19258#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19259
19260
19261#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19262 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4) {
19263 TEST_REQUIRES_ARM_NEON;
19264 GemmMicrokernelTester()
19265 .mr(6)
19266 .nr(8)
19267 .kr(1)
19268 .sr(4)
19269 .m(6)
19270 .n(8)
19271 .k(4)
19272 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19273 }
19274
19275 TEST(F32_GEMMINC_6X8S4__NEON, strided_cn) {
19276 TEST_REQUIRES_ARM_NEON;
19277 GemmMicrokernelTester()
19278 .mr(6)
19279 .nr(8)
19280 .kr(1)
19281 .sr(4)
19282 .m(6)
19283 .n(8)
19284 .k(4)
19285 .cn_stride(11)
19286 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19287 }
19288
19289 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_strided_a) {
19290 TEST_REQUIRES_ARM_NEON;
19291 GemmMicrokernelTester()
19292 .mr(6)
19293 .nr(8)
19294 .kr(1)
19295 .sr(4)
19296 .m(6)
19297 .n(8)
19298 .k(4)
19299 .a_stride(7)
19300 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19301 }
19302
19303 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_subtile) {
19304 TEST_REQUIRES_ARM_NEON;
19305 for (uint32_t m = 1; m <= 6; m++) {
19306 for (uint32_t n = 1; n <= 8; n++) {
19307 GemmMicrokernelTester()
19308 .mr(6)
19309 .nr(8)
19310 .kr(1)
19311 .sr(4)
19312 .m(m)
19313 .n(n)
19314 .k(4)
19315 .iterations(1)
19316 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19317 }
19318 }
19319 }
19320
19321 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_subtile_m) {
19322 TEST_REQUIRES_ARM_NEON;
19323 for (uint32_t m = 1; m <= 6; m++) {
19324 GemmMicrokernelTester()
19325 .mr(6)
19326 .nr(8)
19327 .kr(1)
19328 .sr(4)
19329 .m(m)
19330 .n(8)
19331 .k(4)
19332 .iterations(1)
19333 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19334 }
19335 }
19336
19337 TEST(F32_GEMMINC_6X8S4__NEON, k_eq_4_subtile_n) {
19338 TEST_REQUIRES_ARM_NEON;
19339 for (uint32_t n = 1; n <= 8; n++) {
19340 GemmMicrokernelTester()
19341 .mr(6)
19342 .nr(8)
19343 .kr(1)
19344 .sr(4)
19345 .m(6)
19346 .n(n)
19347 .k(4)
19348 .iterations(1)
19349 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19350 }
19351 }
19352
19353 TEST(F32_GEMMINC_6X8S4__NEON, k_lt_4) {
19354 TEST_REQUIRES_ARM_NEON;
19355 for (size_t k = 1; k < 4; k++) {
19356 GemmMicrokernelTester()
19357 .mr(6)
19358 .nr(8)
19359 .kr(1)
19360 .sr(4)
19361 .m(6)
19362 .n(8)
19363 .k(k)
19364 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19365 }
19366 }
19367
19368 TEST(F32_GEMMINC_6X8S4__NEON, k_lt_4_strided_a) {
19369 TEST_REQUIRES_ARM_NEON;
19370 for (size_t k = 1; k < 4; k++) {
19371 GemmMicrokernelTester()
19372 .mr(6)
19373 .nr(8)
19374 .kr(1)
19375 .sr(4)
19376 .m(6)
19377 .n(8)
19378 .k(k)
19379 .a_stride(7)
19380 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19381 }
19382 }
19383
19384 TEST(F32_GEMMINC_6X8S4__NEON, k_lt_4_subtile) {
19385 TEST_REQUIRES_ARM_NEON;
19386 for (size_t k = 1; k < 4; k++) {
19387 for (uint32_t m = 1; m <= 6; m++) {
19388 for (uint32_t n = 1; n <= 8; n++) {
19389 GemmMicrokernelTester()
19390 .mr(6)
19391 .nr(8)
19392 .kr(1)
19393 .sr(4)
19394 .m(m)
19395 .n(n)
19396 .k(k)
19397 .iterations(1)
19398 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19399 }
19400 }
19401 }
19402 }
19403
19404 TEST(F32_GEMMINC_6X8S4__NEON, k_gt_4) {
19405 TEST_REQUIRES_ARM_NEON;
19406 for (size_t k = 5; k < 8; k++) {
19407 GemmMicrokernelTester()
19408 .mr(6)
19409 .nr(8)
19410 .kr(1)
19411 .sr(4)
19412 .m(6)
19413 .n(8)
19414 .k(k)
19415 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19416 }
19417 }
19418
19419 TEST(F32_GEMMINC_6X8S4__NEON, k_gt_4_strided_a) {
19420 TEST_REQUIRES_ARM_NEON;
19421 for (size_t k = 5; k < 8; k++) {
19422 GemmMicrokernelTester()
19423 .mr(6)
19424 .nr(8)
19425 .kr(1)
19426 .sr(4)
19427 .m(6)
19428 .n(8)
19429 .k(k)
19430 .a_stride(11)
19431 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19432 }
19433 }
19434
19435 TEST(F32_GEMMINC_6X8S4__NEON, k_gt_4_subtile) {
19436 TEST_REQUIRES_ARM_NEON;
19437 for (size_t k = 5; k < 8; k++) {
19438 for (uint32_t m = 1; m <= 6; m++) {
19439 for (uint32_t n = 1; n <= 8; n++) {
19440 GemmMicrokernelTester()
19441 .mr(6)
19442 .nr(8)
19443 .kr(1)
19444 .sr(4)
19445 .m(m)
19446 .n(n)
19447 .k(k)
19448 .iterations(1)
19449 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19450 }
19451 }
19452 }
19453 }
19454
19455 TEST(F32_GEMMINC_6X8S4__NEON, k_div_4) {
19456 TEST_REQUIRES_ARM_NEON;
19457 for (size_t k = 8; k <= 40; k += 4) {
19458 GemmMicrokernelTester()
19459 .mr(6)
19460 .nr(8)
19461 .kr(1)
19462 .sr(4)
19463 .m(6)
19464 .n(8)
19465 .k(k)
19466 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19467 }
19468 }
19469
19470 TEST(F32_GEMMINC_6X8S4__NEON, k_div_4_strided_a) {
19471 TEST_REQUIRES_ARM_NEON;
19472 for (size_t k = 8; k <= 40; k += 4) {
19473 GemmMicrokernelTester()
19474 .mr(6)
19475 .nr(8)
19476 .kr(1)
19477 .sr(4)
19478 .m(6)
19479 .n(8)
19480 .k(k)
19481 .a_stride(43)
19482 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19483 }
19484 }
19485
19486 TEST(F32_GEMMINC_6X8S4__NEON, k_div_4_subtile) {
19487 TEST_REQUIRES_ARM_NEON;
19488 for (size_t k = 8; k <= 40; k += 4) {
19489 for (uint32_t m = 1; m <= 6; m++) {
19490 for (uint32_t n = 1; n <= 8; n++) {
19491 GemmMicrokernelTester()
19492 .mr(6)
19493 .nr(8)
19494 .kr(1)
19495 .sr(4)
19496 .m(m)
19497 .n(n)
19498 .k(k)
19499 .iterations(1)
19500 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19501 }
19502 }
19503 }
19504 }
19505
19506 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8) {
19507 TEST_REQUIRES_ARM_NEON;
19508 for (uint32_t n = 9; n < 16; n++) {
19509 for (size_t k = 1; k <= 20; k += 5) {
19510 GemmMicrokernelTester()
19511 .mr(6)
19512 .nr(8)
19513 .kr(1)
19514 .sr(4)
19515 .m(6)
19516 .n(8)
19517 .k(k)
19518 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19519 }
19520 }
19521 }
19522
19523 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8_strided_cn) {
19524 TEST_REQUIRES_ARM_NEON;
19525 for (uint32_t n = 9; n < 16; n++) {
19526 for (size_t k = 1; k <= 20; k += 5) {
19527 GemmMicrokernelTester()
19528 .mr(6)
19529 .nr(8)
19530 .kr(1)
19531 .sr(4)
19532 .m(6)
19533 .n(8)
19534 .k(k)
19535 .cn_stride(11)
19536 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19537 }
19538 }
19539 }
19540
19541 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8_strided_a) {
19542 TEST_REQUIRES_ARM_NEON;
19543 for (uint32_t n = 9; n < 16; n++) {
19544 for (size_t k = 1; k <= 20; k += 5) {
19545 GemmMicrokernelTester()
19546 .mr(6)
19547 .nr(8)
19548 .kr(1)
19549 .sr(4)
19550 .m(6)
19551 .n(n)
19552 .k(k)
19553 .a_stride(23)
19554 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19555 }
19556 }
19557 }
19558
19559 TEST(F32_GEMMINC_6X8S4__NEON, n_gt_8_subtile) {
19560 TEST_REQUIRES_ARM_NEON;
19561 for (uint32_t n = 9; n < 16; n++) {
19562 for (size_t k = 1; k <= 20; k += 5) {
19563 for (uint32_t m = 1; m <= 6; m++) {
19564 GemmMicrokernelTester()
19565 .mr(6)
19566 .nr(8)
19567 .kr(1)
19568 .sr(4)
19569 .m(m)
19570 .n(n)
19571 .k(k)
19572 .iterations(1)
19573 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19574 }
19575 }
19576 }
19577 }
19578
19579 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8) {
19580 TEST_REQUIRES_ARM_NEON;
19581 for (uint32_t n = 16; n <= 24; n += 8) {
19582 for (size_t k = 1; k <= 20; k += 5) {
19583 GemmMicrokernelTester()
19584 .mr(6)
19585 .nr(8)
19586 .kr(1)
19587 .sr(4)
19588 .m(6)
19589 .n(8)
19590 .k(k)
19591 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19592 }
19593 }
19594 }
19595
19596 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8_strided_cn) {
19597 TEST_REQUIRES_ARM_NEON;
19598 for (uint32_t n = 16; n <= 24; n += 8) {
19599 for (size_t k = 1; k <= 20; k += 5) {
19600 GemmMicrokernelTester()
19601 .mr(6)
19602 .nr(8)
19603 .kr(1)
19604 .sr(4)
19605 .m(6)
19606 .n(n)
19607 .k(k)
19608 .cn_stride(11)
19609 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19610 }
19611 }
19612 }
19613
19614 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8_strided_a) {
19615 TEST_REQUIRES_ARM_NEON;
19616 for (uint32_t n = 16; n <= 24; n += 8) {
19617 for (size_t k = 1; k <= 20; k += 5) {
19618 GemmMicrokernelTester()
19619 .mr(6)
19620 .nr(8)
19621 .kr(1)
19622 .sr(4)
19623 .m(6)
19624 .n(n)
19625 .k(k)
19626 .a_stride(23)
19627 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19628 }
19629 }
19630 }
19631
19632 TEST(F32_GEMMINC_6X8S4__NEON, n_div_8_subtile) {
19633 TEST_REQUIRES_ARM_NEON;
19634 for (uint32_t n = 16; n <= 24; n += 8) {
19635 for (size_t k = 1; k <= 20; k += 5) {
19636 for (uint32_t m = 1; m <= 6; m++) {
19637 GemmMicrokernelTester()
19638 .mr(6)
19639 .nr(8)
19640 .kr(1)
19641 .sr(4)
19642 .m(m)
19643 .n(n)
19644 .k(k)
19645 .iterations(1)
19646 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19647 }
19648 }
19649 }
19650 }
19651
19652 TEST(F32_GEMMINC_6X8S4__NEON, strided_cm_subtile) {
19653 TEST_REQUIRES_ARM_NEON;
19654 for (size_t k = 1; k <= 20; k += 5) {
19655 for (uint32_t m = 1; m <= 6; m++) {
19656 for (uint32_t n = 1; n <= 8; n++) {
19657 GemmMicrokernelTester()
19658 .mr(6)
19659 .nr(8)
19660 .kr(1)
19661 .sr(4)
19662 .m(m)
19663 .n(n)
19664 .k(k)
19665 .cm_stride(11)
19666 .iterations(1)
19667 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19668 }
19669 }
19670 }
19671 }
19672
19673 TEST(F32_GEMMINC_6X8S4__NEON, qmin) {
19674 TEST_REQUIRES_ARM_NEON;
19675 GemmMicrokernelTester()
19676 .mr(6)
19677 .nr(8)
19678 .kr(1)
19679 .sr(4)
19680 .m(6)
19681 .n(8)
19682 .k(4)
19683 .qmin(128)
19684 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19685 }
19686
19687 TEST(F32_GEMMINC_6X8S4__NEON, qmax) {
19688 TEST_REQUIRES_ARM_NEON;
19689 GemmMicrokernelTester()
19690 .mr(6)
19691 .nr(8)
19692 .kr(1)
19693 .sr(4)
19694 .m(6)
19695 .n(8)
19696 .k(4)
19697 .qmax(128)
19698 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19699 }
19700
19701 TEST(F32_GEMMINC_6X8S4__NEON, strided_cm) {
19702 TEST_REQUIRES_ARM_NEON;
19703 GemmMicrokernelTester()
19704 .mr(6)
19705 .nr(8)
19706 .kr(1)
19707 .sr(4)
19708 .m(6)
19709 .n(8)
19710 .k(4)
19711 .cm_stride(11)
19712 .Test(xnn_f32_gemminc_ukernel_6x8s4__neon);
19713 }
19714#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19715
19716
19717#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19718 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4) {
19719 TEST_REQUIRES_ARM_NEON;
19720 GemmMicrokernelTester()
19721 .mr(8)
19722 .nr(8)
19723 .kr(1)
19724 .sr(4)
19725 .m(8)
19726 .n(8)
19727 .k(4)
19728 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19729 }
19730
19731 TEST(F32_GEMMINC_8X8S4__NEON, strided_cn) {
19732 TEST_REQUIRES_ARM_NEON;
19733 GemmMicrokernelTester()
19734 .mr(8)
19735 .nr(8)
19736 .kr(1)
19737 .sr(4)
19738 .m(8)
19739 .n(8)
19740 .k(4)
19741 .cn_stride(11)
19742 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19743 }
19744
19745 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_strided_a) {
19746 TEST_REQUIRES_ARM_NEON;
19747 GemmMicrokernelTester()
19748 .mr(8)
19749 .nr(8)
19750 .kr(1)
19751 .sr(4)
19752 .m(8)
19753 .n(8)
19754 .k(4)
19755 .a_stride(7)
19756 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19757 }
19758
19759 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_subtile) {
19760 TEST_REQUIRES_ARM_NEON;
19761 for (uint32_t m = 1; m <= 8; m++) {
19762 for (uint32_t n = 1; n <= 8; n++) {
19763 GemmMicrokernelTester()
19764 .mr(8)
19765 .nr(8)
19766 .kr(1)
19767 .sr(4)
19768 .m(m)
19769 .n(n)
19770 .k(4)
19771 .iterations(1)
19772 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19773 }
19774 }
19775 }
19776
19777 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_subtile_m) {
19778 TEST_REQUIRES_ARM_NEON;
19779 for (uint32_t m = 1; m <= 8; m++) {
19780 GemmMicrokernelTester()
19781 .mr(8)
19782 .nr(8)
19783 .kr(1)
19784 .sr(4)
19785 .m(m)
19786 .n(8)
19787 .k(4)
19788 .iterations(1)
19789 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19790 }
19791 }
19792
19793 TEST(F32_GEMMINC_8X8S4__NEON, k_eq_4_subtile_n) {
19794 TEST_REQUIRES_ARM_NEON;
19795 for (uint32_t n = 1; n <= 8; n++) {
19796 GemmMicrokernelTester()
19797 .mr(8)
19798 .nr(8)
19799 .kr(1)
19800 .sr(4)
19801 .m(8)
19802 .n(n)
19803 .k(4)
19804 .iterations(1)
19805 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19806 }
19807 }
19808
19809 TEST(F32_GEMMINC_8X8S4__NEON, k_lt_4) {
19810 TEST_REQUIRES_ARM_NEON;
19811 for (size_t k = 1; k < 4; k++) {
19812 GemmMicrokernelTester()
19813 .mr(8)
19814 .nr(8)
19815 .kr(1)
19816 .sr(4)
19817 .m(8)
19818 .n(8)
19819 .k(k)
19820 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19821 }
19822 }
19823
19824 TEST(F32_GEMMINC_8X8S4__NEON, k_lt_4_strided_a) {
19825 TEST_REQUIRES_ARM_NEON;
19826 for (size_t k = 1; k < 4; k++) {
19827 GemmMicrokernelTester()
19828 .mr(8)
19829 .nr(8)
19830 .kr(1)
19831 .sr(4)
19832 .m(8)
19833 .n(8)
19834 .k(k)
19835 .a_stride(7)
19836 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19837 }
19838 }
19839
19840 TEST(F32_GEMMINC_8X8S4__NEON, k_lt_4_subtile) {
19841 TEST_REQUIRES_ARM_NEON;
19842 for (size_t k = 1; k < 4; k++) {
19843 for (uint32_t m = 1; m <= 8; m++) {
19844 for (uint32_t n = 1; n <= 8; n++) {
19845 GemmMicrokernelTester()
19846 .mr(8)
19847 .nr(8)
19848 .kr(1)
19849 .sr(4)
19850 .m(m)
19851 .n(n)
19852 .k(k)
19853 .iterations(1)
19854 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19855 }
19856 }
19857 }
19858 }
19859
19860 TEST(F32_GEMMINC_8X8S4__NEON, k_gt_4) {
19861 TEST_REQUIRES_ARM_NEON;
19862 for (size_t k = 5; k < 8; k++) {
19863 GemmMicrokernelTester()
19864 .mr(8)
19865 .nr(8)
19866 .kr(1)
19867 .sr(4)
19868 .m(8)
19869 .n(8)
19870 .k(k)
19871 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19872 }
19873 }
19874
19875 TEST(F32_GEMMINC_8X8S4__NEON, k_gt_4_strided_a) {
19876 TEST_REQUIRES_ARM_NEON;
19877 for (size_t k = 5; k < 8; k++) {
19878 GemmMicrokernelTester()
19879 .mr(8)
19880 .nr(8)
19881 .kr(1)
19882 .sr(4)
19883 .m(8)
19884 .n(8)
19885 .k(k)
19886 .a_stride(11)
19887 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19888 }
19889 }
19890
19891 TEST(F32_GEMMINC_8X8S4__NEON, k_gt_4_subtile) {
19892 TEST_REQUIRES_ARM_NEON;
19893 for (size_t k = 5; k < 8; k++) {
19894 for (uint32_t m = 1; m <= 8; m++) {
19895 for (uint32_t n = 1; n <= 8; n++) {
19896 GemmMicrokernelTester()
19897 .mr(8)
19898 .nr(8)
19899 .kr(1)
19900 .sr(4)
19901 .m(m)
19902 .n(n)
19903 .k(k)
19904 .iterations(1)
19905 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19906 }
19907 }
19908 }
19909 }
19910
19911 TEST(F32_GEMMINC_8X8S4__NEON, k_div_4) {
19912 TEST_REQUIRES_ARM_NEON;
19913 for (size_t k = 8; k <= 40; k += 4) {
19914 GemmMicrokernelTester()
19915 .mr(8)
19916 .nr(8)
19917 .kr(1)
19918 .sr(4)
19919 .m(8)
19920 .n(8)
19921 .k(k)
19922 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19923 }
19924 }
19925
19926 TEST(F32_GEMMINC_8X8S4__NEON, k_div_4_strided_a) {
19927 TEST_REQUIRES_ARM_NEON;
19928 for (size_t k = 8; k <= 40; k += 4) {
19929 GemmMicrokernelTester()
19930 .mr(8)
19931 .nr(8)
19932 .kr(1)
19933 .sr(4)
19934 .m(8)
19935 .n(8)
19936 .k(k)
19937 .a_stride(43)
19938 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19939 }
19940 }
19941
19942 TEST(F32_GEMMINC_8X8S4__NEON, k_div_4_subtile) {
19943 TEST_REQUIRES_ARM_NEON;
19944 for (size_t k = 8; k <= 40; k += 4) {
19945 for (uint32_t m = 1; m <= 8; m++) {
19946 for (uint32_t n = 1; n <= 8; n++) {
19947 GemmMicrokernelTester()
19948 .mr(8)
19949 .nr(8)
19950 .kr(1)
19951 .sr(4)
19952 .m(m)
19953 .n(n)
19954 .k(k)
19955 .iterations(1)
19956 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19957 }
19958 }
19959 }
19960 }
19961
19962 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8) {
19963 TEST_REQUIRES_ARM_NEON;
19964 for (uint32_t n = 9; n < 16; n++) {
19965 for (size_t k = 1; k <= 20; k += 5) {
19966 GemmMicrokernelTester()
19967 .mr(8)
19968 .nr(8)
19969 .kr(1)
19970 .sr(4)
19971 .m(8)
19972 .n(8)
19973 .k(k)
19974 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19975 }
19976 }
19977 }
19978
19979 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8_strided_cn) {
19980 TEST_REQUIRES_ARM_NEON;
19981 for (uint32_t n = 9; n < 16; n++) {
19982 for (size_t k = 1; k <= 20; k += 5) {
19983 GemmMicrokernelTester()
19984 .mr(8)
19985 .nr(8)
19986 .kr(1)
19987 .sr(4)
19988 .m(8)
19989 .n(8)
19990 .k(k)
19991 .cn_stride(11)
19992 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
19993 }
19994 }
19995 }
19996
19997 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8_strided_a) {
19998 TEST_REQUIRES_ARM_NEON;
19999 for (uint32_t n = 9; n < 16; n++) {
20000 for (size_t k = 1; k <= 20; k += 5) {
20001 GemmMicrokernelTester()
20002 .mr(8)
20003 .nr(8)
20004 .kr(1)
20005 .sr(4)
20006 .m(8)
20007 .n(n)
20008 .k(k)
20009 .a_stride(23)
20010 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20011 }
20012 }
20013 }
20014
20015 TEST(F32_GEMMINC_8X8S4__NEON, n_gt_8_subtile) {
20016 TEST_REQUIRES_ARM_NEON;
20017 for (uint32_t n = 9; n < 16; n++) {
20018 for (size_t k = 1; k <= 20; k += 5) {
20019 for (uint32_t m = 1; m <= 8; m++) {
20020 GemmMicrokernelTester()
20021 .mr(8)
20022 .nr(8)
20023 .kr(1)
20024 .sr(4)
20025 .m(m)
20026 .n(n)
20027 .k(k)
20028 .iterations(1)
20029 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20030 }
20031 }
20032 }
20033 }
20034
20035 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8) {
20036 TEST_REQUIRES_ARM_NEON;
20037 for (uint32_t n = 16; n <= 24; n += 8) {
20038 for (size_t k = 1; k <= 20; k += 5) {
20039 GemmMicrokernelTester()
20040 .mr(8)
20041 .nr(8)
20042 .kr(1)
20043 .sr(4)
20044 .m(8)
20045 .n(8)
20046 .k(k)
20047 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20048 }
20049 }
20050 }
20051
20052 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8_strided_cn) {
20053 TEST_REQUIRES_ARM_NEON;
20054 for (uint32_t n = 16; n <= 24; n += 8) {
20055 for (size_t k = 1; k <= 20; k += 5) {
20056 GemmMicrokernelTester()
20057 .mr(8)
20058 .nr(8)
20059 .kr(1)
20060 .sr(4)
20061 .m(8)
20062 .n(n)
20063 .k(k)
20064 .cn_stride(11)
20065 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20066 }
20067 }
20068 }
20069
20070 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8_strided_a) {
20071 TEST_REQUIRES_ARM_NEON;
20072 for (uint32_t n = 16; n <= 24; n += 8) {
20073 for (size_t k = 1; k <= 20; k += 5) {
20074 GemmMicrokernelTester()
20075 .mr(8)
20076 .nr(8)
20077 .kr(1)
20078 .sr(4)
20079 .m(8)
20080 .n(n)
20081 .k(k)
20082 .a_stride(23)
20083 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20084 }
20085 }
20086 }
20087
20088 TEST(F32_GEMMINC_8X8S4__NEON, n_div_8_subtile) {
20089 TEST_REQUIRES_ARM_NEON;
20090 for (uint32_t n = 16; n <= 24; n += 8) {
20091 for (size_t k = 1; k <= 20; k += 5) {
20092 for (uint32_t m = 1; m <= 8; m++) {
20093 GemmMicrokernelTester()
20094 .mr(8)
20095 .nr(8)
20096 .kr(1)
20097 .sr(4)
20098 .m(m)
20099 .n(n)
20100 .k(k)
20101 .iterations(1)
20102 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20103 }
20104 }
20105 }
20106 }
20107
20108 TEST(F32_GEMMINC_8X8S4__NEON, strided_cm_subtile) {
20109 TEST_REQUIRES_ARM_NEON;
20110 for (size_t k = 1; k <= 20; k += 5) {
20111 for (uint32_t m = 1; m <= 8; m++) {
20112 for (uint32_t n = 1; n <= 8; n++) {
20113 GemmMicrokernelTester()
20114 .mr(8)
20115 .nr(8)
20116 .kr(1)
20117 .sr(4)
20118 .m(m)
20119 .n(n)
20120 .k(k)
20121 .cm_stride(11)
20122 .iterations(1)
20123 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20124 }
20125 }
20126 }
20127 }
20128
20129 TEST(F32_GEMMINC_8X8S4__NEON, qmin) {
20130 TEST_REQUIRES_ARM_NEON;
20131 GemmMicrokernelTester()
20132 .mr(8)
20133 .nr(8)
20134 .kr(1)
20135 .sr(4)
20136 .m(8)
20137 .n(8)
20138 .k(4)
20139 .qmin(128)
20140 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20141 }
20142
20143 TEST(F32_GEMMINC_8X8S4__NEON, qmax) {
20144 TEST_REQUIRES_ARM_NEON;
20145 GemmMicrokernelTester()
20146 .mr(8)
20147 .nr(8)
20148 .kr(1)
20149 .sr(4)
20150 .m(8)
20151 .n(8)
20152 .k(4)
20153 .qmax(128)
20154 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20155 }
20156
20157 TEST(F32_GEMMINC_8X8S4__NEON, strided_cm) {
20158 TEST_REQUIRES_ARM_NEON;
20159 GemmMicrokernelTester()
20160 .mr(8)
20161 .nr(8)
20162 .kr(1)
20163 .sr(4)
20164 .m(8)
20165 .n(8)
20166 .k(4)
20167 .cm_stride(11)
20168 .Test(xnn_f32_gemminc_ukernel_8x8s4__neon);
20169 }
20170#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20171
20172
20173#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barcharddf06d802019-11-20 15:53:46 -080020174 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4) {
20175 TEST_REQUIRES_ARM_NEON_FMA;
20176 GemmMicrokernelTester()
20177 .mr(1)
20178 .nr(8)
20179 .kr(1)
20180 .sr(4)
20181 .m(1)
20182 .n(8)
20183 .k(4)
20184 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20185 }
20186
20187 TEST(F32_GEMMINC_1X8S4__NEONFMA, strided_cn) {
20188 TEST_REQUIRES_ARM_NEON_FMA;
20189 GemmMicrokernelTester()
20190 .mr(1)
20191 .nr(8)
20192 .kr(1)
20193 .sr(4)
20194 .m(1)
20195 .n(8)
20196 .k(4)
20197 .cn_stride(11)
20198 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20199 }
20200
20201 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_strided_a) {
20202 TEST_REQUIRES_ARM_NEON_FMA;
20203 GemmMicrokernelTester()
20204 .mr(1)
20205 .nr(8)
20206 .kr(1)
20207 .sr(4)
20208 .m(1)
20209 .n(8)
20210 .k(4)
20211 .a_stride(7)
20212 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20213 }
20214
20215 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_subtile) {
20216 TEST_REQUIRES_ARM_NEON_FMA;
20217 for (uint32_t m = 1; m <= 1; m++) {
20218 for (uint32_t n = 1; n <= 8; n++) {
20219 GemmMicrokernelTester()
20220 .mr(1)
20221 .nr(8)
20222 .kr(1)
20223 .sr(4)
20224 .m(m)
20225 .n(n)
20226 .k(4)
20227 .iterations(1)
20228 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20229 }
20230 }
20231 }
20232
20233 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_subtile_m) {
20234 TEST_REQUIRES_ARM_NEON_FMA;
20235 for (uint32_t m = 1; m <= 1; m++) {
20236 GemmMicrokernelTester()
20237 .mr(1)
20238 .nr(8)
20239 .kr(1)
20240 .sr(4)
20241 .m(m)
20242 .n(8)
20243 .k(4)
20244 .iterations(1)
20245 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20246 }
20247 }
20248
20249 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_eq_4_subtile_n) {
20250 TEST_REQUIRES_ARM_NEON_FMA;
20251 for (uint32_t n = 1; n <= 8; n++) {
20252 GemmMicrokernelTester()
20253 .mr(1)
20254 .nr(8)
20255 .kr(1)
20256 .sr(4)
20257 .m(1)
20258 .n(n)
20259 .k(4)
20260 .iterations(1)
20261 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20262 }
20263 }
20264
20265 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_lt_4) {
20266 TEST_REQUIRES_ARM_NEON_FMA;
20267 for (size_t k = 1; k < 4; k++) {
20268 GemmMicrokernelTester()
20269 .mr(1)
20270 .nr(8)
20271 .kr(1)
20272 .sr(4)
20273 .m(1)
20274 .n(8)
20275 .k(k)
20276 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20277 }
20278 }
20279
20280 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_lt_4_strided_a) {
20281 TEST_REQUIRES_ARM_NEON_FMA;
20282 for (size_t k = 1; k < 4; k++) {
20283 GemmMicrokernelTester()
20284 .mr(1)
20285 .nr(8)
20286 .kr(1)
20287 .sr(4)
20288 .m(1)
20289 .n(8)
20290 .k(k)
20291 .a_stride(7)
20292 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20293 }
20294 }
20295
20296 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_lt_4_subtile) {
20297 TEST_REQUIRES_ARM_NEON_FMA;
20298 for (size_t k = 1; k < 4; k++) {
20299 for (uint32_t m = 1; m <= 1; m++) {
20300 for (uint32_t n = 1; n <= 8; n++) {
20301 GemmMicrokernelTester()
20302 .mr(1)
20303 .nr(8)
20304 .kr(1)
20305 .sr(4)
20306 .m(m)
20307 .n(n)
20308 .k(k)
20309 .iterations(1)
20310 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20311 }
20312 }
20313 }
20314 }
20315
20316 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_gt_4) {
20317 TEST_REQUIRES_ARM_NEON_FMA;
20318 for (size_t k = 5; k < 8; k++) {
20319 GemmMicrokernelTester()
20320 .mr(1)
20321 .nr(8)
20322 .kr(1)
20323 .sr(4)
20324 .m(1)
20325 .n(8)
20326 .k(k)
20327 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20328 }
20329 }
20330
20331 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_gt_4_strided_a) {
20332 TEST_REQUIRES_ARM_NEON_FMA;
20333 for (size_t k = 5; k < 8; k++) {
20334 GemmMicrokernelTester()
20335 .mr(1)
20336 .nr(8)
20337 .kr(1)
20338 .sr(4)
20339 .m(1)
20340 .n(8)
20341 .k(k)
20342 .a_stride(11)
20343 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20344 }
20345 }
20346
20347 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_gt_4_subtile) {
20348 TEST_REQUIRES_ARM_NEON_FMA;
20349 for (size_t k = 5; k < 8; k++) {
20350 for (uint32_t m = 1; m <= 1; m++) {
20351 for (uint32_t n = 1; n <= 8; n++) {
20352 GemmMicrokernelTester()
20353 .mr(1)
20354 .nr(8)
20355 .kr(1)
20356 .sr(4)
20357 .m(m)
20358 .n(n)
20359 .k(k)
20360 .iterations(1)
20361 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20362 }
20363 }
20364 }
20365 }
20366
20367 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_div_4) {
20368 TEST_REQUIRES_ARM_NEON_FMA;
20369 for (size_t k = 8; k <= 40; k += 4) {
20370 GemmMicrokernelTester()
20371 .mr(1)
20372 .nr(8)
20373 .kr(1)
20374 .sr(4)
20375 .m(1)
20376 .n(8)
20377 .k(k)
20378 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20379 }
20380 }
20381
20382 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_div_4_strided_a) {
20383 TEST_REQUIRES_ARM_NEON_FMA;
20384 for (size_t k = 8; k <= 40; k += 4) {
20385 GemmMicrokernelTester()
20386 .mr(1)
20387 .nr(8)
20388 .kr(1)
20389 .sr(4)
20390 .m(1)
20391 .n(8)
20392 .k(k)
20393 .a_stride(43)
20394 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20395 }
20396 }
20397
20398 TEST(F32_GEMMINC_1X8S4__NEONFMA, k_div_4_subtile) {
20399 TEST_REQUIRES_ARM_NEON_FMA;
20400 for (size_t k = 8; k <= 40; k += 4) {
20401 for (uint32_t m = 1; m <= 1; m++) {
20402 for (uint32_t n = 1; n <= 8; n++) {
20403 GemmMicrokernelTester()
20404 .mr(1)
20405 .nr(8)
20406 .kr(1)
20407 .sr(4)
20408 .m(m)
20409 .n(n)
20410 .k(k)
20411 .iterations(1)
20412 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20413 }
20414 }
20415 }
20416 }
20417
20418 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8) {
20419 TEST_REQUIRES_ARM_NEON_FMA;
20420 for (uint32_t n = 9; n < 16; n++) {
20421 for (size_t k = 1; k <= 20; k += 5) {
20422 GemmMicrokernelTester()
20423 .mr(1)
20424 .nr(8)
20425 .kr(1)
20426 .sr(4)
20427 .m(1)
20428 .n(8)
20429 .k(k)
20430 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20431 }
20432 }
20433 }
20434
20435 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8_strided_cn) {
20436 TEST_REQUIRES_ARM_NEON_FMA;
20437 for (uint32_t n = 9; n < 16; n++) {
20438 for (size_t k = 1; k <= 20; k += 5) {
20439 GemmMicrokernelTester()
20440 .mr(1)
20441 .nr(8)
20442 .kr(1)
20443 .sr(4)
20444 .m(1)
20445 .n(8)
20446 .k(k)
20447 .cn_stride(11)
20448 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20449 }
20450 }
20451 }
20452
20453 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8_strided_a) {
20454 TEST_REQUIRES_ARM_NEON_FMA;
20455 for (uint32_t n = 9; n < 16; n++) {
20456 for (size_t k = 1; k <= 20; k += 5) {
20457 GemmMicrokernelTester()
20458 .mr(1)
20459 .nr(8)
20460 .kr(1)
20461 .sr(4)
20462 .m(1)
20463 .n(n)
20464 .k(k)
20465 .a_stride(23)
20466 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20467 }
20468 }
20469 }
20470
20471 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_gt_8_subtile) {
20472 TEST_REQUIRES_ARM_NEON_FMA;
20473 for (uint32_t n = 9; n < 16; n++) {
20474 for (size_t k = 1; k <= 20; k += 5) {
20475 for (uint32_t m = 1; m <= 1; m++) {
20476 GemmMicrokernelTester()
20477 .mr(1)
20478 .nr(8)
20479 .kr(1)
20480 .sr(4)
20481 .m(m)
20482 .n(n)
20483 .k(k)
20484 .iterations(1)
20485 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20486 }
20487 }
20488 }
20489 }
20490
20491 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8) {
20492 TEST_REQUIRES_ARM_NEON_FMA;
20493 for (uint32_t n = 16; n <= 24; n += 8) {
20494 for (size_t k = 1; k <= 20; k += 5) {
20495 GemmMicrokernelTester()
20496 .mr(1)
20497 .nr(8)
20498 .kr(1)
20499 .sr(4)
20500 .m(1)
20501 .n(8)
20502 .k(k)
20503 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20504 }
20505 }
20506 }
20507
20508 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8_strided_cn) {
20509 TEST_REQUIRES_ARM_NEON_FMA;
20510 for (uint32_t n = 16; n <= 24; n += 8) {
20511 for (size_t k = 1; k <= 20; k += 5) {
20512 GemmMicrokernelTester()
20513 .mr(1)
20514 .nr(8)
20515 .kr(1)
20516 .sr(4)
20517 .m(1)
20518 .n(n)
20519 .k(k)
20520 .cn_stride(11)
20521 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20522 }
20523 }
20524 }
20525
20526 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8_strided_a) {
20527 TEST_REQUIRES_ARM_NEON_FMA;
20528 for (uint32_t n = 16; n <= 24; n += 8) {
20529 for (size_t k = 1; k <= 20; k += 5) {
20530 GemmMicrokernelTester()
20531 .mr(1)
20532 .nr(8)
20533 .kr(1)
20534 .sr(4)
20535 .m(1)
20536 .n(n)
20537 .k(k)
20538 .a_stride(23)
20539 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20540 }
20541 }
20542 }
20543
20544 TEST(F32_GEMMINC_1X8S4__NEONFMA, n_div_8_subtile) {
20545 TEST_REQUIRES_ARM_NEON_FMA;
20546 for (uint32_t n = 16; n <= 24; n += 8) {
20547 for (size_t k = 1; k <= 20; k += 5) {
20548 for (uint32_t m = 1; m <= 1; m++) {
20549 GemmMicrokernelTester()
20550 .mr(1)
20551 .nr(8)
20552 .kr(1)
20553 .sr(4)
20554 .m(m)
20555 .n(n)
20556 .k(k)
20557 .iterations(1)
20558 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20559 }
20560 }
20561 }
20562 }
20563
20564 TEST(F32_GEMMINC_1X8S4__NEONFMA, strided_cm_subtile) {
20565 TEST_REQUIRES_ARM_NEON_FMA;
20566 for (size_t k = 1; k <= 20; k += 5) {
20567 for (uint32_t m = 1; m <= 1; m++) {
20568 for (uint32_t n = 1; n <= 8; n++) {
20569 GemmMicrokernelTester()
20570 .mr(1)
20571 .nr(8)
20572 .kr(1)
20573 .sr(4)
20574 .m(m)
20575 .n(n)
20576 .k(k)
20577 .cm_stride(11)
20578 .iterations(1)
20579 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20580 }
20581 }
20582 }
20583 }
20584
20585 TEST(F32_GEMMINC_1X8S4__NEONFMA, qmin) {
20586 TEST_REQUIRES_ARM_NEON_FMA;
20587 GemmMicrokernelTester()
20588 .mr(1)
20589 .nr(8)
20590 .kr(1)
20591 .sr(4)
20592 .m(1)
20593 .n(8)
20594 .k(4)
20595 .qmin(128)
20596 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20597 }
20598
20599 TEST(F32_GEMMINC_1X8S4__NEONFMA, qmax) {
20600 TEST_REQUIRES_ARM_NEON_FMA;
20601 GemmMicrokernelTester()
20602 .mr(1)
20603 .nr(8)
20604 .kr(1)
20605 .sr(4)
20606 .m(1)
20607 .n(8)
20608 .k(4)
20609 .qmax(128)
20610 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20611 }
20612
20613 TEST(F32_GEMMINC_1X8S4__NEONFMA, strided_cm) {
20614 TEST_REQUIRES_ARM_NEON_FMA;
20615 GemmMicrokernelTester()
20616 .mr(1)
20617 .nr(8)
20618 .kr(1)
20619 .sr(4)
20620 .m(1)
20621 .n(8)
20622 .k(4)
20623 .cm_stride(11)
20624 .Test(xnn_f32_gemminc_ukernel_1x8s4__neonfma);
20625 }
20626#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20627
20628
20629#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20630 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4) {
20631 TEST_REQUIRES_ARM_NEON_FMA;
20632 GemmMicrokernelTester()
20633 .mr(4)
20634 .nr(8)
20635 .kr(1)
20636 .sr(4)
20637 .m(4)
20638 .n(8)
20639 .k(4)
20640 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20641 }
20642
20643 TEST(F32_GEMMINC_4X8S4__NEONFMA, strided_cn) {
20644 TEST_REQUIRES_ARM_NEON_FMA;
20645 GemmMicrokernelTester()
20646 .mr(4)
20647 .nr(8)
20648 .kr(1)
20649 .sr(4)
20650 .m(4)
20651 .n(8)
20652 .k(4)
20653 .cn_stride(11)
20654 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20655 }
20656
20657 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_strided_a) {
20658 TEST_REQUIRES_ARM_NEON_FMA;
20659 GemmMicrokernelTester()
20660 .mr(4)
20661 .nr(8)
20662 .kr(1)
20663 .sr(4)
20664 .m(4)
20665 .n(8)
20666 .k(4)
20667 .a_stride(7)
20668 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20669 }
20670
20671 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_subtile) {
20672 TEST_REQUIRES_ARM_NEON_FMA;
20673 for (uint32_t m = 1; m <= 4; m++) {
20674 for (uint32_t n = 1; n <= 8; n++) {
20675 GemmMicrokernelTester()
20676 .mr(4)
20677 .nr(8)
20678 .kr(1)
20679 .sr(4)
20680 .m(m)
20681 .n(n)
20682 .k(4)
20683 .iterations(1)
20684 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20685 }
20686 }
20687 }
20688
20689 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_subtile_m) {
20690 TEST_REQUIRES_ARM_NEON_FMA;
20691 for (uint32_t m = 1; m <= 4; m++) {
20692 GemmMicrokernelTester()
20693 .mr(4)
20694 .nr(8)
20695 .kr(1)
20696 .sr(4)
20697 .m(m)
20698 .n(8)
20699 .k(4)
20700 .iterations(1)
20701 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20702 }
20703 }
20704
20705 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_eq_4_subtile_n) {
20706 TEST_REQUIRES_ARM_NEON_FMA;
20707 for (uint32_t n = 1; n <= 8; n++) {
20708 GemmMicrokernelTester()
20709 .mr(4)
20710 .nr(8)
20711 .kr(1)
20712 .sr(4)
20713 .m(4)
20714 .n(n)
20715 .k(4)
20716 .iterations(1)
20717 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20718 }
20719 }
20720
20721 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_lt_4) {
20722 TEST_REQUIRES_ARM_NEON_FMA;
20723 for (size_t k = 1; k < 4; k++) {
20724 GemmMicrokernelTester()
20725 .mr(4)
20726 .nr(8)
20727 .kr(1)
20728 .sr(4)
20729 .m(4)
20730 .n(8)
20731 .k(k)
20732 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20733 }
20734 }
20735
20736 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_lt_4_strided_a) {
20737 TEST_REQUIRES_ARM_NEON_FMA;
20738 for (size_t k = 1; k < 4; k++) {
20739 GemmMicrokernelTester()
20740 .mr(4)
20741 .nr(8)
20742 .kr(1)
20743 .sr(4)
20744 .m(4)
20745 .n(8)
20746 .k(k)
20747 .a_stride(7)
20748 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20749 }
20750 }
20751
20752 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_lt_4_subtile) {
20753 TEST_REQUIRES_ARM_NEON_FMA;
20754 for (size_t k = 1; k < 4; k++) {
20755 for (uint32_t m = 1; m <= 4; m++) {
20756 for (uint32_t n = 1; n <= 8; n++) {
20757 GemmMicrokernelTester()
20758 .mr(4)
20759 .nr(8)
20760 .kr(1)
20761 .sr(4)
20762 .m(m)
20763 .n(n)
20764 .k(k)
20765 .iterations(1)
20766 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20767 }
20768 }
20769 }
20770 }
20771
20772 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_gt_4) {
20773 TEST_REQUIRES_ARM_NEON_FMA;
20774 for (size_t k = 5; k < 8; k++) {
20775 GemmMicrokernelTester()
20776 .mr(4)
20777 .nr(8)
20778 .kr(1)
20779 .sr(4)
20780 .m(4)
20781 .n(8)
20782 .k(k)
20783 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20784 }
20785 }
20786
20787 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_gt_4_strided_a) {
20788 TEST_REQUIRES_ARM_NEON_FMA;
20789 for (size_t k = 5; k < 8; k++) {
20790 GemmMicrokernelTester()
20791 .mr(4)
20792 .nr(8)
20793 .kr(1)
20794 .sr(4)
20795 .m(4)
20796 .n(8)
20797 .k(k)
20798 .a_stride(11)
20799 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20800 }
20801 }
20802
20803 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_gt_4_subtile) {
20804 TEST_REQUIRES_ARM_NEON_FMA;
20805 for (size_t k = 5; k < 8; k++) {
20806 for (uint32_t m = 1; m <= 4; m++) {
20807 for (uint32_t n = 1; n <= 8; n++) {
20808 GemmMicrokernelTester()
20809 .mr(4)
20810 .nr(8)
20811 .kr(1)
20812 .sr(4)
20813 .m(m)
20814 .n(n)
20815 .k(k)
20816 .iterations(1)
20817 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20818 }
20819 }
20820 }
20821 }
20822
20823 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_div_4) {
20824 TEST_REQUIRES_ARM_NEON_FMA;
20825 for (size_t k = 8; k <= 40; k += 4) {
20826 GemmMicrokernelTester()
20827 .mr(4)
20828 .nr(8)
20829 .kr(1)
20830 .sr(4)
20831 .m(4)
20832 .n(8)
20833 .k(k)
20834 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20835 }
20836 }
20837
20838 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_div_4_strided_a) {
20839 TEST_REQUIRES_ARM_NEON_FMA;
20840 for (size_t k = 8; k <= 40; k += 4) {
20841 GemmMicrokernelTester()
20842 .mr(4)
20843 .nr(8)
20844 .kr(1)
20845 .sr(4)
20846 .m(4)
20847 .n(8)
20848 .k(k)
20849 .a_stride(43)
20850 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20851 }
20852 }
20853
20854 TEST(F32_GEMMINC_4X8S4__NEONFMA, k_div_4_subtile) {
20855 TEST_REQUIRES_ARM_NEON_FMA;
20856 for (size_t k = 8; k <= 40; k += 4) {
20857 for (uint32_t m = 1; m <= 4; m++) {
20858 for (uint32_t n = 1; n <= 8; n++) {
20859 GemmMicrokernelTester()
20860 .mr(4)
20861 .nr(8)
20862 .kr(1)
20863 .sr(4)
20864 .m(m)
20865 .n(n)
20866 .k(k)
20867 .iterations(1)
20868 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20869 }
20870 }
20871 }
20872 }
20873
20874 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8) {
20875 TEST_REQUIRES_ARM_NEON_FMA;
20876 for (uint32_t n = 9; n < 16; n++) {
20877 for (size_t k = 1; k <= 20; k += 5) {
20878 GemmMicrokernelTester()
20879 .mr(4)
20880 .nr(8)
20881 .kr(1)
20882 .sr(4)
20883 .m(4)
20884 .n(8)
20885 .k(k)
20886 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20887 }
20888 }
20889 }
20890
20891 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8_strided_cn) {
20892 TEST_REQUIRES_ARM_NEON_FMA;
20893 for (uint32_t n = 9; n < 16; n++) {
20894 for (size_t k = 1; k <= 20; k += 5) {
20895 GemmMicrokernelTester()
20896 .mr(4)
20897 .nr(8)
20898 .kr(1)
20899 .sr(4)
20900 .m(4)
20901 .n(8)
20902 .k(k)
20903 .cn_stride(11)
20904 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20905 }
20906 }
20907 }
20908
20909 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8_strided_a) {
20910 TEST_REQUIRES_ARM_NEON_FMA;
20911 for (uint32_t n = 9; n < 16; n++) {
20912 for (size_t k = 1; k <= 20; k += 5) {
20913 GemmMicrokernelTester()
20914 .mr(4)
20915 .nr(8)
20916 .kr(1)
20917 .sr(4)
20918 .m(4)
20919 .n(n)
20920 .k(k)
20921 .a_stride(23)
20922 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20923 }
20924 }
20925 }
20926
20927 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_gt_8_subtile) {
20928 TEST_REQUIRES_ARM_NEON_FMA;
20929 for (uint32_t n = 9; n < 16; n++) {
20930 for (size_t k = 1; k <= 20; k += 5) {
20931 for (uint32_t m = 1; m <= 4; m++) {
20932 GemmMicrokernelTester()
20933 .mr(4)
20934 .nr(8)
20935 .kr(1)
20936 .sr(4)
20937 .m(m)
20938 .n(n)
20939 .k(k)
20940 .iterations(1)
20941 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20942 }
20943 }
20944 }
20945 }
20946
20947 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8) {
20948 TEST_REQUIRES_ARM_NEON_FMA;
20949 for (uint32_t n = 16; n <= 24; n += 8) {
20950 for (size_t k = 1; k <= 20; k += 5) {
20951 GemmMicrokernelTester()
20952 .mr(4)
20953 .nr(8)
20954 .kr(1)
20955 .sr(4)
20956 .m(4)
20957 .n(8)
20958 .k(k)
20959 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20960 }
20961 }
20962 }
20963
20964 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8_strided_cn) {
20965 TEST_REQUIRES_ARM_NEON_FMA;
20966 for (uint32_t n = 16; n <= 24; n += 8) {
20967 for (size_t k = 1; k <= 20; k += 5) {
20968 GemmMicrokernelTester()
20969 .mr(4)
20970 .nr(8)
20971 .kr(1)
20972 .sr(4)
20973 .m(4)
20974 .n(n)
20975 .k(k)
20976 .cn_stride(11)
20977 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20978 }
20979 }
20980 }
20981
20982 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8_strided_a) {
20983 TEST_REQUIRES_ARM_NEON_FMA;
20984 for (uint32_t n = 16; n <= 24; n += 8) {
20985 for (size_t k = 1; k <= 20; k += 5) {
20986 GemmMicrokernelTester()
20987 .mr(4)
20988 .nr(8)
20989 .kr(1)
20990 .sr(4)
20991 .m(4)
20992 .n(n)
20993 .k(k)
20994 .a_stride(23)
20995 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
20996 }
20997 }
20998 }
20999
21000 TEST(F32_GEMMINC_4X8S4__NEONFMA, n_div_8_subtile) {
21001 TEST_REQUIRES_ARM_NEON_FMA;
21002 for (uint32_t n = 16; n <= 24; n += 8) {
21003 for (size_t k = 1; k <= 20; k += 5) {
21004 for (uint32_t m = 1; m <= 4; m++) {
21005 GemmMicrokernelTester()
21006 .mr(4)
21007 .nr(8)
21008 .kr(1)
21009 .sr(4)
21010 .m(m)
21011 .n(n)
21012 .k(k)
21013 .iterations(1)
21014 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
21015 }
21016 }
21017 }
21018 }
21019
21020 TEST(F32_GEMMINC_4X8S4__NEONFMA, strided_cm_subtile) {
21021 TEST_REQUIRES_ARM_NEON_FMA;
21022 for (size_t k = 1; k <= 20; k += 5) {
21023 for (uint32_t m = 1; m <= 4; m++) {
21024 for (uint32_t n = 1; n <= 8; n++) {
21025 GemmMicrokernelTester()
21026 .mr(4)
21027 .nr(8)
21028 .kr(1)
21029 .sr(4)
21030 .m(m)
21031 .n(n)
21032 .k(k)
21033 .cm_stride(11)
21034 .iterations(1)
21035 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
21036 }
21037 }
21038 }
21039 }
21040
21041 TEST(F32_GEMMINC_4X8S4__NEONFMA, qmin) {
21042 TEST_REQUIRES_ARM_NEON_FMA;
21043 GemmMicrokernelTester()
21044 .mr(4)
21045 .nr(8)
21046 .kr(1)
21047 .sr(4)
21048 .m(4)
21049 .n(8)
21050 .k(4)
21051 .qmin(128)
21052 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
21053 }
21054
21055 TEST(F32_GEMMINC_4X8S4__NEONFMA, qmax) {
21056 TEST_REQUIRES_ARM_NEON_FMA;
21057 GemmMicrokernelTester()
21058 .mr(4)
21059 .nr(8)
21060 .kr(1)
21061 .sr(4)
21062 .m(4)
21063 .n(8)
21064 .k(4)
21065 .qmax(128)
21066 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
21067 }
21068
21069 TEST(F32_GEMMINC_4X8S4__NEONFMA, strided_cm) {
21070 TEST_REQUIRES_ARM_NEON_FMA;
21071 GemmMicrokernelTester()
21072 .mr(4)
21073 .nr(8)
21074 .kr(1)
21075 .sr(4)
21076 .m(4)
21077 .n(8)
21078 .k(4)
21079 .cm_stride(11)
21080 .Test(xnn_f32_gemminc_ukernel_4x8s4__neonfma);
21081 }
21082#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21083
21084
21085#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21086 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4) {
21087 TEST_REQUIRES_ARM_NEON_FMA;
21088 GemmMicrokernelTester()
21089 .mr(6)
21090 .nr(8)
21091 .kr(1)
21092 .sr(4)
21093 .m(6)
21094 .n(8)
21095 .k(4)
21096 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21097 }
21098
21099 TEST(F32_GEMMINC_6X8S4__NEONFMA, strided_cn) {
21100 TEST_REQUIRES_ARM_NEON_FMA;
21101 GemmMicrokernelTester()
21102 .mr(6)
21103 .nr(8)
21104 .kr(1)
21105 .sr(4)
21106 .m(6)
21107 .n(8)
21108 .k(4)
21109 .cn_stride(11)
21110 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21111 }
21112
21113 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_strided_a) {
21114 TEST_REQUIRES_ARM_NEON_FMA;
21115 GemmMicrokernelTester()
21116 .mr(6)
21117 .nr(8)
21118 .kr(1)
21119 .sr(4)
21120 .m(6)
21121 .n(8)
21122 .k(4)
21123 .a_stride(7)
21124 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21125 }
21126
21127 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_subtile) {
21128 TEST_REQUIRES_ARM_NEON_FMA;
21129 for (uint32_t m = 1; m <= 6; m++) {
21130 for (uint32_t n = 1; n <= 8; n++) {
21131 GemmMicrokernelTester()
21132 .mr(6)
21133 .nr(8)
21134 .kr(1)
21135 .sr(4)
21136 .m(m)
21137 .n(n)
21138 .k(4)
21139 .iterations(1)
21140 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21141 }
21142 }
21143 }
21144
21145 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_subtile_m) {
21146 TEST_REQUIRES_ARM_NEON_FMA;
21147 for (uint32_t m = 1; m <= 6; m++) {
21148 GemmMicrokernelTester()
21149 .mr(6)
21150 .nr(8)
21151 .kr(1)
21152 .sr(4)
21153 .m(m)
21154 .n(8)
21155 .k(4)
21156 .iterations(1)
21157 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21158 }
21159 }
21160
21161 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_eq_4_subtile_n) {
21162 TEST_REQUIRES_ARM_NEON_FMA;
21163 for (uint32_t n = 1; n <= 8; n++) {
21164 GemmMicrokernelTester()
21165 .mr(6)
21166 .nr(8)
21167 .kr(1)
21168 .sr(4)
21169 .m(6)
21170 .n(n)
21171 .k(4)
21172 .iterations(1)
21173 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21174 }
21175 }
21176
21177 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_lt_4) {
21178 TEST_REQUIRES_ARM_NEON_FMA;
21179 for (size_t k = 1; k < 4; k++) {
21180 GemmMicrokernelTester()
21181 .mr(6)
21182 .nr(8)
21183 .kr(1)
21184 .sr(4)
21185 .m(6)
21186 .n(8)
21187 .k(k)
21188 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21189 }
21190 }
21191
21192 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_lt_4_strided_a) {
21193 TEST_REQUIRES_ARM_NEON_FMA;
21194 for (size_t k = 1; k < 4; k++) {
21195 GemmMicrokernelTester()
21196 .mr(6)
21197 .nr(8)
21198 .kr(1)
21199 .sr(4)
21200 .m(6)
21201 .n(8)
21202 .k(k)
21203 .a_stride(7)
21204 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21205 }
21206 }
21207
21208 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_lt_4_subtile) {
21209 TEST_REQUIRES_ARM_NEON_FMA;
21210 for (size_t k = 1; k < 4; k++) {
21211 for (uint32_t m = 1; m <= 6; m++) {
21212 for (uint32_t n = 1; n <= 8; n++) {
21213 GemmMicrokernelTester()
21214 .mr(6)
21215 .nr(8)
21216 .kr(1)
21217 .sr(4)
21218 .m(m)
21219 .n(n)
21220 .k(k)
21221 .iterations(1)
21222 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21223 }
21224 }
21225 }
21226 }
21227
21228 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_gt_4) {
21229 TEST_REQUIRES_ARM_NEON_FMA;
21230 for (size_t k = 5; k < 8; k++) {
21231 GemmMicrokernelTester()
21232 .mr(6)
21233 .nr(8)
21234 .kr(1)
21235 .sr(4)
21236 .m(6)
21237 .n(8)
21238 .k(k)
21239 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21240 }
21241 }
21242
21243 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_gt_4_strided_a) {
21244 TEST_REQUIRES_ARM_NEON_FMA;
21245 for (size_t k = 5; k < 8; k++) {
21246 GemmMicrokernelTester()
21247 .mr(6)
21248 .nr(8)
21249 .kr(1)
21250 .sr(4)
21251 .m(6)
21252 .n(8)
21253 .k(k)
21254 .a_stride(11)
21255 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21256 }
21257 }
21258
21259 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_gt_4_subtile) {
21260 TEST_REQUIRES_ARM_NEON_FMA;
21261 for (size_t k = 5; k < 8; k++) {
21262 for (uint32_t m = 1; m <= 6; m++) {
21263 for (uint32_t n = 1; n <= 8; n++) {
21264 GemmMicrokernelTester()
21265 .mr(6)
21266 .nr(8)
21267 .kr(1)
21268 .sr(4)
21269 .m(m)
21270 .n(n)
21271 .k(k)
21272 .iterations(1)
21273 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21274 }
21275 }
21276 }
21277 }
21278
21279 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_div_4) {
21280 TEST_REQUIRES_ARM_NEON_FMA;
21281 for (size_t k = 8; k <= 40; k += 4) {
21282 GemmMicrokernelTester()
21283 .mr(6)
21284 .nr(8)
21285 .kr(1)
21286 .sr(4)
21287 .m(6)
21288 .n(8)
21289 .k(k)
21290 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21291 }
21292 }
21293
21294 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_div_4_strided_a) {
21295 TEST_REQUIRES_ARM_NEON_FMA;
21296 for (size_t k = 8; k <= 40; k += 4) {
21297 GemmMicrokernelTester()
21298 .mr(6)
21299 .nr(8)
21300 .kr(1)
21301 .sr(4)
21302 .m(6)
21303 .n(8)
21304 .k(k)
21305 .a_stride(43)
21306 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21307 }
21308 }
21309
21310 TEST(F32_GEMMINC_6X8S4__NEONFMA, k_div_4_subtile) {
21311 TEST_REQUIRES_ARM_NEON_FMA;
21312 for (size_t k = 8; k <= 40; k += 4) {
21313 for (uint32_t m = 1; m <= 6; m++) {
21314 for (uint32_t n = 1; n <= 8; n++) {
21315 GemmMicrokernelTester()
21316 .mr(6)
21317 .nr(8)
21318 .kr(1)
21319 .sr(4)
21320 .m(m)
21321 .n(n)
21322 .k(k)
21323 .iterations(1)
21324 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21325 }
21326 }
21327 }
21328 }
21329
21330 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8) {
21331 TEST_REQUIRES_ARM_NEON_FMA;
21332 for (uint32_t n = 9; n < 16; n++) {
21333 for (size_t k = 1; k <= 20; k += 5) {
21334 GemmMicrokernelTester()
21335 .mr(6)
21336 .nr(8)
21337 .kr(1)
21338 .sr(4)
21339 .m(6)
21340 .n(8)
21341 .k(k)
21342 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21343 }
21344 }
21345 }
21346
21347 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8_strided_cn) {
21348 TEST_REQUIRES_ARM_NEON_FMA;
21349 for (uint32_t n = 9; n < 16; n++) {
21350 for (size_t k = 1; k <= 20; k += 5) {
21351 GemmMicrokernelTester()
21352 .mr(6)
21353 .nr(8)
21354 .kr(1)
21355 .sr(4)
21356 .m(6)
21357 .n(8)
21358 .k(k)
21359 .cn_stride(11)
21360 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21361 }
21362 }
21363 }
21364
21365 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8_strided_a) {
21366 TEST_REQUIRES_ARM_NEON_FMA;
21367 for (uint32_t n = 9; n < 16; n++) {
21368 for (size_t k = 1; k <= 20; k += 5) {
21369 GemmMicrokernelTester()
21370 .mr(6)
21371 .nr(8)
21372 .kr(1)
21373 .sr(4)
21374 .m(6)
21375 .n(n)
21376 .k(k)
21377 .a_stride(23)
21378 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21379 }
21380 }
21381 }
21382
21383 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_gt_8_subtile) {
21384 TEST_REQUIRES_ARM_NEON_FMA;
21385 for (uint32_t n = 9; n < 16; n++) {
21386 for (size_t k = 1; k <= 20; k += 5) {
21387 for (uint32_t m = 1; m <= 6; m++) {
21388 GemmMicrokernelTester()
21389 .mr(6)
21390 .nr(8)
21391 .kr(1)
21392 .sr(4)
21393 .m(m)
21394 .n(n)
21395 .k(k)
21396 .iterations(1)
21397 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21398 }
21399 }
21400 }
21401 }
21402
21403 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8) {
21404 TEST_REQUIRES_ARM_NEON_FMA;
21405 for (uint32_t n = 16; n <= 24; n += 8) {
21406 for (size_t k = 1; k <= 20; k += 5) {
21407 GemmMicrokernelTester()
21408 .mr(6)
21409 .nr(8)
21410 .kr(1)
21411 .sr(4)
21412 .m(6)
21413 .n(8)
21414 .k(k)
21415 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21416 }
21417 }
21418 }
21419
21420 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8_strided_cn) {
21421 TEST_REQUIRES_ARM_NEON_FMA;
21422 for (uint32_t n = 16; n <= 24; n += 8) {
21423 for (size_t k = 1; k <= 20; k += 5) {
21424 GemmMicrokernelTester()
21425 .mr(6)
21426 .nr(8)
21427 .kr(1)
21428 .sr(4)
21429 .m(6)
21430 .n(n)
21431 .k(k)
21432 .cn_stride(11)
21433 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21434 }
21435 }
21436 }
21437
21438 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8_strided_a) {
21439 TEST_REQUIRES_ARM_NEON_FMA;
21440 for (uint32_t n = 16; n <= 24; n += 8) {
21441 for (size_t k = 1; k <= 20; k += 5) {
21442 GemmMicrokernelTester()
21443 .mr(6)
21444 .nr(8)
21445 .kr(1)
21446 .sr(4)
21447 .m(6)
21448 .n(n)
21449 .k(k)
21450 .a_stride(23)
21451 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21452 }
21453 }
21454 }
21455
21456 TEST(F32_GEMMINC_6X8S4__NEONFMA, n_div_8_subtile) {
21457 TEST_REQUIRES_ARM_NEON_FMA;
21458 for (uint32_t n = 16; n <= 24; n += 8) {
21459 for (size_t k = 1; k <= 20; k += 5) {
21460 for (uint32_t m = 1; m <= 6; m++) {
21461 GemmMicrokernelTester()
21462 .mr(6)
21463 .nr(8)
21464 .kr(1)
21465 .sr(4)
21466 .m(m)
21467 .n(n)
21468 .k(k)
21469 .iterations(1)
21470 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21471 }
21472 }
21473 }
21474 }
21475
21476 TEST(F32_GEMMINC_6X8S4__NEONFMA, strided_cm_subtile) {
21477 TEST_REQUIRES_ARM_NEON_FMA;
21478 for (size_t k = 1; k <= 20; k += 5) {
21479 for (uint32_t m = 1; m <= 6; m++) {
21480 for (uint32_t n = 1; n <= 8; n++) {
21481 GemmMicrokernelTester()
21482 .mr(6)
21483 .nr(8)
21484 .kr(1)
21485 .sr(4)
21486 .m(m)
21487 .n(n)
21488 .k(k)
21489 .cm_stride(11)
21490 .iterations(1)
21491 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21492 }
21493 }
21494 }
21495 }
21496
21497 TEST(F32_GEMMINC_6X8S4__NEONFMA, qmin) {
21498 TEST_REQUIRES_ARM_NEON_FMA;
21499 GemmMicrokernelTester()
21500 .mr(6)
21501 .nr(8)
21502 .kr(1)
21503 .sr(4)
21504 .m(6)
21505 .n(8)
21506 .k(4)
21507 .qmin(128)
21508 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21509 }
21510
21511 TEST(F32_GEMMINC_6X8S4__NEONFMA, qmax) {
21512 TEST_REQUIRES_ARM_NEON_FMA;
21513 GemmMicrokernelTester()
21514 .mr(6)
21515 .nr(8)
21516 .kr(1)
21517 .sr(4)
21518 .m(6)
21519 .n(8)
21520 .k(4)
21521 .qmax(128)
21522 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21523 }
21524
21525 TEST(F32_GEMMINC_6X8S4__NEONFMA, strided_cm) {
21526 TEST_REQUIRES_ARM_NEON_FMA;
21527 GemmMicrokernelTester()
21528 .mr(6)
21529 .nr(8)
21530 .kr(1)
21531 .sr(4)
21532 .m(6)
21533 .n(8)
21534 .k(4)
21535 .cm_stride(11)
21536 .Test(xnn_f32_gemminc_ukernel_6x8s4__neonfma);
21537 }
21538#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21539
21540
21541#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21542 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4) {
21543 TEST_REQUIRES_ARM_NEON_FMA;
21544 GemmMicrokernelTester()
21545 .mr(8)
21546 .nr(8)
21547 .kr(1)
21548 .sr(4)
21549 .m(8)
21550 .n(8)
21551 .k(4)
21552 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21553 }
21554
21555 TEST(F32_GEMMINC_8X8S4__NEONFMA, strided_cn) {
21556 TEST_REQUIRES_ARM_NEON_FMA;
21557 GemmMicrokernelTester()
21558 .mr(8)
21559 .nr(8)
21560 .kr(1)
21561 .sr(4)
21562 .m(8)
21563 .n(8)
21564 .k(4)
21565 .cn_stride(11)
21566 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21567 }
21568
21569 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_strided_a) {
21570 TEST_REQUIRES_ARM_NEON_FMA;
21571 GemmMicrokernelTester()
21572 .mr(8)
21573 .nr(8)
21574 .kr(1)
21575 .sr(4)
21576 .m(8)
21577 .n(8)
21578 .k(4)
21579 .a_stride(7)
21580 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21581 }
21582
21583 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_subtile) {
21584 TEST_REQUIRES_ARM_NEON_FMA;
21585 for (uint32_t m = 1; m <= 8; m++) {
21586 for (uint32_t n = 1; n <= 8; n++) {
21587 GemmMicrokernelTester()
21588 .mr(8)
21589 .nr(8)
21590 .kr(1)
21591 .sr(4)
21592 .m(m)
21593 .n(n)
21594 .k(4)
21595 .iterations(1)
21596 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21597 }
21598 }
21599 }
21600
21601 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_subtile_m) {
21602 TEST_REQUIRES_ARM_NEON_FMA;
21603 for (uint32_t m = 1; m <= 8; m++) {
21604 GemmMicrokernelTester()
21605 .mr(8)
21606 .nr(8)
21607 .kr(1)
21608 .sr(4)
21609 .m(m)
21610 .n(8)
21611 .k(4)
21612 .iterations(1)
21613 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21614 }
21615 }
21616
21617 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_eq_4_subtile_n) {
21618 TEST_REQUIRES_ARM_NEON_FMA;
21619 for (uint32_t n = 1; n <= 8; n++) {
21620 GemmMicrokernelTester()
21621 .mr(8)
21622 .nr(8)
21623 .kr(1)
21624 .sr(4)
21625 .m(8)
21626 .n(n)
21627 .k(4)
21628 .iterations(1)
21629 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21630 }
21631 }
21632
21633 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_lt_4) {
21634 TEST_REQUIRES_ARM_NEON_FMA;
21635 for (size_t k = 1; k < 4; k++) {
21636 GemmMicrokernelTester()
21637 .mr(8)
21638 .nr(8)
21639 .kr(1)
21640 .sr(4)
21641 .m(8)
21642 .n(8)
21643 .k(k)
21644 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21645 }
21646 }
21647
21648 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_lt_4_strided_a) {
21649 TEST_REQUIRES_ARM_NEON_FMA;
21650 for (size_t k = 1; k < 4; k++) {
21651 GemmMicrokernelTester()
21652 .mr(8)
21653 .nr(8)
21654 .kr(1)
21655 .sr(4)
21656 .m(8)
21657 .n(8)
21658 .k(k)
21659 .a_stride(7)
21660 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21661 }
21662 }
21663
21664 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_lt_4_subtile) {
21665 TEST_REQUIRES_ARM_NEON_FMA;
21666 for (size_t k = 1; k < 4; k++) {
21667 for (uint32_t m = 1; m <= 8; m++) {
21668 for (uint32_t n = 1; n <= 8; n++) {
21669 GemmMicrokernelTester()
21670 .mr(8)
21671 .nr(8)
21672 .kr(1)
21673 .sr(4)
21674 .m(m)
21675 .n(n)
21676 .k(k)
21677 .iterations(1)
21678 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21679 }
21680 }
21681 }
21682 }
21683
21684 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_gt_4) {
21685 TEST_REQUIRES_ARM_NEON_FMA;
21686 for (size_t k = 5; k < 8; k++) {
21687 GemmMicrokernelTester()
21688 .mr(8)
21689 .nr(8)
21690 .kr(1)
21691 .sr(4)
21692 .m(8)
21693 .n(8)
21694 .k(k)
21695 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21696 }
21697 }
21698
21699 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_gt_4_strided_a) {
21700 TEST_REQUIRES_ARM_NEON_FMA;
21701 for (size_t k = 5; k < 8; k++) {
21702 GemmMicrokernelTester()
21703 .mr(8)
21704 .nr(8)
21705 .kr(1)
21706 .sr(4)
21707 .m(8)
21708 .n(8)
21709 .k(k)
21710 .a_stride(11)
21711 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21712 }
21713 }
21714
21715 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_gt_4_subtile) {
21716 TEST_REQUIRES_ARM_NEON_FMA;
21717 for (size_t k = 5; k < 8; k++) {
21718 for (uint32_t m = 1; m <= 8; m++) {
21719 for (uint32_t n = 1; n <= 8; n++) {
21720 GemmMicrokernelTester()
21721 .mr(8)
21722 .nr(8)
21723 .kr(1)
21724 .sr(4)
21725 .m(m)
21726 .n(n)
21727 .k(k)
21728 .iterations(1)
21729 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21730 }
21731 }
21732 }
21733 }
21734
21735 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_div_4) {
21736 TEST_REQUIRES_ARM_NEON_FMA;
21737 for (size_t k = 8; k <= 40; k += 4) {
21738 GemmMicrokernelTester()
21739 .mr(8)
21740 .nr(8)
21741 .kr(1)
21742 .sr(4)
21743 .m(8)
21744 .n(8)
21745 .k(k)
21746 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21747 }
21748 }
21749
21750 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_div_4_strided_a) {
21751 TEST_REQUIRES_ARM_NEON_FMA;
21752 for (size_t k = 8; k <= 40; k += 4) {
21753 GemmMicrokernelTester()
21754 .mr(8)
21755 .nr(8)
21756 .kr(1)
21757 .sr(4)
21758 .m(8)
21759 .n(8)
21760 .k(k)
21761 .a_stride(43)
21762 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21763 }
21764 }
21765
21766 TEST(F32_GEMMINC_8X8S4__NEONFMA, k_div_4_subtile) {
21767 TEST_REQUIRES_ARM_NEON_FMA;
21768 for (size_t k = 8; k <= 40; k += 4) {
21769 for (uint32_t m = 1; m <= 8; m++) {
21770 for (uint32_t n = 1; n <= 8; n++) {
21771 GemmMicrokernelTester()
21772 .mr(8)
21773 .nr(8)
21774 .kr(1)
21775 .sr(4)
21776 .m(m)
21777 .n(n)
21778 .k(k)
21779 .iterations(1)
21780 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21781 }
21782 }
21783 }
21784 }
21785
21786 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8) {
21787 TEST_REQUIRES_ARM_NEON_FMA;
21788 for (uint32_t n = 9; n < 16; n++) {
21789 for (size_t k = 1; k <= 20; k += 5) {
21790 GemmMicrokernelTester()
21791 .mr(8)
21792 .nr(8)
21793 .kr(1)
21794 .sr(4)
21795 .m(8)
21796 .n(8)
21797 .k(k)
21798 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21799 }
21800 }
21801 }
21802
21803 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8_strided_cn) {
21804 TEST_REQUIRES_ARM_NEON_FMA;
21805 for (uint32_t n = 9; n < 16; n++) {
21806 for (size_t k = 1; k <= 20; k += 5) {
21807 GemmMicrokernelTester()
21808 .mr(8)
21809 .nr(8)
21810 .kr(1)
21811 .sr(4)
21812 .m(8)
21813 .n(8)
21814 .k(k)
21815 .cn_stride(11)
21816 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21817 }
21818 }
21819 }
21820
21821 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8_strided_a) {
21822 TEST_REQUIRES_ARM_NEON_FMA;
21823 for (uint32_t n = 9; n < 16; n++) {
21824 for (size_t k = 1; k <= 20; k += 5) {
21825 GemmMicrokernelTester()
21826 .mr(8)
21827 .nr(8)
21828 .kr(1)
21829 .sr(4)
21830 .m(8)
21831 .n(n)
21832 .k(k)
21833 .a_stride(23)
21834 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21835 }
21836 }
21837 }
21838
21839 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_gt_8_subtile) {
21840 TEST_REQUIRES_ARM_NEON_FMA;
21841 for (uint32_t n = 9; n < 16; n++) {
21842 for (size_t k = 1; k <= 20; k += 5) {
21843 for (uint32_t m = 1; m <= 8; m++) {
21844 GemmMicrokernelTester()
21845 .mr(8)
21846 .nr(8)
21847 .kr(1)
21848 .sr(4)
21849 .m(m)
21850 .n(n)
21851 .k(k)
21852 .iterations(1)
21853 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21854 }
21855 }
21856 }
21857 }
21858
21859 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8) {
21860 TEST_REQUIRES_ARM_NEON_FMA;
21861 for (uint32_t n = 16; n <= 24; n += 8) {
21862 for (size_t k = 1; k <= 20; k += 5) {
21863 GemmMicrokernelTester()
21864 .mr(8)
21865 .nr(8)
21866 .kr(1)
21867 .sr(4)
21868 .m(8)
21869 .n(8)
21870 .k(k)
21871 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21872 }
21873 }
21874 }
21875
21876 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8_strided_cn) {
21877 TEST_REQUIRES_ARM_NEON_FMA;
21878 for (uint32_t n = 16; n <= 24; n += 8) {
21879 for (size_t k = 1; k <= 20; k += 5) {
21880 GemmMicrokernelTester()
21881 .mr(8)
21882 .nr(8)
21883 .kr(1)
21884 .sr(4)
21885 .m(8)
21886 .n(n)
21887 .k(k)
21888 .cn_stride(11)
21889 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21890 }
21891 }
21892 }
21893
21894 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8_strided_a) {
21895 TEST_REQUIRES_ARM_NEON_FMA;
21896 for (uint32_t n = 16; n <= 24; n += 8) {
21897 for (size_t k = 1; k <= 20; k += 5) {
21898 GemmMicrokernelTester()
21899 .mr(8)
21900 .nr(8)
21901 .kr(1)
21902 .sr(4)
21903 .m(8)
21904 .n(n)
21905 .k(k)
21906 .a_stride(23)
21907 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21908 }
21909 }
21910 }
21911
21912 TEST(F32_GEMMINC_8X8S4__NEONFMA, n_div_8_subtile) {
21913 TEST_REQUIRES_ARM_NEON_FMA;
21914 for (uint32_t n = 16; n <= 24; n += 8) {
21915 for (size_t k = 1; k <= 20; k += 5) {
21916 for (uint32_t m = 1; m <= 8; m++) {
21917 GemmMicrokernelTester()
21918 .mr(8)
21919 .nr(8)
21920 .kr(1)
21921 .sr(4)
21922 .m(m)
21923 .n(n)
21924 .k(k)
21925 .iterations(1)
21926 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21927 }
21928 }
21929 }
21930 }
21931
21932 TEST(F32_GEMMINC_8X8S4__NEONFMA, strided_cm_subtile) {
21933 TEST_REQUIRES_ARM_NEON_FMA;
21934 for (size_t k = 1; k <= 20; k += 5) {
21935 for (uint32_t m = 1; m <= 8; m++) {
21936 for (uint32_t n = 1; n <= 8; n++) {
21937 GemmMicrokernelTester()
21938 .mr(8)
21939 .nr(8)
21940 .kr(1)
21941 .sr(4)
21942 .m(m)
21943 .n(n)
21944 .k(k)
21945 .cm_stride(11)
21946 .iterations(1)
21947 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21948 }
21949 }
21950 }
21951 }
21952
21953 TEST(F32_GEMMINC_8X8S4__NEONFMA, qmin) {
21954 TEST_REQUIRES_ARM_NEON_FMA;
21955 GemmMicrokernelTester()
21956 .mr(8)
21957 .nr(8)
21958 .kr(1)
21959 .sr(4)
21960 .m(8)
21961 .n(8)
21962 .k(4)
21963 .qmin(128)
21964 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21965 }
21966
21967 TEST(F32_GEMMINC_8X8S4__NEONFMA, qmax) {
21968 TEST_REQUIRES_ARM_NEON_FMA;
21969 GemmMicrokernelTester()
21970 .mr(8)
21971 .nr(8)
21972 .kr(1)
21973 .sr(4)
21974 .m(8)
21975 .n(8)
21976 .k(4)
21977 .qmax(128)
21978 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21979 }
21980
21981 TEST(F32_GEMMINC_8X8S4__NEONFMA, strided_cm) {
21982 TEST_REQUIRES_ARM_NEON_FMA;
21983 GemmMicrokernelTester()
21984 .mr(8)
21985 .nr(8)
21986 .kr(1)
21987 .sr(4)
21988 .m(8)
21989 .n(8)
21990 .k(4)
21991 .cm_stride(11)
21992 .Test(xnn_f32_gemminc_ukernel_8x8s4__neonfma);
21993 }
21994#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21995
21996
Marat Dukhan1dadbf72019-10-01 10:46:20 -070021997#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070021998 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1) {
21999 TEST_REQUIRES_X86_SSE;
22000 GemmMicrokernelTester()
22001 .mr(1)
22002 .nr(8)
22003 .kr(1)
22004 .sr(1)
22005 .m(1)
22006 .n(8)
22007 .k(1)
22008 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22009 }
22010
22011 TEST(F32_GEMMINC_1X8__SSE_LOAD1, strided_cn) {
22012 TEST_REQUIRES_X86_SSE;
22013 GemmMicrokernelTester()
22014 .mr(1)
22015 .nr(8)
22016 .kr(1)
22017 .sr(1)
22018 .m(1)
22019 .n(8)
22020 .k(1)
22021 .cn_stride(11)
22022 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22023 }
22024
22025 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_strided_a) {
22026 TEST_REQUIRES_X86_SSE;
22027 GemmMicrokernelTester()
22028 .mr(1)
22029 .nr(8)
22030 .kr(1)
22031 .sr(1)
22032 .m(1)
22033 .n(8)
22034 .k(1)
22035 .a_stride(3)
22036 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22037 }
22038
22039 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_subtile) {
22040 TEST_REQUIRES_X86_SSE;
22041 for (uint32_t m = 1; m <= 1; m++) {
22042 for (uint32_t n = 1; n <= 8; n++) {
22043 GemmMicrokernelTester()
22044 .mr(1)
22045 .nr(8)
22046 .kr(1)
22047 .sr(1)
22048 .m(m)
22049 .n(n)
22050 .k(1)
22051 .iterations(1)
22052 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22053 }
22054 }
22055 }
22056
22057 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_subtile_m) {
22058 TEST_REQUIRES_X86_SSE;
22059 for (uint32_t m = 1; m <= 1; m++) {
22060 GemmMicrokernelTester()
22061 .mr(1)
22062 .nr(8)
22063 .kr(1)
22064 .sr(1)
22065 .m(m)
22066 .n(8)
22067 .k(1)
22068 .iterations(1)
22069 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22070 }
22071 }
22072
22073 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_eq_1_subtile_n) {
22074 TEST_REQUIRES_X86_SSE;
22075 for (uint32_t n = 1; n <= 8; n++) {
22076 GemmMicrokernelTester()
22077 .mr(1)
22078 .nr(8)
22079 .kr(1)
22080 .sr(1)
22081 .m(1)
22082 .n(n)
22083 .k(1)
22084 .iterations(1)
22085 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22086 }
22087 }
22088
22089 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_gt_1) {
22090 TEST_REQUIRES_X86_SSE;
22091 for (size_t k = 2; k < 10; k++) {
22092 GemmMicrokernelTester()
22093 .mr(1)
22094 .nr(8)
22095 .kr(1)
22096 .sr(1)
22097 .m(1)
22098 .n(8)
22099 .k(k)
22100 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22101 }
22102 }
22103
22104 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_gt_1_strided_a) {
22105 TEST_REQUIRES_X86_SSE;
22106 for (size_t k = 2; k < 10; k++) {
22107 GemmMicrokernelTester()
22108 .mr(1)
22109 .nr(8)
22110 .kr(1)
22111 .sr(1)
22112 .m(1)
22113 .n(8)
22114 .k(k)
22115 .a_stride(11)
22116 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22117 }
22118 }
22119
22120 TEST(F32_GEMMINC_1X8__SSE_LOAD1, k_gt_1_subtile) {
22121 TEST_REQUIRES_X86_SSE;
22122 for (size_t k = 2; k < 10; k++) {
22123 for (uint32_t m = 1; m <= 1; m++) {
22124 for (uint32_t n = 1; n <= 8; n++) {
22125 GemmMicrokernelTester()
22126 .mr(1)
22127 .nr(8)
22128 .kr(1)
22129 .sr(1)
22130 .m(m)
22131 .n(n)
22132 .k(k)
22133 .iterations(1)
22134 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22135 }
22136 }
22137 }
22138 }
22139
22140 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8) {
22141 TEST_REQUIRES_X86_SSE;
22142 for (uint32_t n = 9; n < 16; n++) {
22143 for (size_t k = 1; k <= 5; k += 2) {
22144 GemmMicrokernelTester()
22145 .mr(1)
22146 .nr(8)
22147 .kr(1)
22148 .sr(1)
22149 .m(1)
22150 .n(8)
22151 .k(k)
22152 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22153 }
22154 }
22155 }
22156
22157 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8_strided_cn) {
22158 TEST_REQUIRES_X86_SSE;
22159 for (uint32_t n = 9; n < 16; n++) {
22160 for (size_t k = 1; k <= 5; k += 2) {
22161 GemmMicrokernelTester()
22162 .mr(1)
22163 .nr(8)
22164 .kr(1)
22165 .sr(1)
22166 .m(1)
22167 .n(8)
22168 .k(k)
22169 .cn_stride(11)
22170 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22171 }
22172 }
22173 }
22174
22175 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8_strided_a) {
22176 TEST_REQUIRES_X86_SSE;
22177 for (uint32_t n = 9; n < 16; n++) {
22178 for (size_t k = 1; k <= 5; k += 2) {
22179 GemmMicrokernelTester()
22180 .mr(1)
22181 .nr(8)
22182 .kr(1)
22183 .sr(1)
22184 .m(1)
22185 .n(n)
22186 .k(k)
22187 .a_stride(7)
22188 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22189 }
22190 }
22191 }
22192
22193 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_gt_8_subtile) {
22194 TEST_REQUIRES_X86_SSE;
22195 for (uint32_t n = 9; n < 16; n++) {
22196 for (size_t k = 1; k <= 5; k += 2) {
22197 for (uint32_t m = 1; m <= 1; m++) {
22198 GemmMicrokernelTester()
22199 .mr(1)
22200 .nr(8)
22201 .kr(1)
22202 .sr(1)
22203 .m(m)
22204 .n(n)
22205 .k(k)
22206 .iterations(1)
22207 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22208 }
22209 }
22210 }
22211 }
22212
22213 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8) {
22214 TEST_REQUIRES_X86_SSE;
22215 for (uint32_t n = 16; n <= 24; n += 8) {
22216 for (size_t k = 1; k <= 5; k += 2) {
22217 GemmMicrokernelTester()
22218 .mr(1)
22219 .nr(8)
22220 .kr(1)
22221 .sr(1)
22222 .m(1)
22223 .n(8)
22224 .k(k)
22225 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22226 }
22227 }
22228 }
22229
22230 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8_strided_cn) {
22231 TEST_REQUIRES_X86_SSE;
22232 for (uint32_t n = 16; n <= 24; n += 8) {
22233 for (size_t k = 1; k <= 5; k += 2) {
22234 GemmMicrokernelTester()
22235 .mr(1)
22236 .nr(8)
22237 .kr(1)
22238 .sr(1)
22239 .m(1)
22240 .n(n)
22241 .k(k)
22242 .cn_stride(11)
22243 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22244 }
22245 }
22246 }
22247
22248 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8_strided_a) {
22249 TEST_REQUIRES_X86_SSE;
22250 for (uint32_t n = 16; n <= 24; n += 8) {
22251 for (size_t k = 1; k <= 5; k += 2) {
22252 GemmMicrokernelTester()
22253 .mr(1)
22254 .nr(8)
22255 .kr(1)
22256 .sr(1)
22257 .m(1)
22258 .n(n)
22259 .k(k)
22260 .a_stride(7)
22261 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22262 }
22263 }
22264 }
22265
22266 TEST(F32_GEMMINC_1X8__SSE_LOAD1, n_div_8_subtile) {
22267 TEST_REQUIRES_X86_SSE;
22268 for (uint32_t n = 16; n <= 24; n += 8) {
22269 for (size_t k = 1; k <= 5; k += 2) {
22270 for (uint32_t m = 1; m <= 1; m++) {
22271 GemmMicrokernelTester()
22272 .mr(1)
22273 .nr(8)
22274 .kr(1)
22275 .sr(1)
22276 .m(m)
22277 .n(n)
22278 .k(k)
22279 .iterations(1)
22280 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22281 }
22282 }
22283 }
22284 }
22285
22286 TEST(F32_GEMMINC_1X8__SSE_LOAD1, strided_cm_subtile) {
22287 TEST_REQUIRES_X86_SSE;
22288 for (size_t k = 1; k <= 5; k += 2) {
22289 for (uint32_t m = 1; m <= 1; m++) {
22290 for (uint32_t n = 1; n <= 8; n++) {
22291 GemmMicrokernelTester()
22292 .mr(1)
22293 .nr(8)
22294 .kr(1)
22295 .sr(1)
22296 .m(m)
22297 .n(n)
22298 .k(k)
22299 .cm_stride(11)
22300 .iterations(1)
22301 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22302 }
22303 }
22304 }
22305 }
22306
22307 TEST(F32_GEMMINC_1X8__SSE_LOAD1, qmin) {
22308 TEST_REQUIRES_X86_SSE;
22309 GemmMicrokernelTester()
22310 .mr(1)
22311 .nr(8)
22312 .kr(1)
22313 .sr(1)
22314 .m(1)
22315 .n(8)
22316 .k(1)
22317 .qmin(128)
22318 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22319 }
22320
22321 TEST(F32_GEMMINC_1X8__SSE_LOAD1, qmax) {
22322 TEST_REQUIRES_X86_SSE;
22323 GemmMicrokernelTester()
22324 .mr(1)
22325 .nr(8)
22326 .kr(1)
22327 .sr(1)
22328 .m(1)
22329 .n(8)
22330 .k(1)
22331 .qmax(128)
22332 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22333 }
22334
22335 TEST(F32_GEMMINC_1X8__SSE_LOAD1, strided_cm) {
22336 TEST_REQUIRES_X86_SSE;
22337 GemmMicrokernelTester()
22338 .mr(1)
22339 .nr(8)
22340 .kr(1)
22341 .sr(1)
22342 .m(1)
22343 .n(8)
22344 .k(1)
22345 .cm_stride(11)
22346 .Test(xnn_f32_gemminc_ukernel_1x8__sse_load1);
22347 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070022348#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070022349
22350
Marat Dukhan1dadbf72019-10-01 10:46:20 -070022351#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070022352 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1) {
22353 TEST_REQUIRES_X86_SSE;
22354 GemmMicrokernelTester()
22355 .mr(4)
22356 .nr(8)
22357 .kr(1)
22358 .sr(1)
22359 .m(4)
22360 .n(8)
22361 .k(1)
22362 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22363 }
22364
22365 TEST(F32_GEMMINC_4X8__SSE_LOAD1, strided_cn) {
22366 TEST_REQUIRES_X86_SSE;
22367 GemmMicrokernelTester()
22368 .mr(4)
22369 .nr(8)
22370 .kr(1)
22371 .sr(1)
22372 .m(4)
22373 .n(8)
22374 .k(1)
22375 .cn_stride(11)
22376 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22377 }
22378
22379 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_strided_a) {
22380 TEST_REQUIRES_X86_SSE;
22381 GemmMicrokernelTester()
22382 .mr(4)
22383 .nr(8)
22384 .kr(1)
22385 .sr(1)
22386 .m(4)
22387 .n(8)
22388 .k(1)
22389 .a_stride(3)
22390 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22391 }
22392
22393 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_subtile) {
22394 TEST_REQUIRES_X86_SSE;
22395 for (uint32_t m = 1; m <= 4; m++) {
22396 for (uint32_t n = 1; n <= 8; n++) {
22397 GemmMicrokernelTester()
22398 .mr(4)
22399 .nr(8)
22400 .kr(1)
22401 .sr(1)
22402 .m(m)
22403 .n(n)
22404 .k(1)
22405 .iterations(1)
22406 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22407 }
22408 }
22409 }
22410
22411 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
22412 TEST_REQUIRES_X86_SSE;
22413 for (uint32_t m = 1; m <= 4; m++) {
22414 GemmMicrokernelTester()
22415 .mr(4)
22416 .nr(8)
22417 .kr(1)
22418 .sr(1)
22419 .m(m)
22420 .n(8)
22421 .k(1)
22422 .iterations(1)
22423 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22424 }
22425 }
22426
22427 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
22428 TEST_REQUIRES_X86_SSE;
22429 for (uint32_t n = 1; n <= 8; n++) {
22430 GemmMicrokernelTester()
22431 .mr(4)
22432 .nr(8)
22433 .kr(1)
22434 .sr(1)
22435 .m(4)
22436 .n(n)
22437 .k(1)
22438 .iterations(1)
22439 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22440 }
22441 }
22442
22443 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_gt_1) {
22444 TEST_REQUIRES_X86_SSE;
22445 for (size_t k = 2; k < 10; k++) {
22446 GemmMicrokernelTester()
22447 .mr(4)
22448 .nr(8)
22449 .kr(1)
22450 .sr(1)
22451 .m(4)
22452 .n(8)
22453 .k(k)
22454 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22455 }
22456 }
22457
22458 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_gt_1_strided_a) {
22459 TEST_REQUIRES_X86_SSE;
22460 for (size_t k = 2; k < 10; k++) {
22461 GemmMicrokernelTester()
22462 .mr(4)
22463 .nr(8)
22464 .kr(1)
22465 .sr(1)
22466 .m(4)
22467 .n(8)
22468 .k(k)
22469 .a_stride(11)
22470 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22471 }
22472 }
22473
22474 TEST(F32_GEMMINC_4X8__SSE_LOAD1, k_gt_1_subtile) {
22475 TEST_REQUIRES_X86_SSE;
22476 for (size_t k = 2; k < 10; k++) {
22477 for (uint32_t m = 1; m <= 4; m++) {
22478 for (uint32_t n = 1; n <= 8; n++) {
22479 GemmMicrokernelTester()
22480 .mr(4)
22481 .nr(8)
22482 .kr(1)
22483 .sr(1)
22484 .m(m)
22485 .n(n)
22486 .k(k)
22487 .iterations(1)
22488 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22489 }
22490 }
22491 }
22492 }
22493
22494 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8) {
22495 TEST_REQUIRES_X86_SSE;
22496 for (uint32_t n = 9; n < 16; n++) {
22497 for (size_t k = 1; k <= 5; k += 2) {
22498 GemmMicrokernelTester()
22499 .mr(4)
22500 .nr(8)
22501 .kr(1)
22502 .sr(1)
22503 .m(4)
22504 .n(8)
22505 .k(k)
22506 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22507 }
22508 }
22509 }
22510
22511 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
22512 TEST_REQUIRES_X86_SSE;
22513 for (uint32_t n = 9; n < 16; n++) {
22514 for (size_t k = 1; k <= 5; k += 2) {
22515 GemmMicrokernelTester()
22516 .mr(4)
22517 .nr(8)
22518 .kr(1)
22519 .sr(1)
22520 .m(4)
22521 .n(8)
22522 .k(k)
22523 .cn_stride(11)
22524 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22525 }
22526 }
22527 }
22528
22529 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8_strided_a) {
22530 TEST_REQUIRES_X86_SSE;
22531 for (uint32_t n = 9; n < 16; n++) {
22532 for (size_t k = 1; k <= 5; k += 2) {
22533 GemmMicrokernelTester()
22534 .mr(4)
22535 .nr(8)
22536 .kr(1)
22537 .sr(1)
22538 .m(4)
22539 .n(n)
22540 .k(k)
22541 .a_stride(7)
22542 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22543 }
22544 }
22545 }
22546
22547 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_gt_8_subtile) {
22548 TEST_REQUIRES_X86_SSE;
22549 for (uint32_t n = 9; n < 16; n++) {
22550 for (size_t k = 1; k <= 5; k += 2) {
22551 for (uint32_t m = 1; m <= 4; m++) {
22552 GemmMicrokernelTester()
22553 .mr(4)
22554 .nr(8)
22555 .kr(1)
22556 .sr(1)
22557 .m(m)
22558 .n(n)
22559 .k(k)
22560 .iterations(1)
22561 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22562 }
22563 }
22564 }
22565 }
22566
22567 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8) {
22568 TEST_REQUIRES_X86_SSE;
22569 for (uint32_t n = 16; n <= 24; n += 8) {
22570 for (size_t k = 1; k <= 5; k += 2) {
22571 GemmMicrokernelTester()
22572 .mr(4)
22573 .nr(8)
22574 .kr(1)
22575 .sr(1)
22576 .m(4)
22577 .n(8)
22578 .k(k)
22579 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22580 }
22581 }
22582 }
22583
22584 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8_strided_cn) {
22585 TEST_REQUIRES_X86_SSE;
22586 for (uint32_t n = 16; n <= 24; n += 8) {
22587 for (size_t k = 1; k <= 5; k += 2) {
22588 GemmMicrokernelTester()
22589 .mr(4)
22590 .nr(8)
22591 .kr(1)
22592 .sr(1)
22593 .m(4)
22594 .n(n)
22595 .k(k)
22596 .cn_stride(11)
22597 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22598 }
22599 }
22600 }
22601
22602 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8_strided_a) {
22603 TEST_REQUIRES_X86_SSE;
22604 for (uint32_t n = 16; n <= 24; n += 8) {
22605 for (size_t k = 1; k <= 5; k += 2) {
22606 GemmMicrokernelTester()
22607 .mr(4)
22608 .nr(8)
22609 .kr(1)
22610 .sr(1)
22611 .m(4)
22612 .n(n)
22613 .k(k)
22614 .a_stride(7)
22615 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22616 }
22617 }
22618 }
22619
22620 TEST(F32_GEMMINC_4X8__SSE_LOAD1, n_div_8_subtile) {
22621 TEST_REQUIRES_X86_SSE;
22622 for (uint32_t n = 16; n <= 24; n += 8) {
22623 for (size_t k = 1; k <= 5; k += 2) {
22624 for (uint32_t m = 1; m <= 4; m++) {
22625 GemmMicrokernelTester()
22626 .mr(4)
22627 .nr(8)
22628 .kr(1)
22629 .sr(1)
22630 .m(m)
22631 .n(n)
22632 .k(k)
22633 .iterations(1)
22634 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22635 }
22636 }
22637 }
22638 }
22639
22640 TEST(F32_GEMMINC_4X8__SSE_LOAD1, strided_cm_subtile) {
22641 TEST_REQUIRES_X86_SSE;
22642 for (size_t k = 1; k <= 5; k += 2) {
22643 for (uint32_t m = 1; m <= 4; m++) {
22644 for (uint32_t n = 1; n <= 8; n++) {
22645 GemmMicrokernelTester()
22646 .mr(4)
22647 .nr(8)
22648 .kr(1)
22649 .sr(1)
22650 .m(m)
22651 .n(n)
22652 .k(k)
22653 .cm_stride(11)
22654 .iterations(1)
22655 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22656 }
22657 }
22658 }
22659 }
22660
22661 TEST(F32_GEMMINC_4X8__SSE_LOAD1, qmin) {
22662 TEST_REQUIRES_X86_SSE;
22663 GemmMicrokernelTester()
22664 .mr(4)
22665 .nr(8)
22666 .kr(1)
22667 .sr(1)
22668 .m(4)
22669 .n(8)
22670 .k(1)
22671 .qmin(128)
22672 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22673 }
22674
22675 TEST(F32_GEMMINC_4X8__SSE_LOAD1, qmax) {
22676 TEST_REQUIRES_X86_SSE;
22677 GemmMicrokernelTester()
22678 .mr(4)
22679 .nr(8)
22680 .kr(1)
22681 .sr(1)
22682 .m(4)
22683 .n(8)
22684 .k(1)
22685 .qmax(128)
22686 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22687 }
22688
22689 TEST(F32_GEMMINC_4X8__SSE_LOAD1, strided_cm) {
22690 TEST_REQUIRES_X86_SSE;
22691 GemmMicrokernelTester()
22692 .mr(4)
22693 .nr(8)
22694 .kr(1)
22695 .sr(1)
22696 .m(4)
22697 .n(8)
22698 .k(1)
22699 .cm_stride(11)
22700 .Test(xnn_f32_gemminc_ukernel_4x8__sse_load1);
22701 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070022702#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070022703
22704
Marat Dukhan1dadbf72019-10-01 10:46:20 -070022705#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070022706 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4) {
22707 TEST_REQUIRES_X86_SSE;
22708 GemmMicrokernelTester()
22709 .mr(1)
22710 .nr(8)
22711 .kr(1)
22712 .sr(1)
22713 .m(1)
22714 .n(8)
22715 .k(4)
22716 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22717 }
22718
22719 TEST(F32_GEMMINC_1X8__SSE_DUP, strided_cn) {
22720 TEST_REQUIRES_X86_SSE;
22721 GemmMicrokernelTester()
22722 .mr(1)
22723 .nr(8)
22724 .kr(1)
22725 .sr(1)
22726 .m(1)
22727 .n(8)
22728 .k(4)
22729 .cn_stride(11)
22730 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22731 }
22732
22733 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_strided_a) {
22734 TEST_REQUIRES_X86_SSE;
22735 GemmMicrokernelTester()
22736 .mr(1)
22737 .nr(8)
22738 .kr(1)
22739 .sr(1)
22740 .m(1)
22741 .n(8)
22742 .k(4)
22743 .a_stride(7)
22744 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22745 }
22746
22747 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_subtile) {
22748 TEST_REQUIRES_X86_SSE;
22749 for (uint32_t m = 1; m <= 1; m++) {
22750 for (uint32_t n = 1; n <= 8; n++) {
22751 GemmMicrokernelTester()
22752 .mr(1)
22753 .nr(8)
22754 .kr(1)
22755 .sr(1)
22756 .m(m)
22757 .n(n)
22758 .k(4)
22759 .iterations(1)
22760 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22761 }
22762 }
22763 }
22764
22765 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_subtile_m) {
22766 TEST_REQUIRES_X86_SSE;
22767 for (uint32_t m = 1; m <= 1; m++) {
22768 GemmMicrokernelTester()
22769 .mr(1)
22770 .nr(8)
22771 .kr(1)
22772 .sr(1)
22773 .m(m)
22774 .n(8)
22775 .k(4)
22776 .iterations(1)
22777 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22778 }
22779 }
22780
22781 TEST(F32_GEMMINC_1X8__SSE_DUP, k_eq_4_subtile_n) {
22782 TEST_REQUIRES_X86_SSE;
22783 for (uint32_t n = 1; n <= 8; n++) {
22784 GemmMicrokernelTester()
22785 .mr(1)
22786 .nr(8)
22787 .kr(1)
22788 .sr(1)
22789 .m(1)
22790 .n(n)
22791 .k(4)
22792 .iterations(1)
22793 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22794 }
22795 }
22796
22797 TEST(F32_GEMMINC_1X8__SSE_DUP, k_lt_4) {
22798 TEST_REQUIRES_X86_SSE;
22799 for (size_t k = 1; k < 4; k++) {
22800 GemmMicrokernelTester()
22801 .mr(1)
22802 .nr(8)
22803 .kr(1)
22804 .sr(1)
22805 .m(1)
22806 .n(8)
22807 .k(k)
22808 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22809 }
22810 }
22811
22812 TEST(F32_GEMMINC_1X8__SSE_DUP, k_lt_4_strided_a) {
22813 TEST_REQUIRES_X86_SSE;
22814 for (size_t k = 1; k < 4; k++) {
22815 GemmMicrokernelTester()
22816 .mr(1)
22817 .nr(8)
22818 .kr(1)
22819 .sr(1)
22820 .m(1)
22821 .n(8)
22822 .k(k)
22823 .a_stride(7)
22824 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22825 }
22826 }
22827
22828 TEST(F32_GEMMINC_1X8__SSE_DUP, k_lt_4_subtile) {
22829 TEST_REQUIRES_X86_SSE;
22830 for (size_t k = 1; k < 4; k++) {
22831 for (uint32_t m = 1; m <= 1; m++) {
22832 for (uint32_t n = 1; n <= 8; n++) {
22833 GemmMicrokernelTester()
22834 .mr(1)
22835 .nr(8)
22836 .kr(1)
22837 .sr(1)
22838 .m(m)
22839 .n(n)
22840 .k(k)
22841 .iterations(1)
22842 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22843 }
22844 }
22845 }
22846 }
22847
22848 TEST(F32_GEMMINC_1X8__SSE_DUP, k_gt_4) {
22849 TEST_REQUIRES_X86_SSE;
22850 for (size_t k = 5; k < 8; k++) {
22851 GemmMicrokernelTester()
22852 .mr(1)
22853 .nr(8)
22854 .kr(1)
22855 .sr(1)
22856 .m(1)
22857 .n(8)
22858 .k(k)
22859 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22860 }
22861 }
22862
22863 TEST(F32_GEMMINC_1X8__SSE_DUP, k_gt_4_strided_a) {
22864 TEST_REQUIRES_X86_SSE;
22865 for (size_t k = 5; k < 8; k++) {
22866 GemmMicrokernelTester()
22867 .mr(1)
22868 .nr(8)
22869 .kr(1)
22870 .sr(1)
22871 .m(1)
22872 .n(8)
22873 .k(k)
22874 .a_stride(11)
22875 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22876 }
22877 }
22878
22879 TEST(F32_GEMMINC_1X8__SSE_DUP, k_gt_4_subtile) {
22880 TEST_REQUIRES_X86_SSE;
22881 for (size_t k = 5; k < 8; k++) {
22882 for (uint32_t m = 1; m <= 1; m++) {
22883 for (uint32_t n = 1; n <= 8; n++) {
22884 GemmMicrokernelTester()
22885 .mr(1)
22886 .nr(8)
22887 .kr(1)
22888 .sr(1)
22889 .m(m)
22890 .n(n)
22891 .k(k)
22892 .iterations(1)
22893 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22894 }
22895 }
22896 }
22897 }
22898
22899 TEST(F32_GEMMINC_1X8__SSE_DUP, k_div_4) {
22900 TEST_REQUIRES_X86_SSE;
22901 for (size_t k = 8; k <= 40; k += 4) {
22902 GemmMicrokernelTester()
22903 .mr(1)
22904 .nr(8)
22905 .kr(1)
22906 .sr(1)
22907 .m(1)
22908 .n(8)
22909 .k(k)
22910 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22911 }
22912 }
22913
22914 TEST(F32_GEMMINC_1X8__SSE_DUP, k_div_4_strided_a) {
22915 TEST_REQUIRES_X86_SSE;
22916 for (size_t k = 8; k <= 40; k += 4) {
22917 GemmMicrokernelTester()
22918 .mr(1)
22919 .nr(8)
22920 .kr(1)
22921 .sr(1)
22922 .m(1)
22923 .n(8)
22924 .k(k)
22925 .a_stride(43)
22926 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22927 }
22928 }
22929
22930 TEST(F32_GEMMINC_1X8__SSE_DUP, k_div_4_subtile) {
22931 TEST_REQUIRES_X86_SSE;
22932 for (size_t k = 8; k <= 40; k += 4) {
22933 for (uint32_t m = 1; m <= 1; m++) {
22934 for (uint32_t n = 1; n <= 8; n++) {
22935 GemmMicrokernelTester()
22936 .mr(1)
22937 .nr(8)
22938 .kr(1)
22939 .sr(1)
22940 .m(m)
22941 .n(n)
22942 .k(k)
22943 .iterations(1)
22944 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22945 }
22946 }
22947 }
22948 }
22949
22950 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8) {
22951 TEST_REQUIRES_X86_SSE;
22952 for (uint32_t n = 9; n < 16; n++) {
22953 for (size_t k = 1; k <= 20; k += 5) {
22954 GemmMicrokernelTester()
22955 .mr(1)
22956 .nr(8)
22957 .kr(1)
22958 .sr(1)
22959 .m(1)
22960 .n(8)
22961 .k(k)
22962 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22963 }
22964 }
22965 }
22966
22967 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8_strided_cn) {
22968 TEST_REQUIRES_X86_SSE;
22969 for (uint32_t n = 9; n < 16; n++) {
22970 for (size_t k = 1; k <= 20; k += 5) {
22971 GemmMicrokernelTester()
22972 .mr(1)
22973 .nr(8)
22974 .kr(1)
22975 .sr(1)
22976 .m(1)
22977 .n(8)
22978 .k(k)
22979 .cn_stride(11)
22980 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22981 }
22982 }
22983 }
22984
22985 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8_strided_a) {
22986 TEST_REQUIRES_X86_SSE;
22987 for (uint32_t n = 9; n < 16; n++) {
22988 for (size_t k = 1; k <= 20; k += 5) {
22989 GemmMicrokernelTester()
22990 .mr(1)
22991 .nr(8)
22992 .kr(1)
22993 .sr(1)
22994 .m(1)
22995 .n(n)
22996 .k(k)
22997 .a_stride(23)
22998 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
22999 }
23000 }
23001 }
23002
23003 TEST(F32_GEMMINC_1X8__SSE_DUP, n_gt_8_subtile) {
23004 TEST_REQUIRES_X86_SSE;
23005 for (uint32_t n = 9; n < 16; n++) {
23006 for (size_t k = 1; k <= 20; k += 5) {
23007 for (uint32_t m = 1; m <= 1; m++) {
23008 GemmMicrokernelTester()
23009 .mr(1)
23010 .nr(8)
23011 .kr(1)
23012 .sr(1)
23013 .m(m)
23014 .n(n)
23015 .k(k)
23016 .iterations(1)
23017 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23018 }
23019 }
23020 }
23021 }
23022
23023 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8) {
23024 TEST_REQUIRES_X86_SSE;
23025 for (uint32_t n = 16; n <= 24; n += 8) {
23026 for (size_t k = 1; k <= 20; k += 5) {
23027 GemmMicrokernelTester()
23028 .mr(1)
23029 .nr(8)
23030 .kr(1)
23031 .sr(1)
23032 .m(1)
23033 .n(8)
23034 .k(k)
23035 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23036 }
23037 }
23038 }
23039
23040 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8_strided_cn) {
23041 TEST_REQUIRES_X86_SSE;
23042 for (uint32_t n = 16; n <= 24; n += 8) {
23043 for (size_t k = 1; k <= 20; k += 5) {
23044 GemmMicrokernelTester()
23045 .mr(1)
23046 .nr(8)
23047 .kr(1)
23048 .sr(1)
23049 .m(1)
23050 .n(n)
23051 .k(k)
23052 .cn_stride(11)
23053 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23054 }
23055 }
23056 }
23057
23058 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8_strided_a) {
23059 TEST_REQUIRES_X86_SSE;
23060 for (uint32_t n = 16; n <= 24; n += 8) {
23061 for (size_t k = 1; k <= 20; k += 5) {
23062 GemmMicrokernelTester()
23063 .mr(1)
23064 .nr(8)
23065 .kr(1)
23066 .sr(1)
23067 .m(1)
23068 .n(n)
23069 .k(k)
23070 .a_stride(23)
23071 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23072 }
23073 }
23074 }
23075
23076 TEST(F32_GEMMINC_1X8__SSE_DUP, n_div_8_subtile) {
23077 TEST_REQUIRES_X86_SSE;
23078 for (uint32_t n = 16; n <= 24; n += 8) {
23079 for (size_t k = 1; k <= 20; k += 5) {
23080 for (uint32_t m = 1; m <= 1; m++) {
23081 GemmMicrokernelTester()
23082 .mr(1)
23083 .nr(8)
23084 .kr(1)
23085 .sr(1)
23086 .m(m)
23087 .n(n)
23088 .k(k)
23089 .iterations(1)
23090 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23091 }
23092 }
23093 }
23094 }
23095
23096 TEST(F32_GEMMINC_1X8__SSE_DUP, strided_cm_subtile) {
23097 TEST_REQUIRES_X86_SSE;
23098 for (size_t k = 1; k <= 20; k += 5) {
23099 for (uint32_t m = 1; m <= 1; m++) {
23100 for (uint32_t n = 1; n <= 8; n++) {
23101 GemmMicrokernelTester()
23102 .mr(1)
23103 .nr(8)
23104 .kr(1)
23105 .sr(1)
23106 .m(m)
23107 .n(n)
23108 .k(k)
23109 .cm_stride(11)
23110 .iterations(1)
23111 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23112 }
23113 }
23114 }
23115 }
23116
23117 TEST(F32_GEMMINC_1X8__SSE_DUP, qmin) {
23118 TEST_REQUIRES_X86_SSE;
23119 GemmMicrokernelTester()
23120 .mr(1)
23121 .nr(8)
23122 .kr(1)
23123 .sr(1)
23124 .m(1)
23125 .n(8)
23126 .k(4)
23127 .qmin(128)
23128 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23129 }
23130
23131 TEST(F32_GEMMINC_1X8__SSE_DUP, qmax) {
23132 TEST_REQUIRES_X86_SSE;
23133 GemmMicrokernelTester()
23134 .mr(1)
23135 .nr(8)
23136 .kr(1)
23137 .sr(1)
23138 .m(1)
23139 .n(8)
23140 .k(4)
23141 .qmax(128)
23142 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23143 }
23144
23145 TEST(F32_GEMMINC_1X8__SSE_DUP, strided_cm) {
23146 TEST_REQUIRES_X86_SSE;
23147 GemmMicrokernelTester()
23148 .mr(1)
23149 .nr(8)
23150 .kr(1)
23151 .sr(1)
23152 .m(1)
23153 .n(8)
23154 .k(4)
23155 .cm_stride(11)
23156 .Test(xnn_f32_gemminc_ukernel_1x8__sse_dup);
23157 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023158#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023159
23160
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023161#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023162 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4) {
23163 TEST_REQUIRES_X86_SSE;
23164 GemmMicrokernelTester()
23165 .mr(4)
23166 .nr(8)
23167 .kr(1)
23168 .sr(1)
23169 .m(4)
23170 .n(8)
23171 .k(4)
23172 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23173 }
23174
23175 TEST(F32_GEMMINC_4X8__SSE_DUP, strided_cn) {
23176 TEST_REQUIRES_X86_SSE;
23177 GemmMicrokernelTester()
23178 .mr(4)
23179 .nr(8)
23180 .kr(1)
23181 .sr(1)
23182 .m(4)
23183 .n(8)
23184 .k(4)
23185 .cn_stride(11)
23186 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23187 }
23188
23189 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_strided_a) {
23190 TEST_REQUIRES_X86_SSE;
23191 GemmMicrokernelTester()
23192 .mr(4)
23193 .nr(8)
23194 .kr(1)
23195 .sr(1)
23196 .m(4)
23197 .n(8)
23198 .k(4)
23199 .a_stride(7)
23200 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23201 }
23202
23203 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_subtile) {
23204 TEST_REQUIRES_X86_SSE;
23205 for (uint32_t m = 1; m <= 4; m++) {
23206 for (uint32_t n = 1; n <= 8; n++) {
23207 GemmMicrokernelTester()
23208 .mr(4)
23209 .nr(8)
23210 .kr(1)
23211 .sr(1)
23212 .m(m)
23213 .n(n)
23214 .k(4)
23215 .iterations(1)
23216 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23217 }
23218 }
23219 }
23220
23221 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_subtile_m) {
23222 TEST_REQUIRES_X86_SSE;
23223 for (uint32_t m = 1; m <= 4; m++) {
23224 GemmMicrokernelTester()
23225 .mr(4)
23226 .nr(8)
23227 .kr(1)
23228 .sr(1)
23229 .m(m)
23230 .n(8)
23231 .k(4)
23232 .iterations(1)
23233 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23234 }
23235 }
23236
23237 TEST(F32_GEMMINC_4X8__SSE_DUP, k_eq_4_subtile_n) {
23238 TEST_REQUIRES_X86_SSE;
23239 for (uint32_t n = 1; n <= 8; n++) {
23240 GemmMicrokernelTester()
23241 .mr(4)
23242 .nr(8)
23243 .kr(1)
23244 .sr(1)
23245 .m(4)
23246 .n(n)
23247 .k(4)
23248 .iterations(1)
23249 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23250 }
23251 }
23252
23253 TEST(F32_GEMMINC_4X8__SSE_DUP, k_lt_4) {
23254 TEST_REQUIRES_X86_SSE;
23255 for (size_t k = 1; k < 4; k++) {
23256 GemmMicrokernelTester()
23257 .mr(4)
23258 .nr(8)
23259 .kr(1)
23260 .sr(1)
23261 .m(4)
23262 .n(8)
23263 .k(k)
23264 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23265 }
23266 }
23267
23268 TEST(F32_GEMMINC_4X8__SSE_DUP, k_lt_4_strided_a) {
23269 TEST_REQUIRES_X86_SSE;
23270 for (size_t k = 1; k < 4; k++) {
23271 GemmMicrokernelTester()
23272 .mr(4)
23273 .nr(8)
23274 .kr(1)
23275 .sr(1)
23276 .m(4)
23277 .n(8)
23278 .k(k)
23279 .a_stride(7)
23280 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23281 }
23282 }
23283
23284 TEST(F32_GEMMINC_4X8__SSE_DUP, k_lt_4_subtile) {
23285 TEST_REQUIRES_X86_SSE;
23286 for (size_t k = 1; k < 4; k++) {
23287 for (uint32_t m = 1; m <= 4; m++) {
23288 for (uint32_t n = 1; n <= 8; n++) {
23289 GemmMicrokernelTester()
23290 .mr(4)
23291 .nr(8)
23292 .kr(1)
23293 .sr(1)
23294 .m(m)
23295 .n(n)
23296 .k(k)
23297 .iterations(1)
23298 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23299 }
23300 }
23301 }
23302 }
23303
23304 TEST(F32_GEMMINC_4X8__SSE_DUP, k_gt_4) {
23305 TEST_REQUIRES_X86_SSE;
23306 for (size_t k = 5; k < 8; k++) {
23307 GemmMicrokernelTester()
23308 .mr(4)
23309 .nr(8)
23310 .kr(1)
23311 .sr(1)
23312 .m(4)
23313 .n(8)
23314 .k(k)
23315 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23316 }
23317 }
23318
23319 TEST(F32_GEMMINC_4X8__SSE_DUP, k_gt_4_strided_a) {
23320 TEST_REQUIRES_X86_SSE;
23321 for (size_t k = 5; k < 8; k++) {
23322 GemmMicrokernelTester()
23323 .mr(4)
23324 .nr(8)
23325 .kr(1)
23326 .sr(1)
23327 .m(4)
23328 .n(8)
23329 .k(k)
23330 .a_stride(11)
23331 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23332 }
23333 }
23334
23335 TEST(F32_GEMMINC_4X8__SSE_DUP, k_gt_4_subtile) {
23336 TEST_REQUIRES_X86_SSE;
23337 for (size_t k = 5; k < 8; k++) {
23338 for (uint32_t m = 1; m <= 4; m++) {
23339 for (uint32_t n = 1; n <= 8; n++) {
23340 GemmMicrokernelTester()
23341 .mr(4)
23342 .nr(8)
23343 .kr(1)
23344 .sr(1)
23345 .m(m)
23346 .n(n)
23347 .k(k)
23348 .iterations(1)
23349 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23350 }
23351 }
23352 }
23353 }
23354
23355 TEST(F32_GEMMINC_4X8__SSE_DUP, k_div_4) {
23356 TEST_REQUIRES_X86_SSE;
23357 for (size_t k = 8; k <= 40; k += 4) {
23358 GemmMicrokernelTester()
23359 .mr(4)
23360 .nr(8)
23361 .kr(1)
23362 .sr(1)
23363 .m(4)
23364 .n(8)
23365 .k(k)
23366 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23367 }
23368 }
23369
23370 TEST(F32_GEMMINC_4X8__SSE_DUP, k_div_4_strided_a) {
23371 TEST_REQUIRES_X86_SSE;
23372 for (size_t k = 8; k <= 40; k += 4) {
23373 GemmMicrokernelTester()
23374 .mr(4)
23375 .nr(8)
23376 .kr(1)
23377 .sr(1)
23378 .m(4)
23379 .n(8)
23380 .k(k)
23381 .a_stride(43)
23382 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23383 }
23384 }
23385
23386 TEST(F32_GEMMINC_4X8__SSE_DUP, k_div_4_subtile) {
23387 TEST_REQUIRES_X86_SSE;
23388 for (size_t k = 8; k <= 40; k += 4) {
23389 for (uint32_t m = 1; m <= 4; m++) {
23390 for (uint32_t n = 1; n <= 8; n++) {
23391 GemmMicrokernelTester()
23392 .mr(4)
23393 .nr(8)
23394 .kr(1)
23395 .sr(1)
23396 .m(m)
23397 .n(n)
23398 .k(k)
23399 .iterations(1)
23400 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23401 }
23402 }
23403 }
23404 }
23405
23406 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8) {
23407 TEST_REQUIRES_X86_SSE;
23408 for (uint32_t n = 9; n < 16; n++) {
23409 for (size_t k = 1; k <= 20; k += 5) {
23410 GemmMicrokernelTester()
23411 .mr(4)
23412 .nr(8)
23413 .kr(1)
23414 .sr(1)
23415 .m(4)
23416 .n(8)
23417 .k(k)
23418 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23419 }
23420 }
23421 }
23422
23423 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8_strided_cn) {
23424 TEST_REQUIRES_X86_SSE;
23425 for (uint32_t n = 9; n < 16; n++) {
23426 for (size_t k = 1; k <= 20; k += 5) {
23427 GemmMicrokernelTester()
23428 .mr(4)
23429 .nr(8)
23430 .kr(1)
23431 .sr(1)
23432 .m(4)
23433 .n(8)
23434 .k(k)
23435 .cn_stride(11)
23436 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23437 }
23438 }
23439 }
23440
23441 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8_strided_a) {
23442 TEST_REQUIRES_X86_SSE;
23443 for (uint32_t n = 9; n < 16; n++) {
23444 for (size_t k = 1; k <= 20; k += 5) {
23445 GemmMicrokernelTester()
23446 .mr(4)
23447 .nr(8)
23448 .kr(1)
23449 .sr(1)
23450 .m(4)
23451 .n(n)
23452 .k(k)
23453 .a_stride(23)
23454 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23455 }
23456 }
23457 }
23458
23459 TEST(F32_GEMMINC_4X8__SSE_DUP, n_gt_8_subtile) {
23460 TEST_REQUIRES_X86_SSE;
23461 for (uint32_t n = 9; n < 16; n++) {
23462 for (size_t k = 1; k <= 20; k += 5) {
23463 for (uint32_t m = 1; m <= 4; m++) {
23464 GemmMicrokernelTester()
23465 .mr(4)
23466 .nr(8)
23467 .kr(1)
23468 .sr(1)
23469 .m(m)
23470 .n(n)
23471 .k(k)
23472 .iterations(1)
23473 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23474 }
23475 }
23476 }
23477 }
23478
23479 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8) {
23480 TEST_REQUIRES_X86_SSE;
23481 for (uint32_t n = 16; n <= 24; n += 8) {
23482 for (size_t k = 1; k <= 20; k += 5) {
23483 GemmMicrokernelTester()
23484 .mr(4)
23485 .nr(8)
23486 .kr(1)
23487 .sr(1)
23488 .m(4)
23489 .n(8)
23490 .k(k)
23491 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23492 }
23493 }
23494 }
23495
23496 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8_strided_cn) {
23497 TEST_REQUIRES_X86_SSE;
23498 for (uint32_t n = 16; n <= 24; n += 8) {
23499 for (size_t k = 1; k <= 20; k += 5) {
23500 GemmMicrokernelTester()
23501 .mr(4)
23502 .nr(8)
23503 .kr(1)
23504 .sr(1)
23505 .m(4)
23506 .n(n)
23507 .k(k)
23508 .cn_stride(11)
23509 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23510 }
23511 }
23512 }
23513
23514 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8_strided_a) {
23515 TEST_REQUIRES_X86_SSE;
23516 for (uint32_t n = 16; n <= 24; n += 8) {
23517 for (size_t k = 1; k <= 20; k += 5) {
23518 GemmMicrokernelTester()
23519 .mr(4)
23520 .nr(8)
23521 .kr(1)
23522 .sr(1)
23523 .m(4)
23524 .n(n)
23525 .k(k)
23526 .a_stride(23)
23527 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23528 }
23529 }
23530 }
23531
23532 TEST(F32_GEMMINC_4X8__SSE_DUP, n_div_8_subtile) {
23533 TEST_REQUIRES_X86_SSE;
23534 for (uint32_t n = 16; n <= 24; n += 8) {
23535 for (size_t k = 1; k <= 20; k += 5) {
23536 for (uint32_t m = 1; m <= 4; m++) {
23537 GemmMicrokernelTester()
23538 .mr(4)
23539 .nr(8)
23540 .kr(1)
23541 .sr(1)
23542 .m(m)
23543 .n(n)
23544 .k(k)
23545 .iterations(1)
23546 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23547 }
23548 }
23549 }
23550 }
23551
23552 TEST(F32_GEMMINC_4X8__SSE_DUP, strided_cm_subtile) {
23553 TEST_REQUIRES_X86_SSE;
23554 for (size_t k = 1; k <= 20; k += 5) {
23555 for (uint32_t m = 1; m <= 4; m++) {
23556 for (uint32_t n = 1; n <= 8; n++) {
23557 GemmMicrokernelTester()
23558 .mr(4)
23559 .nr(8)
23560 .kr(1)
23561 .sr(1)
23562 .m(m)
23563 .n(n)
23564 .k(k)
23565 .cm_stride(11)
23566 .iterations(1)
23567 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23568 }
23569 }
23570 }
23571 }
23572
23573 TEST(F32_GEMMINC_4X8__SSE_DUP, qmin) {
23574 TEST_REQUIRES_X86_SSE;
23575 GemmMicrokernelTester()
23576 .mr(4)
23577 .nr(8)
23578 .kr(1)
23579 .sr(1)
23580 .m(4)
23581 .n(8)
23582 .k(4)
23583 .qmin(128)
23584 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23585 }
23586
23587 TEST(F32_GEMMINC_4X8__SSE_DUP, qmax) {
23588 TEST_REQUIRES_X86_SSE;
23589 GemmMicrokernelTester()
23590 .mr(4)
23591 .nr(8)
23592 .kr(1)
23593 .sr(1)
23594 .m(4)
23595 .n(8)
23596 .k(4)
23597 .qmax(128)
23598 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23599 }
23600
23601 TEST(F32_GEMMINC_4X8__SSE_DUP, strided_cm) {
23602 TEST_REQUIRES_X86_SSE;
23603 GemmMicrokernelTester()
23604 .mr(4)
23605 .nr(8)
23606 .kr(1)
23607 .sr(1)
23608 .m(4)
23609 .n(8)
23610 .k(4)
23611 .cm_stride(11)
23612 .Test(xnn_f32_gemminc_ukernel_4x8__sse_dup);
23613 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023614#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023615
23616
Marat Dukhan1dadbf72019-10-01 10:46:20 -070023617#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070023618 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4) {
23619 TEST_REQUIRES_X86_SSE;
23620 GemmMicrokernelTester()
23621 .mr(1)
23622 .nr(8)
23623 .kr(1)
23624 .sr(4)
23625 .m(1)
23626 .n(8)
23627 .k(4)
23628 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23629 }
23630
23631 TEST(F32_GEMMINC_1X8S4__SSE, strided_cn) {
23632 TEST_REQUIRES_X86_SSE;
23633 GemmMicrokernelTester()
23634 .mr(1)
23635 .nr(8)
23636 .kr(1)
23637 .sr(4)
23638 .m(1)
23639 .n(8)
23640 .k(4)
23641 .cn_stride(11)
23642 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23643 }
23644
23645 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_strided_a) {
23646 TEST_REQUIRES_X86_SSE;
23647 GemmMicrokernelTester()
23648 .mr(1)
23649 .nr(8)
23650 .kr(1)
23651 .sr(4)
23652 .m(1)
23653 .n(8)
23654 .k(4)
23655 .a_stride(7)
23656 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23657 }
23658
23659 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_subtile) {
23660 TEST_REQUIRES_X86_SSE;
23661 for (uint32_t m = 1; m <= 1; m++) {
23662 for (uint32_t n = 1; n <= 8; n++) {
23663 GemmMicrokernelTester()
23664 .mr(1)
23665 .nr(8)
23666 .kr(1)
23667 .sr(4)
23668 .m(m)
23669 .n(n)
23670 .k(4)
23671 .iterations(1)
23672 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23673 }
23674 }
23675 }
23676
23677 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_subtile_m) {
23678 TEST_REQUIRES_X86_SSE;
23679 for (uint32_t m = 1; m <= 1; m++) {
23680 GemmMicrokernelTester()
23681 .mr(1)
23682 .nr(8)
23683 .kr(1)
23684 .sr(4)
23685 .m(m)
23686 .n(8)
23687 .k(4)
23688 .iterations(1)
23689 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23690 }
23691 }
23692
23693 TEST(F32_GEMMINC_1X8S4__SSE, k_eq_4_subtile_n) {
23694 TEST_REQUIRES_X86_SSE;
23695 for (uint32_t n = 1; n <= 8; n++) {
23696 GemmMicrokernelTester()
23697 .mr(1)
23698 .nr(8)
23699 .kr(1)
23700 .sr(4)
23701 .m(1)
23702 .n(n)
23703 .k(4)
23704 .iterations(1)
23705 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23706 }
23707 }
23708
23709 TEST(F32_GEMMINC_1X8S4__SSE, k_lt_4) {
23710 TEST_REQUIRES_X86_SSE;
23711 for (size_t k = 1; k < 4; k++) {
23712 GemmMicrokernelTester()
23713 .mr(1)
23714 .nr(8)
23715 .kr(1)
23716 .sr(4)
23717 .m(1)
23718 .n(8)
23719 .k(k)
23720 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23721 }
23722 }
23723
23724 TEST(F32_GEMMINC_1X8S4__SSE, k_lt_4_strided_a) {
23725 TEST_REQUIRES_X86_SSE;
23726 for (size_t k = 1; k < 4; k++) {
23727 GemmMicrokernelTester()
23728 .mr(1)
23729 .nr(8)
23730 .kr(1)
23731 .sr(4)
23732 .m(1)
23733 .n(8)
23734 .k(k)
23735 .a_stride(7)
23736 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23737 }
23738 }
23739
23740 TEST(F32_GEMMINC_1X8S4__SSE, k_lt_4_subtile) {
23741 TEST_REQUIRES_X86_SSE;
23742 for (size_t k = 1; k < 4; k++) {
23743 for (uint32_t m = 1; m <= 1; m++) {
23744 for (uint32_t n = 1; n <= 8; n++) {
23745 GemmMicrokernelTester()
23746 .mr(1)
23747 .nr(8)
23748 .kr(1)
23749 .sr(4)
23750 .m(m)
23751 .n(n)
23752 .k(k)
23753 .iterations(1)
23754 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23755 }
23756 }
23757 }
23758 }
23759
23760 TEST(F32_GEMMINC_1X8S4__SSE, k_gt_4) {
23761 TEST_REQUIRES_X86_SSE;
23762 for (size_t k = 5; k < 8; k++) {
23763 GemmMicrokernelTester()
23764 .mr(1)
23765 .nr(8)
23766 .kr(1)
23767 .sr(4)
23768 .m(1)
23769 .n(8)
23770 .k(k)
23771 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23772 }
23773 }
23774
23775 TEST(F32_GEMMINC_1X8S4__SSE, k_gt_4_strided_a) {
23776 TEST_REQUIRES_X86_SSE;
23777 for (size_t k = 5; k < 8; k++) {
23778 GemmMicrokernelTester()
23779 .mr(1)
23780 .nr(8)
23781 .kr(1)
23782 .sr(4)
23783 .m(1)
23784 .n(8)
23785 .k(k)
23786 .a_stride(11)
23787 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23788 }
23789 }
23790
23791 TEST(F32_GEMMINC_1X8S4__SSE, k_gt_4_subtile) {
23792 TEST_REQUIRES_X86_SSE;
23793 for (size_t k = 5; k < 8; k++) {
23794 for (uint32_t m = 1; m <= 1; m++) {
23795 for (uint32_t n = 1; n <= 8; n++) {
23796 GemmMicrokernelTester()
23797 .mr(1)
23798 .nr(8)
23799 .kr(1)
23800 .sr(4)
23801 .m(m)
23802 .n(n)
23803 .k(k)
23804 .iterations(1)
23805 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23806 }
23807 }
23808 }
23809 }
23810
23811 TEST(F32_GEMMINC_1X8S4__SSE, k_div_4) {
23812 TEST_REQUIRES_X86_SSE;
23813 for (size_t k = 8; k <= 40; k += 4) {
23814 GemmMicrokernelTester()
23815 .mr(1)
23816 .nr(8)
23817 .kr(1)
23818 .sr(4)
23819 .m(1)
23820 .n(8)
23821 .k(k)
23822 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23823 }
23824 }
23825
23826 TEST(F32_GEMMINC_1X8S4__SSE, k_div_4_strided_a) {
23827 TEST_REQUIRES_X86_SSE;
23828 for (size_t k = 8; k <= 40; k += 4) {
23829 GemmMicrokernelTester()
23830 .mr(1)
23831 .nr(8)
23832 .kr(1)
23833 .sr(4)
23834 .m(1)
23835 .n(8)
23836 .k(k)
23837 .a_stride(43)
23838 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23839 }
23840 }
23841
23842 TEST(F32_GEMMINC_1X8S4__SSE, k_div_4_subtile) {
23843 TEST_REQUIRES_X86_SSE;
23844 for (size_t k = 8; k <= 40; k += 4) {
23845 for (uint32_t m = 1; m <= 1; m++) {
23846 for (uint32_t n = 1; n <= 8; n++) {
23847 GemmMicrokernelTester()
23848 .mr(1)
23849 .nr(8)
23850 .kr(1)
23851 .sr(4)
23852 .m(m)
23853 .n(n)
23854 .k(k)
23855 .iterations(1)
23856 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23857 }
23858 }
23859 }
23860 }
23861
23862 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8) {
23863 TEST_REQUIRES_X86_SSE;
23864 for (uint32_t n = 9; n < 16; n++) {
23865 for (size_t k = 1; k <= 20; k += 5) {
23866 GemmMicrokernelTester()
23867 .mr(1)
23868 .nr(8)
23869 .kr(1)
23870 .sr(4)
23871 .m(1)
23872 .n(8)
23873 .k(k)
23874 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23875 }
23876 }
23877 }
23878
23879 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8_strided_cn) {
23880 TEST_REQUIRES_X86_SSE;
23881 for (uint32_t n = 9; n < 16; n++) {
23882 for (size_t k = 1; k <= 20; k += 5) {
23883 GemmMicrokernelTester()
23884 .mr(1)
23885 .nr(8)
23886 .kr(1)
23887 .sr(4)
23888 .m(1)
23889 .n(8)
23890 .k(k)
23891 .cn_stride(11)
23892 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23893 }
23894 }
23895 }
23896
23897 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8_strided_a) {
23898 TEST_REQUIRES_X86_SSE;
23899 for (uint32_t n = 9; n < 16; n++) {
23900 for (size_t k = 1; k <= 20; k += 5) {
23901 GemmMicrokernelTester()
23902 .mr(1)
23903 .nr(8)
23904 .kr(1)
23905 .sr(4)
23906 .m(1)
23907 .n(n)
23908 .k(k)
23909 .a_stride(23)
23910 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23911 }
23912 }
23913 }
23914
23915 TEST(F32_GEMMINC_1X8S4__SSE, n_gt_8_subtile) {
23916 TEST_REQUIRES_X86_SSE;
23917 for (uint32_t n = 9; n < 16; n++) {
23918 for (size_t k = 1; k <= 20; k += 5) {
23919 for (uint32_t m = 1; m <= 1; m++) {
23920 GemmMicrokernelTester()
23921 .mr(1)
23922 .nr(8)
23923 .kr(1)
23924 .sr(4)
23925 .m(m)
23926 .n(n)
23927 .k(k)
23928 .iterations(1)
23929 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23930 }
23931 }
23932 }
23933 }
23934
23935 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8) {
23936 TEST_REQUIRES_X86_SSE;
23937 for (uint32_t n = 16; n <= 24; n += 8) {
23938 for (size_t k = 1; k <= 20; k += 5) {
23939 GemmMicrokernelTester()
23940 .mr(1)
23941 .nr(8)
23942 .kr(1)
23943 .sr(4)
23944 .m(1)
23945 .n(8)
23946 .k(k)
23947 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23948 }
23949 }
23950 }
23951
23952 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8_strided_cn) {
23953 TEST_REQUIRES_X86_SSE;
23954 for (uint32_t n = 16; n <= 24; n += 8) {
23955 for (size_t k = 1; k <= 20; k += 5) {
23956 GemmMicrokernelTester()
23957 .mr(1)
23958 .nr(8)
23959 .kr(1)
23960 .sr(4)
23961 .m(1)
23962 .n(n)
23963 .k(k)
23964 .cn_stride(11)
23965 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23966 }
23967 }
23968 }
23969
23970 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8_strided_a) {
23971 TEST_REQUIRES_X86_SSE;
23972 for (uint32_t n = 16; n <= 24; n += 8) {
23973 for (size_t k = 1; k <= 20; k += 5) {
23974 GemmMicrokernelTester()
23975 .mr(1)
23976 .nr(8)
23977 .kr(1)
23978 .sr(4)
23979 .m(1)
23980 .n(n)
23981 .k(k)
23982 .a_stride(23)
23983 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
23984 }
23985 }
23986 }
23987
23988 TEST(F32_GEMMINC_1X8S4__SSE, n_div_8_subtile) {
23989 TEST_REQUIRES_X86_SSE;
23990 for (uint32_t n = 16; n <= 24; n += 8) {
23991 for (size_t k = 1; k <= 20; k += 5) {
23992 for (uint32_t m = 1; m <= 1; m++) {
23993 GemmMicrokernelTester()
23994 .mr(1)
23995 .nr(8)
23996 .kr(1)
23997 .sr(4)
23998 .m(m)
23999 .n(n)
24000 .k(k)
24001 .iterations(1)
24002 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
24003 }
24004 }
24005 }
24006 }
24007
24008 TEST(F32_GEMMINC_1X8S4__SSE, strided_cm_subtile) {
24009 TEST_REQUIRES_X86_SSE;
24010 for (size_t k = 1; k <= 20; k += 5) {
24011 for (uint32_t m = 1; m <= 1; m++) {
24012 for (uint32_t n = 1; n <= 8; n++) {
24013 GemmMicrokernelTester()
24014 .mr(1)
24015 .nr(8)
24016 .kr(1)
24017 .sr(4)
24018 .m(m)
24019 .n(n)
24020 .k(k)
24021 .cm_stride(11)
24022 .iterations(1)
24023 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
24024 }
24025 }
24026 }
24027 }
24028
24029 TEST(F32_GEMMINC_1X8S4__SSE, qmin) {
24030 TEST_REQUIRES_X86_SSE;
24031 GemmMicrokernelTester()
24032 .mr(1)
24033 .nr(8)
24034 .kr(1)
24035 .sr(4)
24036 .m(1)
24037 .n(8)
24038 .k(4)
24039 .qmin(128)
24040 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
24041 }
24042
24043 TEST(F32_GEMMINC_1X8S4__SSE, qmax) {
24044 TEST_REQUIRES_X86_SSE;
24045 GemmMicrokernelTester()
24046 .mr(1)
24047 .nr(8)
24048 .kr(1)
24049 .sr(4)
24050 .m(1)
24051 .n(8)
24052 .k(4)
24053 .qmax(128)
24054 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
24055 }
24056
24057 TEST(F32_GEMMINC_1X8S4__SSE, strided_cm) {
24058 TEST_REQUIRES_X86_SSE;
24059 GemmMicrokernelTester()
24060 .mr(1)
24061 .nr(8)
24062 .kr(1)
24063 .sr(4)
24064 .m(1)
24065 .n(8)
24066 .k(4)
24067 .cm_stride(11)
24068 .Test(xnn_f32_gemminc_ukernel_1x8s4__sse);
24069 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024070#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024071
24072
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024073#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024074 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4) {
24075 TEST_REQUIRES_X86_SSE;
24076 GemmMicrokernelTester()
24077 .mr(4)
24078 .nr(8)
24079 .kr(1)
24080 .sr(4)
24081 .m(4)
24082 .n(8)
24083 .k(4)
24084 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24085 }
24086
24087 TEST(F32_GEMMINC_4X8S4__SSE, strided_cn) {
24088 TEST_REQUIRES_X86_SSE;
24089 GemmMicrokernelTester()
24090 .mr(4)
24091 .nr(8)
24092 .kr(1)
24093 .sr(4)
24094 .m(4)
24095 .n(8)
24096 .k(4)
24097 .cn_stride(11)
24098 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24099 }
24100
24101 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_strided_a) {
24102 TEST_REQUIRES_X86_SSE;
24103 GemmMicrokernelTester()
24104 .mr(4)
24105 .nr(8)
24106 .kr(1)
24107 .sr(4)
24108 .m(4)
24109 .n(8)
24110 .k(4)
24111 .a_stride(7)
24112 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24113 }
24114
24115 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_subtile) {
24116 TEST_REQUIRES_X86_SSE;
24117 for (uint32_t m = 1; m <= 4; m++) {
24118 for (uint32_t n = 1; n <= 8; n++) {
24119 GemmMicrokernelTester()
24120 .mr(4)
24121 .nr(8)
24122 .kr(1)
24123 .sr(4)
24124 .m(m)
24125 .n(n)
24126 .k(4)
24127 .iterations(1)
24128 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24129 }
24130 }
24131 }
24132
24133 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_subtile_m) {
24134 TEST_REQUIRES_X86_SSE;
24135 for (uint32_t m = 1; m <= 4; m++) {
24136 GemmMicrokernelTester()
24137 .mr(4)
24138 .nr(8)
24139 .kr(1)
24140 .sr(4)
24141 .m(m)
24142 .n(8)
24143 .k(4)
24144 .iterations(1)
24145 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24146 }
24147 }
24148
24149 TEST(F32_GEMMINC_4X8S4__SSE, k_eq_4_subtile_n) {
24150 TEST_REQUIRES_X86_SSE;
24151 for (uint32_t n = 1; n <= 8; n++) {
24152 GemmMicrokernelTester()
24153 .mr(4)
24154 .nr(8)
24155 .kr(1)
24156 .sr(4)
24157 .m(4)
24158 .n(n)
24159 .k(4)
24160 .iterations(1)
24161 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24162 }
24163 }
24164
24165 TEST(F32_GEMMINC_4X8S4__SSE, k_lt_4) {
24166 TEST_REQUIRES_X86_SSE;
24167 for (size_t k = 1; k < 4; k++) {
24168 GemmMicrokernelTester()
24169 .mr(4)
24170 .nr(8)
24171 .kr(1)
24172 .sr(4)
24173 .m(4)
24174 .n(8)
24175 .k(k)
24176 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24177 }
24178 }
24179
24180 TEST(F32_GEMMINC_4X8S4__SSE, k_lt_4_strided_a) {
24181 TEST_REQUIRES_X86_SSE;
24182 for (size_t k = 1; k < 4; k++) {
24183 GemmMicrokernelTester()
24184 .mr(4)
24185 .nr(8)
24186 .kr(1)
24187 .sr(4)
24188 .m(4)
24189 .n(8)
24190 .k(k)
24191 .a_stride(7)
24192 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24193 }
24194 }
24195
24196 TEST(F32_GEMMINC_4X8S4__SSE, k_lt_4_subtile) {
24197 TEST_REQUIRES_X86_SSE;
24198 for (size_t k = 1; k < 4; k++) {
24199 for (uint32_t m = 1; m <= 4; m++) {
24200 for (uint32_t n = 1; n <= 8; n++) {
24201 GemmMicrokernelTester()
24202 .mr(4)
24203 .nr(8)
24204 .kr(1)
24205 .sr(4)
24206 .m(m)
24207 .n(n)
24208 .k(k)
24209 .iterations(1)
24210 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24211 }
24212 }
24213 }
24214 }
24215
24216 TEST(F32_GEMMINC_4X8S4__SSE, k_gt_4) {
24217 TEST_REQUIRES_X86_SSE;
24218 for (size_t k = 5; k < 8; k++) {
24219 GemmMicrokernelTester()
24220 .mr(4)
24221 .nr(8)
24222 .kr(1)
24223 .sr(4)
24224 .m(4)
24225 .n(8)
24226 .k(k)
24227 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24228 }
24229 }
24230
24231 TEST(F32_GEMMINC_4X8S4__SSE, k_gt_4_strided_a) {
24232 TEST_REQUIRES_X86_SSE;
24233 for (size_t k = 5; k < 8; k++) {
24234 GemmMicrokernelTester()
24235 .mr(4)
24236 .nr(8)
24237 .kr(1)
24238 .sr(4)
24239 .m(4)
24240 .n(8)
24241 .k(k)
24242 .a_stride(11)
24243 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24244 }
24245 }
24246
24247 TEST(F32_GEMMINC_4X8S4__SSE, k_gt_4_subtile) {
24248 TEST_REQUIRES_X86_SSE;
24249 for (size_t k = 5; k < 8; k++) {
24250 for (uint32_t m = 1; m <= 4; m++) {
24251 for (uint32_t n = 1; n <= 8; n++) {
24252 GemmMicrokernelTester()
24253 .mr(4)
24254 .nr(8)
24255 .kr(1)
24256 .sr(4)
24257 .m(m)
24258 .n(n)
24259 .k(k)
24260 .iterations(1)
24261 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24262 }
24263 }
24264 }
24265 }
24266
24267 TEST(F32_GEMMINC_4X8S4__SSE, k_div_4) {
24268 TEST_REQUIRES_X86_SSE;
24269 for (size_t k = 8; k <= 40; k += 4) {
24270 GemmMicrokernelTester()
24271 .mr(4)
24272 .nr(8)
24273 .kr(1)
24274 .sr(4)
24275 .m(4)
24276 .n(8)
24277 .k(k)
24278 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24279 }
24280 }
24281
24282 TEST(F32_GEMMINC_4X8S4__SSE, k_div_4_strided_a) {
24283 TEST_REQUIRES_X86_SSE;
24284 for (size_t k = 8; k <= 40; k += 4) {
24285 GemmMicrokernelTester()
24286 .mr(4)
24287 .nr(8)
24288 .kr(1)
24289 .sr(4)
24290 .m(4)
24291 .n(8)
24292 .k(k)
24293 .a_stride(43)
24294 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24295 }
24296 }
24297
24298 TEST(F32_GEMMINC_4X8S4__SSE, k_div_4_subtile) {
24299 TEST_REQUIRES_X86_SSE;
24300 for (size_t k = 8; k <= 40; k += 4) {
24301 for (uint32_t m = 1; m <= 4; m++) {
24302 for (uint32_t n = 1; n <= 8; n++) {
24303 GemmMicrokernelTester()
24304 .mr(4)
24305 .nr(8)
24306 .kr(1)
24307 .sr(4)
24308 .m(m)
24309 .n(n)
24310 .k(k)
24311 .iterations(1)
24312 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24313 }
24314 }
24315 }
24316 }
24317
24318 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8) {
24319 TEST_REQUIRES_X86_SSE;
24320 for (uint32_t n = 9; n < 16; n++) {
24321 for (size_t k = 1; k <= 20; k += 5) {
24322 GemmMicrokernelTester()
24323 .mr(4)
24324 .nr(8)
24325 .kr(1)
24326 .sr(4)
24327 .m(4)
24328 .n(8)
24329 .k(k)
24330 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24331 }
24332 }
24333 }
24334
24335 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8_strided_cn) {
24336 TEST_REQUIRES_X86_SSE;
24337 for (uint32_t n = 9; n < 16; n++) {
24338 for (size_t k = 1; k <= 20; k += 5) {
24339 GemmMicrokernelTester()
24340 .mr(4)
24341 .nr(8)
24342 .kr(1)
24343 .sr(4)
24344 .m(4)
24345 .n(8)
24346 .k(k)
24347 .cn_stride(11)
24348 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24349 }
24350 }
24351 }
24352
24353 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8_strided_a) {
24354 TEST_REQUIRES_X86_SSE;
24355 for (uint32_t n = 9; n < 16; n++) {
24356 for (size_t k = 1; k <= 20; k += 5) {
24357 GemmMicrokernelTester()
24358 .mr(4)
24359 .nr(8)
24360 .kr(1)
24361 .sr(4)
24362 .m(4)
24363 .n(n)
24364 .k(k)
24365 .a_stride(23)
24366 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24367 }
24368 }
24369 }
24370
24371 TEST(F32_GEMMINC_4X8S4__SSE, n_gt_8_subtile) {
24372 TEST_REQUIRES_X86_SSE;
24373 for (uint32_t n = 9; n < 16; n++) {
24374 for (size_t k = 1; k <= 20; k += 5) {
24375 for (uint32_t m = 1; m <= 4; m++) {
24376 GemmMicrokernelTester()
24377 .mr(4)
24378 .nr(8)
24379 .kr(1)
24380 .sr(4)
24381 .m(m)
24382 .n(n)
24383 .k(k)
24384 .iterations(1)
24385 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24386 }
24387 }
24388 }
24389 }
24390
24391 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8) {
24392 TEST_REQUIRES_X86_SSE;
24393 for (uint32_t n = 16; n <= 24; n += 8) {
24394 for (size_t k = 1; k <= 20; k += 5) {
24395 GemmMicrokernelTester()
24396 .mr(4)
24397 .nr(8)
24398 .kr(1)
24399 .sr(4)
24400 .m(4)
24401 .n(8)
24402 .k(k)
24403 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24404 }
24405 }
24406 }
24407
24408 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8_strided_cn) {
24409 TEST_REQUIRES_X86_SSE;
24410 for (uint32_t n = 16; n <= 24; n += 8) {
24411 for (size_t k = 1; k <= 20; k += 5) {
24412 GemmMicrokernelTester()
24413 .mr(4)
24414 .nr(8)
24415 .kr(1)
24416 .sr(4)
24417 .m(4)
24418 .n(n)
24419 .k(k)
24420 .cn_stride(11)
24421 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24422 }
24423 }
24424 }
24425
24426 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8_strided_a) {
24427 TEST_REQUIRES_X86_SSE;
24428 for (uint32_t n = 16; n <= 24; n += 8) {
24429 for (size_t k = 1; k <= 20; k += 5) {
24430 GemmMicrokernelTester()
24431 .mr(4)
24432 .nr(8)
24433 .kr(1)
24434 .sr(4)
24435 .m(4)
24436 .n(n)
24437 .k(k)
24438 .a_stride(23)
24439 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24440 }
24441 }
24442 }
24443
24444 TEST(F32_GEMMINC_4X8S4__SSE, n_div_8_subtile) {
24445 TEST_REQUIRES_X86_SSE;
24446 for (uint32_t n = 16; n <= 24; n += 8) {
24447 for (size_t k = 1; k <= 20; k += 5) {
24448 for (uint32_t m = 1; m <= 4; m++) {
24449 GemmMicrokernelTester()
24450 .mr(4)
24451 .nr(8)
24452 .kr(1)
24453 .sr(4)
24454 .m(m)
24455 .n(n)
24456 .k(k)
24457 .iterations(1)
24458 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24459 }
24460 }
24461 }
24462 }
24463
24464 TEST(F32_GEMMINC_4X8S4__SSE, strided_cm_subtile) {
24465 TEST_REQUIRES_X86_SSE;
24466 for (size_t k = 1; k <= 20; k += 5) {
24467 for (uint32_t m = 1; m <= 4; m++) {
24468 for (uint32_t n = 1; n <= 8; n++) {
24469 GemmMicrokernelTester()
24470 .mr(4)
24471 .nr(8)
24472 .kr(1)
24473 .sr(4)
24474 .m(m)
24475 .n(n)
24476 .k(k)
24477 .cm_stride(11)
24478 .iterations(1)
24479 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24480 }
24481 }
24482 }
24483 }
24484
24485 TEST(F32_GEMMINC_4X8S4__SSE, qmin) {
24486 TEST_REQUIRES_X86_SSE;
24487 GemmMicrokernelTester()
24488 .mr(4)
24489 .nr(8)
24490 .kr(1)
24491 .sr(4)
24492 .m(4)
24493 .n(8)
24494 .k(4)
24495 .qmin(128)
24496 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24497 }
24498
24499 TEST(F32_GEMMINC_4X8S4__SSE, qmax) {
24500 TEST_REQUIRES_X86_SSE;
24501 GemmMicrokernelTester()
24502 .mr(4)
24503 .nr(8)
24504 .kr(1)
24505 .sr(4)
24506 .m(4)
24507 .n(8)
24508 .k(4)
24509 .qmax(128)
24510 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24511 }
24512
24513 TEST(F32_GEMMINC_4X8S4__SSE, strided_cm) {
24514 TEST_REQUIRES_X86_SSE;
24515 GemmMicrokernelTester()
24516 .mr(4)
24517 .nr(8)
24518 .kr(1)
24519 .sr(4)
24520 .m(4)
24521 .n(8)
24522 .k(4)
24523 .cm_stride(11)
24524 .Test(xnn_f32_gemminc_ukernel_4x8s4__sse);
24525 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024526#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024527
24528
Marat Dukhanfda12b82019-11-21 12:27:59 -080024529#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24530 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1) {
24531 TEST_REQUIRES_X86_AVX;
24532 GemmMicrokernelTester()
24533 .mr(1)
24534 .nr(8)
24535 .kr(1)
24536 .sr(1)
24537 .m(1)
24538 .n(8)
24539 .k(1)
24540 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24541 }
24542
24543 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, strided_cn) {
24544 TEST_REQUIRES_X86_AVX;
24545 GemmMicrokernelTester()
24546 .mr(1)
24547 .nr(8)
24548 .kr(1)
24549 .sr(1)
24550 .m(1)
24551 .n(8)
24552 .k(1)
24553 .cn_stride(11)
24554 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24555 }
24556
24557 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_strided_a) {
24558 TEST_REQUIRES_X86_AVX;
24559 GemmMicrokernelTester()
24560 .mr(1)
24561 .nr(8)
24562 .kr(1)
24563 .sr(1)
24564 .m(1)
24565 .n(8)
24566 .k(1)
24567 .a_stride(3)
24568 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24569 }
24570
24571 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_subtile) {
24572 TEST_REQUIRES_X86_AVX;
24573 for (uint32_t m = 1; m <= 1; m++) {
24574 for (uint32_t n = 1; n <= 8; n++) {
24575 GemmMicrokernelTester()
24576 .mr(1)
24577 .nr(8)
24578 .kr(1)
24579 .sr(1)
24580 .m(m)
24581 .n(n)
24582 .k(1)
24583 .iterations(1)
24584 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24585 }
24586 }
24587 }
24588
24589 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
24590 TEST_REQUIRES_X86_AVX;
24591 for (uint32_t m = 1; m <= 1; m++) {
24592 GemmMicrokernelTester()
24593 .mr(1)
24594 .nr(8)
24595 .kr(1)
24596 .sr(1)
24597 .m(m)
24598 .n(8)
24599 .k(1)
24600 .iterations(1)
24601 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24602 }
24603 }
24604
24605 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
24606 TEST_REQUIRES_X86_AVX;
24607 for (uint32_t n = 1; n <= 8; n++) {
24608 GemmMicrokernelTester()
24609 .mr(1)
24610 .nr(8)
24611 .kr(1)
24612 .sr(1)
24613 .m(1)
24614 .n(n)
24615 .k(1)
24616 .iterations(1)
24617 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24618 }
24619 }
24620
24621 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_gt_1) {
24622 TEST_REQUIRES_X86_AVX;
24623 for (size_t k = 2; k < 10; k++) {
24624 GemmMicrokernelTester()
24625 .mr(1)
24626 .nr(8)
24627 .kr(1)
24628 .sr(1)
24629 .m(1)
24630 .n(8)
24631 .k(k)
24632 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24633 }
24634 }
24635
24636 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_gt_1_strided_a) {
24637 TEST_REQUIRES_X86_AVX;
24638 for (size_t k = 2; k < 10; k++) {
24639 GemmMicrokernelTester()
24640 .mr(1)
24641 .nr(8)
24642 .kr(1)
24643 .sr(1)
24644 .m(1)
24645 .n(8)
24646 .k(k)
24647 .a_stride(11)
24648 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24649 }
24650 }
24651
24652 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, k_gt_1_subtile) {
24653 TEST_REQUIRES_X86_AVX;
24654 for (size_t k = 2; k < 10; k++) {
24655 for (uint32_t m = 1; m <= 1; m++) {
24656 for (uint32_t n = 1; n <= 8; n++) {
24657 GemmMicrokernelTester()
24658 .mr(1)
24659 .nr(8)
24660 .kr(1)
24661 .sr(1)
24662 .m(m)
24663 .n(n)
24664 .k(k)
24665 .iterations(1)
24666 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24667 }
24668 }
24669 }
24670 }
24671
24672 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8) {
24673 TEST_REQUIRES_X86_AVX;
24674 for (uint32_t n = 9; n < 16; n++) {
24675 for (size_t k = 1; k <= 5; k += 2) {
24676 GemmMicrokernelTester()
24677 .mr(1)
24678 .nr(8)
24679 .kr(1)
24680 .sr(1)
24681 .m(1)
24682 .n(8)
24683 .k(k)
24684 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24685 }
24686 }
24687 }
24688
24689 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
24690 TEST_REQUIRES_X86_AVX;
24691 for (uint32_t n = 9; n < 16; n++) {
24692 for (size_t k = 1; k <= 5; k += 2) {
24693 GemmMicrokernelTester()
24694 .mr(1)
24695 .nr(8)
24696 .kr(1)
24697 .sr(1)
24698 .m(1)
24699 .n(8)
24700 .k(k)
24701 .cn_stride(11)
24702 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24703 }
24704 }
24705 }
24706
24707 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8_strided_a) {
24708 TEST_REQUIRES_X86_AVX;
24709 for (uint32_t n = 9; n < 16; n++) {
24710 for (size_t k = 1; k <= 5; k += 2) {
24711 GemmMicrokernelTester()
24712 .mr(1)
24713 .nr(8)
24714 .kr(1)
24715 .sr(1)
24716 .m(1)
24717 .n(n)
24718 .k(k)
24719 .a_stride(7)
24720 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24721 }
24722 }
24723 }
24724
24725 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_gt_8_subtile) {
24726 TEST_REQUIRES_X86_AVX;
24727 for (uint32_t n = 9; n < 16; n++) {
24728 for (size_t k = 1; k <= 5; k += 2) {
24729 for (uint32_t m = 1; m <= 1; m++) {
24730 GemmMicrokernelTester()
24731 .mr(1)
24732 .nr(8)
24733 .kr(1)
24734 .sr(1)
24735 .m(m)
24736 .n(n)
24737 .k(k)
24738 .iterations(1)
24739 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24740 }
24741 }
24742 }
24743 }
24744
24745 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8) {
24746 TEST_REQUIRES_X86_AVX;
24747 for (uint32_t n = 16; n <= 24; n += 8) {
24748 for (size_t k = 1; k <= 5; k += 2) {
24749 GemmMicrokernelTester()
24750 .mr(1)
24751 .nr(8)
24752 .kr(1)
24753 .sr(1)
24754 .m(1)
24755 .n(8)
24756 .k(k)
24757 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24758 }
24759 }
24760 }
24761
24762 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
24763 TEST_REQUIRES_X86_AVX;
24764 for (uint32_t n = 16; n <= 24; n += 8) {
24765 for (size_t k = 1; k <= 5; k += 2) {
24766 GemmMicrokernelTester()
24767 .mr(1)
24768 .nr(8)
24769 .kr(1)
24770 .sr(1)
24771 .m(1)
24772 .n(n)
24773 .k(k)
24774 .cn_stride(11)
24775 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24776 }
24777 }
24778 }
24779
24780 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8_strided_a) {
24781 TEST_REQUIRES_X86_AVX;
24782 for (uint32_t n = 16; n <= 24; n += 8) {
24783 for (size_t k = 1; k <= 5; k += 2) {
24784 GemmMicrokernelTester()
24785 .mr(1)
24786 .nr(8)
24787 .kr(1)
24788 .sr(1)
24789 .m(1)
24790 .n(n)
24791 .k(k)
24792 .a_stride(7)
24793 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24794 }
24795 }
24796 }
24797
24798 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, n_div_8_subtile) {
24799 TEST_REQUIRES_X86_AVX;
24800 for (uint32_t n = 16; n <= 24; n += 8) {
24801 for (size_t k = 1; k <= 5; k += 2) {
24802 for (uint32_t m = 1; m <= 1; m++) {
24803 GemmMicrokernelTester()
24804 .mr(1)
24805 .nr(8)
24806 .kr(1)
24807 .sr(1)
24808 .m(m)
24809 .n(n)
24810 .k(k)
24811 .iterations(1)
24812 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24813 }
24814 }
24815 }
24816 }
24817
24818 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, strided_cm_subtile) {
24819 TEST_REQUIRES_X86_AVX;
24820 for (size_t k = 1; k <= 5; k += 2) {
24821 for (uint32_t m = 1; m <= 1; m++) {
24822 for (uint32_t n = 1; n <= 8; n++) {
24823 GemmMicrokernelTester()
24824 .mr(1)
24825 .nr(8)
24826 .kr(1)
24827 .sr(1)
24828 .m(m)
24829 .n(n)
24830 .k(k)
24831 .cm_stride(11)
24832 .iterations(1)
24833 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24834 }
24835 }
24836 }
24837 }
24838
24839 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, qmin) {
24840 TEST_REQUIRES_X86_AVX;
24841 GemmMicrokernelTester()
24842 .mr(1)
24843 .nr(8)
24844 .kr(1)
24845 .sr(1)
24846 .m(1)
24847 .n(8)
24848 .k(1)
24849 .qmin(128)
24850 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24851 }
24852
24853 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, qmax) {
24854 TEST_REQUIRES_X86_AVX;
24855 GemmMicrokernelTester()
24856 .mr(1)
24857 .nr(8)
24858 .kr(1)
24859 .sr(1)
24860 .m(1)
24861 .n(8)
24862 .k(1)
24863 .qmax(128)
24864 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24865 }
24866
24867 TEST(F32_GEMMINC_1X8__AVX_BROADCAST, strided_cm) {
24868 TEST_REQUIRES_X86_AVX;
24869 GemmMicrokernelTester()
24870 .mr(1)
24871 .nr(8)
24872 .kr(1)
24873 .sr(1)
24874 .m(1)
24875 .n(8)
24876 .k(1)
24877 .cm_stride(11)
24878 .Test(xnn_f32_gemminc_ukernel_1x8__avx_broadcast);
24879 }
24880#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
24881
24882
24883#if XNN_ARCH_X86 || XNN_ARCH_X86_64
24884 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1) {
24885 TEST_REQUIRES_X86_AVX;
24886 GemmMicrokernelTester()
24887 .mr(4)
24888 .nr(8)
24889 .kr(1)
24890 .sr(1)
24891 .m(4)
24892 .n(8)
24893 .k(1)
24894 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24895 }
24896
24897 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, strided_cn) {
24898 TEST_REQUIRES_X86_AVX;
24899 GemmMicrokernelTester()
24900 .mr(4)
24901 .nr(8)
24902 .kr(1)
24903 .sr(1)
24904 .m(4)
24905 .n(8)
24906 .k(1)
24907 .cn_stride(11)
24908 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24909 }
24910
24911 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_strided_a) {
24912 TEST_REQUIRES_X86_AVX;
24913 GemmMicrokernelTester()
24914 .mr(4)
24915 .nr(8)
24916 .kr(1)
24917 .sr(1)
24918 .m(4)
24919 .n(8)
24920 .k(1)
24921 .a_stride(3)
24922 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24923 }
24924
24925 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_subtile) {
24926 TEST_REQUIRES_X86_AVX;
24927 for (uint32_t m = 1; m <= 4; m++) {
24928 for (uint32_t n = 1; n <= 8; n++) {
24929 GemmMicrokernelTester()
24930 .mr(4)
24931 .nr(8)
24932 .kr(1)
24933 .sr(1)
24934 .m(m)
24935 .n(n)
24936 .k(1)
24937 .iterations(1)
24938 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24939 }
24940 }
24941 }
24942
24943 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_subtile_m) {
24944 TEST_REQUIRES_X86_AVX;
24945 for (uint32_t m = 1; m <= 4; m++) {
24946 GemmMicrokernelTester()
24947 .mr(4)
24948 .nr(8)
24949 .kr(1)
24950 .sr(1)
24951 .m(m)
24952 .n(8)
24953 .k(1)
24954 .iterations(1)
24955 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24956 }
24957 }
24958
24959 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_eq_1_subtile_n) {
24960 TEST_REQUIRES_X86_AVX;
24961 for (uint32_t n = 1; n <= 8; n++) {
24962 GemmMicrokernelTester()
24963 .mr(4)
24964 .nr(8)
24965 .kr(1)
24966 .sr(1)
24967 .m(4)
24968 .n(n)
24969 .k(1)
24970 .iterations(1)
24971 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24972 }
24973 }
24974
24975 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_gt_1) {
24976 TEST_REQUIRES_X86_AVX;
24977 for (size_t k = 2; k < 10; k++) {
24978 GemmMicrokernelTester()
24979 .mr(4)
24980 .nr(8)
24981 .kr(1)
24982 .sr(1)
24983 .m(4)
24984 .n(8)
24985 .k(k)
24986 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
24987 }
24988 }
24989
24990 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_gt_1_strided_a) {
24991 TEST_REQUIRES_X86_AVX;
24992 for (size_t k = 2; k < 10; k++) {
24993 GemmMicrokernelTester()
24994 .mr(4)
24995 .nr(8)
24996 .kr(1)
24997 .sr(1)
24998 .m(4)
24999 .n(8)
25000 .k(k)
25001 .a_stride(11)
25002 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25003 }
25004 }
25005
25006 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, k_gt_1_subtile) {
25007 TEST_REQUIRES_X86_AVX;
25008 for (size_t k = 2; k < 10; k++) {
25009 for (uint32_t m = 1; m <= 4; m++) {
25010 for (uint32_t n = 1; n <= 8; n++) {
25011 GemmMicrokernelTester()
25012 .mr(4)
25013 .nr(8)
25014 .kr(1)
25015 .sr(1)
25016 .m(m)
25017 .n(n)
25018 .k(k)
25019 .iterations(1)
25020 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25021 }
25022 }
25023 }
25024 }
25025
25026 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8) {
25027 TEST_REQUIRES_X86_AVX;
25028 for (uint32_t n = 9; n < 16; n++) {
25029 for (size_t k = 1; k <= 5; k += 2) {
25030 GemmMicrokernelTester()
25031 .mr(4)
25032 .nr(8)
25033 .kr(1)
25034 .sr(1)
25035 .m(4)
25036 .n(8)
25037 .k(k)
25038 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25039 }
25040 }
25041 }
25042
25043 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8_strided_cn) {
25044 TEST_REQUIRES_X86_AVX;
25045 for (uint32_t n = 9; n < 16; n++) {
25046 for (size_t k = 1; k <= 5; k += 2) {
25047 GemmMicrokernelTester()
25048 .mr(4)
25049 .nr(8)
25050 .kr(1)
25051 .sr(1)
25052 .m(4)
25053 .n(8)
25054 .k(k)
25055 .cn_stride(11)
25056 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25057 }
25058 }
25059 }
25060
25061 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8_strided_a) {
25062 TEST_REQUIRES_X86_AVX;
25063 for (uint32_t n = 9; n < 16; n++) {
25064 for (size_t k = 1; k <= 5; k += 2) {
25065 GemmMicrokernelTester()
25066 .mr(4)
25067 .nr(8)
25068 .kr(1)
25069 .sr(1)
25070 .m(4)
25071 .n(n)
25072 .k(k)
25073 .a_stride(7)
25074 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25075 }
25076 }
25077 }
25078
25079 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_gt_8_subtile) {
25080 TEST_REQUIRES_X86_AVX;
25081 for (uint32_t n = 9; n < 16; n++) {
25082 for (size_t k = 1; k <= 5; k += 2) {
25083 for (uint32_t m = 1; m <= 4; m++) {
25084 GemmMicrokernelTester()
25085 .mr(4)
25086 .nr(8)
25087 .kr(1)
25088 .sr(1)
25089 .m(m)
25090 .n(n)
25091 .k(k)
25092 .iterations(1)
25093 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25094 }
25095 }
25096 }
25097 }
25098
25099 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8) {
25100 TEST_REQUIRES_X86_AVX;
25101 for (uint32_t n = 16; n <= 24; n += 8) {
25102 for (size_t k = 1; k <= 5; k += 2) {
25103 GemmMicrokernelTester()
25104 .mr(4)
25105 .nr(8)
25106 .kr(1)
25107 .sr(1)
25108 .m(4)
25109 .n(8)
25110 .k(k)
25111 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25112 }
25113 }
25114 }
25115
25116 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8_strided_cn) {
25117 TEST_REQUIRES_X86_AVX;
25118 for (uint32_t n = 16; n <= 24; n += 8) {
25119 for (size_t k = 1; k <= 5; k += 2) {
25120 GemmMicrokernelTester()
25121 .mr(4)
25122 .nr(8)
25123 .kr(1)
25124 .sr(1)
25125 .m(4)
25126 .n(n)
25127 .k(k)
25128 .cn_stride(11)
25129 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25130 }
25131 }
25132 }
25133
25134 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8_strided_a) {
25135 TEST_REQUIRES_X86_AVX;
25136 for (uint32_t n = 16; n <= 24; n += 8) {
25137 for (size_t k = 1; k <= 5; k += 2) {
25138 GemmMicrokernelTester()
25139 .mr(4)
25140 .nr(8)
25141 .kr(1)
25142 .sr(1)
25143 .m(4)
25144 .n(n)
25145 .k(k)
25146 .a_stride(7)
25147 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25148 }
25149 }
25150 }
25151
25152 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, n_div_8_subtile) {
25153 TEST_REQUIRES_X86_AVX;
25154 for (uint32_t n = 16; n <= 24; n += 8) {
25155 for (size_t k = 1; k <= 5; k += 2) {
25156 for (uint32_t m = 1; m <= 4; m++) {
25157 GemmMicrokernelTester()
25158 .mr(4)
25159 .nr(8)
25160 .kr(1)
25161 .sr(1)
25162 .m(m)
25163 .n(n)
25164 .k(k)
25165 .iterations(1)
25166 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25167 }
25168 }
25169 }
25170 }
25171
25172 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, strided_cm_subtile) {
25173 TEST_REQUIRES_X86_AVX;
25174 for (size_t k = 1; k <= 5; k += 2) {
25175 for (uint32_t m = 1; m <= 4; m++) {
25176 for (uint32_t n = 1; n <= 8; n++) {
25177 GemmMicrokernelTester()
25178 .mr(4)
25179 .nr(8)
25180 .kr(1)
25181 .sr(1)
25182 .m(m)
25183 .n(n)
25184 .k(k)
25185 .cm_stride(11)
25186 .iterations(1)
25187 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25188 }
25189 }
25190 }
25191 }
25192
25193 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, qmin) {
25194 TEST_REQUIRES_X86_AVX;
25195 GemmMicrokernelTester()
25196 .mr(4)
25197 .nr(8)
25198 .kr(1)
25199 .sr(1)
25200 .m(4)
25201 .n(8)
25202 .k(1)
25203 .qmin(128)
25204 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25205 }
25206
25207 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, qmax) {
25208 TEST_REQUIRES_X86_AVX;
25209 GemmMicrokernelTester()
25210 .mr(4)
25211 .nr(8)
25212 .kr(1)
25213 .sr(1)
25214 .m(4)
25215 .n(8)
25216 .k(1)
25217 .qmax(128)
25218 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25219 }
25220
25221 TEST(F32_GEMMINC_4X8__AVX_BROADCAST, strided_cm) {
25222 TEST_REQUIRES_X86_AVX;
25223 GemmMicrokernelTester()
25224 .mr(4)
25225 .nr(8)
25226 .kr(1)
25227 .sr(1)
25228 .m(4)
25229 .n(8)
25230 .k(1)
25231 .cm_stride(11)
25232 .Test(xnn_f32_gemminc_ukernel_4x8__avx_broadcast);
25233 }
25234#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25235
25236
25237#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25238 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1) {
25239 TEST_REQUIRES_X86_AVX;
25240 GemmMicrokernelTester()
25241 .mr(5)
25242 .nr(8)
25243 .kr(1)
25244 .sr(1)
25245 .m(5)
25246 .n(8)
25247 .k(1)
25248 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25249 }
25250
25251 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, strided_cn) {
25252 TEST_REQUIRES_X86_AVX;
25253 GemmMicrokernelTester()
25254 .mr(5)
25255 .nr(8)
25256 .kr(1)
25257 .sr(1)
25258 .m(5)
25259 .n(8)
25260 .k(1)
25261 .cn_stride(11)
25262 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25263 }
25264
25265 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_strided_a) {
25266 TEST_REQUIRES_X86_AVX;
25267 GemmMicrokernelTester()
25268 .mr(5)
25269 .nr(8)
25270 .kr(1)
25271 .sr(1)
25272 .m(5)
25273 .n(8)
25274 .k(1)
25275 .a_stride(3)
25276 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25277 }
25278
25279 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_subtile) {
25280 TEST_REQUIRES_X86_AVX;
25281 for (uint32_t m = 1; m <= 5; m++) {
25282 for (uint32_t n = 1; n <= 8; n++) {
25283 GemmMicrokernelTester()
25284 .mr(5)
25285 .nr(8)
25286 .kr(1)
25287 .sr(1)
25288 .m(m)
25289 .n(n)
25290 .k(1)
25291 .iterations(1)
25292 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25293 }
25294 }
25295 }
25296
25297 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
25298 TEST_REQUIRES_X86_AVX;
25299 for (uint32_t m = 1; m <= 5; m++) {
25300 GemmMicrokernelTester()
25301 .mr(5)
25302 .nr(8)
25303 .kr(1)
25304 .sr(1)
25305 .m(m)
25306 .n(8)
25307 .k(1)
25308 .iterations(1)
25309 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25310 }
25311 }
25312
25313 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
25314 TEST_REQUIRES_X86_AVX;
25315 for (uint32_t n = 1; n <= 8; n++) {
25316 GemmMicrokernelTester()
25317 .mr(5)
25318 .nr(8)
25319 .kr(1)
25320 .sr(1)
25321 .m(5)
25322 .n(n)
25323 .k(1)
25324 .iterations(1)
25325 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25326 }
25327 }
25328
25329 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_gt_1) {
25330 TEST_REQUIRES_X86_AVX;
25331 for (size_t k = 2; k < 10; k++) {
25332 GemmMicrokernelTester()
25333 .mr(5)
25334 .nr(8)
25335 .kr(1)
25336 .sr(1)
25337 .m(5)
25338 .n(8)
25339 .k(k)
25340 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25341 }
25342 }
25343
25344 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_gt_1_strided_a) {
25345 TEST_REQUIRES_X86_AVX;
25346 for (size_t k = 2; k < 10; k++) {
25347 GemmMicrokernelTester()
25348 .mr(5)
25349 .nr(8)
25350 .kr(1)
25351 .sr(1)
25352 .m(5)
25353 .n(8)
25354 .k(k)
25355 .a_stride(11)
25356 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25357 }
25358 }
25359
25360 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, k_gt_1_subtile) {
25361 TEST_REQUIRES_X86_AVX;
25362 for (size_t k = 2; k < 10; k++) {
25363 for (uint32_t m = 1; m <= 5; m++) {
25364 for (uint32_t n = 1; n <= 8; n++) {
25365 GemmMicrokernelTester()
25366 .mr(5)
25367 .nr(8)
25368 .kr(1)
25369 .sr(1)
25370 .m(m)
25371 .n(n)
25372 .k(k)
25373 .iterations(1)
25374 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25375 }
25376 }
25377 }
25378 }
25379
25380 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8) {
25381 TEST_REQUIRES_X86_AVX;
25382 for (uint32_t n = 9; n < 16; n++) {
25383 for (size_t k = 1; k <= 5; k += 2) {
25384 GemmMicrokernelTester()
25385 .mr(5)
25386 .nr(8)
25387 .kr(1)
25388 .sr(1)
25389 .m(5)
25390 .n(8)
25391 .k(k)
25392 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25393 }
25394 }
25395 }
25396
25397 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
25398 TEST_REQUIRES_X86_AVX;
25399 for (uint32_t n = 9; n < 16; n++) {
25400 for (size_t k = 1; k <= 5; k += 2) {
25401 GemmMicrokernelTester()
25402 .mr(5)
25403 .nr(8)
25404 .kr(1)
25405 .sr(1)
25406 .m(5)
25407 .n(8)
25408 .k(k)
25409 .cn_stride(11)
25410 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25411 }
25412 }
25413 }
25414
25415 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8_strided_a) {
25416 TEST_REQUIRES_X86_AVX;
25417 for (uint32_t n = 9; n < 16; n++) {
25418 for (size_t k = 1; k <= 5; k += 2) {
25419 GemmMicrokernelTester()
25420 .mr(5)
25421 .nr(8)
25422 .kr(1)
25423 .sr(1)
25424 .m(5)
25425 .n(n)
25426 .k(k)
25427 .a_stride(7)
25428 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25429 }
25430 }
25431 }
25432
25433 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_gt_8_subtile) {
25434 TEST_REQUIRES_X86_AVX;
25435 for (uint32_t n = 9; n < 16; n++) {
25436 for (size_t k = 1; k <= 5; k += 2) {
25437 for (uint32_t m = 1; m <= 5; m++) {
25438 GemmMicrokernelTester()
25439 .mr(5)
25440 .nr(8)
25441 .kr(1)
25442 .sr(1)
25443 .m(m)
25444 .n(n)
25445 .k(k)
25446 .iterations(1)
25447 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25448 }
25449 }
25450 }
25451 }
25452
25453 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8) {
25454 TEST_REQUIRES_X86_AVX;
25455 for (uint32_t n = 16; n <= 24; n += 8) {
25456 for (size_t k = 1; k <= 5; k += 2) {
25457 GemmMicrokernelTester()
25458 .mr(5)
25459 .nr(8)
25460 .kr(1)
25461 .sr(1)
25462 .m(5)
25463 .n(8)
25464 .k(k)
25465 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25466 }
25467 }
25468 }
25469
25470 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
25471 TEST_REQUIRES_X86_AVX;
25472 for (uint32_t n = 16; n <= 24; n += 8) {
25473 for (size_t k = 1; k <= 5; k += 2) {
25474 GemmMicrokernelTester()
25475 .mr(5)
25476 .nr(8)
25477 .kr(1)
25478 .sr(1)
25479 .m(5)
25480 .n(n)
25481 .k(k)
25482 .cn_stride(11)
25483 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25484 }
25485 }
25486 }
25487
25488 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8_strided_a) {
25489 TEST_REQUIRES_X86_AVX;
25490 for (uint32_t n = 16; n <= 24; n += 8) {
25491 for (size_t k = 1; k <= 5; k += 2) {
25492 GemmMicrokernelTester()
25493 .mr(5)
25494 .nr(8)
25495 .kr(1)
25496 .sr(1)
25497 .m(5)
25498 .n(n)
25499 .k(k)
25500 .a_stride(7)
25501 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25502 }
25503 }
25504 }
25505
25506 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, n_div_8_subtile) {
25507 TEST_REQUIRES_X86_AVX;
25508 for (uint32_t n = 16; n <= 24; n += 8) {
25509 for (size_t k = 1; k <= 5; k += 2) {
25510 for (uint32_t m = 1; m <= 5; m++) {
25511 GemmMicrokernelTester()
25512 .mr(5)
25513 .nr(8)
25514 .kr(1)
25515 .sr(1)
25516 .m(m)
25517 .n(n)
25518 .k(k)
25519 .iterations(1)
25520 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25521 }
25522 }
25523 }
25524 }
25525
25526 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, strided_cm_subtile) {
25527 TEST_REQUIRES_X86_AVX;
25528 for (size_t k = 1; k <= 5; k += 2) {
25529 for (uint32_t m = 1; m <= 5; m++) {
25530 for (uint32_t n = 1; n <= 8; n++) {
25531 GemmMicrokernelTester()
25532 .mr(5)
25533 .nr(8)
25534 .kr(1)
25535 .sr(1)
25536 .m(m)
25537 .n(n)
25538 .k(k)
25539 .cm_stride(11)
25540 .iterations(1)
25541 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25542 }
25543 }
25544 }
25545 }
25546
25547 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, qmin) {
25548 TEST_REQUIRES_X86_AVX;
25549 GemmMicrokernelTester()
25550 .mr(5)
25551 .nr(8)
25552 .kr(1)
25553 .sr(1)
25554 .m(5)
25555 .n(8)
25556 .k(1)
25557 .qmin(128)
25558 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25559 }
25560
25561 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, qmax) {
25562 TEST_REQUIRES_X86_AVX;
25563 GemmMicrokernelTester()
25564 .mr(5)
25565 .nr(8)
25566 .kr(1)
25567 .sr(1)
25568 .m(5)
25569 .n(8)
25570 .k(1)
25571 .qmax(128)
25572 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25573 }
25574
25575 TEST(F32_GEMMINC_5X8__AVX_BROADCAST, strided_cm) {
25576 TEST_REQUIRES_X86_AVX;
25577 GemmMicrokernelTester()
25578 .mr(5)
25579 .nr(8)
25580 .kr(1)
25581 .sr(1)
25582 .m(5)
25583 .n(8)
25584 .k(1)
25585 .cm_stride(11)
25586 .Test(xnn_f32_gemminc_ukernel_5x8__avx_broadcast);
25587 }
25588#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25589
25590
25591#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25592 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1) {
25593 TEST_REQUIRES_X86_AVX;
25594 GemmMicrokernelTester()
25595 .mr(6)
25596 .nr(8)
25597 .kr(1)
25598 .sr(1)
25599 .m(6)
25600 .n(8)
25601 .k(1)
25602 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25603 }
25604
25605 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, strided_cn) {
25606 TEST_REQUIRES_X86_AVX;
25607 GemmMicrokernelTester()
25608 .mr(6)
25609 .nr(8)
25610 .kr(1)
25611 .sr(1)
25612 .m(6)
25613 .n(8)
25614 .k(1)
25615 .cn_stride(11)
25616 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25617 }
25618
25619 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_strided_a) {
25620 TEST_REQUIRES_X86_AVX;
25621 GemmMicrokernelTester()
25622 .mr(6)
25623 .nr(8)
25624 .kr(1)
25625 .sr(1)
25626 .m(6)
25627 .n(8)
25628 .k(1)
25629 .a_stride(3)
25630 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25631 }
25632
25633 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_subtile) {
25634 TEST_REQUIRES_X86_AVX;
25635 for (uint32_t m = 1; m <= 6; m++) {
25636 for (uint32_t n = 1; n <= 8; n++) {
25637 GemmMicrokernelTester()
25638 .mr(6)
25639 .nr(8)
25640 .kr(1)
25641 .sr(1)
25642 .m(m)
25643 .n(n)
25644 .k(1)
25645 .iterations(1)
25646 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25647 }
25648 }
25649 }
25650
25651 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
25652 TEST_REQUIRES_X86_AVX;
25653 for (uint32_t m = 1; m <= 6; m++) {
25654 GemmMicrokernelTester()
25655 .mr(6)
25656 .nr(8)
25657 .kr(1)
25658 .sr(1)
25659 .m(m)
25660 .n(8)
25661 .k(1)
25662 .iterations(1)
25663 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25664 }
25665 }
25666
25667 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
25668 TEST_REQUIRES_X86_AVX;
25669 for (uint32_t n = 1; n <= 8; n++) {
25670 GemmMicrokernelTester()
25671 .mr(6)
25672 .nr(8)
25673 .kr(1)
25674 .sr(1)
25675 .m(6)
25676 .n(n)
25677 .k(1)
25678 .iterations(1)
25679 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25680 }
25681 }
25682
25683 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_gt_1) {
25684 TEST_REQUIRES_X86_AVX;
25685 for (size_t k = 2; k < 10; k++) {
25686 GemmMicrokernelTester()
25687 .mr(6)
25688 .nr(8)
25689 .kr(1)
25690 .sr(1)
25691 .m(6)
25692 .n(8)
25693 .k(k)
25694 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25695 }
25696 }
25697
25698 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_gt_1_strided_a) {
25699 TEST_REQUIRES_X86_AVX;
25700 for (size_t k = 2; k < 10; k++) {
25701 GemmMicrokernelTester()
25702 .mr(6)
25703 .nr(8)
25704 .kr(1)
25705 .sr(1)
25706 .m(6)
25707 .n(8)
25708 .k(k)
25709 .a_stride(11)
25710 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25711 }
25712 }
25713
25714 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, k_gt_1_subtile) {
25715 TEST_REQUIRES_X86_AVX;
25716 for (size_t k = 2; k < 10; k++) {
25717 for (uint32_t m = 1; m <= 6; m++) {
25718 for (uint32_t n = 1; n <= 8; n++) {
25719 GemmMicrokernelTester()
25720 .mr(6)
25721 .nr(8)
25722 .kr(1)
25723 .sr(1)
25724 .m(m)
25725 .n(n)
25726 .k(k)
25727 .iterations(1)
25728 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25729 }
25730 }
25731 }
25732 }
25733
25734 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8) {
25735 TEST_REQUIRES_X86_AVX;
25736 for (uint32_t n = 9; n < 16; n++) {
25737 for (size_t k = 1; k <= 5; k += 2) {
25738 GemmMicrokernelTester()
25739 .mr(6)
25740 .nr(8)
25741 .kr(1)
25742 .sr(1)
25743 .m(6)
25744 .n(8)
25745 .k(k)
25746 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25747 }
25748 }
25749 }
25750
25751 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
25752 TEST_REQUIRES_X86_AVX;
25753 for (uint32_t n = 9; n < 16; n++) {
25754 for (size_t k = 1; k <= 5; k += 2) {
25755 GemmMicrokernelTester()
25756 .mr(6)
25757 .nr(8)
25758 .kr(1)
25759 .sr(1)
25760 .m(6)
25761 .n(8)
25762 .k(k)
25763 .cn_stride(11)
25764 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25765 }
25766 }
25767 }
25768
25769 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8_strided_a) {
25770 TEST_REQUIRES_X86_AVX;
25771 for (uint32_t n = 9; n < 16; n++) {
25772 for (size_t k = 1; k <= 5; k += 2) {
25773 GemmMicrokernelTester()
25774 .mr(6)
25775 .nr(8)
25776 .kr(1)
25777 .sr(1)
25778 .m(6)
25779 .n(n)
25780 .k(k)
25781 .a_stride(7)
25782 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25783 }
25784 }
25785 }
25786
25787 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_gt_8_subtile) {
25788 TEST_REQUIRES_X86_AVX;
25789 for (uint32_t n = 9; n < 16; n++) {
25790 for (size_t k = 1; k <= 5; k += 2) {
25791 for (uint32_t m = 1; m <= 6; m++) {
25792 GemmMicrokernelTester()
25793 .mr(6)
25794 .nr(8)
25795 .kr(1)
25796 .sr(1)
25797 .m(m)
25798 .n(n)
25799 .k(k)
25800 .iterations(1)
25801 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25802 }
25803 }
25804 }
25805 }
25806
25807 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8) {
25808 TEST_REQUIRES_X86_AVX;
25809 for (uint32_t n = 16; n <= 24; n += 8) {
25810 for (size_t k = 1; k <= 5; k += 2) {
25811 GemmMicrokernelTester()
25812 .mr(6)
25813 .nr(8)
25814 .kr(1)
25815 .sr(1)
25816 .m(6)
25817 .n(8)
25818 .k(k)
25819 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25820 }
25821 }
25822 }
25823
25824 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
25825 TEST_REQUIRES_X86_AVX;
25826 for (uint32_t n = 16; n <= 24; n += 8) {
25827 for (size_t k = 1; k <= 5; k += 2) {
25828 GemmMicrokernelTester()
25829 .mr(6)
25830 .nr(8)
25831 .kr(1)
25832 .sr(1)
25833 .m(6)
25834 .n(n)
25835 .k(k)
25836 .cn_stride(11)
25837 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25838 }
25839 }
25840 }
25841
25842 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8_strided_a) {
25843 TEST_REQUIRES_X86_AVX;
25844 for (uint32_t n = 16; n <= 24; n += 8) {
25845 for (size_t k = 1; k <= 5; k += 2) {
25846 GemmMicrokernelTester()
25847 .mr(6)
25848 .nr(8)
25849 .kr(1)
25850 .sr(1)
25851 .m(6)
25852 .n(n)
25853 .k(k)
25854 .a_stride(7)
25855 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25856 }
25857 }
25858 }
25859
25860 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, n_div_8_subtile) {
25861 TEST_REQUIRES_X86_AVX;
25862 for (uint32_t n = 16; n <= 24; n += 8) {
25863 for (size_t k = 1; k <= 5; k += 2) {
25864 for (uint32_t m = 1; m <= 6; m++) {
25865 GemmMicrokernelTester()
25866 .mr(6)
25867 .nr(8)
25868 .kr(1)
25869 .sr(1)
25870 .m(m)
25871 .n(n)
25872 .k(k)
25873 .iterations(1)
25874 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25875 }
25876 }
25877 }
25878 }
25879
25880 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, strided_cm_subtile) {
25881 TEST_REQUIRES_X86_AVX;
25882 for (size_t k = 1; k <= 5; k += 2) {
25883 for (uint32_t m = 1; m <= 6; m++) {
25884 for (uint32_t n = 1; n <= 8; n++) {
25885 GemmMicrokernelTester()
25886 .mr(6)
25887 .nr(8)
25888 .kr(1)
25889 .sr(1)
25890 .m(m)
25891 .n(n)
25892 .k(k)
25893 .cm_stride(11)
25894 .iterations(1)
25895 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25896 }
25897 }
25898 }
25899 }
25900
25901 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, qmin) {
25902 TEST_REQUIRES_X86_AVX;
25903 GemmMicrokernelTester()
25904 .mr(6)
25905 .nr(8)
25906 .kr(1)
25907 .sr(1)
25908 .m(6)
25909 .n(8)
25910 .k(1)
25911 .qmin(128)
25912 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25913 }
25914
25915 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, qmax) {
25916 TEST_REQUIRES_X86_AVX;
25917 GemmMicrokernelTester()
25918 .mr(6)
25919 .nr(8)
25920 .kr(1)
25921 .sr(1)
25922 .m(6)
25923 .n(8)
25924 .k(1)
25925 .qmax(128)
25926 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25927 }
25928
25929 TEST(F32_GEMMINC_6X8__AVX_BROADCAST, strided_cm) {
25930 TEST_REQUIRES_X86_AVX;
25931 GemmMicrokernelTester()
25932 .mr(6)
25933 .nr(8)
25934 .kr(1)
25935 .sr(1)
25936 .m(6)
25937 .n(8)
25938 .k(1)
25939 .cm_stride(11)
25940 .Test(xnn_f32_gemminc_ukernel_6x8__avx_broadcast);
25941 }
25942#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
25943
25944
25945#if XNN_ARCH_X86 || XNN_ARCH_X86_64
25946 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1) {
25947 TEST_REQUIRES_X86_AVX;
25948 GemmMicrokernelTester()
25949 .mr(7)
25950 .nr(8)
25951 .kr(1)
25952 .sr(1)
25953 .m(7)
25954 .n(8)
25955 .k(1)
25956 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
25957 }
25958
25959 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, strided_cn) {
25960 TEST_REQUIRES_X86_AVX;
25961 GemmMicrokernelTester()
25962 .mr(7)
25963 .nr(8)
25964 .kr(1)
25965 .sr(1)
25966 .m(7)
25967 .n(8)
25968 .k(1)
25969 .cn_stride(11)
25970 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
25971 }
25972
25973 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_strided_a) {
25974 TEST_REQUIRES_X86_AVX;
25975 GemmMicrokernelTester()
25976 .mr(7)
25977 .nr(8)
25978 .kr(1)
25979 .sr(1)
25980 .m(7)
25981 .n(8)
25982 .k(1)
25983 .a_stride(3)
25984 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
25985 }
25986
25987 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_subtile) {
25988 TEST_REQUIRES_X86_AVX;
25989 for (uint32_t m = 1; m <= 7; m++) {
25990 for (uint32_t n = 1; n <= 8; n++) {
25991 GemmMicrokernelTester()
25992 .mr(7)
25993 .nr(8)
25994 .kr(1)
25995 .sr(1)
25996 .m(m)
25997 .n(n)
25998 .k(1)
25999 .iterations(1)
26000 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26001 }
26002 }
26003 }
26004
26005 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_subtile_m) {
26006 TEST_REQUIRES_X86_AVX;
26007 for (uint32_t m = 1; m <= 7; m++) {
26008 GemmMicrokernelTester()
26009 .mr(7)
26010 .nr(8)
26011 .kr(1)
26012 .sr(1)
26013 .m(m)
26014 .n(8)
26015 .k(1)
26016 .iterations(1)
26017 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26018 }
26019 }
26020
26021 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_eq_1_subtile_n) {
26022 TEST_REQUIRES_X86_AVX;
26023 for (uint32_t n = 1; n <= 8; n++) {
26024 GemmMicrokernelTester()
26025 .mr(7)
26026 .nr(8)
26027 .kr(1)
26028 .sr(1)
26029 .m(7)
26030 .n(n)
26031 .k(1)
26032 .iterations(1)
26033 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26034 }
26035 }
26036
26037 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_gt_1) {
26038 TEST_REQUIRES_X86_AVX;
26039 for (size_t k = 2; k < 10; k++) {
26040 GemmMicrokernelTester()
26041 .mr(7)
26042 .nr(8)
26043 .kr(1)
26044 .sr(1)
26045 .m(7)
26046 .n(8)
26047 .k(k)
26048 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26049 }
26050 }
26051
26052 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_gt_1_strided_a) {
26053 TEST_REQUIRES_X86_AVX;
26054 for (size_t k = 2; k < 10; k++) {
26055 GemmMicrokernelTester()
26056 .mr(7)
26057 .nr(8)
26058 .kr(1)
26059 .sr(1)
26060 .m(7)
26061 .n(8)
26062 .k(k)
26063 .a_stride(11)
26064 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26065 }
26066 }
26067
26068 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, k_gt_1_subtile) {
26069 TEST_REQUIRES_X86_AVX;
26070 for (size_t k = 2; k < 10; k++) {
26071 for (uint32_t m = 1; m <= 7; m++) {
26072 for (uint32_t n = 1; n <= 8; n++) {
26073 GemmMicrokernelTester()
26074 .mr(7)
26075 .nr(8)
26076 .kr(1)
26077 .sr(1)
26078 .m(m)
26079 .n(n)
26080 .k(k)
26081 .iterations(1)
26082 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26083 }
26084 }
26085 }
26086 }
26087
26088 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8) {
26089 TEST_REQUIRES_X86_AVX;
26090 for (uint32_t n = 9; n < 16; n++) {
26091 for (size_t k = 1; k <= 5; k += 2) {
26092 GemmMicrokernelTester()
26093 .mr(7)
26094 .nr(8)
26095 .kr(1)
26096 .sr(1)
26097 .m(7)
26098 .n(8)
26099 .k(k)
26100 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26101 }
26102 }
26103 }
26104
26105 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8_strided_cn) {
26106 TEST_REQUIRES_X86_AVX;
26107 for (uint32_t n = 9; n < 16; n++) {
26108 for (size_t k = 1; k <= 5; k += 2) {
26109 GemmMicrokernelTester()
26110 .mr(7)
26111 .nr(8)
26112 .kr(1)
26113 .sr(1)
26114 .m(7)
26115 .n(8)
26116 .k(k)
26117 .cn_stride(11)
26118 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26119 }
26120 }
26121 }
26122
26123 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8_strided_a) {
26124 TEST_REQUIRES_X86_AVX;
26125 for (uint32_t n = 9; n < 16; n++) {
26126 for (size_t k = 1; k <= 5; k += 2) {
26127 GemmMicrokernelTester()
26128 .mr(7)
26129 .nr(8)
26130 .kr(1)
26131 .sr(1)
26132 .m(7)
26133 .n(n)
26134 .k(k)
26135 .a_stride(7)
26136 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26137 }
26138 }
26139 }
26140
26141 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_gt_8_subtile) {
26142 TEST_REQUIRES_X86_AVX;
26143 for (uint32_t n = 9; n < 16; n++) {
26144 for (size_t k = 1; k <= 5; k += 2) {
26145 for (uint32_t m = 1; m <= 7; m++) {
26146 GemmMicrokernelTester()
26147 .mr(7)
26148 .nr(8)
26149 .kr(1)
26150 .sr(1)
26151 .m(m)
26152 .n(n)
26153 .k(k)
26154 .iterations(1)
26155 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26156 }
26157 }
26158 }
26159 }
26160
26161 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8) {
26162 TEST_REQUIRES_X86_AVX;
26163 for (uint32_t n = 16; n <= 24; n += 8) {
26164 for (size_t k = 1; k <= 5; k += 2) {
26165 GemmMicrokernelTester()
26166 .mr(7)
26167 .nr(8)
26168 .kr(1)
26169 .sr(1)
26170 .m(7)
26171 .n(8)
26172 .k(k)
26173 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26174 }
26175 }
26176 }
26177
26178 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8_strided_cn) {
26179 TEST_REQUIRES_X86_AVX;
26180 for (uint32_t n = 16; n <= 24; n += 8) {
26181 for (size_t k = 1; k <= 5; k += 2) {
26182 GemmMicrokernelTester()
26183 .mr(7)
26184 .nr(8)
26185 .kr(1)
26186 .sr(1)
26187 .m(7)
26188 .n(n)
26189 .k(k)
26190 .cn_stride(11)
26191 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26192 }
26193 }
26194 }
26195
26196 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8_strided_a) {
26197 TEST_REQUIRES_X86_AVX;
26198 for (uint32_t n = 16; n <= 24; n += 8) {
26199 for (size_t k = 1; k <= 5; k += 2) {
26200 GemmMicrokernelTester()
26201 .mr(7)
26202 .nr(8)
26203 .kr(1)
26204 .sr(1)
26205 .m(7)
26206 .n(n)
26207 .k(k)
26208 .a_stride(7)
26209 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26210 }
26211 }
26212 }
26213
26214 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, n_div_8_subtile) {
26215 TEST_REQUIRES_X86_AVX;
26216 for (uint32_t n = 16; n <= 24; n += 8) {
26217 for (size_t k = 1; k <= 5; k += 2) {
26218 for (uint32_t m = 1; m <= 7; m++) {
26219 GemmMicrokernelTester()
26220 .mr(7)
26221 .nr(8)
26222 .kr(1)
26223 .sr(1)
26224 .m(m)
26225 .n(n)
26226 .k(k)
26227 .iterations(1)
26228 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26229 }
26230 }
26231 }
26232 }
26233
26234 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, strided_cm_subtile) {
26235 TEST_REQUIRES_X86_AVX;
26236 for (size_t k = 1; k <= 5; k += 2) {
26237 for (uint32_t m = 1; m <= 7; m++) {
26238 for (uint32_t n = 1; n <= 8; n++) {
26239 GemmMicrokernelTester()
26240 .mr(7)
26241 .nr(8)
26242 .kr(1)
26243 .sr(1)
26244 .m(m)
26245 .n(n)
26246 .k(k)
26247 .cm_stride(11)
26248 .iterations(1)
26249 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26250 }
26251 }
26252 }
26253 }
26254
26255 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, qmin) {
26256 TEST_REQUIRES_X86_AVX;
26257 GemmMicrokernelTester()
26258 .mr(7)
26259 .nr(8)
26260 .kr(1)
26261 .sr(1)
26262 .m(7)
26263 .n(8)
26264 .k(1)
26265 .qmin(128)
26266 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26267 }
26268
26269 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, qmax) {
26270 TEST_REQUIRES_X86_AVX;
26271 GemmMicrokernelTester()
26272 .mr(7)
26273 .nr(8)
26274 .kr(1)
26275 .sr(1)
26276 .m(7)
26277 .n(8)
26278 .k(1)
26279 .qmax(128)
26280 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26281 }
26282
26283 TEST(F32_GEMMINC_7X8__AVX_BROADCAST, strided_cm) {
26284 TEST_REQUIRES_X86_AVX;
26285 GemmMicrokernelTester()
26286 .mr(7)
26287 .nr(8)
26288 .kr(1)
26289 .sr(1)
26290 .m(7)
26291 .n(8)
26292 .k(1)
26293 .cm_stride(11)
26294 .Test(xnn_f32_gemminc_ukernel_7x8__avx_broadcast);
26295 }
26296#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26297
26298
26299#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan27121322019-12-09 14:57:40 -080026300 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1) {
26301 TEST_REQUIRES_X86_AVX;
26302 GemmMicrokernelTester()
26303 .mr(1)
26304 .nr(16)
26305 .kr(1)
26306 .sr(1)
26307 .m(1)
26308 .n(16)
26309 .k(1)
26310 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26311 }
26312
26313 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, strided_cn) {
26314 TEST_REQUIRES_X86_AVX;
26315 GemmMicrokernelTester()
26316 .mr(1)
26317 .nr(16)
26318 .kr(1)
26319 .sr(1)
26320 .m(1)
26321 .n(16)
26322 .k(1)
26323 .cn_stride(19)
26324 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26325 }
26326
26327 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_strided_a) {
26328 TEST_REQUIRES_X86_AVX;
26329 GemmMicrokernelTester()
26330 .mr(1)
26331 .nr(16)
26332 .kr(1)
26333 .sr(1)
26334 .m(1)
26335 .n(16)
26336 .k(1)
26337 .a_stride(3)
26338 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26339 }
26340
26341 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_subtile) {
26342 TEST_REQUIRES_X86_AVX;
26343 for (uint32_t m = 1; m <= 1; m++) {
26344 for (uint32_t n = 1; n <= 16; n++) {
26345 GemmMicrokernelTester()
26346 .mr(1)
26347 .nr(16)
26348 .kr(1)
26349 .sr(1)
26350 .m(m)
26351 .n(n)
26352 .k(1)
26353 .iterations(1)
26354 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26355 }
26356 }
26357 }
26358
26359 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_subtile_m) {
26360 TEST_REQUIRES_X86_AVX;
26361 for (uint32_t m = 1; m <= 1; m++) {
26362 GemmMicrokernelTester()
26363 .mr(1)
26364 .nr(16)
26365 .kr(1)
26366 .sr(1)
26367 .m(m)
26368 .n(16)
26369 .k(1)
26370 .iterations(1)
26371 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26372 }
26373 }
26374
26375 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_eq_1_subtile_n) {
26376 TEST_REQUIRES_X86_AVX;
26377 for (uint32_t n = 1; n <= 16; n++) {
26378 GemmMicrokernelTester()
26379 .mr(1)
26380 .nr(16)
26381 .kr(1)
26382 .sr(1)
26383 .m(1)
26384 .n(n)
26385 .k(1)
26386 .iterations(1)
26387 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26388 }
26389 }
26390
26391 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_gt_1) {
26392 TEST_REQUIRES_X86_AVX;
26393 for (size_t k = 2; k < 10; k++) {
26394 GemmMicrokernelTester()
26395 .mr(1)
26396 .nr(16)
26397 .kr(1)
26398 .sr(1)
26399 .m(1)
26400 .n(16)
26401 .k(k)
26402 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26403 }
26404 }
26405
26406 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_gt_1_strided_a) {
26407 TEST_REQUIRES_X86_AVX;
26408 for (size_t k = 2; k < 10; k++) {
26409 GemmMicrokernelTester()
26410 .mr(1)
26411 .nr(16)
26412 .kr(1)
26413 .sr(1)
26414 .m(1)
26415 .n(16)
26416 .k(k)
26417 .a_stride(11)
26418 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26419 }
26420 }
26421
26422 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, k_gt_1_subtile) {
26423 TEST_REQUIRES_X86_AVX;
26424 for (size_t k = 2; k < 10; k++) {
26425 for (uint32_t m = 1; m <= 1; m++) {
26426 for (uint32_t n = 1; n <= 16; n++) {
26427 GemmMicrokernelTester()
26428 .mr(1)
26429 .nr(16)
26430 .kr(1)
26431 .sr(1)
26432 .m(m)
26433 .n(n)
26434 .k(k)
26435 .iterations(1)
26436 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26437 }
26438 }
26439 }
26440 }
26441
26442 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16) {
26443 TEST_REQUIRES_X86_AVX;
26444 for (uint32_t n = 17; n < 32; n++) {
26445 for (size_t k = 1; k <= 5; k += 2) {
26446 GemmMicrokernelTester()
26447 .mr(1)
26448 .nr(16)
26449 .kr(1)
26450 .sr(1)
26451 .m(1)
26452 .n(16)
26453 .k(k)
26454 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26455 }
26456 }
26457 }
26458
26459 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16_strided_cn) {
26460 TEST_REQUIRES_X86_AVX;
26461 for (uint32_t n = 17; n < 32; n++) {
26462 for (size_t k = 1; k <= 5; k += 2) {
26463 GemmMicrokernelTester()
26464 .mr(1)
26465 .nr(16)
26466 .kr(1)
26467 .sr(1)
26468 .m(1)
26469 .n(16)
26470 .k(k)
26471 .cn_stride(19)
26472 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26473 }
26474 }
26475 }
26476
26477 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16_strided_a) {
26478 TEST_REQUIRES_X86_AVX;
26479 for (uint32_t n = 17; n < 32; n++) {
26480 for (size_t k = 1; k <= 5; k += 2) {
26481 GemmMicrokernelTester()
26482 .mr(1)
26483 .nr(16)
26484 .kr(1)
26485 .sr(1)
26486 .m(1)
26487 .n(n)
26488 .k(k)
26489 .a_stride(7)
26490 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26491 }
26492 }
26493 }
26494
26495 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_gt_16_subtile) {
26496 TEST_REQUIRES_X86_AVX;
26497 for (uint32_t n = 17; n < 32; n++) {
26498 for (size_t k = 1; k <= 5; k += 2) {
26499 for (uint32_t m = 1; m <= 1; m++) {
26500 GemmMicrokernelTester()
26501 .mr(1)
26502 .nr(16)
26503 .kr(1)
26504 .sr(1)
26505 .m(m)
26506 .n(n)
26507 .k(k)
26508 .iterations(1)
26509 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26510 }
26511 }
26512 }
26513 }
26514
26515 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16) {
26516 TEST_REQUIRES_X86_AVX;
26517 for (uint32_t n = 32; n <= 48; n += 16) {
26518 for (size_t k = 1; k <= 5; k += 2) {
26519 GemmMicrokernelTester()
26520 .mr(1)
26521 .nr(16)
26522 .kr(1)
26523 .sr(1)
26524 .m(1)
26525 .n(16)
26526 .k(k)
26527 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26528 }
26529 }
26530 }
26531
26532 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16_strided_cn) {
26533 TEST_REQUIRES_X86_AVX;
26534 for (uint32_t n = 32; n <= 48; n += 16) {
26535 for (size_t k = 1; k <= 5; k += 2) {
26536 GemmMicrokernelTester()
26537 .mr(1)
26538 .nr(16)
26539 .kr(1)
26540 .sr(1)
26541 .m(1)
26542 .n(n)
26543 .k(k)
26544 .cn_stride(19)
26545 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26546 }
26547 }
26548 }
26549
26550 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16_strided_a) {
26551 TEST_REQUIRES_X86_AVX;
26552 for (uint32_t n = 32; n <= 48; n += 16) {
26553 for (size_t k = 1; k <= 5; k += 2) {
26554 GemmMicrokernelTester()
26555 .mr(1)
26556 .nr(16)
26557 .kr(1)
26558 .sr(1)
26559 .m(1)
26560 .n(n)
26561 .k(k)
26562 .a_stride(7)
26563 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26564 }
26565 }
26566 }
26567
26568 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, n_div_16_subtile) {
26569 TEST_REQUIRES_X86_AVX;
26570 for (uint32_t n = 32; n <= 48; n += 16) {
26571 for (size_t k = 1; k <= 5; k += 2) {
26572 for (uint32_t m = 1; m <= 1; m++) {
26573 GemmMicrokernelTester()
26574 .mr(1)
26575 .nr(16)
26576 .kr(1)
26577 .sr(1)
26578 .m(m)
26579 .n(n)
26580 .k(k)
26581 .iterations(1)
26582 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26583 }
26584 }
26585 }
26586 }
26587
26588 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, strided_cm_subtile) {
26589 TEST_REQUIRES_X86_AVX;
26590 for (size_t k = 1; k <= 5; k += 2) {
26591 for (uint32_t m = 1; m <= 1; m++) {
26592 for (uint32_t n = 1; n <= 16; n++) {
26593 GemmMicrokernelTester()
26594 .mr(1)
26595 .nr(16)
26596 .kr(1)
26597 .sr(1)
26598 .m(m)
26599 .n(n)
26600 .k(k)
26601 .cm_stride(19)
26602 .iterations(1)
26603 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26604 }
26605 }
26606 }
26607 }
26608
26609 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, qmin) {
26610 TEST_REQUIRES_X86_AVX;
26611 GemmMicrokernelTester()
26612 .mr(1)
26613 .nr(16)
26614 .kr(1)
26615 .sr(1)
26616 .m(1)
26617 .n(16)
26618 .k(1)
26619 .qmin(128)
26620 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26621 }
26622
26623 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, qmax) {
26624 TEST_REQUIRES_X86_AVX;
26625 GemmMicrokernelTester()
26626 .mr(1)
26627 .nr(16)
26628 .kr(1)
26629 .sr(1)
26630 .m(1)
26631 .n(16)
26632 .k(1)
26633 .qmax(128)
26634 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26635 }
26636
26637 TEST(F32_GEMMINC_1X16__AVX_BROADCAST, strided_cm) {
26638 TEST_REQUIRES_X86_AVX;
26639 GemmMicrokernelTester()
26640 .mr(1)
26641 .nr(16)
26642 .kr(1)
26643 .sr(1)
26644 .m(1)
26645 .n(16)
26646 .k(1)
26647 .cm_stride(19)
26648 .Test(xnn_f32_gemminc_ukernel_1x16__avx_broadcast);
26649 }
26650#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
26651
26652
26653#if XNN_ARCH_X86 || XNN_ARCH_X86_64
26654 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1) {
26655 TEST_REQUIRES_X86_AVX;
26656 GemmMicrokernelTester()
26657 .mr(3)
26658 .nr(16)
26659 .kr(1)
26660 .sr(1)
26661 .m(3)
26662 .n(16)
26663 .k(1)
26664 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26665 }
26666
26667 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, strided_cn) {
26668 TEST_REQUIRES_X86_AVX;
26669 GemmMicrokernelTester()
26670 .mr(3)
26671 .nr(16)
26672 .kr(1)
26673 .sr(1)
26674 .m(3)
26675 .n(16)
26676 .k(1)
26677 .cn_stride(19)
26678 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26679 }
26680
26681 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_strided_a) {
26682 TEST_REQUIRES_X86_AVX;
26683 GemmMicrokernelTester()
26684 .mr(3)
26685 .nr(16)
26686 .kr(1)
26687 .sr(1)
26688 .m(3)
26689 .n(16)
26690 .k(1)
26691 .a_stride(3)
26692 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26693 }
26694
26695 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_subtile) {
26696 TEST_REQUIRES_X86_AVX;
26697 for (uint32_t m = 1; m <= 3; m++) {
26698 for (uint32_t n = 1; n <= 16; n++) {
26699 GemmMicrokernelTester()
26700 .mr(3)
26701 .nr(16)
26702 .kr(1)
26703 .sr(1)
26704 .m(m)
26705 .n(n)
26706 .k(1)
26707 .iterations(1)
26708 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26709 }
26710 }
26711 }
26712
26713 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
26714 TEST_REQUIRES_X86_AVX;
26715 for (uint32_t m = 1; m <= 3; m++) {
26716 GemmMicrokernelTester()
26717 .mr(3)
26718 .nr(16)
26719 .kr(1)
26720 .sr(1)
26721 .m(m)
26722 .n(16)
26723 .k(1)
26724 .iterations(1)
26725 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26726 }
26727 }
26728
26729 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
26730 TEST_REQUIRES_X86_AVX;
26731 for (uint32_t n = 1; n <= 16; n++) {
26732 GemmMicrokernelTester()
26733 .mr(3)
26734 .nr(16)
26735 .kr(1)
26736 .sr(1)
26737 .m(3)
26738 .n(n)
26739 .k(1)
26740 .iterations(1)
26741 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26742 }
26743 }
26744
26745 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_gt_1) {
26746 TEST_REQUIRES_X86_AVX;
26747 for (size_t k = 2; k < 10; k++) {
26748 GemmMicrokernelTester()
26749 .mr(3)
26750 .nr(16)
26751 .kr(1)
26752 .sr(1)
26753 .m(3)
26754 .n(16)
26755 .k(k)
26756 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26757 }
26758 }
26759
26760 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_gt_1_strided_a) {
26761 TEST_REQUIRES_X86_AVX;
26762 for (size_t k = 2; k < 10; k++) {
26763 GemmMicrokernelTester()
26764 .mr(3)
26765 .nr(16)
26766 .kr(1)
26767 .sr(1)
26768 .m(3)
26769 .n(16)
26770 .k(k)
26771 .a_stride(11)
26772 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26773 }
26774 }
26775
26776 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, k_gt_1_subtile) {
26777 TEST_REQUIRES_X86_AVX;
26778 for (size_t k = 2; k < 10; k++) {
26779 for (uint32_t m = 1; m <= 3; m++) {
26780 for (uint32_t n = 1; n <= 16; n++) {
26781 GemmMicrokernelTester()
26782 .mr(3)
26783 .nr(16)
26784 .kr(1)
26785 .sr(1)
26786 .m(m)
26787 .n(n)
26788 .k(k)
26789 .iterations(1)
26790 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26791 }
26792 }
26793 }
26794 }
26795
26796 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16) {
26797 TEST_REQUIRES_X86_AVX;
26798 for (uint32_t n = 17; n < 32; n++) {
26799 for (size_t k = 1; k <= 5; k += 2) {
26800 GemmMicrokernelTester()
26801 .mr(3)
26802 .nr(16)
26803 .kr(1)
26804 .sr(1)
26805 .m(3)
26806 .n(16)
26807 .k(k)
26808 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26809 }
26810 }
26811 }
26812
26813 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
26814 TEST_REQUIRES_X86_AVX;
26815 for (uint32_t n = 17; n < 32; n++) {
26816 for (size_t k = 1; k <= 5; k += 2) {
26817 GemmMicrokernelTester()
26818 .mr(3)
26819 .nr(16)
26820 .kr(1)
26821 .sr(1)
26822 .m(3)
26823 .n(16)
26824 .k(k)
26825 .cn_stride(19)
26826 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26827 }
26828 }
26829 }
26830
26831 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16_strided_a) {
26832 TEST_REQUIRES_X86_AVX;
26833 for (uint32_t n = 17; n < 32; n++) {
26834 for (size_t k = 1; k <= 5; k += 2) {
26835 GemmMicrokernelTester()
26836 .mr(3)
26837 .nr(16)
26838 .kr(1)
26839 .sr(1)
26840 .m(3)
26841 .n(n)
26842 .k(k)
26843 .a_stride(7)
26844 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26845 }
26846 }
26847 }
26848
26849 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_gt_16_subtile) {
26850 TEST_REQUIRES_X86_AVX;
26851 for (uint32_t n = 17; n < 32; n++) {
26852 for (size_t k = 1; k <= 5; k += 2) {
26853 for (uint32_t m = 1; m <= 3; m++) {
26854 GemmMicrokernelTester()
26855 .mr(3)
26856 .nr(16)
26857 .kr(1)
26858 .sr(1)
26859 .m(m)
26860 .n(n)
26861 .k(k)
26862 .iterations(1)
26863 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26864 }
26865 }
26866 }
26867 }
26868
26869 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16) {
26870 TEST_REQUIRES_X86_AVX;
26871 for (uint32_t n = 32; n <= 48; n += 16) {
26872 for (size_t k = 1; k <= 5; k += 2) {
26873 GemmMicrokernelTester()
26874 .mr(3)
26875 .nr(16)
26876 .kr(1)
26877 .sr(1)
26878 .m(3)
26879 .n(16)
26880 .k(k)
26881 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26882 }
26883 }
26884 }
26885
26886 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
26887 TEST_REQUIRES_X86_AVX;
26888 for (uint32_t n = 32; n <= 48; n += 16) {
26889 for (size_t k = 1; k <= 5; k += 2) {
26890 GemmMicrokernelTester()
26891 .mr(3)
26892 .nr(16)
26893 .kr(1)
26894 .sr(1)
26895 .m(3)
26896 .n(n)
26897 .k(k)
26898 .cn_stride(19)
26899 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26900 }
26901 }
26902 }
26903
26904 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16_strided_a) {
26905 TEST_REQUIRES_X86_AVX;
26906 for (uint32_t n = 32; n <= 48; n += 16) {
26907 for (size_t k = 1; k <= 5; k += 2) {
26908 GemmMicrokernelTester()
26909 .mr(3)
26910 .nr(16)
26911 .kr(1)
26912 .sr(1)
26913 .m(3)
26914 .n(n)
26915 .k(k)
26916 .a_stride(7)
26917 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26918 }
26919 }
26920 }
26921
26922 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, n_div_16_subtile) {
26923 TEST_REQUIRES_X86_AVX;
26924 for (uint32_t n = 32; n <= 48; n += 16) {
26925 for (size_t k = 1; k <= 5; k += 2) {
26926 for (uint32_t m = 1; m <= 3; m++) {
26927 GemmMicrokernelTester()
26928 .mr(3)
26929 .nr(16)
26930 .kr(1)
26931 .sr(1)
26932 .m(m)
26933 .n(n)
26934 .k(k)
26935 .iterations(1)
26936 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26937 }
26938 }
26939 }
26940 }
26941
26942 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, strided_cm_subtile) {
26943 TEST_REQUIRES_X86_AVX;
26944 for (size_t k = 1; k <= 5; k += 2) {
26945 for (uint32_t m = 1; m <= 3; m++) {
26946 for (uint32_t n = 1; n <= 16; n++) {
26947 GemmMicrokernelTester()
26948 .mr(3)
26949 .nr(16)
26950 .kr(1)
26951 .sr(1)
26952 .m(m)
26953 .n(n)
26954 .k(k)
26955 .cm_stride(19)
26956 .iterations(1)
26957 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26958 }
26959 }
26960 }
26961 }
26962
26963 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, qmin) {
26964 TEST_REQUIRES_X86_AVX;
26965 GemmMicrokernelTester()
26966 .mr(3)
26967 .nr(16)
26968 .kr(1)
26969 .sr(1)
26970 .m(3)
26971 .n(16)
26972 .k(1)
26973 .qmin(128)
26974 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26975 }
26976
26977 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, qmax) {
26978 TEST_REQUIRES_X86_AVX;
26979 GemmMicrokernelTester()
26980 .mr(3)
26981 .nr(16)
26982 .kr(1)
26983 .sr(1)
26984 .m(3)
26985 .n(16)
26986 .k(1)
26987 .qmax(128)
26988 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
26989 }
26990
26991 TEST(F32_GEMMINC_3X16__AVX_BROADCAST, strided_cm) {
26992 TEST_REQUIRES_X86_AVX;
26993 GemmMicrokernelTester()
26994 .mr(3)
26995 .nr(16)
26996 .kr(1)
26997 .sr(1)
26998 .m(3)
26999 .n(16)
27000 .k(1)
27001 .cm_stride(19)
27002 .Test(xnn_f32_gemminc_ukernel_3x16__avx_broadcast);
27003 }
27004#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27005
27006
27007#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27008 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1) {
27009 TEST_REQUIRES_X86_AVX;
27010 GemmMicrokernelTester()
27011 .mr(4)
27012 .nr(16)
27013 .kr(1)
27014 .sr(1)
27015 .m(4)
27016 .n(16)
27017 .k(1)
27018 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27019 }
27020
27021 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, strided_cn) {
27022 TEST_REQUIRES_X86_AVX;
27023 GemmMicrokernelTester()
27024 .mr(4)
27025 .nr(16)
27026 .kr(1)
27027 .sr(1)
27028 .m(4)
27029 .n(16)
27030 .k(1)
27031 .cn_stride(19)
27032 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27033 }
27034
27035 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_strided_a) {
27036 TEST_REQUIRES_X86_AVX;
27037 GemmMicrokernelTester()
27038 .mr(4)
27039 .nr(16)
27040 .kr(1)
27041 .sr(1)
27042 .m(4)
27043 .n(16)
27044 .k(1)
27045 .a_stride(3)
27046 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27047 }
27048
27049 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_subtile) {
27050 TEST_REQUIRES_X86_AVX;
27051 for (uint32_t m = 1; m <= 4; m++) {
27052 for (uint32_t n = 1; n <= 16; n++) {
27053 GemmMicrokernelTester()
27054 .mr(4)
27055 .nr(16)
27056 .kr(1)
27057 .sr(1)
27058 .m(m)
27059 .n(n)
27060 .k(1)
27061 .iterations(1)
27062 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27063 }
27064 }
27065 }
27066
27067 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_subtile_m) {
27068 TEST_REQUIRES_X86_AVX;
27069 for (uint32_t m = 1; m <= 4; m++) {
27070 GemmMicrokernelTester()
27071 .mr(4)
27072 .nr(16)
27073 .kr(1)
27074 .sr(1)
27075 .m(m)
27076 .n(16)
27077 .k(1)
27078 .iterations(1)
27079 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27080 }
27081 }
27082
27083 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_eq_1_subtile_n) {
27084 TEST_REQUIRES_X86_AVX;
27085 for (uint32_t n = 1; n <= 16; n++) {
27086 GemmMicrokernelTester()
27087 .mr(4)
27088 .nr(16)
27089 .kr(1)
27090 .sr(1)
27091 .m(4)
27092 .n(n)
27093 .k(1)
27094 .iterations(1)
27095 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27096 }
27097 }
27098
27099 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_gt_1) {
27100 TEST_REQUIRES_X86_AVX;
27101 for (size_t k = 2; k < 10; k++) {
27102 GemmMicrokernelTester()
27103 .mr(4)
27104 .nr(16)
27105 .kr(1)
27106 .sr(1)
27107 .m(4)
27108 .n(16)
27109 .k(k)
27110 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27111 }
27112 }
27113
27114 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_gt_1_strided_a) {
27115 TEST_REQUIRES_X86_AVX;
27116 for (size_t k = 2; k < 10; k++) {
27117 GemmMicrokernelTester()
27118 .mr(4)
27119 .nr(16)
27120 .kr(1)
27121 .sr(1)
27122 .m(4)
27123 .n(16)
27124 .k(k)
27125 .a_stride(11)
27126 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27127 }
27128 }
27129
27130 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, k_gt_1_subtile) {
27131 TEST_REQUIRES_X86_AVX;
27132 for (size_t k = 2; k < 10; k++) {
27133 for (uint32_t m = 1; m <= 4; m++) {
27134 for (uint32_t n = 1; n <= 16; n++) {
27135 GemmMicrokernelTester()
27136 .mr(4)
27137 .nr(16)
27138 .kr(1)
27139 .sr(1)
27140 .m(m)
27141 .n(n)
27142 .k(k)
27143 .iterations(1)
27144 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27145 }
27146 }
27147 }
27148 }
27149
27150 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16) {
27151 TEST_REQUIRES_X86_AVX;
27152 for (uint32_t n = 17; n < 32; n++) {
27153 for (size_t k = 1; k <= 5; k += 2) {
27154 GemmMicrokernelTester()
27155 .mr(4)
27156 .nr(16)
27157 .kr(1)
27158 .sr(1)
27159 .m(4)
27160 .n(16)
27161 .k(k)
27162 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27163 }
27164 }
27165 }
27166
27167 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16_strided_cn) {
27168 TEST_REQUIRES_X86_AVX;
27169 for (uint32_t n = 17; n < 32; n++) {
27170 for (size_t k = 1; k <= 5; k += 2) {
27171 GemmMicrokernelTester()
27172 .mr(4)
27173 .nr(16)
27174 .kr(1)
27175 .sr(1)
27176 .m(4)
27177 .n(16)
27178 .k(k)
27179 .cn_stride(19)
27180 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27181 }
27182 }
27183 }
27184
27185 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16_strided_a) {
27186 TEST_REQUIRES_X86_AVX;
27187 for (uint32_t n = 17; n < 32; n++) {
27188 for (size_t k = 1; k <= 5; k += 2) {
27189 GemmMicrokernelTester()
27190 .mr(4)
27191 .nr(16)
27192 .kr(1)
27193 .sr(1)
27194 .m(4)
27195 .n(n)
27196 .k(k)
27197 .a_stride(7)
27198 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27199 }
27200 }
27201 }
27202
27203 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_gt_16_subtile) {
27204 TEST_REQUIRES_X86_AVX;
27205 for (uint32_t n = 17; n < 32; n++) {
27206 for (size_t k = 1; k <= 5; k += 2) {
27207 for (uint32_t m = 1; m <= 4; m++) {
27208 GemmMicrokernelTester()
27209 .mr(4)
27210 .nr(16)
27211 .kr(1)
27212 .sr(1)
27213 .m(m)
27214 .n(n)
27215 .k(k)
27216 .iterations(1)
27217 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27218 }
27219 }
27220 }
27221 }
27222
27223 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16) {
27224 TEST_REQUIRES_X86_AVX;
27225 for (uint32_t n = 32; n <= 48; n += 16) {
27226 for (size_t k = 1; k <= 5; k += 2) {
27227 GemmMicrokernelTester()
27228 .mr(4)
27229 .nr(16)
27230 .kr(1)
27231 .sr(1)
27232 .m(4)
27233 .n(16)
27234 .k(k)
27235 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27236 }
27237 }
27238 }
27239
27240 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16_strided_cn) {
27241 TEST_REQUIRES_X86_AVX;
27242 for (uint32_t n = 32; n <= 48; n += 16) {
27243 for (size_t k = 1; k <= 5; k += 2) {
27244 GemmMicrokernelTester()
27245 .mr(4)
27246 .nr(16)
27247 .kr(1)
27248 .sr(1)
27249 .m(4)
27250 .n(n)
27251 .k(k)
27252 .cn_stride(19)
27253 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27254 }
27255 }
27256 }
27257
27258 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16_strided_a) {
27259 TEST_REQUIRES_X86_AVX;
27260 for (uint32_t n = 32; n <= 48; n += 16) {
27261 for (size_t k = 1; k <= 5; k += 2) {
27262 GemmMicrokernelTester()
27263 .mr(4)
27264 .nr(16)
27265 .kr(1)
27266 .sr(1)
27267 .m(4)
27268 .n(n)
27269 .k(k)
27270 .a_stride(7)
27271 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27272 }
27273 }
27274 }
27275
27276 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, n_div_16_subtile) {
27277 TEST_REQUIRES_X86_AVX;
27278 for (uint32_t n = 32; n <= 48; n += 16) {
27279 for (size_t k = 1; k <= 5; k += 2) {
27280 for (uint32_t m = 1; m <= 4; m++) {
27281 GemmMicrokernelTester()
27282 .mr(4)
27283 .nr(16)
27284 .kr(1)
27285 .sr(1)
27286 .m(m)
27287 .n(n)
27288 .k(k)
27289 .iterations(1)
27290 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27291 }
27292 }
27293 }
27294 }
27295
27296 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, strided_cm_subtile) {
27297 TEST_REQUIRES_X86_AVX;
27298 for (size_t k = 1; k <= 5; k += 2) {
27299 for (uint32_t m = 1; m <= 4; m++) {
27300 for (uint32_t n = 1; n <= 16; n++) {
27301 GemmMicrokernelTester()
27302 .mr(4)
27303 .nr(16)
27304 .kr(1)
27305 .sr(1)
27306 .m(m)
27307 .n(n)
27308 .k(k)
27309 .cm_stride(19)
27310 .iterations(1)
27311 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27312 }
27313 }
27314 }
27315 }
27316
27317 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, qmin) {
27318 TEST_REQUIRES_X86_AVX;
27319 GemmMicrokernelTester()
27320 .mr(4)
27321 .nr(16)
27322 .kr(1)
27323 .sr(1)
27324 .m(4)
27325 .n(16)
27326 .k(1)
27327 .qmin(128)
27328 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27329 }
27330
27331 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, qmax) {
27332 TEST_REQUIRES_X86_AVX;
27333 GemmMicrokernelTester()
27334 .mr(4)
27335 .nr(16)
27336 .kr(1)
27337 .sr(1)
27338 .m(4)
27339 .n(16)
27340 .k(1)
27341 .qmax(128)
27342 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27343 }
27344
27345 TEST(F32_GEMMINC_4X16__AVX_BROADCAST, strided_cm) {
27346 TEST_REQUIRES_X86_AVX;
27347 GemmMicrokernelTester()
27348 .mr(4)
27349 .nr(16)
27350 .kr(1)
27351 .sr(1)
27352 .m(4)
27353 .n(16)
27354 .k(1)
27355 .cm_stride(19)
27356 .Test(xnn_f32_gemminc_ukernel_4x16__avx_broadcast);
27357 }
27358#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27359
27360
27361#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27362 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1) {
27363 TEST_REQUIRES_X86_AVX;
27364 GemmMicrokernelTester()
27365 .mr(5)
27366 .nr(16)
27367 .kr(1)
27368 .sr(1)
27369 .m(5)
27370 .n(16)
27371 .k(1)
27372 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27373 }
27374
27375 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, strided_cn) {
27376 TEST_REQUIRES_X86_AVX;
27377 GemmMicrokernelTester()
27378 .mr(5)
27379 .nr(16)
27380 .kr(1)
27381 .sr(1)
27382 .m(5)
27383 .n(16)
27384 .k(1)
27385 .cn_stride(19)
27386 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27387 }
27388
27389 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_strided_a) {
27390 TEST_REQUIRES_X86_AVX;
27391 GemmMicrokernelTester()
27392 .mr(5)
27393 .nr(16)
27394 .kr(1)
27395 .sr(1)
27396 .m(5)
27397 .n(16)
27398 .k(1)
27399 .a_stride(3)
27400 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27401 }
27402
27403 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_subtile) {
27404 TEST_REQUIRES_X86_AVX;
27405 for (uint32_t m = 1; m <= 5; m++) {
27406 for (uint32_t n = 1; n <= 16; n++) {
27407 GemmMicrokernelTester()
27408 .mr(5)
27409 .nr(16)
27410 .kr(1)
27411 .sr(1)
27412 .m(m)
27413 .n(n)
27414 .k(1)
27415 .iterations(1)
27416 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27417 }
27418 }
27419 }
27420
27421 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_subtile_m) {
27422 TEST_REQUIRES_X86_AVX;
27423 for (uint32_t m = 1; m <= 5; m++) {
27424 GemmMicrokernelTester()
27425 .mr(5)
27426 .nr(16)
27427 .kr(1)
27428 .sr(1)
27429 .m(m)
27430 .n(16)
27431 .k(1)
27432 .iterations(1)
27433 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27434 }
27435 }
27436
27437 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_eq_1_subtile_n) {
27438 TEST_REQUIRES_X86_AVX;
27439 for (uint32_t n = 1; n <= 16; n++) {
27440 GemmMicrokernelTester()
27441 .mr(5)
27442 .nr(16)
27443 .kr(1)
27444 .sr(1)
27445 .m(5)
27446 .n(n)
27447 .k(1)
27448 .iterations(1)
27449 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27450 }
27451 }
27452
27453 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_gt_1) {
27454 TEST_REQUIRES_X86_AVX;
27455 for (size_t k = 2; k < 10; k++) {
27456 GemmMicrokernelTester()
27457 .mr(5)
27458 .nr(16)
27459 .kr(1)
27460 .sr(1)
27461 .m(5)
27462 .n(16)
27463 .k(k)
27464 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27465 }
27466 }
27467
27468 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_gt_1_strided_a) {
27469 TEST_REQUIRES_X86_AVX;
27470 for (size_t k = 2; k < 10; k++) {
27471 GemmMicrokernelTester()
27472 .mr(5)
27473 .nr(16)
27474 .kr(1)
27475 .sr(1)
27476 .m(5)
27477 .n(16)
27478 .k(k)
27479 .a_stride(11)
27480 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27481 }
27482 }
27483
27484 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, k_gt_1_subtile) {
27485 TEST_REQUIRES_X86_AVX;
27486 for (size_t k = 2; k < 10; k++) {
27487 for (uint32_t m = 1; m <= 5; m++) {
27488 for (uint32_t n = 1; n <= 16; n++) {
27489 GemmMicrokernelTester()
27490 .mr(5)
27491 .nr(16)
27492 .kr(1)
27493 .sr(1)
27494 .m(m)
27495 .n(n)
27496 .k(k)
27497 .iterations(1)
27498 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27499 }
27500 }
27501 }
27502 }
27503
27504 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16) {
27505 TEST_REQUIRES_X86_AVX;
27506 for (uint32_t n = 17; n < 32; n++) {
27507 for (size_t k = 1; k <= 5; k += 2) {
27508 GemmMicrokernelTester()
27509 .mr(5)
27510 .nr(16)
27511 .kr(1)
27512 .sr(1)
27513 .m(5)
27514 .n(16)
27515 .k(k)
27516 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27517 }
27518 }
27519 }
27520
27521 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16_strided_cn) {
27522 TEST_REQUIRES_X86_AVX;
27523 for (uint32_t n = 17; n < 32; n++) {
27524 for (size_t k = 1; k <= 5; k += 2) {
27525 GemmMicrokernelTester()
27526 .mr(5)
27527 .nr(16)
27528 .kr(1)
27529 .sr(1)
27530 .m(5)
27531 .n(16)
27532 .k(k)
27533 .cn_stride(19)
27534 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27535 }
27536 }
27537 }
27538
27539 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16_strided_a) {
27540 TEST_REQUIRES_X86_AVX;
27541 for (uint32_t n = 17; n < 32; n++) {
27542 for (size_t k = 1; k <= 5; k += 2) {
27543 GemmMicrokernelTester()
27544 .mr(5)
27545 .nr(16)
27546 .kr(1)
27547 .sr(1)
27548 .m(5)
27549 .n(n)
27550 .k(k)
27551 .a_stride(7)
27552 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27553 }
27554 }
27555 }
27556
27557 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_gt_16_subtile) {
27558 TEST_REQUIRES_X86_AVX;
27559 for (uint32_t n = 17; n < 32; n++) {
27560 for (size_t k = 1; k <= 5; k += 2) {
27561 for (uint32_t m = 1; m <= 5; m++) {
27562 GemmMicrokernelTester()
27563 .mr(5)
27564 .nr(16)
27565 .kr(1)
27566 .sr(1)
27567 .m(m)
27568 .n(n)
27569 .k(k)
27570 .iterations(1)
27571 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27572 }
27573 }
27574 }
27575 }
27576
27577 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16) {
27578 TEST_REQUIRES_X86_AVX;
27579 for (uint32_t n = 32; n <= 48; n += 16) {
27580 for (size_t k = 1; k <= 5; k += 2) {
27581 GemmMicrokernelTester()
27582 .mr(5)
27583 .nr(16)
27584 .kr(1)
27585 .sr(1)
27586 .m(5)
27587 .n(16)
27588 .k(k)
27589 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27590 }
27591 }
27592 }
27593
27594 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16_strided_cn) {
27595 TEST_REQUIRES_X86_AVX;
27596 for (uint32_t n = 32; n <= 48; n += 16) {
27597 for (size_t k = 1; k <= 5; k += 2) {
27598 GemmMicrokernelTester()
27599 .mr(5)
27600 .nr(16)
27601 .kr(1)
27602 .sr(1)
27603 .m(5)
27604 .n(n)
27605 .k(k)
27606 .cn_stride(19)
27607 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27608 }
27609 }
27610 }
27611
27612 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16_strided_a) {
27613 TEST_REQUIRES_X86_AVX;
27614 for (uint32_t n = 32; n <= 48; n += 16) {
27615 for (size_t k = 1; k <= 5; k += 2) {
27616 GemmMicrokernelTester()
27617 .mr(5)
27618 .nr(16)
27619 .kr(1)
27620 .sr(1)
27621 .m(5)
27622 .n(n)
27623 .k(k)
27624 .a_stride(7)
27625 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27626 }
27627 }
27628 }
27629
27630 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, n_div_16_subtile) {
27631 TEST_REQUIRES_X86_AVX;
27632 for (uint32_t n = 32; n <= 48; n += 16) {
27633 for (size_t k = 1; k <= 5; k += 2) {
27634 for (uint32_t m = 1; m <= 5; m++) {
27635 GemmMicrokernelTester()
27636 .mr(5)
27637 .nr(16)
27638 .kr(1)
27639 .sr(1)
27640 .m(m)
27641 .n(n)
27642 .k(k)
27643 .iterations(1)
27644 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27645 }
27646 }
27647 }
27648 }
27649
27650 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, strided_cm_subtile) {
27651 TEST_REQUIRES_X86_AVX;
27652 for (size_t k = 1; k <= 5; k += 2) {
27653 for (uint32_t m = 1; m <= 5; m++) {
27654 for (uint32_t n = 1; n <= 16; n++) {
27655 GemmMicrokernelTester()
27656 .mr(5)
27657 .nr(16)
27658 .kr(1)
27659 .sr(1)
27660 .m(m)
27661 .n(n)
27662 .k(k)
27663 .cm_stride(19)
27664 .iterations(1)
27665 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27666 }
27667 }
27668 }
27669 }
27670
27671 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, qmin) {
27672 TEST_REQUIRES_X86_AVX;
27673 GemmMicrokernelTester()
27674 .mr(5)
27675 .nr(16)
27676 .kr(1)
27677 .sr(1)
27678 .m(5)
27679 .n(16)
27680 .k(1)
27681 .qmin(128)
27682 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27683 }
27684
27685 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, qmax) {
27686 TEST_REQUIRES_X86_AVX;
27687 GemmMicrokernelTester()
27688 .mr(5)
27689 .nr(16)
27690 .kr(1)
27691 .sr(1)
27692 .m(5)
27693 .n(16)
27694 .k(1)
27695 .qmax(128)
27696 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27697 }
27698
27699 TEST(F32_GEMMINC_5X16__AVX_BROADCAST, strided_cm) {
27700 TEST_REQUIRES_X86_AVX;
27701 GemmMicrokernelTester()
27702 .mr(5)
27703 .nr(16)
27704 .kr(1)
27705 .sr(1)
27706 .m(5)
27707 .n(16)
27708 .k(1)
27709 .cm_stride(19)
27710 .Test(xnn_f32_gemminc_ukernel_5x16__avx_broadcast);
27711 }
27712#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27713
27714
27715#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhanfda12b82019-11-21 12:27:59 -080027716 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1) {
27717 TEST_REQUIRES_X86_FMA3;
27718 GemmMicrokernelTester()
27719 .mr(1)
27720 .nr(8)
27721 .kr(1)
27722 .sr(1)
27723 .m(1)
27724 .n(8)
27725 .k(1)
27726 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27727 }
27728
27729 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, strided_cn) {
27730 TEST_REQUIRES_X86_FMA3;
27731 GemmMicrokernelTester()
27732 .mr(1)
27733 .nr(8)
27734 .kr(1)
27735 .sr(1)
27736 .m(1)
27737 .n(8)
27738 .k(1)
27739 .cn_stride(11)
27740 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27741 }
27742
27743 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_strided_a) {
27744 TEST_REQUIRES_X86_FMA3;
27745 GemmMicrokernelTester()
27746 .mr(1)
27747 .nr(8)
27748 .kr(1)
27749 .sr(1)
27750 .m(1)
27751 .n(8)
27752 .k(1)
27753 .a_stride(3)
27754 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27755 }
27756
27757 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
27758 TEST_REQUIRES_X86_FMA3;
27759 for (uint32_t m = 1; m <= 1; m++) {
27760 for (uint32_t n = 1; n <= 8; n++) {
27761 GemmMicrokernelTester()
27762 .mr(1)
27763 .nr(8)
27764 .kr(1)
27765 .sr(1)
27766 .m(m)
27767 .n(n)
27768 .k(1)
27769 .iterations(1)
27770 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27771 }
27772 }
27773 }
27774
27775 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
27776 TEST_REQUIRES_X86_FMA3;
27777 for (uint32_t m = 1; m <= 1; m++) {
27778 GemmMicrokernelTester()
27779 .mr(1)
27780 .nr(8)
27781 .kr(1)
27782 .sr(1)
27783 .m(m)
27784 .n(8)
27785 .k(1)
27786 .iterations(1)
27787 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27788 }
27789 }
27790
27791 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
27792 TEST_REQUIRES_X86_FMA3;
27793 for (uint32_t n = 1; n <= 8; n++) {
27794 GemmMicrokernelTester()
27795 .mr(1)
27796 .nr(8)
27797 .kr(1)
27798 .sr(1)
27799 .m(1)
27800 .n(n)
27801 .k(1)
27802 .iterations(1)
27803 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27804 }
27805 }
27806
27807 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_gt_1) {
27808 TEST_REQUIRES_X86_FMA3;
27809 for (size_t k = 2; k < 10; k++) {
27810 GemmMicrokernelTester()
27811 .mr(1)
27812 .nr(8)
27813 .kr(1)
27814 .sr(1)
27815 .m(1)
27816 .n(8)
27817 .k(k)
27818 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27819 }
27820 }
27821
27822 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_gt_1_strided_a) {
27823 TEST_REQUIRES_X86_FMA3;
27824 for (size_t k = 2; k < 10; k++) {
27825 GemmMicrokernelTester()
27826 .mr(1)
27827 .nr(8)
27828 .kr(1)
27829 .sr(1)
27830 .m(1)
27831 .n(8)
27832 .k(k)
27833 .a_stride(11)
27834 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27835 }
27836 }
27837
27838 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
27839 TEST_REQUIRES_X86_FMA3;
27840 for (size_t k = 2; k < 10; k++) {
27841 for (uint32_t m = 1; m <= 1; m++) {
27842 for (uint32_t n = 1; n <= 8; n++) {
27843 GemmMicrokernelTester()
27844 .mr(1)
27845 .nr(8)
27846 .kr(1)
27847 .sr(1)
27848 .m(m)
27849 .n(n)
27850 .k(k)
27851 .iterations(1)
27852 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27853 }
27854 }
27855 }
27856 }
27857
27858 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8) {
27859 TEST_REQUIRES_X86_FMA3;
27860 for (uint32_t n = 9; n < 16; n++) {
27861 for (size_t k = 1; k <= 5; k += 2) {
27862 GemmMicrokernelTester()
27863 .mr(1)
27864 .nr(8)
27865 .kr(1)
27866 .sr(1)
27867 .m(1)
27868 .n(8)
27869 .k(k)
27870 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27871 }
27872 }
27873 }
27874
27875 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
27876 TEST_REQUIRES_X86_FMA3;
27877 for (uint32_t n = 9; n < 16; n++) {
27878 for (size_t k = 1; k <= 5; k += 2) {
27879 GemmMicrokernelTester()
27880 .mr(1)
27881 .nr(8)
27882 .kr(1)
27883 .sr(1)
27884 .m(1)
27885 .n(8)
27886 .k(k)
27887 .cn_stride(11)
27888 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27889 }
27890 }
27891 }
27892
27893 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8_strided_a) {
27894 TEST_REQUIRES_X86_FMA3;
27895 for (uint32_t n = 9; n < 16; n++) {
27896 for (size_t k = 1; k <= 5; k += 2) {
27897 GemmMicrokernelTester()
27898 .mr(1)
27899 .nr(8)
27900 .kr(1)
27901 .sr(1)
27902 .m(1)
27903 .n(n)
27904 .k(k)
27905 .a_stride(7)
27906 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27907 }
27908 }
27909 }
27910
27911 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
27912 TEST_REQUIRES_X86_FMA3;
27913 for (uint32_t n = 9; n < 16; n++) {
27914 for (size_t k = 1; k <= 5; k += 2) {
27915 for (uint32_t m = 1; m <= 1; m++) {
27916 GemmMicrokernelTester()
27917 .mr(1)
27918 .nr(8)
27919 .kr(1)
27920 .sr(1)
27921 .m(m)
27922 .n(n)
27923 .k(k)
27924 .iterations(1)
27925 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27926 }
27927 }
27928 }
27929 }
27930
27931 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8) {
27932 TEST_REQUIRES_X86_FMA3;
27933 for (uint32_t n = 16; n <= 24; n += 8) {
27934 for (size_t k = 1; k <= 5; k += 2) {
27935 GemmMicrokernelTester()
27936 .mr(1)
27937 .nr(8)
27938 .kr(1)
27939 .sr(1)
27940 .m(1)
27941 .n(8)
27942 .k(k)
27943 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27944 }
27945 }
27946 }
27947
27948 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
27949 TEST_REQUIRES_X86_FMA3;
27950 for (uint32_t n = 16; n <= 24; n += 8) {
27951 for (size_t k = 1; k <= 5; k += 2) {
27952 GemmMicrokernelTester()
27953 .mr(1)
27954 .nr(8)
27955 .kr(1)
27956 .sr(1)
27957 .m(1)
27958 .n(n)
27959 .k(k)
27960 .cn_stride(11)
27961 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27962 }
27963 }
27964 }
27965
27966 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8_strided_a) {
27967 TEST_REQUIRES_X86_FMA3;
27968 for (uint32_t n = 16; n <= 24; n += 8) {
27969 for (size_t k = 1; k <= 5; k += 2) {
27970 GemmMicrokernelTester()
27971 .mr(1)
27972 .nr(8)
27973 .kr(1)
27974 .sr(1)
27975 .m(1)
27976 .n(n)
27977 .k(k)
27978 .a_stride(7)
27979 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27980 }
27981 }
27982 }
27983
27984 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, n_div_8_subtile) {
27985 TEST_REQUIRES_X86_FMA3;
27986 for (uint32_t n = 16; n <= 24; n += 8) {
27987 for (size_t k = 1; k <= 5; k += 2) {
27988 for (uint32_t m = 1; m <= 1; m++) {
27989 GemmMicrokernelTester()
27990 .mr(1)
27991 .nr(8)
27992 .kr(1)
27993 .sr(1)
27994 .m(m)
27995 .n(n)
27996 .k(k)
27997 .iterations(1)
27998 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
27999 }
28000 }
28001 }
28002 }
28003
28004 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, strided_cm_subtile) {
28005 TEST_REQUIRES_X86_FMA3;
28006 for (size_t k = 1; k <= 5; k += 2) {
28007 for (uint32_t m = 1; m <= 1; m++) {
28008 for (uint32_t n = 1; n <= 8; n++) {
28009 GemmMicrokernelTester()
28010 .mr(1)
28011 .nr(8)
28012 .kr(1)
28013 .sr(1)
28014 .m(m)
28015 .n(n)
28016 .k(k)
28017 .cm_stride(11)
28018 .iterations(1)
28019 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
28020 }
28021 }
28022 }
28023 }
28024
28025 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, qmin) {
28026 TEST_REQUIRES_X86_FMA3;
28027 GemmMicrokernelTester()
28028 .mr(1)
28029 .nr(8)
28030 .kr(1)
28031 .sr(1)
28032 .m(1)
28033 .n(8)
28034 .k(1)
28035 .qmin(128)
28036 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
28037 }
28038
28039 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, qmax) {
28040 TEST_REQUIRES_X86_FMA3;
28041 GemmMicrokernelTester()
28042 .mr(1)
28043 .nr(8)
28044 .kr(1)
28045 .sr(1)
28046 .m(1)
28047 .n(8)
28048 .k(1)
28049 .qmax(128)
28050 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
28051 }
28052
28053 TEST(F32_GEMMINC_1X8__FMA3_BROADCAST, strided_cm) {
28054 TEST_REQUIRES_X86_FMA3;
28055 GemmMicrokernelTester()
28056 .mr(1)
28057 .nr(8)
28058 .kr(1)
28059 .sr(1)
28060 .m(1)
28061 .n(8)
28062 .k(1)
28063 .cm_stride(11)
28064 .Test(xnn_f32_gemminc_ukernel_1x8__fma3_broadcast);
28065 }
28066#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28067
28068
28069#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28070 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1) {
28071 TEST_REQUIRES_X86_FMA3;
28072 GemmMicrokernelTester()
28073 .mr(4)
28074 .nr(8)
28075 .kr(1)
28076 .sr(1)
28077 .m(4)
28078 .n(8)
28079 .k(1)
28080 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28081 }
28082
28083 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, strided_cn) {
28084 TEST_REQUIRES_X86_FMA3;
28085 GemmMicrokernelTester()
28086 .mr(4)
28087 .nr(8)
28088 .kr(1)
28089 .sr(1)
28090 .m(4)
28091 .n(8)
28092 .k(1)
28093 .cn_stride(11)
28094 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28095 }
28096
28097 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_strided_a) {
28098 TEST_REQUIRES_X86_FMA3;
28099 GemmMicrokernelTester()
28100 .mr(4)
28101 .nr(8)
28102 .kr(1)
28103 .sr(1)
28104 .m(4)
28105 .n(8)
28106 .k(1)
28107 .a_stride(3)
28108 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28109 }
28110
28111 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
28112 TEST_REQUIRES_X86_FMA3;
28113 for (uint32_t m = 1; m <= 4; m++) {
28114 for (uint32_t n = 1; n <= 8; n++) {
28115 GemmMicrokernelTester()
28116 .mr(4)
28117 .nr(8)
28118 .kr(1)
28119 .sr(1)
28120 .m(m)
28121 .n(n)
28122 .k(1)
28123 .iterations(1)
28124 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28125 }
28126 }
28127 }
28128
28129 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
28130 TEST_REQUIRES_X86_FMA3;
28131 for (uint32_t m = 1; m <= 4; m++) {
28132 GemmMicrokernelTester()
28133 .mr(4)
28134 .nr(8)
28135 .kr(1)
28136 .sr(1)
28137 .m(m)
28138 .n(8)
28139 .k(1)
28140 .iterations(1)
28141 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28142 }
28143 }
28144
28145 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
28146 TEST_REQUIRES_X86_FMA3;
28147 for (uint32_t n = 1; n <= 8; n++) {
28148 GemmMicrokernelTester()
28149 .mr(4)
28150 .nr(8)
28151 .kr(1)
28152 .sr(1)
28153 .m(4)
28154 .n(n)
28155 .k(1)
28156 .iterations(1)
28157 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28158 }
28159 }
28160
28161 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_gt_1) {
28162 TEST_REQUIRES_X86_FMA3;
28163 for (size_t k = 2; k < 10; k++) {
28164 GemmMicrokernelTester()
28165 .mr(4)
28166 .nr(8)
28167 .kr(1)
28168 .sr(1)
28169 .m(4)
28170 .n(8)
28171 .k(k)
28172 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28173 }
28174 }
28175
28176 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_gt_1_strided_a) {
28177 TEST_REQUIRES_X86_FMA3;
28178 for (size_t k = 2; k < 10; k++) {
28179 GemmMicrokernelTester()
28180 .mr(4)
28181 .nr(8)
28182 .kr(1)
28183 .sr(1)
28184 .m(4)
28185 .n(8)
28186 .k(k)
28187 .a_stride(11)
28188 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28189 }
28190 }
28191
28192 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
28193 TEST_REQUIRES_X86_FMA3;
28194 for (size_t k = 2; k < 10; k++) {
28195 for (uint32_t m = 1; m <= 4; m++) {
28196 for (uint32_t n = 1; n <= 8; n++) {
28197 GemmMicrokernelTester()
28198 .mr(4)
28199 .nr(8)
28200 .kr(1)
28201 .sr(1)
28202 .m(m)
28203 .n(n)
28204 .k(k)
28205 .iterations(1)
28206 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28207 }
28208 }
28209 }
28210 }
28211
28212 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8) {
28213 TEST_REQUIRES_X86_FMA3;
28214 for (uint32_t n = 9; n < 16; n++) {
28215 for (size_t k = 1; k <= 5; k += 2) {
28216 GemmMicrokernelTester()
28217 .mr(4)
28218 .nr(8)
28219 .kr(1)
28220 .sr(1)
28221 .m(4)
28222 .n(8)
28223 .k(k)
28224 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28225 }
28226 }
28227 }
28228
28229 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
28230 TEST_REQUIRES_X86_FMA3;
28231 for (uint32_t n = 9; n < 16; n++) {
28232 for (size_t k = 1; k <= 5; k += 2) {
28233 GemmMicrokernelTester()
28234 .mr(4)
28235 .nr(8)
28236 .kr(1)
28237 .sr(1)
28238 .m(4)
28239 .n(8)
28240 .k(k)
28241 .cn_stride(11)
28242 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28243 }
28244 }
28245 }
28246
28247 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8_strided_a) {
28248 TEST_REQUIRES_X86_FMA3;
28249 for (uint32_t n = 9; n < 16; n++) {
28250 for (size_t k = 1; k <= 5; k += 2) {
28251 GemmMicrokernelTester()
28252 .mr(4)
28253 .nr(8)
28254 .kr(1)
28255 .sr(1)
28256 .m(4)
28257 .n(n)
28258 .k(k)
28259 .a_stride(7)
28260 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28261 }
28262 }
28263 }
28264
28265 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
28266 TEST_REQUIRES_X86_FMA3;
28267 for (uint32_t n = 9; n < 16; n++) {
28268 for (size_t k = 1; k <= 5; k += 2) {
28269 for (uint32_t m = 1; m <= 4; m++) {
28270 GemmMicrokernelTester()
28271 .mr(4)
28272 .nr(8)
28273 .kr(1)
28274 .sr(1)
28275 .m(m)
28276 .n(n)
28277 .k(k)
28278 .iterations(1)
28279 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28280 }
28281 }
28282 }
28283 }
28284
28285 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8) {
28286 TEST_REQUIRES_X86_FMA3;
28287 for (uint32_t n = 16; n <= 24; n += 8) {
28288 for (size_t k = 1; k <= 5; k += 2) {
28289 GemmMicrokernelTester()
28290 .mr(4)
28291 .nr(8)
28292 .kr(1)
28293 .sr(1)
28294 .m(4)
28295 .n(8)
28296 .k(k)
28297 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28298 }
28299 }
28300 }
28301
28302 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
28303 TEST_REQUIRES_X86_FMA3;
28304 for (uint32_t n = 16; n <= 24; n += 8) {
28305 for (size_t k = 1; k <= 5; k += 2) {
28306 GemmMicrokernelTester()
28307 .mr(4)
28308 .nr(8)
28309 .kr(1)
28310 .sr(1)
28311 .m(4)
28312 .n(n)
28313 .k(k)
28314 .cn_stride(11)
28315 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28316 }
28317 }
28318 }
28319
28320 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8_strided_a) {
28321 TEST_REQUIRES_X86_FMA3;
28322 for (uint32_t n = 16; n <= 24; n += 8) {
28323 for (size_t k = 1; k <= 5; k += 2) {
28324 GemmMicrokernelTester()
28325 .mr(4)
28326 .nr(8)
28327 .kr(1)
28328 .sr(1)
28329 .m(4)
28330 .n(n)
28331 .k(k)
28332 .a_stride(7)
28333 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28334 }
28335 }
28336 }
28337
28338 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, n_div_8_subtile) {
28339 TEST_REQUIRES_X86_FMA3;
28340 for (uint32_t n = 16; n <= 24; n += 8) {
28341 for (size_t k = 1; k <= 5; k += 2) {
28342 for (uint32_t m = 1; m <= 4; m++) {
28343 GemmMicrokernelTester()
28344 .mr(4)
28345 .nr(8)
28346 .kr(1)
28347 .sr(1)
28348 .m(m)
28349 .n(n)
28350 .k(k)
28351 .iterations(1)
28352 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28353 }
28354 }
28355 }
28356 }
28357
28358 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, strided_cm_subtile) {
28359 TEST_REQUIRES_X86_FMA3;
28360 for (size_t k = 1; k <= 5; k += 2) {
28361 for (uint32_t m = 1; m <= 4; m++) {
28362 for (uint32_t n = 1; n <= 8; n++) {
28363 GemmMicrokernelTester()
28364 .mr(4)
28365 .nr(8)
28366 .kr(1)
28367 .sr(1)
28368 .m(m)
28369 .n(n)
28370 .k(k)
28371 .cm_stride(11)
28372 .iterations(1)
28373 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28374 }
28375 }
28376 }
28377 }
28378
28379 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, qmin) {
28380 TEST_REQUIRES_X86_FMA3;
28381 GemmMicrokernelTester()
28382 .mr(4)
28383 .nr(8)
28384 .kr(1)
28385 .sr(1)
28386 .m(4)
28387 .n(8)
28388 .k(1)
28389 .qmin(128)
28390 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28391 }
28392
28393 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, qmax) {
28394 TEST_REQUIRES_X86_FMA3;
28395 GemmMicrokernelTester()
28396 .mr(4)
28397 .nr(8)
28398 .kr(1)
28399 .sr(1)
28400 .m(4)
28401 .n(8)
28402 .k(1)
28403 .qmax(128)
28404 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28405 }
28406
28407 TEST(F32_GEMMINC_4X8__FMA3_BROADCAST, strided_cm) {
28408 TEST_REQUIRES_X86_FMA3;
28409 GemmMicrokernelTester()
28410 .mr(4)
28411 .nr(8)
28412 .kr(1)
28413 .sr(1)
28414 .m(4)
28415 .n(8)
28416 .k(1)
28417 .cm_stride(11)
28418 .Test(xnn_f32_gemminc_ukernel_4x8__fma3_broadcast);
28419 }
28420#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28421
28422
28423#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28424 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1) {
28425 TEST_REQUIRES_X86_FMA3;
28426 GemmMicrokernelTester()
28427 .mr(5)
28428 .nr(8)
28429 .kr(1)
28430 .sr(1)
28431 .m(5)
28432 .n(8)
28433 .k(1)
28434 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28435 }
28436
28437 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, strided_cn) {
28438 TEST_REQUIRES_X86_FMA3;
28439 GemmMicrokernelTester()
28440 .mr(5)
28441 .nr(8)
28442 .kr(1)
28443 .sr(1)
28444 .m(5)
28445 .n(8)
28446 .k(1)
28447 .cn_stride(11)
28448 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28449 }
28450
28451 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_strided_a) {
28452 TEST_REQUIRES_X86_FMA3;
28453 GemmMicrokernelTester()
28454 .mr(5)
28455 .nr(8)
28456 .kr(1)
28457 .sr(1)
28458 .m(5)
28459 .n(8)
28460 .k(1)
28461 .a_stride(3)
28462 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28463 }
28464
28465 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
28466 TEST_REQUIRES_X86_FMA3;
28467 for (uint32_t m = 1; m <= 5; m++) {
28468 for (uint32_t n = 1; n <= 8; n++) {
28469 GemmMicrokernelTester()
28470 .mr(5)
28471 .nr(8)
28472 .kr(1)
28473 .sr(1)
28474 .m(m)
28475 .n(n)
28476 .k(1)
28477 .iterations(1)
28478 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28479 }
28480 }
28481 }
28482
28483 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
28484 TEST_REQUIRES_X86_FMA3;
28485 for (uint32_t m = 1; m <= 5; m++) {
28486 GemmMicrokernelTester()
28487 .mr(5)
28488 .nr(8)
28489 .kr(1)
28490 .sr(1)
28491 .m(m)
28492 .n(8)
28493 .k(1)
28494 .iterations(1)
28495 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28496 }
28497 }
28498
28499 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
28500 TEST_REQUIRES_X86_FMA3;
28501 for (uint32_t n = 1; n <= 8; n++) {
28502 GemmMicrokernelTester()
28503 .mr(5)
28504 .nr(8)
28505 .kr(1)
28506 .sr(1)
28507 .m(5)
28508 .n(n)
28509 .k(1)
28510 .iterations(1)
28511 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28512 }
28513 }
28514
28515 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_gt_1) {
28516 TEST_REQUIRES_X86_FMA3;
28517 for (size_t k = 2; k < 10; k++) {
28518 GemmMicrokernelTester()
28519 .mr(5)
28520 .nr(8)
28521 .kr(1)
28522 .sr(1)
28523 .m(5)
28524 .n(8)
28525 .k(k)
28526 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28527 }
28528 }
28529
28530 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_gt_1_strided_a) {
28531 TEST_REQUIRES_X86_FMA3;
28532 for (size_t k = 2; k < 10; k++) {
28533 GemmMicrokernelTester()
28534 .mr(5)
28535 .nr(8)
28536 .kr(1)
28537 .sr(1)
28538 .m(5)
28539 .n(8)
28540 .k(k)
28541 .a_stride(11)
28542 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28543 }
28544 }
28545
28546 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
28547 TEST_REQUIRES_X86_FMA3;
28548 for (size_t k = 2; k < 10; k++) {
28549 for (uint32_t m = 1; m <= 5; m++) {
28550 for (uint32_t n = 1; n <= 8; n++) {
28551 GemmMicrokernelTester()
28552 .mr(5)
28553 .nr(8)
28554 .kr(1)
28555 .sr(1)
28556 .m(m)
28557 .n(n)
28558 .k(k)
28559 .iterations(1)
28560 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28561 }
28562 }
28563 }
28564 }
28565
28566 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8) {
28567 TEST_REQUIRES_X86_FMA3;
28568 for (uint32_t n = 9; n < 16; n++) {
28569 for (size_t k = 1; k <= 5; k += 2) {
28570 GemmMicrokernelTester()
28571 .mr(5)
28572 .nr(8)
28573 .kr(1)
28574 .sr(1)
28575 .m(5)
28576 .n(8)
28577 .k(k)
28578 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28579 }
28580 }
28581 }
28582
28583 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
28584 TEST_REQUIRES_X86_FMA3;
28585 for (uint32_t n = 9; n < 16; n++) {
28586 for (size_t k = 1; k <= 5; k += 2) {
28587 GemmMicrokernelTester()
28588 .mr(5)
28589 .nr(8)
28590 .kr(1)
28591 .sr(1)
28592 .m(5)
28593 .n(8)
28594 .k(k)
28595 .cn_stride(11)
28596 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28597 }
28598 }
28599 }
28600
28601 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8_strided_a) {
28602 TEST_REQUIRES_X86_FMA3;
28603 for (uint32_t n = 9; n < 16; n++) {
28604 for (size_t k = 1; k <= 5; k += 2) {
28605 GemmMicrokernelTester()
28606 .mr(5)
28607 .nr(8)
28608 .kr(1)
28609 .sr(1)
28610 .m(5)
28611 .n(n)
28612 .k(k)
28613 .a_stride(7)
28614 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28615 }
28616 }
28617 }
28618
28619 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
28620 TEST_REQUIRES_X86_FMA3;
28621 for (uint32_t n = 9; n < 16; n++) {
28622 for (size_t k = 1; k <= 5; k += 2) {
28623 for (uint32_t m = 1; m <= 5; m++) {
28624 GemmMicrokernelTester()
28625 .mr(5)
28626 .nr(8)
28627 .kr(1)
28628 .sr(1)
28629 .m(m)
28630 .n(n)
28631 .k(k)
28632 .iterations(1)
28633 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28634 }
28635 }
28636 }
28637 }
28638
28639 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8) {
28640 TEST_REQUIRES_X86_FMA3;
28641 for (uint32_t n = 16; n <= 24; n += 8) {
28642 for (size_t k = 1; k <= 5; k += 2) {
28643 GemmMicrokernelTester()
28644 .mr(5)
28645 .nr(8)
28646 .kr(1)
28647 .sr(1)
28648 .m(5)
28649 .n(8)
28650 .k(k)
28651 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28652 }
28653 }
28654 }
28655
28656 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
28657 TEST_REQUIRES_X86_FMA3;
28658 for (uint32_t n = 16; n <= 24; n += 8) {
28659 for (size_t k = 1; k <= 5; k += 2) {
28660 GemmMicrokernelTester()
28661 .mr(5)
28662 .nr(8)
28663 .kr(1)
28664 .sr(1)
28665 .m(5)
28666 .n(n)
28667 .k(k)
28668 .cn_stride(11)
28669 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28670 }
28671 }
28672 }
28673
28674 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8_strided_a) {
28675 TEST_REQUIRES_X86_FMA3;
28676 for (uint32_t n = 16; n <= 24; n += 8) {
28677 for (size_t k = 1; k <= 5; k += 2) {
28678 GemmMicrokernelTester()
28679 .mr(5)
28680 .nr(8)
28681 .kr(1)
28682 .sr(1)
28683 .m(5)
28684 .n(n)
28685 .k(k)
28686 .a_stride(7)
28687 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28688 }
28689 }
28690 }
28691
28692 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, n_div_8_subtile) {
28693 TEST_REQUIRES_X86_FMA3;
28694 for (uint32_t n = 16; n <= 24; n += 8) {
28695 for (size_t k = 1; k <= 5; k += 2) {
28696 for (uint32_t m = 1; m <= 5; m++) {
28697 GemmMicrokernelTester()
28698 .mr(5)
28699 .nr(8)
28700 .kr(1)
28701 .sr(1)
28702 .m(m)
28703 .n(n)
28704 .k(k)
28705 .iterations(1)
28706 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28707 }
28708 }
28709 }
28710 }
28711
28712 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, strided_cm_subtile) {
28713 TEST_REQUIRES_X86_FMA3;
28714 for (size_t k = 1; k <= 5; k += 2) {
28715 for (uint32_t m = 1; m <= 5; m++) {
28716 for (uint32_t n = 1; n <= 8; n++) {
28717 GemmMicrokernelTester()
28718 .mr(5)
28719 .nr(8)
28720 .kr(1)
28721 .sr(1)
28722 .m(m)
28723 .n(n)
28724 .k(k)
28725 .cm_stride(11)
28726 .iterations(1)
28727 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28728 }
28729 }
28730 }
28731 }
28732
28733 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, qmin) {
28734 TEST_REQUIRES_X86_FMA3;
28735 GemmMicrokernelTester()
28736 .mr(5)
28737 .nr(8)
28738 .kr(1)
28739 .sr(1)
28740 .m(5)
28741 .n(8)
28742 .k(1)
28743 .qmin(128)
28744 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28745 }
28746
28747 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, qmax) {
28748 TEST_REQUIRES_X86_FMA3;
28749 GemmMicrokernelTester()
28750 .mr(5)
28751 .nr(8)
28752 .kr(1)
28753 .sr(1)
28754 .m(5)
28755 .n(8)
28756 .k(1)
28757 .qmax(128)
28758 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28759 }
28760
28761 TEST(F32_GEMMINC_5X8__FMA3_BROADCAST, strided_cm) {
28762 TEST_REQUIRES_X86_FMA3;
28763 GemmMicrokernelTester()
28764 .mr(5)
28765 .nr(8)
28766 .kr(1)
28767 .sr(1)
28768 .m(5)
28769 .n(8)
28770 .k(1)
28771 .cm_stride(11)
28772 .Test(xnn_f32_gemminc_ukernel_5x8__fma3_broadcast);
28773 }
28774#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28775
28776
28777#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28778 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1) {
28779 TEST_REQUIRES_X86_FMA3;
28780 GemmMicrokernelTester()
28781 .mr(6)
28782 .nr(8)
28783 .kr(1)
28784 .sr(1)
28785 .m(6)
28786 .n(8)
28787 .k(1)
28788 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28789 }
28790
28791 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, strided_cn) {
28792 TEST_REQUIRES_X86_FMA3;
28793 GemmMicrokernelTester()
28794 .mr(6)
28795 .nr(8)
28796 .kr(1)
28797 .sr(1)
28798 .m(6)
28799 .n(8)
28800 .k(1)
28801 .cn_stride(11)
28802 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28803 }
28804
28805 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_strided_a) {
28806 TEST_REQUIRES_X86_FMA3;
28807 GemmMicrokernelTester()
28808 .mr(6)
28809 .nr(8)
28810 .kr(1)
28811 .sr(1)
28812 .m(6)
28813 .n(8)
28814 .k(1)
28815 .a_stride(3)
28816 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28817 }
28818
28819 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_subtile) {
28820 TEST_REQUIRES_X86_FMA3;
28821 for (uint32_t m = 1; m <= 6; m++) {
28822 for (uint32_t n = 1; n <= 8; n++) {
28823 GemmMicrokernelTester()
28824 .mr(6)
28825 .nr(8)
28826 .kr(1)
28827 .sr(1)
28828 .m(m)
28829 .n(n)
28830 .k(1)
28831 .iterations(1)
28832 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28833 }
28834 }
28835 }
28836
28837 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
28838 TEST_REQUIRES_X86_FMA3;
28839 for (uint32_t m = 1; m <= 6; m++) {
28840 GemmMicrokernelTester()
28841 .mr(6)
28842 .nr(8)
28843 .kr(1)
28844 .sr(1)
28845 .m(m)
28846 .n(8)
28847 .k(1)
28848 .iterations(1)
28849 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28850 }
28851 }
28852
28853 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
28854 TEST_REQUIRES_X86_FMA3;
28855 for (uint32_t n = 1; n <= 8; n++) {
28856 GemmMicrokernelTester()
28857 .mr(6)
28858 .nr(8)
28859 .kr(1)
28860 .sr(1)
28861 .m(6)
28862 .n(n)
28863 .k(1)
28864 .iterations(1)
28865 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28866 }
28867 }
28868
28869 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_gt_1) {
28870 TEST_REQUIRES_X86_FMA3;
28871 for (size_t k = 2; k < 10; k++) {
28872 GemmMicrokernelTester()
28873 .mr(6)
28874 .nr(8)
28875 .kr(1)
28876 .sr(1)
28877 .m(6)
28878 .n(8)
28879 .k(k)
28880 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28881 }
28882 }
28883
28884 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_gt_1_strided_a) {
28885 TEST_REQUIRES_X86_FMA3;
28886 for (size_t k = 2; k < 10; k++) {
28887 GemmMicrokernelTester()
28888 .mr(6)
28889 .nr(8)
28890 .kr(1)
28891 .sr(1)
28892 .m(6)
28893 .n(8)
28894 .k(k)
28895 .a_stride(11)
28896 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28897 }
28898 }
28899
28900 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, k_gt_1_subtile) {
28901 TEST_REQUIRES_X86_FMA3;
28902 for (size_t k = 2; k < 10; k++) {
28903 for (uint32_t m = 1; m <= 6; m++) {
28904 for (uint32_t n = 1; n <= 8; n++) {
28905 GemmMicrokernelTester()
28906 .mr(6)
28907 .nr(8)
28908 .kr(1)
28909 .sr(1)
28910 .m(m)
28911 .n(n)
28912 .k(k)
28913 .iterations(1)
28914 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28915 }
28916 }
28917 }
28918 }
28919
28920 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8) {
28921 TEST_REQUIRES_X86_FMA3;
28922 for (uint32_t n = 9; n < 16; n++) {
28923 for (size_t k = 1; k <= 5; k += 2) {
28924 GemmMicrokernelTester()
28925 .mr(6)
28926 .nr(8)
28927 .kr(1)
28928 .sr(1)
28929 .m(6)
28930 .n(8)
28931 .k(k)
28932 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28933 }
28934 }
28935 }
28936
28937 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
28938 TEST_REQUIRES_X86_FMA3;
28939 for (uint32_t n = 9; n < 16; n++) {
28940 for (size_t k = 1; k <= 5; k += 2) {
28941 GemmMicrokernelTester()
28942 .mr(6)
28943 .nr(8)
28944 .kr(1)
28945 .sr(1)
28946 .m(6)
28947 .n(8)
28948 .k(k)
28949 .cn_stride(11)
28950 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28951 }
28952 }
28953 }
28954
28955 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8_strided_a) {
28956 TEST_REQUIRES_X86_FMA3;
28957 for (uint32_t n = 9; n < 16; n++) {
28958 for (size_t k = 1; k <= 5; k += 2) {
28959 GemmMicrokernelTester()
28960 .mr(6)
28961 .nr(8)
28962 .kr(1)
28963 .sr(1)
28964 .m(6)
28965 .n(n)
28966 .k(k)
28967 .a_stride(7)
28968 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28969 }
28970 }
28971 }
28972
28973 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_gt_8_subtile) {
28974 TEST_REQUIRES_X86_FMA3;
28975 for (uint32_t n = 9; n < 16; n++) {
28976 for (size_t k = 1; k <= 5; k += 2) {
28977 for (uint32_t m = 1; m <= 6; m++) {
28978 GemmMicrokernelTester()
28979 .mr(6)
28980 .nr(8)
28981 .kr(1)
28982 .sr(1)
28983 .m(m)
28984 .n(n)
28985 .k(k)
28986 .iterations(1)
28987 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
28988 }
28989 }
28990 }
28991 }
28992
28993 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8) {
28994 TEST_REQUIRES_X86_FMA3;
28995 for (uint32_t n = 16; n <= 24; n += 8) {
28996 for (size_t k = 1; k <= 5; k += 2) {
28997 GemmMicrokernelTester()
28998 .mr(6)
28999 .nr(8)
29000 .kr(1)
29001 .sr(1)
29002 .m(6)
29003 .n(8)
29004 .k(k)
29005 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29006 }
29007 }
29008 }
29009
29010 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8_strided_cn) {
29011 TEST_REQUIRES_X86_FMA3;
29012 for (uint32_t n = 16; n <= 24; n += 8) {
29013 for (size_t k = 1; k <= 5; k += 2) {
29014 GemmMicrokernelTester()
29015 .mr(6)
29016 .nr(8)
29017 .kr(1)
29018 .sr(1)
29019 .m(6)
29020 .n(n)
29021 .k(k)
29022 .cn_stride(11)
29023 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29024 }
29025 }
29026 }
29027
29028 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8_strided_a) {
29029 TEST_REQUIRES_X86_FMA3;
29030 for (uint32_t n = 16; n <= 24; n += 8) {
29031 for (size_t k = 1; k <= 5; k += 2) {
29032 GemmMicrokernelTester()
29033 .mr(6)
29034 .nr(8)
29035 .kr(1)
29036 .sr(1)
29037 .m(6)
29038 .n(n)
29039 .k(k)
29040 .a_stride(7)
29041 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29042 }
29043 }
29044 }
29045
29046 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, n_div_8_subtile) {
29047 TEST_REQUIRES_X86_FMA3;
29048 for (uint32_t n = 16; n <= 24; n += 8) {
29049 for (size_t k = 1; k <= 5; k += 2) {
29050 for (uint32_t m = 1; m <= 6; m++) {
29051 GemmMicrokernelTester()
29052 .mr(6)
29053 .nr(8)
29054 .kr(1)
29055 .sr(1)
29056 .m(m)
29057 .n(n)
29058 .k(k)
29059 .iterations(1)
29060 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29061 }
29062 }
29063 }
29064 }
29065
29066 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, strided_cm_subtile) {
29067 TEST_REQUIRES_X86_FMA3;
29068 for (size_t k = 1; k <= 5; k += 2) {
29069 for (uint32_t m = 1; m <= 6; m++) {
29070 for (uint32_t n = 1; n <= 8; n++) {
29071 GemmMicrokernelTester()
29072 .mr(6)
29073 .nr(8)
29074 .kr(1)
29075 .sr(1)
29076 .m(m)
29077 .n(n)
29078 .k(k)
29079 .cm_stride(11)
29080 .iterations(1)
29081 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29082 }
29083 }
29084 }
29085 }
29086
29087 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, qmin) {
29088 TEST_REQUIRES_X86_FMA3;
29089 GemmMicrokernelTester()
29090 .mr(6)
29091 .nr(8)
29092 .kr(1)
29093 .sr(1)
29094 .m(6)
29095 .n(8)
29096 .k(1)
29097 .qmin(128)
29098 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29099 }
29100
29101 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, qmax) {
29102 TEST_REQUIRES_X86_FMA3;
29103 GemmMicrokernelTester()
29104 .mr(6)
29105 .nr(8)
29106 .kr(1)
29107 .sr(1)
29108 .m(6)
29109 .n(8)
29110 .k(1)
29111 .qmax(128)
29112 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29113 }
29114
29115 TEST(F32_GEMMINC_6X8__FMA3_BROADCAST, strided_cm) {
29116 TEST_REQUIRES_X86_FMA3;
29117 GemmMicrokernelTester()
29118 .mr(6)
29119 .nr(8)
29120 .kr(1)
29121 .sr(1)
29122 .m(6)
29123 .n(8)
29124 .k(1)
29125 .cm_stride(11)
29126 .Test(xnn_f32_gemminc_ukernel_6x8__fma3_broadcast);
29127 }
29128#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29129
29130
29131#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29132 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1) {
29133 TEST_REQUIRES_X86_FMA3;
29134 GemmMicrokernelTester()
29135 .mr(7)
29136 .nr(8)
29137 .kr(1)
29138 .sr(1)
29139 .m(7)
29140 .n(8)
29141 .k(1)
29142 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29143 }
29144
29145 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, strided_cn) {
29146 TEST_REQUIRES_X86_FMA3;
29147 GemmMicrokernelTester()
29148 .mr(7)
29149 .nr(8)
29150 .kr(1)
29151 .sr(1)
29152 .m(7)
29153 .n(8)
29154 .k(1)
29155 .cn_stride(11)
29156 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29157 }
29158
29159 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_strided_a) {
29160 TEST_REQUIRES_X86_FMA3;
29161 GemmMicrokernelTester()
29162 .mr(7)
29163 .nr(8)
29164 .kr(1)
29165 .sr(1)
29166 .m(7)
29167 .n(8)
29168 .k(1)
29169 .a_stride(3)
29170 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29171 }
29172
29173 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_subtile) {
29174 TEST_REQUIRES_X86_FMA3;
29175 for (uint32_t m = 1; m <= 7; m++) {
29176 for (uint32_t n = 1; n <= 8; n++) {
29177 GemmMicrokernelTester()
29178 .mr(7)
29179 .nr(8)
29180 .kr(1)
29181 .sr(1)
29182 .m(m)
29183 .n(n)
29184 .k(1)
29185 .iterations(1)
29186 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29187 }
29188 }
29189 }
29190
29191 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
29192 TEST_REQUIRES_X86_FMA3;
29193 for (uint32_t m = 1; m <= 7; m++) {
29194 GemmMicrokernelTester()
29195 .mr(7)
29196 .nr(8)
29197 .kr(1)
29198 .sr(1)
29199 .m(m)
29200 .n(8)
29201 .k(1)
29202 .iterations(1)
29203 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29204 }
29205 }
29206
29207 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
29208 TEST_REQUIRES_X86_FMA3;
29209 for (uint32_t n = 1; n <= 8; n++) {
29210 GemmMicrokernelTester()
29211 .mr(7)
29212 .nr(8)
29213 .kr(1)
29214 .sr(1)
29215 .m(7)
29216 .n(n)
29217 .k(1)
29218 .iterations(1)
29219 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29220 }
29221 }
29222
29223 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_gt_1) {
29224 TEST_REQUIRES_X86_FMA3;
29225 for (size_t k = 2; k < 10; k++) {
29226 GemmMicrokernelTester()
29227 .mr(7)
29228 .nr(8)
29229 .kr(1)
29230 .sr(1)
29231 .m(7)
29232 .n(8)
29233 .k(k)
29234 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29235 }
29236 }
29237
29238 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_gt_1_strided_a) {
29239 TEST_REQUIRES_X86_FMA3;
29240 for (size_t k = 2; k < 10; k++) {
29241 GemmMicrokernelTester()
29242 .mr(7)
29243 .nr(8)
29244 .kr(1)
29245 .sr(1)
29246 .m(7)
29247 .n(8)
29248 .k(k)
29249 .a_stride(11)
29250 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29251 }
29252 }
29253
29254 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, k_gt_1_subtile) {
29255 TEST_REQUIRES_X86_FMA3;
29256 for (size_t k = 2; k < 10; k++) {
29257 for (uint32_t m = 1; m <= 7; m++) {
29258 for (uint32_t n = 1; n <= 8; n++) {
29259 GemmMicrokernelTester()
29260 .mr(7)
29261 .nr(8)
29262 .kr(1)
29263 .sr(1)
29264 .m(m)
29265 .n(n)
29266 .k(k)
29267 .iterations(1)
29268 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29269 }
29270 }
29271 }
29272 }
29273
29274 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8) {
29275 TEST_REQUIRES_X86_FMA3;
29276 for (uint32_t n = 9; n < 16; n++) {
29277 for (size_t k = 1; k <= 5; k += 2) {
29278 GemmMicrokernelTester()
29279 .mr(7)
29280 .nr(8)
29281 .kr(1)
29282 .sr(1)
29283 .m(7)
29284 .n(8)
29285 .k(k)
29286 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29287 }
29288 }
29289 }
29290
29291 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
29292 TEST_REQUIRES_X86_FMA3;
29293 for (uint32_t n = 9; n < 16; n++) {
29294 for (size_t k = 1; k <= 5; k += 2) {
29295 GemmMicrokernelTester()
29296 .mr(7)
29297 .nr(8)
29298 .kr(1)
29299 .sr(1)
29300 .m(7)
29301 .n(8)
29302 .k(k)
29303 .cn_stride(11)
29304 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29305 }
29306 }
29307 }
29308
29309 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8_strided_a) {
29310 TEST_REQUIRES_X86_FMA3;
29311 for (uint32_t n = 9; n < 16; n++) {
29312 for (size_t k = 1; k <= 5; k += 2) {
29313 GemmMicrokernelTester()
29314 .mr(7)
29315 .nr(8)
29316 .kr(1)
29317 .sr(1)
29318 .m(7)
29319 .n(n)
29320 .k(k)
29321 .a_stride(7)
29322 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29323 }
29324 }
29325 }
29326
29327 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_gt_8_subtile) {
29328 TEST_REQUIRES_X86_FMA3;
29329 for (uint32_t n = 9; n < 16; n++) {
29330 for (size_t k = 1; k <= 5; k += 2) {
29331 for (uint32_t m = 1; m <= 7; m++) {
29332 GemmMicrokernelTester()
29333 .mr(7)
29334 .nr(8)
29335 .kr(1)
29336 .sr(1)
29337 .m(m)
29338 .n(n)
29339 .k(k)
29340 .iterations(1)
29341 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29342 }
29343 }
29344 }
29345 }
29346
29347 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8) {
29348 TEST_REQUIRES_X86_FMA3;
29349 for (uint32_t n = 16; n <= 24; n += 8) {
29350 for (size_t k = 1; k <= 5; k += 2) {
29351 GemmMicrokernelTester()
29352 .mr(7)
29353 .nr(8)
29354 .kr(1)
29355 .sr(1)
29356 .m(7)
29357 .n(8)
29358 .k(k)
29359 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29360 }
29361 }
29362 }
29363
29364 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8_strided_cn) {
29365 TEST_REQUIRES_X86_FMA3;
29366 for (uint32_t n = 16; n <= 24; n += 8) {
29367 for (size_t k = 1; k <= 5; k += 2) {
29368 GemmMicrokernelTester()
29369 .mr(7)
29370 .nr(8)
29371 .kr(1)
29372 .sr(1)
29373 .m(7)
29374 .n(n)
29375 .k(k)
29376 .cn_stride(11)
29377 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29378 }
29379 }
29380 }
29381
29382 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8_strided_a) {
29383 TEST_REQUIRES_X86_FMA3;
29384 for (uint32_t n = 16; n <= 24; n += 8) {
29385 for (size_t k = 1; k <= 5; k += 2) {
29386 GemmMicrokernelTester()
29387 .mr(7)
29388 .nr(8)
29389 .kr(1)
29390 .sr(1)
29391 .m(7)
29392 .n(n)
29393 .k(k)
29394 .a_stride(7)
29395 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29396 }
29397 }
29398 }
29399
29400 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, n_div_8_subtile) {
29401 TEST_REQUIRES_X86_FMA3;
29402 for (uint32_t n = 16; n <= 24; n += 8) {
29403 for (size_t k = 1; k <= 5; k += 2) {
29404 for (uint32_t m = 1; m <= 7; m++) {
29405 GemmMicrokernelTester()
29406 .mr(7)
29407 .nr(8)
29408 .kr(1)
29409 .sr(1)
29410 .m(m)
29411 .n(n)
29412 .k(k)
29413 .iterations(1)
29414 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29415 }
29416 }
29417 }
29418 }
29419
29420 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, strided_cm_subtile) {
29421 TEST_REQUIRES_X86_FMA3;
29422 for (size_t k = 1; k <= 5; k += 2) {
29423 for (uint32_t m = 1; m <= 7; m++) {
29424 for (uint32_t n = 1; n <= 8; n++) {
29425 GemmMicrokernelTester()
29426 .mr(7)
29427 .nr(8)
29428 .kr(1)
29429 .sr(1)
29430 .m(m)
29431 .n(n)
29432 .k(k)
29433 .cm_stride(11)
29434 .iterations(1)
29435 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29436 }
29437 }
29438 }
29439 }
29440
29441 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, qmin) {
29442 TEST_REQUIRES_X86_FMA3;
29443 GemmMicrokernelTester()
29444 .mr(7)
29445 .nr(8)
29446 .kr(1)
29447 .sr(1)
29448 .m(7)
29449 .n(8)
29450 .k(1)
29451 .qmin(128)
29452 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29453 }
29454
29455 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, qmax) {
29456 TEST_REQUIRES_X86_FMA3;
29457 GemmMicrokernelTester()
29458 .mr(7)
29459 .nr(8)
29460 .kr(1)
29461 .sr(1)
29462 .m(7)
29463 .n(8)
29464 .k(1)
29465 .qmax(128)
29466 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29467 }
29468
29469 TEST(F32_GEMMINC_7X8__FMA3_BROADCAST, strided_cm) {
29470 TEST_REQUIRES_X86_FMA3;
29471 GemmMicrokernelTester()
29472 .mr(7)
29473 .nr(8)
29474 .kr(1)
29475 .sr(1)
29476 .m(7)
29477 .n(8)
29478 .k(1)
29479 .cm_stride(11)
29480 .Test(xnn_f32_gemminc_ukernel_7x8__fma3_broadcast);
29481 }
29482#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29483
29484
29485#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29486 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1) {
29487 TEST_REQUIRES_X86_FMA3;
29488 GemmMicrokernelTester()
29489 .mr(8)
29490 .nr(8)
29491 .kr(1)
29492 .sr(1)
29493 .m(8)
29494 .n(8)
29495 .k(1)
29496 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29497 }
29498
29499 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, strided_cn) {
29500 TEST_REQUIRES_X86_FMA3;
29501 GemmMicrokernelTester()
29502 .mr(8)
29503 .nr(8)
29504 .kr(1)
29505 .sr(1)
29506 .m(8)
29507 .n(8)
29508 .k(1)
29509 .cn_stride(11)
29510 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29511 }
29512
29513 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_strided_a) {
29514 TEST_REQUIRES_X86_FMA3;
29515 GemmMicrokernelTester()
29516 .mr(8)
29517 .nr(8)
29518 .kr(1)
29519 .sr(1)
29520 .m(8)
29521 .n(8)
29522 .k(1)
29523 .a_stride(3)
29524 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29525 }
29526
29527 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_subtile) {
29528 TEST_REQUIRES_X86_FMA3;
29529 for (uint32_t m = 1; m <= 8; m++) {
29530 for (uint32_t n = 1; n <= 8; n++) {
29531 GemmMicrokernelTester()
29532 .mr(8)
29533 .nr(8)
29534 .kr(1)
29535 .sr(1)
29536 .m(m)
29537 .n(n)
29538 .k(1)
29539 .iterations(1)
29540 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29541 }
29542 }
29543 }
29544
29545 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
29546 TEST_REQUIRES_X86_FMA3;
29547 for (uint32_t m = 1; m <= 8; m++) {
29548 GemmMicrokernelTester()
29549 .mr(8)
29550 .nr(8)
29551 .kr(1)
29552 .sr(1)
29553 .m(m)
29554 .n(8)
29555 .k(1)
29556 .iterations(1)
29557 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29558 }
29559 }
29560
29561 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
29562 TEST_REQUIRES_X86_FMA3;
29563 for (uint32_t n = 1; n <= 8; n++) {
29564 GemmMicrokernelTester()
29565 .mr(8)
29566 .nr(8)
29567 .kr(1)
29568 .sr(1)
29569 .m(8)
29570 .n(n)
29571 .k(1)
29572 .iterations(1)
29573 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29574 }
29575 }
29576
29577 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_gt_1) {
29578 TEST_REQUIRES_X86_FMA3;
29579 for (size_t k = 2; k < 10; k++) {
29580 GemmMicrokernelTester()
29581 .mr(8)
29582 .nr(8)
29583 .kr(1)
29584 .sr(1)
29585 .m(8)
29586 .n(8)
29587 .k(k)
29588 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29589 }
29590 }
29591
29592 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_gt_1_strided_a) {
29593 TEST_REQUIRES_X86_FMA3;
29594 for (size_t k = 2; k < 10; k++) {
29595 GemmMicrokernelTester()
29596 .mr(8)
29597 .nr(8)
29598 .kr(1)
29599 .sr(1)
29600 .m(8)
29601 .n(8)
29602 .k(k)
29603 .a_stride(11)
29604 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29605 }
29606 }
29607
29608 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, k_gt_1_subtile) {
29609 TEST_REQUIRES_X86_FMA3;
29610 for (size_t k = 2; k < 10; k++) {
29611 for (uint32_t m = 1; m <= 8; m++) {
29612 for (uint32_t n = 1; n <= 8; n++) {
29613 GemmMicrokernelTester()
29614 .mr(8)
29615 .nr(8)
29616 .kr(1)
29617 .sr(1)
29618 .m(m)
29619 .n(n)
29620 .k(k)
29621 .iterations(1)
29622 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29623 }
29624 }
29625 }
29626 }
29627
29628 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8) {
29629 TEST_REQUIRES_X86_FMA3;
29630 for (uint32_t n = 9; n < 16; n++) {
29631 for (size_t k = 1; k <= 5; k += 2) {
29632 GemmMicrokernelTester()
29633 .mr(8)
29634 .nr(8)
29635 .kr(1)
29636 .sr(1)
29637 .m(8)
29638 .n(8)
29639 .k(k)
29640 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29641 }
29642 }
29643 }
29644
29645 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
29646 TEST_REQUIRES_X86_FMA3;
29647 for (uint32_t n = 9; n < 16; n++) {
29648 for (size_t k = 1; k <= 5; k += 2) {
29649 GemmMicrokernelTester()
29650 .mr(8)
29651 .nr(8)
29652 .kr(1)
29653 .sr(1)
29654 .m(8)
29655 .n(8)
29656 .k(k)
29657 .cn_stride(11)
29658 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29659 }
29660 }
29661 }
29662
29663 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8_strided_a) {
29664 TEST_REQUIRES_X86_FMA3;
29665 for (uint32_t n = 9; n < 16; n++) {
29666 for (size_t k = 1; k <= 5; k += 2) {
29667 GemmMicrokernelTester()
29668 .mr(8)
29669 .nr(8)
29670 .kr(1)
29671 .sr(1)
29672 .m(8)
29673 .n(n)
29674 .k(k)
29675 .a_stride(7)
29676 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29677 }
29678 }
29679 }
29680
29681 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_gt_8_subtile) {
29682 TEST_REQUIRES_X86_FMA3;
29683 for (uint32_t n = 9; n < 16; n++) {
29684 for (size_t k = 1; k <= 5; k += 2) {
29685 for (uint32_t m = 1; m <= 8; m++) {
29686 GemmMicrokernelTester()
29687 .mr(8)
29688 .nr(8)
29689 .kr(1)
29690 .sr(1)
29691 .m(m)
29692 .n(n)
29693 .k(k)
29694 .iterations(1)
29695 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29696 }
29697 }
29698 }
29699 }
29700
29701 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8) {
29702 TEST_REQUIRES_X86_FMA3;
29703 for (uint32_t n = 16; n <= 24; n += 8) {
29704 for (size_t k = 1; k <= 5; k += 2) {
29705 GemmMicrokernelTester()
29706 .mr(8)
29707 .nr(8)
29708 .kr(1)
29709 .sr(1)
29710 .m(8)
29711 .n(8)
29712 .k(k)
29713 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29714 }
29715 }
29716 }
29717
29718 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8_strided_cn) {
29719 TEST_REQUIRES_X86_FMA3;
29720 for (uint32_t n = 16; n <= 24; n += 8) {
29721 for (size_t k = 1; k <= 5; k += 2) {
29722 GemmMicrokernelTester()
29723 .mr(8)
29724 .nr(8)
29725 .kr(1)
29726 .sr(1)
29727 .m(8)
29728 .n(n)
29729 .k(k)
29730 .cn_stride(11)
29731 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29732 }
29733 }
29734 }
29735
29736 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8_strided_a) {
29737 TEST_REQUIRES_X86_FMA3;
29738 for (uint32_t n = 16; n <= 24; n += 8) {
29739 for (size_t k = 1; k <= 5; k += 2) {
29740 GemmMicrokernelTester()
29741 .mr(8)
29742 .nr(8)
29743 .kr(1)
29744 .sr(1)
29745 .m(8)
29746 .n(n)
29747 .k(k)
29748 .a_stride(7)
29749 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29750 }
29751 }
29752 }
29753
29754 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, n_div_8_subtile) {
29755 TEST_REQUIRES_X86_FMA3;
29756 for (uint32_t n = 16; n <= 24; n += 8) {
29757 for (size_t k = 1; k <= 5; k += 2) {
29758 for (uint32_t m = 1; m <= 8; m++) {
29759 GemmMicrokernelTester()
29760 .mr(8)
29761 .nr(8)
29762 .kr(1)
29763 .sr(1)
29764 .m(m)
29765 .n(n)
29766 .k(k)
29767 .iterations(1)
29768 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29769 }
29770 }
29771 }
29772 }
29773
29774 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, strided_cm_subtile) {
29775 TEST_REQUIRES_X86_FMA3;
29776 for (size_t k = 1; k <= 5; k += 2) {
29777 for (uint32_t m = 1; m <= 8; m++) {
29778 for (uint32_t n = 1; n <= 8; n++) {
29779 GemmMicrokernelTester()
29780 .mr(8)
29781 .nr(8)
29782 .kr(1)
29783 .sr(1)
29784 .m(m)
29785 .n(n)
29786 .k(k)
29787 .cm_stride(11)
29788 .iterations(1)
29789 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29790 }
29791 }
29792 }
29793 }
29794
29795 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, qmin) {
29796 TEST_REQUIRES_X86_FMA3;
29797 GemmMicrokernelTester()
29798 .mr(8)
29799 .nr(8)
29800 .kr(1)
29801 .sr(1)
29802 .m(8)
29803 .n(8)
29804 .k(1)
29805 .qmin(128)
29806 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29807 }
29808
29809 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, qmax) {
29810 TEST_REQUIRES_X86_FMA3;
29811 GemmMicrokernelTester()
29812 .mr(8)
29813 .nr(8)
29814 .kr(1)
29815 .sr(1)
29816 .m(8)
29817 .n(8)
29818 .k(1)
29819 .qmax(128)
29820 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29821 }
29822
29823 TEST(F32_GEMMINC_8X8__FMA3_BROADCAST, strided_cm) {
29824 TEST_REQUIRES_X86_FMA3;
29825 GemmMicrokernelTester()
29826 .mr(8)
29827 .nr(8)
29828 .kr(1)
29829 .sr(1)
29830 .m(8)
29831 .n(8)
29832 .k(1)
29833 .cm_stride(11)
29834 .Test(xnn_f32_gemminc_ukernel_8x8__fma3_broadcast);
29835 }
29836#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29837
29838
Marat Dukhan0f349c42019-11-27 11:58:54 -080029839#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan27121322019-12-09 14:57:40 -080029840 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1) {
29841 TEST_REQUIRES_X86_FMA3;
29842 GemmMicrokernelTester()
29843 .mr(1)
29844 .nr(16)
29845 .kr(1)
29846 .sr(1)
29847 .m(1)
29848 .n(16)
29849 .k(1)
29850 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29851 }
29852
29853 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, strided_cn) {
29854 TEST_REQUIRES_X86_FMA3;
29855 GemmMicrokernelTester()
29856 .mr(1)
29857 .nr(16)
29858 .kr(1)
29859 .sr(1)
29860 .m(1)
29861 .n(16)
29862 .k(1)
29863 .cn_stride(19)
29864 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29865 }
29866
29867 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_strided_a) {
29868 TEST_REQUIRES_X86_FMA3;
29869 GemmMicrokernelTester()
29870 .mr(1)
29871 .nr(16)
29872 .kr(1)
29873 .sr(1)
29874 .m(1)
29875 .n(16)
29876 .k(1)
29877 .a_stride(3)
29878 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29879 }
29880
29881 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
29882 TEST_REQUIRES_X86_FMA3;
29883 for (uint32_t m = 1; m <= 1; m++) {
29884 for (uint32_t n = 1; n <= 16; n++) {
29885 GemmMicrokernelTester()
29886 .mr(1)
29887 .nr(16)
29888 .kr(1)
29889 .sr(1)
29890 .m(m)
29891 .n(n)
29892 .k(1)
29893 .iterations(1)
29894 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29895 }
29896 }
29897 }
29898
29899 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
29900 TEST_REQUIRES_X86_FMA3;
29901 for (uint32_t m = 1; m <= 1; m++) {
29902 GemmMicrokernelTester()
29903 .mr(1)
29904 .nr(16)
29905 .kr(1)
29906 .sr(1)
29907 .m(m)
29908 .n(16)
29909 .k(1)
29910 .iterations(1)
29911 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29912 }
29913 }
29914
29915 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
29916 TEST_REQUIRES_X86_FMA3;
29917 for (uint32_t n = 1; n <= 16; n++) {
29918 GemmMicrokernelTester()
29919 .mr(1)
29920 .nr(16)
29921 .kr(1)
29922 .sr(1)
29923 .m(1)
29924 .n(n)
29925 .k(1)
29926 .iterations(1)
29927 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29928 }
29929 }
29930
29931 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_gt_1) {
29932 TEST_REQUIRES_X86_FMA3;
29933 for (size_t k = 2; k < 10; k++) {
29934 GemmMicrokernelTester()
29935 .mr(1)
29936 .nr(16)
29937 .kr(1)
29938 .sr(1)
29939 .m(1)
29940 .n(16)
29941 .k(k)
29942 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29943 }
29944 }
29945
29946 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_gt_1_strided_a) {
29947 TEST_REQUIRES_X86_FMA3;
29948 for (size_t k = 2; k < 10; k++) {
29949 GemmMicrokernelTester()
29950 .mr(1)
29951 .nr(16)
29952 .kr(1)
29953 .sr(1)
29954 .m(1)
29955 .n(16)
29956 .k(k)
29957 .a_stride(11)
29958 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29959 }
29960 }
29961
29962 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
29963 TEST_REQUIRES_X86_FMA3;
29964 for (size_t k = 2; k < 10; k++) {
29965 for (uint32_t m = 1; m <= 1; m++) {
29966 for (uint32_t n = 1; n <= 16; n++) {
29967 GemmMicrokernelTester()
29968 .mr(1)
29969 .nr(16)
29970 .kr(1)
29971 .sr(1)
29972 .m(m)
29973 .n(n)
29974 .k(k)
29975 .iterations(1)
29976 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29977 }
29978 }
29979 }
29980 }
29981
29982 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16) {
29983 TEST_REQUIRES_X86_FMA3;
29984 for (uint32_t n = 17; n < 32; n++) {
29985 for (size_t k = 1; k <= 5; k += 2) {
29986 GemmMicrokernelTester()
29987 .mr(1)
29988 .nr(16)
29989 .kr(1)
29990 .sr(1)
29991 .m(1)
29992 .n(16)
29993 .k(k)
29994 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
29995 }
29996 }
29997 }
29998
29999 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
30000 TEST_REQUIRES_X86_FMA3;
30001 for (uint32_t n = 17; n < 32; n++) {
30002 for (size_t k = 1; k <= 5; k += 2) {
30003 GemmMicrokernelTester()
30004 .mr(1)
30005 .nr(16)
30006 .kr(1)
30007 .sr(1)
30008 .m(1)
30009 .n(16)
30010 .k(k)
30011 .cn_stride(19)
30012 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30013 }
30014 }
30015 }
30016
30017 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16_strided_a) {
30018 TEST_REQUIRES_X86_FMA3;
30019 for (uint32_t n = 17; n < 32; n++) {
30020 for (size_t k = 1; k <= 5; k += 2) {
30021 GemmMicrokernelTester()
30022 .mr(1)
30023 .nr(16)
30024 .kr(1)
30025 .sr(1)
30026 .m(1)
30027 .n(n)
30028 .k(k)
30029 .a_stride(7)
30030 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30031 }
30032 }
30033 }
30034
30035 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
30036 TEST_REQUIRES_X86_FMA3;
30037 for (uint32_t n = 17; n < 32; n++) {
30038 for (size_t k = 1; k <= 5; k += 2) {
30039 for (uint32_t m = 1; m <= 1; m++) {
30040 GemmMicrokernelTester()
30041 .mr(1)
30042 .nr(16)
30043 .kr(1)
30044 .sr(1)
30045 .m(m)
30046 .n(n)
30047 .k(k)
30048 .iterations(1)
30049 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30050 }
30051 }
30052 }
30053 }
30054
30055 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16) {
30056 TEST_REQUIRES_X86_FMA3;
30057 for (uint32_t n = 32; n <= 48; n += 16) {
30058 for (size_t k = 1; k <= 5; k += 2) {
30059 GemmMicrokernelTester()
30060 .mr(1)
30061 .nr(16)
30062 .kr(1)
30063 .sr(1)
30064 .m(1)
30065 .n(16)
30066 .k(k)
30067 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30068 }
30069 }
30070 }
30071
30072 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
30073 TEST_REQUIRES_X86_FMA3;
30074 for (uint32_t n = 32; n <= 48; n += 16) {
30075 for (size_t k = 1; k <= 5; k += 2) {
30076 GemmMicrokernelTester()
30077 .mr(1)
30078 .nr(16)
30079 .kr(1)
30080 .sr(1)
30081 .m(1)
30082 .n(n)
30083 .k(k)
30084 .cn_stride(19)
30085 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30086 }
30087 }
30088 }
30089
30090 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16_strided_a) {
30091 TEST_REQUIRES_X86_FMA3;
30092 for (uint32_t n = 32; n <= 48; n += 16) {
30093 for (size_t k = 1; k <= 5; k += 2) {
30094 GemmMicrokernelTester()
30095 .mr(1)
30096 .nr(16)
30097 .kr(1)
30098 .sr(1)
30099 .m(1)
30100 .n(n)
30101 .k(k)
30102 .a_stride(7)
30103 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30104 }
30105 }
30106 }
30107
30108 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, n_div_16_subtile) {
30109 TEST_REQUIRES_X86_FMA3;
30110 for (uint32_t n = 32; n <= 48; n += 16) {
30111 for (size_t k = 1; k <= 5; k += 2) {
30112 for (uint32_t m = 1; m <= 1; m++) {
30113 GemmMicrokernelTester()
30114 .mr(1)
30115 .nr(16)
30116 .kr(1)
30117 .sr(1)
30118 .m(m)
30119 .n(n)
30120 .k(k)
30121 .iterations(1)
30122 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30123 }
30124 }
30125 }
30126 }
30127
30128 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, strided_cm_subtile) {
30129 TEST_REQUIRES_X86_FMA3;
30130 for (size_t k = 1; k <= 5; k += 2) {
30131 for (uint32_t m = 1; m <= 1; m++) {
30132 for (uint32_t n = 1; n <= 16; n++) {
30133 GemmMicrokernelTester()
30134 .mr(1)
30135 .nr(16)
30136 .kr(1)
30137 .sr(1)
30138 .m(m)
30139 .n(n)
30140 .k(k)
30141 .cm_stride(19)
30142 .iterations(1)
30143 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30144 }
30145 }
30146 }
30147 }
30148
30149 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, qmin) {
30150 TEST_REQUIRES_X86_FMA3;
30151 GemmMicrokernelTester()
30152 .mr(1)
30153 .nr(16)
30154 .kr(1)
30155 .sr(1)
30156 .m(1)
30157 .n(16)
30158 .k(1)
30159 .qmin(128)
30160 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30161 }
30162
30163 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, qmax) {
30164 TEST_REQUIRES_X86_FMA3;
30165 GemmMicrokernelTester()
30166 .mr(1)
30167 .nr(16)
30168 .kr(1)
30169 .sr(1)
30170 .m(1)
30171 .n(16)
30172 .k(1)
30173 .qmax(128)
30174 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30175 }
30176
30177 TEST(F32_GEMMINC_1X16__FMA3_BROADCAST, strided_cm) {
30178 TEST_REQUIRES_X86_FMA3;
30179 GemmMicrokernelTester()
30180 .mr(1)
30181 .nr(16)
30182 .kr(1)
30183 .sr(1)
30184 .m(1)
30185 .n(16)
30186 .k(1)
30187 .cm_stride(19)
30188 .Test(xnn_f32_gemminc_ukernel_1x16__fma3_broadcast);
30189 }
30190#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30191
30192
30193#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30194 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1) {
30195 TEST_REQUIRES_X86_FMA3;
30196 GemmMicrokernelTester()
30197 .mr(3)
30198 .nr(16)
30199 .kr(1)
30200 .sr(1)
30201 .m(3)
30202 .n(16)
30203 .k(1)
30204 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30205 }
30206
30207 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, strided_cn) {
30208 TEST_REQUIRES_X86_FMA3;
30209 GemmMicrokernelTester()
30210 .mr(3)
30211 .nr(16)
30212 .kr(1)
30213 .sr(1)
30214 .m(3)
30215 .n(16)
30216 .k(1)
30217 .cn_stride(19)
30218 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30219 }
30220
30221 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_strided_a) {
30222 TEST_REQUIRES_X86_FMA3;
30223 GemmMicrokernelTester()
30224 .mr(3)
30225 .nr(16)
30226 .kr(1)
30227 .sr(1)
30228 .m(3)
30229 .n(16)
30230 .k(1)
30231 .a_stride(3)
30232 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30233 }
30234
30235 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
30236 TEST_REQUIRES_X86_FMA3;
30237 for (uint32_t m = 1; m <= 3; m++) {
30238 for (uint32_t n = 1; n <= 16; n++) {
30239 GemmMicrokernelTester()
30240 .mr(3)
30241 .nr(16)
30242 .kr(1)
30243 .sr(1)
30244 .m(m)
30245 .n(n)
30246 .k(1)
30247 .iterations(1)
30248 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30249 }
30250 }
30251 }
30252
30253 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
30254 TEST_REQUIRES_X86_FMA3;
30255 for (uint32_t m = 1; m <= 3; m++) {
30256 GemmMicrokernelTester()
30257 .mr(3)
30258 .nr(16)
30259 .kr(1)
30260 .sr(1)
30261 .m(m)
30262 .n(16)
30263 .k(1)
30264 .iterations(1)
30265 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30266 }
30267 }
30268
30269 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
30270 TEST_REQUIRES_X86_FMA3;
30271 for (uint32_t n = 1; n <= 16; n++) {
30272 GemmMicrokernelTester()
30273 .mr(3)
30274 .nr(16)
30275 .kr(1)
30276 .sr(1)
30277 .m(3)
30278 .n(n)
30279 .k(1)
30280 .iterations(1)
30281 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30282 }
30283 }
30284
30285 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_gt_1) {
30286 TEST_REQUIRES_X86_FMA3;
30287 for (size_t k = 2; k < 10; k++) {
30288 GemmMicrokernelTester()
30289 .mr(3)
30290 .nr(16)
30291 .kr(1)
30292 .sr(1)
30293 .m(3)
30294 .n(16)
30295 .k(k)
30296 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30297 }
30298 }
30299
30300 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_gt_1_strided_a) {
30301 TEST_REQUIRES_X86_FMA3;
30302 for (size_t k = 2; k < 10; k++) {
30303 GemmMicrokernelTester()
30304 .mr(3)
30305 .nr(16)
30306 .kr(1)
30307 .sr(1)
30308 .m(3)
30309 .n(16)
30310 .k(k)
30311 .a_stride(11)
30312 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30313 }
30314 }
30315
30316 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
30317 TEST_REQUIRES_X86_FMA3;
30318 for (size_t k = 2; k < 10; k++) {
30319 for (uint32_t m = 1; m <= 3; m++) {
30320 for (uint32_t n = 1; n <= 16; n++) {
30321 GemmMicrokernelTester()
30322 .mr(3)
30323 .nr(16)
30324 .kr(1)
30325 .sr(1)
30326 .m(m)
30327 .n(n)
30328 .k(k)
30329 .iterations(1)
30330 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30331 }
30332 }
30333 }
30334 }
30335
30336 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16) {
30337 TEST_REQUIRES_X86_FMA3;
30338 for (uint32_t n = 17; n < 32; n++) {
30339 for (size_t k = 1; k <= 5; k += 2) {
30340 GemmMicrokernelTester()
30341 .mr(3)
30342 .nr(16)
30343 .kr(1)
30344 .sr(1)
30345 .m(3)
30346 .n(16)
30347 .k(k)
30348 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30349 }
30350 }
30351 }
30352
30353 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
30354 TEST_REQUIRES_X86_FMA3;
30355 for (uint32_t n = 17; n < 32; n++) {
30356 for (size_t k = 1; k <= 5; k += 2) {
30357 GemmMicrokernelTester()
30358 .mr(3)
30359 .nr(16)
30360 .kr(1)
30361 .sr(1)
30362 .m(3)
30363 .n(16)
30364 .k(k)
30365 .cn_stride(19)
30366 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30367 }
30368 }
30369 }
30370
30371 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16_strided_a) {
30372 TEST_REQUIRES_X86_FMA3;
30373 for (uint32_t n = 17; n < 32; n++) {
30374 for (size_t k = 1; k <= 5; k += 2) {
30375 GemmMicrokernelTester()
30376 .mr(3)
30377 .nr(16)
30378 .kr(1)
30379 .sr(1)
30380 .m(3)
30381 .n(n)
30382 .k(k)
30383 .a_stride(7)
30384 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30385 }
30386 }
30387 }
30388
30389 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
30390 TEST_REQUIRES_X86_FMA3;
30391 for (uint32_t n = 17; n < 32; n++) {
30392 for (size_t k = 1; k <= 5; k += 2) {
30393 for (uint32_t m = 1; m <= 3; m++) {
30394 GemmMicrokernelTester()
30395 .mr(3)
30396 .nr(16)
30397 .kr(1)
30398 .sr(1)
30399 .m(m)
30400 .n(n)
30401 .k(k)
30402 .iterations(1)
30403 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30404 }
30405 }
30406 }
30407 }
30408
30409 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16) {
30410 TEST_REQUIRES_X86_FMA3;
30411 for (uint32_t n = 32; n <= 48; n += 16) {
30412 for (size_t k = 1; k <= 5; k += 2) {
30413 GemmMicrokernelTester()
30414 .mr(3)
30415 .nr(16)
30416 .kr(1)
30417 .sr(1)
30418 .m(3)
30419 .n(16)
30420 .k(k)
30421 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30422 }
30423 }
30424 }
30425
30426 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
30427 TEST_REQUIRES_X86_FMA3;
30428 for (uint32_t n = 32; n <= 48; n += 16) {
30429 for (size_t k = 1; k <= 5; k += 2) {
30430 GemmMicrokernelTester()
30431 .mr(3)
30432 .nr(16)
30433 .kr(1)
30434 .sr(1)
30435 .m(3)
30436 .n(n)
30437 .k(k)
30438 .cn_stride(19)
30439 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30440 }
30441 }
30442 }
30443
30444 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16_strided_a) {
30445 TEST_REQUIRES_X86_FMA3;
30446 for (uint32_t n = 32; n <= 48; n += 16) {
30447 for (size_t k = 1; k <= 5; k += 2) {
30448 GemmMicrokernelTester()
30449 .mr(3)
30450 .nr(16)
30451 .kr(1)
30452 .sr(1)
30453 .m(3)
30454 .n(n)
30455 .k(k)
30456 .a_stride(7)
30457 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30458 }
30459 }
30460 }
30461
30462 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, n_div_16_subtile) {
30463 TEST_REQUIRES_X86_FMA3;
30464 for (uint32_t n = 32; n <= 48; n += 16) {
30465 for (size_t k = 1; k <= 5; k += 2) {
30466 for (uint32_t m = 1; m <= 3; m++) {
30467 GemmMicrokernelTester()
30468 .mr(3)
30469 .nr(16)
30470 .kr(1)
30471 .sr(1)
30472 .m(m)
30473 .n(n)
30474 .k(k)
30475 .iterations(1)
30476 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30477 }
30478 }
30479 }
30480 }
30481
30482 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, strided_cm_subtile) {
30483 TEST_REQUIRES_X86_FMA3;
30484 for (size_t k = 1; k <= 5; k += 2) {
30485 for (uint32_t m = 1; m <= 3; m++) {
30486 for (uint32_t n = 1; n <= 16; n++) {
30487 GemmMicrokernelTester()
30488 .mr(3)
30489 .nr(16)
30490 .kr(1)
30491 .sr(1)
30492 .m(m)
30493 .n(n)
30494 .k(k)
30495 .cm_stride(19)
30496 .iterations(1)
30497 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30498 }
30499 }
30500 }
30501 }
30502
30503 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, qmin) {
30504 TEST_REQUIRES_X86_FMA3;
30505 GemmMicrokernelTester()
30506 .mr(3)
30507 .nr(16)
30508 .kr(1)
30509 .sr(1)
30510 .m(3)
30511 .n(16)
30512 .k(1)
30513 .qmin(128)
30514 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30515 }
30516
30517 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, qmax) {
30518 TEST_REQUIRES_X86_FMA3;
30519 GemmMicrokernelTester()
30520 .mr(3)
30521 .nr(16)
30522 .kr(1)
30523 .sr(1)
30524 .m(3)
30525 .n(16)
30526 .k(1)
30527 .qmax(128)
30528 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30529 }
30530
30531 TEST(F32_GEMMINC_3X16__FMA3_BROADCAST, strided_cm) {
30532 TEST_REQUIRES_X86_FMA3;
30533 GemmMicrokernelTester()
30534 .mr(3)
30535 .nr(16)
30536 .kr(1)
30537 .sr(1)
30538 .m(3)
30539 .n(16)
30540 .k(1)
30541 .cm_stride(19)
30542 .Test(xnn_f32_gemminc_ukernel_3x16__fma3_broadcast);
30543 }
30544#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30545
30546
30547#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30548 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1) {
30549 TEST_REQUIRES_X86_FMA3;
30550 GemmMicrokernelTester()
30551 .mr(4)
30552 .nr(16)
30553 .kr(1)
30554 .sr(1)
30555 .m(4)
30556 .n(16)
30557 .k(1)
30558 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30559 }
30560
30561 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, strided_cn) {
30562 TEST_REQUIRES_X86_FMA3;
30563 GemmMicrokernelTester()
30564 .mr(4)
30565 .nr(16)
30566 .kr(1)
30567 .sr(1)
30568 .m(4)
30569 .n(16)
30570 .k(1)
30571 .cn_stride(19)
30572 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30573 }
30574
30575 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_strided_a) {
30576 TEST_REQUIRES_X86_FMA3;
30577 GemmMicrokernelTester()
30578 .mr(4)
30579 .nr(16)
30580 .kr(1)
30581 .sr(1)
30582 .m(4)
30583 .n(16)
30584 .k(1)
30585 .a_stride(3)
30586 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30587 }
30588
30589 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_subtile) {
30590 TEST_REQUIRES_X86_FMA3;
30591 for (uint32_t m = 1; m <= 4; m++) {
30592 for (uint32_t n = 1; n <= 16; n++) {
30593 GemmMicrokernelTester()
30594 .mr(4)
30595 .nr(16)
30596 .kr(1)
30597 .sr(1)
30598 .m(m)
30599 .n(n)
30600 .k(1)
30601 .iterations(1)
30602 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30603 }
30604 }
30605 }
30606
30607 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
30608 TEST_REQUIRES_X86_FMA3;
30609 for (uint32_t m = 1; m <= 4; m++) {
30610 GemmMicrokernelTester()
30611 .mr(4)
30612 .nr(16)
30613 .kr(1)
30614 .sr(1)
30615 .m(m)
30616 .n(16)
30617 .k(1)
30618 .iterations(1)
30619 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30620 }
30621 }
30622
30623 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
30624 TEST_REQUIRES_X86_FMA3;
30625 for (uint32_t n = 1; n <= 16; n++) {
30626 GemmMicrokernelTester()
30627 .mr(4)
30628 .nr(16)
30629 .kr(1)
30630 .sr(1)
30631 .m(4)
30632 .n(n)
30633 .k(1)
30634 .iterations(1)
30635 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30636 }
30637 }
30638
30639 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_gt_1) {
30640 TEST_REQUIRES_X86_FMA3;
30641 for (size_t k = 2; k < 10; k++) {
30642 GemmMicrokernelTester()
30643 .mr(4)
30644 .nr(16)
30645 .kr(1)
30646 .sr(1)
30647 .m(4)
30648 .n(16)
30649 .k(k)
30650 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30651 }
30652 }
30653
30654 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_gt_1_strided_a) {
30655 TEST_REQUIRES_X86_FMA3;
30656 for (size_t k = 2; k < 10; k++) {
30657 GemmMicrokernelTester()
30658 .mr(4)
30659 .nr(16)
30660 .kr(1)
30661 .sr(1)
30662 .m(4)
30663 .n(16)
30664 .k(k)
30665 .a_stride(11)
30666 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30667 }
30668 }
30669
30670 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, k_gt_1_subtile) {
30671 TEST_REQUIRES_X86_FMA3;
30672 for (size_t k = 2; k < 10; k++) {
30673 for (uint32_t m = 1; m <= 4; m++) {
30674 for (uint32_t n = 1; n <= 16; n++) {
30675 GemmMicrokernelTester()
30676 .mr(4)
30677 .nr(16)
30678 .kr(1)
30679 .sr(1)
30680 .m(m)
30681 .n(n)
30682 .k(k)
30683 .iterations(1)
30684 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30685 }
30686 }
30687 }
30688 }
30689
30690 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16) {
30691 TEST_REQUIRES_X86_FMA3;
30692 for (uint32_t n = 17; n < 32; n++) {
30693 for (size_t k = 1; k <= 5; k += 2) {
30694 GemmMicrokernelTester()
30695 .mr(4)
30696 .nr(16)
30697 .kr(1)
30698 .sr(1)
30699 .m(4)
30700 .n(16)
30701 .k(k)
30702 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30703 }
30704 }
30705 }
30706
30707 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
30708 TEST_REQUIRES_X86_FMA3;
30709 for (uint32_t n = 17; n < 32; n++) {
30710 for (size_t k = 1; k <= 5; k += 2) {
30711 GemmMicrokernelTester()
30712 .mr(4)
30713 .nr(16)
30714 .kr(1)
30715 .sr(1)
30716 .m(4)
30717 .n(16)
30718 .k(k)
30719 .cn_stride(19)
30720 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30721 }
30722 }
30723 }
30724
30725 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16_strided_a) {
30726 TEST_REQUIRES_X86_FMA3;
30727 for (uint32_t n = 17; n < 32; n++) {
30728 for (size_t k = 1; k <= 5; k += 2) {
30729 GemmMicrokernelTester()
30730 .mr(4)
30731 .nr(16)
30732 .kr(1)
30733 .sr(1)
30734 .m(4)
30735 .n(n)
30736 .k(k)
30737 .a_stride(7)
30738 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30739 }
30740 }
30741 }
30742
30743 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_gt_16_subtile) {
30744 TEST_REQUIRES_X86_FMA3;
30745 for (uint32_t n = 17; n < 32; n++) {
30746 for (size_t k = 1; k <= 5; k += 2) {
30747 for (uint32_t m = 1; m <= 4; m++) {
30748 GemmMicrokernelTester()
30749 .mr(4)
30750 .nr(16)
30751 .kr(1)
30752 .sr(1)
30753 .m(m)
30754 .n(n)
30755 .k(k)
30756 .iterations(1)
30757 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30758 }
30759 }
30760 }
30761 }
30762
30763 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16) {
30764 TEST_REQUIRES_X86_FMA3;
30765 for (uint32_t n = 32; n <= 48; n += 16) {
30766 for (size_t k = 1; k <= 5; k += 2) {
30767 GemmMicrokernelTester()
30768 .mr(4)
30769 .nr(16)
30770 .kr(1)
30771 .sr(1)
30772 .m(4)
30773 .n(16)
30774 .k(k)
30775 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30776 }
30777 }
30778 }
30779
30780 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16_strided_cn) {
30781 TEST_REQUIRES_X86_FMA3;
30782 for (uint32_t n = 32; n <= 48; n += 16) {
30783 for (size_t k = 1; k <= 5; k += 2) {
30784 GemmMicrokernelTester()
30785 .mr(4)
30786 .nr(16)
30787 .kr(1)
30788 .sr(1)
30789 .m(4)
30790 .n(n)
30791 .k(k)
30792 .cn_stride(19)
30793 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30794 }
30795 }
30796 }
30797
30798 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16_strided_a) {
30799 TEST_REQUIRES_X86_FMA3;
30800 for (uint32_t n = 32; n <= 48; n += 16) {
30801 for (size_t k = 1; k <= 5; k += 2) {
30802 GemmMicrokernelTester()
30803 .mr(4)
30804 .nr(16)
30805 .kr(1)
30806 .sr(1)
30807 .m(4)
30808 .n(n)
30809 .k(k)
30810 .a_stride(7)
30811 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30812 }
30813 }
30814 }
30815
30816 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, n_div_16_subtile) {
30817 TEST_REQUIRES_X86_FMA3;
30818 for (uint32_t n = 32; n <= 48; n += 16) {
30819 for (size_t k = 1; k <= 5; k += 2) {
30820 for (uint32_t m = 1; m <= 4; m++) {
30821 GemmMicrokernelTester()
30822 .mr(4)
30823 .nr(16)
30824 .kr(1)
30825 .sr(1)
30826 .m(m)
30827 .n(n)
30828 .k(k)
30829 .iterations(1)
30830 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30831 }
30832 }
30833 }
30834 }
30835
30836 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, strided_cm_subtile) {
30837 TEST_REQUIRES_X86_FMA3;
30838 for (size_t k = 1; k <= 5; k += 2) {
30839 for (uint32_t m = 1; m <= 4; m++) {
30840 for (uint32_t n = 1; n <= 16; n++) {
30841 GemmMicrokernelTester()
30842 .mr(4)
30843 .nr(16)
30844 .kr(1)
30845 .sr(1)
30846 .m(m)
30847 .n(n)
30848 .k(k)
30849 .cm_stride(19)
30850 .iterations(1)
30851 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30852 }
30853 }
30854 }
30855 }
30856
30857 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, qmin) {
30858 TEST_REQUIRES_X86_FMA3;
30859 GemmMicrokernelTester()
30860 .mr(4)
30861 .nr(16)
30862 .kr(1)
30863 .sr(1)
30864 .m(4)
30865 .n(16)
30866 .k(1)
30867 .qmin(128)
30868 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30869 }
30870
30871 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, qmax) {
30872 TEST_REQUIRES_X86_FMA3;
30873 GemmMicrokernelTester()
30874 .mr(4)
30875 .nr(16)
30876 .kr(1)
30877 .sr(1)
30878 .m(4)
30879 .n(16)
30880 .k(1)
30881 .qmax(128)
30882 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30883 }
30884
30885 TEST(F32_GEMMINC_4X16__FMA3_BROADCAST, strided_cm) {
30886 TEST_REQUIRES_X86_FMA3;
30887 GemmMicrokernelTester()
30888 .mr(4)
30889 .nr(16)
30890 .kr(1)
30891 .sr(1)
30892 .m(4)
30893 .n(16)
30894 .k(1)
30895 .cm_stride(19)
30896 .Test(xnn_f32_gemminc_ukernel_4x16__fma3_broadcast);
30897 }
30898#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30899
30900
30901#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30902 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1) {
30903 TEST_REQUIRES_X86_FMA3;
30904 GemmMicrokernelTester()
30905 .mr(5)
30906 .nr(16)
30907 .kr(1)
30908 .sr(1)
30909 .m(5)
30910 .n(16)
30911 .k(1)
30912 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
30913 }
30914
30915 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, strided_cn) {
30916 TEST_REQUIRES_X86_FMA3;
30917 GemmMicrokernelTester()
30918 .mr(5)
30919 .nr(16)
30920 .kr(1)
30921 .sr(1)
30922 .m(5)
30923 .n(16)
30924 .k(1)
30925 .cn_stride(19)
30926 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
30927 }
30928
30929 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_strided_a) {
30930 TEST_REQUIRES_X86_FMA3;
30931 GemmMicrokernelTester()
30932 .mr(5)
30933 .nr(16)
30934 .kr(1)
30935 .sr(1)
30936 .m(5)
30937 .n(16)
30938 .k(1)
30939 .a_stride(3)
30940 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
30941 }
30942
30943 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_subtile) {
30944 TEST_REQUIRES_X86_FMA3;
30945 for (uint32_t m = 1; m <= 5; m++) {
30946 for (uint32_t n = 1; n <= 16; n++) {
30947 GemmMicrokernelTester()
30948 .mr(5)
30949 .nr(16)
30950 .kr(1)
30951 .sr(1)
30952 .m(m)
30953 .n(n)
30954 .k(1)
30955 .iterations(1)
30956 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
30957 }
30958 }
30959 }
30960
30961 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
30962 TEST_REQUIRES_X86_FMA3;
30963 for (uint32_t m = 1; m <= 5; m++) {
30964 GemmMicrokernelTester()
30965 .mr(5)
30966 .nr(16)
30967 .kr(1)
30968 .sr(1)
30969 .m(m)
30970 .n(16)
30971 .k(1)
30972 .iterations(1)
30973 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
30974 }
30975 }
30976
30977 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
30978 TEST_REQUIRES_X86_FMA3;
30979 for (uint32_t n = 1; n <= 16; n++) {
30980 GemmMicrokernelTester()
30981 .mr(5)
30982 .nr(16)
30983 .kr(1)
30984 .sr(1)
30985 .m(5)
30986 .n(n)
30987 .k(1)
30988 .iterations(1)
30989 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
30990 }
30991 }
30992
30993 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_gt_1) {
30994 TEST_REQUIRES_X86_FMA3;
30995 for (size_t k = 2; k < 10; k++) {
30996 GemmMicrokernelTester()
30997 .mr(5)
30998 .nr(16)
30999 .kr(1)
31000 .sr(1)
31001 .m(5)
31002 .n(16)
31003 .k(k)
31004 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31005 }
31006 }
31007
31008 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_gt_1_strided_a) {
31009 TEST_REQUIRES_X86_FMA3;
31010 for (size_t k = 2; k < 10; k++) {
31011 GemmMicrokernelTester()
31012 .mr(5)
31013 .nr(16)
31014 .kr(1)
31015 .sr(1)
31016 .m(5)
31017 .n(16)
31018 .k(k)
31019 .a_stride(11)
31020 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31021 }
31022 }
31023
31024 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, k_gt_1_subtile) {
31025 TEST_REQUIRES_X86_FMA3;
31026 for (size_t k = 2; k < 10; k++) {
31027 for (uint32_t m = 1; m <= 5; m++) {
31028 for (uint32_t n = 1; n <= 16; n++) {
31029 GemmMicrokernelTester()
31030 .mr(5)
31031 .nr(16)
31032 .kr(1)
31033 .sr(1)
31034 .m(m)
31035 .n(n)
31036 .k(k)
31037 .iterations(1)
31038 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31039 }
31040 }
31041 }
31042 }
31043
31044 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16) {
31045 TEST_REQUIRES_X86_FMA3;
31046 for (uint32_t n = 17; n < 32; n++) {
31047 for (size_t k = 1; k <= 5; k += 2) {
31048 GemmMicrokernelTester()
31049 .mr(5)
31050 .nr(16)
31051 .kr(1)
31052 .sr(1)
31053 .m(5)
31054 .n(16)
31055 .k(k)
31056 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31057 }
31058 }
31059 }
31060
31061 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
31062 TEST_REQUIRES_X86_FMA3;
31063 for (uint32_t n = 17; n < 32; n++) {
31064 for (size_t k = 1; k <= 5; k += 2) {
31065 GemmMicrokernelTester()
31066 .mr(5)
31067 .nr(16)
31068 .kr(1)
31069 .sr(1)
31070 .m(5)
31071 .n(16)
31072 .k(k)
31073 .cn_stride(19)
31074 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31075 }
31076 }
31077 }
31078
31079 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16_strided_a) {
31080 TEST_REQUIRES_X86_FMA3;
31081 for (uint32_t n = 17; n < 32; n++) {
31082 for (size_t k = 1; k <= 5; k += 2) {
31083 GemmMicrokernelTester()
31084 .mr(5)
31085 .nr(16)
31086 .kr(1)
31087 .sr(1)
31088 .m(5)
31089 .n(n)
31090 .k(k)
31091 .a_stride(7)
31092 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31093 }
31094 }
31095 }
31096
31097 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_gt_16_subtile) {
31098 TEST_REQUIRES_X86_FMA3;
31099 for (uint32_t n = 17; n < 32; n++) {
31100 for (size_t k = 1; k <= 5; k += 2) {
31101 for (uint32_t m = 1; m <= 5; m++) {
31102 GemmMicrokernelTester()
31103 .mr(5)
31104 .nr(16)
31105 .kr(1)
31106 .sr(1)
31107 .m(m)
31108 .n(n)
31109 .k(k)
31110 .iterations(1)
31111 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31112 }
31113 }
31114 }
31115 }
31116
31117 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16) {
31118 TEST_REQUIRES_X86_FMA3;
31119 for (uint32_t n = 32; n <= 48; n += 16) {
31120 for (size_t k = 1; k <= 5; k += 2) {
31121 GemmMicrokernelTester()
31122 .mr(5)
31123 .nr(16)
31124 .kr(1)
31125 .sr(1)
31126 .m(5)
31127 .n(16)
31128 .k(k)
31129 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31130 }
31131 }
31132 }
31133
31134 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16_strided_cn) {
31135 TEST_REQUIRES_X86_FMA3;
31136 for (uint32_t n = 32; n <= 48; n += 16) {
31137 for (size_t k = 1; k <= 5; k += 2) {
31138 GemmMicrokernelTester()
31139 .mr(5)
31140 .nr(16)
31141 .kr(1)
31142 .sr(1)
31143 .m(5)
31144 .n(n)
31145 .k(k)
31146 .cn_stride(19)
31147 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31148 }
31149 }
31150 }
31151
31152 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16_strided_a) {
31153 TEST_REQUIRES_X86_FMA3;
31154 for (uint32_t n = 32; n <= 48; n += 16) {
31155 for (size_t k = 1; k <= 5; k += 2) {
31156 GemmMicrokernelTester()
31157 .mr(5)
31158 .nr(16)
31159 .kr(1)
31160 .sr(1)
31161 .m(5)
31162 .n(n)
31163 .k(k)
31164 .a_stride(7)
31165 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31166 }
31167 }
31168 }
31169
31170 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, n_div_16_subtile) {
31171 TEST_REQUIRES_X86_FMA3;
31172 for (uint32_t n = 32; n <= 48; n += 16) {
31173 for (size_t k = 1; k <= 5; k += 2) {
31174 for (uint32_t m = 1; m <= 5; m++) {
31175 GemmMicrokernelTester()
31176 .mr(5)
31177 .nr(16)
31178 .kr(1)
31179 .sr(1)
31180 .m(m)
31181 .n(n)
31182 .k(k)
31183 .iterations(1)
31184 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31185 }
31186 }
31187 }
31188 }
31189
31190 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, strided_cm_subtile) {
31191 TEST_REQUIRES_X86_FMA3;
31192 for (size_t k = 1; k <= 5; k += 2) {
31193 for (uint32_t m = 1; m <= 5; m++) {
31194 for (uint32_t n = 1; n <= 16; n++) {
31195 GemmMicrokernelTester()
31196 .mr(5)
31197 .nr(16)
31198 .kr(1)
31199 .sr(1)
31200 .m(m)
31201 .n(n)
31202 .k(k)
31203 .cm_stride(19)
31204 .iterations(1)
31205 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31206 }
31207 }
31208 }
31209 }
31210
31211 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, qmin) {
31212 TEST_REQUIRES_X86_FMA3;
31213 GemmMicrokernelTester()
31214 .mr(5)
31215 .nr(16)
31216 .kr(1)
31217 .sr(1)
31218 .m(5)
31219 .n(16)
31220 .k(1)
31221 .qmin(128)
31222 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31223 }
31224
31225 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, qmax) {
31226 TEST_REQUIRES_X86_FMA3;
31227 GemmMicrokernelTester()
31228 .mr(5)
31229 .nr(16)
31230 .kr(1)
31231 .sr(1)
31232 .m(5)
31233 .n(16)
31234 .k(1)
31235 .qmax(128)
31236 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31237 }
31238
31239 TEST(F32_GEMMINC_5X16__FMA3_BROADCAST, strided_cm) {
31240 TEST_REQUIRES_X86_FMA3;
31241 GemmMicrokernelTester()
31242 .mr(5)
31243 .nr(16)
31244 .kr(1)
31245 .sr(1)
31246 .m(5)
31247 .n(16)
31248 .k(1)
31249 .cm_stride(19)
31250 .Test(xnn_f32_gemminc_ukernel_5x16__fma3_broadcast);
31251 }
31252#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31253
31254
31255#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31256 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4) {
31257 TEST_REQUIRES_X86_FMA3;
31258 GemmMicrokernelTester()
31259 .mr(1)
31260 .nr(16)
31261 .kr(1)
31262 .sr(4)
31263 .m(1)
31264 .n(16)
31265 .k(4)
31266 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31267 }
31268
31269 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, strided_cn) {
31270 TEST_REQUIRES_X86_FMA3;
31271 GemmMicrokernelTester()
31272 .mr(1)
31273 .nr(16)
31274 .kr(1)
31275 .sr(4)
31276 .m(1)
31277 .n(16)
31278 .k(4)
31279 .cn_stride(19)
31280 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31281 }
31282
31283 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
31284 TEST_REQUIRES_X86_FMA3;
31285 GemmMicrokernelTester()
31286 .mr(1)
31287 .nr(16)
31288 .kr(1)
31289 .sr(4)
31290 .m(1)
31291 .n(16)
31292 .k(4)
31293 .a_stride(7)
31294 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31295 }
31296
31297 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
31298 TEST_REQUIRES_X86_FMA3;
31299 for (uint32_t m = 1; m <= 1; m++) {
31300 for (uint32_t n = 1; n <= 16; n++) {
31301 GemmMicrokernelTester()
31302 .mr(1)
31303 .nr(16)
31304 .kr(1)
31305 .sr(4)
31306 .m(m)
31307 .n(n)
31308 .k(4)
31309 .iterations(1)
31310 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31311 }
31312 }
31313 }
31314
31315 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
31316 TEST_REQUIRES_X86_FMA3;
31317 for (uint32_t m = 1; m <= 1; m++) {
31318 GemmMicrokernelTester()
31319 .mr(1)
31320 .nr(16)
31321 .kr(1)
31322 .sr(4)
31323 .m(m)
31324 .n(16)
31325 .k(4)
31326 .iterations(1)
31327 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31328 }
31329 }
31330
31331 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
31332 TEST_REQUIRES_X86_FMA3;
31333 for (uint32_t n = 1; n <= 16; n++) {
31334 GemmMicrokernelTester()
31335 .mr(1)
31336 .nr(16)
31337 .kr(1)
31338 .sr(4)
31339 .m(1)
31340 .n(n)
31341 .k(4)
31342 .iterations(1)
31343 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31344 }
31345 }
31346
31347 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_lt_4) {
31348 TEST_REQUIRES_X86_FMA3;
31349 for (size_t k = 1; k < 4; k++) {
31350 GemmMicrokernelTester()
31351 .mr(1)
31352 .nr(16)
31353 .kr(1)
31354 .sr(4)
31355 .m(1)
31356 .n(16)
31357 .k(k)
31358 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31359 }
31360 }
31361
31362 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
31363 TEST_REQUIRES_X86_FMA3;
31364 for (size_t k = 1; k < 4; k++) {
31365 GemmMicrokernelTester()
31366 .mr(1)
31367 .nr(16)
31368 .kr(1)
31369 .sr(4)
31370 .m(1)
31371 .n(16)
31372 .k(k)
31373 .a_stride(7)
31374 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31375 }
31376 }
31377
31378 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
31379 TEST_REQUIRES_X86_FMA3;
31380 for (size_t k = 1; k < 4; k++) {
31381 for (uint32_t m = 1; m <= 1; m++) {
31382 for (uint32_t n = 1; n <= 16; n++) {
31383 GemmMicrokernelTester()
31384 .mr(1)
31385 .nr(16)
31386 .kr(1)
31387 .sr(4)
31388 .m(m)
31389 .n(n)
31390 .k(k)
31391 .iterations(1)
31392 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31393 }
31394 }
31395 }
31396 }
31397
31398 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_gt_4) {
31399 TEST_REQUIRES_X86_FMA3;
31400 for (size_t k = 5; k < 8; k++) {
31401 GemmMicrokernelTester()
31402 .mr(1)
31403 .nr(16)
31404 .kr(1)
31405 .sr(4)
31406 .m(1)
31407 .n(16)
31408 .k(k)
31409 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31410 }
31411 }
31412
31413 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
31414 TEST_REQUIRES_X86_FMA3;
31415 for (size_t k = 5; k < 8; k++) {
31416 GemmMicrokernelTester()
31417 .mr(1)
31418 .nr(16)
31419 .kr(1)
31420 .sr(4)
31421 .m(1)
31422 .n(16)
31423 .k(k)
31424 .a_stride(11)
31425 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31426 }
31427 }
31428
31429 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
31430 TEST_REQUIRES_X86_FMA3;
31431 for (size_t k = 5; k < 8; k++) {
31432 for (uint32_t m = 1; m <= 1; m++) {
31433 for (uint32_t n = 1; n <= 16; n++) {
31434 GemmMicrokernelTester()
31435 .mr(1)
31436 .nr(16)
31437 .kr(1)
31438 .sr(4)
31439 .m(m)
31440 .n(n)
31441 .k(k)
31442 .iterations(1)
31443 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31444 }
31445 }
31446 }
31447 }
31448
31449 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_div_4) {
31450 TEST_REQUIRES_X86_FMA3;
31451 for (size_t k = 8; k <= 40; k += 4) {
31452 GemmMicrokernelTester()
31453 .mr(1)
31454 .nr(16)
31455 .kr(1)
31456 .sr(4)
31457 .m(1)
31458 .n(16)
31459 .k(k)
31460 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31461 }
31462 }
31463
31464 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
31465 TEST_REQUIRES_X86_FMA3;
31466 for (size_t k = 8; k <= 40; k += 4) {
31467 GemmMicrokernelTester()
31468 .mr(1)
31469 .nr(16)
31470 .kr(1)
31471 .sr(4)
31472 .m(1)
31473 .n(16)
31474 .k(k)
31475 .a_stride(43)
31476 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31477 }
31478 }
31479
31480 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, k_div_4_subtile) {
31481 TEST_REQUIRES_X86_FMA3;
31482 for (size_t k = 8; k <= 40; k += 4) {
31483 for (uint32_t m = 1; m <= 1; m++) {
31484 for (uint32_t n = 1; n <= 16; n++) {
31485 GemmMicrokernelTester()
31486 .mr(1)
31487 .nr(16)
31488 .kr(1)
31489 .sr(4)
31490 .m(m)
31491 .n(n)
31492 .k(k)
31493 .iterations(1)
31494 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31495 }
31496 }
31497 }
31498 }
31499
31500 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16) {
31501 TEST_REQUIRES_X86_FMA3;
31502 for (uint32_t n = 17; n < 32; n++) {
31503 for (size_t k = 1; k <= 20; k += 5) {
31504 GemmMicrokernelTester()
31505 .mr(1)
31506 .nr(16)
31507 .kr(1)
31508 .sr(4)
31509 .m(1)
31510 .n(16)
31511 .k(k)
31512 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31513 }
31514 }
31515 }
31516
31517 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
31518 TEST_REQUIRES_X86_FMA3;
31519 for (uint32_t n = 17; n < 32; n++) {
31520 for (size_t k = 1; k <= 20; k += 5) {
31521 GemmMicrokernelTester()
31522 .mr(1)
31523 .nr(16)
31524 .kr(1)
31525 .sr(4)
31526 .m(1)
31527 .n(16)
31528 .k(k)
31529 .cn_stride(19)
31530 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31531 }
31532 }
31533 }
31534
31535 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
31536 TEST_REQUIRES_X86_FMA3;
31537 for (uint32_t n = 17; n < 32; n++) {
31538 for (size_t k = 1; k <= 20; k += 5) {
31539 GemmMicrokernelTester()
31540 .mr(1)
31541 .nr(16)
31542 .kr(1)
31543 .sr(4)
31544 .m(1)
31545 .n(n)
31546 .k(k)
31547 .a_stride(23)
31548 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31549 }
31550 }
31551 }
31552
31553 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
31554 TEST_REQUIRES_X86_FMA3;
31555 for (uint32_t n = 17; n < 32; n++) {
31556 for (size_t k = 1; k <= 20; k += 5) {
31557 for (uint32_t m = 1; m <= 1; m++) {
31558 GemmMicrokernelTester()
31559 .mr(1)
31560 .nr(16)
31561 .kr(1)
31562 .sr(4)
31563 .m(m)
31564 .n(n)
31565 .k(k)
31566 .iterations(1)
31567 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31568 }
31569 }
31570 }
31571 }
31572
31573 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16) {
31574 TEST_REQUIRES_X86_FMA3;
31575 for (uint32_t n = 32; n <= 48; n += 16) {
31576 for (size_t k = 1; k <= 20; k += 5) {
31577 GemmMicrokernelTester()
31578 .mr(1)
31579 .nr(16)
31580 .kr(1)
31581 .sr(4)
31582 .m(1)
31583 .n(16)
31584 .k(k)
31585 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31586 }
31587 }
31588 }
31589
31590 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
31591 TEST_REQUIRES_X86_FMA3;
31592 for (uint32_t n = 32; n <= 48; n += 16) {
31593 for (size_t k = 1; k <= 20; k += 5) {
31594 GemmMicrokernelTester()
31595 .mr(1)
31596 .nr(16)
31597 .kr(1)
31598 .sr(4)
31599 .m(1)
31600 .n(n)
31601 .k(k)
31602 .cn_stride(19)
31603 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31604 }
31605 }
31606 }
31607
31608 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
31609 TEST_REQUIRES_X86_FMA3;
31610 for (uint32_t n = 32; n <= 48; n += 16) {
31611 for (size_t k = 1; k <= 20; k += 5) {
31612 GemmMicrokernelTester()
31613 .mr(1)
31614 .nr(16)
31615 .kr(1)
31616 .sr(4)
31617 .m(1)
31618 .n(n)
31619 .k(k)
31620 .a_stride(23)
31621 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31622 }
31623 }
31624 }
31625
31626 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, n_div_16_subtile) {
31627 TEST_REQUIRES_X86_FMA3;
31628 for (uint32_t n = 32; n <= 48; n += 16) {
31629 for (size_t k = 1; k <= 20; k += 5) {
31630 for (uint32_t m = 1; m <= 1; m++) {
31631 GemmMicrokernelTester()
31632 .mr(1)
31633 .nr(16)
31634 .kr(1)
31635 .sr(4)
31636 .m(m)
31637 .n(n)
31638 .k(k)
31639 .iterations(1)
31640 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31641 }
31642 }
31643 }
31644 }
31645
31646 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, strided_cm_subtile) {
31647 TEST_REQUIRES_X86_FMA3;
31648 for (size_t k = 1; k <= 20; k += 5) {
31649 for (uint32_t m = 1; m <= 1; m++) {
31650 for (uint32_t n = 1; n <= 16; n++) {
31651 GemmMicrokernelTester()
31652 .mr(1)
31653 .nr(16)
31654 .kr(1)
31655 .sr(4)
31656 .m(m)
31657 .n(n)
31658 .k(k)
31659 .cm_stride(19)
31660 .iterations(1)
31661 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31662 }
31663 }
31664 }
31665 }
31666
31667 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, qmin) {
31668 TEST_REQUIRES_X86_FMA3;
31669 GemmMicrokernelTester()
31670 .mr(1)
31671 .nr(16)
31672 .kr(1)
31673 .sr(4)
31674 .m(1)
31675 .n(16)
31676 .k(4)
31677 .qmin(128)
31678 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31679 }
31680
31681 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, qmax) {
31682 TEST_REQUIRES_X86_FMA3;
31683 GemmMicrokernelTester()
31684 .mr(1)
31685 .nr(16)
31686 .kr(1)
31687 .sr(4)
31688 .m(1)
31689 .n(16)
31690 .k(4)
31691 .qmax(128)
31692 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31693 }
31694
31695 TEST(F32_GEMMINC_1X16S4__FMA3_BROADCAST, strided_cm) {
31696 TEST_REQUIRES_X86_FMA3;
31697 GemmMicrokernelTester()
31698 .mr(1)
31699 .nr(16)
31700 .kr(1)
31701 .sr(4)
31702 .m(1)
31703 .n(16)
31704 .k(4)
31705 .cm_stride(19)
31706 .Test(xnn_f32_gemminc_ukernel_1x16s4__fma3_broadcast);
31707 }
31708#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31709
31710
31711#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31712 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4) {
31713 TEST_REQUIRES_X86_FMA3;
31714 GemmMicrokernelTester()
31715 .mr(3)
31716 .nr(16)
31717 .kr(1)
31718 .sr(4)
31719 .m(3)
31720 .n(16)
31721 .k(4)
31722 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31723 }
31724
31725 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, strided_cn) {
31726 TEST_REQUIRES_X86_FMA3;
31727 GemmMicrokernelTester()
31728 .mr(3)
31729 .nr(16)
31730 .kr(1)
31731 .sr(4)
31732 .m(3)
31733 .n(16)
31734 .k(4)
31735 .cn_stride(19)
31736 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31737 }
31738
31739 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
31740 TEST_REQUIRES_X86_FMA3;
31741 GemmMicrokernelTester()
31742 .mr(3)
31743 .nr(16)
31744 .kr(1)
31745 .sr(4)
31746 .m(3)
31747 .n(16)
31748 .k(4)
31749 .a_stride(7)
31750 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31751 }
31752
31753 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
31754 TEST_REQUIRES_X86_FMA3;
31755 for (uint32_t m = 1; m <= 3; m++) {
31756 for (uint32_t n = 1; n <= 16; n++) {
31757 GemmMicrokernelTester()
31758 .mr(3)
31759 .nr(16)
31760 .kr(1)
31761 .sr(4)
31762 .m(m)
31763 .n(n)
31764 .k(4)
31765 .iterations(1)
31766 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31767 }
31768 }
31769 }
31770
31771 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
31772 TEST_REQUIRES_X86_FMA3;
31773 for (uint32_t m = 1; m <= 3; m++) {
31774 GemmMicrokernelTester()
31775 .mr(3)
31776 .nr(16)
31777 .kr(1)
31778 .sr(4)
31779 .m(m)
31780 .n(16)
31781 .k(4)
31782 .iterations(1)
31783 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31784 }
31785 }
31786
31787 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
31788 TEST_REQUIRES_X86_FMA3;
31789 for (uint32_t n = 1; n <= 16; n++) {
31790 GemmMicrokernelTester()
31791 .mr(3)
31792 .nr(16)
31793 .kr(1)
31794 .sr(4)
31795 .m(3)
31796 .n(n)
31797 .k(4)
31798 .iterations(1)
31799 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31800 }
31801 }
31802
31803 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_lt_4) {
31804 TEST_REQUIRES_X86_FMA3;
31805 for (size_t k = 1; k < 4; k++) {
31806 GemmMicrokernelTester()
31807 .mr(3)
31808 .nr(16)
31809 .kr(1)
31810 .sr(4)
31811 .m(3)
31812 .n(16)
31813 .k(k)
31814 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31815 }
31816 }
31817
31818 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
31819 TEST_REQUIRES_X86_FMA3;
31820 for (size_t k = 1; k < 4; k++) {
31821 GemmMicrokernelTester()
31822 .mr(3)
31823 .nr(16)
31824 .kr(1)
31825 .sr(4)
31826 .m(3)
31827 .n(16)
31828 .k(k)
31829 .a_stride(7)
31830 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31831 }
31832 }
31833
31834 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
31835 TEST_REQUIRES_X86_FMA3;
31836 for (size_t k = 1; k < 4; k++) {
31837 for (uint32_t m = 1; m <= 3; m++) {
31838 for (uint32_t n = 1; n <= 16; n++) {
31839 GemmMicrokernelTester()
31840 .mr(3)
31841 .nr(16)
31842 .kr(1)
31843 .sr(4)
31844 .m(m)
31845 .n(n)
31846 .k(k)
31847 .iterations(1)
31848 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31849 }
31850 }
31851 }
31852 }
31853
31854 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_gt_4) {
31855 TEST_REQUIRES_X86_FMA3;
31856 for (size_t k = 5; k < 8; k++) {
31857 GemmMicrokernelTester()
31858 .mr(3)
31859 .nr(16)
31860 .kr(1)
31861 .sr(4)
31862 .m(3)
31863 .n(16)
31864 .k(k)
31865 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31866 }
31867 }
31868
31869 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
31870 TEST_REQUIRES_X86_FMA3;
31871 for (size_t k = 5; k < 8; k++) {
31872 GemmMicrokernelTester()
31873 .mr(3)
31874 .nr(16)
31875 .kr(1)
31876 .sr(4)
31877 .m(3)
31878 .n(16)
31879 .k(k)
31880 .a_stride(11)
31881 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31882 }
31883 }
31884
31885 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
31886 TEST_REQUIRES_X86_FMA3;
31887 for (size_t k = 5; k < 8; k++) {
31888 for (uint32_t m = 1; m <= 3; m++) {
31889 for (uint32_t n = 1; n <= 16; n++) {
31890 GemmMicrokernelTester()
31891 .mr(3)
31892 .nr(16)
31893 .kr(1)
31894 .sr(4)
31895 .m(m)
31896 .n(n)
31897 .k(k)
31898 .iterations(1)
31899 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31900 }
31901 }
31902 }
31903 }
31904
31905 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_div_4) {
31906 TEST_REQUIRES_X86_FMA3;
31907 for (size_t k = 8; k <= 40; k += 4) {
31908 GemmMicrokernelTester()
31909 .mr(3)
31910 .nr(16)
31911 .kr(1)
31912 .sr(4)
31913 .m(3)
31914 .n(16)
31915 .k(k)
31916 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31917 }
31918 }
31919
31920 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
31921 TEST_REQUIRES_X86_FMA3;
31922 for (size_t k = 8; k <= 40; k += 4) {
31923 GemmMicrokernelTester()
31924 .mr(3)
31925 .nr(16)
31926 .kr(1)
31927 .sr(4)
31928 .m(3)
31929 .n(16)
31930 .k(k)
31931 .a_stride(43)
31932 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31933 }
31934 }
31935
31936 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, k_div_4_subtile) {
31937 TEST_REQUIRES_X86_FMA3;
31938 for (size_t k = 8; k <= 40; k += 4) {
31939 for (uint32_t m = 1; m <= 3; m++) {
31940 for (uint32_t n = 1; n <= 16; n++) {
31941 GemmMicrokernelTester()
31942 .mr(3)
31943 .nr(16)
31944 .kr(1)
31945 .sr(4)
31946 .m(m)
31947 .n(n)
31948 .k(k)
31949 .iterations(1)
31950 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31951 }
31952 }
31953 }
31954 }
31955
31956 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16) {
31957 TEST_REQUIRES_X86_FMA3;
31958 for (uint32_t n = 17; n < 32; n++) {
31959 for (size_t k = 1; k <= 20; k += 5) {
31960 GemmMicrokernelTester()
31961 .mr(3)
31962 .nr(16)
31963 .kr(1)
31964 .sr(4)
31965 .m(3)
31966 .n(16)
31967 .k(k)
31968 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31969 }
31970 }
31971 }
31972
31973 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
31974 TEST_REQUIRES_X86_FMA3;
31975 for (uint32_t n = 17; n < 32; n++) {
31976 for (size_t k = 1; k <= 20; k += 5) {
31977 GemmMicrokernelTester()
31978 .mr(3)
31979 .nr(16)
31980 .kr(1)
31981 .sr(4)
31982 .m(3)
31983 .n(16)
31984 .k(k)
31985 .cn_stride(19)
31986 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
31987 }
31988 }
31989 }
31990
31991 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
31992 TEST_REQUIRES_X86_FMA3;
31993 for (uint32_t n = 17; n < 32; n++) {
31994 for (size_t k = 1; k <= 20; k += 5) {
31995 GemmMicrokernelTester()
31996 .mr(3)
31997 .nr(16)
31998 .kr(1)
31999 .sr(4)
32000 .m(3)
32001 .n(n)
32002 .k(k)
32003 .a_stride(23)
32004 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32005 }
32006 }
32007 }
32008
32009 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
32010 TEST_REQUIRES_X86_FMA3;
32011 for (uint32_t n = 17; n < 32; n++) {
32012 for (size_t k = 1; k <= 20; k += 5) {
32013 for (uint32_t m = 1; m <= 3; m++) {
32014 GemmMicrokernelTester()
32015 .mr(3)
32016 .nr(16)
32017 .kr(1)
32018 .sr(4)
32019 .m(m)
32020 .n(n)
32021 .k(k)
32022 .iterations(1)
32023 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32024 }
32025 }
32026 }
32027 }
32028
32029 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16) {
32030 TEST_REQUIRES_X86_FMA3;
32031 for (uint32_t n = 32; n <= 48; n += 16) {
32032 for (size_t k = 1; k <= 20; k += 5) {
32033 GemmMicrokernelTester()
32034 .mr(3)
32035 .nr(16)
32036 .kr(1)
32037 .sr(4)
32038 .m(3)
32039 .n(16)
32040 .k(k)
32041 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32042 }
32043 }
32044 }
32045
32046 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
32047 TEST_REQUIRES_X86_FMA3;
32048 for (uint32_t n = 32; n <= 48; n += 16) {
32049 for (size_t k = 1; k <= 20; k += 5) {
32050 GemmMicrokernelTester()
32051 .mr(3)
32052 .nr(16)
32053 .kr(1)
32054 .sr(4)
32055 .m(3)
32056 .n(n)
32057 .k(k)
32058 .cn_stride(19)
32059 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32060 }
32061 }
32062 }
32063
32064 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
32065 TEST_REQUIRES_X86_FMA3;
32066 for (uint32_t n = 32; n <= 48; n += 16) {
32067 for (size_t k = 1; k <= 20; k += 5) {
32068 GemmMicrokernelTester()
32069 .mr(3)
32070 .nr(16)
32071 .kr(1)
32072 .sr(4)
32073 .m(3)
32074 .n(n)
32075 .k(k)
32076 .a_stride(23)
32077 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32078 }
32079 }
32080 }
32081
32082 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, n_div_16_subtile) {
32083 TEST_REQUIRES_X86_FMA3;
32084 for (uint32_t n = 32; n <= 48; n += 16) {
32085 for (size_t k = 1; k <= 20; k += 5) {
32086 for (uint32_t m = 1; m <= 3; m++) {
32087 GemmMicrokernelTester()
32088 .mr(3)
32089 .nr(16)
32090 .kr(1)
32091 .sr(4)
32092 .m(m)
32093 .n(n)
32094 .k(k)
32095 .iterations(1)
32096 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32097 }
32098 }
32099 }
32100 }
32101
32102 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, strided_cm_subtile) {
32103 TEST_REQUIRES_X86_FMA3;
32104 for (size_t k = 1; k <= 20; k += 5) {
32105 for (uint32_t m = 1; m <= 3; m++) {
32106 for (uint32_t n = 1; n <= 16; n++) {
32107 GemmMicrokernelTester()
32108 .mr(3)
32109 .nr(16)
32110 .kr(1)
32111 .sr(4)
32112 .m(m)
32113 .n(n)
32114 .k(k)
32115 .cm_stride(19)
32116 .iterations(1)
32117 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32118 }
32119 }
32120 }
32121 }
32122
32123 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, qmin) {
32124 TEST_REQUIRES_X86_FMA3;
32125 GemmMicrokernelTester()
32126 .mr(3)
32127 .nr(16)
32128 .kr(1)
32129 .sr(4)
32130 .m(3)
32131 .n(16)
32132 .k(4)
32133 .qmin(128)
32134 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32135 }
32136
32137 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, qmax) {
32138 TEST_REQUIRES_X86_FMA3;
32139 GemmMicrokernelTester()
32140 .mr(3)
32141 .nr(16)
32142 .kr(1)
32143 .sr(4)
32144 .m(3)
32145 .n(16)
32146 .k(4)
32147 .qmax(128)
32148 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32149 }
32150
32151 TEST(F32_GEMMINC_3X16S4__FMA3_BROADCAST, strided_cm) {
32152 TEST_REQUIRES_X86_FMA3;
32153 GemmMicrokernelTester()
32154 .mr(3)
32155 .nr(16)
32156 .kr(1)
32157 .sr(4)
32158 .m(3)
32159 .n(16)
32160 .k(4)
32161 .cm_stride(19)
32162 .Test(xnn_f32_gemminc_ukernel_3x16s4__fma3_broadcast);
32163 }
32164#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32165
32166
32167#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32168 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4) {
32169 TEST_REQUIRES_X86_FMA3;
32170 GemmMicrokernelTester()
32171 .mr(4)
32172 .nr(16)
32173 .kr(1)
32174 .sr(4)
32175 .m(4)
32176 .n(16)
32177 .k(4)
32178 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32179 }
32180
32181 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, strided_cn) {
32182 TEST_REQUIRES_X86_FMA3;
32183 GemmMicrokernelTester()
32184 .mr(4)
32185 .nr(16)
32186 .kr(1)
32187 .sr(4)
32188 .m(4)
32189 .n(16)
32190 .k(4)
32191 .cn_stride(19)
32192 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32193 }
32194
32195 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
32196 TEST_REQUIRES_X86_FMA3;
32197 GemmMicrokernelTester()
32198 .mr(4)
32199 .nr(16)
32200 .kr(1)
32201 .sr(4)
32202 .m(4)
32203 .n(16)
32204 .k(4)
32205 .a_stride(7)
32206 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32207 }
32208
32209 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
32210 TEST_REQUIRES_X86_FMA3;
32211 for (uint32_t m = 1; m <= 4; m++) {
32212 for (uint32_t n = 1; n <= 16; n++) {
32213 GemmMicrokernelTester()
32214 .mr(4)
32215 .nr(16)
32216 .kr(1)
32217 .sr(4)
32218 .m(m)
32219 .n(n)
32220 .k(4)
32221 .iterations(1)
32222 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32223 }
32224 }
32225 }
32226
32227 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
32228 TEST_REQUIRES_X86_FMA3;
32229 for (uint32_t m = 1; m <= 4; m++) {
32230 GemmMicrokernelTester()
32231 .mr(4)
32232 .nr(16)
32233 .kr(1)
32234 .sr(4)
32235 .m(m)
32236 .n(16)
32237 .k(4)
32238 .iterations(1)
32239 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32240 }
32241 }
32242
32243 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
32244 TEST_REQUIRES_X86_FMA3;
32245 for (uint32_t n = 1; n <= 16; n++) {
32246 GemmMicrokernelTester()
32247 .mr(4)
32248 .nr(16)
32249 .kr(1)
32250 .sr(4)
32251 .m(4)
32252 .n(n)
32253 .k(4)
32254 .iterations(1)
32255 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32256 }
32257 }
32258
32259 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_lt_4) {
32260 TEST_REQUIRES_X86_FMA3;
32261 for (size_t k = 1; k < 4; k++) {
32262 GemmMicrokernelTester()
32263 .mr(4)
32264 .nr(16)
32265 .kr(1)
32266 .sr(4)
32267 .m(4)
32268 .n(16)
32269 .k(k)
32270 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32271 }
32272 }
32273
32274 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
32275 TEST_REQUIRES_X86_FMA3;
32276 for (size_t k = 1; k < 4; k++) {
32277 GemmMicrokernelTester()
32278 .mr(4)
32279 .nr(16)
32280 .kr(1)
32281 .sr(4)
32282 .m(4)
32283 .n(16)
32284 .k(k)
32285 .a_stride(7)
32286 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32287 }
32288 }
32289
32290 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
32291 TEST_REQUIRES_X86_FMA3;
32292 for (size_t k = 1; k < 4; k++) {
32293 for (uint32_t m = 1; m <= 4; m++) {
32294 for (uint32_t n = 1; n <= 16; n++) {
32295 GemmMicrokernelTester()
32296 .mr(4)
32297 .nr(16)
32298 .kr(1)
32299 .sr(4)
32300 .m(m)
32301 .n(n)
32302 .k(k)
32303 .iterations(1)
32304 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32305 }
32306 }
32307 }
32308 }
32309
32310 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_gt_4) {
32311 TEST_REQUIRES_X86_FMA3;
32312 for (size_t k = 5; k < 8; k++) {
32313 GemmMicrokernelTester()
32314 .mr(4)
32315 .nr(16)
32316 .kr(1)
32317 .sr(4)
32318 .m(4)
32319 .n(16)
32320 .k(k)
32321 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32322 }
32323 }
32324
32325 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
32326 TEST_REQUIRES_X86_FMA3;
32327 for (size_t k = 5; k < 8; k++) {
32328 GemmMicrokernelTester()
32329 .mr(4)
32330 .nr(16)
32331 .kr(1)
32332 .sr(4)
32333 .m(4)
32334 .n(16)
32335 .k(k)
32336 .a_stride(11)
32337 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32338 }
32339 }
32340
32341 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
32342 TEST_REQUIRES_X86_FMA3;
32343 for (size_t k = 5; k < 8; k++) {
32344 for (uint32_t m = 1; m <= 4; m++) {
32345 for (uint32_t n = 1; n <= 16; n++) {
32346 GemmMicrokernelTester()
32347 .mr(4)
32348 .nr(16)
32349 .kr(1)
32350 .sr(4)
32351 .m(m)
32352 .n(n)
32353 .k(k)
32354 .iterations(1)
32355 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32356 }
32357 }
32358 }
32359 }
32360
32361 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_div_4) {
32362 TEST_REQUIRES_X86_FMA3;
32363 for (size_t k = 8; k <= 40; k += 4) {
32364 GemmMicrokernelTester()
32365 .mr(4)
32366 .nr(16)
32367 .kr(1)
32368 .sr(4)
32369 .m(4)
32370 .n(16)
32371 .k(k)
32372 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32373 }
32374 }
32375
32376 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
32377 TEST_REQUIRES_X86_FMA3;
32378 for (size_t k = 8; k <= 40; k += 4) {
32379 GemmMicrokernelTester()
32380 .mr(4)
32381 .nr(16)
32382 .kr(1)
32383 .sr(4)
32384 .m(4)
32385 .n(16)
32386 .k(k)
32387 .a_stride(43)
32388 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32389 }
32390 }
32391
32392 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, k_div_4_subtile) {
32393 TEST_REQUIRES_X86_FMA3;
32394 for (size_t k = 8; k <= 40; k += 4) {
32395 for (uint32_t m = 1; m <= 4; m++) {
32396 for (uint32_t n = 1; n <= 16; n++) {
32397 GemmMicrokernelTester()
32398 .mr(4)
32399 .nr(16)
32400 .kr(1)
32401 .sr(4)
32402 .m(m)
32403 .n(n)
32404 .k(k)
32405 .iterations(1)
32406 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32407 }
32408 }
32409 }
32410 }
32411
32412 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16) {
32413 TEST_REQUIRES_X86_FMA3;
32414 for (uint32_t n = 17; n < 32; n++) {
32415 for (size_t k = 1; k <= 20; k += 5) {
32416 GemmMicrokernelTester()
32417 .mr(4)
32418 .nr(16)
32419 .kr(1)
32420 .sr(4)
32421 .m(4)
32422 .n(16)
32423 .k(k)
32424 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32425 }
32426 }
32427 }
32428
32429 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
32430 TEST_REQUIRES_X86_FMA3;
32431 for (uint32_t n = 17; n < 32; n++) {
32432 for (size_t k = 1; k <= 20; k += 5) {
32433 GemmMicrokernelTester()
32434 .mr(4)
32435 .nr(16)
32436 .kr(1)
32437 .sr(4)
32438 .m(4)
32439 .n(16)
32440 .k(k)
32441 .cn_stride(19)
32442 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32443 }
32444 }
32445 }
32446
32447 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
32448 TEST_REQUIRES_X86_FMA3;
32449 for (uint32_t n = 17; n < 32; n++) {
32450 for (size_t k = 1; k <= 20; k += 5) {
32451 GemmMicrokernelTester()
32452 .mr(4)
32453 .nr(16)
32454 .kr(1)
32455 .sr(4)
32456 .m(4)
32457 .n(n)
32458 .k(k)
32459 .a_stride(23)
32460 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32461 }
32462 }
32463 }
32464
32465 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
32466 TEST_REQUIRES_X86_FMA3;
32467 for (uint32_t n = 17; n < 32; n++) {
32468 for (size_t k = 1; k <= 20; k += 5) {
32469 for (uint32_t m = 1; m <= 4; m++) {
32470 GemmMicrokernelTester()
32471 .mr(4)
32472 .nr(16)
32473 .kr(1)
32474 .sr(4)
32475 .m(m)
32476 .n(n)
32477 .k(k)
32478 .iterations(1)
32479 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32480 }
32481 }
32482 }
32483 }
32484
32485 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16) {
32486 TEST_REQUIRES_X86_FMA3;
32487 for (uint32_t n = 32; n <= 48; n += 16) {
32488 for (size_t k = 1; k <= 20; k += 5) {
32489 GemmMicrokernelTester()
32490 .mr(4)
32491 .nr(16)
32492 .kr(1)
32493 .sr(4)
32494 .m(4)
32495 .n(16)
32496 .k(k)
32497 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32498 }
32499 }
32500 }
32501
32502 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
32503 TEST_REQUIRES_X86_FMA3;
32504 for (uint32_t n = 32; n <= 48; n += 16) {
32505 for (size_t k = 1; k <= 20; k += 5) {
32506 GemmMicrokernelTester()
32507 .mr(4)
32508 .nr(16)
32509 .kr(1)
32510 .sr(4)
32511 .m(4)
32512 .n(n)
32513 .k(k)
32514 .cn_stride(19)
32515 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32516 }
32517 }
32518 }
32519
32520 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
32521 TEST_REQUIRES_X86_FMA3;
32522 for (uint32_t n = 32; n <= 48; n += 16) {
32523 for (size_t k = 1; k <= 20; k += 5) {
32524 GemmMicrokernelTester()
32525 .mr(4)
32526 .nr(16)
32527 .kr(1)
32528 .sr(4)
32529 .m(4)
32530 .n(n)
32531 .k(k)
32532 .a_stride(23)
32533 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32534 }
32535 }
32536 }
32537
32538 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, n_div_16_subtile) {
32539 TEST_REQUIRES_X86_FMA3;
32540 for (uint32_t n = 32; n <= 48; n += 16) {
32541 for (size_t k = 1; k <= 20; k += 5) {
32542 for (uint32_t m = 1; m <= 4; m++) {
32543 GemmMicrokernelTester()
32544 .mr(4)
32545 .nr(16)
32546 .kr(1)
32547 .sr(4)
32548 .m(m)
32549 .n(n)
32550 .k(k)
32551 .iterations(1)
32552 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32553 }
32554 }
32555 }
32556 }
32557
32558 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, strided_cm_subtile) {
32559 TEST_REQUIRES_X86_FMA3;
32560 for (size_t k = 1; k <= 20; k += 5) {
32561 for (uint32_t m = 1; m <= 4; m++) {
32562 for (uint32_t n = 1; n <= 16; n++) {
32563 GemmMicrokernelTester()
32564 .mr(4)
32565 .nr(16)
32566 .kr(1)
32567 .sr(4)
32568 .m(m)
32569 .n(n)
32570 .k(k)
32571 .cm_stride(19)
32572 .iterations(1)
32573 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32574 }
32575 }
32576 }
32577 }
32578
32579 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, qmin) {
32580 TEST_REQUIRES_X86_FMA3;
32581 GemmMicrokernelTester()
32582 .mr(4)
32583 .nr(16)
32584 .kr(1)
32585 .sr(4)
32586 .m(4)
32587 .n(16)
32588 .k(4)
32589 .qmin(128)
32590 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32591 }
32592
32593 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, qmax) {
32594 TEST_REQUIRES_X86_FMA3;
32595 GemmMicrokernelTester()
32596 .mr(4)
32597 .nr(16)
32598 .kr(1)
32599 .sr(4)
32600 .m(4)
32601 .n(16)
32602 .k(4)
32603 .qmax(128)
32604 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32605 }
32606
32607 TEST(F32_GEMMINC_4X16S4__FMA3_BROADCAST, strided_cm) {
32608 TEST_REQUIRES_X86_FMA3;
32609 GemmMicrokernelTester()
32610 .mr(4)
32611 .nr(16)
32612 .kr(1)
32613 .sr(4)
32614 .m(4)
32615 .n(16)
32616 .k(4)
32617 .cm_stride(19)
32618 .Test(xnn_f32_gemminc_ukernel_4x16s4__fma3_broadcast);
32619 }
32620#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32621
32622
32623#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32624 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4) {
32625 TEST_REQUIRES_X86_FMA3;
32626 GemmMicrokernelTester()
32627 .mr(5)
32628 .nr(16)
32629 .kr(1)
32630 .sr(4)
32631 .m(5)
32632 .n(16)
32633 .k(4)
32634 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32635 }
32636
32637 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, strided_cn) {
32638 TEST_REQUIRES_X86_FMA3;
32639 GemmMicrokernelTester()
32640 .mr(5)
32641 .nr(16)
32642 .kr(1)
32643 .sr(4)
32644 .m(5)
32645 .n(16)
32646 .k(4)
32647 .cn_stride(19)
32648 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32649 }
32650
32651 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
32652 TEST_REQUIRES_X86_FMA3;
32653 GemmMicrokernelTester()
32654 .mr(5)
32655 .nr(16)
32656 .kr(1)
32657 .sr(4)
32658 .m(5)
32659 .n(16)
32660 .k(4)
32661 .a_stride(7)
32662 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32663 }
32664
32665 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
32666 TEST_REQUIRES_X86_FMA3;
32667 for (uint32_t m = 1; m <= 5; m++) {
32668 for (uint32_t n = 1; n <= 16; n++) {
32669 GemmMicrokernelTester()
32670 .mr(5)
32671 .nr(16)
32672 .kr(1)
32673 .sr(4)
32674 .m(m)
32675 .n(n)
32676 .k(4)
32677 .iterations(1)
32678 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32679 }
32680 }
32681 }
32682
32683 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
32684 TEST_REQUIRES_X86_FMA3;
32685 for (uint32_t m = 1; m <= 5; m++) {
32686 GemmMicrokernelTester()
32687 .mr(5)
32688 .nr(16)
32689 .kr(1)
32690 .sr(4)
32691 .m(m)
32692 .n(16)
32693 .k(4)
32694 .iterations(1)
32695 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32696 }
32697 }
32698
32699 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
32700 TEST_REQUIRES_X86_FMA3;
32701 for (uint32_t n = 1; n <= 16; n++) {
32702 GemmMicrokernelTester()
32703 .mr(5)
32704 .nr(16)
32705 .kr(1)
32706 .sr(4)
32707 .m(5)
32708 .n(n)
32709 .k(4)
32710 .iterations(1)
32711 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32712 }
32713 }
32714
32715 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_lt_4) {
32716 TEST_REQUIRES_X86_FMA3;
32717 for (size_t k = 1; k < 4; k++) {
32718 GemmMicrokernelTester()
32719 .mr(5)
32720 .nr(16)
32721 .kr(1)
32722 .sr(4)
32723 .m(5)
32724 .n(16)
32725 .k(k)
32726 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32727 }
32728 }
32729
32730 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
32731 TEST_REQUIRES_X86_FMA3;
32732 for (size_t k = 1; k < 4; k++) {
32733 GemmMicrokernelTester()
32734 .mr(5)
32735 .nr(16)
32736 .kr(1)
32737 .sr(4)
32738 .m(5)
32739 .n(16)
32740 .k(k)
32741 .a_stride(7)
32742 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32743 }
32744 }
32745
32746 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
32747 TEST_REQUIRES_X86_FMA3;
32748 for (size_t k = 1; k < 4; k++) {
32749 for (uint32_t m = 1; m <= 5; m++) {
32750 for (uint32_t n = 1; n <= 16; n++) {
32751 GemmMicrokernelTester()
32752 .mr(5)
32753 .nr(16)
32754 .kr(1)
32755 .sr(4)
32756 .m(m)
32757 .n(n)
32758 .k(k)
32759 .iterations(1)
32760 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32761 }
32762 }
32763 }
32764 }
32765
32766 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_gt_4) {
32767 TEST_REQUIRES_X86_FMA3;
32768 for (size_t k = 5; k < 8; k++) {
32769 GemmMicrokernelTester()
32770 .mr(5)
32771 .nr(16)
32772 .kr(1)
32773 .sr(4)
32774 .m(5)
32775 .n(16)
32776 .k(k)
32777 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32778 }
32779 }
32780
32781 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
32782 TEST_REQUIRES_X86_FMA3;
32783 for (size_t k = 5; k < 8; k++) {
32784 GemmMicrokernelTester()
32785 .mr(5)
32786 .nr(16)
32787 .kr(1)
32788 .sr(4)
32789 .m(5)
32790 .n(16)
32791 .k(k)
32792 .a_stride(11)
32793 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32794 }
32795 }
32796
32797 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
32798 TEST_REQUIRES_X86_FMA3;
32799 for (size_t k = 5; k < 8; k++) {
32800 for (uint32_t m = 1; m <= 5; m++) {
32801 for (uint32_t n = 1; n <= 16; n++) {
32802 GemmMicrokernelTester()
32803 .mr(5)
32804 .nr(16)
32805 .kr(1)
32806 .sr(4)
32807 .m(m)
32808 .n(n)
32809 .k(k)
32810 .iterations(1)
32811 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32812 }
32813 }
32814 }
32815 }
32816
32817 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_div_4) {
32818 TEST_REQUIRES_X86_FMA3;
32819 for (size_t k = 8; k <= 40; k += 4) {
32820 GemmMicrokernelTester()
32821 .mr(5)
32822 .nr(16)
32823 .kr(1)
32824 .sr(4)
32825 .m(5)
32826 .n(16)
32827 .k(k)
32828 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32829 }
32830 }
32831
32832 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
32833 TEST_REQUIRES_X86_FMA3;
32834 for (size_t k = 8; k <= 40; k += 4) {
32835 GemmMicrokernelTester()
32836 .mr(5)
32837 .nr(16)
32838 .kr(1)
32839 .sr(4)
32840 .m(5)
32841 .n(16)
32842 .k(k)
32843 .a_stride(43)
32844 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32845 }
32846 }
32847
32848 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, k_div_4_subtile) {
32849 TEST_REQUIRES_X86_FMA3;
32850 for (size_t k = 8; k <= 40; k += 4) {
32851 for (uint32_t m = 1; m <= 5; m++) {
32852 for (uint32_t n = 1; n <= 16; n++) {
32853 GemmMicrokernelTester()
32854 .mr(5)
32855 .nr(16)
32856 .kr(1)
32857 .sr(4)
32858 .m(m)
32859 .n(n)
32860 .k(k)
32861 .iterations(1)
32862 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32863 }
32864 }
32865 }
32866 }
32867
32868 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16) {
32869 TEST_REQUIRES_X86_FMA3;
32870 for (uint32_t n = 17; n < 32; n++) {
32871 for (size_t k = 1; k <= 20; k += 5) {
32872 GemmMicrokernelTester()
32873 .mr(5)
32874 .nr(16)
32875 .kr(1)
32876 .sr(4)
32877 .m(5)
32878 .n(16)
32879 .k(k)
32880 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32881 }
32882 }
32883 }
32884
32885 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
32886 TEST_REQUIRES_X86_FMA3;
32887 for (uint32_t n = 17; n < 32; n++) {
32888 for (size_t k = 1; k <= 20; k += 5) {
32889 GemmMicrokernelTester()
32890 .mr(5)
32891 .nr(16)
32892 .kr(1)
32893 .sr(4)
32894 .m(5)
32895 .n(16)
32896 .k(k)
32897 .cn_stride(19)
32898 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32899 }
32900 }
32901 }
32902
32903 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
32904 TEST_REQUIRES_X86_FMA3;
32905 for (uint32_t n = 17; n < 32; n++) {
32906 for (size_t k = 1; k <= 20; k += 5) {
32907 GemmMicrokernelTester()
32908 .mr(5)
32909 .nr(16)
32910 .kr(1)
32911 .sr(4)
32912 .m(5)
32913 .n(n)
32914 .k(k)
32915 .a_stride(23)
32916 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32917 }
32918 }
32919 }
32920
32921 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
32922 TEST_REQUIRES_X86_FMA3;
32923 for (uint32_t n = 17; n < 32; n++) {
32924 for (size_t k = 1; k <= 20; k += 5) {
32925 for (uint32_t m = 1; m <= 5; m++) {
32926 GemmMicrokernelTester()
32927 .mr(5)
32928 .nr(16)
32929 .kr(1)
32930 .sr(4)
32931 .m(m)
32932 .n(n)
32933 .k(k)
32934 .iterations(1)
32935 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32936 }
32937 }
32938 }
32939 }
32940
32941 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16) {
32942 TEST_REQUIRES_X86_FMA3;
32943 for (uint32_t n = 32; n <= 48; n += 16) {
32944 for (size_t k = 1; k <= 20; k += 5) {
32945 GemmMicrokernelTester()
32946 .mr(5)
32947 .nr(16)
32948 .kr(1)
32949 .sr(4)
32950 .m(5)
32951 .n(16)
32952 .k(k)
32953 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32954 }
32955 }
32956 }
32957
32958 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
32959 TEST_REQUIRES_X86_FMA3;
32960 for (uint32_t n = 32; n <= 48; n += 16) {
32961 for (size_t k = 1; k <= 20; k += 5) {
32962 GemmMicrokernelTester()
32963 .mr(5)
32964 .nr(16)
32965 .kr(1)
32966 .sr(4)
32967 .m(5)
32968 .n(n)
32969 .k(k)
32970 .cn_stride(19)
32971 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32972 }
32973 }
32974 }
32975
32976 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
32977 TEST_REQUIRES_X86_FMA3;
32978 for (uint32_t n = 32; n <= 48; n += 16) {
32979 for (size_t k = 1; k <= 20; k += 5) {
32980 GemmMicrokernelTester()
32981 .mr(5)
32982 .nr(16)
32983 .kr(1)
32984 .sr(4)
32985 .m(5)
32986 .n(n)
32987 .k(k)
32988 .a_stride(23)
32989 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
32990 }
32991 }
32992 }
32993
32994 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, n_div_16_subtile) {
32995 TEST_REQUIRES_X86_FMA3;
32996 for (uint32_t n = 32; n <= 48; n += 16) {
32997 for (size_t k = 1; k <= 20; k += 5) {
32998 for (uint32_t m = 1; m <= 5; m++) {
32999 GemmMicrokernelTester()
33000 .mr(5)
33001 .nr(16)
33002 .kr(1)
33003 .sr(4)
33004 .m(m)
33005 .n(n)
33006 .k(k)
33007 .iterations(1)
33008 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
33009 }
33010 }
33011 }
33012 }
33013
33014 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, strided_cm_subtile) {
33015 TEST_REQUIRES_X86_FMA3;
33016 for (size_t k = 1; k <= 20; k += 5) {
33017 for (uint32_t m = 1; m <= 5; m++) {
33018 for (uint32_t n = 1; n <= 16; n++) {
33019 GemmMicrokernelTester()
33020 .mr(5)
33021 .nr(16)
33022 .kr(1)
33023 .sr(4)
33024 .m(m)
33025 .n(n)
33026 .k(k)
33027 .cm_stride(19)
33028 .iterations(1)
33029 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
33030 }
33031 }
33032 }
33033 }
33034
33035 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, qmin) {
33036 TEST_REQUIRES_X86_FMA3;
33037 GemmMicrokernelTester()
33038 .mr(5)
33039 .nr(16)
33040 .kr(1)
33041 .sr(4)
33042 .m(5)
33043 .n(16)
33044 .k(4)
33045 .qmin(128)
33046 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
33047 }
33048
33049 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, qmax) {
33050 TEST_REQUIRES_X86_FMA3;
33051 GemmMicrokernelTester()
33052 .mr(5)
33053 .nr(16)
33054 .kr(1)
33055 .sr(4)
33056 .m(5)
33057 .n(16)
33058 .k(4)
33059 .qmax(128)
33060 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
33061 }
33062
33063 TEST(F32_GEMMINC_5X16S4__FMA3_BROADCAST, strided_cm) {
33064 TEST_REQUIRES_X86_FMA3;
33065 GemmMicrokernelTester()
33066 .mr(5)
33067 .nr(16)
33068 .kr(1)
33069 .sr(4)
33070 .m(5)
33071 .n(16)
33072 .k(4)
33073 .cm_stride(19)
33074 .Test(xnn_f32_gemminc_ukernel_5x16s4__fma3_broadcast);
33075 }
33076#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33077
33078
33079#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan0f349c42019-11-27 11:58:54 -080033080 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1) {
33081 TEST_REQUIRES_X86_AVX512F;
33082 GemmMicrokernelTester()
33083 .mr(1)
33084 .nr(16)
33085 .kr(1)
33086 .sr(1)
33087 .m(1)
33088 .n(16)
33089 .k(1)
33090 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33091 }
33092
33093 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, strided_cn) {
33094 TEST_REQUIRES_X86_AVX512F;
33095 GemmMicrokernelTester()
33096 .mr(1)
33097 .nr(16)
33098 .kr(1)
33099 .sr(1)
33100 .m(1)
33101 .n(16)
33102 .k(1)
33103 .cn_stride(19)
33104 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33105 }
33106
33107 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
33108 TEST_REQUIRES_X86_AVX512F;
33109 GemmMicrokernelTester()
33110 .mr(1)
33111 .nr(16)
33112 .kr(1)
33113 .sr(1)
33114 .m(1)
33115 .n(16)
33116 .k(1)
33117 .a_stride(3)
33118 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33119 }
33120
33121 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
33122 TEST_REQUIRES_X86_AVX512F;
33123 for (uint32_t m = 1; m <= 1; m++) {
33124 for (uint32_t n = 1; n <= 16; n++) {
33125 GemmMicrokernelTester()
33126 .mr(1)
33127 .nr(16)
33128 .kr(1)
33129 .sr(1)
33130 .m(m)
33131 .n(n)
33132 .k(1)
33133 .iterations(1)
33134 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33135 }
33136 }
33137 }
33138
33139 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
33140 TEST_REQUIRES_X86_AVX512F;
33141 for (uint32_t m = 1; m <= 1; m++) {
33142 GemmMicrokernelTester()
33143 .mr(1)
33144 .nr(16)
33145 .kr(1)
33146 .sr(1)
33147 .m(m)
33148 .n(16)
33149 .k(1)
33150 .iterations(1)
33151 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33152 }
33153 }
33154
33155 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
33156 TEST_REQUIRES_X86_AVX512F;
33157 for (uint32_t n = 1; n <= 16; n++) {
33158 GemmMicrokernelTester()
33159 .mr(1)
33160 .nr(16)
33161 .kr(1)
33162 .sr(1)
33163 .m(1)
33164 .n(n)
33165 .k(1)
33166 .iterations(1)
33167 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33168 }
33169 }
33170
33171 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_gt_1) {
33172 TEST_REQUIRES_X86_AVX512F;
33173 for (size_t k = 2; k < 10; k++) {
33174 GemmMicrokernelTester()
33175 .mr(1)
33176 .nr(16)
33177 .kr(1)
33178 .sr(1)
33179 .m(1)
33180 .n(16)
33181 .k(k)
33182 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33183 }
33184 }
33185
33186 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
33187 TEST_REQUIRES_X86_AVX512F;
33188 for (size_t k = 2; k < 10; k++) {
33189 GemmMicrokernelTester()
33190 .mr(1)
33191 .nr(16)
33192 .kr(1)
33193 .sr(1)
33194 .m(1)
33195 .n(16)
33196 .k(k)
33197 .a_stride(11)
33198 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33199 }
33200 }
33201
33202 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
33203 TEST_REQUIRES_X86_AVX512F;
33204 for (size_t k = 2; k < 10; k++) {
33205 for (uint32_t m = 1; m <= 1; m++) {
33206 for (uint32_t n = 1; n <= 16; n++) {
33207 GemmMicrokernelTester()
33208 .mr(1)
33209 .nr(16)
33210 .kr(1)
33211 .sr(1)
33212 .m(m)
33213 .n(n)
33214 .k(k)
33215 .iterations(1)
33216 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33217 }
33218 }
33219 }
33220 }
33221
33222 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16) {
33223 TEST_REQUIRES_X86_AVX512F;
33224 for (uint32_t n = 17; n < 32; n++) {
33225 for (size_t k = 1; k <= 5; k += 2) {
33226 GemmMicrokernelTester()
33227 .mr(1)
33228 .nr(16)
33229 .kr(1)
33230 .sr(1)
33231 .m(1)
33232 .n(16)
33233 .k(k)
33234 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33235 }
33236 }
33237 }
33238
33239 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
33240 TEST_REQUIRES_X86_AVX512F;
33241 for (uint32_t n = 17; n < 32; n++) {
33242 for (size_t k = 1; k <= 5; k += 2) {
33243 GemmMicrokernelTester()
33244 .mr(1)
33245 .nr(16)
33246 .kr(1)
33247 .sr(1)
33248 .m(1)
33249 .n(16)
33250 .k(k)
33251 .cn_stride(19)
33252 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33253 }
33254 }
33255 }
33256
33257 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
33258 TEST_REQUIRES_X86_AVX512F;
33259 for (uint32_t n = 17; n < 32; n++) {
33260 for (size_t k = 1; k <= 5; k += 2) {
33261 GemmMicrokernelTester()
33262 .mr(1)
33263 .nr(16)
33264 .kr(1)
33265 .sr(1)
33266 .m(1)
33267 .n(n)
33268 .k(k)
33269 .a_stride(7)
33270 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33271 }
33272 }
33273 }
33274
33275 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
33276 TEST_REQUIRES_X86_AVX512F;
33277 for (uint32_t n = 17; n < 32; n++) {
33278 for (size_t k = 1; k <= 5; k += 2) {
33279 for (uint32_t m = 1; m <= 1; m++) {
33280 GemmMicrokernelTester()
33281 .mr(1)
33282 .nr(16)
33283 .kr(1)
33284 .sr(1)
33285 .m(m)
33286 .n(n)
33287 .k(k)
33288 .iterations(1)
33289 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33290 }
33291 }
33292 }
33293 }
33294
33295 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16) {
33296 TEST_REQUIRES_X86_AVX512F;
33297 for (uint32_t n = 32; n <= 48; n += 16) {
33298 for (size_t k = 1; k <= 5; k += 2) {
33299 GemmMicrokernelTester()
33300 .mr(1)
33301 .nr(16)
33302 .kr(1)
33303 .sr(1)
33304 .m(1)
33305 .n(16)
33306 .k(k)
33307 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33308 }
33309 }
33310 }
33311
33312 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
33313 TEST_REQUIRES_X86_AVX512F;
33314 for (uint32_t n = 32; n <= 48; n += 16) {
33315 for (size_t k = 1; k <= 5; k += 2) {
33316 GemmMicrokernelTester()
33317 .mr(1)
33318 .nr(16)
33319 .kr(1)
33320 .sr(1)
33321 .m(1)
33322 .n(n)
33323 .k(k)
33324 .cn_stride(19)
33325 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33326 }
33327 }
33328 }
33329
33330 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16_strided_a) {
33331 TEST_REQUIRES_X86_AVX512F;
33332 for (uint32_t n = 32; n <= 48; n += 16) {
33333 for (size_t k = 1; k <= 5; k += 2) {
33334 GemmMicrokernelTester()
33335 .mr(1)
33336 .nr(16)
33337 .kr(1)
33338 .sr(1)
33339 .m(1)
33340 .n(n)
33341 .k(k)
33342 .a_stride(7)
33343 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33344 }
33345 }
33346 }
33347
33348 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
33349 TEST_REQUIRES_X86_AVX512F;
33350 for (uint32_t n = 32; n <= 48; n += 16) {
33351 for (size_t k = 1; k <= 5; k += 2) {
33352 for (uint32_t m = 1; m <= 1; m++) {
33353 GemmMicrokernelTester()
33354 .mr(1)
33355 .nr(16)
33356 .kr(1)
33357 .sr(1)
33358 .m(m)
33359 .n(n)
33360 .k(k)
33361 .iterations(1)
33362 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33363 }
33364 }
33365 }
33366 }
33367
33368 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
33369 TEST_REQUIRES_X86_AVX512F;
33370 for (size_t k = 1; k <= 5; k += 2) {
33371 for (uint32_t m = 1; m <= 1; m++) {
33372 for (uint32_t n = 1; n <= 16; n++) {
33373 GemmMicrokernelTester()
33374 .mr(1)
33375 .nr(16)
33376 .kr(1)
33377 .sr(1)
33378 .m(m)
33379 .n(n)
33380 .k(k)
33381 .cm_stride(19)
33382 .iterations(1)
33383 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33384 }
33385 }
33386 }
33387 }
33388
33389 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, qmin) {
33390 TEST_REQUIRES_X86_AVX512F;
33391 GemmMicrokernelTester()
33392 .mr(1)
33393 .nr(16)
33394 .kr(1)
33395 .sr(1)
33396 .m(1)
33397 .n(16)
33398 .k(1)
33399 .qmin(128)
33400 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33401 }
33402
33403 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, qmax) {
33404 TEST_REQUIRES_X86_AVX512F;
33405 GemmMicrokernelTester()
33406 .mr(1)
33407 .nr(16)
33408 .kr(1)
33409 .sr(1)
33410 .m(1)
33411 .n(16)
33412 .k(1)
33413 .qmax(128)
33414 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33415 }
33416
33417 TEST(F32_GEMMINC_1X16__AVX512F_BROADCAST, strided_cm) {
33418 TEST_REQUIRES_X86_AVX512F;
33419 GemmMicrokernelTester()
33420 .mr(1)
33421 .nr(16)
33422 .kr(1)
33423 .sr(1)
33424 .m(1)
33425 .n(16)
33426 .k(1)
33427 .cm_stride(19)
33428 .Test(xnn_f32_gemminc_ukernel_1x16__avx512f_broadcast);
33429 }
33430#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33431
33432
33433#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33434 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1) {
33435 TEST_REQUIRES_X86_AVX512F;
33436 GemmMicrokernelTester()
33437 .mr(4)
33438 .nr(16)
33439 .kr(1)
33440 .sr(1)
33441 .m(4)
33442 .n(16)
33443 .k(1)
33444 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33445 }
33446
33447 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, strided_cn) {
33448 TEST_REQUIRES_X86_AVX512F;
33449 GemmMicrokernelTester()
33450 .mr(4)
33451 .nr(16)
33452 .kr(1)
33453 .sr(1)
33454 .m(4)
33455 .n(16)
33456 .k(1)
33457 .cn_stride(19)
33458 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33459 }
33460
33461 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
33462 TEST_REQUIRES_X86_AVX512F;
33463 GemmMicrokernelTester()
33464 .mr(4)
33465 .nr(16)
33466 .kr(1)
33467 .sr(1)
33468 .m(4)
33469 .n(16)
33470 .k(1)
33471 .a_stride(3)
33472 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33473 }
33474
33475 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_subtile) {
33476 TEST_REQUIRES_X86_AVX512F;
33477 for (uint32_t m = 1; m <= 4; m++) {
33478 for (uint32_t n = 1; n <= 16; n++) {
33479 GemmMicrokernelTester()
33480 .mr(4)
33481 .nr(16)
33482 .kr(1)
33483 .sr(1)
33484 .m(m)
33485 .n(n)
33486 .k(1)
33487 .iterations(1)
33488 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33489 }
33490 }
33491 }
33492
33493 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
33494 TEST_REQUIRES_X86_AVX512F;
33495 for (uint32_t m = 1; m <= 4; m++) {
33496 GemmMicrokernelTester()
33497 .mr(4)
33498 .nr(16)
33499 .kr(1)
33500 .sr(1)
33501 .m(m)
33502 .n(16)
33503 .k(1)
33504 .iterations(1)
33505 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33506 }
33507 }
33508
33509 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
33510 TEST_REQUIRES_X86_AVX512F;
33511 for (uint32_t n = 1; n <= 16; n++) {
33512 GemmMicrokernelTester()
33513 .mr(4)
33514 .nr(16)
33515 .kr(1)
33516 .sr(1)
33517 .m(4)
33518 .n(n)
33519 .k(1)
33520 .iterations(1)
33521 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33522 }
33523 }
33524
33525 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_gt_1) {
33526 TEST_REQUIRES_X86_AVX512F;
33527 for (size_t k = 2; k < 10; k++) {
33528 GemmMicrokernelTester()
33529 .mr(4)
33530 .nr(16)
33531 .kr(1)
33532 .sr(1)
33533 .m(4)
33534 .n(16)
33535 .k(k)
33536 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33537 }
33538 }
33539
33540 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
33541 TEST_REQUIRES_X86_AVX512F;
33542 for (size_t k = 2; k < 10; k++) {
33543 GemmMicrokernelTester()
33544 .mr(4)
33545 .nr(16)
33546 .kr(1)
33547 .sr(1)
33548 .m(4)
33549 .n(16)
33550 .k(k)
33551 .a_stride(11)
33552 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33553 }
33554 }
33555
33556 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, k_gt_1_subtile) {
33557 TEST_REQUIRES_X86_AVX512F;
33558 for (size_t k = 2; k < 10; k++) {
33559 for (uint32_t m = 1; m <= 4; m++) {
33560 for (uint32_t n = 1; n <= 16; n++) {
33561 GemmMicrokernelTester()
33562 .mr(4)
33563 .nr(16)
33564 .kr(1)
33565 .sr(1)
33566 .m(m)
33567 .n(n)
33568 .k(k)
33569 .iterations(1)
33570 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33571 }
33572 }
33573 }
33574 }
33575
33576 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16) {
33577 TEST_REQUIRES_X86_AVX512F;
33578 for (uint32_t n = 17; n < 32; n++) {
33579 for (size_t k = 1; k <= 5; k += 2) {
33580 GemmMicrokernelTester()
33581 .mr(4)
33582 .nr(16)
33583 .kr(1)
33584 .sr(1)
33585 .m(4)
33586 .n(16)
33587 .k(k)
33588 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33589 }
33590 }
33591 }
33592
33593 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
33594 TEST_REQUIRES_X86_AVX512F;
33595 for (uint32_t n = 17; n < 32; n++) {
33596 for (size_t k = 1; k <= 5; k += 2) {
33597 GemmMicrokernelTester()
33598 .mr(4)
33599 .nr(16)
33600 .kr(1)
33601 .sr(1)
33602 .m(4)
33603 .n(16)
33604 .k(k)
33605 .cn_stride(19)
33606 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33607 }
33608 }
33609 }
33610
33611 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
33612 TEST_REQUIRES_X86_AVX512F;
33613 for (uint32_t n = 17; n < 32; n++) {
33614 for (size_t k = 1; k <= 5; k += 2) {
33615 GemmMicrokernelTester()
33616 .mr(4)
33617 .nr(16)
33618 .kr(1)
33619 .sr(1)
33620 .m(4)
33621 .n(n)
33622 .k(k)
33623 .a_stride(7)
33624 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33625 }
33626 }
33627 }
33628
33629 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_gt_16_subtile) {
33630 TEST_REQUIRES_X86_AVX512F;
33631 for (uint32_t n = 17; n < 32; n++) {
33632 for (size_t k = 1; k <= 5; k += 2) {
33633 for (uint32_t m = 1; m <= 4; m++) {
33634 GemmMicrokernelTester()
33635 .mr(4)
33636 .nr(16)
33637 .kr(1)
33638 .sr(1)
33639 .m(m)
33640 .n(n)
33641 .k(k)
33642 .iterations(1)
33643 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33644 }
33645 }
33646 }
33647 }
33648
33649 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16) {
33650 TEST_REQUIRES_X86_AVX512F;
33651 for (uint32_t n = 32; n <= 48; n += 16) {
33652 for (size_t k = 1; k <= 5; k += 2) {
33653 GemmMicrokernelTester()
33654 .mr(4)
33655 .nr(16)
33656 .kr(1)
33657 .sr(1)
33658 .m(4)
33659 .n(16)
33660 .k(k)
33661 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33662 }
33663 }
33664 }
33665
33666 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
33667 TEST_REQUIRES_X86_AVX512F;
33668 for (uint32_t n = 32; n <= 48; n += 16) {
33669 for (size_t k = 1; k <= 5; k += 2) {
33670 GemmMicrokernelTester()
33671 .mr(4)
33672 .nr(16)
33673 .kr(1)
33674 .sr(1)
33675 .m(4)
33676 .n(n)
33677 .k(k)
33678 .cn_stride(19)
33679 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33680 }
33681 }
33682 }
33683
33684 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16_strided_a) {
33685 TEST_REQUIRES_X86_AVX512F;
33686 for (uint32_t n = 32; n <= 48; n += 16) {
33687 for (size_t k = 1; k <= 5; k += 2) {
33688 GemmMicrokernelTester()
33689 .mr(4)
33690 .nr(16)
33691 .kr(1)
33692 .sr(1)
33693 .m(4)
33694 .n(n)
33695 .k(k)
33696 .a_stride(7)
33697 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33698 }
33699 }
33700 }
33701
33702 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, n_div_16_subtile) {
33703 TEST_REQUIRES_X86_AVX512F;
33704 for (uint32_t n = 32; n <= 48; n += 16) {
33705 for (size_t k = 1; k <= 5; k += 2) {
33706 for (uint32_t m = 1; m <= 4; m++) {
33707 GemmMicrokernelTester()
33708 .mr(4)
33709 .nr(16)
33710 .kr(1)
33711 .sr(1)
33712 .m(m)
33713 .n(n)
33714 .k(k)
33715 .iterations(1)
33716 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33717 }
33718 }
33719 }
33720 }
33721
33722 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, strided_cm_subtile) {
33723 TEST_REQUIRES_X86_AVX512F;
33724 for (size_t k = 1; k <= 5; k += 2) {
33725 for (uint32_t m = 1; m <= 4; m++) {
33726 for (uint32_t n = 1; n <= 16; n++) {
33727 GemmMicrokernelTester()
33728 .mr(4)
33729 .nr(16)
33730 .kr(1)
33731 .sr(1)
33732 .m(m)
33733 .n(n)
33734 .k(k)
33735 .cm_stride(19)
33736 .iterations(1)
33737 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33738 }
33739 }
33740 }
33741 }
33742
33743 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, qmin) {
33744 TEST_REQUIRES_X86_AVX512F;
33745 GemmMicrokernelTester()
33746 .mr(4)
33747 .nr(16)
33748 .kr(1)
33749 .sr(1)
33750 .m(4)
33751 .n(16)
33752 .k(1)
33753 .qmin(128)
33754 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33755 }
33756
33757 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, qmax) {
33758 TEST_REQUIRES_X86_AVX512F;
33759 GemmMicrokernelTester()
33760 .mr(4)
33761 .nr(16)
33762 .kr(1)
33763 .sr(1)
33764 .m(4)
33765 .n(16)
33766 .k(1)
33767 .qmax(128)
33768 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33769 }
33770
33771 TEST(F32_GEMMINC_4X16__AVX512F_BROADCAST, strided_cm) {
33772 TEST_REQUIRES_X86_AVX512F;
33773 GemmMicrokernelTester()
33774 .mr(4)
33775 .nr(16)
33776 .kr(1)
33777 .sr(1)
33778 .m(4)
33779 .n(16)
33780 .k(1)
33781 .cm_stride(19)
33782 .Test(xnn_f32_gemminc_ukernel_4x16__avx512f_broadcast);
33783 }
33784#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33785
33786
33787#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33788 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1) {
33789 TEST_REQUIRES_X86_AVX512F;
33790 GemmMicrokernelTester()
33791 .mr(5)
33792 .nr(16)
33793 .kr(1)
33794 .sr(1)
33795 .m(5)
33796 .n(16)
33797 .k(1)
33798 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33799 }
33800
33801 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, strided_cn) {
33802 TEST_REQUIRES_X86_AVX512F;
33803 GemmMicrokernelTester()
33804 .mr(5)
33805 .nr(16)
33806 .kr(1)
33807 .sr(1)
33808 .m(5)
33809 .n(16)
33810 .k(1)
33811 .cn_stride(19)
33812 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33813 }
33814
33815 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
33816 TEST_REQUIRES_X86_AVX512F;
33817 GemmMicrokernelTester()
33818 .mr(5)
33819 .nr(16)
33820 .kr(1)
33821 .sr(1)
33822 .m(5)
33823 .n(16)
33824 .k(1)
33825 .a_stride(3)
33826 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33827 }
33828
33829 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_subtile) {
33830 TEST_REQUIRES_X86_AVX512F;
33831 for (uint32_t m = 1; m <= 5; m++) {
33832 for (uint32_t n = 1; n <= 16; n++) {
33833 GemmMicrokernelTester()
33834 .mr(5)
33835 .nr(16)
33836 .kr(1)
33837 .sr(1)
33838 .m(m)
33839 .n(n)
33840 .k(1)
33841 .iterations(1)
33842 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33843 }
33844 }
33845 }
33846
33847 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
33848 TEST_REQUIRES_X86_AVX512F;
33849 for (uint32_t m = 1; m <= 5; m++) {
33850 GemmMicrokernelTester()
33851 .mr(5)
33852 .nr(16)
33853 .kr(1)
33854 .sr(1)
33855 .m(m)
33856 .n(16)
33857 .k(1)
33858 .iterations(1)
33859 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33860 }
33861 }
33862
33863 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
33864 TEST_REQUIRES_X86_AVX512F;
33865 for (uint32_t n = 1; n <= 16; n++) {
33866 GemmMicrokernelTester()
33867 .mr(5)
33868 .nr(16)
33869 .kr(1)
33870 .sr(1)
33871 .m(5)
33872 .n(n)
33873 .k(1)
33874 .iterations(1)
33875 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33876 }
33877 }
33878
33879 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_gt_1) {
33880 TEST_REQUIRES_X86_AVX512F;
33881 for (size_t k = 2; k < 10; k++) {
33882 GemmMicrokernelTester()
33883 .mr(5)
33884 .nr(16)
33885 .kr(1)
33886 .sr(1)
33887 .m(5)
33888 .n(16)
33889 .k(k)
33890 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33891 }
33892 }
33893
33894 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
33895 TEST_REQUIRES_X86_AVX512F;
33896 for (size_t k = 2; k < 10; k++) {
33897 GemmMicrokernelTester()
33898 .mr(5)
33899 .nr(16)
33900 .kr(1)
33901 .sr(1)
33902 .m(5)
33903 .n(16)
33904 .k(k)
33905 .a_stride(11)
33906 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33907 }
33908 }
33909
33910 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, k_gt_1_subtile) {
33911 TEST_REQUIRES_X86_AVX512F;
33912 for (size_t k = 2; k < 10; k++) {
33913 for (uint32_t m = 1; m <= 5; m++) {
33914 for (uint32_t n = 1; n <= 16; n++) {
33915 GemmMicrokernelTester()
33916 .mr(5)
33917 .nr(16)
33918 .kr(1)
33919 .sr(1)
33920 .m(m)
33921 .n(n)
33922 .k(k)
33923 .iterations(1)
33924 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33925 }
33926 }
33927 }
33928 }
33929
33930 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16) {
33931 TEST_REQUIRES_X86_AVX512F;
33932 for (uint32_t n = 17; n < 32; n++) {
33933 for (size_t k = 1; k <= 5; k += 2) {
33934 GemmMicrokernelTester()
33935 .mr(5)
33936 .nr(16)
33937 .kr(1)
33938 .sr(1)
33939 .m(5)
33940 .n(16)
33941 .k(k)
33942 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33943 }
33944 }
33945 }
33946
33947 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
33948 TEST_REQUIRES_X86_AVX512F;
33949 for (uint32_t n = 17; n < 32; n++) {
33950 for (size_t k = 1; k <= 5; k += 2) {
33951 GemmMicrokernelTester()
33952 .mr(5)
33953 .nr(16)
33954 .kr(1)
33955 .sr(1)
33956 .m(5)
33957 .n(16)
33958 .k(k)
33959 .cn_stride(19)
33960 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33961 }
33962 }
33963 }
33964
33965 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
33966 TEST_REQUIRES_X86_AVX512F;
33967 for (uint32_t n = 17; n < 32; n++) {
33968 for (size_t k = 1; k <= 5; k += 2) {
33969 GemmMicrokernelTester()
33970 .mr(5)
33971 .nr(16)
33972 .kr(1)
33973 .sr(1)
33974 .m(5)
33975 .n(n)
33976 .k(k)
33977 .a_stride(7)
33978 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33979 }
33980 }
33981 }
33982
33983 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_gt_16_subtile) {
33984 TEST_REQUIRES_X86_AVX512F;
33985 for (uint32_t n = 17; n < 32; n++) {
33986 for (size_t k = 1; k <= 5; k += 2) {
33987 for (uint32_t m = 1; m <= 5; m++) {
33988 GemmMicrokernelTester()
33989 .mr(5)
33990 .nr(16)
33991 .kr(1)
33992 .sr(1)
33993 .m(m)
33994 .n(n)
33995 .k(k)
33996 .iterations(1)
33997 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
33998 }
33999 }
34000 }
34001 }
34002
34003 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16) {
34004 TEST_REQUIRES_X86_AVX512F;
34005 for (uint32_t n = 32; n <= 48; n += 16) {
34006 for (size_t k = 1; k <= 5; k += 2) {
34007 GemmMicrokernelTester()
34008 .mr(5)
34009 .nr(16)
34010 .kr(1)
34011 .sr(1)
34012 .m(5)
34013 .n(16)
34014 .k(k)
34015 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34016 }
34017 }
34018 }
34019
34020 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
34021 TEST_REQUIRES_X86_AVX512F;
34022 for (uint32_t n = 32; n <= 48; n += 16) {
34023 for (size_t k = 1; k <= 5; k += 2) {
34024 GemmMicrokernelTester()
34025 .mr(5)
34026 .nr(16)
34027 .kr(1)
34028 .sr(1)
34029 .m(5)
34030 .n(n)
34031 .k(k)
34032 .cn_stride(19)
34033 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34034 }
34035 }
34036 }
34037
34038 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16_strided_a) {
34039 TEST_REQUIRES_X86_AVX512F;
34040 for (uint32_t n = 32; n <= 48; n += 16) {
34041 for (size_t k = 1; k <= 5; k += 2) {
34042 GemmMicrokernelTester()
34043 .mr(5)
34044 .nr(16)
34045 .kr(1)
34046 .sr(1)
34047 .m(5)
34048 .n(n)
34049 .k(k)
34050 .a_stride(7)
34051 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34052 }
34053 }
34054 }
34055
34056 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, n_div_16_subtile) {
34057 TEST_REQUIRES_X86_AVX512F;
34058 for (uint32_t n = 32; n <= 48; n += 16) {
34059 for (size_t k = 1; k <= 5; k += 2) {
34060 for (uint32_t m = 1; m <= 5; m++) {
34061 GemmMicrokernelTester()
34062 .mr(5)
34063 .nr(16)
34064 .kr(1)
34065 .sr(1)
34066 .m(m)
34067 .n(n)
34068 .k(k)
34069 .iterations(1)
34070 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34071 }
34072 }
34073 }
34074 }
34075
34076 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, strided_cm_subtile) {
34077 TEST_REQUIRES_X86_AVX512F;
34078 for (size_t k = 1; k <= 5; k += 2) {
34079 for (uint32_t m = 1; m <= 5; m++) {
34080 for (uint32_t n = 1; n <= 16; n++) {
34081 GemmMicrokernelTester()
34082 .mr(5)
34083 .nr(16)
34084 .kr(1)
34085 .sr(1)
34086 .m(m)
34087 .n(n)
34088 .k(k)
34089 .cm_stride(19)
34090 .iterations(1)
34091 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34092 }
34093 }
34094 }
34095 }
34096
34097 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, qmin) {
34098 TEST_REQUIRES_X86_AVX512F;
34099 GemmMicrokernelTester()
34100 .mr(5)
34101 .nr(16)
34102 .kr(1)
34103 .sr(1)
34104 .m(5)
34105 .n(16)
34106 .k(1)
34107 .qmin(128)
34108 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34109 }
34110
34111 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, qmax) {
34112 TEST_REQUIRES_X86_AVX512F;
34113 GemmMicrokernelTester()
34114 .mr(5)
34115 .nr(16)
34116 .kr(1)
34117 .sr(1)
34118 .m(5)
34119 .n(16)
34120 .k(1)
34121 .qmax(128)
34122 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34123 }
34124
34125 TEST(F32_GEMMINC_5X16__AVX512F_BROADCAST, strided_cm) {
34126 TEST_REQUIRES_X86_AVX512F;
34127 GemmMicrokernelTester()
34128 .mr(5)
34129 .nr(16)
34130 .kr(1)
34131 .sr(1)
34132 .m(5)
34133 .n(16)
34134 .k(1)
34135 .cm_stride(19)
34136 .Test(xnn_f32_gemminc_ukernel_5x16__avx512f_broadcast);
34137 }
34138#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34139
34140
34141#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34142 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1) {
34143 TEST_REQUIRES_X86_AVX512F;
34144 GemmMicrokernelTester()
34145 .mr(6)
34146 .nr(16)
34147 .kr(1)
34148 .sr(1)
34149 .m(6)
34150 .n(16)
34151 .k(1)
34152 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34153 }
34154
34155 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, strided_cn) {
34156 TEST_REQUIRES_X86_AVX512F;
34157 GemmMicrokernelTester()
34158 .mr(6)
34159 .nr(16)
34160 .kr(1)
34161 .sr(1)
34162 .m(6)
34163 .n(16)
34164 .k(1)
34165 .cn_stride(19)
34166 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34167 }
34168
34169 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
34170 TEST_REQUIRES_X86_AVX512F;
34171 GemmMicrokernelTester()
34172 .mr(6)
34173 .nr(16)
34174 .kr(1)
34175 .sr(1)
34176 .m(6)
34177 .n(16)
34178 .k(1)
34179 .a_stride(3)
34180 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34181 }
34182
34183 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
34184 TEST_REQUIRES_X86_AVX512F;
34185 for (uint32_t m = 1; m <= 6; m++) {
34186 for (uint32_t n = 1; n <= 16; n++) {
34187 GemmMicrokernelTester()
34188 .mr(6)
34189 .nr(16)
34190 .kr(1)
34191 .sr(1)
34192 .m(m)
34193 .n(n)
34194 .k(1)
34195 .iterations(1)
34196 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34197 }
34198 }
34199 }
34200
34201 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
34202 TEST_REQUIRES_X86_AVX512F;
34203 for (uint32_t m = 1; m <= 6; m++) {
34204 GemmMicrokernelTester()
34205 .mr(6)
34206 .nr(16)
34207 .kr(1)
34208 .sr(1)
34209 .m(m)
34210 .n(16)
34211 .k(1)
34212 .iterations(1)
34213 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34214 }
34215 }
34216
34217 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
34218 TEST_REQUIRES_X86_AVX512F;
34219 for (uint32_t n = 1; n <= 16; n++) {
34220 GemmMicrokernelTester()
34221 .mr(6)
34222 .nr(16)
34223 .kr(1)
34224 .sr(1)
34225 .m(6)
34226 .n(n)
34227 .k(1)
34228 .iterations(1)
34229 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34230 }
34231 }
34232
34233 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_gt_1) {
34234 TEST_REQUIRES_X86_AVX512F;
34235 for (size_t k = 2; k < 10; k++) {
34236 GemmMicrokernelTester()
34237 .mr(6)
34238 .nr(16)
34239 .kr(1)
34240 .sr(1)
34241 .m(6)
34242 .n(16)
34243 .k(k)
34244 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34245 }
34246 }
34247
34248 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
34249 TEST_REQUIRES_X86_AVX512F;
34250 for (size_t k = 2; k < 10; k++) {
34251 GemmMicrokernelTester()
34252 .mr(6)
34253 .nr(16)
34254 .kr(1)
34255 .sr(1)
34256 .m(6)
34257 .n(16)
34258 .k(k)
34259 .a_stride(11)
34260 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34261 }
34262 }
34263
34264 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
34265 TEST_REQUIRES_X86_AVX512F;
34266 for (size_t k = 2; k < 10; k++) {
34267 for (uint32_t m = 1; m <= 6; m++) {
34268 for (uint32_t n = 1; n <= 16; n++) {
34269 GemmMicrokernelTester()
34270 .mr(6)
34271 .nr(16)
34272 .kr(1)
34273 .sr(1)
34274 .m(m)
34275 .n(n)
34276 .k(k)
34277 .iterations(1)
34278 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34279 }
34280 }
34281 }
34282 }
34283
34284 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16) {
34285 TEST_REQUIRES_X86_AVX512F;
34286 for (uint32_t n = 17; n < 32; n++) {
34287 for (size_t k = 1; k <= 5; k += 2) {
34288 GemmMicrokernelTester()
34289 .mr(6)
34290 .nr(16)
34291 .kr(1)
34292 .sr(1)
34293 .m(6)
34294 .n(16)
34295 .k(k)
34296 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34297 }
34298 }
34299 }
34300
34301 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
34302 TEST_REQUIRES_X86_AVX512F;
34303 for (uint32_t n = 17; n < 32; n++) {
34304 for (size_t k = 1; k <= 5; k += 2) {
34305 GemmMicrokernelTester()
34306 .mr(6)
34307 .nr(16)
34308 .kr(1)
34309 .sr(1)
34310 .m(6)
34311 .n(16)
34312 .k(k)
34313 .cn_stride(19)
34314 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34315 }
34316 }
34317 }
34318
34319 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
34320 TEST_REQUIRES_X86_AVX512F;
34321 for (uint32_t n = 17; n < 32; n++) {
34322 for (size_t k = 1; k <= 5; k += 2) {
34323 GemmMicrokernelTester()
34324 .mr(6)
34325 .nr(16)
34326 .kr(1)
34327 .sr(1)
34328 .m(6)
34329 .n(n)
34330 .k(k)
34331 .a_stride(7)
34332 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34333 }
34334 }
34335 }
34336
34337 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
34338 TEST_REQUIRES_X86_AVX512F;
34339 for (uint32_t n = 17; n < 32; n++) {
34340 for (size_t k = 1; k <= 5; k += 2) {
34341 for (uint32_t m = 1; m <= 6; m++) {
34342 GemmMicrokernelTester()
34343 .mr(6)
34344 .nr(16)
34345 .kr(1)
34346 .sr(1)
34347 .m(m)
34348 .n(n)
34349 .k(k)
34350 .iterations(1)
34351 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34352 }
34353 }
34354 }
34355 }
34356
34357 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16) {
34358 TEST_REQUIRES_X86_AVX512F;
34359 for (uint32_t n = 32; n <= 48; n += 16) {
34360 for (size_t k = 1; k <= 5; k += 2) {
34361 GemmMicrokernelTester()
34362 .mr(6)
34363 .nr(16)
34364 .kr(1)
34365 .sr(1)
34366 .m(6)
34367 .n(16)
34368 .k(k)
34369 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34370 }
34371 }
34372 }
34373
34374 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
34375 TEST_REQUIRES_X86_AVX512F;
34376 for (uint32_t n = 32; n <= 48; n += 16) {
34377 for (size_t k = 1; k <= 5; k += 2) {
34378 GemmMicrokernelTester()
34379 .mr(6)
34380 .nr(16)
34381 .kr(1)
34382 .sr(1)
34383 .m(6)
34384 .n(n)
34385 .k(k)
34386 .cn_stride(19)
34387 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34388 }
34389 }
34390 }
34391
34392 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16_strided_a) {
34393 TEST_REQUIRES_X86_AVX512F;
34394 for (uint32_t n = 32; n <= 48; n += 16) {
34395 for (size_t k = 1; k <= 5; k += 2) {
34396 GemmMicrokernelTester()
34397 .mr(6)
34398 .nr(16)
34399 .kr(1)
34400 .sr(1)
34401 .m(6)
34402 .n(n)
34403 .k(k)
34404 .a_stride(7)
34405 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34406 }
34407 }
34408 }
34409
34410 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
34411 TEST_REQUIRES_X86_AVX512F;
34412 for (uint32_t n = 32; n <= 48; n += 16) {
34413 for (size_t k = 1; k <= 5; k += 2) {
34414 for (uint32_t m = 1; m <= 6; m++) {
34415 GemmMicrokernelTester()
34416 .mr(6)
34417 .nr(16)
34418 .kr(1)
34419 .sr(1)
34420 .m(m)
34421 .n(n)
34422 .k(k)
34423 .iterations(1)
34424 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34425 }
34426 }
34427 }
34428 }
34429
34430 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
34431 TEST_REQUIRES_X86_AVX512F;
34432 for (size_t k = 1; k <= 5; k += 2) {
34433 for (uint32_t m = 1; m <= 6; m++) {
34434 for (uint32_t n = 1; n <= 16; n++) {
34435 GemmMicrokernelTester()
34436 .mr(6)
34437 .nr(16)
34438 .kr(1)
34439 .sr(1)
34440 .m(m)
34441 .n(n)
34442 .k(k)
34443 .cm_stride(19)
34444 .iterations(1)
34445 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34446 }
34447 }
34448 }
34449 }
34450
34451 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, qmin) {
34452 TEST_REQUIRES_X86_AVX512F;
34453 GemmMicrokernelTester()
34454 .mr(6)
34455 .nr(16)
34456 .kr(1)
34457 .sr(1)
34458 .m(6)
34459 .n(16)
34460 .k(1)
34461 .qmin(128)
34462 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34463 }
34464
34465 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, qmax) {
34466 TEST_REQUIRES_X86_AVX512F;
34467 GemmMicrokernelTester()
34468 .mr(6)
34469 .nr(16)
34470 .kr(1)
34471 .sr(1)
34472 .m(6)
34473 .n(16)
34474 .k(1)
34475 .qmax(128)
34476 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34477 }
34478
34479 TEST(F32_GEMMINC_6X16__AVX512F_BROADCAST, strided_cm) {
34480 TEST_REQUIRES_X86_AVX512F;
34481 GemmMicrokernelTester()
34482 .mr(6)
34483 .nr(16)
34484 .kr(1)
34485 .sr(1)
34486 .m(6)
34487 .n(16)
34488 .k(1)
34489 .cm_stride(19)
34490 .Test(xnn_f32_gemminc_ukernel_6x16__avx512f_broadcast);
34491 }
34492#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34493
34494
34495#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34496 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1) {
34497 TEST_REQUIRES_X86_AVX512F;
34498 GemmMicrokernelTester()
34499 .mr(7)
34500 .nr(16)
34501 .kr(1)
34502 .sr(1)
34503 .m(7)
34504 .n(16)
34505 .k(1)
34506 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34507 }
34508
34509 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, strided_cn) {
34510 TEST_REQUIRES_X86_AVX512F;
34511 GemmMicrokernelTester()
34512 .mr(7)
34513 .nr(16)
34514 .kr(1)
34515 .sr(1)
34516 .m(7)
34517 .n(16)
34518 .k(1)
34519 .cn_stride(19)
34520 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34521 }
34522
34523 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
34524 TEST_REQUIRES_X86_AVX512F;
34525 GemmMicrokernelTester()
34526 .mr(7)
34527 .nr(16)
34528 .kr(1)
34529 .sr(1)
34530 .m(7)
34531 .n(16)
34532 .k(1)
34533 .a_stride(3)
34534 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34535 }
34536
34537 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
34538 TEST_REQUIRES_X86_AVX512F;
34539 for (uint32_t m = 1; m <= 7; m++) {
34540 for (uint32_t n = 1; n <= 16; n++) {
34541 GemmMicrokernelTester()
34542 .mr(7)
34543 .nr(16)
34544 .kr(1)
34545 .sr(1)
34546 .m(m)
34547 .n(n)
34548 .k(1)
34549 .iterations(1)
34550 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34551 }
34552 }
34553 }
34554
34555 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
34556 TEST_REQUIRES_X86_AVX512F;
34557 for (uint32_t m = 1; m <= 7; m++) {
34558 GemmMicrokernelTester()
34559 .mr(7)
34560 .nr(16)
34561 .kr(1)
34562 .sr(1)
34563 .m(m)
34564 .n(16)
34565 .k(1)
34566 .iterations(1)
34567 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34568 }
34569 }
34570
34571 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
34572 TEST_REQUIRES_X86_AVX512F;
34573 for (uint32_t n = 1; n <= 16; n++) {
34574 GemmMicrokernelTester()
34575 .mr(7)
34576 .nr(16)
34577 .kr(1)
34578 .sr(1)
34579 .m(7)
34580 .n(n)
34581 .k(1)
34582 .iterations(1)
34583 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34584 }
34585 }
34586
34587 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_gt_1) {
34588 TEST_REQUIRES_X86_AVX512F;
34589 for (size_t k = 2; k < 10; k++) {
34590 GemmMicrokernelTester()
34591 .mr(7)
34592 .nr(16)
34593 .kr(1)
34594 .sr(1)
34595 .m(7)
34596 .n(16)
34597 .k(k)
34598 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34599 }
34600 }
34601
34602 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
34603 TEST_REQUIRES_X86_AVX512F;
34604 for (size_t k = 2; k < 10; k++) {
34605 GemmMicrokernelTester()
34606 .mr(7)
34607 .nr(16)
34608 .kr(1)
34609 .sr(1)
34610 .m(7)
34611 .n(16)
34612 .k(k)
34613 .a_stride(11)
34614 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34615 }
34616 }
34617
34618 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
34619 TEST_REQUIRES_X86_AVX512F;
34620 for (size_t k = 2; k < 10; k++) {
34621 for (uint32_t m = 1; m <= 7; m++) {
34622 for (uint32_t n = 1; n <= 16; n++) {
34623 GemmMicrokernelTester()
34624 .mr(7)
34625 .nr(16)
34626 .kr(1)
34627 .sr(1)
34628 .m(m)
34629 .n(n)
34630 .k(k)
34631 .iterations(1)
34632 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34633 }
34634 }
34635 }
34636 }
34637
34638 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16) {
34639 TEST_REQUIRES_X86_AVX512F;
34640 for (uint32_t n = 17; n < 32; n++) {
34641 for (size_t k = 1; k <= 5; k += 2) {
34642 GemmMicrokernelTester()
34643 .mr(7)
34644 .nr(16)
34645 .kr(1)
34646 .sr(1)
34647 .m(7)
34648 .n(16)
34649 .k(k)
34650 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34651 }
34652 }
34653 }
34654
34655 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
34656 TEST_REQUIRES_X86_AVX512F;
34657 for (uint32_t n = 17; n < 32; n++) {
34658 for (size_t k = 1; k <= 5; k += 2) {
34659 GemmMicrokernelTester()
34660 .mr(7)
34661 .nr(16)
34662 .kr(1)
34663 .sr(1)
34664 .m(7)
34665 .n(16)
34666 .k(k)
34667 .cn_stride(19)
34668 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34669 }
34670 }
34671 }
34672
34673 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
34674 TEST_REQUIRES_X86_AVX512F;
34675 for (uint32_t n = 17; n < 32; n++) {
34676 for (size_t k = 1; k <= 5; k += 2) {
34677 GemmMicrokernelTester()
34678 .mr(7)
34679 .nr(16)
34680 .kr(1)
34681 .sr(1)
34682 .m(7)
34683 .n(n)
34684 .k(k)
34685 .a_stride(7)
34686 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34687 }
34688 }
34689 }
34690
34691 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
34692 TEST_REQUIRES_X86_AVX512F;
34693 for (uint32_t n = 17; n < 32; n++) {
34694 for (size_t k = 1; k <= 5; k += 2) {
34695 for (uint32_t m = 1; m <= 7; m++) {
34696 GemmMicrokernelTester()
34697 .mr(7)
34698 .nr(16)
34699 .kr(1)
34700 .sr(1)
34701 .m(m)
34702 .n(n)
34703 .k(k)
34704 .iterations(1)
34705 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34706 }
34707 }
34708 }
34709 }
34710
34711 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16) {
34712 TEST_REQUIRES_X86_AVX512F;
34713 for (uint32_t n = 32; n <= 48; n += 16) {
34714 for (size_t k = 1; k <= 5; k += 2) {
34715 GemmMicrokernelTester()
34716 .mr(7)
34717 .nr(16)
34718 .kr(1)
34719 .sr(1)
34720 .m(7)
34721 .n(16)
34722 .k(k)
34723 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34724 }
34725 }
34726 }
34727
34728 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
34729 TEST_REQUIRES_X86_AVX512F;
34730 for (uint32_t n = 32; n <= 48; n += 16) {
34731 for (size_t k = 1; k <= 5; k += 2) {
34732 GemmMicrokernelTester()
34733 .mr(7)
34734 .nr(16)
34735 .kr(1)
34736 .sr(1)
34737 .m(7)
34738 .n(n)
34739 .k(k)
34740 .cn_stride(19)
34741 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34742 }
34743 }
34744 }
34745
34746 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16_strided_a) {
34747 TEST_REQUIRES_X86_AVX512F;
34748 for (uint32_t n = 32; n <= 48; n += 16) {
34749 for (size_t k = 1; k <= 5; k += 2) {
34750 GemmMicrokernelTester()
34751 .mr(7)
34752 .nr(16)
34753 .kr(1)
34754 .sr(1)
34755 .m(7)
34756 .n(n)
34757 .k(k)
34758 .a_stride(7)
34759 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34760 }
34761 }
34762 }
34763
34764 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
34765 TEST_REQUIRES_X86_AVX512F;
34766 for (uint32_t n = 32; n <= 48; n += 16) {
34767 for (size_t k = 1; k <= 5; k += 2) {
34768 for (uint32_t m = 1; m <= 7; m++) {
34769 GemmMicrokernelTester()
34770 .mr(7)
34771 .nr(16)
34772 .kr(1)
34773 .sr(1)
34774 .m(m)
34775 .n(n)
34776 .k(k)
34777 .iterations(1)
34778 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34779 }
34780 }
34781 }
34782 }
34783
34784 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
34785 TEST_REQUIRES_X86_AVX512F;
34786 for (size_t k = 1; k <= 5; k += 2) {
34787 for (uint32_t m = 1; m <= 7; m++) {
34788 for (uint32_t n = 1; n <= 16; n++) {
34789 GemmMicrokernelTester()
34790 .mr(7)
34791 .nr(16)
34792 .kr(1)
34793 .sr(1)
34794 .m(m)
34795 .n(n)
34796 .k(k)
34797 .cm_stride(19)
34798 .iterations(1)
34799 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34800 }
34801 }
34802 }
34803 }
34804
34805 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, qmin) {
34806 TEST_REQUIRES_X86_AVX512F;
34807 GemmMicrokernelTester()
34808 .mr(7)
34809 .nr(16)
34810 .kr(1)
34811 .sr(1)
34812 .m(7)
34813 .n(16)
34814 .k(1)
34815 .qmin(128)
34816 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34817 }
34818
34819 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, qmax) {
34820 TEST_REQUIRES_X86_AVX512F;
34821 GemmMicrokernelTester()
34822 .mr(7)
34823 .nr(16)
34824 .kr(1)
34825 .sr(1)
34826 .m(7)
34827 .n(16)
34828 .k(1)
34829 .qmax(128)
34830 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34831 }
34832
34833 TEST(F32_GEMMINC_7X16__AVX512F_BROADCAST, strided_cm) {
34834 TEST_REQUIRES_X86_AVX512F;
34835 GemmMicrokernelTester()
34836 .mr(7)
34837 .nr(16)
34838 .kr(1)
34839 .sr(1)
34840 .m(7)
34841 .n(16)
34842 .k(1)
34843 .cm_stride(19)
34844 .Test(xnn_f32_gemminc_ukernel_7x16__avx512f_broadcast);
34845 }
34846#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34847
34848
34849#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34850 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1) {
34851 TEST_REQUIRES_X86_AVX512F;
34852 GemmMicrokernelTester()
34853 .mr(8)
34854 .nr(16)
34855 .kr(1)
34856 .sr(1)
34857 .m(8)
34858 .n(16)
34859 .k(1)
34860 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34861 }
34862
34863 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, strided_cn) {
34864 TEST_REQUIRES_X86_AVX512F;
34865 GemmMicrokernelTester()
34866 .mr(8)
34867 .nr(16)
34868 .kr(1)
34869 .sr(1)
34870 .m(8)
34871 .n(16)
34872 .k(1)
34873 .cn_stride(19)
34874 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34875 }
34876
34877 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
34878 TEST_REQUIRES_X86_AVX512F;
34879 GemmMicrokernelTester()
34880 .mr(8)
34881 .nr(16)
34882 .kr(1)
34883 .sr(1)
34884 .m(8)
34885 .n(16)
34886 .k(1)
34887 .a_stride(3)
34888 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34889 }
34890
34891 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
34892 TEST_REQUIRES_X86_AVX512F;
34893 for (uint32_t m = 1; m <= 8; m++) {
34894 for (uint32_t n = 1; n <= 16; n++) {
34895 GemmMicrokernelTester()
34896 .mr(8)
34897 .nr(16)
34898 .kr(1)
34899 .sr(1)
34900 .m(m)
34901 .n(n)
34902 .k(1)
34903 .iterations(1)
34904 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34905 }
34906 }
34907 }
34908
34909 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
34910 TEST_REQUIRES_X86_AVX512F;
34911 for (uint32_t m = 1; m <= 8; m++) {
34912 GemmMicrokernelTester()
34913 .mr(8)
34914 .nr(16)
34915 .kr(1)
34916 .sr(1)
34917 .m(m)
34918 .n(16)
34919 .k(1)
34920 .iterations(1)
34921 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34922 }
34923 }
34924
34925 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
34926 TEST_REQUIRES_X86_AVX512F;
34927 for (uint32_t n = 1; n <= 16; n++) {
34928 GemmMicrokernelTester()
34929 .mr(8)
34930 .nr(16)
34931 .kr(1)
34932 .sr(1)
34933 .m(8)
34934 .n(n)
34935 .k(1)
34936 .iterations(1)
34937 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34938 }
34939 }
34940
34941 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_gt_1) {
34942 TEST_REQUIRES_X86_AVX512F;
34943 for (size_t k = 2; k < 10; k++) {
34944 GemmMicrokernelTester()
34945 .mr(8)
34946 .nr(16)
34947 .kr(1)
34948 .sr(1)
34949 .m(8)
34950 .n(16)
34951 .k(k)
34952 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34953 }
34954 }
34955
34956 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
34957 TEST_REQUIRES_X86_AVX512F;
34958 for (size_t k = 2; k < 10; k++) {
34959 GemmMicrokernelTester()
34960 .mr(8)
34961 .nr(16)
34962 .kr(1)
34963 .sr(1)
34964 .m(8)
34965 .n(16)
34966 .k(k)
34967 .a_stride(11)
34968 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34969 }
34970 }
34971
34972 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
34973 TEST_REQUIRES_X86_AVX512F;
34974 for (size_t k = 2; k < 10; k++) {
34975 for (uint32_t m = 1; m <= 8; m++) {
34976 for (uint32_t n = 1; n <= 16; n++) {
34977 GemmMicrokernelTester()
34978 .mr(8)
34979 .nr(16)
34980 .kr(1)
34981 .sr(1)
34982 .m(m)
34983 .n(n)
34984 .k(k)
34985 .iterations(1)
34986 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
34987 }
34988 }
34989 }
34990 }
34991
34992 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16) {
34993 TEST_REQUIRES_X86_AVX512F;
34994 for (uint32_t n = 17; n < 32; n++) {
34995 for (size_t k = 1; k <= 5; k += 2) {
34996 GemmMicrokernelTester()
34997 .mr(8)
34998 .nr(16)
34999 .kr(1)
35000 .sr(1)
35001 .m(8)
35002 .n(16)
35003 .k(k)
35004 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35005 }
35006 }
35007 }
35008
35009 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
35010 TEST_REQUIRES_X86_AVX512F;
35011 for (uint32_t n = 17; n < 32; n++) {
35012 for (size_t k = 1; k <= 5; k += 2) {
35013 GemmMicrokernelTester()
35014 .mr(8)
35015 .nr(16)
35016 .kr(1)
35017 .sr(1)
35018 .m(8)
35019 .n(16)
35020 .k(k)
35021 .cn_stride(19)
35022 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35023 }
35024 }
35025 }
35026
35027 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
35028 TEST_REQUIRES_X86_AVX512F;
35029 for (uint32_t n = 17; n < 32; n++) {
35030 for (size_t k = 1; k <= 5; k += 2) {
35031 GemmMicrokernelTester()
35032 .mr(8)
35033 .nr(16)
35034 .kr(1)
35035 .sr(1)
35036 .m(8)
35037 .n(n)
35038 .k(k)
35039 .a_stride(7)
35040 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35041 }
35042 }
35043 }
35044
35045 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
35046 TEST_REQUIRES_X86_AVX512F;
35047 for (uint32_t n = 17; n < 32; n++) {
35048 for (size_t k = 1; k <= 5; k += 2) {
35049 for (uint32_t m = 1; m <= 8; m++) {
35050 GemmMicrokernelTester()
35051 .mr(8)
35052 .nr(16)
35053 .kr(1)
35054 .sr(1)
35055 .m(m)
35056 .n(n)
35057 .k(k)
35058 .iterations(1)
35059 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35060 }
35061 }
35062 }
35063 }
35064
35065 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16) {
35066 TEST_REQUIRES_X86_AVX512F;
35067 for (uint32_t n = 32; n <= 48; n += 16) {
35068 for (size_t k = 1; k <= 5; k += 2) {
35069 GemmMicrokernelTester()
35070 .mr(8)
35071 .nr(16)
35072 .kr(1)
35073 .sr(1)
35074 .m(8)
35075 .n(16)
35076 .k(k)
35077 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35078 }
35079 }
35080 }
35081
35082 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
35083 TEST_REQUIRES_X86_AVX512F;
35084 for (uint32_t n = 32; n <= 48; n += 16) {
35085 for (size_t k = 1; k <= 5; k += 2) {
35086 GemmMicrokernelTester()
35087 .mr(8)
35088 .nr(16)
35089 .kr(1)
35090 .sr(1)
35091 .m(8)
35092 .n(n)
35093 .k(k)
35094 .cn_stride(19)
35095 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35096 }
35097 }
35098 }
35099
35100 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16_strided_a) {
35101 TEST_REQUIRES_X86_AVX512F;
35102 for (uint32_t n = 32; n <= 48; n += 16) {
35103 for (size_t k = 1; k <= 5; k += 2) {
35104 GemmMicrokernelTester()
35105 .mr(8)
35106 .nr(16)
35107 .kr(1)
35108 .sr(1)
35109 .m(8)
35110 .n(n)
35111 .k(k)
35112 .a_stride(7)
35113 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35114 }
35115 }
35116 }
35117
35118 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
35119 TEST_REQUIRES_X86_AVX512F;
35120 for (uint32_t n = 32; n <= 48; n += 16) {
35121 for (size_t k = 1; k <= 5; k += 2) {
35122 for (uint32_t m = 1; m <= 8; m++) {
35123 GemmMicrokernelTester()
35124 .mr(8)
35125 .nr(16)
35126 .kr(1)
35127 .sr(1)
35128 .m(m)
35129 .n(n)
35130 .k(k)
35131 .iterations(1)
35132 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35133 }
35134 }
35135 }
35136 }
35137
35138 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
35139 TEST_REQUIRES_X86_AVX512F;
35140 for (size_t k = 1; k <= 5; k += 2) {
35141 for (uint32_t m = 1; m <= 8; m++) {
35142 for (uint32_t n = 1; n <= 16; n++) {
35143 GemmMicrokernelTester()
35144 .mr(8)
35145 .nr(16)
35146 .kr(1)
35147 .sr(1)
35148 .m(m)
35149 .n(n)
35150 .k(k)
35151 .cm_stride(19)
35152 .iterations(1)
35153 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35154 }
35155 }
35156 }
35157 }
35158
35159 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, qmin) {
35160 TEST_REQUIRES_X86_AVX512F;
35161 GemmMicrokernelTester()
35162 .mr(8)
35163 .nr(16)
35164 .kr(1)
35165 .sr(1)
35166 .m(8)
35167 .n(16)
35168 .k(1)
35169 .qmin(128)
35170 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35171 }
35172
35173 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, qmax) {
35174 TEST_REQUIRES_X86_AVX512F;
35175 GemmMicrokernelTester()
35176 .mr(8)
35177 .nr(16)
35178 .kr(1)
35179 .sr(1)
35180 .m(8)
35181 .n(16)
35182 .k(1)
35183 .qmax(128)
35184 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35185 }
35186
35187 TEST(F32_GEMMINC_8X16__AVX512F_BROADCAST, strided_cm) {
35188 TEST_REQUIRES_X86_AVX512F;
35189 GemmMicrokernelTester()
35190 .mr(8)
35191 .nr(16)
35192 .kr(1)
35193 .sr(1)
35194 .m(8)
35195 .n(16)
35196 .k(1)
35197 .cm_stride(19)
35198 .Test(xnn_f32_gemminc_ukernel_8x16__avx512f_broadcast);
35199 }
35200#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35201
35202
Marat Dukhan1dadbf72019-10-01 10:46:20 -070035203#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070035204 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1) {
35205 TEST_REQUIRES_PSIMD;
35206 GemmMicrokernelTester()
35207 .mr(1)
35208 .nr(8)
35209 .kr(1)
35210 .sr(1)
35211 .m(1)
35212 .n(8)
35213 .k(1)
35214 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35215 }
35216
35217 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, strided_cn) {
35218 TEST_REQUIRES_PSIMD;
35219 GemmMicrokernelTester()
35220 .mr(1)
35221 .nr(8)
35222 .kr(1)
35223 .sr(1)
35224 .m(1)
35225 .n(8)
35226 .k(1)
35227 .cn_stride(11)
35228 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35229 }
35230
35231 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
35232 TEST_REQUIRES_PSIMD;
35233 GemmMicrokernelTester()
35234 .mr(1)
35235 .nr(8)
35236 .kr(1)
35237 .sr(1)
35238 .m(1)
35239 .n(8)
35240 .k(1)
35241 .a_stride(3)
35242 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35243 }
35244
35245 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
35246 TEST_REQUIRES_PSIMD;
35247 for (uint32_t m = 1; m <= 1; m++) {
35248 for (uint32_t n = 1; n <= 8; n++) {
35249 GemmMicrokernelTester()
35250 .mr(1)
35251 .nr(8)
35252 .kr(1)
35253 .sr(1)
35254 .m(m)
35255 .n(n)
35256 .k(1)
35257 .iterations(1)
35258 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35259 }
35260 }
35261 }
35262
35263 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
35264 TEST_REQUIRES_PSIMD;
35265 for (uint32_t m = 1; m <= 1; m++) {
35266 GemmMicrokernelTester()
35267 .mr(1)
35268 .nr(8)
35269 .kr(1)
35270 .sr(1)
35271 .m(m)
35272 .n(8)
35273 .k(1)
35274 .iterations(1)
35275 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35276 }
35277 }
35278
35279 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
35280 TEST_REQUIRES_PSIMD;
35281 for (uint32_t n = 1; n <= 8; n++) {
35282 GemmMicrokernelTester()
35283 .mr(1)
35284 .nr(8)
35285 .kr(1)
35286 .sr(1)
35287 .m(1)
35288 .n(n)
35289 .k(1)
35290 .iterations(1)
35291 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35292 }
35293 }
35294
35295 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_gt_1) {
35296 TEST_REQUIRES_PSIMD;
35297 for (size_t k = 2; k < 10; k++) {
35298 GemmMicrokernelTester()
35299 .mr(1)
35300 .nr(8)
35301 .kr(1)
35302 .sr(1)
35303 .m(1)
35304 .n(8)
35305 .k(k)
35306 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35307 }
35308 }
35309
35310 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
35311 TEST_REQUIRES_PSIMD;
35312 for (size_t k = 2; k < 10; k++) {
35313 GemmMicrokernelTester()
35314 .mr(1)
35315 .nr(8)
35316 .kr(1)
35317 .sr(1)
35318 .m(1)
35319 .n(8)
35320 .k(k)
35321 .a_stride(11)
35322 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35323 }
35324 }
35325
35326 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
35327 TEST_REQUIRES_PSIMD;
35328 for (size_t k = 2; k < 10; k++) {
35329 for (uint32_t m = 1; m <= 1; m++) {
35330 for (uint32_t n = 1; n <= 8; n++) {
35331 GemmMicrokernelTester()
35332 .mr(1)
35333 .nr(8)
35334 .kr(1)
35335 .sr(1)
35336 .m(m)
35337 .n(n)
35338 .k(k)
35339 .iterations(1)
35340 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35341 }
35342 }
35343 }
35344 }
35345
35346 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8) {
35347 TEST_REQUIRES_PSIMD;
35348 for (uint32_t n = 9; n < 16; n++) {
35349 for (size_t k = 1; k <= 5; k += 2) {
35350 GemmMicrokernelTester()
35351 .mr(1)
35352 .nr(8)
35353 .kr(1)
35354 .sr(1)
35355 .m(1)
35356 .n(8)
35357 .k(k)
35358 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35359 }
35360 }
35361 }
35362
35363 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
35364 TEST_REQUIRES_PSIMD;
35365 for (uint32_t n = 9; n < 16; n++) {
35366 for (size_t k = 1; k <= 5; k += 2) {
35367 GemmMicrokernelTester()
35368 .mr(1)
35369 .nr(8)
35370 .kr(1)
35371 .sr(1)
35372 .m(1)
35373 .n(8)
35374 .k(k)
35375 .cn_stride(11)
35376 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35377 }
35378 }
35379 }
35380
35381 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
35382 TEST_REQUIRES_PSIMD;
35383 for (uint32_t n = 9; n < 16; n++) {
35384 for (size_t k = 1; k <= 5; k += 2) {
35385 GemmMicrokernelTester()
35386 .mr(1)
35387 .nr(8)
35388 .kr(1)
35389 .sr(1)
35390 .m(1)
35391 .n(n)
35392 .k(k)
35393 .a_stride(7)
35394 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35395 }
35396 }
35397 }
35398
35399 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
35400 TEST_REQUIRES_PSIMD;
35401 for (uint32_t n = 9; n < 16; n++) {
35402 for (size_t k = 1; k <= 5; k += 2) {
35403 for (uint32_t m = 1; m <= 1; m++) {
35404 GemmMicrokernelTester()
35405 .mr(1)
35406 .nr(8)
35407 .kr(1)
35408 .sr(1)
35409 .m(m)
35410 .n(n)
35411 .k(k)
35412 .iterations(1)
35413 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35414 }
35415 }
35416 }
35417 }
35418
35419 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8) {
35420 TEST_REQUIRES_PSIMD;
35421 for (uint32_t n = 16; n <= 24; n += 8) {
35422 for (size_t k = 1; k <= 5; k += 2) {
35423 GemmMicrokernelTester()
35424 .mr(1)
35425 .nr(8)
35426 .kr(1)
35427 .sr(1)
35428 .m(1)
35429 .n(8)
35430 .k(k)
35431 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35432 }
35433 }
35434 }
35435
35436 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
35437 TEST_REQUIRES_PSIMD;
35438 for (uint32_t n = 16; n <= 24; n += 8) {
35439 for (size_t k = 1; k <= 5; k += 2) {
35440 GemmMicrokernelTester()
35441 .mr(1)
35442 .nr(8)
35443 .kr(1)
35444 .sr(1)
35445 .m(1)
35446 .n(n)
35447 .k(k)
35448 .cn_stride(11)
35449 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35450 }
35451 }
35452 }
35453
35454 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
35455 TEST_REQUIRES_PSIMD;
35456 for (uint32_t n = 16; n <= 24; n += 8) {
35457 for (size_t k = 1; k <= 5; k += 2) {
35458 GemmMicrokernelTester()
35459 .mr(1)
35460 .nr(8)
35461 .kr(1)
35462 .sr(1)
35463 .m(1)
35464 .n(n)
35465 .k(k)
35466 .a_stride(7)
35467 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35468 }
35469 }
35470 }
35471
35472 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
35473 TEST_REQUIRES_PSIMD;
35474 for (uint32_t n = 16; n <= 24; n += 8) {
35475 for (size_t k = 1; k <= 5; k += 2) {
35476 for (uint32_t m = 1; m <= 1; m++) {
35477 GemmMicrokernelTester()
35478 .mr(1)
35479 .nr(8)
35480 .kr(1)
35481 .sr(1)
35482 .m(m)
35483 .n(n)
35484 .k(k)
35485 .iterations(1)
35486 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35487 }
35488 }
35489 }
35490 }
35491
35492 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
35493 TEST_REQUIRES_PSIMD;
35494 for (size_t k = 1; k <= 5; k += 2) {
35495 for (uint32_t m = 1; m <= 1; m++) {
35496 for (uint32_t n = 1; n <= 8; n++) {
35497 GemmMicrokernelTester()
35498 .mr(1)
35499 .nr(8)
35500 .kr(1)
35501 .sr(1)
35502 .m(m)
35503 .n(n)
35504 .k(k)
35505 .cm_stride(11)
35506 .iterations(1)
35507 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35508 }
35509 }
35510 }
35511 }
35512
35513 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, qmin) {
35514 TEST_REQUIRES_PSIMD;
35515 GemmMicrokernelTester()
35516 .mr(1)
35517 .nr(8)
35518 .kr(1)
35519 .sr(1)
35520 .m(1)
35521 .n(8)
35522 .k(1)
35523 .qmin(128)
35524 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35525 }
35526
35527 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, qmax) {
35528 TEST_REQUIRES_PSIMD;
35529 GemmMicrokernelTester()
35530 .mr(1)
35531 .nr(8)
35532 .kr(1)
35533 .sr(1)
35534 .m(1)
35535 .n(8)
35536 .k(1)
35537 .qmax(128)
35538 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35539 }
35540
35541 TEST(F32_GEMMINC_1X8__PSIMD_LOADSPLAT, strided_cm) {
35542 TEST_REQUIRES_PSIMD;
35543 GemmMicrokernelTester()
35544 .mr(1)
35545 .nr(8)
35546 .kr(1)
35547 .sr(1)
35548 .m(1)
35549 .n(8)
35550 .k(1)
35551 .cm_stride(11)
35552 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35553 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070035554#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070035555
35556
Marat Dukhan1dadbf72019-10-01 10:46:20 -070035557#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070035558 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1) {
35559 TEST_REQUIRES_PSIMD;
35560 GemmMicrokernelTester()
35561 .mr(4)
35562 .nr(8)
35563 .kr(1)
35564 .sr(1)
35565 .m(4)
35566 .n(8)
35567 .k(1)
35568 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35569 }
35570
35571 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, strided_cn) {
35572 TEST_REQUIRES_PSIMD;
35573 GemmMicrokernelTester()
35574 .mr(4)
35575 .nr(8)
35576 .kr(1)
35577 .sr(1)
35578 .m(4)
35579 .n(8)
35580 .k(1)
35581 .cn_stride(11)
35582 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35583 }
35584
35585 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
35586 TEST_REQUIRES_PSIMD;
35587 GemmMicrokernelTester()
35588 .mr(4)
35589 .nr(8)
35590 .kr(1)
35591 .sr(1)
35592 .m(4)
35593 .n(8)
35594 .k(1)
35595 .a_stride(3)
35596 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35597 }
35598
35599 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
35600 TEST_REQUIRES_PSIMD;
35601 for (uint32_t m = 1; m <= 4; m++) {
35602 for (uint32_t n = 1; n <= 8; n++) {
35603 GemmMicrokernelTester()
35604 .mr(4)
35605 .nr(8)
35606 .kr(1)
35607 .sr(1)
35608 .m(m)
35609 .n(n)
35610 .k(1)
35611 .iterations(1)
35612 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35613 }
35614 }
35615 }
35616
35617 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
35618 TEST_REQUIRES_PSIMD;
35619 for (uint32_t m = 1; m <= 4; m++) {
35620 GemmMicrokernelTester()
35621 .mr(4)
35622 .nr(8)
35623 .kr(1)
35624 .sr(1)
35625 .m(m)
35626 .n(8)
35627 .k(1)
35628 .iterations(1)
35629 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35630 }
35631 }
35632
35633 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
35634 TEST_REQUIRES_PSIMD;
35635 for (uint32_t n = 1; n <= 8; n++) {
35636 GemmMicrokernelTester()
35637 .mr(4)
35638 .nr(8)
35639 .kr(1)
35640 .sr(1)
35641 .m(4)
35642 .n(n)
35643 .k(1)
35644 .iterations(1)
35645 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35646 }
35647 }
35648
35649 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_gt_1) {
35650 TEST_REQUIRES_PSIMD;
35651 for (size_t k = 2; k < 10; k++) {
35652 GemmMicrokernelTester()
35653 .mr(4)
35654 .nr(8)
35655 .kr(1)
35656 .sr(1)
35657 .m(4)
35658 .n(8)
35659 .k(k)
35660 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35661 }
35662 }
35663
35664 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
35665 TEST_REQUIRES_PSIMD;
35666 for (size_t k = 2; k < 10; k++) {
35667 GemmMicrokernelTester()
35668 .mr(4)
35669 .nr(8)
35670 .kr(1)
35671 .sr(1)
35672 .m(4)
35673 .n(8)
35674 .k(k)
35675 .a_stride(11)
35676 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35677 }
35678 }
35679
35680 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
35681 TEST_REQUIRES_PSIMD;
35682 for (size_t k = 2; k < 10; k++) {
35683 for (uint32_t m = 1; m <= 4; m++) {
35684 for (uint32_t n = 1; n <= 8; n++) {
35685 GemmMicrokernelTester()
35686 .mr(4)
35687 .nr(8)
35688 .kr(1)
35689 .sr(1)
35690 .m(m)
35691 .n(n)
35692 .k(k)
35693 .iterations(1)
35694 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35695 }
35696 }
35697 }
35698 }
35699
35700 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8) {
35701 TEST_REQUIRES_PSIMD;
35702 for (uint32_t n = 9; n < 16; n++) {
35703 for (size_t k = 1; k <= 5; k += 2) {
35704 GemmMicrokernelTester()
35705 .mr(4)
35706 .nr(8)
35707 .kr(1)
35708 .sr(1)
35709 .m(4)
35710 .n(8)
35711 .k(k)
35712 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35713 }
35714 }
35715 }
35716
35717 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
35718 TEST_REQUIRES_PSIMD;
35719 for (uint32_t n = 9; n < 16; n++) {
35720 for (size_t k = 1; k <= 5; k += 2) {
35721 GemmMicrokernelTester()
35722 .mr(4)
35723 .nr(8)
35724 .kr(1)
35725 .sr(1)
35726 .m(4)
35727 .n(8)
35728 .k(k)
35729 .cn_stride(11)
35730 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35731 }
35732 }
35733 }
35734
35735 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
35736 TEST_REQUIRES_PSIMD;
35737 for (uint32_t n = 9; n < 16; n++) {
35738 for (size_t k = 1; k <= 5; k += 2) {
35739 GemmMicrokernelTester()
35740 .mr(4)
35741 .nr(8)
35742 .kr(1)
35743 .sr(1)
35744 .m(4)
35745 .n(n)
35746 .k(k)
35747 .a_stride(7)
35748 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35749 }
35750 }
35751 }
35752
35753 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
35754 TEST_REQUIRES_PSIMD;
35755 for (uint32_t n = 9; n < 16; n++) {
35756 for (size_t k = 1; k <= 5; k += 2) {
35757 for (uint32_t m = 1; m <= 4; m++) {
35758 GemmMicrokernelTester()
35759 .mr(4)
35760 .nr(8)
35761 .kr(1)
35762 .sr(1)
35763 .m(m)
35764 .n(n)
35765 .k(k)
35766 .iterations(1)
35767 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35768 }
35769 }
35770 }
35771 }
35772
35773 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8) {
35774 TEST_REQUIRES_PSIMD;
35775 for (uint32_t n = 16; n <= 24; n += 8) {
35776 for (size_t k = 1; k <= 5; k += 2) {
35777 GemmMicrokernelTester()
35778 .mr(4)
35779 .nr(8)
35780 .kr(1)
35781 .sr(1)
35782 .m(4)
35783 .n(8)
35784 .k(k)
35785 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35786 }
35787 }
35788 }
35789
35790 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
35791 TEST_REQUIRES_PSIMD;
35792 for (uint32_t n = 16; n <= 24; n += 8) {
35793 for (size_t k = 1; k <= 5; k += 2) {
35794 GemmMicrokernelTester()
35795 .mr(4)
35796 .nr(8)
35797 .kr(1)
35798 .sr(1)
35799 .m(4)
35800 .n(n)
35801 .k(k)
35802 .cn_stride(11)
35803 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35804 }
35805 }
35806 }
35807
35808 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
35809 TEST_REQUIRES_PSIMD;
35810 for (uint32_t n = 16; n <= 24; n += 8) {
35811 for (size_t k = 1; k <= 5; k += 2) {
35812 GemmMicrokernelTester()
35813 .mr(4)
35814 .nr(8)
35815 .kr(1)
35816 .sr(1)
35817 .m(4)
35818 .n(n)
35819 .k(k)
35820 .a_stride(7)
35821 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35822 }
35823 }
35824 }
35825
35826 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
35827 TEST_REQUIRES_PSIMD;
35828 for (uint32_t n = 16; n <= 24; n += 8) {
35829 for (size_t k = 1; k <= 5; k += 2) {
35830 for (uint32_t m = 1; m <= 4; m++) {
35831 GemmMicrokernelTester()
35832 .mr(4)
35833 .nr(8)
35834 .kr(1)
35835 .sr(1)
35836 .m(m)
35837 .n(n)
35838 .k(k)
35839 .iterations(1)
35840 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35841 }
35842 }
35843 }
35844 }
35845
35846 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
35847 TEST_REQUIRES_PSIMD;
35848 for (size_t k = 1; k <= 5; k += 2) {
35849 for (uint32_t m = 1; m <= 4; m++) {
35850 for (uint32_t n = 1; n <= 8; n++) {
35851 GemmMicrokernelTester()
35852 .mr(4)
35853 .nr(8)
35854 .kr(1)
35855 .sr(1)
35856 .m(m)
35857 .n(n)
35858 .k(k)
35859 .cm_stride(11)
35860 .iterations(1)
35861 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35862 }
35863 }
35864 }
35865 }
35866
35867 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, qmin) {
35868 TEST_REQUIRES_PSIMD;
35869 GemmMicrokernelTester()
35870 .mr(4)
35871 .nr(8)
35872 .kr(1)
35873 .sr(1)
35874 .m(4)
35875 .n(8)
35876 .k(1)
35877 .qmin(128)
35878 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35879 }
35880
35881 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, qmax) {
35882 TEST_REQUIRES_PSIMD;
35883 GemmMicrokernelTester()
35884 .mr(4)
35885 .nr(8)
35886 .kr(1)
35887 .sr(1)
35888 .m(4)
35889 .n(8)
35890 .k(1)
35891 .qmax(128)
35892 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35893 }
35894
35895 TEST(F32_GEMMINC_4X8__PSIMD_LOADSPLAT, strided_cm) {
35896 TEST_REQUIRES_PSIMD;
35897 GemmMicrokernelTester()
35898 .mr(4)
35899 .nr(8)
35900 .kr(1)
35901 .sr(1)
35902 .m(4)
35903 .n(8)
35904 .k(1)
35905 .cm_stride(11)
35906 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35907 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070035908#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070035909
35910
Marat Dukhan1dadbf72019-10-01 10:46:20 -070035911#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070035912 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1) {
35913 TEST_REQUIRES_PSIMD;
35914 GemmMicrokernelTester()
35915 .mr(6)
35916 .nr(8)
35917 .kr(1)
35918 .sr(1)
35919 .m(6)
35920 .n(8)
35921 .k(1)
35922 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35923 }
35924
35925 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, strided_cn) {
35926 TEST_REQUIRES_PSIMD;
35927 GemmMicrokernelTester()
35928 .mr(6)
35929 .nr(8)
35930 .kr(1)
35931 .sr(1)
35932 .m(6)
35933 .n(8)
35934 .k(1)
35935 .cn_stride(11)
35936 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35937 }
35938
35939 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
35940 TEST_REQUIRES_PSIMD;
35941 GemmMicrokernelTester()
35942 .mr(6)
35943 .nr(8)
35944 .kr(1)
35945 .sr(1)
35946 .m(6)
35947 .n(8)
35948 .k(1)
35949 .a_stride(3)
35950 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35951 }
35952
35953 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
35954 TEST_REQUIRES_PSIMD;
35955 for (uint32_t m = 1; m <= 6; m++) {
35956 for (uint32_t n = 1; n <= 8; n++) {
35957 GemmMicrokernelTester()
35958 .mr(6)
35959 .nr(8)
35960 .kr(1)
35961 .sr(1)
35962 .m(m)
35963 .n(n)
35964 .k(1)
35965 .iterations(1)
35966 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35967 }
35968 }
35969 }
35970
35971 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
35972 TEST_REQUIRES_PSIMD;
35973 for (uint32_t m = 1; m <= 6; m++) {
35974 GemmMicrokernelTester()
35975 .mr(6)
35976 .nr(8)
35977 .kr(1)
35978 .sr(1)
35979 .m(m)
35980 .n(8)
35981 .k(1)
35982 .iterations(1)
35983 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
35984 }
35985 }
35986
35987 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
35988 TEST_REQUIRES_PSIMD;
35989 for (uint32_t n = 1; n <= 8; n++) {
35990 GemmMicrokernelTester()
35991 .mr(6)
35992 .nr(8)
35993 .kr(1)
35994 .sr(1)
35995 .m(6)
35996 .n(n)
35997 .k(1)
35998 .iterations(1)
35999 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36000 }
36001 }
36002
36003 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_gt_1) {
36004 TEST_REQUIRES_PSIMD;
36005 for (size_t k = 2; k < 10; k++) {
36006 GemmMicrokernelTester()
36007 .mr(6)
36008 .nr(8)
36009 .kr(1)
36010 .sr(1)
36011 .m(6)
36012 .n(8)
36013 .k(k)
36014 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36015 }
36016 }
36017
36018 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
36019 TEST_REQUIRES_PSIMD;
36020 for (size_t k = 2; k < 10; k++) {
36021 GemmMicrokernelTester()
36022 .mr(6)
36023 .nr(8)
36024 .kr(1)
36025 .sr(1)
36026 .m(6)
36027 .n(8)
36028 .k(k)
36029 .a_stride(11)
36030 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36031 }
36032 }
36033
36034 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
36035 TEST_REQUIRES_PSIMD;
36036 for (size_t k = 2; k < 10; k++) {
36037 for (uint32_t m = 1; m <= 6; m++) {
36038 for (uint32_t n = 1; n <= 8; n++) {
36039 GemmMicrokernelTester()
36040 .mr(6)
36041 .nr(8)
36042 .kr(1)
36043 .sr(1)
36044 .m(m)
36045 .n(n)
36046 .k(k)
36047 .iterations(1)
36048 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36049 }
36050 }
36051 }
36052 }
36053
36054 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8) {
36055 TEST_REQUIRES_PSIMD;
36056 for (uint32_t n = 9; n < 16; n++) {
36057 for (size_t k = 1; k <= 5; k += 2) {
36058 GemmMicrokernelTester()
36059 .mr(6)
36060 .nr(8)
36061 .kr(1)
36062 .sr(1)
36063 .m(6)
36064 .n(8)
36065 .k(k)
36066 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36067 }
36068 }
36069 }
36070
36071 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
36072 TEST_REQUIRES_PSIMD;
36073 for (uint32_t n = 9; n < 16; n++) {
36074 for (size_t k = 1; k <= 5; k += 2) {
36075 GemmMicrokernelTester()
36076 .mr(6)
36077 .nr(8)
36078 .kr(1)
36079 .sr(1)
36080 .m(6)
36081 .n(8)
36082 .k(k)
36083 .cn_stride(11)
36084 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36085 }
36086 }
36087 }
36088
36089 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
36090 TEST_REQUIRES_PSIMD;
36091 for (uint32_t n = 9; n < 16; n++) {
36092 for (size_t k = 1; k <= 5; k += 2) {
36093 GemmMicrokernelTester()
36094 .mr(6)
36095 .nr(8)
36096 .kr(1)
36097 .sr(1)
36098 .m(6)
36099 .n(n)
36100 .k(k)
36101 .a_stride(7)
36102 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36103 }
36104 }
36105 }
36106
36107 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
36108 TEST_REQUIRES_PSIMD;
36109 for (uint32_t n = 9; n < 16; n++) {
36110 for (size_t k = 1; k <= 5; k += 2) {
36111 for (uint32_t m = 1; m <= 6; m++) {
36112 GemmMicrokernelTester()
36113 .mr(6)
36114 .nr(8)
36115 .kr(1)
36116 .sr(1)
36117 .m(m)
36118 .n(n)
36119 .k(k)
36120 .iterations(1)
36121 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36122 }
36123 }
36124 }
36125 }
36126
36127 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8) {
36128 TEST_REQUIRES_PSIMD;
36129 for (uint32_t n = 16; n <= 24; n += 8) {
36130 for (size_t k = 1; k <= 5; k += 2) {
36131 GemmMicrokernelTester()
36132 .mr(6)
36133 .nr(8)
36134 .kr(1)
36135 .sr(1)
36136 .m(6)
36137 .n(8)
36138 .k(k)
36139 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36140 }
36141 }
36142 }
36143
36144 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
36145 TEST_REQUIRES_PSIMD;
36146 for (uint32_t n = 16; n <= 24; n += 8) {
36147 for (size_t k = 1; k <= 5; k += 2) {
36148 GemmMicrokernelTester()
36149 .mr(6)
36150 .nr(8)
36151 .kr(1)
36152 .sr(1)
36153 .m(6)
36154 .n(n)
36155 .k(k)
36156 .cn_stride(11)
36157 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36158 }
36159 }
36160 }
36161
36162 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
36163 TEST_REQUIRES_PSIMD;
36164 for (uint32_t n = 16; n <= 24; n += 8) {
36165 for (size_t k = 1; k <= 5; k += 2) {
36166 GemmMicrokernelTester()
36167 .mr(6)
36168 .nr(8)
36169 .kr(1)
36170 .sr(1)
36171 .m(6)
36172 .n(n)
36173 .k(k)
36174 .a_stride(7)
36175 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36176 }
36177 }
36178 }
36179
36180 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
36181 TEST_REQUIRES_PSIMD;
36182 for (uint32_t n = 16; n <= 24; n += 8) {
36183 for (size_t k = 1; k <= 5; k += 2) {
36184 for (uint32_t m = 1; m <= 6; m++) {
36185 GemmMicrokernelTester()
36186 .mr(6)
36187 .nr(8)
36188 .kr(1)
36189 .sr(1)
36190 .m(m)
36191 .n(n)
36192 .k(k)
36193 .iterations(1)
36194 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36195 }
36196 }
36197 }
36198 }
36199
36200 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
36201 TEST_REQUIRES_PSIMD;
36202 for (size_t k = 1; k <= 5; k += 2) {
36203 for (uint32_t m = 1; m <= 6; m++) {
36204 for (uint32_t n = 1; n <= 8; n++) {
36205 GemmMicrokernelTester()
36206 .mr(6)
36207 .nr(8)
36208 .kr(1)
36209 .sr(1)
36210 .m(m)
36211 .n(n)
36212 .k(k)
36213 .cm_stride(11)
36214 .iterations(1)
36215 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36216 }
36217 }
36218 }
36219 }
36220
36221 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, qmin) {
36222 TEST_REQUIRES_PSIMD;
36223 GemmMicrokernelTester()
36224 .mr(6)
36225 .nr(8)
36226 .kr(1)
36227 .sr(1)
36228 .m(6)
36229 .n(8)
36230 .k(1)
36231 .qmin(128)
36232 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36233 }
36234
36235 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, qmax) {
36236 TEST_REQUIRES_PSIMD;
36237 GemmMicrokernelTester()
36238 .mr(6)
36239 .nr(8)
36240 .kr(1)
36241 .sr(1)
36242 .m(6)
36243 .n(8)
36244 .k(1)
36245 .qmax(128)
36246 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36247 }
36248
36249 TEST(F32_GEMMINC_6X8__PSIMD_LOADSPLAT, strided_cm) {
36250 TEST_REQUIRES_PSIMD;
36251 GemmMicrokernelTester()
36252 .mr(6)
36253 .nr(8)
36254 .kr(1)
36255 .sr(1)
36256 .m(6)
36257 .n(8)
36258 .k(1)
36259 .cm_stride(11)
36260 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
36261 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070036262#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070036263
36264
Marat Dukhan1dadbf72019-10-01 10:46:20 -070036265#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070036266 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4) {
36267 TEST_REQUIRES_PSIMD;
36268 GemmMicrokernelTester()
36269 .mr(1)
36270 .nr(8)
36271 .kr(1)
36272 .sr(1)
36273 .m(1)
36274 .n(8)
36275 .k(4)
36276 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36277 }
36278
36279 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, strided_cn) {
36280 TEST_REQUIRES_PSIMD;
36281 GemmMicrokernelTester()
36282 .mr(1)
36283 .nr(8)
36284 .kr(1)
36285 .sr(1)
36286 .m(1)
36287 .n(8)
36288 .k(4)
36289 .cn_stride(11)
36290 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36291 }
36292
36293 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_strided_a) {
36294 TEST_REQUIRES_PSIMD;
36295 GemmMicrokernelTester()
36296 .mr(1)
36297 .nr(8)
36298 .kr(1)
36299 .sr(1)
36300 .m(1)
36301 .n(8)
36302 .k(4)
36303 .a_stride(7)
36304 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36305 }
36306
36307 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_subtile) {
36308 TEST_REQUIRES_PSIMD;
36309 for (uint32_t m = 1; m <= 1; m++) {
36310 for (uint32_t n = 1; n <= 8; n++) {
36311 GemmMicrokernelTester()
36312 .mr(1)
36313 .nr(8)
36314 .kr(1)
36315 .sr(1)
36316 .m(m)
36317 .n(n)
36318 .k(4)
36319 .iterations(1)
36320 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36321 }
36322 }
36323 }
36324
36325 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
36326 TEST_REQUIRES_PSIMD;
36327 for (uint32_t m = 1; m <= 1; m++) {
36328 GemmMicrokernelTester()
36329 .mr(1)
36330 .nr(8)
36331 .kr(1)
36332 .sr(1)
36333 .m(m)
36334 .n(8)
36335 .k(4)
36336 .iterations(1)
36337 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36338 }
36339 }
36340
36341 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
36342 TEST_REQUIRES_PSIMD;
36343 for (uint32_t n = 1; n <= 8; n++) {
36344 GemmMicrokernelTester()
36345 .mr(1)
36346 .nr(8)
36347 .kr(1)
36348 .sr(1)
36349 .m(1)
36350 .n(n)
36351 .k(4)
36352 .iterations(1)
36353 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36354 }
36355 }
36356
36357 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_lt_4) {
36358 TEST_REQUIRES_PSIMD;
36359 for (size_t k = 1; k < 4; k++) {
36360 GemmMicrokernelTester()
36361 .mr(1)
36362 .nr(8)
36363 .kr(1)
36364 .sr(1)
36365 .m(1)
36366 .n(8)
36367 .k(k)
36368 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36369 }
36370 }
36371
36372 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_lt_4_strided_a) {
36373 TEST_REQUIRES_PSIMD;
36374 for (size_t k = 1; k < 4; k++) {
36375 GemmMicrokernelTester()
36376 .mr(1)
36377 .nr(8)
36378 .kr(1)
36379 .sr(1)
36380 .m(1)
36381 .n(8)
36382 .k(k)
36383 .a_stride(7)
36384 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36385 }
36386 }
36387
36388 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_lt_4_subtile) {
36389 TEST_REQUIRES_PSIMD;
36390 for (size_t k = 1; k < 4; k++) {
36391 for (uint32_t m = 1; m <= 1; m++) {
36392 for (uint32_t n = 1; n <= 8; n++) {
36393 GemmMicrokernelTester()
36394 .mr(1)
36395 .nr(8)
36396 .kr(1)
36397 .sr(1)
36398 .m(m)
36399 .n(n)
36400 .k(k)
36401 .iterations(1)
36402 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36403 }
36404 }
36405 }
36406 }
36407
36408 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_gt_4) {
36409 TEST_REQUIRES_PSIMD;
36410 for (size_t k = 5; k < 8; k++) {
36411 GemmMicrokernelTester()
36412 .mr(1)
36413 .nr(8)
36414 .kr(1)
36415 .sr(1)
36416 .m(1)
36417 .n(8)
36418 .k(k)
36419 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36420 }
36421 }
36422
36423 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_gt_4_strided_a) {
36424 TEST_REQUIRES_PSIMD;
36425 for (size_t k = 5; k < 8; k++) {
36426 GemmMicrokernelTester()
36427 .mr(1)
36428 .nr(8)
36429 .kr(1)
36430 .sr(1)
36431 .m(1)
36432 .n(8)
36433 .k(k)
36434 .a_stride(11)
36435 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36436 }
36437 }
36438
36439 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_gt_4_subtile) {
36440 TEST_REQUIRES_PSIMD;
36441 for (size_t k = 5; k < 8; k++) {
36442 for (uint32_t m = 1; m <= 1; m++) {
36443 for (uint32_t n = 1; n <= 8; n++) {
36444 GemmMicrokernelTester()
36445 .mr(1)
36446 .nr(8)
36447 .kr(1)
36448 .sr(1)
36449 .m(m)
36450 .n(n)
36451 .k(k)
36452 .iterations(1)
36453 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36454 }
36455 }
36456 }
36457 }
36458
36459 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_div_4) {
36460 TEST_REQUIRES_PSIMD;
36461 for (size_t k = 8; k <= 40; k += 4) {
36462 GemmMicrokernelTester()
36463 .mr(1)
36464 .nr(8)
36465 .kr(1)
36466 .sr(1)
36467 .m(1)
36468 .n(8)
36469 .k(k)
36470 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36471 }
36472 }
36473
36474 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_div_4_strided_a) {
36475 TEST_REQUIRES_PSIMD;
36476 for (size_t k = 8; k <= 40; k += 4) {
36477 GemmMicrokernelTester()
36478 .mr(1)
36479 .nr(8)
36480 .kr(1)
36481 .sr(1)
36482 .m(1)
36483 .n(8)
36484 .k(k)
36485 .a_stride(43)
36486 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36487 }
36488 }
36489
36490 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, k_div_4_subtile) {
36491 TEST_REQUIRES_PSIMD;
36492 for (size_t k = 8; k <= 40; k += 4) {
36493 for (uint32_t m = 1; m <= 1; m++) {
36494 for (uint32_t n = 1; n <= 8; n++) {
36495 GemmMicrokernelTester()
36496 .mr(1)
36497 .nr(8)
36498 .kr(1)
36499 .sr(1)
36500 .m(m)
36501 .n(n)
36502 .k(k)
36503 .iterations(1)
36504 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36505 }
36506 }
36507 }
36508 }
36509
36510 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8) {
36511 TEST_REQUIRES_PSIMD;
36512 for (uint32_t n = 9; n < 16; n++) {
36513 for (size_t k = 1; k <= 20; k += 5) {
36514 GemmMicrokernelTester()
36515 .mr(1)
36516 .nr(8)
36517 .kr(1)
36518 .sr(1)
36519 .m(1)
36520 .n(8)
36521 .k(k)
36522 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36523 }
36524 }
36525 }
36526
36527 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
36528 TEST_REQUIRES_PSIMD;
36529 for (uint32_t n = 9; n < 16; n++) {
36530 for (size_t k = 1; k <= 20; k += 5) {
36531 GemmMicrokernelTester()
36532 .mr(1)
36533 .nr(8)
36534 .kr(1)
36535 .sr(1)
36536 .m(1)
36537 .n(8)
36538 .k(k)
36539 .cn_stride(11)
36540 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36541 }
36542 }
36543 }
36544
36545 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8_strided_a) {
36546 TEST_REQUIRES_PSIMD;
36547 for (uint32_t n = 9; n < 16; n++) {
36548 for (size_t k = 1; k <= 20; k += 5) {
36549 GemmMicrokernelTester()
36550 .mr(1)
36551 .nr(8)
36552 .kr(1)
36553 .sr(1)
36554 .m(1)
36555 .n(n)
36556 .k(k)
36557 .a_stride(23)
36558 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36559 }
36560 }
36561 }
36562
36563 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_gt_8_subtile) {
36564 TEST_REQUIRES_PSIMD;
36565 for (uint32_t n = 9; n < 16; n++) {
36566 for (size_t k = 1; k <= 20; k += 5) {
36567 for (uint32_t m = 1; m <= 1; m++) {
36568 GemmMicrokernelTester()
36569 .mr(1)
36570 .nr(8)
36571 .kr(1)
36572 .sr(1)
36573 .m(m)
36574 .n(n)
36575 .k(k)
36576 .iterations(1)
36577 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36578 }
36579 }
36580 }
36581 }
36582
36583 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8) {
36584 TEST_REQUIRES_PSIMD;
36585 for (uint32_t n = 16; n <= 24; n += 8) {
36586 for (size_t k = 1; k <= 20; k += 5) {
36587 GemmMicrokernelTester()
36588 .mr(1)
36589 .nr(8)
36590 .kr(1)
36591 .sr(1)
36592 .m(1)
36593 .n(8)
36594 .k(k)
36595 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36596 }
36597 }
36598 }
36599
36600 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8_strided_cn) {
36601 TEST_REQUIRES_PSIMD;
36602 for (uint32_t n = 16; n <= 24; n += 8) {
36603 for (size_t k = 1; k <= 20; k += 5) {
36604 GemmMicrokernelTester()
36605 .mr(1)
36606 .nr(8)
36607 .kr(1)
36608 .sr(1)
36609 .m(1)
36610 .n(n)
36611 .k(k)
36612 .cn_stride(11)
36613 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36614 }
36615 }
36616 }
36617
36618 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8_strided_a) {
36619 TEST_REQUIRES_PSIMD;
36620 for (uint32_t n = 16; n <= 24; n += 8) {
36621 for (size_t k = 1; k <= 20; k += 5) {
36622 GemmMicrokernelTester()
36623 .mr(1)
36624 .nr(8)
36625 .kr(1)
36626 .sr(1)
36627 .m(1)
36628 .n(n)
36629 .k(k)
36630 .a_stride(23)
36631 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36632 }
36633 }
36634 }
36635
36636 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, n_div_8_subtile) {
36637 TEST_REQUIRES_PSIMD;
36638 for (uint32_t n = 16; n <= 24; n += 8) {
36639 for (size_t k = 1; k <= 20; k += 5) {
36640 for (uint32_t m = 1; m <= 1; m++) {
36641 GemmMicrokernelTester()
36642 .mr(1)
36643 .nr(8)
36644 .kr(1)
36645 .sr(1)
36646 .m(m)
36647 .n(n)
36648 .k(k)
36649 .iterations(1)
36650 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36651 }
36652 }
36653 }
36654 }
36655
36656 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, strided_cm_subtile) {
36657 TEST_REQUIRES_PSIMD;
36658 for (size_t k = 1; k <= 20; k += 5) {
36659 for (uint32_t m = 1; m <= 1; m++) {
36660 for (uint32_t n = 1; n <= 8; n++) {
36661 GemmMicrokernelTester()
36662 .mr(1)
36663 .nr(8)
36664 .kr(1)
36665 .sr(1)
36666 .m(m)
36667 .n(n)
36668 .k(k)
36669 .cm_stride(11)
36670 .iterations(1)
36671 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36672 }
36673 }
36674 }
36675 }
36676
36677 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, qmin) {
36678 TEST_REQUIRES_PSIMD;
36679 GemmMicrokernelTester()
36680 .mr(1)
36681 .nr(8)
36682 .kr(1)
36683 .sr(1)
36684 .m(1)
36685 .n(8)
36686 .k(4)
36687 .qmin(128)
36688 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36689 }
36690
36691 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, qmax) {
36692 TEST_REQUIRES_PSIMD;
36693 GemmMicrokernelTester()
36694 .mr(1)
36695 .nr(8)
36696 .kr(1)
36697 .sr(1)
36698 .m(1)
36699 .n(8)
36700 .k(4)
36701 .qmax(128)
36702 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36703 }
36704
36705 TEST(F32_GEMMINC_1X8__PSIMD_SPLAT, strided_cm) {
36706 TEST_REQUIRES_PSIMD;
36707 GemmMicrokernelTester()
36708 .mr(1)
36709 .nr(8)
36710 .kr(1)
36711 .sr(1)
36712 .m(1)
36713 .n(8)
36714 .k(4)
36715 .cm_stride(11)
36716 .Test(xnn_f32_gemminc_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36717 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070036718#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070036719
36720
Marat Dukhan1dadbf72019-10-01 10:46:20 -070036721#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070036722 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4) {
36723 TEST_REQUIRES_PSIMD;
36724 GemmMicrokernelTester()
36725 .mr(4)
36726 .nr(8)
36727 .kr(1)
36728 .sr(1)
36729 .m(4)
36730 .n(8)
36731 .k(4)
36732 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36733 }
36734
36735 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, strided_cn) {
36736 TEST_REQUIRES_PSIMD;
36737 GemmMicrokernelTester()
36738 .mr(4)
36739 .nr(8)
36740 .kr(1)
36741 .sr(1)
36742 .m(4)
36743 .n(8)
36744 .k(4)
36745 .cn_stride(11)
36746 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36747 }
36748
36749 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_strided_a) {
36750 TEST_REQUIRES_PSIMD;
36751 GemmMicrokernelTester()
36752 .mr(4)
36753 .nr(8)
36754 .kr(1)
36755 .sr(1)
36756 .m(4)
36757 .n(8)
36758 .k(4)
36759 .a_stride(7)
36760 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36761 }
36762
36763 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_subtile) {
36764 TEST_REQUIRES_PSIMD;
36765 for (uint32_t m = 1; m <= 4; m++) {
36766 for (uint32_t n = 1; n <= 8; n++) {
36767 GemmMicrokernelTester()
36768 .mr(4)
36769 .nr(8)
36770 .kr(1)
36771 .sr(1)
36772 .m(m)
36773 .n(n)
36774 .k(4)
36775 .iterations(1)
36776 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36777 }
36778 }
36779 }
36780
36781 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
36782 TEST_REQUIRES_PSIMD;
36783 for (uint32_t m = 1; m <= 4; m++) {
36784 GemmMicrokernelTester()
36785 .mr(4)
36786 .nr(8)
36787 .kr(1)
36788 .sr(1)
36789 .m(m)
36790 .n(8)
36791 .k(4)
36792 .iterations(1)
36793 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36794 }
36795 }
36796
36797 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
36798 TEST_REQUIRES_PSIMD;
36799 for (uint32_t n = 1; n <= 8; n++) {
36800 GemmMicrokernelTester()
36801 .mr(4)
36802 .nr(8)
36803 .kr(1)
36804 .sr(1)
36805 .m(4)
36806 .n(n)
36807 .k(4)
36808 .iterations(1)
36809 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36810 }
36811 }
36812
36813 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_lt_4) {
36814 TEST_REQUIRES_PSIMD;
36815 for (size_t k = 1; k < 4; k++) {
36816 GemmMicrokernelTester()
36817 .mr(4)
36818 .nr(8)
36819 .kr(1)
36820 .sr(1)
36821 .m(4)
36822 .n(8)
36823 .k(k)
36824 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36825 }
36826 }
36827
36828 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_lt_4_strided_a) {
36829 TEST_REQUIRES_PSIMD;
36830 for (size_t k = 1; k < 4; k++) {
36831 GemmMicrokernelTester()
36832 .mr(4)
36833 .nr(8)
36834 .kr(1)
36835 .sr(1)
36836 .m(4)
36837 .n(8)
36838 .k(k)
36839 .a_stride(7)
36840 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36841 }
36842 }
36843
36844 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_lt_4_subtile) {
36845 TEST_REQUIRES_PSIMD;
36846 for (size_t k = 1; k < 4; k++) {
36847 for (uint32_t m = 1; m <= 4; m++) {
36848 for (uint32_t n = 1; n <= 8; n++) {
36849 GemmMicrokernelTester()
36850 .mr(4)
36851 .nr(8)
36852 .kr(1)
36853 .sr(1)
36854 .m(m)
36855 .n(n)
36856 .k(k)
36857 .iterations(1)
36858 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36859 }
36860 }
36861 }
36862 }
36863
36864 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_gt_4) {
36865 TEST_REQUIRES_PSIMD;
36866 for (size_t k = 5; k < 8; k++) {
36867 GemmMicrokernelTester()
36868 .mr(4)
36869 .nr(8)
36870 .kr(1)
36871 .sr(1)
36872 .m(4)
36873 .n(8)
36874 .k(k)
36875 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36876 }
36877 }
36878
36879 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_gt_4_strided_a) {
36880 TEST_REQUIRES_PSIMD;
36881 for (size_t k = 5; k < 8; k++) {
36882 GemmMicrokernelTester()
36883 .mr(4)
36884 .nr(8)
36885 .kr(1)
36886 .sr(1)
36887 .m(4)
36888 .n(8)
36889 .k(k)
36890 .a_stride(11)
36891 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36892 }
36893 }
36894
36895 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_gt_4_subtile) {
36896 TEST_REQUIRES_PSIMD;
36897 for (size_t k = 5; k < 8; k++) {
36898 for (uint32_t m = 1; m <= 4; m++) {
36899 for (uint32_t n = 1; n <= 8; n++) {
36900 GemmMicrokernelTester()
36901 .mr(4)
36902 .nr(8)
36903 .kr(1)
36904 .sr(1)
36905 .m(m)
36906 .n(n)
36907 .k(k)
36908 .iterations(1)
36909 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36910 }
36911 }
36912 }
36913 }
36914
36915 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_div_4) {
36916 TEST_REQUIRES_PSIMD;
36917 for (size_t k = 8; k <= 40; k += 4) {
36918 GemmMicrokernelTester()
36919 .mr(4)
36920 .nr(8)
36921 .kr(1)
36922 .sr(1)
36923 .m(4)
36924 .n(8)
36925 .k(k)
36926 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36927 }
36928 }
36929
36930 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_div_4_strided_a) {
36931 TEST_REQUIRES_PSIMD;
36932 for (size_t k = 8; k <= 40; k += 4) {
36933 GemmMicrokernelTester()
36934 .mr(4)
36935 .nr(8)
36936 .kr(1)
36937 .sr(1)
36938 .m(4)
36939 .n(8)
36940 .k(k)
36941 .a_stride(43)
36942 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36943 }
36944 }
36945
36946 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, k_div_4_subtile) {
36947 TEST_REQUIRES_PSIMD;
36948 for (size_t k = 8; k <= 40; k += 4) {
36949 for (uint32_t m = 1; m <= 4; m++) {
36950 for (uint32_t n = 1; n <= 8; n++) {
36951 GemmMicrokernelTester()
36952 .mr(4)
36953 .nr(8)
36954 .kr(1)
36955 .sr(1)
36956 .m(m)
36957 .n(n)
36958 .k(k)
36959 .iterations(1)
36960 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36961 }
36962 }
36963 }
36964 }
36965
36966 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8) {
36967 TEST_REQUIRES_PSIMD;
36968 for (uint32_t n = 9; n < 16; n++) {
36969 for (size_t k = 1; k <= 20; k += 5) {
36970 GemmMicrokernelTester()
36971 .mr(4)
36972 .nr(8)
36973 .kr(1)
36974 .sr(1)
36975 .m(4)
36976 .n(8)
36977 .k(k)
36978 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36979 }
36980 }
36981 }
36982
36983 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
36984 TEST_REQUIRES_PSIMD;
36985 for (uint32_t n = 9; n < 16; n++) {
36986 for (size_t k = 1; k <= 20; k += 5) {
36987 GemmMicrokernelTester()
36988 .mr(4)
36989 .nr(8)
36990 .kr(1)
36991 .sr(1)
36992 .m(4)
36993 .n(8)
36994 .k(k)
36995 .cn_stride(11)
36996 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
36997 }
36998 }
36999 }
37000
37001 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8_strided_a) {
37002 TEST_REQUIRES_PSIMD;
37003 for (uint32_t n = 9; n < 16; n++) {
37004 for (size_t k = 1; k <= 20; k += 5) {
37005 GemmMicrokernelTester()
37006 .mr(4)
37007 .nr(8)
37008 .kr(1)
37009 .sr(1)
37010 .m(4)
37011 .n(n)
37012 .k(k)
37013 .a_stride(23)
37014 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37015 }
37016 }
37017 }
37018
37019 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_gt_8_subtile) {
37020 TEST_REQUIRES_PSIMD;
37021 for (uint32_t n = 9; n < 16; n++) {
37022 for (size_t k = 1; k <= 20; k += 5) {
37023 for (uint32_t m = 1; m <= 4; m++) {
37024 GemmMicrokernelTester()
37025 .mr(4)
37026 .nr(8)
37027 .kr(1)
37028 .sr(1)
37029 .m(m)
37030 .n(n)
37031 .k(k)
37032 .iterations(1)
37033 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37034 }
37035 }
37036 }
37037 }
37038
37039 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8) {
37040 TEST_REQUIRES_PSIMD;
37041 for (uint32_t n = 16; n <= 24; n += 8) {
37042 for (size_t k = 1; k <= 20; k += 5) {
37043 GemmMicrokernelTester()
37044 .mr(4)
37045 .nr(8)
37046 .kr(1)
37047 .sr(1)
37048 .m(4)
37049 .n(8)
37050 .k(k)
37051 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37052 }
37053 }
37054 }
37055
37056 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8_strided_cn) {
37057 TEST_REQUIRES_PSIMD;
37058 for (uint32_t n = 16; n <= 24; n += 8) {
37059 for (size_t k = 1; k <= 20; k += 5) {
37060 GemmMicrokernelTester()
37061 .mr(4)
37062 .nr(8)
37063 .kr(1)
37064 .sr(1)
37065 .m(4)
37066 .n(n)
37067 .k(k)
37068 .cn_stride(11)
37069 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37070 }
37071 }
37072 }
37073
37074 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8_strided_a) {
37075 TEST_REQUIRES_PSIMD;
37076 for (uint32_t n = 16; n <= 24; n += 8) {
37077 for (size_t k = 1; k <= 20; k += 5) {
37078 GemmMicrokernelTester()
37079 .mr(4)
37080 .nr(8)
37081 .kr(1)
37082 .sr(1)
37083 .m(4)
37084 .n(n)
37085 .k(k)
37086 .a_stride(23)
37087 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37088 }
37089 }
37090 }
37091
37092 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, n_div_8_subtile) {
37093 TEST_REQUIRES_PSIMD;
37094 for (uint32_t n = 16; n <= 24; n += 8) {
37095 for (size_t k = 1; k <= 20; k += 5) {
37096 for (uint32_t m = 1; m <= 4; m++) {
37097 GemmMicrokernelTester()
37098 .mr(4)
37099 .nr(8)
37100 .kr(1)
37101 .sr(1)
37102 .m(m)
37103 .n(n)
37104 .k(k)
37105 .iterations(1)
37106 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37107 }
37108 }
37109 }
37110 }
37111
37112 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, strided_cm_subtile) {
37113 TEST_REQUIRES_PSIMD;
37114 for (size_t k = 1; k <= 20; k += 5) {
37115 for (uint32_t m = 1; m <= 4; m++) {
37116 for (uint32_t n = 1; n <= 8; n++) {
37117 GemmMicrokernelTester()
37118 .mr(4)
37119 .nr(8)
37120 .kr(1)
37121 .sr(1)
37122 .m(m)
37123 .n(n)
37124 .k(k)
37125 .cm_stride(11)
37126 .iterations(1)
37127 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37128 }
37129 }
37130 }
37131 }
37132
37133 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, qmin) {
37134 TEST_REQUIRES_PSIMD;
37135 GemmMicrokernelTester()
37136 .mr(4)
37137 .nr(8)
37138 .kr(1)
37139 .sr(1)
37140 .m(4)
37141 .n(8)
37142 .k(4)
37143 .qmin(128)
37144 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37145 }
37146
37147 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, qmax) {
37148 TEST_REQUIRES_PSIMD;
37149 GemmMicrokernelTester()
37150 .mr(4)
37151 .nr(8)
37152 .kr(1)
37153 .sr(1)
37154 .m(4)
37155 .n(8)
37156 .k(4)
37157 .qmax(128)
37158 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37159 }
37160
37161 TEST(F32_GEMMINC_4X8__PSIMD_SPLAT, strided_cm) {
37162 TEST_REQUIRES_PSIMD;
37163 GemmMicrokernelTester()
37164 .mr(4)
37165 .nr(8)
37166 .kr(1)
37167 .sr(1)
37168 .m(4)
37169 .n(8)
37170 .k(4)
37171 .cm_stride(11)
37172 .Test(xnn_f32_gemminc_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37173 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070037174#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070037175
37176
Marat Dukhan1dadbf72019-10-01 10:46:20 -070037177#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070037178 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4) {
37179 TEST_REQUIRES_PSIMD;
37180 GemmMicrokernelTester()
37181 .mr(6)
37182 .nr(8)
37183 .kr(1)
37184 .sr(1)
37185 .m(6)
37186 .n(8)
37187 .k(4)
37188 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37189 }
37190
37191 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, strided_cn) {
37192 TEST_REQUIRES_PSIMD;
37193 GemmMicrokernelTester()
37194 .mr(6)
37195 .nr(8)
37196 .kr(1)
37197 .sr(1)
37198 .m(6)
37199 .n(8)
37200 .k(4)
37201 .cn_stride(11)
37202 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37203 }
37204
37205 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_strided_a) {
37206 TEST_REQUIRES_PSIMD;
37207 GemmMicrokernelTester()
37208 .mr(6)
37209 .nr(8)
37210 .kr(1)
37211 .sr(1)
37212 .m(6)
37213 .n(8)
37214 .k(4)
37215 .a_stride(7)
37216 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37217 }
37218
37219 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_subtile) {
37220 TEST_REQUIRES_PSIMD;
37221 for (uint32_t m = 1; m <= 6; m++) {
37222 for (uint32_t n = 1; n <= 8; n++) {
37223 GemmMicrokernelTester()
37224 .mr(6)
37225 .nr(8)
37226 .kr(1)
37227 .sr(1)
37228 .m(m)
37229 .n(n)
37230 .k(4)
37231 .iterations(1)
37232 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37233 }
37234 }
37235 }
37236
37237 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
37238 TEST_REQUIRES_PSIMD;
37239 for (uint32_t m = 1; m <= 6; m++) {
37240 GemmMicrokernelTester()
37241 .mr(6)
37242 .nr(8)
37243 .kr(1)
37244 .sr(1)
37245 .m(m)
37246 .n(8)
37247 .k(4)
37248 .iterations(1)
37249 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37250 }
37251 }
37252
37253 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
37254 TEST_REQUIRES_PSIMD;
37255 for (uint32_t n = 1; n <= 8; n++) {
37256 GemmMicrokernelTester()
37257 .mr(6)
37258 .nr(8)
37259 .kr(1)
37260 .sr(1)
37261 .m(6)
37262 .n(n)
37263 .k(4)
37264 .iterations(1)
37265 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37266 }
37267 }
37268
37269 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_lt_4) {
37270 TEST_REQUIRES_PSIMD;
37271 for (size_t k = 1; k < 4; k++) {
37272 GemmMicrokernelTester()
37273 .mr(6)
37274 .nr(8)
37275 .kr(1)
37276 .sr(1)
37277 .m(6)
37278 .n(8)
37279 .k(k)
37280 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37281 }
37282 }
37283
37284 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_lt_4_strided_a) {
37285 TEST_REQUIRES_PSIMD;
37286 for (size_t k = 1; k < 4; k++) {
37287 GemmMicrokernelTester()
37288 .mr(6)
37289 .nr(8)
37290 .kr(1)
37291 .sr(1)
37292 .m(6)
37293 .n(8)
37294 .k(k)
37295 .a_stride(7)
37296 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37297 }
37298 }
37299
37300 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_lt_4_subtile) {
37301 TEST_REQUIRES_PSIMD;
37302 for (size_t k = 1; k < 4; k++) {
37303 for (uint32_t m = 1; m <= 6; m++) {
37304 for (uint32_t n = 1; n <= 8; n++) {
37305 GemmMicrokernelTester()
37306 .mr(6)
37307 .nr(8)
37308 .kr(1)
37309 .sr(1)
37310 .m(m)
37311 .n(n)
37312 .k(k)
37313 .iterations(1)
37314 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37315 }
37316 }
37317 }
37318 }
37319
37320 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_gt_4) {
37321 TEST_REQUIRES_PSIMD;
37322 for (size_t k = 5; k < 8; k++) {
37323 GemmMicrokernelTester()
37324 .mr(6)
37325 .nr(8)
37326 .kr(1)
37327 .sr(1)
37328 .m(6)
37329 .n(8)
37330 .k(k)
37331 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37332 }
37333 }
37334
37335 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_gt_4_strided_a) {
37336 TEST_REQUIRES_PSIMD;
37337 for (size_t k = 5; k < 8; k++) {
37338 GemmMicrokernelTester()
37339 .mr(6)
37340 .nr(8)
37341 .kr(1)
37342 .sr(1)
37343 .m(6)
37344 .n(8)
37345 .k(k)
37346 .a_stride(11)
37347 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37348 }
37349 }
37350
37351 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_gt_4_subtile) {
37352 TEST_REQUIRES_PSIMD;
37353 for (size_t k = 5; k < 8; k++) {
37354 for (uint32_t m = 1; m <= 6; m++) {
37355 for (uint32_t n = 1; n <= 8; n++) {
37356 GemmMicrokernelTester()
37357 .mr(6)
37358 .nr(8)
37359 .kr(1)
37360 .sr(1)
37361 .m(m)
37362 .n(n)
37363 .k(k)
37364 .iterations(1)
37365 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37366 }
37367 }
37368 }
37369 }
37370
37371 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_div_4) {
37372 TEST_REQUIRES_PSIMD;
37373 for (size_t k = 8; k <= 40; k += 4) {
37374 GemmMicrokernelTester()
37375 .mr(6)
37376 .nr(8)
37377 .kr(1)
37378 .sr(1)
37379 .m(6)
37380 .n(8)
37381 .k(k)
37382 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37383 }
37384 }
37385
37386 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_div_4_strided_a) {
37387 TEST_REQUIRES_PSIMD;
37388 for (size_t k = 8; k <= 40; k += 4) {
37389 GemmMicrokernelTester()
37390 .mr(6)
37391 .nr(8)
37392 .kr(1)
37393 .sr(1)
37394 .m(6)
37395 .n(8)
37396 .k(k)
37397 .a_stride(43)
37398 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37399 }
37400 }
37401
37402 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, k_div_4_subtile) {
37403 TEST_REQUIRES_PSIMD;
37404 for (size_t k = 8; k <= 40; k += 4) {
37405 for (uint32_t m = 1; m <= 6; m++) {
37406 for (uint32_t n = 1; n <= 8; n++) {
37407 GemmMicrokernelTester()
37408 .mr(6)
37409 .nr(8)
37410 .kr(1)
37411 .sr(1)
37412 .m(m)
37413 .n(n)
37414 .k(k)
37415 .iterations(1)
37416 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37417 }
37418 }
37419 }
37420 }
37421
37422 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8) {
37423 TEST_REQUIRES_PSIMD;
37424 for (uint32_t n = 9; n < 16; n++) {
37425 for (size_t k = 1; k <= 20; k += 5) {
37426 GemmMicrokernelTester()
37427 .mr(6)
37428 .nr(8)
37429 .kr(1)
37430 .sr(1)
37431 .m(6)
37432 .n(8)
37433 .k(k)
37434 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37435 }
37436 }
37437 }
37438
37439 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
37440 TEST_REQUIRES_PSIMD;
37441 for (uint32_t n = 9; n < 16; n++) {
37442 for (size_t k = 1; k <= 20; k += 5) {
37443 GemmMicrokernelTester()
37444 .mr(6)
37445 .nr(8)
37446 .kr(1)
37447 .sr(1)
37448 .m(6)
37449 .n(8)
37450 .k(k)
37451 .cn_stride(11)
37452 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37453 }
37454 }
37455 }
37456
37457 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8_strided_a) {
37458 TEST_REQUIRES_PSIMD;
37459 for (uint32_t n = 9; n < 16; n++) {
37460 for (size_t k = 1; k <= 20; k += 5) {
37461 GemmMicrokernelTester()
37462 .mr(6)
37463 .nr(8)
37464 .kr(1)
37465 .sr(1)
37466 .m(6)
37467 .n(n)
37468 .k(k)
37469 .a_stride(23)
37470 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37471 }
37472 }
37473 }
37474
37475 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_gt_8_subtile) {
37476 TEST_REQUIRES_PSIMD;
37477 for (uint32_t n = 9; n < 16; n++) {
37478 for (size_t k = 1; k <= 20; k += 5) {
37479 for (uint32_t m = 1; m <= 6; m++) {
37480 GemmMicrokernelTester()
37481 .mr(6)
37482 .nr(8)
37483 .kr(1)
37484 .sr(1)
37485 .m(m)
37486 .n(n)
37487 .k(k)
37488 .iterations(1)
37489 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37490 }
37491 }
37492 }
37493 }
37494
37495 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8) {
37496 TEST_REQUIRES_PSIMD;
37497 for (uint32_t n = 16; n <= 24; n += 8) {
37498 for (size_t k = 1; k <= 20; k += 5) {
37499 GemmMicrokernelTester()
37500 .mr(6)
37501 .nr(8)
37502 .kr(1)
37503 .sr(1)
37504 .m(6)
37505 .n(8)
37506 .k(k)
37507 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37508 }
37509 }
37510 }
37511
37512 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8_strided_cn) {
37513 TEST_REQUIRES_PSIMD;
37514 for (uint32_t n = 16; n <= 24; n += 8) {
37515 for (size_t k = 1; k <= 20; k += 5) {
37516 GemmMicrokernelTester()
37517 .mr(6)
37518 .nr(8)
37519 .kr(1)
37520 .sr(1)
37521 .m(6)
37522 .n(n)
37523 .k(k)
37524 .cn_stride(11)
37525 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37526 }
37527 }
37528 }
37529
37530 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8_strided_a) {
37531 TEST_REQUIRES_PSIMD;
37532 for (uint32_t n = 16; n <= 24; n += 8) {
37533 for (size_t k = 1; k <= 20; k += 5) {
37534 GemmMicrokernelTester()
37535 .mr(6)
37536 .nr(8)
37537 .kr(1)
37538 .sr(1)
37539 .m(6)
37540 .n(n)
37541 .k(k)
37542 .a_stride(23)
37543 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37544 }
37545 }
37546 }
37547
37548 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, n_div_8_subtile) {
37549 TEST_REQUIRES_PSIMD;
37550 for (uint32_t n = 16; n <= 24; n += 8) {
37551 for (size_t k = 1; k <= 20; k += 5) {
37552 for (uint32_t m = 1; m <= 6; m++) {
37553 GemmMicrokernelTester()
37554 .mr(6)
37555 .nr(8)
37556 .kr(1)
37557 .sr(1)
37558 .m(m)
37559 .n(n)
37560 .k(k)
37561 .iterations(1)
37562 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37563 }
37564 }
37565 }
37566 }
37567
37568 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, strided_cm_subtile) {
37569 TEST_REQUIRES_PSIMD;
37570 for (size_t k = 1; k <= 20; k += 5) {
37571 for (uint32_t m = 1; m <= 6; m++) {
37572 for (uint32_t n = 1; n <= 8; n++) {
37573 GemmMicrokernelTester()
37574 .mr(6)
37575 .nr(8)
37576 .kr(1)
37577 .sr(1)
37578 .m(m)
37579 .n(n)
37580 .k(k)
37581 .cm_stride(11)
37582 .iterations(1)
37583 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37584 }
37585 }
37586 }
37587 }
37588
37589 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, qmin) {
37590 TEST_REQUIRES_PSIMD;
37591 GemmMicrokernelTester()
37592 .mr(6)
37593 .nr(8)
37594 .kr(1)
37595 .sr(1)
37596 .m(6)
37597 .n(8)
37598 .k(4)
37599 .qmin(128)
37600 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37601 }
37602
37603 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, qmax) {
37604 TEST_REQUIRES_PSIMD;
37605 GemmMicrokernelTester()
37606 .mr(6)
37607 .nr(8)
37608 .kr(1)
37609 .sr(1)
37610 .m(6)
37611 .n(8)
37612 .k(4)
37613 .qmax(128)
37614 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37615 }
37616
37617 TEST(F32_GEMMINC_6X8__PSIMD_SPLAT, strided_cm) {
37618 TEST_REQUIRES_PSIMD;
37619 GemmMicrokernelTester()
37620 .mr(6)
37621 .nr(8)
37622 .kr(1)
37623 .sr(1)
37624 .m(6)
37625 .n(8)
37626 .k(4)
37627 .cm_stride(11)
37628 .Test(xnn_f32_gemminc_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
37629 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070037630#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070037631
37632
Marat Dukhan1dadbf72019-10-01 10:46:20 -070037633#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070037634 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4) {
37635 TEST_REQUIRES_PSIMD;
37636 GemmMicrokernelTester()
37637 .mr(1)
37638 .nr(8)
37639 .kr(1)
37640 .sr(4)
37641 .m(1)
37642 .n(8)
37643 .k(4)
37644 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37645 }
37646
37647 TEST(F32_GEMMINC_1X8S4__PSIMD, strided_cn) {
37648 TEST_REQUIRES_PSIMD;
37649 GemmMicrokernelTester()
37650 .mr(1)
37651 .nr(8)
37652 .kr(1)
37653 .sr(4)
37654 .m(1)
37655 .n(8)
37656 .k(4)
37657 .cn_stride(11)
37658 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37659 }
37660
37661 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_strided_a) {
37662 TEST_REQUIRES_PSIMD;
37663 GemmMicrokernelTester()
37664 .mr(1)
37665 .nr(8)
37666 .kr(1)
37667 .sr(4)
37668 .m(1)
37669 .n(8)
37670 .k(4)
37671 .a_stride(7)
37672 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37673 }
37674
37675 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_subtile) {
37676 TEST_REQUIRES_PSIMD;
37677 for (uint32_t m = 1; m <= 1; m++) {
37678 for (uint32_t n = 1; n <= 8; n++) {
37679 GemmMicrokernelTester()
37680 .mr(1)
37681 .nr(8)
37682 .kr(1)
37683 .sr(4)
37684 .m(m)
37685 .n(n)
37686 .k(4)
37687 .iterations(1)
37688 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37689 }
37690 }
37691 }
37692
37693 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_subtile_m) {
37694 TEST_REQUIRES_PSIMD;
37695 for (uint32_t m = 1; m <= 1; m++) {
37696 GemmMicrokernelTester()
37697 .mr(1)
37698 .nr(8)
37699 .kr(1)
37700 .sr(4)
37701 .m(m)
37702 .n(8)
37703 .k(4)
37704 .iterations(1)
37705 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37706 }
37707 }
37708
37709 TEST(F32_GEMMINC_1X8S4__PSIMD, k_eq_4_subtile_n) {
37710 TEST_REQUIRES_PSIMD;
37711 for (uint32_t n = 1; n <= 8; n++) {
37712 GemmMicrokernelTester()
37713 .mr(1)
37714 .nr(8)
37715 .kr(1)
37716 .sr(4)
37717 .m(1)
37718 .n(n)
37719 .k(4)
37720 .iterations(1)
37721 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37722 }
37723 }
37724
37725 TEST(F32_GEMMINC_1X8S4__PSIMD, k_lt_4) {
37726 TEST_REQUIRES_PSIMD;
37727 for (size_t k = 1; k < 4; k++) {
37728 GemmMicrokernelTester()
37729 .mr(1)
37730 .nr(8)
37731 .kr(1)
37732 .sr(4)
37733 .m(1)
37734 .n(8)
37735 .k(k)
37736 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37737 }
37738 }
37739
37740 TEST(F32_GEMMINC_1X8S4__PSIMD, k_lt_4_strided_a) {
37741 TEST_REQUIRES_PSIMD;
37742 for (size_t k = 1; k < 4; k++) {
37743 GemmMicrokernelTester()
37744 .mr(1)
37745 .nr(8)
37746 .kr(1)
37747 .sr(4)
37748 .m(1)
37749 .n(8)
37750 .k(k)
37751 .a_stride(7)
37752 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37753 }
37754 }
37755
37756 TEST(F32_GEMMINC_1X8S4__PSIMD, k_lt_4_subtile) {
37757 TEST_REQUIRES_PSIMD;
37758 for (size_t k = 1; k < 4; k++) {
37759 for (uint32_t m = 1; m <= 1; m++) {
37760 for (uint32_t n = 1; n <= 8; n++) {
37761 GemmMicrokernelTester()
37762 .mr(1)
37763 .nr(8)
37764 .kr(1)
37765 .sr(4)
37766 .m(m)
37767 .n(n)
37768 .k(k)
37769 .iterations(1)
37770 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37771 }
37772 }
37773 }
37774 }
37775
37776 TEST(F32_GEMMINC_1X8S4__PSIMD, k_gt_4) {
37777 TEST_REQUIRES_PSIMD;
37778 for (size_t k = 5; k < 8; k++) {
37779 GemmMicrokernelTester()
37780 .mr(1)
37781 .nr(8)
37782 .kr(1)
37783 .sr(4)
37784 .m(1)
37785 .n(8)
37786 .k(k)
37787 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37788 }
37789 }
37790
37791 TEST(F32_GEMMINC_1X8S4__PSIMD, k_gt_4_strided_a) {
37792 TEST_REQUIRES_PSIMD;
37793 for (size_t k = 5; k < 8; k++) {
37794 GemmMicrokernelTester()
37795 .mr(1)
37796 .nr(8)
37797 .kr(1)
37798 .sr(4)
37799 .m(1)
37800 .n(8)
37801 .k(k)
37802 .a_stride(11)
37803 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37804 }
37805 }
37806
37807 TEST(F32_GEMMINC_1X8S4__PSIMD, k_gt_4_subtile) {
37808 TEST_REQUIRES_PSIMD;
37809 for (size_t k = 5; k < 8; k++) {
37810 for (uint32_t m = 1; m <= 1; m++) {
37811 for (uint32_t n = 1; n <= 8; n++) {
37812 GemmMicrokernelTester()
37813 .mr(1)
37814 .nr(8)
37815 .kr(1)
37816 .sr(4)
37817 .m(m)
37818 .n(n)
37819 .k(k)
37820 .iterations(1)
37821 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37822 }
37823 }
37824 }
37825 }
37826
37827 TEST(F32_GEMMINC_1X8S4__PSIMD, k_div_4) {
37828 TEST_REQUIRES_PSIMD;
37829 for (size_t k = 8; k <= 40; k += 4) {
37830 GemmMicrokernelTester()
37831 .mr(1)
37832 .nr(8)
37833 .kr(1)
37834 .sr(4)
37835 .m(1)
37836 .n(8)
37837 .k(k)
37838 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37839 }
37840 }
37841
37842 TEST(F32_GEMMINC_1X8S4__PSIMD, k_div_4_strided_a) {
37843 TEST_REQUIRES_PSIMD;
37844 for (size_t k = 8; k <= 40; k += 4) {
37845 GemmMicrokernelTester()
37846 .mr(1)
37847 .nr(8)
37848 .kr(1)
37849 .sr(4)
37850 .m(1)
37851 .n(8)
37852 .k(k)
37853 .a_stride(43)
37854 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37855 }
37856 }
37857
37858 TEST(F32_GEMMINC_1X8S4__PSIMD, k_div_4_subtile) {
37859 TEST_REQUIRES_PSIMD;
37860 for (size_t k = 8; k <= 40; k += 4) {
37861 for (uint32_t m = 1; m <= 1; m++) {
37862 for (uint32_t n = 1; n <= 8; n++) {
37863 GemmMicrokernelTester()
37864 .mr(1)
37865 .nr(8)
37866 .kr(1)
37867 .sr(4)
37868 .m(m)
37869 .n(n)
37870 .k(k)
37871 .iterations(1)
37872 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37873 }
37874 }
37875 }
37876 }
37877
37878 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8) {
37879 TEST_REQUIRES_PSIMD;
37880 for (uint32_t n = 9; n < 16; n++) {
37881 for (size_t k = 1; k <= 20; k += 5) {
37882 GemmMicrokernelTester()
37883 .mr(1)
37884 .nr(8)
37885 .kr(1)
37886 .sr(4)
37887 .m(1)
37888 .n(8)
37889 .k(k)
37890 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37891 }
37892 }
37893 }
37894
37895 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8_strided_cn) {
37896 TEST_REQUIRES_PSIMD;
37897 for (uint32_t n = 9; n < 16; n++) {
37898 for (size_t k = 1; k <= 20; k += 5) {
37899 GemmMicrokernelTester()
37900 .mr(1)
37901 .nr(8)
37902 .kr(1)
37903 .sr(4)
37904 .m(1)
37905 .n(8)
37906 .k(k)
37907 .cn_stride(11)
37908 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37909 }
37910 }
37911 }
37912
37913 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8_strided_a) {
37914 TEST_REQUIRES_PSIMD;
37915 for (uint32_t n = 9; n < 16; n++) {
37916 for (size_t k = 1; k <= 20; k += 5) {
37917 GemmMicrokernelTester()
37918 .mr(1)
37919 .nr(8)
37920 .kr(1)
37921 .sr(4)
37922 .m(1)
37923 .n(n)
37924 .k(k)
37925 .a_stride(23)
37926 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37927 }
37928 }
37929 }
37930
37931 TEST(F32_GEMMINC_1X8S4__PSIMD, n_gt_8_subtile) {
37932 TEST_REQUIRES_PSIMD;
37933 for (uint32_t n = 9; n < 16; n++) {
37934 for (size_t k = 1; k <= 20; k += 5) {
37935 for (uint32_t m = 1; m <= 1; m++) {
37936 GemmMicrokernelTester()
37937 .mr(1)
37938 .nr(8)
37939 .kr(1)
37940 .sr(4)
37941 .m(m)
37942 .n(n)
37943 .k(k)
37944 .iterations(1)
37945 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37946 }
37947 }
37948 }
37949 }
37950
37951 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8) {
37952 TEST_REQUIRES_PSIMD;
37953 for (uint32_t n = 16; n <= 24; n += 8) {
37954 for (size_t k = 1; k <= 20; k += 5) {
37955 GemmMicrokernelTester()
37956 .mr(1)
37957 .nr(8)
37958 .kr(1)
37959 .sr(4)
37960 .m(1)
37961 .n(8)
37962 .k(k)
37963 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37964 }
37965 }
37966 }
37967
37968 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8_strided_cn) {
37969 TEST_REQUIRES_PSIMD;
37970 for (uint32_t n = 16; n <= 24; n += 8) {
37971 for (size_t k = 1; k <= 20; k += 5) {
37972 GemmMicrokernelTester()
37973 .mr(1)
37974 .nr(8)
37975 .kr(1)
37976 .sr(4)
37977 .m(1)
37978 .n(n)
37979 .k(k)
37980 .cn_stride(11)
37981 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
37982 }
37983 }
37984 }
37985
37986 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8_strided_a) {
37987 TEST_REQUIRES_PSIMD;
37988 for (uint32_t n = 16; n <= 24; n += 8) {
37989 for (size_t k = 1; k <= 20; k += 5) {
37990 GemmMicrokernelTester()
37991 .mr(1)
37992 .nr(8)
37993 .kr(1)
37994 .sr(4)
37995 .m(1)
37996 .n(n)
37997 .k(k)
37998 .a_stride(23)
37999 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38000 }
38001 }
38002 }
38003
38004 TEST(F32_GEMMINC_1X8S4__PSIMD, n_div_8_subtile) {
38005 TEST_REQUIRES_PSIMD;
38006 for (uint32_t n = 16; n <= 24; n += 8) {
38007 for (size_t k = 1; k <= 20; k += 5) {
38008 for (uint32_t m = 1; m <= 1; m++) {
38009 GemmMicrokernelTester()
38010 .mr(1)
38011 .nr(8)
38012 .kr(1)
38013 .sr(4)
38014 .m(m)
38015 .n(n)
38016 .k(k)
38017 .iterations(1)
38018 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38019 }
38020 }
38021 }
38022 }
38023
38024 TEST(F32_GEMMINC_1X8S4__PSIMD, strided_cm_subtile) {
38025 TEST_REQUIRES_PSIMD;
38026 for (size_t k = 1; k <= 20; k += 5) {
38027 for (uint32_t m = 1; m <= 1; m++) {
38028 for (uint32_t n = 1; n <= 8; n++) {
38029 GemmMicrokernelTester()
38030 .mr(1)
38031 .nr(8)
38032 .kr(1)
38033 .sr(4)
38034 .m(m)
38035 .n(n)
38036 .k(k)
38037 .cm_stride(11)
38038 .iterations(1)
38039 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38040 }
38041 }
38042 }
38043 }
38044
38045 TEST(F32_GEMMINC_1X8S4__PSIMD, qmin) {
38046 TEST_REQUIRES_PSIMD;
38047 GemmMicrokernelTester()
38048 .mr(1)
38049 .nr(8)
38050 .kr(1)
38051 .sr(4)
38052 .m(1)
38053 .n(8)
38054 .k(4)
38055 .qmin(128)
38056 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38057 }
38058
38059 TEST(F32_GEMMINC_1X8S4__PSIMD, qmax) {
38060 TEST_REQUIRES_PSIMD;
38061 GemmMicrokernelTester()
38062 .mr(1)
38063 .nr(8)
38064 .kr(1)
38065 .sr(4)
38066 .m(1)
38067 .n(8)
38068 .k(4)
38069 .qmax(128)
38070 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38071 }
38072
38073 TEST(F32_GEMMINC_1X8S4__PSIMD, strided_cm) {
38074 TEST_REQUIRES_PSIMD;
38075 GemmMicrokernelTester()
38076 .mr(1)
38077 .nr(8)
38078 .kr(1)
38079 .sr(4)
38080 .m(1)
38081 .n(8)
38082 .k(4)
38083 .cm_stride(11)
38084 .Test(xnn_f32_gemminc_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38085 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038086#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038087
38088
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038089#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038090 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4) {
38091 TEST_REQUIRES_PSIMD;
38092 GemmMicrokernelTester()
38093 .mr(4)
38094 .nr(8)
38095 .kr(1)
38096 .sr(4)
38097 .m(4)
38098 .n(8)
38099 .k(4)
38100 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38101 }
38102
38103 TEST(F32_GEMMINC_4X8S4__PSIMD, strided_cn) {
38104 TEST_REQUIRES_PSIMD;
38105 GemmMicrokernelTester()
38106 .mr(4)
38107 .nr(8)
38108 .kr(1)
38109 .sr(4)
38110 .m(4)
38111 .n(8)
38112 .k(4)
38113 .cn_stride(11)
38114 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38115 }
38116
38117 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_strided_a) {
38118 TEST_REQUIRES_PSIMD;
38119 GemmMicrokernelTester()
38120 .mr(4)
38121 .nr(8)
38122 .kr(1)
38123 .sr(4)
38124 .m(4)
38125 .n(8)
38126 .k(4)
38127 .a_stride(7)
38128 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38129 }
38130
38131 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_subtile) {
38132 TEST_REQUIRES_PSIMD;
38133 for (uint32_t m = 1; m <= 4; m++) {
38134 for (uint32_t n = 1; n <= 8; n++) {
38135 GemmMicrokernelTester()
38136 .mr(4)
38137 .nr(8)
38138 .kr(1)
38139 .sr(4)
38140 .m(m)
38141 .n(n)
38142 .k(4)
38143 .iterations(1)
38144 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38145 }
38146 }
38147 }
38148
38149 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_subtile_m) {
38150 TEST_REQUIRES_PSIMD;
38151 for (uint32_t m = 1; m <= 4; m++) {
38152 GemmMicrokernelTester()
38153 .mr(4)
38154 .nr(8)
38155 .kr(1)
38156 .sr(4)
38157 .m(m)
38158 .n(8)
38159 .k(4)
38160 .iterations(1)
38161 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38162 }
38163 }
38164
38165 TEST(F32_GEMMINC_4X8S4__PSIMD, k_eq_4_subtile_n) {
38166 TEST_REQUIRES_PSIMD;
38167 for (uint32_t n = 1; n <= 8; n++) {
38168 GemmMicrokernelTester()
38169 .mr(4)
38170 .nr(8)
38171 .kr(1)
38172 .sr(4)
38173 .m(4)
38174 .n(n)
38175 .k(4)
38176 .iterations(1)
38177 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38178 }
38179 }
38180
38181 TEST(F32_GEMMINC_4X8S4__PSIMD, k_lt_4) {
38182 TEST_REQUIRES_PSIMD;
38183 for (size_t k = 1; k < 4; k++) {
38184 GemmMicrokernelTester()
38185 .mr(4)
38186 .nr(8)
38187 .kr(1)
38188 .sr(4)
38189 .m(4)
38190 .n(8)
38191 .k(k)
38192 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38193 }
38194 }
38195
38196 TEST(F32_GEMMINC_4X8S4__PSIMD, k_lt_4_strided_a) {
38197 TEST_REQUIRES_PSIMD;
38198 for (size_t k = 1; k < 4; k++) {
38199 GemmMicrokernelTester()
38200 .mr(4)
38201 .nr(8)
38202 .kr(1)
38203 .sr(4)
38204 .m(4)
38205 .n(8)
38206 .k(k)
38207 .a_stride(7)
38208 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38209 }
38210 }
38211
38212 TEST(F32_GEMMINC_4X8S4__PSIMD, k_lt_4_subtile) {
38213 TEST_REQUIRES_PSIMD;
38214 for (size_t k = 1; k < 4; k++) {
38215 for (uint32_t m = 1; m <= 4; m++) {
38216 for (uint32_t n = 1; n <= 8; n++) {
38217 GemmMicrokernelTester()
38218 .mr(4)
38219 .nr(8)
38220 .kr(1)
38221 .sr(4)
38222 .m(m)
38223 .n(n)
38224 .k(k)
38225 .iterations(1)
38226 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38227 }
38228 }
38229 }
38230 }
38231
38232 TEST(F32_GEMMINC_4X8S4__PSIMD, k_gt_4) {
38233 TEST_REQUIRES_PSIMD;
38234 for (size_t k = 5; k < 8; k++) {
38235 GemmMicrokernelTester()
38236 .mr(4)
38237 .nr(8)
38238 .kr(1)
38239 .sr(4)
38240 .m(4)
38241 .n(8)
38242 .k(k)
38243 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38244 }
38245 }
38246
38247 TEST(F32_GEMMINC_4X8S4__PSIMD, k_gt_4_strided_a) {
38248 TEST_REQUIRES_PSIMD;
38249 for (size_t k = 5; k < 8; k++) {
38250 GemmMicrokernelTester()
38251 .mr(4)
38252 .nr(8)
38253 .kr(1)
38254 .sr(4)
38255 .m(4)
38256 .n(8)
38257 .k(k)
38258 .a_stride(11)
38259 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38260 }
38261 }
38262
38263 TEST(F32_GEMMINC_4X8S4__PSIMD, k_gt_4_subtile) {
38264 TEST_REQUIRES_PSIMD;
38265 for (size_t k = 5; k < 8; k++) {
38266 for (uint32_t m = 1; m <= 4; m++) {
38267 for (uint32_t n = 1; n <= 8; n++) {
38268 GemmMicrokernelTester()
38269 .mr(4)
38270 .nr(8)
38271 .kr(1)
38272 .sr(4)
38273 .m(m)
38274 .n(n)
38275 .k(k)
38276 .iterations(1)
38277 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38278 }
38279 }
38280 }
38281 }
38282
38283 TEST(F32_GEMMINC_4X8S4__PSIMD, k_div_4) {
38284 TEST_REQUIRES_PSIMD;
38285 for (size_t k = 8; k <= 40; k += 4) {
38286 GemmMicrokernelTester()
38287 .mr(4)
38288 .nr(8)
38289 .kr(1)
38290 .sr(4)
38291 .m(4)
38292 .n(8)
38293 .k(k)
38294 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38295 }
38296 }
38297
38298 TEST(F32_GEMMINC_4X8S4__PSIMD, k_div_4_strided_a) {
38299 TEST_REQUIRES_PSIMD;
38300 for (size_t k = 8; k <= 40; k += 4) {
38301 GemmMicrokernelTester()
38302 .mr(4)
38303 .nr(8)
38304 .kr(1)
38305 .sr(4)
38306 .m(4)
38307 .n(8)
38308 .k(k)
38309 .a_stride(43)
38310 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38311 }
38312 }
38313
38314 TEST(F32_GEMMINC_4X8S4__PSIMD, k_div_4_subtile) {
38315 TEST_REQUIRES_PSIMD;
38316 for (size_t k = 8; k <= 40; k += 4) {
38317 for (uint32_t m = 1; m <= 4; m++) {
38318 for (uint32_t n = 1; n <= 8; n++) {
38319 GemmMicrokernelTester()
38320 .mr(4)
38321 .nr(8)
38322 .kr(1)
38323 .sr(4)
38324 .m(m)
38325 .n(n)
38326 .k(k)
38327 .iterations(1)
38328 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38329 }
38330 }
38331 }
38332 }
38333
38334 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8) {
38335 TEST_REQUIRES_PSIMD;
38336 for (uint32_t n = 9; n < 16; n++) {
38337 for (size_t k = 1; k <= 20; k += 5) {
38338 GemmMicrokernelTester()
38339 .mr(4)
38340 .nr(8)
38341 .kr(1)
38342 .sr(4)
38343 .m(4)
38344 .n(8)
38345 .k(k)
38346 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38347 }
38348 }
38349 }
38350
38351 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8_strided_cn) {
38352 TEST_REQUIRES_PSIMD;
38353 for (uint32_t n = 9; n < 16; n++) {
38354 for (size_t k = 1; k <= 20; k += 5) {
38355 GemmMicrokernelTester()
38356 .mr(4)
38357 .nr(8)
38358 .kr(1)
38359 .sr(4)
38360 .m(4)
38361 .n(8)
38362 .k(k)
38363 .cn_stride(11)
38364 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38365 }
38366 }
38367 }
38368
38369 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8_strided_a) {
38370 TEST_REQUIRES_PSIMD;
38371 for (uint32_t n = 9; n < 16; n++) {
38372 for (size_t k = 1; k <= 20; k += 5) {
38373 GemmMicrokernelTester()
38374 .mr(4)
38375 .nr(8)
38376 .kr(1)
38377 .sr(4)
38378 .m(4)
38379 .n(n)
38380 .k(k)
38381 .a_stride(23)
38382 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38383 }
38384 }
38385 }
38386
38387 TEST(F32_GEMMINC_4X8S4__PSIMD, n_gt_8_subtile) {
38388 TEST_REQUIRES_PSIMD;
38389 for (uint32_t n = 9; n < 16; n++) {
38390 for (size_t k = 1; k <= 20; k += 5) {
38391 for (uint32_t m = 1; m <= 4; m++) {
38392 GemmMicrokernelTester()
38393 .mr(4)
38394 .nr(8)
38395 .kr(1)
38396 .sr(4)
38397 .m(m)
38398 .n(n)
38399 .k(k)
38400 .iterations(1)
38401 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38402 }
38403 }
38404 }
38405 }
38406
38407 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8) {
38408 TEST_REQUIRES_PSIMD;
38409 for (uint32_t n = 16; n <= 24; n += 8) {
38410 for (size_t k = 1; k <= 20; k += 5) {
38411 GemmMicrokernelTester()
38412 .mr(4)
38413 .nr(8)
38414 .kr(1)
38415 .sr(4)
38416 .m(4)
38417 .n(8)
38418 .k(k)
38419 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38420 }
38421 }
38422 }
38423
38424 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8_strided_cn) {
38425 TEST_REQUIRES_PSIMD;
38426 for (uint32_t n = 16; n <= 24; n += 8) {
38427 for (size_t k = 1; k <= 20; k += 5) {
38428 GemmMicrokernelTester()
38429 .mr(4)
38430 .nr(8)
38431 .kr(1)
38432 .sr(4)
38433 .m(4)
38434 .n(n)
38435 .k(k)
38436 .cn_stride(11)
38437 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38438 }
38439 }
38440 }
38441
38442 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8_strided_a) {
38443 TEST_REQUIRES_PSIMD;
38444 for (uint32_t n = 16; n <= 24; n += 8) {
38445 for (size_t k = 1; k <= 20; k += 5) {
38446 GemmMicrokernelTester()
38447 .mr(4)
38448 .nr(8)
38449 .kr(1)
38450 .sr(4)
38451 .m(4)
38452 .n(n)
38453 .k(k)
38454 .a_stride(23)
38455 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38456 }
38457 }
38458 }
38459
38460 TEST(F32_GEMMINC_4X8S4__PSIMD, n_div_8_subtile) {
38461 TEST_REQUIRES_PSIMD;
38462 for (uint32_t n = 16; n <= 24; n += 8) {
38463 for (size_t k = 1; k <= 20; k += 5) {
38464 for (uint32_t m = 1; m <= 4; m++) {
38465 GemmMicrokernelTester()
38466 .mr(4)
38467 .nr(8)
38468 .kr(1)
38469 .sr(4)
38470 .m(m)
38471 .n(n)
38472 .k(k)
38473 .iterations(1)
38474 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38475 }
38476 }
38477 }
38478 }
38479
38480 TEST(F32_GEMMINC_4X8S4__PSIMD, strided_cm_subtile) {
38481 TEST_REQUIRES_PSIMD;
38482 for (size_t k = 1; k <= 20; k += 5) {
38483 for (uint32_t m = 1; m <= 4; m++) {
38484 for (uint32_t n = 1; n <= 8; n++) {
38485 GemmMicrokernelTester()
38486 .mr(4)
38487 .nr(8)
38488 .kr(1)
38489 .sr(4)
38490 .m(m)
38491 .n(n)
38492 .k(k)
38493 .cm_stride(11)
38494 .iterations(1)
38495 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38496 }
38497 }
38498 }
38499 }
38500
38501 TEST(F32_GEMMINC_4X8S4__PSIMD, qmin) {
38502 TEST_REQUIRES_PSIMD;
38503 GemmMicrokernelTester()
38504 .mr(4)
38505 .nr(8)
38506 .kr(1)
38507 .sr(4)
38508 .m(4)
38509 .n(8)
38510 .k(4)
38511 .qmin(128)
38512 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38513 }
38514
38515 TEST(F32_GEMMINC_4X8S4__PSIMD, qmax) {
38516 TEST_REQUIRES_PSIMD;
38517 GemmMicrokernelTester()
38518 .mr(4)
38519 .nr(8)
38520 .kr(1)
38521 .sr(4)
38522 .m(4)
38523 .n(8)
38524 .k(4)
38525 .qmax(128)
38526 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38527 }
38528
38529 TEST(F32_GEMMINC_4X8S4__PSIMD, strided_cm) {
38530 TEST_REQUIRES_PSIMD;
38531 GemmMicrokernelTester()
38532 .mr(4)
38533 .nr(8)
38534 .kr(1)
38535 .sr(4)
38536 .m(4)
38537 .n(8)
38538 .k(4)
38539 .cm_stride(11)
38540 .Test(xnn_f32_gemminc_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38541 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038542#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038543
38544
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038545#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038546 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4) {
38547 TEST_REQUIRES_PSIMD;
38548 GemmMicrokernelTester()
38549 .mr(6)
38550 .nr(8)
38551 .kr(1)
38552 .sr(4)
38553 .m(6)
38554 .n(8)
38555 .k(4)
38556 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38557 }
38558
38559 TEST(F32_GEMMINC_6X8S4__PSIMD, strided_cn) {
38560 TEST_REQUIRES_PSIMD;
38561 GemmMicrokernelTester()
38562 .mr(6)
38563 .nr(8)
38564 .kr(1)
38565 .sr(4)
38566 .m(6)
38567 .n(8)
38568 .k(4)
38569 .cn_stride(11)
38570 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38571 }
38572
38573 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_strided_a) {
38574 TEST_REQUIRES_PSIMD;
38575 GemmMicrokernelTester()
38576 .mr(6)
38577 .nr(8)
38578 .kr(1)
38579 .sr(4)
38580 .m(6)
38581 .n(8)
38582 .k(4)
38583 .a_stride(7)
38584 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38585 }
38586
38587 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_subtile) {
38588 TEST_REQUIRES_PSIMD;
38589 for (uint32_t m = 1; m <= 6; m++) {
38590 for (uint32_t n = 1; n <= 8; n++) {
38591 GemmMicrokernelTester()
38592 .mr(6)
38593 .nr(8)
38594 .kr(1)
38595 .sr(4)
38596 .m(m)
38597 .n(n)
38598 .k(4)
38599 .iterations(1)
38600 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38601 }
38602 }
38603 }
38604
38605 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_subtile_m) {
38606 TEST_REQUIRES_PSIMD;
38607 for (uint32_t m = 1; m <= 6; m++) {
38608 GemmMicrokernelTester()
38609 .mr(6)
38610 .nr(8)
38611 .kr(1)
38612 .sr(4)
38613 .m(m)
38614 .n(8)
38615 .k(4)
38616 .iterations(1)
38617 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38618 }
38619 }
38620
38621 TEST(F32_GEMMINC_6X8S4__PSIMD, k_eq_4_subtile_n) {
38622 TEST_REQUIRES_PSIMD;
38623 for (uint32_t n = 1; n <= 8; n++) {
38624 GemmMicrokernelTester()
38625 .mr(6)
38626 .nr(8)
38627 .kr(1)
38628 .sr(4)
38629 .m(6)
38630 .n(n)
38631 .k(4)
38632 .iterations(1)
38633 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38634 }
38635 }
38636
38637 TEST(F32_GEMMINC_6X8S4__PSIMD, k_lt_4) {
38638 TEST_REQUIRES_PSIMD;
38639 for (size_t k = 1; k < 4; k++) {
38640 GemmMicrokernelTester()
38641 .mr(6)
38642 .nr(8)
38643 .kr(1)
38644 .sr(4)
38645 .m(6)
38646 .n(8)
38647 .k(k)
38648 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38649 }
38650 }
38651
38652 TEST(F32_GEMMINC_6X8S4__PSIMD, k_lt_4_strided_a) {
38653 TEST_REQUIRES_PSIMD;
38654 for (size_t k = 1; k < 4; k++) {
38655 GemmMicrokernelTester()
38656 .mr(6)
38657 .nr(8)
38658 .kr(1)
38659 .sr(4)
38660 .m(6)
38661 .n(8)
38662 .k(k)
38663 .a_stride(7)
38664 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38665 }
38666 }
38667
38668 TEST(F32_GEMMINC_6X8S4__PSIMD, k_lt_4_subtile) {
38669 TEST_REQUIRES_PSIMD;
38670 for (size_t k = 1; k < 4; k++) {
38671 for (uint32_t m = 1; m <= 6; m++) {
38672 for (uint32_t n = 1; n <= 8; n++) {
38673 GemmMicrokernelTester()
38674 .mr(6)
38675 .nr(8)
38676 .kr(1)
38677 .sr(4)
38678 .m(m)
38679 .n(n)
38680 .k(k)
38681 .iterations(1)
38682 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38683 }
38684 }
38685 }
38686 }
38687
38688 TEST(F32_GEMMINC_6X8S4__PSIMD, k_gt_4) {
38689 TEST_REQUIRES_PSIMD;
38690 for (size_t k = 5; k < 8; k++) {
38691 GemmMicrokernelTester()
38692 .mr(6)
38693 .nr(8)
38694 .kr(1)
38695 .sr(4)
38696 .m(6)
38697 .n(8)
38698 .k(k)
38699 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38700 }
38701 }
38702
38703 TEST(F32_GEMMINC_6X8S4__PSIMD, k_gt_4_strided_a) {
38704 TEST_REQUIRES_PSIMD;
38705 for (size_t k = 5; k < 8; k++) {
38706 GemmMicrokernelTester()
38707 .mr(6)
38708 .nr(8)
38709 .kr(1)
38710 .sr(4)
38711 .m(6)
38712 .n(8)
38713 .k(k)
38714 .a_stride(11)
38715 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38716 }
38717 }
38718
38719 TEST(F32_GEMMINC_6X8S4__PSIMD, k_gt_4_subtile) {
38720 TEST_REQUIRES_PSIMD;
38721 for (size_t k = 5; k < 8; k++) {
38722 for (uint32_t m = 1; m <= 6; m++) {
38723 for (uint32_t n = 1; n <= 8; n++) {
38724 GemmMicrokernelTester()
38725 .mr(6)
38726 .nr(8)
38727 .kr(1)
38728 .sr(4)
38729 .m(m)
38730 .n(n)
38731 .k(k)
38732 .iterations(1)
38733 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38734 }
38735 }
38736 }
38737 }
38738
38739 TEST(F32_GEMMINC_6X8S4__PSIMD, k_div_4) {
38740 TEST_REQUIRES_PSIMD;
38741 for (size_t k = 8; k <= 40; k += 4) {
38742 GemmMicrokernelTester()
38743 .mr(6)
38744 .nr(8)
38745 .kr(1)
38746 .sr(4)
38747 .m(6)
38748 .n(8)
38749 .k(k)
38750 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38751 }
38752 }
38753
38754 TEST(F32_GEMMINC_6X8S4__PSIMD, k_div_4_strided_a) {
38755 TEST_REQUIRES_PSIMD;
38756 for (size_t k = 8; k <= 40; k += 4) {
38757 GemmMicrokernelTester()
38758 .mr(6)
38759 .nr(8)
38760 .kr(1)
38761 .sr(4)
38762 .m(6)
38763 .n(8)
38764 .k(k)
38765 .a_stride(43)
38766 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38767 }
38768 }
38769
38770 TEST(F32_GEMMINC_6X8S4__PSIMD, k_div_4_subtile) {
38771 TEST_REQUIRES_PSIMD;
38772 for (size_t k = 8; k <= 40; k += 4) {
38773 for (uint32_t m = 1; m <= 6; m++) {
38774 for (uint32_t n = 1; n <= 8; n++) {
38775 GemmMicrokernelTester()
38776 .mr(6)
38777 .nr(8)
38778 .kr(1)
38779 .sr(4)
38780 .m(m)
38781 .n(n)
38782 .k(k)
38783 .iterations(1)
38784 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38785 }
38786 }
38787 }
38788 }
38789
38790 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8) {
38791 TEST_REQUIRES_PSIMD;
38792 for (uint32_t n = 9; n < 16; n++) {
38793 for (size_t k = 1; k <= 20; k += 5) {
38794 GemmMicrokernelTester()
38795 .mr(6)
38796 .nr(8)
38797 .kr(1)
38798 .sr(4)
38799 .m(6)
38800 .n(8)
38801 .k(k)
38802 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38803 }
38804 }
38805 }
38806
38807 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8_strided_cn) {
38808 TEST_REQUIRES_PSIMD;
38809 for (uint32_t n = 9; n < 16; n++) {
38810 for (size_t k = 1; k <= 20; k += 5) {
38811 GemmMicrokernelTester()
38812 .mr(6)
38813 .nr(8)
38814 .kr(1)
38815 .sr(4)
38816 .m(6)
38817 .n(8)
38818 .k(k)
38819 .cn_stride(11)
38820 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38821 }
38822 }
38823 }
38824
38825 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8_strided_a) {
38826 TEST_REQUIRES_PSIMD;
38827 for (uint32_t n = 9; n < 16; n++) {
38828 for (size_t k = 1; k <= 20; k += 5) {
38829 GemmMicrokernelTester()
38830 .mr(6)
38831 .nr(8)
38832 .kr(1)
38833 .sr(4)
38834 .m(6)
38835 .n(n)
38836 .k(k)
38837 .a_stride(23)
38838 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38839 }
38840 }
38841 }
38842
38843 TEST(F32_GEMMINC_6X8S4__PSIMD, n_gt_8_subtile) {
38844 TEST_REQUIRES_PSIMD;
38845 for (uint32_t n = 9; n < 16; n++) {
38846 for (size_t k = 1; k <= 20; k += 5) {
38847 for (uint32_t m = 1; m <= 6; m++) {
38848 GemmMicrokernelTester()
38849 .mr(6)
38850 .nr(8)
38851 .kr(1)
38852 .sr(4)
38853 .m(m)
38854 .n(n)
38855 .k(k)
38856 .iterations(1)
38857 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38858 }
38859 }
38860 }
38861 }
38862
38863 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8) {
38864 TEST_REQUIRES_PSIMD;
38865 for (uint32_t n = 16; n <= 24; n += 8) {
38866 for (size_t k = 1; k <= 20; k += 5) {
38867 GemmMicrokernelTester()
38868 .mr(6)
38869 .nr(8)
38870 .kr(1)
38871 .sr(4)
38872 .m(6)
38873 .n(8)
38874 .k(k)
38875 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38876 }
38877 }
38878 }
38879
38880 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8_strided_cn) {
38881 TEST_REQUIRES_PSIMD;
38882 for (uint32_t n = 16; n <= 24; n += 8) {
38883 for (size_t k = 1; k <= 20; k += 5) {
38884 GemmMicrokernelTester()
38885 .mr(6)
38886 .nr(8)
38887 .kr(1)
38888 .sr(4)
38889 .m(6)
38890 .n(n)
38891 .k(k)
38892 .cn_stride(11)
38893 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38894 }
38895 }
38896 }
38897
38898 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8_strided_a) {
38899 TEST_REQUIRES_PSIMD;
38900 for (uint32_t n = 16; n <= 24; n += 8) {
38901 for (size_t k = 1; k <= 20; k += 5) {
38902 GemmMicrokernelTester()
38903 .mr(6)
38904 .nr(8)
38905 .kr(1)
38906 .sr(4)
38907 .m(6)
38908 .n(n)
38909 .k(k)
38910 .a_stride(23)
38911 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38912 }
38913 }
38914 }
38915
38916 TEST(F32_GEMMINC_6X8S4__PSIMD, n_div_8_subtile) {
38917 TEST_REQUIRES_PSIMD;
38918 for (uint32_t n = 16; n <= 24; n += 8) {
38919 for (size_t k = 1; k <= 20; k += 5) {
38920 for (uint32_t m = 1; m <= 6; m++) {
38921 GemmMicrokernelTester()
38922 .mr(6)
38923 .nr(8)
38924 .kr(1)
38925 .sr(4)
38926 .m(m)
38927 .n(n)
38928 .k(k)
38929 .iterations(1)
38930 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38931 }
38932 }
38933 }
38934 }
38935
38936 TEST(F32_GEMMINC_6X8S4__PSIMD, strided_cm_subtile) {
38937 TEST_REQUIRES_PSIMD;
38938 for (size_t k = 1; k <= 20; k += 5) {
38939 for (uint32_t m = 1; m <= 6; m++) {
38940 for (uint32_t n = 1; n <= 8; n++) {
38941 GemmMicrokernelTester()
38942 .mr(6)
38943 .nr(8)
38944 .kr(1)
38945 .sr(4)
38946 .m(m)
38947 .n(n)
38948 .k(k)
38949 .cm_stride(11)
38950 .iterations(1)
38951 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38952 }
38953 }
38954 }
38955 }
38956
38957 TEST(F32_GEMMINC_6X8S4__PSIMD, qmin) {
38958 TEST_REQUIRES_PSIMD;
38959 GemmMicrokernelTester()
38960 .mr(6)
38961 .nr(8)
38962 .kr(1)
38963 .sr(4)
38964 .m(6)
38965 .n(8)
38966 .k(4)
38967 .qmin(128)
38968 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38969 }
38970
38971 TEST(F32_GEMMINC_6X8S4__PSIMD, qmax) {
38972 TEST_REQUIRES_PSIMD;
38973 GemmMicrokernelTester()
38974 .mr(6)
38975 .nr(8)
38976 .kr(1)
38977 .sr(4)
38978 .m(6)
38979 .n(8)
38980 .k(4)
38981 .qmax(128)
38982 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38983 }
38984
38985 TEST(F32_GEMMINC_6X8S4__PSIMD, strided_cm) {
38986 TEST_REQUIRES_PSIMD;
38987 GemmMicrokernelTester()
38988 .mr(6)
38989 .nr(8)
38990 .kr(1)
38991 .sr(4)
38992 .m(6)
38993 .n(8)
38994 .k(4)
38995 .cm_stride(11)
38996 .Test(xnn_f32_gemminc_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
38997 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038998#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038999
39000
Marat Dukhan436ebe62019-12-04 15:10:12 -080039001#if XNN_ARCH_WASM
39002 TEST(F32_GEMMINC_1X4__WASM, k_eq_1) {
39003 GemmMicrokernelTester()
39004 .mr(1)
39005 .nr(4)
39006 .kr(1)
39007 .sr(1)
39008 .m(1)
39009 .n(4)
39010 .k(1)
39011 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39012 }
39013
39014 TEST(F32_GEMMINC_1X4__WASM, strided_cn) {
39015 GemmMicrokernelTester()
39016 .mr(1)
39017 .nr(4)
39018 .kr(1)
39019 .sr(1)
39020 .m(1)
39021 .n(4)
39022 .k(1)
39023 .cn_stride(7)
39024 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39025 }
39026
39027 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_strided_a) {
39028 GemmMicrokernelTester()
39029 .mr(1)
39030 .nr(4)
39031 .kr(1)
39032 .sr(1)
39033 .m(1)
39034 .n(4)
39035 .k(1)
39036 .a_stride(3)
39037 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39038 }
39039
39040 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_subtile) {
39041 for (uint32_t m = 1; m <= 1; m++) {
39042 for (uint32_t n = 1; n <= 4; n++) {
39043 GemmMicrokernelTester()
39044 .mr(1)
39045 .nr(4)
39046 .kr(1)
39047 .sr(1)
39048 .m(m)
39049 .n(n)
39050 .k(1)
39051 .iterations(1)
39052 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39053 }
39054 }
39055 }
39056
39057 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_subtile_m) {
39058 for (uint32_t m = 1; m <= 1; m++) {
39059 GemmMicrokernelTester()
39060 .mr(1)
39061 .nr(4)
39062 .kr(1)
39063 .sr(1)
39064 .m(m)
39065 .n(4)
39066 .k(1)
39067 .iterations(1)
39068 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39069 }
39070 }
39071
39072 TEST(F32_GEMMINC_1X4__WASM, k_eq_1_subtile_n) {
39073 for (uint32_t n = 1; n <= 4; n++) {
39074 GemmMicrokernelTester()
39075 .mr(1)
39076 .nr(4)
39077 .kr(1)
39078 .sr(1)
39079 .m(1)
39080 .n(n)
39081 .k(1)
39082 .iterations(1)
39083 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39084 }
39085 }
39086
39087 TEST(F32_GEMMINC_1X4__WASM, k_gt_1) {
39088 for (size_t k = 2; k < 10; k++) {
39089 GemmMicrokernelTester()
39090 .mr(1)
39091 .nr(4)
39092 .kr(1)
39093 .sr(1)
39094 .m(1)
39095 .n(4)
39096 .k(k)
39097 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39098 }
39099 }
39100
39101 TEST(F32_GEMMINC_1X4__WASM, k_gt_1_strided_a) {
39102 for (size_t k = 2; k < 10; k++) {
39103 GemmMicrokernelTester()
39104 .mr(1)
39105 .nr(4)
39106 .kr(1)
39107 .sr(1)
39108 .m(1)
39109 .n(4)
39110 .k(k)
39111 .a_stride(11)
39112 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39113 }
39114 }
39115
39116 TEST(F32_GEMMINC_1X4__WASM, k_gt_1_subtile) {
39117 for (size_t k = 2; k < 10; k++) {
39118 for (uint32_t m = 1; m <= 1; m++) {
39119 for (uint32_t n = 1; n <= 4; n++) {
39120 GemmMicrokernelTester()
39121 .mr(1)
39122 .nr(4)
39123 .kr(1)
39124 .sr(1)
39125 .m(m)
39126 .n(n)
39127 .k(k)
39128 .iterations(1)
39129 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39130 }
39131 }
39132 }
39133 }
39134
39135 TEST(F32_GEMMINC_1X4__WASM, n_gt_4) {
39136 for (uint32_t n = 5; n < 8; n++) {
39137 for (size_t k = 1; k <= 5; k += 2) {
39138 GemmMicrokernelTester()
39139 .mr(1)
39140 .nr(4)
39141 .kr(1)
39142 .sr(1)
39143 .m(1)
39144 .n(4)
39145 .k(k)
39146 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39147 }
39148 }
39149 }
39150
39151 TEST(F32_GEMMINC_1X4__WASM, n_gt_4_strided_cn) {
39152 for (uint32_t n = 5; n < 8; n++) {
39153 for (size_t k = 1; k <= 5; k += 2) {
39154 GemmMicrokernelTester()
39155 .mr(1)
39156 .nr(4)
39157 .kr(1)
39158 .sr(1)
39159 .m(1)
39160 .n(4)
39161 .k(k)
39162 .cn_stride(7)
39163 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39164 }
39165 }
39166 }
39167
39168 TEST(F32_GEMMINC_1X4__WASM, n_gt_4_strided_a) {
39169 for (uint32_t n = 5; n < 8; n++) {
39170 for (size_t k = 1; k <= 5; k += 2) {
39171 GemmMicrokernelTester()
39172 .mr(1)
39173 .nr(4)
39174 .kr(1)
39175 .sr(1)
39176 .m(1)
39177 .n(n)
39178 .k(k)
39179 .a_stride(7)
39180 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39181 }
39182 }
39183 }
39184
39185 TEST(F32_GEMMINC_1X4__WASM, n_gt_4_subtile) {
39186 for (uint32_t n = 5; n < 8; n++) {
39187 for (size_t k = 1; k <= 5; k += 2) {
39188 for (uint32_t m = 1; m <= 1; m++) {
39189 GemmMicrokernelTester()
39190 .mr(1)
39191 .nr(4)
39192 .kr(1)
39193 .sr(1)
39194 .m(m)
39195 .n(n)
39196 .k(k)
39197 .iterations(1)
39198 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39199 }
39200 }
39201 }
39202 }
39203
39204 TEST(F32_GEMMINC_1X4__WASM, n_div_4) {
39205 for (uint32_t n = 8; n <= 12; n += 4) {
39206 for (size_t k = 1; k <= 5; k += 2) {
39207 GemmMicrokernelTester()
39208 .mr(1)
39209 .nr(4)
39210 .kr(1)
39211 .sr(1)
39212 .m(1)
39213 .n(4)
39214 .k(k)
39215 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39216 }
39217 }
39218 }
39219
39220 TEST(F32_GEMMINC_1X4__WASM, n_div_4_strided_cn) {
39221 for (uint32_t n = 8; n <= 12; n += 4) {
39222 for (size_t k = 1; k <= 5; k += 2) {
39223 GemmMicrokernelTester()
39224 .mr(1)
39225 .nr(4)
39226 .kr(1)
39227 .sr(1)
39228 .m(1)
39229 .n(n)
39230 .k(k)
39231 .cn_stride(7)
39232 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39233 }
39234 }
39235 }
39236
39237 TEST(F32_GEMMINC_1X4__WASM, n_div_4_strided_a) {
39238 for (uint32_t n = 8; n <= 12; n += 4) {
39239 for (size_t k = 1; k <= 5; k += 2) {
39240 GemmMicrokernelTester()
39241 .mr(1)
39242 .nr(4)
39243 .kr(1)
39244 .sr(1)
39245 .m(1)
39246 .n(n)
39247 .k(k)
39248 .a_stride(7)
39249 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39250 }
39251 }
39252 }
39253
39254 TEST(F32_GEMMINC_1X4__WASM, n_div_4_subtile) {
39255 for (uint32_t n = 8; n <= 12; n += 4) {
39256 for (size_t k = 1; k <= 5; k += 2) {
39257 for (uint32_t m = 1; m <= 1; m++) {
39258 GemmMicrokernelTester()
39259 .mr(1)
39260 .nr(4)
39261 .kr(1)
39262 .sr(1)
39263 .m(m)
39264 .n(n)
39265 .k(k)
39266 .iterations(1)
39267 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39268 }
39269 }
39270 }
39271 }
39272
39273 TEST(F32_GEMMINC_1X4__WASM, strided_cm_subtile) {
39274 for (size_t k = 1; k <= 5; k += 2) {
39275 for (uint32_t m = 1; m <= 1; m++) {
39276 for (uint32_t n = 1; n <= 4; n++) {
39277 GemmMicrokernelTester()
39278 .mr(1)
39279 .nr(4)
39280 .kr(1)
39281 .sr(1)
39282 .m(m)
39283 .n(n)
39284 .k(k)
39285 .cm_stride(7)
39286 .iterations(1)
39287 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39288 }
39289 }
39290 }
39291 }
39292
39293 TEST(F32_GEMMINC_1X4__WASM, qmin) {
39294 GemmMicrokernelTester()
39295 .mr(1)
39296 .nr(4)
39297 .kr(1)
39298 .sr(1)
39299 .m(1)
39300 .n(4)
39301 .k(1)
39302 .qmin(128)
39303 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39304 }
39305
39306 TEST(F32_GEMMINC_1X4__WASM, qmax) {
39307 GemmMicrokernelTester()
39308 .mr(1)
39309 .nr(4)
39310 .kr(1)
39311 .sr(1)
39312 .m(1)
39313 .n(4)
39314 .k(1)
39315 .qmax(128)
39316 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39317 }
39318
39319 TEST(F32_GEMMINC_1X4__WASM, strided_cm) {
39320 GemmMicrokernelTester()
39321 .mr(1)
39322 .nr(4)
39323 .kr(1)
39324 .sr(1)
39325 .m(1)
39326 .n(4)
39327 .k(1)
39328 .cm_stride(7)
39329 .Test(xnn_f32_gemminc_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39330 }
39331#endif // XNN_ARCH_WASM
39332
39333
39334#if XNN_ARCH_WASM
39335 TEST(F32_GEMMINC_2X4__WASM, k_eq_1) {
39336 GemmMicrokernelTester()
39337 .mr(2)
39338 .nr(4)
39339 .kr(1)
39340 .sr(1)
39341 .m(2)
39342 .n(4)
39343 .k(1)
39344 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39345 }
39346
39347 TEST(F32_GEMMINC_2X4__WASM, strided_cn) {
39348 GemmMicrokernelTester()
39349 .mr(2)
39350 .nr(4)
39351 .kr(1)
39352 .sr(1)
39353 .m(2)
39354 .n(4)
39355 .k(1)
39356 .cn_stride(7)
39357 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39358 }
39359
39360 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_strided_a) {
39361 GemmMicrokernelTester()
39362 .mr(2)
39363 .nr(4)
39364 .kr(1)
39365 .sr(1)
39366 .m(2)
39367 .n(4)
39368 .k(1)
39369 .a_stride(3)
39370 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39371 }
39372
39373 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_subtile) {
39374 for (uint32_t m = 1; m <= 2; m++) {
39375 for (uint32_t n = 1; n <= 4; n++) {
39376 GemmMicrokernelTester()
39377 .mr(2)
39378 .nr(4)
39379 .kr(1)
39380 .sr(1)
39381 .m(m)
39382 .n(n)
39383 .k(1)
39384 .iterations(1)
39385 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39386 }
39387 }
39388 }
39389
39390 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_subtile_m) {
39391 for (uint32_t m = 1; m <= 2; m++) {
39392 GemmMicrokernelTester()
39393 .mr(2)
39394 .nr(4)
39395 .kr(1)
39396 .sr(1)
39397 .m(m)
39398 .n(4)
39399 .k(1)
39400 .iterations(1)
39401 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39402 }
39403 }
39404
39405 TEST(F32_GEMMINC_2X4__WASM, k_eq_1_subtile_n) {
39406 for (uint32_t n = 1; n <= 4; n++) {
39407 GemmMicrokernelTester()
39408 .mr(2)
39409 .nr(4)
39410 .kr(1)
39411 .sr(1)
39412 .m(2)
39413 .n(n)
39414 .k(1)
39415 .iterations(1)
39416 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39417 }
39418 }
39419
39420 TEST(F32_GEMMINC_2X4__WASM, k_gt_1) {
39421 for (size_t k = 2; k < 10; k++) {
39422 GemmMicrokernelTester()
39423 .mr(2)
39424 .nr(4)
39425 .kr(1)
39426 .sr(1)
39427 .m(2)
39428 .n(4)
39429 .k(k)
39430 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39431 }
39432 }
39433
39434 TEST(F32_GEMMINC_2X4__WASM, k_gt_1_strided_a) {
39435 for (size_t k = 2; k < 10; k++) {
39436 GemmMicrokernelTester()
39437 .mr(2)
39438 .nr(4)
39439 .kr(1)
39440 .sr(1)
39441 .m(2)
39442 .n(4)
39443 .k(k)
39444 .a_stride(11)
39445 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39446 }
39447 }
39448
39449 TEST(F32_GEMMINC_2X4__WASM, k_gt_1_subtile) {
39450 for (size_t k = 2; k < 10; k++) {
39451 for (uint32_t m = 1; m <= 2; m++) {
39452 for (uint32_t n = 1; n <= 4; n++) {
39453 GemmMicrokernelTester()
39454 .mr(2)
39455 .nr(4)
39456 .kr(1)
39457 .sr(1)
39458 .m(m)
39459 .n(n)
39460 .k(k)
39461 .iterations(1)
39462 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39463 }
39464 }
39465 }
39466 }
39467
39468 TEST(F32_GEMMINC_2X4__WASM, n_gt_4) {
39469 for (uint32_t n = 5; n < 8; n++) {
39470 for (size_t k = 1; k <= 5; k += 2) {
39471 GemmMicrokernelTester()
39472 .mr(2)
39473 .nr(4)
39474 .kr(1)
39475 .sr(1)
39476 .m(2)
39477 .n(4)
39478 .k(k)
39479 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39480 }
39481 }
39482 }
39483
39484 TEST(F32_GEMMINC_2X4__WASM, n_gt_4_strided_cn) {
39485 for (uint32_t n = 5; n < 8; n++) {
39486 for (size_t k = 1; k <= 5; k += 2) {
39487 GemmMicrokernelTester()
39488 .mr(2)
39489 .nr(4)
39490 .kr(1)
39491 .sr(1)
39492 .m(2)
39493 .n(4)
39494 .k(k)
39495 .cn_stride(7)
39496 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39497 }
39498 }
39499 }
39500
39501 TEST(F32_GEMMINC_2X4__WASM, n_gt_4_strided_a) {
39502 for (uint32_t n = 5; n < 8; n++) {
39503 for (size_t k = 1; k <= 5; k += 2) {
39504 GemmMicrokernelTester()
39505 .mr(2)
39506 .nr(4)
39507 .kr(1)
39508 .sr(1)
39509 .m(2)
39510 .n(n)
39511 .k(k)
39512 .a_stride(7)
39513 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39514 }
39515 }
39516 }
39517
39518 TEST(F32_GEMMINC_2X4__WASM, n_gt_4_subtile) {
39519 for (uint32_t n = 5; n < 8; n++) {
39520 for (size_t k = 1; k <= 5; k += 2) {
39521 for (uint32_t m = 1; m <= 2; m++) {
39522 GemmMicrokernelTester()
39523 .mr(2)
39524 .nr(4)
39525 .kr(1)
39526 .sr(1)
39527 .m(m)
39528 .n(n)
39529 .k(k)
39530 .iterations(1)
39531 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39532 }
39533 }
39534 }
39535 }
39536
39537 TEST(F32_GEMMINC_2X4__WASM, n_div_4) {
39538 for (uint32_t n = 8; n <= 12; n += 4) {
39539 for (size_t k = 1; k <= 5; k += 2) {
39540 GemmMicrokernelTester()
39541 .mr(2)
39542 .nr(4)
39543 .kr(1)
39544 .sr(1)
39545 .m(2)
39546 .n(4)
39547 .k(k)
39548 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39549 }
39550 }
39551 }
39552
39553 TEST(F32_GEMMINC_2X4__WASM, n_div_4_strided_cn) {
39554 for (uint32_t n = 8; n <= 12; n += 4) {
39555 for (size_t k = 1; k <= 5; k += 2) {
39556 GemmMicrokernelTester()
39557 .mr(2)
39558 .nr(4)
39559 .kr(1)
39560 .sr(1)
39561 .m(2)
39562 .n(n)
39563 .k(k)
39564 .cn_stride(7)
39565 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39566 }
39567 }
39568 }
39569
39570 TEST(F32_GEMMINC_2X4__WASM, n_div_4_strided_a) {
39571 for (uint32_t n = 8; n <= 12; n += 4) {
39572 for (size_t k = 1; k <= 5; k += 2) {
39573 GemmMicrokernelTester()
39574 .mr(2)
39575 .nr(4)
39576 .kr(1)
39577 .sr(1)
39578 .m(2)
39579 .n(n)
39580 .k(k)
39581 .a_stride(7)
39582 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39583 }
39584 }
39585 }
39586
39587 TEST(F32_GEMMINC_2X4__WASM, n_div_4_subtile) {
39588 for (uint32_t n = 8; n <= 12; n += 4) {
39589 for (size_t k = 1; k <= 5; k += 2) {
39590 for (uint32_t m = 1; m <= 2; m++) {
39591 GemmMicrokernelTester()
39592 .mr(2)
39593 .nr(4)
39594 .kr(1)
39595 .sr(1)
39596 .m(m)
39597 .n(n)
39598 .k(k)
39599 .iterations(1)
39600 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39601 }
39602 }
39603 }
39604 }
39605
39606 TEST(F32_GEMMINC_2X4__WASM, strided_cm_subtile) {
39607 for (size_t k = 1; k <= 5; k += 2) {
39608 for (uint32_t m = 1; m <= 2; m++) {
39609 for (uint32_t n = 1; n <= 4; n++) {
39610 GemmMicrokernelTester()
39611 .mr(2)
39612 .nr(4)
39613 .kr(1)
39614 .sr(1)
39615 .m(m)
39616 .n(n)
39617 .k(k)
39618 .cm_stride(7)
39619 .iterations(1)
39620 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39621 }
39622 }
39623 }
39624 }
39625
39626 TEST(F32_GEMMINC_2X4__WASM, qmin) {
39627 GemmMicrokernelTester()
39628 .mr(2)
39629 .nr(4)
39630 .kr(1)
39631 .sr(1)
39632 .m(2)
39633 .n(4)
39634 .k(1)
39635 .qmin(128)
39636 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39637 }
39638
39639 TEST(F32_GEMMINC_2X4__WASM, qmax) {
39640 GemmMicrokernelTester()
39641 .mr(2)
39642 .nr(4)
39643 .kr(1)
39644 .sr(1)
39645 .m(2)
39646 .n(4)
39647 .k(1)
39648 .qmax(128)
39649 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39650 }
39651
39652 TEST(F32_GEMMINC_2X4__WASM, strided_cm) {
39653 GemmMicrokernelTester()
39654 .mr(2)
39655 .nr(4)
39656 .kr(1)
39657 .sr(1)
39658 .m(2)
39659 .n(4)
39660 .k(1)
39661 .cm_stride(7)
39662 .Test(xnn_f32_gemminc_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39663 }
39664#endif // XNN_ARCH_WASM
39665
39666
39667#if XNN_ARCH_WASM
39668 TEST(F32_GEMMINC_4X4__WASM, k_eq_1) {
39669 GemmMicrokernelTester()
39670 .mr(4)
39671 .nr(4)
39672 .kr(1)
39673 .sr(1)
39674 .m(4)
39675 .n(4)
39676 .k(1)
39677 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39678 }
39679
39680 TEST(F32_GEMMINC_4X4__WASM, strided_cn) {
39681 GemmMicrokernelTester()
39682 .mr(4)
39683 .nr(4)
39684 .kr(1)
39685 .sr(1)
39686 .m(4)
39687 .n(4)
39688 .k(1)
39689 .cn_stride(7)
39690 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39691 }
39692
39693 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_strided_a) {
39694 GemmMicrokernelTester()
39695 .mr(4)
39696 .nr(4)
39697 .kr(1)
39698 .sr(1)
39699 .m(4)
39700 .n(4)
39701 .k(1)
39702 .a_stride(3)
39703 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39704 }
39705
39706 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_subtile) {
39707 for (uint32_t m = 1; m <= 4; m++) {
39708 for (uint32_t n = 1; n <= 4; n++) {
39709 GemmMicrokernelTester()
39710 .mr(4)
39711 .nr(4)
39712 .kr(1)
39713 .sr(1)
39714 .m(m)
39715 .n(n)
39716 .k(1)
39717 .iterations(1)
39718 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39719 }
39720 }
39721 }
39722
39723 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_subtile_m) {
39724 for (uint32_t m = 1; m <= 4; m++) {
39725 GemmMicrokernelTester()
39726 .mr(4)
39727 .nr(4)
39728 .kr(1)
39729 .sr(1)
39730 .m(m)
39731 .n(4)
39732 .k(1)
39733 .iterations(1)
39734 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39735 }
39736 }
39737
39738 TEST(F32_GEMMINC_4X4__WASM, k_eq_1_subtile_n) {
39739 for (uint32_t n = 1; n <= 4; n++) {
39740 GemmMicrokernelTester()
39741 .mr(4)
39742 .nr(4)
39743 .kr(1)
39744 .sr(1)
39745 .m(4)
39746 .n(n)
39747 .k(1)
39748 .iterations(1)
39749 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39750 }
39751 }
39752
39753 TEST(F32_GEMMINC_4X4__WASM, k_gt_1) {
39754 for (size_t k = 2; k < 10; k++) {
39755 GemmMicrokernelTester()
39756 .mr(4)
39757 .nr(4)
39758 .kr(1)
39759 .sr(1)
39760 .m(4)
39761 .n(4)
39762 .k(k)
39763 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39764 }
39765 }
39766
39767 TEST(F32_GEMMINC_4X4__WASM, k_gt_1_strided_a) {
39768 for (size_t k = 2; k < 10; k++) {
39769 GemmMicrokernelTester()
39770 .mr(4)
39771 .nr(4)
39772 .kr(1)
39773 .sr(1)
39774 .m(4)
39775 .n(4)
39776 .k(k)
39777 .a_stride(11)
39778 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39779 }
39780 }
39781
39782 TEST(F32_GEMMINC_4X4__WASM, k_gt_1_subtile) {
39783 for (size_t k = 2; k < 10; k++) {
39784 for (uint32_t m = 1; m <= 4; m++) {
39785 for (uint32_t n = 1; n <= 4; n++) {
39786 GemmMicrokernelTester()
39787 .mr(4)
39788 .nr(4)
39789 .kr(1)
39790 .sr(1)
39791 .m(m)
39792 .n(n)
39793 .k(k)
39794 .iterations(1)
39795 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39796 }
39797 }
39798 }
39799 }
39800
39801 TEST(F32_GEMMINC_4X4__WASM, n_gt_4) {
39802 for (uint32_t n = 5; n < 8; n++) {
39803 for (size_t k = 1; k <= 5; k += 2) {
39804 GemmMicrokernelTester()
39805 .mr(4)
39806 .nr(4)
39807 .kr(1)
39808 .sr(1)
39809 .m(4)
39810 .n(4)
39811 .k(k)
39812 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39813 }
39814 }
39815 }
39816
39817 TEST(F32_GEMMINC_4X4__WASM, n_gt_4_strided_cn) {
39818 for (uint32_t n = 5; n < 8; n++) {
39819 for (size_t k = 1; k <= 5; k += 2) {
39820 GemmMicrokernelTester()
39821 .mr(4)
39822 .nr(4)
39823 .kr(1)
39824 .sr(1)
39825 .m(4)
39826 .n(4)
39827 .k(k)
39828 .cn_stride(7)
39829 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39830 }
39831 }
39832 }
39833
39834 TEST(F32_GEMMINC_4X4__WASM, n_gt_4_strided_a) {
39835 for (uint32_t n = 5; n < 8; n++) {
39836 for (size_t k = 1; k <= 5; k += 2) {
39837 GemmMicrokernelTester()
39838 .mr(4)
39839 .nr(4)
39840 .kr(1)
39841 .sr(1)
39842 .m(4)
39843 .n(n)
39844 .k(k)
39845 .a_stride(7)
39846 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39847 }
39848 }
39849 }
39850
39851 TEST(F32_GEMMINC_4X4__WASM, n_gt_4_subtile) {
39852 for (uint32_t n = 5; n < 8; n++) {
39853 for (size_t k = 1; k <= 5; k += 2) {
39854 for (uint32_t m = 1; m <= 4; m++) {
39855 GemmMicrokernelTester()
39856 .mr(4)
39857 .nr(4)
39858 .kr(1)
39859 .sr(1)
39860 .m(m)
39861 .n(n)
39862 .k(k)
39863 .iterations(1)
39864 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39865 }
39866 }
39867 }
39868 }
39869
39870 TEST(F32_GEMMINC_4X4__WASM, n_div_4) {
39871 for (uint32_t n = 8; n <= 12; n += 4) {
39872 for (size_t k = 1; k <= 5; k += 2) {
39873 GemmMicrokernelTester()
39874 .mr(4)
39875 .nr(4)
39876 .kr(1)
39877 .sr(1)
39878 .m(4)
39879 .n(4)
39880 .k(k)
39881 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39882 }
39883 }
39884 }
39885
39886 TEST(F32_GEMMINC_4X4__WASM, n_div_4_strided_cn) {
39887 for (uint32_t n = 8; n <= 12; n += 4) {
39888 for (size_t k = 1; k <= 5; k += 2) {
39889 GemmMicrokernelTester()
39890 .mr(4)
39891 .nr(4)
39892 .kr(1)
39893 .sr(1)
39894 .m(4)
39895 .n(n)
39896 .k(k)
39897 .cn_stride(7)
39898 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39899 }
39900 }
39901 }
39902
39903 TEST(F32_GEMMINC_4X4__WASM, n_div_4_strided_a) {
39904 for (uint32_t n = 8; n <= 12; n += 4) {
39905 for (size_t k = 1; k <= 5; k += 2) {
39906 GemmMicrokernelTester()
39907 .mr(4)
39908 .nr(4)
39909 .kr(1)
39910 .sr(1)
39911 .m(4)
39912 .n(n)
39913 .k(k)
39914 .a_stride(7)
39915 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39916 }
39917 }
39918 }
39919
39920 TEST(F32_GEMMINC_4X4__WASM, n_div_4_subtile) {
39921 for (uint32_t n = 8; n <= 12; n += 4) {
39922 for (size_t k = 1; k <= 5; k += 2) {
39923 for (uint32_t m = 1; m <= 4; m++) {
39924 GemmMicrokernelTester()
39925 .mr(4)
39926 .nr(4)
39927 .kr(1)
39928 .sr(1)
39929 .m(m)
39930 .n(n)
39931 .k(k)
39932 .iterations(1)
39933 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39934 }
39935 }
39936 }
39937 }
39938
39939 TEST(F32_GEMMINC_4X4__WASM, strided_cm_subtile) {
39940 for (size_t k = 1; k <= 5; k += 2) {
39941 for (uint32_t m = 1; m <= 4; m++) {
39942 for (uint32_t n = 1; n <= 4; n++) {
39943 GemmMicrokernelTester()
39944 .mr(4)
39945 .nr(4)
39946 .kr(1)
39947 .sr(1)
39948 .m(m)
39949 .n(n)
39950 .k(k)
39951 .cm_stride(7)
39952 .iterations(1)
39953 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39954 }
39955 }
39956 }
39957 }
39958
39959 TEST(F32_GEMMINC_4X4__WASM, qmin) {
39960 GemmMicrokernelTester()
39961 .mr(4)
39962 .nr(4)
39963 .kr(1)
39964 .sr(1)
39965 .m(4)
39966 .n(4)
39967 .k(1)
39968 .qmin(128)
39969 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39970 }
39971
39972 TEST(F32_GEMMINC_4X4__WASM, qmax) {
39973 GemmMicrokernelTester()
39974 .mr(4)
39975 .nr(4)
39976 .kr(1)
39977 .sr(1)
39978 .m(4)
39979 .n(4)
39980 .k(1)
39981 .qmax(128)
39982 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39983 }
39984
39985 TEST(F32_GEMMINC_4X4__WASM, strided_cm) {
39986 GemmMicrokernelTester()
39987 .mr(4)
39988 .nr(4)
39989 .kr(1)
39990 .sr(1)
39991 .m(4)
39992 .n(4)
39993 .k(1)
39994 .cm_stride(7)
39995 .Test(xnn_f32_gemminc_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
39996 }
39997#endif // XNN_ARCH_WASM
39998
39999
XNNPACK Teamb455b122019-09-27 18:10:33 -070040000TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1) {
40001 GemmMicrokernelTester()
40002 .mr(1)
40003 .nr(4)
40004 .kr(1)
40005 .sr(1)
40006 .m(1)
40007 .n(4)
40008 .k(1)
40009 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40010}
40011
40012TEST(F32_GEMMINC_1X4__SCALAR, strided_cn) {
40013 GemmMicrokernelTester()
40014 .mr(1)
40015 .nr(4)
40016 .kr(1)
40017 .sr(1)
40018 .m(1)
40019 .n(4)
40020 .k(1)
40021 .cn_stride(7)
40022 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40023}
40024
40025TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_strided_a) {
40026 GemmMicrokernelTester()
40027 .mr(1)
40028 .nr(4)
40029 .kr(1)
40030 .sr(1)
40031 .m(1)
40032 .n(4)
40033 .k(1)
40034 .a_stride(3)
40035 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40036}
40037
40038TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_subtile) {
40039 for (uint32_t m = 1; m <= 1; m++) {
40040 for (uint32_t n = 1; n <= 4; n++) {
40041 GemmMicrokernelTester()
40042 .mr(1)
40043 .nr(4)
40044 .kr(1)
40045 .sr(1)
40046 .m(m)
40047 .n(n)
40048 .k(1)
40049 .iterations(1)
40050 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40051 }
40052 }
40053}
40054
40055TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_subtile_m) {
40056 for (uint32_t m = 1; m <= 1; m++) {
40057 GemmMicrokernelTester()
40058 .mr(1)
40059 .nr(4)
40060 .kr(1)
40061 .sr(1)
40062 .m(m)
40063 .n(4)
40064 .k(1)
40065 .iterations(1)
40066 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40067 }
40068}
40069
40070TEST(F32_GEMMINC_1X4__SCALAR, k_eq_1_subtile_n) {
40071 for (uint32_t n = 1; n <= 4; n++) {
40072 GemmMicrokernelTester()
40073 .mr(1)
40074 .nr(4)
40075 .kr(1)
40076 .sr(1)
40077 .m(1)
40078 .n(n)
40079 .k(1)
40080 .iterations(1)
40081 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40082 }
40083}
40084
40085TEST(F32_GEMMINC_1X4__SCALAR, k_gt_1) {
40086 for (size_t k = 2; k < 10; k++) {
40087 GemmMicrokernelTester()
40088 .mr(1)
40089 .nr(4)
40090 .kr(1)
40091 .sr(1)
40092 .m(1)
40093 .n(4)
40094 .k(k)
40095 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40096 }
40097}
40098
40099TEST(F32_GEMMINC_1X4__SCALAR, k_gt_1_strided_a) {
40100 for (size_t k = 2; k < 10; k++) {
40101 GemmMicrokernelTester()
40102 .mr(1)
40103 .nr(4)
40104 .kr(1)
40105 .sr(1)
40106 .m(1)
40107 .n(4)
40108 .k(k)
40109 .a_stride(11)
40110 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40111 }
40112}
40113
40114TEST(F32_GEMMINC_1X4__SCALAR, k_gt_1_subtile) {
40115 for (size_t k = 2; k < 10; k++) {
40116 for (uint32_t m = 1; m <= 1; m++) {
40117 for (uint32_t n = 1; n <= 4; n++) {
40118 GemmMicrokernelTester()
40119 .mr(1)
40120 .nr(4)
40121 .kr(1)
40122 .sr(1)
40123 .m(m)
40124 .n(n)
40125 .k(k)
40126 .iterations(1)
40127 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40128 }
40129 }
40130 }
40131}
40132
40133TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4) {
40134 for (uint32_t n = 5; n < 8; n++) {
40135 for (size_t k = 1; k <= 5; k += 2) {
40136 GemmMicrokernelTester()
40137 .mr(1)
40138 .nr(4)
40139 .kr(1)
40140 .sr(1)
40141 .m(1)
40142 .n(4)
40143 .k(k)
40144 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40145 }
40146 }
40147}
40148
40149TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4_strided_cn) {
40150 for (uint32_t n = 5; n < 8; n++) {
40151 for (size_t k = 1; k <= 5; k += 2) {
40152 GemmMicrokernelTester()
40153 .mr(1)
40154 .nr(4)
40155 .kr(1)
40156 .sr(1)
40157 .m(1)
40158 .n(4)
40159 .k(k)
40160 .cn_stride(7)
40161 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40162 }
40163 }
40164}
40165
40166TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4_strided_a) {
40167 for (uint32_t n = 5; n < 8; n++) {
40168 for (size_t k = 1; k <= 5; k += 2) {
40169 GemmMicrokernelTester()
40170 .mr(1)
40171 .nr(4)
40172 .kr(1)
40173 .sr(1)
40174 .m(1)
40175 .n(n)
40176 .k(k)
40177 .a_stride(7)
40178 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40179 }
40180 }
40181}
40182
40183TEST(F32_GEMMINC_1X4__SCALAR, n_gt_4_subtile) {
40184 for (uint32_t n = 5; n < 8; n++) {
40185 for (size_t k = 1; k <= 5; k += 2) {
40186 for (uint32_t m = 1; m <= 1; m++) {
40187 GemmMicrokernelTester()
40188 .mr(1)
40189 .nr(4)
40190 .kr(1)
40191 .sr(1)
40192 .m(m)
40193 .n(n)
40194 .k(k)
40195 .iterations(1)
40196 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40197 }
40198 }
40199 }
40200}
40201
40202TEST(F32_GEMMINC_1X4__SCALAR, n_div_4) {
40203 for (uint32_t n = 8; n <= 12; n += 4) {
40204 for (size_t k = 1; k <= 5; k += 2) {
40205 GemmMicrokernelTester()
40206 .mr(1)
40207 .nr(4)
40208 .kr(1)
40209 .sr(1)
40210 .m(1)
40211 .n(4)
40212 .k(k)
40213 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40214 }
40215 }
40216}
40217
40218TEST(F32_GEMMINC_1X4__SCALAR, n_div_4_strided_cn) {
40219 for (uint32_t n = 8; n <= 12; n += 4) {
40220 for (size_t k = 1; k <= 5; k += 2) {
40221 GemmMicrokernelTester()
40222 .mr(1)
40223 .nr(4)
40224 .kr(1)
40225 .sr(1)
40226 .m(1)
40227 .n(n)
40228 .k(k)
40229 .cn_stride(7)
40230 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40231 }
40232 }
40233}
40234
40235TEST(F32_GEMMINC_1X4__SCALAR, n_div_4_strided_a) {
40236 for (uint32_t n = 8; n <= 12; n += 4) {
40237 for (size_t k = 1; k <= 5; k += 2) {
40238 GemmMicrokernelTester()
40239 .mr(1)
40240 .nr(4)
40241 .kr(1)
40242 .sr(1)
40243 .m(1)
40244 .n(n)
40245 .k(k)
40246 .a_stride(7)
40247 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40248 }
40249 }
40250}
40251
40252TEST(F32_GEMMINC_1X4__SCALAR, n_div_4_subtile) {
40253 for (uint32_t n = 8; n <= 12; n += 4) {
40254 for (size_t k = 1; k <= 5; k += 2) {
40255 for (uint32_t m = 1; m <= 1; m++) {
40256 GemmMicrokernelTester()
40257 .mr(1)
40258 .nr(4)
40259 .kr(1)
40260 .sr(1)
40261 .m(m)
40262 .n(n)
40263 .k(k)
40264 .iterations(1)
40265 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40266 }
40267 }
40268 }
40269}
40270
40271TEST(F32_GEMMINC_1X4__SCALAR, strided_cm_subtile) {
40272 for (size_t k = 1; k <= 5; k += 2) {
40273 for (uint32_t m = 1; m <= 1; m++) {
40274 for (uint32_t n = 1; n <= 4; n++) {
40275 GemmMicrokernelTester()
40276 .mr(1)
40277 .nr(4)
40278 .kr(1)
40279 .sr(1)
40280 .m(m)
40281 .n(n)
40282 .k(k)
40283 .cm_stride(7)
40284 .iterations(1)
40285 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40286 }
40287 }
40288 }
40289}
40290
40291TEST(F32_GEMMINC_1X4__SCALAR, qmin) {
40292 GemmMicrokernelTester()
40293 .mr(1)
40294 .nr(4)
40295 .kr(1)
40296 .sr(1)
40297 .m(1)
40298 .n(4)
40299 .k(1)
40300 .qmin(128)
40301 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40302}
40303
40304TEST(F32_GEMMINC_1X4__SCALAR, qmax) {
40305 GemmMicrokernelTester()
40306 .mr(1)
40307 .nr(4)
40308 .kr(1)
40309 .sr(1)
40310 .m(1)
40311 .n(4)
40312 .k(1)
40313 .qmax(128)
40314 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40315}
40316
40317TEST(F32_GEMMINC_1X4__SCALAR, strided_cm) {
40318 GemmMicrokernelTester()
40319 .mr(1)
40320 .nr(4)
40321 .kr(1)
40322 .sr(1)
40323 .m(1)
40324 .n(4)
40325 .k(1)
40326 .cm_stride(7)
40327 .Test(xnn_f32_gemminc_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40328}
40329
40330
40331TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1) {
40332 GemmMicrokernelTester()
40333 .mr(2)
40334 .nr(4)
40335 .kr(1)
40336 .sr(1)
40337 .m(2)
40338 .n(4)
40339 .k(1)
40340 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40341}
40342
40343TEST(F32_GEMMINC_2X4__SCALAR, strided_cn) {
40344 GemmMicrokernelTester()
40345 .mr(2)
40346 .nr(4)
40347 .kr(1)
40348 .sr(1)
40349 .m(2)
40350 .n(4)
40351 .k(1)
40352 .cn_stride(7)
40353 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40354}
40355
40356TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_strided_a) {
40357 GemmMicrokernelTester()
40358 .mr(2)
40359 .nr(4)
40360 .kr(1)
40361 .sr(1)
40362 .m(2)
40363 .n(4)
40364 .k(1)
40365 .a_stride(3)
40366 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40367}
40368
40369TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_subtile) {
40370 for (uint32_t m = 1; m <= 2; m++) {
40371 for (uint32_t n = 1; n <= 4; n++) {
40372 GemmMicrokernelTester()
40373 .mr(2)
40374 .nr(4)
40375 .kr(1)
40376 .sr(1)
40377 .m(m)
40378 .n(n)
40379 .k(1)
40380 .iterations(1)
40381 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40382 }
40383 }
40384}
40385
40386TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_subtile_m) {
40387 for (uint32_t m = 1; m <= 2; m++) {
40388 GemmMicrokernelTester()
40389 .mr(2)
40390 .nr(4)
40391 .kr(1)
40392 .sr(1)
40393 .m(m)
40394 .n(4)
40395 .k(1)
40396 .iterations(1)
40397 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40398 }
40399}
40400
40401TEST(F32_GEMMINC_2X4__SCALAR, k_eq_1_subtile_n) {
40402 for (uint32_t n = 1; n <= 4; n++) {
40403 GemmMicrokernelTester()
40404 .mr(2)
40405 .nr(4)
40406 .kr(1)
40407 .sr(1)
40408 .m(2)
40409 .n(n)
40410 .k(1)
40411 .iterations(1)
40412 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40413 }
40414}
40415
40416TEST(F32_GEMMINC_2X4__SCALAR, k_gt_1) {
40417 for (size_t k = 2; k < 10; k++) {
40418 GemmMicrokernelTester()
40419 .mr(2)
40420 .nr(4)
40421 .kr(1)
40422 .sr(1)
40423 .m(2)
40424 .n(4)
40425 .k(k)
40426 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40427 }
40428}
40429
40430TEST(F32_GEMMINC_2X4__SCALAR, k_gt_1_strided_a) {
40431 for (size_t k = 2; k < 10; k++) {
40432 GemmMicrokernelTester()
40433 .mr(2)
40434 .nr(4)
40435 .kr(1)
40436 .sr(1)
40437 .m(2)
40438 .n(4)
40439 .k(k)
40440 .a_stride(11)
40441 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40442 }
40443}
40444
40445TEST(F32_GEMMINC_2X4__SCALAR, k_gt_1_subtile) {
40446 for (size_t k = 2; k < 10; k++) {
40447 for (uint32_t m = 1; m <= 2; m++) {
40448 for (uint32_t n = 1; n <= 4; n++) {
40449 GemmMicrokernelTester()
40450 .mr(2)
40451 .nr(4)
40452 .kr(1)
40453 .sr(1)
40454 .m(m)
40455 .n(n)
40456 .k(k)
40457 .iterations(1)
40458 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40459 }
40460 }
40461 }
40462}
40463
40464TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4) {
40465 for (uint32_t n = 5; n < 8; n++) {
40466 for (size_t k = 1; k <= 5; k += 2) {
40467 GemmMicrokernelTester()
40468 .mr(2)
40469 .nr(4)
40470 .kr(1)
40471 .sr(1)
40472 .m(2)
40473 .n(4)
40474 .k(k)
40475 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40476 }
40477 }
40478}
40479
40480TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4_strided_cn) {
40481 for (uint32_t n = 5; n < 8; n++) {
40482 for (size_t k = 1; k <= 5; k += 2) {
40483 GemmMicrokernelTester()
40484 .mr(2)
40485 .nr(4)
40486 .kr(1)
40487 .sr(1)
40488 .m(2)
40489 .n(4)
40490 .k(k)
40491 .cn_stride(7)
40492 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40493 }
40494 }
40495}
40496
40497TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4_strided_a) {
40498 for (uint32_t n = 5; n < 8; n++) {
40499 for (size_t k = 1; k <= 5; k += 2) {
40500 GemmMicrokernelTester()
40501 .mr(2)
40502 .nr(4)
40503 .kr(1)
40504 .sr(1)
40505 .m(2)
40506 .n(n)
40507 .k(k)
40508 .a_stride(7)
40509 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40510 }
40511 }
40512}
40513
40514TEST(F32_GEMMINC_2X4__SCALAR, n_gt_4_subtile) {
40515 for (uint32_t n = 5; n < 8; n++) {
40516 for (size_t k = 1; k <= 5; k += 2) {
40517 for (uint32_t m = 1; m <= 2; m++) {
40518 GemmMicrokernelTester()
40519 .mr(2)
40520 .nr(4)
40521 .kr(1)
40522 .sr(1)
40523 .m(m)
40524 .n(n)
40525 .k(k)
40526 .iterations(1)
40527 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40528 }
40529 }
40530 }
40531}
40532
40533TEST(F32_GEMMINC_2X4__SCALAR, n_div_4) {
40534 for (uint32_t n = 8; n <= 12; n += 4) {
40535 for (size_t k = 1; k <= 5; k += 2) {
40536 GemmMicrokernelTester()
40537 .mr(2)
40538 .nr(4)
40539 .kr(1)
40540 .sr(1)
40541 .m(2)
40542 .n(4)
40543 .k(k)
40544 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40545 }
40546 }
40547}
40548
40549TEST(F32_GEMMINC_2X4__SCALAR, n_div_4_strided_cn) {
40550 for (uint32_t n = 8; n <= 12; n += 4) {
40551 for (size_t k = 1; k <= 5; k += 2) {
40552 GemmMicrokernelTester()
40553 .mr(2)
40554 .nr(4)
40555 .kr(1)
40556 .sr(1)
40557 .m(2)
40558 .n(n)
40559 .k(k)
40560 .cn_stride(7)
40561 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40562 }
40563 }
40564}
40565
40566TEST(F32_GEMMINC_2X4__SCALAR, n_div_4_strided_a) {
40567 for (uint32_t n = 8; n <= 12; n += 4) {
40568 for (size_t k = 1; k <= 5; k += 2) {
40569 GemmMicrokernelTester()
40570 .mr(2)
40571 .nr(4)
40572 .kr(1)
40573 .sr(1)
40574 .m(2)
40575 .n(n)
40576 .k(k)
40577 .a_stride(7)
40578 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40579 }
40580 }
40581}
40582
40583TEST(F32_GEMMINC_2X4__SCALAR, n_div_4_subtile) {
40584 for (uint32_t n = 8; n <= 12; n += 4) {
40585 for (size_t k = 1; k <= 5; k += 2) {
40586 for (uint32_t m = 1; m <= 2; m++) {
40587 GemmMicrokernelTester()
40588 .mr(2)
40589 .nr(4)
40590 .kr(1)
40591 .sr(1)
40592 .m(m)
40593 .n(n)
40594 .k(k)
40595 .iterations(1)
40596 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40597 }
40598 }
40599 }
40600}
40601
40602TEST(F32_GEMMINC_2X4__SCALAR, strided_cm_subtile) {
40603 for (size_t k = 1; k <= 5; k += 2) {
40604 for (uint32_t m = 1; m <= 2; m++) {
40605 for (uint32_t n = 1; n <= 4; n++) {
40606 GemmMicrokernelTester()
40607 .mr(2)
40608 .nr(4)
40609 .kr(1)
40610 .sr(1)
40611 .m(m)
40612 .n(n)
40613 .k(k)
40614 .cm_stride(7)
40615 .iterations(1)
40616 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40617 }
40618 }
40619 }
40620}
40621
40622TEST(F32_GEMMINC_2X4__SCALAR, qmin) {
40623 GemmMicrokernelTester()
40624 .mr(2)
40625 .nr(4)
40626 .kr(1)
40627 .sr(1)
40628 .m(2)
40629 .n(4)
40630 .k(1)
40631 .qmin(128)
40632 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40633}
40634
40635TEST(F32_GEMMINC_2X4__SCALAR, qmax) {
40636 GemmMicrokernelTester()
40637 .mr(2)
40638 .nr(4)
40639 .kr(1)
40640 .sr(1)
40641 .m(2)
40642 .n(4)
40643 .k(1)
40644 .qmax(128)
40645 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40646}
40647
40648TEST(F32_GEMMINC_2X4__SCALAR, strided_cm) {
40649 GemmMicrokernelTester()
40650 .mr(2)
40651 .nr(4)
40652 .kr(1)
40653 .sr(1)
40654 .m(2)
40655 .n(4)
40656 .k(1)
40657 .cm_stride(7)
40658 .Test(xnn_f32_gemminc_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40659}
40660
40661
40662TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1) {
40663 GemmMicrokernelTester()
40664 .mr(4)
40665 .nr(4)
40666 .kr(1)
40667 .sr(1)
40668 .m(4)
40669 .n(4)
40670 .k(1)
40671 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40672}
40673
40674TEST(F32_GEMMINC_4X4__SCALAR, strided_cn) {
40675 GemmMicrokernelTester()
40676 .mr(4)
40677 .nr(4)
40678 .kr(1)
40679 .sr(1)
40680 .m(4)
40681 .n(4)
40682 .k(1)
40683 .cn_stride(7)
40684 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40685}
40686
40687TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_strided_a) {
40688 GemmMicrokernelTester()
40689 .mr(4)
40690 .nr(4)
40691 .kr(1)
40692 .sr(1)
40693 .m(4)
40694 .n(4)
40695 .k(1)
40696 .a_stride(3)
40697 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40698}
40699
40700TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_subtile) {
40701 for (uint32_t m = 1; m <= 4; m++) {
40702 for (uint32_t n = 1; n <= 4; n++) {
40703 GemmMicrokernelTester()
40704 .mr(4)
40705 .nr(4)
40706 .kr(1)
40707 .sr(1)
40708 .m(m)
40709 .n(n)
40710 .k(1)
40711 .iterations(1)
40712 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40713 }
40714 }
40715}
40716
40717TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_subtile_m) {
40718 for (uint32_t m = 1; m <= 4; m++) {
40719 GemmMicrokernelTester()
40720 .mr(4)
40721 .nr(4)
40722 .kr(1)
40723 .sr(1)
40724 .m(m)
40725 .n(4)
40726 .k(1)
40727 .iterations(1)
40728 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40729 }
40730}
40731
40732TEST(F32_GEMMINC_4X4__SCALAR, k_eq_1_subtile_n) {
40733 for (uint32_t n = 1; n <= 4; n++) {
40734 GemmMicrokernelTester()
40735 .mr(4)
40736 .nr(4)
40737 .kr(1)
40738 .sr(1)
40739 .m(4)
40740 .n(n)
40741 .k(1)
40742 .iterations(1)
40743 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40744 }
40745}
40746
40747TEST(F32_GEMMINC_4X4__SCALAR, k_gt_1) {
40748 for (size_t k = 2; k < 10; k++) {
40749 GemmMicrokernelTester()
40750 .mr(4)
40751 .nr(4)
40752 .kr(1)
40753 .sr(1)
40754 .m(4)
40755 .n(4)
40756 .k(k)
40757 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40758 }
40759}
40760
40761TEST(F32_GEMMINC_4X4__SCALAR, k_gt_1_strided_a) {
40762 for (size_t k = 2; k < 10; k++) {
40763 GemmMicrokernelTester()
40764 .mr(4)
40765 .nr(4)
40766 .kr(1)
40767 .sr(1)
40768 .m(4)
40769 .n(4)
40770 .k(k)
40771 .a_stride(11)
40772 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40773 }
40774}
40775
40776TEST(F32_GEMMINC_4X4__SCALAR, k_gt_1_subtile) {
40777 for (size_t k = 2; k < 10; k++) {
40778 for (uint32_t m = 1; m <= 4; m++) {
40779 for (uint32_t n = 1; n <= 4; n++) {
40780 GemmMicrokernelTester()
40781 .mr(4)
40782 .nr(4)
40783 .kr(1)
40784 .sr(1)
40785 .m(m)
40786 .n(n)
40787 .k(k)
40788 .iterations(1)
40789 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40790 }
40791 }
40792 }
40793}
40794
40795TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4) {
40796 for (uint32_t n = 5; n < 8; n++) {
40797 for (size_t k = 1; k <= 5; k += 2) {
40798 GemmMicrokernelTester()
40799 .mr(4)
40800 .nr(4)
40801 .kr(1)
40802 .sr(1)
40803 .m(4)
40804 .n(4)
40805 .k(k)
40806 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40807 }
40808 }
40809}
40810
40811TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4_strided_cn) {
40812 for (uint32_t n = 5; n < 8; n++) {
40813 for (size_t k = 1; k <= 5; k += 2) {
40814 GemmMicrokernelTester()
40815 .mr(4)
40816 .nr(4)
40817 .kr(1)
40818 .sr(1)
40819 .m(4)
40820 .n(4)
40821 .k(k)
40822 .cn_stride(7)
40823 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40824 }
40825 }
40826}
40827
40828TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4_strided_a) {
40829 for (uint32_t n = 5; n < 8; n++) {
40830 for (size_t k = 1; k <= 5; k += 2) {
40831 GemmMicrokernelTester()
40832 .mr(4)
40833 .nr(4)
40834 .kr(1)
40835 .sr(1)
40836 .m(4)
40837 .n(n)
40838 .k(k)
40839 .a_stride(7)
40840 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40841 }
40842 }
40843}
40844
40845TEST(F32_GEMMINC_4X4__SCALAR, n_gt_4_subtile) {
40846 for (uint32_t n = 5; n < 8; n++) {
40847 for (size_t k = 1; k <= 5; k += 2) {
40848 for (uint32_t m = 1; m <= 4; m++) {
40849 GemmMicrokernelTester()
40850 .mr(4)
40851 .nr(4)
40852 .kr(1)
40853 .sr(1)
40854 .m(m)
40855 .n(n)
40856 .k(k)
40857 .iterations(1)
40858 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40859 }
40860 }
40861 }
40862}
40863
40864TEST(F32_GEMMINC_4X4__SCALAR, n_div_4) {
40865 for (uint32_t n = 8; n <= 12; n += 4) {
40866 for (size_t k = 1; k <= 5; k += 2) {
40867 GemmMicrokernelTester()
40868 .mr(4)
40869 .nr(4)
40870 .kr(1)
40871 .sr(1)
40872 .m(4)
40873 .n(4)
40874 .k(k)
40875 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40876 }
40877 }
40878}
40879
40880TEST(F32_GEMMINC_4X4__SCALAR, n_div_4_strided_cn) {
40881 for (uint32_t n = 8; n <= 12; n += 4) {
40882 for (size_t k = 1; k <= 5; k += 2) {
40883 GemmMicrokernelTester()
40884 .mr(4)
40885 .nr(4)
40886 .kr(1)
40887 .sr(1)
40888 .m(4)
40889 .n(n)
40890 .k(k)
40891 .cn_stride(7)
40892 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40893 }
40894 }
40895}
40896
40897TEST(F32_GEMMINC_4X4__SCALAR, n_div_4_strided_a) {
40898 for (uint32_t n = 8; n <= 12; n += 4) {
40899 for (size_t k = 1; k <= 5; k += 2) {
40900 GemmMicrokernelTester()
40901 .mr(4)
40902 .nr(4)
40903 .kr(1)
40904 .sr(1)
40905 .m(4)
40906 .n(n)
40907 .k(k)
40908 .a_stride(7)
40909 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40910 }
40911 }
40912}
40913
40914TEST(F32_GEMMINC_4X4__SCALAR, n_div_4_subtile) {
40915 for (uint32_t n = 8; n <= 12; n += 4) {
40916 for (size_t k = 1; k <= 5; k += 2) {
40917 for (uint32_t m = 1; m <= 4; m++) {
40918 GemmMicrokernelTester()
40919 .mr(4)
40920 .nr(4)
40921 .kr(1)
40922 .sr(1)
40923 .m(m)
40924 .n(n)
40925 .k(k)
40926 .iterations(1)
40927 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40928 }
40929 }
40930 }
40931}
40932
40933TEST(F32_GEMMINC_4X4__SCALAR, strided_cm_subtile) {
40934 for (size_t k = 1; k <= 5; k += 2) {
40935 for (uint32_t m = 1; m <= 4; m++) {
40936 for (uint32_t n = 1; n <= 4; n++) {
40937 GemmMicrokernelTester()
40938 .mr(4)
40939 .nr(4)
40940 .kr(1)
40941 .sr(1)
40942 .m(m)
40943 .n(n)
40944 .k(k)
40945 .cm_stride(7)
40946 .iterations(1)
40947 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40948 }
40949 }
40950 }
40951}
40952
40953TEST(F32_GEMMINC_4X4__SCALAR, qmin) {
40954 GemmMicrokernelTester()
40955 .mr(4)
40956 .nr(4)
40957 .kr(1)
40958 .sr(1)
40959 .m(4)
40960 .n(4)
40961 .k(1)
40962 .qmin(128)
40963 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40964}
40965
40966TEST(F32_GEMMINC_4X4__SCALAR, qmax) {
40967 GemmMicrokernelTester()
40968 .mr(4)
40969 .nr(4)
40970 .kr(1)
40971 .sr(1)
40972 .m(4)
40973 .n(4)
40974 .k(1)
40975 .qmax(128)
40976 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40977}
40978
40979TEST(F32_GEMMINC_4X4__SCALAR, strided_cm) {
40980 GemmMicrokernelTester()
40981 .mr(4)
40982 .nr(4)
40983 .kr(1)
40984 .sr(1)
40985 .m(4)
40986 .n(4)
40987 .k(1)
40988 .cm_stride(7)
40989 .Test(xnn_f32_gemminc_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
40990}