blob: 1350a23f7b061913c8ad9c584c0c07d4946db9e0 [file] [log] [blame]
XNNPACK Teamb455b122019-09-27 18:10:33 -07001// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8//
9// Auto-generated file. Do not edit!
10// Specification: test/f32-gemm.yaml
11// Generator: tools/generate-gemm-test.py
12
13
XNNPACK Teamb455b122019-09-27 18:10:33 -070014#include <gtest/gtest.h>
15
Marat Dukhan1dadbf72019-10-01 10:46:20 -070016#include <xnnpack/common.h>
17#include <xnnpack/isa-checks.h>
18
XNNPACK Teamb455b122019-09-27 18:10:33 -070019#include <xnnpack/gemm.h>
20#include <xnnpack/igemm.h>
21#include <xnnpack/ppmm.h>
XNNPACK Teamb455b122019-09-27 18:10:33 -070022#include "gemm-microkernel-tester.h"
23
24
Frank Barchard7e955972019-10-11 10:34:25 -070025#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard21be34f2019-10-09 19:32:19 -070026 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
27 TEST_REQUIRES_ARM_NEON_FMA;
28 GemmMicrokernelTester()
29 .mr(1)
30 .nr(8)
31 .kr(1)
32 .sr(1)
33 .m(1)
34 .n(8)
35 .k(8)
36 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
37 }
38
39 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
40 TEST_REQUIRES_ARM_NEON_FMA;
41 GemmMicrokernelTester()
42 .mr(1)
43 .nr(8)
44 .kr(1)
45 .sr(1)
46 .m(1)
47 .n(8)
48 .k(8)
49 .cn_stride(11)
50 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
51 }
52
53 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
54 TEST_REQUIRES_ARM_NEON_FMA;
55 GemmMicrokernelTester()
56 .mr(1)
57 .nr(8)
58 .kr(1)
59 .sr(1)
60 .m(1)
61 .n(8)
62 .k(8)
63 .a_stride(11)
64 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
65 }
66
67 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
68 TEST_REQUIRES_ARM_NEON_FMA;
69 for (uint32_t m = 1; m <= 1; m++) {
70 for (uint32_t n = 1; n <= 8; n++) {
71 GemmMicrokernelTester()
72 .mr(1)
73 .nr(8)
74 .kr(1)
75 .sr(1)
76 .m(m)
77 .n(n)
78 .k(8)
79 .iterations(1)
80 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
81 }
82 }
83 }
84
85 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_m) {
86 TEST_REQUIRES_ARM_NEON_FMA;
87 for (uint32_t m = 1; m <= 1; m++) {
88 GemmMicrokernelTester()
89 .mr(1)
90 .nr(8)
91 .kr(1)
92 .sr(1)
93 .m(m)
94 .n(8)
95 .k(8)
96 .iterations(1)
97 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
98 }
99 }
100
101 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile_n) {
102 TEST_REQUIRES_ARM_NEON_FMA;
103 for (uint32_t n = 1; n <= 8; n++) {
104 GemmMicrokernelTester()
105 .mr(1)
106 .nr(8)
107 .kr(1)
108 .sr(1)
109 .m(1)
110 .n(n)
111 .k(8)
112 .iterations(1)
113 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
114 }
115 }
116
117 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16) {
118 TEST_REQUIRES_ARM_NEON_FMA;
119 GemmMicrokernelTester()
120 .mr(1)
121 .nr(8)
122 .kr(1)
123 .sr(1)
124 .m(1)
125 .n(8)
126 .k(16)
127 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
128 }
129
130 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_strided_a) {
131 TEST_REQUIRES_ARM_NEON_FMA;
132 GemmMicrokernelTester()
133 .mr(1)
134 .nr(8)
135 .kr(1)
136 .sr(1)
137 .m(1)
138 .n(8)
139 .k(16)
140 .a_stride(19)
141 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
142 }
143
144 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_16_subtile) {
145 TEST_REQUIRES_ARM_NEON_FMA;
146 for (uint32_t m = 1; m <= 1; m++) {
147 for (uint32_t n = 1; n <= 8; n++) {
148 GemmMicrokernelTester()
149 .mr(1)
150 .nr(8)
151 .kr(1)
152 .sr(1)
153 .m(m)
154 .n(n)
155 .k(16)
156 .iterations(1)
157 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
158 }
159 }
160 }
161
162 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16) {
163 TEST_REQUIRES_ARM_NEON_FMA;
164 for (size_t k = 1; k < 16; k++) {
165 GemmMicrokernelTester()
166 .mr(1)
167 .nr(8)
168 .kr(1)
169 .sr(1)
170 .m(1)
171 .n(8)
172 .k(k)
173 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
174 }
175 }
176
177 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_strided_a) {
178 TEST_REQUIRES_ARM_NEON_FMA;
179 for (size_t k = 1; k < 16; k++) {
180 GemmMicrokernelTester()
181 .mr(1)
182 .nr(8)
183 .kr(1)
184 .sr(1)
185 .m(1)
186 .n(8)
187 .k(k)
188 .a_stride(19)
189 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
190 }
191 }
192
193 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_16_subtile) {
194 TEST_REQUIRES_ARM_NEON_FMA;
195 for (size_t k = 1; k < 16; k++) {
196 for (uint32_t m = 1; m <= 1; m++) {
197 for (uint32_t n = 1; n <= 8; n++) {
198 GemmMicrokernelTester()
199 .mr(1)
200 .nr(8)
201 .kr(1)
202 .sr(1)
203 .m(m)
204 .n(n)
205 .k(k)
206 .iterations(1)
207 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
208 }
209 }
210 }
211 }
212
213 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_16) {
214 TEST_REQUIRES_ARM_NEON_FMA;
215 for (size_t k = 17; k < 16; k++) {
216 GemmMicrokernelTester()
217 .mr(1)
218 .nr(8)
219 .kr(1)
220 .sr(1)
221 .m(1)
222 .n(8)
223 .k(k)
224 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
225 }
226 }
227
228 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_strided_a) {
229 TEST_REQUIRES_ARM_NEON_FMA;
230 for (size_t k = 17; k < 16; k++) {
231 GemmMicrokernelTester()
232 .mr(1)
233 .nr(8)
234 .kr(1)
235 .sr(1)
236 .m(1)
237 .n(8)
238 .k(k)
239 .a_stride(19)
240 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
241 }
242 }
243
244 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8_subtile) {
245 TEST_REQUIRES_ARM_NEON_FMA;
246 for (size_t k = 17; k < 16; k++) {
247 for (uint32_t m = 1; m <= 1; m++) {
248 for (uint32_t n = 1; n <= 8; n++) {
249 GemmMicrokernelTester()
250 .mr(1)
251 .nr(8)
252 .kr(1)
253 .sr(1)
254 .m(m)
255 .n(n)
256 .k(k)
257 .iterations(1)
258 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
259 }
260 }
261 }
262 }
263
264 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8) {
265 TEST_REQUIRES_ARM_NEON_FMA;
266 for (size_t k = 24; k <= 80; k += 8) {
267 GemmMicrokernelTester()
268 .mr(1)
269 .nr(8)
270 .kr(1)
271 .sr(1)
272 .m(1)
273 .n(8)
274 .k(k)
275 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
276 }
277 }
278
279 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_strided_a) {
280 TEST_REQUIRES_ARM_NEON_FMA;
281 for (size_t k = 24; k <= 80; k += 8) {
282 GemmMicrokernelTester()
283 .mr(1)
284 .nr(8)
285 .kr(1)
286 .sr(1)
287 .m(1)
288 .n(8)
289 .k(k)
290 .a_stride(83)
291 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
292 }
293 }
294
295 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, k_div_8_subtile) {
296 TEST_REQUIRES_ARM_NEON_FMA;
297 for (size_t k = 24; k <= 80; k += 8) {
298 for (uint32_t m = 1; m <= 1; m++) {
299 for (uint32_t n = 1; n <= 8; n++) {
300 GemmMicrokernelTester()
301 .mr(1)
302 .nr(8)
303 .kr(1)
304 .sr(1)
305 .m(m)
306 .n(n)
307 .k(k)
308 .iterations(1)
309 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
310 }
311 }
312 }
313 }
314
315 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
316 TEST_REQUIRES_ARM_NEON_FMA;
317 for (uint32_t n = 9; n < 16; n++) {
318 for (size_t k = 1; k <= 40; k += 9) {
319 GemmMicrokernelTester()
320 .mr(1)
321 .nr(8)
322 .kr(1)
323 .sr(1)
324 .m(1)
325 .n(8)
326 .k(k)
327 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
328 }
329 }
330 }
331
332 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
333 TEST_REQUIRES_ARM_NEON_FMA;
334 for (uint32_t n = 9; n < 16; n++) {
335 for (size_t k = 1; k <= 40; k += 9) {
336 GemmMicrokernelTester()
337 .mr(1)
338 .nr(8)
339 .kr(1)
340 .sr(1)
341 .m(1)
342 .n(8)
343 .k(k)
344 .cn_stride(11)
345 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
346 }
347 }
348 }
349
350 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
351 TEST_REQUIRES_ARM_NEON_FMA;
352 for (uint32_t n = 9; n < 16; n++) {
353 for (size_t k = 1; k <= 40; k += 9) {
354 GemmMicrokernelTester()
355 .mr(1)
356 .nr(8)
357 .kr(1)
358 .sr(1)
359 .m(1)
360 .n(n)
361 .k(k)
362 .a_stride(43)
363 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
364 }
365 }
366 }
367
368 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
369 TEST_REQUIRES_ARM_NEON_FMA;
370 for (uint32_t n = 9; n < 16; n++) {
371 for (size_t k = 1; k <= 40; k += 9) {
372 for (uint32_t m = 1; m <= 1; m++) {
373 GemmMicrokernelTester()
374 .mr(1)
375 .nr(8)
376 .kr(1)
377 .sr(1)
378 .m(m)
379 .n(n)
380 .k(k)
381 .iterations(1)
382 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
383 }
384 }
385 }
386 }
387
388 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
389 TEST_REQUIRES_ARM_NEON_FMA;
390 for (uint32_t n = 16; n <= 24; n += 8) {
391 for (size_t k = 1; k <= 40; k += 9) {
392 GemmMicrokernelTester()
393 .mr(1)
394 .nr(8)
395 .kr(1)
396 .sr(1)
397 .m(1)
398 .n(8)
399 .k(k)
400 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
401 }
402 }
403 }
404
405 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
406 TEST_REQUIRES_ARM_NEON_FMA;
407 for (uint32_t n = 16; n <= 24; n += 8) {
408 for (size_t k = 1; k <= 40; k += 9) {
409 GemmMicrokernelTester()
410 .mr(1)
411 .nr(8)
412 .kr(1)
413 .sr(1)
414 .m(1)
415 .n(n)
416 .k(k)
417 .cn_stride(11)
418 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
419 }
420 }
421 }
422
423 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
424 TEST_REQUIRES_ARM_NEON_FMA;
425 for (uint32_t n = 16; n <= 24; n += 8) {
426 for (size_t k = 1; k <= 40; k += 9) {
427 GemmMicrokernelTester()
428 .mr(1)
429 .nr(8)
430 .kr(1)
431 .sr(1)
432 .m(1)
433 .n(n)
434 .k(k)
435 .a_stride(43)
436 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
437 }
438 }
439 }
440
441 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
442 TEST_REQUIRES_ARM_NEON_FMA;
443 for (uint32_t n = 16; n <= 24; n += 8) {
444 for (size_t k = 1; k <= 40; k += 9) {
445 for (uint32_t m = 1; m <= 1; m++) {
446 GemmMicrokernelTester()
447 .mr(1)
448 .nr(8)
449 .kr(1)
450 .sr(1)
451 .m(m)
452 .n(n)
453 .k(k)
454 .iterations(1)
455 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
456 }
457 }
458 }
459 }
460
461 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
462 TEST_REQUIRES_ARM_NEON_FMA;
463 for (size_t k = 1; k <= 40; k += 9) {
464 for (uint32_t m = 1; m <= 1; m++) {
465 for (uint32_t n = 1; n <= 8; n++) {
466 GemmMicrokernelTester()
467 .mr(1)
468 .nr(8)
469 .kr(1)
470 .sr(1)
471 .m(m)
472 .n(n)
473 .k(k)
474 .cm_stride(11)
475 .iterations(1)
476 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
477 }
478 }
479 }
480 }
481
482 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
483 TEST_REQUIRES_ARM_NEON_FMA;
484 GemmMicrokernelTester()
485 .mr(1)
486 .nr(8)
487 .kr(1)
488 .sr(1)
489 .m(1)
490 .n(8)
491 .k(8)
492 .qmin(128)
493 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
494 }
495
496 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
497 TEST_REQUIRES_ARM_NEON_FMA;
498 GemmMicrokernelTester()
499 .mr(1)
500 .nr(8)
501 .kr(1)
502 .sr(1)
503 .m(1)
504 .n(8)
505 .k(8)
506 .qmax(128)
507 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
508 }
509
510 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
511 TEST_REQUIRES_ARM_NEON_FMA;
512 GemmMicrokernelTester()
513 .mr(1)
514 .nr(8)
515 .kr(1)
516 .sr(1)
517 .m(1)
518 .n(8)
519 .k(8)
520 .cm_stride(11)
521 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a53);
522 }
Frank Barchard7e955972019-10-11 10:34:25 -0700523#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard21be34f2019-10-09 19:32:19 -0700524
525
Frank Barchard7e955972019-10-11 10:34:25 -0700526#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -0700527 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
528 TEST_REQUIRES_ARM_NEON_FMA;
529 GemmMicrokernelTester()
530 .mr(1)
531 .nr(8)
532 .kr(1)
533 .sr(1)
534 .m(1)
535 .n(8)
536 .k(8)
537 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
538 }
539
540 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
541 TEST_REQUIRES_ARM_NEON_FMA;
542 GemmMicrokernelTester()
543 .mr(1)
544 .nr(8)
545 .kr(1)
546 .sr(1)
547 .m(1)
548 .n(8)
549 .k(8)
550 .cn_stride(11)
551 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
552 }
553
554 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
555 TEST_REQUIRES_ARM_NEON_FMA;
556 GemmMicrokernelTester()
557 .mr(1)
558 .nr(8)
559 .kr(1)
560 .sr(1)
561 .m(1)
562 .n(8)
563 .k(8)
564 .a_stride(11)
565 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
566 }
567
568 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
569 TEST_REQUIRES_ARM_NEON_FMA;
570 for (uint32_t m = 1; m <= 1; m++) {
571 for (uint32_t n = 1; n <= 8; n++) {
572 GemmMicrokernelTester()
573 .mr(1)
574 .nr(8)
575 .kr(1)
576 .sr(1)
577 .m(m)
578 .n(n)
579 .k(8)
580 .iterations(1)
581 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
582 }
583 }
584 }
585
586 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
587 TEST_REQUIRES_ARM_NEON_FMA;
588 for (uint32_t m = 1; m <= 1; m++) {
589 GemmMicrokernelTester()
590 .mr(1)
591 .nr(8)
592 .kr(1)
593 .sr(1)
594 .m(m)
595 .n(8)
596 .k(8)
597 .iterations(1)
598 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
599 }
600 }
601
602 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
603 TEST_REQUIRES_ARM_NEON_FMA;
604 for (uint32_t n = 1; n <= 8; n++) {
605 GemmMicrokernelTester()
606 .mr(1)
607 .nr(8)
608 .kr(1)
609 .sr(1)
610 .m(1)
611 .n(n)
612 .k(8)
613 .iterations(1)
614 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
615 }
616 }
617
618 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
619 TEST_REQUIRES_ARM_NEON_FMA;
620 GemmMicrokernelTester()
621 .mr(1)
622 .nr(8)
623 .kr(1)
624 .sr(1)
625 .m(1)
626 .n(8)
627 .k(16)
628 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
629 }
630
631 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
632 TEST_REQUIRES_ARM_NEON_FMA;
633 GemmMicrokernelTester()
634 .mr(1)
635 .nr(8)
636 .kr(1)
637 .sr(1)
638 .m(1)
639 .n(8)
640 .k(16)
641 .a_stride(19)
642 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
643 }
644
645 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
646 TEST_REQUIRES_ARM_NEON_FMA;
647 for (uint32_t m = 1; m <= 1; m++) {
648 for (uint32_t n = 1; n <= 8; n++) {
649 GemmMicrokernelTester()
650 .mr(1)
651 .nr(8)
652 .kr(1)
653 .sr(1)
654 .m(m)
655 .n(n)
656 .k(16)
657 .iterations(1)
658 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
659 }
660 }
661 }
662
663 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
664 TEST_REQUIRES_ARM_NEON_FMA;
665 for (size_t k = 1; k < 16; k++) {
666 GemmMicrokernelTester()
667 .mr(1)
668 .nr(8)
669 .kr(1)
670 .sr(1)
671 .m(1)
672 .n(8)
673 .k(k)
674 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
675 }
676 }
677
678 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
679 TEST_REQUIRES_ARM_NEON_FMA;
680 for (size_t k = 1; k < 16; k++) {
681 GemmMicrokernelTester()
682 .mr(1)
683 .nr(8)
684 .kr(1)
685 .sr(1)
686 .m(1)
687 .n(8)
688 .k(k)
689 .a_stride(19)
690 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
691 }
692 }
693
694 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
695 TEST_REQUIRES_ARM_NEON_FMA;
696 for (size_t k = 1; k < 16; k++) {
697 for (uint32_t m = 1; m <= 1; m++) {
698 for (uint32_t n = 1; n <= 8; n++) {
699 GemmMicrokernelTester()
700 .mr(1)
701 .nr(8)
702 .kr(1)
703 .sr(1)
704 .m(m)
705 .n(n)
706 .k(k)
707 .iterations(1)
708 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
709 }
710 }
711 }
712 }
713
714 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
715 TEST_REQUIRES_ARM_NEON_FMA;
716 for (size_t k = 17; k < 16; k++) {
717 GemmMicrokernelTester()
718 .mr(1)
719 .nr(8)
720 .kr(1)
721 .sr(1)
722 .m(1)
723 .n(8)
724 .k(k)
725 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
726 }
727 }
728
729 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
730 TEST_REQUIRES_ARM_NEON_FMA;
731 for (size_t k = 17; k < 16; k++) {
732 GemmMicrokernelTester()
733 .mr(1)
734 .nr(8)
735 .kr(1)
736 .sr(1)
737 .m(1)
738 .n(8)
739 .k(k)
740 .a_stride(19)
741 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
742 }
743 }
744
745 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
746 TEST_REQUIRES_ARM_NEON_FMA;
747 for (size_t k = 17; k < 16; k++) {
748 for (uint32_t m = 1; m <= 1; m++) {
749 for (uint32_t n = 1; n <= 8; n++) {
750 GemmMicrokernelTester()
751 .mr(1)
752 .nr(8)
753 .kr(1)
754 .sr(1)
755 .m(m)
756 .n(n)
757 .k(k)
758 .iterations(1)
759 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
760 }
761 }
762 }
763 }
764
765 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
766 TEST_REQUIRES_ARM_NEON_FMA;
767 for (size_t k = 24; k <= 80; k += 8) {
768 GemmMicrokernelTester()
769 .mr(1)
770 .nr(8)
771 .kr(1)
772 .sr(1)
773 .m(1)
774 .n(8)
775 .k(k)
776 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
777 }
778 }
779
780 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
781 TEST_REQUIRES_ARM_NEON_FMA;
782 for (size_t k = 24; k <= 80; k += 8) {
783 GemmMicrokernelTester()
784 .mr(1)
785 .nr(8)
786 .kr(1)
787 .sr(1)
788 .m(1)
789 .n(8)
790 .k(k)
791 .a_stride(83)
792 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
793 }
794 }
795
796 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
797 TEST_REQUIRES_ARM_NEON_FMA;
798 for (size_t k = 24; k <= 80; k += 8) {
799 for (uint32_t m = 1; m <= 1; m++) {
800 for (uint32_t n = 1; n <= 8; n++) {
801 GemmMicrokernelTester()
802 .mr(1)
803 .nr(8)
804 .kr(1)
805 .sr(1)
806 .m(m)
807 .n(n)
808 .k(k)
809 .iterations(1)
810 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
811 }
812 }
813 }
814 }
815
816 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
817 TEST_REQUIRES_ARM_NEON_FMA;
818 for (uint32_t n = 9; n < 16; n++) {
819 for (size_t k = 1; k <= 40; k += 9) {
820 GemmMicrokernelTester()
821 .mr(1)
822 .nr(8)
823 .kr(1)
824 .sr(1)
825 .m(1)
826 .n(8)
827 .k(k)
828 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
829 }
830 }
831 }
832
833 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
834 TEST_REQUIRES_ARM_NEON_FMA;
835 for (uint32_t n = 9; n < 16; n++) {
836 for (size_t k = 1; k <= 40; k += 9) {
837 GemmMicrokernelTester()
838 .mr(1)
839 .nr(8)
840 .kr(1)
841 .sr(1)
842 .m(1)
843 .n(8)
844 .k(k)
845 .cn_stride(11)
846 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
847 }
848 }
849 }
850
851 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
852 TEST_REQUIRES_ARM_NEON_FMA;
853 for (uint32_t n = 9; n < 16; n++) {
854 for (size_t k = 1; k <= 40; k += 9) {
855 GemmMicrokernelTester()
856 .mr(1)
857 .nr(8)
858 .kr(1)
859 .sr(1)
860 .m(1)
861 .n(n)
862 .k(k)
863 .a_stride(43)
864 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
865 }
866 }
867 }
868
869 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
870 TEST_REQUIRES_ARM_NEON_FMA;
871 for (uint32_t n = 9; n < 16; n++) {
872 for (size_t k = 1; k <= 40; k += 9) {
873 for (uint32_t m = 1; m <= 1; m++) {
874 GemmMicrokernelTester()
875 .mr(1)
876 .nr(8)
877 .kr(1)
878 .sr(1)
879 .m(m)
880 .n(n)
881 .k(k)
882 .iterations(1)
883 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
884 }
885 }
886 }
887 }
888
889 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
890 TEST_REQUIRES_ARM_NEON_FMA;
891 for (uint32_t n = 16; n <= 24; n += 8) {
892 for (size_t k = 1; k <= 40; k += 9) {
893 GemmMicrokernelTester()
894 .mr(1)
895 .nr(8)
896 .kr(1)
897 .sr(1)
898 .m(1)
899 .n(8)
900 .k(k)
901 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
902 }
903 }
904 }
905
906 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
907 TEST_REQUIRES_ARM_NEON_FMA;
908 for (uint32_t n = 16; n <= 24; n += 8) {
909 for (size_t k = 1; k <= 40; k += 9) {
910 GemmMicrokernelTester()
911 .mr(1)
912 .nr(8)
913 .kr(1)
914 .sr(1)
915 .m(1)
916 .n(n)
917 .k(k)
918 .cn_stride(11)
919 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
920 }
921 }
922 }
923
924 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
925 TEST_REQUIRES_ARM_NEON_FMA;
926 for (uint32_t n = 16; n <= 24; n += 8) {
927 for (size_t k = 1; k <= 40; k += 9) {
928 GemmMicrokernelTester()
929 .mr(1)
930 .nr(8)
931 .kr(1)
932 .sr(1)
933 .m(1)
934 .n(n)
935 .k(k)
936 .a_stride(43)
937 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
938 }
939 }
940 }
941
942 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
943 TEST_REQUIRES_ARM_NEON_FMA;
944 for (uint32_t n = 16; n <= 24; n += 8) {
945 for (size_t k = 1; k <= 40; k += 9) {
946 for (uint32_t m = 1; m <= 1; m++) {
947 GemmMicrokernelTester()
948 .mr(1)
949 .nr(8)
950 .kr(1)
951 .sr(1)
952 .m(m)
953 .n(n)
954 .k(k)
955 .iterations(1)
956 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
957 }
958 }
959 }
960 }
961
962 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
963 TEST_REQUIRES_ARM_NEON_FMA;
964 for (size_t k = 1; k <= 40; k += 9) {
965 for (uint32_t m = 1; m <= 1; m++) {
966 for (uint32_t n = 1; n <= 8; n++) {
967 GemmMicrokernelTester()
968 .mr(1)
969 .nr(8)
970 .kr(1)
971 .sr(1)
972 .m(m)
973 .n(n)
974 .k(k)
975 .cm_stride(11)
976 .iterations(1)
977 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
978 }
979 }
980 }
981 }
982
983 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
984 TEST_REQUIRES_ARM_NEON_FMA;
985 GemmMicrokernelTester()
986 .mr(1)
987 .nr(8)
988 .kr(1)
989 .sr(1)
990 .m(1)
991 .n(8)
992 .k(8)
993 .qmin(128)
994 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
995 }
996
997 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
998 TEST_REQUIRES_ARM_NEON_FMA;
999 GemmMicrokernelTester()
1000 .mr(1)
1001 .nr(8)
1002 .kr(1)
1003 .sr(1)
1004 .m(1)
1005 .n(8)
1006 .k(8)
1007 .qmax(128)
1008 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1009 }
1010
1011 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
1012 TEST_REQUIRES_ARM_NEON_FMA;
1013 GemmMicrokernelTester()
1014 .mr(1)
1015 .nr(8)
1016 .kr(1)
1017 .sr(1)
1018 .m(1)
1019 .n(8)
1020 .k(8)
1021 .cm_stride(11)
1022 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a57);
1023 }
Frank Barchard7e955972019-10-11 10:34:25 -07001024#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001025
1026
Frank Barchard7e955972019-10-11 10:34:25 -07001027#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001028 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
1029 TEST_REQUIRES_ARM_NEON_FMA;
1030 GemmMicrokernelTester()
1031 .mr(1)
1032 .nr(8)
1033 .kr(1)
1034 .sr(1)
1035 .m(1)
1036 .n(8)
1037 .k(8)
1038 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1039 }
1040
1041 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
1042 TEST_REQUIRES_ARM_NEON_FMA;
1043 GemmMicrokernelTester()
1044 .mr(1)
1045 .nr(8)
1046 .kr(1)
1047 .sr(1)
1048 .m(1)
1049 .n(8)
1050 .k(8)
1051 .cn_stride(11)
1052 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1053 }
1054
1055 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
1056 TEST_REQUIRES_ARM_NEON_FMA;
1057 GemmMicrokernelTester()
1058 .mr(1)
1059 .nr(8)
1060 .kr(1)
1061 .sr(1)
1062 .m(1)
1063 .n(8)
1064 .k(8)
1065 .a_stride(11)
1066 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1067 }
1068
1069 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
1070 TEST_REQUIRES_ARM_NEON_FMA;
1071 for (uint32_t m = 1; m <= 1; m++) {
1072 for (uint32_t n = 1; n <= 8; n++) {
1073 GemmMicrokernelTester()
1074 .mr(1)
1075 .nr(8)
1076 .kr(1)
1077 .sr(1)
1078 .m(m)
1079 .n(n)
1080 .k(8)
1081 .iterations(1)
1082 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1083 }
1084 }
1085 }
1086
1087 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
1088 TEST_REQUIRES_ARM_NEON_FMA;
1089 for (uint32_t m = 1; m <= 1; m++) {
1090 GemmMicrokernelTester()
1091 .mr(1)
1092 .nr(8)
1093 .kr(1)
1094 .sr(1)
1095 .m(m)
1096 .n(8)
1097 .k(8)
1098 .iterations(1)
1099 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1100 }
1101 }
1102
1103 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
1104 TEST_REQUIRES_ARM_NEON_FMA;
1105 for (uint32_t n = 1; n <= 8; n++) {
1106 GemmMicrokernelTester()
1107 .mr(1)
1108 .nr(8)
1109 .kr(1)
1110 .sr(1)
1111 .m(1)
1112 .n(n)
1113 .k(8)
1114 .iterations(1)
1115 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1116 }
1117 }
1118
1119 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
1120 TEST_REQUIRES_ARM_NEON_FMA;
1121 GemmMicrokernelTester()
1122 .mr(1)
1123 .nr(8)
1124 .kr(1)
1125 .sr(1)
1126 .m(1)
1127 .n(8)
1128 .k(16)
1129 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1130 }
1131
1132 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
1133 TEST_REQUIRES_ARM_NEON_FMA;
1134 GemmMicrokernelTester()
1135 .mr(1)
1136 .nr(8)
1137 .kr(1)
1138 .sr(1)
1139 .m(1)
1140 .n(8)
1141 .k(16)
1142 .a_stride(19)
1143 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1144 }
1145
1146 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
1147 TEST_REQUIRES_ARM_NEON_FMA;
1148 for (uint32_t m = 1; m <= 1; m++) {
1149 for (uint32_t n = 1; n <= 8; n++) {
1150 GemmMicrokernelTester()
1151 .mr(1)
1152 .nr(8)
1153 .kr(1)
1154 .sr(1)
1155 .m(m)
1156 .n(n)
1157 .k(16)
1158 .iterations(1)
1159 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1160 }
1161 }
1162 }
1163
1164 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
1165 TEST_REQUIRES_ARM_NEON_FMA;
1166 for (size_t k = 1; k < 16; k++) {
1167 GemmMicrokernelTester()
1168 .mr(1)
1169 .nr(8)
1170 .kr(1)
1171 .sr(1)
1172 .m(1)
1173 .n(8)
1174 .k(k)
1175 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1176 }
1177 }
1178
1179 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
1180 TEST_REQUIRES_ARM_NEON_FMA;
1181 for (size_t k = 1; k < 16; k++) {
1182 GemmMicrokernelTester()
1183 .mr(1)
1184 .nr(8)
1185 .kr(1)
1186 .sr(1)
1187 .m(1)
1188 .n(8)
1189 .k(k)
1190 .a_stride(19)
1191 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1192 }
1193 }
1194
1195 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
1196 TEST_REQUIRES_ARM_NEON_FMA;
1197 for (size_t k = 1; k < 16; k++) {
1198 for (uint32_t m = 1; m <= 1; m++) {
1199 for (uint32_t n = 1; n <= 8; n++) {
1200 GemmMicrokernelTester()
1201 .mr(1)
1202 .nr(8)
1203 .kr(1)
1204 .sr(1)
1205 .m(m)
1206 .n(n)
1207 .k(k)
1208 .iterations(1)
1209 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1210 }
1211 }
1212 }
1213 }
1214
1215 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
1216 TEST_REQUIRES_ARM_NEON_FMA;
1217 for (size_t k = 17; k < 16; k++) {
1218 GemmMicrokernelTester()
1219 .mr(1)
1220 .nr(8)
1221 .kr(1)
1222 .sr(1)
1223 .m(1)
1224 .n(8)
1225 .k(k)
1226 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1227 }
1228 }
1229
1230 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
1231 TEST_REQUIRES_ARM_NEON_FMA;
1232 for (size_t k = 17; k < 16; k++) {
1233 GemmMicrokernelTester()
1234 .mr(1)
1235 .nr(8)
1236 .kr(1)
1237 .sr(1)
1238 .m(1)
1239 .n(8)
1240 .k(k)
1241 .a_stride(19)
1242 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1243 }
1244 }
1245
1246 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
1247 TEST_REQUIRES_ARM_NEON_FMA;
1248 for (size_t k = 17; k < 16; k++) {
1249 for (uint32_t m = 1; m <= 1; m++) {
1250 for (uint32_t n = 1; n <= 8; n++) {
1251 GemmMicrokernelTester()
1252 .mr(1)
1253 .nr(8)
1254 .kr(1)
1255 .sr(1)
1256 .m(m)
1257 .n(n)
1258 .k(k)
1259 .iterations(1)
1260 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1261 }
1262 }
1263 }
1264 }
1265
1266 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
1267 TEST_REQUIRES_ARM_NEON_FMA;
1268 for (size_t k = 24; k <= 80; k += 8) {
1269 GemmMicrokernelTester()
1270 .mr(1)
1271 .nr(8)
1272 .kr(1)
1273 .sr(1)
1274 .m(1)
1275 .n(8)
1276 .k(k)
1277 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1278 }
1279 }
1280
1281 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
1282 TEST_REQUIRES_ARM_NEON_FMA;
1283 for (size_t k = 24; k <= 80; k += 8) {
1284 GemmMicrokernelTester()
1285 .mr(1)
1286 .nr(8)
1287 .kr(1)
1288 .sr(1)
1289 .m(1)
1290 .n(8)
1291 .k(k)
1292 .a_stride(83)
1293 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1294 }
1295 }
1296
1297 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
1298 TEST_REQUIRES_ARM_NEON_FMA;
1299 for (size_t k = 24; k <= 80; k += 8) {
1300 for (uint32_t m = 1; m <= 1; m++) {
1301 for (uint32_t n = 1; n <= 8; n++) {
1302 GemmMicrokernelTester()
1303 .mr(1)
1304 .nr(8)
1305 .kr(1)
1306 .sr(1)
1307 .m(m)
1308 .n(n)
1309 .k(k)
1310 .iterations(1)
1311 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1312 }
1313 }
1314 }
1315 }
1316
1317 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
1318 TEST_REQUIRES_ARM_NEON_FMA;
1319 for (uint32_t n = 9; n < 16; n++) {
1320 for (size_t k = 1; k <= 40; k += 9) {
1321 GemmMicrokernelTester()
1322 .mr(1)
1323 .nr(8)
1324 .kr(1)
1325 .sr(1)
1326 .m(1)
1327 .n(8)
1328 .k(k)
1329 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1330 }
1331 }
1332 }
1333
1334 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
1335 TEST_REQUIRES_ARM_NEON_FMA;
1336 for (uint32_t n = 9; n < 16; n++) {
1337 for (size_t k = 1; k <= 40; k += 9) {
1338 GemmMicrokernelTester()
1339 .mr(1)
1340 .nr(8)
1341 .kr(1)
1342 .sr(1)
1343 .m(1)
1344 .n(8)
1345 .k(k)
1346 .cn_stride(11)
1347 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1348 }
1349 }
1350 }
1351
1352 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
1353 TEST_REQUIRES_ARM_NEON_FMA;
1354 for (uint32_t n = 9; n < 16; n++) {
1355 for (size_t k = 1; k <= 40; k += 9) {
1356 GemmMicrokernelTester()
1357 .mr(1)
1358 .nr(8)
1359 .kr(1)
1360 .sr(1)
1361 .m(1)
1362 .n(n)
1363 .k(k)
1364 .a_stride(43)
1365 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1366 }
1367 }
1368 }
1369
1370 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
1371 TEST_REQUIRES_ARM_NEON_FMA;
1372 for (uint32_t n = 9; n < 16; n++) {
1373 for (size_t k = 1; k <= 40; k += 9) {
1374 for (uint32_t m = 1; m <= 1; m++) {
1375 GemmMicrokernelTester()
1376 .mr(1)
1377 .nr(8)
1378 .kr(1)
1379 .sr(1)
1380 .m(m)
1381 .n(n)
1382 .k(k)
1383 .iterations(1)
1384 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1385 }
1386 }
1387 }
1388 }
1389
1390 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
1391 TEST_REQUIRES_ARM_NEON_FMA;
1392 for (uint32_t n = 16; n <= 24; n += 8) {
1393 for (size_t k = 1; k <= 40; k += 9) {
1394 GemmMicrokernelTester()
1395 .mr(1)
1396 .nr(8)
1397 .kr(1)
1398 .sr(1)
1399 .m(1)
1400 .n(8)
1401 .k(k)
1402 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1403 }
1404 }
1405 }
1406
1407 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
1408 TEST_REQUIRES_ARM_NEON_FMA;
1409 for (uint32_t n = 16; n <= 24; n += 8) {
1410 for (size_t k = 1; k <= 40; k += 9) {
1411 GemmMicrokernelTester()
1412 .mr(1)
1413 .nr(8)
1414 .kr(1)
1415 .sr(1)
1416 .m(1)
1417 .n(n)
1418 .k(k)
1419 .cn_stride(11)
1420 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1421 }
1422 }
1423 }
1424
1425 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
1426 TEST_REQUIRES_ARM_NEON_FMA;
1427 for (uint32_t n = 16; n <= 24; n += 8) {
1428 for (size_t k = 1; k <= 40; k += 9) {
1429 GemmMicrokernelTester()
1430 .mr(1)
1431 .nr(8)
1432 .kr(1)
1433 .sr(1)
1434 .m(1)
1435 .n(n)
1436 .k(k)
1437 .a_stride(43)
1438 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1439 }
1440 }
1441 }
1442
1443 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
1444 TEST_REQUIRES_ARM_NEON_FMA;
1445 for (uint32_t n = 16; n <= 24; n += 8) {
1446 for (size_t k = 1; k <= 40; k += 9) {
1447 for (uint32_t m = 1; m <= 1; m++) {
1448 GemmMicrokernelTester()
1449 .mr(1)
1450 .nr(8)
1451 .kr(1)
1452 .sr(1)
1453 .m(m)
1454 .n(n)
1455 .k(k)
1456 .iterations(1)
1457 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1458 }
1459 }
1460 }
1461 }
1462
1463 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
1464 TEST_REQUIRES_ARM_NEON_FMA;
1465 for (size_t k = 1; k <= 40; k += 9) {
1466 for (uint32_t m = 1; m <= 1; m++) {
1467 for (uint32_t n = 1; n <= 8; n++) {
1468 GemmMicrokernelTester()
1469 .mr(1)
1470 .nr(8)
1471 .kr(1)
1472 .sr(1)
1473 .m(m)
1474 .n(n)
1475 .k(k)
1476 .cm_stride(11)
1477 .iterations(1)
1478 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1479 }
1480 }
1481 }
1482 }
1483
1484 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
1485 TEST_REQUIRES_ARM_NEON_FMA;
1486 GemmMicrokernelTester()
1487 .mr(1)
1488 .nr(8)
1489 .kr(1)
1490 .sr(1)
1491 .m(1)
1492 .n(8)
1493 .k(8)
1494 .qmin(128)
1495 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1496 }
1497
1498 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
1499 TEST_REQUIRES_ARM_NEON_FMA;
1500 GemmMicrokernelTester()
1501 .mr(1)
1502 .nr(8)
1503 .kr(1)
1504 .sr(1)
1505 .m(1)
1506 .n(8)
1507 .k(8)
1508 .qmax(128)
1509 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1510 }
1511
1512 TEST(F32_GEMM_1X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
1513 TEST_REQUIRES_ARM_NEON_FMA;
1514 GemmMicrokernelTester()
1515 .mr(1)
1516 .nr(8)
1517 .kr(1)
1518 .sr(1)
1519 .m(1)
1520 .n(8)
1521 .k(8)
1522 .cm_stride(11)
1523 .Test(xnn_f32_gemm_ukernel_1x8__aarch64_neonfma_cortex_a75);
1524 }
Frank Barchard7e955972019-10-11 10:34:25 -07001525#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07001526
1527
Frank Barchard7e955972019-10-11 10:34:25 -07001528#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001529 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001530 TEST_REQUIRES_ARM_NEON_FMA;
1531 GemmMicrokernelTester()
1532 .mr(4)
1533 .nr(8)
1534 .kr(1)
1535 .sr(1)
1536 .m(4)
1537 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001538 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001539 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1540 }
1541
1542 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
1543 TEST_REQUIRES_ARM_NEON_FMA;
1544 GemmMicrokernelTester()
1545 .mr(4)
1546 .nr(8)
1547 .kr(1)
1548 .sr(1)
1549 .m(4)
1550 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001551 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001552 .cn_stride(11)
1553 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1554 }
1555
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001556 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001557 TEST_REQUIRES_ARM_NEON_FMA;
1558 GemmMicrokernelTester()
1559 .mr(4)
1560 .nr(8)
1561 .kr(1)
1562 .sr(1)
1563 .m(4)
1564 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001565 .k(4)
1566 .a_stride(7)
Frank Barchard46fb8072019-10-25 12:54:22 -07001567 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1568 }
1569
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001570 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001571 TEST_REQUIRES_ARM_NEON_FMA;
1572 for (uint32_t m = 1; m <= 4; m++) {
1573 for (uint32_t n = 1; n <= 8; n++) {
1574 GemmMicrokernelTester()
1575 .mr(4)
1576 .nr(8)
1577 .kr(1)
1578 .sr(1)
1579 .m(m)
1580 .n(n)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001581 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001582 .iterations(1)
1583 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1584 }
1585 }
1586 }
1587
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001588 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001589 TEST_REQUIRES_ARM_NEON_FMA;
1590 for (uint32_t m = 1; m <= 4; m++) {
1591 GemmMicrokernelTester()
1592 .mr(4)
1593 .nr(8)
1594 .kr(1)
1595 .sr(1)
1596 .m(m)
1597 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001598 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001599 .iterations(1)
1600 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1601 }
1602 }
1603
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001604 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001605 TEST_REQUIRES_ARM_NEON_FMA;
1606 for (uint32_t n = 1; n <= 8; n++) {
1607 GemmMicrokernelTester()
1608 .mr(4)
1609 .nr(8)
1610 .kr(1)
1611 .sr(1)
1612 .m(4)
1613 .n(n)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001614 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001615 .iterations(1)
1616 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1617 }
1618 }
1619
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001620 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001621 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001622 GemmMicrokernelTester()
1623 .mr(4)
1624 .nr(8)
1625 .kr(1)
1626 .sr(1)
1627 .m(4)
1628 .n(8)
1629 .k(8)
1630 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1631 }
1632
1633 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
1634 TEST_REQUIRES_ARM_NEON_FMA;
1635 GemmMicrokernelTester()
1636 .mr(4)
1637 .nr(8)
1638 .kr(1)
1639 .sr(1)
1640 .m(4)
1641 .n(8)
1642 .k(8)
1643 .a_stride(11)
1644 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1645 }
1646
1647 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
1648 TEST_REQUIRES_ARM_NEON_FMA;
1649 for (uint32_t m = 1; m <= 4; m++) {
1650 for (uint32_t n = 1; n <= 8; n++) {
1651 GemmMicrokernelTester()
1652 .mr(4)
1653 .nr(8)
1654 .kr(1)
1655 .sr(1)
1656 .m(m)
1657 .n(n)
1658 .k(8)
1659 .iterations(1)
1660 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1661 }
1662 }
1663 }
1664
1665 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
1666 TEST_REQUIRES_ARM_NEON_FMA;
1667 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001668 GemmMicrokernelTester()
1669 .mr(4)
1670 .nr(8)
1671 .kr(1)
1672 .sr(1)
1673 .m(4)
1674 .n(8)
1675 .k(k)
1676 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1677 }
1678 }
1679
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001680 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001681 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001682 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001683 GemmMicrokernelTester()
1684 .mr(4)
1685 .nr(8)
1686 .kr(1)
1687 .sr(1)
1688 .m(4)
1689 .n(8)
1690 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001691 .a_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07001692 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1693 }
1694 }
1695
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001696 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001697 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001698 for (size_t k = 1; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001699 for (uint32_t m = 1; m <= 4; m++) {
1700 for (uint32_t n = 1; n <= 8; n++) {
1701 GemmMicrokernelTester()
1702 .mr(4)
1703 .nr(8)
1704 .kr(1)
1705 .sr(1)
1706 .m(m)
1707 .n(n)
1708 .k(k)
1709 .iterations(1)
1710 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1711 }
1712 }
1713 }
1714 }
1715
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001716 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001717 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001718 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001719 GemmMicrokernelTester()
1720 .mr(4)
1721 .nr(8)
1722 .kr(1)
1723 .sr(1)
1724 .m(4)
1725 .n(8)
1726 .k(k)
1727 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1728 }
1729 }
1730
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001731 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001732 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001733 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001734 GemmMicrokernelTester()
1735 .mr(4)
1736 .nr(8)
1737 .kr(1)
1738 .sr(1)
1739 .m(4)
1740 .n(8)
1741 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001742 .a_stride(11)
Frank Barchard46fb8072019-10-25 12:54:22 -07001743 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1744 }
1745 }
1746
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001747 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001748 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001749 for (size_t k = 9; k < 8; k++) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001750 for (uint32_t m = 1; m <= 4; m++) {
1751 for (uint32_t n = 1; n <= 8; n++) {
1752 GemmMicrokernelTester()
1753 .mr(4)
1754 .nr(8)
1755 .kr(1)
1756 .sr(1)
1757 .m(m)
1758 .n(n)
1759 .k(k)
1760 .iterations(1)
1761 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1762 }
1763 }
1764 }
1765 }
1766
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001767 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001768 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001769 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001770 GemmMicrokernelTester()
1771 .mr(4)
1772 .nr(8)
1773 .kr(1)
1774 .sr(1)
1775 .m(4)
1776 .n(8)
1777 .k(k)
1778 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1779 }
1780 }
1781
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001782 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001783 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001784 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001785 GemmMicrokernelTester()
1786 .mr(4)
1787 .nr(8)
1788 .kr(1)
1789 .sr(1)
1790 .m(4)
1791 .n(8)
1792 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001793 .a_stride(43)
Frank Barchard46fb8072019-10-25 12:54:22 -07001794 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1795 }
1796 }
1797
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001798 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001799 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001800 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001801 for (uint32_t m = 1; m <= 4; m++) {
1802 for (uint32_t n = 1; n <= 8; n++) {
1803 GemmMicrokernelTester()
1804 .mr(4)
1805 .nr(8)
1806 .kr(1)
1807 .sr(1)
1808 .m(m)
1809 .n(n)
1810 .k(k)
1811 .iterations(1)
1812 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1813 }
1814 }
1815 }
1816 }
1817
1818 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
1819 TEST_REQUIRES_ARM_NEON_FMA;
1820 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001821 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001822 GemmMicrokernelTester()
1823 .mr(4)
1824 .nr(8)
1825 .kr(1)
1826 .sr(1)
1827 .m(4)
1828 .n(8)
1829 .k(k)
1830 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1831 }
1832 }
1833 }
1834
1835 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
1836 TEST_REQUIRES_ARM_NEON_FMA;
1837 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001838 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001839 GemmMicrokernelTester()
1840 .mr(4)
1841 .nr(8)
1842 .kr(1)
1843 .sr(1)
1844 .m(4)
1845 .n(8)
1846 .k(k)
1847 .cn_stride(11)
1848 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1849 }
1850 }
1851 }
1852
1853 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
1854 TEST_REQUIRES_ARM_NEON_FMA;
1855 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001856 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001857 GemmMicrokernelTester()
1858 .mr(4)
1859 .nr(8)
1860 .kr(1)
1861 .sr(1)
1862 .m(4)
1863 .n(n)
1864 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001865 .a_stride(23)
Frank Barchard46fb8072019-10-25 12:54:22 -07001866 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1867 }
1868 }
1869 }
1870
1871 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
1872 TEST_REQUIRES_ARM_NEON_FMA;
1873 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001874 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001875 for (uint32_t m = 1; m <= 4; m++) {
1876 GemmMicrokernelTester()
1877 .mr(4)
1878 .nr(8)
1879 .kr(1)
1880 .sr(1)
1881 .m(m)
1882 .n(n)
1883 .k(k)
1884 .iterations(1)
1885 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1886 }
1887 }
1888 }
1889 }
1890
1891 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
1892 TEST_REQUIRES_ARM_NEON_FMA;
1893 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001894 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001895 GemmMicrokernelTester()
1896 .mr(4)
1897 .nr(8)
1898 .kr(1)
1899 .sr(1)
1900 .m(4)
1901 .n(8)
1902 .k(k)
1903 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1904 }
1905 }
1906 }
1907
1908 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
1909 TEST_REQUIRES_ARM_NEON_FMA;
1910 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001911 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001912 GemmMicrokernelTester()
1913 .mr(4)
1914 .nr(8)
1915 .kr(1)
1916 .sr(1)
1917 .m(4)
1918 .n(n)
1919 .k(k)
1920 .cn_stride(11)
1921 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1922 }
1923 }
1924 }
1925
1926 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
1927 TEST_REQUIRES_ARM_NEON_FMA;
1928 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001929 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001930 GemmMicrokernelTester()
1931 .mr(4)
1932 .nr(8)
1933 .kr(1)
1934 .sr(1)
1935 .m(4)
1936 .n(n)
1937 .k(k)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001938 .a_stride(23)
Frank Barchard46fb8072019-10-25 12:54:22 -07001939 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1940 }
1941 }
1942 }
1943
1944 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
1945 TEST_REQUIRES_ARM_NEON_FMA;
1946 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001947 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001948 for (uint32_t m = 1; m <= 4; m++) {
1949 GemmMicrokernelTester()
1950 .mr(4)
1951 .nr(8)
1952 .kr(1)
1953 .sr(1)
1954 .m(m)
1955 .n(n)
1956 .k(k)
1957 .iterations(1)
1958 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1959 }
1960 }
1961 }
1962 }
1963
1964 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
1965 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001966 for (size_t k = 1; k <= 20; k += 5) {
Frank Barchard46fb8072019-10-25 12:54:22 -07001967 for (uint32_t m = 1; m <= 4; m++) {
1968 for (uint32_t n = 1; n <= 8; n++) {
1969 GemmMicrokernelTester()
1970 .mr(4)
1971 .nr(8)
1972 .kr(1)
1973 .sr(1)
1974 .m(m)
1975 .n(n)
1976 .k(k)
1977 .cm_stride(11)
1978 .iterations(1)
1979 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1980 }
1981 }
1982 }
1983 }
1984
1985 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
1986 TEST_REQUIRES_ARM_NEON_FMA;
1987 GemmMicrokernelTester()
1988 .mr(4)
1989 .nr(8)
1990 .kr(1)
1991 .sr(1)
1992 .m(4)
1993 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08001994 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07001995 .qmin(128)
1996 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
1997 }
1998
1999 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
2000 TEST_REQUIRES_ARM_NEON_FMA;
2001 GemmMicrokernelTester()
2002 .mr(4)
2003 .nr(8)
2004 .kr(1)
2005 .sr(1)
2006 .m(4)
2007 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08002008 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07002009 .qmax(128)
2010 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2011 }
2012
2013 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
2014 TEST_REQUIRES_ARM_NEON_FMA;
2015 GemmMicrokernelTester()
2016 .mr(4)
2017 .nr(8)
2018 .kr(1)
2019 .sr(1)
2020 .m(4)
2021 .n(8)
Frank Barchard0ecc2ab2019-11-14 10:57:48 -08002022 .k(4)
Frank Barchard46fb8072019-10-25 12:54:22 -07002023 .cm_stride(11)
2024 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a53);
2025 }
2026#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
2027
2028
2029#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002030 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
2031 TEST_REQUIRES_ARM_NEON_FMA;
2032 GemmMicrokernelTester()
2033 .mr(4)
2034 .nr(8)
2035 .kr(1)
2036 .sr(1)
2037 .m(4)
2038 .n(8)
2039 .k(8)
2040 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2041 }
2042
2043 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
2044 TEST_REQUIRES_ARM_NEON_FMA;
2045 GemmMicrokernelTester()
2046 .mr(4)
2047 .nr(8)
2048 .kr(1)
2049 .sr(1)
2050 .m(4)
2051 .n(8)
2052 .k(8)
2053 .cn_stride(11)
2054 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2055 }
2056
2057 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
2058 TEST_REQUIRES_ARM_NEON_FMA;
2059 GemmMicrokernelTester()
2060 .mr(4)
2061 .nr(8)
2062 .kr(1)
2063 .sr(1)
2064 .m(4)
2065 .n(8)
2066 .k(8)
2067 .a_stride(11)
2068 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2069 }
2070
2071 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
2072 TEST_REQUIRES_ARM_NEON_FMA;
2073 for (uint32_t m = 1; m <= 4; m++) {
2074 for (uint32_t n = 1; n <= 8; n++) {
2075 GemmMicrokernelTester()
2076 .mr(4)
2077 .nr(8)
2078 .kr(1)
2079 .sr(1)
2080 .m(m)
2081 .n(n)
2082 .k(8)
2083 .iterations(1)
2084 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2085 }
2086 }
2087 }
2088
2089 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
2090 TEST_REQUIRES_ARM_NEON_FMA;
2091 for (uint32_t m = 1; m <= 4; m++) {
2092 GemmMicrokernelTester()
2093 .mr(4)
2094 .nr(8)
2095 .kr(1)
2096 .sr(1)
2097 .m(m)
2098 .n(8)
2099 .k(8)
2100 .iterations(1)
2101 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2102 }
2103 }
2104
2105 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
2106 TEST_REQUIRES_ARM_NEON_FMA;
2107 for (uint32_t n = 1; n <= 8; n++) {
2108 GemmMicrokernelTester()
2109 .mr(4)
2110 .nr(8)
2111 .kr(1)
2112 .sr(1)
2113 .m(4)
2114 .n(n)
2115 .k(8)
2116 .iterations(1)
2117 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2118 }
2119 }
2120
2121 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
2122 TEST_REQUIRES_ARM_NEON_FMA;
2123 GemmMicrokernelTester()
2124 .mr(4)
2125 .nr(8)
2126 .kr(1)
2127 .sr(1)
2128 .m(4)
2129 .n(8)
2130 .k(16)
2131 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2132 }
2133
2134 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
2135 TEST_REQUIRES_ARM_NEON_FMA;
2136 GemmMicrokernelTester()
2137 .mr(4)
2138 .nr(8)
2139 .kr(1)
2140 .sr(1)
2141 .m(4)
2142 .n(8)
2143 .k(16)
2144 .a_stride(19)
2145 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2146 }
2147
2148 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
2149 TEST_REQUIRES_ARM_NEON_FMA;
2150 for (uint32_t m = 1; m <= 4; m++) {
2151 for (uint32_t n = 1; n <= 8; n++) {
2152 GemmMicrokernelTester()
2153 .mr(4)
2154 .nr(8)
2155 .kr(1)
2156 .sr(1)
2157 .m(m)
2158 .n(n)
2159 .k(16)
2160 .iterations(1)
2161 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2162 }
2163 }
2164 }
2165
2166 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
2167 TEST_REQUIRES_ARM_NEON_FMA;
2168 for (size_t k = 1; k < 16; k++) {
2169 GemmMicrokernelTester()
2170 .mr(4)
2171 .nr(8)
2172 .kr(1)
2173 .sr(1)
2174 .m(4)
2175 .n(8)
2176 .k(k)
2177 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2178 }
2179 }
2180
2181 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
2182 TEST_REQUIRES_ARM_NEON_FMA;
2183 for (size_t k = 1; k < 16; k++) {
2184 GemmMicrokernelTester()
2185 .mr(4)
2186 .nr(8)
2187 .kr(1)
2188 .sr(1)
2189 .m(4)
2190 .n(8)
2191 .k(k)
2192 .a_stride(19)
2193 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2194 }
2195 }
2196
2197 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
2198 TEST_REQUIRES_ARM_NEON_FMA;
2199 for (size_t k = 1; k < 16; k++) {
2200 for (uint32_t m = 1; m <= 4; m++) {
2201 for (uint32_t n = 1; n <= 8; n++) {
2202 GemmMicrokernelTester()
2203 .mr(4)
2204 .nr(8)
2205 .kr(1)
2206 .sr(1)
2207 .m(m)
2208 .n(n)
2209 .k(k)
2210 .iterations(1)
2211 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2212 }
2213 }
2214 }
2215 }
2216
2217 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
2218 TEST_REQUIRES_ARM_NEON_FMA;
2219 for (size_t k = 17; k < 16; k++) {
2220 GemmMicrokernelTester()
2221 .mr(4)
2222 .nr(8)
2223 .kr(1)
2224 .sr(1)
2225 .m(4)
2226 .n(8)
2227 .k(k)
2228 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2229 }
2230 }
2231
2232 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
2233 TEST_REQUIRES_ARM_NEON_FMA;
2234 for (size_t k = 17; k < 16; k++) {
2235 GemmMicrokernelTester()
2236 .mr(4)
2237 .nr(8)
2238 .kr(1)
2239 .sr(1)
2240 .m(4)
2241 .n(8)
2242 .k(k)
2243 .a_stride(19)
2244 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2245 }
2246 }
2247
2248 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
2249 TEST_REQUIRES_ARM_NEON_FMA;
2250 for (size_t k = 17; k < 16; k++) {
2251 for (uint32_t m = 1; m <= 4; m++) {
2252 for (uint32_t n = 1; n <= 8; n++) {
2253 GemmMicrokernelTester()
2254 .mr(4)
2255 .nr(8)
2256 .kr(1)
2257 .sr(1)
2258 .m(m)
2259 .n(n)
2260 .k(k)
2261 .iterations(1)
2262 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2263 }
2264 }
2265 }
2266 }
2267
2268 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
2269 TEST_REQUIRES_ARM_NEON_FMA;
2270 for (size_t k = 24; k <= 80; k += 8) {
2271 GemmMicrokernelTester()
2272 .mr(4)
2273 .nr(8)
2274 .kr(1)
2275 .sr(1)
2276 .m(4)
2277 .n(8)
2278 .k(k)
2279 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2280 }
2281 }
2282
2283 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
2284 TEST_REQUIRES_ARM_NEON_FMA;
2285 for (size_t k = 24; k <= 80; k += 8) {
2286 GemmMicrokernelTester()
2287 .mr(4)
2288 .nr(8)
2289 .kr(1)
2290 .sr(1)
2291 .m(4)
2292 .n(8)
2293 .k(k)
2294 .a_stride(83)
2295 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2296 }
2297 }
2298
2299 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
2300 TEST_REQUIRES_ARM_NEON_FMA;
2301 for (size_t k = 24; k <= 80; k += 8) {
2302 for (uint32_t m = 1; m <= 4; m++) {
2303 for (uint32_t n = 1; n <= 8; n++) {
2304 GemmMicrokernelTester()
2305 .mr(4)
2306 .nr(8)
2307 .kr(1)
2308 .sr(1)
2309 .m(m)
2310 .n(n)
2311 .k(k)
2312 .iterations(1)
2313 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2314 }
2315 }
2316 }
2317 }
2318
2319 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
2320 TEST_REQUIRES_ARM_NEON_FMA;
2321 for (uint32_t n = 9; n < 16; n++) {
2322 for (size_t k = 1; k <= 40; k += 9) {
2323 GemmMicrokernelTester()
2324 .mr(4)
2325 .nr(8)
2326 .kr(1)
2327 .sr(1)
2328 .m(4)
2329 .n(8)
2330 .k(k)
2331 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2332 }
2333 }
2334 }
2335
2336 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
2337 TEST_REQUIRES_ARM_NEON_FMA;
2338 for (uint32_t n = 9; n < 16; n++) {
2339 for (size_t k = 1; k <= 40; k += 9) {
2340 GemmMicrokernelTester()
2341 .mr(4)
2342 .nr(8)
2343 .kr(1)
2344 .sr(1)
2345 .m(4)
2346 .n(8)
2347 .k(k)
2348 .cn_stride(11)
2349 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2350 }
2351 }
2352 }
2353
2354 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
2355 TEST_REQUIRES_ARM_NEON_FMA;
2356 for (uint32_t n = 9; n < 16; n++) {
2357 for (size_t k = 1; k <= 40; k += 9) {
2358 GemmMicrokernelTester()
2359 .mr(4)
2360 .nr(8)
2361 .kr(1)
2362 .sr(1)
2363 .m(4)
2364 .n(n)
2365 .k(k)
2366 .a_stride(43)
2367 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2368 }
2369 }
2370 }
2371
2372 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
2373 TEST_REQUIRES_ARM_NEON_FMA;
2374 for (uint32_t n = 9; n < 16; n++) {
2375 for (size_t k = 1; k <= 40; k += 9) {
2376 for (uint32_t m = 1; m <= 4; m++) {
2377 GemmMicrokernelTester()
2378 .mr(4)
2379 .nr(8)
2380 .kr(1)
2381 .sr(1)
2382 .m(m)
2383 .n(n)
2384 .k(k)
2385 .iterations(1)
2386 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2387 }
2388 }
2389 }
2390 }
2391
2392 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
2393 TEST_REQUIRES_ARM_NEON_FMA;
2394 for (uint32_t n = 16; n <= 24; n += 8) {
2395 for (size_t k = 1; k <= 40; k += 9) {
2396 GemmMicrokernelTester()
2397 .mr(4)
2398 .nr(8)
2399 .kr(1)
2400 .sr(1)
2401 .m(4)
2402 .n(8)
2403 .k(k)
2404 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2405 }
2406 }
2407 }
2408
2409 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
2410 TEST_REQUIRES_ARM_NEON_FMA;
2411 for (uint32_t n = 16; n <= 24; n += 8) {
2412 for (size_t k = 1; k <= 40; k += 9) {
2413 GemmMicrokernelTester()
2414 .mr(4)
2415 .nr(8)
2416 .kr(1)
2417 .sr(1)
2418 .m(4)
2419 .n(n)
2420 .k(k)
2421 .cn_stride(11)
2422 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2423 }
2424 }
2425 }
2426
2427 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
2428 TEST_REQUIRES_ARM_NEON_FMA;
2429 for (uint32_t n = 16; n <= 24; n += 8) {
2430 for (size_t k = 1; k <= 40; k += 9) {
2431 GemmMicrokernelTester()
2432 .mr(4)
2433 .nr(8)
2434 .kr(1)
2435 .sr(1)
2436 .m(4)
2437 .n(n)
2438 .k(k)
2439 .a_stride(43)
2440 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2441 }
2442 }
2443 }
2444
2445 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
2446 TEST_REQUIRES_ARM_NEON_FMA;
2447 for (uint32_t n = 16; n <= 24; n += 8) {
2448 for (size_t k = 1; k <= 40; k += 9) {
2449 for (uint32_t m = 1; m <= 4; m++) {
2450 GemmMicrokernelTester()
2451 .mr(4)
2452 .nr(8)
2453 .kr(1)
2454 .sr(1)
2455 .m(m)
2456 .n(n)
2457 .k(k)
2458 .iterations(1)
2459 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2460 }
2461 }
2462 }
2463 }
2464
2465 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
2466 TEST_REQUIRES_ARM_NEON_FMA;
2467 for (size_t k = 1; k <= 40; k += 9) {
2468 for (uint32_t m = 1; m <= 4; m++) {
2469 for (uint32_t n = 1; n <= 8; n++) {
2470 GemmMicrokernelTester()
2471 .mr(4)
2472 .nr(8)
2473 .kr(1)
2474 .sr(1)
2475 .m(m)
2476 .n(n)
2477 .k(k)
2478 .cm_stride(11)
2479 .iterations(1)
2480 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2481 }
2482 }
2483 }
2484 }
2485
2486 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
2487 TEST_REQUIRES_ARM_NEON_FMA;
2488 GemmMicrokernelTester()
2489 .mr(4)
2490 .nr(8)
2491 .kr(1)
2492 .sr(1)
2493 .m(4)
2494 .n(8)
2495 .k(8)
2496 .qmin(128)
2497 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2498 }
2499
2500 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
2501 TEST_REQUIRES_ARM_NEON_FMA;
2502 GemmMicrokernelTester()
2503 .mr(4)
2504 .nr(8)
2505 .kr(1)
2506 .sr(1)
2507 .m(4)
2508 .n(8)
2509 .k(8)
2510 .qmax(128)
2511 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2512 }
2513
2514 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
2515 TEST_REQUIRES_ARM_NEON_FMA;
2516 GemmMicrokernelTester()
2517 .mr(4)
2518 .nr(8)
2519 .kr(1)
2520 .sr(1)
2521 .m(4)
2522 .n(8)
2523 .k(8)
2524 .cm_stride(11)
2525 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a57);
2526 }
Frank Barchard7e955972019-10-11 10:34:25 -07002527#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002528
2529
Frank Barchard7e955972019-10-11 10:34:25 -07002530#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07002531 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
2532 TEST_REQUIRES_ARM_NEON_FMA;
2533 GemmMicrokernelTester()
2534 .mr(4)
2535 .nr(8)
2536 .kr(1)
2537 .sr(1)
2538 .m(4)
2539 .n(8)
2540 .k(8)
2541 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2542 }
2543
2544 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
2545 TEST_REQUIRES_ARM_NEON_FMA;
2546 GemmMicrokernelTester()
2547 .mr(4)
2548 .nr(8)
2549 .kr(1)
2550 .sr(1)
2551 .m(4)
2552 .n(8)
2553 .k(8)
2554 .cn_stride(11)
2555 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2556 }
2557
2558 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
2559 TEST_REQUIRES_ARM_NEON_FMA;
2560 GemmMicrokernelTester()
2561 .mr(4)
2562 .nr(8)
2563 .kr(1)
2564 .sr(1)
2565 .m(4)
2566 .n(8)
2567 .k(8)
2568 .a_stride(11)
2569 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2570 }
2571
2572 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
2573 TEST_REQUIRES_ARM_NEON_FMA;
2574 for (uint32_t m = 1; m <= 4; m++) {
2575 for (uint32_t n = 1; n <= 8; n++) {
2576 GemmMicrokernelTester()
2577 .mr(4)
2578 .nr(8)
2579 .kr(1)
2580 .sr(1)
2581 .m(m)
2582 .n(n)
2583 .k(8)
2584 .iterations(1)
2585 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2586 }
2587 }
2588 }
2589
2590 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
2591 TEST_REQUIRES_ARM_NEON_FMA;
2592 for (uint32_t m = 1; m <= 4; m++) {
2593 GemmMicrokernelTester()
2594 .mr(4)
2595 .nr(8)
2596 .kr(1)
2597 .sr(1)
2598 .m(m)
2599 .n(8)
2600 .k(8)
2601 .iterations(1)
2602 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2603 }
2604 }
2605
2606 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
2607 TEST_REQUIRES_ARM_NEON_FMA;
2608 for (uint32_t n = 1; n <= 8; n++) {
2609 GemmMicrokernelTester()
2610 .mr(4)
2611 .nr(8)
2612 .kr(1)
2613 .sr(1)
2614 .m(4)
2615 .n(n)
2616 .k(8)
2617 .iterations(1)
2618 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2619 }
2620 }
2621
2622 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
2623 TEST_REQUIRES_ARM_NEON_FMA;
2624 GemmMicrokernelTester()
2625 .mr(4)
2626 .nr(8)
2627 .kr(1)
2628 .sr(1)
2629 .m(4)
2630 .n(8)
2631 .k(16)
2632 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2633 }
2634
2635 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
2636 TEST_REQUIRES_ARM_NEON_FMA;
2637 GemmMicrokernelTester()
2638 .mr(4)
2639 .nr(8)
2640 .kr(1)
2641 .sr(1)
2642 .m(4)
2643 .n(8)
2644 .k(16)
2645 .a_stride(19)
2646 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2647 }
2648
2649 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
2650 TEST_REQUIRES_ARM_NEON_FMA;
2651 for (uint32_t m = 1; m <= 4; m++) {
2652 for (uint32_t n = 1; n <= 8; n++) {
2653 GemmMicrokernelTester()
2654 .mr(4)
2655 .nr(8)
2656 .kr(1)
2657 .sr(1)
2658 .m(m)
2659 .n(n)
2660 .k(16)
2661 .iterations(1)
2662 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2663 }
2664 }
2665 }
2666
2667 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
2668 TEST_REQUIRES_ARM_NEON_FMA;
2669 for (size_t k = 1; k < 16; k++) {
2670 GemmMicrokernelTester()
2671 .mr(4)
2672 .nr(8)
2673 .kr(1)
2674 .sr(1)
2675 .m(4)
2676 .n(8)
2677 .k(k)
2678 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2679 }
2680 }
2681
2682 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
2683 TEST_REQUIRES_ARM_NEON_FMA;
2684 for (size_t k = 1; k < 16; k++) {
2685 GemmMicrokernelTester()
2686 .mr(4)
2687 .nr(8)
2688 .kr(1)
2689 .sr(1)
2690 .m(4)
2691 .n(8)
2692 .k(k)
2693 .a_stride(19)
2694 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2695 }
2696 }
2697
2698 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
2699 TEST_REQUIRES_ARM_NEON_FMA;
2700 for (size_t k = 1; k < 16; k++) {
2701 for (uint32_t m = 1; m <= 4; m++) {
2702 for (uint32_t n = 1; n <= 8; n++) {
2703 GemmMicrokernelTester()
2704 .mr(4)
2705 .nr(8)
2706 .kr(1)
2707 .sr(1)
2708 .m(m)
2709 .n(n)
2710 .k(k)
2711 .iterations(1)
2712 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2713 }
2714 }
2715 }
2716 }
2717
2718 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
2719 TEST_REQUIRES_ARM_NEON_FMA;
2720 for (size_t k = 17; k < 16; k++) {
2721 GemmMicrokernelTester()
2722 .mr(4)
2723 .nr(8)
2724 .kr(1)
2725 .sr(1)
2726 .m(4)
2727 .n(8)
2728 .k(k)
2729 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2730 }
2731 }
2732
2733 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
2734 TEST_REQUIRES_ARM_NEON_FMA;
2735 for (size_t k = 17; k < 16; k++) {
2736 GemmMicrokernelTester()
2737 .mr(4)
2738 .nr(8)
2739 .kr(1)
2740 .sr(1)
2741 .m(4)
2742 .n(8)
2743 .k(k)
2744 .a_stride(19)
2745 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2746 }
2747 }
2748
2749 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
2750 TEST_REQUIRES_ARM_NEON_FMA;
2751 for (size_t k = 17; k < 16; k++) {
2752 for (uint32_t m = 1; m <= 4; m++) {
2753 for (uint32_t n = 1; n <= 8; n++) {
2754 GemmMicrokernelTester()
2755 .mr(4)
2756 .nr(8)
2757 .kr(1)
2758 .sr(1)
2759 .m(m)
2760 .n(n)
2761 .k(k)
2762 .iterations(1)
2763 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2764 }
2765 }
2766 }
2767 }
2768
2769 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
2770 TEST_REQUIRES_ARM_NEON_FMA;
2771 for (size_t k = 24; k <= 80; k += 8) {
2772 GemmMicrokernelTester()
2773 .mr(4)
2774 .nr(8)
2775 .kr(1)
2776 .sr(1)
2777 .m(4)
2778 .n(8)
2779 .k(k)
2780 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2781 }
2782 }
2783
2784 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
2785 TEST_REQUIRES_ARM_NEON_FMA;
2786 for (size_t k = 24; k <= 80; k += 8) {
2787 GemmMicrokernelTester()
2788 .mr(4)
2789 .nr(8)
2790 .kr(1)
2791 .sr(1)
2792 .m(4)
2793 .n(8)
2794 .k(k)
2795 .a_stride(83)
2796 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2797 }
2798 }
2799
2800 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
2801 TEST_REQUIRES_ARM_NEON_FMA;
2802 for (size_t k = 24; k <= 80; k += 8) {
2803 for (uint32_t m = 1; m <= 4; m++) {
2804 for (uint32_t n = 1; n <= 8; n++) {
2805 GemmMicrokernelTester()
2806 .mr(4)
2807 .nr(8)
2808 .kr(1)
2809 .sr(1)
2810 .m(m)
2811 .n(n)
2812 .k(k)
2813 .iterations(1)
2814 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2815 }
2816 }
2817 }
2818 }
2819
2820 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
2821 TEST_REQUIRES_ARM_NEON_FMA;
2822 for (uint32_t n = 9; n < 16; n++) {
2823 for (size_t k = 1; k <= 40; k += 9) {
2824 GemmMicrokernelTester()
2825 .mr(4)
2826 .nr(8)
2827 .kr(1)
2828 .sr(1)
2829 .m(4)
2830 .n(8)
2831 .k(k)
2832 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2833 }
2834 }
2835 }
2836
2837 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
2838 TEST_REQUIRES_ARM_NEON_FMA;
2839 for (uint32_t n = 9; n < 16; n++) {
2840 for (size_t k = 1; k <= 40; k += 9) {
2841 GemmMicrokernelTester()
2842 .mr(4)
2843 .nr(8)
2844 .kr(1)
2845 .sr(1)
2846 .m(4)
2847 .n(8)
2848 .k(k)
2849 .cn_stride(11)
2850 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2851 }
2852 }
2853 }
2854
2855 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
2856 TEST_REQUIRES_ARM_NEON_FMA;
2857 for (uint32_t n = 9; n < 16; n++) {
2858 for (size_t k = 1; k <= 40; k += 9) {
2859 GemmMicrokernelTester()
2860 .mr(4)
2861 .nr(8)
2862 .kr(1)
2863 .sr(1)
2864 .m(4)
2865 .n(n)
2866 .k(k)
2867 .a_stride(43)
2868 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2869 }
2870 }
2871 }
2872
2873 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
2874 TEST_REQUIRES_ARM_NEON_FMA;
2875 for (uint32_t n = 9; n < 16; n++) {
2876 for (size_t k = 1; k <= 40; k += 9) {
2877 for (uint32_t m = 1; m <= 4; m++) {
2878 GemmMicrokernelTester()
2879 .mr(4)
2880 .nr(8)
2881 .kr(1)
2882 .sr(1)
2883 .m(m)
2884 .n(n)
2885 .k(k)
2886 .iterations(1)
2887 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2888 }
2889 }
2890 }
2891 }
2892
2893 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
2894 TEST_REQUIRES_ARM_NEON_FMA;
2895 for (uint32_t n = 16; n <= 24; n += 8) {
2896 for (size_t k = 1; k <= 40; k += 9) {
2897 GemmMicrokernelTester()
2898 .mr(4)
2899 .nr(8)
2900 .kr(1)
2901 .sr(1)
2902 .m(4)
2903 .n(8)
2904 .k(k)
2905 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2906 }
2907 }
2908 }
2909
2910 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
2911 TEST_REQUIRES_ARM_NEON_FMA;
2912 for (uint32_t n = 16; n <= 24; n += 8) {
2913 for (size_t k = 1; k <= 40; k += 9) {
2914 GemmMicrokernelTester()
2915 .mr(4)
2916 .nr(8)
2917 .kr(1)
2918 .sr(1)
2919 .m(4)
2920 .n(n)
2921 .k(k)
2922 .cn_stride(11)
2923 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2924 }
2925 }
2926 }
2927
2928 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
2929 TEST_REQUIRES_ARM_NEON_FMA;
2930 for (uint32_t n = 16; n <= 24; n += 8) {
2931 for (size_t k = 1; k <= 40; k += 9) {
2932 GemmMicrokernelTester()
2933 .mr(4)
2934 .nr(8)
2935 .kr(1)
2936 .sr(1)
2937 .m(4)
2938 .n(n)
2939 .k(k)
2940 .a_stride(43)
2941 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2942 }
2943 }
2944 }
2945
2946 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
2947 TEST_REQUIRES_ARM_NEON_FMA;
2948 for (uint32_t n = 16; n <= 24; n += 8) {
2949 for (size_t k = 1; k <= 40; k += 9) {
2950 for (uint32_t m = 1; m <= 4; m++) {
2951 GemmMicrokernelTester()
2952 .mr(4)
2953 .nr(8)
2954 .kr(1)
2955 .sr(1)
2956 .m(m)
2957 .n(n)
2958 .k(k)
2959 .iterations(1)
2960 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2961 }
2962 }
2963 }
2964 }
2965
2966 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
2967 TEST_REQUIRES_ARM_NEON_FMA;
2968 for (size_t k = 1; k <= 40; k += 9) {
2969 for (uint32_t m = 1; m <= 4; m++) {
2970 for (uint32_t n = 1; n <= 8; n++) {
2971 GemmMicrokernelTester()
2972 .mr(4)
2973 .nr(8)
2974 .kr(1)
2975 .sr(1)
2976 .m(m)
2977 .n(n)
2978 .k(k)
2979 .cm_stride(11)
2980 .iterations(1)
2981 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2982 }
2983 }
2984 }
2985 }
2986
2987 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
2988 TEST_REQUIRES_ARM_NEON_FMA;
2989 GemmMicrokernelTester()
2990 .mr(4)
2991 .nr(8)
2992 .kr(1)
2993 .sr(1)
2994 .m(4)
2995 .n(8)
2996 .k(8)
2997 .qmin(128)
2998 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
2999 }
3000
3001 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
3002 TEST_REQUIRES_ARM_NEON_FMA;
3003 GemmMicrokernelTester()
3004 .mr(4)
3005 .nr(8)
3006 .kr(1)
3007 .sr(1)
3008 .m(4)
3009 .n(8)
3010 .k(8)
3011 .qmax(128)
3012 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3013 }
3014
3015 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
3016 TEST_REQUIRES_ARM_NEON_FMA;
3017 GemmMicrokernelTester()
3018 .mr(4)
3019 .nr(8)
3020 .kr(1)
3021 .sr(1)
3022 .m(4)
3023 .n(8)
3024 .k(8)
3025 .cm_stride(11)
3026 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_cortex_a75);
3027 }
Frank Barchard7e955972019-10-11 10:34:25 -07003028#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07003029
3030
Frank Barchard7e955972019-10-11 10:34:25 -07003031#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard387c2d12019-12-16 19:14:07 -08003032 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
3033 TEST_REQUIRES_ARM_NEON_FMA;
3034 GemmMicrokernelTester()
3035 .mr(5)
3036 .nr(8)
3037 .kr(1)
3038 .sr(1)
3039 .m(5)
3040 .n(8)
3041 .k(8)
3042 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3043 }
3044
3045 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
3046 TEST_REQUIRES_ARM_NEON_FMA;
3047 GemmMicrokernelTester()
3048 .mr(5)
3049 .nr(8)
3050 .kr(1)
3051 .sr(1)
3052 .m(5)
3053 .n(8)
3054 .k(8)
3055 .cn_stride(11)
3056 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3057 }
3058
3059 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
3060 TEST_REQUIRES_ARM_NEON_FMA;
3061 GemmMicrokernelTester()
3062 .mr(5)
3063 .nr(8)
3064 .kr(1)
3065 .sr(1)
3066 .m(5)
3067 .n(8)
3068 .k(8)
3069 .a_stride(11)
3070 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3071 }
3072
3073 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
3074 TEST_REQUIRES_ARM_NEON_FMA;
3075 for (uint32_t m = 1; m <= 5; m++) {
3076 for (uint32_t n = 1; n <= 8; n++) {
3077 GemmMicrokernelTester()
3078 .mr(5)
3079 .nr(8)
3080 .kr(1)
3081 .sr(1)
3082 .m(m)
3083 .n(n)
3084 .k(8)
3085 .iterations(1)
3086 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3087 }
3088 }
3089 }
3090
3091 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
3092 TEST_REQUIRES_ARM_NEON_FMA;
3093 for (uint32_t m = 1; m <= 5; m++) {
3094 GemmMicrokernelTester()
3095 .mr(5)
3096 .nr(8)
3097 .kr(1)
3098 .sr(1)
3099 .m(m)
3100 .n(8)
3101 .k(8)
3102 .iterations(1)
3103 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3104 }
3105 }
3106
3107 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
3108 TEST_REQUIRES_ARM_NEON_FMA;
3109 for (uint32_t n = 1; n <= 8; n++) {
3110 GemmMicrokernelTester()
3111 .mr(5)
3112 .nr(8)
3113 .kr(1)
3114 .sr(1)
3115 .m(5)
3116 .n(n)
3117 .k(8)
3118 .iterations(1)
3119 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3120 }
3121 }
3122
3123 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
3124 TEST_REQUIRES_ARM_NEON_FMA;
3125 GemmMicrokernelTester()
3126 .mr(5)
3127 .nr(8)
3128 .kr(1)
3129 .sr(1)
3130 .m(5)
3131 .n(8)
3132 .k(16)
3133 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3134 }
3135
3136 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
3137 TEST_REQUIRES_ARM_NEON_FMA;
3138 GemmMicrokernelTester()
3139 .mr(5)
3140 .nr(8)
3141 .kr(1)
3142 .sr(1)
3143 .m(5)
3144 .n(8)
3145 .k(16)
3146 .a_stride(19)
3147 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3148 }
3149
3150 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
3151 TEST_REQUIRES_ARM_NEON_FMA;
3152 for (uint32_t m = 1; m <= 5; m++) {
3153 for (uint32_t n = 1; n <= 8; n++) {
3154 GemmMicrokernelTester()
3155 .mr(5)
3156 .nr(8)
3157 .kr(1)
3158 .sr(1)
3159 .m(m)
3160 .n(n)
3161 .k(16)
3162 .iterations(1)
3163 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3164 }
3165 }
3166 }
3167
3168 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
3169 TEST_REQUIRES_ARM_NEON_FMA;
3170 for (size_t k = 1; k < 16; k++) {
3171 GemmMicrokernelTester()
3172 .mr(5)
3173 .nr(8)
3174 .kr(1)
3175 .sr(1)
3176 .m(5)
3177 .n(8)
3178 .k(k)
3179 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3180 }
3181 }
3182
3183 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
3184 TEST_REQUIRES_ARM_NEON_FMA;
3185 for (size_t k = 1; k < 16; k++) {
3186 GemmMicrokernelTester()
3187 .mr(5)
3188 .nr(8)
3189 .kr(1)
3190 .sr(1)
3191 .m(5)
3192 .n(8)
3193 .k(k)
3194 .a_stride(19)
3195 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3196 }
3197 }
3198
3199 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
3200 TEST_REQUIRES_ARM_NEON_FMA;
3201 for (size_t k = 1; k < 16; k++) {
3202 for (uint32_t m = 1; m <= 5; m++) {
3203 for (uint32_t n = 1; n <= 8; n++) {
3204 GemmMicrokernelTester()
3205 .mr(5)
3206 .nr(8)
3207 .kr(1)
3208 .sr(1)
3209 .m(m)
3210 .n(n)
3211 .k(k)
3212 .iterations(1)
3213 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3214 }
3215 }
3216 }
3217 }
3218
3219 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
3220 TEST_REQUIRES_ARM_NEON_FMA;
3221 for (size_t k = 17; k < 16; k++) {
3222 GemmMicrokernelTester()
3223 .mr(5)
3224 .nr(8)
3225 .kr(1)
3226 .sr(1)
3227 .m(5)
3228 .n(8)
3229 .k(k)
3230 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3231 }
3232 }
3233
3234 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
3235 TEST_REQUIRES_ARM_NEON_FMA;
3236 for (size_t k = 17; k < 16; k++) {
3237 GemmMicrokernelTester()
3238 .mr(5)
3239 .nr(8)
3240 .kr(1)
3241 .sr(1)
3242 .m(5)
3243 .n(8)
3244 .k(k)
3245 .a_stride(19)
3246 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3247 }
3248 }
3249
3250 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
3251 TEST_REQUIRES_ARM_NEON_FMA;
3252 for (size_t k = 17; k < 16; k++) {
3253 for (uint32_t m = 1; m <= 5; m++) {
3254 for (uint32_t n = 1; n <= 8; n++) {
3255 GemmMicrokernelTester()
3256 .mr(5)
3257 .nr(8)
3258 .kr(1)
3259 .sr(1)
3260 .m(m)
3261 .n(n)
3262 .k(k)
3263 .iterations(1)
3264 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3265 }
3266 }
3267 }
3268 }
3269
3270 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
3271 TEST_REQUIRES_ARM_NEON_FMA;
3272 for (size_t k = 24; k <= 80; k += 8) {
3273 GemmMicrokernelTester()
3274 .mr(5)
3275 .nr(8)
3276 .kr(1)
3277 .sr(1)
3278 .m(5)
3279 .n(8)
3280 .k(k)
3281 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3282 }
3283 }
3284
3285 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
3286 TEST_REQUIRES_ARM_NEON_FMA;
3287 for (size_t k = 24; k <= 80; k += 8) {
3288 GemmMicrokernelTester()
3289 .mr(5)
3290 .nr(8)
3291 .kr(1)
3292 .sr(1)
3293 .m(5)
3294 .n(8)
3295 .k(k)
3296 .a_stride(83)
3297 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3298 }
3299 }
3300
3301 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
3302 TEST_REQUIRES_ARM_NEON_FMA;
3303 for (size_t k = 24; k <= 80; k += 8) {
3304 for (uint32_t m = 1; m <= 5; m++) {
3305 for (uint32_t n = 1; n <= 8; n++) {
3306 GemmMicrokernelTester()
3307 .mr(5)
3308 .nr(8)
3309 .kr(1)
3310 .sr(1)
3311 .m(m)
3312 .n(n)
3313 .k(k)
3314 .iterations(1)
3315 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3316 }
3317 }
3318 }
3319 }
3320
3321 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
3322 TEST_REQUIRES_ARM_NEON_FMA;
3323 for (uint32_t n = 9; n < 16; n++) {
3324 for (size_t k = 1; k <= 40; k += 9) {
3325 GemmMicrokernelTester()
3326 .mr(5)
3327 .nr(8)
3328 .kr(1)
3329 .sr(1)
3330 .m(5)
3331 .n(8)
3332 .k(k)
3333 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3334 }
3335 }
3336 }
3337
3338 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
3339 TEST_REQUIRES_ARM_NEON_FMA;
3340 for (uint32_t n = 9; n < 16; n++) {
3341 for (size_t k = 1; k <= 40; k += 9) {
3342 GemmMicrokernelTester()
3343 .mr(5)
3344 .nr(8)
3345 .kr(1)
3346 .sr(1)
3347 .m(5)
3348 .n(8)
3349 .k(k)
3350 .cn_stride(11)
3351 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3352 }
3353 }
3354 }
3355
3356 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
3357 TEST_REQUIRES_ARM_NEON_FMA;
3358 for (uint32_t n = 9; n < 16; n++) {
3359 for (size_t k = 1; k <= 40; k += 9) {
3360 GemmMicrokernelTester()
3361 .mr(5)
3362 .nr(8)
3363 .kr(1)
3364 .sr(1)
3365 .m(5)
3366 .n(n)
3367 .k(k)
3368 .a_stride(43)
3369 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3370 }
3371 }
3372 }
3373
3374 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
3375 TEST_REQUIRES_ARM_NEON_FMA;
3376 for (uint32_t n = 9; n < 16; n++) {
3377 for (size_t k = 1; k <= 40; k += 9) {
3378 for (uint32_t m = 1; m <= 5; m++) {
3379 GemmMicrokernelTester()
3380 .mr(5)
3381 .nr(8)
3382 .kr(1)
3383 .sr(1)
3384 .m(m)
3385 .n(n)
3386 .k(k)
3387 .iterations(1)
3388 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3389 }
3390 }
3391 }
3392 }
3393
3394 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
3395 TEST_REQUIRES_ARM_NEON_FMA;
3396 for (uint32_t n = 16; n <= 24; n += 8) {
3397 for (size_t k = 1; k <= 40; k += 9) {
3398 GemmMicrokernelTester()
3399 .mr(5)
3400 .nr(8)
3401 .kr(1)
3402 .sr(1)
3403 .m(5)
3404 .n(8)
3405 .k(k)
3406 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3407 }
3408 }
3409 }
3410
3411 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
3412 TEST_REQUIRES_ARM_NEON_FMA;
3413 for (uint32_t n = 16; n <= 24; n += 8) {
3414 for (size_t k = 1; k <= 40; k += 9) {
3415 GemmMicrokernelTester()
3416 .mr(5)
3417 .nr(8)
3418 .kr(1)
3419 .sr(1)
3420 .m(5)
3421 .n(n)
3422 .k(k)
3423 .cn_stride(11)
3424 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3425 }
3426 }
3427 }
3428
3429 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
3430 TEST_REQUIRES_ARM_NEON_FMA;
3431 for (uint32_t n = 16; n <= 24; n += 8) {
3432 for (size_t k = 1; k <= 40; k += 9) {
3433 GemmMicrokernelTester()
3434 .mr(5)
3435 .nr(8)
3436 .kr(1)
3437 .sr(1)
3438 .m(5)
3439 .n(n)
3440 .k(k)
3441 .a_stride(43)
3442 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3443 }
3444 }
3445 }
3446
3447 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
3448 TEST_REQUIRES_ARM_NEON_FMA;
3449 for (uint32_t n = 16; n <= 24; n += 8) {
3450 for (size_t k = 1; k <= 40; k += 9) {
3451 for (uint32_t m = 1; m <= 5; m++) {
3452 GemmMicrokernelTester()
3453 .mr(5)
3454 .nr(8)
3455 .kr(1)
3456 .sr(1)
3457 .m(m)
3458 .n(n)
3459 .k(k)
3460 .iterations(1)
3461 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3462 }
3463 }
3464 }
3465 }
3466
3467 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
3468 TEST_REQUIRES_ARM_NEON_FMA;
3469 for (size_t k = 1; k <= 40; k += 9) {
3470 for (uint32_t m = 1; m <= 5; m++) {
3471 for (uint32_t n = 1; n <= 8; n++) {
3472 GemmMicrokernelTester()
3473 .mr(5)
3474 .nr(8)
3475 .kr(1)
3476 .sr(1)
3477 .m(m)
3478 .n(n)
3479 .k(k)
3480 .cm_stride(11)
3481 .iterations(1)
3482 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3483 }
3484 }
3485 }
3486 }
3487
3488 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
3489 TEST_REQUIRES_ARM_NEON_FMA;
3490 GemmMicrokernelTester()
3491 .mr(5)
3492 .nr(8)
3493 .kr(1)
3494 .sr(1)
3495 .m(5)
3496 .n(8)
3497 .k(8)
3498 .qmin(128)
3499 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3500 }
3501
3502 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
3503 TEST_REQUIRES_ARM_NEON_FMA;
3504 GemmMicrokernelTester()
3505 .mr(5)
3506 .nr(8)
3507 .kr(1)
3508 .sr(1)
3509 .m(5)
3510 .n(8)
3511 .k(8)
3512 .qmax(128)
3513 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3514 }
3515
3516 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
3517 TEST_REQUIRES_ARM_NEON_FMA;
3518 GemmMicrokernelTester()
3519 .mr(5)
3520 .nr(8)
3521 .kr(1)
3522 .sr(1)
3523 .m(5)
3524 .n(8)
3525 .k(8)
3526 .cm_stride(11)
3527 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a57);
3528 }
3529#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
3530
3531
3532#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07003533 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
3534 TEST_REQUIRES_ARM_NEON_FMA;
3535 GemmMicrokernelTester()
3536 .mr(5)
3537 .nr(8)
3538 .kr(1)
3539 .sr(1)
3540 .m(5)
3541 .n(8)
3542 .k(8)
3543 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3544 }
3545
3546 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
3547 TEST_REQUIRES_ARM_NEON_FMA;
3548 GemmMicrokernelTester()
3549 .mr(5)
3550 .nr(8)
3551 .kr(1)
3552 .sr(1)
3553 .m(5)
3554 .n(8)
3555 .k(8)
3556 .cn_stride(11)
3557 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3558 }
3559
3560 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
3561 TEST_REQUIRES_ARM_NEON_FMA;
3562 GemmMicrokernelTester()
3563 .mr(5)
3564 .nr(8)
3565 .kr(1)
3566 .sr(1)
3567 .m(5)
3568 .n(8)
3569 .k(8)
3570 .a_stride(11)
3571 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3572 }
3573
3574 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
3575 TEST_REQUIRES_ARM_NEON_FMA;
3576 for (uint32_t m = 1; m <= 5; m++) {
3577 for (uint32_t n = 1; n <= 8; n++) {
3578 GemmMicrokernelTester()
3579 .mr(5)
3580 .nr(8)
3581 .kr(1)
3582 .sr(1)
3583 .m(m)
3584 .n(n)
3585 .k(8)
3586 .iterations(1)
3587 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3588 }
3589 }
3590 }
3591
3592 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
3593 TEST_REQUIRES_ARM_NEON_FMA;
3594 for (uint32_t m = 1; m <= 5; m++) {
3595 GemmMicrokernelTester()
3596 .mr(5)
3597 .nr(8)
3598 .kr(1)
3599 .sr(1)
3600 .m(m)
3601 .n(8)
3602 .k(8)
3603 .iterations(1)
3604 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3605 }
3606 }
3607
3608 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
3609 TEST_REQUIRES_ARM_NEON_FMA;
3610 for (uint32_t n = 1; n <= 8; n++) {
3611 GemmMicrokernelTester()
3612 .mr(5)
3613 .nr(8)
3614 .kr(1)
3615 .sr(1)
3616 .m(5)
3617 .n(n)
3618 .k(8)
3619 .iterations(1)
3620 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3621 }
3622 }
3623
3624 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
3625 TEST_REQUIRES_ARM_NEON_FMA;
3626 GemmMicrokernelTester()
3627 .mr(5)
3628 .nr(8)
3629 .kr(1)
3630 .sr(1)
3631 .m(5)
3632 .n(8)
3633 .k(16)
3634 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3635 }
3636
3637 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
3638 TEST_REQUIRES_ARM_NEON_FMA;
3639 GemmMicrokernelTester()
3640 .mr(5)
3641 .nr(8)
3642 .kr(1)
3643 .sr(1)
3644 .m(5)
3645 .n(8)
3646 .k(16)
3647 .a_stride(19)
3648 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3649 }
3650
3651 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
3652 TEST_REQUIRES_ARM_NEON_FMA;
3653 for (uint32_t m = 1; m <= 5; m++) {
3654 for (uint32_t n = 1; n <= 8; n++) {
3655 GemmMicrokernelTester()
3656 .mr(5)
3657 .nr(8)
3658 .kr(1)
3659 .sr(1)
3660 .m(m)
3661 .n(n)
3662 .k(16)
3663 .iterations(1)
3664 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3665 }
3666 }
3667 }
3668
3669 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
3670 TEST_REQUIRES_ARM_NEON_FMA;
3671 for (size_t k = 1; k < 16; k++) {
3672 GemmMicrokernelTester()
3673 .mr(5)
3674 .nr(8)
3675 .kr(1)
3676 .sr(1)
3677 .m(5)
3678 .n(8)
3679 .k(k)
3680 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3681 }
3682 }
3683
3684 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
3685 TEST_REQUIRES_ARM_NEON_FMA;
3686 for (size_t k = 1; k < 16; k++) {
3687 GemmMicrokernelTester()
3688 .mr(5)
3689 .nr(8)
3690 .kr(1)
3691 .sr(1)
3692 .m(5)
3693 .n(8)
3694 .k(k)
3695 .a_stride(19)
3696 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3697 }
3698 }
3699
3700 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
3701 TEST_REQUIRES_ARM_NEON_FMA;
3702 for (size_t k = 1; k < 16; k++) {
3703 for (uint32_t m = 1; m <= 5; m++) {
3704 for (uint32_t n = 1; n <= 8; n++) {
3705 GemmMicrokernelTester()
3706 .mr(5)
3707 .nr(8)
3708 .kr(1)
3709 .sr(1)
3710 .m(m)
3711 .n(n)
3712 .k(k)
3713 .iterations(1)
3714 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3715 }
3716 }
3717 }
3718 }
3719
3720 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
3721 TEST_REQUIRES_ARM_NEON_FMA;
3722 for (size_t k = 17; k < 16; k++) {
3723 GemmMicrokernelTester()
3724 .mr(5)
3725 .nr(8)
3726 .kr(1)
3727 .sr(1)
3728 .m(5)
3729 .n(8)
3730 .k(k)
3731 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3732 }
3733 }
3734
3735 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
3736 TEST_REQUIRES_ARM_NEON_FMA;
3737 for (size_t k = 17; k < 16; k++) {
3738 GemmMicrokernelTester()
3739 .mr(5)
3740 .nr(8)
3741 .kr(1)
3742 .sr(1)
3743 .m(5)
3744 .n(8)
3745 .k(k)
3746 .a_stride(19)
3747 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3748 }
3749 }
3750
3751 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
3752 TEST_REQUIRES_ARM_NEON_FMA;
3753 for (size_t k = 17; k < 16; k++) {
3754 for (uint32_t m = 1; m <= 5; m++) {
3755 for (uint32_t n = 1; n <= 8; n++) {
3756 GemmMicrokernelTester()
3757 .mr(5)
3758 .nr(8)
3759 .kr(1)
3760 .sr(1)
3761 .m(m)
3762 .n(n)
3763 .k(k)
3764 .iterations(1)
3765 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3766 }
3767 }
3768 }
3769 }
3770
3771 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
3772 TEST_REQUIRES_ARM_NEON_FMA;
3773 for (size_t k = 24; k <= 80; k += 8) {
3774 GemmMicrokernelTester()
3775 .mr(5)
3776 .nr(8)
3777 .kr(1)
3778 .sr(1)
3779 .m(5)
3780 .n(8)
3781 .k(k)
3782 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3783 }
3784 }
3785
3786 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
3787 TEST_REQUIRES_ARM_NEON_FMA;
3788 for (size_t k = 24; k <= 80; k += 8) {
3789 GemmMicrokernelTester()
3790 .mr(5)
3791 .nr(8)
3792 .kr(1)
3793 .sr(1)
3794 .m(5)
3795 .n(8)
3796 .k(k)
3797 .a_stride(83)
3798 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3799 }
3800 }
3801
3802 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
3803 TEST_REQUIRES_ARM_NEON_FMA;
3804 for (size_t k = 24; k <= 80; k += 8) {
3805 for (uint32_t m = 1; m <= 5; m++) {
3806 for (uint32_t n = 1; n <= 8; n++) {
3807 GemmMicrokernelTester()
3808 .mr(5)
3809 .nr(8)
3810 .kr(1)
3811 .sr(1)
3812 .m(m)
3813 .n(n)
3814 .k(k)
3815 .iterations(1)
3816 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3817 }
3818 }
3819 }
3820 }
3821
3822 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
3823 TEST_REQUIRES_ARM_NEON_FMA;
3824 for (uint32_t n = 9; n < 16; n++) {
3825 for (size_t k = 1; k <= 40; k += 9) {
3826 GemmMicrokernelTester()
3827 .mr(5)
3828 .nr(8)
3829 .kr(1)
3830 .sr(1)
3831 .m(5)
3832 .n(8)
3833 .k(k)
3834 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3835 }
3836 }
3837 }
3838
3839 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
3840 TEST_REQUIRES_ARM_NEON_FMA;
3841 for (uint32_t n = 9; n < 16; n++) {
3842 for (size_t k = 1; k <= 40; k += 9) {
3843 GemmMicrokernelTester()
3844 .mr(5)
3845 .nr(8)
3846 .kr(1)
3847 .sr(1)
3848 .m(5)
3849 .n(8)
3850 .k(k)
3851 .cn_stride(11)
3852 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3853 }
3854 }
3855 }
3856
3857 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
3858 TEST_REQUIRES_ARM_NEON_FMA;
3859 for (uint32_t n = 9; n < 16; n++) {
3860 for (size_t k = 1; k <= 40; k += 9) {
3861 GemmMicrokernelTester()
3862 .mr(5)
3863 .nr(8)
3864 .kr(1)
3865 .sr(1)
3866 .m(5)
3867 .n(n)
3868 .k(k)
3869 .a_stride(43)
3870 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3871 }
3872 }
3873 }
3874
3875 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
3876 TEST_REQUIRES_ARM_NEON_FMA;
3877 for (uint32_t n = 9; n < 16; n++) {
3878 for (size_t k = 1; k <= 40; k += 9) {
3879 for (uint32_t m = 1; m <= 5; m++) {
3880 GemmMicrokernelTester()
3881 .mr(5)
3882 .nr(8)
3883 .kr(1)
3884 .sr(1)
3885 .m(m)
3886 .n(n)
3887 .k(k)
3888 .iterations(1)
3889 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3890 }
3891 }
3892 }
3893 }
3894
3895 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
3896 TEST_REQUIRES_ARM_NEON_FMA;
3897 for (uint32_t n = 16; n <= 24; n += 8) {
3898 for (size_t k = 1; k <= 40; k += 9) {
3899 GemmMicrokernelTester()
3900 .mr(5)
3901 .nr(8)
3902 .kr(1)
3903 .sr(1)
3904 .m(5)
3905 .n(8)
3906 .k(k)
3907 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3908 }
3909 }
3910 }
3911
3912 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
3913 TEST_REQUIRES_ARM_NEON_FMA;
3914 for (uint32_t n = 16; n <= 24; n += 8) {
3915 for (size_t k = 1; k <= 40; k += 9) {
3916 GemmMicrokernelTester()
3917 .mr(5)
3918 .nr(8)
3919 .kr(1)
3920 .sr(1)
3921 .m(5)
3922 .n(n)
3923 .k(k)
3924 .cn_stride(11)
3925 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3926 }
3927 }
3928 }
3929
3930 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
3931 TEST_REQUIRES_ARM_NEON_FMA;
3932 for (uint32_t n = 16; n <= 24; n += 8) {
3933 for (size_t k = 1; k <= 40; k += 9) {
3934 GemmMicrokernelTester()
3935 .mr(5)
3936 .nr(8)
3937 .kr(1)
3938 .sr(1)
3939 .m(5)
3940 .n(n)
3941 .k(k)
3942 .a_stride(43)
3943 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3944 }
3945 }
3946 }
3947
3948 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
3949 TEST_REQUIRES_ARM_NEON_FMA;
3950 for (uint32_t n = 16; n <= 24; n += 8) {
3951 for (size_t k = 1; k <= 40; k += 9) {
3952 for (uint32_t m = 1; m <= 5; m++) {
3953 GemmMicrokernelTester()
3954 .mr(5)
3955 .nr(8)
3956 .kr(1)
3957 .sr(1)
3958 .m(m)
3959 .n(n)
3960 .k(k)
3961 .iterations(1)
3962 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3963 }
3964 }
3965 }
3966 }
3967
3968 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
3969 TEST_REQUIRES_ARM_NEON_FMA;
3970 for (size_t k = 1; k <= 40; k += 9) {
3971 for (uint32_t m = 1; m <= 5; m++) {
3972 for (uint32_t n = 1; n <= 8; n++) {
3973 GemmMicrokernelTester()
3974 .mr(5)
3975 .nr(8)
3976 .kr(1)
3977 .sr(1)
3978 .m(m)
3979 .n(n)
3980 .k(k)
3981 .cm_stride(11)
3982 .iterations(1)
3983 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
3984 }
3985 }
3986 }
3987 }
3988
3989 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
3990 TEST_REQUIRES_ARM_NEON_FMA;
3991 GemmMicrokernelTester()
3992 .mr(5)
3993 .nr(8)
3994 .kr(1)
3995 .sr(1)
3996 .m(5)
3997 .n(8)
3998 .k(8)
3999 .qmin(128)
4000 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4001 }
4002
4003 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
4004 TEST_REQUIRES_ARM_NEON_FMA;
4005 GemmMicrokernelTester()
4006 .mr(5)
4007 .nr(8)
4008 .kr(1)
4009 .sr(1)
4010 .m(5)
4011 .n(8)
4012 .k(8)
4013 .qmax(128)
4014 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4015 }
4016
4017 TEST(F32_GEMM_5X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
4018 TEST_REQUIRES_ARM_NEON_FMA;
4019 GemmMicrokernelTester()
4020 .mr(5)
4021 .nr(8)
4022 .kr(1)
4023 .sr(1)
4024 .m(5)
4025 .n(8)
4026 .k(8)
4027 .cm_stride(11)
4028 .Test(xnn_f32_gemm_ukernel_5x8__aarch64_neonfma_cortex_a75);
4029 }
Frank Barchard7e955972019-10-11 10:34:25 -07004030#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004031
4032
Frank Barchard7e955972019-10-11 10:34:25 -07004033#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard00bf68e2019-10-27 03:00:09 -07004034 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004035 TEST_REQUIRES_ARM_NEON_FMA;
4036 GemmMicrokernelTester()
4037 .mr(6)
4038 .nr(8)
4039 .kr(1)
4040 .sr(1)
4041 .m(6)
4042 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004043 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004044 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4045 }
4046
4047 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
4048 TEST_REQUIRES_ARM_NEON_FMA;
4049 GemmMicrokernelTester()
4050 .mr(6)
4051 .nr(8)
4052 .kr(1)
4053 .sr(1)
4054 .m(6)
4055 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004056 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004057 .cn_stride(11)
4058 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4059 }
4060
Frank Barchard00bf68e2019-10-27 03:00:09 -07004061 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004062 TEST_REQUIRES_ARM_NEON_FMA;
4063 GemmMicrokernelTester()
4064 .mr(6)
4065 .nr(8)
4066 .kr(1)
4067 .sr(1)
4068 .m(6)
4069 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004070 .k(4)
4071 .a_stride(7)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004072 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4073 }
4074
Frank Barchard00bf68e2019-10-27 03:00:09 -07004075 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004076 TEST_REQUIRES_ARM_NEON_FMA;
4077 for (uint32_t m = 1; m <= 6; m++) {
4078 for (uint32_t n = 1; n <= 8; n++) {
4079 GemmMicrokernelTester()
4080 .mr(6)
4081 .nr(8)
4082 .kr(1)
4083 .sr(1)
4084 .m(m)
4085 .n(n)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004086 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004087 .iterations(1)
4088 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4089 }
4090 }
4091 }
4092
Frank Barchard00bf68e2019-10-27 03:00:09 -07004093 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004094 TEST_REQUIRES_ARM_NEON_FMA;
4095 for (uint32_t m = 1; m <= 6; m++) {
4096 GemmMicrokernelTester()
4097 .mr(6)
4098 .nr(8)
4099 .kr(1)
4100 .sr(1)
4101 .m(m)
4102 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004103 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004104 .iterations(1)
4105 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4106 }
4107 }
4108
Frank Barchard00bf68e2019-10-27 03:00:09 -07004109 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004110 TEST_REQUIRES_ARM_NEON_FMA;
4111 for (uint32_t n = 1; n <= 8; n++) {
4112 GemmMicrokernelTester()
4113 .mr(6)
4114 .nr(8)
4115 .kr(1)
4116 .sr(1)
4117 .m(6)
4118 .n(n)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004119 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004120 .iterations(1)
4121 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4122 }
4123 }
4124
Frank Barcharde64f91a2019-11-11 13:18:00 -08004125 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004126 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004127 GemmMicrokernelTester()
4128 .mr(6)
4129 .nr(8)
4130 .kr(1)
4131 .sr(1)
4132 .m(6)
4133 .n(8)
4134 .k(8)
4135 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4136 }
4137
4138 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
4139 TEST_REQUIRES_ARM_NEON_FMA;
4140 GemmMicrokernelTester()
4141 .mr(6)
4142 .nr(8)
4143 .kr(1)
4144 .sr(1)
4145 .m(6)
4146 .n(8)
4147 .k(8)
4148 .a_stride(11)
4149 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4150 }
4151
4152 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
4153 TEST_REQUIRES_ARM_NEON_FMA;
4154 for (uint32_t m = 1; m <= 6; m++) {
4155 for (uint32_t n = 1; n <= 8; n++) {
4156 GemmMicrokernelTester()
4157 .mr(6)
4158 .nr(8)
4159 .kr(1)
4160 .sr(1)
4161 .m(m)
4162 .n(n)
4163 .k(8)
4164 .iterations(1)
4165 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4166 }
4167 }
4168 }
4169
4170 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
4171 TEST_REQUIRES_ARM_NEON_FMA;
4172 for (size_t k = 1; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004173 GemmMicrokernelTester()
4174 .mr(6)
4175 .nr(8)
4176 .kr(1)
4177 .sr(1)
4178 .m(6)
4179 .n(8)
4180 .k(k)
4181 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4182 }
4183 }
4184
Frank Barcharde64f91a2019-11-11 13:18:00 -08004185 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004186 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004187 for (size_t k = 1; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004188 GemmMicrokernelTester()
4189 .mr(6)
4190 .nr(8)
4191 .kr(1)
4192 .sr(1)
4193 .m(6)
4194 .n(8)
4195 .k(k)
Frank Barcharde64f91a2019-11-11 13:18:00 -08004196 .a_stride(11)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004197 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4198 }
4199 }
4200
Frank Barcharde64f91a2019-11-11 13:18:00 -08004201 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004202 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004203 for (size_t k = 1; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004204 for (uint32_t m = 1; m <= 6; m++) {
4205 for (uint32_t n = 1; n <= 8; n++) {
4206 GemmMicrokernelTester()
4207 .mr(6)
4208 .nr(8)
4209 .kr(1)
4210 .sr(1)
4211 .m(m)
4212 .n(n)
4213 .k(k)
4214 .iterations(1)
4215 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4216 }
4217 }
4218 }
4219 }
4220
Frank Barcharde64f91a2019-11-11 13:18:00 -08004221 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004222 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004223 for (size_t k = 9; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004224 GemmMicrokernelTester()
4225 .mr(6)
4226 .nr(8)
4227 .kr(1)
4228 .sr(1)
4229 .m(6)
4230 .n(8)
4231 .k(k)
4232 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4233 }
4234 }
4235
Frank Barchard00bf68e2019-10-27 03:00:09 -07004236 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004237 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004238 for (size_t k = 9; k < 8; k++) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004239 GemmMicrokernelTester()
4240 .mr(6)
4241 .nr(8)
4242 .kr(1)
4243 .sr(1)
4244 .m(6)
4245 .n(8)
4246 .k(k)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004247 .a_stride(11)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004248 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4249 }
4250 }
4251
Frank Barchard00bf68e2019-10-27 03:00:09 -07004252 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004253 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004254 for (size_t k = 9; k < 8; k++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004255 for (uint32_t m = 1; m <= 6; m++) {
4256 for (uint32_t n = 1; n <= 8; n++) {
4257 GemmMicrokernelTester()
4258 .mr(6)
4259 .nr(8)
4260 .kr(1)
4261 .sr(1)
4262 .m(m)
4263 .n(n)
4264 .k(k)
4265 .iterations(1)
4266 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4267 }
4268 }
4269 }
4270 }
4271
4272 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
4273 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004274 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004275 GemmMicrokernelTester()
4276 .mr(6)
4277 .nr(8)
4278 .kr(1)
4279 .sr(1)
4280 .m(6)
4281 .n(8)
4282 .k(k)
4283 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4284 }
4285 }
4286
4287 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
4288 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004289 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004290 GemmMicrokernelTester()
4291 .mr(6)
4292 .nr(8)
4293 .kr(1)
4294 .sr(1)
4295 .m(6)
4296 .n(8)
4297 .k(k)
4298 .a_stride(43)
4299 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4300 }
4301 }
4302
4303 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
4304 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barcharde64f91a2019-11-11 13:18:00 -08004305 for (size_t k = 12; k <= 40; k += 4) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004306 for (uint32_t m = 1; m <= 6; m++) {
4307 for (uint32_t n = 1; n <= 8; n++) {
4308 GemmMicrokernelTester()
4309 .mr(6)
4310 .nr(8)
4311 .kr(1)
4312 .sr(1)
4313 .m(m)
4314 .n(n)
4315 .k(k)
4316 .iterations(1)
4317 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4318 }
4319 }
4320 }
4321 }
4322
4323 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8) {
4324 TEST_REQUIRES_ARM_NEON_FMA;
4325 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004326 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004327 GemmMicrokernelTester()
4328 .mr(6)
4329 .nr(8)
4330 .kr(1)
4331 .sr(1)
4332 .m(6)
4333 .n(8)
4334 .k(k)
4335 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4336 }
4337 }
4338 }
4339
4340 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_cn) {
4341 TEST_REQUIRES_ARM_NEON_FMA;
4342 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004343 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004344 GemmMicrokernelTester()
4345 .mr(6)
4346 .nr(8)
4347 .kr(1)
4348 .sr(1)
4349 .m(6)
4350 .n(8)
4351 .k(k)
4352 .cn_stride(11)
4353 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4354 }
4355 }
4356 }
4357
4358 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_strided_a) {
4359 TEST_REQUIRES_ARM_NEON_FMA;
4360 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004361 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004362 GemmMicrokernelTester()
4363 .mr(6)
4364 .nr(8)
4365 .kr(1)
4366 .sr(1)
4367 .m(6)
4368 .n(n)
4369 .k(k)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004370 .a_stride(23)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004371 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4372 }
4373 }
4374 }
4375
4376 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_gt_8_subtile) {
4377 TEST_REQUIRES_ARM_NEON_FMA;
4378 for (uint32_t n = 9; n < 16; n++) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004379 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004380 for (uint32_t m = 1; m <= 6; m++) {
4381 GemmMicrokernelTester()
4382 .mr(6)
4383 .nr(8)
4384 .kr(1)
4385 .sr(1)
4386 .m(m)
4387 .n(n)
4388 .k(k)
4389 .iterations(1)
4390 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4391 }
4392 }
4393 }
4394 }
4395
4396 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8) {
4397 TEST_REQUIRES_ARM_NEON_FMA;
4398 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004399 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004400 GemmMicrokernelTester()
4401 .mr(6)
4402 .nr(8)
4403 .kr(1)
4404 .sr(1)
4405 .m(6)
4406 .n(8)
4407 .k(k)
4408 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4409 }
4410 }
4411 }
4412
4413 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_cn) {
4414 TEST_REQUIRES_ARM_NEON_FMA;
4415 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004416 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004417 GemmMicrokernelTester()
4418 .mr(6)
4419 .nr(8)
4420 .kr(1)
4421 .sr(1)
4422 .m(6)
4423 .n(n)
4424 .k(k)
4425 .cn_stride(11)
4426 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4427 }
4428 }
4429 }
4430
4431 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_strided_a) {
4432 TEST_REQUIRES_ARM_NEON_FMA;
4433 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004434 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004435 GemmMicrokernelTester()
4436 .mr(6)
4437 .nr(8)
4438 .kr(1)
4439 .sr(1)
4440 .m(6)
4441 .n(n)
4442 .k(k)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004443 .a_stride(23)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004444 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4445 }
4446 }
4447 }
4448
4449 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, n_div_8_subtile) {
4450 TEST_REQUIRES_ARM_NEON_FMA;
4451 for (uint32_t n = 16; n <= 24; n += 8) {
Frank Barchard00bf68e2019-10-27 03:00:09 -07004452 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004453 for (uint32_t m = 1; m <= 6; m++) {
4454 GemmMicrokernelTester()
4455 .mr(6)
4456 .nr(8)
4457 .kr(1)
4458 .sr(1)
4459 .m(m)
4460 .n(n)
4461 .k(k)
4462 .iterations(1)
4463 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4464 }
4465 }
4466 }
4467 }
4468
4469 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
4470 TEST_REQUIRES_ARM_NEON_FMA;
Frank Barchard00bf68e2019-10-27 03:00:09 -07004471 for (size_t k = 1; k <= 20; k += 5) {
Frank Barcharda7fb8552019-10-23 17:14:17 -07004472 for (uint32_t m = 1; m <= 6; m++) {
4473 for (uint32_t n = 1; n <= 8; n++) {
4474 GemmMicrokernelTester()
4475 .mr(6)
4476 .nr(8)
4477 .kr(1)
4478 .sr(1)
4479 .m(m)
4480 .n(n)
4481 .k(k)
4482 .cm_stride(11)
4483 .iterations(1)
4484 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4485 }
4486 }
4487 }
4488 }
4489
4490 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmin) {
4491 TEST_REQUIRES_ARM_NEON_FMA;
4492 GemmMicrokernelTester()
4493 .mr(6)
4494 .nr(8)
4495 .kr(1)
4496 .sr(1)
4497 .m(6)
4498 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004499 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004500 .qmin(128)
4501 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4502 }
4503
4504 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, qmax) {
4505 TEST_REQUIRES_ARM_NEON_FMA;
4506 GemmMicrokernelTester()
4507 .mr(6)
4508 .nr(8)
4509 .kr(1)
4510 .sr(1)
4511 .m(6)
4512 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004513 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004514 .qmax(128)
4515 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4516 }
4517
4518 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
4519 TEST_REQUIRES_ARM_NEON_FMA;
4520 GemmMicrokernelTester()
4521 .mr(6)
4522 .nr(8)
4523 .kr(1)
4524 .sr(1)
4525 .m(6)
4526 .n(8)
Frank Barchard00bf68e2019-10-27 03:00:09 -07004527 .k(4)
Frank Barcharda7fb8552019-10-23 17:14:17 -07004528 .cm_stride(11)
4529 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a53);
4530 }
4531#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
4532
4533
4534#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07004535 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8) {
4536 TEST_REQUIRES_ARM_NEON_FMA;
4537 GemmMicrokernelTester()
4538 .mr(6)
4539 .nr(8)
4540 .kr(1)
4541 .sr(1)
4542 .m(6)
4543 .n(8)
4544 .k(8)
4545 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4546 }
4547
4548 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cn) {
4549 TEST_REQUIRES_ARM_NEON_FMA;
4550 GemmMicrokernelTester()
4551 .mr(6)
4552 .nr(8)
4553 .kr(1)
4554 .sr(1)
4555 .m(6)
4556 .n(8)
4557 .k(8)
4558 .cn_stride(11)
4559 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4560 }
4561
4562 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_strided_a) {
4563 TEST_REQUIRES_ARM_NEON_FMA;
4564 GemmMicrokernelTester()
4565 .mr(6)
4566 .nr(8)
4567 .kr(1)
4568 .sr(1)
4569 .m(6)
4570 .n(8)
4571 .k(8)
4572 .a_stride(11)
4573 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4574 }
4575
4576 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile) {
4577 TEST_REQUIRES_ARM_NEON_FMA;
4578 for (uint32_t m = 1; m <= 6; m++) {
4579 for (uint32_t n = 1; n <= 8; n++) {
4580 GemmMicrokernelTester()
4581 .mr(6)
4582 .nr(8)
4583 .kr(1)
4584 .sr(1)
4585 .m(m)
4586 .n(n)
4587 .k(8)
4588 .iterations(1)
4589 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4590 }
4591 }
4592 }
4593
4594 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_m) {
4595 TEST_REQUIRES_ARM_NEON_FMA;
4596 for (uint32_t m = 1; m <= 6; m++) {
4597 GemmMicrokernelTester()
4598 .mr(6)
4599 .nr(8)
4600 .kr(1)
4601 .sr(1)
4602 .m(m)
4603 .n(8)
4604 .k(8)
4605 .iterations(1)
4606 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4607 }
4608 }
4609
4610 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_8_subtile_n) {
4611 TEST_REQUIRES_ARM_NEON_FMA;
4612 for (uint32_t n = 1; n <= 8; n++) {
4613 GemmMicrokernelTester()
4614 .mr(6)
4615 .nr(8)
4616 .kr(1)
4617 .sr(1)
4618 .m(6)
4619 .n(n)
4620 .k(8)
4621 .iterations(1)
4622 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4623 }
4624 }
4625
4626 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16) {
4627 TEST_REQUIRES_ARM_NEON_FMA;
4628 GemmMicrokernelTester()
4629 .mr(6)
4630 .nr(8)
4631 .kr(1)
4632 .sr(1)
4633 .m(6)
4634 .n(8)
4635 .k(16)
4636 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4637 }
4638
4639 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_strided_a) {
4640 TEST_REQUIRES_ARM_NEON_FMA;
4641 GemmMicrokernelTester()
4642 .mr(6)
4643 .nr(8)
4644 .kr(1)
4645 .sr(1)
4646 .m(6)
4647 .n(8)
4648 .k(16)
4649 .a_stride(19)
4650 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4651 }
4652
4653 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_eq_16_subtile) {
4654 TEST_REQUIRES_ARM_NEON_FMA;
4655 for (uint32_t m = 1; m <= 6; m++) {
4656 for (uint32_t n = 1; n <= 8; n++) {
4657 GemmMicrokernelTester()
4658 .mr(6)
4659 .nr(8)
4660 .kr(1)
4661 .sr(1)
4662 .m(m)
4663 .n(n)
4664 .k(16)
4665 .iterations(1)
4666 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4667 }
4668 }
4669 }
4670
4671 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16) {
4672 TEST_REQUIRES_ARM_NEON_FMA;
4673 for (size_t k = 1; k < 16; k++) {
4674 GemmMicrokernelTester()
4675 .mr(6)
4676 .nr(8)
4677 .kr(1)
4678 .sr(1)
4679 .m(6)
4680 .n(8)
4681 .k(k)
4682 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4683 }
4684 }
4685
4686 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_strided_a) {
4687 TEST_REQUIRES_ARM_NEON_FMA;
4688 for (size_t k = 1; k < 16; k++) {
4689 GemmMicrokernelTester()
4690 .mr(6)
4691 .nr(8)
4692 .kr(1)
4693 .sr(1)
4694 .m(6)
4695 .n(8)
4696 .k(k)
4697 .a_stride(19)
4698 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4699 }
4700 }
4701
4702 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_lt_16_subtile) {
4703 TEST_REQUIRES_ARM_NEON_FMA;
4704 for (size_t k = 1; k < 16; k++) {
4705 for (uint32_t m = 1; m <= 6; m++) {
4706 for (uint32_t n = 1; n <= 8; n++) {
4707 GemmMicrokernelTester()
4708 .mr(6)
4709 .nr(8)
4710 .kr(1)
4711 .sr(1)
4712 .m(m)
4713 .n(n)
4714 .k(k)
4715 .iterations(1)
4716 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4717 }
4718 }
4719 }
4720 }
4721
4722 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_16) {
4723 TEST_REQUIRES_ARM_NEON_FMA;
4724 for (size_t k = 17; k < 16; k++) {
4725 GemmMicrokernelTester()
4726 .mr(6)
4727 .nr(8)
4728 .kr(1)
4729 .sr(1)
4730 .m(6)
4731 .n(8)
4732 .k(k)
4733 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4734 }
4735 }
4736
4737 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_strided_a) {
4738 TEST_REQUIRES_ARM_NEON_FMA;
4739 for (size_t k = 17; k < 16; k++) {
4740 GemmMicrokernelTester()
4741 .mr(6)
4742 .nr(8)
4743 .kr(1)
4744 .sr(1)
4745 .m(6)
4746 .n(8)
4747 .k(k)
4748 .a_stride(19)
4749 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4750 }
4751 }
4752
4753 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_gt_8_subtile) {
4754 TEST_REQUIRES_ARM_NEON_FMA;
4755 for (size_t k = 17; k < 16; k++) {
4756 for (uint32_t m = 1; m <= 6; m++) {
4757 for (uint32_t n = 1; n <= 8; n++) {
4758 GemmMicrokernelTester()
4759 .mr(6)
4760 .nr(8)
4761 .kr(1)
4762 .sr(1)
4763 .m(m)
4764 .n(n)
4765 .k(k)
4766 .iterations(1)
4767 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4768 }
4769 }
4770 }
4771 }
4772
4773 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8) {
4774 TEST_REQUIRES_ARM_NEON_FMA;
4775 for (size_t k = 24; k <= 80; k += 8) {
4776 GemmMicrokernelTester()
4777 .mr(6)
4778 .nr(8)
4779 .kr(1)
4780 .sr(1)
4781 .m(6)
4782 .n(8)
4783 .k(k)
4784 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4785 }
4786 }
4787
4788 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_strided_a) {
4789 TEST_REQUIRES_ARM_NEON_FMA;
4790 for (size_t k = 24; k <= 80; k += 8) {
4791 GemmMicrokernelTester()
4792 .mr(6)
4793 .nr(8)
4794 .kr(1)
4795 .sr(1)
4796 .m(6)
4797 .n(8)
4798 .k(k)
4799 .a_stride(83)
4800 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4801 }
4802 }
4803
4804 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, k_div_8_subtile) {
4805 TEST_REQUIRES_ARM_NEON_FMA;
4806 for (size_t k = 24; k <= 80; k += 8) {
4807 for (uint32_t m = 1; m <= 6; m++) {
4808 for (uint32_t n = 1; n <= 8; n++) {
4809 GemmMicrokernelTester()
4810 .mr(6)
4811 .nr(8)
4812 .kr(1)
4813 .sr(1)
4814 .m(m)
4815 .n(n)
4816 .k(k)
4817 .iterations(1)
4818 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4819 }
4820 }
4821 }
4822 }
4823
4824 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8) {
4825 TEST_REQUIRES_ARM_NEON_FMA;
4826 for (uint32_t n = 9; n < 16; n++) {
4827 for (size_t k = 1; k <= 40; k += 9) {
4828 GemmMicrokernelTester()
4829 .mr(6)
4830 .nr(8)
4831 .kr(1)
4832 .sr(1)
4833 .m(6)
4834 .n(8)
4835 .k(k)
4836 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4837 }
4838 }
4839 }
4840
4841 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_cn) {
4842 TEST_REQUIRES_ARM_NEON_FMA;
4843 for (uint32_t n = 9; n < 16; n++) {
4844 for (size_t k = 1; k <= 40; k += 9) {
4845 GemmMicrokernelTester()
4846 .mr(6)
4847 .nr(8)
4848 .kr(1)
4849 .sr(1)
4850 .m(6)
4851 .n(8)
4852 .k(k)
4853 .cn_stride(11)
4854 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4855 }
4856 }
4857 }
4858
4859 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_strided_a) {
4860 TEST_REQUIRES_ARM_NEON_FMA;
4861 for (uint32_t n = 9; n < 16; n++) {
4862 for (size_t k = 1; k <= 40; k += 9) {
4863 GemmMicrokernelTester()
4864 .mr(6)
4865 .nr(8)
4866 .kr(1)
4867 .sr(1)
4868 .m(6)
4869 .n(n)
4870 .k(k)
4871 .a_stride(43)
4872 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4873 }
4874 }
4875 }
4876
4877 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_gt_8_subtile) {
4878 TEST_REQUIRES_ARM_NEON_FMA;
4879 for (uint32_t n = 9; n < 16; n++) {
4880 for (size_t k = 1; k <= 40; k += 9) {
4881 for (uint32_t m = 1; m <= 6; m++) {
4882 GemmMicrokernelTester()
4883 .mr(6)
4884 .nr(8)
4885 .kr(1)
4886 .sr(1)
4887 .m(m)
4888 .n(n)
4889 .k(k)
4890 .iterations(1)
4891 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4892 }
4893 }
4894 }
4895 }
4896
4897 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8) {
4898 TEST_REQUIRES_ARM_NEON_FMA;
4899 for (uint32_t n = 16; n <= 24; n += 8) {
4900 for (size_t k = 1; k <= 40; k += 9) {
4901 GemmMicrokernelTester()
4902 .mr(6)
4903 .nr(8)
4904 .kr(1)
4905 .sr(1)
4906 .m(6)
4907 .n(8)
4908 .k(k)
4909 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4910 }
4911 }
4912 }
4913
4914 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_cn) {
4915 TEST_REQUIRES_ARM_NEON_FMA;
4916 for (uint32_t n = 16; n <= 24; n += 8) {
4917 for (size_t k = 1; k <= 40; k += 9) {
4918 GemmMicrokernelTester()
4919 .mr(6)
4920 .nr(8)
4921 .kr(1)
4922 .sr(1)
4923 .m(6)
4924 .n(n)
4925 .k(k)
4926 .cn_stride(11)
4927 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4928 }
4929 }
4930 }
4931
4932 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_strided_a) {
4933 TEST_REQUIRES_ARM_NEON_FMA;
4934 for (uint32_t n = 16; n <= 24; n += 8) {
4935 for (size_t k = 1; k <= 40; k += 9) {
4936 GemmMicrokernelTester()
4937 .mr(6)
4938 .nr(8)
4939 .kr(1)
4940 .sr(1)
4941 .m(6)
4942 .n(n)
4943 .k(k)
4944 .a_stride(43)
4945 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4946 }
4947 }
4948 }
4949
4950 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, n_div_8_subtile) {
4951 TEST_REQUIRES_ARM_NEON_FMA;
4952 for (uint32_t n = 16; n <= 24; n += 8) {
4953 for (size_t k = 1; k <= 40; k += 9) {
4954 for (uint32_t m = 1; m <= 6; m++) {
4955 GemmMicrokernelTester()
4956 .mr(6)
4957 .nr(8)
4958 .kr(1)
4959 .sr(1)
4960 .m(m)
4961 .n(n)
4962 .k(k)
4963 .iterations(1)
4964 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4965 }
4966 }
4967 }
4968 }
4969
4970 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm_subtile) {
4971 TEST_REQUIRES_ARM_NEON_FMA;
4972 for (size_t k = 1; k <= 40; k += 9) {
4973 for (uint32_t m = 1; m <= 6; m++) {
4974 for (uint32_t n = 1; n <= 8; n++) {
4975 GemmMicrokernelTester()
4976 .mr(6)
4977 .nr(8)
4978 .kr(1)
4979 .sr(1)
4980 .m(m)
4981 .n(n)
4982 .k(k)
4983 .cm_stride(11)
4984 .iterations(1)
4985 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
4986 }
4987 }
4988 }
4989 }
4990
4991 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmin) {
4992 TEST_REQUIRES_ARM_NEON_FMA;
4993 GemmMicrokernelTester()
4994 .mr(6)
4995 .nr(8)
4996 .kr(1)
4997 .sr(1)
4998 .m(6)
4999 .n(8)
5000 .k(8)
5001 .qmin(128)
5002 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5003 }
5004
5005 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, qmax) {
5006 TEST_REQUIRES_ARM_NEON_FMA;
5007 GemmMicrokernelTester()
5008 .mr(6)
5009 .nr(8)
5010 .kr(1)
5011 .sr(1)
5012 .m(6)
5013 .n(8)
5014 .k(8)
5015 .qmax(128)
5016 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5017 }
5018
5019 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A73, strided_cm) {
5020 TEST_REQUIRES_ARM_NEON_FMA;
5021 GemmMicrokernelTester()
5022 .mr(6)
5023 .nr(8)
5024 .kr(1)
5025 .sr(1)
5026 .m(6)
5027 .n(8)
5028 .k(8)
5029 .cm_stride(11)
5030 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a73);
5031 }
Frank Barchard7e955972019-10-11 10:34:25 -07005032#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005033
5034
Frank Barchard7e955972019-10-11 10:34:25 -07005035#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
Frank Barchard387c2d12019-12-16 19:14:07 -08005036 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8) {
5037 TEST_REQUIRES_ARM_NEON_FMA;
5038 GemmMicrokernelTester()
5039 .mr(6)
5040 .nr(8)
5041 .kr(1)
5042 .sr(1)
5043 .m(6)
5044 .n(8)
5045 .k(8)
5046 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5047 }
5048
5049 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cn) {
5050 TEST_REQUIRES_ARM_NEON_FMA;
5051 GemmMicrokernelTester()
5052 .mr(6)
5053 .nr(8)
5054 .kr(1)
5055 .sr(1)
5056 .m(6)
5057 .n(8)
5058 .k(8)
5059 .cn_stride(11)
5060 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5061 }
5062
5063 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_strided_a) {
5064 TEST_REQUIRES_ARM_NEON_FMA;
5065 GemmMicrokernelTester()
5066 .mr(6)
5067 .nr(8)
5068 .kr(1)
5069 .sr(1)
5070 .m(6)
5071 .n(8)
5072 .k(8)
5073 .a_stride(11)
5074 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5075 }
5076
5077 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile) {
5078 TEST_REQUIRES_ARM_NEON_FMA;
5079 for (uint32_t m = 1; m <= 6; m++) {
5080 for (uint32_t n = 1; n <= 8; n++) {
5081 GemmMicrokernelTester()
5082 .mr(6)
5083 .nr(8)
5084 .kr(1)
5085 .sr(1)
5086 .m(m)
5087 .n(n)
5088 .k(8)
5089 .iterations(1)
5090 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5091 }
5092 }
5093 }
5094
5095 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_m) {
5096 TEST_REQUIRES_ARM_NEON_FMA;
5097 for (uint32_t m = 1; m <= 6; m++) {
5098 GemmMicrokernelTester()
5099 .mr(6)
5100 .nr(8)
5101 .kr(1)
5102 .sr(1)
5103 .m(m)
5104 .n(8)
5105 .k(8)
5106 .iterations(1)
5107 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5108 }
5109 }
5110
5111 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_8_subtile_n) {
5112 TEST_REQUIRES_ARM_NEON_FMA;
5113 for (uint32_t n = 1; n <= 8; n++) {
5114 GemmMicrokernelTester()
5115 .mr(6)
5116 .nr(8)
5117 .kr(1)
5118 .sr(1)
5119 .m(6)
5120 .n(n)
5121 .k(8)
5122 .iterations(1)
5123 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5124 }
5125 }
5126
5127 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16) {
5128 TEST_REQUIRES_ARM_NEON_FMA;
5129 GemmMicrokernelTester()
5130 .mr(6)
5131 .nr(8)
5132 .kr(1)
5133 .sr(1)
5134 .m(6)
5135 .n(8)
5136 .k(16)
5137 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5138 }
5139
5140 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_strided_a) {
5141 TEST_REQUIRES_ARM_NEON_FMA;
5142 GemmMicrokernelTester()
5143 .mr(6)
5144 .nr(8)
5145 .kr(1)
5146 .sr(1)
5147 .m(6)
5148 .n(8)
5149 .k(16)
5150 .a_stride(19)
5151 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5152 }
5153
5154 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_eq_16_subtile) {
5155 TEST_REQUIRES_ARM_NEON_FMA;
5156 for (uint32_t m = 1; m <= 6; m++) {
5157 for (uint32_t n = 1; n <= 8; n++) {
5158 GemmMicrokernelTester()
5159 .mr(6)
5160 .nr(8)
5161 .kr(1)
5162 .sr(1)
5163 .m(m)
5164 .n(n)
5165 .k(16)
5166 .iterations(1)
5167 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5168 }
5169 }
5170 }
5171
5172 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16) {
5173 TEST_REQUIRES_ARM_NEON_FMA;
5174 for (size_t k = 1; k < 16; k++) {
5175 GemmMicrokernelTester()
5176 .mr(6)
5177 .nr(8)
5178 .kr(1)
5179 .sr(1)
5180 .m(6)
5181 .n(8)
5182 .k(k)
5183 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5184 }
5185 }
5186
5187 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_strided_a) {
5188 TEST_REQUIRES_ARM_NEON_FMA;
5189 for (size_t k = 1; k < 16; k++) {
5190 GemmMicrokernelTester()
5191 .mr(6)
5192 .nr(8)
5193 .kr(1)
5194 .sr(1)
5195 .m(6)
5196 .n(8)
5197 .k(k)
5198 .a_stride(19)
5199 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5200 }
5201 }
5202
5203 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_lt_16_subtile) {
5204 TEST_REQUIRES_ARM_NEON_FMA;
5205 for (size_t k = 1; k < 16; k++) {
5206 for (uint32_t m = 1; m <= 6; m++) {
5207 for (uint32_t n = 1; n <= 8; n++) {
5208 GemmMicrokernelTester()
5209 .mr(6)
5210 .nr(8)
5211 .kr(1)
5212 .sr(1)
5213 .m(m)
5214 .n(n)
5215 .k(k)
5216 .iterations(1)
5217 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5218 }
5219 }
5220 }
5221 }
5222
5223 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_16) {
5224 TEST_REQUIRES_ARM_NEON_FMA;
5225 for (size_t k = 17; k < 16; k++) {
5226 GemmMicrokernelTester()
5227 .mr(6)
5228 .nr(8)
5229 .kr(1)
5230 .sr(1)
5231 .m(6)
5232 .n(8)
5233 .k(k)
5234 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5235 }
5236 }
5237
5238 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_strided_a) {
5239 TEST_REQUIRES_ARM_NEON_FMA;
5240 for (size_t k = 17; k < 16; k++) {
5241 GemmMicrokernelTester()
5242 .mr(6)
5243 .nr(8)
5244 .kr(1)
5245 .sr(1)
5246 .m(6)
5247 .n(8)
5248 .k(k)
5249 .a_stride(19)
5250 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5251 }
5252 }
5253
5254 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_gt_8_subtile) {
5255 TEST_REQUIRES_ARM_NEON_FMA;
5256 for (size_t k = 17; k < 16; k++) {
5257 for (uint32_t m = 1; m <= 6; m++) {
5258 for (uint32_t n = 1; n <= 8; n++) {
5259 GemmMicrokernelTester()
5260 .mr(6)
5261 .nr(8)
5262 .kr(1)
5263 .sr(1)
5264 .m(m)
5265 .n(n)
5266 .k(k)
5267 .iterations(1)
5268 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5269 }
5270 }
5271 }
5272 }
5273
5274 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8) {
5275 TEST_REQUIRES_ARM_NEON_FMA;
5276 for (size_t k = 24; k <= 80; k += 8) {
5277 GemmMicrokernelTester()
5278 .mr(6)
5279 .nr(8)
5280 .kr(1)
5281 .sr(1)
5282 .m(6)
5283 .n(8)
5284 .k(k)
5285 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5286 }
5287 }
5288
5289 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_strided_a) {
5290 TEST_REQUIRES_ARM_NEON_FMA;
5291 for (size_t k = 24; k <= 80; k += 8) {
5292 GemmMicrokernelTester()
5293 .mr(6)
5294 .nr(8)
5295 .kr(1)
5296 .sr(1)
5297 .m(6)
5298 .n(8)
5299 .k(k)
5300 .a_stride(83)
5301 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5302 }
5303 }
5304
5305 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, k_div_8_subtile) {
5306 TEST_REQUIRES_ARM_NEON_FMA;
5307 for (size_t k = 24; k <= 80; k += 8) {
5308 for (uint32_t m = 1; m <= 6; m++) {
5309 for (uint32_t n = 1; n <= 8; n++) {
5310 GemmMicrokernelTester()
5311 .mr(6)
5312 .nr(8)
5313 .kr(1)
5314 .sr(1)
5315 .m(m)
5316 .n(n)
5317 .k(k)
5318 .iterations(1)
5319 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5320 }
5321 }
5322 }
5323 }
5324
5325 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8) {
5326 TEST_REQUIRES_ARM_NEON_FMA;
5327 for (uint32_t n = 9; n < 16; n++) {
5328 for (size_t k = 1; k <= 40; k += 9) {
5329 GemmMicrokernelTester()
5330 .mr(6)
5331 .nr(8)
5332 .kr(1)
5333 .sr(1)
5334 .m(6)
5335 .n(8)
5336 .k(k)
5337 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5338 }
5339 }
5340 }
5341
5342 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_cn) {
5343 TEST_REQUIRES_ARM_NEON_FMA;
5344 for (uint32_t n = 9; n < 16; n++) {
5345 for (size_t k = 1; k <= 40; k += 9) {
5346 GemmMicrokernelTester()
5347 .mr(6)
5348 .nr(8)
5349 .kr(1)
5350 .sr(1)
5351 .m(6)
5352 .n(8)
5353 .k(k)
5354 .cn_stride(11)
5355 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5356 }
5357 }
5358 }
5359
5360 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_strided_a) {
5361 TEST_REQUIRES_ARM_NEON_FMA;
5362 for (uint32_t n = 9; n < 16; n++) {
5363 for (size_t k = 1; k <= 40; k += 9) {
5364 GemmMicrokernelTester()
5365 .mr(6)
5366 .nr(8)
5367 .kr(1)
5368 .sr(1)
5369 .m(6)
5370 .n(n)
5371 .k(k)
5372 .a_stride(43)
5373 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5374 }
5375 }
5376 }
5377
5378 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_gt_8_subtile) {
5379 TEST_REQUIRES_ARM_NEON_FMA;
5380 for (uint32_t n = 9; n < 16; n++) {
5381 for (size_t k = 1; k <= 40; k += 9) {
5382 for (uint32_t m = 1; m <= 6; m++) {
5383 GemmMicrokernelTester()
5384 .mr(6)
5385 .nr(8)
5386 .kr(1)
5387 .sr(1)
5388 .m(m)
5389 .n(n)
5390 .k(k)
5391 .iterations(1)
5392 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5393 }
5394 }
5395 }
5396 }
5397
5398 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8) {
5399 TEST_REQUIRES_ARM_NEON_FMA;
5400 for (uint32_t n = 16; n <= 24; n += 8) {
5401 for (size_t k = 1; k <= 40; k += 9) {
5402 GemmMicrokernelTester()
5403 .mr(6)
5404 .nr(8)
5405 .kr(1)
5406 .sr(1)
5407 .m(6)
5408 .n(8)
5409 .k(k)
5410 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5411 }
5412 }
5413 }
5414
5415 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_cn) {
5416 TEST_REQUIRES_ARM_NEON_FMA;
5417 for (uint32_t n = 16; n <= 24; n += 8) {
5418 for (size_t k = 1; k <= 40; k += 9) {
5419 GemmMicrokernelTester()
5420 .mr(6)
5421 .nr(8)
5422 .kr(1)
5423 .sr(1)
5424 .m(6)
5425 .n(n)
5426 .k(k)
5427 .cn_stride(11)
5428 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5429 }
5430 }
5431 }
5432
5433 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_strided_a) {
5434 TEST_REQUIRES_ARM_NEON_FMA;
5435 for (uint32_t n = 16; n <= 24; n += 8) {
5436 for (size_t k = 1; k <= 40; k += 9) {
5437 GemmMicrokernelTester()
5438 .mr(6)
5439 .nr(8)
5440 .kr(1)
5441 .sr(1)
5442 .m(6)
5443 .n(n)
5444 .k(k)
5445 .a_stride(43)
5446 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5447 }
5448 }
5449 }
5450
5451 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, n_div_8_subtile) {
5452 TEST_REQUIRES_ARM_NEON_FMA;
5453 for (uint32_t n = 16; n <= 24; n += 8) {
5454 for (size_t k = 1; k <= 40; k += 9) {
5455 for (uint32_t m = 1; m <= 6; m++) {
5456 GemmMicrokernelTester()
5457 .mr(6)
5458 .nr(8)
5459 .kr(1)
5460 .sr(1)
5461 .m(m)
5462 .n(n)
5463 .k(k)
5464 .iterations(1)
5465 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5466 }
5467 }
5468 }
5469 }
5470
5471 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm_subtile) {
5472 TEST_REQUIRES_ARM_NEON_FMA;
5473 for (size_t k = 1; k <= 40; k += 9) {
5474 for (uint32_t m = 1; m <= 6; m++) {
5475 for (uint32_t n = 1; n <= 8; n++) {
5476 GemmMicrokernelTester()
5477 .mr(6)
5478 .nr(8)
5479 .kr(1)
5480 .sr(1)
5481 .m(m)
5482 .n(n)
5483 .k(k)
5484 .cm_stride(11)
5485 .iterations(1)
5486 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5487 }
5488 }
5489 }
5490 }
5491
5492 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmin) {
5493 TEST_REQUIRES_ARM_NEON_FMA;
5494 GemmMicrokernelTester()
5495 .mr(6)
5496 .nr(8)
5497 .kr(1)
5498 .sr(1)
5499 .m(6)
5500 .n(8)
5501 .k(8)
5502 .qmin(128)
5503 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5504 }
5505
5506 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, qmax) {
5507 TEST_REQUIRES_ARM_NEON_FMA;
5508 GemmMicrokernelTester()
5509 .mr(6)
5510 .nr(8)
5511 .kr(1)
5512 .sr(1)
5513 .m(6)
5514 .n(8)
5515 .k(8)
5516 .qmax(128)
5517 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5518 }
5519
5520 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A57, strided_cm) {
5521 TEST_REQUIRES_ARM_NEON_FMA;
5522 GemmMicrokernelTester()
5523 .mr(6)
5524 .nr(8)
5525 .kr(1)
5526 .sr(1)
5527 .m(6)
5528 .n(8)
5529 .k(8)
5530 .cm_stride(11)
5531 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a57);
5532 }
5533#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
5534
5535
5536#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07005537 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8) {
5538 TEST_REQUIRES_ARM_NEON_FMA;
5539 GemmMicrokernelTester()
5540 .mr(6)
5541 .nr(8)
5542 .kr(1)
5543 .sr(1)
5544 .m(6)
5545 .n(8)
5546 .k(8)
5547 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5548 }
5549
5550 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cn) {
5551 TEST_REQUIRES_ARM_NEON_FMA;
5552 GemmMicrokernelTester()
5553 .mr(6)
5554 .nr(8)
5555 .kr(1)
5556 .sr(1)
5557 .m(6)
5558 .n(8)
5559 .k(8)
5560 .cn_stride(11)
5561 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5562 }
5563
5564 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_strided_a) {
5565 TEST_REQUIRES_ARM_NEON_FMA;
5566 GemmMicrokernelTester()
5567 .mr(6)
5568 .nr(8)
5569 .kr(1)
5570 .sr(1)
5571 .m(6)
5572 .n(8)
5573 .k(8)
5574 .a_stride(11)
5575 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5576 }
5577
5578 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile) {
5579 TEST_REQUIRES_ARM_NEON_FMA;
5580 for (uint32_t m = 1; m <= 6; m++) {
5581 for (uint32_t n = 1; n <= 8; n++) {
5582 GemmMicrokernelTester()
5583 .mr(6)
5584 .nr(8)
5585 .kr(1)
5586 .sr(1)
5587 .m(m)
5588 .n(n)
5589 .k(8)
5590 .iterations(1)
5591 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5592 }
5593 }
5594 }
5595
5596 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_m) {
5597 TEST_REQUIRES_ARM_NEON_FMA;
5598 for (uint32_t m = 1; m <= 6; m++) {
5599 GemmMicrokernelTester()
5600 .mr(6)
5601 .nr(8)
5602 .kr(1)
5603 .sr(1)
5604 .m(m)
5605 .n(8)
5606 .k(8)
5607 .iterations(1)
5608 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5609 }
5610 }
5611
5612 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_8_subtile_n) {
5613 TEST_REQUIRES_ARM_NEON_FMA;
5614 for (uint32_t n = 1; n <= 8; n++) {
5615 GemmMicrokernelTester()
5616 .mr(6)
5617 .nr(8)
5618 .kr(1)
5619 .sr(1)
5620 .m(6)
5621 .n(n)
5622 .k(8)
5623 .iterations(1)
5624 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5625 }
5626 }
5627
5628 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16) {
5629 TEST_REQUIRES_ARM_NEON_FMA;
5630 GemmMicrokernelTester()
5631 .mr(6)
5632 .nr(8)
5633 .kr(1)
5634 .sr(1)
5635 .m(6)
5636 .n(8)
5637 .k(16)
5638 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5639 }
5640
5641 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_strided_a) {
5642 TEST_REQUIRES_ARM_NEON_FMA;
5643 GemmMicrokernelTester()
5644 .mr(6)
5645 .nr(8)
5646 .kr(1)
5647 .sr(1)
5648 .m(6)
5649 .n(8)
5650 .k(16)
5651 .a_stride(19)
5652 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5653 }
5654
5655 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_eq_16_subtile) {
5656 TEST_REQUIRES_ARM_NEON_FMA;
5657 for (uint32_t m = 1; m <= 6; m++) {
5658 for (uint32_t n = 1; n <= 8; n++) {
5659 GemmMicrokernelTester()
5660 .mr(6)
5661 .nr(8)
5662 .kr(1)
5663 .sr(1)
5664 .m(m)
5665 .n(n)
5666 .k(16)
5667 .iterations(1)
5668 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5669 }
5670 }
5671 }
5672
5673 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16) {
5674 TEST_REQUIRES_ARM_NEON_FMA;
5675 for (size_t k = 1; k < 16; k++) {
5676 GemmMicrokernelTester()
5677 .mr(6)
5678 .nr(8)
5679 .kr(1)
5680 .sr(1)
5681 .m(6)
5682 .n(8)
5683 .k(k)
5684 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5685 }
5686 }
5687
5688 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_strided_a) {
5689 TEST_REQUIRES_ARM_NEON_FMA;
5690 for (size_t k = 1; k < 16; k++) {
5691 GemmMicrokernelTester()
5692 .mr(6)
5693 .nr(8)
5694 .kr(1)
5695 .sr(1)
5696 .m(6)
5697 .n(8)
5698 .k(k)
5699 .a_stride(19)
5700 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5701 }
5702 }
5703
5704 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_lt_16_subtile) {
5705 TEST_REQUIRES_ARM_NEON_FMA;
5706 for (size_t k = 1; k < 16; k++) {
5707 for (uint32_t m = 1; m <= 6; m++) {
5708 for (uint32_t n = 1; n <= 8; n++) {
5709 GemmMicrokernelTester()
5710 .mr(6)
5711 .nr(8)
5712 .kr(1)
5713 .sr(1)
5714 .m(m)
5715 .n(n)
5716 .k(k)
5717 .iterations(1)
5718 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5719 }
5720 }
5721 }
5722 }
5723
5724 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_16) {
5725 TEST_REQUIRES_ARM_NEON_FMA;
5726 for (size_t k = 17; k < 16; k++) {
5727 GemmMicrokernelTester()
5728 .mr(6)
5729 .nr(8)
5730 .kr(1)
5731 .sr(1)
5732 .m(6)
5733 .n(8)
5734 .k(k)
5735 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5736 }
5737 }
5738
5739 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_strided_a) {
5740 TEST_REQUIRES_ARM_NEON_FMA;
5741 for (size_t k = 17; k < 16; k++) {
5742 GemmMicrokernelTester()
5743 .mr(6)
5744 .nr(8)
5745 .kr(1)
5746 .sr(1)
5747 .m(6)
5748 .n(8)
5749 .k(k)
5750 .a_stride(19)
5751 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5752 }
5753 }
5754
5755 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_gt_8_subtile) {
5756 TEST_REQUIRES_ARM_NEON_FMA;
5757 for (size_t k = 17; k < 16; k++) {
5758 for (uint32_t m = 1; m <= 6; m++) {
5759 for (uint32_t n = 1; n <= 8; n++) {
5760 GemmMicrokernelTester()
5761 .mr(6)
5762 .nr(8)
5763 .kr(1)
5764 .sr(1)
5765 .m(m)
5766 .n(n)
5767 .k(k)
5768 .iterations(1)
5769 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5770 }
5771 }
5772 }
5773 }
5774
5775 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8) {
5776 TEST_REQUIRES_ARM_NEON_FMA;
5777 for (size_t k = 24; k <= 80; k += 8) {
5778 GemmMicrokernelTester()
5779 .mr(6)
5780 .nr(8)
5781 .kr(1)
5782 .sr(1)
5783 .m(6)
5784 .n(8)
5785 .k(k)
5786 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5787 }
5788 }
5789
5790 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_strided_a) {
5791 TEST_REQUIRES_ARM_NEON_FMA;
5792 for (size_t k = 24; k <= 80; k += 8) {
5793 GemmMicrokernelTester()
5794 .mr(6)
5795 .nr(8)
5796 .kr(1)
5797 .sr(1)
5798 .m(6)
5799 .n(8)
5800 .k(k)
5801 .a_stride(83)
5802 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5803 }
5804 }
5805
5806 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, k_div_8_subtile) {
5807 TEST_REQUIRES_ARM_NEON_FMA;
5808 for (size_t k = 24; k <= 80; k += 8) {
5809 for (uint32_t m = 1; m <= 6; m++) {
5810 for (uint32_t n = 1; n <= 8; n++) {
5811 GemmMicrokernelTester()
5812 .mr(6)
5813 .nr(8)
5814 .kr(1)
5815 .sr(1)
5816 .m(m)
5817 .n(n)
5818 .k(k)
5819 .iterations(1)
5820 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5821 }
5822 }
5823 }
5824 }
5825
5826 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8) {
5827 TEST_REQUIRES_ARM_NEON_FMA;
5828 for (uint32_t n = 9; n < 16; n++) {
5829 for (size_t k = 1; k <= 40; k += 9) {
5830 GemmMicrokernelTester()
5831 .mr(6)
5832 .nr(8)
5833 .kr(1)
5834 .sr(1)
5835 .m(6)
5836 .n(8)
5837 .k(k)
5838 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5839 }
5840 }
5841 }
5842
5843 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_cn) {
5844 TEST_REQUIRES_ARM_NEON_FMA;
5845 for (uint32_t n = 9; n < 16; n++) {
5846 for (size_t k = 1; k <= 40; k += 9) {
5847 GemmMicrokernelTester()
5848 .mr(6)
5849 .nr(8)
5850 .kr(1)
5851 .sr(1)
5852 .m(6)
5853 .n(8)
5854 .k(k)
5855 .cn_stride(11)
5856 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5857 }
5858 }
5859 }
5860
5861 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_strided_a) {
5862 TEST_REQUIRES_ARM_NEON_FMA;
5863 for (uint32_t n = 9; n < 16; n++) {
5864 for (size_t k = 1; k <= 40; k += 9) {
5865 GemmMicrokernelTester()
5866 .mr(6)
5867 .nr(8)
5868 .kr(1)
5869 .sr(1)
5870 .m(6)
5871 .n(n)
5872 .k(k)
5873 .a_stride(43)
5874 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5875 }
5876 }
5877 }
5878
5879 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_gt_8_subtile) {
5880 TEST_REQUIRES_ARM_NEON_FMA;
5881 for (uint32_t n = 9; n < 16; n++) {
5882 for (size_t k = 1; k <= 40; k += 9) {
5883 for (uint32_t m = 1; m <= 6; m++) {
5884 GemmMicrokernelTester()
5885 .mr(6)
5886 .nr(8)
5887 .kr(1)
5888 .sr(1)
5889 .m(m)
5890 .n(n)
5891 .k(k)
5892 .iterations(1)
5893 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5894 }
5895 }
5896 }
5897 }
5898
5899 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8) {
5900 TEST_REQUIRES_ARM_NEON_FMA;
5901 for (uint32_t n = 16; n <= 24; n += 8) {
5902 for (size_t k = 1; k <= 40; k += 9) {
5903 GemmMicrokernelTester()
5904 .mr(6)
5905 .nr(8)
5906 .kr(1)
5907 .sr(1)
5908 .m(6)
5909 .n(8)
5910 .k(k)
5911 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5912 }
5913 }
5914 }
5915
5916 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_cn) {
5917 TEST_REQUIRES_ARM_NEON_FMA;
5918 for (uint32_t n = 16; n <= 24; n += 8) {
5919 for (size_t k = 1; k <= 40; k += 9) {
5920 GemmMicrokernelTester()
5921 .mr(6)
5922 .nr(8)
5923 .kr(1)
5924 .sr(1)
5925 .m(6)
5926 .n(n)
5927 .k(k)
5928 .cn_stride(11)
5929 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5930 }
5931 }
5932 }
5933
5934 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_strided_a) {
5935 TEST_REQUIRES_ARM_NEON_FMA;
5936 for (uint32_t n = 16; n <= 24; n += 8) {
5937 for (size_t k = 1; k <= 40; k += 9) {
5938 GemmMicrokernelTester()
5939 .mr(6)
5940 .nr(8)
5941 .kr(1)
5942 .sr(1)
5943 .m(6)
5944 .n(n)
5945 .k(k)
5946 .a_stride(43)
5947 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5948 }
5949 }
5950 }
5951
5952 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, n_div_8_subtile) {
5953 TEST_REQUIRES_ARM_NEON_FMA;
5954 for (uint32_t n = 16; n <= 24; n += 8) {
5955 for (size_t k = 1; k <= 40; k += 9) {
5956 for (uint32_t m = 1; m <= 6; m++) {
5957 GemmMicrokernelTester()
5958 .mr(6)
5959 .nr(8)
5960 .kr(1)
5961 .sr(1)
5962 .m(m)
5963 .n(n)
5964 .k(k)
5965 .iterations(1)
5966 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5967 }
5968 }
5969 }
5970 }
5971
5972 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm_subtile) {
5973 TEST_REQUIRES_ARM_NEON_FMA;
5974 for (size_t k = 1; k <= 40; k += 9) {
5975 for (uint32_t m = 1; m <= 6; m++) {
5976 for (uint32_t n = 1; n <= 8; n++) {
5977 GemmMicrokernelTester()
5978 .mr(6)
5979 .nr(8)
5980 .kr(1)
5981 .sr(1)
5982 .m(m)
5983 .n(n)
5984 .k(k)
5985 .cm_stride(11)
5986 .iterations(1)
5987 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
5988 }
5989 }
5990 }
5991 }
5992
5993 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmin) {
5994 TEST_REQUIRES_ARM_NEON_FMA;
5995 GemmMicrokernelTester()
5996 .mr(6)
5997 .nr(8)
5998 .kr(1)
5999 .sr(1)
6000 .m(6)
6001 .n(8)
6002 .k(8)
6003 .qmin(128)
6004 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6005 }
6006
6007 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, qmax) {
6008 TEST_REQUIRES_ARM_NEON_FMA;
6009 GemmMicrokernelTester()
6010 .mr(6)
6011 .nr(8)
6012 .kr(1)
6013 .sr(1)
6014 .m(6)
6015 .n(8)
6016 .k(8)
6017 .qmax(128)
6018 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6019 }
6020
6021 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_CORTEX_A75, strided_cm) {
6022 TEST_REQUIRES_ARM_NEON_FMA;
6023 GemmMicrokernelTester()
6024 .mr(6)
6025 .nr(8)
6026 .kr(1)
6027 .sr(1)
6028 .m(6)
6029 .n(8)
6030 .k(8)
6031 .cm_stride(11)
6032 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_cortex_a75);
6033 }
Frank Barchard7e955972019-10-11 10:34:25 -07006034#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006035
6036
Frank Barchard7e955972019-10-11 10:34:25 -07006037#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006038 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
6039 TEST_REQUIRES_ARM_NEON_FMA;
6040 GemmMicrokernelTester()
6041 .mr(1)
6042 .nr(12)
6043 .kr(1)
6044 .sr(1)
6045 .m(1)
6046 .n(12)
6047 .k(4)
6048 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6049 }
6050
6051 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
6052 TEST_REQUIRES_ARM_NEON_FMA;
6053 GemmMicrokernelTester()
6054 .mr(1)
6055 .nr(12)
6056 .kr(1)
6057 .sr(1)
6058 .m(1)
6059 .n(12)
6060 .k(4)
6061 .cn_stride(17)
6062 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6063 }
6064
6065 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
6066 TEST_REQUIRES_ARM_NEON_FMA;
6067 GemmMicrokernelTester()
6068 .mr(1)
6069 .nr(12)
6070 .kr(1)
6071 .sr(1)
6072 .m(1)
6073 .n(12)
6074 .k(4)
6075 .a_stride(7)
6076 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6077 }
6078
6079 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
6080 TEST_REQUIRES_ARM_NEON_FMA;
6081 for (uint32_t m = 1; m <= 1; m++) {
6082 for (uint32_t n = 1; n <= 12; n++) {
6083 GemmMicrokernelTester()
6084 .mr(1)
6085 .nr(12)
6086 .kr(1)
6087 .sr(1)
6088 .m(m)
6089 .n(n)
6090 .k(4)
6091 .iterations(1)
6092 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6093 }
6094 }
6095 }
6096
6097 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
6098 TEST_REQUIRES_ARM_NEON_FMA;
6099 for (uint32_t m = 1; m <= 1; m++) {
6100 GemmMicrokernelTester()
6101 .mr(1)
6102 .nr(12)
6103 .kr(1)
6104 .sr(1)
6105 .m(m)
6106 .n(12)
6107 .k(4)
6108 .iterations(1)
6109 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6110 }
6111 }
6112
6113 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
6114 TEST_REQUIRES_ARM_NEON_FMA;
6115 for (uint32_t n = 1; n <= 12; n++) {
6116 GemmMicrokernelTester()
6117 .mr(1)
6118 .nr(12)
6119 .kr(1)
6120 .sr(1)
6121 .m(1)
6122 .n(n)
6123 .k(4)
6124 .iterations(1)
6125 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6126 }
6127 }
6128
6129 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
6130 TEST_REQUIRES_ARM_NEON_FMA;
6131 GemmMicrokernelTester()
6132 .mr(1)
6133 .nr(12)
6134 .kr(1)
6135 .sr(1)
6136 .m(1)
6137 .n(12)
6138 .k(8)
6139 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6140 }
6141
6142 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
6143 TEST_REQUIRES_ARM_NEON_FMA;
6144 GemmMicrokernelTester()
6145 .mr(1)
6146 .nr(12)
6147 .kr(1)
6148 .sr(1)
6149 .m(1)
6150 .n(12)
6151 .k(8)
6152 .a_stride(11)
6153 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6154 }
6155
6156 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
6157 TEST_REQUIRES_ARM_NEON_FMA;
6158 for (uint32_t m = 1; m <= 1; m++) {
6159 for (uint32_t n = 1; n <= 12; n++) {
6160 GemmMicrokernelTester()
6161 .mr(1)
6162 .nr(12)
6163 .kr(1)
6164 .sr(1)
6165 .m(m)
6166 .n(n)
6167 .k(8)
6168 .iterations(1)
6169 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6170 }
6171 }
6172 }
6173
6174 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
6175 TEST_REQUIRES_ARM_NEON_FMA;
6176 for (size_t k = 1; k < 8; k++) {
6177 GemmMicrokernelTester()
6178 .mr(1)
6179 .nr(12)
6180 .kr(1)
6181 .sr(1)
6182 .m(1)
6183 .n(12)
6184 .k(k)
6185 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6186 }
6187 }
6188
6189 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
6190 TEST_REQUIRES_ARM_NEON_FMA;
6191 for (size_t k = 1; k < 8; k++) {
6192 GemmMicrokernelTester()
6193 .mr(1)
6194 .nr(12)
6195 .kr(1)
6196 .sr(1)
6197 .m(1)
6198 .n(12)
6199 .k(k)
6200 .a_stride(11)
6201 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6202 }
6203 }
6204
6205 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
6206 TEST_REQUIRES_ARM_NEON_FMA;
6207 for (size_t k = 1; k < 8; k++) {
6208 for (uint32_t m = 1; m <= 1; m++) {
6209 for (uint32_t n = 1; n <= 12; n++) {
6210 GemmMicrokernelTester()
6211 .mr(1)
6212 .nr(12)
6213 .kr(1)
6214 .sr(1)
6215 .m(m)
6216 .n(n)
6217 .k(k)
6218 .iterations(1)
6219 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6220 }
6221 }
6222 }
6223 }
6224
6225 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
6226 TEST_REQUIRES_ARM_NEON_FMA;
6227 for (size_t k = 9; k < 8; k++) {
6228 GemmMicrokernelTester()
6229 .mr(1)
6230 .nr(12)
6231 .kr(1)
6232 .sr(1)
6233 .m(1)
6234 .n(12)
6235 .k(k)
6236 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6237 }
6238 }
6239
6240 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
6241 TEST_REQUIRES_ARM_NEON_FMA;
6242 for (size_t k = 9; k < 8; k++) {
6243 GemmMicrokernelTester()
6244 .mr(1)
6245 .nr(12)
6246 .kr(1)
6247 .sr(1)
6248 .m(1)
6249 .n(12)
6250 .k(k)
6251 .a_stride(11)
6252 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6253 }
6254 }
6255
6256 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
6257 TEST_REQUIRES_ARM_NEON_FMA;
6258 for (size_t k = 9; k < 8; k++) {
6259 for (uint32_t m = 1; m <= 1; m++) {
6260 for (uint32_t n = 1; n <= 12; n++) {
6261 GemmMicrokernelTester()
6262 .mr(1)
6263 .nr(12)
6264 .kr(1)
6265 .sr(1)
6266 .m(m)
6267 .n(n)
6268 .k(k)
6269 .iterations(1)
6270 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6271 }
6272 }
6273 }
6274 }
6275
6276 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
6277 TEST_REQUIRES_ARM_NEON_FMA;
6278 for (size_t k = 12; k <= 40; k += 4) {
6279 GemmMicrokernelTester()
6280 .mr(1)
6281 .nr(12)
6282 .kr(1)
6283 .sr(1)
6284 .m(1)
6285 .n(12)
6286 .k(k)
6287 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6288 }
6289 }
6290
6291 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
6292 TEST_REQUIRES_ARM_NEON_FMA;
6293 for (size_t k = 12; k <= 40; k += 4) {
6294 GemmMicrokernelTester()
6295 .mr(1)
6296 .nr(12)
6297 .kr(1)
6298 .sr(1)
6299 .m(1)
6300 .n(12)
6301 .k(k)
6302 .a_stride(43)
6303 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6304 }
6305 }
6306
6307 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
6308 TEST_REQUIRES_ARM_NEON_FMA;
6309 for (size_t k = 12; k <= 40; k += 4) {
6310 for (uint32_t m = 1; m <= 1; m++) {
6311 for (uint32_t n = 1; n <= 12; n++) {
6312 GemmMicrokernelTester()
6313 .mr(1)
6314 .nr(12)
6315 .kr(1)
6316 .sr(1)
6317 .m(m)
6318 .n(n)
6319 .k(k)
6320 .iterations(1)
6321 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6322 }
6323 }
6324 }
6325 }
6326
6327 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
6328 TEST_REQUIRES_ARM_NEON_FMA;
6329 for (uint32_t n = 13; n < 24; n++) {
6330 for (size_t k = 1; k <= 20; k += 5) {
6331 GemmMicrokernelTester()
6332 .mr(1)
6333 .nr(12)
6334 .kr(1)
6335 .sr(1)
6336 .m(1)
6337 .n(12)
6338 .k(k)
6339 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6340 }
6341 }
6342 }
6343
6344 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
6345 TEST_REQUIRES_ARM_NEON_FMA;
6346 for (uint32_t n = 13; n < 24; n++) {
6347 for (size_t k = 1; k <= 20; k += 5) {
6348 GemmMicrokernelTester()
6349 .mr(1)
6350 .nr(12)
6351 .kr(1)
6352 .sr(1)
6353 .m(1)
6354 .n(12)
6355 .k(k)
6356 .cn_stride(17)
6357 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6358 }
6359 }
6360 }
6361
6362 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
6363 TEST_REQUIRES_ARM_NEON_FMA;
6364 for (uint32_t n = 13; n < 24; n++) {
6365 for (size_t k = 1; k <= 20; k += 5) {
6366 GemmMicrokernelTester()
6367 .mr(1)
6368 .nr(12)
6369 .kr(1)
6370 .sr(1)
6371 .m(1)
6372 .n(n)
6373 .k(k)
6374 .a_stride(23)
6375 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6376 }
6377 }
6378 }
6379
6380 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
6381 TEST_REQUIRES_ARM_NEON_FMA;
6382 for (uint32_t n = 13; n < 24; n++) {
6383 for (size_t k = 1; k <= 20; k += 5) {
6384 for (uint32_t m = 1; m <= 1; m++) {
6385 GemmMicrokernelTester()
6386 .mr(1)
6387 .nr(12)
6388 .kr(1)
6389 .sr(1)
6390 .m(m)
6391 .n(n)
6392 .k(k)
6393 .iterations(1)
6394 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6395 }
6396 }
6397 }
6398 }
6399
6400 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
6401 TEST_REQUIRES_ARM_NEON_FMA;
6402 for (uint32_t n = 24; n <= 36; n += 12) {
6403 for (size_t k = 1; k <= 20; k += 5) {
6404 GemmMicrokernelTester()
6405 .mr(1)
6406 .nr(12)
6407 .kr(1)
6408 .sr(1)
6409 .m(1)
6410 .n(12)
6411 .k(k)
6412 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6413 }
6414 }
6415 }
6416
6417 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
6418 TEST_REQUIRES_ARM_NEON_FMA;
6419 for (uint32_t n = 24; n <= 36; n += 12) {
6420 for (size_t k = 1; k <= 20; k += 5) {
6421 GemmMicrokernelTester()
6422 .mr(1)
6423 .nr(12)
6424 .kr(1)
6425 .sr(1)
6426 .m(1)
6427 .n(n)
6428 .k(k)
6429 .cn_stride(17)
6430 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6431 }
6432 }
6433 }
6434
6435 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
6436 TEST_REQUIRES_ARM_NEON_FMA;
6437 for (uint32_t n = 24; n <= 36; n += 12) {
6438 for (size_t k = 1; k <= 20; k += 5) {
6439 GemmMicrokernelTester()
6440 .mr(1)
6441 .nr(12)
6442 .kr(1)
6443 .sr(1)
6444 .m(1)
6445 .n(n)
6446 .k(k)
6447 .a_stride(23)
6448 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6449 }
6450 }
6451 }
6452
6453 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
6454 TEST_REQUIRES_ARM_NEON_FMA;
6455 for (uint32_t n = 24; n <= 36; n += 12) {
6456 for (size_t k = 1; k <= 20; k += 5) {
6457 for (uint32_t m = 1; m <= 1; m++) {
6458 GemmMicrokernelTester()
6459 .mr(1)
6460 .nr(12)
6461 .kr(1)
6462 .sr(1)
6463 .m(m)
6464 .n(n)
6465 .k(k)
6466 .iterations(1)
6467 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6468 }
6469 }
6470 }
6471 }
6472
6473 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
6474 TEST_REQUIRES_ARM_NEON_FMA;
6475 for (size_t k = 1; k <= 20; k += 5) {
6476 for (uint32_t m = 1; m <= 1; m++) {
6477 for (uint32_t n = 1; n <= 12; n++) {
6478 GemmMicrokernelTester()
6479 .mr(1)
6480 .nr(12)
6481 .kr(1)
6482 .sr(1)
6483 .m(m)
6484 .n(n)
6485 .k(k)
6486 .cm_stride(17)
6487 .iterations(1)
6488 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6489 }
6490 }
6491 }
6492 }
6493
6494 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
6495 TEST_REQUIRES_ARM_NEON_FMA;
6496 GemmMicrokernelTester()
6497 .mr(1)
6498 .nr(12)
6499 .kr(1)
6500 .sr(1)
6501 .m(1)
6502 .n(12)
6503 .k(4)
6504 .qmin(128)
6505 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6506 }
6507
6508 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
6509 TEST_REQUIRES_ARM_NEON_FMA;
6510 GemmMicrokernelTester()
6511 .mr(1)
6512 .nr(12)
6513 .kr(1)
6514 .sr(1)
6515 .m(1)
6516 .n(12)
6517 .k(4)
6518 .qmax(128)
6519 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6520 }
6521
6522 TEST(F32_GEMM_1X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
6523 TEST_REQUIRES_ARM_NEON_FMA;
6524 GemmMicrokernelTester()
6525 .mr(1)
6526 .nr(12)
6527 .kr(1)
6528 .sr(1)
6529 .m(1)
6530 .n(12)
6531 .k(4)
6532 .cm_stride(17)
6533 .Test(xnn_f32_gemm_ukernel_1x12__aarch64_neonfma_cortex_a53);
6534 }
Frank Barchard7e955972019-10-11 10:34:25 -07006535#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006536
6537
Frank Barchard7e955972019-10-11 10:34:25 -07006538#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07006539 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4) {
6540 TEST_REQUIRES_ARM_NEON_FMA;
6541 GemmMicrokernelTester()
6542 .mr(4)
6543 .nr(12)
6544 .kr(1)
6545 .sr(1)
6546 .m(4)
6547 .n(12)
6548 .k(4)
6549 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6550 }
6551
6552 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cn) {
6553 TEST_REQUIRES_ARM_NEON_FMA;
6554 GemmMicrokernelTester()
6555 .mr(4)
6556 .nr(12)
6557 .kr(1)
6558 .sr(1)
6559 .m(4)
6560 .n(12)
6561 .k(4)
6562 .cn_stride(17)
6563 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6564 }
6565
6566 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_strided_a) {
6567 TEST_REQUIRES_ARM_NEON_FMA;
6568 GemmMicrokernelTester()
6569 .mr(4)
6570 .nr(12)
6571 .kr(1)
6572 .sr(1)
6573 .m(4)
6574 .n(12)
6575 .k(4)
6576 .a_stride(7)
6577 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6578 }
6579
6580 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile) {
6581 TEST_REQUIRES_ARM_NEON_FMA;
6582 for (uint32_t m = 1; m <= 4; m++) {
6583 for (uint32_t n = 1; n <= 12; n++) {
6584 GemmMicrokernelTester()
6585 .mr(4)
6586 .nr(12)
6587 .kr(1)
6588 .sr(1)
6589 .m(m)
6590 .n(n)
6591 .k(4)
6592 .iterations(1)
6593 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6594 }
6595 }
6596 }
6597
6598 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_m) {
6599 TEST_REQUIRES_ARM_NEON_FMA;
6600 for (uint32_t m = 1; m <= 4; m++) {
6601 GemmMicrokernelTester()
6602 .mr(4)
6603 .nr(12)
6604 .kr(1)
6605 .sr(1)
6606 .m(m)
6607 .n(12)
6608 .k(4)
6609 .iterations(1)
6610 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6611 }
6612 }
6613
6614 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_4_subtile_n) {
6615 TEST_REQUIRES_ARM_NEON_FMA;
6616 for (uint32_t n = 1; n <= 12; n++) {
6617 GemmMicrokernelTester()
6618 .mr(4)
6619 .nr(12)
6620 .kr(1)
6621 .sr(1)
6622 .m(4)
6623 .n(n)
6624 .k(4)
6625 .iterations(1)
6626 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6627 }
6628 }
6629
6630 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8) {
6631 TEST_REQUIRES_ARM_NEON_FMA;
6632 GemmMicrokernelTester()
6633 .mr(4)
6634 .nr(12)
6635 .kr(1)
6636 .sr(1)
6637 .m(4)
6638 .n(12)
6639 .k(8)
6640 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6641 }
6642
6643 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_strided_a) {
6644 TEST_REQUIRES_ARM_NEON_FMA;
6645 GemmMicrokernelTester()
6646 .mr(4)
6647 .nr(12)
6648 .kr(1)
6649 .sr(1)
6650 .m(4)
6651 .n(12)
6652 .k(8)
6653 .a_stride(11)
6654 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6655 }
6656
6657 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_eq_8_subtile) {
6658 TEST_REQUIRES_ARM_NEON_FMA;
6659 for (uint32_t m = 1; m <= 4; m++) {
6660 for (uint32_t n = 1; n <= 12; n++) {
6661 GemmMicrokernelTester()
6662 .mr(4)
6663 .nr(12)
6664 .kr(1)
6665 .sr(1)
6666 .m(m)
6667 .n(n)
6668 .k(8)
6669 .iterations(1)
6670 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6671 }
6672 }
6673 }
6674
6675 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8) {
6676 TEST_REQUIRES_ARM_NEON_FMA;
6677 for (size_t k = 1; k < 8; k++) {
6678 GemmMicrokernelTester()
6679 .mr(4)
6680 .nr(12)
6681 .kr(1)
6682 .sr(1)
6683 .m(4)
6684 .n(12)
6685 .k(k)
6686 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6687 }
6688 }
6689
6690 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_strided_a) {
6691 TEST_REQUIRES_ARM_NEON_FMA;
6692 for (size_t k = 1; k < 8; k++) {
6693 GemmMicrokernelTester()
6694 .mr(4)
6695 .nr(12)
6696 .kr(1)
6697 .sr(1)
6698 .m(4)
6699 .n(12)
6700 .k(k)
6701 .a_stride(11)
6702 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6703 }
6704 }
6705
6706 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_lt_8_subtile) {
6707 TEST_REQUIRES_ARM_NEON_FMA;
6708 for (size_t k = 1; k < 8; k++) {
6709 for (uint32_t m = 1; m <= 4; m++) {
6710 for (uint32_t n = 1; n <= 12; n++) {
6711 GemmMicrokernelTester()
6712 .mr(4)
6713 .nr(12)
6714 .kr(1)
6715 .sr(1)
6716 .m(m)
6717 .n(n)
6718 .k(k)
6719 .iterations(1)
6720 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6721 }
6722 }
6723 }
6724 }
6725
6726 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_8) {
6727 TEST_REQUIRES_ARM_NEON_FMA;
6728 for (size_t k = 9; k < 8; k++) {
6729 GemmMicrokernelTester()
6730 .mr(4)
6731 .nr(12)
6732 .kr(1)
6733 .sr(1)
6734 .m(4)
6735 .n(12)
6736 .k(k)
6737 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6738 }
6739 }
6740
6741 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_strided_a) {
6742 TEST_REQUIRES_ARM_NEON_FMA;
6743 for (size_t k = 9; k < 8; k++) {
6744 GemmMicrokernelTester()
6745 .mr(4)
6746 .nr(12)
6747 .kr(1)
6748 .sr(1)
6749 .m(4)
6750 .n(12)
6751 .k(k)
6752 .a_stride(11)
6753 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6754 }
6755 }
6756
6757 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_gt_4_subtile) {
6758 TEST_REQUIRES_ARM_NEON_FMA;
6759 for (size_t k = 9; k < 8; k++) {
6760 for (uint32_t m = 1; m <= 4; m++) {
6761 for (uint32_t n = 1; n <= 12; n++) {
6762 GemmMicrokernelTester()
6763 .mr(4)
6764 .nr(12)
6765 .kr(1)
6766 .sr(1)
6767 .m(m)
6768 .n(n)
6769 .k(k)
6770 .iterations(1)
6771 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6772 }
6773 }
6774 }
6775 }
6776
6777 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4) {
6778 TEST_REQUIRES_ARM_NEON_FMA;
6779 for (size_t k = 12; k <= 40; k += 4) {
6780 GemmMicrokernelTester()
6781 .mr(4)
6782 .nr(12)
6783 .kr(1)
6784 .sr(1)
6785 .m(4)
6786 .n(12)
6787 .k(k)
6788 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6789 }
6790 }
6791
6792 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_strided_a) {
6793 TEST_REQUIRES_ARM_NEON_FMA;
6794 for (size_t k = 12; k <= 40; k += 4) {
6795 GemmMicrokernelTester()
6796 .mr(4)
6797 .nr(12)
6798 .kr(1)
6799 .sr(1)
6800 .m(4)
6801 .n(12)
6802 .k(k)
6803 .a_stride(43)
6804 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6805 }
6806 }
6807
6808 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, k_div_4_subtile) {
6809 TEST_REQUIRES_ARM_NEON_FMA;
6810 for (size_t k = 12; k <= 40; k += 4) {
6811 for (uint32_t m = 1; m <= 4; m++) {
6812 for (uint32_t n = 1; n <= 12; n++) {
6813 GemmMicrokernelTester()
6814 .mr(4)
6815 .nr(12)
6816 .kr(1)
6817 .sr(1)
6818 .m(m)
6819 .n(n)
6820 .k(k)
6821 .iterations(1)
6822 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6823 }
6824 }
6825 }
6826 }
6827
6828 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12) {
6829 TEST_REQUIRES_ARM_NEON_FMA;
6830 for (uint32_t n = 13; n < 24; n++) {
6831 for (size_t k = 1; k <= 20; k += 5) {
6832 GemmMicrokernelTester()
6833 .mr(4)
6834 .nr(12)
6835 .kr(1)
6836 .sr(1)
6837 .m(4)
6838 .n(12)
6839 .k(k)
6840 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6841 }
6842 }
6843 }
6844
6845 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_cn) {
6846 TEST_REQUIRES_ARM_NEON_FMA;
6847 for (uint32_t n = 13; n < 24; n++) {
6848 for (size_t k = 1; k <= 20; k += 5) {
6849 GemmMicrokernelTester()
6850 .mr(4)
6851 .nr(12)
6852 .kr(1)
6853 .sr(1)
6854 .m(4)
6855 .n(12)
6856 .k(k)
6857 .cn_stride(17)
6858 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6859 }
6860 }
6861 }
6862
6863 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_strided_a) {
6864 TEST_REQUIRES_ARM_NEON_FMA;
6865 for (uint32_t n = 13; n < 24; n++) {
6866 for (size_t k = 1; k <= 20; k += 5) {
6867 GemmMicrokernelTester()
6868 .mr(4)
6869 .nr(12)
6870 .kr(1)
6871 .sr(1)
6872 .m(4)
6873 .n(n)
6874 .k(k)
6875 .a_stride(23)
6876 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6877 }
6878 }
6879 }
6880
6881 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_gt_12_subtile) {
6882 TEST_REQUIRES_ARM_NEON_FMA;
6883 for (uint32_t n = 13; n < 24; n++) {
6884 for (size_t k = 1; k <= 20; k += 5) {
6885 for (uint32_t m = 1; m <= 4; m++) {
6886 GemmMicrokernelTester()
6887 .mr(4)
6888 .nr(12)
6889 .kr(1)
6890 .sr(1)
6891 .m(m)
6892 .n(n)
6893 .k(k)
6894 .iterations(1)
6895 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6896 }
6897 }
6898 }
6899 }
6900
6901 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12) {
6902 TEST_REQUIRES_ARM_NEON_FMA;
6903 for (uint32_t n = 24; n <= 36; n += 12) {
6904 for (size_t k = 1; k <= 20; k += 5) {
6905 GemmMicrokernelTester()
6906 .mr(4)
6907 .nr(12)
6908 .kr(1)
6909 .sr(1)
6910 .m(4)
6911 .n(12)
6912 .k(k)
6913 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6914 }
6915 }
6916 }
6917
6918 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_cn) {
6919 TEST_REQUIRES_ARM_NEON_FMA;
6920 for (uint32_t n = 24; n <= 36; n += 12) {
6921 for (size_t k = 1; k <= 20; k += 5) {
6922 GemmMicrokernelTester()
6923 .mr(4)
6924 .nr(12)
6925 .kr(1)
6926 .sr(1)
6927 .m(4)
6928 .n(n)
6929 .k(k)
6930 .cn_stride(17)
6931 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6932 }
6933 }
6934 }
6935
6936 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_strided_a) {
6937 TEST_REQUIRES_ARM_NEON_FMA;
6938 for (uint32_t n = 24; n <= 36; n += 12) {
6939 for (size_t k = 1; k <= 20; k += 5) {
6940 GemmMicrokernelTester()
6941 .mr(4)
6942 .nr(12)
6943 .kr(1)
6944 .sr(1)
6945 .m(4)
6946 .n(n)
6947 .k(k)
6948 .a_stride(23)
6949 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6950 }
6951 }
6952 }
6953
6954 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, n_div_12_subtile) {
6955 TEST_REQUIRES_ARM_NEON_FMA;
6956 for (uint32_t n = 24; n <= 36; n += 12) {
6957 for (size_t k = 1; k <= 20; k += 5) {
6958 for (uint32_t m = 1; m <= 4; m++) {
6959 GemmMicrokernelTester()
6960 .mr(4)
6961 .nr(12)
6962 .kr(1)
6963 .sr(1)
6964 .m(m)
6965 .n(n)
6966 .k(k)
6967 .iterations(1)
6968 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6969 }
6970 }
6971 }
6972 }
6973
6974 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm_subtile) {
6975 TEST_REQUIRES_ARM_NEON_FMA;
6976 for (size_t k = 1; k <= 20; k += 5) {
6977 for (uint32_t m = 1; m <= 4; m++) {
6978 for (uint32_t n = 1; n <= 12; n++) {
6979 GemmMicrokernelTester()
6980 .mr(4)
6981 .nr(12)
6982 .kr(1)
6983 .sr(1)
6984 .m(m)
6985 .n(n)
6986 .k(k)
6987 .cm_stride(17)
6988 .iterations(1)
6989 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
6990 }
6991 }
6992 }
6993 }
6994
6995 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmin) {
6996 TEST_REQUIRES_ARM_NEON_FMA;
6997 GemmMicrokernelTester()
6998 .mr(4)
6999 .nr(12)
7000 .kr(1)
7001 .sr(1)
7002 .m(4)
7003 .n(12)
7004 .k(4)
7005 .qmin(128)
7006 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
7007 }
7008
7009 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, qmax) {
7010 TEST_REQUIRES_ARM_NEON_FMA;
7011 GemmMicrokernelTester()
7012 .mr(4)
7013 .nr(12)
7014 .kr(1)
7015 .sr(1)
7016 .m(4)
7017 .n(12)
7018 .k(4)
7019 .qmax(128)
7020 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
7021 }
7022
7023 TEST(F32_GEMM_4X12__AARCH64_NEONFMA_CORTEX_A53, strided_cm) {
7024 TEST_REQUIRES_ARM_NEON_FMA;
7025 GemmMicrokernelTester()
7026 .mr(4)
7027 .nr(12)
7028 .kr(1)
7029 .sr(1)
7030 .m(4)
7031 .n(12)
7032 .k(4)
7033 .cm_stride(17)
7034 .Test(xnn_f32_gemm_ukernel_4x12__aarch64_neonfma_cortex_a53);
7035 }
Frank Barchard7e955972019-10-11 10:34:25 -07007036#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007037
7038
Frank Barchard7e955972019-10-11 10:34:25 -07007039#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007040 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2) {
7041 TEST_REQUIRES_ARM_NEON_FMA;
7042 GemmMicrokernelTester()
7043 .mr(4)
7044 .nr(8)
7045 .kr(1)
7046 .sr(1)
7047 .m(4)
7048 .n(8)
7049 .k(2)
7050 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7051 }
7052
7053 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, strided_cn) {
7054 TEST_REQUIRES_ARM_NEON_FMA;
7055 GemmMicrokernelTester()
7056 .mr(4)
7057 .nr(8)
7058 .kr(1)
7059 .sr(1)
7060 .m(4)
7061 .n(8)
7062 .k(2)
7063 .cn_stride(11)
7064 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7065 }
7066
7067 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
7068 TEST_REQUIRES_ARM_NEON_FMA;
7069 GemmMicrokernelTester()
7070 .mr(4)
7071 .nr(8)
7072 .kr(1)
7073 .sr(1)
7074 .m(4)
7075 .n(8)
7076 .k(2)
7077 .a_stride(5)
7078 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7079 }
7080
7081 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
7082 TEST_REQUIRES_ARM_NEON_FMA;
7083 for (uint32_t m = 1; m <= 4; m++) {
7084 for (uint32_t n = 1; n <= 8; n++) {
7085 GemmMicrokernelTester()
7086 .mr(4)
7087 .nr(8)
7088 .kr(1)
7089 .sr(1)
7090 .m(m)
7091 .n(n)
7092 .k(2)
7093 .iterations(1)
7094 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7095 }
7096 }
7097 }
7098
7099 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
7100 TEST_REQUIRES_ARM_NEON_FMA;
7101 for (uint32_t m = 1; m <= 4; m++) {
7102 GemmMicrokernelTester()
7103 .mr(4)
7104 .nr(8)
7105 .kr(1)
7106 .sr(1)
7107 .m(m)
7108 .n(8)
7109 .k(2)
7110 .iterations(1)
7111 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7112 }
7113 }
7114
7115 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
7116 TEST_REQUIRES_ARM_NEON_FMA;
7117 for (uint32_t n = 1; n <= 8; n++) {
7118 GemmMicrokernelTester()
7119 .mr(4)
7120 .nr(8)
7121 .kr(1)
7122 .sr(1)
7123 .m(4)
7124 .n(n)
7125 .k(2)
7126 .iterations(1)
7127 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7128 }
7129 }
7130
7131 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_lt_2) {
7132 TEST_REQUIRES_ARM_NEON_FMA;
7133 for (size_t k = 1; k < 2; k++) {
7134 GemmMicrokernelTester()
7135 .mr(4)
7136 .nr(8)
7137 .kr(1)
7138 .sr(1)
7139 .m(4)
7140 .n(8)
7141 .k(k)
7142 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7143 }
7144 }
7145
7146 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
7147 TEST_REQUIRES_ARM_NEON_FMA;
7148 for (size_t k = 1; k < 2; k++) {
7149 GemmMicrokernelTester()
7150 .mr(4)
7151 .nr(8)
7152 .kr(1)
7153 .sr(1)
7154 .m(4)
7155 .n(8)
7156 .k(k)
7157 .a_stride(5)
7158 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7159 }
7160 }
7161
7162 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
7163 TEST_REQUIRES_ARM_NEON_FMA;
7164 for (size_t k = 1; k < 2; k++) {
7165 for (uint32_t m = 1; m <= 4; m++) {
7166 for (uint32_t n = 1; n <= 8; n++) {
7167 GemmMicrokernelTester()
7168 .mr(4)
7169 .nr(8)
7170 .kr(1)
7171 .sr(1)
7172 .m(m)
7173 .n(n)
7174 .k(k)
7175 .iterations(1)
7176 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7177 }
7178 }
7179 }
7180 }
7181
7182 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_gt_2) {
7183 TEST_REQUIRES_ARM_NEON_FMA;
7184 for (size_t k = 3; k < 4; k++) {
7185 GemmMicrokernelTester()
7186 .mr(4)
7187 .nr(8)
7188 .kr(1)
7189 .sr(1)
7190 .m(4)
7191 .n(8)
7192 .k(k)
7193 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7194 }
7195 }
7196
7197 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
7198 TEST_REQUIRES_ARM_NEON_FMA;
7199 for (size_t k = 3; k < 4; k++) {
7200 GemmMicrokernelTester()
7201 .mr(4)
7202 .nr(8)
7203 .kr(1)
7204 .sr(1)
7205 .m(4)
7206 .n(8)
7207 .k(k)
7208 .a_stride(7)
7209 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7210 }
7211 }
7212
7213 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
7214 TEST_REQUIRES_ARM_NEON_FMA;
7215 for (size_t k = 3; k < 4; k++) {
7216 for (uint32_t m = 1; m <= 4; m++) {
7217 for (uint32_t n = 1; n <= 8; n++) {
7218 GemmMicrokernelTester()
7219 .mr(4)
7220 .nr(8)
7221 .kr(1)
7222 .sr(1)
7223 .m(m)
7224 .n(n)
7225 .k(k)
7226 .iterations(1)
7227 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7228 }
7229 }
7230 }
7231 }
7232
7233 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_div_2) {
7234 TEST_REQUIRES_ARM_NEON_FMA;
7235 for (size_t k = 4; k <= 20; k += 2) {
7236 GemmMicrokernelTester()
7237 .mr(4)
7238 .nr(8)
7239 .kr(1)
7240 .sr(1)
7241 .m(4)
7242 .n(8)
7243 .k(k)
7244 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7245 }
7246 }
7247
7248 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
7249 TEST_REQUIRES_ARM_NEON_FMA;
7250 for (size_t k = 4; k <= 20; k += 2) {
7251 GemmMicrokernelTester()
7252 .mr(4)
7253 .nr(8)
7254 .kr(1)
7255 .sr(1)
7256 .m(4)
7257 .n(8)
7258 .k(k)
7259 .a_stride(23)
7260 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7261 }
7262 }
7263
7264 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
7265 TEST_REQUIRES_ARM_NEON_FMA;
7266 for (size_t k = 4; k <= 20; k += 2) {
7267 for (uint32_t m = 1; m <= 4; m++) {
7268 for (uint32_t n = 1; n <= 8; n++) {
7269 GemmMicrokernelTester()
7270 .mr(4)
7271 .nr(8)
7272 .kr(1)
7273 .sr(1)
7274 .m(m)
7275 .n(n)
7276 .k(k)
7277 .iterations(1)
7278 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7279 }
7280 }
7281 }
7282 }
7283
7284 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8) {
7285 TEST_REQUIRES_ARM_NEON_FMA;
7286 for (uint32_t n = 9; n < 16; n++) {
7287 for (size_t k = 1; k <= 10; k += 3) {
7288 GemmMicrokernelTester()
7289 .mr(4)
7290 .nr(8)
7291 .kr(1)
7292 .sr(1)
7293 .m(4)
7294 .n(8)
7295 .k(k)
7296 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7297 }
7298 }
7299 }
7300
7301 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
7302 TEST_REQUIRES_ARM_NEON_FMA;
7303 for (uint32_t n = 9; n < 16; n++) {
7304 for (size_t k = 1; k <= 10; k += 3) {
7305 GemmMicrokernelTester()
7306 .mr(4)
7307 .nr(8)
7308 .kr(1)
7309 .sr(1)
7310 .m(4)
7311 .n(8)
7312 .k(k)
7313 .cn_stride(11)
7314 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7315 }
7316 }
7317 }
7318
7319 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
7320 TEST_REQUIRES_ARM_NEON_FMA;
7321 for (uint32_t n = 9; n < 16; n++) {
7322 for (size_t k = 1; k <= 10; k += 3) {
7323 GemmMicrokernelTester()
7324 .mr(4)
7325 .nr(8)
7326 .kr(1)
7327 .sr(1)
7328 .m(4)
7329 .n(n)
7330 .k(k)
7331 .a_stride(13)
7332 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7333 }
7334 }
7335 }
7336
7337 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
7338 TEST_REQUIRES_ARM_NEON_FMA;
7339 for (uint32_t n = 9; n < 16; n++) {
7340 for (size_t k = 1; k <= 10; k += 3) {
7341 for (uint32_t m = 1; m <= 4; m++) {
7342 GemmMicrokernelTester()
7343 .mr(4)
7344 .nr(8)
7345 .kr(1)
7346 .sr(1)
7347 .m(m)
7348 .n(n)
7349 .k(k)
7350 .iterations(1)
7351 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7352 }
7353 }
7354 }
7355 }
7356
7357 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8) {
7358 TEST_REQUIRES_ARM_NEON_FMA;
7359 for (uint32_t n = 16; n <= 24; n += 8) {
7360 for (size_t k = 1; k <= 10; k += 3) {
7361 GemmMicrokernelTester()
7362 .mr(4)
7363 .nr(8)
7364 .kr(1)
7365 .sr(1)
7366 .m(4)
7367 .n(8)
7368 .k(k)
7369 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7370 }
7371 }
7372 }
7373
7374 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
7375 TEST_REQUIRES_ARM_NEON_FMA;
7376 for (uint32_t n = 16; n <= 24; n += 8) {
7377 for (size_t k = 1; k <= 10; k += 3) {
7378 GemmMicrokernelTester()
7379 .mr(4)
7380 .nr(8)
7381 .kr(1)
7382 .sr(1)
7383 .m(4)
7384 .n(n)
7385 .k(k)
7386 .cn_stride(11)
7387 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7388 }
7389 }
7390 }
7391
7392 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
7393 TEST_REQUIRES_ARM_NEON_FMA;
7394 for (uint32_t n = 16; n <= 24; n += 8) {
7395 for (size_t k = 1; k <= 10; k += 3) {
7396 GemmMicrokernelTester()
7397 .mr(4)
7398 .nr(8)
7399 .kr(1)
7400 .sr(1)
7401 .m(4)
7402 .n(n)
7403 .k(k)
7404 .a_stride(13)
7405 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7406 }
7407 }
7408 }
7409
7410 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
7411 TEST_REQUIRES_ARM_NEON_FMA;
7412 for (uint32_t n = 16; n <= 24; n += 8) {
7413 for (size_t k = 1; k <= 10; k += 3) {
7414 for (uint32_t m = 1; m <= 4; m++) {
7415 GemmMicrokernelTester()
7416 .mr(4)
7417 .nr(8)
7418 .kr(1)
7419 .sr(1)
7420 .m(m)
7421 .n(n)
7422 .k(k)
7423 .iterations(1)
7424 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7425 }
7426 }
7427 }
7428 }
7429
7430 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
7431 TEST_REQUIRES_ARM_NEON_FMA;
7432 for (size_t k = 1; k <= 10; k += 3) {
7433 for (uint32_t m = 1; m <= 4; m++) {
7434 for (uint32_t n = 1; n <= 8; n++) {
7435 GemmMicrokernelTester()
7436 .mr(4)
7437 .nr(8)
7438 .kr(1)
7439 .sr(1)
7440 .m(m)
7441 .n(n)
7442 .k(k)
7443 .cm_stride(11)
7444 .iterations(1)
7445 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7446 }
7447 }
7448 }
7449 }
7450
7451 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, qmin) {
7452 TEST_REQUIRES_ARM_NEON_FMA;
7453 GemmMicrokernelTester()
7454 .mr(4)
7455 .nr(8)
7456 .kr(1)
7457 .sr(1)
7458 .m(4)
7459 .n(8)
7460 .k(2)
7461 .qmin(128)
7462 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7463 }
7464
7465 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, qmax) {
7466 TEST_REQUIRES_ARM_NEON_FMA;
7467 GemmMicrokernelTester()
7468 .mr(4)
7469 .nr(8)
7470 .kr(1)
7471 .sr(1)
7472 .m(4)
7473 .n(8)
7474 .k(2)
7475 .qmax(128)
7476 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7477 }
7478
7479 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD64, strided_cm) {
7480 TEST_REQUIRES_ARM_NEON_FMA;
7481 GemmMicrokernelTester()
7482 .mr(4)
7483 .nr(8)
7484 .kr(1)
7485 .sr(1)
7486 .m(4)
7487 .n(8)
7488 .k(2)
7489 .cm_stride(11)
7490 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld64);
7491 }
Frank Barchard7e955972019-10-11 10:34:25 -07007492#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07007493
7494
Frank Barchard8b0f0262019-11-27 23:18:40 -08007495#if XNN_ARCH_ARM
Frank Barchard13916042019-12-11 10:56:34 -08007496 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4) {
7497 TEST_REQUIRES_ARM_NEON;
7498 GemmMicrokernelTester()
7499 .mr(4)
7500 .nr(8)
7501 .kr(1)
7502 .sr(1)
7503 .m(4)
7504 .n(8)
7505 .k(4)
7506 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7507 }
7508
7509 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cn) {
7510 TEST_REQUIRES_ARM_NEON;
7511 GemmMicrokernelTester()
7512 .mr(4)
7513 .nr(8)
7514 .kr(1)
7515 .sr(1)
7516 .m(4)
7517 .n(8)
7518 .k(4)
7519 .cn_stride(11)
7520 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7521 }
7522
7523 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_strided_a) {
7524 TEST_REQUIRES_ARM_NEON;
7525 GemmMicrokernelTester()
7526 .mr(4)
7527 .nr(8)
7528 .kr(1)
7529 .sr(1)
7530 .m(4)
7531 .n(8)
7532 .k(4)
7533 .a_stride(7)
7534 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7535 }
7536
7537 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile) {
7538 TEST_REQUIRES_ARM_NEON;
7539 for (uint32_t m = 1; m <= 4; m++) {
7540 for (uint32_t n = 1; n <= 8; n++) {
7541 GemmMicrokernelTester()
7542 .mr(4)
7543 .nr(8)
7544 .kr(1)
7545 .sr(1)
7546 .m(m)
7547 .n(n)
7548 .k(4)
7549 .iterations(1)
7550 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7551 }
7552 }
7553 }
7554
7555 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile_m) {
7556 TEST_REQUIRES_ARM_NEON;
7557 for (uint32_t m = 1; m <= 4; m++) {
7558 GemmMicrokernelTester()
7559 .mr(4)
7560 .nr(8)
7561 .kr(1)
7562 .sr(1)
7563 .m(m)
7564 .n(8)
7565 .k(4)
7566 .iterations(1)
7567 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7568 }
7569 }
7570
7571 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_4_subtile_n) {
7572 TEST_REQUIRES_ARM_NEON;
7573 for (uint32_t n = 1; n <= 8; n++) {
7574 GemmMicrokernelTester()
7575 .mr(4)
7576 .nr(8)
7577 .kr(1)
7578 .sr(1)
7579 .m(4)
7580 .n(n)
7581 .k(4)
7582 .iterations(1)
7583 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7584 }
7585 }
7586
Frank Barchardca27b402020-02-03 17:47:32 -08007587 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8) {
Frank Barchard13916042019-12-11 10:56:34 -08007588 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007589 GemmMicrokernelTester()
7590 .mr(4)
7591 .nr(8)
7592 .kr(1)
7593 .sr(1)
7594 .m(4)
7595 .n(8)
7596 .k(8)
7597 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7598 }
7599
7600 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8_strided_a) {
7601 TEST_REQUIRES_ARM_NEON;
7602 GemmMicrokernelTester()
7603 .mr(4)
7604 .nr(8)
7605 .kr(1)
7606 .sr(1)
7607 .m(4)
7608 .n(8)
7609 .k(8)
7610 .a_stride(11)
7611 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7612 }
7613
7614 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_eq_8_subtile) {
7615 TEST_REQUIRES_ARM_NEON;
7616 for (uint32_t m = 1; m <= 4; m++) {
7617 for (uint32_t n = 1; n <= 8; n++) {
7618 GemmMicrokernelTester()
7619 .mr(4)
7620 .nr(8)
7621 .kr(1)
7622 .sr(1)
7623 .m(m)
7624 .n(n)
7625 .k(8)
7626 .iterations(1)
7627 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7628 }
7629 }
7630 }
7631
7632 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8) {
7633 TEST_REQUIRES_ARM_NEON;
7634 for (size_t k = 1; k < 8; k++) {
Frank Barchard13916042019-12-11 10:56:34 -08007635 GemmMicrokernelTester()
7636 .mr(4)
7637 .nr(8)
7638 .kr(1)
7639 .sr(1)
7640 .m(4)
7641 .n(8)
7642 .k(k)
7643 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7644 }
7645 }
7646
Frank Barchardca27b402020-02-03 17:47:32 -08007647 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8_strided_a) {
Frank Barchard13916042019-12-11 10:56:34 -08007648 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007649 for (size_t k = 1; k < 8; k++) {
Frank Barchard13916042019-12-11 10:56:34 -08007650 GemmMicrokernelTester()
7651 .mr(4)
7652 .nr(8)
7653 .kr(1)
7654 .sr(1)
7655 .m(4)
7656 .n(8)
7657 .k(k)
Frank Barchardca27b402020-02-03 17:47:32 -08007658 .a_stride(11)
Frank Barchard13916042019-12-11 10:56:34 -08007659 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7660 }
7661 }
7662
Frank Barchardca27b402020-02-03 17:47:32 -08007663 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_lt_8_subtile) {
Frank Barchard13916042019-12-11 10:56:34 -08007664 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007665 for (size_t k = 1; k < 8; k++) {
Frank Barchard13916042019-12-11 10:56:34 -08007666 for (uint32_t m = 1; m <= 4; m++) {
7667 for (uint32_t n = 1; n <= 8; n++) {
7668 GemmMicrokernelTester()
7669 .mr(4)
7670 .nr(8)
7671 .kr(1)
7672 .sr(1)
7673 .m(m)
7674 .n(n)
7675 .k(k)
7676 .iterations(1)
7677 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7678 }
7679 }
7680 }
7681 }
7682
Frank Barchardca27b402020-02-03 17:47:32 -08007683 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_8) {
Frank Barchard13916042019-12-11 10:56:34 -08007684 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007685 for (size_t k = 9; k < 8; k++) {
Frank Barchard13916042019-12-11 10:56:34 -08007686 GemmMicrokernelTester()
7687 .mr(4)
7688 .nr(8)
7689 .kr(1)
7690 .sr(1)
7691 .m(4)
7692 .n(8)
7693 .k(k)
7694 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7695 }
7696 }
7697
7698 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_4_strided_a) {
7699 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007700 for (size_t k = 9; k < 8; k++) {
Frank Barchard13916042019-12-11 10:56:34 -08007701 GemmMicrokernelTester()
7702 .mr(4)
7703 .nr(8)
7704 .kr(1)
7705 .sr(1)
7706 .m(4)
7707 .n(8)
7708 .k(k)
7709 .a_stride(11)
7710 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7711 }
7712 }
7713
7714 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_gt_4_subtile) {
7715 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007716 for (size_t k = 9; k < 8; k++) {
Frank Barchard13916042019-12-11 10:56:34 -08007717 for (uint32_t m = 1; m <= 4; m++) {
7718 for (uint32_t n = 1; n <= 8; n++) {
7719 GemmMicrokernelTester()
7720 .mr(4)
7721 .nr(8)
7722 .kr(1)
7723 .sr(1)
7724 .m(m)
7725 .n(n)
7726 .k(k)
7727 .iterations(1)
7728 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7729 }
7730 }
7731 }
7732 }
7733
7734 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4) {
7735 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007736 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard13916042019-12-11 10:56:34 -08007737 GemmMicrokernelTester()
7738 .mr(4)
7739 .nr(8)
7740 .kr(1)
7741 .sr(1)
7742 .m(4)
7743 .n(8)
7744 .k(k)
7745 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7746 }
7747 }
7748
7749 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4_strided_a) {
7750 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007751 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard13916042019-12-11 10:56:34 -08007752 GemmMicrokernelTester()
7753 .mr(4)
7754 .nr(8)
7755 .kr(1)
7756 .sr(1)
7757 .m(4)
7758 .n(8)
7759 .k(k)
7760 .a_stride(43)
7761 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7762 }
7763 }
7764
7765 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, k_div_4_subtile) {
7766 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08007767 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard13916042019-12-11 10:56:34 -08007768 for (uint32_t m = 1; m <= 4; m++) {
7769 for (uint32_t n = 1; n <= 8; n++) {
7770 GemmMicrokernelTester()
7771 .mr(4)
7772 .nr(8)
7773 .kr(1)
7774 .sr(1)
7775 .m(m)
7776 .n(n)
7777 .k(k)
7778 .iterations(1)
7779 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7780 }
7781 }
7782 }
7783 }
7784
7785 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8) {
7786 TEST_REQUIRES_ARM_NEON;
7787 for (uint32_t n = 9; n < 16; n++) {
7788 for (size_t k = 1; k <= 20; k += 5) {
7789 GemmMicrokernelTester()
7790 .mr(4)
7791 .nr(8)
7792 .kr(1)
7793 .sr(1)
7794 .m(4)
7795 .n(8)
7796 .k(k)
7797 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7798 }
7799 }
7800 }
7801
7802 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_strided_cn) {
7803 TEST_REQUIRES_ARM_NEON;
7804 for (uint32_t n = 9; n < 16; n++) {
7805 for (size_t k = 1; k <= 20; k += 5) {
7806 GemmMicrokernelTester()
7807 .mr(4)
7808 .nr(8)
7809 .kr(1)
7810 .sr(1)
7811 .m(4)
7812 .n(8)
7813 .k(k)
7814 .cn_stride(11)
7815 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7816 }
7817 }
7818 }
7819
7820 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_strided_a) {
7821 TEST_REQUIRES_ARM_NEON;
7822 for (uint32_t n = 9; n < 16; n++) {
7823 for (size_t k = 1; k <= 20; k += 5) {
7824 GemmMicrokernelTester()
7825 .mr(4)
7826 .nr(8)
7827 .kr(1)
7828 .sr(1)
7829 .m(4)
7830 .n(n)
7831 .k(k)
7832 .a_stride(23)
7833 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7834 }
7835 }
7836 }
7837
7838 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_gt_8_subtile) {
7839 TEST_REQUIRES_ARM_NEON;
7840 for (uint32_t n = 9; n < 16; n++) {
7841 for (size_t k = 1; k <= 20; k += 5) {
7842 for (uint32_t m = 1; m <= 4; m++) {
7843 GemmMicrokernelTester()
7844 .mr(4)
7845 .nr(8)
7846 .kr(1)
7847 .sr(1)
7848 .m(m)
7849 .n(n)
7850 .k(k)
7851 .iterations(1)
7852 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7853 }
7854 }
7855 }
7856 }
7857
7858 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8) {
7859 TEST_REQUIRES_ARM_NEON;
7860 for (uint32_t n = 16; n <= 24; n += 8) {
7861 for (size_t k = 1; k <= 20; k += 5) {
7862 GemmMicrokernelTester()
7863 .mr(4)
7864 .nr(8)
7865 .kr(1)
7866 .sr(1)
7867 .m(4)
7868 .n(8)
7869 .k(k)
7870 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7871 }
7872 }
7873 }
7874
7875 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_strided_cn) {
7876 TEST_REQUIRES_ARM_NEON;
7877 for (uint32_t n = 16; n <= 24; n += 8) {
7878 for (size_t k = 1; k <= 20; k += 5) {
7879 GemmMicrokernelTester()
7880 .mr(4)
7881 .nr(8)
7882 .kr(1)
7883 .sr(1)
7884 .m(4)
7885 .n(n)
7886 .k(k)
7887 .cn_stride(11)
7888 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7889 }
7890 }
7891 }
7892
7893 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_strided_a) {
7894 TEST_REQUIRES_ARM_NEON;
7895 for (uint32_t n = 16; n <= 24; n += 8) {
7896 for (size_t k = 1; k <= 20; k += 5) {
7897 GemmMicrokernelTester()
7898 .mr(4)
7899 .nr(8)
7900 .kr(1)
7901 .sr(1)
7902 .m(4)
7903 .n(n)
7904 .k(k)
7905 .a_stride(23)
7906 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7907 }
7908 }
7909 }
7910
7911 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, n_div_8_subtile) {
7912 TEST_REQUIRES_ARM_NEON;
7913 for (uint32_t n = 16; n <= 24; n += 8) {
7914 for (size_t k = 1; k <= 20; k += 5) {
7915 for (uint32_t m = 1; m <= 4; m++) {
7916 GemmMicrokernelTester()
7917 .mr(4)
7918 .nr(8)
7919 .kr(1)
7920 .sr(1)
7921 .m(m)
7922 .n(n)
7923 .k(k)
7924 .iterations(1)
7925 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7926 }
7927 }
7928 }
7929 }
7930
7931 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cm_subtile) {
7932 TEST_REQUIRES_ARM_NEON;
7933 for (size_t k = 1; k <= 20; k += 5) {
7934 for (uint32_t m = 1; m <= 4; m++) {
7935 for (uint32_t n = 1; n <= 8; n++) {
7936 GemmMicrokernelTester()
7937 .mr(4)
7938 .nr(8)
7939 .kr(1)
7940 .sr(1)
7941 .m(m)
7942 .n(n)
7943 .k(k)
7944 .cm_stride(11)
7945 .iterations(1)
7946 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7947 }
7948 }
7949 }
7950 }
7951
7952 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, qmin) {
7953 TEST_REQUIRES_ARM_NEON;
7954 GemmMicrokernelTester()
7955 .mr(4)
7956 .nr(8)
7957 .kr(1)
7958 .sr(1)
7959 .m(4)
7960 .n(8)
7961 .k(4)
7962 .qmin(128)
7963 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7964 }
7965
7966 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, qmax) {
7967 TEST_REQUIRES_ARM_NEON;
7968 GemmMicrokernelTester()
7969 .mr(4)
7970 .nr(8)
7971 .kr(1)
7972 .sr(1)
7973 .m(4)
7974 .n(8)
7975 .k(4)
7976 .qmax(128)
7977 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7978 }
7979
7980 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A53, strided_cm) {
7981 TEST_REQUIRES_ARM_NEON;
7982 GemmMicrokernelTester()
7983 .mr(4)
7984 .nr(8)
7985 .kr(1)
7986 .sr(1)
7987 .m(4)
7988 .n(8)
7989 .k(4)
7990 .cm_stride(11)
7991 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a53);
7992 }
7993#endif // XNN_ARCH_ARM
7994
7995
7996#if XNN_ARCH_ARM
Frank Barchard3e237f22019-12-04 23:08:51 -08007997 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4) {
7998 TEST_REQUIRES_ARM_NEON;
7999 GemmMicrokernelTester()
8000 .mr(4)
8001 .nr(8)
8002 .kr(1)
8003 .sr(1)
8004 .m(4)
8005 .n(8)
8006 .k(4)
8007 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8008 }
8009
8010 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cn) {
8011 TEST_REQUIRES_ARM_NEON;
8012 GemmMicrokernelTester()
8013 .mr(4)
8014 .nr(8)
8015 .kr(1)
8016 .sr(1)
8017 .m(4)
8018 .n(8)
8019 .k(4)
8020 .cn_stride(11)
8021 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8022 }
8023
8024 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_strided_a) {
8025 TEST_REQUIRES_ARM_NEON;
8026 GemmMicrokernelTester()
8027 .mr(4)
8028 .nr(8)
8029 .kr(1)
8030 .sr(1)
8031 .m(4)
8032 .n(8)
8033 .k(4)
8034 .a_stride(7)
8035 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8036 }
8037
8038 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile) {
8039 TEST_REQUIRES_ARM_NEON;
8040 for (uint32_t m = 1; m <= 4; m++) {
8041 for (uint32_t n = 1; n <= 8; n++) {
8042 GemmMicrokernelTester()
8043 .mr(4)
8044 .nr(8)
8045 .kr(1)
8046 .sr(1)
8047 .m(m)
8048 .n(n)
8049 .k(4)
8050 .iterations(1)
8051 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8052 }
8053 }
8054 }
8055
8056 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_m) {
8057 TEST_REQUIRES_ARM_NEON;
8058 for (uint32_t m = 1; m <= 4; m++) {
8059 GemmMicrokernelTester()
8060 .mr(4)
8061 .nr(8)
8062 .kr(1)
8063 .sr(1)
8064 .m(m)
8065 .n(8)
8066 .k(4)
8067 .iterations(1)
8068 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8069 }
8070 }
8071
8072 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_4_subtile_n) {
8073 TEST_REQUIRES_ARM_NEON;
8074 for (uint32_t n = 1; n <= 8; n++) {
8075 GemmMicrokernelTester()
8076 .mr(4)
8077 .nr(8)
8078 .kr(1)
8079 .sr(1)
8080 .m(4)
8081 .n(n)
8082 .k(4)
8083 .iterations(1)
8084 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8085 }
8086 }
8087
Frank Barchardca27b402020-02-03 17:47:32 -08008088 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008089 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008090 GemmMicrokernelTester()
8091 .mr(4)
8092 .nr(8)
8093 .kr(1)
8094 .sr(1)
8095 .m(4)
8096 .n(8)
8097 .k(8)
8098 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8099 }
8100
8101 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_strided_a) {
8102 TEST_REQUIRES_ARM_NEON;
8103 GemmMicrokernelTester()
8104 .mr(4)
8105 .nr(8)
8106 .kr(1)
8107 .sr(1)
8108 .m(4)
8109 .n(8)
8110 .k(8)
8111 .a_stride(11)
8112 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8113 }
8114
8115 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_eq_8_subtile) {
8116 TEST_REQUIRES_ARM_NEON;
8117 for (uint32_t m = 1; m <= 4; m++) {
8118 for (uint32_t n = 1; n <= 8; n++) {
8119 GemmMicrokernelTester()
8120 .mr(4)
8121 .nr(8)
8122 .kr(1)
8123 .sr(1)
8124 .m(m)
8125 .n(n)
8126 .k(8)
8127 .iterations(1)
8128 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8129 }
8130 }
8131 }
8132
8133 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8) {
8134 TEST_REQUIRES_ARM_NEON;
8135 for (size_t k = 1; k < 8; k++) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008136 GemmMicrokernelTester()
8137 .mr(4)
8138 .nr(8)
8139 .kr(1)
8140 .sr(1)
8141 .m(4)
8142 .n(8)
8143 .k(k)
8144 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8145 }
8146 }
8147
Frank Barchardca27b402020-02-03 17:47:32 -08008148 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_strided_a) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008149 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008150 for (size_t k = 1; k < 8; k++) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008151 GemmMicrokernelTester()
8152 .mr(4)
8153 .nr(8)
8154 .kr(1)
8155 .sr(1)
8156 .m(4)
8157 .n(8)
8158 .k(k)
Frank Barchardca27b402020-02-03 17:47:32 -08008159 .a_stride(11)
Frank Barchard3e237f22019-12-04 23:08:51 -08008160 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8161 }
8162 }
8163
Frank Barchardca27b402020-02-03 17:47:32 -08008164 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_lt_8_subtile) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008165 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008166 for (size_t k = 1; k < 8; k++) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008167 for (uint32_t m = 1; m <= 4; m++) {
8168 for (uint32_t n = 1; n <= 8; n++) {
8169 GemmMicrokernelTester()
8170 .mr(4)
8171 .nr(8)
8172 .kr(1)
8173 .sr(1)
8174 .m(m)
8175 .n(n)
8176 .k(k)
8177 .iterations(1)
8178 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8179 }
8180 }
8181 }
8182 }
8183
Frank Barchardca27b402020-02-03 17:47:32 -08008184 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_8) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008185 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008186 for (size_t k = 9; k < 8; k++) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008187 GemmMicrokernelTester()
8188 .mr(4)
8189 .nr(8)
8190 .kr(1)
8191 .sr(1)
8192 .m(4)
8193 .n(8)
8194 .k(k)
8195 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8196 }
8197 }
8198
8199 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_4_strided_a) {
8200 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008201 for (size_t k = 9; k < 8; k++) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008202 GemmMicrokernelTester()
8203 .mr(4)
8204 .nr(8)
8205 .kr(1)
8206 .sr(1)
8207 .m(4)
8208 .n(8)
8209 .k(k)
8210 .a_stride(11)
8211 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8212 }
8213 }
8214
8215 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_gt_4_subtile) {
8216 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008217 for (size_t k = 9; k < 8; k++) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008218 for (uint32_t m = 1; m <= 4; m++) {
8219 for (uint32_t n = 1; n <= 8; n++) {
8220 GemmMicrokernelTester()
8221 .mr(4)
8222 .nr(8)
8223 .kr(1)
8224 .sr(1)
8225 .m(m)
8226 .n(n)
8227 .k(k)
8228 .iterations(1)
8229 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8230 }
8231 }
8232 }
8233 }
8234
8235 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4) {
8236 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008237 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008238 GemmMicrokernelTester()
8239 .mr(4)
8240 .nr(8)
8241 .kr(1)
8242 .sr(1)
8243 .m(4)
8244 .n(8)
8245 .k(k)
8246 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8247 }
8248 }
8249
8250 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_strided_a) {
8251 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008252 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008253 GemmMicrokernelTester()
8254 .mr(4)
8255 .nr(8)
8256 .kr(1)
8257 .sr(1)
8258 .m(4)
8259 .n(8)
8260 .k(k)
8261 .a_stride(43)
8262 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8263 }
8264 }
8265
8266 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, k_div_4_subtile) {
8267 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008268 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard3e237f22019-12-04 23:08:51 -08008269 for (uint32_t m = 1; m <= 4; m++) {
8270 for (uint32_t n = 1; n <= 8; n++) {
8271 GemmMicrokernelTester()
8272 .mr(4)
8273 .nr(8)
8274 .kr(1)
8275 .sr(1)
8276 .m(m)
8277 .n(n)
8278 .k(k)
8279 .iterations(1)
8280 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8281 }
8282 }
8283 }
8284 }
8285
8286 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8) {
8287 TEST_REQUIRES_ARM_NEON;
8288 for (uint32_t n = 9; n < 16; n++) {
8289 for (size_t k = 1; k <= 20; k += 5) {
8290 GemmMicrokernelTester()
8291 .mr(4)
8292 .nr(8)
8293 .kr(1)
8294 .sr(1)
8295 .m(4)
8296 .n(8)
8297 .k(k)
8298 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8299 }
8300 }
8301 }
8302
8303 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_cn) {
8304 TEST_REQUIRES_ARM_NEON;
8305 for (uint32_t n = 9; n < 16; n++) {
8306 for (size_t k = 1; k <= 20; k += 5) {
8307 GemmMicrokernelTester()
8308 .mr(4)
8309 .nr(8)
8310 .kr(1)
8311 .sr(1)
8312 .m(4)
8313 .n(8)
8314 .k(k)
8315 .cn_stride(11)
8316 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8317 }
8318 }
8319 }
8320
8321 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_strided_a) {
8322 TEST_REQUIRES_ARM_NEON;
8323 for (uint32_t n = 9; n < 16; n++) {
8324 for (size_t k = 1; k <= 20; k += 5) {
8325 GemmMicrokernelTester()
8326 .mr(4)
8327 .nr(8)
8328 .kr(1)
8329 .sr(1)
8330 .m(4)
8331 .n(n)
8332 .k(k)
8333 .a_stride(23)
8334 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8335 }
8336 }
8337 }
8338
8339 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_gt_8_subtile) {
8340 TEST_REQUIRES_ARM_NEON;
8341 for (uint32_t n = 9; n < 16; n++) {
8342 for (size_t k = 1; k <= 20; k += 5) {
8343 for (uint32_t m = 1; m <= 4; m++) {
8344 GemmMicrokernelTester()
8345 .mr(4)
8346 .nr(8)
8347 .kr(1)
8348 .sr(1)
8349 .m(m)
8350 .n(n)
8351 .k(k)
8352 .iterations(1)
8353 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8354 }
8355 }
8356 }
8357 }
8358
8359 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8) {
8360 TEST_REQUIRES_ARM_NEON;
8361 for (uint32_t n = 16; n <= 24; n += 8) {
8362 for (size_t k = 1; k <= 20; k += 5) {
8363 GemmMicrokernelTester()
8364 .mr(4)
8365 .nr(8)
8366 .kr(1)
8367 .sr(1)
8368 .m(4)
8369 .n(8)
8370 .k(k)
8371 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8372 }
8373 }
8374 }
8375
8376 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_cn) {
8377 TEST_REQUIRES_ARM_NEON;
8378 for (uint32_t n = 16; n <= 24; n += 8) {
8379 for (size_t k = 1; k <= 20; k += 5) {
8380 GemmMicrokernelTester()
8381 .mr(4)
8382 .nr(8)
8383 .kr(1)
8384 .sr(1)
8385 .m(4)
8386 .n(n)
8387 .k(k)
8388 .cn_stride(11)
8389 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8390 }
8391 }
8392 }
8393
8394 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_strided_a) {
8395 TEST_REQUIRES_ARM_NEON;
8396 for (uint32_t n = 16; n <= 24; n += 8) {
8397 for (size_t k = 1; k <= 20; k += 5) {
8398 GemmMicrokernelTester()
8399 .mr(4)
8400 .nr(8)
8401 .kr(1)
8402 .sr(1)
8403 .m(4)
8404 .n(n)
8405 .k(k)
8406 .a_stride(23)
8407 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8408 }
8409 }
8410 }
8411
8412 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, n_div_8_subtile) {
8413 TEST_REQUIRES_ARM_NEON;
8414 for (uint32_t n = 16; n <= 24; n += 8) {
8415 for (size_t k = 1; k <= 20; k += 5) {
8416 for (uint32_t m = 1; m <= 4; m++) {
8417 GemmMicrokernelTester()
8418 .mr(4)
8419 .nr(8)
8420 .kr(1)
8421 .sr(1)
8422 .m(m)
8423 .n(n)
8424 .k(k)
8425 .iterations(1)
8426 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8427 }
8428 }
8429 }
8430 }
8431
8432 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm_subtile) {
8433 TEST_REQUIRES_ARM_NEON;
8434 for (size_t k = 1; k <= 20; k += 5) {
8435 for (uint32_t m = 1; m <= 4; m++) {
8436 for (uint32_t n = 1; n <= 8; n++) {
8437 GemmMicrokernelTester()
8438 .mr(4)
8439 .nr(8)
8440 .kr(1)
8441 .sr(1)
8442 .m(m)
8443 .n(n)
8444 .k(k)
8445 .cm_stride(11)
8446 .iterations(1)
8447 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8448 }
8449 }
8450 }
8451 }
8452
8453 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, qmin) {
8454 TEST_REQUIRES_ARM_NEON;
8455 GemmMicrokernelTester()
8456 .mr(4)
8457 .nr(8)
8458 .kr(1)
8459 .sr(1)
8460 .m(4)
8461 .n(8)
8462 .k(4)
8463 .qmin(128)
8464 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8465 }
8466
8467 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, qmax) {
8468 TEST_REQUIRES_ARM_NEON;
8469 GemmMicrokernelTester()
8470 .mr(4)
8471 .nr(8)
8472 .kr(1)
8473 .sr(1)
8474 .m(4)
8475 .n(8)
8476 .k(4)
8477 .qmax(128)
8478 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8479 }
8480
8481 TEST(F32_GEMM_4X8__AARCH32_NEON_CORTEX_A75, strided_cm) {
8482 TEST_REQUIRES_ARM_NEON;
8483 GemmMicrokernelTester()
8484 .mr(4)
8485 .nr(8)
8486 .kr(1)
8487 .sr(1)
8488 .m(4)
8489 .n(8)
8490 .k(4)
8491 .cm_stride(11)
8492 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_cortex_a75);
8493 }
8494#endif // XNN_ARCH_ARM
8495
8496
8497#if XNN_ARCH_ARM
Frank Barchard9f7d5552019-12-12 10:58:10 -08008498 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4) {
8499 TEST_REQUIRES_ARM_NEON;
8500 GemmMicrokernelTester()
8501 .mr(4)
8502 .nr(8)
8503 .kr(1)
8504 .sr(1)
8505 .m(4)
8506 .n(8)
8507 .k(4)
8508 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8509 }
8510
8511 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cn) {
8512 TEST_REQUIRES_ARM_NEON;
8513 GemmMicrokernelTester()
8514 .mr(4)
8515 .nr(8)
8516 .kr(1)
8517 .sr(1)
8518 .m(4)
8519 .n(8)
8520 .k(4)
8521 .cn_stride(11)
8522 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8523 }
8524
8525 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_strided_a) {
8526 TEST_REQUIRES_ARM_NEON;
8527 GemmMicrokernelTester()
8528 .mr(4)
8529 .nr(8)
8530 .kr(1)
8531 .sr(1)
8532 .m(4)
8533 .n(8)
8534 .k(4)
8535 .a_stride(7)
8536 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8537 }
8538
8539 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile) {
8540 TEST_REQUIRES_ARM_NEON;
8541 for (uint32_t m = 1; m <= 4; m++) {
8542 for (uint32_t n = 1; n <= 8; n++) {
8543 GemmMicrokernelTester()
8544 .mr(4)
8545 .nr(8)
8546 .kr(1)
8547 .sr(1)
8548 .m(m)
8549 .n(n)
8550 .k(4)
8551 .iterations(1)
8552 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8553 }
8554 }
8555 }
8556
8557 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_m) {
8558 TEST_REQUIRES_ARM_NEON;
8559 for (uint32_t m = 1; m <= 4; m++) {
8560 GemmMicrokernelTester()
8561 .mr(4)
8562 .nr(8)
8563 .kr(1)
8564 .sr(1)
8565 .m(m)
8566 .n(8)
8567 .k(4)
8568 .iterations(1)
8569 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8570 }
8571 }
8572
8573 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_4_subtile_n) {
8574 TEST_REQUIRES_ARM_NEON;
8575 for (uint32_t n = 1; n <= 8; n++) {
8576 GemmMicrokernelTester()
8577 .mr(4)
8578 .nr(8)
8579 .kr(1)
8580 .sr(1)
8581 .m(4)
8582 .n(n)
8583 .k(4)
8584 .iterations(1)
8585 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8586 }
8587 }
8588
Frank Barchardca27b402020-02-03 17:47:32 -08008589 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008590 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008591 GemmMicrokernelTester()
8592 .mr(4)
8593 .nr(8)
8594 .kr(1)
8595 .sr(1)
8596 .m(4)
8597 .n(8)
8598 .k(8)
8599 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8600 }
8601
8602 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8_strided_a) {
8603 TEST_REQUIRES_ARM_NEON;
8604 GemmMicrokernelTester()
8605 .mr(4)
8606 .nr(8)
8607 .kr(1)
8608 .sr(1)
8609 .m(4)
8610 .n(8)
8611 .k(8)
8612 .a_stride(11)
8613 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8614 }
8615
8616 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_eq_8_subtile) {
8617 TEST_REQUIRES_ARM_NEON;
8618 for (uint32_t m = 1; m <= 4; m++) {
8619 for (uint32_t n = 1; n <= 8; n++) {
8620 GemmMicrokernelTester()
8621 .mr(4)
8622 .nr(8)
8623 .kr(1)
8624 .sr(1)
8625 .m(m)
8626 .n(n)
8627 .k(8)
8628 .iterations(1)
8629 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8630 }
8631 }
8632 }
8633
8634 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8) {
8635 TEST_REQUIRES_ARM_NEON;
8636 for (size_t k = 1; k < 8; k++) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008637 GemmMicrokernelTester()
8638 .mr(4)
8639 .nr(8)
8640 .kr(1)
8641 .sr(1)
8642 .m(4)
8643 .n(8)
8644 .k(k)
8645 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8646 }
8647 }
8648
Frank Barchardca27b402020-02-03 17:47:32 -08008649 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8_strided_a) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008650 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008651 for (size_t k = 1; k < 8; k++) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008652 GemmMicrokernelTester()
8653 .mr(4)
8654 .nr(8)
8655 .kr(1)
8656 .sr(1)
8657 .m(4)
8658 .n(8)
8659 .k(k)
Frank Barchardca27b402020-02-03 17:47:32 -08008660 .a_stride(11)
Frank Barchard9f7d5552019-12-12 10:58:10 -08008661 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8662 }
8663 }
8664
Frank Barchardca27b402020-02-03 17:47:32 -08008665 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_lt_8_subtile) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008666 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008667 for (size_t k = 1; k < 8; k++) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008668 for (uint32_t m = 1; m <= 4; m++) {
8669 for (uint32_t n = 1; n <= 8; n++) {
8670 GemmMicrokernelTester()
8671 .mr(4)
8672 .nr(8)
8673 .kr(1)
8674 .sr(1)
8675 .m(m)
8676 .n(n)
8677 .k(k)
8678 .iterations(1)
8679 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8680 }
8681 }
8682 }
8683 }
8684
Frank Barchardca27b402020-02-03 17:47:32 -08008685 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_8) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008686 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008687 for (size_t k = 9; k < 8; k++) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008688 GemmMicrokernelTester()
8689 .mr(4)
8690 .nr(8)
8691 .kr(1)
8692 .sr(1)
8693 .m(4)
8694 .n(8)
8695 .k(k)
8696 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8697 }
8698 }
8699
8700 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_4_strided_a) {
8701 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008702 for (size_t k = 9; k < 8; k++) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008703 GemmMicrokernelTester()
8704 .mr(4)
8705 .nr(8)
8706 .kr(1)
8707 .sr(1)
8708 .m(4)
8709 .n(8)
8710 .k(k)
8711 .a_stride(11)
8712 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8713 }
8714 }
8715
8716 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_gt_4_subtile) {
8717 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008718 for (size_t k = 9; k < 8; k++) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008719 for (uint32_t m = 1; m <= 4; m++) {
8720 for (uint32_t n = 1; n <= 8; n++) {
8721 GemmMicrokernelTester()
8722 .mr(4)
8723 .nr(8)
8724 .kr(1)
8725 .sr(1)
8726 .m(m)
8727 .n(n)
8728 .k(k)
8729 .iterations(1)
8730 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8731 }
8732 }
8733 }
8734 }
8735
8736 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4) {
8737 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008738 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008739 GemmMicrokernelTester()
8740 .mr(4)
8741 .nr(8)
8742 .kr(1)
8743 .sr(1)
8744 .m(4)
8745 .n(8)
8746 .k(k)
8747 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8748 }
8749 }
8750
8751 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4_strided_a) {
8752 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008753 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008754 GemmMicrokernelTester()
8755 .mr(4)
8756 .nr(8)
8757 .kr(1)
8758 .sr(1)
8759 .m(4)
8760 .n(8)
8761 .k(k)
8762 .a_stride(43)
8763 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8764 }
8765 }
8766
8767 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, k_div_4_subtile) {
8768 TEST_REQUIRES_ARM_NEON;
Frank Barchardca27b402020-02-03 17:47:32 -08008769 for (size_t k = 12; k <= 40; k += 4) {
Frank Barchard9f7d5552019-12-12 10:58:10 -08008770 for (uint32_t m = 1; m <= 4; m++) {
8771 for (uint32_t n = 1; n <= 8; n++) {
8772 GemmMicrokernelTester()
8773 .mr(4)
8774 .nr(8)
8775 .kr(1)
8776 .sr(1)
8777 .m(m)
8778 .n(n)
8779 .k(k)
8780 .iterations(1)
8781 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8782 }
8783 }
8784 }
8785 }
8786
8787 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8) {
8788 TEST_REQUIRES_ARM_NEON;
8789 for (uint32_t n = 9; n < 16; n++) {
8790 for (size_t k = 1; k <= 20; k += 5) {
8791 GemmMicrokernelTester()
8792 .mr(4)
8793 .nr(8)
8794 .kr(1)
8795 .sr(1)
8796 .m(4)
8797 .n(8)
8798 .k(k)
8799 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8800 }
8801 }
8802 }
8803
8804 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_strided_cn) {
8805 TEST_REQUIRES_ARM_NEON;
8806 for (uint32_t n = 9; n < 16; n++) {
8807 for (size_t k = 1; k <= 20; k += 5) {
8808 GemmMicrokernelTester()
8809 .mr(4)
8810 .nr(8)
8811 .kr(1)
8812 .sr(1)
8813 .m(4)
8814 .n(8)
8815 .k(k)
8816 .cn_stride(11)
8817 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8818 }
8819 }
8820 }
8821
8822 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_strided_a) {
8823 TEST_REQUIRES_ARM_NEON;
8824 for (uint32_t n = 9; n < 16; n++) {
8825 for (size_t k = 1; k <= 20; k += 5) {
8826 GemmMicrokernelTester()
8827 .mr(4)
8828 .nr(8)
8829 .kr(1)
8830 .sr(1)
8831 .m(4)
8832 .n(n)
8833 .k(k)
8834 .a_stride(23)
8835 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8836 }
8837 }
8838 }
8839
8840 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_gt_8_subtile) {
8841 TEST_REQUIRES_ARM_NEON;
8842 for (uint32_t n = 9; n < 16; n++) {
8843 for (size_t k = 1; k <= 20; k += 5) {
8844 for (uint32_t m = 1; m <= 4; m++) {
8845 GemmMicrokernelTester()
8846 .mr(4)
8847 .nr(8)
8848 .kr(1)
8849 .sr(1)
8850 .m(m)
8851 .n(n)
8852 .k(k)
8853 .iterations(1)
8854 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8855 }
8856 }
8857 }
8858 }
8859
8860 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8) {
8861 TEST_REQUIRES_ARM_NEON;
8862 for (uint32_t n = 16; n <= 24; n += 8) {
8863 for (size_t k = 1; k <= 20; k += 5) {
8864 GemmMicrokernelTester()
8865 .mr(4)
8866 .nr(8)
8867 .kr(1)
8868 .sr(1)
8869 .m(4)
8870 .n(8)
8871 .k(k)
8872 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8873 }
8874 }
8875 }
8876
8877 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_strided_cn) {
8878 TEST_REQUIRES_ARM_NEON;
8879 for (uint32_t n = 16; n <= 24; n += 8) {
8880 for (size_t k = 1; k <= 20; k += 5) {
8881 GemmMicrokernelTester()
8882 .mr(4)
8883 .nr(8)
8884 .kr(1)
8885 .sr(1)
8886 .m(4)
8887 .n(n)
8888 .k(k)
8889 .cn_stride(11)
8890 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8891 }
8892 }
8893 }
8894
8895 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_strided_a) {
8896 TEST_REQUIRES_ARM_NEON;
8897 for (uint32_t n = 16; n <= 24; n += 8) {
8898 for (size_t k = 1; k <= 20; k += 5) {
8899 GemmMicrokernelTester()
8900 .mr(4)
8901 .nr(8)
8902 .kr(1)
8903 .sr(1)
8904 .m(4)
8905 .n(n)
8906 .k(k)
8907 .a_stride(23)
8908 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8909 }
8910 }
8911 }
8912
8913 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, n_div_8_subtile) {
8914 TEST_REQUIRES_ARM_NEON;
8915 for (uint32_t n = 16; n <= 24; n += 8) {
8916 for (size_t k = 1; k <= 20; k += 5) {
8917 for (uint32_t m = 1; m <= 4; m++) {
8918 GemmMicrokernelTester()
8919 .mr(4)
8920 .nr(8)
8921 .kr(1)
8922 .sr(1)
8923 .m(m)
8924 .n(n)
8925 .k(k)
8926 .iterations(1)
8927 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8928 }
8929 }
8930 }
8931 }
8932
8933 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm_subtile) {
8934 TEST_REQUIRES_ARM_NEON;
8935 for (size_t k = 1; k <= 20; k += 5) {
8936 for (uint32_t m = 1; m <= 4; m++) {
8937 for (uint32_t n = 1; n <= 8; n++) {
8938 GemmMicrokernelTester()
8939 .mr(4)
8940 .nr(8)
8941 .kr(1)
8942 .sr(1)
8943 .m(m)
8944 .n(n)
8945 .k(k)
8946 .cm_stride(11)
8947 .iterations(1)
8948 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8949 }
8950 }
8951 }
8952 }
8953
8954 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmin) {
8955 TEST_REQUIRES_ARM_NEON;
8956 GemmMicrokernelTester()
8957 .mr(4)
8958 .nr(8)
8959 .kr(1)
8960 .sr(1)
8961 .m(4)
8962 .n(8)
8963 .k(4)
8964 .qmin(128)
8965 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8966 }
8967
8968 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, qmax) {
8969 TEST_REQUIRES_ARM_NEON;
8970 GemmMicrokernelTester()
8971 .mr(4)
8972 .nr(8)
8973 .kr(1)
8974 .sr(1)
8975 .m(4)
8976 .n(8)
8977 .k(4)
8978 .qmax(128)
8979 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8980 }
8981
8982 TEST(F32_GEMM_4X8__AARCH32_NEON_PLD_CORTEX_A75, strided_cm) {
8983 TEST_REQUIRES_ARM_NEON;
8984 GemmMicrokernelTester()
8985 .mr(4)
8986 .nr(8)
8987 .kr(1)
8988 .sr(1)
8989 .m(4)
8990 .n(8)
8991 .k(4)
8992 .cm_stride(11)
8993 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_pld_cortex_a75);
8994 }
8995#endif // XNN_ARCH_ARM
8996
8997
8998#if XNN_ARCH_ARM
Frank Barchard8b0f0262019-11-27 23:18:40 -08008999 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2) {
9000 TEST_REQUIRES_ARM_NEON;
9001 GemmMicrokernelTester()
9002 .mr(4)
9003 .nr(8)
9004 .kr(1)
9005 .sr(1)
9006 .m(4)
9007 .n(8)
9008 .k(2)
9009 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9010 }
9011
9012 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, strided_cn) {
9013 TEST_REQUIRES_ARM_NEON;
9014 GemmMicrokernelTester()
9015 .mr(4)
9016 .nr(8)
9017 .kr(1)
9018 .sr(1)
9019 .m(4)
9020 .n(8)
9021 .k(2)
9022 .cn_stride(11)
9023 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9024 }
9025
9026 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_strided_a) {
9027 TEST_REQUIRES_ARM_NEON;
9028 GemmMicrokernelTester()
9029 .mr(4)
9030 .nr(8)
9031 .kr(1)
9032 .sr(1)
9033 .m(4)
9034 .n(8)
9035 .k(2)
9036 .a_stride(5)
9037 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9038 }
9039
9040 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile) {
9041 TEST_REQUIRES_ARM_NEON;
9042 for (uint32_t m = 1; m <= 4; m++) {
9043 for (uint32_t n = 1; n <= 8; n++) {
9044 GemmMicrokernelTester()
9045 .mr(4)
9046 .nr(8)
9047 .kr(1)
9048 .sr(1)
9049 .m(m)
9050 .n(n)
9051 .k(2)
9052 .iterations(1)
9053 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9054 }
9055 }
9056 }
9057
9058 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_m) {
9059 TEST_REQUIRES_ARM_NEON;
9060 for (uint32_t m = 1; m <= 4; m++) {
9061 GemmMicrokernelTester()
9062 .mr(4)
9063 .nr(8)
9064 .kr(1)
9065 .sr(1)
9066 .m(m)
9067 .n(8)
9068 .k(2)
9069 .iterations(1)
9070 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9071 }
9072 }
9073
9074 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_eq_2_subtile_n) {
9075 TEST_REQUIRES_ARM_NEON;
9076 for (uint32_t n = 1; n <= 8; n++) {
9077 GemmMicrokernelTester()
9078 .mr(4)
9079 .nr(8)
9080 .kr(1)
9081 .sr(1)
9082 .m(4)
9083 .n(n)
9084 .k(2)
9085 .iterations(1)
9086 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9087 }
9088 }
9089
9090 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_lt_2) {
9091 TEST_REQUIRES_ARM_NEON;
9092 for (size_t k = 1; k < 2; k++) {
9093 GemmMicrokernelTester()
9094 .mr(4)
9095 .nr(8)
9096 .kr(1)
9097 .sr(1)
9098 .m(4)
9099 .n(8)
9100 .k(k)
9101 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9102 }
9103 }
9104
9105 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_lt_2_strided_a) {
9106 TEST_REQUIRES_ARM_NEON;
9107 for (size_t k = 1; k < 2; k++) {
9108 GemmMicrokernelTester()
9109 .mr(4)
9110 .nr(8)
9111 .kr(1)
9112 .sr(1)
9113 .m(4)
9114 .n(8)
9115 .k(k)
9116 .a_stride(5)
9117 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9118 }
9119 }
9120
9121 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_lt_2_subtile) {
9122 TEST_REQUIRES_ARM_NEON;
9123 for (size_t k = 1; k < 2; k++) {
9124 for (uint32_t m = 1; m <= 4; m++) {
9125 for (uint32_t n = 1; n <= 8; n++) {
9126 GemmMicrokernelTester()
9127 .mr(4)
9128 .nr(8)
9129 .kr(1)
9130 .sr(1)
9131 .m(m)
9132 .n(n)
9133 .k(k)
9134 .iterations(1)
9135 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9136 }
9137 }
9138 }
9139 }
9140
9141 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_gt_2) {
9142 TEST_REQUIRES_ARM_NEON;
9143 for (size_t k = 3; k < 4; k++) {
9144 GemmMicrokernelTester()
9145 .mr(4)
9146 .nr(8)
9147 .kr(1)
9148 .sr(1)
9149 .m(4)
9150 .n(8)
9151 .k(k)
9152 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9153 }
9154 }
9155
9156 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_gt_2_strided_a) {
9157 TEST_REQUIRES_ARM_NEON;
9158 for (size_t k = 3; k < 4; k++) {
9159 GemmMicrokernelTester()
9160 .mr(4)
9161 .nr(8)
9162 .kr(1)
9163 .sr(1)
9164 .m(4)
9165 .n(8)
9166 .k(k)
9167 .a_stride(7)
9168 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9169 }
9170 }
9171
9172 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_gt_2_subtile) {
9173 TEST_REQUIRES_ARM_NEON;
9174 for (size_t k = 3; k < 4; k++) {
9175 for (uint32_t m = 1; m <= 4; m++) {
9176 for (uint32_t n = 1; n <= 8; n++) {
9177 GemmMicrokernelTester()
9178 .mr(4)
9179 .nr(8)
9180 .kr(1)
9181 .sr(1)
9182 .m(m)
9183 .n(n)
9184 .k(k)
9185 .iterations(1)
9186 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9187 }
9188 }
9189 }
9190 }
9191
9192 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_div_2) {
9193 TEST_REQUIRES_ARM_NEON;
9194 for (size_t k = 4; k <= 20; k += 2) {
9195 GemmMicrokernelTester()
9196 .mr(4)
9197 .nr(8)
9198 .kr(1)
9199 .sr(1)
9200 .m(4)
9201 .n(8)
9202 .k(k)
9203 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9204 }
9205 }
9206
9207 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_div_2_strided_a) {
9208 TEST_REQUIRES_ARM_NEON;
9209 for (size_t k = 4; k <= 20; k += 2) {
9210 GemmMicrokernelTester()
9211 .mr(4)
9212 .nr(8)
9213 .kr(1)
9214 .sr(1)
9215 .m(4)
9216 .n(8)
9217 .k(k)
9218 .a_stride(23)
9219 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9220 }
9221 }
9222
9223 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, k_div_2_subtile) {
9224 TEST_REQUIRES_ARM_NEON;
9225 for (size_t k = 4; k <= 20; k += 2) {
9226 for (uint32_t m = 1; m <= 4; m++) {
9227 for (uint32_t n = 1; n <= 8; n++) {
9228 GemmMicrokernelTester()
9229 .mr(4)
9230 .nr(8)
9231 .kr(1)
9232 .sr(1)
9233 .m(m)
9234 .n(n)
9235 .k(k)
9236 .iterations(1)
9237 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9238 }
9239 }
9240 }
9241 }
9242
9243 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8) {
9244 TEST_REQUIRES_ARM_NEON;
9245 for (uint32_t n = 9; n < 16; n++) {
9246 for (size_t k = 1; k <= 10; k += 3) {
9247 GemmMicrokernelTester()
9248 .mr(4)
9249 .nr(8)
9250 .kr(1)
9251 .sr(1)
9252 .m(4)
9253 .n(8)
9254 .k(k)
9255 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9256 }
9257 }
9258 }
9259
9260 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8_strided_cn) {
9261 TEST_REQUIRES_ARM_NEON;
9262 for (uint32_t n = 9; n < 16; n++) {
9263 for (size_t k = 1; k <= 10; k += 3) {
9264 GemmMicrokernelTester()
9265 .mr(4)
9266 .nr(8)
9267 .kr(1)
9268 .sr(1)
9269 .m(4)
9270 .n(8)
9271 .k(k)
9272 .cn_stride(11)
9273 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9274 }
9275 }
9276 }
9277
9278 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8_strided_a) {
9279 TEST_REQUIRES_ARM_NEON;
9280 for (uint32_t n = 9; n < 16; n++) {
9281 for (size_t k = 1; k <= 10; k += 3) {
9282 GemmMicrokernelTester()
9283 .mr(4)
9284 .nr(8)
9285 .kr(1)
9286 .sr(1)
9287 .m(4)
9288 .n(n)
9289 .k(k)
9290 .a_stride(13)
9291 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9292 }
9293 }
9294 }
9295
9296 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_gt_8_subtile) {
9297 TEST_REQUIRES_ARM_NEON;
9298 for (uint32_t n = 9; n < 16; n++) {
9299 for (size_t k = 1; k <= 10; k += 3) {
9300 for (uint32_t m = 1; m <= 4; m++) {
9301 GemmMicrokernelTester()
9302 .mr(4)
9303 .nr(8)
9304 .kr(1)
9305 .sr(1)
9306 .m(m)
9307 .n(n)
9308 .k(k)
9309 .iterations(1)
9310 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9311 }
9312 }
9313 }
9314 }
9315
9316 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8) {
9317 TEST_REQUIRES_ARM_NEON;
9318 for (uint32_t n = 16; n <= 24; n += 8) {
9319 for (size_t k = 1; k <= 10; k += 3) {
9320 GemmMicrokernelTester()
9321 .mr(4)
9322 .nr(8)
9323 .kr(1)
9324 .sr(1)
9325 .m(4)
9326 .n(8)
9327 .k(k)
9328 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9329 }
9330 }
9331 }
9332
9333 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8_strided_cn) {
9334 TEST_REQUIRES_ARM_NEON;
9335 for (uint32_t n = 16; n <= 24; n += 8) {
9336 for (size_t k = 1; k <= 10; k += 3) {
9337 GemmMicrokernelTester()
9338 .mr(4)
9339 .nr(8)
9340 .kr(1)
9341 .sr(1)
9342 .m(4)
9343 .n(n)
9344 .k(k)
9345 .cn_stride(11)
9346 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9347 }
9348 }
9349 }
9350
9351 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8_strided_a) {
9352 TEST_REQUIRES_ARM_NEON;
9353 for (uint32_t n = 16; n <= 24; n += 8) {
9354 for (size_t k = 1; k <= 10; k += 3) {
9355 GemmMicrokernelTester()
9356 .mr(4)
9357 .nr(8)
9358 .kr(1)
9359 .sr(1)
9360 .m(4)
9361 .n(n)
9362 .k(k)
9363 .a_stride(13)
9364 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9365 }
9366 }
9367 }
9368
9369 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, n_div_8_subtile) {
9370 TEST_REQUIRES_ARM_NEON;
9371 for (uint32_t n = 16; n <= 24; n += 8) {
9372 for (size_t k = 1; k <= 10; k += 3) {
9373 for (uint32_t m = 1; m <= 4; m++) {
9374 GemmMicrokernelTester()
9375 .mr(4)
9376 .nr(8)
9377 .kr(1)
9378 .sr(1)
9379 .m(m)
9380 .n(n)
9381 .k(k)
9382 .iterations(1)
9383 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9384 }
9385 }
9386 }
9387 }
9388
9389 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, strided_cm_subtile) {
9390 TEST_REQUIRES_ARM_NEON;
9391 for (size_t k = 1; k <= 10; k += 3) {
9392 for (uint32_t m = 1; m <= 4; m++) {
9393 for (uint32_t n = 1; n <= 8; n++) {
9394 GemmMicrokernelTester()
9395 .mr(4)
9396 .nr(8)
9397 .kr(1)
9398 .sr(1)
9399 .m(m)
9400 .n(n)
9401 .k(k)
9402 .cm_stride(11)
9403 .iterations(1)
9404 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9405 }
9406 }
9407 }
9408 }
9409
9410 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, qmin) {
9411 TEST_REQUIRES_ARM_NEON;
9412 GemmMicrokernelTester()
9413 .mr(4)
9414 .nr(8)
9415 .kr(1)
9416 .sr(1)
9417 .m(4)
9418 .n(8)
9419 .k(2)
9420 .qmin(128)
9421 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9422 }
9423
9424 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, qmax) {
9425 TEST_REQUIRES_ARM_NEON;
9426 GemmMicrokernelTester()
9427 .mr(4)
9428 .nr(8)
9429 .kr(1)
9430 .sr(1)
9431 .m(4)
9432 .n(8)
9433 .k(2)
9434 .qmax(128)
9435 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9436 }
9437
9438 TEST(F32_GEMM_4X8__AARCH32_NEON_LD64, strided_cm) {
9439 TEST_REQUIRES_ARM_NEON;
9440 GemmMicrokernelTester()
9441 .mr(4)
9442 .nr(8)
9443 .kr(1)
9444 .sr(1)
9445 .m(4)
9446 .n(8)
9447 .k(2)
9448 .cm_stride(11)
9449 .Test(xnn_f32_gemm_ukernel_4x8__aarch32_neon_ld64);
9450 }
9451#endif // XNN_ARCH_ARM
9452
9453
Frank Barchard7e955972019-10-11 10:34:25 -07009454#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07009455 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4) {
9456 TEST_REQUIRES_ARM_NEON_FMA;
9457 GemmMicrokernelTester()
9458 .mr(4)
9459 .nr(8)
9460 .kr(1)
9461 .sr(1)
9462 .m(4)
9463 .n(8)
9464 .k(4)
9465 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9466 }
9467
9468 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, strided_cn) {
9469 TEST_REQUIRES_ARM_NEON_FMA;
9470 GemmMicrokernelTester()
9471 .mr(4)
9472 .nr(8)
9473 .kr(1)
9474 .sr(1)
9475 .m(4)
9476 .n(8)
9477 .k(4)
9478 .cn_stride(11)
9479 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9480 }
9481
9482 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
9483 TEST_REQUIRES_ARM_NEON_FMA;
9484 GemmMicrokernelTester()
9485 .mr(4)
9486 .nr(8)
9487 .kr(1)
9488 .sr(1)
9489 .m(4)
9490 .n(8)
9491 .k(4)
9492 .a_stride(7)
9493 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9494 }
9495
9496 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
9497 TEST_REQUIRES_ARM_NEON_FMA;
9498 for (uint32_t m = 1; m <= 4; m++) {
9499 for (uint32_t n = 1; n <= 8; n++) {
9500 GemmMicrokernelTester()
9501 .mr(4)
9502 .nr(8)
9503 .kr(1)
9504 .sr(1)
9505 .m(m)
9506 .n(n)
9507 .k(4)
9508 .iterations(1)
9509 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9510 }
9511 }
9512 }
9513
9514 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
9515 TEST_REQUIRES_ARM_NEON_FMA;
9516 for (uint32_t m = 1; m <= 4; m++) {
9517 GemmMicrokernelTester()
9518 .mr(4)
9519 .nr(8)
9520 .kr(1)
9521 .sr(1)
9522 .m(m)
9523 .n(8)
9524 .k(4)
9525 .iterations(1)
9526 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9527 }
9528 }
9529
9530 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
9531 TEST_REQUIRES_ARM_NEON_FMA;
9532 for (uint32_t n = 1; n <= 8; n++) {
9533 GemmMicrokernelTester()
9534 .mr(4)
9535 .nr(8)
9536 .kr(1)
9537 .sr(1)
9538 .m(4)
9539 .n(n)
9540 .k(4)
9541 .iterations(1)
9542 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9543 }
9544 }
9545
9546 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_lt_4) {
9547 TEST_REQUIRES_ARM_NEON_FMA;
9548 for (size_t k = 1; k < 4; k++) {
9549 GemmMicrokernelTester()
9550 .mr(4)
9551 .nr(8)
9552 .kr(1)
9553 .sr(1)
9554 .m(4)
9555 .n(8)
9556 .k(k)
9557 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9558 }
9559 }
9560
9561 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
9562 TEST_REQUIRES_ARM_NEON_FMA;
9563 for (size_t k = 1; k < 4; k++) {
9564 GemmMicrokernelTester()
9565 .mr(4)
9566 .nr(8)
9567 .kr(1)
9568 .sr(1)
9569 .m(4)
9570 .n(8)
9571 .k(k)
9572 .a_stride(7)
9573 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9574 }
9575 }
9576
9577 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
9578 TEST_REQUIRES_ARM_NEON_FMA;
9579 for (size_t k = 1; k < 4; k++) {
9580 for (uint32_t m = 1; m <= 4; m++) {
9581 for (uint32_t n = 1; n <= 8; n++) {
9582 GemmMicrokernelTester()
9583 .mr(4)
9584 .nr(8)
9585 .kr(1)
9586 .sr(1)
9587 .m(m)
9588 .n(n)
9589 .k(k)
9590 .iterations(1)
9591 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9592 }
9593 }
9594 }
9595 }
9596
9597 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_gt_4) {
9598 TEST_REQUIRES_ARM_NEON_FMA;
9599 for (size_t k = 5; k < 8; k++) {
9600 GemmMicrokernelTester()
9601 .mr(4)
9602 .nr(8)
9603 .kr(1)
9604 .sr(1)
9605 .m(4)
9606 .n(8)
9607 .k(k)
9608 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9609 }
9610 }
9611
9612 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
9613 TEST_REQUIRES_ARM_NEON_FMA;
9614 for (size_t k = 5; k < 8; k++) {
9615 GemmMicrokernelTester()
9616 .mr(4)
9617 .nr(8)
9618 .kr(1)
9619 .sr(1)
9620 .m(4)
9621 .n(8)
9622 .k(k)
9623 .a_stride(11)
9624 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9625 }
9626 }
9627
9628 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
9629 TEST_REQUIRES_ARM_NEON_FMA;
9630 for (size_t k = 5; k < 8; k++) {
9631 for (uint32_t m = 1; m <= 4; m++) {
9632 for (uint32_t n = 1; n <= 8; n++) {
9633 GemmMicrokernelTester()
9634 .mr(4)
9635 .nr(8)
9636 .kr(1)
9637 .sr(1)
9638 .m(m)
9639 .n(n)
9640 .k(k)
9641 .iterations(1)
9642 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9643 }
9644 }
9645 }
9646 }
9647
9648 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_div_4) {
9649 TEST_REQUIRES_ARM_NEON_FMA;
9650 for (size_t k = 8; k <= 40; k += 4) {
9651 GemmMicrokernelTester()
9652 .mr(4)
9653 .nr(8)
9654 .kr(1)
9655 .sr(1)
9656 .m(4)
9657 .n(8)
9658 .k(k)
9659 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9660 }
9661 }
9662
9663 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
9664 TEST_REQUIRES_ARM_NEON_FMA;
9665 for (size_t k = 8; k <= 40; k += 4) {
9666 GemmMicrokernelTester()
9667 .mr(4)
9668 .nr(8)
9669 .kr(1)
9670 .sr(1)
9671 .m(4)
9672 .n(8)
9673 .k(k)
9674 .a_stride(43)
9675 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9676 }
9677 }
9678
9679 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
9680 TEST_REQUIRES_ARM_NEON_FMA;
9681 for (size_t k = 8; k <= 40; k += 4) {
9682 for (uint32_t m = 1; m <= 4; m++) {
9683 for (uint32_t n = 1; n <= 8; n++) {
9684 GemmMicrokernelTester()
9685 .mr(4)
9686 .nr(8)
9687 .kr(1)
9688 .sr(1)
9689 .m(m)
9690 .n(n)
9691 .k(k)
9692 .iterations(1)
9693 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9694 }
9695 }
9696 }
9697 }
9698
9699 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8) {
9700 TEST_REQUIRES_ARM_NEON_FMA;
9701 for (uint32_t n = 9; n < 16; n++) {
9702 for (size_t k = 1; k <= 20; k += 5) {
9703 GemmMicrokernelTester()
9704 .mr(4)
9705 .nr(8)
9706 .kr(1)
9707 .sr(1)
9708 .m(4)
9709 .n(8)
9710 .k(k)
9711 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9712 }
9713 }
9714 }
9715
9716 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
9717 TEST_REQUIRES_ARM_NEON_FMA;
9718 for (uint32_t n = 9; n < 16; n++) {
9719 for (size_t k = 1; k <= 20; k += 5) {
9720 GemmMicrokernelTester()
9721 .mr(4)
9722 .nr(8)
9723 .kr(1)
9724 .sr(1)
9725 .m(4)
9726 .n(8)
9727 .k(k)
9728 .cn_stride(11)
9729 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9730 }
9731 }
9732 }
9733
9734 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
9735 TEST_REQUIRES_ARM_NEON_FMA;
9736 for (uint32_t n = 9; n < 16; n++) {
9737 for (size_t k = 1; k <= 20; k += 5) {
9738 GemmMicrokernelTester()
9739 .mr(4)
9740 .nr(8)
9741 .kr(1)
9742 .sr(1)
9743 .m(4)
9744 .n(n)
9745 .k(k)
9746 .a_stride(23)
9747 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9748 }
9749 }
9750 }
9751
9752 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
9753 TEST_REQUIRES_ARM_NEON_FMA;
9754 for (uint32_t n = 9; n < 16; n++) {
9755 for (size_t k = 1; k <= 20; k += 5) {
9756 for (uint32_t m = 1; m <= 4; m++) {
9757 GemmMicrokernelTester()
9758 .mr(4)
9759 .nr(8)
9760 .kr(1)
9761 .sr(1)
9762 .m(m)
9763 .n(n)
9764 .k(k)
9765 .iterations(1)
9766 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9767 }
9768 }
9769 }
9770 }
9771
9772 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8) {
9773 TEST_REQUIRES_ARM_NEON_FMA;
9774 for (uint32_t n = 16; n <= 24; n += 8) {
9775 for (size_t k = 1; k <= 20; k += 5) {
9776 GemmMicrokernelTester()
9777 .mr(4)
9778 .nr(8)
9779 .kr(1)
9780 .sr(1)
9781 .m(4)
9782 .n(8)
9783 .k(k)
9784 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9785 }
9786 }
9787 }
9788
9789 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
9790 TEST_REQUIRES_ARM_NEON_FMA;
9791 for (uint32_t n = 16; n <= 24; n += 8) {
9792 for (size_t k = 1; k <= 20; k += 5) {
9793 GemmMicrokernelTester()
9794 .mr(4)
9795 .nr(8)
9796 .kr(1)
9797 .sr(1)
9798 .m(4)
9799 .n(n)
9800 .k(k)
9801 .cn_stride(11)
9802 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9803 }
9804 }
9805 }
9806
9807 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
9808 TEST_REQUIRES_ARM_NEON_FMA;
9809 for (uint32_t n = 16; n <= 24; n += 8) {
9810 for (size_t k = 1; k <= 20; k += 5) {
9811 GemmMicrokernelTester()
9812 .mr(4)
9813 .nr(8)
9814 .kr(1)
9815 .sr(1)
9816 .m(4)
9817 .n(n)
9818 .k(k)
9819 .a_stride(23)
9820 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9821 }
9822 }
9823 }
9824
9825 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
9826 TEST_REQUIRES_ARM_NEON_FMA;
9827 for (uint32_t n = 16; n <= 24; n += 8) {
9828 for (size_t k = 1; k <= 20; k += 5) {
9829 for (uint32_t m = 1; m <= 4; m++) {
9830 GemmMicrokernelTester()
9831 .mr(4)
9832 .nr(8)
9833 .kr(1)
9834 .sr(1)
9835 .m(m)
9836 .n(n)
9837 .k(k)
9838 .iterations(1)
9839 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9840 }
9841 }
9842 }
9843 }
9844
9845 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
9846 TEST_REQUIRES_ARM_NEON_FMA;
9847 for (size_t k = 1; k <= 20; k += 5) {
9848 for (uint32_t m = 1; m <= 4; m++) {
9849 for (uint32_t n = 1; n <= 8; n++) {
9850 GemmMicrokernelTester()
9851 .mr(4)
9852 .nr(8)
9853 .kr(1)
9854 .sr(1)
9855 .m(m)
9856 .n(n)
9857 .k(k)
9858 .cm_stride(11)
9859 .iterations(1)
9860 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9861 }
9862 }
9863 }
9864 }
9865
9866 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, qmin) {
9867 TEST_REQUIRES_ARM_NEON_FMA;
9868 GemmMicrokernelTester()
9869 .mr(4)
9870 .nr(8)
9871 .kr(1)
9872 .sr(1)
9873 .m(4)
9874 .n(8)
9875 .k(4)
9876 .qmin(128)
9877 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9878 }
9879
9880 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, qmax) {
9881 TEST_REQUIRES_ARM_NEON_FMA;
9882 GemmMicrokernelTester()
9883 .mr(4)
9884 .nr(8)
9885 .kr(1)
9886 .sr(1)
9887 .m(4)
9888 .n(8)
9889 .k(4)
9890 .qmax(128)
9891 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9892 }
9893
9894 TEST(F32_GEMM_4X8__AARCH64_NEONFMA_LD128, strided_cm) {
9895 TEST_REQUIRES_ARM_NEON_FMA;
9896 GemmMicrokernelTester()
9897 .mr(4)
9898 .nr(8)
9899 .kr(1)
9900 .sr(1)
9901 .m(4)
9902 .n(8)
9903 .k(4)
9904 .cm_stride(11)
9905 .Test(xnn_f32_gemm_ukernel_4x8__aarch64_neonfma_ld128);
9906 }
Frank Barchard7e955972019-10-11 10:34:25 -07009907#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07009908
9909
Frank Barchard7e955972019-10-11 10:34:25 -07009910#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -07009911 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2) {
9912 TEST_REQUIRES_ARM_NEON_FMA;
9913 GemmMicrokernelTester()
9914 .mr(6)
9915 .nr(8)
9916 .kr(1)
9917 .sr(1)
9918 .m(6)
9919 .n(8)
9920 .k(2)
9921 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
9922 }
9923
9924 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, strided_cn) {
9925 TEST_REQUIRES_ARM_NEON_FMA;
9926 GemmMicrokernelTester()
9927 .mr(6)
9928 .nr(8)
9929 .kr(1)
9930 .sr(1)
9931 .m(6)
9932 .n(8)
9933 .k(2)
9934 .cn_stride(11)
9935 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
9936 }
9937
9938 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_strided_a) {
9939 TEST_REQUIRES_ARM_NEON_FMA;
9940 GemmMicrokernelTester()
9941 .mr(6)
9942 .nr(8)
9943 .kr(1)
9944 .sr(1)
9945 .m(6)
9946 .n(8)
9947 .k(2)
9948 .a_stride(5)
9949 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
9950 }
9951
9952 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile) {
9953 TEST_REQUIRES_ARM_NEON_FMA;
9954 for (uint32_t m = 1; m <= 6; m++) {
9955 for (uint32_t n = 1; n <= 8; n++) {
9956 GemmMicrokernelTester()
9957 .mr(6)
9958 .nr(8)
9959 .kr(1)
9960 .sr(1)
9961 .m(m)
9962 .n(n)
9963 .k(2)
9964 .iterations(1)
9965 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
9966 }
9967 }
9968 }
9969
9970 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_m) {
9971 TEST_REQUIRES_ARM_NEON_FMA;
9972 for (uint32_t m = 1; m <= 6; m++) {
9973 GemmMicrokernelTester()
9974 .mr(6)
9975 .nr(8)
9976 .kr(1)
9977 .sr(1)
9978 .m(m)
9979 .n(8)
9980 .k(2)
9981 .iterations(1)
9982 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
9983 }
9984 }
9985
9986 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_eq_2_subtile_n) {
9987 TEST_REQUIRES_ARM_NEON_FMA;
9988 for (uint32_t n = 1; n <= 8; n++) {
9989 GemmMicrokernelTester()
9990 .mr(6)
9991 .nr(8)
9992 .kr(1)
9993 .sr(1)
9994 .m(6)
9995 .n(n)
9996 .k(2)
9997 .iterations(1)
9998 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
9999 }
10000 }
10001
10002 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_lt_2) {
10003 TEST_REQUIRES_ARM_NEON_FMA;
10004 for (size_t k = 1; k < 2; k++) {
10005 GemmMicrokernelTester()
10006 .mr(6)
10007 .nr(8)
10008 .kr(1)
10009 .sr(1)
10010 .m(6)
10011 .n(8)
10012 .k(k)
10013 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10014 }
10015 }
10016
10017 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_lt_2_strided_a) {
10018 TEST_REQUIRES_ARM_NEON_FMA;
10019 for (size_t k = 1; k < 2; k++) {
10020 GemmMicrokernelTester()
10021 .mr(6)
10022 .nr(8)
10023 .kr(1)
10024 .sr(1)
10025 .m(6)
10026 .n(8)
10027 .k(k)
10028 .a_stride(5)
10029 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10030 }
10031 }
10032
10033 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_lt_2_subtile) {
10034 TEST_REQUIRES_ARM_NEON_FMA;
10035 for (size_t k = 1; k < 2; k++) {
10036 for (uint32_t m = 1; m <= 6; m++) {
10037 for (uint32_t n = 1; n <= 8; n++) {
10038 GemmMicrokernelTester()
10039 .mr(6)
10040 .nr(8)
10041 .kr(1)
10042 .sr(1)
10043 .m(m)
10044 .n(n)
10045 .k(k)
10046 .iterations(1)
10047 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10048 }
10049 }
10050 }
10051 }
10052
10053 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_gt_2) {
10054 TEST_REQUIRES_ARM_NEON_FMA;
10055 for (size_t k = 3; k < 4; k++) {
10056 GemmMicrokernelTester()
10057 .mr(6)
10058 .nr(8)
10059 .kr(1)
10060 .sr(1)
10061 .m(6)
10062 .n(8)
10063 .k(k)
10064 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10065 }
10066 }
10067
10068 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_gt_2_strided_a) {
10069 TEST_REQUIRES_ARM_NEON_FMA;
10070 for (size_t k = 3; k < 4; k++) {
10071 GemmMicrokernelTester()
10072 .mr(6)
10073 .nr(8)
10074 .kr(1)
10075 .sr(1)
10076 .m(6)
10077 .n(8)
10078 .k(k)
10079 .a_stride(7)
10080 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10081 }
10082 }
10083
10084 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_gt_2_subtile) {
10085 TEST_REQUIRES_ARM_NEON_FMA;
10086 for (size_t k = 3; k < 4; k++) {
10087 for (uint32_t m = 1; m <= 6; m++) {
10088 for (uint32_t n = 1; n <= 8; n++) {
10089 GemmMicrokernelTester()
10090 .mr(6)
10091 .nr(8)
10092 .kr(1)
10093 .sr(1)
10094 .m(m)
10095 .n(n)
10096 .k(k)
10097 .iterations(1)
10098 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10099 }
10100 }
10101 }
10102 }
10103
10104 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_div_2) {
10105 TEST_REQUIRES_ARM_NEON_FMA;
10106 for (size_t k = 4; k <= 20; k += 2) {
10107 GemmMicrokernelTester()
10108 .mr(6)
10109 .nr(8)
10110 .kr(1)
10111 .sr(1)
10112 .m(6)
10113 .n(8)
10114 .k(k)
10115 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10116 }
10117 }
10118
10119 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_div_2_strided_a) {
10120 TEST_REQUIRES_ARM_NEON_FMA;
10121 for (size_t k = 4; k <= 20; k += 2) {
10122 GemmMicrokernelTester()
10123 .mr(6)
10124 .nr(8)
10125 .kr(1)
10126 .sr(1)
10127 .m(6)
10128 .n(8)
10129 .k(k)
10130 .a_stride(23)
10131 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10132 }
10133 }
10134
10135 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, k_div_2_subtile) {
10136 TEST_REQUIRES_ARM_NEON_FMA;
10137 for (size_t k = 4; k <= 20; k += 2) {
10138 for (uint32_t m = 1; m <= 6; m++) {
10139 for (uint32_t n = 1; n <= 8; n++) {
10140 GemmMicrokernelTester()
10141 .mr(6)
10142 .nr(8)
10143 .kr(1)
10144 .sr(1)
10145 .m(m)
10146 .n(n)
10147 .k(k)
10148 .iterations(1)
10149 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10150 }
10151 }
10152 }
10153 }
10154
10155 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8) {
10156 TEST_REQUIRES_ARM_NEON_FMA;
10157 for (uint32_t n = 9; n < 16; n++) {
10158 for (size_t k = 1; k <= 10; k += 3) {
10159 GemmMicrokernelTester()
10160 .mr(6)
10161 .nr(8)
10162 .kr(1)
10163 .sr(1)
10164 .m(6)
10165 .n(8)
10166 .k(k)
10167 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10168 }
10169 }
10170 }
10171
10172 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_cn) {
10173 TEST_REQUIRES_ARM_NEON_FMA;
10174 for (uint32_t n = 9; n < 16; n++) {
10175 for (size_t k = 1; k <= 10; k += 3) {
10176 GemmMicrokernelTester()
10177 .mr(6)
10178 .nr(8)
10179 .kr(1)
10180 .sr(1)
10181 .m(6)
10182 .n(8)
10183 .k(k)
10184 .cn_stride(11)
10185 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10186 }
10187 }
10188 }
10189
10190 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8_strided_a) {
10191 TEST_REQUIRES_ARM_NEON_FMA;
10192 for (uint32_t n = 9; n < 16; n++) {
10193 for (size_t k = 1; k <= 10; k += 3) {
10194 GemmMicrokernelTester()
10195 .mr(6)
10196 .nr(8)
10197 .kr(1)
10198 .sr(1)
10199 .m(6)
10200 .n(n)
10201 .k(k)
10202 .a_stride(13)
10203 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10204 }
10205 }
10206 }
10207
10208 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_gt_8_subtile) {
10209 TEST_REQUIRES_ARM_NEON_FMA;
10210 for (uint32_t n = 9; n < 16; n++) {
10211 for (size_t k = 1; k <= 10; k += 3) {
10212 for (uint32_t m = 1; m <= 6; m++) {
10213 GemmMicrokernelTester()
10214 .mr(6)
10215 .nr(8)
10216 .kr(1)
10217 .sr(1)
10218 .m(m)
10219 .n(n)
10220 .k(k)
10221 .iterations(1)
10222 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10223 }
10224 }
10225 }
10226 }
10227
10228 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8) {
10229 TEST_REQUIRES_ARM_NEON_FMA;
10230 for (uint32_t n = 16; n <= 24; n += 8) {
10231 for (size_t k = 1; k <= 10; k += 3) {
10232 GemmMicrokernelTester()
10233 .mr(6)
10234 .nr(8)
10235 .kr(1)
10236 .sr(1)
10237 .m(6)
10238 .n(8)
10239 .k(k)
10240 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10241 }
10242 }
10243 }
10244
10245 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_cn) {
10246 TEST_REQUIRES_ARM_NEON_FMA;
10247 for (uint32_t n = 16; n <= 24; n += 8) {
10248 for (size_t k = 1; k <= 10; k += 3) {
10249 GemmMicrokernelTester()
10250 .mr(6)
10251 .nr(8)
10252 .kr(1)
10253 .sr(1)
10254 .m(6)
10255 .n(n)
10256 .k(k)
10257 .cn_stride(11)
10258 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10259 }
10260 }
10261 }
10262
10263 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8_strided_a) {
10264 TEST_REQUIRES_ARM_NEON_FMA;
10265 for (uint32_t n = 16; n <= 24; n += 8) {
10266 for (size_t k = 1; k <= 10; k += 3) {
10267 GemmMicrokernelTester()
10268 .mr(6)
10269 .nr(8)
10270 .kr(1)
10271 .sr(1)
10272 .m(6)
10273 .n(n)
10274 .k(k)
10275 .a_stride(13)
10276 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10277 }
10278 }
10279 }
10280
10281 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, n_div_8_subtile) {
10282 TEST_REQUIRES_ARM_NEON_FMA;
10283 for (uint32_t n = 16; n <= 24; n += 8) {
10284 for (size_t k = 1; k <= 10; k += 3) {
10285 for (uint32_t m = 1; m <= 6; m++) {
10286 GemmMicrokernelTester()
10287 .mr(6)
10288 .nr(8)
10289 .kr(1)
10290 .sr(1)
10291 .m(m)
10292 .n(n)
10293 .k(k)
10294 .iterations(1)
10295 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10296 }
10297 }
10298 }
10299 }
10300
10301 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, strided_cm_subtile) {
10302 TEST_REQUIRES_ARM_NEON_FMA;
10303 for (size_t k = 1; k <= 10; k += 3) {
10304 for (uint32_t m = 1; m <= 6; m++) {
10305 for (uint32_t n = 1; n <= 8; n++) {
10306 GemmMicrokernelTester()
10307 .mr(6)
10308 .nr(8)
10309 .kr(1)
10310 .sr(1)
10311 .m(m)
10312 .n(n)
10313 .k(k)
10314 .cm_stride(11)
10315 .iterations(1)
10316 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10317 }
10318 }
10319 }
10320 }
10321
10322 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, qmin) {
10323 TEST_REQUIRES_ARM_NEON_FMA;
10324 GemmMicrokernelTester()
10325 .mr(6)
10326 .nr(8)
10327 .kr(1)
10328 .sr(1)
10329 .m(6)
10330 .n(8)
10331 .k(2)
10332 .qmin(128)
10333 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10334 }
10335
10336 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, qmax) {
10337 TEST_REQUIRES_ARM_NEON_FMA;
10338 GemmMicrokernelTester()
10339 .mr(6)
10340 .nr(8)
10341 .kr(1)
10342 .sr(1)
10343 .m(6)
10344 .n(8)
10345 .k(2)
10346 .qmax(128)
10347 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10348 }
10349
10350 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD64, strided_cm) {
10351 TEST_REQUIRES_ARM_NEON_FMA;
10352 GemmMicrokernelTester()
10353 .mr(6)
10354 .nr(8)
10355 .kr(1)
10356 .sr(1)
10357 .m(6)
10358 .n(8)
10359 .k(2)
10360 .cm_stride(11)
10361 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld64);
10362 }
Frank Barchard7e955972019-10-11 10:34:25 -070010363#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -070010364
10365
Frank Barchard7e955972019-10-11 10:34:25 -070010366#if XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -070010367 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4) {
10368 TEST_REQUIRES_ARM_NEON_FMA;
10369 GemmMicrokernelTester()
10370 .mr(6)
10371 .nr(8)
10372 .kr(1)
10373 .sr(1)
10374 .m(6)
10375 .n(8)
10376 .k(4)
10377 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10378 }
10379
10380 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cn) {
10381 TEST_REQUIRES_ARM_NEON_FMA;
10382 GemmMicrokernelTester()
10383 .mr(6)
10384 .nr(8)
10385 .kr(1)
10386 .sr(1)
10387 .m(6)
10388 .n(8)
10389 .k(4)
10390 .cn_stride(11)
10391 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10392 }
10393
10394 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_strided_a) {
10395 TEST_REQUIRES_ARM_NEON_FMA;
10396 GemmMicrokernelTester()
10397 .mr(6)
10398 .nr(8)
10399 .kr(1)
10400 .sr(1)
10401 .m(6)
10402 .n(8)
10403 .k(4)
10404 .a_stride(7)
10405 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10406 }
10407
10408 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile) {
10409 TEST_REQUIRES_ARM_NEON_FMA;
10410 for (uint32_t m = 1; m <= 6; m++) {
10411 for (uint32_t n = 1; n <= 8; n++) {
10412 GemmMicrokernelTester()
10413 .mr(6)
10414 .nr(8)
10415 .kr(1)
10416 .sr(1)
10417 .m(m)
10418 .n(n)
10419 .k(4)
10420 .iterations(1)
10421 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10422 }
10423 }
10424 }
10425
10426 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_m) {
10427 TEST_REQUIRES_ARM_NEON_FMA;
10428 for (uint32_t m = 1; m <= 6; m++) {
10429 GemmMicrokernelTester()
10430 .mr(6)
10431 .nr(8)
10432 .kr(1)
10433 .sr(1)
10434 .m(m)
10435 .n(8)
10436 .k(4)
10437 .iterations(1)
10438 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10439 }
10440 }
10441
10442 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_eq_4_subtile_n) {
10443 TEST_REQUIRES_ARM_NEON_FMA;
10444 for (uint32_t n = 1; n <= 8; n++) {
10445 GemmMicrokernelTester()
10446 .mr(6)
10447 .nr(8)
10448 .kr(1)
10449 .sr(1)
10450 .m(6)
10451 .n(n)
10452 .k(4)
10453 .iterations(1)
10454 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10455 }
10456 }
10457
10458 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4) {
10459 TEST_REQUIRES_ARM_NEON_FMA;
10460 for (size_t k = 1; k < 4; k++) {
10461 GemmMicrokernelTester()
10462 .mr(6)
10463 .nr(8)
10464 .kr(1)
10465 .sr(1)
10466 .m(6)
10467 .n(8)
10468 .k(k)
10469 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10470 }
10471 }
10472
10473 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4_strided_a) {
10474 TEST_REQUIRES_ARM_NEON_FMA;
10475 for (size_t k = 1; k < 4; k++) {
10476 GemmMicrokernelTester()
10477 .mr(6)
10478 .nr(8)
10479 .kr(1)
10480 .sr(1)
10481 .m(6)
10482 .n(8)
10483 .k(k)
10484 .a_stride(7)
10485 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10486 }
10487 }
10488
10489 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_lt_4_subtile) {
10490 TEST_REQUIRES_ARM_NEON_FMA;
10491 for (size_t k = 1; k < 4; k++) {
10492 for (uint32_t m = 1; m <= 6; m++) {
10493 for (uint32_t n = 1; n <= 8; n++) {
10494 GemmMicrokernelTester()
10495 .mr(6)
10496 .nr(8)
10497 .kr(1)
10498 .sr(1)
10499 .m(m)
10500 .n(n)
10501 .k(k)
10502 .iterations(1)
10503 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10504 }
10505 }
10506 }
10507 }
10508
10509 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4) {
10510 TEST_REQUIRES_ARM_NEON_FMA;
10511 for (size_t k = 5; k < 8; k++) {
10512 GemmMicrokernelTester()
10513 .mr(6)
10514 .nr(8)
10515 .kr(1)
10516 .sr(1)
10517 .m(6)
10518 .n(8)
10519 .k(k)
10520 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10521 }
10522 }
10523
10524 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4_strided_a) {
10525 TEST_REQUIRES_ARM_NEON_FMA;
10526 for (size_t k = 5; k < 8; k++) {
10527 GemmMicrokernelTester()
10528 .mr(6)
10529 .nr(8)
10530 .kr(1)
10531 .sr(1)
10532 .m(6)
10533 .n(8)
10534 .k(k)
10535 .a_stride(11)
10536 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10537 }
10538 }
10539
10540 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_gt_4_subtile) {
10541 TEST_REQUIRES_ARM_NEON_FMA;
10542 for (size_t k = 5; k < 8; k++) {
10543 for (uint32_t m = 1; m <= 6; m++) {
10544 for (uint32_t n = 1; n <= 8; n++) {
10545 GemmMicrokernelTester()
10546 .mr(6)
10547 .nr(8)
10548 .kr(1)
10549 .sr(1)
10550 .m(m)
10551 .n(n)
10552 .k(k)
10553 .iterations(1)
10554 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10555 }
10556 }
10557 }
10558 }
10559
10560 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4) {
10561 TEST_REQUIRES_ARM_NEON_FMA;
10562 for (size_t k = 8; k <= 40; k += 4) {
10563 GemmMicrokernelTester()
10564 .mr(6)
10565 .nr(8)
10566 .kr(1)
10567 .sr(1)
10568 .m(6)
10569 .n(8)
10570 .k(k)
10571 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10572 }
10573 }
10574
10575 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4_strided_a) {
10576 TEST_REQUIRES_ARM_NEON_FMA;
10577 for (size_t k = 8; k <= 40; k += 4) {
10578 GemmMicrokernelTester()
10579 .mr(6)
10580 .nr(8)
10581 .kr(1)
10582 .sr(1)
10583 .m(6)
10584 .n(8)
10585 .k(k)
10586 .a_stride(43)
10587 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10588 }
10589 }
10590
10591 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, k_div_4_subtile) {
10592 TEST_REQUIRES_ARM_NEON_FMA;
10593 for (size_t k = 8; k <= 40; k += 4) {
10594 for (uint32_t m = 1; m <= 6; m++) {
10595 for (uint32_t n = 1; n <= 8; n++) {
10596 GemmMicrokernelTester()
10597 .mr(6)
10598 .nr(8)
10599 .kr(1)
10600 .sr(1)
10601 .m(m)
10602 .n(n)
10603 .k(k)
10604 .iterations(1)
10605 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10606 }
10607 }
10608 }
10609 }
10610
10611 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8) {
10612 TEST_REQUIRES_ARM_NEON_FMA;
10613 for (uint32_t n = 9; n < 16; n++) {
10614 for (size_t k = 1; k <= 20; k += 5) {
10615 GemmMicrokernelTester()
10616 .mr(6)
10617 .nr(8)
10618 .kr(1)
10619 .sr(1)
10620 .m(6)
10621 .n(8)
10622 .k(k)
10623 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10624 }
10625 }
10626 }
10627
10628 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_cn) {
10629 TEST_REQUIRES_ARM_NEON_FMA;
10630 for (uint32_t n = 9; n < 16; n++) {
10631 for (size_t k = 1; k <= 20; k += 5) {
10632 GemmMicrokernelTester()
10633 .mr(6)
10634 .nr(8)
10635 .kr(1)
10636 .sr(1)
10637 .m(6)
10638 .n(8)
10639 .k(k)
10640 .cn_stride(11)
10641 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10642 }
10643 }
10644 }
10645
10646 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_strided_a) {
10647 TEST_REQUIRES_ARM_NEON_FMA;
10648 for (uint32_t n = 9; n < 16; n++) {
10649 for (size_t k = 1; k <= 20; k += 5) {
10650 GemmMicrokernelTester()
10651 .mr(6)
10652 .nr(8)
10653 .kr(1)
10654 .sr(1)
10655 .m(6)
10656 .n(n)
10657 .k(k)
10658 .a_stride(23)
10659 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10660 }
10661 }
10662 }
10663
10664 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_gt_8_subtile) {
10665 TEST_REQUIRES_ARM_NEON_FMA;
10666 for (uint32_t n = 9; n < 16; n++) {
10667 for (size_t k = 1; k <= 20; k += 5) {
10668 for (uint32_t m = 1; m <= 6; m++) {
10669 GemmMicrokernelTester()
10670 .mr(6)
10671 .nr(8)
10672 .kr(1)
10673 .sr(1)
10674 .m(m)
10675 .n(n)
10676 .k(k)
10677 .iterations(1)
10678 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10679 }
10680 }
10681 }
10682 }
10683
10684 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8) {
10685 TEST_REQUIRES_ARM_NEON_FMA;
10686 for (uint32_t n = 16; n <= 24; n += 8) {
10687 for (size_t k = 1; k <= 20; k += 5) {
10688 GemmMicrokernelTester()
10689 .mr(6)
10690 .nr(8)
10691 .kr(1)
10692 .sr(1)
10693 .m(6)
10694 .n(8)
10695 .k(k)
10696 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10697 }
10698 }
10699 }
10700
10701 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_cn) {
10702 TEST_REQUIRES_ARM_NEON_FMA;
10703 for (uint32_t n = 16; n <= 24; n += 8) {
10704 for (size_t k = 1; k <= 20; k += 5) {
10705 GemmMicrokernelTester()
10706 .mr(6)
10707 .nr(8)
10708 .kr(1)
10709 .sr(1)
10710 .m(6)
10711 .n(n)
10712 .k(k)
10713 .cn_stride(11)
10714 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10715 }
10716 }
10717 }
10718
10719 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_strided_a) {
10720 TEST_REQUIRES_ARM_NEON_FMA;
10721 for (uint32_t n = 16; n <= 24; n += 8) {
10722 for (size_t k = 1; k <= 20; k += 5) {
10723 GemmMicrokernelTester()
10724 .mr(6)
10725 .nr(8)
10726 .kr(1)
10727 .sr(1)
10728 .m(6)
10729 .n(n)
10730 .k(k)
10731 .a_stride(23)
10732 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10733 }
10734 }
10735 }
10736
10737 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, n_div_8_subtile) {
10738 TEST_REQUIRES_ARM_NEON_FMA;
10739 for (uint32_t n = 16; n <= 24; n += 8) {
10740 for (size_t k = 1; k <= 20; k += 5) {
10741 for (uint32_t m = 1; m <= 6; m++) {
10742 GemmMicrokernelTester()
10743 .mr(6)
10744 .nr(8)
10745 .kr(1)
10746 .sr(1)
10747 .m(m)
10748 .n(n)
10749 .k(k)
10750 .iterations(1)
10751 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10752 }
10753 }
10754 }
10755 }
10756
10757 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cm_subtile) {
10758 TEST_REQUIRES_ARM_NEON_FMA;
10759 for (size_t k = 1; k <= 20; k += 5) {
10760 for (uint32_t m = 1; m <= 6; m++) {
10761 for (uint32_t n = 1; n <= 8; n++) {
10762 GemmMicrokernelTester()
10763 .mr(6)
10764 .nr(8)
10765 .kr(1)
10766 .sr(1)
10767 .m(m)
10768 .n(n)
10769 .k(k)
10770 .cm_stride(11)
10771 .iterations(1)
10772 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10773 }
10774 }
10775 }
10776 }
10777
10778 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, qmin) {
10779 TEST_REQUIRES_ARM_NEON_FMA;
10780 GemmMicrokernelTester()
10781 .mr(6)
10782 .nr(8)
10783 .kr(1)
10784 .sr(1)
10785 .m(6)
10786 .n(8)
10787 .k(4)
10788 .qmin(128)
10789 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10790 }
10791
10792 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, qmax) {
10793 TEST_REQUIRES_ARM_NEON_FMA;
10794 GemmMicrokernelTester()
10795 .mr(6)
10796 .nr(8)
10797 .kr(1)
10798 .sr(1)
10799 .m(6)
10800 .n(8)
10801 .k(4)
10802 .qmax(128)
10803 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10804 }
10805
10806 TEST(F32_GEMM_6X8__AARCH64_NEONFMA_LD128, strided_cm) {
10807 TEST_REQUIRES_ARM_NEON_FMA;
10808 GemmMicrokernelTester()
10809 .mr(6)
10810 .nr(8)
10811 .kr(1)
10812 .sr(1)
10813 .m(6)
10814 .n(8)
10815 .k(4)
10816 .cm_stride(11)
10817 .Test(xnn_f32_gemm_ukernel_6x8__aarch64_neonfma_ld128);
10818 }
Frank Barchard7e955972019-10-11 10:34:25 -070010819#endif // XNN_ARCH_ARM64 && XNN_ENABLE_ASSEMBLY
XNNPACK Teamb455b122019-09-27 18:10:33 -070010820
10821
Marat Dukhan1dadbf72019-10-01 10:46:20 -070010822#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080010823 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010824 TEST_REQUIRES_ARM_NEON;
10825 GemmMicrokernelTester()
10826 .mr(1)
10827 .nr(8)
10828 .kr(1)
10829 .sr(1)
10830 .m(1)
10831 .n(8)
10832 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080010833 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010834 }
10835
Frank Barchard91317c52019-11-22 10:54:35 -080010836 TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010837 TEST_REQUIRES_ARM_NEON;
10838 GemmMicrokernelTester()
10839 .mr(1)
10840 .nr(8)
10841 .kr(1)
10842 .sr(1)
10843 .m(1)
10844 .n(8)
10845 .k(2)
10846 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080010847 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010848 }
10849
Frank Barchard91317c52019-11-22 10:54:35 -080010850 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010851 TEST_REQUIRES_ARM_NEON;
10852 GemmMicrokernelTester()
10853 .mr(1)
10854 .nr(8)
10855 .kr(1)
10856 .sr(1)
10857 .m(1)
10858 .n(8)
10859 .k(2)
10860 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080010861 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010862 }
10863
Frank Barchard91317c52019-11-22 10:54:35 -080010864 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010865 TEST_REQUIRES_ARM_NEON;
10866 for (uint32_t m = 1; m <= 1; m++) {
10867 for (uint32_t n = 1; n <= 8; n++) {
10868 GemmMicrokernelTester()
10869 .mr(1)
10870 .nr(8)
10871 .kr(1)
10872 .sr(1)
10873 .m(m)
10874 .n(n)
10875 .k(2)
10876 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010877 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010878 }
10879 }
10880 }
10881
Frank Barchard91317c52019-11-22 10:54:35 -080010882 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010883 TEST_REQUIRES_ARM_NEON;
10884 for (uint32_t m = 1; m <= 1; m++) {
10885 GemmMicrokernelTester()
10886 .mr(1)
10887 .nr(8)
10888 .kr(1)
10889 .sr(1)
10890 .m(m)
10891 .n(8)
10892 .k(2)
10893 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010894 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010895 }
10896 }
10897
Frank Barchard91317c52019-11-22 10:54:35 -080010898 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010899 TEST_REQUIRES_ARM_NEON;
10900 for (uint32_t n = 1; n <= 8; n++) {
10901 GemmMicrokernelTester()
10902 .mr(1)
10903 .nr(8)
10904 .kr(1)
10905 .sr(1)
10906 .m(1)
10907 .n(n)
10908 .k(2)
10909 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010910 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010911 }
10912 }
10913
Frank Barchard91317c52019-11-22 10:54:35 -080010914 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010915 TEST_REQUIRES_ARM_NEON;
10916 for (size_t k = 1; k < 2; k++) {
10917 GemmMicrokernelTester()
10918 .mr(1)
10919 .nr(8)
10920 .kr(1)
10921 .sr(1)
10922 .m(1)
10923 .n(8)
10924 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010925 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010926 }
10927 }
10928
Frank Barchard91317c52019-11-22 10:54:35 -080010929 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010930 TEST_REQUIRES_ARM_NEON;
10931 for (size_t k = 1; k < 2; k++) {
10932 GemmMicrokernelTester()
10933 .mr(1)
10934 .nr(8)
10935 .kr(1)
10936 .sr(1)
10937 .m(1)
10938 .n(8)
10939 .k(k)
10940 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080010941 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010942 }
10943 }
10944
Frank Barchard91317c52019-11-22 10:54:35 -080010945 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010946 TEST_REQUIRES_ARM_NEON;
10947 for (size_t k = 1; k < 2; k++) {
10948 for (uint32_t m = 1; m <= 1; m++) {
10949 for (uint32_t n = 1; n <= 8; n++) {
10950 GemmMicrokernelTester()
10951 .mr(1)
10952 .nr(8)
10953 .kr(1)
10954 .sr(1)
10955 .m(m)
10956 .n(n)
10957 .k(k)
10958 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080010959 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010960 }
10961 }
10962 }
10963 }
10964
Frank Barchard91317c52019-11-22 10:54:35 -080010965 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010966 TEST_REQUIRES_ARM_NEON;
10967 for (size_t k = 3; k < 4; k++) {
10968 GemmMicrokernelTester()
10969 .mr(1)
10970 .nr(8)
10971 .kr(1)
10972 .sr(1)
10973 .m(1)
10974 .n(8)
10975 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080010976 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010977 }
10978 }
10979
Frank Barchard91317c52019-11-22 10:54:35 -080010980 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010981 TEST_REQUIRES_ARM_NEON;
10982 for (size_t k = 3; k < 4; k++) {
10983 GemmMicrokernelTester()
10984 .mr(1)
10985 .nr(8)
10986 .kr(1)
10987 .sr(1)
10988 .m(1)
10989 .n(8)
10990 .k(k)
10991 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080010992 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070010993 }
10994 }
10995
Frank Barchard91317c52019-11-22 10:54:35 -080010996 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070010997 TEST_REQUIRES_ARM_NEON;
10998 for (size_t k = 3; k < 4; k++) {
10999 for (uint32_t m = 1; m <= 1; m++) {
11000 for (uint32_t n = 1; n <= 8; n++) {
11001 GemmMicrokernelTester()
11002 .mr(1)
11003 .nr(8)
11004 .kr(1)
11005 .sr(1)
11006 .m(m)
11007 .n(n)
11008 .k(k)
11009 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011010 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011011 }
11012 }
11013 }
11014 }
11015
Frank Barchard91317c52019-11-22 10:54:35 -080011016 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011017 TEST_REQUIRES_ARM_NEON;
11018 for (size_t k = 4; k <= 20; k += 2) {
11019 GemmMicrokernelTester()
11020 .mr(1)
11021 .nr(8)
11022 .kr(1)
11023 .sr(1)
11024 .m(1)
11025 .n(8)
11026 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011027 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011028 }
11029 }
11030
Frank Barchard91317c52019-11-22 10:54:35 -080011031 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011032 TEST_REQUIRES_ARM_NEON;
11033 for (size_t k = 4; k <= 20; k += 2) {
11034 GemmMicrokernelTester()
11035 .mr(1)
11036 .nr(8)
11037 .kr(1)
11038 .sr(1)
11039 .m(1)
11040 .n(8)
11041 .k(k)
11042 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080011043 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011044 }
11045 }
11046
Frank Barchard91317c52019-11-22 10:54:35 -080011047 TEST(F32_GEMM_1X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011048 TEST_REQUIRES_ARM_NEON;
11049 for (size_t k = 4; k <= 20; k += 2) {
11050 for (uint32_t m = 1; m <= 1; m++) {
11051 for (uint32_t n = 1; n <= 8; n++) {
11052 GemmMicrokernelTester()
11053 .mr(1)
11054 .nr(8)
11055 .kr(1)
11056 .sr(1)
11057 .m(m)
11058 .n(n)
11059 .k(k)
11060 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011061 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011062 }
11063 }
11064 }
11065 }
11066
Frank Barchard91317c52019-11-22 10:54:35 -080011067 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011068 TEST_REQUIRES_ARM_NEON;
11069 for (uint32_t n = 9; n < 16; n++) {
11070 for (size_t k = 1; k <= 10; k += 3) {
11071 GemmMicrokernelTester()
11072 .mr(1)
11073 .nr(8)
11074 .kr(1)
11075 .sr(1)
11076 .m(1)
11077 .n(8)
11078 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011079 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011080 }
11081 }
11082 }
11083
Frank Barchard91317c52019-11-22 10:54:35 -080011084 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011085 TEST_REQUIRES_ARM_NEON;
11086 for (uint32_t n = 9; n < 16; n++) {
11087 for (size_t k = 1; k <= 10; k += 3) {
11088 GemmMicrokernelTester()
11089 .mr(1)
11090 .nr(8)
11091 .kr(1)
11092 .sr(1)
11093 .m(1)
11094 .n(8)
11095 .k(k)
11096 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011097 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011098 }
11099 }
11100 }
11101
Frank Barchard91317c52019-11-22 10:54:35 -080011102 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011103 TEST_REQUIRES_ARM_NEON;
11104 for (uint32_t n = 9; n < 16; n++) {
11105 for (size_t k = 1; k <= 10; k += 3) {
11106 GemmMicrokernelTester()
11107 .mr(1)
11108 .nr(8)
11109 .kr(1)
11110 .sr(1)
11111 .m(1)
11112 .n(n)
11113 .k(k)
11114 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011115 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011116 }
11117 }
11118 }
11119
Frank Barchard91317c52019-11-22 10:54:35 -080011120 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011121 TEST_REQUIRES_ARM_NEON;
11122 for (uint32_t n = 9; n < 16; n++) {
11123 for (size_t k = 1; k <= 10; k += 3) {
11124 for (uint32_t m = 1; m <= 1; m++) {
11125 GemmMicrokernelTester()
11126 .mr(1)
11127 .nr(8)
11128 .kr(1)
11129 .sr(1)
11130 .m(m)
11131 .n(n)
11132 .k(k)
11133 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011134 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011135 }
11136 }
11137 }
11138 }
11139
Frank Barchard91317c52019-11-22 10:54:35 -080011140 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011141 TEST_REQUIRES_ARM_NEON;
11142 for (uint32_t n = 16; n <= 24; n += 8) {
11143 for (size_t k = 1; k <= 10; k += 3) {
11144 GemmMicrokernelTester()
11145 .mr(1)
11146 .nr(8)
11147 .kr(1)
11148 .sr(1)
11149 .m(1)
11150 .n(8)
11151 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011152 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011153 }
11154 }
11155 }
11156
Frank Barchard91317c52019-11-22 10:54:35 -080011157 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011158 TEST_REQUIRES_ARM_NEON;
11159 for (uint32_t n = 16; n <= 24; n += 8) {
11160 for (size_t k = 1; k <= 10; k += 3) {
11161 GemmMicrokernelTester()
11162 .mr(1)
11163 .nr(8)
11164 .kr(1)
11165 .sr(1)
11166 .m(1)
11167 .n(n)
11168 .k(k)
11169 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011170 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011171 }
11172 }
11173 }
11174
Frank Barchard91317c52019-11-22 10:54:35 -080011175 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011176 TEST_REQUIRES_ARM_NEON;
11177 for (uint32_t n = 16; n <= 24; n += 8) {
11178 for (size_t k = 1; k <= 10; k += 3) {
11179 GemmMicrokernelTester()
11180 .mr(1)
11181 .nr(8)
11182 .kr(1)
11183 .sr(1)
11184 .m(1)
11185 .n(n)
11186 .k(k)
11187 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011188 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011189 }
11190 }
11191 }
11192
Frank Barchard91317c52019-11-22 10:54:35 -080011193 TEST(F32_GEMM_1X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011194 TEST_REQUIRES_ARM_NEON;
11195 for (uint32_t n = 16; n <= 24; n += 8) {
11196 for (size_t k = 1; k <= 10; k += 3) {
11197 for (uint32_t m = 1; m <= 1; m++) {
11198 GemmMicrokernelTester()
11199 .mr(1)
11200 .nr(8)
11201 .kr(1)
11202 .sr(1)
11203 .m(m)
11204 .n(n)
11205 .k(k)
11206 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011207 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011208 }
11209 }
11210 }
11211 }
11212
Frank Barchard91317c52019-11-22 10:54:35 -080011213 TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011214 TEST_REQUIRES_ARM_NEON;
11215 for (size_t k = 1; k <= 10; k += 3) {
11216 for (uint32_t m = 1; m <= 1; m++) {
11217 for (uint32_t n = 1; n <= 8; n++) {
11218 GemmMicrokernelTester()
11219 .mr(1)
11220 .nr(8)
11221 .kr(1)
11222 .sr(1)
11223 .m(m)
11224 .n(n)
11225 .k(k)
11226 .cm_stride(11)
11227 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011228 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011229 }
11230 }
11231 }
11232 }
11233
Frank Barchard91317c52019-11-22 10:54:35 -080011234 TEST(F32_GEMM_1X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011235 TEST_REQUIRES_ARM_NEON;
11236 GemmMicrokernelTester()
11237 .mr(1)
11238 .nr(8)
11239 .kr(1)
11240 .sr(1)
11241 .m(1)
11242 .n(8)
11243 .k(2)
11244 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011245 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011246 }
11247
Frank Barchard91317c52019-11-22 10:54:35 -080011248 TEST(F32_GEMM_1X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011249 TEST_REQUIRES_ARM_NEON;
11250 GemmMicrokernelTester()
11251 .mr(1)
11252 .nr(8)
11253 .kr(1)
11254 .sr(1)
11255 .m(1)
11256 .n(8)
11257 .k(2)
11258 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011259 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011260 }
11261
Frank Barchard91317c52019-11-22 10:54:35 -080011262 TEST(F32_GEMM_1X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011263 TEST_REQUIRES_ARM_NEON;
11264 GemmMicrokernelTester()
11265 .mr(1)
11266 .nr(8)
11267 .kr(1)
11268 .sr(1)
11269 .m(1)
11270 .n(8)
11271 .k(2)
11272 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011273 .Test(xnn_f32_gemm_ukernel_1x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011274 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070011275#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070011276
11277
Marat Dukhan1dadbf72019-10-01 10:46:20 -070011278#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080011279 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011280 TEST_REQUIRES_ARM_NEON;
11281 GemmMicrokernelTester()
11282 .mr(4)
11283 .nr(2)
11284 .kr(1)
11285 .sr(1)
11286 .m(4)
11287 .n(2)
11288 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080011289 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011290 }
11291
Frank Barchard91317c52019-11-22 10:54:35 -080011292 TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011293 TEST_REQUIRES_ARM_NEON;
11294 GemmMicrokernelTester()
11295 .mr(4)
11296 .nr(2)
11297 .kr(1)
11298 .sr(1)
11299 .m(4)
11300 .n(2)
11301 .k(2)
11302 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011303 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011304 }
11305
Frank Barchard91317c52019-11-22 10:54:35 -080011306 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011307 TEST_REQUIRES_ARM_NEON;
11308 GemmMicrokernelTester()
11309 .mr(4)
11310 .nr(2)
11311 .kr(1)
11312 .sr(1)
11313 .m(4)
11314 .n(2)
11315 .k(2)
11316 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011317 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011318 }
11319
Frank Barchard91317c52019-11-22 10:54:35 -080011320 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011321 TEST_REQUIRES_ARM_NEON;
11322 for (uint32_t m = 1; m <= 4; m++) {
11323 for (uint32_t n = 1; n <= 2; n++) {
11324 GemmMicrokernelTester()
11325 .mr(4)
11326 .nr(2)
11327 .kr(1)
11328 .sr(1)
11329 .m(m)
11330 .n(n)
11331 .k(2)
11332 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011333 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011334 }
11335 }
11336 }
11337
Frank Barchard91317c52019-11-22 10:54:35 -080011338 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011339 TEST_REQUIRES_ARM_NEON;
11340 for (uint32_t m = 1; m <= 4; m++) {
11341 GemmMicrokernelTester()
11342 .mr(4)
11343 .nr(2)
11344 .kr(1)
11345 .sr(1)
11346 .m(m)
11347 .n(2)
11348 .k(2)
11349 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011350 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011351 }
11352 }
11353
Frank Barchard91317c52019-11-22 10:54:35 -080011354 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011355 TEST_REQUIRES_ARM_NEON;
11356 for (uint32_t n = 1; n <= 2; n++) {
11357 GemmMicrokernelTester()
11358 .mr(4)
11359 .nr(2)
11360 .kr(1)
11361 .sr(1)
11362 .m(4)
11363 .n(n)
11364 .k(2)
11365 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011366 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011367 }
11368 }
11369
Frank Barchard91317c52019-11-22 10:54:35 -080011370 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011371 TEST_REQUIRES_ARM_NEON;
11372 for (size_t k = 1; k < 2; k++) {
11373 GemmMicrokernelTester()
11374 .mr(4)
11375 .nr(2)
11376 .kr(1)
11377 .sr(1)
11378 .m(4)
11379 .n(2)
11380 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011381 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011382 }
11383 }
11384
Frank Barchard91317c52019-11-22 10:54:35 -080011385 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011386 TEST_REQUIRES_ARM_NEON;
11387 for (size_t k = 1; k < 2; k++) {
11388 GemmMicrokernelTester()
11389 .mr(4)
11390 .nr(2)
11391 .kr(1)
11392 .sr(1)
11393 .m(4)
11394 .n(2)
11395 .k(k)
11396 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011397 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011398 }
11399 }
11400
Frank Barchard91317c52019-11-22 10:54:35 -080011401 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011402 TEST_REQUIRES_ARM_NEON;
11403 for (size_t k = 1; k < 2; k++) {
11404 for (uint32_t m = 1; m <= 4; m++) {
11405 for (uint32_t n = 1; n <= 2; n++) {
11406 GemmMicrokernelTester()
11407 .mr(4)
11408 .nr(2)
11409 .kr(1)
11410 .sr(1)
11411 .m(m)
11412 .n(n)
11413 .k(k)
11414 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011415 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011416 }
11417 }
11418 }
11419 }
11420
Frank Barchard91317c52019-11-22 10:54:35 -080011421 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011422 TEST_REQUIRES_ARM_NEON;
11423 for (size_t k = 3; k < 4; k++) {
11424 GemmMicrokernelTester()
11425 .mr(4)
11426 .nr(2)
11427 .kr(1)
11428 .sr(1)
11429 .m(4)
11430 .n(2)
11431 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011432 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011433 }
11434 }
11435
Frank Barchard91317c52019-11-22 10:54:35 -080011436 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011437 TEST_REQUIRES_ARM_NEON;
11438 for (size_t k = 3; k < 4; k++) {
11439 GemmMicrokernelTester()
11440 .mr(4)
11441 .nr(2)
11442 .kr(1)
11443 .sr(1)
11444 .m(4)
11445 .n(2)
11446 .k(k)
11447 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080011448 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011449 }
11450 }
11451
Frank Barchard91317c52019-11-22 10:54:35 -080011452 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011453 TEST_REQUIRES_ARM_NEON;
11454 for (size_t k = 3; k < 4; k++) {
11455 for (uint32_t m = 1; m <= 4; m++) {
11456 for (uint32_t n = 1; n <= 2; n++) {
11457 GemmMicrokernelTester()
11458 .mr(4)
11459 .nr(2)
11460 .kr(1)
11461 .sr(1)
11462 .m(m)
11463 .n(n)
11464 .k(k)
11465 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011466 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011467 }
11468 }
11469 }
11470 }
11471
Frank Barchard91317c52019-11-22 10:54:35 -080011472 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011473 TEST_REQUIRES_ARM_NEON;
11474 for (size_t k = 4; k <= 20; k += 2) {
11475 GemmMicrokernelTester()
11476 .mr(4)
11477 .nr(2)
11478 .kr(1)
11479 .sr(1)
11480 .m(4)
11481 .n(2)
11482 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011483 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011484 }
11485 }
11486
Frank Barchard91317c52019-11-22 10:54:35 -080011487 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011488 TEST_REQUIRES_ARM_NEON;
11489 for (size_t k = 4; k <= 20; k += 2) {
11490 GemmMicrokernelTester()
11491 .mr(4)
11492 .nr(2)
11493 .kr(1)
11494 .sr(1)
11495 .m(4)
11496 .n(2)
11497 .k(k)
11498 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080011499 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011500 }
11501 }
11502
Frank Barchard91317c52019-11-22 10:54:35 -080011503 TEST(F32_GEMM_4X2__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011504 TEST_REQUIRES_ARM_NEON;
11505 for (size_t k = 4; k <= 20; k += 2) {
11506 for (uint32_t m = 1; m <= 4; m++) {
11507 for (uint32_t n = 1; n <= 2; n++) {
11508 GemmMicrokernelTester()
11509 .mr(4)
11510 .nr(2)
11511 .kr(1)
11512 .sr(1)
11513 .m(m)
11514 .n(n)
11515 .k(k)
11516 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011517 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011518 }
11519 }
11520 }
11521 }
11522
Frank Barchard91317c52019-11-22 10:54:35 -080011523 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011524 TEST_REQUIRES_ARM_NEON;
11525 for (uint32_t n = 3; n < 4; n++) {
11526 for (size_t k = 1; k <= 10; k += 3) {
11527 GemmMicrokernelTester()
11528 .mr(4)
11529 .nr(2)
11530 .kr(1)
11531 .sr(1)
11532 .m(4)
11533 .n(2)
11534 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011535 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011536 }
11537 }
11538 }
11539
Frank Barchard91317c52019-11-22 10:54:35 -080011540 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011541 TEST_REQUIRES_ARM_NEON;
11542 for (uint32_t n = 3; n < 4; n++) {
11543 for (size_t k = 1; k <= 10; k += 3) {
11544 GemmMicrokernelTester()
11545 .mr(4)
11546 .nr(2)
11547 .kr(1)
11548 .sr(1)
11549 .m(4)
11550 .n(2)
11551 .k(k)
11552 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011553 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011554 }
11555 }
11556 }
11557
Frank Barchard91317c52019-11-22 10:54:35 -080011558 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011559 TEST_REQUIRES_ARM_NEON;
11560 for (uint32_t n = 3; n < 4; n++) {
11561 for (size_t k = 1; k <= 10; k += 3) {
11562 GemmMicrokernelTester()
11563 .mr(4)
11564 .nr(2)
11565 .kr(1)
11566 .sr(1)
11567 .m(4)
11568 .n(n)
11569 .k(k)
11570 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011571 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011572 }
11573 }
11574 }
11575
Frank Barchard91317c52019-11-22 10:54:35 -080011576 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011577 TEST_REQUIRES_ARM_NEON;
11578 for (uint32_t n = 3; n < 4; n++) {
11579 for (size_t k = 1; k <= 10; k += 3) {
11580 for (uint32_t m = 1; m <= 4; m++) {
11581 GemmMicrokernelTester()
11582 .mr(4)
11583 .nr(2)
11584 .kr(1)
11585 .sr(1)
11586 .m(m)
11587 .n(n)
11588 .k(k)
11589 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011590 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011591 }
11592 }
11593 }
11594 }
11595
Frank Barchard91317c52019-11-22 10:54:35 -080011596 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011597 TEST_REQUIRES_ARM_NEON;
11598 for (uint32_t n = 4; n <= 6; n += 2) {
11599 for (size_t k = 1; k <= 10; k += 3) {
11600 GemmMicrokernelTester()
11601 .mr(4)
11602 .nr(2)
11603 .kr(1)
11604 .sr(1)
11605 .m(4)
11606 .n(2)
11607 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011608 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011609 }
11610 }
11611 }
11612
Frank Barchard91317c52019-11-22 10:54:35 -080011613 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011614 TEST_REQUIRES_ARM_NEON;
11615 for (uint32_t n = 4; n <= 6; n += 2) {
11616 for (size_t k = 1; k <= 10; k += 3) {
11617 GemmMicrokernelTester()
11618 .mr(4)
11619 .nr(2)
11620 .kr(1)
11621 .sr(1)
11622 .m(4)
11623 .n(n)
11624 .k(k)
11625 .cn_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011626 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011627 }
11628 }
11629 }
11630
Frank Barchard91317c52019-11-22 10:54:35 -080011631 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011632 TEST_REQUIRES_ARM_NEON;
11633 for (uint32_t n = 4; n <= 6; n += 2) {
11634 for (size_t k = 1; k <= 10; k += 3) {
11635 GemmMicrokernelTester()
11636 .mr(4)
11637 .nr(2)
11638 .kr(1)
11639 .sr(1)
11640 .m(4)
11641 .n(n)
11642 .k(k)
11643 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080011644 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011645 }
11646 }
11647 }
11648
Frank Barchard91317c52019-11-22 10:54:35 -080011649 TEST(F32_GEMM_4X2__NEON_LANE_LD64, n_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011650 TEST_REQUIRES_ARM_NEON;
11651 for (uint32_t n = 4; n <= 6; n += 2) {
11652 for (size_t k = 1; k <= 10; k += 3) {
11653 for (uint32_t m = 1; m <= 4; m++) {
11654 GemmMicrokernelTester()
11655 .mr(4)
11656 .nr(2)
11657 .kr(1)
11658 .sr(1)
11659 .m(m)
11660 .n(n)
11661 .k(k)
11662 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011663 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011664 }
11665 }
11666 }
11667 }
11668
Frank Barchard91317c52019-11-22 10:54:35 -080011669 TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011670 TEST_REQUIRES_ARM_NEON;
11671 for (size_t k = 1; k <= 10; k += 3) {
11672 for (uint32_t m = 1; m <= 4; m++) {
11673 for (uint32_t n = 1; n <= 2; n++) {
11674 GemmMicrokernelTester()
11675 .mr(4)
11676 .nr(2)
11677 .kr(1)
11678 .sr(1)
11679 .m(m)
11680 .n(n)
11681 .k(k)
11682 .cm_stride(5)
11683 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011684 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011685 }
11686 }
11687 }
11688 }
11689
Frank Barchard91317c52019-11-22 10:54:35 -080011690 TEST(F32_GEMM_4X2__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011691 TEST_REQUIRES_ARM_NEON;
11692 GemmMicrokernelTester()
11693 .mr(4)
11694 .nr(2)
11695 .kr(1)
11696 .sr(1)
11697 .m(4)
11698 .n(2)
11699 .k(2)
11700 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011701 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011702 }
11703
Frank Barchard91317c52019-11-22 10:54:35 -080011704 TEST(F32_GEMM_4X2__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011705 TEST_REQUIRES_ARM_NEON;
11706 GemmMicrokernelTester()
11707 .mr(4)
11708 .nr(2)
11709 .kr(1)
11710 .sr(1)
11711 .m(4)
11712 .n(2)
11713 .k(2)
11714 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080011715 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011716 }
11717
Frank Barchard91317c52019-11-22 10:54:35 -080011718 TEST(F32_GEMM_4X2__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011719 TEST_REQUIRES_ARM_NEON;
11720 GemmMicrokernelTester()
11721 .mr(4)
11722 .nr(2)
11723 .kr(1)
11724 .sr(1)
11725 .m(4)
11726 .n(2)
11727 .k(2)
11728 .cm_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011729 .Test(xnn_f32_gemm_ukernel_4x2__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011730 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070011731#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070011732
11733
Marat Dukhan1dadbf72019-10-01 10:46:20 -070011734#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080011735 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011736 TEST_REQUIRES_ARM_NEON;
11737 GemmMicrokernelTester()
11738 .mr(4)
11739 .nr(8)
11740 .kr(1)
11741 .sr(1)
11742 .m(4)
11743 .n(8)
11744 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080011745 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011746 }
11747
Frank Barchard91317c52019-11-22 10:54:35 -080011748 TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011749 TEST_REQUIRES_ARM_NEON;
11750 GemmMicrokernelTester()
11751 .mr(4)
11752 .nr(8)
11753 .kr(1)
11754 .sr(1)
11755 .m(4)
11756 .n(8)
11757 .k(2)
11758 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080011759 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011760 }
11761
Frank Barchard91317c52019-11-22 10:54:35 -080011762 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011763 TEST_REQUIRES_ARM_NEON;
11764 GemmMicrokernelTester()
11765 .mr(4)
11766 .nr(8)
11767 .kr(1)
11768 .sr(1)
11769 .m(4)
11770 .n(8)
11771 .k(2)
11772 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011773 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011774 }
11775
Frank Barchard91317c52019-11-22 10:54:35 -080011776 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011777 TEST_REQUIRES_ARM_NEON;
11778 for (uint32_t m = 1; m <= 4; m++) {
11779 for (uint32_t n = 1; n <= 8; n++) {
11780 GemmMicrokernelTester()
11781 .mr(4)
11782 .nr(8)
11783 .kr(1)
11784 .sr(1)
11785 .m(m)
11786 .n(n)
11787 .k(2)
11788 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011789 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011790 }
11791 }
11792 }
11793
Frank Barchard91317c52019-11-22 10:54:35 -080011794 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011795 TEST_REQUIRES_ARM_NEON;
11796 for (uint32_t m = 1; m <= 4; m++) {
11797 GemmMicrokernelTester()
11798 .mr(4)
11799 .nr(8)
11800 .kr(1)
11801 .sr(1)
11802 .m(m)
11803 .n(8)
11804 .k(2)
11805 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011806 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011807 }
11808 }
11809
Frank Barchard91317c52019-11-22 10:54:35 -080011810 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011811 TEST_REQUIRES_ARM_NEON;
11812 for (uint32_t n = 1; n <= 8; n++) {
11813 GemmMicrokernelTester()
11814 .mr(4)
11815 .nr(8)
11816 .kr(1)
11817 .sr(1)
11818 .m(4)
11819 .n(n)
11820 .k(2)
11821 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011822 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011823 }
11824 }
11825
Frank Barchard91317c52019-11-22 10:54:35 -080011826 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011827 TEST_REQUIRES_ARM_NEON;
11828 for (size_t k = 1; k < 2; k++) {
11829 GemmMicrokernelTester()
11830 .mr(4)
11831 .nr(8)
11832 .kr(1)
11833 .sr(1)
11834 .m(4)
11835 .n(8)
11836 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011837 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011838 }
11839 }
11840
Frank Barchard91317c52019-11-22 10:54:35 -080011841 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011842 TEST_REQUIRES_ARM_NEON;
11843 for (size_t k = 1; k < 2; k++) {
11844 GemmMicrokernelTester()
11845 .mr(4)
11846 .nr(8)
11847 .kr(1)
11848 .sr(1)
11849 .m(4)
11850 .n(8)
11851 .k(k)
11852 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080011853 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011854 }
11855 }
11856
Frank Barchard91317c52019-11-22 10:54:35 -080011857 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011858 TEST_REQUIRES_ARM_NEON;
11859 for (size_t k = 1; k < 2; k++) {
11860 for (uint32_t m = 1; m <= 4; m++) {
11861 for (uint32_t n = 1; n <= 8; n++) {
11862 GemmMicrokernelTester()
11863 .mr(4)
11864 .nr(8)
11865 .kr(1)
11866 .sr(1)
11867 .m(m)
11868 .n(n)
11869 .k(k)
11870 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011871 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011872 }
11873 }
11874 }
11875 }
11876
Frank Barchard91317c52019-11-22 10:54:35 -080011877 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011878 TEST_REQUIRES_ARM_NEON;
11879 for (size_t k = 3; k < 4; k++) {
11880 GemmMicrokernelTester()
11881 .mr(4)
11882 .nr(8)
11883 .kr(1)
11884 .sr(1)
11885 .m(4)
11886 .n(8)
11887 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011888 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011889 }
11890 }
11891
Frank Barchard91317c52019-11-22 10:54:35 -080011892 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011893 TEST_REQUIRES_ARM_NEON;
11894 for (size_t k = 3; k < 4; k++) {
11895 GemmMicrokernelTester()
11896 .mr(4)
11897 .nr(8)
11898 .kr(1)
11899 .sr(1)
11900 .m(4)
11901 .n(8)
11902 .k(k)
11903 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080011904 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011905 }
11906 }
11907
Frank Barchard91317c52019-11-22 10:54:35 -080011908 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011909 TEST_REQUIRES_ARM_NEON;
11910 for (size_t k = 3; k < 4; k++) {
11911 for (uint32_t m = 1; m <= 4; m++) {
11912 for (uint32_t n = 1; n <= 8; n++) {
11913 GemmMicrokernelTester()
11914 .mr(4)
11915 .nr(8)
11916 .kr(1)
11917 .sr(1)
11918 .m(m)
11919 .n(n)
11920 .k(k)
11921 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011922 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011923 }
11924 }
11925 }
11926 }
11927
Frank Barchard91317c52019-11-22 10:54:35 -080011928 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011929 TEST_REQUIRES_ARM_NEON;
11930 for (size_t k = 4; k <= 20; k += 2) {
11931 GemmMicrokernelTester()
11932 .mr(4)
11933 .nr(8)
11934 .kr(1)
11935 .sr(1)
11936 .m(4)
11937 .n(8)
11938 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011939 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011940 }
11941 }
11942
Frank Barchard91317c52019-11-22 10:54:35 -080011943 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011944 TEST_REQUIRES_ARM_NEON;
11945 for (size_t k = 4; k <= 20; k += 2) {
11946 GemmMicrokernelTester()
11947 .mr(4)
11948 .nr(8)
11949 .kr(1)
11950 .sr(1)
11951 .m(4)
11952 .n(8)
11953 .k(k)
11954 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080011955 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011956 }
11957 }
11958
Frank Barchard91317c52019-11-22 10:54:35 -080011959 TEST(F32_GEMM_4X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011960 TEST_REQUIRES_ARM_NEON;
11961 for (size_t k = 4; k <= 20; k += 2) {
11962 for (uint32_t m = 1; m <= 4; m++) {
11963 for (uint32_t n = 1; n <= 8; n++) {
11964 GemmMicrokernelTester()
11965 .mr(4)
11966 .nr(8)
11967 .kr(1)
11968 .sr(1)
11969 .m(m)
11970 .n(n)
11971 .k(k)
11972 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080011973 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011974 }
11975 }
11976 }
11977 }
11978
Frank Barchard91317c52019-11-22 10:54:35 -080011979 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011980 TEST_REQUIRES_ARM_NEON;
11981 for (uint32_t n = 9; n < 16; n++) {
11982 for (size_t k = 1; k <= 10; k += 3) {
11983 GemmMicrokernelTester()
11984 .mr(4)
11985 .nr(8)
11986 .kr(1)
11987 .sr(1)
11988 .m(4)
11989 .n(8)
11990 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080011991 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070011992 }
11993 }
11994 }
11995
Frank Barchard91317c52019-11-22 10:54:35 -080011996 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070011997 TEST_REQUIRES_ARM_NEON;
11998 for (uint32_t n = 9; n < 16; n++) {
11999 for (size_t k = 1; k <= 10; k += 3) {
12000 GemmMicrokernelTester()
12001 .mr(4)
12002 .nr(8)
12003 .kr(1)
12004 .sr(1)
12005 .m(4)
12006 .n(8)
12007 .k(k)
12008 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012009 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012010 }
12011 }
12012 }
12013
Frank Barchard91317c52019-11-22 10:54:35 -080012014 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012015 TEST_REQUIRES_ARM_NEON;
12016 for (uint32_t n = 9; n < 16; n++) {
12017 for (size_t k = 1; k <= 10; k += 3) {
12018 GemmMicrokernelTester()
12019 .mr(4)
12020 .nr(8)
12021 .kr(1)
12022 .sr(1)
12023 .m(4)
12024 .n(n)
12025 .k(k)
12026 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080012027 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012028 }
12029 }
12030 }
12031
Frank Barchard91317c52019-11-22 10:54:35 -080012032 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012033 TEST_REQUIRES_ARM_NEON;
12034 for (uint32_t n = 9; n < 16; n++) {
12035 for (size_t k = 1; k <= 10; k += 3) {
12036 for (uint32_t m = 1; m <= 4; m++) {
12037 GemmMicrokernelTester()
12038 .mr(4)
12039 .nr(8)
12040 .kr(1)
12041 .sr(1)
12042 .m(m)
12043 .n(n)
12044 .k(k)
12045 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012046 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012047 }
12048 }
12049 }
12050 }
12051
Frank Barchard91317c52019-11-22 10:54:35 -080012052 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012053 TEST_REQUIRES_ARM_NEON;
12054 for (uint32_t n = 16; n <= 24; n += 8) {
12055 for (size_t k = 1; k <= 10; k += 3) {
12056 GemmMicrokernelTester()
12057 .mr(4)
12058 .nr(8)
12059 .kr(1)
12060 .sr(1)
12061 .m(4)
12062 .n(8)
12063 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012064 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012065 }
12066 }
12067 }
12068
Frank Barchard91317c52019-11-22 10:54:35 -080012069 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012070 TEST_REQUIRES_ARM_NEON;
12071 for (uint32_t n = 16; n <= 24; n += 8) {
12072 for (size_t k = 1; k <= 10; k += 3) {
12073 GemmMicrokernelTester()
12074 .mr(4)
12075 .nr(8)
12076 .kr(1)
12077 .sr(1)
12078 .m(4)
12079 .n(n)
12080 .k(k)
12081 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012082 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012083 }
12084 }
12085 }
12086
Frank Barchard91317c52019-11-22 10:54:35 -080012087 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012088 TEST_REQUIRES_ARM_NEON;
12089 for (uint32_t n = 16; n <= 24; n += 8) {
12090 for (size_t k = 1; k <= 10; k += 3) {
12091 GemmMicrokernelTester()
12092 .mr(4)
12093 .nr(8)
12094 .kr(1)
12095 .sr(1)
12096 .m(4)
12097 .n(n)
12098 .k(k)
12099 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080012100 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012101 }
12102 }
12103 }
12104
Frank Barchard91317c52019-11-22 10:54:35 -080012105 TEST(F32_GEMM_4X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012106 TEST_REQUIRES_ARM_NEON;
12107 for (uint32_t n = 16; n <= 24; n += 8) {
12108 for (size_t k = 1; k <= 10; k += 3) {
12109 for (uint32_t m = 1; m <= 4; m++) {
12110 GemmMicrokernelTester()
12111 .mr(4)
12112 .nr(8)
12113 .kr(1)
12114 .sr(1)
12115 .m(m)
12116 .n(n)
12117 .k(k)
12118 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012119 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012120 }
12121 }
12122 }
12123 }
12124
Frank Barchard91317c52019-11-22 10:54:35 -080012125 TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012126 TEST_REQUIRES_ARM_NEON;
12127 for (size_t k = 1; k <= 10; k += 3) {
12128 for (uint32_t m = 1; m <= 4; m++) {
12129 for (uint32_t n = 1; n <= 8; n++) {
12130 GemmMicrokernelTester()
12131 .mr(4)
12132 .nr(8)
12133 .kr(1)
12134 .sr(1)
12135 .m(m)
12136 .n(n)
12137 .k(k)
12138 .cm_stride(11)
12139 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012140 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012141 }
12142 }
12143 }
12144 }
12145
Frank Barchard91317c52019-11-22 10:54:35 -080012146 TEST(F32_GEMM_4X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012147 TEST_REQUIRES_ARM_NEON;
12148 GemmMicrokernelTester()
12149 .mr(4)
12150 .nr(8)
12151 .kr(1)
12152 .sr(1)
12153 .m(4)
12154 .n(8)
12155 .k(2)
12156 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012157 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012158 }
12159
Frank Barchard91317c52019-11-22 10:54:35 -080012160 TEST(F32_GEMM_4X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012161 TEST_REQUIRES_ARM_NEON;
12162 GemmMicrokernelTester()
12163 .mr(4)
12164 .nr(8)
12165 .kr(1)
12166 .sr(1)
12167 .m(4)
12168 .n(8)
12169 .k(2)
12170 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012171 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012172 }
12173
Frank Barchard91317c52019-11-22 10:54:35 -080012174 TEST(F32_GEMM_4X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012175 TEST_REQUIRES_ARM_NEON;
12176 GemmMicrokernelTester()
12177 .mr(4)
12178 .nr(8)
12179 .kr(1)
12180 .sr(1)
12181 .m(4)
12182 .n(8)
12183 .k(2)
12184 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012185 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012186 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070012187#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070012188
12189
Marat Dukhan1dadbf72019-10-01 10:46:20 -070012190#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080012191 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012192 TEST_REQUIRES_ARM_NEON;
12193 GemmMicrokernelTester()
12194 .mr(4)
12195 .nr(8)
12196 .kr(1)
12197 .sr(1)
12198 .m(4)
12199 .n(8)
12200 .k(4)
Frank Barchard91317c52019-11-22 10:54:35 -080012201 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012202 }
12203
Frank Barchard91317c52019-11-22 10:54:35 -080012204 TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012205 TEST_REQUIRES_ARM_NEON;
12206 GemmMicrokernelTester()
12207 .mr(4)
12208 .nr(8)
12209 .kr(1)
12210 .sr(1)
12211 .m(4)
12212 .n(8)
12213 .k(4)
12214 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012215 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012216 }
12217
Frank Barchard91317c52019-11-22 10:54:35 -080012218 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012219 TEST_REQUIRES_ARM_NEON;
12220 GemmMicrokernelTester()
12221 .mr(4)
12222 .nr(8)
12223 .kr(1)
12224 .sr(1)
12225 .m(4)
12226 .n(8)
12227 .k(4)
12228 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012229 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012230 }
12231
Frank Barchard91317c52019-11-22 10:54:35 -080012232 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012233 TEST_REQUIRES_ARM_NEON;
12234 for (uint32_t m = 1; m <= 4; m++) {
12235 for (uint32_t n = 1; n <= 8; n++) {
12236 GemmMicrokernelTester()
12237 .mr(4)
12238 .nr(8)
12239 .kr(1)
12240 .sr(1)
12241 .m(m)
12242 .n(n)
12243 .k(4)
12244 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012245 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012246 }
12247 }
12248 }
12249
Frank Barchard91317c52019-11-22 10:54:35 -080012250 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012251 TEST_REQUIRES_ARM_NEON;
12252 for (uint32_t m = 1; m <= 4; m++) {
12253 GemmMicrokernelTester()
12254 .mr(4)
12255 .nr(8)
12256 .kr(1)
12257 .sr(1)
12258 .m(m)
12259 .n(8)
12260 .k(4)
12261 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012262 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012263 }
12264 }
12265
Frank Barchard91317c52019-11-22 10:54:35 -080012266 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012267 TEST_REQUIRES_ARM_NEON;
12268 for (uint32_t n = 1; n <= 8; n++) {
12269 GemmMicrokernelTester()
12270 .mr(4)
12271 .nr(8)
12272 .kr(1)
12273 .sr(1)
12274 .m(4)
12275 .n(n)
12276 .k(4)
12277 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012278 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012279 }
12280 }
12281
Frank Barchard91317c52019-11-22 10:54:35 -080012282 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012283 TEST_REQUIRES_ARM_NEON;
12284 for (size_t k = 1; k < 4; k++) {
12285 GemmMicrokernelTester()
12286 .mr(4)
12287 .nr(8)
12288 .kr(1)
12289 .sr(1)
12290 .m(4)
12291 .n(8)
12292 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012293 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012294 }
12295 }
12296
Frank Barchard91317c52019-11-22 10:54:35 -080012297 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012298 TEST_REQUIRES_ARM_NEON;
12299 for (size_t k = 1; k < 4; k++) {
12300 GemmMicrokernelTester()
12301 .mr(4)
12302 .nr(8)
12303 .kr(1)
12304 .sr(1)
12305 .m(4)
12306 .n(8)
12307 .k(k)
12308 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012309 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012310 }
12311 }
12312
Frank Barchard91317c52019-11-22 10:54:35 -080012313 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012314 TEST_REQUIRES_ARM_NEON;
12315 for (size_t k = 1; k < 4; k++) {
12316 for (uint32_t m = 1; m <= 4; m++) {
12317 for (uint32_t n = 1; n <= 8; n++) {
12318 GemmMicrokernelTester()
12319 .mr(4)
12320 .nr(8)
12321 .kr(1)
12322 .sr(1)
12323 .m(m)
12324 .n(n)
12325 .k(k)
12326 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012327 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012328 }
12329 }
12330 }
12331 }
12332
Frank Barchard91317c52019-11-22 10:54:35 -080012333 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012334 TEST_REQUIRES_ARM_NEON;
12335 for (size_t k = 5; k < 8; k++) {
12336 GemmMicrokernelTester()
12337 .mr(4)
12338 .nr(8)
12339 .kr(1)
12340 .sr(1)
12341 .m(4)
12342 .n(8)
12343 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012344 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012345 }
12346 }
12347
Frank Barchard91317c52019-11-22 10:54:35 -080012348 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012349 TEST_REQUIRES_ARM_NEON;
12350 for (size_t k = 5; k < 8; k++) {
12351 GemmMicrokernelTester()
12352 .mr(4)
12353 .nr(8)
12354 .kr(1)
12355 .sr(1)
12356 .m(4)
12357 .n(8)
12358 .k(k)
12359 .a_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012360 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012361 }
12362 }
12363
Frank Barchard91317c52019-11-22 10:54:35 -080012364 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012365 TEST_REQUIRES_ARM_NEON;
12366 for (size_t k = 5; k < 8; k++) {
12367 for (uint32_t m = 1; m <= 4; m++) {
12368 for (uint32_t n = 1; n <= 8; n++) {
12369 GemmMicrokernelTester()
12370 .mr(4)
12371 .nr(8)
12372 .kr(1)
12373 .sr(1)
12374 .m(m)
12375 .n(n)
12376 .k(k)
12377 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012378 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012379 }
12380 }
12381 }
12382 }
12383
Frank Barchard91317c52019-11-22 10:54:35 -080012384 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012385 TEST_REQUIRES_ARM_NEON;
12386 for (size_t k = 8; k <= 40; k += 4) {
12387 GemmMicrokernelTester()
12388 .mr(4)
12389 .nr(8)
12390 .kr(1)
12391 .sr(1)
12392 .m(4)
12393 .n(8)
12394 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012395 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012396 }
12397 }
12398
Frank Barchard91317c52019-11-22 10:54:35 -080012399 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012400 TEST_REQUIRES_ARM_NEON;
12401 for (size_t k = 8; k <= 40; k += 4) {
12402 GemmMicrokernelTester()
12403 .mr(4)
12404 .nr(8)
12405 .kr(1)
12406 .sr(1)
12407 .m(4)
12408 .n(8)
12409 .k(k)
12410 .a_stride(43)
Frank Barchard91317c52019-11-22 10:54:35 -080012411 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012412 }
12413 }
12414
Frank Barchard91317c52019-11-22 10:54:35 -080012415 TEST(F32_GEMM_4X8__NEON_LANE_LD128, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012416 TEST_REQUIRES_ARM_NEON;
12417 for (size_t k = 8; k <= 40; k += 4) {
12418 for (uint32_t m = 1; m <= 4; m++) {
12419 for (uint32_t n = 1; n <= 8; n++) {
12420 GemmMicrokernelTester()
12421 .mr(4)
12422 .nr(8)
12423 .kr(1)
12424 .sr(1)
12425 .m(m)
12426 .n(n)
12427 .k(k)
12428 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012429 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012430 }
12431 }
12432 }
12433 }
12434
Frank Barchard91317c52019-11-22 10:54:35 -080012435 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012436 TEST_REQUIRES_ARM_NEON;
12437 for (uint32_t n = 9; n < 16; n++) {
12438 for (size_t k = 1; k <= 20; k += 5) {
12439 GemmMicrokernelTester()
12440 .mr(4)
12441 .nr(8)
12442 .kr(1)
12443 .sr(1)
12444 .m(4)
12445 .n(8)
12446 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012447 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012448 }
12449 }
12450 }
12451
Frank Barchard91317c52019-11-22 10:54:35 -080012452 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012453 TEST_REQUIRES_ARM_NEON;
12454 for (uint32_t n = 9; n < 16; n++) {
12455 for (size_t k = 1; k <= 20; k += 5) {
12456 GemmMicrokernelTester()
12457 .mr(4)
12458 .nr(8)
12459 .kr(1)
12460 .sr(1)
12461 .m(4)
12462 .n(8)
12463 .k(k)
12464 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012465 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012466 }
12467 }
12468 }
12469
Frank Barchard91317c52019-11-22 10:54:35 -080012470 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012471 TEST_REQUIRES_ARM_NEON;
12472 for (uint32_t n = 9; n < 16; n++) {
12473 for (size_t k = 1; k <= 20; k += 5) {
12474 GemmMicrokernelTester()
12475 .mr(4)
12476 .nr(8)
12477 .kr(1)
12478 .sr(1)
12479 .m(4)
12480 .n(n)
12481 .k(k)
12482 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080012483 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012484 }
12485 }
12486 }
12487
Frank Barchard91317c52019-11-22 10:54:35 -080012488 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012489 TEST_REQUIRES_ARM_NEON;
12490 for (uint32_t n = 9; n < 16; n++) {
12491 for (size_t k = 1; k <= 20; k += 5) {
12492 for (uint32_t m = 1; m <= 4; m++) {
12493 GemmMicrokernelTester()
12494 .mr(4)
12495 .nr(8)
12496 .kr(1)
12497 .sr(1)
12498 .m(m)
12499 .n(n)
12500 .k(k)
12501 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012502 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012503 }
12504 }
12505 }
12506 }
12507
Frank Barchard91317c52019-11-22 10:54:35 -080012508 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012509 TEST_REQUIRES_ARM_NEON;
12510 for (uint32_t n = 16; n <= 24; n += 8) {
12511 for (size_t k = 1; k <= 20; k += 5) {
12512 GemmMicrokernelTester()
12513 .mr(4)
12514 .nr(8)
12515 .kr(1)
12516 .sr(1)
12517 .m(4)
12518 .n(8)
12519 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012520 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012521 }
12522 }
12523 }
12524
Frank Barchard91317c52019-11-22 10:54:35 -080012525 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012526 TEST_REQUIRES_ARM_NEON;
12527 for (uint32_t n = 16; n <= 24; n += 8) {
12528 for (size_t k = 1; k <= 20; k += 5) {
12529 GemmMicrokernelTester()
12530 .mr(4)
12531 .nr(8)
12532 .kr(1)
12533 .sr(1)
12534 .m(4)
12535 .n(n)
12536 .k(k)
12537 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012538 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012539 }
12540 }
12541 }
12542
Frank Barchard91317c52019-11-22 10:54:35 -080012543 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012544 TEST_REQUIRES_ARM_NEON;
12545 for (uint32_t n = 16; n <= 24; n += 8) {
12546 for (size_t k = 1; k <= 20; k += 5) {
12547 GemmMicrokernelTester()
12548 .mr(4)
12549 .nr(8)
12550 .kr(1)
12551 .sr(1)
12552 .m(4)
12553 .n(n)
12554 .k(k)
12555 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080012556 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012557 }
12558 }
12559 }
12560
Frank Barchard91317c52019-11-22 10:54:35 -080012561 TEST(F32_GEMM_4X8__NEON_LANE_LD128, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012562 TEST_REQUIRES_ARM_NEON;
12563 for (uint32_t n = 16; n <= 24; n += 8) {
12564 for (size_t k = 1; k <= 20; k += 5) {
12565 for (uint32_t m = 1; m <= 4; m++) {
12566 GemmMicrokernelTester()
12567 .mr(4)
12568 .nr(8)
12569 .kr(1)
12570 .sr(1)
12571 .m(m)
12572 .n(n)
12573 .k(k)
12574 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012575 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012576 }
12577 }
12578 }
12579 }
12580
Frank Barchard91317c52019-11-22 10:54:35 -080012581 TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012582 TEST_REQUIRES_ARM_NEON;
12583 for (size_t k = 1; k <= 20; k += 5) {
12584 for (uint32_t m = 1; m <= 4; m++) {
12585 for (uint32_t n = 1; n <= 8; n++) {
12586 GemmMicrokernelTester()
12587 .mr(4)
12588 .nr(8)
12589 .kr(1)
12590 .sr(1)
12591 .m(m)
12592 .n(n)
12593 .k(k)
12594 .cm_stride(11)
12595 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012596 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012597 }
12598 }
12599 }
12600 }
12601
Frank Barchard91317c52019-11-22 10:54:35 -080012602 TEST(F32_GEMM_4X8__NEON_LANE_LD128, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012603 TEST_REQUIRES_ARM_NEON;
12604 GemmMicrokernelTester()
12605 .mr(4)
12606 .nr(8)
12607 .kr(1)
12608 .sr(1)
12609 .m(4)
12610 .n(8)
12611 .k(4)
12612 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012613 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012614 }
12615
Frank Barchard91317c52019-11-22 10:54:35 -080012616 TEST(F32_GEMM_4X8__NEON_LANE_LD128, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012617 TEST_REQUIRES_ARM_NEON;
12618 GemmMicrokernelTester()
12619 .mr(4)
12620 .nr(8)
12621 .kr(1)
12622 .sr(1)
12623 .m(4)
12624 .n(8)
12625 .k(4)
12626 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080012627 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012628 }
12629
Frank Barchard91317c52019-11-22 10:54:35 -080012630 TEST(F32_GEMM_4X8__NEON_LANE_LD128, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012631 TEST_REQUIRES_ARM_NEON;
12632 GemmMicrokernelTester()
12633 .mr(4)
12634 .nr(8)
12635 .kr(1)
12636 .sr(1)
12637 .m(4)
12638 .n(8)
12639 .k(4)
12640 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012641 .Test(xnn_f32_gemm_ukernel_4x8__neon_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012642 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070012643#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070012644
12645
Marat Dukhan1dadbf72019-10-01 10:46:20 -070012646#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080012647 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012648 TEST_REQUIRES_ARM_NEON;
12649 GemmMicrokernelTester()
12650 .mr(5)
12651 .nr(8)
12652 .kr(1)
12653 .sr(1)
12654 .m(5)
12655 .n(8)
12656 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080012657 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012658 }
12659
Frank Barchard91317c52019-11-22 10:54:35 -080012660 TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012661 TEST_REQUIRES_ARM_NEON;
12662 GemmMicrokernelTester()
12663 .mr(5)
12664 .nr(8)
12665 .kr(1)
12666 .sr(1)
12667 .m(5)
12668 .n(8)
12669 .k(2)
12670 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012671 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012672 }
12673
Frank Barchard91317c52019-11-22 10:54:35 -080012674 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012675 TEST_REQUIRES_ARM_NEON;
12676 GemmMicrokernelTester()
12677 .mr(5)
12678 .nr(8)
12679 .kr(1)
12680 .sr(1)
12681 .m(5)
12682 .n(8)
12683 .k(2)
12684 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012685 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012686 }
12687
Frank Barchard91317c52019-11-22 10:54:35 -080012688 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012689 TEST_REQUIRES_ARM_NEON;
12690 for (uint32_t m = 1; m <= 5; m++) {
12691 for (uint32_t n = 1; n <= 8; n++) {
12692 GemmMicrokernelTester()
12693 .mr(5)
12694 .nr(8)
12695 .kr(1)
12696 .sr(1)
12697 .m(m)
12698 .n(n)
12699 .k(2)
12700 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012701 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012702 }
12703 }
12704 }
12705
Frank Barchard91317c52019-11-22 10:54:35 -080012706 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012707 TEST_REQUIRES_ARM_NEON;
12708 for (uint32_t m = 1; m <= 5; m++) {
12709 GemmMicrokernelTester()
12710 .mr(5)
12711 .nr(8)
12712 .kr(1)
12713 .sr(1)
12714 .m(m)
12715 .n(8)
12716 .k(2)
12717 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012718 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012719 }
12720 }
12721
Frank Barchard91317c52019-11-22 10:54:35 -080012722 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012723 TEST_REQUIRES_ARM_NEON;
12724 for (uint32_t n = 1; n <= 8; n++) {
12725 GemmMicrokernelTester()
12726 .mr(5)
12727 .nr(8)
12728 .kr(1)
12729 .sr(1)
12730 .m(5)
12731 .n(n)
12732 .k(2)
12733 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012734 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012735 }
12736 }
12737
Frank Barchard91317c52019-11-22 10:54:35 -080012738 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012739 TEST_REQUIRES_ARM_NEON;
12740 for (size_t k = 1; k < 2; k++) {
12741 GemmMicrokernelTester()
12742 .mr(5)
12743 .nr(8)
12744 .kr(1)
12745 .sr(1)
12746 .m(5)
12747 .n(8)
12748 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012749 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012750 }
12751 }
12752
Frank Barchard91317c52019-11-22 10:54:35 -080012753 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012754 TEST_REQUIRES_ARM_NEON;
12755 for (size_t k = 1; k < 2; k++) {
12756 GemmMicrokernelTester()
12757 .mr(5)
12758 .nr(8)
12759 .kr(1)
12760 .sr(1)
12761 .m(5)
12762 .n(8)
12763 .k(k)
12764 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080012765 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012766 }
12767 }
12768
Frank Barchard91317c52019-11-22 10:54:35 -080012769 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012770 TEST_REQUIRES_ARM_NEON;
12771 for (size_t k = 1; k < 2; k++) {
12772 for (uint32_t m = 1; m <= 5; m++) {
12773 for (uint32_t n = 1; n <= 8; n++) {
12774 GemmMicrokernelTester()
12775 .mr(5)
12776 .nr(8)
12777 .kr(1)
12778 .sr(1)
12779 .m(m)
12780 .n(n)
12781 .k(k)
12782 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012783 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012784 }
12785 }
12786 }
12787 }
12788
Frank Barchard91317c52019-11-22 10:54:35 -080012789 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012790 TEST_REQUIRES_ARM_NEON;
12791 for (size_t k = 3; k < 4; k++) {
12792 GemmMicrokernelTester()
12793 .mr(5)
12794 .nr(8)
12795 .kr(1)
12796 .sr(1)
12797 .m(5)
12798 .n(8)
12799 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012800 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012801 }
12802 }
12803
Frank Barchard91317c52019-11-22 10:54:35 -080012804 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012805 TEST_REQUIRES_ARM_NEON;
12806 for (size_t k = 3; k < 4; k++) {
12807 GemmMicrokernelTester()
12808 .mr(5)
12809 .nr(8)
12810 .kr(1)
12811 .sr(1)
12812 .m(5)
12813 .n(8)
12814 .k(k)
12815 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080012816 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012817 }
12818 }
12819
Frank Barchard91317c52019-11-22 10:54:35 -080012820 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012821 TEST_REQUIRES_ARM_NEON;
12822 for (size_t k = 3; k < 4; k++) {
12823 for (uint32_t m = 1; m <= 5; m++) {
12824 for (uint32_t n = 1; n <= 8; n++) {
12825 GemmMicrokernelTester()
12826 .mr(5)
12827 .nr(8)
12828 .kr(1)
12829 .sr(1)
12830 .m(m)
12831 .n(n)
12832 .k(k)
12833 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012834 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012835 }
12836 }
12837 }
12838 }
12839
Frank Barchard91317c52019-11-22 10:54:35 -080012840 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012841 TEST_REQUIRES_ARM_NEON;
12842 for (size_t k = 4; k <= 20; k += 2) {
12843 GemmMicrokernelTester()
12844 .mr(5)
12845 .nr(8)
12846 .kr(1)
12847 .sr(1)
12848 .m(5)
12849 .n(8)
12850 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012851 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012852 }
12853 }
12854
Frank Barchard91317c52019-11-22 10:54:35 -080012855 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012856 TEST_REQUIRES_ARM_NEON;
12857 for (size_t k = 4; k <= 20; k += 2) {
12858 GemmMicrokernelTester()
12859 .mr(5)
12860 .nr(8)
12861 .kr(1)
12862 .sr(1)
12863 .m(5)
12864 .n(8)
12865 .k(k)
12866 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080012867 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012868 }
12869 }
12870
Frank Barchard91317c52019-11-22 10:54:35 -080012871 TEST(F32_GEMM_5X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012872 TEST_REQUIRES_ARM_NEON;
12873 for (size_t k = 4; k <= 20; k += 2) {
12874 for (uint32_t m = 1; m <= 5; m++) {
12875 for (uint32_t n = 1; n <= 8; n++) {
12876 GemmMicrokernelTester()
12877 .mr(5)
12878 .nr(8)
12879 .kr(1)
12880 .sr(1)
12881 .m(m)
12882 .n(n)
12883 .k(k)
12884 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012885 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012886 }
12887 }
12888 }
12889 }
12890
Frank Barchard91317c52019-11-22 10:54:35 -080012891 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012892 TEST_REQUIRES_ARM_NEON;
12893 for (uint32_t n = 9; n < 16; n++) {
12894 for (size_t k = 1; k <= 10; k += 3) {
12895 GemmMicrokernelTester()
12896 .mr(5)
12897 .nr(8)
12898 .kr(1)
12899 .sr(1)
12900 .m(5)
12901 .n(8)
12902 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012903 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012904 }
12905 }
12906 }
12907
Frank Barchard91317c52019-11-22 10:54:35 -080012908 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012909 TEST_REQUIRES_ARM_NEON;
12910 for (uint32_t n = 9; n < 16; n++) {
12911 for (size_t k = 1; k <= 10; k += 3) {
12912 GemmMicrokernelTester()
12913 .mr(5)
12914 .nr(8)
12915 .kr(1)
12916 .sr(1)
12917 .m(5)
12918 .n(8)
12919 .k(k)
12920 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012921 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012922 }
12923 }
12924 }
12925
Frank Barchard91317c52019-11-22 10:54:35 -080012926 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012927 TEST_REQUIRES_ARM_NEON;
12928 for (uint32_t n = 9; n < 16; n++) {
12929 for (size_t k = 1; k <= 10; k += 3) {
12930 GemmMicrokernelTester()
12931 .mr(5)
12932 .nr(8)
12933 .kr(1)
12934 .sr(1)
12935 .m(5)
12936 .n(n)
12937 .k(k)
12938 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080012939 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012940 }
12941 }
12942 }
12943
Frank Barchard91317c52019-11-22 10:54:35 -080012944 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012945 TEST_REQUIRES_ARM_NEON;
12946 for (uint32_t n = 9; n < 16; n++) {
12947 for (size_t k = 1; k <= 10; k += 3) {
12948 for (uint32_t m = 1; m <= 5; m++) {
12949 GemmMicrokernelTester()
12950 .mr(5)
12951 .nr(8)
12952 .kr(1)
12953 .sr(1)
12954 .m(m)
12955 .n(n)
12956 .k(k)
12957 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080012958 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012959 }
12960 }
12961 }
12962 }
12963
Frank Barchard91317c52019-11-22 10:54:35 -080012964 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012965 TEST_REQUIRES_ARM_NEON;
12966 for (uint32_t n = 16; n <= 24; n += 8) {
12967 for (size_t k = 1; k <= 10; k += 3) {
12968 GemmMicrokernelTester()
12969 .mr(5)
12970 .nr(8)
12971 .kr(1)
12972 .sr(1)
12973 .m(5)
12974 .n(8)
12975 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080012976 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012977 }
12978 }
12979 }
12980
Frank Barchard91317c52019-11-22 10:54:35 -080012981 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070012982 TEST_REQUIRES_ARM_NEON;
12983 for (uint32_t n = 16; n <= 24; n += 8) {
12984 for (size_t k = 1; k <= 10; k += 3) {
12985 GemmMicrokernelTester()
12986 .mr(5)
12987 .nr(8)
12988 .kr(1)
12989 .sr(1)
12990 .m(5)
12991 .n(n)
12992 .k(k)
12993 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080012994 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070012995 }
12996 }
12997 }
12998
Frank Barchard91317c52019-11-22 10:54:35 -080012999 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013000 TEST_REQUIRES_ARM_NEON;
13001 for (uint32_t n = 16; n <= 24; n += 8) {
13002 for (size_t k = 1; k <= 10; k += 3) {
13003 GemmMicrokernelTester()
13004 .mr(5)
13005 .nr(8)
13006 .kr(1)
13007 .sr(1)
13008 .m(5)
13009 .n(n)
13010 .k(k)
13011 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080013012 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013013 }
13014 }
13015 }
13016
Frank Barchard91317c52019-11-22 10:54:35 -080013017 TEST(F32_GEMM_5X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013018 TEST_REQUIRES_ARM_NEON;
13019 for (uint32_t n = 16; n <= 24; n += 8) {
13020 for (size_t k = 1; k <= 10; k += 3) {
13021 for (uint32_t m = 1; m <= 5; m++) {
13022 GemmMicrokernelTester()
13023 .mr(5)
13024 .nr(8)
13025 .kr(1)
13026 .sr(1)
13027 .m(m)
13028 .n(n)
13029 .k(k)
13030 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013031 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013032 }
13033 }
13034 }
13035 }
13036
Frank Barchard91317c52019-11-22 10:54:35 -080013037 TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013038 TEST_REQUIRES_ARM_NEON;
13039 for (size_t k = 1; k <= 10; k += 3) {
13040 for (uint32_t m = 1; m <= 5; m++) {
13041 for (uint32_t n = 1; n <= 8; n++) {
13042 GemmMicrokernelTester()
13043 .mr(5)
13044 .nr(8)
13045 .kr(1)
13046 .sr(1)
13047 .m(m)
13048 .n(n)
13049 .k(k)
13050 .cm_stride(11)
13051 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013052 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013053 }
13054 }
13055 }
13056 }
13057
Frank Barchard91317c52019-11-22 10:54:35 -080013058 TEST(F32_GEMM_5X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013059 TEST_REQUIRES_ARM_NEON;
13060 GemmMicrokernelTester()
13061 .mr(5)
13062 .nr(8)
13063 .kr(1)
13064 .sr(1)
13065 .m(5)
13066 .n(8)
13067 .k(2)
13068 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013069 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013070 }
13071
Frank Barchard91317c52019-11-22 10:54:35 -080013072 TEST(F32_GEMM_5X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013073 TEST_REQUIRES_ARM_NEON;
13074 GemmMicrokernelTester()
13075 .mr(5)
13076 .nr(8)
13077 .kr(1)
13078 .sr(1)
13079 .m(5)
13080 .n(8)
13081 .k(2)
13082 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013083 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013084 }
13085
Frank Barchard91317c52019-11-22 10:54:35 -080013086 TEST(F32_GEMM_5X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013087 TEST_REQUIRES_ARM_NEON;
13088 GemmMicrokernelTester()
13089 .mr(5)
13090 .nr(8)
13091 .kr(1)
13092 .sr(1)
13093 .m(5)
13094 .n(8)
13095 .k(2)
13096 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013097 .Test(xnn_f32_gemm_ukernel_5x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013098 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070013099#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070013100
13101
Marat Dukhan1dadbf72019-10-01 10:46:20 -070013102#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard91317c52019-11-22 10:54:35 -080013103 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013104 TEST_REQUIRES_ARM_NEON;
13105 GemmMicrokernelTester()
13106 .mr(6)
13107 .nr(8)
13108 .kr(1)
13109 .sr(1)
13110 .m(6)
13111 .n(8)
13112 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080013113 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013114 }
13115
Frank Barchard91317c52019-11-22 10:54:35 -080013116 TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013117 TEST_REQUIRES_ARM_NEON;
13118 GemmMicrokernelTester()
13119 .mr(6)
13120 .nr(8)
13121 .kr(1)
13122 .sr(1)
13123 .m(6)
13124 .n(8)
13125 .k(2)
13126 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013127 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013128 }
13129
Frank Barchard91317c52019-11-22 10:54:35 -080013130 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013131 TEST_REQUIRES_ARM_NEON;
13132 GemmMicrokernelTester()
13133 .mr(6)
13134 .nr(8)
13135 .kr(1)
13136 .sr(1)
13137 .m(6)
13138 .n(8)
13139 .k(2)
13140 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080013141 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013142 }
13143
Frank Barchard91317c52019-11-22 10:54:35 -080013144 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013145 TEST_REQUIRES_ARM_NEON;
13146 for (uint32_t m = 1; m <= 6; m++) {
13147 for (uint32_t n = 1; n <= 8; n++) {
13148 GemmMicrokernelTester()
13149 .mr(6)
13150 .nr(8)
13151 .kr(1)
13152 .sr(1)
13153 .m(m)
13154 .n(n)
13155 .k(2)
13156 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013157 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013158 }
13159 }
13160 }
13161
Frank Barchard91317c52019-11-22 10:54:35 -080013162 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013163 TEST_REQUIRES_ARM_NEON;
13164 for (uint32_t m = 1; m <= 6; m++) {
13165 GemmMicrokernelTester()
13166 .mr(6)
13167 .nr(8)
13168 .kr(1)
13169 .sr(1)
13170 .m(m)
13171 .n(8)
13172 .k(2)
13173 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013174 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013175 }
13176 }
13177
Frank Barchard91317c52019-11-22 10:54:35 -080013178 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013179 TEST_REQUIRES_ARM_NEON;
13180 for (uint32_t n = 1; n <= 8; n++) {
13181 GemmMicrokernelTester()
13182 .mr(6)
13183 .nr(8)
13184 .kr(1)
13185 .sr(1)
13186 .m(6)
13187 .n(n)
13188 .k(2)
13189 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013190 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013191 }
13192 }
13193
Frank Barchard91317c52019-11-22 10:54:35 -080013194 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013195 TEST_REQUIRES_ARM_NEON;
13196 for (size_t k = 1; k < 2; k++) {
13197 GemmMicrokernelTester()
13198 .mr(6)
13199 .nr(8)
13200 .kr(1)
13201 .sr(1)
13202 .m(6)
13203 .n(8)
13204 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013205 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013206 }
13207 }
13208
Frank Barchard91317c52019-11-22 10:54:35 -080013209 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013210 TEST_REQUIRES_ARM_NEON;
13211 for (size_t k = 1; k < 2; k++) {
13212 GemmMicrokernelTester()
13213 .mr(6)
13214 .nr(8)
13215 .kr(1)
13216 .sr(1)
13217 .m(6)
13218 .n(8)
13219 .k(k)
13220 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080013221 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013222 }
13223 }
13224
Frank Barchard91317c52019-11-22 10:54:35 -080013225 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013226 TEST_REQUIRES_ARM_NEON;
13227 for (size_t k = 1; k < 2; k++) {
13228 for (uint32_t m = 1; m <= 6; m++) {
13229 for (uint32_t n = 1; n <= 8; n++) {
13230 GemmMicrokernelTester()
13231 .mr(6)
13232 .nr(8)
13233 .kr(1)
13234 .sr(1)
13235 .m(m)
13236 .n(n)
13237 .k(k)
13238 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013239 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013240 }
13241 }
13242 }
13243 }
13244
Frank Barchard91317c52019-11-22 10:54:35 -080013245 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013246 TEST_REQUIRES_ARM_NEON;
13247 for (size_t k = 3; k < 4; k++) {
13248 GemmMicrokernelTester()
13249 .mr(6)
13250 .nr(8)
13251 .kr(1)
13252 .sr(1)
13253 .m(6)
13254 .n(8)
13255 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013256 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013257 }
13258 }
13259
Frank Barchard91317c52019-11-22 10:54:35 -080013260 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013261 TEST_REQUIRES_ARM_NEON;
13262 for (size_t k = 3; k < 4; k++) {
13263 GemmMicrokernelTester()
13264 .mr(6)
13265 .nr(8)
13266 .kr(1)
13267 .sr(1)
13268 .m(6)
13269 .n(8)
13270 .k(k)
13271 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080013272 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013273 }
13274 }
13275
Frank Barchard91317c52019-11-22 10:54:35 -080013276 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013277 TEST_REQUIRES_ARM_NEON;
13278 for (size_t k = 3; k < 4; k++) {
13279 for (uint32_t m = 1; m <= 6; m++) {
13280 for (uint32_t n = 1; n <= 8; n++) {
13281 GemmMicrokernelTester()
13282 .mr(6)
13283 .nr(8)
13284 .kr(1)
13285 .sr(1)
13286 .m(m)
13287 .n(n)
13288 .k(k)
13289 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013290 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013291 }
13292 }
13293 }
13294 }
13295
Frank Barchard91317c52019-11-22 10:54:35 -080013296 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013297 TEST_REQUIRES_ARM_NEON;
13298 for (size_t k = 4; k <= 20; k += 2) {
13299 GemmMicrokernelTester()
13300 .mr(6)
13301 .nr(8)
13302 .kr(1)
13303 .sr(1)
13304 .m(6)
13305 .n(8)
13306 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013307 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013308 }
13309 }
13310
Frank Barchard91317c52019-11-22 10:54:35 -080013311 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013312 TEST_REQUIRES_ARM_NEON;
13313 for (size_t k = 4; k <= 20; k += 2) {
13314 GemmMicrokernelTester()
13315 .mr(6)
13316 .nr(8)
13317 .kr(1)
13318 .sr(1)
13319 .m(6)
13320 .n(8)
13321 .k(k)
13322 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080013323 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013324 }
13325 }
13326
Frank Barchard91317c52019-11-22 10:54:35 -080013327 TEST(F32_GEMM_6X8__NEON_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013328 TEST_REQUIRES_ARM_NEON;
13329 for (size_t k = 4; k <= 20; k += 2) {
13330 for (uint32_t m = 1; m <= 6; m++) {
13331 for (uint32_t n = 1; n <= 8; n++) {
13332 GemmMicrokernelTester()
13333 .mr(6)
13334 .nr(8)
13335 .kr(1)
13336 .sr(1)
13337 .m(m)
13338 .n(n)
13339 .k(k)
13340 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013341 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013342 }
13343 }
13344 }
13345 }
13346
Frank Barchard91317c52019-11-22 10:54:35 -080013347 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013348 TEST_REQUIRES_ARM_NEON;
13349 for (uint32_t n = 9; n < 16; n++) {
13350 for (size_t k = 1; k <= 10; k += 3) {
13351 GemmMicrokernelTester()
13352 .mr(6)
13353 .nr(8)
13354 .kr(1)
13355 .sr(1)
13356 .m(6)
13357 .n(8)
13358 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013359 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013360 }
13361 }
13362 }
13363
Frank Barchard91317c52019-11-22 10:54:35 -080013364 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013365 TEST_REQUIRES_ARM_NEON;
13366 for (uint32_t n = 9; n < 16; n++) {
13367 for (size_t k = 1; k <= 10; k += 3) {
13368 GemmMicrokernelTester()
13369 .mr(6)
13370 .nr(8)
13371 .kr(1)
13372 .sr(1)
13373 .m(6)
13374 .n(8)
13375 .k(k)
13376 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013377 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013378 }
13379 }
13380 }
13381
Frank Barchard91317c52019-11-22 10:54:35 -080013382 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013383 TEST_REQUIRES_ARM_NEON;
13384 for (uint32_t n = 9; n < 16; n++) {
13385 for (size_t k = 1; k <= 10; k += 3) {
13386 GemmMicrokernelTester()
13387 .mr(6)
13388 .nr(8)
13389 .kr(1)
13390 .sr(1)
13391 .m(6)
13392 .n(n)
13393 .k(k)
13394 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080013395 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013396 }
13397 }
13398 }
13399
Frank Barchard91317c52019-11-22 10:54:35 -080013400 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013401 TEST_REQUIRES_ARM_NEON;
13402 for (uint32_t n = 9; n < 16; n++) {
13403 for (size_t k = 1; k <= 10; k += 3) {
13404 for (uint32_t m = 1; m <= 6; m++) {
13405 GemmMicrokernelTester()
13406 .mr(6)
13407 .nr(8)
13408 .kr(1)
13409 .sr(1)
13410 .m(m)
13411 .n(n)
13412 .k(k)
13413 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013414 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013415 }
13416 }
13417 }
13418 }
13419
Frank Barchard91317c52019-11-22 10:54:35 -080013420 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013421 TEST_REQUIRES_ARM_NEON;
13422 for (uint32_t n = 16; n <= 24; n += 8) {
13423 for (size_t k = 1; k <= 10; k += 3) {
13424 GemmMicrokernelTester()
13425 .mr(6)
13426 .nr(8)
13427 .kr(1)
13428 .sr(1)
13429 .m(6)
13430 .n(8)
13431 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080013432 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013433 }
13434 }
13435 }
13436
Frank Barchard91317c52019-11-22 10:54:35 -080013437 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013438 TEST_REQUIRES_ARM_NEON;
13439 for (uint32_t n = 16; n <= 24; n += 8) {
13440 for (size_t k = 1; k <= 10; k += 3) {
13441 GemmMicrokernelTester()
13442 .mr(6)
13443 .nr(8)
13444 .kr(1)
13445 .sr(1)
13446 .m(6)
13447 .n(n)
13448 .k(k)
13449 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013450 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013451 }
13452 }
13453 }
13454
Frank Barchard91317c52019-11-22 10:54:35 -080013455 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013456 TEST_REQUIRES_ARM_NEON;
13457 for (uint32_t n = 16; n <= 24; n += 8) {
13458 for (size_t k = 1; k <= 10; k += 3) {
13459 GemmMicrokernelTester()
13460 .mr(6)
13461 .nr(8)
13462 .kr(1)
13463 .sr(1)
13464 .m(6)
13465 .n(n)
13466 .k(k)
13467 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080013468 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013469 }
13470 }
13471 }
13472
Frank Barchard91317c52019-11-22 10:54:35 -080013473 TEST(F32_GEMM_6X8__NEON_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013474 TEST_REQUIRES_ARM_NEON;
13475 for (uint32_t n = 16; n <= 24; n += 8) {
13476 for (size_t k = 1; k <= 10; k += 3) {
13477 for (uint32_t m = 1; m <= 6; m++) {
13478 GemmMicrokernelTester()
13479 .mr(6)
13480 .nr(8)
13481 .kr(1)
13482 .sr(1)
13483 .m(m)
13484 .n(n)
13485 .k(k)
13486 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013487 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013488 }
13489 }
13490 }
13491 }
13492
Frank Barchard91317c52019-11-22 10:54:35 -080013493 TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013494 TEST_REQUIRES_ARM_NEON;
13495 for (size_t k = 1; k <= 10; k += 3) {
13496 for (uint32_t m = 1; m <= 6; m++) {
13497 for (uint32_t n = 1; n <= 8; n++) {
13498 GemmMicrokernelTester()
13499 .mr(6)
13500 .nr(8)
13501 .kr(1)
13502 .sr(1)
13503 .m(m)
13504 .n(n)
13505 .k(k)
13506 .cm_stride(11)
13507 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080013508 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013509 }
13510 }
13511 }
13512 }
13513
Frank Barchard91317c52019-11-22 10:54:35 -080013514 TEST(F32_GEMM_6X8__NEON_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013515 TEST_REQUIRES_ARM_NEON;
13516 GemmMicrokernelTester()
13517 .mr(6)
13518 .nr(8)
13519 .kr(1)
13520 .sr(1)
13521 .m(6)
13522 .n(8)
13523 .k(2)
13524 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013525 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013526 }
13527
Frank Barchard91317c52019-11-22 10:54:35 -080013528 TEST(F32_GEMM_6X8__NEON_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013529 TEST_REQUIRES_ARM_NEON;
13530 GemmMicrokernelTester()
13531 .mr(6)
13532 .nr(8)
13533 .kr(1)
13534 .sr(1)
13535 .m(6)
13536 .n(8)
13537 .k(2)
13538 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080013539 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013540 }
13541
Frank Barchard91317c52019-11-22 10:54:35 -080013542 TEST(F32_GEMM_6X8__NEON_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070013543 TEST_REQUIRES_ARM_NEON;
13544 GemmMicrokernelTester()
13545 .mr(6)
13546 .nr(8)
13547 .kr(1)
13548 .sr(1)
13549 .m(6)
13550 .n(8)
13551 .k(2)
13552 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080013553 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070013554 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070013555#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070013556
13557
Frank Barchard69172d92019-11-26 16:22:39 -080013558#if XNN_ARCH_ARM || XNN_ARCH_ARM64
13559 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4) {
13560 TEST_REQUIRES_ARM_NEON;
13561 GemmMicrokernelTester()
13562 .mr(6)
13563 .nr(8)
13564 .kr(1)
13565 .sr(1)
13566 .m(6)
13567 .n(8)
13568 .k(4)
13569 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13570 }
13571
13572 TEST(F32_GEMM_6X8__NEON_LANE_LD128, strided_cn) {
13573 TEST_REQUIRES_ARM_NEON;
13574 GemmMicrokernelTester()
13575 .mr(6)
13576 .nr(8)
13577 .kr(1)
13578 .sr(1)
13579 .m(6)
13580 .n(8)
13581 .k(4)
13582 .cn_stride(11)
13583 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13584 }
13585
13586 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_strided_a) {
13587 TEST_REQUIRES_ARM_NEON;
13588 GemmMicrokernelTester()
13589 .mr(6)
13590 .nr(8)
13591 .kr(1)
13592 .sr(1)
13593 .m(6)
13594 .n(8)
13595 .k(4)
13596 .a_stride(7)
13597 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13598 }
13599
13600 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile) {
13601 TEST_REQUIRES_ARM_NEON;
13602 for (uint32_t m = 1; m <= 6; m++) {
13603 for (uint32_t n = 1; n <= 8; n++) {
13604 GemmMicrokernelTester()
13605 .mr(6)
13606 .nr(8)
13607 .kr(1)
13608 .sr(1)
13609 .m(m)
13610 .n(n)
13611 .k(4)
13612 .iterations(1)
13613 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13614 }
13615 }
13616 }
13617
13618 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_m) {
13619 TEST_REQUIRES_ARM_NEON;
13620 for (uint32_t m = 1; m <= 6; m++) {
13621 GemmMicrokernelTester()
13622 .mr(6)
13623 .nr(8)
13624 .kr(1)
13625 .sr(1)
13626 .m(m)
13627 .n(8)
13628 .k(4)
13629 .iterations(1)
13630 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13631 }
13632 }
13633
13634 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_eq_4_subtile_n) {
13635 TEST_REQUIRES_ARM_NEON;
13636 for (uint32_t n = 1; n <= 8; n++) {
13637 GemmMicrokernelTester()
13638 .mr(6)
13639 .nr(8)
13640 .kr(1)
13641 .sr(1)
13642 .m(6)
13643 .n(n)
13644 .k(4)
13645 .iterations(1)
13646 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13647 }
13648 }
13649
13650 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_lt_4) {
13651 TEST_REQUIRES_ARM_NEON;
13652 for (size_t k = 1; k < 4; k++) {
13653 GemmMicrokernelTester()
13654 .mr(6)
13655 .nr(8)
13656 .kr(1)
13657 .sr(1)
13658 .m(6)
13659 .n(8)
13660 .k(k)
13661 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13662 }
13663 }
13664
13665 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_lt_4_strided_a) {
13666 TEST_REQUIRES_ARM_NEON;
13667 for (size_t k = 1; k < 4; k++) {
13668 GemmMicrokernelTester()
13669 .mr(6)
13670 .nr(8)
13671 .kr(1)
13672 .sr(1)
13673 .m(6)
13674 .n(8)
13675 .k(k)
13676 .a_stride(7)
13677 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13678 }
13679 }
13680
13681 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_lt_4_subtile) {
13682 TEST_REQUIRES_ARM_NEON;
13683 for (size_t k = 1; k < 4; k++) {
13684 for (uint32_t m = 1; m <= 6; m++) {
13685 for (uint32_t n = 1; n <= 8; n++) {
13686 GemmMicrokernelTester()
13687 .mr(6)
13688 .nr(8)
13689 .kr(1)
13690 .sr(1)
13691 .m(m)
13692 .n(n)
13693 .k(k)
13694 .iterations(1)
13695 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13696 }
13697 }
13698 }
13699 }
13700
13701 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_gt_4) {
13702 TEST_REQUIRES_ARM_NEON;
13703 for (size_t k = 5; k < 8; k++) {
13704 GemmMicrokernelTester()
13705 .mr(6)
13706 .nr(8)
13707 .kr(1)
13708 .sr(1)
13709 .m(6)
13710 .n(8)
13711 .k(k)
13712 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13713 }
13714 }
13715
13716 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_gt_4_strided_a) {
13717 TEST_REQUIRES_ARM_NEON;
13718 for (size_t k = 5; k < 8; k++) {
13719 GemmMicrokernelTester()
13720 .mr(6)
13721 .nr(8)
13722 .kr(1)
13723 .sr(1)
13724 .m(6)
13725 .n(8)
13726 .k(k)
13727 .a_stride(11)
13728 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13729 }
13730 }
13731
13732 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_gt_4_subtile) {
13733 TEST_REQUIRES_ARM_NEON;
13734 for (size_t k = 5; k < 8; k++) {
13735 for (uint32_t m = 1; m <= 6; m++) {
13736 for (uint32_t n = 1; n <= 8; n++) {
13737 GemmMicrokernelTester()
13738 .mr(6)
13739 .nr(8)
13740 .kr(1)
13741 .sr(1)
13742 .m(m)
13743 .n(n)
13744 .k(k)
13745 .iterations(1)
13746 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13747 }
13748 }
13749 }
13750 }
13751
13752 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_div_4) {
13753 TEST_REQUIRES_ARM_NEON;
13754 for (size_t k = 8; k <= 40; k += 4) {
13755 GemmMicrokernelTester()
13756 .mr(6)
13757 .nr(8)
13758 .kr(1)
13759 .sr(1)
13760 .m(6)
13761 .n(8)
13762 .k(k)
13763 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13764 }
13765 }
13766
13767 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_div_4_strided_a) {
13768 TEST_REQUIRES_ARM_NEON;
13769 for (size_t k = 8; k <= 40; k += 4) {
13770 GemmMicrokernelTester()
13771 .mr(6)
13772 .nr(8)
13773 .kr(1)
13774 .sr(1)
13775 .m(6)
13776 .n(8)
13777 .k(k)
13778 .a_stride(43)
13779 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13780 }
13781 }
13782
13783 TEST(F32_GEMM_6X8__NEON_LANE_LD128, k_div_4_subtile) {
13784 TEST_REQUIRES_ARM_NEON;
13785 for (size_t k = 8; k <= 40; k += 4) {
13786 for (uint32_t m = 1; m <= 6; m++) {
13787 for (uint32_t n = 1; n <= 8; n++) {
13788 GemmMicrokernelTester()
13789 .mr(6)
13790 .nr(8)
13791 .kr(1)
13792 .sr(1)
13793 .m(m)
13794 .n(n)
13795 .k(k)
13796 .iterations(1)
13797 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13798 }
13799 }
13800 }
13801 }
13802
13803 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8) {
13804 TEST_REQUIRES_ARM_NEON;
13805 for (uint32_t n = 9; n < 16; n++) {
13806 for (size_t k = 1; k <= 20; k += 5) {
13807 GemmMicrokernelTester()
13808 .mr(6)
13809 .nr(8)
13810 .kr(1)
13811 .sr(1)
13812 .m(6)
13813 .n(8)
13814 .k(k)
13815 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13816 }
13817 }
13818 }
13819
13820 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8_strided_cn) {
13821 TEST_REQUIRES_ARM_NEON;
13822 for (uint32_t n = 9; n < 16; n++) {
13823 for (size_t k = 1; k <= 20; k += 5) {
13824 GemmMicrokernelTester()
13825 .mr(6)
13826 .nr(8)
13827 .kr(1)
13828 .sr(1)
13829 .m(6)
13830 .n(8)
13831 .k(k)
13832 .cn_stride(11)
13833 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13834 }
13835 }
13836 }
13837
13838 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8_strided_a) {
13839 TEST_REQUIRES_ARM_NEON;
13840 for (uint32_t n = 9; n < 16; n++) {
13841 for (size_t k = 1; k <= 20; k += 5) {
13842 GemmMicrokernelTester()
13843 .mr(6)
13844 .nr(8)
13845 .kr(1)
13846 .sr(1)
13847 .m(6)
13848 .n(n)
13849 .k(k)
13850 .a_stride(23)
13851 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13852 }
13853 }
13854 }
13855
13856 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_gt_8_subtile) {
13857 TEST_REQUIRES_ARM_NEON;
13858 for (uint32_t n = 9; n < 16; n++) {
13859 for (size_t k = 1; k <= 20; k += 5) {
13860 for (uint32_t m = 1; m <= 6; m++) {
13861 GemmMicrokernelTester()
13862 .mr(6)
13863 .nr(8)
13864 .kr(1)
13865 .sr(1)
13866 .m(m)
13867 .n(n)
13868 .k(k)
13869 .iterations(1)
13870 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13871 }
13872 }
13873 }
13874 }
13875
13876 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8) {
13877 TEST_REQUIRES_ARM_NEON;
13878 for (uint32_t n = 16; n <= 24; n += 8) {
13879 for (size_t k = 1; k <= 20; k += 5) {
13880 GemmMicrokernelTester()
13881 .mr(6)
13882 .nr(8)
13883 .kr(1)
13884 .sr(1)
13885 .m(6)
13886 .n(8)
13887 .k(k)
13888 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13889 }
13890 }
13891 }
13892
13893 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8_strided_cn) {
13894 TEST_REQUIRES_ARM_NEON;
13895 for (uint32_t n = 16; n <= 24; n += 8) {
13896 for (size_t k = 1; k <= 20; k += 5) {
13897 GemmMicrokernelTester()
13898 .mr(6)
13899 .nr(8)
13900 .kr(1)
13901 .sr(1)
13902 .m(6)
13903 .n(n)
13904 .k(k)
13905 .cn_stride(11)
13906 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13907 }
13908 }
13909 }
13910
13911 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8_strided_a) {
13912 TEST_REQUIRES_ARM_NEON;
13913 for (uint32_t n = 16; n <= 24; n += 8) {
13914 for (size_t k = 1; k <= 20; k += 5) {
13915 GemmMicrokernelTester()
13916 .mr(6)
13917 .nr(8)
13918 .kr(1)
13919 .sr(1)
13920 .m(6)
13921 .n(n)
13922 .k(k)
13923 .a_stride(23)
13924 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13925 }
13926 }
13927 }
13928
13929 TEST(F32_GEMM_6X8__NEON_LANE_LD128, n_div_8_subtile) {
13930 TEST_REQUIRES_ARM_NEON;
13931 for (uint32_t n = 16; n <= 24; n += 8) {
13932 for (size_t k = 1; k <= 20; k += 5) {
13933 for (uint32_t m = 1; m <= 6; m++) {
13934 GemmMicrokernelTester()
13935 .mr(6)
13936 .nr(8)
13937 .kr(1)
13938 .sr(1)
13939 .m(m)
13940 .n(n)
13941 .k(k)
13942 .iterations(1)
13943 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13944 }
13945 }
13946 }
13947 }
13948
13949 TEST(F32_GEMM_6X8__NEON_LANE_LD128, strided_cm_subtile) {
13950 TEST_REQUIRES_ARM_NEON;
13951 for (size_t k = 1; k <= 20; k += 5) {
13952 for (uint32_t m = 1; m <= 6; m++) {
13953 for (uint32_t n = 1; n <= 8; n++) {
13954 GemmMicrokernelTester()
13955 .mr(6)
13956 .nr(8)
13957 .kr(1)
13958 .sr(1)
13959 .m(m)
13960 .n(n)
13961 .k(k)
13962 .cm_stride(11)
13963 .iterations(1)
13964 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13965 }
13966 }
13967 }
13968 }
13969
13970 TEST(F32_GEMM_6X8__NEON_LANE_LD128, qmin) {
13971 TEST_REQUIRES_ARM_NEON;
13972 GemmMicrokernelTester()
13973 .mr(6)
13974 .nr(8)
13975 .kr(1)
13976 .sr(1)
13977 .m(6)
13978 .n(8)
13979 .k(4)
13980 .qmin(128)
13981 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13982 }
13983
13984 TEST(F32_GEMM_6X8__NEON_LANE_LD128, qmax) {
13985 TEST_REQUIRES_ARM_NEON;
13986 GemmMicrokernelTester()
13987 .mr(6)
13988 .nr(8)
13989 .kr(1)
13990 .sr(1)
13991 .m(6)
13992 .n(8)
13993 .k(4)
13994 .qmax(128)
13995 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
13996 }
13997
13998 TEST(F32_GEMM_6X8__NEON_LANE_LD128, strided_cm) {
13999 TEST_REQUIRES_ARM_NEON;
14000 GemmMicrokernelTester()
14001 .mr(6)
14002 .nr(8)
14003 .kr(1)
14004 .sr(1)
14005 .m(6)
14006 .n(8)
14007 .k(4)
14008 .cm_stride(11)
14009 .Test(xnn_f32_gemm_ukernel_6x8__neon_lane_ld128);
14010 }
14011#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
14012
14013
Frank Barchard91317c52019-11-22 10:54:35 -080014014#if XNN_ARCH_ARM64
14015 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014016 TEST_REQUIRES_ARM_NEON_FMA;
14017 GemmMicrokernelTester()
14018 .mr(1)
14019 .nr(8)
14020 .kr(1)
14021 .sr(1)
14022 .m(1)
14023 .n(8)
14024 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080014025 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014026 }
14027
Frank Barchard91317c52019-11-22 10:54:35 -080014028 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014029 TEST_REQUIRES_ARM_NEON_FMA;
14030 GemmMicrokernelTester()
14031 .mr(1)
14032 .nr(8)
14033 .kr(1)
14034 .sr(1)
14035 .m(1)
14036 .n(8)
14037 .k(2)
14038 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014039 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014040 }
14041
Frank Barchard91317c52019-11-22 10:54:35 -080014042 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014043 TEST_REQUIRES_ARM_NEON_FMA;
14044 GemmMicrokernelTester()
14045 .mr(1)
14046 .nr(8)
14047 .kr(1)
14048 .sr(1)
14049 .m(1)
14050 .n(8)
14051 .k(2)
14052 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080014053 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014054 }
14055
Frank Barchard91317c52019-11-22 10:54:35 -080014056 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014057 TEST_REQUIRES_ARM_NEON_FMA;
14058 for (uint32_t m = 1; m <= 1; m++) {
14059 for (uint32_t n = 1; n <= 8; n++) {
14060 GemmMicrokernelTester()
14061 .mr(1)
14062 .nr(8)
14063 .kr(1)
14064 .sr(1)
14065 .m(m)
14066 .n(n)
14067 .k(2)
14068 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014069 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014070 }
14071 }
14072 }
14073
Frank Barchard91317c52019-11-22 10:54:35 -080014074 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014075 TEST_REQUIRES_ARM_NEON_FMA;
14076 for (uint32_t m = 1; m <= 1; m++) {
14077 GemmMicrokernelTester()
14078 .mr(1)
14079 .nr(8)
14080 .kr(1)
14081 .sr(1)
14082 .m(m)
14083 .n(8)
14084 .k(2)
14085 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014086 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014087 }
14088 }
14089
Frank Barchard91317c52019-11-22 10:54:35 -080014090 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014091 TEST_REQUIRES_ARM_NEON_FMA;
14092 for (uint32_t n = 1; n <= 8; n++) {
14093 GemmMicrokernelTester()
14094 .mr(1)
14095 .nr(8)
14096 .kr(1)
14097 .sr(1)
14098 .m(1)
14099 .n(n)
14100 .k(2)
14101 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014102 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014103 }
14104 }
14105
Frank Barchard91317c52019-11-22 10:54:35 -080014106 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014107 TEST_REQUIRES_ARM_NEON_FMA;
14108 for (size_t k = 1; k < 2; k++) {
14109 GemmMicrokernelTester()
14110 .mr(1)
14111 .nr(8)
14112 .kr(1)
14113 .sr(1)
14114 .m(1)
14115 .n(8)
14116 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014117 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014118 }
14119 }
14120
Frank Barchard91317c52019-11-22 10:54:35 -080014121 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014122 TEST_REQUIRES_ARM_NEON_FMA;
14123 for (size_t k = 1; k < 2; k++) {
14124 GemmMicrokernelTester()
14125 .mr(1)
14126 .nr(8)
14127 .kr(1)
14128 .sr(1)
14129 .m(1)
14130 .n(8)
14131 .k(k)
14132 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080014133 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014134 }
14135 }
14136
Frank Barchard91317c52019-11-22 10:54:35 -080014137 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014138 TEST_REQUIRES_ARM_NEON_FMA;
14139 for (size_t k = 1; k < 2; k++) {
14140 for (uint32_t m = 1; m <= 1; m++) {
14141 for (uint32_t n = 1; n <= 8; n++) {
14142 GemmMicrokernelTester()
14143 .mr(1)
14144 .nr(8)
14145 .kr(1)
14146 .sr(1)
14147 .m(m)
14148 .n(n)
14149 .k(k)
14150 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014151 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014152 }
14153 }
14154 }
14155 }
14156
Frank Barchard91317c52019-11-22 10:54:35 -080014157 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014158 TEST_REQUIRES_ARM_NEON_FMA;
14159 for (size_t k = 3; k < 4; k++) {
14160 GemmMicrokernelTester()
14161 .mr(1)
14162 .nr(8)
14163 .kr(1)
14164 .sr(1)
14165 .m(1)
14166 .n(8)
14167 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014168 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014169 }
14170 }
14171
Frank Barchard91317c52019-11-22 10:54:35 -080014172 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014173 TEST_REQUIRES_ARM_NEON_FMA;
14174 for (size_t k = 3; k < 4; k++) {
14175 GemmMicrokernelTester()
14176 .mr(1)
14177 .nr(8)
14178 .kr(1)
14179 .sr(1)
14180 .m(1)
14181 .n(8)
14182 .k(k)
14183 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080014184 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014185 }
14186 }
14187
Frank Barchard91317c52019-11-22 10:54:35 -080014188 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014189 TEST_REQUIRES_ARM_NEON_FMA;
14190 for (size_t k = 3; k < 4; k++) {
14191 for (uint32_t m = 1; m <= 1; m++) {
14192 for (uint32_t n = 1; n <= 8; n++) {
14193 GemmMicrokernelTester()
14194 .mr(1)
14195 .nr(8)
14196 .kr(1)
14197 .sr(1)
14198 .m(m)
14199 .n(n)
14200 .k(k)
14201 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014202 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014203 }
14204 }
14205 }
14206 }
14207
Frank Barchard91317c52019-11-22 10:54:35 -080014208 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014209 TEST_REQUIRES_ARM_NEON_FMA;
14210 for (size_t k = 4; k <= 20; k += 2) {
14211 GemmMicrokernelTester()
14212 .mr(1)
14213 .nr(8)
14214 .kr(1)
14215 .sr(1)
14216 .m(1)
14217 .n(8)
14218 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014219 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014220 }
14221 }
14222
Frank Barchard91317c52019-11-22 10:54:35 -080014223 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014224 TEST_REQUIRES_ARM_NEON_FMA;
14225 for (size_t k = 4; k <= 20; k += 2) {
14226 GemmMicrokernelTester()
14227 .mr(1)
14228 .nr(8)
14229 .kr(1)
14230 .sr(1)
14231 .m(1)
14232 .n(8)
14233 .k(k)
14234 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080014235 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014236 }
14237 }
14238
Frank Barchard91317c52019-11-22 10:54:35 -080014239 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014240 TEST_REQUIRES_ARM_NEON_FMA;
14241 for (size_t k = 4; k <= 20; k += 2) {
14242 for (uint32_t m = 1; m <= 1; m++) {
14243 for (uint32_t n = 1; n <= 8; n++) {
14244 GemmMicrokernelTester()
14245 .mr(1)
14246 .nr(8)
14247 .kr(1)
14248 .sr(1)
14249 .m(m)
14250 .n(n)
14251 .k(k)
14252 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014253 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014254 }
14255 }
14256 }
14257 }
14258
Frank Barchard91317c52019-11-22 10:54:35 -080014259 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014260 TEST_REQUIRES_ARM_NEON_FMA;
14261 for (uint32_t n = 9; n < 16; n++) {
14262 for (size_t k = 1; k <= 10; k += 3) {
14263 GemmMicrokernelTester()
14264 .mr(1)
14265 .nr(8)
14266 .kr(1)
14267 .sr(1)
14268 .m(1)
14269 .n(8)
14270 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014271 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014272 }
14273 }
14274 }
14275
Frank Barchard91317c52019-11-22 10:54:35 -080014276 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014277 TEST_REQUIRES_ARM_NEON_FMA;
14278 for (uint32_t n = 9; n < 16; n++) {
14279 for (size_t k = 1; k <= 10; k += 3) {
14280 GemmMicrokernelTester()
14281 .mr(1)
14282 .nr(8)
14283 .kr(1)
14284 .sr(1)
14285 .m(1)
14286 .n(8)
14287 .k(k)
14288 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014289 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014290 }
14291 }
14292 }
14293
Frank Barchard91317c52019-11-22 10:54:35 -080014294 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014295 TEST_REQUIRES_ARM_NEON_FMA;
14296 for (uint32_t n = 9; n < 16; n++) {
14297 for (size_t k = 1; k <= 10; k += 3) {
14298 GemmMicrokernelTester()
14299 .mr(1)
14300 .nr(8)
14301 .kr(1)
14302 .sr(1)
14303 .m(1)
14304 .n(n)
14305 .k(k)
14306 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080014307 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014308 }
14309 }
14310 }
14311
Frank Barchard91317c52019-11-22 10:54:35 -080014312 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014313 TEST_REQUIRES_ARM_NEON_FMA;
14314 for (uint32_t n = 9; n < 16; n++) {
14315 for (size_t k = 1; k <= 10; k += 3) {
14316 for (uint32_t m = 1; m <= 1; m++) {
14317 GemmMicrokernelTester()
14318 .mr(1)
14319 .nr(8)
14320 .kr(1)
14321 .sr(1)
14322 .m(m)
14323 .n(n)
14324 .k(k)
14325 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014326 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014327 }
14328 }
14329 }
14330 }
14331
Frank Barchard91317c52019-11-22 10:54:35 -080014332 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014333 TEST_REQUIRES_ARM_NEON_FMA;
14334 for (uint32_t n = 16; n <= 24; n += 8) {
14335 for (size_t k = 1; k <= 10; k += 3) {
14336 GemmMicrokernelTester()
14337 .mr(1)
14338 .nr(8)
14339 .kr(1)
14340 .sr(1)
14341 .m(1)
14342 .n(8)
14343 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014344 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014345 }
14346 }
14347 }
14348
Frank Barchard91317c52019-11-22 10:54:35 -080014349 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014350 TEST_REQUIRES_ARM_NEON_FMA;
14351 for (uint32_t n = 16; n <= 24; n += 8) {
14352 for (size_t k = 1; k <= 10; k += 3) {
14353 GemmMicrokernelTester()
14354 .mr(1)
14355 .nr(8)
14356 .kr(1)
14357 .sr(1)
14358 .m(1)
14359 .n(n)
14360 .k(k)
14361 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014362 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014363 }
14364 }
14365 }
14366
Frank Barchard91317c52019-11-22 10:54:35 -080014367 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014368 TEST_REQUIRES_ARM_NEON_FMA;
14369 for (uint32_t n = 16; n <= 24; n += 8) {
14370 for (size_t k = 1; k <= 10; k += 3) {
14371 GemmMicrokernelTester()
14372 .mr(1)
14373 .nr(8)
14374 .kr(1)
14375 .sr(1)
14376 .m(1)
14377 .n(n)
14378 .k(k)
14379 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080014380 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014381 }
14382 }
14383 }
14384
Frank Barchard91317c52019-11-22 10:54:35 -080014385 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014386 TEST_REQUIRES_ARM_NEON_FMA;
14387 for (uint32_t n = 16; n <= 24; n += 8) {
14388 for (size_t k = 1; k <= 10; k += 3) {
14389 for (uint32_t m = 1; m <= 1; m++) {
14390 GemmMicrokernelTester()
14391 .mr(1)
14392 .nr(8)
14393 .kr(1)
14394 .sr(1)
14395 .m(m)
14396 .n(n)
14397 .k(k)
14398 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014399 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014400 }
14401 }
14402 }
14403 }
14404
Frank Barchard91317c52019-11-22 10:54:35 -080014405 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014406 TEST_REQUIRES_ARM_NEON_FMA;
14407 for (size_t k = 1; k <= 10; k += 3) {
14408 for (uint32_t m = 1; m <= 1; m++) {
14409 for (uint32_t n = 1; n <= 8; n++) {
14410 GemmMicrokernelTester()
14411 .mr(1)
14412 .nr(8)
14413 .kr(1)
14414 .sr(1)
14415 .m(m)
14416 .n(n)
14417 .k(k)
14418 .cm_stride(11)
14419 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014420 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014421 }
14422 }
14423 }
14424 }
14425
Frank Barchard91317c52019-11-22 10:54:35 -080014426 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014427 TEST_REQUIRES_ARM_NEON_FMA;
14428 GemmMicrokernelTester()
14429 .mr(1)
14430 .nr(8)
14431 .kr(1)
14432 .sr(1)
14433 .m(1)
14434 .n(8)
14435 .k(2)
14436 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014437 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014438 }
14439
Frank Barchard91317c52019-11-22 10:54:35 -080014440 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014441 TEST_REQUIRES_ARM_NEON_FMA;
14442 GemmMicrokernelTester()
14443 .mr(1)
14444 .nr(8)
14445 .kr(1)
14446 .sr(1)
14447 .m(1)
14448 .n(8)
14449 .k(2)
14450 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014451 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014452 }
14453
Frank Barchard91317c52019-11-22 10:54:35 -080014454 TEST(F32_GEMM_1X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014455 TEST_REQUIRES_ARM_NEON_FMA;
14456 GemmMicrokernelTester()
14457 .mr(1)
14458 .nr(8)
14459 .kr(1)
14460 .sr(1)
14461 .m(1)
14462 .n(8)
14463 .k(2)
14464 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014465 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014466 }
Frank Barchard91317c52019-11-22 10:54:35 -080014467#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070014468
14469
Frank Barchard91317c52019-11-22 10:54:35 -080014470#if XNN_ARCH_ARM64
14471 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014472 TEST_REQUIRES_ARM_NEON_FMA;
14473 GemmMicrokernelTester()
14474 .mr(4)
14475 .nr(8)
14476 .kr(1)
14477 .sr(1)
14478 .m(4)
14479 .n(8)
14480 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080014481 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014482 }
14483
Frank Barchard91317c52019-11-22 10:54:35 -080014484 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014485 TEST_REQUIRES_ARM_NEON_FMA;
14486 GemmMicrokernelTester()
14487 .mr(4)
14488 .nr(8)
14489 .kr(1)
14490 .sr(1)
14491 .m(4)
14492 .n(8)
14493 .k(2)
14494 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014495 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014496 }
14497
Frank Barchard91317c52019-11-22 10:54:35 -080014498 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014499 TEST_REQUIRES_ARM_NEON_FMA;
14500 GemmMicrokernelTester()
14501 .mr(4)
14502 .nr(8)
14503 .kr(1)
14504 .sr(1)
14505 .m(4)
14506 .n(8)
14507 .k(2)
14508 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080014509 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014510 }
14511
Frank Barchard91317c52019-11-22 10:54:35 -080014512 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014513 TEST_REQUIRES_ARM_NEON_FMA;
14514 for (uint32_t m = 1; m <= 4; m++) {
14515 for (uint32_t n = 1; n <= 8; n++) {
14516 GemmMicrokernelTester()
14517 .mr(4)
14518 .nr(8)
14519 .kr(1)
14520 .sr(1)
14521 .m(m)
14522 .n(n)
14523 .k(2)
14524 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014525 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014526 }
14527 }
14528 }
14529
Frank Barchard91317c52019-11-22 10:54:35 -080014530 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014531 TEST_REQUIRES_ARM_NEON_FMA;
14532 for (uint32_t m = 1; m <= 4; m++) {
14533 GemmMicrokernelTester()
14534 .mr(4)
14535 .nr(8)
14536 .kr(1)
14537 .sr(1)
14538 .m(m)
14539 .n(8)
14540 .k(2)
14541 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014542 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014543 }
14544 }
14545
Frank Barchard91317c52019-11-22 10:54:35 -080014546 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014547 TEST_REQUIRES_ARM_NEON_FMA;
14548 for (uint32_t n = 1; n <= 8; n++) {
14549 GemmMicrokernelTester()
14550 .mr(4)
14551 .nr(8)
14552 .kr(1)
14553 .sr(1)
14554 .m(4)
14555 .n(n)
14556 .k(2)
14557 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014558 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014559 }
14560 }
14561
Frank Barchard91317c52019-11-22 10:54:35 -080014562 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014563 TEST_REQUIRES_ARM_NEON_FMA;
14564 for (size_t k = 1; k < 2; k++) {
14565 GemmMicrokernelTester()
14566 .mr(4)
14567 .nr(8)
14568 .kr(1)
14569 .sr(1)
14570 .m(4)
14571 .n(8)
14572 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014573 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014574 }
14575 }
14576
Frank Barchard91317c52019-11-22 10:54:35 -080014577 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014578 TEST_REQUIRES_ARM_NEON_FMA;
14579 for (size_t k = 1; k < 2; k++) {
14580 GemmMicrokernelTester()
14581 .mr(4)
14582 .nr(8)
14583 .kr(1)
14584 .sr(1)
14585 .m(4)
14586 .n(8)
14587 .k(k)
14588 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080014589 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014590 }
14591 }
14592
Frank Barchard91317c52019-11-22 10:54:35 -080014593 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014594 TEST_REQUIRES_ARM_NEON_FMA;
14595 for (size_t k = 1; k < 2; k++) {
14596 for (uint32_t m = 1; m <= 4; m++) {
14597 for (uint32_t n = 1; n <= 8; n++) {
14598 GemmMicrokernelTester()
14599 .mr(4)
14600 .nr(8)
14601 .kr(1)
14602 .sr(1)
14603 .m(m)
14604 .n(n)
14605 .k(k)
14606 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014607 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014608 }
14609 }
14610 }
14611 }
14612
Frank Barchard91317c52019-11-22 10:54:35 -080014613 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014614 TEST_REQUIRES_ARM_NEON_FMA;
14615 for (size_t k = 3; k < 4; k++) {
14616 GemmMicrokernelTester()
14617 .mr(4)
14618 .nr(8)
14619 .kr(1)
14620 .sr(1)
14621 .m(4)
14622 .n(8)
14623 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014624 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014625 }
14626 }
14627
Frank Barchard91317c52019-11-22 10:54:35 -080014628 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014629 TEST_REQUIRES_ARM_NEON_FMA;
14630 for (size_t k = 3; k < 4; k++) {
14631 GemmMicrokernelTester()
14632 .mr(4)
14633 .nr(8)
14634 .kr(1)
14635 .sr(1)
14636 .m(4)
14637 .n(8)
14638 .k(k)
14639 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080014640 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014641 }
14642 }
14643
Frank Barchard91317c52019-11-22 10:54:35 -080014644 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014645 TEST_REQUIRES_ARM_NEON_FMA;
14646 for (size_t k = 3; k < 4; k++) {
14647 for (uint32_t m = 1; m <= 4; m++) {
14648 for (uint32_t n = 1; n <= 8; n++) {
14649 GemmMicrokernelTester()
14650 .mr(4)
14651 .nr(8)
14652 .kr(1)
14653 .sr(1)
14654 .m(m)
14655 .n(n)
14656 .k(k)
14657 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014658 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014659 }
14660 }
14661 }
14662 }
14663
Frank Barchard91317c52019-11-22 10:54:35 -080014664 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014665 TEST_REQUIRES_ARM_NEON_FMA;
14666 for (size_t k = 4; k <= 20; k += 2) {
14667 GemmMicrokernelTester()
14668 .mr(4)
14669 .nr(8)
14670 .kr(1)
14671 .sr(1)
14672 .m(4)
14673 .n(8)
14674 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014675 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014676 }
14677 }
14678
Frank Barchard91317c52019-11-22 10:54:35 -080014679 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014680 TEST_REQUIRES_ARM_NEON_FMA;
14681 for (size_t k = 4; k <= 20; k += 2) {
14682 GemmMicrokernelTester()
14683 .mr(4)
14684 .nr(8)
14685 .kr(1)
14686 .sr(1)
14687 .m(4)
14688 .n(8)
14689 .k(k)
14690 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080014691 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014692 }
14693 }
14694
Frank Barchard91317c52019-11-22 10:54:35 -080014695 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014696 TEST_REQUIRES_ARM_NEON_FMA;
14697 for (size_t k = 4; k <= 20; k += 2) {
14698 for (uint32_t m = 1; m <= 4; m++) {
14699 for (uint32_t n = 1; n <= 8; n++) {
14700 GemmMicrokernelTester()
14701 .mr(4)
14702 .nr(8)
14703 .kr(1)
14704 .sr(1)
14705 .m(m)
14706 .n(n)
14707 .k(k)
14708 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014709 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014710 }
14711 }
14712 }
14713 }
14714
Frank Barchard91317c52019-11-22 10:54:35 -080014715 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014716 TEST_REQUIRES_ARM_NEON_FMA;
14717 for (uint32_t n = 9; n < 16; n++) {
14718 for (size_t k = 1; k <= 10; k += 3) {
14719 GemmMicrokernelTester()
14720 .mr(4)
14721 .nr(8)
14722 .kr(1)
14723 .sr(1)
14724 .m(4)
14725 .n(8)
14726 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014727 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014728 }
14729 }
14730 }
14731
Frank Barchard91317c52019-11-22 10:54:35 -080014732 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014733 TEST_REQUIRES_ARM_NEON_FMA;
14734 for (uint32_t n = 9; n < 16; n++) {
14735 for (size_t k = 1; k <= 10; k += 3) {
14736 GemmMicrokernelTester()
14737 .mr(4)
14738 .nr(8)
14739 .kr(1)
14740 .sr(1)
14741 .m(4)
14742 .n(8)
14743 .k(k)
14744 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014745 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014746 }
14747 }
14748 }
14749
Frank Barchard91317c52019-11-22 10:54:35 -080014750 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014751 TEST_REQUIRES_ARM_NEON_FMA;
14752 for (uint32_t n = 9; n < 16; n++) {
14753 for (size_t k = 1; k <= 10; k += 3) {
14754 GemmMicrokernelTester()
14755 .mr(4)
14756 .nr(8)
14757 .kr(1)
14758 .sr(1)
14759 .m(4)
14760 .n(n)
14761 .k(k)
14762 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080014763 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014764 }
14765 }
14766 }
14767
Frank Barchard91317c52019-11-22 10:54:35 -080014768 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014769 TEST_REQUIRES_ARM_NEON_FMA;
14770 for (uint32_t n = 9; n < 16; n++) {
14771 for (size_t k = 1; k <= 10; k += 3) {
14772 for (uint32_t m = 1; m <= 4; m++) {
14773 GemmMicrokernelTester()
14774 .mr(4)
14775 .nr(8)
14776 .kr(1)
14777 .sr(1)
14778 .m(m)
14779 .n(n)
14780 .k(k)
14781 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014782 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014783 }
14784 }
14785 }
14786 }
14787
Frank Barchard91317c52019-11-22 10:54:35 -080014788 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014789 TEST_REQUIRES_ARM_NEON_FMA;
14790 for (uint32_t n = 16; n <= 24; n += 8) {
14791 for (size_t k = 1; k <= 10; k += 3) {
14792 GemmMicrokernelTester()
14793 .mr(4)
14794 .nr(8)
14795 .kr(1)
14796 .sr(1)
14797 .m(4)
14798 .n(8)
14799 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080014800 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014801 }
14802 }
14803 }
14804
Frank Barchard91317c52019-11-22 10:54:35 -080014805 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014806 TEST_REQUIRES_ARM_NEON_FMA;
14807 for (uint32_t n = 16; n <= 24; n += 8) {
14808 for (size_t k = 1; k <= 10; k += 3) {
14809 GemmMicrokernelTester()
14810 .mr(4)
14811 .nr(8)
14812 .kr(1)
14813 .sr(1)
14814 .m(4)
14815 .n(n)
14816 .k(k)
14817 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014818 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014819 }
14820 }
14821 }
14822
Frank Barchard91317c52019-11-22 10:54:35 -080014823 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014824 TEST_REQUIRES_ARM_NEON_FMA;
14825 for (uint32_t n = 16; n <= 24; n += 8) {
14826 for (size_t k = 1; k <= 10; k += 3) {
14827 GemmMicrokernelTester()
14828 .mr(4)
14829 .nr(8)
14830 .kr(1)
14831 .sr(1)
14832 .m(4)
14833 .n(n)
14834 .k(k)
14835 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080014836 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014837 }
14838 }
14839 }
14840
Frank Barchard91317c52019-11-22 10:54:35 -080014841 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014842 TEST_REQUIRES_ARM_NEON_FMA;
14843 for (uint32_t n = 16; n <= 24; n += 8) {
14844 for (size_t k = 1; k <= 10; k += 3) {
14845 for (uint32_t m = 1; m <= 4; m++) {
14846 GemmMicrokernelTester()
14847 .mr(4)
14848 .nr(8)
14849 .kr(1)
14850 .sr(1)
14851 .m(m)
14852 .n(n)
14853 .k(k)
14854 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014855 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014856 }
14857 }
14858 }
14859 }
14860
Frank Barchard91317c52019-11-22 10:54:35 -080014861 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014862 TEST_REQUIRES_ARM_NEON_FMA;
14863 for (size_t k = 1; k <= 10; k += 3) {
14864 for (uint32_t m = 1; m <= 4; m++) {
14865 for (uint32_t n = 1; n <= 8; n++) {
14866 GemmMicrokernelTester()
14867 .mr(4)
14868 .nr(8)
14869 .kr(1)
14870 .sr(1)
14871 .m(m)
14872 .n(n)
14873 .k(k)
14874 .cm_stride(11)
14875 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014876 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014877 }
14878 }
14879 }
14880 }
14881
Frank Barchard91317c52019-11-22 10:54:35 -080014882 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014883 TEST_REQUIRES_ARM_NEON_FMA;
14884 GemmMicrokernelTester()
14885 .mr(4)
14886 .nr(8)
14887 .kr(1)
14888 .sr(1)
14889 .m(4)
14890 .n(8)
14891 .k(2)
14892 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014893 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014894 }
14895
Frank Barchard91317c52019-11-22 10:54:35 -080014896 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014897 TEST_REQUIRES_ARM_NEON_FMA;
14898 GemmMicrokernelTester()
14899 .mr(4)
14900 .nr(8)
14901 .kr(1)
14902 .sr(1)
14903 .m(4)
14904 .n(8)
14905 .k(2)
14906 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080014907 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014908 }
14909
Frank Barchard91317c52019-11-22 10:54:35 -080014910 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014911 TEST_REQUIRES_ARM_NEON_FMA;
14912 GemmMicrokernelTester()
14913 .mr(4)
14914 .nr(8)
14915 .kr(1)
14916 .sr(1)
14917 .m(4)
14918 .n(8)
14919 .k(2)
14920 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014921 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014922 }
Frank Barchard91317c52019-11-22 10:54:35 -080014923#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070014924
14925
Frank Barchard91317c52019-11-22 10:54:35 -080014926#if XNN_ARCH_ARM64
14927 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014928 TEST_REQUIRES_ARM_NEON_FMA;
14929 GemmMicrokernelTester()
14930 .mr(4)
14931 .nr(8)
14932 .kr(1)
14933 .sr(1)
14934 .m(4)
14935 .n(8)
14936 .k(4)
Frank Barchard91317c52019-11-22 10:54:35 -080014937 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014938 }
14939
Frank Barchard91317c52019-11-22 10:54:35 -080014940 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014941 TEST_REQUIRES_ARM_NEON_FMA;
14942 GemmMicrokernelTester()
14943 .mr(4)
14944 .nr(8)
14945 .kr(1)
14946 .sr(1)
14947 .m(4)
14948 .n(8)
14949 .k(4)
14950 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080014951 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014952 }
14953
Frank Barchard91317c52019-11-22 10:54:35 -080014954 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014955 TEST_REQUIRES_ARM_NEON_FMA;
14956 GemmMicrokernelTester()
14957 .mr(4)
14958 .nr(8)
14959 .kr(1)
14960 .sr(1)
14961 .m(4)
14962 .n(8)
14963 .k(4)
14964 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080014965 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014966 }
14967
Frank Barchard91317c52019-11-22 10:54:35 -080014968 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014969 TEST_REQUIRES_ARM_NEON_FMA;
14970 for (uint32_t m = 1; m <= 4; m++) {
14971 for (uint32_t n = 1; n <= 8; n++) {
14972 GemmMicrokernelTester()
14973 .mr(4)
14974 .nr(8)
14975 .kr(1)
14976 .sr(1)
14977 .m(m)
14978 .n(n)
14979 .k(4)
14980 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014981 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014982 }
14983 }
14984 }
14985
Frank Barchard91317c52019-11-22 10:54:35 -080014986 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070014987 TEST_REQUIRES_ARM_NEON_FMA;
14988 for (uint32_t m = 1; m <= 4; m++) {
14989 GemmMicrokernelTester()
14990 .mr(4)
14991 .nr(8)
14992 .kr(1)
14993 .sr(1)
14994 .m(m)
14995 .n(8)
14996 .k(4)
14997 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080014998 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070014999 }
15000 }
15001
Frank Barchard91317c52019-11-22 10:54:35 -080015002 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015003 TEST_REQUIRES_ARM_NEON_FMA;
15004 for (uint32_t n = 1; n <= 8; n++) {
15005 GemmMicrokernelTester()
15006 .mr(4)
15007 .nr(8)
15008 .kr(1)
15009 .sr(1)
15010 .m(4)
15011 .n(n)
15012 .k(4)
15013 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015014 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015015 }
15016 }
15017
Frank Barchard91317c52019-11-22 10:54:35 -080015018 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015019 TEST_REQUIRES_ARM_NEON_FMA;
15020 for (size_t k = 1; k < 4; k++) {
15021 GemmMicrokernelTester()
15022 .mr(4)
15023 .nr(8)
15024 .kr(1)
15025 .sr(1)
15026 .m(4)
15027 .n(8)
15028 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015029 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015030 }
15031 }
15032
Frank Barchard91317c52019-11-22 10:54:35 -080015033 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015034 TEST_REQUIRES_ARM_NEON_FMA;
15035 for (size_t k = 1; k < 4; k++) {
15036 GemmMicrokernelTester()
15037 .mr(4)
15038 .nr(8)
15039 .kr(1)
15040 .sr(1)
15041 .m(4)
15042 .n(8)
15043 .k(k)
15044 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080015045 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015046 }
15047 }
15048
Frank Barchard91317c52019-11-22 10:54:35 -080015049 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015050 TEST_REQUIRES_ARM_NEON_FMA;
15051 for (size_t k = 1; k < 4; k++) {
15052 for (uint32_t m = 1; m <= 4; m++) {
15053 for (uint32_t n = 1; n <= 8; n++) {
15054 GemmMicrokernelTester()
15055 .mr(4)
15056 .nr(8)
15057 .kr(1)
15058 .sr(1)
15059 .m(m)
15060 .n(n)
15061 .k(k)
15062 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015063 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015064 }
15065 }
15066 }
15067 }
15068
Frank Barchard91317c52019-11-22 10:54:35 -080015069 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015070 TEST_REQUIRES_ARM_NEON_FMA;
15071 for (size_t k = 5; k < 8; k++) {
15072 GemmMicrokernelTester()
15073 .mr(4)
15074 .nr(8)
15075 .kr(1)
15076 .sr(1)
15077 .m(4)
15078 .n(8)
15079 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015080 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015081 }
15082 }
15083
Frank Barchard91317c52019-11-22 10:54:35 -080015084 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015085 TEST_REQUIRES_ARM_NEON_FMA;
15086 for (size_t k = 5; k < 8; k++) {
15087 GemmMicrokernelTester()
15088 .mr(4)
15089 .nr(8)
15090 .kr(1)
15091 .sr(1)
15092 .m(4)
15093 .n(8)
15094 .k(k)
15095 .a_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015096 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015097 }
15098 }
15099
Frank Barchard91317c52019-11-22 10:54:35 -080015100 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015101 TEST_REQUIRES_ARM_NEON_FMA;
15102 for (size_t k = 5; k < 8; k++) {
15103 for (uint32_t m = 1; m <= 4; m++) {
15104 for (uint32_t n = 1; n <= 8; n++) {
15105 GemmMicrokernelTester()
15106 .mr(4)
15107 .nr(8)
15108 .kr(1)
15109 .sr(1)
15110 .m(m)
15111 .n(n)
15112 .k(k)
15113 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015114 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015115 }
15116 }
15117 }
15118 }
15119
Frank Barchard91317c52019-11-22 10:54:35 -080015120 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015121 TEST_REQUIRES_ARM_NEON_FMA;
15122 for (size_t k = 8; k <= 40; k += 4) {
15123 GemmMicrokernelTester()
15124 .mr(4)
15125 .nr(8)
15126 .kr(1)
15127 .sr(1)
15128 .m(4)
15129 .n(8)
15130 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015131 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015132 }
15133 }
15134
Frank Barchard91317c52019-11-22 10:54:35 -080015135 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015136 TEST_REQUIRES_ARM_NEON_FMA;
15137 for (size_t k = 8; k <= 40; k += 4) {
15138 GemmMicrokernelTester()
15139 .mr(4)
15140 .nr(8)
15141 .kr(1)
15142 .sr(1)
15143 .m(4)
15144 .n(8)
15145 .k(k)
15146 .a_stride(43)
Frank Barchard91317c52019-11-22 10:54:35 -080015147 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015148 }
15149 }
15150
Frank Barchard91317c52019-11-22 10:54:35 -080015151 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015152 TEST_REQUIRES_ARM_NEON_FMA;
15153 for (size_t k = 8; k <= 40; k += 4) {
15154 for (uint32_t m = 1; m <= 4; m++) {
15155 for (uint32_t n = 1; n <= 8; n++) {
15156 GemmMicrokernelTester()
15157 .mr(4)
15158 .nr(8)
15159 .kr(1)
15160 .sr(1)
15161 .m(m)
15162 .n(n)
15163 .k(k)
15164 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015165 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015166 }
15167 }
15168 }
15169 }
15170
Frank Barchard91317c52019-11-22 10:54:35 -080015171 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015172 TEST_REQUIRES_ARM_NEON_FMA;
15173 for (uint32_t n = 9; n < 16; n++) {
15174 for (size_t k = 1; k <= 20; k += 5) {
15175 GemmMicrokernelTester()
15176 .mr(4)
15177 .nr(8)
15178 .kr(1)
15179 .sr(1)
15180 .m(4)
15181 .n(8)
15182 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015183 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015184 }
15185 }
15186 }
15187
Frank Barchard91317c52019-11-22 10:54:35 -080015188 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015189 TEST_REQUIRES_ARM_NEON_FMA;
15190 for (uint32_t n = 9; n < 16; n++) {
15191 for (size_t k = 1; k <= 20; k += 5) {
15192 GemmMicrokernelTester()
15193 .mr(4)
15194 .nr(8)
15195 .kr(1)
15196 .sr(1)
15197 .m(4)
15198 .n(8)
15199 .k(k)
15200 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015201 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015202 }
15203 }
15204 }
15205
Frank Barchard91317c52019-11-22 10:54:35 -080015206 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015207 TEST_REQUIRES_ARM_NEON_FMA;
15208 for (uint32_t n = 9; n < 16; n++) {
15209 for (size_t k = 1; k <= 20; k += 5) {
15210 GemmMicrokernelTester()
15211 .mr(4)
15212 .nr(8)
15213 .kr(1)
15214 .sr(1)
15215 .m(4)
15216 .n(n)
15217 .k(k)
15218 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080015219 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015220 }
15221 }
15222 }
15223
Frank Barchard91317c52019-11-22 10:54:35 -080015224 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015225 TEST_REQUIRES_ARM_NEON_FMA;
15226 for (uint32_t n = 9; n < 16; n++) {
15227 for (size_t k = 1; k <= 20; k += 5) {
15228 for (uint32_t m = 1; m <= 4; m++) {
15229 GemmMicrokernelTester()
15230 .mr(4)
15231 .nr(8)
15232 .kr(1)
15233 .sr(1)
15234 .m(m)
15235 .n(n)
15236 .k(k)
15237 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015238 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015239 }
15240 }
15241 }
15242 }
15243
Frank Barchard91317c52019-11-22 10:54:35 -080015244 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015245 TEST_REQUIRES_ARM_NEON_FMA;
15246 for (uint32_t n = 16; n <= 24; n += 8) {
15247 for (size_t k = 1; k <= 20; k += 5) {
15248 GemmMicrokernelTester()
15249 .mr(4)
15250 .nr(8)
15251 .kr(1)
15252 .sr(1)
15253 .m(4)
15254 .n(8)
15255 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015256 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015257 }
15258 }
15259 }
15260
Frank Barchard91317c52019-11-22 10:54:35 -080015261 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015262 TEST_REQUIRES_ARM_NEON_FMA;
15263 for (uint32_t n = 16; n <= 24; n += 8) {
15264 for (size_t k = 1; k <= 20; k += 5) {
15265 GemmMicrokernelTester()
15266 .mr(4)
15267 .nr(8)
15268 .kr(1)
15269 .sr(1)
15270 .m(4)
15271 .n(n)
15272 .k(k)
15273 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015274 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015275 }
15276 }
15277 }
15278
Frank Barchard91317c52019-11-22 10:54:35 -080015279 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015280 TEST_REQUIRES_ARM_NEON_FMA;
15281 for (uint32_t n = 16; n <= 24; n += 8) {
15282 for (size_t k = 1; k <= 20; k += 5) {
15283 GemmMicrokernelTester()
15284 .mr(4)
15285 .nr(8)
15286 .kr(1)
15287 .sr(1)
15288 .m(4)
15289 .n(n)
15290 .k(k)
15291 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080015292 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015293 }
15294 }
15295 }
15296
Frank Barchard91317c52019-11-22 10:54:35 -080015297 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015298 TEST_REQUIRES_ARM_NEON_FMA;
15299 for (uint32_t n = 16; n <= 24; n += 8) {
15300 for (size_t k = 1; k <= 20; k += 5) {
15301 for (uint32_t m = 1; m <= 4; m++) {
15302 GemmMicrokernelTester()
15303 .mr(4)
15304 .nr(8)
15305 .kr(1)
15306 .sr(1)
15307 .m(m)
15308 .n(n)
15309 .k(k)
15310 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015311 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015312 }
15313 }
15314 }
15315 }
15316
Frank Barchard91317c52019-11-22 10:54:35 -080015317 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015318 TEST_REQUIRES_ARM_NEON_FMA;
15319 for (size_t k = 1; k <= 20; k += 5) {
15320 for (uint32_t m = 1; m <= 4; m++) {
15321 for (uint32_t n = 1; n <= 8; n++) {
15322 GemmMicrokernelTester()
15323 .mr(4)
15324 .nr(8)
15325 .kr(1)
15326 .sr(1)
15327 .m(m)
15328 .n(n)
15329 .k(k)
15330 .cm_stride(11)
15331 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015332 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015333 }
15334 }
15335 }
15336 }
15337
Frank Barchard91317c52019-11-22 10:54:35 -080015338 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015339 TEST_REQUIRES_ARM_NEON_FMA;
15340 GemmMicrokernelTester()
15341 .mr(4)
15342 .nr(8)
15343 .kr(1)
15344 .sr(1)
15345 .m(4)
15346 .n(8)
15347 .k(4)
15348 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080015349 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015350 }
15351
Frank Barchard91317c52019-11-22 10:54:35 -080015352 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015353 TEST_REQUIRES_ARM_NEON_FMA;
15354 GemmMicrokernelTester()
15355 .mr(4)
15356 .nr(8)
15357 .kr(1)
15358 .sr(1)
15359 .m(4)
15360 .n(8)
15361 .k(4)
15362 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080015363 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015364 }
15365
Frank Barchard91317c52019-11-22 10:54:35 -080015366 TEST(F32_GEMM_4X8__NEONFMA_LANE_LD128, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015367 TEST_REQUIRES_ARM_NEON_FMA;
15368 GemmMicrokernelTester()
15369 .mr(4)
15370 .nr(8)
15371 .kr(1)
15372 .sr(1)
15373 .m(4)
15374 .n(8)
15375 .k(4)
15376 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015377 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_lane_ld128);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015378 }
Frank Barchard91317c52019-11-22 10:54:35 -080015379#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070015380
15381
Frank Barchard91317c52019-11-22 10:54:35 -080015382#if XNN_ARCH_ARM64
15383 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015384 TEST_REQUIRES_ARM_NEON_FMA;
15385 GemmMicrokernelTester()
15386 .mr(5)
15387 .nr(8)
15388 .kr(1)
15389 .sr(1)
15390 .m(5)
15391 .n(8)
15392 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080015393 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015394 }
15395
Frank Barchard91317c52019-11-22 10:54:35 -080015396 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015397 TEST_REQUIRES_ARM_NEON_FMA;
15398 GemmMicrokernelTester()
15399 .mr(5)
15400 .nr(8)
15401 .kr(1)
15402 .sr(1)
15403 .m(5)
15404 .n(8)
15405 .k(2)
15406 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015407 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015408 }
15409
Frank Barchard91317c52019-11-22 10:54:35 -080015410 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015411 TEST_REQUIRES_ARM_NEON_FMA;
15412 GemmMicrokernelTester()
15413 .mr(5)
15414 .nr(8)
15415 .kr(1)
15416 .sr(1)
15417 .m(5)
15418 .n(8)
15419 .k(2)
15420 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080015421 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015422 }
15423
Frank Barchard91317c52019-11-22 10:54:35 -080015424 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015425 TEST_REQUIRES_ARM_NEON_FMA;
15426 for (uint32_t m = 1; m <= 5; m++) {
15427 for (uint32_t n = 1; n <= 8; n++) {
15428 GemmMicrokernelTester()
15429 .mr(5)
15430 .nr(8)
15431 .kr(1)
15432 .sr(1)
15433 .m(m)
15434 .n(n)
15435 .k(2)
15436 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015437 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015438 }
15439 }
15440 }
15441
Frank Barchard91317c52019-11-22 10:54:35 -080015442 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015443 TEST_REQUIRES_ARM_NEON_FMA;
15444 for (uint32_t m = 1; m <= 5; m++) {
15445 GemmMicrokernelTester()
15446 .mr(5)
15447 .nr(8)
15448 .kr(1)
15449 .sr(1)
15450 .m(m)
15451 .n(8)
15452 .k(2)
15453 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015454 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015455 }
15456 }
15457
Frank Barchard91317c52019-11-22 10:54:35 -080015458 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015459 TEST_REQUIRES_ARM_NEON_FMA;
15460 for (uint32_t n = 1; n <= 8; n++) {
15461 GemmMicrokernelTester()
15462 .mr(5)
15463 .nr(8)
15464 .kr(1)
15465 .sr(1)
15466 .m(5)
15467 .n(n)
15468 .k(2)
15469 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015470 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015471 }
15472 }
15473
Frank Barchard91317c52019-11-22 10:54:35 -080015474 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015475 TEST_REQUIRES_ARM_NEON_FMA;
15476 for (size_t k = 1; k < 2; k++) {
15477 GemmMicrokernelTester()
15478 .mr(5)
15479 .nr(8)
15480 .kr(1)
15481 .sr(1)
15482 .m(5)
15483 .n(8)
15484 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015485 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015486 }
15487 }
15488
Frank Barchard91317c52019-11-22 10:54:35 -080015489 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015490 TEST_REQUIRES_ARM_NEON_FMA;
15491 for (size_t k = 1; k < 2; k++) {
15492 GemmMicrokernelTester()
15493 .mr(5)
15494 .nr(8)
15495 .kr(1)
15496 .sr(1)
15497 .m(5)
15498 .n(8)
15499 .k(k)
15500 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080015501 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015502 }
15503 }
15504
Frank Barchard91317c52019-11-22 10:54:35 -080015505 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015506 TEST_REQUIRES_ARM_NEON_FMA;
15507 for (size_t k = 1; k < 2; k++) {
15508 for (uint32_t m = 1; m <= 5; m++) {
15509 for (uint32_t n = 1; n <= 8; n++) {
15510 GemmMicrokernelTester()
15511 .mr(5)
15512 .nr(8)
15513 .kr(1)
15514 .sr(1)
15515 .m(m)
15516 .n(n)
15517 .k(k)
15518 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015519 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015520 }
15521 }
15522 }
15523 }
15524
Frank Barchard91317c52019-11-22 10:54:35 -080015525 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015526 TEST_REQUIRES_ARM_NEON_FMA;
15527 for (size_t k = 3; k < 4; k++) {
15528 GemmMicrokernelTester()
15529 .mr(5)
15530 .nr(8)
15531 .kr(1)
15532 .sr(1)
15533 .m(5)
15534 .n(8)
15535 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015536 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015537 }
15538 }
15539
Frank Barchard91317c52019-11-22 10:54:35 -080015540 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015541 TEST_REQUIRES_ARM_NEON_FMA;
15542 for (size_t k = 3; k < 4; k++) {
15543 GemmMicrokernelTester()
15544 .mr(5)
15545 .nr(8)
15546 .kr(1)
15547 .sr(1)
15548 .m(5)
15549 .n(8)
15550 .k(k)
15551 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080015552 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015553 }
15554 }
15555
Frank Barchard91317c52019-11-22 10:54:35 -080015556 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015557 TEST_REQUIRES_ARM_NEON_FMA;
15558 for (size_t k = 3; k < 4; k++) {
15559 for (uint32_t m = 1; m <= 5; m++) {
15560 for (uint32_t n = 1; n <= 8; n++) {
15561 GemmMicrokernelTester()
15562 .mr(5)
15563 .nr(8)
15564 .kr(1)
15565 .sr(1)
15566 .m(m)
15567 .n(n)
15568 .k(k)
15569 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015570 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015571 }
15572 }
15573 }
15574 }
15575
Frank Barchard91317c52019-11-22 10:54:35 -080015576 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015577 TEST_REQUIRES_ARM_NEON_FMA;
15578 for (size_t k = 4; k <= 20; k += 2) {
15579 GemmMicrokernelTester()
15580 .mr(5)
15581 .nr(8)
15582 .kr(1)
15583 .sr(1)
15584 .m(5)
15585 .n(8)
15586 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015587 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015588 }
15589 }
15590
Frank Barchard91317c52019-11-22 10:54:35 -080015591 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015592 TEST_REQUIRES_ARM_NEON_FMA;
15593 for (size_t k = 4; k <= 20; k += 2) {
15594 GemmMicrokernelTester()
15595 .mr(5)
15596 .nr(8)
15597 .kr(1)
15598 .sr(1)
15599 .m(5)
15600 .n(8)
15601 .k(k)
15602 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080015603 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015604 }
15605 }
15606
Frank Barchard91317c52019-11-22 10:54:35 -080015607 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015608 TEST_REQUIRES_ARM_NEON_FMA;
15609 for (size_t k = 4; k <= 20; k += 2) {
15610 for (uint32_t m = 1; m <= 5; m++) {
15611 for (uint32_t n = 1; n <= 8; n++) {
15612 GemmMicrokernelTester()
15613 .mr(5)
15614 .nr(8)
15615 .kr(1)
15616 .sr(1)
15617 .m(m)
15618 .n(n)
15619 .k(k)
15620 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015621 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015622 }
15623 }
15624 }
15625 }
15626
Frank Barchard91317c52019-11-22 10:54:35 -080015627 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015628 TEST_REQUIRES_ARM_NEON_FMA;
15629 for (uint32_t n = 9; n < 16; n++) {
15630 for (size_t k = 1; k <= 10; k += 3) {
15631 GemmMicrokernelTester()
15632 .mr(5)
15633 .nr(8)
15634 .kr(1)
15635 .sr(1)
15636 .m(5)
15637 .n(8)
15638 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015639 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015640 }
15641 }
15642 }
15643
Frank Barchard91317c52019-11-22 10:54:35 -080015644 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015645 TEST_REQUIRES_ARM_NEON_FMA;
15646 for (uint32_t n = 9; n < 16; n++) {
15647 for (size_t k = 1; k <= 10; k += 3) {
15648 GemmMicrokernelTester()
15649 .mr(5)
15650 .nr(8)
15651 .kr(1)
15652 .sr(1)
15653 .m(5)
15654 .n(8)
15655 .k(k)
15656 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015657 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015658 }
15659 }
15660 }
15661
Frank Barchard91317c52019-11-22 10:54:35 -080015662 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015663 TEST_REQUIRES_ARM_NEON_FMA;
15664 for (uint32_t n = 9; n < 16; n++) {
15665 for (size_t k = 1; k <= 10; k += 3) {
15666 GemmMicrokernelTester()
15667 .mr(5)
15668 .nr(8)
15669 .kr(1)
15670 .sr(1)
15671 .m(5)
15672 .n(n)
15673 .k(k)
15674 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080015675 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015676 }
15677 }
15678 }
15679
Frank Barchard91317c52019-11-22 10:54:35 -080015680 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015681 TEST_REQUIRES_ARM_NEON_FMA;
15682 for (uint32_t n = 9; n < 16; n++) {
15683 for (size_t k = 1; k <= 10; k += 3) {
15684 for (uint32_t m = 1; m <= 5; m++) {
15685 GemmMicrokernelTester()
15686 .mr(5)
15687 .nr(8)
15688 .kr(1)
15689 .sr(1)
15690 .m(m)
15691 .n(n)
15692 .k(k)
15693 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015694 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015695 }
15696 }
15697 }
15698 }
15699
Frank Barchard91317c52019-11-22 10:54:35 -080015700 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015701 TEST_REQUIRES_ARM_NEON_FMA;
15702 for (uint32_t n = 16; n <= 24; n += 8) {
15703 for (size_t k = 1; k <= 10; k += 3) {
15704 GemmMicrokernelTester()
15705 .mr(5)
15706 .nr(8)
15707 .kr(1)
15708 .sr(1)
15709 .m(5)
15710 .n(8)
15711 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015712 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015713 }
15714 }
15715 }
15716
Frank Barchard91317c52019-11-22 10:54:35 -080015717 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015718 TEST_REQUIRES_ARM_NEON_FMA;
15719 for (uint32_t n = 16; n <= 24; n += 8) {
15720 for (size_t k = 1; k <= 10; k += 3) {
15721 GemmMicrokernelTester()
15722 .mr(5)
15723 .nr(8)
15724 .kr(1)
15725 .sr(1)
15726 .m(5)
15727 .n(n)
15728 .k(k)
15729 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015730 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015731 }
15732 }
15733 }
15734
Frank Barchard91317c52019-11-22 10:54:35 -080015735 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015736 TEST_REQUIRES_ARM_NEON_FMA;
15737 for (uint32_t n = 16; n <= 24; n += 8) {
15738 for (size_t k = 1; k <= 10; k += 3) {
15739 GemmMicrokernelTester()
15740 .mr(5)
15741 .nr(8)
15742 .kr(1)
15743 .sr(1)
15744 .m(5)
15745 .n(n)
15746 .k(k)
15747 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080015748 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015749 }
15750 }
15751 }
15752
Frank Barchard91317c52019-11-22 10:54:35 -080015753 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015754 TEST_REQUIRES_ARM_NEON_FMA;
15755 for (uint32_t n = 16; n <= 24; n += 8) {
15756 for (size_t k = 1; k <= 10; k += 3) {
15757 for (uint32_t m = 1; m <= 5; m++) {
15758 GemmMicrokernelTester()
15759 .mr(5)
15760 .nr(8)
15761 .kr(1)
15762 .sr(1)
15763 .m(m)
15764 .n(n)
15765 .k(k)
15766 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015767 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015768 }
15769 }
15770 }
15771 }
15772
Frank Barchard91317c52019-11-22 10:54:35 -080015773 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015774 TEST_REQUIRES_ARM_NEON_FMA;
15775 for (size_t k = 1; k <= 10; k += 3) {
15776 for (uint32_t m = 1; m <= 5; m++) {
15777 for (uint32_t n = 1; n <= 8; n++) {
15778 GemmMicrokernelTester()
15779 .mr(5)
15780 .nr(8)
15781 .kr(1)
15782 .sr(1)
15783 .m(m)
15784 .n(n)
15785 .k(k)
15786 .cm_stride(11)
15787 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015788 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015789 }
15790 }
15791 }
15792 }
15793
Frank Barchard91317c52019-11-22 10:54:35 -080015794 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015795 TEST_REQUIRES_ARM_NEON_FMA;
15796 GemmMicrokernelTester()
15797 .mr(5)
15798 .nr(8)
15799 .kr(1)
15800 .sr(1)
15801 .m(5)
15802 .n(8)
15803 .k(2)
15804 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080015805 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015806 }
15807
Frank Barchard91317c52019-11-22 10:54:35 -080015808 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015809 TEST_REQUIRES_ARM_NEON_FMA;
15810 GemmMicrokernelTester()
15811 .mr(5)
15812 .nr(8)
15813 .kr(1)
15814 .sr(1)
15815 .m(5)
15816 .n(8)
15817 .k(2)
15818 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080015819 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015820 }
15821
Frank Barchard91317c52019-11-22 10:54:35 -080015822 TEST(F32_GEMM_5X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015823 TEST_REQUIRES_ARM_NEON_FMA;
15824 GemmMicrokernelTester()
15825 .mr(5)
15826 .nr(8)
15827 .kr(1)
15828 .sr(1)
15829 .m(5)
15830 .n(8)
15831 .k(2)
15832 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015833 .Test(xnn_f32_gemm_ukernel_5x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015834 }
Frank Barchard91317c52019-11-22 10:54:35 -080015835#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070015836
15837
Frank Barchard91317c52019-11-22 10:54:35 -080015838#if XNN_ARCH_ARM64
15839 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015840 TEST_REQUIRES_ARM_NEON_FMA;
15841 GemmMicrokernelTester()
15842 .mr(6)
15843 .nr(8)
15844 .kr(1)
15845 .sr(1)
15846 .m(6)
15847 .n(8)
15848 .k(2)
Frank Barchard91317c52019-11-22 10:54:35 -080015849 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015850 }
15851
Frank Barchard91317c52019-11-22 10:54:35 -080015852 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015853 TEST_REQUIRES_ARM_NEON_FMA;
15854 GemmMicrokernelTester()
15855 .mr(6)
15856 .nr(8)
15857 .kr(1)
15858 .sr(1)
15859 .m(6)
15860 .n(8)
15861 .k(2)
15862 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080015863 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015864 }
15865
Frank Barchard91317c52019-11-22 10:54:35 -080015866 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015867 TEST_REQUIRES_ARM_NEON_FMA;
15868 GemmMicrokernelTester()
15869 .mr(6)
15870 .nr(8)
15871 .kr(1)
15872 .sr(1)
15873 .m(6)
15874 .n(8)
15875 .k(2)
15876 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080015877 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015878 }
15879
Frank Barchard91317c52019-11-22 10:54:35 -080015880 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015881 TEST_REQUIRES_ARM_NEON_FMA;
15882 for (uint32_t m = 1; m <= 6; m++) {
15883 for (uint32_t n = 1; n <= 8; n++) {
15884 GemmMicrokernelTester()
15885 .mr(6)
15886 .nr(8)
15887 .kr(1)
15888 .sr(1)
15889 .m(m)
15890 .n(n)
15891 .k(2)
15892 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015893 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015894 }
15895 }
15896 }
15897
Frank Barchard91317c52019-11-22 10:54:35 -080015898 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_m) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015899 TEST_REQUIRES_ARM_NEON_FMA;
15900 for (uint32_t m = 1; m <= 6; m++) {
15901 GemmMicrokernelTester()
15902 .mr(6)
15903 .nr(8)
15904 .kr(1)
15905 .sr(1)
15906 .m(m)
15907 .n(8)
15908 .k(2)
15909 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015910 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015911 }
15912 }
15913
Frank Barchard91317c52019-11-22 10:54:35 -080015914 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_eq_2_subtile_n) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015915 TEST_REQUIRES_ARM_NEON_FMA;
15916 for (uint32_t n = 1; n <= 8; n++) {
15917 GemmMicrokernelTester()
15918 .mr(6)
15919 .nr(8)
15920 .kr(1)
15921 .sr(1)
15922 .m(6)
15923 .n(n)
15924 .k(2)
15925 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015926 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015927 }
15928 }
15929
Frank Barchard91317c52019-11-22 10:54:35 -080015930 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015931 TEST_REQUIRES_ARM_NEON_FMA;
15932 for (size_t k = 1; k < 2; k++) {
15933 GemmMicrokernelTester()
15934 .mr(6)
15935 .nr(8)
15936 .kr(1)
15937 .sr(1)
15938 .m(6)
15939 .n(8)
15940 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015941 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015942 }
15943 }
15944
Frank Barchard91317c52019-11-22 10:54:35 -080015945 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015946 TEST_REQUIRES_ARM_NEON_FMA;
15947 for (size_t k = 1; k < 2; k++) {
15948 GemmMicrokernelTester()
15949 .mr(6)
15950 .nr(8)
15951 .kr(1)
15952 .sr(1)
15953 .m(6)
15954 .n(8)
15955 .k(k)
15956 .a_stride(5)
Frank Barchard91317c52019-11-22 10:54:35 -080015957 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015958 }
15959 }
15960
Frank Barchard91317c52019-11-22 10:54:35 -080015961 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_lt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015962 TEST_REQUIRES_ARM_NEON_FMA;
15963 for (size_t k = 1; k < 2; k++) {
15964 for (uint32_t m = 1; m <= 6; m++) {
15965 for (uint32_t n = 1; n <= 8; n++) {
15966 GemmMicrokernelTester()
15967 .mr(6)
15968 .nr(8)
15969 .kr(1)
15970 .sr(1)
15971 .m(m)
15972 .n(n)
15973 .k(k)
15974 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080015975 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015976 }
15977 }
15978 }
15979 }
15980
Frank Barchard91317c52019-11-22 10:54:35 -080015981 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015982 TEST_REQUIRES_ARM_NEON_FMA;
15983 for (size_t k = 3; k < 4; k++) {
15984 GemmMicrokernelTester()
15985 .mr(6)
15986 .nr(8)
15987 .kr(1)
15988 .sr(1)
15989 .m(6)
15990 .n(8)
15991 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080015992 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070015993 }
15994 }
15995
Frank Barchard91317c52019-11-22 10:54:35 -080015996 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070015997 TEST_REQUIRES_ARM_NEON_FMA;
15998 for (size_t k = 3; k < 4; k++) {
15999 GemmMicrokernelTester()
16000 .mr(6)
16001 .nr(8)
16002 .kr(1)
16003 .sr(1)
16004 .m(6)
16005 .n(8)
16006 .k(k)
16007 .a_stride(7)
Frank Barchard91317c52019-11-22 10:54:35 -080016008 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016009 }
16010 }
16011
Frank Barchard91317c52019-11-22 10:54:35 -080016012 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_gt_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016013 TEST_REQUIRES_ARM_NEON_FMA;
16014 for (size_t k = 3; k < 4; k++) {
16015 for (uint32_t m = 1; m <= 6; m++) {
16016 for (uint32_t n = 1; n <= 8; n++) {
16017 GemmMicrokernelTester()
16018 .mr(6)
16019 .nr(8)
16020 .kr(1)
16021 .sr(1)
16022 .m(m)
16023 .n(n)
16024 .k(k)
16025 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080016026 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016027 }
16028 }
16029 }
16030 }
16031
Frank Barchard91317c52019-11-22 10:54:35 -080016032 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016033 TEST_REQUIRES_ARM_NEON_FMA;
16034 for (size_t k = 4; k <= 20; k += 2) {
16035 GemmMicrokernelTester()
16036 .mr(6)
16037 .nr(8)
16038 .kr(1)
16039 .sr(1)
16040 .m(6)
16041 .n(8)
16042 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080016043 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016044 }
16045 }
16046
Frank Barchard91317c52019-11-22 10:54:35 -080016047 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016048 TEST_REQUIRES_ARM_NEON_FMA;
16049 for (size_t k = 4; k <= 20; k += 2) {
16050 GemmMicrokernelTester()
16051 .mr(6)
16052 .nr(8)
16053 .kr(1)
16054 .sr(1)
16055 .m(6)
16056 .n(8)
16057 .k(k)
16058 .a_stride(23)
Frank Barchard91317c52019-11-22 10:54:35 -080016059 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016060 }
16061 }
16062
Frank Barchard91317c52019-11-22 10:54:35 -080016063 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, k_div_2_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016064 TEST_REQUIRES_ARM_NEON_FMA;
16065 for (size_t k = 4; k <= 20; k += 2) {
16066 for (uint32_t m = 1; m <= 6; m++) {
16067 for (uint32_t n = 1; n <= 8; n++) {
16068 GemmMicrokernelTester()
16069 .mr(6)
16070 .nr(8)
16071 .kr(1)
16072 .sr(1)
16073 .m(m)
16074 .n(n)
16075 .k(k)
16076 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080016077 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016078 }
16079 }
16080 }
16081 }
16082
Frank Barchard91317c52019-11-22 10:54:35 -080016083 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016084 TEST_REQUIRES_ARM_NEON_FMA;
16085 for (uint32_t n = 9; n < 16; n++) {
16086 for (size_t k = 1; k <= 10; k += 3) {
16087 GemmMicrokernelTester()
16088 .mr(6)
16089 .nr(8)
16090 .kr(1)
16091 .sr(1)
16092 .m(6)
16093 .n(8)
16094 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080016095 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016096 }
16097 }
16098 }
16099
Frank Barchard91317c52019-11-22 10:54:35 -080016100 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016101 TEST_REQUIRES_ARM_NEON_FMA;
16102 for (uint32_t n = 9; n < 16; n++) {
16103 for (size_t k = 1; k <= 10; k += 3) {
16104 GemmMicrokernelTester()
16105 .mr(6)
16106 .nr(8)
16107 .kr(1)
16108 .sr(1)
16109 .m(6)
16110 .n(8)
16111 .k(k)
16112 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080016113 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016114 }
16115 }
16116 }
16117
Frank Barchard91317c52019-11-22 10:54:35 -080016118 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016119 TEST_REQUIRES_ARM_NEON_FMA;
16120 for (uint32_t n = 9; n < 16; n++) {
16121 for (size_t k = 1; k <= 10; k += 3) {
16122 GemmMicrokernelTester()
16123 .mr(6)
16124 .nr(8)
16125 .kr(1)
16126 .sr(1)
16127 .m(6)
16128 .n(n)
16129 .k(k)
16130 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080016131 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016132 }
16133 }
16134 }
16135
Frank Barchard91317c52019-11-22 10:54:35 -080016136 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_gt_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016137 TEST_REQUIRES_ARM_NEON_FMA;
16138 for (uint32_t n = 9; n < 16; n++) {
16139 for (size_t k = 1; k <= 10; k += 3) {
16140 for (uint32_t m = 1; m <= 6; m++) {
16141 GemmMicrokernelTester()
16142 .mr(6)
16143 .nr(8)
16144 .kr(1)
16145 .sr(1)
16146 .m(m)
16147 .n(n)
16148 .k(k)
16149 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080016150 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016151 }
16152 }
16153 }
16154 }
16155
Frank Barchard91317c52019-11-22 10:54:35 -080016156 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016157 TEST_REQUIRES_ARM_NEON_FMA;
16158 for (uint32_t n = 16; n <= 24; n += 8) {
16159 for (size_t k = 1; k <= 10; k += 3) {
16160 GemmMicrokernelTester()
16161 .mr(6)
16162 .nr(8)
16163 .kr(1)
16164 .sr(1)
16165 .m(6)
16166 .n(8)
16167 .k(k)
Frank Barchard91317c52019-11-22 10:54:35 -080016168 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016169 }
16170 }
16171 }
16172
Frank Barchard91317c52019-11-22 10:54:35 -080016173 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_cn) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016174 TEST_REQUIRES_ARM_NEON_FMA;
16175 for (uint32_t n = 16; n <= 24; n += 8) {
16176 for (size_t k = 1; k <= 10; k += 3) {
16177 GemmMicrokernelTester()
16178 .mr(6)
16179 .nr(8)
16180 .kr(1)
16181 .sr(1)
16182 .m(6)
16183 .n(n)
16184 .k(k)
16185 .cn_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080016186 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016187 }
16188 }
16189 }
16190
Frank Barchard91317c52019-11-22 10:54:35 -080016191 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_strided_a) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016192 TEST_REQUIRES_ARM_NEON_FMA;
16193 for (uint32_t n = 16; n <= 24; n += 8) {
16194 for (size_t k = 1; k <= 10; k += 3) {
16195 GemmMicrokernelTester()
16196 .mr(6)
16197 .nr(8)
16198 .kr(1)
16199 .sr(1)
16200 .m(6)
16201 .n(n)
16202 .k(k)
16203 .a_stride(13)
Frank Barchard91317c52019-11-22 10:54:35 -080016204 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016205 }
16206 }
16207 }
16208
Frank Barchard91317c52019-11-22 10:54:35 -080016209 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, n_div_8_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016210 TEST_REQUIRES_ARM_NEON_FMA;
16211 for (uint32_t n = 16; n <= 24; n += 8) {
16212 for (size_t k = 1; k <= 10; k += 3) {
16213 for (uint32_t m = 1; m <= 6; m++) {
16214 GemmMicrokernelTester()
16215 .mr(6)
16216 .nr(8)
16217 .kr(1)
16218 .sr(1)
16219 .m(m)
16220 .n(n)
16221 .k(k)
16222 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080016223 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016224 }
16225 }
16226 }
16227 }
16228
Frank Barchard91317c52019-11-22 10:54:35 -080016229 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cm_subtile) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016230 TEST_REQUIRES_ARM_NEON_FMA;
16231 for (size_t k = 1; k <= 10; k += 3) {
16232 for (uint32_t m = 1; m <= 6; m++) {
16233 for (uint32_t n = 1; n <= 8; n++) {
16234 GemmMicrokernelTester()
16235 .mr(6)
16236 .nr(8)
16237 .kr(1)
16238 .sr(1)
16239 .m(m)
16240 .n(n)
16241 .k(k)
16242 .cm_stride(11)
16243 .iterations(1)
Frank Barchard91317c52019-11-22 10:54:35 -080016244 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016245 }
16246 }
16247 }
16248 }
16249
Frank Barchard91317c52019-11-22 10:54:35 -080016250 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, qmin) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016251 TEST_REQUIRES_ARM_NEON_FMA;
16252 GemmMicrokernelTester()
16253 .mr(6)
16254 .nr(8)
16255 .kr(1)
16256 .sr(1)
16257 .m(6)
16258 .n(8)
16259 .k(2)
16260 .qmin(128)
Frank Barchard91317c52019-11-22 10:54:35 -080016261 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016262 }
16263
Frank Barchard91317c52019-11-22 10:54:35 -080016264 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, qmax) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016265 TEST_REQUIRES_ARM_NEON_FMA;
16266 GemmMicrokernelTester()
16267 .mr(6)
16268 .nr(8)
16269 .kr(1)
16270 .sr(1)
16271 .m(6)
16272 .n(8)
16273 .k(2)
16274 .qmax(128)
Frank Barchard91317c52019-11-22 10:54:35 -080016275 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016276 }
16277
Frank Barchard91317c52019-11-22 10:54:35 -080016278 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD64, strided_cm) {
XNNPACK Teamb455b122019-09-27 18:10:33 -070016279 TEST_REQUIRES_ARM_NEON_FMA;
16280 GemmMicrokernelTester()
16281 .mr(6)
16282 .nr(8)
16283 .kr(1)
16284 .sr(1)
16285 .m(6)
16286 .n(8)
16287 .k(2)
16288 .cm_stride(11)
Frank Barchard91317c52019-11-22 10:54:35 -080016289 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld64);
XNNPACK Teamb455b122019-09-27 18:10:33 -070016290 }
Frank Barchard91317c52019-11-22 10:54:35 -080016291#endif // XNN_ARCH_ARM64
XNNPACK Teamb455b122019-09-27 18:10:33 -070016292
16293
Frank Barchard69172d92019-11-26 16:22:39 -080016294#if XNN_ARCH_ARM64
16295 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4) {
16296 TEST_REQUIRES_ARM_NEON_FMA;
16297 GemmMicrokernelTester()
16298 .mr(6)
16299 .nr(8)
16300 .kr(1)
16301 .sr(1)
16302 .m(6)
16303 .n(8)
16304 .k(4)
16305 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16306 }
16307
16308 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, strided_cn) {
16309 TEST_REQUIRES_ARM_NEON_FMA;
16310 GemmMicrokernelTester()
16311 .mr(6)
16312 .nr(8)
16313 .kr(1)
16314 .sr(1)
16315 .m(6)
16316 .n(8)
16317 .k(4)
16318 .cn_stride(11)
16319 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16320 }
16321
16322 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_strided_a) {
16323 TEST_REQUIRES_ARM_NEON_FMA;
16324 GemmMicrokernelTester()
16325 .mr(6)
16326 .nr(8)
16327 .kr(1)
16328 .sr(1)
16329 .m(6)
16330 .n(8)
16331 .k(4)
16332 .a_stride(7)
16333 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16334 }
16335
16336 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile) {
16337 TEST_REQUIRES_ARM_NEON_FMA;
16338 for (uint32_t m = 1; m <= 6; m++) {
16339 for (uint32_t n = 1; n <= 8; n++) {
16340 GemmMicrokernelTester()
16341 .mr(6)
16342 .nr(8)
16343 .kr(1)
16344 .sr(1)
16345 .m(m)
16346 .n(n)
16347 .k(4)
16348 .iterations(1)
16349 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16350 }
16351 }
16352 }
16353
16354 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_m) {
16355 TEST_REQUIRES_ARM_NEON_FMA;
16356 for (uint32_t m = 1; m <= 6; m++) {
16357 GemmMicrokernelTester()
16358 .mr(6)
16359 .nr(8)
16360 .kr(1)
16361 .sr(1)
16362 .m(m)
16363 .n(8)
16364 .k(4)
16365 .iterations(1)
16366 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16367 }
16368 }
16369
16370 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_eq_4_subtile_n) {
16371 TEST_REQUIRES_ARM_NEON_FMA;
16372 for (uint32_t n = 1; n <= 8; n++) {
16373 GemmMicrokernelTester()
16374 .mr(6)
16375 .nr(8)
16376 .kr(1)
16377 .sr(1)
16378 .m(6)
16379 .n(n)
16380 .k(4)
16381 .iterations(1)
16382 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16383 }
16384 }
16385
16386 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_lt_4) {
16387 TEST_REQUIRES_ARM_NEON_FMA;
16388 for (size_t k = 1; k < 4; k++) {
16389 GemmMicrokernelTester()
16390 .mr(6)
16391 .nr(8)
16392 .kr(1)
16393 .sr(1)
16394 .m(6)
16395 .n(8)
16396 .k(k)
16397 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16398 }
16399 }
16400
16401 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_lt_4_strided_a) {
16402 TEST_REQUIRES_ARM_NEON_FMA;
16403 for (size_t k = 1; k < 4; k++) {
16404 GemmMicrokernelTester()
16405 .mr(6)
16406 .nr(8)
16407 .kr(1)
16408 .sr(1)
16409 .m(6)
16410 .n(8)
16411 .k(k)
16412 .a_stride(7)
16413 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16414 }
16415 }
16416
16417 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_lt_4_subtile) {
16418 TEST_REQUIRES_ARM_NEON_FMA;
16419 for (size_t k = 1; k < 4; k++) {
16420 for (uint32_t m = 1; m <= 6; m++) {
16421 for (uint32_t n = 1; n <= 8; n++) {
16422 GemmMicrokernelTester()
16423 .mr(6)
16424 .nr(8)
16425 .kr(1)
16426 .sr(1)
16427 .m(m)
16428 .n(n)
16429 .k(k)
16430 .iterations(1)
16431 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16432 }
16433 }
16434 }
16435 }
16436
16437 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_gt_4) {
16438 TEST_REQUIRES_ARM_NEON_FMA;
16439 for (size_t k = 5; k < 8; k++) {
16440 GemmMicrokernelTester()
16441 .mr(6)
16442 .nr(8)
16443 .kr(1)
16444 .sr(1)
16445 .m(6)
16446 .n(8)
16447 .k(k)
16448 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16449 }
16450 }
16451
16452 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_gt_4_strided_a) {
16453 TEST_REQUIRES_ARM_NEON_FMA;
16454 for (size_t k = 5; k < 8; k++) {
16455 GemmMicrokernelTester()
16456 .mr(6)
16457 .nr(8)
16458 .kr(1)
16459 .sr(1)
16460 .m(6)
16461 .n(8)
16462 .k(k)
16463 .a_stride(11)
16464 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16465 }
16466 }
16467
16468 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_gt_4_subtile) {
16469 TEST_REQUIRES_ARM_NEON_FMA;
16470 for (size_t k = 5; k < 8; k++) {
16471 for (uint32_t m = 1; m <= 6; m++) {
16472 for (uint32_t n = 1; n <= 8; n++) {
16473 GemmMicrokernelTester()
16474 .mr(6)
16475 .nr(8)
16476 .kr(1)
16477 .sr(1)
16478 .m(m)
16479 .n(n)
16480 .k(k)
16481 .iterations(1)
16482 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16483 }
16484 }
16485 }
16486 }
16487
16488 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_div_4) {
16489 TEST_REQUIRES_ARM_NEON_FMA;
16490 for (size_t k = 8; k <= 40; k += 4) {
16491 GemmMicrokernelTester()
16492 .mr(6)
16493 .nr(8)
16494 .kr(1)
16495 .sr(1)
16496 .m(6)
16497 .n(8)
16498 .k(k)
16499 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16500 }
16501 }
16502
16503 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_div_4_strided_a) {
16504 TEST_REQUIRES_ARM_NEON_FMA;
16505 for (size_t k = 8; k <= 40; k += 4) {
16506 GemmMicrokernelTester()
16507 .mr(6)
16508 .nr(8)
16509 .kr(1)
16510 .sr(1)
16511 .m(6)
16512 .n(8)
16513 .k(k)
16514 .a_stride(43)
16515 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16516 }
16517 }
16518
16519 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, k_div_4_subtile) {
16520 TEST_REQUIRES_ARM_NEON_FMA;
16521 for (size_t k = 8; k <= 40; k += 4) {
16522 for (uint32_t m = 1; m <= 6; m++) {
16523 for (uint32_t n = 1; n <= 8; n++) {
16524 GemmMicrokernelTester()
16525 .mr(6)
16526 .nr(8)
16527 .kr(1)
16528 .sr(1)
16529 .m(m)
16530 .n(n)
16531 .k(k)
16532 .iterations(1)
16533 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16534 }
16535 }
16536 }
16537 }
16538
16539 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8) {
16540 TEST_REQUIRES_ARM_NEON_FMA;
16541 for (uint32_t n = 9; n < 16; n++) {
16542 for (size_t k = 1; k <= 20; k += 5) {
16543 GemmMicrokernelTester()
16544 .mr(6)
16545 .nr(8)
16546 .kr(1)
16547 .sr(1)
16548 .m(6)
16549 .n(8)
16550 .k(k)
16551 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16552 }
16553 }
16554 }
16555
16556 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_cn) {
16557 TEST_REQUIRES_ARM_NEON_FMA;
16558 for (uint32_t n = 9; n < 16; n++) {
16559 for (size_t k = 1; k <= 20; k += 5) {
16560 GemmMicrokernelTester()
16561 .mr(6)
16562 .nr(8)
16563 .kr(1)
16564 .sr(1)
16565 .m(6)
16566 .n(8)
16567 .k(k)
16568 .cn_stride(11)
16569 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16570 }
16571 }
16572 }
16573
16574 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_strided_a) {
16575 TEST_REQUIRES_ARM_NEON_FMA;
16576 for (uint32_t n = 9; n < 16; n++) {
16577 for (size_t k = 1; k <= 20; k += 5) {
16578 GemmMicrokernelTester()
16579 .mr(6)
16580 .nr(8)
16581 .kr(1)
16582 .sr(1)
16583 .m(6)
16584 .n(n)
16585 .k(k)
16586 .a_stride(23)
16587 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16588 }
16589 }
16590 }
16591
16592 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_gt_8_subtile) {
16593 TEST_REQUIRES_ARM_NEON_FMA;
16594 for (uint32_t n = 9; n < 16; n++) {
16595 for (size_t k = 1; k <= 20; k += 5) {
16596 for (uint32_t m = 1; m <= 6; m++) {
16597 GemmMicrokernelTester()
16598 .mr(6)
16599 .nr(8)
16600 .kr(1)
16601 .sr(1)
16602 .m(m)
16603 .n(n)
16604 .k(k)
16605 .iterations(1)
16606 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16607 }
16608 }
16609 }
16610 }
16611
16612 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8) {
16613 TEST_REQUIRES_ARM_NEON_FMA;
16614 for (uint32_t n = 16; n <= 24; n += 8) {
16615 for (size_t k = 1; k <= 20; k += 5) {
16616 GemmMicrokernelTester()
16617 .mr(6)
16618 .nr(8)
16619 .kr(1)
16620 .sr(1)
16621 .m(6)
16622 .n(8)
16623 .k(k)
16624 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16625 }
16626 }
16627 }
16628
16629 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8_strided_cn) {
16630 TEST_REQUIRES_ARM_NEON_FMA;
16631 for (uint32_t n = 16; n <= 24; n += 8) {
16632 for (size_t k = 1; k <= 20; k += 5) {
16633 GemmMicrokernelTester()
16634 .mr(6)
16635 .nr(8)
16636 .kr(1)
16637 .sr(1)
16638 .m(6)
16639 .n(n)
16640 .k(k)
16641 .cn_stride(11)
16642 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16643 }
16644 }
16645 }
16646
16647 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8_strided_a) {
16648 TEST_REQUIRES_ARM_NEON_FMA;
16649 for (uint32_t n = 16; n <= 24; n += 8) {
16650 for (size_t k = 1; k <= 20; k += 5) {
16651 GemmMicrokernelTester()
16652 .mr(6)
16653 .nr(8)
16654 .kr(1)
16655 .sr(1)
16656 .m(6)
16657 .n(n)
16658 .k(k)
16659 .a_stride(23)
16660 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16661 }
16662 }
16663 }
16664
16665 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, n_div_8_subtile) {
16666 TEST_REQUIRES_ARM_NEON_FMA;
16667 for (uint32_t n = 16; n <= 24; n += 8) {
16668 for (size_t k = 1; k <= 20; k += 5) {
16669 for (uint32_t m = 1; m <= 6; m++) {
16670 GemmMicrokernelTester()
16671 .mr(6)
16672 .nr(8)
16673 .kr(1)
16674 .sr(1)
16675 .m(m)
16676 .n(n)
16677 .k(k)
16678 .iterations(1)
16679 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16680 }
16681 }
16682 }
16683 }
16684
16685 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, strided_cm_subtile) {
16686 TEST_REQUIRES_ARM_NEON_FMA;
16687 for (size_t k = 1; k <= 20; k += 5) {
16688 for (uint32_t m = 1; m <= 6; m++) {
16689 for (uint32_t n = 1; n <= 8; n++) {
16690 GemmMicrokernelTester()
16691 .mr(6)
16692 .nr(8)
16693 .kr(1)
16694 .sr(1)
16695 .m(m)
16696 .n(n)
16697 .k(k)
16698 .cm_stride(11)
16699 .iterations(1)
16700 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16701 }
16702 }
16703 }
16704 }
16705
16706 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, qmin) {
16707 TEST_REQUIRES_ARM_NEON_FMA;
16708 GemmMicrokernelTester()
16709 .mr(6)
16710 .nr(8)
16711 .kr(1)
16712 .sr(1)
16713 .m(6)
16714 .n(8)
16715 .k(4)
16716 .qmin(128)
16717 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16718 }
16719
16720 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, qmax) {
16721 TEST_REQUIRES_ARM_NEON_FMA;
16722 GemmMicrokernelTester()
16723 .mr(6)
16724 .nr(8)
16725 .kr(1)
16726 .sr(1)
16727 .m(6)
16728 .n(8)
16729 .k(4)
16730 .qmax(128)
16731 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16732 }
16733
16734 TEST(F32_GEMM_6X8__NEONFMA_LANE_LD128, strided_cm) {
16735 TEST_REQUIRES_ARM_NEON_FMA;
16736 GemmMicrokernelTester()
16737 .mr(6)
16738 .nr(8)
16739 .kr(1)
16740 .sr(1)
16741 .m(6)
16742 .n(8)
16743 .k(4)
16744 .cm_stride(11)
16745 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_lane_ld128);
16746 }
16747#endif // XNN_ARCH_ARM64
16748
16749
Frank Barcharddf06d802019-11-20 15:53:46 -080016750#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080016751 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2) {
16752 TEST_REQUIRES_ARM_NEON;
16753 GemmMicrokernelTester()
16754 .mr(1)
16755 .nr(8)
16756 .kr(1)
16757 .sr(1)
16758 .m(1)
16759 .n(8)
16760 .k(2)
16761 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16762 }
16763
16764 TEST(F32_GEMM_1X8__NEON_DUP_LD64, strided_cn) {
16765 TEST_REQUIRES_ARM_NEON;
16766 GemmMicrokernelTester()
16767 .mr(1)
16768 .nr(8)
16769 .kr(1)
16770 .sr(1)
16771 .m(1)
16772 .n(8)
16773 .k(2)
16774 .cn_stride(11)
16775 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16776 }
16777
16778 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_strided_a) {
16779 TEST_REQUIRES_ARM_NEON;
16780 GemmMicrokernelTester()
16781 .mr(1)
16782 .nr(8)
16783 .kr(1)
16784 .sr(1)
16785 .m(1)
16786 .n(8)
16787 .k(2)
16788 .a_stride(5)
16789 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16790 }
16791
16792 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile) {
16793 TEST_REQUIRES_ARM_NEON;
16794 for (uint32_t m = 1; m <= 1; m++) {
16795 for (uint32_t n = 1; n <= 8; n++) {
16796 GemmMicrokernelTester()
16797 .mr(1)
16798 .nr(8)
16799 .kr(1)
16800 .sr(1)
16801 .m(m)
16802 .n(n)
16803 .k(2)
16804 .iterations(1)
16805 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16806 }
16807 }
16808 }
16809
16810 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
16811 TEST_REQUIRES_ARM_NEON;
16812 for (uint32_t m = 1; m <= 1; m++) {
16813 GemmMicrokernelTester()
16814 .mr(1)
16815 .nr(8)
16816 .kr(1)
16817 .sr(1)
16818 .m(m)
16819 .n(8)
16820 .k(2)
16821 .iterations(1)
16822 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16823 }
16824 }
16825
16826 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
16827 TEST_REQUIRES_ARM_NEON;
16828 for (uint32_t n = 1; n <= 8; n++) {
16829 GemmMicrokernelTester()
16830 .mr(1)
16831 .nr(8)
16832 .kr(1)
16833 .sr(1)
16834 .m(1)
16835 .n(n)
16836 .k(2)
16837 .iterations(1)
16838 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16839 }
16840 }
16841
16842 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_lt_2) {
16843 TEST_REQUIRES_ARM_NEON;
16844 for (size_t k = 1; k < 2; k++) {
16845 GemmMicrokernelTester()
16846 .mr(1)
16847 .nr(8)
16848 .kr(1)
16849 .sr(1)
16850 .m(1)
16851 .n(8)
16852 .k(k)
16853 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16854 }
16855 }
16856
16857 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_lt_2_strided_a) {
16858 TEST_REQUIRES_ARM_NEON;
16859 for (size_t k = 1; k < 2; k++) {
16860 GemmMicrokernelTester()
16861 .mr(1)
16862 .nr(8)
16863 .kr(1)
16864 .sr(1)
16865 .m(1)
16866 .n(8)
16867 .k(k)
16868 .a_stride(5)
16869 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16870 }
16871 }
16872
16873 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_lt_2_subtile) {
16874 TEST_REQUIRES_ARM_NEON;
16875 for (size_t k = 1; k < 2; k++) {
16876 for (uint32_t m = 1; m <= 1; m++) {
16877 for (uint32_t n = 1; n <= 8; n++) {
16878 GemmMicrokernelTester()
16879 .mr(1)
16880 .nr(8)
16881 .kr(1)
16882 .sr(1)
16883 .m(m)
16884 .n(n)
16885 .k(k)
16886 .iterations(1)
16887 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16888 }
16889 }
16890 }
16891 }
16892
16893 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_gt_2) {
16894 TEST_REQUIRES_ARM_NEON;
16895 for (size_t k = 3; k < 4; k++) {
16896 GemmMicrokernelTester()
16897 .mr(1)
16898 .nr(8)
16899 .kr(1)
16900 .sr(1)
16901 .m(1)
16902 .n(8)
16903 .k(k)
16904 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16905 }
16906 }
16907
16908 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_gt_2_strided_a) {
16909 TEST_REQUIRES_ARM_NEON;
16910 for (size_t k = 3; k < 4; k++) {
16911 GemmMicrokernelTester()
16912 .mr(1)
16913 .nr(8)
16914 .kr(1)
16915 .sr(1)
16916 .m(1)
16917 .n(8)
16918 .k(k)
16919 .a_stride(7)
16920 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16921 }
16922 }
16923
16924 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_gt_2_subtile) {
16925 TEST_REQUIRES_ARM_NEON;
16926 for (size_t k = 3; k < 4; k++) {
16927 for (uint32_t m = 1; m <= 1; m++) {
16928 for (uint32_t n = 1; n <= 8; n++) {
16929 GemmMicrokernelTester()
16930 .mr(1)
16931 .nr(8)
16932 .kr(1)
16933 .sr(1)
16934 .m(m)
16935 .n(n)
16936 .k(k)
16937 .iterations(1)
16938 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16939 }
16940 }
16941 }
16942 }
16943
16944 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_div_2) {
16945 TEST_REQUIRES_ARM_NEON;
16946 for (size_t k = 4; k <= 20; k += 2) {
16947 GemmMicrokernelTester()
16948 .mr(1)
16949 .nr(8)
16950 .kr(1)
16951 .sr(1)
16952 .m(1)
16953 .n(8)
16954 .k(k)
16955 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16956 }
16957 }
16958
16959 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_div_2_strided_a) {
16960 TEST_REQUIRES_ARM_NEON;
16961 for (size_t k = 4; k <= 20; k += 2) {
16962 GemmMicrokernelTester()
16963 .mr(1)
16964 .nr(8)
16965 .kr(1)
16966 .sr(1)
16967 .m(1)
16968 .n(8)
16969 .k(k)
16970 .a_stride(23)
16971 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16972 }
16973 }
16974
16975 TEST(F32_GEMM_1X8__NEON_DUP_LD64, k_div_2_subtile) {
16976 TEST_REQUIRES_ARM_NEON;
16977 for (size_t k = 4; k <= 20; k += 2) {
16978 for (uint32_t m = 1; m <= 1; m++) {
16979 for (uint32_t n = 1; n <= 8; n++) {
16980 GemmMicrokernelTester()
16981 .mr(1)
16982 .nr(8)
16983 .kr(1)
16984 .sr(1)
16985 .m(m)
16986 .n(n)
16987 .k(k)
16988 .iterations(1)
16989 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
16990 }
16991 }
16992 }
16993 }
16994
16995 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8) {
16996 TEST_REQUIRES_ARM_NEON;
16997 for (uint32_t n = 9; n < 16; n++) {
16998 for (size_t k = 1; k <= 10; k += 3) {
16999 GemmMicrokernelTester()
17000 .mr(1)
17001 .nr(8)
17002 .kr(1)
17003 .sr(1)
17004 .m(1)
17005 .n(8)
17006 .k(k)
17007 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17008 }
17009 }
17010 }
17011
17012 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
17013 TEST_REQUIRES_ARM_NEON;
17014 for (uint32_t n = 9; n < 16; n++) {
17015 for (size_t k = 1; k <= 10; k += 3) {
17016 GemmMicrokernelTester()
17017 .mr(1)
17018 .nr(8)
17019 .kr(1)
17020 .sr(1)
17021 .m(1)
17022 .n(8)
17023 .k(k)
17024 .cn_stride(11)
17025 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17026 }
17027 }
17028 }
17029
17030 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8_strided_a) {
17031 TEST_REQUIRES_ARM_NEON;
17032 for (uint32_t n = 9; n < 16; n++) {
17033 for (size_t k = 1; k <= 10; k += 3) {
17034 GemmMicrokernelTester()
17035 .mr(1)
17036 .nr(8)
17037 .kr(1)
17038 .sr(1)
17039 .m(1)
17040 .n(n)
17041 .k(k)
17042 .a_stride(13)
17043 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17044 }
17045 }
17046 }
17047
17048 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_gt_8_subtile) {
17049 TEST_REQUIRES_ARM_NEON;
17050 for (uint32_t n = 9; n < 16; n++) {
17051 for (size_t k = 1; k <= 10; k += 3) {
17052 for (uint32_t m = 1; m <= 1; m++) {
17053 GemmMicrokernelTester()
17054 .mr(1)
17055 .nr(8)
17056 .kr(1)
17057 .sr(1)
17058 .m(m)
17059 .n(n)
17060 .k(k)
17061 .iterations(1)
17062 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17063 }
17064 }
17065 }
17066 }
17067
17068 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8) {
17069 TEST_REQUIRES_ARM_NEON;
17070 for (uint32_t n = 16; n <= 24; n += 8) {
17071 for (size_t k = 1; k <= 10; k += 3) {
17072 GemmMicrokernelTester()
17073 .mr(1)
17074 .nr(8)
17075 .kr(1)
17076 .sr(1)
17077 .m(1)
17078 .n(8)
17079 .k(k)
17080 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17081 }
17082 }
17083 }
17084
17085 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8_strided_cn) {
17086 TEST_REQUIRES_ARM_NEON;
17087 for (uint32_t n = 16; n <= 24; n += 8) {
17088 for (size_t k = 1; k <= 10; k += 3) {
17089 GemmMicrokernelTester()
17090 .mr(1)
17091 .nr(8)
17092 .kr(1)
17093 .sr(1)
17094 .m(1)
17095 .n(n)
17096 .k(k)
17097 .cn_stride(11)
17098 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17099 }
17100 }
17101 }
17102
17103 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8_strided_a) {
17104 TEST_REQUIRES_ARM_NEON;
17105 for (uint32_t n = 16; n <= 24; n += 8) {
17106 for (size_t k = 1; k <= 10; k += 3) {
17107 GemmMicrokernelTester()
17108 .mr(1)
17109 .nr(8)
17110 .kr(1)
17111 .sr(1)
17112 .m(1)
17113 .n(n)
17114 .k(k)
17115 .a_stride(13)
17116 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17117 }
17118 }
17119 }
17120
17121 TEST(F32_GEMM_1X8__NEON_DUP_LD64, n_div_8_subtile) {
17122 TEST_REQUIRES_ARM_NEON;
17123 for (uint32_t n = 16; n <= 24; n += 8) {
17124 for (size_t k = 1; k <= 10; k += 3) {
17125 for (uint32_t m = 1; m <= 1; m++) {
17126 GemmMicrokernelTester()
17127 .mr(1)
17128 .nr(8)
17129 .kr(1)
17130 .sr(1)
17131 .m(m)
17132 .n(n)
17133 .k(k)
17134 .iterations(1)
17135 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17136 }
17137 }
17138 }
17139 }
17140
17141 TEST(F32_GEMM_1X8__NEON_DUP_LD64, strided_cm_subtile) {
17142 TEST_REQUIRES_ARM_NEON;
17143 for (size_t k = 1; k <= 10; k += 3) {
17144 for (uint32_t m = 1; m <= 1; m++) {
17145 for (uint32_t n = 1; n <= 8; n++) {
17146 GemmMicrokernelTester()
17147 .mr(1)
17148 .nr(8)
17149 .kr(1)
17150 .sr(1)
17151 .m(m)
17152 .n(n)
17153 .k(k)
17154 .cm_stride(11)
17155 .iterations(1)
17156 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17157 }
17158 }
17159 }
17160 }
17161
17162 TEST(F32_GEMM_1X8__NEON_DUP_LD64, qmin) {
17163 TEST_REQUIRES_ARM_NEON;
17164 GemmMicrokernelTester()
17165 .mr(1)
17166 .nr(8)
17167 .kr(1)
17168 .sr(1)
17169 .m(1)
17170 .n(8)
17171 .k(2)
17172 .qmin(128)
17173 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17174 }
17175
17176 TEST(F32_GEMM_1X8__NEON_DUP_LD64, qmax) {
17177 TEST_REQUIRES_ARM_NEON;
17178 GemmMicrokernelTester()
17179 .mr(1)
17180 .nr(8)
17181 .kr(1)
17182 .sr(1)
17183 .m(1)
17184 .n(8)
17185 .k(2)
17186 .qmax(128)
17187 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17188 }
17189
17190 TEST(F32_GEMM_1X8__NEON_DUP_LD64, strided_cm) {
17191 TEST_REQUIRES_ARM_NEON;
17192 GemmMicrokernelTester()
17193 .mr(1)
17194 .nr(8)
17195 .kr(1)
17196 .sr(1)
17197 .m(1)
17198 .n(8)
17199 .k(2)
17200 .cm_stride(11)
17201 .Test(xnn_f32_gemm_ukernel_1x8__neon_dup_ld64);
17202 }
17203#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17204
17205
17206#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17207 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2) {
17208 TEST_REQUIRES_ARM_NEON;
17209 GemmMicrokernelTester()
17210 .mr(4)
17211 .nr(8)
17212 .kr(1)
17213 .sr(1)
17214 .m(4)
17215 .n(8)
17216 .k(2)
17217 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17218 }
17219
17220 TEST(F32_GEMM_4X8__NEON_DUP_LD64, strided_cn) {
17221 TEST_REQUIRES_ARM_NEON;
17222 GemmMicrokernelTester()
17223 .mr(4)
17224 .nr(8)
17225 .kr(1)
17226 .sr(1)
17227 .m(4)
17228 .n(8)
17229 .k(2)
17230 .cn_stride(11)
17231 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17232 }
17233
17234 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_strided_a) {
17235 TEST_REQUIRES_ARM_NEON;
17236 GemmMicrokernelTester()
17237 .mr(4)
17238 .nr(8)
17239 .kr(1)
17240 .sr(1)
17241 .m(4)
17242 .n(8)
17243 .k(2)
17244 .a_stride(5)
17245 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17246 }
17247
17248 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile) {
17249 TEST_REQUIRES_ARM_NEON;
17250 for (uint32_t m = 1; m <= 4; m++) {
17251 for (uint32_t n = 1; n <= 8; n++) {
17252 GemmMicrokernelTester()
17253 .mr(4)
17254 .nr(8)
17255 .kr(1)
17256 .sr(1)
17257 .m(m)
17258 .n(n)
17259 .k(2)
17260 .iterations(1)
17261 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17262 }
17263 }
17264 }
17265
17266 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
17267 TEST_REQUIRES_ARM_NEON;
17268 for (uint32_t m = 1; m <= 4; m++) {
17269 GemmMicrokernelTester()
17270 .mr(4)
17271 .nr(8)
17272 .kr(1)
17273 .sr(1)
17274 .m(m)
17275 .n(8)
17276 .k(2)
17277 .iterations(1)
17278 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17279 }
17280 }
17281
17282 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
17283 TEST_REQUIRES_ARM_NEON;
17284 for (uint32_t n = 1; n <= 8; n++) {
17285 GemmMicrokernelTester()
17286 .mr(4)
17287 .nr(8)
17288 .kr(1)
17289 .sr(1)
17290 .m(4)
17291 .n(n)
17292 .k(2)
17293 .iterations(1)
17294 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17295 }
17296 }
17297
17298 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_lt_2) {
17299 TEST_REQUIRES_ARM_NEON;
17300 for (size_t k = 1; k < 2; k++) {
17301 GemmMicrokernelTester()
17302 .mr(4)
17303 .nr(8)
17304 .kr(1)
17305 .sr(1)
17306 .m(4)
17307 .n(8)
17308 .k(k)
17309 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17310 }
17311 }
17312
17313 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_lt_2_strided_a) {
17314 TEST_REQUIRES_ARM_NEON;
17315 for (size_t k = 1; k < 2; k++) {
17316 GemmMicrokernelTester()
17317 .mr(4)
17318 .nr(8)
17319 .kr(1)
17320 .sr(1)
17321 .m(4)
17322 .n(8)
17323 .k(k)
17324 .a_stride(5)
17325 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17326 }
17327 }
17328
17329 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_lt_2_subtile) {
17330 TEST_REQUIRES_ARM_NEON;
17331 for (size_t k = 1; k < 2; k++) {
17332 for (uint32_t m = 1; m <= 4; m++) {
17333 for (uint32_t n = 1; n <= 8; n++) {
17334 GemmMicrokernelTester()
17335 .mr(4)
17336 .nr(8)
17337 .kr(1)
17338 .sr(1)
17339 .m(m)
17340 .n(n)
17341 .k(k)
17342 .iterations(1)
17343 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17344 }
17345 }
17346 }
17347 }
17348
17349 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_gt_2) {
17350 TEST_REQUIRES_ARM_NEON;
17351 for (size_t k = 3; k < 4; k++) {
17352 GemmMicrokernelTester()
17353 .mr(4)
17354 .nr(8)
17355 .kr(1)
17356 .sr(1)
17357 .m(4)
17358 .n(8)
17359 .k(k)
17360 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17361 }
17362 }
17363
17364 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_gt_2_strided_a) {
17365 TEST_REQUIRES_ARM_NEON;
17366 for (size_t k = 3; k < 4; k++) {
17367 GemmMicrokernelTester()
17368 .mr(4)
17369 .nr(8)
17370 .kr(1)
17371 .sr(1)
17372 .m(4)
17373 .n(8)
17374 .k(k)
17375 .a_stride(7)
17376 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17377 }
17378 }
17379
17380 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_gt_2_subtile) {
17381 TEST_REQUIRES_ARM_NEON;
17382 for (size_t k = 3; k < 4; k++) {
17383 for (uint32_t m = 1; m <= 4; m++) {
17384 for (uint32_t n = 1; n <= 8; n++) {
17385 GemmMicrokernelTester()
17386 .mr(4)
17387 .nr(8)
17388 .kr(1)
17389 .sr(1)
17390 .m(m)
17391 .n(n)
17392 .k(k)
17393 .iterations(1)
17394 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17395 }
17396 }
17397 }
17398 }
17399
17400 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_div_2) {
17401 TEST_REQUIRES_ARM_NEON;
17402 for (size_t k = 4; k <= 20; k += 2) {
17403 GemmMicrokernelTester()
17404 .mr(4)
17405 .nr(8)
17406 .kr(1)
17407 .sr(1)
17408 .m(4)
17409 .n(8)
17410 .k(k)
17411 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17412 }
17413 }
17414
17415 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_div_2_strided_a) {
17416 TEST_REQUIRES_ARM_NEON;
17417 for (size_t k = 4; k <= 20; k += 2) {
17418 GemmMicrokernelTester()
17419 .mr(4)
17420 .nr(8)
17421 .kr(1)
17422 .sr(1)
17423 .m(4)
17424 .n(8)
17425 .k(k)
17426 .a_stride(23)
17427 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17428 }
17429 }
17430
17431 TEST(F32_GEMM_4X8__NEON_DUP_LD64, k_div_2_subtile) {
17432 TEST_REQUIRES_ARM_NEON;
17433 for (size_t k = 4; k <= 20; k += 2) {
17434 for (uint32_t m = 1; m <= 4; m++) {
17435 for (uint32_t n = 1; n <= 8; n++) {
17436 GemmMicrokernelTester()
17437 .mr(4)
17438 .nr(8)
17439 .kr(1)
17440 .sr(1)
17441 .m(m)
17442 .n(n)
17443 .k(k)
17444 .iterations(1)
17445 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17446 }
17447 }
17448 }
17449 }
17450
17451 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8) {
17452 TEST_REQUIRES_ARM_NEON;
17453 for (uint32_t n = 9; n < 16; n++) {
17454 for (size_t k = 1; k <= 10; k += 3) {
17455 GemmMicrokernelTester()
17456 .mr(4)
17457 .nr(8)
17458 .kr(1)
17459 .sr(1)
17460 .m(4)
17461 .n(8)
17462 .k(k)
17463 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17464 }
17465 }
17466 }
17467
17468 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
17469 TEST_REQUIRES_ARM_NEON;
17470 for (uint32_t n = 9; n < 16; n++) {
17471 for (size_t k = 1; k <= 10; k += 3) {
17472 GemmMicrokernelTester()
17473 .mr(4)
17474 .nr(8)
17475 .kr(1)
17476 .sr(1)
17477 .m(4)
17478 .n(8)
17479 .k(k)
17480 .cn_stride(11)
17481 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17482 }
17483 }
17484 }
17485
17486 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8_strided_a) {
17487 TEST_REQUIRES_ARM_NEON;
17488 for (uint32_t n = 9; n < 16; n++) {
17489 for (size_t k = 1; k <= 10; k += 3) {
17490 GemmMicrokernelTester()
17491 .mr(4)
17492 .nr(8)
17493 .kr(1)
17494 .sr(1)
17495 .m(4)
17496 .n(n)
17497 .k(k)
17498 .a_stride(13)
17499 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17500 }
17501 }
17502 }
17503
17504 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_gt_8_subtile) {
17505 TEST_REQUIRES_ARM_NEON;
17506 for (uint32_t n = 9; n < 16; n++) {
17507 for (size_t k = 1; k <= 10; k += 3) {
17508 for (uint32_t m = 1; m <= 4; m++) {
17509 GemmMicrokernelTester()
17510 .mr(4)
17511 .nr(8)
17512 .kr(1)
17513 .sr(1)
17514 .m(m)
17515 .n(n)
17516 .k(k)
17517 .iterations(1)
17518 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17519 }
17520 }
17521 }
17522 }
17523
17524 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8) {
17525 TEST_REQUIRES_ARM_NEON;
17526 for (uint32_t n = 16; n <= 24; n += 8) {
17527 for (size_t k = 1; k <= 10; k += 3) {
17528 GemmMicrokernelTester()
17529 .mr(4)
17530 .nr(8)
17531 .kr(1)
17532 .sr(1)
17533 .m(4)
17534 .n(8)
17535 .k(k)
17536 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17537 }
17538 }
17539 }
17540
17541 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8_strided_cn) {
17542 TEST_REQUIRES_ARM_NEON;
17543 for (uint32_t n = 16; n <= 24; n += 8) {
17544 for (size_t k = 1; k <= 10; k += 3) {
17545 GemmMicrokernelTester()
17546 .mr(4)
17547 .nr(8)
17548 .kr(1)
17549 .sr(1)
17550 .m(4)
17551 .n(n)
17552 .k(k)
17553 .cn_stride(11)
17554 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17555 }
17556 }
17557 }
17558
17559 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8_strided_a) {
17560 TEST_REQUIRES_ARM_NEON;
17561 for (uint32_t n = 16; n <= 24; n += 8) {
17562 for (size_t k = 1; k <= 10; k += 3) {
17563 GemmMicrokernelTester()
17564 .mr(4)
17565 .nr(8)
17566 .kr(1)
17567 .sr(1)
17568 .m(4)
17569 .n(n)
17570 .k(k)
17571 .a_stride(13)
17572 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17573 }
17574 }
17575 }
17576
17577 TEST(F32_GEMM_4X8__NEON_DUP_LD64, n_div_8_subtile) {
17578 TEST_REQUIRES_ARM_NEON;
17579 for (uint32_t n = 16; n <= 24; n += 8) {
17580 for (size_t k = 1; k <= 10; k += 3) {
17581 for (uint32_t m = 1; m <= 4; m++) {
17582 GemmMicrokernelTester()
17583 .mr(4)
17584 .nr(8)
17585 .kr(1)
17586 .sr(1)
17587 .m(m)
17588 .n(n)
17589 .k(k)
17590 .iterations(1)
17591 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17592 }
17593 }
17594 }
17595 }
17596
17597 TEST(F32_GEMM_4X8__NEON_DUP_LD64, strided_cm_subtile) {
17598 TEST_REQUIRES_ARM_NEON;
17599 for (size_t k = 1; k <= 10; k += 3) {
17600 for (uint32_t m = 1; m <= 4; m++) {
17601 for (uint32_t n = 1; n <= 8; n++) {
17602 GemmMicrokernelTester()
17603 .mr(4)
17604 .nr(8)
17605 .kr(1)
17606 .sr(1)
17607 .m(m)
17608 .n(n)
17609 .k(k)
17610 .cm_stride(11)
17611 .iterations(1)
17612 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17613 }
17614 }
17615 }
17616 }
17617
17618 TEST(F32_GEMM_4X8__NEON_DUP_LD64, qmin) {
17619 TEST_REQUIRES_ARM_NEON;
17620 GemmMicrokernelTester()
17621 .mr(4)
17622 .nr(8)
17623 .kr(1)
17624 .sr(1)
17625 .m(4)
17626 .n(8)
17627 .k(2)
17628 .qmin(128)
17629 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17630 }
17631
17632 TEST(F32_GEMM_4X8__NEON_DUP_LD64, qmax) {
17633 TEST_REQUIRES_ARM_NEON;
17634 GemmMicrokernelTester()
17635 .mr(4)
17636 .nr(8)
17637 .kr(1)
17638 .sr(1)
17639 .m(4)
17640 .n(8)
17641 .k(2)
17642 .qmax(128)
17643 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17644 }
17645
17646 TEST(F32_GEMM_4X8__NEON_DUP_LD64, strided_cm) {
17647 TEST_REQUIRES_ARM_NEON;
17648 GemmMicrokernelTester()
17649 .mr(4)
17650 .nr(8)
17651 .kr(1)
17652 .sr(1)
17653 .m(4)
17654 .n(8)
17655 .k(2)
17656 .cm_stride(11)
17657 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld64);
17658 }
17659#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
17660
17661
17662#if XNN_ARCH_ARM || XNN_ARCH_ARM64
17663 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4) {
17664 TEST_REQUIRES_ARM_NEON;
17665 GemmMicrokernelTester()
17666 .mr(4)
17667 .nr(8)
17668 .kr(1)
17669 .sr(1)
17670 .m(4)
17671 .n(8)
17672 .k(4)
17673 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17674 }
17675
17676 TEST(F32_GEMM_4X8__NEON_DUP_LD128, strided_cn) {
17677 TEST_REQUIRES_ARM_NEON;
17678 GemmMicrokernelTester()
17679 .mr(4)
17680 .nr(8)
17681 .kr(1)
17682 .sr(1)
17683 .m(4)
17684 .n(8)
17685 .k(4)
17686 .cn_stride(11)
17687 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17688 }
17689
17690 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_strided_a) {
17691 TEST_REQUIRES_ARM_NEON;
17692 GemmMicrokernelTester()
17693 .mr(4)
17694 .nr(8)
17695 .kr(1)
17696 .sr(1)
17697 .m(4)
17698 .n(8)
17699 .k(4)
17700 .a_stride(7)
17701 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17702 }
17703
17704 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile) {
17705 TEST_REQUIRES_ARM_NEON;
17706 for (uint32_t m = 1; m <= 4; m++) {
17707 for (uint32_t n = 1; n <= 8; n++) {
17708 GemmMicrokernelTester()
17709 .mr(4)
17710 .nr(8)
17711 .kr(1)
17712 .sr(1)
17713 .m(m)
17714 .n(n)
17715 .k(4)
17716 .iterations(1)
17717 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17718 }
17719 }
17720 }
17721
17722 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
17723 TEST_REQUIRES_ARM_NEON;
17724 for (uint32_t m = 1; m <= 4; m++) {
17725 GemmMicrokernelTester()
17726 .mr(4)
17727 .nr(8)
17728 .kr(1)
17729 .sr(1)
17730 .m(m)
17731 .n(8)
17732 .k(4)
17733 .iterations(1)
17734 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17735 }
17736 }
17737
17738 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
17739 TEST_REQUIRES_ARM_NEON;
17740 for (uint32_t n = 1; n <= 8; n++) {
17741 GemmMicrokernelTester()
17742 .mr(4)
17743 .nr(8)
17744 .kr(1)
17745 .sr(1)
17746 .m(4)
17747 .n(n)
17748 .k(4)
17749 .iterations(1)
17750 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17751 }
17752 }
17753
17754 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_lt_4) {
17755 TEST_REQUIRES_ARM_NEON;
17756 for (size_t k = 1; k < 4; k++) {
17757 GemmMicrokernelTester()
17758 .mr(4)
17759 .nr(8)
17760 .kr(1)
17761 .sr(1)
17762 .m(4)
17763 .n(8)
17764 .k(k)
17765 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17766 }
17767 }
17768
17769 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_lt_4_strided_a) {
17770 TEST_REQUIRES_ARM_NEON;
17771 for (size_t k = 1; k < 4; k++) {
17772 GemmMicrokernelTester()
17773 .mr(4)
17774 .nr(8)
17775 .kr(1)
17776 .sr(1)
17777 .m(4)
17778 .n(8)
17779 .k(k)
17780 .a_stride(7)
17781 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17782 }
17783 }
17784
17785 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_lt_4_subtile) {
17786 TEST_REQUIRES_ARM_NEON;
17787 for (size_t k = 1; k < 4; k++) {
17788 for (uint32_t m = 1; m <= 4; m++) {
17789 for (uint32_t n = 1; n <= 8; n++) {
17790 GemmMicrokernelTester()
17791 .mr(4)
17792 .nr(8)
17793 .kr(1)
17794 .sr(1)
17795 .m(m)
17796 .n(n)
17797 .k(k)
17798 .iterations(1)
17799 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17800 }
17801 }
17802 }
17803 }
17804
17805 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_gt_4) {
17806 TEST_REQUIRES_ARM_NEON;
17807 for (size_t k = 5; k < 8; k++) {
17808 GemmMicrokernelTester()
17809 .mr(4)
17810 .nr(8)
17811 .kr(1)
17812 .sr(1)
17813 .m(4)
17814 .n(8)
17815 .k(k)
17816 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17817 }
17818 }
17819
17820 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_gt_4_strided_a) {
17821 TEST_REQUIRES_ARM_NEON;
17822 for (size_t k = 5; k < 8; k++) {
17823 GemmMicrokernelTester()
17824 .mr(4)
17825 .nr(8)
17826 .kr(1)
17827 .sr(1)
17828 .m(4)
17829 .n(8)
17830 .k(k)
17831 .a_stride(11)
17832 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17833 }
17834 }
17835
17836 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_gt_4_subtile) {
17837 TEST_REQUIRES_ARM_NEON;
17838 for (size_t k = 5; k < 8; k++) {
17839 for (uint32_t m = 1; m <= 4; m++) {
17840 for (uint32_t n = 1; n <= 8; n++) {
17841 GemmMicrokernelTester()
17842 .mr(4)
17843 .nr(8)
17844 .kr(1)
17845 .sr(1)
17846 .m(m)
17847 .n(n)
17848 .k(k)
17849 .iterations(1)
17850 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17851 }
17852 }
17853 }
17854 }
17855
17856 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_div_4) {
17857 TEST_REQUIRES_ARM_NEON;
17858 for (size_t k = 8; k <= 40; k += 4) {
17859 GemmMicrokernelTester()
17860 .mr(4)
17861 .nr(8)
17862 .kr(1)
17863 .sr(1)
17864 .m(4)
17865 .n(8)
17866 .k(k)
17867 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17868 }
17869 }
17870
17871 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_div_4_strided_a) {
17872 TEST_REQUIRES_ARM_NEON;
17873 for (size_t k = 8; k <= 40; k += 4) {
17874 GemmMicrokernelTester()
17875 .mr(4)
17876 .nr(8)
17877 .kr(1)
17878 .sr(1)
17879 .m(4)
17880 .n(8)
17881 .k(k)
17882 .a_stride(43)
17883 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17884 }
17885 }
17886
17887 TEST(F32_GEMM_4X8__NEON_DUP_LD128, k_div_4_subtile) {
17888 TEST_REQUIRES_ARM_NEON;
17889 for (size_t k = 8; k <= 40; k += 4) {
17890 for (uint32_t m = 1; m <= 4; m++) {
17891 for (uint32_t n = 1; n <= 8; n++) {
17892 GemmMicrokernelTester()
17893 .mr(4)
17894 .nr(8)
17895 .kr(1)
17896 .sr(1)
17897 .m(m)
17898 .n(n)
17899 .k(k)
17900 .iterations(1)
17901 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17902 }
17903 }
17904 }
17905 }
17906
17907 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8) {
17908 TEST_REQUIRES_ARM_NEON;
17909 for (uint32_t n = 9; n < 16; n++) {
17910 for (size_t k = 1; k <= 20; k += 5) {
17911 GemmMicrokernelTester()
17912 .mr(4)
17913 .nr(8)
17914 .kr(1)
17915 .sr(1)
17916 .m(4)
17917 .n(8)
17918 .k(k)
17919 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17920 }
17921 }
17922 }
17923
17924 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
17925 TEST_REQUIRES_ARM_NEON;
17926 for (uint32_t n = 9; n < 16; n++) {
17927 for (size_t k = 1; k <= 20; k += 5) {
17928 GemmMicrokernelTester()
17929 .mr(4)
17930 .nr(8)
17931 .kr(1)
17932 .sr(1)
17933 .m(4)
17934 .n(8)
17935 .k(k)
17936 .cn_stride(11)
17937 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17938 }
17939 }
17940 }
17941
17942 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8_strided_a) {
17943 TEST_REQUIRES_ARM_NEON;
17944 for (uint32_t n = 9; n < 16; n++) {
17945 for (size_t k = 1; k <= 20; k += 5) {
17946 GemmMicrokernelTester()
17947 .mr(4)
17948 .nr(8)
17949 .kr(1)
17950 .sr(1)
17951 .m(4)
17952 .n(n)
17953 .k(k)
17954 .a_stride(23)
17955 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17956 }
17957 }
17958 }
17959
17960 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_gt_8_subtile) {
17961 TEST_REQUIRES_ARM_NEON;
17962 for (uint32_t n = 9; n < 16; n++) {
17963 for (size_t k = 1; k <= 20; k += 5) {
17964 for (uint32_t m = 1; m <= 4; m++) {
17965 GemmMicrokernelTester()
17966 .mr(4)
17967 .nr(8)
17968 .kr(1)
17969 .sr(1)
17970 .m(m)
17971 .n(n)
17972 .k(k)
17973 .iterations(1)
17974 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17975 }
17976 }
17977 }
17978 }
17979
17980 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8) {
17981 TEST_REQUIRES_ARM_NEON;
17982 for (uint32_t n = 16; n <= 24; n += 8) {
17983 for (size_t k = 1; k <= 20; k += 5) {
17984 GemmMicrokernelTester()
17985 .mr(4)
17986 .nr(8)
17987 .kr(1)
17988 .sr(1)
17989 .m(4)
17990 .n(8)
17991 .k(k)
17992 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
17993 }
17994 }
17995 }
17996
17997 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8_strided_cn) {
17998 TEST_REQUIRES_ARM_NEON;
17999 for (uint32_t n = 16; n <= 24; n += 8) {
18000 for (size_t k = 1; k <= 20; k += 5) {
18001 GemmMicrokernelTester()
18002 .mr(4)
18003 .nr(8)
18004 .kr(1)
18005 .sr(1)
18006 .m(4)
18007 .n(n)
18008 .k(k)
18009 .cn_stride(11)
18010 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18011 }
18012 }
18013 }
18014
18015 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8_strided_a) {
18016 TEST_REQUIRES_ARM_NEON;
18017 for (uint32_t n = 16; n <= 24; n += 8) {
18018 for (size_t k = 1; k <= 20; k += 5) {
18019 GemmMicrokernelTester()
18020 .mr(4)
18021 .nr(8)
18022 .kr(1)
18023 .sr(1)
18024 .m(4)
18025 .n(n)
18026 .k(k)
18027 .a_stride(23)
18028 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18029 }
18030 }
18031 }
18032
18033 TEST(F32_GEMM_4X8__NEON_DUP_LD128, n_div_8_subtile) {
18034 TEST_REQUIRES_ARM_NEON;
18035 for (uint32_t n = 16; n <= 24; n += 8) {
18036 for (size_t k = 1; k <= 20; k += 5) {
18037 for (uint32_t m = 1; m <= 4; m++) {
18038 GemmMicrokernelTester()
18039 .mr(4)
18040 .nr(8)
18041 .kr(1)
18042 .sr(1)
18043 .m(m)
18044 .n(n)
18045 .k(k)
18046 .iterations(1)
18047 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18048 }
18049 }
18050 }
18051 }
18052
18053 TEST(F32_GEMM_4X8__NEON_DUP_LD128, strided_cm_subtile) {
18054 TEST_REQUIRES_ARM_NEON;
18055 for (size_t k = 1; k <= 20; k += 5) {
18056 for (uint32_t m = 1; m <= 4; m++) {
18057 for (uint32_t n = 1; n <= 8; n++) {
18058 GemmMicrokernelTester()
18059 .mr(4)
18060 .nr(8)
18061 .kr(1)
18062 .sr(1)
18063 .m(m)
18064 .n(n)
18065 .k(k)
18066 .cm_stride(11)
18067 .iterations(1)
18068 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18069 }
18070 }
18071 }
18072 }
18073
18074 TEST(F32_GEMM_4X8__NEON_DUP_LD128, qmin) {
18075 TEST_REQUIRES_ARM_NEON;
18076 GemmMicrokernelTester()
18077 .mr(4)
18078 .nr(8)
18079 .kr(1)
18080 .sr(1)
18081 .m(4)
18082 .n(8)
18083 .k(4)
18084 .qmin(128)
18085 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18086 }
18087
18088 TEST(F32_GEMM_4X8__NEON_DUP_LD128, qmax) {
18089 TEST_REQUIRES_ARM_NEON;
18090 GemmMicrokernelTester()
18091 .mr(4)
18092 .nr(8)
18093 .kr(1)
18094 .sr(1)
18095 .m(4)
18096 .n(8)
18097 .k(4)
18098 .qmax(128)
18099 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18100 }
18101
18102 TEST(F32_GEMM_4X8__NEON_DUP_LD128, strided_cm) {
18103 TEST_REQUIRES_ARM_NEON;
18104 GemmMicrokernelTester()
18105 .mr(4)
18106 .nr(8)
18107 .kr(1)
18108 .sr(1)
18109 .m(4)
18110 .n(8)
18111 .k(4)
18112 .cm_stride(11)
18113 .Test(xnn_f32_gemm_ukernel_4x8__neon_dup_ld128);
18114 }
18115#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18116
18117
18118#if XNN_ARCH_ARM || XNN_ARCH_ARM64
18119 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2) {
18120 TEST_REQUIRES_ARM_NEON;
18121 GemmMicrokernelTester()
18122 .mr(6)
18123 .nr(8)
18124 .kr(1)
18125 .sr(1)
18126 .m(6)
18127 .n(8)
18128 .k(2)
18129 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18130 }
18131
18132 TEST(F32_GEMM_6X8__NEON_DUP_LD64, strided_cn) {
18133 TEST_REQUIRES_ARM_NEON;
18134 GemmMicrokernelTester()
18135 .mr(6)
18136 .nr(8)
18137 .kr(1)
18138 .sr(1)
18139 .m(6)
18140 .n(8)
18141 .k(2)
18142 .cn_stride(11)
18143 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18144 }
18145
18146 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_strided_a) {
18147 TEST_REQUIRES_ARM_NEON;
18148 GemmMicrokernelTester()
18149 .mr(6)
18150 .nr(8)
18151 .kr(1)
18152 .sr(1)
18153 .m(6)
18154 .n(8)
18155 .k(2)
18156 .a_stride(5)
18157 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18158 }
18159
18160 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile) {
18161 TEST_REQUIRES_ARM_NEON;
18162 for (uint32_t m = 1; m <= 6; m++) {
18163 for (uint32_t n = 1; n <= 8; n++) {
18164 GemmMicrokernelTester()
18165 .mr(6)
18166 .nr(8)
18167 .kr(1)
18168 .sr(1)
18169 .m(m)
18170 .n(n)
18171 .k(2)
18172 .iterations(1)
18173 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18174 }
18175 }
18176 }
18177
18178 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_m) {
18179 TEST_REQUIRES_ARM_NEON;
18180 for (uint32_t m = 1; m <= 6; m++) {
18181 GemmMicrokernelTester()
18182 .mr(6)
18183 .nr(8)
18184 .kr(1)
18185 .sr(1)
18186 .m(m)
18187 .n(8)
18188 .k(2)
18189 .iterations(1)
18190 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18191 }
18192 }
18193
18194 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_eq_2_subtile_n) {
18195 TEST_REQUIRES_ARM_NEON;
18196 for (uint32_t n = 1; n <= 8; n++) {
18197 GemmMicrokernelTester()
18198 .mr(6)
18199 .nr(8)
18200 .kr(1)
18201 .sr(1)
18202 .m(6)
18203 .n(n)
18204 .k(2)
18205 .iterations(1)
18206 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18207 }
18208 }
18209
18210 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_lt_2) {
18211 TEST_REQUIRES_ARM_NEON;
18212 for (size_t k = 1; k < 2; k++) {
18213 GemmMicrokernelTester()
18214 .mr(6)
18215 .nr(8)
18216 .kr(1)
18217 .sr(1)
18218 .m(6)
18219 .n(8)
18220 .k(k)
18221 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18222 }
18223 }
18224
18225 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_lt_2_strided_a) {
18226 TEST_REQUIRES_ARM_NEON;
18227 for (size_t k = 1; k < 2; k++) {
18228 GemmMicrokernelTester()
18229 .mr(6)
18230 .nr(8)
18231 .kr(1)
18232 .sr(1)
18233 .m(6)
18234 .n(8)
18235 .k(k)
18236 .a_stride(5)
18237 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18238 }
18239 }
18240
18241 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_lt_2_subtile) {
18242 TEST_REQUIRES_ARM_NEON;
18243 for (size_t k = 1; k < 2; k++) {
18244 for (uint32_t m = 1; m <= 6; m++) {
18245 for (uint32_t n = 1; n <= 8; n++) {
18246 GemmMicrokernelTester()
18247 .mr(6)
18248 .nr(8)
18249 .kr(1)
18250 .sr(1)
18251 .m(m)
18252 .n(n)
18253 .k(k)
18254 .iterations(1)
18255 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18256 }
18257 }
18258 }
18259 }
18260
18261 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_gt_2) {
18262 TEST_REQUIRES_ARM_NEON;
18263 for (size_t k = 3; k < 4; k++) {
18264 GemmMicrokernelTester()
18265 .mr(6)
18266 .nr(8)
18267 .kr(1)
18268 .sr(1)
18269 .m(6)
18270 .n(8)
18271 .k(k)
18272 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18273 }
18274 }
18275
18276 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_gt_2_strided_a) {
18277 TEST_REQUIRES_ARM_NEON;
18278 for (size_t k = 3; k < 4; k++) {
18279 GemmMicrokernelTester()
18280 .mr(6)
18281 .nr(8)
18282 .kr(1)
18283 .sr(1)
18284 .m(6)
18285 .n(8)
18286 .k(k)
18287 .a_stride(7)
18288 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18289 }
18290 }
18291
18292 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_gt_2_subtile) {
18293 TEST_REQUIRES_ARM_NEON;
18294 for (size_t k = 3; k < 4; k++) {
18295 for (uint32_t m = 1; m <= 6; m++) {
18296 for (uint32_t n = 1; n <= 8; n++) {
18297 GemmMicrokernelTester()
18298 .mr(6)
18299 .nr(8)
18300 .kr(1)
18301 .sr(1)
18302 .m(m)
18303 .n(n)
18304 .k(k)
18305 .iterations(1)
18306 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18307 }
18308 }
18309 }
18310 }
18311
18312 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_div_2) {
18313 TEST_REQUIRES_ARM_NEON;
18314 for (size_t k = 4; k <= 20; k += 2) {
18315 GemmMicrokernelTester()
18316 .mr(6)
18317 .nr(8)
18318 .kr(1)
18319 .sr(1)
18320 .m(6)
18321 .n(8)
18322 .k(k)
18323 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18324 }
18325 }
18326
18327 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_div_2_strided_a) {
18328 TEST_REQUIRES_ARM_NEON;
18329 for (size_t k = 4; k <= 20; k += 2) {
18330 GemmMicrokernelTester()
18331 .mr(6)
18332 .nr(8)
18333 .kr(1)
18334 .sr(1)
18335 .m(6)
18336 .n(8)
18337 .k(k)
18338 .a_stride(23)
18339 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18340 }
18341 }
18342
18343 TEST(F32_GEMM_6X8__NEON_DUP_LD64, k_div_2_subtile) {
18344 TEST_REQUIRES_ARM_NEON;
18345 for (size_t k = 4; k <= 20; k += 2) {
18346 for (uint32_t m = 1; m <= 6; m++) {
18347 for (uint32_t n = 1; n <= 8; n++) {
18348 GemmMicrokernelTester()
18349 .mr(6)
18350 .nr(8)
18351 .kr(1)
18352 .sr(1)
18353 .m(m)
18354 .n(n)
18355 .k(k)
18356 .iterations(1)
18357 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18358 }
18359 }
18360 }
18361 }
18362
18363 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8) {
18364 TEST_REQUIRES_ARM_NEON;
18365 for (uint32_t n = 9; n < 16; n++) {
18366 for (size_t k = 1; k <= 10; k += 3) {
18367 GemmMicrokernelTester()
18368 .mr(6)
18369 .nr(8)
18370 .kr(1)
18371 .sr(1)
18372 .m(6)
18373 .n(8)
18374 .k(k)
18375 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18376 }
18377 }
18378 }
18379
18380 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8_strided_cn) {
18381 TEST_REQUIRES_ARM_NEON;
18382 for (uint32_t n = 9; n < 16; n++) {
18383 for (size_t k = 1; k <= 10; k += 3) {
18384 GemmMicrokernelTester()
18385 .mr(6)
18386 .nr(8)
18387 .kr(1)
18388 .sr(1)
18389 .m(6)
18390 .n(8)
18391 .k(k)
18392 .cn_stride(11)
18393 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18394 }
18395 }
18396 }
18397
18398 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8_strided_a) {
18399 TEST_REQUIRES_ARM_NEON;
18400 for (uint32_t n = 9; n < 16; n++) {
18401 for (size_t k = 1; k <= 10; k += 3) {
18402 GemmMicrokernelTester()
18403 .mr(6)
18404 .nr(8)
18405 .kr(1)
18406 .sr(1)
18407 .m(6)
18408 .n(n)
18409 .k(k)
18410 .a_stride(13)
18411 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18412 }
18413 }
18414 }
18415
18416 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_gt_8_subtile) {
18417 TEST_REQUIRES_ARM_NEON;
18418 for (uint32_t n = 9; n < 16; n++) {
18419 for (size_t k = 1; k <= 10; k += 3) {
18420 for (uint32_t m = 1; m <= 6; m++) {
18421 GemmMicrokernelTester()
18422 .mr(6)
18423 .nr(8)
18424 .kr(1)
18425 .sr(1)
18426 .m(m)
18427 .n(n)
18428 .k(k)
18429 .iterations(1)
18430 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18431 }
18432 }
18433 }
18434 }
18435
18436 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8) {
18437 TEST_REQUIRES_ARM_NEON;
18438 for (uint32_t n = 16; n <= 24; n += 8) {
18439 for (size_t k = 1; k <= 10; k += 3) {
18440 GemmMicrokernelTester()
18441 .mr(6)
18442 .nr(8)
18443 .kr(1)
18444 .sr(1)
18445 .m(6)
18446 .n(8)
18447 .k(k)
18448 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18449 }
18450 }
18451 }
18452
18453 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8_strided_cn) {
18454 TEST_REQUIRES_ARM_NEON;
18455 for (uint32_t n = 16; n <= 24; n += 8) {
18456 for (size_t k = 1; k <= 10; k += 3) {
18457 GemmMicrokernelTester()
18458 .mr(6)
18459 .nr(8)
18460 .kr(1)
18461 .sr(1)
18462 .m(6)
18463 .n(n)
18464 .k(k)
18465 .cn_stride(11)
18466 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18467 }
18468 }
18469 }
18470
18471 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8_strided_a) {
18472 TEST_REQUIRES_ARM_NEON;
18473 for (uint32_t n = 16; n <= 24; n += 8) {
18474 for (size_t k = 1; k <= 10; k += 3) {
18475 GemmMicrokernelTester()
18476 .mr(6)
18477 .nr(8)
18478 .kr(1)
18479 .sr(1)
18480 .m(6)
18481 .n(n)
18482 .k(k)
18483 .a_stride(13)
18484 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18485 }
18486 }
18487 }
18488
18489 TEST(F32_GEMM_6X8__NEON_DUP_LD64, n_div_8_subtile) {
18490 TEST_REQUIRES_ARM_NEON;
18491 for (uint32_t n = 16; n <= 24; n += 8) {
18492 for (size_t k = 1; k <= 10; k += 3) {
18493 for (uint32_t m = 1; m <= 6; m++) {
18494 GemmMicrokernelTester()
18495 .mr(6)
18496 .nr(8)
18497 .kr(1)
18498 .sr(1)
18499 .m(m)
18500 .n(n)
18501 .k(k)
18502 .iterations(1)
18503 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18504 }
18505 }
18506 }
18507 }
18508
18509 TEST(F32_GEMM_6X8__NEON_DUP_LD64, strided_cm_subtile) {
18510 TEST_REQUIRES_ARM_NEON;
18511 for (size_t k = 1; k <= 10; k += 3) {
18512 for (uint32_t m = 1; m <= 6; m++) {
18513 for (uint32_t n = 1; n <= 8; n++) {
18514 GemmMicrokernelTester()
18515 .mr(6)
18516 .nr(8)
18517 .kr(1)
18518 .sr(1)
18519 .m(m)
18520 .n(n)
18521 .k(k)
18522 .cm_stride(11)
18523 .iterations(1)
18524 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18525 }
18526 }
18527 }
18528 }
18529
18530 TEST(F32_GEMM_6X8__NEON_DUP_LD64, qmin) {
18531 TEST_REQUIRES_ARM_NEON;
18532 GemmMicrokernelTester()
18533 .mr(6)
18534 .nr(8)
18535 .kr(1)
18536 .sr(1)
18537 .m(6)
18538 .n(8)
18539 .k(2)
18540 .qmin(128)
18541 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18542 }
18543
18544 TEST(F32_GEMM_6X8__NEON_DUP_LD64, qmax) {
18545 TEST_REQUIRES_ARM_NEON;
18546 GemmMicrokernelTester()
18547 .mr(6)
18548 .nr(8)
18549 .kr(1)
18550 .sr(1)
18551 .m(6)
18552 .n(8)
18553 .k(2)
18554 .qmax(128)
18555 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18556 }
18557
18558 TEST(F32_GEMM_6X8__NEON_DUP_LD64, strided_cm) {
18559 TEST_REQUIRES_ARM_NEON;
18560 GemmMicrokernelTester()
18561 .mr(6)
18562 .nr(8)
18563 .kr(1)
18564 .sr(1)
18565 .m(6)
18566 .n(8)
18567 .k(2)
18568 .cm_stride(11)
18569 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld64);
18570 }
18571#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
18572
18573
18574#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard69172d92019-11-26 16:22:39 -080018575 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4) {
18576 TEST_REQUIRES_ARM_NEON;
18577 GemmMicrokernelTester()
18578 .mr(6)
18579 .nr(8)
18580 .kr(1)
18581 .sr(1)
18582 .m(6)
18583 .n(8)
18584 .k(4)
18585 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18586 }
18587
18588 TEST(F32_GEMM_6X8__NEON_DUP_LD128, strided_cn) {
18589 TEST_REQUIRES_ARM_NEON;
18590 GemmMicrokernelTester()
18591 .mr(6)
18592 .nr(8)
18593 .kr(1)
18594 .sr(1)
18595 .m(6)
18596 .n(8)
18597 .k(4)
18598 .cn_stride(11)
18599 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18600 }
18601
18602 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_strided_a) {
18603 TEST_REQUIRES_ARM_NEON;
18604 GemmMicrokernelTester()
18605 .mr(6)
18606 .nr(8)
18607 .kr(1)
18608 .sr(1)
18609 .m(6)
18610 .n(8)
18611 .k(4)
18612 .a_stride(7)
18613 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18614 }
18615
18616 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile) {
18617 TEST_REQUIRES_ARM_NEON;
18618 for (uint32_t m = 1; m <= 6; m++) {
18619 for (uint32_t n = 1; n <= 8; n++) {
18620 GemmMicrokernelTester()
18621 .mr(6)
18622 .nr(8)
18623 .kr(1)
18624 .sr(1)
18625 .m(m)
18626 .n(n)
18627 .k(4)
18628 .iterations(1)
18629 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18630 }
18631 }
18632 }
18633
18634 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_m) {
18635 TEST_REQUIRES_ARM_NEON;
18636 for (uint32_t m = 1; m <= 6; m++) {
18637 GemmMicrokernelTester()
18638 .mr(6)
18639 .nr(8)
18640 .kr(1)
18641 .sr(1)
18642 .m(m)
18643 .n(8)
18644 .k(4)
18645 .iterations(1)
18646 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18647 }
18648 }
18649
18650 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_eq_4_subtile_n) {
18651 TEST_REQUIRES_ARM_NEON;
18652 for (uint32_t n = 1; n <= 8; n++) {
18653 GemmMicrokernelTester()
18654 .mr(6)
18655 .nr(8)
18656 .kr(1)
18657 .sr(1)
18658 .m(6)
18659 .n(n)
18660 .k(4)
18661 .iterations(1)
18662 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18663 }
18664 }
18665
18666 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_lt_4) {
18667 TEST_REQUIRES_ARM_NEON;
18668 for (size_t k = 1; k < 4; k++) {
18669 GemmMicrokernelTester()
18670 .mr(6)
18671 .nr(8)
18672 .kr(1)
18673 .sr(1)
18674 .m(6)
18675 .n(8)
18676 .k(k)
18677 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18678 }
18679 }
18680
18681 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_lt_4_strided_a) {
18682 TEST_REQUIRES_ARM_NEON;
18683 for (size_t k = 1; k < 4; k++) {
18684 GemmMicrokernelTester()
18685 .mr(6)
18686 .nr(8)
18687 .kr(1)
18688 .sr(1)
18689 .m(6)
18690 .n(8)
18691 .k(k)
18692 .a_stride(7)
18693 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18694 }
18695 }
18696
18697 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_lt_4_subtile) {
18698 TEST_REQUIRES_ARM_NEON;
18699 for (size_t k = 1; k < 4; k++) {
18700 for (uint32_t m = 1; m <= 6; m++) {
18701 for (uint32_t n = 1; n <= 8; n++) {
18702 GemmMicrokernelTester()
18703 .mr(6)
18704 .nr(8)
18705 .kr(1)
18706 .sr(1)
18707 .m(m)
18708 .n(n)
18709 .k(k)
18710 .iterations(1)
18711 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18712 }
18713 }
18714 }
18715 }
18716
18717 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_gt_4) {
18718 TEST_REQUIRES_ARM_NEON;
18719 for (size_t k = 5; k < 8; k++) {
18720 GemmMicrokernelTester()
18721 .mr(6)
18722 .nr(8)
18723 .kr(1)
18724 .sr(1)
18725 .m(6)
18726 .n(8)
18727 .k(k)
18728 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18729 }
18730 }
18731
18732 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_gt_4_strided_a) {
18733 TEST_REQUIRES_ARM_NEON;
18734 for (size_t k = 5; k < 8; k++) {
18735 GemmMicrokernelTester()
18736 .mr(6)
18737 .nr(8)
18738 .kr(1)
18739 .sr(1)
18740 .m(6)
18741 .n(8)
18742 .k(k)
18743 .a_stride(11)
18744 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18745 }
18746 }
18747
18748 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_gt_4_subtile) {
18749 TEST_REQUIRES_ARM_NEON;
18750 for (size_t k = 5; k < 8; k++) {
18751 for (uint32_t m = 1; m <= 6; m++) {
18752 for (uint32_t n = 1; n <= 8; n++) {
18753 GemmMicrokernelTester()
18754 .mr(6)
18755 .nr(8)
18756 .kr(1)
18757 .sr(1)
18758 .m(m)
18759 .n(n)
18760 .k(k)
18761 .iterations(1)
18762 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18763 }
18764 }
18765 }
18766 }
18767
18768 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_div_4) {
18769 TEST_REQUIRES_ARM_NEON;
18770 for (size_t k = 8; k <= 40; k += 4) {
18771 GemmMicrokernelTester()
18772 .mr(6)
18773 .nr(8)
18774 .kr(1)
18775 .sr(1)
18776 .m(6)
18777 .n(8)
18778 .k(k)
18779 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18780 }
18781 }
18782
18783 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_div_4_strided_a) {
18784 TEST_REQUIRES_ARM_NEON;
18785 for (size_t k = 8; k <= 40; k += 4) {
18786 GemmMicrokernelTester()
18787 .mr(6)
18788 .nr(8)
18789 .kr(1)
18790 .sr(1)
18791 .m(6)
18792 .n(8)
18793 .k(k)
18794 .a_stride(43)
18795 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18796 }
18797 }
18798
18799 TEST(F32_GEMM_6X8__NEON_DUP_LD128, k_div_4_subtile) {
18800 TEST_REQUIRES_ARM_NEON;
18801 for (size_t k = 8; k <= 40; k += 4) {
18802 for (uint32_t m = 1; m <= 6; m++) {
18803 for (uint32_t n = 1; n <= 8; n++) {
18804 GemmMicrokernelTester()
18805 .mr(6)
18806 .nr(8)
18807 .kr(1)
18808 .sr(1)
18809 .m(m)
18810 .n(n)
18811 .k(k)
18812 .iterations(1)
18813 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18814 }
18815 }
18816 }
18817 }
18818
18819 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8) {
18820 TEST_REQUIRES_ARM_NEON;
18821 for (uint32_t n = 9; n < 16; n++) {
18822 for (size_t k = 1; k <= 20; k += 5) {
18823 GemmMicrokernelTester()
18824 .mr(6)
18825 .nr(8)
18826 .kr(1)
18827 .sr(1)
18828 .m(6)
18829 .n(8)
18830 .k(k)
18831 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18832 }
18833 }
18834 }
18835
18836 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8_strided_cn) {
18837 TEST_REQUIRES_ARM_NEON;
18838 for (uint32_t n = 9; n < 16; n++) {
18839 for (size_t k = 1; k <= 20; k += 5) {
18840 GemmMicrokernelTester()
18841 .mr(6)
18842 .nr(8)
18843 .kr(1)
18844 .sr(1)
18845 .m(6)
18846 .n(8)
18847 .k(k)
18848 .cn_stride(11)
18849 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18850 }
18851 }
18852 }
18853
18854 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8_strided_a) {
18855 TEST_REQUIRES_ARM_NEON;
18856 for (uint32_t n = 9; n < 16; n++) {
18857 for (size_t k = 1; k <= 20; k += 5) {
18858 GemmMicrokernelTester()
18859 .mr(6)
18860 .nr(8)
18861 .kr(1)
18862 .sr(1)
18863 .m(6)
18864 .n(n)
18865 .k(k)
18866 .a_stride(23)
18867 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18868 }
18869 }
18870 }
18871
18872 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_gt_8_subtile) {
18873 TEST_REQUIRES_ARM_NEON;
18874 for (uint32_t n = 9; n < 16; n++) {
18875 for (size_t k = 1; k <= 20; k += 5) {
18876 for (uint32_t m = 1; m <= 6; m++) {
18877 GemmMicrokernelTester()
18878 .mr(6)
18879 .nr(8)
18880 .kr(1)
18881 .sr(1)
18882 .m(m)
18883 .n(n)
18884 .k(k)
18885 .iterations(1)
18886 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18887 }
18888 }
18889 }
18890 }
18891
18892 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8) {
18893 TEST_REQUIRES_ARM_NEON;
18894 for (uint32_t n = 16; n <= 24; n += 8) {
18895 for (size_t k = 1; k <= 20; k += 5) {
18896 GemmMicrokernelTester()
18897 .mr(6)
18898 .nr(8)
18899 .kr(1)
18900 .sr(1)
18901 .m(6)
18902 .n(8)
18903 .k(k)
18904 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18905 }
18906 }
18907 }
18908
18909 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8_strided_cn) {
18910 TEST_REQUIRES_ARM_NEON;
18911 for (uint32_t n = 16; n <= 24; n += 8) {
18912 for (size_t k = 1; k <= 20; k += 5) {
18913 GemmMicrokernelTester()
18914 .mr(6)
18915 .nr(8)
18916 .kr(1)
18917 .sr(1)
18918 .m(6)
18919 .n(n)
18920 .k(k)
18921 .cn_stride(11)
18922 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18923 }
18924 }
18925 }
18926
18927 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8_strided_a) {
18928 TEST_REQUIRES_ARM_NEON;
18929 for (uint32_t n = 16; n <= 24; n += 8) {
18930 for (size_t k = 1; k <= 20; k += 5) {
18931 GemmMicrokernelTester()
18932 .mr(6)
18933 .nr(8)
18934 .kr(1)
18935 .sr(1)
18936 .m(6)
18937 .n(n)
18938 .k(k)
18939 .a_stride(23)
18940 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18941 }
18942 }
18943 }
18944
18945 TEST(F32_GEMM_6X8__NEON_DUP_LD128, n_div_8_subtile) {
18946 TEST_REQUIRES_ARM_NEON;
18947 for (uint32_t n = 16; n <= 24; n += 8) {
18948 for (size_t k = 1; k <= 20; k += 5) {
18949 for (uint32_t m = 1; m <= 6; m++) {
18950 GemmMicrokernelTester()
18951 .mr(6)
18952 .nr(8)
18953 .kr(1)
18954 .sr(1)
18955 .m(m)
18956 .n(n)
18957 .k(k)
18958 .iterations(1)
18959 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18960 }
18961 }
18962 }
18963 }
18964
18965 TEST(F32_GEMM_6X8__NEON_DUP_LD128, strided_cm_subtile) {
18966 TEST_REQUIRES_ARM_NEON;
18967 for (size_t k = 1; k <= 20; k += 5) {
18968 for (uint32_t m = 1; m <= 6; m++) {
18969 for (uint32_t n = 1; n <= 8; n++) {
18970 GemmMicrokernelTester()
18971 .mr(6)
18972 .nr(8)
18973 .kr(1)
18974 .sr(1)
18975 .m(m)
18976 .n(n)
18977 .k(k)
18978 .cm_stride(11)
18979 .iterations(1)
18980 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18981 }
18982 }
18983 }
18984 }
18985
18986 TEST(F32_GEMM_6X8__NEON_DUP_LD128, qmin) {
18987 TEST_REQUIRES_ARM_NEON;
18988 GemmMicrokernelTester()
18989 .mr(6)
18990 .nr(8)
18991 .kr(1)
18992 .sr(1)
18993 .m(6)
18994 .n(8)
18995 .k(4)
18996 .qmin(128)
18997 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
18998 }
18999
19000 TEST(F32_GEMM_6X8__NEON_DUP_LD128, qmax) {
19001 TEST_REQUIRES_ARM_NEON;
19002 GemmMicrokernelTester()
19003 .mr(6)
19004 .nr(8)
19005 .kr(1)
19006 .sr(1)
19007 .m(6)
19008 .n(8)
19009 .k(4)
19010 .qmax(128)
19011 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
19012 }
19013
19014 TEST(F32_GEMM_6X8__NEON_DUP_LD128, strided_cm) {
19015 TEST_REQUIRES_ARM_NEON;
19016 GemmMicrokernelTester()
19017 .mr(6)
19018 .nr(8)
19019 .kr(1)
19020 .sr(1)
19021 .m(6)
19022 .n(8)
19023 .k(4)
19024 .cm_stride(11)
19025 .Test(xnn_f32_gemm_ukernel_6x8__neon_dup_ld128);
19026 }
19027#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19028
19029
19030#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080019031 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2) {
19032 TEST_REQUIRES_ARM_NEON_FMA;
19033 GemmMicrokernelTester()
19034 .mr(1)
19035 .nr(8)
19036 .kr(1)
19037 .sr(1)
19038 .m(1)
19039 .n(8)
19040 .k(2)
19041 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19042 }
19043
19044 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, strided_cn) {
19045 TEST_REQUIRES_ARM_NEON_FMA;
19046 GemmMicrokernelTester()
19047 .mr(1)
19048 .nr(8)
19049 .kr(1)
19050 .sr(1)
19051 .m(1)
19052 .n(8)
19053 .k(2)
19054 .cn_stride(11)
19055 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19056 }
19057
19058 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
19059 TEST_REQUIRES_ARM_NEON_FMA;
19060 GemmMicrokernelTester()
19061 .mr(1)
19062 .nr(8)
19063 .kr(1)
19064 .sr(1)
19065 .m(1)
19066 .n(8)
19067 .k(2)
19068 .a_stride(5)
19069 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19070 }
19071
19072 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
19073 TEST_REQUIRES_ARM_NEON_FMA;
19074 for (uint32_t m = 1; m <= 1; m++) {
19075 for (uint32_t n = 1; n <= 8; n++) {
19076 GemmMicrokernelTester()
19077 .mr(1)
19078 .nr(8)
19079 .kr(1)
19080 .sr(1)
19081 .m(m)
19082 .n(n)
19083 .k(2)
19084 .iterations(1)
19085 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19086 }
19087 }
19088 }
19089
19090 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
19091 TEST_REQUIRES_ARM_NEON_FMA;
19092 for (uint32_t m = 1; m <= 1; m++) {
19093 GemmMicrokernelTester()
19094 .mr(1)
19095 .nr(8)
19096 .kr(1)
19097 .sr(1)
19098 .m(m)
19099 .n(8)
19100 .k(2)
19101 .iterations(1)
19102 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19103 }
19104 }
19105
19106 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
19107 TEST_REQUIRES_ARM_NEON_FMA;
19108 for (uint32_t n = 1; n <= 8; n++) {
19109 GemmMicrokernelTester()
19110 .mr(1)
19111 .nr(8)
19112 .kr(1)
19113 .sr(1)
19114 .m(1)
19115 .n(n)
19116 .k(2)
19117 .iterations(1)
19118 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19119 }
19120 }
19121
19122 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_lt_2) {
19123 TEST_REQUIRES_ARM_NEON_FMA;
19124 for (size_t k = 1; k < 2; k++) {
19125 GemmMicrokernelTester()
19126 .mr(1)
19127 .nr(8)
19128 .kr(1)
19129 .sr(1)
19130 .m(1)
19131 .n(8)
19132 .k(k)
19133 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19134 }
19135 }
19136
19137 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
19138 TEST_REQUIRES_ARM_NEON_FMA;
19139 for (size_t k = 1; k < 2; k++) {
19140 GemmMicrokernelTester()
19141 .mr(1)
19142 .nr(8)
19143 .kr(1)
19144 .sr(1)
19145 .m(1)
19146 .n(8)
19147 .k(k)
19148 .a_stride(5)
19149 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19150 }
19151 }
19152
19153 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
19154 TEST_REQUIRES_ARM_NEON_FMA;
19155 for (size_t k = 1; k < 2; k++) {
19156 for (uint32_t m = 1; m <= 1; m++) {
19157 for (uint32_t n = 1; n <= 8; n++) {
19158 GemmMicrokernelTester()
19159 .mr(1)
19160 .nr(8)
19161 .kr(1)
19162 .sr(1)
19163 .m(m)
19164 .n(n)
19165 .k(k)
19166 .iterations(1)
19167 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19168 }
19169 }
19170 }
19171 }
19172
19173 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_gt_2) {
19174 TEST_REQUIRES_ARM_NEON_FMA;
19175 for (size_t k = 3; k < 4; k++) {
19176 GemmMicrokernelTester()
19177 .mr(1)
19178 .nr(8)
19179 .kr(1)
19180 .sr(1)
19181 .m(1)
19182 .n(8)
19183 .k(k)
19184 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19185 }
19186 }
19187
19188 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
19189 TEST_REQUIRES_ARM_NEON_FMA;
19190 for (size_t k = 3; k < 4; k++) {
19191 GemmMicrokernelTester()
19192 .mr(1)
19193 .nr(8)
19194 .kr(1)
19195 .sr(1)
19196 .m(1)
19197 .n(8)
19198 .k(k)
19199 .a_stride(7)
19200 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19201 }
19202 }
19203
19204 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
19205 TEST_REQUIRES_ARM_NEON_FMA;
19206 for (size_t k = 3; k < 4; k++) {
19207 for (uint32_t m = 1; m <= 1; m++) {
19208 for (uint32_t n = 1; n <= 8; n++) {
19209 GemmMicrokernelTester()
19210 .mr(1)
19211 .nr(8)
19212 .kr(1)
19213 .sr(1)
19214 .m(m)
19215 .n(n)
19216 .k(k)
19217 .iterations(1)
19218 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19219 }
19220 }
19221 }
19222 }
19223
19224 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_div_2) {
19225 TEST_REQUIRES_ARM_NEON_FMA;
19226 for (size_t k = 4; k <= 20; k += 2) {
19227 GemmMicrokernelTester()
19228 .mr(1)
19229 .nr(8)
19230 .kr(1)
19231 .sr(1)
19232 .m(1)
19233 .n(8)
19234 .k(k)
19235 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19236 }
19237 }
19238
19239 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
19240 TEST_REQUIRES_ARM_NEON_FMA;
19241 for (size_t k = 4; k <= 20; k += 2) {
19242 GemmMicrokernelTester()
19243 .mr(1)
19244 .nr(8)
19245 .kr(1)
19246 .sr(1)
19247 .m(1)
19248 .n(8)
19249 .k(k)
19250 .a_stride(23)
19251 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19252 }
19253 }
19254
19255 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
19256 TEST_REQUIRES_ARM_NEON_FMA;
19257 for (size_t k = 4; k <= 20; k += 2) {
19258 for (uint32_t m = 1; m <= 1; m++) {
19259 for (uint32_t n = 1; n <= 8; n++) {
19260 GemmMicrokernelTester()
19261 .mr(1)
19262 .nr(8)
19263 .kr(1)
19264 .sr(1)
19265 .m(m)
19266 .n(n)
19267 .k(k)
19268 .iterations(1)
19269 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19270 }
19271 }
19272 }
19273 }
19274
19275 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8) {
19276 TEST_REQUIRES_ARM_NEON_FMA;
19277 for (uint32_t n = 9; n < 16; n++) {
19278 for (size_t k = 1; k <= 10; k += 3) {
19279 GemmMicrokernelTester()
19280 .mr(1)
19281 .nr(8)
19282 .kr(1)
19283 .sr(1)
19284 .m(1)
19285 .n(8)
19286 .k(k)
19287 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19288 }
19289 }
19290 }
19291
19292 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
19293 TEST_REQUIRES_ARM_NEON_FMA;
19294 for (uint32_t n = 9; n < 16; n++) {
19295 for (size_t k = 1; k <= 10; k += 3) {
19296 GemmMicrokernelTester()
19297 .mr(1)
19298 .nr(8)
19299 .kr(1)
19300 .sr(1)
19301 .m(1)
19302 .n(8)
19303 .k(k)
19304 .cn_stride(11)
19305 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19306 }
19307 }
19308 }
19309
19310 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
19311 TEST_REQUIRES_ARM_NEON_FMA;
19312 for (uint32_t n = 9; n < 16; n++) {
19313 for (size_t k = 1; k <= 10; k += 3) {
19314 GemmMicrokernelTester()
19315 .mr(1)
19316 .nr(8)
19317 .kr(1)
19318 .sr(1)
19319 .m(1)
19320 .n(n)
19321 .k(k)
19322 .a_stride(13)
19323 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19324 }
19325 }
19326 }
19327
19328 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
19329 TEST_REQUIRES_ARM_NEON_FMA;
19330 for (uint32_t n = 9; n < 16; n++) {
19331 for (size_t k = 1; k <= 10; k += 3) {
19332 for (uint32_t m = 1; m <= 1; m++) {
19333 GemmMicrokernelTester()
19334 .mr(1)
19335 .nr(8)
19336 .kr(1)
19337 .sr(1)
19338 .m(m)
19339 .n(n)
19340 .k(k)
19341 .iterations(1)
19342 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19343 }
19344 }
19345 }
19346 }
19347
19348 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8) {
19349 TEST_REQUIRES_ARM_NEON_FMA;
19350 for (uint32_t n = 16; n <= 24; n += 8) {
19351 for (size_t k = 1; k <= 10; k += 3) {
19352 GemmMicrokernelTester()
19353 .mr(1)
19354 .nr(8)
19355 .kr(1)
19356 .sr(1)
19357 .m(1)
19358 .n(8)
19359 .k(k)
19360 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19361 }
19362 }
19363 }
19364
19365 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
19366 TEST_REQUIRES_ARM_NEON_FMA;
19367 for (uint32_t n = 16; n <= 24; n += 8) {
19368 for (size_t k = 1; k <= 10; k += 3) {
19369 GemmMicrokernelTester()
19370 .mr(1)
19371 .nr(8)
19372 .kr(1)
19373 .sr(1)
19374 .m(1)
19375 .n(n)
19376 .k(k)
19377 .cn_stride(11)
19378 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19379 }
19380 }
19381 }
19382
19383 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
19384 TEST_REQUIRES_ARM_NEON_FMA;
19385 for (uint32_t n = 16; n <= 24; n += 8) {
19386 for (size_t k = 1; k <= 10; k += 3) {
19387 GemmMicrokernelTester()
19388 .mr(1)
19389 .nr(8)
19390 .kr(1)
19391 .sr(1)
19392 .m(1)
19393 .n(n)
19394 .k(k)
19395 .a_stride(13)
19396 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19397 }
19398 }
19399 }
19400
19401 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
19402 TEST_REQUIRES_ARM_NEON_FMA;
19403 for (uint32_t n = 16; n <= 24; n += 8) {
19404 for (size_t k = 1; k <= 10; k += 3) {
19405 for (uint32_t m = 1; m <= 1; m++) {
19406 GemmMicrokernelTester()
19407 .mr(1)
19408 .nr(8)
19409 .kr(1)
19410 .sr(1)
19411 .m(m)
19412 .n(n)
19413 .k(k)
19414 .iterations(1)
19415 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19416 }
19417 }
19418 }
19419 }
19420
19421 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
19422 TEST_REQUIRES_ARM_NEON_FMA;
19423 for (size_t k = 1; k <= 10; k += 3) {
19424 for (uint32_t m = 1; m <= 1; m++) {
19425 for (uint32_t n = 1; n <= 8; n++) {
19426 GemmMicrokernelTester()
19427 .mr(1)
19428 .nr(8)
19429 .kr(1)
19430 .sr(1)
19431 .m(m)
19432 .n(n)
19433 .k(k)
19434 .cm_stride(11)
19435 .iterations(1)
19436 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19437 }
19438 }
19439 }
19440 }
19441
19442 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, qmin) {
19443 TEST_REQUIRES_ARM_NEON_FMA;
19444 GemmMicrokernelTester()
19445 .mr(1)
19446 .nr(8)
19447 .kr(1)
19448 .sr(1)
19449 .m(1)
19450 .n(8)
19451 .k(2)
19452 .qmin(128)
19453 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19454 }
19455
19456 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, qmax) {
19457 TEST_REQUIRES_ARM_NEON_FMA;
19458 GemmMicrokernelTester()
19459 .mr(1)
19460 .nr(8)
19461 .kr(1)
19462 .sr(1)
19463 .m(1)
19464 .n(8)
19465 .k(2)
19466 .qmax(128)
19467 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19468 }
19469
19470 TEST(F32_GEMM_1X8__NEONFMA_DUP_LD64, strided_cm) {
19471 TEST_REQUIRES_ARM_NEON_FMA;
19472 GemmMicrokernelTester()
19473 .mr(1)
19474 .nr(8)
19475 .kr(1)
19476 .sr(1)
19477 .m(1)
19478 .n(8)
19479 .k(2)
19480 .cm_stride(11)
19481 .Test(xnn_f32_gemm_ukernel_1x8__neonfma_dup_ld64);
19482 }
19483#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19484
19485
19486#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19487 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2) {
19488 TEST_REQUIRES_ARM_NEON_FMA;
19489 GemmMicrokernelTester()
19490 .mr(4)
19491 .nr(8)
19492 .kr(1)
19493 .sr(1)
19494 .m(4)
19495 .n(8)
19496 .k(2)
19497 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19498 }
19499
19500 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, strided_cn) {
19501 TEST_REQUIRES_ARM_NEON_FMA;
19502 GemmMicrokernelTester()
19503 .mr(4)
19504 .nr(8)
19505 .kr(1)
19506 .sr(1)
19507 .m(4)
19508 .n(8)
19509 .k(2)
19510 .cn_stride(11)
19511 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19512 }
19513
19514 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
19515 TEST_REQUIRES_ARM_NEON_FMA;
19516 GemmMicrokernelTester()
19517 .mr(4)
19518 .nr(8)
19519 .kr(1)
19520 .sr(1)
19521 .m(4)
19522 .n(8)
19523 .k(2)
19524 .a_stride(5)
19525 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19526 }
19527
19528 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
19529 TEST_REQUIRES_ARM_NEON_FMA;
19530 for (uint32_t m = 1; m <= 4; m++) {
19531 for (uint32_t n = 1; n <= 8; n++) {
19532 GemmMicrokernelTester()
19533 .mr(4)
19534 .nr(8)
19535 .kr(1)
19536 .sr(1)
19537 .m(m)
19538 .n(n)
19539 .k(2)
19540 .iterations(1)
19541 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19542 }
19543 }
19544 }
19545
19546 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
19547 TEST_REQUIRES_ARM_NEON_FMA;
19548 for (uint32_t m = 1; m <= 4; m++) {
19549 GemmMicrokernelTester()
19550 .mr(4)
19551 .nr(8)
19552 .kr(1)
19553 .sr(1)
19554 .m(m)
19555 .n(8)
19556 .k(2)
19557 .iterations(1)
19558 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19559 }
19560 }
19561
19562 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
19563 TEST_REQUIRES_ARM_NEON_FMA;
19564 for (uint32_t n = 1; n <= 8; n++) {
19565 GemmMicrokernelTester()
19566 .mr(4)
19567 .nr(8)
19568 .kr(1)
19569 .sr(1)
19570 .m(4)
19571 .n(n)
19572 .k(2)
19573 .iterations(1)
19574 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19575 }
19576 }
19577
19578 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_lt_2) {
19579 TEST_REQUIRES_ARM_NEON_FMA;
19580 for (size_t k = 1; k < 2; k++) {
19581 GemmMicrokernelTester()
19582 .mr(4)
19583 .nr(8)
19584 .kr(1)
19585 .sr(1)
19586 .m(4)
19587 .n(8)
19588 .k(k)
19589 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19590 }
19591 }
19592
19593 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
19594 TEST_REQUIRES_ARM_NEON_FMA;
19595 for (size_t k = 1; k < 2; k++) {
19596 GemmMicrokernelTester()
19597 .mr(4)
19598 .nr(8)
19599 .kr(1)
19600 .sr(1)
19601 .m(4)
19602 .n(8)
19603 .k(k)
19604 .a_stride(5)
19605 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19606 }
19607 }
19608
19609 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
19610 TEST_REQUIRES_ARM_NEON_FMA;
19611 for (size_t k = 1; k < 2; k++) {
19612 for (uint32_t m = 1; m <= 4; m++) {
19613 for (uint32_t n = 1; n <= 8; n++) {
19614 GemmMicrokernelTester()
19615 .mr(4)
19616 .nr(8)
19617 .kr(1)
19618 .sr(1)
19619 .m(m)
19620 .n(n)
19621 .k(k)
19622 .iterations(1)
19623 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19624 }
19625 }
19626 }
19627 }
19628
19629 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_gt_2) {
19630 TEST_REQUIRES_ARM_NEON_FMA;
19631 for (size_t k = 3; k < 4; k++) {
19632 GemmMicrokernelTester()
19633 .mr(4)
19634 .nr(8)
19635 .kr(1)
19636 .sr(1)
19637 .m(4)
19638 .n(8)
19639 .k(k)
19640 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19641 }
19642 }
19643
19644 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
19645 TEST_REQUIRES_ARM_NEON_FMA;
19646 for (size_t k = 3; k < 4; k++) {
19647 GemmMicrokernelTester()
19648 .mr(4)
19649 .nr(8)
19650 .kr(1)
19651 .sr(1)
19652 .m(4)
19653 .n(8)
19654 .k(k)
19655 .a_stride(7)
19656 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19657 }
19658 }
19659
19660 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
19661 TEST_REQUIRES_ARM_NEON_FMA;
19662 for (size_t k = 3; k < 4; k++) {
19663 for (uint32_t m = 1; m <= 4; m++) {
19664 for (uint32_t n = 1; n <= 8; n++) {
19665 GemmMicrokernelTester()
19666 .mr(4)
19667 .nr(8)
19668 .kr(1)
19669 .sr(1)
19670 .m(m)
19671 .n(n)
19672 .k(k)
19673 .iterations(1)
19674 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19675 }
19676 }
19677 }
19678 }
19679
19680 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_div_2) {
19681 TEST_REQUIRES_ARM_NEON_FMA;
19682 for (size_t k = 4; k <= 20; k += 2) {
19683 GemmMicrokernelTester()
19684 .mr(4)
19685 .nr(8)
19686 .kr(1)
19687 .sr(1)
19688 .m(4)
19689 .n(8)
19690 .k(k)
19691 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19692 }
19693 }
19694
19695 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
19696 TEST_REQUIRES_ARM_NEON_FMA;
19697 for (size_t k = 4; k <= 20; k += 2) {
19698 GemmMicrokernelTester()
19699 .mr(4)
19700 .nr(8)
19701 .kr(1)
19702 .sr(1)
19703 .m(4)
19704 .n(8)
19705 .k(k)
19706 .a_stride(23)
19707 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19708 }
19709 }
19710
19711 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
19712 TEST_REQUIRES_ARM_NEON_FMA;
19713 for (size_t k = 4; k <= 20; k += 2) {
19714 for (uint32_t m = 1; m <= 4; m++) {
19715 for (uint32_t n = 1; n <= 8; n++) {
19716 GemmMicrokernelTester()
19717 .mr(4)
19718 .nr(8)
19719 .kr(1)
19720 .sr(1)
19721 .m(m)
19722 .n(n)
19723 .k(k)
19724 .iterations(1)
19725 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19726 }
19727 }
19728 }
19729 }
19730
19731 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8) {
19732 TEST_REQUIRES_ARM_NEON_FMA;
19733 for (uint32_t n = 9; n < 16; n++) {
19734 for (size_t k = 1; k <= 10; k += 3) {
19735 GemmMicrokernelTester()
19736 .mr(4)
19737 .nr(8)
19738 .kr(1)
19739 .sr(1)
19740 .m(4)
19741 .n(8)
19742 .k(k)
19743 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19744 }
19745 }
19746 }
19747
19748 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
19749 TEST_REQUIRES_ARM_NEON_FMA;
19750 for (uint32_t n = 9; n < 16; n++) {
19751 for (size_t k = 1; k <= 10; k += 3) {
19752 GemmMicrokernelTester()
19753 .mr(4)
19754 .nr(8)
19755 .kr(1)
19756 .sr(1)
19757 .m(4)
19758 .n(8)
19759 .k(k)
19760 .cn_stride(11)
19761 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19762 }
19763 }
19764 }
19765
19766 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
19767 TEST_REQUIRES_ARM_NEON_FMA;
19768 for (uint32_t n = 9; n < 16; n++) {
19769 for (size_t k = 1; k <= 10; k += 3) {
19770 GemmMicrokernelTester()
19771 .mr(4)
19772 .nr(8)
19773 .kr(1)
19774 .sr(1)
19775 .m(4)
19776 .n(n)
19777 .k(k)
19778 .a_stride(13)
19779 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19780 }
19781 }
19782 }
19783
19784 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
19785 TEST_REQUIRES_ARM_NEON_FMA;
19786 for (uint32_t n = 9; n < 16; n++) {
19787 for (size_t k = 1; k <= 10; k += 3) {
19788 for (uint32_t m = 1; m <= 4; m++) {
19789 GemmMicrokernelTester()
19790 .mr(4)
19791 .nr(8)
19792 .kr(1)
19793 .sr(1)
19794 .m(m)
19795 .n(n)
19796 .k(k)
19797 .iterations(1)
19798 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19799 }
19800 }
19801 }
19802 }
19803
19804 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8) {
19805 TEST_REQUIRES_ARM_NEON_FMA;
19806 for (uint32_t n = 16; n <= 24; n += 8) {
19807 for (size_t k = 1; k <= 10; k += 3) {
19808 GemmMicrokernelTester()
19809 .mr(4)
19810 .nr(8)
19811 .kr(1)
19812 .sr(1)
19813 .m(4)
19814 .n(8)
19815 .k(k)
19816 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19817 }
19818 }
19819 }
19820
19821 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
19822 TEST_REQUIRES_ARM_NEON_FMA;
19823 for (uint32_t n = 16; n <= 24; n += 8) {
19824 for (size_t k = 1; k <= 10; k += 3) {
19825 GemmMicrokernelTester()
19826 .mr(4)
19827 .nr(8)
19828 .kr(1)
19829 .sr(1)
19830 .m(4)
19831 .n(n)
19832 .k(k)
19833 .cn_stride(11)
19834 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19835 }
19836 }
19837 }
19838
19839 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
19840 TEST_REQUIRES_ARM_NEON_FMA;
19841 for (uint32_t n = 16; n <= 24; n += 8) {
19842 for (size_t k = 1; k <= 10; k += 3) {
19843 GemmMicrokernelTester()
19844 .mr(4)
19845 .nr(8)
19846 .kr(1)
19847 .sr(1)
19848 .m(4)
19849 .n(n)
19850 .k(k)
19851 .a_stride(13)
19852 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19853 }
19854 }
19855 }
19856
19857 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
19858 TEST_REQUIRES_ARM_NEON_FMA;
19859 for (uint32_t n = 16; n <= 24; n += 8) {
19860 for (size_t k = 1; k <= 10; k += 3) {
19861 for (uint32_t m = 1; m <= 4; m++) {
19862 GemmMicrokernelTester()
19863 .mr(4)
19864 .nr(8)
19865 .kr(1)
19866 .sr(1)
19867 .m(m)
19868 .n(n)
19869 .k(k)
19870 .iterations(1)
19871 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19872 }
19873 }
19874 }
19875 }
19876
19877 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
19878 TEST_REQUIRES_ARM_NEON_FMA;
19879 for (size_t k = 1; k <= 10; k += 3) {
19880 for (uint32_t m = 1; m <= 4; m++) {
19881 for (uint32_t n = 1; n <= 8; n++) {
19882 GemmMicrokernelTester()
19883 .mr(4)
19884 .nr(8)
19885 .kr(1)
19886 .sr(1)
19887 .m(m)
19888 .n(n)
19889 .k(k)
19890 .cm_stride(11)
19891 .iterations(1)
19892 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19893 }
19894 }
19895 }
19896 }
19897
19898 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, qmin) {
19899 TEST_REQUIRES_ARM_NEON_FMA;
19900 GemmMicrokernelTester()
19901 .mr(4)
19902 .nr(8)
19903 .kr(1)
19904 .sr(1)
19905 .m(4)
19906 .n(8)
19907 .k(2)
19908 .qmin(128)
19909 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19910 }
19911
19912 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, qmax) {
19913 TEST_REQUIRES_ARM_NEON_FMA;
19914 GemmMicrokernelTester()
19915 .mr(4)
19916 .nr(8)
19917 .kr(1)
19918 .sr(1)
19919 .m(4)
19920 .n(8)
19921 .k(2)
19922 .qmax(128)
19923 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19924 }
19925
19926 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD64, strided_cm) {
19927 TEST_REQUIRES_ARM_NEON_FMA;
19928 GemmMicrokernelTester()
19929 .mr(4)
19930 .nr(8)
19931 .kr(1)
19932 .sr(1)
19933 .m(4)
19934 .n(8)
19935 .k(2)
19936 .cm_stride(11)
19937 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld64);
19938 }
19939#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
19940
19941
19942#if XNN_ARCH_ARM || XNN_ARCH_ARM64
19943 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4) {
19944 TEST_REQUIRES_ARM_NEON_FMA;
19945 GemmMicrokernelTester()
19946 .mr(4)
19947 .nr(8)
19948 .kr(1)
19949 .sr(1)
19950 .m(4)
19951 .n(8)
19952 .k(4)
19953 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
19954 }
19955
19956 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, strided_cn) {
19957 TEST_REQUIRES_ARM_NEON_FMA;
19958 GemmMicrokernelTester()
19959 .mr(4)
19960 .nr(8)
19961 .kr(1)
19962 .sr(1)
19963 .m(4)
19964 .n(8)
19965 .k(4)
19966 .cn_stride(11)
19967 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
19968 }
19969
19970 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
19971 TEST_REQUIRES_ARM_NEON_FMA;
19972 GemmMicrokernelTester()
19973 .mr(4)
19974 .nr(8)
19975 .kr(1)
19976 .sr(1)
19977 .m(4)
19978 .n(8)
19979 .k(4)
19980 .a_stride(7)
19981 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
19982 }
19983
19984 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
19985 TEST_REQUIRES_ARM_NEON_FMA;
19986 for (uint32_t m = 1; m <= 4; m++) {
19987 for (uint32_t n = 1; n <= 8; n++) {
19988 GemmMicrokernelTester()
19989 .mr(4)
19990 .nr(8)
19991 .kr(1)
19992 .sr(1)
19993 .m(m)
19994 .n(n)
19995 .k(4)
19996 .iterations(1)
19997 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
19998 }
19999 }
20000 }
20001
20002 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
20003 TEST_REQUIRES_ARM_NEON_FMA;
20004 for (uint32_t m = 1; m <= 4; m++) {
20005 GemmMicrokernelTester()
20006 .mr(4)
20007 .nr(8)
20008 .kr(1)
20009 .sr(1)
20010 .m(m)
20011 .n(8)
20012 .k(4)
20013 .iterations(1)
20014 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20015 }
20016 }
20017
20018 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
20019 TEST_REQUIRES_ARM_NEON_FMA;
20020 for (uint32_t n = 1; n <= 8; n++) {
20021 GemmMicrokernelTester()
20022 .mr(4)
20023 .nr(8)
20024 .kr(1)
20025 .sr(1)
20026 .m(4)
20027 .n(n)
20028 .k(4)
20029 .iterations(1)
20030 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20031 }
20032 }
20033
20034 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_lt_4) {
20035 TEST_REQUIRES_ARM_NEON_FMA;
20036 for (size_t k = 1; k < 4; k++) {
20037 GemmMicrokernelTester()
20038 .mr(4)
20039 .nr(8)
20040 .kr(1)
20041 .sr(1)
20042 .m(4)
20043 .n(8)
20044 .k(k)
20045 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20046 }
20047 }
20048
20049 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
20050 TEST_REQUIRES_ARM_NEON_FMA;
20051 for (size_t k = 1; k < 4; k++) {
20052 GemmMicrokernelTester()
20053 .mr(4)
20054 .nr(8)
20055 .kr(1)
20056 .sr(1)
20057 .m(4)
20058 .n(8)
20059 .k(k)
20060 .a_stride(7)
20061 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20062 }
20063 }
20064
20065 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
20066 TEST_REQUIRES_ARM_NEON_FMA;
20067 for (size_t k = 1; k < 4; k++) {
20068 for (uint32_t m = 1; m <= 4; m++) {
20069 for (uint32_t n = 1; n <= 8; n++) {
20070 GemmMicrokernelTester()
20071 .mr(4)
20072 .nr(8)
20073 .kr(1)
20074 .sr(1)
20075 .m(m)
20076 .n(n)
20077 .k(k)
20078 .iterations(1)
20079 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20080 }
20081 }
20082 }
20083 }
20084
20085 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_gt_4) {
20086 TEST_REQUIRES_ARM_NEON_FMA;
20087 for (size_t k = 5; k < 8; k++) {
20088 GemmMicrokernelTester()
20089 .mr(4)
20090 .nr(8)
20091 .kr(1)
20092 .sr(1)
20093 .m(4)
20094 .n(8)
20095 .k(k)
20096 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20097 }
20098 }
20099
20100 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
20101 TEST_REQUIRES_ARM_NEON_FMA;
20102 for (size_t k = 5; k < 8; k++) {
20103 GemmMicrokernelTester()
20104 .mr(4)
20105 .nr(8)
20106 .kr(1)
20107 .sr(1)
20108 .m(4)
20109 .n(8)
20110 .k(k)
20111 .a_stride(11)
20112 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20113 }
20114 }
20115
20116 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
20117 TEST_REQUIRES_ARM_NEON_FMA;
20118 for (size_t k = 5; k < 8; k++) {
20119 for (uint32_t m = 1; m <= 4; m++) {
20120 for (uint32_t n = 1; n <= 8; n++) {
20121 GemmMicrokernelTester()
20122 .mr(4)
20123 .nr(8)
20124 .kr(1)
20125 .sr(1)
20126 .m(m)
20127 .n(n)
20128 .k(k)
20129 .iterations(1)
20130 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20131 }
20132 }
20133 }
20134 }
20135
20136 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_div_4) {
20137 TEST_REQUIRES_ARM_NEON_FMA;
20138 for (size_t k = 8; k <= 40; k += 4) {
20139 GemmMicrokernelTester()
20140 .mr(4)
20141 .nr(8)
20142 .kr(1)
20143 .sr(1)
20144 .m(4)
20145 .n(8)
20146 .k(k)
20147 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20148 }
20149 }
20150
20151 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
20152 TEST_REQUIRES_ARM_NEON_FMA;
20153 for (size_t k = 8; k <= 40; k += 4) {
20154 GemmMicrokernelTester()
20155 .mr(4)
20156 .nr(8)
20157 .kr(1)
20158 .sr(1)
20159 .m(4)
20160 .n(8)
20161 .k(k)
20162 .a_stride(43)
20163 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20164 }
20165 }
20166
20167 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
20168 TEST_REQUIRES_ARM_NEON_FMA;
20169 for (size_t k = 8; k <= 40; k += 4) {
20170 for (uint32_t m = 1; m <= 4; m++) {
20171 for (uint32_t n = 1; n <= 8; n++) {
20172 GemmMicrokernelTester()
20173 .mr(4)
20174 .nr(8)
20175 .kr(1)
20176 .sr(1)
20177 .m(m)
20178 .n(n)
20179 .k(k)
20180 .iterations(1)
20181 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20182 }
20183 }
20184 }
20185 }
20186
20187 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8) {
20188 TEST_REQUIRES_ARM_NEON_FMA;
20189 for (uint32_t n = 9; n < 16; n++) {
20190 for (size_t k = 1; k <= 20; k += 5) {
20191 GemmMicrokernelTester()
20192 .mr(4)
20193 .nr(8)
20194 .kr(1)
20195 .sr(1)
20196 .m(4)
20197 .n(8)
20198 .k(k)
20199 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20200 }
20201 }
20202 }
20203
20204 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
20205 TEST_REQUIRES_ARM_NEON_FMA;
20206 for (uint32_t n = 9; n < 16; n++) {
20207 for (size_t k = 1; k <= 20; k += 5) {
20208 GemmMicrokernelTester()
20209 .mr(4)
20210 .nr(8)
20211 .kr(1)
20212 .sr(1)
20213 .m(4)
20214 .n(8)
20215 .k(k)
20216 .cn_stride(11)
20217 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20218 }
20219 }
20220 }
20221
20222 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
20223 TEST_REQUIRES_ARM_NEON_FMA;
20224 for (uint32_t n = 9; n < 16; n++) {
20225 for (size_t k = 1; k <= 20; k += 5) {
20226 GemmMicrokernelTester()
20227 .mr(4)
20228 .nr(8)
20229 .kr(1)
20230 .sr(1)
20231 .m(4)
20232 .n(n)
20233 .k(k)
20234 .a_stride(23)
20235 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20236 }
20237 }
20238 }
20239
20240 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
20241 TEST_REQUIRES_ARM_NEON_FMA;
20242 for (uint32_t n = 9; n < 16; n++) {
20243 for (size_t k = 1; k <= 20; k += 5) {
20244 for (uint32_t m = 1; m <= 4; m++) {
20245 GemmMicrokernelTester()
20246 .mr(4)
20247 .nr(8)
20248 .kr(1)
20249 .sr(1)
20250 .m(m)
20251 .n(n)
20252 .k(k)
20253 .iterations(1)
20254 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20255 }
20256 }
20257 }
20258 }
20259
20260 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8) {
20261 TEST_REQUIRES_ARM_NEON_FMA;
20262 for (uint32_t n = 16; n <= 24; n += 8) {
20263 for (size_t k = 1; k <= 20; k += 5) {
20264 GemmMicrokernelTester()
20265 .mr(4)
20266 .nr(8)
20267 .kr(1)
20268 .sr(1)
20269 .m(4)
20270 .n(8)
20271 .k(k)
20272 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20273 }
20274 }
20275 }
20276
20277 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
20278 TEST_REQUIRES_ARM_NEON_FMA;
20279 for (uint32_t n = 16; n <= 24; n += 8) {
20280 for (size_t k = 1; k <= 20; k += 5) {
20281 GemmMicrokernelTester()
20282 .mr(4)
20283 .nr(8)
20284 .kr(1)
20285 .sr(1)
20286 .m(4)
20287 .n(n)
20288 .k(k)
20289 .cn_stride(11)
20290 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20291 }
20292 }
20293 }
20294
20295 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
20296 TEST_REQUIRES_ARM_NEON_FMA;
20297 for (uint32_t n = 16; n <= 24; n += 8) {
20298 for (size_t k = 1; k <= 20; k += 5) {
20299 GemmMicrokernelTester()
20300 .mr(4)
20301 .nr(8)
20302 .kr(1)
20303 .sr(1)
20304 .m(4)
20305 .n(n)
20306 .k(k)
20307 .a_stride(23)
20308 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20309 }
20310 }
20311 }
20312
20313 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
20314 TEST_REQUIRES_ARM_NEON_FMA;
20315 for (uint32_t n = 16; n <= 24; n += 8) {
20316 for (size_t k = 1; k <= 20; k += 5) {
20317 for (uint32_t m = 1; m <= 4; m++) {
20318 GemmMicrokernelTester()
20319 .mr(4)
20320 .nr(8)
20321 .kr(1)
20322 .sr(1)
20323 .m(m)
20324 .n(n)
20325 .k(k)
20326 .iterations(1)
20327 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20328 }
20329 }
20330 }
20331 }
20332
20333 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
20334 TEST_REQUIRES_ARM_NEON_FMA;
20335 for (size_t k = 1; k <= 20; k += 5) {
20336 for (uint32_t m = 1; m <= 4; m++) {
20337 for (uint32_t n = 1; n <= 8; n++) {
20338 GemmMicrokernelTester()
20339 .mr(4)
20340 .nr(8)
20341 .kr(1)
20342 .sr(1)
20343 .m(m)
20344 .n(n)
20345 .k(k)
20346 .cm_stride(11)
20347 .iterations(1)
20348 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20349 }
20350 }
20351 }
20352 }
20353
20354 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, qmin) {
20355 TEST_REQUIRES_ARM_NEON_FMA;
20356 GemmMicrokernelTester()
20357 .mr(4)
20358 .nr(8)
20359 .kr(1)
20360 .sr(1)
20361 .m(4)
20362 .n(8)
20363 .k(4)
20364 .qmin(128)
20365 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20366 }
20367
20368 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, qmax) {
20369 TEST_REQUIRES_ARM_NEON_FMA;
20370 GemmMicrokernelTester()
20371 .mr(4)
20372 .nr(8)
20373 .kr(1)
20374 .sr(1)
20375 .m(4)
20376 .n(8)
20377 .k(4)
20378 .qmax(128)
20379 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20380 }
20381
20382 TEST(F32_GEMM_4X8__NEONFMA_DUP_LD128, strided_cm) {
20383 TEST_REQUIRES_ARM_NEON_FMA;
20384 GemmMicrokernelTester()
20385 .mr(4)
20386 .nr(8)
20387 .kr(1)
20388 .sr(1)
20389 .m(4)
20390 .n(8)
20391 .k(4)
20392 .cm_stride(11)
20393 .Test(xnn_f32_gemm_ukernel_4x8__neonfma_dup_ld128);
20394 }
20395#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20396
20397
20398#if XNN_ARCH_ARM || XNN_ARCH_ARM64
20399 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2) {
20400 TEST_REQUIRES_ARM_NEON_FMA;
20401 GemmMicrokernelTester()
20402 .mr(6)
20403 .nr(8)
20404 .kr(1)
20405 .sr(1)
20406 .m(6)
20407 .n(8)
20408 .k(2)
20409 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20410 }
20411
20412 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, strided_cn) {
20413 TEST_REQUIRES_ARM_NEON_FMA;
20414 GemmMicrokernelTester()
20415 .mr(6)
20416 .nr(8)
20417 .kr(1)
20418 .sr(1)
20419 .m(6)
20420 .n(8)
20421 .k(2)
20422 .cn_stride(11)
20423 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20424 }
20425
20426 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_strided_a) {
20427 TEST_REQUIRES_ARM_NEON_FMA;
20428 GemmMicrokernelTester()
20429 .mr(6)
20430 .nr(8)
20431 .kr(1)
20432 .sr(1)
20433 .m(6)
20434 .n(8)
20435 .k(2)
20436 .a_stride(5)
20437 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20438 }
20439
20440 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile) {
20441 TEST_REQUIRES_ARM_NEON_FMA;
20442 for (uint32_t m = 1; m <= 6; m++) {
20443 for (uint32_t n = 1; n <= 8; n++) {
20444 GemmMicrokernelTester()
20445 .mr(6)
20446 .nr(8)
20447 .kr(1)
20448 .sr(1)
20449 .m(m)
20450 .n(n)
20451 .k(2)
20452 .iterations(1)
20453 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20454 }
20455 }
20456 }
20457
20458 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_m) {
20459 TEST_REQUIRES_ARM_NEON_FMA;
20460 for (uint32_t m = 1; m <= 6; m++) {
20461 GemmMicrokernelTester()
20462 .mr(6)
20463 .nr(8)
20464 .kr(1)
20465 .sr(1)
20466 .m(m)
20467 .n(8)
20468 .k(2)
20469 .iterations(1)
20470 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20471 }
20472 }
20473
20474 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_eq_2_subtile_n) {
20475 TEST_REQUIRES_ARM_NEON_FMA;
20476 for (uint32_t n = 1; n <= 8; n++) {
20477 GemmMicrokernelTester()
20478 .mr(6)
20479 .nr(8)
20480 .kr(1)
20481 .sr(1)
20482 .m(6)
20483 .n(n)
20484 .k(2)
20485 .iterations(1)
20486 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20487 }
20488 }
20489
20490 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_lt_2) {
20491 TEST_REQUIRES_ARM_NEON_FMA;
20492 for (size_t k = 1; k < 2; k++) {
20493 GemmMicrokernelTester()
20494 .mr(6)
20495 .nr(8)
20496 .kr(1)
20497 .sr(1)
20498 .m(6)
20499 .n(8)
20500 .k(k)
20501 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20502 }
20503 }
20504
20505 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_lt_2_strided_a) {
20506 TEST_REQUIRES_ARM_NEON_FMA;
20507 for (size_t k = 1; k < 2; k++) {
20508 GemmMicrokernelTester()
20509 .mr(6)
20510 .nr(8)
20511 .kr(1)
20512 .sr(1)
20513 .m(6)
20514 .n(8)
20515 .k(k)
20516 .a_stride(5)
20517 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20518 }
20519 }
20520
20521 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_lt_2_subtile) {
20522 TEST_REQUIRES_ARM_NEON_FMA;
20523 for (size_t k = 1; k < 2; k++) {
20524 for (uint32_t m = 1; m <= 6; m++) {
20525 for (uint32_t n = 1; n <= 8; n++) {
20526 GemmMicrokernelTester()
20527 .mr(6)
20528 .nr(8)
20529 .kr(1)
20530 .sr(1)
20531 .m(m)
20532 .n(n)
20533 .k(k)
20534 .iterations(1)
20535 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20536 }
20537 }
20538 }
20539 }
20540
20541 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_gt_2) {
20542 TEST_REQUIRES_ARM_NEON_FMA;
20543 for (size_t k = 3; k < 4; k++) {
20544 GemmMicrokernelTester()
20545 .mr(6)
20546 .nr(8)
20547 .kr(1)
20548 .sr(1)
20549 .m(6)
20550 .n(8)
20551 .k(k)
20552 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20553 }
20554 }
20555
20556 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_gt_2_strided_a) {
20557 TEST_REQUIRES_ARM_NEON_FMA;
20558 for (size_t k = 3; k < 4; k++) {
20559 GemmMicrokernelTester()
20560 .mr(6)
20561 .nr(8)
20562 .kr(1)
20563 .sr(1)
20564 .m(6)
20565 .n(8)
20566 .k(k)
20567 .a_stride(7)
20568 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20569 }
20570 }
20571
20572 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_gt_2_subtile) {
20573 TEST_REQUIRES_ARM_NEON_FMA;
20574 for (size_t k = 3; k < 4; k++) {
20575 for (uint32_t m = 1; m <= 6; m++) {
20576 for (uint32_t n = 1; n <= 8; n++) {
20577 GemmMicrokernelTester()
20578 .mr(6)
20579 .nr(8)
20580 .kr(1)
20581 .sr(1)
20582 .m(m)
20583 .n(n)
20584 .k(k)
20585 .iterations(1)
20586 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20587 }
20588 }
20589 }
20590 }
20591
20592 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_div_2) {
20593 TEST_REQUIRES_ARM_NEON_FMA;
20594 for (size_t k = 4; k <= 20; k += 2) {
20595 GemmMicrokernelTester()
20596 .mr(6)
20597 .nr(8)
20598 .kr(1)
20599 .sr(1)
20600 .m(6)
20601 .n(8)
20602 .k(k)
20603 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20604 }
20605 }
20606
20607 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_div_2_strided_a) {
20608 TEST_REQUIRES_ARM_NEON_FMA;
20609 for (size_t k = 4; k <= 20; k += 2) {
20610 GemmMicrokernelTester()
20611 .mr(6)
20612 .nr(8)
20613 .kr(1)
20614 .sr(1)
20615 .m(6)
20616 .n(8)
20617 .k(k)
20618 .a_stride(23)
20619 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20620 }
20621 }
20622
20623 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, k_div_2_subtile) {
20624 TEST_REQUIRES_ARM_NEON_FMA;
20625 for (size_t k = 4; k <= 20; k += 2) {
20626 for (uint32_t m = 1; m <= 6; m++) {
20627 for (uint32_t n = 1; n <= 8; n++) {
20628 GemmMicrokernelTester()
20629 .mr(6)
20630 .nr(8)
20631 .kr(1)
20632 .sr(1)
20633 .m(m)
20634 .n(n)
20635 .k(k)
20636 .iterations(1)
20637 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20638 }
20639 }
20640 }
20641 }
20642
20643 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8) {
20644 TEST_REQUIRES_ARM_NEON_FMA;
20645 for (uint32_t n = 9; n < 16; n++) {
20646 for (size_t k = 1; k <= 10; k += 3) {
20647 GemmMicrokernelTester()
20648 .mr(6)
20649 .nr(8)
20650 .kr(1)
20651 .sr(1)
20652 .m(6)
20653 .n(8)
20654 .k(k)
20655 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20656 }
20657 }
20658 }
20659
20660 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_cn) {
20661 TEST_REQUIRES_ARM_NEON_FMA;
20662 for (uint32_t n = 9; n < 16; n++) {
20663 for (size_t k = 1; k <= 10; k += 3) {
20664 GemmMicrokernelTester()
20665 .mr(6)
20666 .nr(8)
20667 .kr(1)
20668 .sr(1)
20669 .m(6)
20670 .n(8)
20671 .k(k)
20672 .cn_stride(11)
20673 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20674 }
20675 }
20676 }
20677
20678 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_strided_a) {
20679 TEST_REQUIRES_ARM_NEON_FMA;
20680 for (uint32_t n = 9; n < 16; n++) {
20681 for (size_t k = 1; k <= 10; k += 3) {
20682 GemmMicrokernelTester()
20683 .mr(6)
20684 .nr(8)
20685 .kr(1)
20686 .sr(1)
20687 .m(6)
20688 .n(n)
20689 .k(k)
20690 .a_stride(13)
20691 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20692 }
20693 }
20694 }
20695
20696 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_gt_8_subtile) {
20697 TEST_REQUIRES_ARM_NEON_FMA;
20698 for (uint32_t n = 9; n < 16; n++) {
20699 for (size_t k = 1; k <= 10; k += 3) {
20700 for (uint32_t m = 1; m <= 6; m++) {
20701 GemmMicrokernelTester()
20702 .mr(6)
20703 .nr(8)
20704 .kr(1)
20705 .sr(1)
20706 .m(m)
20707 .n(n)
20708 .k(k)
20709 .iterations(1)
20710 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20711 }
20712 }
20713 }
20714 }
20715
20716 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8) {
20717 TEST_REQUIRES_ARM_NEON_FMA;
20718 for (uint32_t n = 16; n <= 24; n += 8) {
20719 for (size_t k = 1; k <= 10; k += 3) {
20720 GemmMicrokernelTester()
20721 .mr(6)
20722 .nr(8)
20723 .kr(1)
20724 .sr(1)
20725 .m(6)
20726 .n(8)
20727 .k(k)
20728 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20729 }
20730 }
20731 }
20732
20733 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8_strided_cn) {
20734 TEST_REQUIRES_ARM_NEON_FMA;
20735 for (uint32_t n = 16; n <= 24; n += 8) {
20736 for (size_t k = 1; k <= 10; k += 3) {
20737 GemmMicrokernelTester()
20738 .mr(6)
20739 .nr(8)
20740 .kr(1)
20741 .sr(1)
20742 .m(6)
20743 .n(n)
20744 .k(k)
20745 .cn_stride(11)
20746 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20747 }
20748 }
20749 }
20750
20751 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8_strided_a) {
20752 TEST_REQUIRES_ARM_NEON_FMA;
20753 for (uint32_t n = 16; n <= 24; n += 8) {
20754 for (size_t k = 1; k <= 10; k += 3) {
20755 GemmMicrokernelTester()
20756 .mr(6)
20757 .nr(8)
20758 .kr(1)
20759 .sr(1)
20760 .m(6)
20761 .n(n)
20762 .k(k)
20763 .a_stride(13)
20764 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20765 }
20766 }
20767 }
20768
20769 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, n_div_8_subtile) {
20770 TEST_REQUIRES_ARM_NEON_FMA;
20771 for (uint32_t n = 16; n <= 24; n += 8) {
20772 for (size_t k = 1; k <= 10; k += 3) {
20773 for (uint32_t m = 1; m <= 6; m++) {
20774 GemmMicrokernelTester()
20775 .mr(6)
20776 .nr(8)
20777 .kr(1)
20778 .sr(1)
20779 .m(m)
20780 .n(n)
20781 .k(k)
20782 .iterations(1)
20783 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20784 }
20785 }
20786 }
20787 }
20788
20789 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, strided_cm_subtile) {
20790 TEST_REQUIRES_ARM_NEON_FMA;
20791 for (size_t k = 1; k <= 10; k += 3) {
20792 for (uint32_t m = 1; m <= 6; m++) {
20793 for (uint32_t n = 1; n <= 8; n++) {
20794 GemmMicrokernelTester()
20795 .mr(6)
20796 .nr(8)
20797 .kr(1)
20798 .sr(1)
20799 .m(m)
20800 .n(n)
20801 .k(k)
20802 .cm_stride(11)
20803 .iterations(1)
20804 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20805 }
20806 }
20807 }
20808 }
20809
20810 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, qmin) {
20811 TEST_REQUIRES_ARM_NEON_FMA;
20812 GemmMicrokernelTester()
20813 .mr(6)
20814 .nr(8)
20815 .kr(1)
20816 .sr(1)
20817 .m(6)
20818 .n(8)
20819 .k(2)
20820 .qmin(128)
20821 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20822 }
20823
20824 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, qmax) {
20825 TEST_REQUIRES_ARM_NEON_FMA;
20826 GemmMicrokernelTester()
20827 .mr(6)
20828 .nr(8)
20829 .kr(1)
20830 .sr(1)
20831 .m(6)
20832 .n(8)
20833 .k(2)
20834 .qmax(128)
20835 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20836 }
20837
20838 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD64, strided_cm) {
20839 TEST_REQUIRES_ARM_NEON_FMA;
20840 GemmMicrokernelTester()
20841 .mr(6)
20842 .nr(8)
20843 .kr(1)
20844 .sr(1)
20845 .m(6)
20846 .n(8)
20847 .k(2)
20848 .cm_stride(11)
20849 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld64);
20850 }
20851#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
20852
20853
20854#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard69172d92019-11-26 16:22:39 -080020855 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4) {
20856 TEST_REQUIRES_ARM_NEON_FMA;
20857 GemmMicrokernelTester()
20858 .mr(6)
20859 .nr(8)
20860 .kr(1)
20861 .sr(1)
20862 .m(6)
20863 .n(8)
20864 .k(4)
20865 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20866 }
20867
20868 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, strided_cn) {
20869 TEST_REQUIRES_ARM_NEON_FMA;
20870 GemmMicrokernelTester()
20871 .mr(6)
20872 .nr(8)
20873 .kr(1)
20874 .sr(1)
20875 .m(6)
20876 .n(8)
20877 .k(4)
20878 .cn_stride(11)
20879 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20880 }
20881
20882 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_strided_a) {
20883 TEST_REQUIRES_ARM_NEON_FMA;
20884 GemmMicrokernelTester()
20885 .mr(6)
20886 .nr(8)
20887 .kr(1)
20888 .sr(1)
20889 .m(6)
20890 .n(8)
20891 .k(4)
20892 .a_stride(7)
20893 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20894 }
20895
20896 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile) {
20897 TEST_REQUIRES_ARM_NEON_FMA;
20898 for (uint32_t m = 1; m <= 6; m++) {
20899 for (uint32_t n = 1; n <= 8; n++) {
20900 GemmMicrokernelTester()
20901 .mr(6)
20902 .nr(8)
20903 .kr(1)
20904 .sr(1)
20905 .m(m)
20906 .n(n)
20907 .k(4)
20908 .iterations(1)
20909 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20910 }
20911 }
20912 }
20913
20914 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_m) {
20915 TEST_REQUIRES_ARM_NEON_FMA;
20916 for (uint32_t m = 1; m <= 6; m++) {
20917 GemmMicrokernelTester()
20918 .mr(6)
20919 .nr(8)
20920 .kr(1)
20921 .sr(1)
20922 .m(m)
20923 .n(8)
20924 .k(4)
20925 .iterations(1)
20926 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20927 }
20928 }
20929
20930 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_eq_4_subtile_n) {
20931 TEST_REQUIRES_ARM_NEON_FMA;
20932 for (uint32_t n = 1; n <= 8; n++) {
20933 GemmMicrokernelTester()
20934 .mr(6)
20935 .nr(8)
20936 .kr(1)
20937 .sr(1)
20938 .m(6)
20939 .n(n)
20940 .k(4)
20941 .iterations(1)
20942 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20943 }
20944 }
20945
20946 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_lt_4) {
20947 TEST_REQUIRES_ARM_NEON_FMA;
20948 for (size_t k = 1; k < 4; k++) {
20949 GemmMicrokernelTester()
20950 .mr(6)
20951 .nr(8)
20952 .kr(1)
20953 .sr(1)
20954 .m(6)
20955 .n(8)
20956 .k(k)
20957 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20958 }
20959 }
20960
20961 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_lt_4_strided_a) {
20962 TEST_REQUIRES_ARM_NEON_FMA;
20963 for (size_t k = 1; k < 4; k++) {
20964 GemmMicrokernelTester()
20965 .mr(6)
20966 .nr(8)
20967 .kr(1)
20968 .sr(1)
20969 .m(6)
20970 .n(8)
20971 .k(k)
20972 .a_stride(7)
20973 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20974 }
20975 }
20976
20977 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_lt_4_subtile) {
20978 TEST_REQUIRES_ARM_NEON_FMA;
20979 for (size_t k = 1; k < 4; k++) {
20980 for (uint32_t m = 1; m <= 6; m++) {
20981 for (uint32_t n = 1; n <= 8; n++) {
20982 GemmMicrokernelTester()
20983 .mr(6)
20984 .nr(8)
20985 .kr(1)
20986 .sr(1)
20987 .m(m)
20988 .n(n)
20989 .k(k)
20990 .iterations(1)
20991 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
20992 }
20993 }
20994 }
20995 }
20996
20997 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_gt_4) {
20998 TEST_REQUIRES_ARM_NEON_FMA;
20999 for (size_t k = 5; k < 8; k++) {
21000 GemmMicrokernelTester()
21001 .mr(6)
21002 .nr(8)
21003 .kr(1)
21004 .sr(1)
21005 .m(6)
21006 .n(8)
21007 .k(k)
21008 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21009 }
21010 }
21011
21012 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_gt_4_strided_a) {
21013 TEST_REQUIRES_ARM_NEON_FMA;
21014 for (size_t k = 5; k < 8; k++) {
21015 GemmMicrokernelTester()
21016 .mr(6)
21017 .nr(8)
21018 .kr(1)
21019 .sr(1)
21020 .m(6)
21021 .n(8)
21022 .k(k)
21023 .a_stride(11)
21024 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21025 }
21026 }
21027
21028 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_gt_4_subtile) {
21029 TEST_REQUIRES_ARM_NEON_FMA;
21030 for (size_t k = 5; k < 8; k++) {
21031 for (uint32_t m = 1; m <= 6; m++) {
21032 for (uint32_t n = 1; n <= 8; n++) {
21033 GemmMicrokernelTester()
21034 .mr(6)
21035 .nr(8)
21036 .kr(1)
21037 .sr(1)
21038 .m(m)
21039 .n(n)
21040 .k(k)
21041 .iterations(1)
21042 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21043 }
21044 }
21045 }
21046 }
21047
21048 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_div_4) {
21049 TEST_REQUIRES_ARM_NEON_FMA;
21050 for (size_t k = 8; k <= 40; k += 4) {
21051 GemmMicrokernelTester()
21052 .mr(6)
21053 .nr(8)
21054 .kr(1)
21055 .sr(1)
21056 .m(6)
21057 .n(8)
21058 .k(k)
21059 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21060 }
21061 }
21062
21063 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_div_4_strided_a) {
21064 TEST_REQUIRES_ARM_NEON_FMA;
21065 for (size_t k = 8; k <= 40; k += 4) {
21066 GemmMicrokernelTester()
21067 .mr(6)
21068 .nr(8)
21069 .kr(1)
21070 .sr(1)
21071 .m(6)
21072 .n(8)
21073 .k(k)
21074 .a_stride(43)
21075 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21076 }
21077 }
21078
21079 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, k_div_4_subtile) {
21080 TEST_REQUIRES_ARM_NEON_FMA;
21081 for (size_t k = 8; k <= 40; k += 4) {
21082 for (uint32_t m = 1; m <= 6; m++) {
21083 for (uint32_t n = 1; n <= 8; n++) {
21084 GemmMicrokernelTester()
21085 .mr(6)
21086 .nr(8)
21087 .kr(1)
21088 .sr(1)
21089 .m(m)
21090 .n(n)
21091 .k(k)
21092 .iterations(1)
21093 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21094 }
21095 }
21096 }
21097 }
21098
21099 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8) {
21100 TEST_REQUIRES_ARM_NEON_FMA;
21101 for (uint32_t n = 9; n < 16; n++) {
21102 for (size_t k = 1; k <= 20; k += 5) {
21103 GemmMicrokernelTester()
21104 .mr(6)
21105 .nr(8)
21106 .kr(1)
21107 .sr(1)
21108 .m(6)
21109 .n(8)
21110 .k(k)
21111 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21112 }
21113 }
21114 }
21115
21116 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_cn) {
21117 TEST_REQUIRES_ARM_NEON_FMA;
21118 for (uint32_t n = 9; n < 16; n++) {
21119 for (size_t k = 1; k <= 20; k += 5) {
21120 GemmMicrokernelTester()
21121 .mr(6)
21122 .nr(8)
21123 .kr(1)
21124 .sr(1)
21125 .m(6)
21126 .n(8)
21127 .k(k)
21128 .cn_stride(11)
21129 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21130 }
21131 }
21132 }
21133
21134 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_strided_a) {
21135 TEST_REQUIRES_ARM_NEON_FMA;
21136 for (uint32_t n = 9; n < 16; n++) {
21137 for (size_t k = 1; k <= 20; k += 5) {
21138 GemmMicrokernelTester()
21139 .mr(6)
21140 .nr(8)
21141 .kr(1)
21142 .sr(1)
21143 .m(6)
21144 .n(n)
21145 .k(k)
21146 .a_stride(23)
21147 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21148 }
21149 }
21150 }
21151
21152 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_gt_8_subtile) {
21153 TEST_REQUIRES_ARM_NEON_FMA;
21154 for (uint32_t n = 9; n < 16; n++) {
21155 for (size_t k = 1; k <= 20; k += 5) {
21156 for (uint32_t m = 1; m <= 6; m++) {
21157 GemmMicrokernelTester()
21158 .mr(6)
21159 .nr(8)
21160 .kr(1)
21161 .sr(1)
21162 .m(m)
21163 .n(n)
21164 .k(k)
21165 .iterations(1)
21166 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21167 }
21168 }
21169 }
21170 }
21171
21172 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8) {
21173 TEST_REQUIRES_ARM_NEON_FMA;
21174 for (uint32_t n = 16; n <= 24; n += 8) {
21175 for (size_t k = 1; k <= 20; k += 5) {
21176 GemmMicrokernelTester()
21177 .mr(6)
21178 .nr(8)
21179 .kr(1)
21180 .sr(1)
21181 .m(6)
21182 .n(8)
21183 .k(k)
21184 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21185 }
21186 }
21187 }
21188
21189 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8_strided_cn) {
21190 TEST_REQUIRES_ARM_NEON_FMA;
21191 for (uint32_t n = 16; n <= 24; n += 8) {
21192 for (size_t k = 1; k <= 20; k += 5) {
21193 GemmMicrokernelTester()
21194 .mr(6)
21195 .nr(8)
21196 .kr(1)
21197 .sr(1)
21198 .m(6)
21199 .n(n)
21200 .k(k)
21201 .cn_stride(11)
21202 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21203 }
21204 }
21205 }
21206
21207 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8_strided_a) {
21208 TEST_REQUIRES_ARM_NEON_FMA;
21209 for (uint32_t n = 16; n <= 24; n += 8) {
21210 for (size_t k = 1; k <= 20; k += 5) {
21211 GemmMicrokernelTester()
21212 .mr(6)
21213 .nr(8)
21214 .kr(1)
21215 .sr(1)
21216 .m(6)
21217 .n(n)
21218 .k(k)
21219 .a_stride(23)
21220 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21221 }
21222 }
21223 }
21224
21225 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, n_div_8_subtile) {
21226 TEST_REQUIRES_ARM_NEON_FMA;
21227 for (uint32_t n = 16; n <= 24; n += 8) {
21228 for (size_t k = 1; k <= 20; k += 5) {
21229 for (uint32_t m = 1; m <= 6; m++) {
21230 GemmMicrokernelTester()
21231 .mr(6)
21232 .nr(8)
21233 .kr(1)
21234 .sr(1)
21235 .m(m)
21236 .n(n)
21237 .k(k)
21238 .iterations(1)
21239 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21240 }
21241 }
21242 }
21243 }
21244
21245 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, strided_cm_subtile) {
21246 TEST_REQUIRES_ARM_NEON_FMA;
21247 for (size_t k = 1; k <= 20; k += 5) {
21248 for (uint32_t m = 1; m <= 6; m++) {
21249 for (uint32_t n = 1; n <= 8; n++) {
21250 GemmMicrokernelTester()
21251 .mr(6)
21252 .nr(8)
21253 .kr(1)
21254 .sr(1)
21255 .m(m)
21256 .n(n)
21257 .k(k)
21258 .cm_stride(11)
21259 .iterations(1)
21260 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21261 }
21262 }
21263 }
21264 }
21265
21266 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, qmin) {
21267 TEST_REQUIRES_ARM_NEON_FMA;
21268 GemmMicrokernelTester()
21269 .mr(6)
21270 .nr(8)
21271 .kr(1)
21272 .sr(1)
21273 .m(6)
21274 .n(8)
21275 .k(4)
21276 .qmin(128)
21277 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21278 }
21279
21280 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, qmax) {
21281 TEST_REQUIRES_ARM_NEON_FMA;
21282 GemmMicrokernelTester()
21283 .mr(6)
21284 .nr(8)
21285 .kr(1)
21286 .sr(1)
21287 .m(6)
21288 .n(8)
21289 .k(4)
21290 .qmax(128)
21291 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21292 }
21293
21294 TEST(F32_GEMM_6X8__NEONFMA_DUP_LD128, strided_cm) {
21295 TEST_REQUIRES_ARM_NEON_FMA;
21296 GemmMicrokernelTester()
21297 .mr(6)
21298 .nr(8)
21299 .kr(1)
21300 .sr(1)
21301 .m(6)
21302 .n(8)
21303 .k(4)
21304 .cm_stride(11)
21305 .Test(xnn_f32_gemm_ukernel_6x8__neonfma_dup_ld128);
21306 }
21307#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21308
21309
21310#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barchard5243bb02019-11-22 16:37:50 -080021311 TEST(F32_GEMM_1X8S4__NEON, k_eq_4) {
21312 TEST_REQUIRES_ARM_NEON;
21313 GemmMicrokernelTester()
21314 .mr(1)
21315 .nr(8)
21316 .kr(1)
21317 .sr(4)
21318 .m(1)
21319 .n(8)
21320 .k(4)
21321 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21322 }
21323
21324 TEST(F32_GEMM_1X8S4__NEON, strided_cn) {
21325 TEST_REQUIRES_ARM_NEON;
21326 GemmMicrokernelTester()
21327 .mr(1)
21328 .nr(8)
21329 .kr(1)
21330 .sr(4)
21331 .m(1)
21332 .n(8)
21333 .k(4)
21334 .cn_stride(11)
21335 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21336 }
21337
21338 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_strided_a) {
21339 TEST_REQUIRES_ARM_NEON;
21340 GemmMicrokernelTester()
21341 .mr(1)
21342 .nr(8)
21343 .kr(1)
21344 .sr(4)
21345 .m(1)
21346 .n(8)
21347 .k(4)
21348 .a_stride(7)
21349 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21350 }
21351
21352 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_subtile) {
21353 TEST_REQUIRES_ARM_NEON;
21354 for (uint32_t m = 1; m <= 1; m++) {
21355 for (uint32_t n = 1; n <= 8; n++) {
21356 GemmMicrokernelTester()
21357 .mr(1)
21358 .nr(8)
21359 .kr(1)
21360 .sr(4)
21361 .m(m)
21362 .n(n)
21363 .k(4)
21364 .iterations(1)
21365 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21366 }
21367 }
21368 }
21369
21370 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_subtile_m) {
21371 TEST_REQUIRES_ARM_NEON;
21372 for (uint32_t m = 1; m <= 1; m++) {
21373 GemmMicrokernelTester()
21374 .mr(1)
21375 .nr(8)
21376 .kr(1)
21377 .sr(4)
21378 .m(m)
21379 .n(8)
21380 .k(4)
21381 .iterations(1)
21382 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21383 }
21384 }
21385
21386 TEST(F32_GEMM_1X8S4__NEON, k_eq_4_subtile_n) {
21387 TEST_REQUIRES_ARM_NEON;
21388 for (uint32_t n = 1; n <= 8; n++) {
21389 GemmMicrokernelTester()
21390 .mr(1)
21391 .nr(8)
21392 .kr(1)
21393 .sr(4)
21394 .m(1)
21395 .n(n)
21396 .k(4)
21397 .iterations(1)
21398 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21399 }
21400 }
21401
21402 TEST(F32_GEMM_1X8S4__NEON, k_lt_4) {
21403 TEST_REQUIRES_ARM_NEON;
21404 for (size_t k = 1; k < 4; k++) {
21405 GemmMicrokernelTester()
21406 .mr(1)
21407 .nr(8)
21408 .kr(1)
21409 .sr(4)
21410 .m(1)
21411 .n(8)
21412 .k(k)
21413 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21414 }
21415 }
21416
21417 TEST(F32_GEMM_1X8S4__NEON, k_lt_4_strided_a) {
21418 TEST_REQUIRES_ARM_NEON;
21419 for (size_t k = 1; k < 4; k++) {
21420 GemmMicrokernelTester()
21421 .mr(1)
21422 .nr(8)
21423 .kr(1)
21424 .sr(4)
21425 .m(1)
21426 .n(8)
21427 .k(k)
21428 .a_stride(7)
21429 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21430 }
21431 }
21432
21433 TEST(F32_GEMM_1X8S4__NEON, k_lt_4_subtile) {
21434 TEST_REQUIRES_ARM_NEON;
21435 for (size_t k = 1; k < 4; k++) {
21436 for (uint32_t m = 1; m <= 1; m++) {
21437 for (uint32_t n = 1; n <= 8; n++) {
21438 GemmMicrokernelTester()
21439 .mr(1)
21440 .nr(8)
21441 .kr(1)
21442 .sr(4)
21443 .m(m)
21444 .n(n)
21445 .k(k)
21446 .iterations(1)
21447 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21448 }
21449 }
21450 }
21451 }
21452
21453 TEST(F32_GEMM_1X8S4__NEON, k_gt_4) {
21454 TEST_REQUIRES_ARM_NEON;
21455 for (size_t k = 5; k < 8; k++) {
21456 GemmMicrokernelTester()
21457 .mr(1)
21458 .nr(8)
21459 .kr(1)
21460 .sr(4)
21461 .m(1)
21462 .n(8)
21463 .k(k)
21464 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21465 }
21466 }
21467
21468 TEST(F32_GEMM_1X8S4__NEON, k_gt_4_strided_a) {
21469 TEST_REQUIRES_ARM_NEON;
21470 for (size_t k = 5; k < 8; k++) {
21471 GemmMicrokernelTester()
21472 .mr(1)
21473 .nr(8)
21474 .kr(1)
21475 .sr(4)
21476 .m(1)
21477 .n(8)
21478 .k(k)
21479 .a_stride(11)
21480 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21481 }
21482 }
21483
21484 TEST(F32_GEMM_1X8S4__NEON, k_gt_4_subtile) {
21485 TEST_REQUIRES_ARM_NEON;
21486 for (size_t k = 5; k < 8; k++) {
21487 for (uint32_t m = 1; m <= 1; m++) {
21488 for (uint32_t n = 1; n <= 8; n++) {
21489 GemmMicrokernelTester()
21490 .mr(1)
21491 .nr(8)
21492 .kr(1)
21493 .sr(4)
21494 .m(m)
21495 .n(n)
21496 .k(k)
21497 .iterations(1)
21498 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21499 }
21500 }
21501 }
21502 }
21503
21504 TEST(F32_GEMM_1X8S4__NEON, k_div_4) {
21505 TEST_REQUIRES_ARM_NEON;
21506 for (size_t k = 8; k <= 40; k += 4) {
21507 GemmMicrokernelTester()
21508 .mr(1)
21509 .nr(8)
21510 .kr(1)
21511 .sr(4)
21512 .m(1)
21513 .n(8)
21514 .k(k)
21515 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21516 }
21517 }
21518
21519 TEST(F32_GEMM_1X8S4__NEON, k_div_4_strided_a) {
21520 TEST_REQUIRES_ARM_NEON;
21521 for (size_t k = 8; k <= 40; k += 4) {
21522 GemmMicrokernelTester()
21523 .mr(1)
21524 .nr(8)
21525 .kr(1)
21526 .sr(4)
21527 .m(1)
21528 .n(8)
21529 .k(k)
21530 .a_stride(43)
21531 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21532 }
21533 }
21534
21535 TEST(F32_GEMM_1X8S4__NEON, k_div_4_subtile) {
21536 TEST_REQUIRES_ARM_NEON;
21537 for (size_t k = 8; k <= 40; k += 4) {
21538 for (uint32_t m = 1; m <= 1; m++) {
21539 for (uint32_t n = 1; n <= 8; n++) {
21540 GemmMicrokernelTester()
21541 .mr(1)
21542 .nr(8)
21543 .kr(1)
21544 .sr(4)
21545 .m(m)
21546 .n(n)
21547 .k(k)
21548 .iterations(1)
21549 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21550 }
21551 }
21552 }
21553 }
21554
21555 TEST(F32_GEMM_1X8S4__NEON, n_gt_8) {
21556 TEST_REQUIRES_ARM_NEON;
21557 for (uint32_t n = 9; n < 16; n++) {
21558 for (size_t k = 1; k <= 20; k += 5) {
21559 GemmMicrokernelTester()
21560 .mr(1)
21561 .nr(8)
21562 .kr(1)
21563 .sr(4)
21564 .m(1)
21565 .n(8)
21566 .k(k)
21567 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21568 }
21569 }
21570 }
21571
21572 TEST(F32_GEMM_1X8S4__NEON, n_gt_8_strided_cn) {
21573 TEST_REQUIRES_ARM_NEON;
21574 for (uint32_t n = 9; n < 16; n++) {
21575 for (size_t k = 1; k <= 20; k += 5) {
21576 GemmMicrokernelTester()
21577 .mr(1)
21578 .nr(8)
21579 .kr(1)
21580 .sr(4)
21581 .m(1)
21582 .n(8)
21583 .k(k)
21584 .cn_stride(11)
21585 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21586 }
21587 }
21588 }
21589
21590 TEST(F32_GEMM_1X8S4__NEON, n_gt_8_strided_a) {
21591 TEST_REQUIRES_ARM_NEON;
21592 for (uint32_t n = 9; n < 16; n++) {
21593 for (size_t k = 1; k <= 20; k += 5) {
21594 GemmMicrokernelTester()
21595 .mr(1)
21596 .nr(8)
21597 .kr(1)
21598 .sr(4)
21599 .m(1)
21600 .n(n)
21601 .k(k)
21602 .a_stride(23)
21603 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21604 }
21605 }
21606 }
21607
21608 TEST(F32_GEMM_1X8S4__NEON, n_gt_8_subtile) {
21609 TEST_REQUIRES_ARM_NEON;
21610 for (uint32_t n = 9; n < 16; n++) {
21611 for (size_t k = 1; k <= 20; k += 5) {
21612 for (uint32_t m = 1; m <= 1; m++) {
21613 GemmMicrokernelTester()
21614 .mr(1)
21615 .nr(8)
21616 .kr(1)
21617 .sr(4)
21618 .m(m)
21619 .n(n)
21620 .k(k)
21621 .iterations(1)
21622 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21623 }
21624 }
21625 }
21626 }
21627
21628 TEST(F32_GEMM_1X8S4__NEON, n_div_8) {
21629 TEST_REQUIRES_ARM_NEON;
21630 for (uint32_t n = 16; n <= 24; n += 8) {
21631 for (size_t k = 1; k <= 20; k += 5) {
21632 GemmMicrokernelTester()
21633 .mr(1)
21634 .nr(8)
21635 .kr(1)
21636 .sr(4)
21637 .m(1)
21638 .n(8)
21639 .k(k)
21640 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21641 }
21642 }
21643 }
21644
21645 TEST(F32_GEMM_1X8S4__NEON, n_div_8_strided_cn) {
21646 TEST_REQUIRES_ARM_NEON;
21647 for (uint32_t n = 16; n <= 24; n += 8) {
21648 for (size_t k = 1; k <= 20; k += 5) {
21649 GemmMicrokernelTester()
21650 .mr(1)
21651 .nr(8)
21652 .kr(1)
21653 .sr(4)
21654 .m(1)
21655 .n(n)
21656 .k(k)
21657 .cn_stride(11)
21658 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21659 }
21660 }
21661 }
21662
21663 TEST(F32_GEMM_1X8S4__NEON, n_div_8_strided_a) {
21664 TEST_REQUIRES_ARM_NEON;
21665 for (uint32_t n = 16; n <= 24; n += 8) {
21666 for (size_t k = 1; k <= 20; k += 5) {
21667 GemmMicrokernelTester()
21668 .mr(1)
21669 .nr(8)
21670 .kr(1)
21671 .sr(4)
21672 .m(1)
21673 .n(n)
21674 .k(k)
21675 .a_stride(23)
21676 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21677 }
21678 }
21679 }
21680
21681 TEST(F32_GEMM_1X8S4__NEON, n_div_8_subtile) {
21682 TEST_REQUIRES_ARM_NEON;
21683 for (uint32_t n = 16; n <= 24; n += 8) {
21684 for (size_t k = 1; k <= 20; k += 5) {
21685 for (uint32_t m = 1; m <= 1; m++) {
21686 GemmMicrokernelTester()
21687 .mr(1)
21688 .nr(8)
21689 .kr(1)
21690 .sr(4)
21691 .m(m)
21692 .n(n)
21693 .k(k)
21694 .iterations(1)
21695 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21696 }
21697 }
21698 }
21699 }
21700
21701 TEST(F32_GEMM_1X8S4__NEON, strided_cm_subtile) {
21702 TEST_REQUIRES_ARM_NEON;
21703 for (size_t k = 1; k <= 20; k += 5) {
21704 for (uint32_t m = 1; m <= 1; m++) {
21705 for (uint32_t n = 1; n <= 8; n++) {
21706 GemmMicrokernelTester()
21707 .mr(1)
21708 .nr(8)
21709 .kr(1)
21710 .sr(4)
21711 .m(m)
21712 .n(n)
21713 .k(k)
21714 .cm_stride(11)
21715 .iterations(1)
21716 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21717 }
21718 }
21719 }
21720 }
21721
21722 TEST(F32_GEMM_1X8S4__NEON, qmin) {
21723 TEST_REQUIRES_ARM_NEON;
21724 GemmMicrokernelTester()
21725 .mr(1)
21726 .nr(8)
21727 .kr(1)
21728 .sr(4)
21729 .m(1)
21730 .n(8)
21731 .k(4)
21732 .qmin(128)
21733 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21734 }
21735
21736 TEST(F32_GEMM_1X8S4__NEON, qmax) {
21737 TEST_REQUIRES_ARM_NEON;
21738 GemmMicrokernelTester()
21739 .mr(1)
21740 .nr(8)
21741 .kr(1)
21742 .sr(4)
21743 .m(1)
21744 .n(8)
21745 .k(4)
21746 .qmax(128)
21747 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21748 }
21749
21750 TEST(F32_GEMM_1X8S4__NEON, strided_cm) {
21751 TEST_REQUIRES_ARM_NEON;
21752 GemmMicrokernelTester()
21753 .mr(1)
21754 .nr(8)
21755 .kr(1)
21756 .sr(4)
21757 .m(1)
21758 .n(8)
21759 .k(4)
21760 .cm_stride(11)
21761 .Test(xnn_f32_gemm_ukernel_1x8s4__neon);
21762 }
21763#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
21764
21765
21766#if XNN_ARCH_ARM || XNN_ARCH_ARM64
21767 TEST(F32_GEMM_4X8S4__NEON, k_eq_4) {
21768 TEST_REQUIRES_ARM_NEON;
21769 GemmMicrokernelTester()
21770 .mr(4)
21771 .nr(8)
21772 .kr(1)
21773 .sr(4)
21774 .m(4)
21775 .n(8)
21776 .k(4)
21777 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21778 }
21779
21780 TEST(F32_GEMM_4X8S4__NEON, strided_cn) {
21781 TEST_REQUIRES_ARM_NEON;
21782 GemmMicrokernelTester()
21783 .mr(4)
21784 .nr(8)
21785 .kr(1)
21786 .sr(4)
21787 .m(4)
21788 .n(8)
21789 .k(4)
21790 .cn_stride(11)
21791 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21792 }
21793
21794 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_strided_a) {
21795 TEST_REQUIRES_ARM_NEON;
21796 GemmMicrokernelTester()
21797 .mr(4)
21798 .nr(8)
21799 .kr(1)
21800 .sr(4)
21801 .m(4)
21802 .n(8)
21803 .k(4)
21804 .a_stride(7)
21805 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21806 }
21807
21808 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_subtile) {
21809 TEST_REQUIRES_ARM_NEON;
21810 for (uint32_t m = 1; m <= 4; m++) {
21811 for (uint32_t n = 1; n <= 8; n++) {
21812 GemmMicrokernelTester()
21813 .mr(4)
21814 .nr(8)
21815 .kr(1)
21816 .sr(4)
21817 .m(m)
21818 .n(n)
21819 .k(4)
21820 .iterations(1)
21821 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21822 }
21823 }
21824 }
21825
21826 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_subtile_m) {
21827 TEST_REQUIRES_ARM_NEON;
21828 for (uint32_t m = 1; m <= 4; m++) {
21829 GemmMicrokernelTester()
21830 .mr(4)
21831 .nr(8)
21832 .kr(1)
21833 .sr(4)
21834 .m(m)
21835 .n(8)
21836 .k(4)
21837 .iterations(1)
21838 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21839 }
21840 }
21841
21842 TEST(F32_GEMM_4X8S4__NEON, k_eq_4_subtile_n) {
21843 TEST_REQUIRES_ARM_NEON;
21844 for (uint32_t n = 1; n <= 8; n++) {
21845 GemmMicrokernelTester()
21846 .mr(4)
21847 .nr(8)
21848 .kr(1)
21849 .sr(4)
21850 .m(4)
21851 .n(n)
21852 .k(4)
21853 .iterations(1)
21854 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21855 }
21856 }
21857
21858 TEST(F32_GEMM_4X8S4__NEON, k_lt_4) {
21859 TEST_REQUIRES_ARM_NEON;
21860 for (size_t k = 1; k < 4; k++) {
21861 GemmMicrokernelTester()
21862 .mr(4)
21863 .nr(8)
21864 .kr(1)
21865 .sr(4)
21866 .m(4)
21867 .n(8)
21868 .k(k)
21869 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21870 }
21871 }
21872
21873 TEST(F32_GEMM_4X8S4__NEON, k_lt_4_strided_a) {
21874 TEST_REQUIRES_ARM_NEON;
21875 for (size_t k = 1; k < 4; k++) {
21876 GemmMicrokernelTester()
21877 .mr(4)
21878 .nr(8)
21879 .kr(1)
21880 .sr(4)
21881 .m(4)
21882 .n(8)
21883 .k(k)
21884 .a_stride(7)
21885 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21886 }
21887 }
21888
21889 TEST(F32_GEMM_4X8S4__NEON, k_lt_4_subtile) {
21890 TEST_REQUIRES_ARM_NEON;
21891 for (size_t k = 1; k < 4; k++) {
21892 for (uint32_t m = 1; m <= 4; m++) {
21893 for (uint32_t n = 1; n <= 8; n++) {
21894 GemmMicrokernelTester()
21895 .mr(4)
21896 .nr(8)
21897 .kr(1)
21898 .sr(4)
21899 .m(m)
21900 .n(n)
21901 .k(k)
21902 .iterations(1)
21903 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21904 }
21905 }
21906 }
21907 }
21908
21909 TEST(F32_GEMM_4X8S4__NEON, k_gt_4) {
21910 TEST_REQUIRES_ARM_NEON;
21911 for (size_t k = 5; k < 8; k++) {
21912 GemmMicrokernelTester()
21913 .mr(4)
21914 .nr(8)
21915 .kr(1)
21916 .sr(4)
21917 .m(4)
21918 .n(8)
21919 .k(k)
21920 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21921 }
21922 }
21923
21924 TEST(F32_GEMM_4X8S4__NEON, k_gt_4_strided_a) {
21925 TEST_REQUIRES_ARM_NEON;
21926 for (size_t k = 5; k < 8; k++) {
21927 GemmMicrokernelTester()
21928 .mr(4)
21929 .nr(8)
21930 .kr(1)
21931 .sr(4)
21932 .m(4)
21933 .n(8)
21934 .k(k)
21935 .a_stride(11)
21936 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21937 }
21938 }
21939
21940 TEST(F32_GEMM_4X8S4__NEON, k_gt_4_subtile) {
21941 TEST_REQUIRES_ARM_NEON;
21942 for (size_t k = 5; k < 8; k++) {
21943 for (uint32_t m = 1; m <= 4; m++) {
21944 for (uint32_t n = 1; n <= 8; n++) {
21945 GemmMicrokernelTester()
21946 .mr(4)
21947 .nr(8)
21948 .kr(1)
21949 .sr(4)
21950 .m(m)
21951 .n(n)
21952 .k(k)
21953 .iterations(1)
21954 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21955 }
21956 }
21957 }
21958 }
21959
21960 TEST(F32_GEMM_4X8S4__NEON, k_div_4) {
21961 TEST_REQUIRES_ARM_NEON;
21962 for (size_t k = 8; k <= 40; k += 4) {
21963 GemmMicrokernelTester()
21964 .mr(4)
21965 .nr(8)
21966 .kr(1)
21967 .sr(4)
21968 .m(4)
21969 .n(8)
21970 .k(k)
21971 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21972 }
21973 }
21974
21975 TEST(F32_GEMM_4X8S4__NEON, k_div_4_strided_a) {
21976 TEST_REQUIRES_ARM_NEON;
21977 for (size_t k = 8; k <= 40; k += 4) {
21978 GemmMicrokernelTester()
21979 .mr(4)
21980 .nr(8)
21981 .kr(1)
21982 .sr(4)
21983 .m(4)
21984 .n(8)
21985 .k(k)
21986 .a_stride(43)
21987 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
21988 }
21989 }
21990
21991 TEST(F32_GEMM_4X8S4__NEON, k_div_4_subtile) {
21992 TEST_REQUIRES_ARM_NEON;
21993 for (size_t k = 8; k <= 40; k += 4) {
21994 for (uint32_t m = 1; m <= 4; m++) {
21995 for (uint32_t n = 1; n <= 8; n++) {
21996 GemmMicrokernelTester()
21997 .mr(4)
21998 .nr(8)
21999 .kr(1)
22000 .sr(4)
22001 .m(m)
22002 .n(n)
22003 .k(k)
22004 .iterations(1)
22005 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22006 }
22007 }
22008 }
22009 }
22010
22011 TEST(F32_GEMM_4X8S4__NEON, n_gt_8) {
22012 TEST_REQUIRES_ARM_NEON;
22013 for (uint32_t n = 9; n < 16; n++) {
22014 for (size_t k = 1; k <= 20; k += 5) {
22015 GemmMicrokernelTester()
22016 .mr(4)
22017 .nr(8)
22018 .kr(1)
22019 .sr(4)
22020 .m(4)
22021 .n(8)
22022 .k(k)
22023 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22024 }
22025 }
22026 }
22027
22028 TEST(F32_GEMM_4X8S4__NEON, n_gt_8_strided_cn) {
22029 TEST_REQUIRES_ARM_NEON;
22030 for (uint32_t n = 9; n < 16; n++) {
22031 for (size_t k = 1; k <= 20; k += 5) {
22032 GemmMicrokernelTester()
22033 .mr(4)
22034 .nr(8)
22035 .kr(1)
22036 .sr(4)
22037 .m(4)
22038 .n(8)
22039 .k(k)
22040 .cn_stride(11)
22041 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22042 }
22043 }
22044 }
22045
22046 TEST(F32_GEMM_4X8S4__NEON, n_gt_8_strided_a) {
22047 TEST_REQUIRES_ARM_NEON;
22048 for (uint32_t n = 9; n < 16; n++) {
22049 for (size_t k = 1; k <= 20; k += 5) {
22050 GemmMicrokernelTester()
22051 .mr(4)
22052 .nr(8)
22053 .kr(1)
22054 .sr(4)
22055 .m(4)
22056 .n(n)
22057 .k(k)
22058 .a_stride(23)
22059 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22060 }
22061 }
22062 }
22063
22064 TEST(F32_GEMM_4X8S4__NEON, n_gt_8_subtile) {
22065 TEST_REQUIRES_ARM_NEON;
22066 for (uint32_t n = 9; n < 16; n++) {
22067 for (size_t k = 1; k <= 20; k += 5) {
22068 for (uint32_t m = 1; m <= 4; m++) {
22069 GemmMicrokernelTester()
22070 .mr(4)
22071 .nr(8)
22072 .kr(1)
22073 .sr(4)
22074 .m(m)
22075 .n(n)
22076 .k(k)
22077 .iterations(1)
22078 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22079 }
22080 }
22081 }
22082 }
22083
22084 TEST(F32_GEMM_4X8S4__NEON, n_div_8) {
22085 TEST_REQUIRES_ARM_NEON;
22086 for (uint32_t n = 16; n <= 24; n += 8) {
22087 for (size_t k = 1; k <= 20; k += 5) {
22088 GemmMicrokernelTester()
22089 .mr(4)
22090 .nr(8)
22091 .kr(1)
22092 .sr(4)
22093 .m(4)
22094 .n(8)
22095 .k(k)
22096 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22097 }
22098 }
22099 }
22100
22101 TEST(F32_GEMM_4X8S4__NEON, n_div_8_strided_cn) {
22102 TEST_REQUIRES_ARM_NEON;
22103 for (uint32_t n = 16; n <= 24; n += 8) {
22104 for (size_t k = 1; k <= 20; k += 5) {
22105 GemmMicrokernelTester()
22106 .mr(4)
22107 .nr(8)
22108 .kr(1)
22109 .sr(4)
22110 .m(4)
22111 .n(n)
22112 .k(k)
22113 .cn_stride(11)
22114 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22115 }
22116 }
22117 }
22118
22119 TEST(F32_GEMM_4X8S4__NEON, n_div_8_strided_a) {
22120 TEST_REQUIRES_ARM_NEON;
22121 for (uint32_t n = 16; n <= 24; n += 8) {
22122 for (size_t k = 1; k <= 20; k += 5) {
22123 GemmMicrokernelTester()
22124 .mr(4)
22125 .nr(8)
22126 .kr(1)
22127 .sr(4)
22128 .m(4)
22129 .n(n)
22130 .k(k)
22131 .a_stride(23)
22132 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22133 }
22134 }
22135 }
22136
22137 TEST(F32_GEMM_4X8S4__NEON, n_div_8_subtile) {
22138 TEST_REQUIRES_ARM_NEON;
22139 for (uint32_t n = 16; n <= 24; n += 8) {
22140 for (size_t k = 1; k <= 20; k += 5) {
22141 for (uint32_t m = 1; m <= 4; m++) {
22142 GemmMicrokernelTester()
22143 .mr(4)
22144 .nr(8)
22145 .kr(1)
22146 .sr(4)
22147 .m(m)
22148 .n(n)
22149 .k(k)
22150 .iterations(1)
22151 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22152 }
22153 }
22154 }
22155 }
22156
22157 TEST(F32_GEMM_4X8S4__NEON, strided_cm_subtile) {
22158 TEST_REQUIRES_ARM_NEON;
22159 for (size_t k = 1; k <= 20; k += 5) {
22160 for (uint32_t m = 1; m <= 4; m++) {
22161 for (uint32_t n = 1; n <= 8; n++) {
22162 GemmMicrokernelTester()
22163 .mr(4)
22164 .nr(8)
22165 .kr(1)
22166 .sr(4)
22167 .m(m)
22168 .n(n)
22169 .k(k)
22170 .cm_stride(11)
22171 .iterations(1)
22172 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22173 }
22174 }
22175 }
22176 }
22177
22178 TEST(F32_GEMM_4X8S4__NEON, qmin) {
22179 TEST_REQUIRES_ARM_NEON;
22180 GemmMicrokernelTester()
22181 .mr(4)
22182 .nr(8)
22183 .kr(1)
22184 .sr(4)
22185 .m(4)
22186 .n(8)
22187 .k(4)
22188 .qmin(128)
22189 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22190 }
22191
22192 TEST(F32_GEMM_4X8S4__NEON, qmax) {
22193 TEST_REQUIRES_ARM_NEON;
22194 GemmMicrokernelTester()
22195 .mr(4)
22196 .nr(8)
22197 .kr(1)
22198 .sr(4)
22199 .m(4)
22200 .n(8)
22201 .k(4)
22202 .qmax(128)
22203 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22204 }
22205
22206 TEST(F32_GEMM_4X8S4__NEON, strided_cm) {
22207 TEST_REQUIRES_ARM_NEON;
22208 GemmMicrokernelTester()
22209 .mr(4)
22210 .nr(8)
22211 .kr(1)
22212 .sr(4)
22213 .m(4)
22214 .n(8)
22215 .k(4)
22216 .cm_stride(11)
22217 .Test(xnn_f32_gemm_ukernel_4x8s4__neon);
22218 }
22219#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22220
22221
22222#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22223 TEST(F32_GEMM_6X8S4__NEON, k_eq_4) {
22224 TEST_REQUIRES_ARM_NEON;
22225 GemmMicrokernelTester()
22226 .mr(6)
22227 .nr(8)
22228 .kr(1)
22229 .sr(4)
22230 .m(6)
22231 .n(8)
22232 .k(4)
22233 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22234 }
22235
22236 TEST(F32_GEMM_6X8S4__NEON, strided_cn) {
22237 TEST_REQUIRES_ARM_NEON;
22238 GemmMicrokernelTester()
22239 .mr(6)
22240 .nr(8)
22241 .kr(1)
22242 .sr(4)
22243 .m(6)
22244 .n(8)
22245 .k(4)
22246 .cn_stride(11)
22247 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22248 }
22249
22250 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_strided_a) {
22251 TEST_REQUIRES_ARM_NEON;
22252 GemmMicrokernelTester()
22253 .mr(6)
22254 .nr(8)
22255 .kr(1)
22256 .sr(4)
22257 .m(6)
22258 .n(8)
22259 .k(4)
22260 .a_stride(7)
22261 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22262 }
22263
22264 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_subtile) {
22265 TEST_REQUIRES_ARM_NEON;
22266 for (uint32_t m = 1; m <= 6; m++) {
22267 for (uint32_t n = 1; n <= 8; n++) {
22268 GemmMicrokernelTester()
22269 .mr(6)
22270 .nr(8)
22271 .kr(1)
22272 .sr(4)
22273 .m(m)
22274 .n(n)
22275 .k(4)
22276 .iterations(1)
22277 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22278 }
22279 }
22280 }
22281
22282 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_subtile_m) {
22283 TEST_REQUIRES_ARM_NEON;
22284 for (uint32_t m = 1; m <= 6; m++) {
22285 GemmMicrokernelTester()
22286 .mr(6)
22287 .nr(8)
22288 .kr(1)
22289 .sr(4)
22290 .m(m)
22291 .n(8)
22292 .k(4)
22293 .iterations(1)
22294 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22295 }
22296 }
22297
22298 TEST(F32_GEMM_6X8S4__NEON, k_eq_4_subtile_n) {
22299 TEST_REQUIRES_ARM_NEON;
22300 for (uint32_t n = 1; n <= 8; n++) {
22301 GemmMicrokernelTester()
22302 .mr(6)
22303 .nr(8)
22304 .kr(1)
22305 .sr(4)
22306 .m(6)
22307 .n(n)
22308 .k(4)
22309 .iterations(1)
22310 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22311 }
22312 }
22313
22314 TEST(F32_GEMM_6X8S4__NEON, k_lt_4) {
22315 TEST_REQUIRES_ARM_NEON;
22316 for (size_t k = 1; k < 4; k++) {
22317 GemmMicrokernelTester()
22318 .mr(6)
22319 .nr(8)
22320 .kr(1)
22321 .sr(4)
22322 .m(6)
22323 .n(8)
22324 .k(k)
22325 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22326 }
22327 }
22328
22329 TEST(F32_GEMM_6X8S4__NEON, k_lt_4_strided_a) {
22330 TEST_REQUIRES_ARM_NEON;
22331 for (size_t k = 1; k < 4; k++) {
22332 GemmMicrokernelTester()
22333 .mr(6)
22334 .nr(8)
22335 .kr(1)
22336 .sr(4)
22337 .m(6)
22338 .n(8)
22339 .k(k)
22340 .a_stride(7)
22341 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22342 }
22343 }
22344
22345 TEST(F32_GEMM_6X8S4__NEON, k_lt_4_subtile) {
22346 TEST_REQUIRES_ARM_NEON;
22347 for (size_t k = 1; k < 4; k++) {
22348 for (uint32_t m = 1; m <= 6; m++) {
22349 for (uint32_t n = 1; n <= 8; n++) {
22350 GemmMicrokernelTester()
22351 .mr(6)
22352 .nr(8)
22353 .kr(1)
22354 .sr(4)
22355 .m(m)
22356 .n(n)
22357 .k(k)
22358 .iterations(1)
22359 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22360 }
22361 }
22362 }
22363 }
22364
22365 TEST(F32_GEMM_6X8S4__NEON, k_gt_4) {
22366 TEST_REQUIRES_ARM_NEON;
22367 for (size_t k = 5; k < 8; k++) {
22368 GemmMicrokernelTester()
22369 .mr(6)
22370 .nr(8)
22371 .kr(1)
22372 .sr(4)
22373 .m(6)
22374 .n(8)
22375 .k(k)
22376 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22377 }
22378 }
22379
22380 TEST(F32_GEMM_6X8S4__NEON, k_gt_4_strided_a) {
22381 TEST_REQUIRES_ARM_NEON;
22382 for (size_t k = 5; k < 8; k++) {
22383 GemmMicrokernelTester()
22384 .mr(6)
22385 .nr(8)
22386 .kr(1)
22387 .sr(4)
22388 .m(6)
22389 .n(8)
22390 .k(k)
22391 .a_stride(11)
22392 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22393 }
22394 }
22395
22396 TEST(F32_GEMM_6X8S4__NEON, k_gt_4_subtile) {
22397 TEST_REQUIRES_ARM_NEON;
22398 for (size_t k = 5; k < 8; k++) {
22399 for (uint32_t m = 1; m <= 6; m++) {
22400 for (uint32_t n = 1; n <= 8; n++) {
22401 GemmMicrokernelTester()
22402 .mr(6)
22403 .nr(8)
22404 .kr(1)
22405 .sr(4)
22406 .m(m)
22407 .n(n)
22408 .k(k)
22409 .iterations(1)
22410 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22411 }
22412 }
22413 }
22414 }
22415
22416 TEST(F32_GEMM_6X8S4__NEON, k_div_4) {
22417 TEST_REQUIRES_ARM_NEON;
22418 for (size_t k = 8; k <= 40; k += 4) {
22419 GemmMicrokernelTester()
22420 .mr(6)
22421 .nr(8)
22422 .kr(1)
22423 .sr(4)
22424 .m(6)
22425 .n(8)
22426 .k(k)
22427 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22428 }
22429 }
22430
22431 TEST(F32_GEMM_6X8S4__NEON, k_div_4_strided_a) {
22432 TEST_REQUIRES_ARM_NEON;
22433 for (size_t k = 8; k <= 40; k += 4) {
22434 GemmMicrokernelTester()
22435 .mr(6)
22436 .nr(8)
22437 .kr(1)
22438 .sr(4)
22439 .m(6)
22440 .n(8)
22441 .k(k)
22442 .a_stride(43)
22443 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22444 }
22445 }
22446
22447 TEST(F32_GEMM_6X8S4__NEON, k_div_4_subtile) {
22448 TEST_REQUIRES_ARM_NEON;
22449 for (size_t k = 8; k <= 40; k += 4) {
22450 for (uint32_t m = 1; m <= 6; m++) {
22451 for (uint32_t n = 1; n <= 8; n++) {
22452 GemmMicrokernelTester()
22453 .mr(6)
22454 .nr(8)
22455 .kr(1)
22456 .sr(4)
22457 .m(m)
22458 .n(n)
22459 .k(k)
22460 .iterations(1)
22461 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22462 }
22463 }
22464 }
22465 }
22466
22467 TEST(F32_GEMM_6X8S4__NEON, n_gt_8) {
22468 TEST_REQUIRES_ARM_NEON;
22469 for (uint32_t n = 9; n < 16; n++) {
22470 for (size_t k = 1; k <= 20; k += 5) {
22471 GemmMicrokernelTester()
22472 .mr(6)
22473 .nr(8)
22474 .kr(1)
22475 .sr(4)
22476 .m(6)
22477 .n(8)
22478 .k(k)
22479 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22480 }
22481 }
22482 }
22483
22484 TEST(F32_GEMM_6X8S4__NEON, n_gt_8_strided_cn) {
22485 TEST_REQUIRES_ARM_NEON;
22486 for (uint32_t n = 9; n < 16; n++) {
22487 for (size_t k = 1; k <= 20; k += 5) {
22488 GemmMicrokernelTester()
22489 .mr(6)
22490 .nr(8)
22491 .kr(1)
22492 .sr(4)
22493 .m(6)
22494 .n(8)
22495 .k(k)
22496 .cn_stride(11)
22497 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22498 }
22499 }
22500 }
22501
22502 TEST(F32_GEMM_6X8S4__NEON, n_gt_8_strided_a) {
22503 TEST_REQUIRES_ARM_NEON;
22504 for (uint32_t n = 9; n < 16; n++) {
22505 for (size_t k = 1; k <= 20; k += 5) {
22506 GemmMicrokernelTester()
22507 .mr(6)
22508 .nr(8)
22509 .kr(1)
22510 .sr(4)
22511 .m(6)
22512 .n(n)
22513 .k(k)
22514 .a_stride(23)
22515 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22516 }
22517 }
22518 }
22519
22520 TEST(F32_GEMM_6X8S4__NEON, n_gt_8_subtile) {
22521 TEST_REQUIRES_ARM_NEON;
22522 for (uint32_t n = 9; n < 16; n++) {
22523 for (size_t k = 1; k <= 20; k += 5) {
22524 for (uint32_t m = 1; m <= 6; m++) {
22525 GemmMicrokernelTester()
22526 .mr(6)
22527 .nr(8)
22528 .kr(1)
22529 .sr(4)
22530 .m(m)
22531 .n(n)
22532 .k(k)
22533 .iterations(1)
22534 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22535 }
22536 }
22537 }
22538 }
22539
22540 TEST(F32_GEMM_6X8S4__NEON, n_div_8) {
22541 TEST_REQUIRES_ARM_NEON;
22542 for (uint32_t n = 16; n <= 24; n += 8) {
22543 for (size_t k = 1; k <= 20; k += 5) {
22544 GemmMicrokernelTester()
22545 .mr(6)
22546 .nr(8)
22547 .kr(1)
22548 .sr(4)
22549 .m(6)
22550 .n(8)
22551 .k(k)
22552 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22553 }
22554 }
22555 }
22556
22557 TEST(F32_GEMM_6X8S4__NEON, n_div_8_strided_cn) {
22558 TEST_REQUIRES_ARM_NEON;
22559 for (uint32_t n = 16; n <= 24; n += 8) {
22560 for (size_t k = 1; k <= 20; k += 5) {
22561 GemmMicrokernelTester()
22562 .mr(6)
22563 .nr(8)
22564 .kr(1)
22565 .sr(4)
22566 .m(6)
22567 .n(n)
22568 .k(k)
22569 .cn_stride(11)
22570 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22571 }
22572 }
22573 }
22574
22575 TEST(F32_GEMM_6X8S4__NEON, n_div_8_strided_a) {
22576 TEST_REQUIRES_ARM_NEON;
22577 for (uint32_t n = 16; n <= 24; n += 8) {
22578 for (size_t k = 1; k <= 20; k += 5) {
22579 GemmMicrokernelTester()
22580 .mr(6)
22581 .nr(8)
22582 .kr(1)
22583 .sr(4)
22584 .m(6)
22585 .n(n)
22586 .k(k)
22587 .a_stride(23)
22588 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22589 }
22590 }
22591 }
22592
22593 TEST(F32_GEMM_6X8S4__NEON, n_div_8_subtile) {
22594 TEST_REQUIRES_ARM_NEON;
22595 for (uint32_t n = 16; n <= 24; n += 8) {
22596 for (size_t k = 1; k <= 20; k += 5) {
22597 for (uint32_t m = 1; m <= 6; m++) {
22598 GemmMicrokernelTester()
22599 .mr(6)
22600 .nr(8)
22601 .kr(1)
22602 .sr(4)
22603 .m(m)
22604 .n(n)
22605 .k(k)
22606 .iterations(1)
22607 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22608 }
22609 }
22610 }
22611 }
22612
22613 TEST(F32_GEMM_6X8S4__NEON, strided_cm_subtile) {
22614 TEST_REQUIRES_ARM_NEON;
22615 for (size_t k = 1; k <= 20; k += 5) {
22616 for (uint32_t m = 1; m <= 6; m++) {
22617 for (uint32_t n = 1; n <= 8; n++) {
22618 GemmMicrokernelTester()
22619 .mr(6)
22620 .nr(8)
22621 .kr(1)
22622 .sr(4)
22623 .m(m)
22624 .n(n)
22625 .k(k)
22626 .cm_stride(11)
22627 .iterations(1)
22628 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22629 }
22630 }
22631 }
22632 }
22633
22634 TEST(F32_GEMM_6X8S4__NEON, qmin) {
22635 TEST_REQUIRES_ARM_NEON;
22636 GemmMicrokernelTester()
22637 .mr(6)
22638 .nr(8)
22639 .kr(1)
22640 .sr(4)
22641 .m(6)
22642 .n(8)
22643 .k(4)
22644 .qmin(128)
22645 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22646 }
22647
22648 TEST(F32_GEMM_6X8S4__NEON, qmax) {
22649 TEST_REQUIRES_ARM_NEON;
22650 GemmMicrokernelTester()
22651 .mr(6)
22652 .nr(8)
22653 .kr(1)
22654 .sr(4)
22655 .m(6)
22656 .n(8)
22657 .k(4)
22658 .qmax(128)
22659 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22660 }
22661
22662 TEST(F32_GEMM_6X8S4__NEON, strided_cm) {
22663 TEST_REQUIRES_ARM_NEON;
22664 GemmMicrokernelTester()
22665 .mr(6)
22666 .nr(8)
22667 .kr(1)
22668 .sr(4)
22669 .m(6)
22670 .n(8)
22671 .k(4)
22672 .cm_stride(11)
22673 .Test(xnn_f32_gemm_ukernel_6x8s4__neon);
22674 }
22675#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
22676
22677
22678#if XNN_ARCH_ARM || XNN_ARCH_ARM64
22679 TEST(F32_GEMM_8X8S4__NEON, k_eq_4) {
22680 TEST_REQUIRES_ARM_NEON;
22681 GemmMicrokernelTester()
22682 .mr(8)
22683 .nr(8)
22684 .kr(1)
22685 .sr(4)
22686 .m(8)
22687 .n(8)
22688 .k(4)
22689 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22690 }
22691
22692 TEST(F32_GEMM_8X8S4__NEON, strided_cn) {
22693 TEST_REQUIRES_ARM_NEON;
22694 GemmMicrokernelTester()
22695 .mr(8)
22696 .nr(8)
22697 .kr(1)
22698 .sr(4)
22699 .m(8)
22700 .n(8)
22701 .k(4)
22702 .cn_stride(11)
22703 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22704 }
22705
22706 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_strided_a) {
22707 TEST_REQUIRES_ARM_NEON;
22708 GemmMicrokernelTester()
22709 .mr(8)
22710 .nr(8)
22711 .kr(1)
22712 .sr(4)
22713 .m(8)
22714 .n(8)
22715 .k(4)
22716 .a_stride(7)
22717 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22718 }
22719
22720 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_subtile) {
22721 TEST_REQUIRES_ARM_NEON;
22722 for (uint32_t m = 1; m <= 8; m++) {
22723 for (uint32_t n = 1; n <= 8; n++) {
22724 GemmMicrokernelTester()
22725 .mr(8)
22726 .nr(8)
22727 .kr(1)
22728 .sr(4)
22729 .m(m)
22730 .n(n)
22731 .k(4)
22732 .iterations(1)
22733 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22734 }
22735 }
22736 }
22737
22738 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_subtile_m) {
22739 TEST_REQUIRES_ARM_NEON;
22740 for (uint32_t m = 1; m <= 8; m++) {
22741 GemmMicrokernelTester()
22742 .mr(8)
22743 .nr(8)
22744 .kr(1)
22745 .sr(4)
22746 .m(m)
22747 .n(8)
22748 .k(4)
22749 .iterations(1)
22750 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22751 }
22752 }
22753
22754 TEST(F32_GEMM_8X8S4__NEON, k_eq_4_subtile_n) {
22755 TEST_REQUIRES_ARM_NEON;
22756 for (uint32_t n = 1; n <= 8; n++) {
22757 GemmMicrokernelTester()
22758 .mr(8)
22759 .nr(8)
22760 .kr(1)
22761 .sr(4)
22762 .m(8)
22763 .n(n)
22764 .k(4)
22765 .iterations(1)
22766 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22767 }
22768 }
22769
22770 TEST(F32_GEMM_8X8S4__NEON, k_lt_4) {
22771 TEST_REQUIRES_ARM_NEON;
22772 for (size_t k = 1; k < 4; k++) {
22773 GemmMicrokernelTester()
22774 .mr(8)
22775 .nr(8)
22776 .kr(1)
22777 .sr(4)
22778 .m(8)
22779 .n(8)
22780 .k(k)
22781 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22782 }
22783 }
22784
22785 TEST(F32_GEMM_8X8S4__NEON, k_lt_4_strided_a) {
22786 TEST_REQUIRES_ARM_NEON;
22787 for (size_t k = 1; k < 4; k++) {
22788 GemmMicrokernelTester()
22789 .mr(8)
22790 .nr(8)
22791 .kr(1)
22792 .sr(4)
22793 .m(8)
22794 .n(8)
22795 .k(k)
22796 .a_stride(7)
22797 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22798 }
22799 }
22800
22801 TEST(F32_GEMM_8X8S4__NEON, k_lt_4_subtile) {
22802 TEST_REQUIRES_ARM_NEON;
22803 for (size_t k = 1; k < 4; k++) {
22804 for (uint32_t m = 1; m <= 8; m++) {
22805 for (uint32_t n = 1; n <= 8; n++) {
22806 GemmMicrokernelTester()
22807 .mr(8)
22808 .nr(8)
22809 .kr(1)
22810 .sr(4)
22811 .m(m)
22812 .n(n)
22813 .k(k)
22814 .iterations(1)
22815 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22816 }
22817 }
22818 }
22819 }
22820
22821 TEST(F32_GEMM_8X8S4__NEON, k_gt_4) {
22822 TEST_REQUIRES_ARM_NEON;
22823 for (size_t k = 5; k < 8; k++) {
22824 GemmMicrokernelTester()
22825 .mr(8)
22826 .nr(8)
22827 .kr(1)
22828 .sr(4)
22829 .m(8)
22830 .n(8)
22831 .k(k)
22832 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22833 }
22834 }
22835
22836 TEST(F32_GEMM_8X8S4__NEON, k_gt_4_strided_a) {
22837 TEST_REQUIRES_ARM_NEON;
22838 for (size_t k = 5; k < 8; k++) {
22839 GemmMicrokernelTester()
22840 .mr(8)
22841 .nr(8)
22842 .kr(1)
22843 .sr(4)
22844 .m(8)
22845 .n(8)
22846 .k(k)
22847 .a_stride(11)
22848 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22849 }
22850 }
22851
22852 TEST(F32_GEMM_8X8S4__NEON, k_gt_4_subtile) {
22853 TEST_REQUIRES_ARM_NEON;
22854 for (size_t k = 5; k < 8; k++) {
22855 for (uint32_t m = 1; m <= 8; m++) {
22856 for (uint32_t n = 1; n <= 8; n++) {
22857 GemmMicrokernelTester()
22858 .mr(8)
22859 .nr(8)
22860 .kr(1)
22861 .sr(4)
22862 .m(m)
22863 .n(n)
22864 .k(k)
22865 .iterations(1)
22866 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22867 }
22868 }
22869 }
22870 }
22871
22872 TEST(F32_GEMM_8X8S4__NEON, k_div_4) {
22873 TEST_REQUIRES_ARM_NEON;
22874 for (size_t k = 8; k <= 40; k += 4) {
22875 GemmMicrokernelTester()
22876 .mr(8)
22877 .nr(8)
22878 .kr(1)
22879 .sr(4)
22880 .m(8)
22881 .n(8)
22882 .k(k)
22883 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22884 }
22885 }
22886
22887 TEST(F32_GEMM_8X8S4__NEON, k_div_4_strided_a) {
22888 TEST_REQUIRES_ARM_NEON;
22889 for (size_t k = 8; k <= 40; k += 4) {
22890 GemmMicrokernelTester()
22891 .mr(8)
22892 .nr(8)
22893 .kr(1)
22894 .sr(4)
22895 .m(8)
22896 .n(8)
22897 .k(k)
22898 .a_stride(43)
22899 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22900 }
22901 }
22902
22903 TEST(F32_GEMM_8X8S4__NEON, k_div_4_subtile) {
22904 TEST_REQUIRES_ARM_NEON;
22905 for (size_t k = 8; k <= 40; k += 4) {
22906 for (uint32_t m = 1; m <= 8; m++) {
22907 for (uint32_t n = 1; n <= 8; n++) {
22908 GemmMicrokernelTester()
22909 .mr(8)
22910 .nr(8)
22911 .kr(1)
22912 .sr(4)
22913 .m(m)
22914 .n(n)
22915 .k(k)
22916 .iterations(1)
22917 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22918 }
22919 }
22920 }
22921 }
22922
22923 TEST(F32_GEMM_8X8S4__NEON, n_gt_8) {
22924 TEST_REQUIRES_ARM_NEON;
22925 for (uint32_t n = 9; n < 16; n++) {
22926 for (size_t k = 1; k <= 20; k += 5) {
22927 GemmMicrokernelTester()
22928 .mr(8)
22929 .nr(8)
22930 .kr(1)
22931 .sr(4)
22932 .m(8)
22933 .n(8)
22934 .k(k)
22935 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22936 }
22937 }
22938 }
22939
22940 TEST(F32_GEMM_8X8S4__NEON, n_gt_8_strided_cn) {
22941 TEST_REQUIRES_ARM_NEON;
22942 for (uint32_t n = 9; n < 16; n++) {
22943 for (size_t k = 1; k <= 20; k += 5) {
22944 GemmMicrokernelTester()
22945 .mr(8)
22946 .nr(8)
22947 .kr(1)
22948 .sr(4)
22949 .m(8)
22950 .n(8)
22951 .k(k)
22952 .cn_stride(11)
22953 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22954 }
22955 }
22956 }
22957
22958 TEST(F32_GEMM_8X8S4__NEON, n_gt_8_strided_a) {
22959 TEST_REQUIRES_ARM_NEON;
22960 for (uint32_t n = 9; n < 16; n++) {
22961 for (size_t k = 1; k <= 20; k += 5) {
22962 GemmMicrokernelTester()
22963 .mr(8)
22964 .nr(8)
22965 .kr(1)
22966 .sr(4)
22967 .m(8)
22968 .n(n)
22969 .k(k)
22970 .a_stride(23)
22971 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22972 }
22973 }
22974 }
22975
22976 TEST(F32_GEMM_8X8S4__NEON, n_gt_8_subtile) {
22977 TEST_REQUIRES_ARM_NEON;
22978 for (uint32_t n = 9; n < 16; n++) {
22979 for (size_t k = 1; k <= 20; k += 5) {
22980 for (uint32_t m = 1; m <= 8; m++) {
22981 GemmMicrokernelTester()
22982 .mr(8)
22983 .nr(8)
22984 .kr(1)
22985 .sr(4)
22986 .m(m)
22987 .n(n)
22988 .k(k)
22989 .iterations(1)
22990 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
22991 }
22992 }
22993 }
22994 }
22995
22996 TEST(F32_GEMM_8X8S4__NEON, n_div_8) {
22997 TEST_REQUIRES_ARM_NEON;
22998 for (uint32_t n = 16; n <= 24; n += 8) {
22999 for (size_t k = 1; k <= 20; k += 5) {
23000 GemmMicrokernelTester()
23001 .mr(8)
23002 .nr(8)
23003 .kr(1)
23004 .sr(4)
23005 .m(8)
23006 .n(8)
23007 .k(k)
23008 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23009 }
23010 }
23011 }
23012
23013 TEST(F32_GEMM_8X8S4__NEON, n_div_8_strided_cn) {
23014 TEST_REQUIRES_ARM_NEON;
23015 for (uint32_t n = 16; n <= 24; n += 8) {
23016 for (size_t k = 1; k <= 20; k += 5) {
23017 GemmMicrokernelTester()
23018 .mr(8)
23019 .nr(8)
23020 .kr(1)
23021 .sr(4)
23022 .m(8)
23023 .n(n)
23024 .k(k)
23025 .cn_stride(11)
23026 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23027 }
23028 }
23029 }
23030
23031 TEST(F32_GEMM_8X8S4__NEON, n_div_8_strided_a) {
23032 TEST_REQUIRES_ARM_NEON;
23033 for (uint32_t n = 16; n <= 24; n += 8) {
23034 for (size_t k = 1; k <= 20; k += 5) {
23035 GemmMicrokernelTester()
23036 .mr(8)
23037 .nr(8)
23038 .kr(1)
23039 .sr(4)
23040 .m(8)
23041 .n(n)
23042 .k(k)
23043 .a_stride(23)
23044 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23045 }
23046 }
23047 }
23048
23049 TEST(F32_GEMM_8X8S4__NEON, n_div_8_subtile) {
23050 TEST_REQUIRES_ARM_NEON;
23051 for (uint32_t n = 16; n <= 24; n += 8) {
23052 for (size_t k = 1; k <= 20; k += 5) {
23053 for (uint32_t m = 1; m <= 8; m++) {
23054 GemmMicrokernelTester()
23055 .mr(8)
23056 .nr(8)
23057 .kr(1)
23058 .sr(4)
23059 .m(m)
23060 .n(n)
23061 .k(k)
23062 .iterations(1)
23063 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23064 }
23065 }
23066 }
23067 }
23068
23069 TEST(F32_GEMM_8X8S4__NEON, strided_cm_subtile) {
23070 TEST_REQUIRES_ARM_NEON;
23071 for (size_t k = 1; k <= 20; k += 5) {
23072 for (uint32_t m = 1; m <= 8; m++) {
23073 for (uint32_t n = 1; n <= 8; n++) {
23074 GemmMicrokernelTester()
23075 .mr(8)
23076 .nr(8)
23077 .kr(1)
23078 .sr(4)
23079 .m(m)
23080 .n(n)
23081 .k(k)
23082 .cm_stride(11)
23083 .iterations(1)
23084 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23085 }
23086 }
23087 }
23088 }
23089
23090 TEST(F32_GEMM_8X8S4__NEON, qmin) {
23091 TEST_REQUIRES_ARM_NEON;
23092 GemmMicrokernelTester()
23093 .mr(8)
23094 .nr(8)
23095 .kr(1)
23096 .sr(4)
23097 .m(8)
23098 .n(8)
23099 .k(4)
23100 .qmin(128)
23101 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23102 }
23103
23104 TEST(F32_GEMM_8X8S4__NEON, qmax) {
23105 TEST_REQUIRES_ARM_NEON;
23106 GemmMicrokernelTester()
23107 .mr(8)
23108 .nr(8)
23109 .kr(1)
23110 .sr(4)
23111 .m(8)
23112 .n(8)
23113 .k(4)
23114 .qmax(128)
23115 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23116 }
23117
23118 TEST(F32_GEMM_8X8S4__NEON, strided_cm) {
23119 TEST_REQUIRES_ARM_NEON;
23120 GemmMicrokernelTester()
23121 .mr(8)
23122 .nr(8)
23123 .kr(1)
23124 .sr(4)
23125 .m(8)
23126 .n(8)
23127 .k(4)
23128 .cm_stride(11)
23129 .Test(xnn_f32_gemm_ukernel_8x8s4__neon);
23130 }
23131#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23132
23133
23134#if XNN_ARCH_ARM || XNN_ARCH_ARM64
Frank Barcharddf06d802019-11-20 15:53:46 -080023135 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4) {
23136 TEST_REQUIRES_ARM_NEON_FMA;
23137 GemmMicrokernelTester()
23138 .mr(1)
23139 .nr(8)
23140 .kr(1)
23141 .sr(4)
23142 .m(1)
23143 .n(8)
23144 .k(4)
23145 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23146 }
23147
23148 TEST(F32_GEMM_1X8S4__NEONFMA, strided_cn) {
23149 TEST_REQUIRES_ARM_NEON_FMA;
23150 GemmMicrokernelTester()
23151 .mr(1)
23152 .nr(8)
23153 .kr(1)
23154 .sr(4)
23155 .m(1)
23156 .n(8)
23157 .k(4)
23158 .cn_stride(11)
23159 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23160 }
23161
23162 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_strided_a) {
23163 TEST_REQUIRES_ARM_NEON_FMA;
23164 GemmMicrokernelTester()
23165 .mr(1)
23166 .nr(8)
23167 .kr(1)
23168 .sr(4)
23169 .m(1)
23170 .n(8)
23171 .k(4)
23172 .a_stride(7)
23173 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23174 }
23175
23176 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_subtile) {
23177 TEST_REQUIRES_ARM_NEON_FMA;
23178 for (uint32_t m = 1; m <= 1; m++) {
23179 for (uint32_t n = 1; n <= 8; n++) {
23180 GemmMicrokernelTester()
23181 .mr(1)
23182 .nr(8)
23183 .kr(1)
23184 .sr(4)
23185 .m(m)
23186 .n(n)
23187 .k(4)
23188 .iterations(1)
23189 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23190 }
23191 }
23192 }
23193
23194 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_subtile_m) {
23195 TEST_REQUIRES_ARM_NEON_FMA;
23196 for (uint32_t m = 1; m <= 1; m++) {
23197 GemmMicrokernelTester()
23198 .mr(1)
23199 .nr(8)
23200 .kr(1)
23201 .sr(4)
23202 .m(m)
23203 .n(8)
23204 .k(4)
23205 .iterations(1)
23206 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23207 }
23208 }
23209
23210 TEST(F32_GEMM_1X8S4__NEONFMA, k_eq_4_subtile_n) {
23211 TEST_REQUIRES_ARM_NEON_FMA;
23212 for (uint32_t n = 1; n <= 8; n++) {
23213 GemmMicrokernelTester()
23214 .mr(1)
23215 .nr(8)
23216 .kr(1)
23217 .sr(4)
23218 .m(1)
23219 .n(n)
23220 .k(4)
23221 .iterations(1)
23222 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23223 }
23224 }
23225
23226 TEST(F32_GEMM_1X8S4__NEONFMA, k_lt_4) {
23227 TEST_REQUIRES_ARM_NEON_FMA;
23228 for (size_t k = 1; k < 4; k++) {
23229 GemmMicrokernelTester()
23230 .mr(1)
23231 .nr(8)
23232 .kr(1)
23233 .sr(4)
23234 .m(1)
23235 .n(8)
23236 .k(k)
23237 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23238 }
23239 }
23240
23241 TEST(F32_GEMM_1X8S4__NEONFMA, k_lt_4_strided_a) {
23242 TEST_REQUIRES_ARM_NEON_FMA;
23243 for (size_t k = 1; k < 4; k++) {
23244 GemmMicrokernelTester()
23245 .mr(1)
23246 .nr(8)
23247 .kr(1)
23248 .sr(4)
23249 .m(1)
23250 .n(8)
23251 .k(k)
23252 .a_stride(7)
23253 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23254 }
23255 }
23256
23257 TEST(F32_GEMM_1X8S4__NEONFMA, k_lt_4_subtile) {
23258 TEST_REQUIRES_ARM_NEON_FMA;
23259 for (size_t k = 1; k < 4; k++) {
23260 for (uint32_t m = 1; m <= 1; m++) {
23261 for (uint32_t n = 1; n <= 8; n++) {
23262 GemmMicrokernelTester()
23263 .mr(1)
23264 .nr(8)
23265 .kr(1)
23266 .sr(4)
23267 .m(m)
23268 .n(n)
23269 .k(k)
23270 .iterations(1)
23271 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23272 }
23273 }
23274 }
23275 }
23276
23277 TEST(F32_GEMM_1X8S4__NEONFMA, k_gt_4) {
23278 TEST_REQUIRES_ARM_NEON_FMA;
23279 for (size_t k = 5; k < 8; k++) {
23280 GemmMicrokernelTester()
23281 .mr(1)
23282 .nr(8)
23283 .kr(1)
23284 .sr(4)
23285 .m(1)
23286 .n(8)
23287 .k(k)
23288 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23289 }
23290 }
23291
23292 TEST(F32_GEMM_1X8S4__NEONFMA, k_gt_4_strided_a) {
23293 TEST_REQUIRES_ARM_NEON_FMA;
23294 for (size_t k = 5; k < 8; k++) {
23295 GemmMicrokernelTester()
23296 .mr(1)
23297 .nr(8)
23298 .kr(1)
23299 .sr(4)
23300 .m(1)
23301 .n(8)
23302 .k(k)
23303 .a_stride(11)
23304 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23305 }
23306 }
23307
23308 TEST(F32_GEMM_1X8S4__NEONFMA, k_gt_4_subtile) {
23309 TEST_REQUIRES_ARM_NEON_FMA;
23310 for (size_t k = 5; k < 8; k++) {
23311 for (uint32_t m = 1; m <= 1; m++) {
23312 for (uint32_t n = 1; n <= 8; n++) {
23313 GemmMicrokernelTester()
23314 .mr(1)
23315 .nr(8)
23316 .kr(1)
23317 .sr(4)
23318 .m(m)
23319 .n(n)
23320 .k(k)
23321 .iterations(1)
23322 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23323 }
23324 }
23325 }
23326 }
23327
23328 TEST(F32_GEMM_1X8S4__NEONFMA, k_div_4) {
23329 TEST_REQUIRES_ARM_NEON_FMA;
23330 for (size_t k = 8; k <= 40; k += 4) {
23331 GemmMicrokernelTester()
23332 .mr(1)
23333 .nr(8)
23334 .kr(1)
23335 .sr(4)
23336 .m(1)
23337 .n(8)
23338 .k(k)
23339 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23340 }
23341 }
23342
23343 TEST(F32_GEMM_1X8S4__NEONFMA, k_div_4_strided_a) {
23344 TEST_REQUIRES_ARM_NEON_FMA;
23345 for (size_t k = 8; k <= 40; k += 4) {
23346 GemmMicrokernelTester()
23347 .mr(1)
23348 .nr(8)
23349 .kr(1)
23350 .sr(4)
23351 .m(1)
23352 .n(8)
23353 .k(k)
23354 .a_stride(43)
23355 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23356 }
23357 }
23358
23359 TEST(F32_GEMM_1X8S4__NEONFMA, k_div_4_subtile) {
23360 TEST_REQUIRES_ARM_NEON_FMA;
23361 for (size_t k = 8; k <= 40; k += 4) {
23362 for (uint32_t m = 1; m <= 1; m++) {
23363 for (uint32_t n = 1; n <= 8; n++) {
23364 GemmMicrokernelTester()
23365 .mr(1)
23366 .nr(8)
23367 .kr(1)
23368 .sr(4)
23369 .m(m)
23370 .n(n)
23371 .k(k)
23372 .iterations(1)
23373 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23374 }
23375 }
23376 }
23377 }
23378
23379 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8) {
23380 TEST_REQUIRES_ARM_NEON_FMA;
23381 for (uint32_t n = 9; n < 16; n++) {
23382 for (size_t k = 1; k <= 20; k += 5) {
23383 GemmMicrokernelTester()
23384 .mr(1)
23385 .nr(8)
23386 .kr(1)
23387 .sr(4)
23388 .m(1)
23389 .n(8)
23390 .k(k)
23391 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23392 }
23393 }
23394 }
23395
23396 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8_strided_cn) {
23397 TEST_REQUIRES_ARM_NEON_FMA;
23398 for (uint32_t n = 9; n < 16; n++) {
23399 for (size_t k = 1; k <= 20; k += 5) {
23400 GemmMicrokernelTester()
23401 .mr(1)
23402 .nr(8)
23403 .kr(1)
23404 .sr(4)
23405 .m(1)
23406 .n(8)
23407 .k(k)
23408 .cn_stride(11)
23409 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23410 }
23411 }
23412 }
23413
23414 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8_strided_a) {
23415 TEST_REQUIRES_ARM_NEON_FMA;
23416 for (uint32_t n = 9; n < 16; n++) {
23417 for (size_t k = 1; k <= 20; k += 5) {
23418 GemmMicrokernelTester()
23419 .mr(1)
23420 .nr(8)
23421 .kr(1)
23422 .sr(4)
23423 .m(1)
23424 .n(n)
23425 .k(k)
23426 .a_stride(23)
23427 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23428 }
23429 }
23430 }
23431
23432 TEST(F32_GEMM_1X8S4__NEONFMA, n_gt_8_subtile) {
23433 TEST_REQUIRES_ARM_NEON_FMA;
23434 for (uint32_t n = 9; n < 16; n++) {
23435 for (size_t k = 1; k <= 20; k += 5) {
23436 for (uint32_t m = 1; m <= 1; m++) {
23437 GemmMicrokernelTester()
23438 .mr(1)
23439 .nr(8)
23440 .kr(1)
23441 .sr(4)
23442 .m(m)
23443 .n(n)
23444 .k(k)
23445 .iterations(1)
23446 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23447 }
23448 }
23449 }
23450 }
23451
23452 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8) {
23453 TEST_REQUIRES_ARM_NEON_FMA;
23454 for (uint32_t n = 16; n <= 24; n += 8) {
23455 for (size_t k = 1; k <= 20; k += 5) {
23456 GemmMicrokernelTester()
23457 .mr(1)
23458 .nr(8)
23459 .kr(1)
23460 .sr(4)
23461 .m(1)
23462 .n(8)
23463 .k(k)
23464 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23465 }
23466 }
23467 }
23468
23469 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8_strided_cn) {
23470 TEST_REQUIRES_ARM_NEON_FMA;
23471 for (uint32_t n = 16; n <= 24; n += 8) {
23472 for (size_t k = 1; k <= 20; k += 5) {
23473 GemmMicrokernelTester()
23474 .mr(1)
23475 .nr(8)
23476 .kr(1)
23477 .sr(4)
23478 .m(1)
23479 .n(n)
23480 .k(k)
23481 .cn_stride(11)
23482 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23483 }
23484 }
23485 }
23486
23487 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8_strided_a) {
23488 TEST_REQUIRES_ARM_NEON_FMA;
23489 for (uint32_t n = 16; n <= 24; n += 8) {
23490 for (size_t k = 1; k <= 20; k += 5) {
23491 GemmMicrokernelTester()
23492 .mr(1)
23493 .nr(8)
23494 .kr(1)
23495 .sr(4)
23496 .m(1)
23497 .n(n)
23498 .k(k)
23499 .a_stride(23)
23500 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23501 }
23502 }
23503 }
23504
23505 TEST(F32_GEMM_1X8S4__NEONFMA, n_div_8_subtile) {
23506 TEST_REQUIRES_ARM_NEON_FMA;
23507 for (uint32_t n = 16; n <= 24; n += 8) {
23508 for (size_t k = 1; k <= 20; k += 5) {
23509 for (uint32_t m = 1; m <= 1; m++) {
23510 GemmMicrokernelTester()
23511 .mr(1)
23512 .nr(8)
23513 .kr(1)
23514 .sr(4)
23515 .m(m)
23516 .n(n)
23517 .k(k)
23518 .iterations(1)
23519 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23520 }
23521 }
23522 }
23523 }
23524
23525 TEST(F32_GEMM_1X8S4__NEONFMA, strided_cm_subtile) {
23526 TEST_REQUIRES_ARM_NEON_FMA;
23527 for (size_t k = 1; k <= 20; k += 5) {
23528 for (uint32_t m = 1; m <= 1; m++) {
23529 for (uint32_t n = 1; n <= 8; n++) {
23530 GemmMicrokernelTester()
23531 .mr(1)
23532 .nr(8)
23533 .kr(1)
23534 .sr(4)
23535 .m(m)
23536 .n(n)
23537 .k(k)
23538 .cm_stride(11)
23539 .iterations(1)
23540 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23541 }
23542 }
23543 }
23544 }
23545
23546 TEST(F32_GEMM_1X8S4__NEONFMA, qmin) {
23547 TEST_REQUIRES_ARM_NEON_FMA;
23548 GemmMicrokernelTester()
23549 .mr(1)
23550 .nr(8)
23551 .kr(1)
23552 .sr(4)
23553 .m(1)
23554 .n(8)
23555 .k(4)
23556 .qmin(128)
23557 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23558 }
23559
23560 TEST(F32_GEMM_1X8S4__NEONFMA, qmax) {
23561 TEST_REQUIRES_ARM_NEON_FMA;
23562 GemmMicrokernelTester()
23563 .mr(1)
23564 .nr(8)
23565 .kr(1)
23566 .sr(4)
23567 .m(1)
23568 .n(8)
23569 .k(4)
23570 .qmax(128)
23571 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23572 }
23573
23574 TEST(F32_GEMM_1X8S4__NEONFMA, strided_cm) {
23575 TEST_REQUIRES_ARM_NEON_FMA;
23576 GemmMicrokernelTester()
23577 .mr(1)
23578 .nr(8)
23579 .kr(1)
23580 .sr(4)
23581 .m(1)
23582 .n(8)
23583 .k(4)
23584 .cm_stride(11)
23585 .Test(xnn_f32_gemm_ukernel_1x8s4__neonfma);
23586 }
23587#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
23588
23589
23590#if XNN_ARCH_ARM || XNN_ARCH_ARM64
23591 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4) {
23592 TEST_REQUIRES_ARM_NEON_FMA;
23593 GemmMicrokernelTester()
23594 .mr(4)
23595 .nr(8)
23596 .kr(1)
23597 .sr(4)
23598 .m(4)
23599 .n(8)
23600 .k(4)
23601 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23602 }
23603
23604 TEST(F32_GEMM_4X8S4__NEONFMA, strided_cn) {
23605 TEST_REQUIRES_ARM_NEON_FMA;
23606 GemmMicrokernelTester()
23607 .mr(4)
23608 .nr(8)
23609 .kr(1)
23610 .sr(4)
23611 .m(4)
23612 .n(8)
23613 .k(4)
23614 .cn_stride(11)
23615 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23616 }
23617
23618 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_strided_a) {
23619 TEST_REQUIRES_ARM_NEON_FMA;
23620 GemmMicrokernelTester()
23621 .mr(4)
23622 .nr(8)
23623 .kr(1)
23624 .sr(4)
23625 .m(4)
23626 .n(8)
23627 .k(4)
23628 .a_stride(7)
23629 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23630 }
23631
23632 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_subtile) {
23633 TEST_REQUIRES_ARM_NEON_FMA;
23634 for (uint32_t m = 1; m <= 4; m++) {
23635 for (uint32_t n = 1; n <= 8; n++) {
23636 GemmMicrokernelTester()
23637 .mr(4)
23638 .nr(8)
23639 .kr(1)
23640 .sr(4)
23641 .m(m)
23642 .n(n)
23643 .k(4)
23644 .iterations(1)
23645 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23646 }
23647 }
23648 }
23649
23650 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_subtile_m) {
23651 TEST_REQUIRES_ARM_NEON_FMA;
23652 for (uint32_t m = 1; m <= 4; m++) {
23653 GemmMicrokernelTester()
23654 .mr(4)
23655 .nr(8)
23656 .kr(1)
23657 .sr(4)
23658 .m(m)
23659 .n(8)
23660 .k(4)
23661 .iterations(1)
23662 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23663 }
23664 }
23665
23666 TEST(F32_GEMM_4X8S4__NEONFMA, k_eq_4_subtile_n) {
23667 TEST_REQUIRES_ARM_NEON_FMA;
23668 for (uint32_t n = 1; n <= 8; n++) {
23669 GemmMicrokernelTester()
23670 .mr(4)
23671 .nr(8)
23672 .kr(1)
23673 .sr(4)
23674 .m(4)
23675 .n(n)
23676 .k(4)
23677 .iterations(1)
23678 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23679 }
23680 }
23681
23682 TEST(F32_GEMM_4X8S4__NEONFMA, k_lt_4) {
23683 TEST_REQUIRES_ARM_NEON_FMA;
23684 for (size_t k = 1; k < 4; k++) {
23685 GemmMicrokernelTester()
23686 .mr(4)
23687 .nr(8)
23688 .kr(1)
23689 .sr(4)
23690 .m(4)
23691 .n(8)
23692 .k(k)
23693 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23694 }
23695 }
23696
23697 TEST(F32_GEMM_4X8S4__NEONFMA, k_lt_4_strided_a) {
23698 TEST_REQUIRES_ARM_NEON_FMA;
23699 for (size_t k = 1; k < 4; k++) {
23700 GemmMicrokernelTester()
23701 .mr(4)
23702 .nr(8)
23703 .kr(1)
23704 .sr(4)
23705 .m(4)
23706 .n(8)
23707 .k(k)
23708 .a_stride(7)
23709 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23710 }
23711 }
23712
23713 TEST(F32_GEMM_4X8S4__NEONFMA, k_lt_4_subtile) {
23714 TEST_REQUIRES_ARM_NEON_FMA;
23715 for (size_t k = 1; k < 4; k++) {
23716 for (uint32_t m = 1; m <= 4; m++) {
23717 for (uint32_t n = 1; n <= 8; n++) {
23718 GemmMicrokernelTester()
23719 .mr(4)
23720 .nr(8)
23721 .kr(1)
23722 .sr(4)
23723 .m(m)
23724 .n(n)
23725 .k(k)
23726 .iterations(1)
23727 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23728 }
23729 }
23730 }
23731 }
23732
23733 TEST(F32_GEMM_4X8S4__NEONFMA, k_gt_4) {
23734 TEST_REQUIRES_ARM_NEON_FMA;
23735 for (size_t k = 5; k < 8; k++) {
23736 GemmMicrokernelTester()
23737 .mr(4)
23738 .nr(8)
23739 .kr(1)
23740 .sr(4)
23741 .m(4)
23742 .n(8)
23743 .k(k)
23744 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23745 }
23746 }
23747
23748 TEST(F32_GEMM_4X8S4__NEONFMA, k_gt_4_strided_a) {
23749 TEST_REQUIRES_ARM_NEON_FMA;
23750 for (size_t k = 5; k < 8; k++) {
23751 GemmMicrokernelTester()
23752 .mr(4)
23753 .nr(8)
23754 .kr(1)
23755 .sr(4)
23756 .m(4)
23757 .n(8)
23758 .k(k)
23759 .a_stride(11)
23760 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23761 }
23762 }
23763
23764 TEST(F32_GEMM_4X8S4__NEONFMA, k_gt_4_subtile) {
23765 TEST_REQUIRES_ARM_NEON_FMA;
23766 for (size_t k = 5; k < 8; k++) {
23767 for (uint32_t m = 1; m <= 4; m++) {
23768 for (uint32_t n = 1; n <= 8; n++) {
23769 GemmMicrokernelTester()
23770 .mr(4)
23771 .nr(8)
23772 .kr(1)
23773 .sr(4)
23774 .m(m)
23775 .n(n)
23776 .k(k)
23777 .iterations(1)
23778 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23779 }
23780 }
23781 }
23782 }
23783
23784 TEST(F32_GEMM_4X8S4__NEONFMA, k_div_4) {
23785 TEST_REQUIRES_ARM_NEON_FMA;
23786 for (size_t k = 8; k <= 40; k += 4) {
23787 GemmMicrokernelTester()
23788 .mr(4)
23789 .nr(8)
23790 .kr(1)
23791 .sr(4)
23792 .m(4)
23793 .n(8)
23794 .k(k)
23795 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23796 }
23797 }
23798
23799 TEST(F32_GEMM_4X8S4__NEONFMA, k_div_4_strided_a) {
23800 TEST_REQUIRES_ARM_NEON_FMA;
23801 for (size_t k = 8; k <= 40; k += 4) {
23802 GemmMicrokernelTester()
23803 .mr(4)
23804 .nr(8)
23805 .kr(1)
23806 .sr(4)
23807 .m(4)
23808 .n(8)
23809 .k(k)
23810 .a_stride(43)
23811 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23812 }
23813 }
23814
23815 TEST(F32_GEMM_4X8S4__NEONFMA, k_div_4_subtile) {
23816 TEST_REQUIRES_ARM_NEON_FMA;
23817 for (size_t k = 8; k <= 40; k += 4) {
23818 for (uint32_t m = 1; m <= 4; m++) {
23819 for (uint32_t n = 1; n <= 8; n++) {
23820 GemmMicrokernelTester()
23821 .mr(4)
23822 .nr(8)
23823 .kr(1)
23824 .sr(4)
23825 .m(m)
23826 .n(n)
23827 .k(k)
23828 .iterations(1)
23829 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23830 }
23831 }
23832 }
23833 }
23834
23835 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8) {
23836 TEST_REQUIRES_ARM_NEON_FMA;
23837 for (uint32_t n = 9; n < 16; n++) {
23838 for (size_t k = 1; k <= 20; k += 5) {
23839 GemmMicrokernelTester()
23840 .mr(4)
23841 .nr(8)
23842 .kr(1)
23843 .sr(4)
23844 .m(4)
23845 .n(8)
23846 .k(k)
23847 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23848 }
23849 }
23850 }
23851
23852 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8_strided_cn) {
23853 TEST_REQUIRES_ARM_NEON_FMA;
23854 for (uint32_t n = 9; n < 16; n++) {
23855 for (size_t k = 1; k <= 20; k += 5) {
23856 GemmMicrokernelTester()
23857 .mr(4)
23858 .nr(8)
23859 .kr(1)
23860 .sr(4)
23861 .m(4)
23862 .n(8)
23863 .k(k)
23864 .cn_stride(11)
23865 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23866 }
23867 }
23868 }
23869
23870 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8_strided_a) {
23871 TEST_REQUIRES_ARM_NEON_FMA;
23872 for (uint32_t n = 9; n < 16; n++) {
23873 for (size_t k = 1; k <= 20; k += 5) {
23874 GemmMicrokernelTester()
23875 .mr(4)
23876 .nr(8)
23877 .kr(1)
23878 .sr(4)
23879 .m(4)
23880 .n(n)
23881 .k(k)
23882 .a_stride(23)
23883 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23884 }
23885 }
23886 }
23887
23888 TEST(F32_GEMM_4X8S4__NEONFMA, n_gt_8_subtile) {
23889 TEST_REQUIRES_ARM_NEON_FMA;
23890 for (uint32_t n = 9; n < 16; n++) {
23891 for (size_t k = 1; k <= 20; k += 5) {
23892 for (uint32_t m = 1; m <= 4; m++) {
23893 GemmMicrokernelTester()
23894 .mr(4)
23895 .nr(8)
23896 .kr(1)
23897 .sr(4)
23898 .m(m)
23899 .n(n)
23900 .k(k)
23901 .iterations(1)
23902 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23903 }
23904 }
23905 }
23906 }
23907
23908 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8) {
23909 TEST_REQUIRES_ARM_NEON_FMA;
23910 for (uint32_t n = 16; n <= 24; n += 8) {
23911 for (size_t k = 1; k <= 20; k += 5) {
23912 GemmMicrokernelTester()
23913 .mr(4)
23914 .nr(8)
23915 .kr(1)
23916 .sr(4)
23917 .m(4)
23918 .n(8)
23919 .k(k)
23920 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23921 }
23922 }
23923 }
23924
23925 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8_strided_cn) {
23926 TEST_REQUIRES_ARM_NEON_FMA;
23927 for (uint32_t n = 16; n <= 24; n += 8) {
23928 for (size_t k = 1; k <= 20; k += 5) {
23929 GemmMicrokernelTester()
23930 .mr(4)
23931 .nr(8)
23932 .kr(1)
23933 .sr(4)
23934 .m(4)
23935 .n(n)
23936 .k(k)
23937 .cn_stride(11)
23938 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23939 }
23940 }
23941 }
23942
23943 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8_strided_a) {
23944 TEST_REQUIRES_ARM_NEON_FMA;
23945 for (uint32_t n = 16; n <= 24; n += 8) {
23946 for (size_t k = 1; k <= 20; k += 5) {
23947 GemmMicrokernelTester()
23948 .mr(4)
23949 .nr(8)
23950 .kr(1)
23951 .sr(4)
23952 .m(4)
23953 .n(n)
23954 .k(k)
23955 .a_stride(23)
23956 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23957 }
23958 }
23959 }
23960
23961 TEST(F32_GEMM_4X8S4__NEONFMA, n_div_8_subtile) {
23962 TEST_REQUIRES_ARM_NEON_FMA;
23963 for (uint32_t n = 16; n <= 24; n += 8) {
23964 for (size_t k = 1; k <= 20; k += 5) {
23965 for (uint32_t m = 1; m <= 4; m++) {
23966 GemmMicrokernelTester()
23967 .mr(4)
23968 .nr(8)
23969 .kr(1)
23970 .sr(4)
23971 .m(m)
23972 .n(n)
23973 .k(k)
23974 .iterations(1)
23975 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23976 }
23977 }
23978 }
23979 }
23980
23981 TEST(F32_GEMM_4X8S4__NEONFMA, strided_cm_subtile) {
23982 TEST_REQUIRES_ARM_NEON_FMA;
23983 for (size_t k = 1; k <= 20; k += 5) {
23984 for (uint32_t m = 1; m <= 4; m++) {
23985 for (uint32_t n = 1; n <= 8; n++) {
23986 GemmMicrokernelTester()
23987 .mr(4)
23988 .nr(8)
23989 .kr(1)
23990 .sr(4)
23991 .m(m)
23992 .n(n)
23993 .k(k)
23994 .cm_stride(11)
23995 .iterations(1)
23996 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
23997 }
23998 }
23999 }
24000 }
24001
24002 TEST(F32_GEMM_4X8S4__NEONFMA, qmin) {
24003 TEST_REQUIRES_ARM_NEON_FMA;
24004 GemmMicrokernelTester()
24005 .mr(4)
24006 .nr(8)
24007 .kr(1)
24008 .sr(4)
24009 .m(4)
24010 .n(8)
24011 .k(4)
24012 .qmin(128)
24013 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
24014 }
24015
24016 TEST(F32_GEMM_4X8S4__NEONFMA, qmax) {
24017 TEST_REQUIRES_ARM_NEON_FMA;
24018 GemmMicrokernelTester()
24019 .mr(4)
24020 .nr(8)
24021 .kr(1)
24022 .sr(4)
24023 .m(4)
24024 .n(8)
24025 .k(4)
24026 .qmax(128)
24027 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
24028 }
24029
24030 TEST(F32_GEMM_4X8S4__NEONFMA, strided_cm) {
24031 TEST_REQUIRES_ARM_NEON_FMA;
24032 GemmMicrokernelTester()
24033 .mr(4)
24034 .nr(8)
24035 .kr(1)
24036 .sr(4)
24037 .m(4)
24038 .n(8)
24039 .k(4)
24040 .cm_stride(11)
24041 .Test(xnn_f32_gemm_ukernel_4x8s4__neonfma);
24042 }
24043#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24044
24045
24046#if XNN_ARCH_ARM || XNN_ARCH_ARM64
24047 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4) {
24048 TEST_REQUIRES_ARM_NEON_FMA;
24049 GemmMicrokernelTester()
24050 .mr(6)
24051 .nr(8)
24052 .kr(1)
24053 .sr(4)
24054 .m(6)
24055 .n(8)
24056 .k(4)
24057 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24058 }
24059
24060 TEST(F32_GEMM_6X8S4__NEONFMA, strided_cn) {
24061 TEST_REQUIRES_ARM_NEON_FMA;
24062 GemmMicrokernelTester()
24063 .mr(6)
24064 .nr(8)
24065 .kr(1)
24066 .sr(4)
24067 .m(6)
24068 .n(8)
24069 .k(4)
24070 .cn_stride(11)
24071 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24072 }
24073
24074 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_strided_a) {
24075 TEST_REQUIRES_ARM_NEON_FMA;
24076 GemmMicrokernelTester()
24077 .mr(6)
24078 .nr(8)
24079 .kr(1)
24080 .sr(4)
24081 .m(6)
24082 .n(8)
24083 .k(4)
24084 .a_stride(7)
24085 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24086 }
24087
24088 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_subtile) {
24089 TEST_REQUIRES_ARM_NEON_FMA;
24090 for (uint32_t m = 1; m <= 6; m++) {
24091 for (uint32_t n = 1; n <= 8; n++) {
24092 GemmMicrokernelTester()
24093 .mr(6)
24094 .nr(8)
24095 .kr(1)
24096 .sr(4)
24097 .m(m)
24098 .n(n)
24099 .k(4)
24100 .iterations(1)
24101 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24102 }
24103 }
24104 }
24105
24106 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_subtile_m) {
24107 TEST_REQUIRES_ARM_NEON_FMA;
24108 for (uint32_t m = 1; m <= 6; m++) {
24109 GemmMicrokernelTester()
24110 .mr(6)
24111 .nr(8)
24112 .kr(1)
24113 .sr(4)
24114 .m(m)
24115 .n(8)
24116 .k(4)
24117 .iterations(1)
24118 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24119 }
24120 }
24121
24122 TEST(F32_GEMM_6X8S4__NEONFMA, k_eq_4_subtile_n) {
24123 TEST_REQUIRES_ARM_NEON_FMA;
24124 for (uint32_t n = 1; n <= 8; n++) {
24125 GemmMicrokernelTester()
24126 .mr(6)
24127 .nr(8)
24128 .kr(1)
24129 .sr(4)
24130 .m(6)
24131 .n(n)
24132 .k(4)
24133 .iterations(1)
24134 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24135 }
24136 }
24137
24138 TEST(F32_GEMM_6X8S4__NEONFMA, k_lt_4) {
24139 TEST_REQUIRES_ARM_NEON_FMA;
24140 for (size_t k = 1; k < 4; k++) {
24141 GemmMicrokernelTester()
24142 .mr(6)
24143 .nr(8)
24144 .kr(1)
24145 .sr(4)
24146 .m(6)
24147 .n(8)
24148 .k(k)
24149 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24150 }
24151 }
24152
24153 TEST(F32_GEMM_6X8S4__NEONFMA, k_lt_4_strided_a) {
24154 TEST_REQUIRES_ARM_NEON_FMA;
24155 for (size_t k = 1; k < 4; k++) {
24156 GemmMicrokernelTester()
24157 .mr(6)
24158 .nr(8)
24159 .kr(1)
24160 .sr(4)
24161 .m(6)
24162 .n(8)
24163 .k(k)
24164 .a_stride(7)
24165 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24166 }
24167 }
24168
24169 TEST(F32_GEMM_6X8S4__NEONFMA, k_lt_4_subtile) {
24170 TEST_REQUIRES_ARM_NEON_FMA;
24171 for (size_t k = 1; k < 4; k++) {
24172 for (uint32_t m = 1; m <= 6; m++) {
24173 for (uint32_t n = 1; n <= 8; n++) {
24174 GemmMicrokernelTester()
24175 .mr(6)
24176 .nr(8)
24177 .kr(1)
24178 .sr(4)
24179 .m(m)
24180 .n(n)
24181 .k(k)
24182 .iterations(1)
24183 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24184 }
24185 }
24186 }
24187 }
24188
24189 TEST(F32_GEMM_6X8S4__NEONFMA, k_gt_4) {
24190 TEST_REQUIRES_ARM_NEON_FMA;
24191 for (size_t k = 5; k < 8; k++) {
24192 GemmMicrokernelTester()
24193 .mr(6)
24194 .nr(8)
24195 .kr(1)
24196 .sr(4)
24197 .m(6)
24198 .n(8)
24199 .k(k)
24200 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24201 }
24202 }
24203
24204 TEST(F32_GEMM_6X8S4__NEONFMA, k_gt_4_strided_a) {
24205 TEST_REQUIRES_ARM_NEON_FMA;
24206 for (size_t k = 5; k < 8; k++) {
24207 GemmMicrokernelTester()
24208 .mr(6)
24209 .nr(8)
24210 .kr(1)
24211 .sr(4)
24212 .m(6)
24213 .n(8)
24214 .k(k)
24215 .a_stride(11)
24216 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24217 }
24218 }
24219
24220 TEST(F32_GEMM_6X8S4__NEONFMA, k_gt_4_subtile) {
24221 TEST_REQUIRES_ARM_NEON_FMA;
24222 for (size_t k = 5; k < 8; k++) {
24223 for (uint32_t m = 1; m <= 6; m++) {
24224 for (uint32_t n = 1; n <= 8; n++) {
24225 GemmMicrokernelTester()
24226 .mr(6)
24227 .nr(8)
24228 .kr(1)
24229 .sr(4)
24230 .m(m)
24231 .n(n)
24232 .k(k)
24233 .iterations(1)
24234 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24235 }
24236 }
24237 }
24238 }
24239
24240 TEST(F32_GEMM_6X8S4__NEONFMA, k_div_4) {
24241 TEST_REQUIRES_ARM_NEON_FMA;
24242 for (size_t k = 8; k <= 40; k += 4) {
24243 GemmMicrokernelTester()
24244 .mr(6)
24245 .nr(8)
24246 .kr(1)
24247 .sr(4)
24248 .m(6)
24249 .n(8)
24250 .k(k)
24251 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24252 }
24253 }
24254
24255 TEST(F32_GEMM_6X8S4__NEONFMA, k_div_4_strided_a) {
24256 TEST_REQUIRES_ARM_NEON_FMA;
24257 for (size_t k = 8; k <= 40; k += 4) {
24258 GemmMicrokernelTester()
24259 .mr(6)
24260 .nr(8)
24261 .kr(1)
24262 .sr(4)
24263 .m(6)
24264 .n(8)
24265 .k(k)
24266 .a_stride(43)
24267 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24268 }
24269 }
24270
24271 TEST(F32_GEMM_6X8S4__NEONFMA, k_div_4_subtile) {
24272 TEST_REQUIRES_ARM_NEON_FMA;
24273 for (size_t k = 8; k <= 40; k += 4) {
24274 for (uint32_t m = 1; m <= 6; m++) {
24275 for (uint32_t n = 1; n <= 8; n++) {
24276 GemmMicrokernelTester()
24277 .mr(6)
24278 .nr(8)
24279 .kr(1)
24280 .sr(4)
24281 .m(m)
24282 .n(n)
24283 .k(k)
24284 .iterations(1)
24285 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24286 }
24287 }
24288 }
24289 }
24290
24291 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8) {
24292 TEST_REQUIRES_ARM_NEON_FMA;
24293 for (uint32_t n = 9; n < 16; n++) {
24294 for (size_t k = 1; k <= 20; k += 5) {
24295 GemmMicrokernelTester()
24296 .mr(6)
24297 .nr(8)
24298 .kr(1)
24299 .sr(4)
24300 .m(6)
24301 .n(8)
24302 .k(k)
24303 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24304 }
24305 }
24306 }
24307
24308 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8_strided_cn) {
24309 TEST_REQUIRES_ARM_NEON_FMA;
24310 for (uint32_t n = 9; n < 16; n++) {
24311 for (size_t k = 1; k <= 20; k += 5) {
24312 GemmMicrokernelTester()
24313 .mr(6)
24314 .nr(8)
24315 .kr(1)
24316 .sr(4)
24317 .m(6)
24318 .n(8)
24319 .k(k)
24320 .cn_stride(11)
24321 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24322 }
24323 }
24324 }
24325
24326 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8_strided_a) {
24327 TEST_REQUIRES_ARM_NEON_FMA;
24328 for (uint32_t n = 9; n < 16; n++) {
24329 for (size_t k = 1; k <= 20; k += 5) {
24330 GemmMicrokernelTester()
24331 .mr(6)
24332 .nr(8)
24333 .kr(1)
24334 .sr(4)
24335 .m(6)
24336 .n(n)
24337 .k(k)
24338 .a_stride(23)
24339 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24340 }
24341 }
24342 }
24343
24344 TEST(F32_GEMM_6X8S4__NEONFMA, n_gt_8_subtile) {
24345 TEST_REQUIRES_ARM_NEON_FMA;
24346 for (uint32_t n = 9; n < 16; n++) {
24347 for (size_t k = 1; k <= 20; k += 5) {
24348 for (uint32_t m = 1; m <= 6; m++) {
24349 GemmMicrokernelTester()
24350 .mr(6)
24351 .nr(8)
24352 .kr(1)
24353 .sr(4)
24354 .m(m)
24355 .n(n)
24356 .k(k)
24357 .iterations(1)
24358 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24359 }
24360 }
24361 }
24362 }
24363
24364 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8) {
24365 TEST_REQUIRES_ARM_NEON_FMA;
24366 for (uint32_t n = 16; n <= 24; n += 8) {
24367 for (size_t k = 1; k <= 20; k += 5) {
24368 GemmMicrokernelTester()
24369 .mr(6)
24370 .nr(8)
24371 .kr(1)
24372 .sr(4)
24373 .m(6)
24374 .n(8)
24375 .k(k)
24376 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24377 }
24378 }
24379 }
24380
24381 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8_strided_cn) {
24382 TEST_REQUIRES_ARM_NEON_FMA;
24383 for (uint32_t n = 16; n <= 24; n += 8) {
24384 for (size_t k = 1; k <= 20; k += 5) {
24385 GemmMicrokernelTester()
24386 .mr(6)
24387 .nr(8)
24388 .kr(1)
24389 .sr(4)
24390 .m(6)
24391 .n(n)
24392 .k(k)
24393 .cn_stride(11)
24394 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24395 }
24396 }
24397 }
24398
24399 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8_strided_a) {
24400 TEST_REQUIRES_ARM_NEON_FMA;
24401 for (uint32_t n = 16; n <= 24; n += 8) {
24402 for (size_t k = 1; k <= 20; k += 5) {
24403 GemmMicrokernelTester()
24404 .mr(6)
24405 .nr(8)
24406 .kr(1)
24407 .sr(4)
24408 .m(6)
24409 .n(n)
24410 .k(k)
24411 .a_stride(23)
24412 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24413 }
24414 }
24415 }
24416
24417 TEST(F32_GEMM_6X8S4__NEONFMA, n_div_8_subtile) {
24418 TEST_REQUIRES_ARM_NEON_FMA;
24419 for (uint32_t n = 16; n <= 24; n += 8) {
24420 for (size_t k = 1; k <= 20; k += 5) {
24421 for (uint32_t m = 1; m <= 6; m++) {
24422 GemmMicrokernelTester()
24423 .mr(6)
24424 .nr(8)
24425 .kr(1)
24426 .sr(4)
24427 .m(m)
24428 .n(n)
24429 .k(k)
24430 .iterations(1)
24431 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24432 }
24433 }
24434 }
24435 }
24436
24437 TEST(F32_GEMM_6X8S4__NEONFMA, strided_cm_subtile) {
24438 TEST_REQUIRES_ARM_NEON_FMA;
24439 for (size_t k = 1; k <= 20; k += 5) {
24440 for (uint32_t m = 1; m <= 6; m++) {
24441 for (uint32_t n = 1; n <= 8; n++) {
24442 GemmMicrokernelTester()
24443 .mr(6)
24444 .nr(8)
24445 .kr(1)
24446 .sr(4)
24447 .m(m)
24448 .n(n)
24449 .k(k)
24450 .cm_stride(11)
24451 .iterations(1)
24452 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24453 }
24454 }
24455 }
24456 }
24457
24458 TEST(F32_GEMM_6X8S4__NEONFMA, qmin) {
24459 TEST_REQUIRES_ARM_NEON_FMA;
24460 GemmMicrokernelTester()
24461 .mr(6)
24462 .nr(8)
24463 .kr(1)
24464 .sr(4)
24465 .m(6)
24466 .n(8)
24467 .k(4)
24468 .qmin(128)
24469 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24470 }
24471
24472 TEST(F32_GEMM_6X8S4__NEONFMA, qmax) {
24473 TEST_REQUIRES_ARM_NEON_FMA;
24474 GemmMicrokernelTester()
24475 .mr(6)
24476 .nr(8)
24477 .kr(1)
24478 .sr(4)
24479 .m(6)
24480 .n(8)
24481 .k(4)
24482 .qmax(128)
24483 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24484 }
24485
24486 TEST(F32_GEMM_6X8S4__NEONFMA, strided_cm) {
24487 TEST_REQUIRES_ARM_NEON_FMA;
24488 GemmMicrokernelTester()
24489 .mr(6)
24490 .nr(8)
24491 .kr(1)
24492 .sr(4)
24493 .m(6)
24494 .n(8)
24495 .k(4)
24496 .cm_stride(11)
24497 .Test(xnn_f32_gemm_ukernel_6x8s4__neonfma);
24498 }
24499#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24500
24501
24502#if XNN_ARCH_ARM || XNN_ARCH_ARM64
24503 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4) {
24504 TEST_REQUIRES_ARM_NEON_FMA;
24505 GemmMicrokernelTester()
24506 .mr(8)
24507 .nr(8)
24508 .kr(1)
24509 .sr(4)
24510 .m(8)
24511 .n(8)
24512 .k(4)
24513 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24514 }
24515
24516 TEST(F32_GEMM_8X8S4__NEONFMA, strided_cn) {
24517 TEST_REQUIRES_ARM_NEON_FMA;
24518 GemmMicrokernelTester()
24519 .mr(8)
24520 .nr(8)
24521 .kr(1)
24522 .sr(4)
24523 .m(8)
24524 .n(8)
24525 .k(4)
24526 .cn_stride(11)
24527 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24528 }
24529
24530 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_strided_a) {
24531 TEST_REQUIRES_ARM_NEON_FMA;
24532 GemmMicrokernelTester()
24533 .mr(8)
24534 .nr(8)
24535 .kr(1)
24536 .sr(4)
24537 .m(8)
24538 .n(8)
24539 .k(4)
24540 .a_stride(7)
24541 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24542 }
24543
24544 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_subtile) {
24545 TEST_REQUIRES_ARM_NEON_FMA;
24546 for (uint32_t m = 1; m <= 8; m++) {
24547 for (uint32_t n = 1; n <= 8; n++) {
24548 GemmMicrokernelTester()
24549 .mr(8)
24550 .nr(8)
24551 .kr(1)
24552 .sr(4)
24553 .m(m)
24554 .n(n)
24555 .k(4)
24556 .iterations(1)
24557 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24558 }
24559 }
24560 }
24561
24562 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_subtile_m) {
24563 TEST_REQUIRES_ARM_NEON_FMA;
24564 for (uint32_t m = 1; m <= 8; m++) {
24565 GemmMicrokernelTester()
24566 .mr(8)
24567 .nr(8)
24568 .kr(1)
24569 .sr(4)
24570 .m(m)
24571 .n(8)
24572 .k(4)
24573 .iterations(1)
24574 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24575 }
24576 }
24577
24578 TEST(F32_GEMM_8X8S4__NEONFMA, k_eq_4_subtile_n) {
24579 TEST_REQUIRES_ARM_NEON_FMA;
24580 for (uint32_t n = 1; n <= 8; n++) {
24581 GemmMicrokernelTester()
24582 .mr(8)
24583 .nr(8)
24584 .kr(1)
24585 .sr(4)
24586 .m(8)
24587 .n(n)
24588 .k(4)
24589 .iterations(1)
24590 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24591 }
24592 }
24593
24594 TEST(F32_GEMM_8X8S4__NEONFMA, k_lt_4) {
24595 TEST_REQUIRES_ARM_NEON_FMA;
24596 for (size_t k = 1; k < 4; k++) {
24597 GemmMicrokernelTester()
24598 .mr(8)
24599 .nr(8)
24600 .kr(1)
24601 .sr(4)
24602 .m(8)
24603 .n(8)
24604 .k(k)
24605 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24606 }
24607 }
24608
24609 TEST(F32_GEMM_8X8S4__NEONFMA, k_lt_4_strided_a) {
24610 TEST_REQUIRES_ARM_NEON_FMA;
24611 for (size_t k = 1; k < 4; k++) {
24612 GemmMicrokernelTester()
24613 .mr(8)
24614 .nr(8)
24615 .kr(1)
24616 .sr(4)
24617 .m(8)
24618 .n(8)
24619 .k(k)
24620 .a_stride(7)
24621 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24622 }
24623 }
24624
24625 TEST(F32_GEMM_8X8S4__NEONFMA, k_lt_4_subtile) {
24626 TEST_REQUIRES_ARM_NEON_FMA;
24627 for (size_t k = 1; k < 4; k++) {
24628 for (uint32_t m = 1; m <= 8; m++) {
24629 for (uint32_t n = 1; n <= 8; n++) {
24630 GemmMicrokernelTester()
24631 .mr(8)
24632 .nr(8)
24633 .kr(1)
24634 .sr(4)
24635 .m(m)
24636 .n(n)
24637 .k(k)
24638 .iterations(1)
24639 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24640 }
24641 }
24642 }
24643 }
24644
24645 TEST(F32_GEMM_8X8S4__NEONFMA, k_gt_4) {
24646 TEST_REQUIRES_ARM_NEON_FMA;
24647 for (size_t k = 5; k < 8; k++) {
24648 GemmMicrokernelTester()
24649 .mr(8)
24650 .nr(8)
24651 .kr(1)
24652 .sr(4)
24653 .m(8)
24654 .n(8)
24655 .k(k)
24656 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24657 }
24658 }
24659
24660 TEST(F32_GEMM_8X8S4__NEONFMA, k_gt_4_strided_a) {
24661 TEST_REQUIRES_ARM_NEON_FMA;
24662 for (size_t k = 5; k < 8; k++) {
24663 GemmMicrokernelTester()
24664 .mr(8)
24665 .nr(8)
24666 .kr(1)
24667 .sr(4)
24668 .m(8)
24669 .n(8)
24670 .k(k)
24671 .a_stride(11)
24672 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24673 }
24674 }
24675
24676 TEST(F32_GEMM_8X8S4__NEONFMA, k_gt_4_subtile) {
24677 TEST_REQUIRES_ARM_NEON_FMA;
24678 for (size_t k = 5; k < 8; k++) {
24679 for (uint32_t m = 1; m <= 8; m++) {
24680 for (uint32_t n = 1; n <= 8; n++) {
24681 GemmMicrokernelTester()
24682 .mr(8)
24683 .nr(8)
24684 .kr(1)
24685 .sr(4)
24686 .m(m)
24687 .n(n)
24688 .k(k)
24689 .iterations(1)
24690 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24691 }
24692 }
24693 }
24694 }
24695
24696 TEST(F32_GEMM_8X8S4__NEONFMA, k_div_4) {
24697 TEST_REQUIRES_ARM_NEON_FMA;
24698 for (size_t k = 8; k <= 40; k += 4) {
24699 GemmMicrokernelTester()
24700 .mr(8)
24701 .nr(8)
24702 .kr(1)
24703 .sr(4)
24704 .m(8)
24705 .n(8)
24706 .k(k)
24707 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24708 }
24709 }
24710
24711 TEST(F32_GEMM_8X8S4__NEONFMA, k_div_4_strided_a) {
24712 TEST_REQUIRES_ARM_NEON_FMA;
24713 for (size_t k = 8; k <= 40; k += 4) {
24714 GemmMicrokernelTester()
24715 .mr(8)
24716 .nr(8)
24717 .kr(1)
24718 .sr(4)
24719 .m(8)
24720 .n(8)
24721 .k(k)
24722 .a_stride(43)
24723 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24724 }
24725 }
24726
24727 TEST(F32_GEMM_8X8S4__NEONFMA, k_div_4_subtile) {
24728 TEST_REQUIRES_ARM_NEON_FMA;
24729 for (size_t k = 8; k <= 40; k += 4) {
24730 for (uint32_t m = 1; m <= 8; m++) {
24731 for (uint32_t n = 1; n <= 8; n++) {
24732 GemmMicrokernelTester()
24733 .mr(8)
24734 .nr(8)
24735 .kr(1)
24736 .sr(4)
24737 .m(m)
24738 .n(n)
24739 .k(k)
24740 .iterations(1)
24741 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24742 }
24743 }
24744 }
24745 }
24746
24747 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8) {
24748 TEST_REQUIRES_ARM_NEON_FMA;
24749 for (uint32_t n = 9; n < 16; n++) {
24750 for (size_t k = 1; k <= 20; k += 5) {
24751 GemmMicrokernelTester()
24752 .mr(8)
24753 .nr(8)
24754 .kr(1)
24755 .sr(4)
24756 .m(8)
24757 .n(8)
24758 .k(k)
24759 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24760 }
24761 }
24762 }
24763
24764 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8_strided_cn) {
24765 TEST_REQUIRES_ARM_NEON_FMA;
24766 for (uint32_t n = 9; n < 16; n++) {
24767 for (size_t k = 1; k <= 20; k += 5) {
24768 GemmMicrokernelTester()
24769 .mr(8)
24770 .nr(8)
24771 .kr(1)
24772 .sr(4)
24773 .m(8)
24774 .n(8)
24775 .k(k)
24776 .cn_stride(11)
24777 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24778 }
24779 }
24780 }
24781
24782 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8_strided_a) {
24783 TEST_REQUIRES_ARM_NEON_FMA;
24784 for (uint32_t n = 9; n < 16; n++) {
24785 for (size_t k = 1; k <= 20; k += 5) {
24786 GemmMicrokernelTester()
24787 .mr(8)
24788 .nr(8)
24789 .kr(1)
24790 .sr(4)
24791 .m(8)
24792 .n(n)
24793 .k(k)
24794 .a_stride(23)
24795 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24796 }
24797 }
24798 }
24799
24800 TEST(F32_GEMM_8X8S4__NEONFMA, n_gt_8_subtile) {
24801 TEST_REQUIRES_ARM_NEON_FMA;
24802 for (uint32_t n = 9; n < 16; n++) {
24803 for (size_t k = 1; k <= 20; k += 5) {
24804 for (uint32_t m = 1; m <= 8; m++) {
24805 GemmMicrokernelTester()
24806 .mr(8)
24807 .nr(8)
24808 .kr(1)
24809 .sr(4)
24810 .m(m)
24811 .n(n)
24812 .k(k)
24813 .iterations(1)
24814 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24815 }
24816 }
24817 }
24818 }
24819
24820 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8) {
24821 TEST_REQUIRES_ARM_NEON_FMA;
24822 for (uint32_t n = 16; n <= 24; n += 8) {
24823 for (size_t k = 1; k <= 20; k += 5) {
24824 GemmMicrokernelTester()
24825 .mr(8)
24826 .nr(8)
24827 .kr(1)
24828 .sr(4)
24829 .m(8)
24830 .n(8)
24831 .k(k)
24832 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24833 }
24834 }
24835 }
24836
24837 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8_strided_cn) {
24838 TEST_REQUIRES_ARM_NEON_FMA;
24839 for (uint32_t n = 16; n <= 24; n += 8) {
24840 for (size_t k = 1; k <= 20; k += 5) {
24841 GemmMicrokernelTester()
24842 .mr(8)
24843 .nr(8)
24844 .kr(1)
24845 .sr(4)
24846 .m(8)
24847 .n(n)
24848 .k(k)
24849 .cn_stride(11)
24850 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24851 }
24852 }
24853 }
24854
24855 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8_strided_a) {
24856 TEST_REQUIRES_ARM_NEON_FMA;
24857 for (uint32_t n = 16; n <= 24; n += 8) {
24858 for (size_t k = 1; k <= 20; k += 5) {
24859 GemmMicrokernelTester()
24860 .mr(8)
24861 .nr(8)
24862 .kr(1)
24863 .sr(4)
24864 .m(8)
24865 .n(n)
24866 .k(k)
24867 .a_stride(23)
24868 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24869 }
24870 }
24871 }
24872
24873 TEST(F32_GEMM_8X8S4__NEONFMA, n_div_8_subtile) {
24874 TEST_REQUIRES_ARM_NEON_FMA;
24875 for (uint32_t n = 16; n <= 24; n += 8) {
24876 for (size_t k = 1; k <= 20; k += 5) {
24877 for (uint32_t m = 1; m <= 8; m++) {
24878 GemmMicrokernelTester()
24879 .mr(8)
24880 .nr(8)
24881 .kr(1)
24882 .sr(4)
24883 .m(m)
24884 .n(n)
24885 .k(k)
24886 .iterations(1)
24887 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24888 }
24889 }
24890 }
24891 }
24892
24893 TEST(F32_GEMM_8X8S4__NEONFMA, strided_cm_subtile) {
24894 TEST_REQUIRES_ARM_NEON_FMA;
24895 for (size_t k = 1; k <= 20; k += 5) {
24896 for (uint32_t m = 1; m <= 8; m++) {
24897 for (uint32_t n = 1; n <= 8; n++) {
24898 GemmMicrokernelTester()
24899 .mr(8)
24900 .nr(8)
24901 .kr(1)
24902 .sr(4)
24903 .m(m)
24904 .n(n)
24905 .k(k)
24906 .cm_stride(11)
24907 .iterations(1)
24908 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24909 }
24910 }
24911 }
24912 }
24913
24914 TEST(F32_GEMM_8X8S4__NEONFMA, qmin) {
24915 TEST_REQUIRES_ARM_NEON_FMA;
24916 GemmMicrokernelTester()
24917 .mr(8)
24918 .nr(8)
24919 .kr(1)
24920 .sr(4)
24921 .m(8)
24922 .n(8)
24923 .k(4)
24924 .qmin(128)
24925 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24926 }
24927
24928 TEST(F32_GEMM_8X8S4__NEONFMA, qmax) {
24929 TEST_REQUIRES_ARM_NEON_FMA;
24930 GemmMicrokernelTester()
24931 .mr(8)
24932 .nr(8)
24933 .kr(1)
24934 .sr(4)
24935 .m(8)
24936 .n(8)
24937 .k(4)
24938 .qmax(128)
24939 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24940 }
24941
24942 TEST(F32_GEMM_8X8S4__NEONFMA, strided_cm) {
24943 TEST_REQUIRES_ARM_NEON_FMA;
24944 GemmMicrokernelTester()
24945 .mr(8)
24946 .nr(8)
24947 .kr(1)
24948 .sr(4)
24949 .m(8)
24950 .n(8)
24951 .k(4)
24952 .cm_stride(11)
24953 .Test(xnn_f32_gemm_ukernel_8x8s4__neonfma);
24954 }
24955#endif // XNN_ARCH_ARM || XNN_ARCH_ARM64
24956
24957
Marat Dukhan1dadbf72019-10-01 10:46:20 -070024958#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070024959 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1) {
24960 TEST_REQUIRES_X86_SSE;
24961 GemmMicrokernelTester()
24962 .mr(1)
24963 .nr(8)
24964 .kr(1)
24965 .sr(1)
24966 .m(1)
24967 .n(8)
24968 .k(1)
24969 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
24970 }
24971
24972 TEST(F32_GEMM_1X8__SSE_LOAD1, strided_cn) {
24973 TEST_REQUIRES_X86_SSE;
24974 GemmMicrokernelTester()
24975 .mr(1)
24976 .nr(8)
24977 .kr(1)
24978 .sr(1)
24979 .m(1)
24980 .n(8)
24981 .k(1)
24982 .cn_stride(11)
24983 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
24984 }
24985
24986 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_strided_a) {
24987 TEST_REQUIRES_X86_SSE;
24988 GemmMicrokernelTester()
24989 .mr(1)
24990 .nr(8)
24991 .kr(1)
24992 .sr(1)
24993 .m(1)
24994 .n(8)
24995 .k(1)
24996 .a_stride(3)
24997 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
24998 }
24999
25000 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_subtile) {
25001 TEST_REQUIRES_X86_SSE;
25002 for (uint32_t m = 1; m <= 1; m++) {
25003 for (uint32_t n = 1; n <= 8; n++) {
25004 GemmMicrokernelTester()
25005 .mr(1)
25006 .nr(8)
25007 .kr(1)
25008 .sr(1)
25009 .m(m)
25010 .n(n)
25011 .k(1)
25012 .iterations(1)
25013 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25014 }
25015 }
25016 }
25017
25018 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_subtile_m) {
25019 TEST_REQUIRES_X86_SSE;
25020 for (uint32_t m = 1; m <= 1; m++) {
25021 GemmMicrokernelTester()
25022 .mr(1)
25023 .nr(8)
25024 .kr(1)
25025 .sr(1)
25026 .m(m)
25027 .n(8)
25028 .k(1)
25029 .iterations(1)
25030 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25031 }
25032 }
25033
25034 TEST(F32_GEMM_1X8__SSE_LOAD1, k_eq_1_subtile_n) {
25035 TEST_REQUIRES_X86_SSE;
25036 for (uint32_t n = 1; n <= 8; n++) {
25037 GemmMicrokernelTester()
25038 .mr(1)
25039 .nr(8)
25040 .kr(1)
25041 .sr(1)
25042 .m(1)
25043 .n(n)
25044 .k(1)
25045 .iterations(1)
25046 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25047 }
25048 }
25049
25050 TEST(F32_GEMM_1X8__SSE_LOAD1, k_gt_1) {
25051 TEST_REQUIRES_X86_SSE;
25052 for (size_t k = 2; k < 10; k++) {
25053 GemmMicrokernelTester()
25054 .mr(1)
25055 .nr(8)
25056 .kr(1)
25057 .sr(1)
25058 .m(1)
25059 .n(8)
25060 .k(k)
25061 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25062 }
25063 }
25064
25065 TEST(F32_GEMM_1X8__SSE_LOAD1, k_gt_1_strided_a) {
25066 TEST_REQUIRES_X86_SSE;
25067 for (size_t k = 2; k < 10; k++) {
25068 GemmMicrokernelTester()
25069 .mr(1)
25070 .nr(8)
25071 .kr(1)
25072 .sr(1)
25073 .m(1)
25074 .n(8)
25075 .k(k)
25076 .a_stride(11)
25077 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25078 }
25079 }
25080
25081 TEST(F32_GEMM_1X8__SSE_LOAD1, k_gt_1_subtile) {
25082 TEST_REQUIRES_X86_SSE;
25083 for (size_t k = 2; k < 10; k++) {
25084 for (uint32_t m = 1; m <= 1; m++) {
25085 for (uint32_t n = 1; n <= 8; n++) {
25086 GemmMicrokernelTester()
25087 .mr(1)
25088 .nr(8)
25089 .kr(1)
25090 .sr(1)
25091 .m(m)
25092 .n(n)
25093 .k(k)
25094 .iterations(1)
25095 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25096 }
25097 }
25098 }
25099 }
25100
25101 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8) {
25102 TEST_REQUIRES_X86_SSE;
25103 for (uint32_t n = 9; n < 16; n++) {
25104 for (size_t k = 1; k <= 5; k += 2) {
25105 GemmMicrokernelTester()
25106 .mr(1)
25107 .nr(8)
25108 .kr(1)
25109 .sr(1)
25110 .m(1)
25111 .n(8)
25112 .k(k)
25113 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25114 }
25115 }
25116 }
25117
25118 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8_strided_cn) {
25119 TEST_REQUIRES_X86_SSE;
25120 for (uint32_t n = 9; n < 16; n++) {
25121 for (size_t k = 1; k <= 5; k += 2) {
25122 GemmMicrokernelTester()
25123 .mr(1)
25124 .nr(8)
25125 .kr(1)
25126 .sr(1)
25127 .m(1)
25128 .n(8)
25129 .k(k)
25130 .cn_stride(11)
25131 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25132 }
25133 }
25134 }
25135
25136 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8_strided_a) {
25137 TEST_REQUIRES_X86_SSE;
25138 for (uint32_t n = 9; n < 16; n++) {
25139 for (size_t k = 1; k <= 5; k += 2) {
25140 GemmMicrokernelTester()
25141 .mr(1)
25142 .nr(8)
25143 .kr(1)
25144 .sr(1)
25145 .m(1)
25146 .n(n)
25147 .k(k)
25148 .a_stride(7)
25149 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25150 }
25151 }
25152 }
25153
25154 TEST(F32_GEMM_1X8__SSE_LOAD1, n_gt_8_subtile) {
25155 TEST_REQUIRES_X86_SSE;
25156 for (uint32_t n = 9; n < 16; n++) {
25157 for (size_t k = 1; k <= 5; k += 2) {
25158 for (uint32_t m = 1; m <= 1; m++) {
25159 GemmMicrokernelTester()
25160 .mr(1)
25161 .nr(8)
25162 .kr(1)
25163 .sr(1)
25164 .m(m)
25165 .n(n)
25166 .k(k)
25167 .iterations(1)
25168 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25169 }
25170 }
25171 }
25172 }
25173
25174 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8) {
25175 TEST_REQUIRES_X86_SSE;
25176 for (uint32_t n = 16; n <= 24; n += 8) {
25177 for (size_t k = 1; k <= 5; k += 2) {
25178 GemmMicrokernelTester()
25179 .mr(1)
25180 .nr(8)
25181 .kr(1)
25182 .sr(1)
25183 .m(1)
25184 .n(8)
25185 .k(k)
25186 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25187 }
25188 }
25189 }
25190
25191 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8_strided_cn) {
25192 TEST_REQUIRES_X86_SSE;
25193 for (uint32_t n = 16; n <= 24; n += 8) {
25194 for (size_t k = 1; k <= 5; k += 2) {
25195 GemmMicrokernelTester()
25196 .mr(1)
25197 .nr(8)
25198 .kr(1)
25199 .sr(1)
25200 .m(1)
25201 .n(n)
25202 .k(k)
25203 .cn_stride(11)
25204 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25205 }
25206 }
25207 }
25208
25209 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8_strided_a) {
25210 TEST_REQUIRES_X86_SSE;
25211 for (uint32_t n = 16; n <= 24; n += 8) {
25212 for (size_t k = 1; k <= 5; k += 2) {
25213 GemmMicrokernelTester()
25214 .mr(1)
25215 .nr(8)
25216 .kr(1)
25217 .sr(1)
25218 .m(1)
25219 .n(n)
25220 .k(k)
25221 .a_stride(7)
25222 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25223 }
25224 }
25225 }
25226
25227 TEST(F32_GEMM_1X8__SSE_LOAD1, n_div_8_subtile) {
25228 TEST_REQUIRES_X86_SSE;
25229 for (uint32_t n = 16; n <= 24; n += 8) {
25230 for (size_t k = 1; k <= 5; k += 2) {
25231 for (uint32_t m = 1; m <= 1; m++) {
25232 GemmMicrokernelTester()
25233 .mr(1)
25234 .nr(8)
25235 .kr(1)
25236 .sr(1)
25237 .m(m)
25238 .n(n)
25239 .k(k)
25240 .iterations(1)
25241 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25242 }
25243 }
25244 }
25245 }
25246
25247 TEST(F32_GEMM_1X8__SSE_LOAD1, strided_cm_subtile) {
25248 TEST_REQUIRES_X86_SSE;
25249 for (size_t k = 1; k <= 5; k += 2) {
25250 for (uint32_t m = 1; m <= 1; m++) {
25251 for (uint32_t n = 1; n <= 8; n++) {
25252 GemmMicrokernelTester()
25253 .mr(1)
25254 .nr(8)
25255 .kr(1)
25256 .sr(1)
25257 .m(m)
25258 .n(n)
25259 .k(k)
25260 .cm_stride(11)
25261 .iterations(1)
25262 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25263 }
25264 }
25265 }
25266 }
25267
25268 TEST(F32_GEMM_1X8__SSE_LOAD1, qmin) {
25269 TEST_REQUIRES_X86_SSE;
25270 GemmMicrokernelTester()
25271 .mr(1)
25272 .nr(8)
25273 .kr(1)
25274 .sr(1)
25275 .m(1)
25276 .n(8)
25277 .k(1)
25278 .qmin(128)
25279 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25280 }
25281
25282 TEST(F32_GEMM_1X8__SSE_LOAD1, qmax) {
25283 TEST_REQUIRES_X86_SSE;
25284 GemmMicrokernelTester()
25285 .mr(1)
25286 .nr(8)
25287 .kr(1)
25288 .sr(1)
25289 .m(1)
25290 .n(8)
25291 .k(1)
25292 .qmax(128)
25293 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25294 }
25295
25296 TEST(F32_GEMM_1X8__SSE_LOAD1, strided_cm) {
25297 TEST_REQUIRES_X86_SSE;
25298 GemmMicrokernelTester()
25299 .mr(1)
25300 .nr(8)
25301 .kr(1)
25302 .sr(1)
25303 .m(1)
25304 .n(8)
25305 .k(1)
25306 .cm_stride(11)
25307 .Test(xnn_f32_gemm_ukernel_1x8__sse_load1);
25308 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025309#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070025310
25311
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025312#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070025313 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1) {
25314 TEST_REQUIRES_X86_SSE;
25315 GemmMicrokernelTester()
25316 .mr(4)
25317 .nr(8)
25318 .kr(1)
25319 .sr(1)
25320 .m(4)
25321 .n(8)
25322 .k(1)
25323 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25324 }
25325
25326 TEST(F32_GEMM_4X8__SSE_LOAD1, strided_cn) {
25327 TEST_REQUIRES_X86_SSE;
25328 GemmMicrokernelTester()
25329 .mr(4)
25330 .nr(8)
25331 .kr(1)
25332 .sr(1)
25333 .m(4)
25334 .n(8)
25335 .k(1)
25336 .cn_stride(11)
25337 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25338 }
25339
25340 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_strided_a) {
25341 TEST_REQUIRES_X86_SSE;
25342 GemmMicrokernelTester()
25343 .mr(4)
25344 .nr(8)
25345 .kr(1)
25346 .sr(1)
25347 .m(4)
25348 .n(8)
25349 .k(1)
25350 .a_stride(3)
25351 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25352 }
25353
25354 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_subtile) {
25355 TEST_REQUIRES_X86_SSE;
25356 for (uint32_t m = 1; m <= 4; m++) {
25357 for (uint32_t n = 1; n <= 8; n++) {
25358 GemmMicrokernelTester()
25359 .mr(4)
25360 .nr(8)
25361 .kr(1)
25362 .sr(1)
25363 .m(m)
25364 .n(n)
25365 .k(1)
25366 .iterations(1)
25367 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25368 }
25369 }
25370 }
25371
25372 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_subtile_m) {
25373 TEST_REQUIRES_X86_SSE;
25374 for (uint32_t m = 1; m <= 4; m++) {
25375 GemmMicrokernelTester()
25376 .mr(4)
25377 .nr(8)
25378 .kr(1)
25379 .sr(1)
25380 .m(m)
25381 .n(8)
25382 .k(1)
25383 .iterations(1)
25384 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25385 }
25386 }
25387
25388 TEST(F32_GEMM_4X8__SSE_LOAD1, k_eq_1_subtile_n) {
25389 TEST_REQUIRES_X86_SSE;
25390 for (uint32_t n = 1; n <= 8; n++) {
25391 GemmMicrokernelTester()
25392 .mr(4)
25393 .nr(8)
25394 .kr(1)
25395 .sr(1)
25396 .m(4)
25397 .n(n)
25398 .k(1)
25399 .iterations(1)
25400 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25401 }
25402 }
25403
25404 TEST(F32_GEMM_4X8__SSE_LOAD1, k_gt_1) {
25405 TEST_REQUIRES_X86_SSE;
25406 for (size_t k = 2; k < 10; k++) {
25407 GemmMicrokernelTester()
25408 .mr(4)
25409 .nr(8)
25410 .kr(1)
25411 .sr(1)
25412 .m(4)
25413 .n(8)
25414 .k(k)
25415 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25416 }
25417 }
25418
25419 TEST(F32_GEMM_4X8__SSE_LOAD1, k_gt_1_strided_a) {
25420 TEST_REQUIRES_X86_SSE;
25421 for (size_t k = 2; k < 10; k++) {
25422 GemmMicrokernelTester()
25423 .mr(4)
25424 .nr(8)
25425 .kr(1)
25426 .sr(1)
25427 .m(4)
25428 .n(8)
25429 .k(k)
25430 .a_stride(11)
25431 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25432 }
25433 }
25434
25435 TEST(F32_GEMM_4X8__SSE_LOAD1, k_gt_1_subtile) {
25436 TEST_REQUIRES_X86_SSE;
25437 for (size_t k = 2; k < 10; k++) {
25438 for (uint32_t m = 1; m <= 4; m++) {
25439 for (uint32_t n = 1; n <= 8; n++) {
25440 GemmMicrokernelTester()
25441 .mr(4)
25442 .nr(8)
25443 .kr(1)
25444 .sr(1)
25445 .m(m)
25446 .n(n)
25447 .k(k)
25448 .iterations(1)
25449 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25450 }
25451 }
25452 }
25453 }
25454
25455 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8) {
25456 TEST_REQUIRES_X86_SSE;
25457 for (uint32_t n = 9; n < 16; n++) {
25458 for (size_t k = 1; k <= 5; k += 2) {
25459 GemmMicrokernelTester()
25460 .mr(4)
25461 .nr(8)
25462 .kr(1)
25463 .sr(1)
25464 .m(4)
25465 .n(8)
25466 .k(k)
25467 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25468 }
25469 }
25470 }
25471
25472 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8_strided_cn) {
25473 TEST_REQUIRES_X86_SSE;
25474 for (uint32_t n = 9; n < 16; n++) {
25475 for (size_t k = 1; k <= 5; k += 2) {
25476 GemmMicrokernelTester()
25477 .mr(4)
25478 .nr(8)
25479 .kr(1)
25480 .sr(1)
25481 .m(4)
25482 .n(8)
25483 .k(k)
25484 .cn_stride(11)
25485 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25486 }
25487 }
25488 }
25489
25490 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8_strided_a) {
25491 TEST_REQUIRES_X86_SSE;
25492 for (uint32_t n = 9; n < 16; n++) {
25493 for (size_t k = 1; k <= 5; k += 2) {
25494 GemmMicrokernelTester()
25495 .mr(4)
25496 .nr(8)
25497 .kr(1)
25498 .sr(1)
25499 .m(4)
25500 .n(n)
25501 .k(k)
25502 .a_stride(7)
25503 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25504 }
25505 }
25506 }
25507
25508 TEST(F32_GEMM_4X8__SSE_LOAD1, n_gt_8_subtile) {
25509 TEST_REQUIRES_X86_SSE;
25510 for (uint32_t n = 9; n < 16; n++) {
25511 for (size_t k = 1; k <= 5; k += 2) {
25512 for (uint32_t m = 1; m <= 4; m++) {
25513 GemmMicrokernelTester()
25514 .mr(4)
25515 .nr(8)
25516 .kr(1)
25517 .sr(1)
25518 .m(m)
25519 .n(n)
25520 .k(k)
25521 .iterations(1)
25522 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25523 }
25524 }
25525 }
25526 }
25527
25528 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8) {
25529 TEST_REQUIRES_X86_SSE;
25530 for (uint32_t n = 16; n <= 24; n += 8) {
25531 for (size_t k = 1; k <= 5; k += 2) {
25532 GemmMicrokernelTester()
25533 .mr(4)
25534 .nr(8)
25535 .kr(1)
25536 .sr(1)
25537 .m(4)
25538 .n(8)
25539 .k(k)
25540 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25541 }
25542 }
25543 }
25544
25545 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8_strided_cn) {
25546 TEST_REQUIRES_X86_SSE;
25547 for (uint32_t n = 16; n <= 24; n += 8) {
25548 for (size_t k = 1; k <= 5; k += 2) {
25549 GemmMicrokernelTester()
25550 .mr(4)
25551 .nr(8)
25552 .kr(1)
25553 .sr(1)
25554 .m(4)
25555 .n(n)
25556 .k(k)
25557 .cn_stride(11)
25558 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25559 }
25560 }
25561 }
25562
25563 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8_strided_a) {
25564 TEST_REQUIRES_X86_SSE;
25565 for (uint32_t n = 16; n <= 24; n += 8) {
25566 for (size_t k = 1; k <= 5; k += 2) {
25567 GemmMicrokernelTester()
25568 .mr(4)
25569 .nr(8)
25570 .kr(1)
25571 .sr(1)
25572 .m(4)
25573 .n(n)
25574 .k(k)
25575 .a_stride(7)
25576 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25577 }
25578 }
25579 }
25580
25581 TEST(F32_GEMM_4X8__SSE_LOAD1, n_div_8_subtile) {
25582 TEST_REQUIRES_X86_SSE;
25583 for (uint32_t n = 16; n <= 24; n += 8) {
25584 for (size_t k = 1; k <= 5; k += 2) {
25585 for (uint32_t m = 1; m <= 4; m++) {
25586 GemmMicrokernelTester()
25587 .mr(4)
25588 .nr(8)
25589 .kr(1)
25590 .sr(1)
25591 .m(m)
25592 .n(n)
25593 .k(k)
25594 .iterations(1)
25595 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25596 }
25597 }
25598 }
25599 }
25600
25601 TEST(F32_GEMM_4X8__SSE_LOAD1, strided_cm_subtile) {
25602 TEST_REQUIRES_X86_SSE;
25603 for (size_t k = 1; k <= 5; k += 2) {
25604 for (uint32_t m = 1; m <= 4; m++) {
25605 for (uint32_t n = 1; n <= 8; n++) {
25606 GemmMicrokernelTester()
25607 .mr(4)
25608 .nr(8)
25609 .kr(1)
25610 .sr(1)
25611 .m(m)
25612 .n(n)
25613 .k(k)
25614 .cm_stride(11)
25615 .iterations(1)
25616 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25617 }
25618 }
25619 }
25620 }
25621
25622 TEST(F32_GEMM_4X8__SSE_LOAD1, qmin) {
25623 TEST_REQUIRES_X86_SSE;
25624 GemmMicrokernelTester()
25625 .mr(4)
25626 .nr(8)
25627 .kr(1)
25628 .sr(1)
25629 .m(4)
25630 .n(8)
25631 .k(1)
25632 .qmin(128)
25633 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25634 }
25635
25636 TEST(F32_GEMM_4X8__SSE_LOAD1, qmax) {
25637 TEST_REQUIRES_X86_SSE;
25638 GemmMicrokernelTester()
25639 .mr(4)
25640 .nr(8)
25641 .kr(1)
25642 .sr(1)
25643 .m(4)
25644 .n(8)
25645 .k(1)
25646 .qmax(128)
25647 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25648 }
25649
25650 TEST(F32_GEMM_4X8__SSE_LOAD1, strided_cm) {
25651 TEST_REQUIRES_X86_SSE;
25652 GemmMicrokernelTester()
25653 .mr(4)
25654 .nr(8)
25655 .kr(1)
25656 .sr(1)
25657 .m(4)
25658 .n(8)
25659 .k(1)
25660 .cm_stride(11)
25661 .Test(xnn_f32_gemm_ukernel_4x8__sse_load1);
25662 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025663#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070025664
25665
Marat Dukhan1dadbf72019-10-01 10:46:20 -070025666#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070025667 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4) {
25668 TEST_REQUIRES_X86_SSE;
25669 GemmMicrokernelTester()
25670 .mr(1)
25671 .nr(8)
25672 .kr(1)
25673 .sr(1)
25674 .m(1)
25675 .n(8)
25676 .k(4)
25677 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25678 }
25679
25680 TEST(F32_GEMM_1X8__SSE_DUP, strided_cn) {
25681 TEST_REQUIRES_X86_SSE;
25682 GemmMicrokernelTester()
25683 .mr(1)
25684 .nr(8)
25685 .kr(1)
25686 .sr(1)
25687 .m(1)
25688 .n(8)
25689 .k(4)
25690 .cn_stride(11)
25691 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25692 }
25693
25694 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_strided_a) {
25695 TEST_REQUIRES_X86_SSE;
25696 GemmMicrokernelTester()
25697 .mr(1)
25698 .nr(8)
25699 .kr(1)
25700 .sr(1)
25701 .m(1)
25702 .n(8)
25703 .k(4)
25704 .a_stride(7)
25705 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25706 }
25707
25708 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_subtile) {
25709 TEST_REQUIRES_X86_SSE;
25710 for (uint32_t m = 1; m <= 1; m++) {
25711 for (uint32_t n = 1; n <= 8; n++) {
25712 GemmMicrokernelTester()
25713 .mr(1)
25714 .nr(8)
25715 .kr(1)
25716 .sr(1)
25717 .m(m)
25718 .n(n)
25719 .k(4)
25720 .iterations(1)
25721 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25722 }
25723 }
25724 }
25725
25726 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_subtile_m) {
25727 TEST_REQUIRES_X86_SSE;
25728 for (uint32_t m = 1; m <= 1; m++) {
25729 GemmMicrokernelTester()
25730 .mr(1)
25731 .nr(8)
25732 .kr(1)
25733 .sr(1)
25734 .m(m)
25735 .n(8)
25736 .k(4)
25737 .iterations(1)
25738 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25739 }
25740 }
25741
25742 TEST(F32_GEMM_1X8__SSE_DUP, k_eq_4_subtile_n) {
25743 TEST_REQUIRES_X86_SSE;
25744 for (uint32_t n = 1; n <= 8; n++) {
25745 GemmMicrokernelTester()
25746 .mr(1)
25747 .nr(8)
25748 .kr(1)
25749 .sr(1)
25750 .m(1)
25751 .n(n)
25752 .k(4)
25753 .iterations(1)
25754 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25755 }
25756 }
25757
25758 TEST(F32_GEMM_1X8__SSE_DUP, k_lt_4) {
25759 TEST_REQUIRES_X86_SSE;
25760 for (size_t k = 1; k < 4; k++) {
25761 GemmMicrokernelTester()
25762 .mr(1)
25763 .nr(8)
25764 .kr(1)
25765 .sr(1)
25766 .m(1)
25767 .n(8)
25768 .k(k)
25769 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25770 }
25771 }
25772
25773 TEST(F32_GEMM_1X8__SSE_DUP, k_lt_4_strided_a) {
25774 TEST_REQUIRES_X86_SSE;
25775 for (size_t k = 1; k < 4; k++) {
25776 GemmMicrokernelTester()
25777 .mr(1)
25778 .nr(8)
25779 .kr(1)
25780 .sr(1)
25781 .m(1)
25782 .n(8)
25783 .k(k)
25784 .a_stride(7)
25785 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25786 }
25787 }
25788
25789 TEST(F32_GEMM_1X8__SSE_DUP, k_lt_4_subtile) {
25790 TEST_REQUIRES_X86_SSE;
25791 for (size_t k = 1; k < 4; k++) {
25792 for (uint32_t m = 1; m <= 1; m++) {
25793 for (uint32_t n = 1; n <= 8; n++) {
25794 GemmMicrokernelTester()
25795 .mr(1)
25796 .nr(8)
25797 .kr(1)
25798 .sr(1)
25799 .m(m)
25800 .n(n)
25801 .k(k)
25802 .iterations(1)
25803 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25804 }
25805 }
25806 }
25807 }
25808
25809 TEST(F32_GEMM_1X8__SSE_DUP, k_gt_4) {
25810 TEST_REQUIRES_X86_SSE;
25811 for (size_t k = 5; k < 8; k++) {
25812 GemmMicrokernelTester()
25813 .mr(1)
25814 .nr(8)
25815 .kr(1)
25816 .sr(1)
25817 .m(1)
25818 .n(8)
25819 .k(k)
25820 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25821 }
25822 }
25823
25824 TEST(F32_GEMM_1X8__SSE_DUP, k_gt_4_strided_a) {
25825 TEST_REQUIRES_X86_SSE;
25826 for (size_t k = 5; k < 8; k++) {
25827 GemmMicrokernelTester()
25828 .mr(1)
25829 .nr(8)
25830 .kr(1)
25831 .sr(1)
25832 .m(1)
25833 .n(8)
25834 .k(k)
25835 .a_stride(11)
25836 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25837 }
25838 }
25839
25840 TEST(F32_GEMM_1X8__SSE_DUP, k_gt_4_subtile) {
25841 TEST_REQUIRES_X86_SSE;
25842 for (size_t k = 5; k < 8; k++) {
25843 for (uint32_t m = 1; m <= 1; m++) {
25844 for (uint32_t n = 1; n <= 8; n++) {
25845 GemmMicrokernelTester()
25846 .mr(1)
25847 .nr(8)
25848 .kr(1)
25849 .sr(1)
25850 .m(m)
25851 .n(n)
25852 .k(k)
25853 .iterations(1)
25854 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25855 }
25856 }
25857 }
25858 }
25859
25860 TEST(F32_GEMM_1X8__SSE_DUP, k_div_4) {
25861 TEST_REQUIRES_X86_SSE;
25862 for (size_t k = 8; k <= 40; k += 4) {
25863 GemmMicrokernelTester()
25864 .mr(1)
25865 .nr(8)
25866 .kr(1)
25867 .sr(1)
25868 .m(1)
25869 .n(8)
25870 .k(k)
25871 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25872 }
25873 }
25874
25875 TEST(F32_GEMM_1X8__SSE_DUP, k_div_4_strided_a) {
25876 TEST_REQUIRES_X86_SSE;
25877 for (size_t k = 8; k <= 40; k += 4) {
25878 GemmMicrokernelTester()
25879 .mr(1)
25880 .nr(8)
25881 .kr(1)
25882 .sr(1)
25883 .m(1)
25884 .n(8)
25885 .k(k)
25886 .a_stride(43)
25887 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25888 }
25889 }
25890
25891 TEST(F32_GEMM_1X8__SSE_DUP, k_div_4_subtile) {
25892 TEST_REQUIRES_X86_SSE;
25893 for (size_t k = 8; k <= 40; k += 4) {
25894 for (uint32_t m = 1; m <= 1; m++) {
25895 for (uint32_t n = 1; n <= 8; n++) {
25896 GemmMicrokernelTester()
25897 .mr(1)
25898 .nr(8)
25899 .kr(1)
25900 .sr(1)
25901 .m(m)
25902 .n(n)
25903 .k(k)
25904 .iterations(1)
25905 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25906 }
25907 }
25908 }
25909 }
25910
25911 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8) {
25912 TEST_REQUIRES_X86_SSE;
25913 for (uint32_t n = 9; n < 16; n++) {
25914 for (size_t k = 1; k <= 20; k += 5) {
25915 GemmMicrokernelTester()
25916 .mr(1)
25917 .nr(8)
25918 .kr(1)
25919 .sr(1)
25920 .m(1)
25921 .n(8)
25922 .k(k)
25923 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25924 }
25925 }
25926 }
25927
25928 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8_strided_cn) {
25929 TEST_REQUIRES_X86_SSE;
25930 for (uint32_t n = 9; n < 16; n++) {
25931 for (size_t k = 1; k <= 20; k += 5) {
25932 GemmMicrokernelTester()
25933 .mr(1)
25934 .nr(8)
25935 .kr(1)
25936 .sr(1)
25937 .m(1)
25938 .n(8)
25939 .k(k)
25940 .cn_stride(11)
25941 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25942 }
25943 }
25944 }
25945
25946 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8_strided_a) {
25947 TEST_REQUIRES_X86_SSE;
25948 for (uint32_t n = 9; n < 16; n++) {
25949 for (size_t k = 1; k <= 20; k += 5) {
25950 GemmMicrokernelTester()
25951 .mr(1)
25952 .nr(8)
25953 .kr(1)
25954 .sr(1)
25955 .m(1)
25956 .n(n)
25957 .k(k)
25958 .a_stride(23)
25959 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25960 }
25961 }
25962 }
25963
25964 TEST(F32_GEMM_1X8__SSE_DUP, n_gt_8_subtile) {
25965 TEST_REQUIRES_X86_SSE;
25966 for (uint32_t n = 9; n < 16; n++) {
25967 for (size_t k = 1; k <= 20; k += 5) {
25968 for (uint32_t m = 1; m <= 1; m++) {
25969 GemmMicrokernelTester()
25970 .mr(1)
25971 .nr(8)
25972 .kr(1)
25973 .sr(1)
25974 .m(m)
25975 .n(n)
25976 .k(k)
25977 .iterations(1)
25978 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25979 }
25980 }
25981 }
25982 }
25983
25984 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8) {
25985 TEST_REQUIRES_X86_SSE;
25986 for (uint32_t n = 16; n <= 24; n += 8) {
25987 for (size_t k = 1; k <= 20; k += 5) {
25988 GemmMicrokernelTester()
25989 .mr(1)
25990 .nr(8)
25991 .kr(1)
25992 .sr(1)
25993 .m(1)
25994 .n(8)
25995 .k(k)
25996 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
25997 }
25998 }
25999 }
26000
26001 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8_strided_cn) {
26002 TEST_REQUIRES_X86_SSE;
26003 for (uint32_t n = 16; n <= 24; n += 8) {
26004 for (size_t k = 1; k <= 20; k += 5) {
26005 GemmMicrokernelTester()
26006 .mr(1)
26007 .nr(8)
26008 .kr(1)
26009 .sr(1)
26010 .m(1)
26011 .n(n)
26012 .k(k)
26013 .cn_stride(11)
26014 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26015 }
26016 }
26017 }
26018
26019 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8_strided_a) {
26020 TEST_REQUIRES_X86_SSE;
26021 for (uint32_t n = 16; n <= 24; n += 8) {
26022 for (size_t k = 1; k <= 20; k += 5) {
26023 GemmMicrokernelTester()
26024 .mr(1)
26025 .nr(8)
26026 .kr(1)
26027 .sr(1)
26028 .m(1)
26029 .n(n)
26030 .k(k)
26031 .a_stride(23)
26032 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26033 }
26034 }
26035 }
26036
26037 TEST(F32_GEMM_1X8__SSE_DUP, n_div_8_subtile) {
26038 TEST_REQUIRES_X86_SSE;
26039 for (uint32_t n = 16; n <= 24; n += 8) {
26040 for (size_t k = 1; k <= 20; k += 5) {
26041 for (uint32_t m = 1; m <= 1; m++) {
26042 GemmMicrokernelTester()
26043 .mr(1)
26044 .nr(8)
26045 .kr(1)
26046 .sr(1)
26047 .m(m)
26048 .n(n)
26049 .k(k)
26050 .iterations(1)
26051 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26052 }
26053 }
26054 }
26055 }
26056
26057 TEST(F32_GEMM_1X8__SSE_DUP, strided_cm_subtile) {
26058 TEST_REQUIRES_X86_SSE;
26059 for (size_t k = 1; k <= 20; k += 5) {
26060 for (uint32_t m = 1; m <= 1; m++) {
26061 for (uint32_t n = 1; n <= 8; n++) {
26062 GemmMicrokernelTester()
26063 .mr(1)
26064 .nr(8)
26065 .kr(1)
26066 .sr(1)
26067 .m(m)
26068 .n(n)
26069 .k(k)
26070 .cm_stride(11)
26071 .iterations(1)
26072 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26073 }
26074 }
26075 }
26076 }
26077
26078 TEST(F32_GEMM_1X8__SSE_DUP, qmin) {
26079 TEST_REQUIRES_X86_SSE;
26080 GemmMicrokernelTester()
26081 .mr(1)
26082 .nr(8)
26083 .kr(1)
26084 .sr(1)
26085 .m(1)
26086 .n(8)
26087 .k(4)
26088 .qmin(128)
26089 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26090 }
26091
26092 TEST(F32_GEMM_1X8__SSE_DUP, qmax) {
26093 TEST_REQUIRES_X86_SSE;
26094 GemmMicrokernelTester()
26095 .mr(1)
26096 .nr(8)
26097 .kr(1)
26098 .sr(1)
26099 .m(1)
26100 .n(8)
26101 .k(4)
26102 .qmax(128)
26103 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26104 }
26105
26106 TEST(F32_GEMM_1X8__SSE_DUP, strided_cm) {
26107 TEST_REQUIRES_X86_SSE;
26108 GemmMicrokernelTester()
26109 .mr(1)
26110 .nr(8)
26111 .kr(1)
26112 .sr(1)
26113 .m(1)
26114 .n(8)
26115 .k(4)
26116 .cm_stride(11)
26117 .Test(xnn_f32_gemm_ukernel_1x8__sse_dup);
26118 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026119#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070026120
26121
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026122#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070026123 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4) {
26124 TEST_REQUIRES_X86_SSE;
26125 GemmMicrokernelTester()
26126 .mr(4)
26127 .nr(8)
26128 .kr(1)
26129 .sr(1)
26130 .m(4)
26131 .n(8)
26132 .k(4)
26133 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26134 }
26135
26136 TEST(F32_GEMM_4X8__SSE_DUP, strided_cn) {
26137 TEST_REQUIRES_X86_SSE;
26138 GemmMicrokernelTester()
26139 .mr(4)
26140 .nr(8)
26141 .kr(1)
26142 .sr(1)
26143 .m(4)
26144 .n(8)
26145 .k(4)
26146 .cn_stride(11)
26147 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26148 }
26149
26150 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_strided_a) {
26151 TEST_REQUIRES_X86_SSE;
26152 GemmMicrokernelTester()
26153 .mr(4)
26154 .nr(8)
26155 .kr(1)
26156 .sr(1)
26157 .m(4)
26158 .n(8)
26159 .k(4)
26160 .a_stride(7)
26161 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26162 }
26163
26164 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_subtile) {
26165 TEST_REQUIRES_X86_SSE;
26166 for (uint32_t m = 1; m <= 4; m++) {
26167 for (uint32_t n = 1; n <= 8; n++) {
26168 GemmMicrokernelTester()
26169 .mr(4)
26170 .nr(8)
26171 .kr(1)
26172 .sr(1)
26173 .m(m)
26174 .n(n)
26175 .k(4)
26176 .iterations(1)
26177 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26178 }
26179 }
26180 }
26181
26182 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_subtile_m) {
26183 TEST_REQUIRES_X86_SSE;
26184 for (uint32_t m = 1; m <= 4; m++) {
26185 GemmMicrokernelTester()
26186 .mr(4)
26187 .nr(8)
26188 .kr(1)
26189 .sr(1)
26190 .m(m)
26191 .n(8)
26192 .k(4)
26193 .iterations(1)
26194 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26195 }
26196 }
26197
26198 TEST(F32_GEMM_4X8__SSE_DUP, k_eq_4_subtile_n) {
26199 TEST_REQUIRES_X86_SSE;
26200 for (uint32_t n = 1; n <= 8; n++) {
26201 GemmMicrokernelTester()
26202 .mr(4)
26203 .nr(8)
26204 .kr(1)
26205 .sr(1)
26206 .m(4)
26207 .n(n)
26208 .k(4)
26209 .iterations(1)
26210 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26211 }
26212 }
26213
26214 TEST(F32_GEMM_4X8__SSE_DUP, k_lt_4) {
26215 TEST_REQUIRES_X86_SSE;
26216 for (size_t k = 1; k < 4; k++) {
26217 GemmMicrokernelTester()
26218 .mr(4)
26219 .nr(8)
26220 .kr(1)
26221 .sr(1)
26222 .m(4)
26223 .n(8)
26224 .k(k)
26225 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26226 }
26227 }
26228
26229 TEST(F32_GEMM_4X8__SSE_DUP, k_lt_4_strided_a) {
26230 TEST_REQUIRES_X86_SSE;
26231 for (size_t k = 1; k < 4; k++) {
26232 GemmMicrokernelTester()
26233 .mr(4)
26234 .nr(8)
26235 .kr(1)
26236 .sr(1)
26237 .m(4)
26238 .n(8)
26239 .k(k)
26240 .a_stride(7)
26241 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26242 }
26243 }
26244
26245 TEST(F32_GEMM_4X8__SSE_DUP, k_lt_4_subtile) {
26246 TEST_REQUIRES_X86_SSE;
26247 for (size_t k = 1; k < 4; k++) {
26248 for (uint32_t m = 1; m <= 4; m++) {
26249 for (uint32_t n = 1; n <= 8; n++) {
26250 GemmMicrokernelTester()
26251 .mr(4)
26252 .nr(8)
26253 .kr(1)
26254 .sr(1)
26255 .m(m)
26256 .n(n)
26257 .k(k)
26258 .iterations(1)
26259 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26260 }
26261 }
26262 }
26263 }
26264
26265 TEST(F32_GEMM_4X8__SSE_DUP, k_gt_4) {
26266 TEST_REQUIRES_X86_SSE;
26267 for (size_t k = 5; k < 8; k++) {
26268 GemmMicrokernelTester()
26269 .mr(4)
26270 .nr(8)
26271 .kr(1)
26272 .sr(1)
26273 .m(4)
26274 .n(8)
26275 .k(k)
26276 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26277 }
26278 }
26279
26280 TEST(F32_GEMM_4X8__SSE_DUP, k_gt_4_strided_a) {
26281 TEST_REQUIRES_X86_SSE;
26282 for (size_t k = 5; k < 8; k++) {
26283 GemmMicrokernelTester()
26284 .mr(4)
26285 .nr(8)
26286 .kr(1)
26287 .sr(1)
26288 .m(4)
26289 .n(8)
26290 .k(k)
26291 .a_stride(11)
26292 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26293 }
26294 }
26295
26296 TEST(F32_GEMM_4X8__SSE_DUP, k_gt_4_subtile) {
26297 TEST_REQUIRES_X86_SSE;
26298 for (size_t k = 5; k < 8; k++) {
26299 for (uint32_t m = 1; m <= 4; m++) {
26300 for (uint32_t n = 1; n <= 8; n++) {
26301 GemmMicrokernelTester()
26302 .mr(4)
26303 .nr(8)
26304 .kr(1)
26305 .sr(1)
26306 .m(m)
26307 .n(n)
26308 .k(k)
26309 .iterations(1)
26310 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26311 }
26312 }
26313 }
26314 }
26315
26316 TEST(F32_GEMM_4X8__SSE_DUP, k_div_4) {
26317 TEST_REQUIRES_X86_SSE;
26318 for (size_t k = 8; k <= 40; k += 4) {
26319 GemmMicrokernelTester()
26320 .mr(4)
26321 .nr(8)
26322 .kr(1)
26323 .sr(1)
26324 .m(4)
26325 .n(8)
26326 .k(k)
26327 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26328 }
26329 }
26330
26331 TEST(F32_GEMM_4X8__SSE_DUP, k_div_4_strided_a) {
26332 TEST_REQUIRES_X86_SSE;
26333 for (size_t k = 8; k <= 40; k += 4) {
26334 GemmMicrokernelTester()
26335 .mr(4)
26336 .nr(8)
26337 .kr(1)
26338 .sr(1)
26339 .m(4)
26340 .n(8)
26341 .k(k)
26342 .a_stride(43)
26343 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26344 }
26345 }
26346
26347 TEST(F32_GEMM_4X8__SSE_DUP, k_div_4_subtile) {
26348 TEST_REQUIRES_X86_SSE;
26349 for (size_t k = 8; k <= 40; k += 4) {
26350 for (uint32_t m = 1; m <= 4; m++) {
26351 for (uint32_t n = 1; n <= 8; n++) {
26352 GemmMicrokernelTester()
26353 .mr(4)
26354 .nr(8)
26355 .kr(1)
26356 .sr(1)
26357 .m(m)
26358 .n(n)
26359 .k(k)
26360 .iterations(1)
26361 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26362 }
26363 }
26364 }
26365 }
26366
26367 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8) {
26368 TEST_REQUIRES_X86_SSE;
26369 for (uint32_t n = 9; n < 16; n++) {
26370 for (size_t k = 1; k <= 20; k += 5) {
26371 GemmMicrokernelTester()
26372 .mr(4)
26373 .nr(8)
26374 .kr(1)
26375 .sr(1)
26376 .m(4)
26377 .n(8)
26378 .k(k)
26379 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26380 }
26381 }
26382 }
26383
26384 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8_strided_cn) {
26385 TEST_REQUIRES_X86_SSE;
26386 for (uint32_t n = 9; n < 16; n++) {
26387 for (size_t k = 1; k <= 20; k += 5) {
26388 GemmMicrokernelTester()
26389 .mr(4)
26390 .nr(8)
26391 .kr(1)
26392 .sr(1)
26393 .m(4)
26394 .n(8)
26395 .k(k)
26396 .cn_stride(11)
26397 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26398 }
26399 }
26400 }
26401
26402 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8_strided_a) {
26403 TEST_REQUIRES_X86_SSE;
26404 for (uint32_t n = 9; n < 16; n++) {
26405 for (size_t k = 1; k <= 20; k += 5) {
26406 GemmMicrokernelTester()
26407 .mr(4)
26408 .nr(8)
26409 .kr(1)
26410 .sr(1)
26411 .m(4)
26412 .n(n)
26413 .k(k)
26414 .a_stride(23)
26415 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26416 }
26417 }
26418 }
26419
26420 TEST(F32_GEMM_4X8__SSE_DUP, n_gt_8_subtile) {
26421 TEST_REQUIRES_X86_SSE;
26422 for (uint32_t n = 9; n < 16; n++) {
26423 for (size_t k = 1; k <= 20; k += 5) {
26424 for (uint32_t m = 1; m <= 4; m++) {
26425 GemmMicrokernelTester()
26426 .mr(4)
26427 .nr(8)
26428 .kr(1)
26429 .sr(1)
26430 .m(m)
26431 .n(n)
26432 .k(k)
26433 .iterations(1)
26434 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26435 }
26436 }
26437 }
26438 }
26439
26440 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8) {
26441 TEST_REQUIRES_X86_SSE;
26442 for (uint32_t n = 16; n <= 24; n += 8) {
26443 for (size_t k = 1; k <= 20; k += 5) {
26444 GemmMicrokernelTester()
26445 .mr(4)
26446 .nr(8)
26447 .kr(1)
26448 .sr(1)
26449 .m(4)
26450 .n(8)
26451 .k(k)
26452 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26453 }
26454 }
26455 }
26456
26457 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8_strided_cn) {
26458 TEST_REQUIRES_X86_SSE;
26459 for (uint32_t n = 16; n <= 24; n += 8) {
26460 for (size_t k = 1; k <= 20; k += 5) {
26461 GemmMicrokernelTester()
26462 .mr(4)
26463 .nr(8)
26464 .kr(1)
26465 .sr(1)
26466 .m(4)
26467 .n(n)
26468 .k(k)
26469 .cn_stride(11)
26470 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26471 }
26472 }
26473 }
26474
26475 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8_strided_a) {
26476 TEST_REQUIRES_X86_SSE;
26477 for (uint32_t n = 16; n <= 24; n += 8) {
26478 for (size_t k = 1; k <= 20; k += 5) {
26479 GemmMicrokernelTester()
26480 .mr(4)
26481 .nr(8)
26482 .kr(1)
26483 .sr(1)
26484 .m(4)
26485 .n(n)
26486 .k(k)
26487 .a_stride(23)
26488 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26489 }
26490 }
26491 }
26492
26493 TEST(F32_GEMM_4X8__SSE_DUP, n_div_8_subtile) {
26494 TEST_REQUIRES_X86_SSE;
26495 for (uint32_t n = 16; n <= 24; n += 8) {
26496 for (size_t k = 1; k <= 20; k += 5) {
26497 for (uint32_t m = 1; m <= 4; m++) {
26498 GemmMicrokernelTester()
26499 .mr(4)
26500 .nr(8)
26501 .kr(1)
26502 .sr(1)
26503 .m(m)
26504 .n(n)
26505 .k(k)
26506 .iterations(1)
26507 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26508 }
26509 }
26510 }
26511 }
26512
26513 TEST(F32_GEMM_4X8__SSE_DUP, strided_cm_subtile) {
26514 TEST_REQUIRES_X86_SSE;
26515 for (size_t k = 1; k <= 20; k += 5) {
26516 for (uint32_t m = 1; m <= 4; m++) {
26517 for (uint32_t n = 1; n <= 8; n++) {
26518 GemmMicrokernelTester()
26519 .mr(4)
26520 .nr(8)
26521 .kr(1)
26522 .sr(1)
26523 .m(m)
26524 .n(n)
26525 .k(k)
26526 .cm_stride(11)
26527 .iterations(1)
26528 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26529 }
26530 }
26531 }
26532 }
26533
26534 TEST(F32_GEMM_4X8__SSE_DUP, qmin) {
26535 TEST_REQUIRES_X86_SSE;
26536 GemmMicrokernelTester()
26537 .mr(4)
26538 .nr(8)
26539 .kr(1)
26540 .sr(1)
26541 .m(4)
26542 .n(8)
26543 .k(4)
26544 .qmin(128)
26545 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26546 }
26547
26548 TEST(F32_GEMM_4X8__SSE_DUP, qmax) {
26549 TEST_REQUIRES_X86_SSE;
26550 GemmMicrokernelTester()
26551 .mr(4)
26552 .nr(8)
26553 .kr(1)
26554 .sr(1)
26555 .m(4)
26556 .n(8)
26557 .k(4)
26558 .qmax(128)
26559 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26560 }
26561
26562 TEST(F32_GEMM_4X8__SSE_DUP, strided_cm) {
26563 TEST_REQUIRES_X86_SSE;
26564 GemmMicrokernelTester()
26565 .mr(4)
26566 .nr(8)
26567 .kr(1)
26568 .sr(1)
26569 .m(4)
26570 .n(8)
26571 .k(4)
26572 .cm_stride(11)
26573 .Test(xnn_f32_gemm_ukernel_4x8__sse_dup);
26574 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026575#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070026576
26577
Marat Dukhan1dadbf72019-10-01 10:46:20 -070026578#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070026579 TEST(F32_GEMM_1X8S4__SSE, k_eq_4) {
26580 TEST_REQUIRES_X86_SSE;
26581 GemmMicrokernelTester()
26582 .mr(1)
26583 .nr(8)
26584 .kr(1)
26585 .sr(4)
26586 .m(1)
26587 .n(8)
26588 .k(4)
26589 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26590 }
26591
26592 TEST(F32_GEMM_1X8S4__SSE, strided_cn) {
26593 TEST_REQUIRES_X86_SSE;
26594 GemmMicrokernelTester()
26595 .mr(1)
26596 .nr(8)
26597 .kr(1)
26598 .sr(4)
26599 .m(1)
26600 .n(8)
26601 .k(4)
26602 .cn_stride(11)
26603 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26604 }
26605
26606 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_strided_a) {
26607 TEST_REQUIRES_X86_SSE;
26608 GemmMicrokernelTester()
26609 .mr(1)
26610 .nr(8)
26611 .kr(1)
26612 .sr(4)
26613 .m(1)
26614 .n(8)
26615 .k(4)
26616 .a_stride(7)
26617 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26618 }
26619
26620 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_subtile) {
26621 TEST_REQUIRES_X86_SSE;
26622 for (uint32_t m = 1; m <= 1; m++) {
26623 for (uint32_t n = 1; n <= 8; n++) {
26624 GemmMicrokernelTester()
26625 .mr(1)
26626 .nr(8)
26627 .kr(1)
26628 .sr(4)
26629 .m(m)
26630 .n(n)
26631 .k(4)
26632 .iterations(1)
26633 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26634 }
26635 }
26636 }
26637
26638 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_subtile_m) {
26639 TEST_REQUIRES_X86_SSE;
26640 for (uint32_t m = 1; m <= 1; m++) {
26641 GemmMicrokernelTester()
26642 .mr(1)
26643 .nr(8)
26644 .kr(1)
26645 .sr(4)
26646 .m(m)
26647 .n(8)
26648 .k(4)
26649 .iterations(1)
26650 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26651 }
26652 }
26653
26654 TEST(F32_GEMM_1X8S4__SSE, k_eq_4_subtile_n) {
26655 TEST_REQUIRES_X86_SSE;
26656 for (uint32_t n = 1; n <= 8; n++) {
26657 GemmMicrokernelTester()
26658 .mr(1)
26659 .nr(8)
26660 .kr(1)
26661 .sr(4)
26662 .m(1)
26663 .n(n)
26664 .k(4)
26665 .iterations(1)
26666 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26667 }
26668 }
26669
26670 TEST(F32_GEMM_1X8S4__SSE, k_lt_4) {
26671 TEST_REQUIRES_X86_SSE;
26672 for (size_t k = 1; k < 4; k++) {
26673 GemmMicrokernelTester()
26674 .mr(1)
26675 .nr(8)
26676 .kr(1)
26677 .sr(4)
26678 .m(1)
26679 .n(8)
26680 .k(k)
26681 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26682 }
26683 }
26684
26685 TEST(F32_GEMM_1X8S4__SSE, k_lt_4_strided_a) {
26686 TEST_REQUIRES_X86_SSE;
26687 for (size_t k = 1; k < 4; k++) {
26688 GemmMicrokernelTester()
26689 .mr(1)
26690 .nr(8)
26691 .kr(1)
26692 .sr(4)
26693 .m(1)
26694 .n(8)
26695 .k(k)
26696 .a_stride(7)
26697 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26698 }
26699 }
26700
26701 TEST(F32_GEMM_1X8S4__SSE, k_lt_4_subtile) {
26702 TEST_REQUIRES_X86_SSE;
26703 for (size_t k = 1; k < 4; k++) {
26704 for (uint32_t m = 1; m <= 1; m++) {
26705 for (uint32_t n = 1; n <= 8; n++) {
26706 GemmMicrokernelTester()
26707 .mr(1)
26708 .nr(8)
26709 .kr(1)
26710 .sr(4)
26711 .m(m)
26712 .n(n)
26713 .k(k)
26714 .iterations(1)
26715 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26716 }
26717 }
26718 }
26719 }
26720
26721 TEST(F32_GEMM_1X8S4__SSE, k_gt_4) {
26722 TEST_REQUIRES_X86_SSE;
26723 for (size_t k = 5; k < 8; k++) {
26724 GemmMicrokernelTester()
26725 .mr(1)
26726 .nr(8)
26727 .kr(1)
26728 .sr(4)
26729 .m(1)
26730 .n(8)
26731 .k(k)
26732 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26733 }
26734 }
26735
26736 TEST(F32_GEMM_1X8S4__SSE, k_gt_4_strided_a) {
26737 TEST_REQUIRES_X86_SSE;
26738 for (size_t k = 5; k < 8; k++) {
26739 GemmMicrokernelTester()
26740 .mr(1)
26741 .nr(8)
26742 .kr(1)
26743 .sr(4)
26744 .m(1)
26745 .n(8)
26746 .k(k)
26747 .a_stride(11)
26748 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26749 }
26750 }
26751
26752 TEST(F32_GEMM_1X8S4__SSE, k_gt_4_subtile) {
26753 TEST_REQUIRES_X86_SSE;
26754 for (size_t k = 5; k < 8; k++) {
26755 for (uint32_t m = 1; m <= 1; m++) {
26756 for (uint32_t n = 1; n <= 8; n++) {
26757 GemmMicrokernelTester()
26758 .mr(1)
26759 .nr(8)
26760 .kr(1)
26761 .sr(4)
26762 .m(m)
26763 .n(n)
26764 .k(k)
26765 .iterations(1)
26766 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26767 }
26768 }
26769 }
26770 }
26771
26772 TEST(F32_GEMM_1X8S4__SSE, k_div_4) {
26773 TEST_REQUIRES_X86_SSE;
26774 for (size_t k = 8; k <= 40; k += 4) {
26775 GemmMicrokernelTester()
26776 .mr(1)
26777 .nr(8)
26778 .kr(1)
26779 .sr(4)
26780 .m(1)
26781 .n(8)
26782 .k(k)
26783 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26784 }
26785 }
26786
26787 TEST(F32_GEMM_1X8S4__SSE, k_div_4_strided_a) {
26788 TEST_REQUIRES_X86_SSE;
26789 for (size_t k = 8; k <= 40; k += 4) {
26790 GemmMicrokernelTester()
26791 .mr(1)
26792 .nr(8)
26793 .kr(1)
26794 .sr(4)
26795 .m(1)
26796 .n(8)
26797 .k(k)
26798 .a_stride(43)
26799 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26800 }
26801 }
26802
26803 TEST(F32_GEMM_1X8S4__SSE, k_div_4_subtile) {
26804 TEST_REQUIRES_X86_SSE;
26805 for (size_t k = 8; k <= 40; k += 4) {
26806 for (uint32_t m = 1; m <= 1; m++) {
26807 for (uint32_t n = 1; n <= 8; n++) {
26808 GemmMicrokernelTester()
26809 .mr(1)
26810 .nr(8)
26811 .kr(1)
26812 .sr(4)
26813 .m(m)
26814 .n(n)
26815 .k(k)
26816 .iterations(1)
26817 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26818 }
26819 }
26820 }
26821 }
26822
26823 TEST(F32_GEMM_1X8S4__SSE, n_gt_8) {
26824 TEST_REQUIRES_X86_SSE;
26825 for (uint32_t n = 9; n < 16; n++) {
26826 for (size_t k = 1; k <= 20; k += 5) {
26827 GemmMicrokernelTester()
26828 .mr(1)
26829 .nr(8)
26830 .kr(1)
26831 .sr(4)
26832 .m(1)
26833 .n(8)
26834 .k(k)
26835 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26836 }
26837 }
26838 }
26839
26840 TEST(F32_GEMM_1X8S4__SSE, n_gt_8_strided_cn) {
26841 TEST_REQUIRES_X86_SSE;
26842 for (uint32_t n = 9; n < 16; n++) {
26843 for (size_t k = 1; k <= 20; k += 5) {
26844 GemmMicrokernelTester()
26845 .mr(1)
26846 .nr(8)
26847 .kr(1)
26848 .sr(4)
26849 .m(1)
26850 .n(8)
26851 .k(k)
26852 .cn_stride(11)
26853 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26854 }
26855 }
26856 }
26857
26858 TEST(F32_GEMM_1X8S4__SSE, n_gt_8_strided_a) {
26859 TEST_REQUIRES_X86_SSE;
26860 for (uint32_t n = 9; n < 16; n++) {
26861 for (size_t k = 1; k <= 20; k += 5) {
26862 GemmMicrokernelTester()
26863 .mr(1)
26864 .nr(8)
26865 .kr(1)
26866 .sr(4)
26867 .m(1)
26868 .n(n)
26869 .k(k)
26870 .a_stride(23)
26871 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26872 }
26873 }
26874 }
26875
26876 TEST(F32_GEMM_1X8S4__SSE, n_gt_8_subtile) {
26877 TEST_REQUIRES_X86_SSE;
26878 for (uint32_t n = 9; n < 16; n++) {
26879 for (size_t k = 1; k <= 20; k += 5) {
26880 for (uint32_t m = 1; m <= 1; m++) {
26881 GemmMicrokernelTester()
26882 .mr(1)
26883 .nr(8)
26884 .kr(1)
26885 .sr(4)
26886 .m(m)
26887 .n(n)
26888 .k(k)
26889 .iterations(1)
26890 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26891 }
26892 }
26893 }
26894 }
26895
26896 TEST(F32_GEMM_1X8S4__SSE, n_div_8) {
26897 TEST_REQUIRES_X86_SSE;
26898 for (uint32_t n = 16; n <= 24; n += 8) {
26899 for (size_t k = 1; k <= 20; k += 5) {
26900 GemmMicrokernelTester()
26901 .mr(1)
26902 .nr(8)
26903 .kr(1)
26904 .sr(4)
26905 .m(1)
26906 .n(8)
26907 .k(k)
26908 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26909 }
26910 }
26911 }
26912
26913 TEST(F32_GEMM_1X8S4__SSE, n_div_8_strided_cn) {
26914 TEST_REQUIRES_X86_SSE;
26915 for (uint32_t n = 16; n <= 24; n += 8) {
26916 for (size_t k = 1; k <= 20; k += 5) {
26917 GemmMicrokernelTester()
26918 .mr(1)
26919 .nr(8)
26920 .kr(1)
26921 .sr(4)
26922 .m(1)
26923 .n(n)
26924 .k(k)
26925 .cn_stride(11)
26926 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26927 }
26928 }
26929 }
26930
26931 TEST(F32_GEMM_1X8S4__SSE, n_div_8_strided_a) {
26932 TEST_REQUIRES_X86_SSE;
26933 for (uint32_t n = 16; n <= 24; n += 8) {
26934 for (size_t k = 1; k <= 20; k += 5) {
26935 GemmMicrokernelTester()
26936 .mr(1)
26937 .nr(8)
26938 .kr(1)
26939 .sr(4)
26940 .m(1)
26941 .n(n)
26942 .k(k)
26943 .a_stride(23)
26944 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26945 }
26946 }
26947 }
26948
26949 TEST(F32_GEMM_1X8S4__SSE, n_div_8_subtile) {
26950 TEST_REQUIRES_X86_SSE;
26951 for (uint32_t n = 16; n <= 24; n += 8) {
26952 for (size_t k = 1; k <= 20; k += 5) {
26953 for (uint32_t m = 1; m <= 1; m++) {
26954 GemmMicrokernelTester()
26955 .mr(1)
26956 .nr(8)
26957 .kr(1)
26958 .sr(4)
26959 .m(m)
26960 .n(n)
26961 .k(k)
26962 .iterations(1)
26963 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26964 }
26965 }
26966 }
26967 }
26968
26969 TEST(F32_GEMM_1X8S4__SSE, strided_cm_subtile) {
26970 TEST_REQUIRES_X86_SSE;
26971 for (size_t k = 1; k <= 20; k += 5) {
26972 for (uint32_t m = 1; m <= 1; m++) {
26973 for (uint32_t n = 1; n <= 8; n++) {
26974 GemmMicrokernelTester()
26975 .mr(1)
26976 .nr(8)
26977 .kr(1)
26978 .sr(4)
26979 .m(m)
26980 .n(n)
26981 .k(k)
26982 .cm_stride(11)
26983 .iterations(1)
26984 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
26985 }
26986 }
26987 }
26988 }
26989
26990 TEST(F32_GEMM_1X8S4__SSE, qmin) {
26991 TEST_REQUIRES_X86_SSE;
26992 GemmMicrokernelTester()
26993 .mr(1)
26994 .nr(8)
26995 .kr(1)
26996 .sr(4)
26997 .m(1)
26998 .n(8)
26999 .k(4)
27000 .qmin(128)
27001 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
27002 }
27003
27004 TEST(F32_GEMM_1X8S4__SSE, qmax) {
27005 TEST_REQUIRES_X86_SSE;
27006 GemmMicrokernelTester()
27007 .mr(1)
27008 .nr(8)
27009 .kr(1)
27010 .sr(4)
27011 .m(1)
27012 .n(8)
27013 .k(4)
27014 .qmax(128)
27015 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
27016 }
27017
27018 TEST(F32_GEMM_1X8S4__SSE, strided_cm) {
27019 TEST_REQUIRES_X86_SSE;
27020 GemmMicrokernelTester()
27021 .mr(1)
27022 .nr(8)
27023 .kr(1)
27024 .sr(4)
27025 .m(1)
27026 .n(8)
27027 .k(4)
27028 .cm_stride(11)
27029 .Test(xnn_f32_gemm_ukernel_1x8s4__sse);
27030 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027031#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070027032
27033
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027034#if XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070027035 TEST(F32_GEMM_4X8S4__SSE, k_eq_4) {
27036 TEST_REQUIRES_X86_SSE;
27037 GemmMicrokernelTester()
27038 .mr(4)
27039 .nr(8)
27040 .kr(1)
27041 .sr(4)
27042 .m(4)
27043 .n(8)
27044 .k(4)
27045 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27046 }
27047
27048 TEST(F32_GEMM_4X8S4__SSE, strided_cn) {
27049 TEST_REQUIRES_X86_SSE;
27050 GemmMicrokernelTester()
27051 .mr(4)
27052 .nr(8)
27053 .kr(1)
27054 .sr(4)
27055 .m(4)
27056 .n(8)
27057 .k(4)
27058 .cn_stride(11)
27059 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27060 }
27061
27062 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_strided_a) {
27063 TEST_REQUIRES_X86_SSE;
27064 GemmMicrokernelTester()
27065 .mr(4)
27066 .nr(8)
27067 .kr(1)
27068 .sr(4)
27069 .m(4)
27070 .n(8)
27071 .k(4)
27072 .a_stride(7)
27073 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27074 }
27075
27076 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_subtile) {
27077 TEST_REQUIRES_X86_SSE;
27078 for (uint32_t m = 1; m <= 4; m++) {
27079 for (uint32_t n = 1; n <= 8; n++) {
27080 GemmMicrokernelTester()
27081 .mr(4)
27082 .nr(8)
27083 .kr(1)
27084 .sr(4)
27085 .m(m)
27086 .n(n)
27087 .k(4)
27088 .iterations(1)
27089 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27090 }
27091 }
27092 }
27093
27094 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_subtile_m) {
27095 TEST_REQUIRES_X86_SSE;
27096 for (uint32_t m = 1; m <= 4; m++) {
27097 GemmMicrokernelTester()
27098 .mr(4)
27099 .nr(8)
27100 .kr(1)
27101 .sr(4)
27102 .m(m)
27103 .n(8)
27104 .k(4)
27105 .iterations(1)
27106 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27107 }
27108 }
27109
27110 TEST(F32_GEMM_4X8S4__SSE, k_eq_4_subtile_n) {
27111 TEST_REQUIRES_X86_SSE;
27112 for (uint32_t n = 1; n <= 8; n++) {
27113 GemmMicrokernelTester()
27114 .mr(4)
27115 .nr(8)
27116 .kr(1)
27117 .sr(4)
27118 .m(4)
27119 .n(n)
27120 .k(4)
27121 .iterations(1)
27122 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27123 }
27124 }
27125
27126 TEST(F32_GEMM_4X8S4__SSE, k_lt_4) {
27127 TEST_REQUIRES_X86_SSE;
27128 for (size_t k = 1; k < 4; k++) {
27129 GemmMicrokernelTester()
27130 .mr(4)
27131 .nr(8)
27132 .kr(1)
27133 .sr(4)
27134 .m(4)
27135 .n(8)
27136 .k(k)
27137 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27138 }
27139 }
27140
27141 TEST(F32_GEMM_4X8S4__SSE, k_lt_4_strided_a) {
27142 TEST_REQUIRES_X86_SSE;
27143 for (size_t k = 1; k < 4; k++) {
27144 GemmMicrokernelTester()
27145 .mr(4)
27146 .nr(8)
27147 .kr(1)
27148 .sr(4)
27149 .m(4)
27150 .n(8)
27151 .k(k)
27152 .a_stride(7)
27153 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27154 }
27155 }
27156
27157 TEST(F32_GEMM_4X8S4__SSE, k_lt_4_subtile) {
27158 TEST_REQUIRES_X86_SSE;
27159 for (size_t k = 1; k < 4; k++) {
27160 for (uint32_t m = 1; m <= 4; m++) {
27161 for (uint32_t n = 1; n <= 8; n++) {
27162 GemmMicrokernelTester()
27163 .mr(4)
27164 .nr(8)
27165 .kr(1)
27166 .sr(4)
27167 .m(m)
27168 .n(n)
27169 .k(k)
27170 .iterations(1)
27171 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27172 }
27173 }
27174 }
27175 }
27176
27177 TEST(F32_GEMM_4X8S4__SSE, k_gt_4) {
27178 TEST_REQUIRES_X86_SSE;
27179 for (size_t k = 5; k < 8; k++) {
27180 GemmMicrokernelTester()
27181 .mr(4)
27182 .nr(8)
27183 .kr(1)
27184 .sr(4)
27185 .m(4)
27186 .n(8)
27187 .k(k)
27188 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27189 }
27190 }
27191
27192 TEST(F32_GEMM_4X8S4__SSE, k_gt_4_strided_a) {
27193 TEST_REQUIRES_X86_SSE;
27194 for (size_t k = 5; k < 8; k++) {
27195 GemmMicrokernelTester()
27196 .mr(4)
27197 .nr(8)
27198 .kr(1)
27199 .sr(4)
27200 .m(4)
27201 .n(8)
27202 .k(k)
27203 .a_stride(11)
27204 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27205 }
27206 }
27207
27208 TEST(F32_GEMM_4X8S4__SSE, k_gt_4_subtile) {
27209 TEST_REQUIRES_X86_SSE;
27210 for (size_t k = 5; k < 8; k++) {
27211 for (uint32_t m = 1; m <= 4; m++) {
27212 for (uint32_t n = 1; n <= 8; n++) {
27213 GemmMicrokernelTester()
27214 .mr(4)
27215 .nr(8)
27216 .kr(1)
27217 .sr(4)
27218 .m(m)
27219 .n(n)
27220 .k(k)
27221 .iterations(1)
27222 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27223 }
27224 }
27225 }
27226 }
27227
27228 TEST(F32_GEMM_4X8S4__SSE, k_div_4) {
27229 TEST_REQUIRES_X86_SSE;
27230 for (size_t k = 8; k <= 40; k += 4) {
27231 GemmMicrokernelTester()
27232 .mr(4)
27233 .nr(8)
27234 .kr(1)
27235 .sr(4)
27236 .m(4)
27237 .n(8)
27238 .k(k)
27239 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27240 }
27241 }
27242
27243 TEST(F32_GEMM_4X8S4__SSE, k_div_4_strided_a) {
27244 TEST_REQUIRES_X86_SSE;
27245 for (size_t k = 8; k <= 40; k += 4) {
27246 GemmMicrokernelTester()
27247 .mr(4)
27248 .nr(8)
27249 .kr(1)
27250 .sr(4)
27251 .m(4)
27252 .n(8)
27253 .k(k)
27254 .a_stride(43)
27255 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27256 }
27257 }
27258
27259 TEST(F32_GEMM_4X8S4__SSE, k_div_4_subtile) {
27260 TEST_REQUIRES_X86_SSE;
27261 for (size_t k = 8; k <= 40; k += 4) {
27262 for (uint32_t m = 1; m <= 4; m++) {
27263 for (uint32_t n = 1; n <= 8; n++) {
27264 GemmMicrokernelTester()
27265 .mr(4)
27266 .nr(8)
27267 .kr(1)
27268 .sr(4)
27269 .m(m)
27270 .n(n)
27271 .k(k)
27272 .iterations(1)
27273 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27274 }
27275 }
27276 }
27277 }
27278
27279 TEST(F32_GEMM_4X8S4__SSE, n_gt_8) {
27280 TEST_REQUIRES_X86_SSE;
27281 for (uint32_t n = 9; n < 16; n++) {
27282 for (size_t k = 1; k <= 20; k += 5) {
27283 GemmMicrokernelTester()
27284 .mr(4)
27285 .nr(8)
27286 .kr(1)
27287 .sr(4)
27288 .m(4)
27289 .n(8)
27290 .k(k)
27291 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27292 }
27293 }
27294 }
27295
27296 TEST(F32_GEMM_4X8S4__SSE, n_gt_8_strided_cn) {
27297 TEST_REQUIRES_X86_SSE;
27298 for (uint32_t n = 9; n < 16; n++) {
27299 for (size_t k = 1; k <= 20; k += 5) {
27300 GemmMicrokernelTester()
27301 .mr(4)
27302 .nr(8)
27303 .kr(1)
27304 .sr(4)
27305 .m(4)
27306 .n(8)
27307 .k(k)
27308 .cn_stride(11)
27309 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27310 }
27311 }
27312 }
27313
27314 TEST(F32_GEMM_4X8S4__SSE, n_gt_8_strided_a) {
27315 TEST_REQUIRES_X86_SSE;
27316 for (uint32_t n = 9; n < 16; n++) {
27317 for (size_t k = 1; k <= 20; k += 5) {
27318 GemmMicrokernelTester()
27319 .mr(4)
27320 .nr(8)
27321 .kr(1)
27322 .sr(4)
27323 .m(4)
27324 .n(n)
27325 .k(k)
27326 .a_stride(23)
27327 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27328 }
27329 }
27330 }
27331
27332 TEST(F32_GEMM_4X8S4__SSE, n_gt_8_subtile) {
27333 TEST_REQUIRES_X86_SSE;
27334 for (uint32_t n = 9; n < 16; n++) {
27335 for (size_t k = 1; k <= 20; k += 5) {
27336 for (uint32_t m = 1; m <= 4; m++) {
27337 GemmMicrokernelTester()
27338 .mr(4)
27339 .nr(8)
27340 .kr(1)
27341 .sr(4)
27342 .m(m)
27343 .n(n)
27344 .k(k)
27345 .iterations(1)
27346 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27347 }
27348 }
27349 }
27350 }
27351
27352 TEST(F32_GEMM_4X8S4__SSE, n_div_8) {
27353 TEST_REQUIRES_X86_SSE;
27354 for (uint32_t n = 16; n <= 24; n += 8) {
27355 for (size_t k = 1; k <= 20; k += 5) {
27356 GemmMicrokernelTester()
27357 .mr(4)
27358 .nr(8)
27359 .kr(1)
27360 .sr(4)
27361 .m(4)
27362 .n(8)
27363 .k(k)
27364 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27365 }
27366 }
27367 }
27368
27369 TEST(F32_GEMM_4X8S4__SSE, n_div_8_strided_cn) {
27370 TEST_REQUIRES_X86_SSE;
27371 for (uint32_t n = 16; n <= 24; n += 8) {
27372 for (size_t k = 1; k <= 20; k += 5) {
27373 GemmMicrokernelTester()
27374 .mr(4)
27375 .nr(8)
27376 .kr(1)
27377 .sr(4)
27378 .m(4)
27379 .n(n)
27380 .k(k)
27381 .cn_stride(11)
27382 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27383 }
27384 }
27385 }
27386
27387 TEST(F32_GEMM_4X8S4__SSE, n_div_8_strided_a) {
27388 TEST_REQUIRES_X86_SSE;
27389 for (uint32_t n = 16; n <= 24; n += 8) {
27390 for (size_t k = 1; k <= 20; k += 5) {
27391 GemmMicrokernelTester()
27392 .mr(4)
27393 .nr(8)
27394 .kr(1)
27395 .sr(4)
27396 .m(4)
27397 .n(n)
27398 .k(k)
27399 .a_stride(23)
27400 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27401 }
27402 }
27403 }
27404
27405 TEST(F32_GEMM_4X8S4__SSE, n_div_8_subtile) {
27406 TEST_REQUIRES_X86_SSE;
27407 for (uint32_t n = 16; n <= 24; n += 8) {
27408 for (size_t k = 1; k <= 20; k += 5) {
27409 for (uint32_t m = 1; m <= 4; m++) {
27410 GemmMicrokernelTester()
27411 .mr(4)
27412 .nr(8)
27413 .kr(1)
27414 .sr(4)
27415 .m(m)
27416 .n(n)
27417 .k(k)
27418 .iterations(1)
27419 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27420 }
27421 }
27422 }
27423 }
27424
27425 TEST(F32_GEMM_4X8S4__SSE, strided_cm_subtile) {
27426 TEST_REQUIRES_X86_SSE;
27427 for (size_t k = 1; k <= 20; k += 5) {
27428 for (uint32_t m = 1; m <= 4; m++) {
27429 for (uint32_t n = 1; n <= 8; n++) {
27430 GemmMicrokernelTester()
27431 .mr(4)
27432 .nr(8)
27433 .kr(1)
27434 .sr(4)
27435 .m(m)
27436 .n(n)
27437 .k(k)
27438 .cm_stride(11)
27439 .iterations(1)
27440 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27441 }
27442 }
27443 }
27444 }
27445
27446 TEST(F32_GEMM_4X8S4__SSE, qmin) {
27447 TEST_REQUIRES_X86_SSE;
27448 GemmMicrokernelTester()
27449 .mr(4)
27450 .nr(8)
27451 .kr(1)
27452 .sr(4)
27453 .m(4)
27454 .n(8)
27455 .k(4)
27456 .qmin(128)
27457 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27458 }
27459
27460 TEST(F32_GEMM_4X8S4__SSE, qmax) {
27461 TEST_REQUIRES_X86_SSE;
27462 GemmMicrokernelTester()
27463 .mr(4)
27464 .nr(8)
27465 .kr(1)
27466 .sr(4)
27467 .m(4)
27468 .n(8)
27469 .k(4)
27470 .qmax(128)
27471 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27472 }
27473
27474 TEST(F32_GEMM_4X8S4__SSE, strided_cm) {
27475 TEST_REQUIRES_X86_SSE;
27476 GemmMicrokernelTester()
27477 .mr(4)
27478 .nr(8)
27479 .kr(1)
27480 .sr(4)
27481 .m(4)
27482 .n(8)
27483 .k(4)
27484 .cm_stride(11)
27485 .Test(xnn_f32_gemm_ukernel_4x8s4__sse);
27486 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070027487#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
XNNPACK Teamb455b122019-09-27 18:10:33 -070027488
27489
Marat Dukhanfda12b82019-11-21 12:27:59 -080027490#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27491 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1) {
27492 TEST_REQUIRES_X86_AVX;
27493 GemmMicrokernelTester()
27494 .mr(1)
27495 .nr(8)
27496 .kr(1)
27497 .sr(1)
27498 .m(1)
27499 .n(8)
27500 .k(1)
27501 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27502 }
27503
27504 TEST(F32_GEMM_1X8__AVX_BROADCAST, strided_cn) {
27505 TEST_REQUIRES_X86_AVX;
27506 GemmMicrokernelTester()
27507 .mr(1)
27508 .nr(8)
27509 .kr(1)
27510 .sr(1)
27511 .m(1)
27512 .n(8)
27513 .k(1)
27514 .cn_stride(11)
27515 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27516 }
27517
27518 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_strided_a) {
27519 TEST_REQUIRES_X86_AVX;
27520 GemmMicrokernelTester()
27521 .mr(1)
27522 .nr(8)
27523 .kr(1)
27524 .sr(1)
27525 .m(1)
27526 .n(8)
27527 .k(1)
27528 .a_stride(3)
27529 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27530 }
27531
27532 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_subtile) {
27533 TEST_REQUIRES_X86_AVX;
27534 for (uint32_t m = 1; m <= 1; m++) {
27535 for (uint32_t n = 1; n <= 8; n++) {
27536 GemmMicrokernelTester()
27537 .mr(1)
27538 .nr(8)
27539 .kr(1)
27540 .sr(1)
27541 .m(m)
27542 .n(n)
27543 .k(1)
27544 .iterations(1)
27545 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27546 }
27547 }
27548 }
27549
27550 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27551 TEST_REQUIRES_X86_AVX;
27552 for (uint32_t m = 1; m <= 1; m++) {
27553 GemmMicrokernelTester()
27554 .mr(1)
27555 .nr(8)
27556 .kr(1)
27557 .sr(1)
27558 .m(m)
27559 .n(8)
27560 .k(1)
27561 .iterations(1)
27562 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27563 }
27564 }
27565
27566 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27567 TEST_REQUIRES_X86_AVX;
27568 for (uint32_t n = 1; n <= 8; n++) {
27569 GemmMicrokernelTester()
27570 .mr(1)
27571 .nr(8)
27572 .kr(1)
27573 .sr(1)
27574 .m(1)
27575 .n(n)
27576 .k(1)
27577 .iterations(1)
27578 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27579 }
27580 }
27581
27582 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_gt_1) {
27583 TEST_REQUIRES_X86_AVX;
27584 for (size_t k = 2; k < 10; k++) {
27585 GemmMicrokernelTester()
27586 .mr(1)
27587 .nr(8)
27588 .kr(1)
27589 .sr(1)
27590 .m(1)
27591 .n(8)
27592 .k(k)
27593 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27594 }
27595 }
27596
27597 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_gt_1_strided_a) {
27598 TEST_REQUIRES_X86_AVX;
27599 for (size_t k = 2; k < 10; k++) {
27600 GemmMicrokernelTester()
27601 .mr(1)
27602 .nr(8)
27603 .kr(1)
27604 .sr(1)
27605 .m(1)
27606 .n(8)
27607 .k(k)
27608 .a_stride(11)
27609 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27610 }
27611 }
27612
27613 TEST(F32_GEMM_1X8__AVX_BROADCAST, k_gt_1_subtile) {
27614 TEST_REQUIRES_X86_AVX;
27615 for (size_t k = 2; k < 10; k++) {
27616 for (uint32_t m = 1; m <= 1; m++) {
27617 for (uint32_t n = 1; n <= 8; n++) {
27618 GemmMicrokernelTester()
27619 .mr(1)
27620 .nr(8)
27621 .kr(1)
27622 .sr(1)
27623 .m(m)
27624 .n(n)
27625 .k(k)
27626 .iterations(1)
27627 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27628 }
27629 }
27630 }
27631 }
27632
27633 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8) {
27634 TEST_REQUIRES_X86_AVX;
27635 for (uint32_t n = 9; n < 16; n++) {
27636 for (size_t k = 1; k <= 5; k += 2) {
27637 GemmMicrokernelTester()
27638 .mr(1)
27639 .nr(8)
27640 .kr(1)
27641 .sr(1)
27642 .m(1)
27643 .n(8)
27644 .k(k)
27645 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27646 }
27647 }
27648 }
27649
27650 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8_strided_cn) {
27651 TEST_REQUIRES_X86_AVX;
27652 for (uint32_t n = 9; n < 16; n++) {
27653 for (size_t k = 1; k <= 5; k += 2) {
27654 GemmMicrokernelTester()
27655 .mr(1)
27656 .nr(8)
27657 .kr(1)
27658 .sr(1)
27659 .m(1)
27660 .n(8)
27661 .k(k)
27662 .cn_stride(11)
27663 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27664 }
27665 }
27666 }
27667
27668 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8_strided_a) {
27669 TEST_REQUIRES_X86_AVX;
27670 for (uint32_t n = 9; n < 16; n++) {
27671 for (size_t k = 1; k <= 5; k += 2) {
27672 GemmMicrokernelTester()
27673 .mr(1)
27674 .nr(8)
27675 .kr(1)
27676 .sr(1)
27677 .m(1)
27678 .n(n)
27679 .k(k)
27680 .a_stride(7)
27681 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27682 }
27683 }
27684 }
27685
27686 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_gt_8_subtile) {
27687 TEST_REQUIRES_X86_AVX;
27688 for (uint32_t n = 9; n < 16; n++) {
27689 for (size_t k = 1; k <= 5; k += 2) {
27690 for (uint32_t m = 1; m <= 1; m++) {
27691 GemmMicrokernelTester()
27692 .mr(1)
27693 .nr(8)
27694 .kr(1)
27695 .sr(1)
27696 .m(m)
27697 .n(n)
27698 .k(k)
27699 .iterations(1)
27700 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27701 }
27702 }
27703 }
27704 }
27705
27706 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8) {
27707 TEST_REQUIRES_X86_AVX;
27708 for (uint32_t n = 16; n <= 24; n += 8) {
27709 for (size_t k = 1; k <= 5; k += 2) {
27710 GemmMicrokernelTester()
27711 .mr(1)
27712 .nr(8)
27713 .kr(1)
27714 .sr(1)
27715 .m(1)
27716 .n(8)
27717 .k(k)
27718 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27719 }
27720 }
27721 }
27722
27723 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8_strided_cn) {
27724 TEST_REQUIRES_X86_AVX;
27725 for (uint32_t n = 16; n <= 24; n += 8) {
27726 for (size_t k = 1; k <= 5; k += 2) {
27727 GemmMicrokernelTester()
27728 .mr(1)
27729 .nr(8)
27730 .kr(1)
27731 .sr(1)
27732 .m(1)
27733 .n(n)
27734 .k(k)
27735 .cn_stride(11)
27736 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27737 }
27738 }
27739 }
27740
27741 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8_strided_a) {
27742 TEST_REQUIRES_X86_AVX;
27743 for (uint32_t n = 16; n <= 24; n += 8) {
27744 for (size_t k = 1; k <= 5; k += 2) {
27745 GemmMicrokernelTester()
27746 .mr(1)
27747 .nr(8)
27748 .kr(1)
27749 .sr(1)
27750 .m(1)
27751 .n(n)
27752 .k(k)
27753 .a_stride(7)
27754 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27755 }
27756 }
27757 }
27758
27759 TEST(F32_GEMM_1X8__AVX_BROADCAST, n_div_8_subtile) {
27760 TEST_REQUIRES_X86_AVX;
27761 for (uint32_t n = 16; n <= 24; n += 8) {
27762 for (size_t k = 1; k <= 5; k += 2) {
27763 for (uint32_t m = 1; m <= 1; m++) {
27764 GemmMicrokernelTester()
27765 .mr(1)
27766 .nr(8)
27767 .kr(1)
27768 .sr(1)
27769 .m(m)
27770 .n(n)
27771 .k(k)
27772 .iterations(1)
27773 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27774 }
27775 }
27776 }
27777 }
27778
27779 TEST(F32_GEMM_1X8__AVX_BROADCAST, strided_cm_subtile) {
27780 TEST_REQUIRES_X86_AVX;
27781 for (size_t k = 1; k <= 5; k += 2) {
27782 for (uint32_t m = 1; m <= 1; m++) {
27783 for (uint32_t n = 1; n <= 8; n++) {
27784 GemmMicrokernelTester()
27785 .mr(1)
27786 .nr(8)
27787 .kr(1)
27788 .sr(1)
27789 .m(m)
27790 .n(n)
27791 .k(k)
27792 .cm_stride(11)
27793 .iterations(1)
27794 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27795 }
27796 }
27797 }
27798 }
27799
27800 TEST(F32_GEMM_1X8__AVX_BROADCAST, qmin) {
27801 TEST_REQUIRES_X86_AVX;
27802 GemmMicrokernelTester()
27803 .mr(1)
27804 .nr(8)
27805 .kr(1)
27806 .sr(1)
27807 .m(1)
27808 .n(8)
27809 .k(1)
27810 .qmin(128)
27811 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27812 }
27813
27814 TEST(F32_GEMM_1X8__AVX_BROADCAST, qmax) {
27815 TEST_REQUIRES_X86_AVX;
27816 GemmMicrokernelTester()
27817 .mr(1)
27818 .nr(8)
27819 .kr(1)
27820 .sr(1)
27821 .m(1)
27822 .n(8)
27823 .k(1)
27824 .qmax(128)
27825 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27826 }
27827
27828 TEST(F32_GEMM_1X8__AVX_BROADCAST, strided_cm) {
27829 TEST_REQUIRES_X86_AVX;
27830 GemmMicrokernelTester()
27831 .mr(1)
27832 .nr(8)
27833 .kr(1)
27834 .sr(1)
27835 .m(1)
27836 .n(8)
27837 .k(1)
27838 .cm_stride(11)
27839 .Test(xnn_f32_gemm_ukernel_1x8__avx_broadcast);
27840 }
27841#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
27842
27843
27844#if XNN_ARCH_X86 || XNN_ARCH_X86_64
27845 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1) {
27846 TEST_REQUIRES_X86_AVX;
27847 GemmMicrokernelTester()
27848 .mr(4)
27849 .nr(8)
27850 .kr(1)
27851 .sr(1)
27852 .m(4)
27853 .n(8)
27854 .k(1)
27855 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27856 }
27857
27858 TEST(F32_GEMM_4X8__AVX_BROADCAST, strided_cn) {
27859 TEST_REQUIRES_X86_AVX;
27860 GemmMicrokernelTester()
27861 .mr(4)
27862 .nr(8)
27863 .kr(1)
27864 .sr(1)
27865 .m(4)
27866 .n(8)
27867 .k(1)
27868 .cn_stride(11)
27869 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27870 }
27871
27872 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_strided_a) {
27873 TEST_REQUIRES_X86_AVX;
27874 GemmMicrokernelTester()
27875 .mr(4)
27876 .nr(8)
27877 .kr(1)
27878 .sr(1)
27879 .m(4)
27880 .n(8)
27881 .k(1)
27882 .a_stride(3)
27883 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27884 }
27885
27886 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_subtile) {
27887 TEST_REQUIRES_X86_AVX;
27888 for (uint32_t m = 1; m <= 4; m++) {
27889 for (uint32_t n = 1; n <= 8; n++) {
27890 GemmMicrokernelTester()
27891 .mr(4)
27892 .nr(8)
27893 .kr(1)
27894 .sr(1)
27895 .m(m)
27896 .n(n)
27897 .k(1)
27898 .iterations(1)
27899 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27900 }
27901 }
27902 }
27903
27904 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_m) {
27905 TEST_REQUIRES_X86_AVX;
27906 for (uint32_t m = 1; m <= 4; m++) {
27907 GemmMicrokernelTester()
27908 .mr(4)
27909 .nr(8)
27910 .kr(1)
27911 .sr(1)
27912 .m(m)
27913 .n(8)
27914 .k(1)
27915 .iterations(1)
27916 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27917 }
27918 }
27919
27920 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_eq_1_subtile_n) {
27921 TEST_REQUIRES_X86_AVX;
27922 for (uint32_t n = 1; n <= 8; n++) {
27923 GemmMicrokernelTester()
27924 .mr(4)
27925 .nr(8)
27926 .kr(1)
27927 .sr(1)
27928 .m(4)
27929 .n(n)
27930 .k(1)
27931 .iterations(1)
27932 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27933 }
27934 }
27935
27936 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_gt_1) {
27937 TEST_REQUIRES_X86_AVX;
27938 for (size_t k = 2; k < 10; k++) {
27939 GemmMicrokernelTester()
27940 .mr(4)
27941 .nr(8)
27942 .kr(1)
27943 .sr(1)
27944 .m(4)
27945 .n(8)
27946 .k(k)
27947 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27948 }
27949 }
27950
27951 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_gt_1_strided_a) {
27952 TEST_REQUIRES_X86_AVX;
27953 for (size_t k = 2; k < 10; k++) {
27954 GemmMicrokernelTester()
27955 .mr(4)
27956 .nr(8)
27957 .kr(1)
27958 .sr(1)
27959 .m(4)
27960 .n(8)
27961 .k(k)
27962 .a_stride(11)
27963 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27964 }
27965 }
27966
27967 TEST(F32_GEMM_4X8__AVX_BROADCAST, k_gt_1_subtile) {
27968 TEST_REQUIRES_X86_AVX;
27969 for (size_t k = 2; k < 10; k++) {
27970 for (uint32_t m = 1; m <= 4; m++) {
27971 for (uint32_t n = 1; n <= 8; n++) {
27972 GemmMicrokernelTester()
27973 .mr(4)
27974 .nr(8)
27975 .kr(1)
27976 .sr(1)
27977 .m(m)
27978 .n(n)
27979 .k(k)
27980 .iterations(1)
27981 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
27982 }
27983 }
27984 }
27985 }
27986
27987 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8) {
27988 TEST_REQUIRES_X86_AVX;
27989 for (uint32_t n = 9; n < 16; n++) {
27990 for (size_t k = 1; k <= 5; k += 2) {
27991 GemmMicrokernelTester()
27992 .mr(4)
27993 .nr(8)
27994 .kr(1)
27995 .sr(1)
27996 .m(4)
27997 .n(8)
27998 .k(k)
27999 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28000 }
28001 }
28002 }
28003
28004 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8_strided_cn) {
28005 TEST_REQUIRES_X86_AVX;
28006 for (uint32_t n = 9; n < 16; n++) {
28007 for (size_t k = 1; k <= 5; k += 2) {
28008 GemmMicrokernelTester()
28009 .mr(4)
28010 .nr(8)
28011 .kr(1)
28012 .sr(1)
28013 .m(4)
28014 .n(8)
28015 .k(k)
28016 .cn_stride(11)
28017 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28018 }
28019 }
28020 }
28021
28022 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8_strided_a) {
28023 TEST_REQUIRES_X86_AVX;
28024 for (uint32_t n = 9; n < 16; n++) {
28025 for (size_t k = 1; k <= 5; k += 2) {
28026 GemmMicrokernelTester()
28027 .mr(4)
28028 .nr(8)
28029 .kr(1)
28030 .sr(1)
28031 .m(4)
28032 .n(n)
28033 .k(k)
28034 .a_stride(7)
28035 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28036 }
28037 }
28038 }
28039
28040 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_gt_8_subtile) {
28041 TEST_REQUIRES_X86_AVX;
28042 for (uint32_t n = 9; n < 16; n++) {
28043 for (size_t k = 1; k <= 5; k += 2) {
28044 for (uint32_t m = 1; m <= 4; m++) {
28045 GemmMicrokernelTester()
28046 .mr(4)
28047 .nr(8)
28048 .kr(1)
28049 .sr(1)
28050 .m(m)
28051 .n(n)
28052 .k(k)
28053 .iterations(1)
28054 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28055 }
28056 }
28057 }
28058 }
28059
28060 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8) {
28061 TEST_REQUIRES_X86_AVX;
28062 for (uint32_t n = 16; n <= 24; n += 8) {
28063 for (size_t k = 1; k <= 5; k += 2) {
28064 GemmMicrokernelTester()
28065 .mr(4)
28066 .nr(8)
28067 .kr(1)
28068 .sr(1)
28069 .m(4)
28070 .n(8)
28071 .k(k)
28072 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28073 }
28074 }
28075 }
28076
28077 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8_strided_cn) {
28078 TEST_REQUIRES_X86_AVX;
28079 for (uint32_t n = 16; n <= 24; n += 8) {
28080 for (size_t k = 1; k <= 5; k += 2) {
28081 GemmMicrokernelTester()
28082 .mr(4)
28083 .nr(8)
28084 .kr(1)
28085 .sr(1)
28086 .m(4)
28087 .n(n)
28088 .k(k)
28089 .cn_stride(11)
28090 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28091 }
28092 }
28093 }
28094
28095 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8_strided_a) {
28096 TEST_REQUIRES_X86_AVX;
28097 for (uint32_t n = 16; n <= 24; n += 8) {
28098 for (size_t k = 1; k <= 5; k += 2) {
28099 GemmMicrokernelTester()
28100 .mr(4)
28101 .nr(8)
28102 .kr(1)
28103 .sr(1)
28104 .m(4)
28105 .n(n)
28106 .k(k)
28107 .a_stride(7)
28108 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28109 }
28110 }
28111 }
28112
28113 TEST(F32_GEMM_4X8__AVX_BROADCAST, n_div_8_subtile) {
28114 TEST_REQUIRES_X86_AVX;
28115 for (uint32_t n = 16; n <= 24; n += 8) {
28116 for (size_t k = 1; k <= 5; k += 2) {
28117 for (uint32_t m = 1; m <= 4; m++) {
28118 GemmMicrokernelTester()
28119 .mr(4)
28120 .nr(8)
28121 .kr(1)
28122 .sr(1)
28123 .m(m)
28124 .n(n)
28125 .k(k)
28126 .iterations(1)
28127 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28128 }
28129 }
28130 }
28131 }
28132
28133 TEST(F32_GEMM_4X8__AVX_BROADCAST, strided_cm_subtile) {
28134 TEST_REQUIRES_X86_AVX;
28135 for (size_t k = 1; k <= 5; k += 2) {
28136 for (uint32_t m = 1; m <= 4; m++) {
28137 for (uint32_t n = 1; n <= 8; n++) {
28138 GemmMicrokernelTester()
28139 .mr(4)
28140 .nr(8)
28141 .kr(1)
28142 .sr(1)
28143 .m(m)
28144 .n(n)
28145 .k(k)
28146 .cm_stride(11)
28147 .iterations(1)
28148 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28149 }
28150 }
28151 }
28152 }
28153
28154 TEST(F32_GEMM_4X8__AVX_BROADCAST, qmin) {
28155 TEST_REQUIRES_X86_AVX;
28156 GemmMicrokernelTester()
28157 .mr(4)
28158 .nr(8)
28159 .kr(1)
28160 .sr(1)
28161 .m(4)
28162 .n(8)
28163 .k(1)
28164 .qmin(128)
28165 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28166 }
28167
28168 TEST(F32_GEMM_4X8__AVX_BROADCAST, qmax) {
28169 TEST_REQUIRES_X86_AVX;
28170 GemmMicrokernelTester()
28171 .mr(4)
28172 .nr(8)
28173 .kr(1)
28174 .sr(1)
28175 .m(4)
28176 .n(8)
28177 .k(1)
28178 .qmax(128)
28179 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28180 }
28181
28182 TEST(F32_GEMM_4X8__AVX_BROADCAST, strided_cm) {
28183 TEST_REQUIRES_X86_AVX;
28184 GemmMicrokernelTester()
28185 .mr(4)
28186 .nr(8)
28187 .kr(1)
28188 .sr(1)
28189 .m(4)
28190 .n(8)
28191 .k(1)
28192 .cm_stride(11)
28193 .Test(xnn_f32_gemm_ukernel_4x8__avx_broadcast);
28194 }
28195#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28196
28197
28198#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28199 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1) {
28200 TEST_REQUIRES_X86_AVX;
28201 GemmMicrokernelTester()
28202 .mr(5)
28203 .nr(8)
28204 .kr(1)
28205 .sr(1)
28206 .m(5)
28207 .n(8)
28208 .k(1)
28209 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28210 }
28211
28212 TEST(F32_GEMM_5X8__AVX_BROADCAST, strided_cn) {
28213 TEST_REQUIRES_X86_AVX;
28214 GemmMicrokernelTester()
28215 .mr(5)
28216 .nr(8)
28217 .kr(1)
28218 .sr(1)
28219 .m(5)
28220 .n(8)
28221 .k(1)
28222 .cn_stride(11)
28223 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28224 }
28225
28226 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_strided_a) {
28227 TEST_REQUIRES_X86_AVX;
28228 GemmMicrokernelTester()
28229 .mr(5)
28230 .nr(8)
28231 .kr(1)
28232 .sr(1)
28233 .m(5)
28234 .n(8)
28235 .k(1)
28236 .a_stride(3)
28237 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28238 }
28239
28240 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_subtile) {
28241 TEST_REQUIRES_X86_AVX;
28242 for (uint32_t m = 1; m <= 5; m++) {
28243 for (uint32_t n = 1; n <= 8; n++) {
28244 GemmMicrokernelTester()
28245 .mr(5)
28246 .nr(8)
28247 .kr(1)
28248 .sr(1)
28249 .m(m)
28250 .n(n)
28251 .k(1)
28252 .iterations(1)
28253 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28254 }
28255 }
28256 }
28257
28258 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_m) {
28259 TEST_REQUIRES_X86_AVX;
28260 for (uint32_t m = 1; m <= 5; m++) {
28261 GemmMicrokernelTester()
28262 .mr(5)
28263 .nr(8)
28264 .kr(1)
28265 .sr(1)
28266 .m(m)
28267 .n(8)
28268 .k(1)
28269 .iterations(1)
28270 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28271 }
28272 }
28273
28274 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_eq_1_subtile_n) {
28275 TEST_REQUIRES_X86_AVX;
28276 for (uint32_t n = 1; n <= 8; n++) {
28277 GemmMicrokernelTester()
28278 .mr(5)
28279 .nr(8)
28280 .kr(1)
28281 .sr(1)
28282 .m(5)
28283 .n(n)
28284 .k(1)
28285 .iterations(1)
28286 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28287 }
28288 }
28289
28290 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_gt_1) {
28291 TEST_REQUIRES_X86_AVX;
28292 for (size_t k = 2; k < 10; k++) {
28293 GemmMicrokernelTester()
28294 .mr(5)
28295 .nr(8)
28296 .kr(1)
28297 .sr(1)
28298 .m(5)
28299 .n(8)
28300 .k(k)
28301 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28302 }
28303 }
28304
28305 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_gt_1_strided_a) {
28306 TEST_REQUIRES_X86_AVX;
28307 for (size_t k = 2; k < 10; k++) {
28308 GemmMicrokernelTester()
28309 .mr(5)
28310 .nr(8)
28311 .kr(1)
28312 .sr(1)
28313 .m(5)
28314 .n(8)
28315 .k(k)
28316 .a_stride(11)
28317 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28318 }
28319 }
28320
28321 TEST(F32_GEMM_5X8__AVX_BROADCAST, k_gt_1_subtile) {
28322 TEST_REQUIRES_X86_AVX;
28323 for (size_t k = 2; k < 10; k++) {
28324 for (uint32_t m = 1; m <= 5; m++) {
28325 for (uint32_t n = 1; n <= 8; n++) {
28326 GemmMicrokernelTester()
28327 .mr(5)
28328 .nr(8)
28329 .kr(1)
28330 .sr(1)
28331 .m(m)
28332 .n(n)
28333 .k(k)
28334 .iterations(1)
28335 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28336 }
28337 }
28338 }
28339 }
28340
28341 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8) {
28342 TEST_REQUIRES_X86_AVX;
28343 for (uint32_t n = 9; n < 16; n++) {
28344 for (size_t k = 1; k <= 5; k += 2) {
28345 GemmMicrokernelTester()
28346 .mr(5)
28347 .nr(8)
28348 .kr(1)
28349 .sr(1)
28350 .m(5)
28351 .n(8)
28352 .k(k)
28353 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28354 }
28355 }
28356 }
28357
28358 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8_strided_cn) {
28359 TEST_REQUIRES_X86_AVX;
28360 for (uint32_t n = 9; n < 16; n++) {
28361 for (size_t k = 1; k <= 5; k += 2) {
28362 GemmMicrokernelTester()
28363 .mr(5)
28364 .nr(8)
28365 .kr(1)
28366 .sr(1)
28367 .m(5)
28368 .n(8)
28369 .k(k)
28370 .cn_stride(11)
28371 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28372 }
28373 }
28374 }
28375
28376 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8_strided_a) {
28377 TEST_REQUIRES_X86_AVX;
28378 for (uint32_t n = 9; n < 16; n++) {
28379 for (size_t k = 1; k <= 5; k += 2) {
28380 GemmMicrokernelTester()
28381 .mr(5)
28382 .nr(8)
28383 .kr(1)
28384 .sr(1)
28385 .m(5)
28386 .n(n)
28387 .k(k)
28388 .a_stride(7)
28389 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28390 }
28391 }
28392 }
28393
28394 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_gt_8_subtile) {
28395 TEST_REQUIRES_X86_AVX;
28396 for (uint32_t n = 9; n < 16; n++) {
28397 for (size_t k = 1; k <= 5; k += 2) {
28398 for (uint32_t m = 1; m <= 5; m++) {
28399 GemmMicrokernelTester()
28400 .mr(5)
28401 .nr(8)
28402 .kr(1)
28403 .sr(1)
28404 .m(m)
28405 .n(n)
28406 .k(k)
28407 .iterations(1)
28408 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28409 }
28410 }
28411 }
28412 }
28413
28414 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8) {
28415 TEST_REQUIRES_X86_AVX;
28416 for (uint32_t n = 16; n <= 24; n += 8) {
28417 for (size_t k = 1; k <= 5; k += 2) {
28418 GemmMicrokernelTester()
28419 .mr(5)
28420 .nr(8)
28421 .kr(1)
28422 .sr(1)
28423 .m(5)
28424 .n(8)
28425 .k(k)
28426 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28427 }
28428 }
28429 }
28430
28431 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8_strided_cn) {
28432 TEST_REQUIRES_X86_AVX;
28433 for (uint32_t n = 16; n <= 24; n += 8) {
28434 for (size_t k = 1; k <= 5; k += 2) {
28435 GemmMicrokernelTester()
28436 .mr(5)
28437 .nr(8)
28438 .kr(1)
28439 .sr(1)
28440 .m(5)
28441 .n(n)
28442 .k(k)
28443 .cn_stride(11)
28444 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28445 }
28446 }
28447 }
28448
28449 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8_strided_a) {
28450 TEST_REQUIRES_X86_AVX;
28451 for (uint32_t n = 16; n <= 24; n += 8) {
28452 for (size_t k = 1; k <= 5; k += 2) {
28453 GemmMicrokernelTester()
28454 .mr(5)
28455 .nr(8)
28456 .kr(1)
28457 .sr(1)
28458 .m(5)
28459 .n(n)
28460 .k(k)
28461 .a_stride(7)
28462 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28463 }
28464 }
28465 }
28466
28467 TEST(F32_GEMM_5X8__AVX_BROADCAST, n_div_8_subtile) {
28468 TEST_REQUIRES_X86_AVX;
28469 for (uint32_t n = 16; n <= 24; n += 8) {
28470 for (size_t k = 1; k <= 5; k += 2) {
28471 for (uint32_t m = 1; m <= 5; m++) {
28472 GemmMicrokernelTester()
28473 .mr(5)
28474 .nr(8)
28475 .kr(1)
28476 .sr(1)
28477 .m(m)
28478 .n(n)
28479 .k(k)
28480 .iterations(1)
28481 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28482 }
28483 }
28484 }
28485 }
28486
28487 TEST(F32_GEMM_5X8__AVX_BROADCAST, strided_cm_subtile) {
28488 TEST_REQUIRES_X86_AVX;
28489 for (size_t k = 1; k <= 5; k += 2) {
28490 for (uint32_t m = 1; m <= 5; m++) {
28491 for (uint32_t n = 1; n <= 8; n++) {
28492 GemmMicrokernelTester()
28493 .mr(5)
28494 .nr(8)
28495 .kr(1)
28496 .sr(1)
28497 .m(m)
28498 .n(n)
28499 .k(k)
28500 .cm_stride(11)
28501 .iterations(1)
28502 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28503 }
28504 }
28505 }
28506 }
28507
28508 TEST(F32_GEMM_5X8__AVX_BROADCAST, qmin) {
28509 TEST_REQUIRES_X86_AVX;
28510 GemmMicrokernelTester()
28511 .mr(5)
28512 .nr(8)
28513 .kr(1)
28514 .sr(1)
28515 .m(5)
28516 .n(8)
28517 .k(1)
28518 .qmin(128)
28519 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28520 }
28521
28522 TEST(F32_GEMM_5X8__AVX_BROADCAST, qmax) {
28523 TEST_REQUIRES_X86_AVX;
28524 GemmMicrokernelTester()
28525 .mr(5)
28526 .nr(8)
28527 .kr(1)
28528 .sr(1)
28529 .m(5)
28530 .n(8)
28531 .k(1)
28532 .qmax(128)
28533 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28534 }
28535
28536 TEST(F32_GEMM_5X8__AVX_BROADCAST, strided_cm) {
28537 TEST_REQUIRES_X86_AVX;
28538 GemmMicrokernelTester()
28539 .mr(5)
28540 .nr(8)
28541 .kr(1)
28542 .sr(1)
28543 .m(5)
28544 .n(8)
28545 .k(1)
28546 .cm_stride(11)
28547 .Test(xnn_f32_gemm_ukernel_5x8__avx_broadcast);
28548 }
28549#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28550
28551
28552#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28553 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1) {
28554 TEST_REQUIRES_X86_AVX;
28555 GemmMicrokernelTester()
28556 .mr(6)
28557 .nr(8)
28558 .kr(1)
28559 .sr(1)
28560 .m(6)
28561 .n(8)
28562 .k(1)
28563 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28564 }
28565
28566 TEST(F32_GEMM_6X8__AVX_BROADCAST, strided_cn) {
28567 TEST_REQUIRES_X86_AVX;
28568 GemmMicrokernelTester()
28569 .mr(6)
28570 .nr(8)
28571 .kr(1)
28572 .sr(1)
28573 .m(6)
28574 .n(8)
28575 .k(1)
28576 .cn_stride(11)
28577 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28578 }
28579
28580 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_strided_a) {
28581 TEST_REQUIRES_X86_AVX;
28582 GemmMicrokernelTester()
28583 .mr(6)
28584 .nr(8)
28585 .kr(1)
28586 .sr(1)
28587 .m(6)
28588 .n(8)
28589 .k(1)
28590 .a_stride(3)
28591 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28592 }
28593
28594 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_subtile) {
28595 TEST_REQUIRES_X86_AVX;
28596 for (uint32_t m = 1; m <= 6; m++) {
28597 for (uint32_t n = 1; n <= 8; n++) {
28598 GemmMicrokernelTester()
28599 .mr(6)
28600 .nr(8)
28601 .kr(1)
28602 .sr(1)
28603 .m(m)
28604 .n(n)
28605 .k(1)
28606 .iterations(1)
28607 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28608 }
28609 }
28610 }
28611
28612 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_m) {
28613 TEST_REQUIRES_X86_AVX;
28614 for (uint32_t m = 1; m <= 6; m++) {
28615 GemmMicrokernelTester()
28616 .mr(6)
28617 .nr(8)
28618 .kr(1)
28619 .sr(1)
28620 .m(m)
28621 .n(8)
28622 .k(1)
28623 .iterations(1)
28624 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28625 }
28626 }
28627
28628 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_eq_1_subtile_n) {
28629 TEST_REQUIRES_X86_AVX;
28630 for (uint32_t n = 1; n <= 8; n++) {
28631 GemmMicrokernelTester()
28632 .mr(6)
28633 .nr(8)
28634 .kr(1)
28635 .sr(1)
28636 .m(6)
28637 .n(n)
28638 .k(1)
28639 .iterations(1)
28640 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28641 }
28642 }
28643
28644 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_gt_1) {
28645 TEST_REQUIRES_X86_AVX;
28646 for (size_t k = 2; k < 10; k++) {
28647 GemmMicrokernelTester()
28648 .mr(6)
28649 .nr(8)
28650 .kr(1)
28651 .sr(1)
28652 .m(6)
28653 .n(8)
28654 .k(k)
28655 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28656 }
28657 }
28658
28659 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_gt_1_strided_a) {
28660 TEST_REQUIRES_X86_AVX;
28661 for (size_t k = 2; k < 10; k++) {
28662 GemmMicrokernelTester()
28663 .mr(6)
28664 .nr(8)
28665 .kr(1)
28666 .sr(1)
28667 .m(6)
28668 .n(8)
28669 .k(k)
28670 .a_stride(11)
28671 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28672 }
28673 }
28674
28675 TEST(F32_GEMM_6X8__AVX_BROADCAST, k_gt_1_subtile) {
28676 TEST_REQUIRES_X86_AVX;
28677 for (size_t k = 2; k < 10; k++) {
28678 for (uint32_t m = 1; m <= 6; m++) {
28679 for (uint32_t n = 1; n <= 8; n++) {
28680 GemmMicrokernelTester()
28681 .mr(6)
28682 .nr(8)
28683 .kr(1)
28684 .sr(1)
28685 .m(m)
28686 .n(n)
28687 .k(k)
28688 .iterations(1)
28689 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28690 }
28691 }
28692 }
28693 }
28694
28695 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8) {
28696 TEST_REQUIRES_X86_AVX;
28697 for (uint32_t n = 9; n < 16; n++) {
28698 for (size_t k = 1; k <= 5; k += 2) {
28699 GemmMicrokernelTester()
28700 .mr(6)
28701 .nr(8)
28702 .kr(1)
28703 .sr(1)
28704 .m(6)
28705 .n(8)
28706 .k(k)
28707 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28708 }
28709 }
28710 }
28711
28712 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8_strided_cn) {
28713 TEST_REQUIRES_X86_AVX;
28714 for (uint32_t n = 9; n < 16; n++) {
28715 for (size_t k = 1; k <= 5; k += 2) {
28716 GemmMicrokernelTester()
28717 .mr(6)
28718 .nr(8)
28719 .kr(1)
28720 .sr(1)
28721 .m(6)
28722 .n(8)
28723 .k(k)
28724 .cn_stride(11)
28725 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28726 }
28727 }
28728 }
28729
28730 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8_strided_a) {
28731 TEST_REQUIRES_X86_AVX;
28732 for (uint32_t n = 9; n < 16; n++) {
28733 for (size_t k = 1; k <= 5; k += 2) {
28734 GemmMicrokernelTester()
28735 .mr(6)
28736 .nr(8)
28737 .kr(1)
28738 .sr(1)
28739 .m(6)
28740 .n(n)
28741 .k(k)
28742 .a_stride(7)
28743 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28744 }
28745 }
28746 }
28747
28748 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_gt_8_subtile) {
28749 TEST_REQUIRES_X86_AVX;
28750 for (uint32_t n = 9; n < 16; n++) {
28751 for (size_t k = 1; k <= 5; k += 2) {
28752 for (uint32_t m = 1; m <= 6; m++) {
28753 GemmMicrokernelTester()
28754 .mr(6)
28755 .nr(8)
28756 .kr(1)
28757 .sr(1)
28758 .m(m)
28759 .n(n)
28760 .k(k)
28761 .iterations(1)
28762 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28763 }
28764 }
28765 }
28766 }
28767
28768 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8) {
28769 TEST_REQUIRES_X86_AVX;
28770 for (uint32_t n = 16; n <= 24; n += 8) {
28771 for (size_t k = 1; k <= 5; k += 2) {
28772 GemmMicrokernelTester()
28773 .mr(6)
28774 .nr(8)
28775 .kr(1)
28776 .sr(1)
28777 .m(6)
28778 .n(8)
28779 .k(k)
28780 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28781 }
28782 }
28783 }
28784
28785 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8_strided_cn) {
28786 TEST_REQUIRES_X86_AVX;
28787 for (uint32_t n = 16; n <= 24; n += 8) {
28788 for (size_t k = 1; k <= 5; k += 2) {
28789 GemmMicrokernelTester()
28790 .mr(6)
28791 .nr(8)
28792 .kr(1)
28793 .sr(1)
28794 .m(6)
28795 .n(n)
28796 .k(k)
28797 .cn_stride(11)
28798 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28799 }
28800 }
28801 }
28802
28803 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8_strided_a) {
28804 TEST_REQUIRES_X86_AVX;
28805 for (uint32_t n = 16; n <= 24; n += 8) {
28806 for (size_t k = 1; k <= 5; k += 2) {
28807 GemmMicrokernelTester()
28808 .mr(6)
28809 .nr(8)
28810 .kr(1)
28811 .sr(1)
28812 .m(6)
28813 .n(n)
28814 .k(k)
28815 .a_stride(7)
28816 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28817 }
28818 }
28819 }
28820
28821 TEST(F32_GEMM_6X8__AVX_BROADCAST, n_div_8_subtile) {
28822 TEST_REQUIRES_X86_AVX;
28823 for (uint32_t n = 16; n <= 24; n += 8) {
28824 for (size_t k = 1; k <= 5; k += 2) {
28825 for (uint32_t m = 1; m <= 6; m++) {
28826 GemmMicrokernelTester()
28827 .mr(6)
28828 .nr(8)
28829 .kr(1)
28830 .sr(1)
28831 .m(m)
28832 .n(n)
28833 .k(k)
28834 .iterations(1)
28835 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28836 }
28837 }
28838 }
28839 }
28840
28841 TEST(F32_GEMM_6X8__AVX_BROADCAST, strided_cm_subtile) {
28842 TEST_REQUIRES_X86_AVX;
28843 for (size_t k = 1; k <= 5; k += 2) {
28844 for (uint32_t m = 1; m <= 6; m++) {
28845 for (uint32_t n = 1; n <= 8; n++) {
28846 GemmMicrokernelTester()
28847 .mr(6)
28848 .nr(8)
28849 .kr(1)
28850 .sr(1)
28851 .m(m)
28852 .n(n)
28853 .k(k)
28854 .cm_stride(11)
28855 .iterations(1)
28856 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28857 }
28858 }
28859 }
28860 }
28861
28862 TEST(F32_GEMM_6X8__AVX_BROADCAST, qmin) {
28863 TEST_REQUIRES_X86_AVX;
28864 GemmMicrokernelTester()
28865 .mr(6)
28866 .nr(8)
28867 .kr(1)
28868 .sr(1)
28869 .m(6)
28870 .n(8)
28871 .k(1)
28872 .qmin(128)
28873 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28874 }
28875
28876 TEST(F32_GEMM_6X8__AVX_BROADCAST, qmax) {
28877 TEST_REQUIRES_X86_AVX;
28878 GemmMicrokernelTester()
28879 .mr(6)
28880 .nr(8)
28881 .kr(1)
28882 .sr(1)
28883 .m(6)
28884 .n(8)
28885 .k(1)
28886 .qmax(128)
28887 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28888 }
28889
28890 TEST(F32_GEMM_6X8__AVX_BROADCAST, strided_cm) {
28891 TEST_REQUIRES_X86_AVX;
28892 GemmMicrokernelTester()
28893 .mr(6)
28894 .nr(8)
28895 .kr(1)
28896 .sr(1)
28897 .m(6)
28898 .n(8)
28899 .k(1)
28900 .cm_stride(11)
28901 .Test(xnn_f32_gemm_ukernel_6x8__avx_broadcast);
28902 }
28903#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
28904
28905
28906#if XNN_ARCH_X86 || XNN_ARCH_X86_64
28907 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1) {
28908 TEST_REQUIRES_X86_AVX;
28909 GemmMicrokernelTester()
28910 .mr(7)
28911 .nr(8)
28912 .kr(1)
28913 .sr(1)
28914 .m(7)
28915 .n(8)
28916 .k(1)
28917 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
28918 }
28919
28920 TEST(F32_GEMM_7X8__AVX_BROADCAST, strided_cn) {
28921 TEST_REQUIRES_X86_AVX;
28922 GemmMicrokernelTester()
28923 .mr(7)
28924 .nr(8)
28925 .kr(1)
28926 .sr(1)
28927 .m(7)
28928 .n(8)
28929 .k(1)
28930 .cn_stride(11)
28931 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
28932 }
28933
28934 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_strided_a) {
28935 TEST_REQUIRES_X86_AVX;
28936 GemmMicrokernelTester()
28937 .mr(7)
28938 .nr(8)
28939 .kr(1)
28940 .sr(1)
28941 .m(7)
28942 .n(8)
28943 .k(1)
28944 .a_stride(3)
28945 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
28946 }
28947
28948 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_subtile) {
28949 TEST_REQUIRES_X86_AVX;
28950 for (uint32_t m = 1; m <= 7; m++) {
28951 for (uint32_t n = 1; n <= 8; n++) {
28952 GemmMicrokernelTester()
28953 .mr(7)
28954 .nr(8)
28955 .kr(1)
28956 .sr(1)
28957 .m(m)
28958 .n(n)
28959 .k(1)
28960 .iterations(1)
28961 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
28962 }
28963 }
28964 }
28965
28966 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_m) {
28967 TEST_REQUIRES_X86_AVX;
28968 for (uint32_t m = 1; m <= 7; m++) {
28969 GemmMicrokernelTester()
28970 .mr(7)
28971 .nr(8)
28972 .kr(1)
28973 .sr(1)
28974 .m(m)
28975 .n(8)
28976 .k(1)
28977 .iterations(1)
28978 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
28979 }
28980 }
28981
28982 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_eq_1_subtile_n) {
28983 TEST_REQUIRES_X86_AVX;
28984 for (uint32_t n = 1; n <= 8; n++) {
28985 GemmMicrokernelTester()
28986 .mr(7)
28987 .nr(8)
28988 .kr(1)
28989 .sr(1)
28990 .m(7)
28991 .n(n)
28992 .k(1)
28993 .iterations(1)
28994 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
28995 }
28996 }
28997
28998 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_gt_1) {
28999 TEST_REQUIRES_X86_AVX;
29000 for (size_t k = 2; k < 10; k++) {
29001 GemmMicrokernelTester()
29002 .mr(7)
29003 .nr(8)
29004 .kr(1)
29005 .sr(1)
29006 .m(7)
29007 .n(8)
29008 .k(k)
29009 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29010 }
29011 }
29012
29013 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_gt_1_strided_a) {
29014 TEST_REQUIRES_X86_AVX;
29015 for (size_t k = 2; k < 10; k++) {
29016 GemmMicrokernelTester()
29017 .mr(7)
29018 .nr(8)
29019 .kr(1)
29020 .sr(1)
29021 .m(7)
29022 .n(8)
29023 .k(k)
29024 .a_stride(11)
29025 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29026 }
29027 }
29028
29029 TEST(F32_GEMM_7X8__AVX_BROADCAST, k_gt_1_subtile) {
29030 TEST_REQUIRES_X86_AVX;
29031 for (size_t k = 2; k < 10; k++) {
29032 for (uint32_t m = 1; m <= 7; m++) {
29033 for (uint32_t n = 1; n <= 8; n++) {
29034 GemmMicrokernelTester()
29035 .mr(7)
29036 .nr(8)
29037 .kr(1)
29038 .sr(1)
29039 .m(m)
29040 .n(n)
29041 .k(k)
29042 .iterations(1)
29043 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29044 }
29045 }
29046 }
29047 }
29048
29049 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8) {
29050 TEST_REQUIRES_X86_AVX;
29051 for (uint32_t n = 9; n < 16; n++) {
29052 for (size_t k = 1; k <= 5; k += 2) {
29053 GemmMicrokernelTester()
29054 .mr(7)
29055 .nr(8)
29056 .kr(1)
29057 .sr(1)
29058 .m(7)
29059 .n(8)
29060 .k(k)
29061 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29062 }
29063 }
29064 }
29065
29066 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8_strided_cn) {
29067 TEST_REQUIRES_X86_AVX;
29068 for (uint32_t n = 9; n < 16; n++) {
29069 for (size_t k = 1; k <= 5; k += 2) {
29070 GemmMicrokernelTester()
29071 .mr(7)
29072 .nr(8)
29073 .kr(1)
29074 .sr(1)
29075 .m(7)
29076 .n(8)
29077 .k(k)
29078 .cn_stride(11)
29079 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29080 }
29081 }
29082 }
29083
29084 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8_strided_a) {
29085 TEST_REQUIRES_X86_AVX;
29086 for (uint32_t n = 9; n < 16; n++) {
29087 for (size_t k = 1; k <= 5; k += 2) {
29088 GemmMicrokernelTester()
29089 .mr(7)
29090 .nr(8)
29091 .kr(1)
29092 .sr(1)
29093 .m(7)
29094 .n(n)
29095 .k(k)
29096 .a_stride(7)
29097 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29098 }
29099 }
29100 }
29101
29102 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_gt_8_subtile) {
29103 TEST_REQUIRES_X86_AVX;
29104 for (uint32_t n = 9; n < 16; n++) {
29105 for (size_t k = 1; k <= 5; k += 2) {
29106 for (uint32_t m = 1; m <= 7; m++) {
29107 GemmMicrokernelTester()
29108 .mr(7)
29109 .nr(8)
29110 .kr(1)
29111 .sr(1)
29112 .m(m)
29113 .n(n)
29114 .k(k)
29115 .iterations(1)
29116 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29117 }
29118 }
29119 }
29120 }
29121
29122 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8) {
29123 TEST_REQUIRES_X86_AVX;
29124 for (uint32_t n = 16; n <= 24; n += 8) {
29125 for (size_t k = 1; k <= 5; k += 2) {
29126 GemmMicrokernelTester()
29127 .mr(7)
29128 .nr(8)
29129 .kr(1)
29130 .sr(1)
29131 .m(7)
29132 .n(8)
29133 .k(k)
29134 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29135 }
29136 }
29137 }
29138
29139 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8_strided_cn) {
29140 TEST_REQUIRES_X86_AVX;
29141 for (uint32_t n = 16; n <= 24; n += 8) {
29142 for (size_t k = 1; k <= 5; k += 2) {
29143 GemmMicrokernelTester()
29144 .mr(7)
29145 .nr(8)
29146 .kr(1)
29147 .sr(1)
29148 .m(7)
29149 .n(n)
29150 .k(k)
29151 .cn_stride(11)
29152 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29153 }
29154 }
29155 }
29156
29157 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8_strided_a) {
29158 TEST_REQUIRES_X86_AVX;
29159 for (uint32_t n = 16; n <= 24; n += 8) {
29160 for (size_t k = 1; k <= 5; k += 2) {
29161 GemmMicrokernelTester()
29162 .mr(7)
29163 .nr(8)
29164 .kr(1)
29165 .sr(1)
29166 .m(7)
29167 .n(n)
29168 .k(k)
29169 .a_stride(7)
29170 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29171 }
29172 }
29173 }
29174
29175 TEST(F32_GEMM_7X8__AVX_BROADCAST, n_div_8_subtile) {
29176 TEST_REQUIRES_X86_AVX;
29177 for (uint32_t n = 16; n <= 24; n += 8) {
29178 for (size_t k = 1; k <= 5; k += 2) {
29179 for (uint32_t m = 1; m <= 7; m++) {
29180 GemmMicrokernelTester()
29181 .mr(7)
29182 .nr(8)
29183 .kr(1)
29184 .sr(1)
29185 .m(m)
29186 .n(n)
29187 .k(k)
29188 .iterations(1)
29189 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29190 }
29191 }
29192 }
29193 }
29194
29195 TEST(F32_GEMM_7X8__AVX_BROADCAST, strided_cm_subtile) {
29196 TEST_REQUIRES_X86_AVX;
29197 for (size_t k = 1; k <= 5; k += 2) {
29198 for (uint32_t m = 1; m <= 7; m++) {
29199 for (uint32_t n = 1; n <= 8; n++) {
29200 GemmMicrokernelTester()
29201 .mr(7)
29202 .nr(8)
29203 .kr(1)
29204 .sr(1)
29205 .m(m)
29206 .n(n)
29207 .k(k)
29208 .cm_stride(11)
29209 .iterations(1)
29210 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29211 }
29212 }
29213 }
29214 }
29215
29216 TEST(F32_GEMM_7X8__AVX_BROADCAST, qmin) {
29217 TEST_REQUIRES_X86_AVX;
29218 GemmMicrokernelTester()
29219 .mr(7)
29220 .nr(8)
29221 .kr(1)
29222 .sr(1)
29223 .m(7)
29224 .n(8)
29225 .k(1)
29226 .qmin(128)
29227 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29228 }
29229
29230 TEST(F32_GEMM_7X8__AVX_BROADCAST, qmax) {
29231 TEST_REQUIRES_X86_AVX;
29232 GemmMicrokernelTester()
29233 .mr(7)
29234 .nr(8)
29235 .kr(1)
29236 .sr(1)
29237 .m(7)
29238 .n(8)
29239 .k(1)
29240 .qmax(128)
29241 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29242 }
29243
29244 TEST(F32_GEMM_7X8__AVX_BROADCAST, strided_cm) {
29245 TEST_REQUIRES_X86_AVX;
29246 GemmMicrokernelTester()
29247 .mr(7)
29248 .nr(8)
29249 .kr(1)
29250 .sr(1)
29251 .m(7)
29252 .n(8)
29253 .k(1)
29254 .cm_stride(11)
29255 .Test(xnn_f32_gemm_ukernel_7x8__avx_broadcast);
29256 }
29257#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29258
29259
29260#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhaneccfd712019-12-08 16:49:27 -080029261 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1) {
29262 TEST_REQUIRES_X86_AVX;
29263 GemmMicrokernelTester()
29264 .mr(1)
29265 .nr(16)
29266 .kr(1)
29267 .sr(1)
29268 .m(1)
29269 .n(16)
29270 .k(1)
29271 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29272 }
29273
29274 TEST(F32_GEMM_1X16__AVX_BROADCAST, strided_cn) {
29275 TEST_REQUIRES_X86_AVX;
29276 GemmMicrokernelTester()
29277 .mr(1)
29278 .nr(16)
29279 .kr(1)
29280 .sr(1)
29281 .m(1)
29282 .n(16)
29283 .k(1)
29284 .cn_stride(19)
29285 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29286 }
29287
29288 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_strided_a) {
29289 TEST_REQUIRES_X86_AVX;
29290 GemmMicrokernelTester()
29291 .mr(1)
29292 .nr(16)
29293 .kr(1)
29294 .sr(1)
29295 .m(1)
29296 .n(16)
29297 .k(1)
29298 .a_stride(3)
29299 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29300 }
29301
29302 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_subtile) {
29303 TEST_REQUIRES_X86_AVX;
29304 for (uint32_t m = 1; m <= 1; m++) {
29305 for (uint32_t n = 1; n <= 16; n++) {
29306 GemmMicrokernelTester()
29307 .mr(1)
29308 .nr(16)
29309 .kr(1)
29310 .sr(1)
29311 .m(m)
29312 .n(n)
29313 .k(1)
29314 .iterations(1)
29315 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29316 }
29317 }
29318 }
29319
29320 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_m) {
29321 TEST_REQUIRES_X86_AVX;
29322 for (uint32_t m = 1; m <= 1; m++) {
29323 GemmMicrokernelTester()
29324 .mr(1)
29325 .nr(16)
29326 .kr(1)
29327 .sr(1)
29328 .m(m)
29329 .n(16)
29330 .k(1)
29331 .iterations(1)
29332 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29333 }
29334 }
29335
29336 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_eq_1_subtile_n) {
29337 TEST_REQUIRES_X86_AVX;
29338 for (uint32_t n = 1; n <= 16; n++) {
29339 GemmMicrokernelTester()
29340 .mr(1)
29341 .nr(16)
29342 .kr(1)
29343 .sr(1)
29344 .m(1)
29345 .n(n)
29346 .k(1)
29347 .iterations(1)
29348 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29349 }
29350 }
29351
29352 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_gt_1) {
29353 TEST_REQUIRES_X86_AVX;
29354 for (size_t k = 2; k < 10; k++) {
29355 GemmMicrokernelTester()
29356 .mr(1)
29357 .nr(16)
29358 .kr(1)
29359 .sr(1)
29360 .m(1)
29361 .n(16)
29362 .k(k)
29363 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29364 }
29365 }
29366
29367 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_gt_1_strided_a) {
29368 TEST_REQUIRES_X86_AVX;
29369 for (size_t k = 2; k < 10; k++) {
29370 GemmMicrokernelTester()
29371 .mr(1)
29372 .nr(16)
29373 .kr(1)
29374 .sr(1)
29375 .m(1)
29376 .n(16)
29377 .k(k)
29378 .a_stride(11)
29379 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29380 }
29381 }
29382
29383 TEST(F32_GEMM_1X16__AVX_BROADCAST, k_gt_1_subtile) {
29384 TEST_REQUIRES_X86_AVX;
29385 for (size_t k = 2; k < 10; k++) {
29386 for (uint32_t m = 1; m <= 1; m++) {
29387 for (uint32_t n = 1; n <= 16; n++) {
29388 GemmMicrokernelTester()
29389 .mr(1)
29390 .nr(16)
29391 .kr(1)
29392 .sr(1)
29393 .m(m)
29394 .n(n)
29395 .k(k)
29396 .iterations(1)
29397 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29398 }
29399 }
29400 }
29401 }
29402
29403 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16) {
29404 TEST_REQUIRES_X86_AVX;
29405 for (uint32_t n = 17; n < 32; n++) {
29406 for (size_t k = 1; k <= 5; k += 2) {
29407 GemmMicrokernelTester()
29408 .mr(1)
29409 .nr(16)
29410 .kr(1)
29411 .sr(1)
29412 .m(1)
29413 .n(16)
29414 .k(k)
29415 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29416 }
29417 }
29418 }
29419
29420 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16_strided_cn) {
29421 TEST_REQUIRES_X86_AVX;
29422 for (uint32_t n = 17; n < 32; n++) {
29423 for (size_t k = 1; k <= 5; k += 2) {
29424 GemmMicrokernelTester()
29425 .mr(1)
29426 .nr(16)
29427 .kr(1)
29428 .sr(1)
29429 .m(1)
29430 .n(16)
29431 .k(k)
29432 .cn_stride(19)
29433 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29434 }
29435 }
29436 }
29437
29438 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16_strided_a) {
29439 TEST_REQUIRES_X86_AVX;
29440 for (uint32_t n = 17; n < 32; n++) {
29441 for (size_t k = 1; k <= 5; k += 2) {
29442 GemmMicrokernelTester()
29443 .mr(1)
29444 .nr(16)
29445 .kr(1)
29446 .sr(1)
29447 .m(1)
29448 .n(n)
29449 .k(k)
29450 .a_stride(7)
29451 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29452 }
29453 }
29454 }
29455
29456 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_gt_16_subtile) {
29457 TEST_REQUIRES_X86_AVX;
29458 for (uint32_t n = 17; n < 32; n++) {
29459 for (size_t k = 1; k <= 5; k += 2) {
29460 for (uint32_t m = 1; m <= 1; m++) {
29461 GemmMicrokernelTester()
29462 .mr(1)
29463 .nr(16)
29464 .kr(1)
29465 .sr(1)
29466 .m(m)
29467 .n(n)
29468 .k(k)
29469 .iterations(1)
29470 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29471 }
29472 }
29473 }
29474 }
29475
29476 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16) {
29477 TEST_REQUIRES_X86_AVX;
29478 for (uint32_t n = 32; n <= 48; n += 16) {
29479 for (size_t k = 1; k <= 5; k += 2) {
29480 GemmMicrokernelTester()
29481 .mr(1)
29482 .nr(16)
29483 .kr(1)
29484 .sr(1)
29485 .m(1)
29486 .n(16)
29487 .k(k)
29488 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29489 }
29490 }
29491 }
29492
29493 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16_strided_cn) {
29494 TEST_REQUIRES_X86_AVX;
29495 for (uint32_t n = 32; n <= 48; n += 16) {
29496 for (size_t k = 1; k <= 5; k += 2) {
29497 GemmMicrokernelTester()
29498 .mr(1)
29499 .nr(16)
29500 .kr(1)
29501 .sr(1)
29502 .m(1)
29503 .n(n)
29504 .k(k)
29505 .cn_stride(19)
29506 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29507 }
29508 }
29509 }
29510
29511 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16_strided_a) {
29512 TEST_REQUIRES_X86_AVX;
29513 for (uint32_t n = 32; n <= 48; n += 16) {
29514 for (size_t k = 1; k <= 5; k += 2) {
29515 GemmMicrokernelTester()
29516 .mr(1)
29517 .nr(16)
29518 .kr(1)
29519 .sr(1)
29520 .m(1)
29521 .n(n)
29522 .k(k)
29523 .a_stride(7)
29524 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29525 }
29526 }
29527 }
29528
29529 TEST(F32_GEMM_1X16__AVX_BROADCAST, n_div_16_subtile) {
29530 TEST_REQUIRES_X86_AVX;
29531 for (uint32_t n = 32; n <= 48; n += 16) {
29532 for (size_t k = 1; k <= 5; k += 2) {
29533 for (uint32_t m = 1; m <= 1; m++) {
29534 GemmMicrokernelTester()
29535 .mr(1)
29536 .nr(16)
29537 .kr(1)
29538 .sr(1)
29539 .m(m)
29540 .n(n)
29541 .k(k)
29542 .iterations(1)
29543 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29544 }
29545 }
29546 }
29547 }
29548
29549 TEST(F32_GEMM_1X16__AVX_BROADCAST, strided_cm_subtile) {
29550 TEST_REQUIRES_X86_AVX;
29551 for (size_t k = 1; k <= 5; k += 2) {
29552 for (uint32_t m = 1; m <= 1; m++) {
29553 for (uint32_t n = 1; n <= 16; n++) {
29554 GemmMicrokernelTester()
29555 .mr(1)
29556 .nr(16)
29557 .kr(1)
29558 .sr(1)
29559 .m(m)
29560 .n(n)
29561 .k(k)
29562 .cm_stride(19)
29563 .iterations(1)
29564 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29565 }
29566 }
29567 }
29568 }
29569
29570 TEST(F32_GEMM_1X16__AVX_BROADCAST, qmin) {
29571 TEST_REQUIRES_X86_AVX;
29572 GemmMicrokernelTester()
29573 .mr(1)
29574 .nr(16)
29575 .kr(1)
29576 .sr(1)
29577 .m(1)
29578 .n(16)
29579 .k(1)
29580 .qmin(128)
29581 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29582 }
29583
29584 TEST(F32_GEMM_1X16__AVX_BROADCAST, qmax) {
29585 TEST_REQUIRES_X86_AVX;
29586 GemmMicrokernelTester()
29587 .mr(1)
29588 .nr(16)
29589 .kr(1)
29590 .sr(1)
29591 .m(1)
29592 .n(16)
29593 .k(1)
29594 .qmax(128)
29595 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29596 }
29597
29598 TEST(F32_GEMM_1X16__AVX_BROADCAST, strided_cm) {
29599 TEST_REQUIRES_X86_AVX;
29600 GemmMicrokernelTester()
29601 .mr(1)
29602 .nr(16)
29603 .kr(1)
29604 .sr(1)
29605 .m(1)
29606 .n(16)
29607 .k(1)
29608 .cm_stride(19)
29609 .Test(xnn_f32_gemm_ukernel_1x16__avx_broadcast);
29610 }
29611#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29612
29613
29614#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29615 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1) {
29616 TEST_REQUIRES_X86_AVX;
29617 GemmMicrokernelTester()
29618 .mr(3)
29619 .nr(16)
29620 .kr(1)
29621 .sr(1)
29622 .m(3)
29623 .n(16)
29624 .k(1)
29625 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29626 }
29627
29628 TEST(F32_GEMM_3X16__AVX_BROADCAST, strided_cn) {
29629 TEST_REQUIRES_X86_AVX;
29630 GemmMicrokernelTester()
29631 .mr(3)
29632 .nr(16)
29633 .kr(1)
29634 .sr(1)
29635 .m(3)
29636 .n(16)
29637 .k(1)
29638 .cn_stride(19)
29639 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29640 }
29641
29642 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_strided_a) {
29643 TEST_REQUIRES_X86_AVX;
29644 GemmMicrokernelTester()
29645 .mr(3)
29646 .nr(16)
29647 .kr(1)
29648 .sr(1)
29649 .m(3)
29650 .n(16)
29651 .k(1)
29652 .a_stride(3)
29653 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29654 }
29655
29656 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_subtile) {
29657 TEST_REQUIRES_X86_AVX;
29658 for (uint32_t m = 1; m <= 3; m++) {
29659 for (uint32_t n = 1; n <= 16; n++) {
29660 GemmMicrokernelTester()
29661 .mr(3)
29662 .nr(16)
29663 .kr(1)
29664 .sr(1)
29665 .m(m)
29666 .n(n)
29667 .k(1)
29668 .iterations(1)
29669 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29670 }
29671 }
29672 }
29673
29674 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_m) {
29675 TEST_REQUIRES_X86_AVX;
29676 for (uint32_t m = 1; m <= 3; m++) {
29677 GemmMicrokernelTester()
29678 .mr(3)
29679 .nr(16)
29680 .kr(1)
29681 .sr(1)
29682 .m(m)
29683 .n(16)
29684 .k(1)
29685 .iterations(1)
29686 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29687 }
29688 }
29689
29690 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_eq_1_subtile_n) {
29691 TEST_REQUIRES_X86_AVX;
29692 for (uint32_t n = 1; n <= 16; n++) {
29693 GemmMicrokernelTester()
29694 .mr(3)
29695 .nr(16)
29696 .kr(1)
29697 .sr(1)
29698 .m(3)
29699 .n(n)
29700 .k(1)
29701 .iterations(1)
29702 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29703 }
29704 }
29705
29706 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_gt_1) {
29707 TEST_REQUIRES_X86_AVX;
29708 for (size_t k = 2; k < 10; k++) {
29709 GemmMicrokernelTester()
29710 .mr(3)
29711 .nr(16)
29712 .kr(1)
29713 .sr(1)
29714 .m(3)
29715 .n(16)
29716 .k(k)
29717 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29718 }
29719 }
29720
29721 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_gt_1_strided_a) {
29722 TEST_REQUIRES_X86_AVX;
29723 for (size_t k = 2; k < 10; k++) {
29724 GemmMicrokernelTester()
29725 .mr(3)
29726 .nr(16)
29727 .kr(1)
29728 .sr(1)
29729 .m(3)
29730 .n(16)
29731 .k(k)
29732 .a_stride(11)
29733 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29734 }
29735 }
29736
29737 TEST(F32_GEMM_3X16__AVX_BROADCAST, k_gt_1_subtile) {
29738 TEST_REQUIRES_X86_AVX;
29739 for (size_t k = 2; k < 10; k++) {
29740 for (uint32_t m = 1; m <= 3; m++) {
29741 for (uint32_t n = 1; n <= 16; n++) {
29742 GemmMicrokernelTester()
29743 .mr(3)
29744 .nr(16)
29745 .kr(1)
29746 .sr(1)
29747 .m(m)
29748 .n(n)
29749 .k(k)
29750 .iterations(1)
29751 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29752 }
29753 }
29754 }
29755 }
29756
29757 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16) {
29758 TEST_REQUIRES_X86_AVX;
29759 for (uint32_t n = 17; n < 32; n++) {
29760 for (size_t k = 1; k <= 5; k += 2) {
29761 GemmMicrokernelTester()
29762 .mr(3)
29763 .nr(16)
29764 .kr(1)
29765 .sr(1)
29766 .m(3)
29767 .n(16)
29768 .k(k)
29769 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29770 }
29771 }
29772 }
29773
29774 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16_strided_cn) {
29775 TEST_REQUIRES_X86_AVX;
29776 for (uint32_t n = 17; n < 32; n++) {
29777 for (size_t k = 1; k <= 5; k += 2) {
29778 GemmMicrokernelTester()
29779 .mr(3)
29780 .nr(16)
29781 .kr(1)
29782 .sr(1)
29783 .m(3)
29784 .n(16)
29785 .k(k)
29786 .cn_stride(19)
29787 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29788 }
29789 }
29790 }
29791
29792 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16_strided_a) {
29793 TEST_REQUIRES_X86_AVX;
29794 for (uint32_t n = 17; n < 32; n++) {
29795 for (size_t k = 1; k <= 5; k += 2) {
29796 GemmMicrokernelTester()
29797 .mr(3)
29798 .nr(16)
29799 .kr(1)
29800 .sr(1)
29801 .m(3)
29802 .n(n)
29803 .k(k)
29804 .a_stride(7)
29805 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29806 }
29807 }
29808 }
29809
29810 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_gt_16_subtile) {
29811 TEST_REQUIRES_X86_AVX;
29812 for (uint32_t n = 17; n < 32; n++) {
29813 for (size_t k = 1; k <= 5; k += 2) {
29814 for (uint32_t m = 1; m <= 3; m++) {
29815 GemmMicrokernelTester()
29816 .mr(3)
29817 .nr(16)
29818 .kr(1)
29819 .sr(1)
29820 .m(m)
29821 .n(n)
29822 .k(k)
29823 .iterations(1)
29824 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29825 }
29826 }
29827 }
29828 }
29829
29830 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16) {
29831 TEST_REQUIRES_X86_AVX;
29832 for (uint32_t n = 32; n <= 48; n += 16) {
29833 for (size_t k = 1; k <= 5; k += 2) {
29834 GemmMicrokernelTester()
29835 .mr(3)
29836 .nr(16)
29837 .kr(1)
29838 .sr(1)
29839 .m(3)
29840 .n(16)
29841 .k(k)
29842 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29843 }
29844 }
29845 }
29846
29847 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16_strided_cn) {
29848 TEST_REQUIRES_X86_AVX;
29849 for (uint32_t n = 32; n <= 48; n += 16) {
29850 for (size_t k = 1; k <= 5; k += 2) {
29851 GemmMicrokernelTester()
29852 .mr(3)
29853 .nr(16)
29854 .kr(1)
29855 .sr(1)
29856 .m(3)
29857 .n(n)
29858 .k(k)
29859 .cn_stride(19)
29860 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29861 }
29862 }
29863 }
29864
29865 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16_strided_a) {
29866 TEST_REQUIRES_X86_AVX;
29867 for (uint32_t n = 32; n <= 48; n += 16) {
29868 for (size_t k = 1; k <= 5; k += 2) {
29869 GemmMicrokernelTester()
29870 .mr(3)
29871 .nr(16)
29872 .kr(1)
29873 .sr(1)
29874 .m(3)
29875 .n(n)
29876 .k(k)
29877 .a_stride(7)
29878 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29879 }
29880 }
29881 }
29882
29883 TEST(F32_GEMM_3X16__AVX_BROADCAST, n_div_16_subtile) {
29884 TEST_REQUIRES_X86_AVX;
29885 for (uint32_t n = 32; n <= 48; n += 16) {
29886 for (size_t k = 1; k <= 5; k += 2) {
29887 for (uint32_t m = 1; m <= 3; m++) {
29888 GemmMicrokernelTester()
29889 .mr(3)
29890 .nr(16)
29891 .kr(1)
29892 .sr(1)
29893 .m(m)
29894 .n(n)
29895 .k(k)
29896 .iterations(1)
29897 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29898 }
29899 }
29900 }
29901 }
29902
29903 TEST(F32_GEMM_3X16__AVX_BROADCAST, strided_cm_subtile) {
29904 TEST_REQUIRES_X86_AVX;
29905 for (size_t k = 1; k <= 5; k += 2) {
29906 for (uint32_t m = 1; m <= 3; m++) {
29907 for (uint32_t n = 1; n <= 16; n++) {
29908 GemmMicrokernelTester()
29909 .mr(3)
29910 .nr(16)
29911 .kr(1)
29912 .sr(1)
29913 .m(m)
29914 .n(n)
29915 .k(k)
29916 .cm_stride(19)
29917 .iterations(1)
29918 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29919 }
29920 }
29921 }
29922 }
29923
29924 TEST(F32_GEMM_3X16__AVX_BROADCAST, qmin) {
29925 TEST_REQUIRES_X86_AVX;
29926 GemmMicrokernelTester()
29927 .mr(3)
29928 .nr(16)
29929 .kr(1)
29930 .sr(1)
29931 .m(3)
29932 .n(16)
29933 .k(1)
29934 .qmin(128)
29935 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29936 }
29937
29938 TEST(F32_GEMM_3X16__AVX_BROADCAST, qmax) {
29939 TEST_REQUIRES_X86_AVX;
29940 GemmMicrokernelTester()
29941 .mr(3)
29942 .nr(16)
29943 .kr(1)
29944 .sr(1)
29945 .m(3)
29946 .n(16)
29947 .k(1)
29948 .qmax(128)
29949 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29950 }
29951
29952 TEST(F32_GEMM_3X16__AVX_BROADCAST, strided_cm) {
29953 TEST_REQUIRES_X86_AVX;
29954 GemmMicrokernelTester()
29955 .mr(3)
29956 .nr(16)
29957 .kr(1)
29958 .sr(1)
29959 .m(3)
29960 .n(16)
29961 .k(1)
29962 .cm_stride(19)
29963 .Test(xnn_f32_gemm_ukernel_3x16__avx_broadcast);
29964 }
29965#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
29966
29967
29968#if XNN_ARCH_X86 || XNN_ARCH_X86_64
29969 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1) {
29970 TEST_REQUIRES_X86_AVX;
29971 GemmMicrokernelTester()
29972 .mr(4)
29973 .nr(16)
29974 .kr(1)
29975 .sr(1)
29976 .m(4)
29977 .n(16)
29978 .k(1)
29979 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
29980 }
29981
29982 TEST(F32_GEMM_4X16__AVX_BROADCAST, strided_cn) {
29983 TEST_REQUIRES_X86_AVX;
29984 GemmMicrokernelTester()
29985 .mr(4)
29986 .nr(16)
29987 .kr(1)
29988 .sr(1)
29989 .m(4)
29990 .n(16)
29991 .k(1)
29992 .cn_stride(19)
29993 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
29994 }
29995
29996 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_strided_a) {
29997 TEST_REQUIRES_X86_AVX;
29998 GemmMicrokernelTester()
29999 .mr(4)
30000 .nr(16)
30001 .kr(1)
30002 .sr(1)
30003 .m(4)
30004 .n(16)
30005 .k(1)
30006 .a_stride(3)
30007 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30008 }
30009
30010 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_subtile) {
30011 TEST_REQUIRES_X86_AVX;
30012 for (uint32_t m = 1; m <= 4; m++) {
30013 for (uint32_t n = 1; n <= 16; n++) {
30014 GemmMicrokernelTester()
30015 .mr(4)
30016 .nr(16)
30017 .kr(1)
30018 .sr(1)
30019 .m(m)
30020 .n(n)
30021 .k(1)
30022 .iterations(1)
30023 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30024 }
30025 }
30026 }
30027
30028 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_m) {
30029 TEST_REQUIRES_X86_AVX;
30030 for (uint32_t m = 1; m <= 4; m++) {
30031 GemmMicrokernelTester()
30032 .mr(4)
30033 .nr(16)
30034 .kr(1)
30035 .sr(1)
30036 .m(m)
30037 .n(16)
30038 .k(1)
30039 .iterations(1)
30040 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30041 }
30042 }
30043
30044 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_eq_1_subtile_n) {
30045 TEST_REQUIRES_X86_AVX;
30046 for (uint32_t n = 1; n <= 16; n++) {
30047 GemmMicrokernelTester()
30048 .mr(4)
30049 .nr(16)
30050 .kr(1)
30051 .sr(1)
30052 .m(4)
30053 .n(n)
30054 .k(1)
30055 .iterations(1)
30056 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30057 }
30058 }
30059
30060 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_gt_1) {
30061 TEST_REQUIRES_X86_AVX;
30062 for (size_t k = 2; k < 10; k++) {
30063 GemmMicrokernelTester()
30064 .mr(4)
30065 .nr(16)
30066 .kr(1)
30067 .sr(1)
30068 .m(4)
30069 .n(16)
30070 .k(k)
30071 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30072 }
30073 }
30074
30075 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_gt_1_strided_a) {
30076 TEST_REQUIRES_X86_AVX;
30077 for (size_t k = 2; k < 10; k++) {
30078 GemmMicrokernelTester()
30079 .mr(4)
30080 .nr(16)
30081 .kr(1)
30082 .sr(1)
30083 .m(4)
30084 .n(16)
30085 .k(k)
30086 .a_stride(11)
30087 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30088 }
30089 }
30090
30091 TEST(F32_GEMM_4X16__AVX_BROADCAST, k_gt_1_subtile) {
30092 TEST_REQUIRES_X86_AVX;
30093 for (size_t k = 2; k < 10; k++) {
30094 for (uint32_t m = 1; m <= 4; m++) {
30095 for (uint32_t n = 1; n <= 16; n++) {
30096 GemmMicrokernelTester()
30097 .mr(4)
30098 .nr(16)
30099 .kr(1)
30100 .sr(1)
30101 .m(m)
30102 .n(n)
30103 .k(k)
30104 .iterations(1)
30105 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30106 }
30107 }
30108 }
30109 }
30110
30111 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16) {
30112 TEST_REQUIRES_X86_AVX;
30113 for (uint32_t n = 17; n < 32; n++) {
30114 for (size_t k = 1; k <= 5; k += 2) {
30115 GemmMicrokernelTester()
30116 .mr(4)
30117 .nr(16)
30118 .kr(1)
30119 .sr(1)
30120 .m(4)
30121 .n(16)
30122 .k(k)
30123 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30124 }
30125 }
30126 }
30127
30128 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16_strided_cn) {
30129 TEST_REQUIRES_X86_AVX;
30130 for (uint32_t n = 17; n < 32; n++) {
30131 for (size_t k = 1; k <= 5; k += 2) {
30132 GemmMicrokernelTester()
30133 .mr(4)
30134 .nr(16)
30135 .kr(1)
30136 .sr(1)
30137 .m(4)
30138 .n(16)
30139 .k(k)
30140 .cn_stride(19)
30141 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30142 }
30143 }
30144 }
30145
30146 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16_strided_a) {
30147 TEST_REQUIRES_X86_AVX;
30148 for (uint32_t n = 17; n < 32; n++) {
30149 for (size_t k = 1; k <= 5; k += 2) {
30150 GemmMicrokernelTester()
30151 .mr(4)
30152 .nr(16)
30153 .kr(1)
30154 .sr(1)
30155 .m(4)
30156 .n(n)
30157 .k(k)
30158 .a_stride(7)
30159 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30160 }
30161 }
30162 }
30163
30164 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_gt_16_subtile) {
30165 TEST_REQUIRES_X86_AVX;
30166 for (uint32_t n = 17; n < 32; n++) {
30167 for (size_t k = 1; k <= 5; k += 2) {
30168 for (uint32_t m = 1; m <= 4; m++) {
30169 GemmMicrokernelTester()
30170 .mr(4)
30171 .nr(16)
30172 .kr(1)
30173 .sr(1)
30174 .m(m)
30175 .n(n)
30176 .k(k)
30177 .iterations(1)
30178 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30179 }
30180 }
30181 }
30182 }
30183
30184 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16) {
30185 TEST_REQUIRES_X86_AVX;
30186 for (uint32_t n = 32; n <= 48; n += 16) {
30187 for (size_t k = 1; k <= 5; k += 2) {
30188 GemmMicrokernelTester()
30189 .mr(4)
30190 .nr(16)
30191 .kr(1)
30192 .sr(1)
30193 .m(4)
30194 .n(16)
30195 .k(k)
30196 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30197 }
30198 }
30199 }
30200
30201 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16_strided_cn) {
30202 TEST_REQUIRES_X86_AVX;
30203 for (uint32_t n = 32; n <= 48; n += 16) {
30204 for (size_t k = 1; k <= 5; k += 2) {
30205 GemmMicrokernelTester()
30206 .mr(4)
30207 .nr(16)
30208 .kr(1)
30209 .sr(1)
30210 .m(4)
30211 .n(n)
30212 .k(k)
30213 .cn_stride(19)
30214 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30215 }
30216 }
30217 }
30218
30219 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16_strided_a) {
30220 TEST_REQUIRES_X86_AVX;
30221 for (uint32_t n = 32; n <= 48; n += 16) {
30222 for (size_t k = 1; k <= 5; k += 2) {
30223 GemmMicrokernelTester()
30224 .mr(4)
30225 .nr(16)
30226 .kr(1)
30227 .sr(1)
30228 .m(4)
30229 .n(n)
30230 .k(k)
30231 .a_stride(7)
30232 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30233 }
30234 }
30235 }
30236
30237 TEST(F32_GEMM_4X16__AVX_BROADCAST, n_div_16_subtile) {
30238 TEST_REQUIRES_X86_AVX;
30239 for (uint32_t n = 32; n <= 48; n += 16) {
30240 for (size_t k = 1; k <= 5; k += 2) {
30241 for (uint32_t m = 1; m <= 4; m++) {
30242 GemmMicrokernelTester()
30243 .mr(4)
30244 .nr(16)
30245 .kr(1)
30246 .sr(1)
30247 .m(m)
30248 .n(n)
30249 .k(k)
30250 .iterations(1)
30251 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30252 }
30253 }
30254 }
30255 }
30256
30257 TEST(F32_GEMM_4X16__AVX_BROADCAST, strided_cm_subtile) {
30258 TEST_REQUIRES_X86_AVX;
30259 for (size_t k = 1; k <= 5; k += 2) {
30260 for (uint32_t m = 1; m <= 4; m++) {
30261 for (uint32_t n = 1; n <= 16; n++) {
30262 GemmMicrokernelTester()
30263 .mr(4)
30264 .nr(16)
30265 .kr(1)
30266 .sr(1)
30267 .m(m)
30268 .n(n)
30269 .k(k)
30270 .cm_stride(19)
30271 .iterations(1)
30272 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30273 }
30274 }
30275 }
30276 }
30277
30278 TEST(F32_GEMM_4X16__AVX_BROADCAST, qmin) {
30279 TEST_REQUIRES_X86_AVX;
30280 GemmMicrokernelTester()
30281 .mr(4)
30282 .nr(16)
30283 .kr(1)
30284 .sr(1)
30285 .m(4)
30286 .n(16)
30287 .k(1)
30288 .qmin(128)
30289 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30290 }
30291
30292 TEST(F32_GEMM_4X16__AVX_BROADCAST, qmax) {
30293 TEST_REQUIRES_X86_AVX;
30294 GemmMicrokernelTester()
30295 .mr(4)
30296 .nr(16)
30297 .kr(1)
30298 .sr(1)
30299 .m(4)
30300 .n(16)
30301 .k(1)
30302 .qmax(128)
30303 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30304 }
30305
30306 TEST(F32_GEMM_4X16__AVX_BROADCAST, strided_cm) {
30307 TEST_REQUIRES_X86_AVX;
30308 GemmMicrokernelTester()
30309 .mr(4)
30310 .nr(16)
30311 .kr(1)
30312 .sr(1)
30313 .m(4)
30314 .n(16)
30315 .k(1)
30316 .cm_stride(19)
30317 .Test(xnn_f32_gemm_ukernel_4x16__avx_broadcast);
30318 }
30319#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30320
30321
30322#if XNN_ARCH_X86 || XNN_ARCH_X86_64
30323 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1) {
30324 TEST_REQUIRES_X86_AVX;
30325 GemmMicrokernelTester()
30326 .mr(5)
30327 .nr(16)
30328 .kr(1)
30329 .sr(1)
30330 .m(5)
30331 .n(16)
30332 .k(1)
30333 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30334 }
30335
30336 TEST(F32_GEMM_5X16__AVX_BROADCAST, strided_cn) {
30337 TEST_REQUIRES_X86_AVX;
30338 GemmMicrokernelTester()
30339 .mr(5)
30340 .nr(16)
30341 .kr(1)
30342 .sr(1)
30343 .m(5)
30344 .n(16)
30345 .k(1)
30346 .cn_stride(19)
30347 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30348 }
30349
30350 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_strided_a) {
30351 TEST_REQUIRES_X86_AVX;
30352 GemmMicrokernelTester()
30353 .mr(5)
30354 .nr(16)
30355 .kr(1)
30356 .sr(1)
30357 .m(5)
30358 .n(16)
30359 .k(1)
30360 .a_stride(3)
30361 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30362 }
30363
30364 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_subtile) {
30365 TEST_REQUIRES_X86_AVX;
30366 for (uint32_t m = 1; m <= 5; m++) {
30367 for (uint32_t n = 1; n <= 16; n++) {
30368 GemmMicrokernelTester()
30369 .mr(5)
30370 .nr(16)
30371 .kr(1)
30372 .sr(1)
30373 .m(m)
30374 .n(n)
30375 .k(1)
30376 .iterations(1)
30377 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30378 }
30379 }
30380 }
30381
30382 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_m) {
30383 TEST_REQUIRES_X86_AVX;
30384 for (uint32_t m = 1; m <= 5; m++) {
30385 GemmMicrokernelTester()
30386 .mr(5)
30387 .nr(16)
30388 .kr(1)
30389 .sr(1)
30390 .m(m)
30391 .n(16)
30392 .k(1)
30393 .iterations(1)
30394 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30395 }
30396 }
30397
30398 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_eq_1_subtile_n) {
30399 TEST_REQUIRES_X86_AVX;
30400 for (uint32_t n = 1; n <= 16; n++) {
30401 GemmMicrokernelTester()
30402 .mr(5)
30403 .nr(16)
30404 .kr(1)
30405 .sr(1)
30406 .m(5)
30407 .n(n)
30408 .k(1)
30409 .iterations(1)
30410 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30411 }
30412 }
30413
30414 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_gt_1) {
30415 TEST_REQUIRES_X86_AVX;
30416 for (size_t k = 2; k < 10; k++) {
30417 GemmMicrokernelTester()
30418 .mr(5)
30419 .nr(16)
30420 .kr(1)
30421 .sr(1)
30422 .m(5)
30423 .n(16)
30424 .k(k)
30425 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30426 }
30427 }
30428
30429 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_gt_1_strided_a) {
30430 TEST_REQUIRES_X86_AVX;
30431 for (size_t k = 2; k < 10; k++) {
30432 GemmMicrokernelTester()
30433 .mr(5)
30434 .nr(16)
30435 .kr(1)
30436 .sr(1)
30437 .m(5)
30438 .n(16)
30439 .k(k)
30440 .a_stride(11)
30441 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30442 }
30443 }
30444
30445 TEST(F32_GEMM_5X16__AVX_BROADCAST, k_gt_1_subtile) {
30446 TEST_REQUIRES_X86_AVX;
30447 for (size_t k = 2; k < 10; k++) {
30448 for (uint32_t m = 1; m <= 5; m++) {
30449 for (uint32_t n = 1; n <= 16; n++) {
30450 GemmMicrokernelTester()
30451 .mr(5)
30452 .nr(16)
30453 .kr(1)
30454 .sr(1)
30455 .m(m)
30456 .n(n)
30457 .k(k)
30458 .iterations(1)
30459 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30460 }
30461 }
30462 }
30463 }
30464
30465 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16) {
30466 TEST_REQUIRES_X86_AVX;
30467 for (uint32_t n = 17; n < 32; n++) {
30468 for (size_t k = 1; k <= 5; k += 2) {
30469 GemmMicrokernelTester()
30470 .mr(5)
30471 .nr(16)
30472 .kr(1)
30473 .sr(1)
30474 .m(5)
30475 .n(16)
30476 .k(k)
30477 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30478 }
30479 }
30480 }
30481
30482 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16_strided_cn) {
30483 TEST_REQUIRES_X86_AVX;
30484 for (uint32_t n = 17; n < 32; n++) {
30485 for (size_t k = 1; k <= 5; k += 2) {
30486 GemmMicrokernelTester()
30487 .mr(5)
30488 .nr(16)
30489 .kr(1)
30490 .sr(1)
30491 .m(5)
30492 .n(16)
30493 .k(k)
30494 .cn_stride(19)
30495 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30496 }
30497 }
30498 }
30499
30500 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16_strided_a) {
30501 TEST_REQUIRES_X86_AVX;
30502 for (uint32_t n = 17; n < 32; n++) {
30503 for (size_t k = 1; k <= 5; k += 2) {
30504 GemmMicrokernelTester()
30505 .mr(5)
30506 .nr(16)
30507 .kr(1)
30508 .sr(1)
30509 .m(5)
30510 .n(n)
30511 .k(k)
30512 .a_stride(7)
30513 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30514 }
30515 }
30516 }
30517
30518 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_gt_16_subtile) {
30519 TEST_REQUIRES_X86_AVX;
30520 for (uint32_t n = 17; n < 32; n++) {
30521 for (size_t k = 1; k <= 5; k += 2) {
30522 for (uint32_t m = 1; m <= 5; m++) {
30523 GemmMicrokernelTester()
30524 .mr(5)
30525 .nr(16)
30526 .kr(1)
30527 .sr(1)
30528 .m(m)
30529 .n(n)
30530 .k(k)
30531 .iterations(1)
30532 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30533 }
30534 }
30535 }
30536 }
30537
30538 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16) {
30539 TEST_REQUIRES_X86_AVX;
30540 for (uint32_t n = 32; n <= 48; n += 16) {
30541 for (size_t k = 1; k <= 5; k += 2) {
30542 GemmMicrokernelTester()
30543 .mr(5)
30544 .nr(16)
30545 .kr(1)
30546 .sr(1)
30547 .m(5)
30548 .n(16)
30549 .k(k)
30550 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30551 }
30552 }
30553 }
30554
30555 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16_strided_cn) {
30556 TEST_REQUIRES_X86_AVX;
30557 for (uint32_t n = 32; n <= 48; n += 16) {
30558 for (size_t k = 1; k <= 5; k += 2) {
30559 GemmMicrokernelTester()
30560 .mr(5)
30561 .nr(16)
30562 .kr(1)
30563 .sr(1)
30564 .m(5)
30565 .n(n)
30566 .k(k)
30567 .cn_stride(19)
30568 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30569 }
30570 }
30571 }
30572
30573 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16_strided_a) {
30574 TEST_REQUIRES_X86_AVX;
30575 for (uint32_t n = 32; n <= 48; n += 16) {
30576 for (size_t k = 1; k <= 5; k += 2) {
30577 GemmMicrokernelTester()
30578 .mr(5)
30579 .nr(16)
30580 .kr(1)
30581 .sr(1)
30582 .m(5)
30583 .n(n)
30584 .k(k)
30585 .a_stride(7)
30586 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30587 }
30588 }
30589 }
30590
30591 TEST(F32_GEMM_5X16__AVX_BROADCAST, n_div_16_subtile) {
30592 TEST_REQUIRES_X86_AVX;
30593 for (uint32_t n = 32; n <= 48; n += 16) {
30594 for (size_t k = 1; k <= 5; k += 2) {
30595 for (uint32_t m = 1; m <= 5; m++) {
30596 GemmMicrokernelTester()
30597 .mr(5)
30598 .nr(16)
30599 .kr(1)
30600 .sr(1)
30601 .m(m)
30602 .n(n)
30603 .k(k)
30604 .iterations(1)
30605 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30606 }
30607 }
30608 }
30609 }
30610
30611 TEST(F32_GEMM_5X16__AVX_BROADCAST, strided_cm_subtile) {
30612 TEST_REQUIRES_X86_AVX;
30613 for (size_t k = 1; k <= 5; k += 2) {
30614 for (uint32_t m = 1; m <= 5; m++) {
30615 for (uint32_t n = 1; n <= 16; n++) {
30616 GemmMicrokernelTester()
30617 .mr(5)
30618 .nr(16)
30619 .kr(1)
30620 .sr(1)
30621 .m(m)
30622 .n(n)
30623 .k(k)
30624 .cm_stride(19)
30625 .iterations(1)
30626 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30627 }
30628 }
30629 }
30630 }
30631
30632 TEST(F32_GEMM_5X16__AVX_BROADCAST, qmin) {
30633 TEST_REQUIRES_X86_AVX;
30634 GemmMicrokernelTester()
30635 .mr(5)
30636 .nr(16)
30637 .kr(1)
30638 .sr(1)
30639 .m(5)
30640 .n(16)
30641 .k(1)
30642 .qmin(128)
30643 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30644 }
30645
30646 TEST(F32_GEMM_5X16__AVX_BROADCAST, qmax) {
30647 TEST_REQUIRES_X86_AVX;
30648 GemmMicrokernelTester()
30649 .mr(5)
30650 .nr(16)
30651 .kr(1)
30652 .sr(1)
30653 .m(5)
30654 .n(16)
30655 .k(1)
30656 .qmax(128)
30657 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30658 }
30659
30660 TEST(F32_GEMM_5X16__AVX_BROADCAST, strided_cm) {
30661 TEST_REQUIRES_X86_AVX;
30662 GemmMicrokernelTester()
30663 .mr(5)
30664 .nr(16)
30665 .kr(1)
30666 .sr(1)
30667 .m(5)
30668 .n(16)
30669 .k(1)
30670 .cm_stride(19)
30671 .Test(xnn_f32_gemm_ukernel_5x16__avx_broadcast);
30672 }
30673#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
30674
30675
30676#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhanfda12b82019-11-21 12:27:59 -080030677 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1) {
30678 TEST_REQUIRES_X86_FMA3;
30679 GemmMicrokernelTester()
30680 .mr(1)
30681 .nr(8)
30682 .kr(1)
30683 .sr(1)
30684 .m(1)
30685 .n(8)
30686 .k(1)
30687 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30688 }
30689
30690 TEST(F32_GEMM_1X8__FMA3_BROADCAST, strided_cn) {
30691 TEST_REQUIRES_X86_FMA3;
30692 GemmMicrokernelTester()
30693 .mr(1)
30694 .nr(8)
30695 .kr(1)
30696 .sr(1)
30697 .m(1)
30698 .n(8)
30699 .k(1)
30700 .cn_stride(11)
30701 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30702 }
30703
30704 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_strided_a) {
30705 TEST_REQUIRES_X86_FMA3;
30706 GemmMicrokernelTester()
30707 .mr(1)
30708 .nr(8)
30709 .kr(1)
30710 .sr(1)
30711 .m(1)
30712 .n(8)
30713 .k(1)
30714 .a_stride(3)
30715 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30716 }
30717
30718 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile) {
30719 TEST_REQUIRES_X86_FMA3;
30720 for (uint32_t m = 1; m <= 1; m++) {
30721 for (uint32_t n = 1; n <= 8; n++) {
30722 GemmMicrokernelTester()
30723 .mr(1)
30724 .nr(8)
30725 .kr(1)
30726 .sr(1)
30727 .m(m)
30728 .n(n)
30729 .k(1)
30730 .iterations(1)
30731 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30732 }
30733 }
30734 }
30735
30736 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
30737 TEST_REQUIRES_X86_FMA3;
30738 for (uint32_t m = 1; m <= 1; m++) {
30739 GemmMicrokernelTester()
30740 .mr(1)
30741 .nr(8)
30742 .kr(1)
30743 .sr(1)
30744 .m(m)
30745 .n(8)
30746 .k(1)
30747 .iterations(1)
30748 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30749 }
30750 }
30751
30752 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
30753 TEST_REQUIRES_X86_FMA3;
30754 for (uint32_t n = 1; n <= 8; n++) {
30755 GemmMicrokernelTester()
30756 .mr(1)
30757 .nr(8)
30758 .kr(1)
30759 .sr(1)
30760 .m(1)
30761 .n(n)
30762 .k(1)
30763 .iterations(1)
30764 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30765 }
30766 }
30767
30768 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_gt_1) {
30769 TEST_REQUIRES_X86_FMA3;
30770 for (size_t k = 2; k < 10; k++) {
30771 GemmMicrokernelTester()
30772 .mr(1)
30773 .nr(8)
30774 .kr(1)
30775 .sr(1)
30776 .m(1)
30777 .n(8)
30778 .k(k)
30779 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30780 }
30781 }
30782
30783 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_gt_1_strided_a) {
30784 TEST_REQUIRES_X86_FMA3;
30785 for (size_t k = 2; k < 10; k++) {
30786 GemmMicrokernelTester()
30787 .mr(1)
30788 .nr(8)
30789 .kr(1)
30790 .sr(1)
30791 .m(1)
30792 .n(8)
30793 .k(k)
30794 .a_stride(11)
30795 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30796 }
30797 }
30798
30799 TEST(F32_GEMM_1X8__FMA3_BROADCAST, k_gt_1_subtile) {
30800 TEST_REQUIRES_X86_FMA3;
30801 for (size_t k = 2; k < 10; k++) {
30802 for (uint32_t m = 1; m <= 1; m++) {
30803 for (uint32_t n = 1; n <= 8; n++) {
30804 GemmMicrokernelTester()
30805 .mr(1)
30806 .nr(8)
30807 .kr(1)
30808 .sr(1)
30809 .m(m)
30810 .n(n)
30811 .k(k)
30812 .iterations(1)
30813 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30814 }
30815 }
30816 }
30817 }
30818
30819 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8) {
30820 TEST_REQUIRES_X86_FMA3;
30821 for (uint32_t n = 9; n < 16; n++) {
30822 for (size_t k = 1; k <= 5; k += 2) {
30823 GemmMicrokernelTester()
30824 .mr(1)
30825 .nr(8)
30826 .kr(1)
30827 .sr(1)
30828 .m(1)
30829 .n(8)
30830 .k(k)
30831 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30832 }
30833 }
30834 }
30835
30836 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
30837 TEST_REQUIRES_X86_FMA3;
30838 for (uint32_t n = 9; n < 16; n++) {
30839 for (size_t k = 1; k <= 5; k += 2) {
30840 GemmMicrokernelTester()
30841 .mr(1)
30842 .nr(8)
30843 .kr(1)
30844 .sr(1)
30845 .m(1)
30846 .n(8)
30847 .k(k)
30848 .cn_stride(11)
30849 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30850 }
30851 }
30852 }
30853
30854 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8_strided_a) {
30855 TEST_REQUIRES_X86_FMA3;
30856 for (uint32_t n = 9; n < 16; n++) {
30857 for (size_t k = 1; k <= 5; k += 2) {
30858 GemmMicrokernelTester()
30859 .mr(1)
30860 .nr(8)
30861 .kr(1)
30862 .sr(1)
30863 .m(1)
30864 .n(n)
30865 .k(k)
30866 .a_stride(7)
30867 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30868 }
30869 }
30870 }
30871
30872 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_gt_8_subtile) {
30873 TEST_REQUIRES_X86_FMA3;
30874 for (uint32_t n = 9; n < 16; n++) {
30875 for (size_t k = 1; k <= 5; k += 2) {
30876 for (uint32_t m = 1; m <= 1; m++) {
30877 GemmMicrokernelTester()
30878 .mr(1)
30879 .nr(8)
30880 .kr(1)
30881 .sr(1)
30882 .m(m)
30883 .n(n)
30884 .k(k)
30885 .iterations(1)
30886 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30887 }
30888 }
30889 }
30890 }
30891
30892 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8) {
30893 TEST_REQUIRES_X86_FMA3;
30894 for (uint32_t n = 16; n <= 24; n += 8) {
30895 for (size_t k = 1; k <= 5; k += 2) {
30896 GemmMicrokernelTester()
30897 .mr(1)
30898 .nr(8)
30899 .kr(1)
30900 .sr(1)
30901 .m(1)
30902 .n(8)
30903 .k(k)
30904 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30905 }
30906 }
30907 }
30908
30909 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8_strided_cn) {
30910 TEST_REQUIRES_X86_FMA3;
30911 for (uint32_t n = 16; n <= 24; n += 8) {
30912 for (size_t k = 1; k <= 5; k += 2) {
30913 GemmMicrokernelTester()
30914 .mr(1)
30915 .nr(8)
30916 .kr(1)
30917 .sr(1)
30918 .m(1)
30919 .n(n)
30920 .k(k)
30921 .cn_stride(11)
30922 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30923 }
30924 }
30925 }
30926
30927 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8_strided_a) {
30928 TEST_REQUIRES_X86_FMA3;
30929 for (uint32_t n = 16; n <= 24; n += 8) {
30930 for (size_t k = 1; k <= 5; k += 2) {
30931 GemmMicrokernelTester()
30932 .mr(1)
30933 .nr(8)
30934 .kr(1)
30935 .sr(1)
30936 .m(1)
30937 .n(n)
30938 .k(k)
30939 .a_stride(7)
30940 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30941 }
30942 }
30943 }
30944
30945 TEST(F32_GEMM_1X8__FMA3_BROADCAST, n_div_8_subtile) {
30946 TEST_REQUIRES_X86_FMA3;
30947 for (uint32_t n = 16; n <= 24; n += 8) {
30948 for (size_t k = 1; k <= 5; k += 2) {
30949 for (uint32_t m = 1; m <= 1; m++) {
30950 GemmMicrokernelTester()
30951 .mr(1)
30952 .nr(8)
30953 .kr(1)
30954 .sr(1)
30955 .m(m)
30956 .n(n)
30957 .k(k)
30958 .iterations(1)
30959 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30960 }
30961 }
30962 }
30963 }
30964
30965 TEST(F32_GEMM_1X8__FMA3_BROADCAST, strided_cm_subtile) {
30966 TEST_REQUIRES_X86_FMA3;
30967 for (size_t k = 1; k <= 5; k += 2) {
30968 for (uint32_t m = 1; m <= 1; m++) {
30969 for (uint32_t n = 1; n <= 8; n++) {
30970 GemmMicrokernelTester()
30971 .mr(1)
30972 .nr(8)
30973 .kr(1)
30974 .sr(1)
30975 .m(m)
30976 .n(n)
30977 .k(k)
30978 .cm_stride(11)
30979 .iterations(1)
30980 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30981 }
30982 }
30983 }
30984 }
30985
30986 TEST(F32_GEMM_1X8__FMA3_BROADCAST, qmin) {
30987 TEST_REQUIRES_X86_FMA3;
30988 GemmMicrokernelTester()
30989 .mr(1)
30990 .nr(8)
30991 .kr(1)
30992 .sr(1)
30993 .m(1)
30994 .n(8)
30995 .k(1)
30996 .qmin(128)
30997 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
30998 }
30999
31000 TEST(F32_GEMM_1X8__FMA3_BROADCAST, qmax) {
31001 TEST_REQUIRES_X86_FMA3;
31002 GemmMicrokernelTester()
31003 .mr(1)
31004 .nr(8)
31005 .kr(1)
31006 .sr(1)
31007 .m(1)
31008 .n(8)
31009 .k(1)
31010 .qmax(128)
31011 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
31012 }
31013
31014 TEST(F32_GEMM_1X8__FMA3_BROADCAST, strided_cm) {
31015 TEST_REQUIRES_X86_FMA3;
31016 GemmMicrokernelTester()
31017 .mr(1)
31018 .nr(8)
31019 .kr(1)
31020 .sr(1)
31021 .m(1)
31022 .n(8)
31023 .k(1)
31024 .cm_stride(11)
31025 .Test(xnn_f32_gemm_ukernel_1x8__fma3_broadcast);
31026 }
31027#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31028
31029
31030#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31031 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1) {
31032 TEST_REQUIRES_X86_FMA3;
31033 GemmMicrokernelTester()
31034 .mr(4)
31035 .nr(8)
31036 .kr(1)
31037 .sr(1)
31038 .m(4)
31039 .n(8)
31040 .k(1)
31041 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31042 }
31043
31044 TEST(F32_GEMM_4X8__FMA3_BROADCAST, strided_cn) {
31045 TEST_REQUIRES_X86_FMA3;
31046 GemmMicrokernelTester()
31047 .mr(4)
31048 .nr(8)
31049 .kr(1)
31050 .sr(1)
31051 .m(4)
31052 .n(8)
31053 .k(1)
31054 .cn_stride(11)
31055 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31056 }
31057
31058 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_strided_a) {
31059 TEST_REQUIRES_X86_FMA3;
31060 GemmMicrokernelTester()
31061 .mr(4)
31062 .nr(8)
31063 .kr(1)
31064 .sr(1)
31065 .m(4)
31066 .n(8)
31067 .k(1)
31068 .a_stride(3)
31069 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31070 }
31071
31072 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile) {
31073 TEST_REQUIRES_X86_FMA3;
31074 for (uint32_t m = 1; m <= 4; m++) {
31075 for (uint32_t n = 1; n <= 8; n++) {
31076 GemmMicrokernelTester()
31077 .mr(4)
31078 .nr(8)
31079 .kr(1)
31080 .sr(1)
31081 .m(m)
31082 .n(n)
31083 .k(1)
31084 .iterations(1)
31085 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31086 }
31087 }
31088 }
31089
31090 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31091 TEST_REQUIRES_X86_FMA3;
31092 for (uint32_t m = 1; m <= 4; m++) {
31093 GemmMicrokernelTester()
31094 .mr(4)
31095 .nr(8)
31096 .kr(1)
31097 .sr(1)
31098 .m(m)
31099 .n(8)
31100 .k(1)
31101 .iterations(1)
31102 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31103 }
31104 }
31105
31106 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31107 TEST_REQUIRES_X86_FMA3;
31108 for (uint32_t n = 1; n <= 8; n++) {
31109 GemmMicrokernelTester()
31110 .mr(4)
31111 .nr(8)
31112 .kr(1)
31113 .sr(1)
31114 .m(4)
31115 .n(n)
31116 .k(1)
31117 .iterations(1)
31118 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31119 }
31120 }
31121
31122 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_gt_1) {
31123 TEST_REQUIRES_X86_FMA3;
31124 for (size_t k = 2; k < 10; k++) {
31125 GemmMicrokernelTester()
31126 .mr(4)
31127 .nr(8)
31128 .kr(1)
31129 .sr(1)
31130 .m(4)
31131 .n(8)
31132 .k(k)
31133 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31134 }
31135 }
31136
31137 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_gt_1_strided_a) {
31138 TEST_REQUIRES_X86_FMA3;
31139 for (size_t k = 2; k < 10; k++) {
31140 GemmMicrokernelTester()
31141 .mr(4)
31142 .nr(8)
31143 .kr(1)
31144 .sr(1)
31145 .m(4)
31146 .n(8)
31147 .k(k)
31148 .a_stride(11)
31149 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31150 }
31151 }
31152
31153 TEST(F32_GEMM_4X8__FMA3_BROADCAST, k_gt_1_subtile) {
31154 TEST_REQUIRES_X86_FMA3;
31155 for (size_t k = 2; k < 10; k++) {
31156 for (uint32_t m = 1; m <= 4; m++) {
31157 for (uint32_t n = 1; n <= 8; n++) {
31158 GemmMicrokernelTester()
31159 .mr(4)
31160 .nr(8)
31161 .kr(1)
31162 .sr(1)
31163 .m(m)
31164 .n(n)
31165 .k(k)
31166 .iterations(1)
31167 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31168 }
31169 }
31170 }
31171 }
31172
31173 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8) {
31174 TEST_REQUIRES_X86_FMA3;
31175 for (uint32_t n = 9; n < 16; n++) {
31176 for (size_t k = 1; k <= 5; k += 2) {
31177 GemmMicrokernelTester()
31178 .mr(4)
31179 .nr(8)
31180 .kr(1)
31181 .sr(1)
31182 .m(4)
31183 .n(8)
31184 .k(k)
31185 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31186 }
31187 }
31188 }
31189
31190 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31191 TEST_REQUIRES_X86_FMA3;
31192 for (uint32_t n = 9; n < 16; n++) {
31193 for (size_t k = 1; k <= 5; k += 2) {
31194 GemmMicrokernelTester()
31195 .mr(4)
31196 .nr(8)
31197 .kr(1)
31198 .sr(1)
31199 .m(4)
31200 .n(8)
31201 .k(k)
31202 .cn_stride(11)
31203 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31204 }
31205 }
31206 }
31207
31208 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8_strided_a) {
31209 TEST_REQUIRES_X86_FMA3;
31210 for (uint32_t n = 9; n < 16; n++) {
31211 for (size_t k = 1; k <= 5; k += 2) {
31212 GemmMicrokernelTester()
31213 .mr(4)
31214 .nr(8)
31215 .kr(1)
31216 .sr(1)
31217 .m(4)
31218 .n(n)
31219 .k(k)
31220 .a_stride(7)
31221 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31222 }
31223 }
31224 }
31225
31226 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_gt_8_subtile) {
31227 TEST_REQUIRES_X86_FMA3;
31228 for (uint32_t n = 9; n < 16; n++) {
31229 for (size_t k = 1; k <= 5; k += 2) {
31230 for (uint32_t m = 1; m <= 4; m++) {
31231 GemmMicrokernelTester()
31232 .mr(4)
31233 .nr(8)
31234 .kr(1)
31235 .sr(1)
31236 .m(m)
31237 .n(n)
31238 .k(k)
31239 .iterations(1)
31240 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31241 }
31242 }
31243 }
31244 }
31245
31246 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8) {
31247 TEST_REQUIRES_X86_FMA3;
31248 for (uint32_t n = 16; n <= 24; n += 8) {
31249 for (size_t k = 1; k <= 5; k += 2) {
31250 GemmMicrokernelTester()
31251 .mr(4)
31252 .nr(8)
31253 .kr(1)
31254 .sr(1)
31255 .m(4)
31256 .n(8)
31257 .k(k)
31258 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31259 }
31260 }
31261 }
31262
31263 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8_strided_cn) {
31264 TEST_REQUIRES_X86_FMA3;
31265 for (uint32_t n = 16; n <= 24; n += 8) {
31266 for (size_t k = 1; k <= 5; k += 2) {
31267 GemmMicrokernelTester()
31268 .mr(4)
31269 .nr(8)
31270 .kr(1)
31271 .sr(1)
31272 .m(4)
31273 .n(n)
31274 .k(k)
31275 .cn_stride(11)
31276 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31277 }
31278 }
31279 }
31280
31281 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8_strided_a) {
31282 TEST_REQUIRES_X86_FMA3;
31283 for (uint32_t n = 16; n <= 24; n += 8) {
31284 for (size_t k = 1; k <= 5; k += 2) {
31285 GemmMicrokernelTester()
31286 .mr(4)
31287 .nr(8)
31288 .kr(1)
31289 .sr(1)
31290 .m(4)
31291 .n(n)
31292 .k(k)
31293 .a_stride(7)
31294 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31295 }
31296 }
31297 }
31298
31299 TEST(F32_GEMM_4X8__FMA3_BROADCAST, n_div_8_subtile) {
31300 TEST_REQUIRES_X86_FMA3;
31301 for (uint32_t n = 16; n <= 24; n += 8) {
31302 for (size_t k = 1; k <= 5; k += 2) {
31303 for (uint32_t m = 1; m <= 4; m++) {
31304 GemmMicrokernelTester()
31305 .mr(4)
31306 .nr(8)
31307 .kr(1)
31308 .sr(1)
31309 .m(m)
31310 .n(n)
31311 .k(k)
31312 .iterations(1)
31313 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31314 }
31315 }
31316 }
31317 }
31318
31319 TEST(F32_GEMM_4X8__FMA3_BROADCAST, strided_cm_subtile) {
31320 TEST_REQUIRES_X86_FMA3;
31321 for (size_t k = 1; k <= 5; k += 2) {
31322 for (uint32_t m = 1; m <= 4; m++) {
31323 for (uint32_t n = 1; n <= 8; n++) {
31324 GemmMicrokernelTester()
31325 .mr(4)
31326 .nr(8)
31327 .kr(1)
31328 .sr(1)
31329 .m(m)
31330 .n(n)
31331 .k(k)
31332 .cm_stride(11)
31333 .iterations(1)
31334 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31335 }
31336 }
31337 }
31338 }
31339
31340 TEST(F32_GEMM_4X8__FMA3_BROADCAST, qmin) {
31341 TEST_REQUIRES_X86_FMA3;
31342 GemmMicrokernelTester()
31343 .mr(4)
31344 .nr(8)
31345 .kr(1)
31346 .sr(1)
31347 .m(4)
31348 .n(8)
31349 .k(1)
31350 .qmin(128)
31351 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31352 }
31353
31354 TEST(F32_GEMM_4X8__FMA3_BROADCAST, qmax) {
31355 TEST_REQUIRES_X86_FMA3;
31356 GemmMicrokernelTester()
31357 .mr(4)
31358 .nr(8)
31359 .kr(1)
31360 .sr(1)
31361 .m(4)
31362 .n(8)
31363 .k(1)
31364 .qmax(128)
31365 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31366 }
31367
31368 TEST(F32_GEMM_4X8__FMA3_BROADCAST, strided_cm) {
31369 TEST_REQUIRES_X86_FMA3;
31370 GemmMicrokernelTester()
31371 .mr(4)
31372 .nr(8)
31373 .kr(1)
31374 .sr(1)
31375 .m(4)
31376 .n(8)
31377 .k(1)
31378 .cm_stride(11)
31379 .Test(xnn_f32_gemm_ukernel_4x8__fma3_broadcast);
31380 }
31381#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31382
31383
31384#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31385 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1) {
31386 TEST_REQUIRES_X86_FMA3;
31387 GemmMicrokernelTester()
31388 .mr(5)
31389 .nr(8)
31390 .kr(1)
31391 .sr(1)
31392 .m(5)
31393 .n(8)
31394 .k(1)
31395 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31396 }
31397
31398 TEST(F32_GEMM_5X8__FMA3_BROADCAST, strided_cn) {
31399 TEST_REQUIRES_X86_FMA3;
31400 GemmMicrokernelTester()
31401 .mr(5)
31402 .nr(8)
31403 .kr(1)
31404 .sr(1)
31405 .m(5)
31406 .n(8)
31407 .k(1)
31408 .cn_stride(11)
31409 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31410 }
31411
31412 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_strided_a) {
31413 TEST_REQUIRES_X86_FMA3;
31414 GemmMicrokernelTester()
31415 .mr(5)
31416 .nr(8)
31417 .kr(1)
31418 .sr(1)
31419 .m(5)
31420 .n(8)
31421 .k(1)
31422 .a_stride(3)
31423 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31424 }
31425
31426 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile) {
31427 TEST_REQUIRES_X86_FMA3;
31428 for (uint32_t m = 1; m <= 5; m++) {
31429 for (uint32_t n = 1; n <= 8; n++) {
31430 GemmMicrokernelTester()
31431 .mr(5)
31432 .nr(8)
31433 .kr(1)
31434 .sr(1)
31435 .m(m)
31436 .n(n)
31437 .k(1)
31438 .iterations(1)
31439 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31440 }
31441 }
31442 }
31443
31444 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31445 TEST_REQUIRES_X86_FMA3;
31446 for (uint32_t m = 1; m <= 5; m++) {
31447 GemmMicrokernelTester()
31448 .mr(5)
31449 .nr(8)
31450 .kr(1)
31451 .sr(1)
31452 .m(m)
31453 .n(8)
31454 .k(1)
31455 .iterations(1)
31456 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31457 }
31458 }
31459
31460 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31461 TEST_REQUIRES_X86_FMA3;
31462 for (uint32_t n = 1; n <= 8; n++) {
31463 GemmMicrokernelTester()
31464 .mr(5)
31465 .nr(8)
31466 .kr(1)
31467 .sr(1)
31468 .m(5)
31469 .n(n)
31470 .k(1)
31471 .iterations(1)
31472 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31473 }
31474 }
31475
31476 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_gt_1) {
31477 TEST_REQUIRES_X86_FMA3;
31478 for (size_t k = 2; k < 10; k++) {
31479 GemmMicrokernelTester()
31480 .mr(5)
31481 .nr(8)
31482 .kr(1)
31483 .sr(1)
31484 .m(5)
31485 .n(8)
31486 .k(k)
31487 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31488 }
31489 }
31490
31491 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_gt_1_strided_a) {
31492 TEST_REQUIRES_X86_FMA3;
31493 for (size_t k = 2; k < 10; k++) {
31494 GemmMicrokernelTester()
31495 .mr(5)
31496 .nr(8)
31497 .kr(1)
31498 .sr(1)
31499 .m(5)
31500 .n(8)
31501 .k(k)
31502 .a_stride(11)
31503 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31504 }
31505 }
31506
31507 TEST(F32_GEMM_5X8__FMA3_BROADCAST, k_gt_1_subtile) {
31508 TEST_REQUIRES_X86_FMA3;
31509 for (size_t k = 2; k < 10; k++) {
31510 for (uint32_t m = 1; m <= 5; m++) {
31511 for (uint32_t n = 1; n <= 8; n++) {
31512 GemmMicrokernelTester()
31513 .mr(5)
31514 .nr(8)
31515 .kr(1)
31516 .sr(1)
31517 .m(m)
31518 .n(n)
31519 .k(k)
31520 .iterations(1)
31521 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31522 }
31523 }
31524 }
31525 }
31526
31527 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8) {
31528 TEST_REQUIRES_X86_FMA3;
31529 for (uint32_t n = 9; n < 16; n++) {
31530 for (size_t k = 1; k <= 5; k += 2) {
31531 GemmMicrokernelTester()
31532 .mr(5)
31533 .nr(8)
31534 .kr(1)
31535 .sr(1)
31536 .m(5)
31537 .n(8)
31538 .k(k)
31539 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31540 }
31541 }
31542 }
31543
31544 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31545 TEST_REQUIRES_X86_FMA3;
31546 for (uint32_t n = 9; n < 16; n++) {
31547 for (size_t k = 1; k <= 5; k += 2) {
31548 GemmMicrokernelTester()
31549 .mr(5)
31550 .nr(8)
31551 .kr(1)
31552 .sr(1)
31553 .m(5)
31554 .n(8)
31555 .k(k)
31556 .cn_stride(11)
31557 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31558 }
31559 }
31560 }
31561
31562 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8_strided_a) {
31563 TEST_REQUIRES_X86_FMA3;
31564 for (uint32_t n = 9; n < 16; n++) {
31565 for (size_t k = 1; k <= 5; k += 2) {
31566 GemmMicrokernelTester()
31567 .mr(5)
31568 .nr(8)
31569 .kr(1)
31570 .sr(1)
31571 .m(5)
31572 .n(n)
31573 .k(k)
31574 .a_stride(7)
31575 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31576 }
31577 }
31578 }
31579
31580 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_gt_8_subtile) {
31581 TEST_REQUIRES_X86_FMA3;
31582 for (uint32_t n = 9; n < 16; n++) {
31583 for (size_t k = 1; k <= 5; k += 2) {
31584 for (uint32_t m = 1; m <= 5; m++) {
31585 GemmMicrokernelTester()
31586 .mr(5)
31587 .nr(8)
31588 .kr(1)
31589 .sr(1)
31590 .m(m)
31591 .n(n)
31592 .k(k)
31593 .iterations(1)
31594 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31595 }
31596 }
31597 }
31598 }
31599
31600 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8) {
31601 TEST_REQUIRES_X86_FMA3;
31602 for (uint32_t n = 16; n <= 24; n += 8) {
31603 for (size_t k = 1; k <= 5; k += 2) {
31604 GemmMicrokernelTester()
31605 .mr(5)
31606 .nr(8)
31607 .kr(1)
31608 .sr(1)
31609 .m(5)
31610 .n(8)
31611 .k(k)
31612 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31613 }
31614 }
31615 }
31616
31617 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8_strided_cn) {
31618 TEST_REQUIRES_X86_FMA3;
31619 for (uint32_t n = 16; n <= 24; n += 8) {
31620 for (size_t k = 1; k <= 5; k += 2) {
31621 GemmMicrokernelTester()
31622 .mr(5)
31623 .nr(8)
31624 .kr(1)
31625 .sr(1)
31626 .m(5)
31627 .n(n)
31628 .k(k)
31629 .cn_stride(11)
31630 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31631 }
31632 }
31633 }
31634
31635 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8_strided_a) {
31636 TEST_REQUIRES_X86_FMA3;
31637 for (uint32_t n = 16; n <= 24; n += 8) {
31638 for (size_t k = 1; k <= 5; k += 2) {
31639 GemmMicrokernelTester()
31640 .mr(5)
31641 .nr(8)
31642 .kr(1)
31643 .sr(1)
31644 .m(5)
31645 .n(n)
31646 .k(k)
31647 .a_stride(7)
31648 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31649 }
31650 }
31651 }
31652
31653 TEST(F32_GEMM_5X8__FMA3_BROADCAST, n_div_8_subtile) {
31654 TEST_REQUIRES_X86_FMA3;
31655 for (uint32_t n = 16; n <= 24; n += 8) {
31656 for (size_t k = 1; k <= 5; k += 2) {
31657 for (uint32_t m = 1; m <= 5; m++) {
31658 GemmMicrokernelTester()
31659 .mr(5)
31660 .nr(8)
31661 .kr(1)
31662 .sr(1)
31663 .m(m)
31664 .n(n)
31665 .k(k)
31666 .iterations(1)
31667 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31668 }
31669 }
31670 }
31671 }
31672
31673 TEST(F32_GEMM_5X8__FMA3_BROADCAST, strided_cm_subtile) {
31674 TEST_REQUIRES_X86_FMA3;
31675 for (size_t k = 1; k <= 5; k += 2) {
31676 for (uint32_t m = 1; m <= 5; m++) {
31677 for (uint32_t n = 1; n <= 8; n++) {
31678 GemmMicrokernelTester()
31679 .mr(5)
31680 .nr(8)
31681 .kr(1)
31682 .sr(1)
31683 .m(m)
31684 .n(n)
31685 .k(k)
31686 .cm_stride(11)
31687 .iterations(1)
31688 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31689 }
31690 }
31691 }
31692 }
31693
31694 TEST(F32_GEMM_5X8__FMA3_BROADCAST, qmin) {
31695 TEST_REQUIRES_X86_FMA3;
31696 GemmMicrokernelTester()
31697 .mr(5)
31698 .nr(8)
31699 .kr(1)
31700 .sr(1)
31701 .m(5)
31702 .n(8)
31703 .k(1)
31704 .qmin(128)
31705 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31706 }
31707
31708 TEST(F32_GEMM_5X8__FMA3_BROADCAST, qmax) {
31709 TEST_REQUIRES_X86_FMA3;
31710 GemmMicrokernelTester()
31711 .mr(5)
31712 .nr(8)
31713 .kr(1)
31714 .sr(1)
31715 .m(5)
31716 .n(8)
31717 .k(1)
31718 .qmax(128)
31719 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31720 }
31721
31722 TEST(F32_GEMM_5X8__FMA3_BROADCAST, strided_cm) {
31723 TEST_REQUIRES_X86_FMA3;
31724 GemmMicrokernelTester()
31725 .mr(5)
31726 .nr(8)
31727 .kr(1)
31728 .sr(1)
31729 .m(5)
31730 .n(8)
31731 .k(1)
31732 .cm_stride(11)
31733 .Test(xnn_f32_gemm_ukernel_5x8__fma3_broadcast);
31734 }
31735#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
31736
31737
31738#if XNN_ARCH_X86 || XNN_ARCH_X86_64
31739 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1) {
31740 TEST_REQUIRES_X86_FMA3;
31741 GemmMicrokernelTester()
31742 .mr(6)
31743 .nr(8)
31744 .kr(1)
31745 .sr(1)
31746 .m(6)
31747 .n(8)
31748 .k(1)
31749 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31750 }
31751
31752 TEST(F32_GEMM_6X8__FMA3_BROADCAST, strided_cn) {
31753 TEST_REQUIRES_X86_FMA3;
31754 GemmMicrokernelTester()
31755 .mr(6)
31756 .nr(8)
31757 .kr(1)
31758 .sr(1)
31759 .m(6)
31760 .n(8)
31761 .k(1)
31762 .cn_stride(11)
31763 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31764 }
31765
31766 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_strided_a) {
31767 TEST_REQUIRES_X86_FMA3;
31768 GemmMicrokernelTester()
31769 .mr(6)
31770 .nr(8)
31771 .kr(1)
31772 .sr(1)
31773 .m(6)
31774 .n(8)
31775 .k(1)
31776 .a_stride(3)
31777 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31778 }
31779
31780 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile) {
31781 TEST_REQUIRES_X86_FMA3;
31782 for (uint32_t m = 1; m <= 6; m++) {
31783 for (uint32_t n = 1; n <= 8; n++) {
31784 GemmMicrokernelTester()
31785 .mr(6)
31786 .nr(8)
31787 .kr(1)
31788 .sr(1)
31789 .m(m)
31790 .n(n)
31791 .k(1)
31792 .iterations(1)
31793 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31794 }
31795 }
31796 }
31797
31798 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
31799 TEST_REQUIRES_X86_FMA3;
31800 for (uint32_t m = 1; m <= 6; m++) {
31801 GemmMicrokernelTester()
31802 .mr(6)
31803 .nr(8)
31804 .kr(1)
31805 .sr(1)
31806 .m(m)
31807 .n(8)
31808 .k(1)
31809 .iterations(1)
31810 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31811 }
31812 }
31813
31814 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
31815 TEST_REQUIRES_X86_FMA3;
31816 for (uint32_t n = 1; n <= 8; n++) {
31817 GemmMicrokernelTester()
31818 .mr(6)
31819 .nr(8)
31820 .kr(1)
31821 .sr(1)
31822 .m(6)
31823 .n(n)
31824 .k(1)
31825 .iterations(1)
31826 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31827 }
31828 }
31829
31830 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_gt_1) {
31831 TEST_REQUIRES_X86_FMA3;
31832 for (size_t k = 2; k < 10; k++) {
31833 GemmMicrokernelTester()
31834 .mr(6)
31835 .nr(8)
31836 .kr(1)
31837 .sr(1)
31838 .m(6)
31839 .n(8)
31840 .k(k)
31841 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31842 }
31843 }
31844
31845 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_gt_1_strided_a) {
31846 TEST_REQUIRES_X86_FMA3;
31847 for (size_t k = 2; k < 10; k++) {
31848 GemmMicrokernelTester()
31849 .mr(6)
31850 .nr(8)
31851 .kr(1)
31852 .sr(1)
31853 .m(6)
31854 .n(8)
31855 .k(k)
31856 .a_stride(11)
31857 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31858 }
31859 }
31860
31861 TEST(F32_GEMM_6X8__FMA3_BROADCAST, k_gt_1_subtile) {
31862 TEST_REQUIRES_X86_FMA3;
31863 for (size_t k = 2; k < 10; k++) {
31864 for (uint32_t m = 1; m <= 6; m++) {
31865 for (uint32_t n = 1; n <= 8; n++) {
31866 GemmMicrokernelTester()
31867 .mr(6)
31868 .nr(8)
31869 .kr(1)
31870 .sr(1)
31871 .m(m)
31872 .n(n)
31873 .k(k)
31874 .iterations(1)
31875 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31876 }
31877 }
31878 }
31879 }
31880
31881 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8) {
31882 TEST_REQUIRES_X86_FMA3;
31883 for (uint32_t n = 9; n < 16; n++) {
31884 for (size_t k = 1; k <= 5; k += 2) {
31885 GemmMicrokernelTester()
31886 .mr(6)
31887 .nr(8)
31888 .kr(1)
31889 .sr(1)
31890 .m(6)
31891 .n(8)
31892 .k(k)
31893 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31894 }
31895 }
31896 }
31897
31898 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
31899 TEST_REQUIRES_X86_FMA3;
31900 for (uint32_t n = 9; n < 16; n++) {
31901 for (size_t k = 1; k <= 5; k += 2) {
31902 GemmMicrokernelTester()
31903 .mr(6)
31904 .nr(8)
31905 .kr(1)
31906 .sr(1)
31907 .m(6)
31908 .n(8)
31909 .k(k)
31910 .cn_stride(11)
31911 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31912 }
31913 }
31914 }
31915
31916 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8_strided_a) {
31917 TEST_REQUIRES_X86_FMA3;
31918 for (uint32_t n = 9; n < 16; n++) {
31919 for (size_t k = 1; k <= 5; k += 2) {
31920 GemmMicrokernelTester()
31921 .mr(6)
31922 .nr(8)
31923 .kr(1)
31924 .sr(1)
31925 .m(6)
31926 .n(n)
31927 .k(k)
31928 .a_stride(7)
31929 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31930 }
31931 }
31932 }
31933
31934 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_gt_8_subtile) {
31935 TEST_REQUIRES_X86_FMA3;
31936 for (uint32_t n = 9; n < 16; n++) {
31937 for (size_t k = 1; k <= 5; k += 2) {
31938 for (uint32_t m = 1; m <= 6; m++) {
31939 GemmMicrokernelTester()
31940 .mr(6)
31941 .nr(8)
31942 .kr(1)
31943 .sr(1)
31944 .m(m)
31945 .n(n)
31946 .k(k)
31947 .iterations(1)
31948 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31949 }
31950 }
31951 }
31952 }
31953
31954 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8) {
31955 TEST_REQUIRES_X86_FMA3;
31956 for (uint32_t n = 16; n <= 24; n += 8) {
31957 for (size_t k = 1; k <= 5; k += 2) {
31958 GemmMicrokernelTester()
31959 .mr(6)
31960 .nr(8)
31961 .kr(1)
31962 .sr(1)
31963 .m(6)
31964 .n(8)
31965 .k(k)
31966 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31967 }
31968 }
31969 }
31970
31971 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8_strided_cn) {
31972 TEST_REQUIRES_X86_FMA3;
31973 for (uint32_t n = 16; n <= 24; n += 8) {
31974 for (size_t k = 1; k <= 5; k += 2) {
31975 GemmMicrokernelTester()
31976 .mr(6)
31977 .nr(8)
31978 .kr(1)
31979 .sr(1)
31980 .m(6)
31981 .n(n)
31982 .k(k)
31983 .cn_stride(11)
31984 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
31985 }
31986 }
31987 }
31988
31989 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8_strided_a) {
31990 TEST_REQUIRES_X86_FMA3;
31991 for (uint32_t n = 16; n <= 24; n += 8) {
31992 for (size_t k = 1; k <= 5; k += 2) {
31993 GemmMicrokernelTester()
31994 .mr(6)
31995 .nr(8)
31996 .kr(1)
31997 .sr(1)
31998 .m(6)
31999 .n(n)
32000 .k(k)
32001 .a_stride(7)
32002 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
32003 }
32004 }
32005 }
32006
32007 TEST(F32_GEMM_6X8__FMA3_BROADCAST, n_div_8_subtile) {
32008 TEST_REQUIRES_X86_FMA3;
32009 for (uint32_t n = 16; n <= 24; n += 8) {
32010 for (size_t k = 1; k <= 5; k += 2) {
32011 for (uint32_t m = 1; m <= 6; m++) {
32012 GemmMicrokernelTester()
32013 .mr(6)
32014 .nr(8)
32015 .kr(1)
32016 .sr(1)
32017 .m(m)
32018 .n(n)
32019 .k(k)
32020 .iterations(1)
32021 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
32022 }
32023 }
32024 }
32025 }
32026
32027 TEST(F32_GEMM_6X8__FMA3_BROADCAST, strided_cm_subtile) {
32028 TEST_REQUIRES_X86_FMA3;
32029 for (size_t k = 1; k <= 5; k += 2) {
32030 for (uint32_t m = 1; m <= 6; m++) {
32031 for (uint32_t n = 1; n <= 8; n++) {
32032 GemmMicrokernelTester()
32033 .mr(6)
32034 .nr(8)
32035 .kr(1)
32036 .sr(1)
32037 .m(m)
32038 .n(n)
32039 .k(k)
32040 .cm_stride(11)
32041 .iterations(1)
32042 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
32043 }
32044 }
32045 }
32046 }
32047
32048 TEST(F32_GEMM_6X8__FMA3_BROADCAST, qmin) {
32049 TEST_REQUIRES_X86_FMA3;
32050 GemmMicrokernelTester()
32051 .mr(6)
32052 .nr(8)
32053 .kr(1)
32054 .sr(1)
32055 .m(6)
32056 .n(8)
32057 .k(1)
32058 .qmin(128)
32059 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
32060 }
32061
32062 TEST(F32_GEMM_6X8__FMA3_BROADCAST, qmax) {
32063 TEST_REQUIRES_X86_FMA3;
32064 GemmMicrokernelTester()
32065 .mr(6)
32066 .nr(8)
32067 .kr(1)
32068 .sr(1)
32069 .m(6)
32070 .n(8)
32071 .k(1)
32072 .qmax(128)
32073 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
32074 }
32075
32076 TEST(F32_GEMM_6X8__FMA3_BROADCAST, strided_cm) {
32077 TEST_REQUIRES_X86_FMA3;
32078 GemmMicrokernelTester()
32079 .mr(6)
32080 .nr(8)
32081 .kr(1)
32082 .sr(1)
32083 .m(6)
32084 .n(8)
32085 .k(1)
32086 .cm_stride(11)
32087 .Test(xnn_f32_gemm_ukernel_6x8__fma3_broadcast);
32088 }
32089#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32090
32091
32092#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32093 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1) {
32094 TEST_REQUIRES_X86_FMA3;
32095 GemmMicrokernelTester()
32096 .mr(7)
32097 .nr(8)
32098 .kr(1)
32099 .sr(1)
32100 .m(7)
32101 .n(8)
32102 .k(1)
32103 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32104 }
32105
32106 TEST(F32_GEMM_7X8__FMA3_BROADCAST, strided_cn) {
32107 TEST_REQUIRES_X86_FMA3;
32108 GemmMicrokernelTester()
32109 .mr(7)
32110 .nr(8)
32111 .kr(1)
32112 .sr(1)
32113 .m(7)
32114 .n(8)
32115 .k(1)
32116 .cn_stride(11)
32117 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32118 }
32119
32120 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_strided_a) {
32121 TEST_REQUIRES_X86_FMA3;
32122 GemmMicrokernelTester()
32123 .mr(7)
32124 .nr(8)
32125 .kr(1)
32126 .sr(1)
32127 .m(7)
32128 .n(8)
32129 .k(1)
32130 .a_stride(3)
32131 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32132 }
32133
32134 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile) {
32135 TEST_REQUIRES_X86_FMA3;
32136 for (uint32_t m = 1; m <= 7; m++) {
32137 for (uint32_t n = 1; n <= 8; n++) {
32138 GemmMicrokernelTester()
32139 .mr(7)
32140 .nr(8)
32141 .kr(1)
32142 .sr(1)
32143 .m(m)
32144 .n(n)
32145 .k(1)
32146 .iterations(1)
32147 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32148 }
32149 }
32150 }
32151
32152 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
32153 TEST_REQUIRES_X86_FMA3;
32154 for (uint32_t m = 1; m <= 7; m++) {
32155 GemmMicrokernelTester()
32156 .mr(7)
32157 .nr(8)
32158 .kr(1)
32159 .sr(1)
32160 .m(m)
32161 .n(8)
32162 .k(1)
32163 .iterations(1)
32164 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32165 }
32166 }
32167
32168 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
32169 TEST_REQUIRES_X86_FMA3;
32170 for (uint32_t n = 1; n <= 8; n++) {
32171 GemmMicrokernelTester()
32172 .mr(7)
32173 .nr(8)
32174 .kr(1)
32175 .sr(1)
32176 .m(7)
32177 .n(n)
32178 .k(1)
32179 .iterations(1)
32180 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32181 }
32182 }
32183
32184 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_gt_1) {
32185 TEST_REQUIRES_X86_FMA3;
32186 for (size_t k = 2; k < 10; k++) {
32187 GemmMicrokernelTester()
32188 .mr(7)
32189 .nr(8)
32190 .kr(1)
32191 .sr(1)
32192 .m(7)
32193 .n(8)
32194 .k(k)
32195 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32196 }
32197 }
32198
32199 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_gt_1_strided_a) {
32200 TEST_REQUIRES_X86_FMA3;
32201 for (size_t k = 2; k < 10; k++) {
32202 GemmMicrokernelTester()
32203 .mr(7)
32204 .nr(8)
32205 .kr(1)
32206 .sr(1)
32207 .m(7)
32208 .n(8)
32209 .k(k)
32210 .a_stride(11)
32211 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32212 }
32213 }
32214
32215 TEST(F32_GEMM_7X8__FMA3_BROADCAST, k_gt_1_subtile) {
32216 TEST_REQUIRES_X86_FMA3;
32217 for (size_t k = 2; k < 10; k++) {
32218 for (uint32_t m = 1; m <= 7; m++) {
32219 for (uint32_t n = 1; n <= 8; n++) {
32220 GemmMicrokernelTester()
32221 .mr(7)
32222 .nr(8)
32223 .kr(1)
32224 .sr(1)
32225 .m(m)
32226 .n(n)
32227 .k(k)
32228 .iterations(1)
32229 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32230 }
32231 }
32232 }
32233 }
32234
32235 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8) {
32236 TEST_REQUIRES_X86_FMA3;
32237 for (uint32_t n = 9; n < 16; n++) {
32238 for (size_t k = 1; k <= 5; k += 2) {
32239 GemmMicrokernelTester()
32240 .mr(7)
32241 .nr(8)
32242 .kr(1)
32243 .sr(1)
32244 .m(7)
32245 .n(8)
32246 .k(k)
32247 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32248 }
32249 }
32250 }
32251
32252 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
32253 TEST_REQUIRES_X86_FMA3;
32254 for (uint32_t n = 9; n < 16; n++) {
32255 for (size_t k = 1; k <= 5; k += 2) {
32256 GemmMicrokernelTester()
32257 .mr(7)
32258 .nr(8)
32259 .kr(1)
32260 .sr(1)
32261 .m(7)
32262 .n(8)
32263 .k(k)
32264 .cn_stride(11)
32265 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32266 }
32267 }
32268 }
32269
32270 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8_strided_a) {
32271 TEST_REQUIRES_X86_FMA3;
32272 for (uint32_t n = 9; n < 16; n++) {
32273 for (size_t k = 1; k <= 5; k += 2) {
32274 GemmMicrokernelTester()
32275 .mr(7)
32276 .nr(8)
32277 .kr(1)
32278 .sr(1)
32279 .m(7)
32280 .n(n)
32281 .k(k)
32282 .a_stride(7)
32283 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32284 }
32285 }
32286 }
32287
32288 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_gt_8_subtile) {
32289 TEST_REQUIRES_X86_FMA3;
32290 for (uint32_t n = 9; n < 16; n++) {
32291 for (size_t k = 1; k <= 5; k += 2) {
32292 for (uint32_t m = 1; m <= 7; m++) {
32293 GemmMicrokernelTester()
32294 .mr(7)
32295 .nr(8)
32296 .kr(1)
32297 .sr(1)
32298 .m(m)
32299 .n(n)
32300 .k(k)
32301 .iterations(1)
32302 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32303 }
32304 }
32305 }
32306 }
32307
32308 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8) {
32309 TEST_REQUIRES_X86_FMA3;
32310 for (uint32_t n = 16; n <= 24; n += 8) {
32311 for (size_t k = 1; k <= 5; k += 2) {
32312 GemmMicrokernelTester()
32313 .mr(7)
32314 .nr(8)
32315 .kr(1)
32316 .sr(1)
32317 .m(7)
32318 .n(8)
32319 .k(k)
32320 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32321 }
32322 }
32323 }
32324
32325 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8_strided_cn) {
32326 TEST_REQUIRES_X86_FMA3;
32327 for (uint32_t n = 16; n <= 24; n += 8) {
32328 for (size_t k = 1; k <= 5; k += 2) {
32329 GemmMicrokernelTester()
32330 .mr(7)
32331 .nr(8)
32332 .kr(1)
32333 .sr(1)
32334 .m(7)
32335 .n(n)
32336 .k(k)
32337 .cn_stride(11)
32338 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32339 }
32340 }
32341 }
32342
32343 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8_strided_a) {
32344 TEST_REQUIRES_X86_FMA3;
32345 for (uint32_t n = 16; n <= 24; n += 8) {
32346 for (size_t k = 1; k <= 5; k += 2) {
32347 GemmMicrokernelTester()
32348 .mr(7)
32349 .nr(8)
32350 .kr(1)
32351 .sr(1)
32352 .m(7)
32353 .n(n)
32354 .k(k)
32355 .a_stride(7)
32356 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32357 }
32358 }
32359 }
32360
32361 TEST(F32_GEMM_7X8__FMA3_BROADCAST, n_div_8_subtile) {
32362 TEST_REQUIRES_X86_FMA3;
32363 for (uint32_t n = 16; n <= 24; n += 8) {
32364 for (size_t k = 1; k <= 5; k += 2) {
32365 for (uint32_t m = 1; m <= 7; m++) {
32366 GemmMicrokernelTester()
32367 .mr(7)
32368 .nr(8)
32369 .kr(1)
32370 .sr(1)
32371 .m(m)
32372 .n(n)
32373 .k(k)
32374 .iterations(1)
32375 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32376 }
32377 }
32378 }
32379 }
32380
32381 TEST(F32_GEMM_7X8__FMA3_BROADCAST, strided_cm_subtile) {
32382 TEST_REQUIRES_X86_FMA3;
32383 for (size_t k = 1; k <= 5; k += 2) {
32384 for (uint32_t m = 1; m <= 7; m++) {
32385 for (uint32_t n = 1; n <= 8; n++) {
32386 GemmMicrokernelTester()
32387 .mr(7)
32388 .nr(8)
32389 .kr(1)
32390 .sr(1)
32391 .m(m)
32392 .n(n)
32393 .k(k)
32394 .cm_stride(11)
32395 .iterations(1)
32396 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32397 }
32398 }
32399 }
32400 }
32401
32402 TEST(F32_GEMM_7X8__FMA3_BROADCAST, qmin) {
32403 TEST_REQUIRES_X86_FMA3;
32404 GemmMicrokernelTester()
32405 .mr(7)
32406 .nr(8)
32407 .kr(1)
32408 .sr(1)
32409 .m(7)
32410 .n(8)
32411 .k(1)
32412 .qmin(128)
32413 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32414 }
32415
32416 TEST(F32_GEMM_7X8__FMA3_BROADCAST, qmax) {
32417 TEST_REQUIRES_X86_FMA3;
32418 GemmMicrokernelTester()
32419 .mr(7)
32420 .nr(8)
32421 .kr(1)
32422 .sr(1)
32423 .m(7)
32424 .n(8)
32425 .k(1)
32426 .qmax(128)
32427 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32428 }
32429
32430 TEST(F32_GEMM_7X8__FMA3_BROADCAST, strided_cm) {
32431 TEST_REQUIRES_X86_FMA3;
32432 GemmMicrokernelTester()
32433 .mr(7)
32434 .nr(8)
32435 .kr(1)
32436 .sr(1)
32437 .m(7)
32438 .n(8)
32439 .k(1)
32440 .cm_stride(11)
32441 .Test(xnn_f32_gemm_ukernel_7x8__fma3_broadcast);
32442 }
32443#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32444
32445
32446#if XNN_ARCH_X86 || XNN_ARCH_X86_64
32447 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1) {
32448 TEST_REQUIRES_X86_FMA3;
32449 GemmMicrokernelTester()
32450 .mr(8)
32451 .nr(8)
32452 .kr(1)
32453 .sr(1)
32454 .m(8)
32455 .n(8)
32456 .k(1)
32457 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32458 }
32459
32460 TEST(F32_GEMM_8X8__FMA3_BROADCAST, strided_cn) {
32461 TEST_REQUIRES_X86_FMA3;
32462 GemmMicrokernelTester()
32463 .mr(8)
32464 .nr(8)
32465 .kr(1)
32466 .sr(1)
32467 .m(8)
32468 .n(8)
32469 .k(1)
32470 .cn_stride(11)
32471 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32472 }
32473
32474 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_strided_a) {
32475 TEST_REQUIRES_X86_FMA3;
32476 GemmMicrokernelTester()
32477 .mr(8)
32478 .nr(8)
32479 .kr(1)
32480 .sr(1)
32481 .m(8)
32482 .n(8)
32483 .k(1)
32484 .a_stride(3)
32485 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32486 }
32487
32488 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile) {
32489 TEST_REQUIRES_X86_FMA3;
32490 for (uint32_t m = 1; m <= 8; m++) {
32491 for (uint32_t n = 1; n <= 8; n++) {
32492 GemmMicrokernelTester()
32493 .mr(8)
32494 .nr(8)
32495 .kr(1)
32496 .sr(1)
32497 .m(m)
32498 .n(n)
32499 .k(1)
32500 .iterations(1)
32501 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32502 }
32503 }
32504 }
32505
32506 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_m) {
32507 TEST_REQUIRES_X86_FMA3;
32508 for (uint32_t m = 1; m <= 8; m++) {
32509 GemmMicrokernelTester()
32510 .mr(8)
32511 .nr(8)
32512 .kr(1)
32513 .sr(1)
32514 .m(m)
32515 .n(8)
32516 .k(1)
32517 .iterations(1)
32518 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32519 }
32520 }
32521
32522 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_eq_1_subtile_n) {
32523 TEST_REQUIRES_X86_FMA3;
32524 for (uint32_t n = 1; n <= 8; n++) {
32525 GemmMicrokernelTester()
32526 .mr(8)
32527 .nr(8)
32528 .kr(1)
32529 .sr(1)
32530 .m(8)
32531 .n(n)
32532 .k(1)
32533 .iterations(1)
32534 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32535 }
32536 }
32537
32538 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_gt_1) {
32539 TEST_REQUIRES_X86_FMA3;
32540 for (size_t k = 2; k < 10; k++) {
32541 GemmMicrokernelTester()
32542 .mr(8)
32543 .nr(8)
32544 .kr(1)
32545 .sr(1)
32546 .m(8)
32547 .n(8)
32548 .k(k)
32549 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32550 }
32551 }
32552
32553 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_gt_1_strided_a) {
32554 TEST_REQUIRES_X86_FMA3;
32555 for (size_t k = 2; k < 10; k++) {
32556 GemmMicrokernelTester()
32557 .mr(8)
32558 .nr(8)
32559 .kr(1)
32560 .sr(1)
32561 .m(8)
32562 .n(8)
32563 .k(k)
32564 .a_stride(11)
32565 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32566 }
32567 }
32568
32569 TEST(F32_GEMM_8X8__FMA3_BROADCAST, k_gt_1_subtile) {
32570 TEST_REQUIRES_X86_FMA3;
32571 for (size_t k = 2; k < 10; k++) {
32572 for (uint32_t m = 1; m <= 8; m++) {
32573 for (uint32_t n = 1; n <= 8; n++) {
32574 GemmMicrokernelTester()
32575 .mr(8)
32576 .nr(8)
32577 .kr(1)
32578 .sr(1)
32579 .m(m)
32580 .n(n)
32581 .k(k)
32582 .iterations(1)
32583 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32584 }
32585 }
32586 }
32587 }
32588
32589 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8) {
32590 TEST_REQUIRES_X86_FMA3;
32591 for (uint32_t n = 9; n < 16; n++) {
32592 for (size_t k = 1; k <= 5; k += 2) {
32593 GemmMicrokernelTester()
32594 .mr(8)
32595 .nr(8)
32596 .kr(1)
32597 .sr(1)
32598 .m(8)
32599 .n(8)
32600 .k(k)
32601 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32602 }
32603 }
32604 }
32605
32606 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8_strided_cn) {
32607 TEST_REQUIRES_X86_FMA3;
32608 for (uint32_t n = 9; n < 16; n++) {
32609 for (size_t k = 1; k <= 5; k += 2) {
32610 GemmMicrokernelTester()
32611 .mr(8)
32612 .nr(8)
32613 .kr(1)
32614 .sr(1)
32615 .m(8)
32616 .n(8)
32617 .k(k)
32618 .cn_stride(11)
32619 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32620 }
32621 }
32622 }
32623
32624 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8_strided_a) {
32625 TEST_REQUIRES_X86_FMA3;
32626 for (uint32_t n = 9; n < 16; n++) {
32627 for (size_t k = 1; k <= 5; k += 2) {
32628 GemmMicrokernelTester()
32629 .mr(8)
32630 .nr(8)
32631 .kr(1)
32632 .sr(1)
32633 .m(8)
32634 .n(n)
32635 .k(k)
32636 .a_stride(7)
32637 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32638 }
32639 }
32640 }
32641
32642 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_gt_8_subtile) {
32643 TEST_REQUIRES_X86_FMA3;
32644 for (uint32_t n = 9; n < 16; n++) {
32645 for (size_t k = 1; k <= 5; k += 2) {
32646 for (uint32_t m = 1; m <= 8; m++) {
32647 GemmMicrokernelTester()
32648 .mr(8)
32649 .nr(8)
32650 .kr(1)
32651 .sr(1)
32652 .m(m)
32653 .n(n)
32654 .k(k)
32655 .iterations(1)
32656 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32657 }
32658 }
32659 }
32660 }
32661
32662 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8) {
32663 TEST_REQUIRES_X86_FMA3;
32664 for (uint32_t n = 16; n <= 24; n += 8) {
32665 for (size_t k = 1; k <= 5; k += 2) {
32666 GemmMicrokernelTester()
32667 .mr(8)
32668 .nr(8)
32669 .kr(1)
32670 .sr(1)
32671 .m(8)
32672 .n(8)
32673 .k(k)
32674 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32675 }
32676 }
32677 }
32678
32679 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8_strided_cn) {
32680 TEST_REQUIRES_X86_FMA3;
32681 for (uint32_t n = 16; n <= 24; n += 8) {
32682 for (size_t k = 1; k <= 5; k += 2) {
32683 GemmMicrokernelTester()
32684 .mr(8)
32685 .nr(8)
32686 .kr(1)
32687 .sr(1)
32688 .m(8)
32689 .n(n)
32690 .k(k)
32691 .cn_stride(11)
32692 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32693 }
32694 }
32695 }
32696
32697 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8_strided_a) {
32698 TEST_REQUIRES_X86_FMA3;
32699 for (uint32_t n = 16; n <= 24; n += 8) {
32700 for (size_t k = 1; k <= 5; k += 2) {
32701 GemmMicrokernelTester()
32702 .mr(8)
32703 .nr(8)
32704 .kr(1)
32705 .sr(1)
32706 .m(8)
32707 .n(n)
32708 .k(k)
32709 .a_stride(7)
32710 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32711 }
32712 }
32713 }
32714
32715 TEST(F32_GEMM_8X8__FMA3_BROADCAST, n_div_8_subtile) {
32716 TEST_REQUIRES_X86_FMA3;
32717 for (uint32_t n = 16; n <= 24; n += 8) {
32718 for (size_t k = 1; k <= 5; k += 2) {
32719 for (uint32_t m = 1; m <= 8; m++) {
32720 GemmMicrokernelTester()
32721 .mr(8)
32722 .nr(8)
32723 .kr(1)
32724 .sr(1)
32725 .m(m)
32726 .n(n)
32727 .k(k)
32728 .iterations(1)
32729 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32730 }
32731 }
32732 }
32733 }
32734
32735 TEST(F32_GEMM_8X8__FMA3_BROADCAST, strided_cm_subtile) {
32736 TEST_REQUIRES_X86_FMA3;
32737 for (size_t k = 1; k <= 5; k += 2) {
32738 for (uint32_t m = 1; m <= 8; m++) {
32739 for (uint32_t n = 1; n <= 8; n++) {
32740 GemmMicrokernelTester()
32741 .mr(8)
32742 .nr(8)
32743 .kr(1)
32744 .sr(1)
32745 .m(m)
32746 .n(n)
32747 .k(k)
32748 .cm_stride(11)
32749 .iterations(1)
32750 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32751 }
32752 }
32753 }
32754 }
32755
32756 TEST(F32_GEMM_8X8__FMA3_BROADCAST, qmin) {
32757 TEST_REQUIRES_X86_FMA3;
32758 GemmMicrokernelTester()
32759 .mr(8)
32760 .nr(8)
32761 .kr(1)
32762 .sr(1)
32763 .m(8)
32764 .n(8)
32765 .k(1)
32766 .qmin(128)
32767 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32768 }
32769
32770 TEST(F32_GEMM_8X8__FMA3_BROADCAST, qmax) {
32771 TEST_REQUIRES_X86_FMA3;
32772 GemmMicrokernelTester()
32773 .mr(8)
32774 .nr(8)
32775 .kr(1)
32776 .sr(1)
32777 .m(8)
32778 .n(8)
32779 .k(1)
32780 .qmax(128)
32781 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32782 }
32783
32784 TEST(F32_GEMM_8X8__FMA3_BROADCAST, strided_cm) {
32785 TEST_REQUIRES_X86_FMA3;
32786 GemmMicrokernelTester()
32787 .mr(8)
32788 .nr(8)
32789 .kr(1)
32790 .sr(1)
32791 .m(8)
32792 .n(8)
32793 .k(1)
32794 .cm_stride(11)
32795 .Test(xnn_f32_gemm_ukernel_8x8__fma3_broadcast);
32796 }
32797#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
32798
32799
Marat Dukhan0f349c42019-11-27 11:58:54 -080032800#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhaneccfd712019-12-08 16:49:27 -080032801 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1) {
32802 TEST_REQUIRES_X86_FMA3;
32803 GemmMicrokernelTester()
32804 .mr(1)
32805 .nr(16)
32806 .kr(1)
32807 .sr(1)
32808 .m(1)
32809 .n(16)
32810 .k(1)
32811 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32812 }
32813
32814 TEST(F32_GEMM_1X16__FMA3_BROADCAST, strided_cn) {
32815 TEST_REQUIRES_X86_FMA3;
32816 GemmMicrokernelTester()
32817 .mr(1)
32818 .nr(16)
32819 .kr(1)
32820 .sr(1)
32821 .m(1)
32822 .n(16)
32823 .k(1)
32824 .cn_stride(19)
32825 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32826 }
32827
32828 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_strided_a) {
32829 TEST_REQUIRES_X86_FMA3;
32830 GemmMicrokernelTester()
32831 .mr(1)
32832 .nr(16)
32833 .kr(1)
32834 .sr(1)
32835 .m(1)
32836 .n(16)
32837 .k(1)
32838 .a_stride(3)
32839 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32840 }
32841
32842 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile) {
32843 TEST_REQUIRES_X86_FMA3;
32844 for (uint32_t m = 1; m <= 1; m++) {
32845 for (uint32_t n = 1; n <= 16; n++) {
32846 GemmMicrokernelTester()
32847 .mr(1)
32848 .nr(16)
32849 .kr(1)
32850 .sr(1)
32851 .m(m)
32852 .n(n)
32853 .k(1)
32854 .iterations(1)
32855 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32856 }
32857 }
32858 }
32859
32860 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
32861 TEST_REQUIRES_X86_FMA3;
32862 for (uint32_t m = 1; m <= 1; m++) {
32863 GemmMicrokernelTester()
32864 .mr(1)
32865 .nr(16)
32866 .kr(1)
32867 .sr(1)
32868 .m(m)
32869 .n(16)
32870 .k(1)
32871 .iterations(1)
32872 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32873 }
32874 }
32875
32876 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
32877 TEST_REQUIRES_X86_FMA3;
32878 for (uint32_t n = 1; n <= 16; n++) {
32879 GemmMicrokernelTester()
32880 .mr(1)
32881 .nr(16)
32882 .kr(1)
32883 .sr(1)
32884 .m(1)
32885 .n(n)
32886 .k(1)
32887 .iterations(1)
32888 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32889 }
32890 }
32891
32892 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_gt_1) {
32893 TEST_REQUIRES_X86_FMA3;
32894 for (size_t k = 2; k < 10; k++) {
32895 GemmMicrokernelTester()
32896 .mr(1)
32897 .nr(16)
32898 .kr(1)
32899 .sr(1)
32900 .m(1)
32901 .n(16)
32902 .k(k)
32903 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32904 }
32905 }
32906
32907 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_gt_1_strided_a) {
32908 TEST_REQUIRES_X86_FMA3;
32909 for (size_t k = 2; k < 10; k++) {
32910 GemmMicrokernelTester()
32911 .mr(1)
32912 .nr(16)
32913 .kr(1)
32914 .sr(1)
32915 .m(1)
32916 .n(16)
32917 .k(k)
32918 .a_stride(11)
32919 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32920 }
32921 }
32922
32923 TEST(F32_GEMM_1X16__FMA3_BROADCAST, k_gt_1_subtile) {
32924 TEST_REQUIRES_X86_FMA3;
32925 for (size_t k = 2; k < 10; k++) {
32926 for (uint32_t m = 1; m <= 1; m++) {
32927 for (uint32_t n = 1; n <= 16; n++) {
32928 GemmMicrokernelTester()
32929 .mr(1)
32930 .nr(16)
32931 .kr(1)
32932 .sr(1)
32933 .m(m)
32934 .n(n)
32935 .k(k)
32936 .iterations(1)
32937 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32938 }
32939 }
32940 }
32941 }
32942
32943 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16) {
32944 TEST_REQUIRES_X86_FMA3;
32945 for (uint32_t n = 17; n < 32; n++) {
32946 for (size_t k = 1; k <= 5; k += 2) {
32947 GemmMicrokernelTester()
32948 .mr(1)
32949 .nr(16)
32950 .kr(1)
32951 .sr(1)
32952 .m(1)
32953 .n(16)
32954 .k(k)
32955 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32956 }
32957 }
32958 }
32959
32960 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
32961 TEST_REQUIRES_X86_FMA3;
32962 for (uint32_t n = 17; n < 32; n++) {
32963 for (size_t k = 1; k <= 5; k += 2) {
32964 GemmMicrokernelTester()
32965 .mr(1)
32966 .nr(16)
32967 .kr(1)
32968 .sr(1)
32969 .m(1)
32970 .n(16)
32971 .k(k)
32972 .cn_stride(19)
32973 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32974 }
32975 }
32976 }
32977
32978 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16_strided_a) {
32979 TEST_REQUIRES_X86_FMA3;
32980 for (uint32_t n = 17; n < 32; n++) {
32981 for (size_t k = 1; k <= 5; k += 2) {
32982 GemmMicrokernelTester()
32983 .mr(1)
32984 .nr(16)
32985 .kr(1)
32986 .sr(1)
32987 .m(1)
32988 .n(n)
32989 .k(k)
32990 .a_stride(7)
32991 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
32992 }
32993 }
32994 }
32995
32996 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_gt_16_subtile) {
32997 TEST_REQUIRES_X86_FMA3;
32998 for (uint32_t n = 17; n < 32; n++) {
32999 for (size_t k = 1; k <= 5; k += 2) {
33000 for (uint32_t m = 1; m <= 1; m++) {
33001 GemmMicrokernelTester()
33002 .mr(1)
33003 .nr(16)
33004 .kr(1)
33005 .sr(1)
33006 .m(m)
33007 .n(n)
33008 .k(k)
33009 .iterations(1)
33010 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33011 }
33012 }
33013 }
33014 }
33015
33016 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16) {
33017 TEST_REQUIRES_X86_FMA3;
33018 for (uint32_t n = 32; n <= 48; n += 16) {
33019 for (size_t k = 1; k <= 5; k += 2) {
33020 GemmMicrokernelTester()
33021 .mr(1)
33022 .nr(16)
33023 .kr(1)
33024 .sr(1)
33025 .m(1)
33026 .n(16)
33027 .k(k)
33028 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33029 }
33030 }
33031 }
33032
33033 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16_strided_cn) {
33034 TEST_REQUIRES_X86_FMA3;
33035 for (uint32_t n = 32; n <= 48; n += 16) {
33036 for (size_t k = 1; k <= 5; k += 2) {
33037 GemmMicrokernelTester()
33038 .mr(1)
33039 .nr(16)
33040 .kr(1)
33041 .sr(1)
33042 .m(1)
33043 .n(n)
33044 .k(k)
33045 .cn_stride(19)
33046 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33047 }
33048 }
33049 }
33050
33051 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16_strided_a) {
33052 TEST_REQUIRES_X86_FMA3;
33053 for (uint32_t n = 32; n <= 48; n += 16) {
33054 for (size_t k = 1; k <= 5; k += 2) {
33055 GemmMicrokernelTester()
33056 .mr(1)
33057 .nr(16)
33058 .kr(1)
33059 .sr(1)
33060 .m(1)
33061 .n(n)
33062 .k(k)
33063 .a_stride(7)
33064 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33065 }
33066 }
33067 }
33068
33069 TEST(F32_GEMM_1X16__FMA3_BROADCAST, n_div_16_subtile) {
33070 TEST_REQUIRES_X86_FMA3;
33071 for (uint32_t n = 32; n <= 48; n += 16) {
33072 for (size_t k = 1; k <= 5; k += 2) {
33073 for (uint32_t m = 1; m <= 1; m++) {
33074 GemmMicrokernelTester()
33075 .mr(1)
33076 .nr(16)
33077 .kr(1)
33078 .sr(1)
33079 .m(m)
33080 .n(n)
33081 .k(k)
33082 .iterations(1)
33083 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33084 }
33085 }
33086 }
33087 }
33088
33089 TEST(F32_GEMM_1X16__FMA3_BROADCAST, strided_cm_subtile) {
33090 TEST_REQUIRES_X86_FMA3;
33091 for (size_t k = 1; k <= 5; k += 2) {
33092 for (uint32_t m = 1; m <= 1; m++) {
33093 for (uint32_t n = 1; n <= 16; n++) {
33094 GemmMicrokernelTester()
33095 .mr(1)
33096 .nr(16)
33097 .kr(1)
33098 .sr(1)
33099 .m(m)
33100 .n(n)
33101 .k(k)
33102 .cm_stride(19)
33103 .iterations(1)
33104 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33105 }
33106 }
33107 }
33108 }
33109
33110 TEST(F32_GEMM_1X16__FMA3_BROADCAST, qmin) {
33111 TEST_REQUIRES_X86_FMA3;
33112 GemmMicrokernelTester()
33113 .mr(1)
33114 .nr(16)
33115 .kr(1)
33116 .sr(1)
33117 .m(1)
33118 .n(16)
33119 .k(1)
33120 .qmin(128)
33121 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33122 }
33123
33124 TEST(F32_GEMM_1X16__FMA3_BROADCAST, qmax) {
33125 TEST_REQUIRES_X86_FMA3;
33126 GemmMicrokernelTester()
33127 .mr(1)
33128 .nr(16)
33129 .kr(1)
33130 .sr(1)
33131 .m(1)
33132 .n(16)
33133 .k(1)
33134 .qmax(128)
33135 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33136 }
33137
33138 TEST(F32_GEMM_1X16__FMA3_BROADCAST, strided_cm) {
33139 TEST_REQUIRES_X86_FMA3;
33140 GemmMicrokernelTester()
33141 .mr(1)
33142 .nr(16)
33143 .kr(1)
33144 .sr(1)
33145 .m(1)
33146 .n(16)
33147 .k(1)
33148 .cm_stride(19)
33149 .Test(xnn_f32_gemm_ukernel_1x16__fma3_broadcast);
33150 }
33151#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33152
33153
33154#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33155 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1) {
33156 TEST_REQUIRES_X86_FMA3;
33157 GemmMicrokernelTester()
33158 .mr(3)
33159 .nr(16)
33160 .kr(1)
33161 .sr(1)
33162 .m(3)
33163 .n(16)
33164 .k(1)
33165 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33166 }
33167
33168 TEST(F32_GEMM_3X16__FMA3_BROADCAST, strided_cn) {
33169 TEST_REQUIRES_X86_FMA3;
33170 GemmMicrokernelTester()
33171 .mr(3)
33172 .nr(16)
33173 .kr(1)
33174 .sr(1)
33175 .m(3)
33176 .n(16)
33177 .k(1)
33178 .cn_stride(19)
33179 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33180 }
33181
33182 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_strided_a) {
33183 TEST_REQUIRES_X86_FMA3;
33184 GemmMicrokernelTester()
33185 .mr(3)
33186 .nr(16)
33187 .kr(1)
33188 .sr(1)
33189 .m(3)
33190 .n(16)
33191 .k(1)
33192 .a_stride(3)
33193 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33194 }
33195
33196 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile) {
33197 TEST_REQUIRES_X86_FMA3;
33198 for (uint32_t m = 1; m <= 3; m++) {
33199 for (uint32_t n = 1; n <= 16; n++) {
33200 GemmMicrokernelTester()
33201 .mr(3)
33202 .nr(16)
33203 .kr(1)
33204 .sr(1)
33205 .m(m)
33206 .n(n)
33207 .k(1)
33208 .iterations(1)
33209 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33210 }
33211 }
33212 }
33213
33214 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
33215 TEST_REQUIRES_X86_FMA3;
33216 for (uint32_t m = 1; m <= 3; m++) {
33217 GemmMicrokernelTester()
33218 .mr(3)
33219 .nr(16)
33220 .kr(1)
33221 .sr(1)
33222 .m(m)
33223 .n(16)
33224 .k(1)
33225 .iterations(1)
33226 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33227 }
33228 }
33229
33230 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
33231 TEST_REQUIRES_X86_FMA3;
33232 for (uint32_t n = 1; n <= 16; n++) {
33233 GemmMicrokernelTester()
33234 .mr(3)
33235 .nr(16)
33236 .kr(1)
33237 .sr(1)
33238 .m(3)
33239 .n(n)
33240 .k(1)
33241 .iterations(1)
33242 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33243 }
33244 }
33245
33246 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_gt_1) {
33247 TEST_REQUIRES_X86_FMA3;
33248 for (size_t k = 2; k < 10; k++) {
33249 GemmMicrokernelTester()
33250 .mr(3)
33251 .nr(16)
33252 .kr(1)
33253 .sr(1)
33254 .m(3)
33255 .n(16)
33256 .k(k)
33257 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33258 }
33259 }
33260
33261 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_gt_1_strided_a) {
33262 TEST_REQUIRES_X86_FMA3;
33263 for (size_t k = 2; k < 10; k++) {
33264 GemmMicrokernelTester()
33265 .mr(3)
33266 .nr(16)
33267 .kr(1)
33268 .sr(1)
33269 .m(3)
33270 .n(16)
33271 .k(k)
33272 .a_stride(11)
33273 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33274 }
33275 }
33276
33277 TEST(F32_GEMM_3X16__FMA3_BROADCAST, k_gt_1_subtile) {
33278 TEST_REQUIRES_X86_FMA3;
33279 for (size_t k = 2; k < 10; k++) {
33280 for (uint32_t m = 1; m <= 3; m++) {
33281 for (uint32_t n = 1; n <= 16; n++) {
33282 GemmMicrokernelTester()
33283 .mr(3)
33284 .nr(16)
33285 .kr(1)
33286 .sr(1)
33287 .m(m)
33288 .n(n)
33289 .k(k)
33290 .iterations(1)
33291 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33292 }
33293 }
33294 }
33295 }
33296
33297 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16) {
33298 TEST_REQUIRES_X86_FMA3;
33299 for (uint32_t n = 17; n < 32; n++) {
33300 for (size_t k = 1; k <= 5; k += 2) {
33301 GemmMicrokernelTester()
33302 .mr(3)
33303 .nr(16)
33304 .kr(1)
33305 .sr(1)
33306 .m(3)
33307 .n(16)
33308 .k(k)
33309 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33310 }
33311 }
33312 }
33313
33314 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
33315 TEST_REQUIRES_X86_FMA3;
33316 for (uint32_t n = 17; n < 32; n++) {
33317 for (size_t k = 1; k <= 5; k += 2) {
33318 GemmMicrokernelTester()
33319 .mr(3)
33320 .nr(16)
33321 .kr(1)
33322 .sr(1)
33323 .m(3)
33324 .n(16)
33325 .k(k)
33326 .cn_stride(19)
33327 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33328 }
33329 }
33330 }
33331
33332 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16_strided_a) {
33333 TEST_REQUIRES_X86_FMA3;
33334 for (uint32_t n = 17; n < 32; n++) {
33335 for (size_t k = 1; k <= 5; k += 2) {
33336 GemmMicrokernelTester()
33337 .mr(3)
33338 .nr(16)
33339 .kr(1)
33340 .sr(1)
33341 .m(3)
33342 .n(n)
33343 .k(k)
33344 .a_stride(7)
33345 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33346 }
33347 }
33348 }
33349
33350 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_gt_16_subtile) {
33351 TEST_REQUIRES_X86_FMA3;
33352 for (uint32_t n = 17; n < 32; n++) {
33353 for (size_t k = 1; k <= 5; k += 2) {
33354 for (uint32_t m = 1; m <= 3; m++) {
33355 GemmMicrokernelTester()
33356 .mr(3)
33357 .nr(16)
33358 .kr(1)
33359 .sr(1)
33360 .m(m)
33361 .n(n)
33362 .k(k)
33363 .iterations(1)
33364 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33365 }
33366 }
33367 }
33368 }
33369
33370 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16) {
33371 TEST_REQUIRES_X86_FMA3;
33372 for (uint32_t n = 32; n <= 48; n += 16) {
33373 for (size_t k = 1; k <= 5; k += 2) {
33374 GemmMicrokernelTester()
33375 .mr(3)
33376 .nr(16)
33377 .kr(1)
33378 .sr(1)
33379 .m(3)
33380 .n(16)
33381 .k(k)
33382 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33383 }
33384 }
33385 }
33386
33387 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16_strided_cn) {
33388 TEST_REQUIRES_X86_FMA3;
33389 for (uint32_t n = 32; n <= 48; n += 16) {
33390 for (size_t k = 1; k <= 5; k += 2) {
33391 GemmMicrokernelTester()
33392 .mr(3)
33393 .nr(16)
33394 .kr(1)
33395 .sr(1)
33396 .m(3)
33397 .n(n)
33398 .k(k)
33399 .cn_stride(19)
33400 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33401 }
33402 }
33403 }
33404
33405 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16_strided_a) {
33406 TEST_REQUIRES_X86_FMA3;
33407 for (uint32_t n = 32; n <= 48; n += 16) {
33408 for (size_t k = 1; k <= 5; k += 2) {
33409 GemmMicrokernelTester()
33410 .mr(3)
33411 .nr(16)
33412 .kr(1)
33413 .sr(1)
33414 .m(3)
33415 .n(n)
33416 .k(k)
33417 .a_stride(7)
33418 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33419 }
33420 }
33421 }
33422
33423 TEST(F32_GEMM_3X16__FMA3_BROADCAST, n_div_16_subtile) {
33424 TEST_REQUIRES_X86_FMA3;
33425 for (uint32_t n = 32; n <= 48; n += 16) {
33426 for (size_t k = 1; k <= 5; k += 2) {
33427 for (uint32_t m = 1; m <= 3; m++) {
33428 GemmMicrokernelTester()
33429 .mr(3)
33430 .nr(16)
33431 .kr(1)
33432 .sr(1)
33433 .m(m)
33434 .n(n)
33435 .k(k)
33436 .iterations(1)
33437 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33438 }
33439 }
33440 }
33441 }
33442
33443 TEST(F32_GEMM_3X16__FMA3_BROADCAST, strided_cm_subtile) {
33444 TEST_REQUIRES_X86_FMA3;
33445 for (size_t k = 1; k <= 5; k += 2) {
33446 for (uint32_t m = 1; m <= 3; m++) {
33447 for (uint32_t n = 1; n <= 16; n++) {
33448 GemmMicrokernelTester()
33449 .mr(3)
33450 .nr(16)
33451 .kr(1)
33452 .sr(1)
33453 .m(m)
33454 .n(n)
33455 .k(k)
33456 .cm_stride(19)
33457 .iterations(1)
33458 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33459 }
33460 }
33461 }
33462 }
33463
33464 TEST(F32_GEMM_3X16__FMA3_BROADCAST, qmin) {
33465 TEST_REQUIRES_X86_FMA3;
33466 GemmMicrokernelTester()
33467 .mr(3)
33468 .nr(16)
33469 .kr(1)
33470 .sr(1)
33471 .m(3)
33472 .n(16)
33473 .k(1)
33474 .qmin(128)
33475 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33476 }
33477
33478 TEST(F32_GEMM_3X16__FMA3_BROADCAST, qmax) {
33479 TEST_REQUIRES_X86_FMA3;
33480 GemmMicrokernelTester()
33481 .mr(3)
33482 .nr(16)
33483 .kr(1)
33484 .sr(1)
33485 .m(3)
33486 .n(16)
33487 .k(1)
33488 .qmax(128)
33489 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33490 }
33491
33492 TEST(F32_GEMM_3X16__FMA3_BROADCAST, strided_cm) {
33493 TEST_REQUIRES_X86_FMA3;
33494 GemmMicrokernelTester()
33495 .mr(3)
33496 .nr(16)
33497 .kr(1)
33498 .sr(1)
33499 .m(3)
33500 .n(16)
33501 .k(1)
33502 .cm_stride(19)
33503 .Test(xnn_f32_gemm_ukernel_3x16__fma3_broadcast);
33504 }
33505#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33506
33507
33508#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33509 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1) {
33510 TEST_REQUIRES_X86_FMA3;
33511 GemmMicrokernelTester()
33512 .mr(4)
33513 .nr(16)
33514 .kr(1)
33515 .sr(1)
33516 .m(4)
33517 .n(16)
33518 .k(1)
33519 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33520 }
33521
33522 TEST(F32_GEMM_4X16__FMA3_BROADCAST, strided_cn) {
33523 TEST_REQUIRES_X86_FMA3;
33524 GemmMicrokernelTester()
33525 .mr(4)
33526 .nr(16)
33527 .kr(1)
33528 .sr(1)
33529 .m(4)
33530 .n(16)
33531 .k(1)
33532 .cn_stride(19)
33533 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33534 }
33535
33536 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_strided_a) {
33537 TEST_REQUIRES_X86_FMA3;
33538 GemmMicrokernelTester()
33539 .mr(4)
33540 .nr(16)
33541 .kr(1)
33542 .sr(1)
33543 .m(4)
33544 .n(16)
33545 .k(1)
33546 .a_stride(3)
33547 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33548 }
33549
33550 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile) {
33551 TEST_REQUIRES_X86_FMA3;
33552 for (uint32_t m = 1; m <= 4; m++) {
33553 for (uint32_t n = 1; n <= 16; n++) {
33554 GemmMicrokernelTester()
33555 .mr(4)
33556 .nr(16)
33557 .kr(1)
33558 .sr(1)
33559 .m(m)
33560 .n(n)
33561 .k(1)
33562 .iterations(1)
33563 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33564 }
33565 }
33566 }
33567
33568 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
33569 TEST_REQUIRES_X86_FMA3;
33570 for (uint32_t m = 1; m <= 4; m++) {
33571 GemmMicrokernelTester()
33572 .mr(4)
33573 .nr(16)
33574 .kr(1)
33575 .sr(1)
33576 .m(m)
33577 .n(16)
33578 .k(1)
33579 .iterations(1)
33580 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33581 }
33582 }
33583
33584 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
33585 TEST_REQUIRES_X86_FMA3;
33586 for (uint32_t n = 1; n <= 16; n++) {
33587 GemmMicrokernelTester()
33588 .mr(4)
33589 .nr(16)
33590 .kr(1)
33591 .sr(1)
33592 .m(4)
33593 .n(n)
33594 .k(1)
33595 .iterations(1)
33596 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33597 }
33598 }
33599
33600 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_gt_1) {
33601 TEST_REQUIRES_X86_FMA3;
33602 for (size_t k = 2; k < 10; k++) {
33603 GemmMicrokernelTester()
33604 .mr(4)
33605 .nr(16)
33606 .kr(1)
33607 .sr(1)
33608 .m(4)
33609 .n(16)
33610 .k(k)
33611 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33612 }
33613 }
33614
33615 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_gt_1_strided_a) {
33616 TEST_REQUIRES_X86_FMA3;
33617 for (size_t k = 2; k < 10; k++) {
33618 GemmMicrokernelTester()
33619 .mr(4)
33620 .nr(16)
33621 .kr(1)
33622 .sr(1)
33623 .m(4)
33624 .n(16)
33625 .k(k)
33626 .a_stride(11)
33627 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33628 }
33629 }
33630
33631 TEST(F32_GEMM_4X16__FMA3_BROADCAST, k_gt_1_subtile) {
33632 TEST_REQUIRES_X86_FMA3;
33633 for (size_t k = 2; k < 10; k++) {
33634 for (uint32_t m = 1; m <= 4; m++) {
33635 for (uint32_t n = 1; n <= 16; n++) {
33636 GemmMicrokernelTester()
33637 .mr(4)
33638 .nr(16)
33639 .kr(1)
33640 .sr(1)
33641 .m(m)
33642 .n(n)
33643 .k(k)
33644 .iterations(1)
33645 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33646 }
33647 }
33648 }
33649 }
33650
33651 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16) {
33652 TEST_REQUIRES_X86_FMA3;
33653 for (uint32_t n = 17; n < 32; n++) {
33654 for (size_t k = 1; k <= 5; k += 2) {
33655 GemmMicrokernelTester()
33656 .mr(4)
33657 .nr(16)
33658 .kr(1)
33659 .sr(1)
33660 .m(4)
33661 .n(16)
33662 .k(k)
33663 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33664 }
33665 }
33666 }
33667
33668 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
33669 TEST_REQUIRES_X86_FMA3;
33670 for (uint32_t n = 17; n < 32; n++) {
33671 for (size_t k = 1; k <= 5; k += 2) {
33672 GemmMicrokernelTester()
33673 .mr(4)
33674 .nr(16)
33675 .kr(1)
33676 .sr(1)
33677 .m(4)
33678 .n(16)
33679 .k(k)
33680 .cn_stride(19)
33681 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33682 }
33683 }
33684 }
33685
33686 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16_strided_a) {
33687 TEST_REQUIRES_X86_FMA3;
33688 for (uint32_t n = 17; n < 32; n++) {
33689 for (size_t k = 1; k <= 5; k += 2) {
33690 GemmMicrokernelTester()
33691 .mr(4)
33692 .nr(16)
33693 .kr(1)
33694 .sr(1)
33695 .m(4)
33696 .n(n)
33697 .k(k)
33698 .a_stride(7)
33699 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33700 }
33701 }
33702 }
33703
33704 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_gt_16_subtile) {
33705 TEST_REQUIRES_X86_FMA3;
33706 for (uint32_t n = 17; n < 32; n++) {
33707 for (size_t k = 1; k <= 5; k += 2) {
33708 for (uint32_t m = 1; m <= 4; m++) {
33709 GemmMicrokernelTester()
33710 .mr(4)
33711 .nr(16)
33712 .kr(1)
33713 .sr(1)
33714 .m(m)
33715 .n(n)
33716 .k(k)
33717 .iterations(1)
33718 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33719 }
33720 }
33721 }
33722 }
33723
33724 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16) {
33725 TEST_REQUIRES_X86_FMA3;
33726 for (uint32_t n = 32; n <= 48; n += 16) {
33727 for (size_t k = 1; k <= 5; k += 2) {
33728 GemmMicrokernelTester()
33729 .mr(4)
33730 .nr(16)
33731 .kr(1)
33732 .sr(1)
33733 .m(4)
33734 .n(16)
33735 .k(k)
33736 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33737 }
33738 }
33739 }
33740
33741 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16_strided_cn) {
33742 TEST_REQUIRES_X86_FMA3;
33743 for (uint32_t n = 32; n <= 48; n += 16) {
33744 for (size_t k = 1; k <= 5; k += 2) {
33745 GemmMicrokernelTester()
33746 .mr(4)
33747 .nr(16)
33748 .kr(1)
33749 .sr(1)
33750 .m(4)
33751 .n(n)
33752 .k(k)
33753 .cn_stride(19)
33754 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33755 }
33756 }
33757 }
33758
33759 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16_strided_a) {
33760 TEST_REQUIRES_X86_FMA3;
33761 for (uint32_t n = 32; n <= 48; n += 16) {
33762 for (size_t k = 1; k <= 5; k += 2) {
33763 GemmMicrokernelTester()
33764 .mr(4)
33765 .nr(16)
33766 .kr(1)
33767 .sr(1)
33768 .m(4)
33769 .n(n)
33770 .k(k)
33771 .a_stride(7)
33772 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33773 }
33774 }
33775 }
33776
33777 TEST(F32_GEMM_4X16__FMA3_BROADCAST, n_div_16_subtile) {
33778 TEST_REQUIRES_X86_FMA3;
33779 for (uint32_t n = 32; n <= 48; n += 16) {
33780 for (size_t k = 1; k <= 5; k += 2) {
33781 for (uint32_t m = 1; m <= 4; m++) {
33782 GemmMicrokernelTester()
33783 .mr(4)
33784 .nr(16)
33785 .kr(1)
33786 .sr(1)
33787 .m(m)
33788 .n(n)
33789 .k(k)
33790 .iterations(1)
33791 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33792 }
33793 }
33794 }
33795 }
33796
33797 TEST(F32_GEMM_4X16__FMA3_BROADCAST, strided_cm_subtile) {
33798 TEST_REQUIRES_X86_FMA3;
33799 for (size_t k = 1; k <= 5; k += 2) {
33800 for (uint32_t m = 1; m <= 4; m++) {
33801 for (uint32_t n = 1; n <= 16; n++) {
33802 GemmMicrokernelTester()
33803 .mr(4)
33804 .nr(16)
33805 .kr(1)
33806 .sr(1)
33807 .m(m)
33808 .n(n)
33809 .k(k)
33810 .cm_stride(19)
33811 .iterations(1)
33812 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33813 }
33814 }
33815 }
33816 }
33817
33818 TEST(F32_GEMM_4X16__FMA3_BROADCAST, qmin) {
33819 TEST_REQUIRES_X86_FMA3;
33820 GemmMicrokernelTester()
33821 .mr(4)
33822 .nr(16)
33823 .kr(1)
33824 .sr(1)
33825 .m(4)
33826 .n(16)
33827 .k(1)
33828 .qmin(128)
33829 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33830 }
33831
33832 TEST(F32_GEMM_4X16__FMA3_BROADCAST, qmax) {
33833 TEST_REQUIRES_X86_FMA3;
33834 GemmMicrokernelTester()
33835 .mr(4)
33836 .nr(16)
33837 .kr(1)
33838 .sr(1)
33839 .m(4)
33840 .n(16)
33841 .k(1)
33842 .qmax(128)
33843 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33844 }
33845
33846 TEST(F32_GEMM_4X16__FMA3_BROADCAST, strided_cm) {
33847 TEST_REQUIRES_X86_FMA3;
33848 GemmMicrokernelTester()
33849 .mr(4)
33850 .nr(16)
33851 .kr(1)
33852 .sr(1)
33853 .m(4)
33854 .n(16)
33855 .k(1)
33856 .cm_stride(19)
33857 .Test(xnn_f32_gemm_ukernel_4x16__fma3_broadcast);
33858 }
33859#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
33860
33861
33862#if XNN_ARCH_X86 || XNN_ARCH_X86_64
33863 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1) {
33864 TEST_REQUIRES_X86_FMA3;
33865 GemmMicrokernelTester()
33866 .mr(5)
33867 .nr(16)
33868 .kr(1)
33869 .sr(1)
33870 .m(5)
33871 .n(16)
33872 .k(1)
33873 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33874 }
33875
33876 TEST(F32_GEMM_5X16__FMA3_BROADCAST, strided_cn) {
33877 TEST_REQUIRES_X86_FMA3;
33878 GemmMicrokernelTester()
33879 .mr(5)
33880 .nr(16)
33881 .kr(1)
33882 .sr(1)
33883 .m(5)
33884 .n(16)
33885 .k(1)
33886 .cn_stride(19)
33887 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33888 }
33889
33890 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_strided_a) {
33891 TEST_REQUIRES_X86_FMA3;
33892 GemmMicrokernelTester()
33893 .mr(5)
33894 .nr(16)
33895 .kr(1)
33896 .sr(1)
33897 .m(5)
33898 .n(16)
33899 .k(1)
33900 .a_stride(3)
33901 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33902 }
33903
33904 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile) {
33905 TEST_REQUIRES_X86_FMA3;
33906 for (uint32_t m = 1; m <= 5; m++) {
33907 for (uint32_t n = 1; n <= 16; n++) {
33908 GemmMicrokernelTester()
33909 .mr(5)
33910 .nr(16)
33911 .kr(1)
33912 .sr(1)
33913 .m(m)
33914 .n(n)
33915 .k(1)
33916 .iterations(1)
33917 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33918 }
33919 }
33920 }
33921
33922 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_m) {
33923 TEST_REQUIRES_X86_FMA3;
33924 for (uint32_t m = 1; m <= 5; m++) {
33925 GemmMicrokernelTester()
33926 .mr(5)
33927 .nr(16)
33928 .kr(1)
33929 .sr(1)
33930 .m(m)
33931 .n(16)
33932 .k(1)
33933 .iterations(1)
33934 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33935 }
33936 }
33937
33938 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_eq_1_subtile_n) {
33939 TEST_REQUIRES_X86_FMA3;
33940 for (uint32_t n = 1; n <= 16; n++) {
33941 GemmMicrokernelTester()
33942 .mr(5)
33943 .nr(16)
33944 .kr(1)
33945 .sr(1)
33946 .m(5)
33947 .n(n)
33948 .k(1)
33949 .iterations(1)
33950 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33951 }
33952 }
33953
33954 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_gt_1) {
33955 TEST_REQUIRES_X86_FMA3;
33956 for (size_t k = 2; k < 10; k++) {
33957 GemmMicrokernelTester()
33958 .mr(5)
33959 .nr(16)
33960 .kr(1)
33961 .sr(1)
33962 .m(5)
33963 .n(16)
33964 .k(k)
33965 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33966 }
33967 }
33968
33969 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_gt_1_strided_a) {
33970 TEST_REQUIRES_X86_FMA3;
33971 for (size_t k = 2; k < 10; k++) {
33972 GemmMicrokernelTester()
33973 .mr(5)
33974 .nr(16)
33975 .kr(1)
33976 .sr(1)
33977 .m(5)
33978 .n(16)
33979 .k(k)
33980 .a_stride(11)
33981 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
33982 }
33983 }
33984
33985 TEST(F32_GEMM_5X16__FMA3_BROADCAST, k_gt_1_subtile) {
33986 TEST_REQUIRES_X86_FMA3;
33987 for (size_t k = 2; k < 10; k++) {
33988 for (uint32_t m = 1; m <= 5; m++) {
33989 for (uint32_t n = 1; n <= 16; n++) {
33990 GemmMicrokernelTester()
33991 .mr(5)
33992 .nr(16)
33993 .kr(1)
33994 .sr(1)
33995 .m(m)
33996 .n(n)
33997 .k(k)
33998 .iterations(1)
33999 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34000 }
34001 }
34002 }
34003 }
34004
34005 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16) {
34006 TEST_REQUIRES_X86_FMA3;
34007 for (uint32_t n = 17; n < 32; n++) {
34008 for (size_t k = 1; k <= 5; k += 2) {
34009 GemmMicrokernelTester()
34010 .mr(5)
34011 .nr(16)
34012 .kr(1)
34013 .sr(1)
34014 .m(5)
34015 .n(16)
34016 .k(k)
34017 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34018 }
34019 }
34020 }
34021
34022 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16_strided_cn) {
34023 TEST_REQUIRES_X86_FMA3;
34024 for (uint32_t n = 17; n < 32; n++) {
34025 for (size_t k = 1; k <= 5; k += 2) {
34026 GemmMicrokernelTester()
34027 .mr(5)
34028 .nr(16)
34029 .kr(1)
34030 .sr(1)
34031 .m(5)
34032 .n(16)
34033 .k(k)
34034 .cn_stride(19)
34035 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34036 }
34037 }
34038 }
34039
34040 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16_strided_a) {
34041 TEST_REQUIRES_X86_FMA3;
34042 for (uint32_t n = 17; n < 32; n++) {
34043 for (size_t k = 1; k <= 5; k += 2) {
34044 GemmMicrokernelTester()
34045 .mr(5)
34046 .nr(16)
34047 .kr(1)
34048 .sr(1)
34049 .m(5)
34050 .n(n)
34051 .k(k)
34052 .a_stride(7)
34053 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34054 }
34055 }
34056 }
34057
34058 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_gt_16_subtile) {
34059 TEST_REQUIRES_X86_FMA3;
34060 for (uint32_t n = 17; n < 32; n++) {
34061 for (size_t k = 1; k <= 5; k += 2) {
34062 for (uint32_t m = 1; m <= 5; m++) {
34063 GemmMicrokernelTester()
34064 .mr(5)
34065 .nr(16)
34066 .kr(1)
34067 .sr(1)
34068 .m(m)
34069 .n(n)
34070 .k(k)
34071 .iterations(1)
34072 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34073 }
34074 }
34075 }
34076 }
34077
34078 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16) {
34079 TEST_REQUIRES_X86_FMA3;
34080 for (uint32_t n = 32; n <= 48; n += 16) {
34081 for (size_t k = 1; k <= 5; k += 2) {
34082 GemmMicrokernelTester()
34083 .mr(5)
34084 .nr(16)
34085 .kr(1)
34086 .sr(1)
34087 .m(5)
34088 .n(16)
34089 .k(k)
34090 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34091 }
34092 }
34093 }
34094
34095 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16_strided_cn) {
34096 TEST_REQUIRES_X86_FMA3;
34097 for (uint32_t n = 32; n <= 48; n += 16) {
34098 for (size_t k = 1; k <= 5; k += 2) {
34099 GemmMicrokernelTester()
34100 .mr(5)
34101 .nr(16)
34102 .kr(1)
34103 .sr(1)
34104 .m(5)
34105 .n(n)
34106 .k(k)
34107 .cn_stride(19)
34108 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34109 }
34110 }
34111 }
34112
34113 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16_strided_a) {
34114 TEST_REQUIRES_X86_FMA3;
34115 for (uint32_t n = 32; n <= 48; n += 16) {
34116 for (size_t k = 1; k <= 5; k += 2) {
34117 GemmMicrokernelTester()
34118 .mr(5)
34119 .nr(16)
34120 .kr(1)
34121 .sr(1)
34122 .m(5)
34123 .n(n)
34124 .k(k)
34125 .a_stride(7)
34126 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34127 }
34128 }
34129 }
34130
34131 TEST(F32_GEMM_5X16__FMA3_BROADCAST, n_div_16_subtile) {
34132 TEST_REQUIRES_X86_FMA3;
34133 for (uint32_t n = 32; n <= 48; n += 16) {
34134 for (size_t k = 1; k <= 5; k += 2) {
34135 for (uint32_t m = 1; m <= 5; m++) {
34136 GemmMicrokernelTester()
34137 .mr(5)
34138 .nr(16)
34139 .kr(1)
34140 .sr(1)
34141 .m(m)
34142 .n(n)
34143 .k(k)
34144 .iterations(1)
34145 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34146 }
34147 }
34148 }
34149 }
34150
34151 TEST(F32_GEMM_5X16__FMA3_BROADCAST, strided_cm_subtile) {
34152 TEST_REQUIRES_X86_FMA3;
34153 for (size_t k = 1; k <= 5; k += 2) {
34154 for (uint32_t m = 1; m <= 5; m++) {
34155 for (uint32_t n = 1; n <= 16; n++) {
34156 GemmMicrokernelTester()
34157 .mr(5)
34158 .nr(16)
34159 .kr(1)
34160 .sr(1)
34161 .m(m)
34162 .n(n)
34163 .k(k)
34164 .cm_stride(19)
34165 .iterations(1)
34166 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34167 }
34168 }
34169 }
34170 }
34171
34172 TEST(F32_GEMM_5X16__FMA3_BROADCAST, qmin) {
34173 TEST_REQUIRES_X86_FMA3;
34174 GemmMicrokernelTester()
34175 .mr(5)
34176 .nr(16)
34177 .kr(1)
34178 .sr(1)
34179 .m(5)
34180 .n(16)
34181 .k(1)
34182 .qmin(128)
34183 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34184 }
34185
34186 TEST(F32_GEMM_5X16__FMA3_BROADCAST, qmax) {
34187 TEST_REQUIRES_X86_FMA3;
34188 GemmMicrokernelTester()
34189 .mr(5)
34190 .nr(16)
34191 .kr(1)
34192 .sr(1)
34193 .m(5)
34194 .n(16)
34195 .k(1)
34196 .qmax(128)
34197 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34198 }
34199
34200 TEST(F32_GEMM_5X16__FMA3_BROADCAST, strided_cm) {
34201 TEST_REQUIRES_X86_FMA3;
34202 GemmMicrokernelTester()
34203 .mr(5)
34204 .nr(16)
34205 .kr(1)
34206 .sr(1)
34207 .m(5)
34208 .n(16)
34209 .k(1)
34210 .cm_stride(19)
34211 .Test(xnn_f32_gemm_ukernel_5x16__fma3_broadcast);
34212 }
34213#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34214
34215
34216#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan27121322019-12-09 14:57:40 -080034217 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4) {
34218 TEST_REQUIRES_X86_FMA3;
34219 GemmMicrokernelTester()
34220 .mr(1)
34221 .nr(16)
34222 .kr(1)
34223 .sr(4)
34224 .m(1)
34225 .n(16)
34226 .k(4)
34227 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34228 }
34229
34230 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, strided_cn) {
34231 TEST_REQUIRES_X86_FMA3;
34232 GemmMicrokernelTester()
34233 .mr(1)
34234 .nr(16)
34235 .kr(1)
34236 .sr(4)
34237 .m(1)
34238 .n(16)
34239 .k(4)
34240 .cn_stride(19)
34241 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34242 }
34243
34244 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
34245 TEST_REQUIRES_X86_FMA3;
34246 GemmMicrokernelTester()
34247 .mr(1)
34248 .nr(16)
34249 .kr(1)
34250 .sr(4)
34251 .m(1)
34252 .n(16)
34253 .k(4)
34254 .a_stride(7)
34255 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34256 }
34257
34258 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
34259 TEST_REQUIRES_X86_FMA3;
34260 for (uint32_t m = 1; m <= 1; m++) {
34261 for (uint32_t n = 1; n <= 16; n++) {
34262 GemmMicrokernelTester()
34263 .mr(1)
34264 .nr(16)
34265 .kr(1)
34266 .sr(4)
34267 .m(m)
34268 .n(n)
34269 .k(4)
34270 .iterations(1)
34271 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34272 }
34273 }
34274 }
34275
34276 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
34277 TEST_REQUIRES_X86_FMA3;
34278 for (uint32_t m = 1; m <= 1; m++) {
34279 GemmMicrokernelTester()
34280 .mr(1)
34281 .nr(16)
34282 .kr(1)
34283 .sr(4)
34284 .m(m)
34285 .n(16)
34286 .k(4)
34287 .iterations(1)
34288 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34289 }
34290 }
34291
34292 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
34293 TEST_REQUIRES_X86_FMA3;
34294 for (uint32_t n = 1; n <= 16; n++) {
34295 GemmMicrokernelTester()
34296 .mr(1)
34297 .nr(16)
34298 .kr(1)
34299 .sr(4)
34300 .m(1)
34301 .n(n)
34302 .k(4)
34303 .iterations(1)
34304 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34305 }
34306 }
34307
34308 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_lt_4) {
34309 TEST_REQUIRES_X86_FMA3;
34310 for (size_t k = 1; k < 4; k++) {
34311 GemmMicrokernelTester()
34312 .mr(1)
34313 .nr(16)
34314 .kr(1)
34315 .sr(4)
34316 .m(1)
34317 .n(16)
34318 .k(k)
34319 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34320 }
34321 }
34322
34323 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
34324 TEST_REQUIRES_X86_FMA3;
34325 for (size_t k = 1; k < 4; k++) {
34326 GemmMicrokernelTester()
34327 .mr(1)
34328 .nr(16)
34329 .kr(1)
34330 .sr(4)
34331 .m(1)
34332 .n(16)
34333 .k(k)
34334 .a_stride(7)
34335 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34336 }
34337 }
34338
34339 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
34340 TEST_REQUIRES_X86_FMA3;
34341 for (size_t k = 1; k < 4; k++) {
34342 for (uint32_t m = 1; m <= 1; m++) {
34343 for (uint32_t n = 1; n <= 16; n++) {
34344 GemmMicrokernelTester()
34345 .mr(1)
34346 .nr(16)
34347 .kr(1)
34348 .sr(4)
34349 .m(m)
34350 .n(n)
34351 .k(k)
34352 .iterations(1)
34353 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34354 }
34355 }
34356 }
34357 }
34358
34359 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_gt_4) {
34360 TEST_REQUIRES_X86_FMA3;
34361 for (size_t k = 5; k < 8; k++) {
34362 GemmMicrokernelTester()
34363 .mr(1)
34364 .nr(16)
34365 .kr(1)
34366 .sr(4)
34367 .m(1)
34368 .n(16)
34369 .k(k)
34370 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34371 }
34372 }
34373
34374 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
34375 TEST_REQUIRES_X86_FMA3;
34376 for (size_t k = 5; k < 8; k++) {
34377 GemmMicrokernelTester()
34378 .mr(1)
34379 .nr(16)
34380 .kr(1)
34381 .sr(4)
34382 .m(1)
34383 .n(16)
34384 .k(k)
34385 .a_stride(11)
34386 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34387 }
34388 }
34389
34390 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
34391 TEST_REQUIRES_X86_FMA3;
34392 for (size_t k = 5; k < 8; k++) {
34393 for (uint32_t m = 1; m <= 1; m++) {
34394 for (uint32_t n = 1; n <= 16; n++) {
34395 GemmMicrokernelTester()
34396 .mr(1)
34397 .nr(16)
34398 .kr(1)
34399 .sr(4)
34400 .m(m)
34401 .n(n)
34402 .k(k)
34403 .iterations(1)
34404 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34405 }
34406 }
34407 }
34408 }
34409
34410 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_div_4) {
34411 TEST_REQUIRES_X86_FMA3;
34412 for (size_t k = 8; k <= 40; k += 4) {
34413 GemmMicrokernelTester()
34414 .mr(1)
34415 .nr(16)
34416 .kr(1)
34417 .sr(4)
34418 .m(1)
34419 .n(16)
34420 .k(k)
34421 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34422 }
34423 }
34424
34425 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
34426 TEST_REQUIRES_X86_FMA3;
34427 for (size_t k = 8; k <= 40; k += 4) {
34428 GemmMicrokernelTester()
34429 .mr(1)
34430 .nr(16)
34431 .kr(1)
34432 .sr(4)
34433 .m(1)
34434 .n(16)
34435 .k(k)
34436 .a_stride(43)
34437 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34438 }
34439 }
34440
34441 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, k_div_4_subtile) {
34442 TEST_REQUIRES_X86_FMA3;
34443 for (size_t k = 8; k <= 40; k += 4) {
34444 for (uint32_t m = 1; m <= 1; m++) {
34445 for (uint32_t n = 1; n <= 16; n++) {
34446 GemmMicrokernelTester()
34447 .mr(1)
34448 .nr(16)
34449 .kr(1)
34450 .sr(4)
34451 .m(m)
34452 .n(n)
34453 .k(k)
34454 .iterations(1)
34455 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34456 }
34457 }
34458 }
34459 }
34460
34461 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16) {
34462 TEST_REQUIRES_X86_FMA3;
34463 for (uint32_t n = 17; n < 32; n++) {
34464 for (size_t k = 1; k <= 20; k += 5) {
34465 GemmMicrokernelTester()
34466 .mr(1)
34467 .nr(16)
34468 .kr(1)
34469 .sr(4)
34470 .m(1)
34471 .n(16)
34472 .k(k)
34473 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34474 }
34475 }
34476 }
34477
34478 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
34479 TEST_REQUIRES_X86_FMA3;
34480 for (uint32_t n = 17; n < 32; n++) {
34481 for (size_t k = 1; k <= 20; k += 5) {
34482 GemmMicrokernelTester()
34483 .mr(1)
34484 .nr(16)
34485 .kr(1)
34486 .sr(4)
34487 .m(1)
34488 .n(16)
34489 .k(k)
34490 .cn_stride(19)
34491 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34492 }
34493 }
34494 }
34495
34496 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
34497 TEST_REQUIRES_X86_FMA3;
34498 for (uint32_t n = 17; n < 32; n++) {
34499 for (size_t k = 1; k <= 20; k += 5) {
34500 GemmMicrokernelTester()
34501 .mr(1)
34502 .nr(16)
34503 .kr(1)
34504 .sr(4)
34505 .m(1)
34506 .n(n)
34507 .k(k)
34508 .a_stride(23)
34509 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34510 }
34511 }
34512 }
34513
34514 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
34515 TEST_REQUIRES_X86_FMA3;
34516 for (uint32_t n = 17; n < 32; n++) {
34517 for (size_t k = 1; k <= 20; k += 5) {
34518 for (uint32_t m = 1; m <= 1; m++) {
34519 GemmMicrokernelTester()
34520 .mr(1)
34521 .nr(16)
34522 .kr(1)
34523 .sr(4)
34524 .m(m)
34525 .n(n)
34526 .k(k)
34527 .iterations(1)
34528 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34529 }
34530 }
34531 }
34532 }
34533
34534 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16) {
34535 TEST_REQUIRES_X86_FMA3;
34536 for (uint32_t n = 32; n <= 48; n += 16) {
34537 for (size_t k = 1; k <= 20; k += 5) {
34538 GemmMicrokernelTester()
34539 .mr(1)
34540 .nr(16)
34541 .kr(1)
34542 .sr(4)
34543 .m(1)
34544 .n(16)
34545 .k(k)
34546 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34547 }
34548 }
34549 }
34550
34551 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
34552 TEST_REQUIRES_X86_FMA3;
34553 for (uint32_t n = 32; n <= 48; n += 16) {
34554 for (size_t k = 1; k <= 20; k += 5) {
34555 GemmMicrokernelTester()
34556 .mr(1)
34557 .nr(16)
34558 .kr(1)
34559 .sr(4)
34560 .m(1)
34561 .n(n)
34562 .k(k)
34563 .cn_stride(19)
34564 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34565 }
34566 }
34567 }
34568
34569 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
34570 TEST_REQUIRES_X86_FMA3;
34571 for (uint32_t n = 32; n <= 48; n += 16) {
34572 for (size_t k = 1; k <= 20; k += 5) {
34573 GemmMicrokernelTester()
34574 .mr(1)
34575 .nr(16)
34576 .kr(1)
34577 .sr(4)
34578 .m(1)
34579 .n(n)
34580 .k(k)
34581 .a_stride(23)
34582 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34583 }
34584 }
34585 }
34586
34587 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, n_div_16_subtile) {
34588 TEST_REQUIRES_X86_FMA3;
34589 for (uint32_t n = 32; n <= 48; n += 16) {
34590 for (size_t k = 1; k <= 20; k += 5) {
34591 for (uint32_t m = 1; m <= 1; m++) {
34592 GemmMicrokernelTester()
34593 .mr(1)
34594 .nr(16)
34595 .kr(1)
34596 .sr(4)
34597 .m(m)
34598 .n(n)
34599 .k(k)
34600 .iterations(1)
34601 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34602 }
34603 }
34604 }
34605 }
34606
34607 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, strided_cm_subtile) {
34608 TEST_REQUIRES_X86_FMA3;
34609 for (size_t k = 1; k <= 20; k += 5) {
34610 for (uint32_t m = 1; m <= 1; m++) {
34611 for (uint32_t n = 1; n <= 16; n++) {
34612 GemmMicrokernelTester()
34613 .mr(1)
34614 .nr(16)
34615 .kr(1)
34616 .sr(4)
34617 .m(m)
34618 .n(n)
34619 .k(k)
34620 .cm_stride(19)
34621 .iterations(1)
34622 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34623 }
34624 }
34625 }
34626 }
34627
34628 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, qmin) {
34629 TEST_REQUIRES_X86_FMA3;
34630 GemmMicrokernelTester()
34631 .mr(1)
34632 .nr(16)
34633 .kr(1)
34634 .sr(4)
34635 .m(1)
34636 .n(16)
34637 .k(4)
34638 .qmin(128)
34639 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34640 }
34641
34642 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, qmax) {
34643 TEST_REQUIRES_X86_FMA3;
34644 GemmMicrokernelTester()
34645 .mr(1)
34646 .nr(16)
34647 .kr(1)
34648 .sr(4)
34649 .m(1)
34650 .n(16)
34651 .k(4)
34652 .qmax(128)
34653 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34654 }
34655
34656 TEST(F32_GEMM_1X16S4__FMA3_BROADCAST, strided_cm) {
34657 TEST_REQUIRES_X86_FMA3;
34658 GemmMicrokernelTester()
34659 .mr(1)
34660 .nr(16)
34661 .kr(1)
34662 .sr(4)
34663 .m(1)
34664 .n(16)
34665 .k(4)
34666 .cm_stride(19)
34667 .Test(xnn_f32_gemm_ukernel_1x16s4__fma3_broadcast);
34668 }
34669#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
34670
34671
34672#if XNN_ARCH_X86 || XNN_ARCH_X86_64
34673 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4) {
34674 TEST_REQUIRES_X86_FMA3;
34675 GemmMicrokernelTester()
34676 .mr(3)
34677 .nr(16)
34678 .kr(1)
34679 .sr(4)
34680 .m(3)
34681 .n(16)
34682 .k(4)
34683 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34684 }
34685
34686 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, strided_cn) {
34687 TEST_REQUIRES_X86_FMA3;
34688 GemmMicrokernelTester()
34689 .mr(3)
34690 .nr(16)
34691 .kr(1)
34692 .sr(4)
34693 .m(3)
34694 .n(16)
34695 .k(4)
34696 .cn_stride(19)
34697 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34698 }
34699
34700 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
34701 TEST_REQUIRES_X86_FMA3;
34702 GemmMicrokernelTester()
34703 .mr(3)
34704 .nr(16)
34705 .kr(1)
34706 .sr(4)
34707 .m(3)
34708 .n(16)
34709 .k(4)
34710 .a_stride(7)
34711 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34712 }
34713
34714 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
34715 TEST_REQUIRES_X86_FMA3;
34716 for (uint32_t m = 1; m <= 3; m++) {
34717 for (uint32_t n = 1; n <= 16; n++) {
34718 GemmMicrokernelTester()
34719 .mr(3)
34720 .nr(16)
34721 .kr(1)
34722 .sr(4)
34723 .m(m)
34724 .n(n)
34725 .k(4)
34726 .iterations(1)
34727 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34728 }
34729 }
34730 }
34731
34732 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
34733 TEST_REQUIRES_X86_FMA3;
34734 for (uint32_t m = 1; m <= 3; m++) {
34735 GemmMicrokernelTester()
34736 .mr(3)
34737 .nr(16)
34738 .kr(1)
34739 .sr(4)
34740 .m(m)
34741 .n(16)
34742 .k(4)
34743 .iterations(1)
34744 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34745 }
34746 }
34747
34748 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
34749 TEST_REQUIRES_X86_FMA3;
34750 for (uint32_t n = 1; n <= 16; n++) {
34751 GemmMicrokernelTester()
34752 .mr(3)
34753 .nr(16)
34754 .kr(1)
34755 .sr(4)
34756 .m(3)
34757 .n(n)
34758 .k(4)
34759 .iterations(1)
34760 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34761 }
34762 }
34763
34764 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_lt_4) {
34765 TEST_REQUIRES_X86_FMA3;
34766 for (size_t k = 1; k < 4; k++) {
34767 GemmMicrokernelTester()
34768 .mr(3)
34769 .nr(16)
34770 .kr(1)
34771 .sr(4)
34772 .m(3)
34773 .n(16)
34774 .k(k)
34775 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34776 }
34777 }
34778
34779 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
34780 TEST_REQUIRES_X86_FMA3;
34781 for (size_t k = 1; k < 4; k++) {
34782 GemmMicrokernelTester()
34783 .mr(3)
34784 .nr(16)
34785 .kr(1)
34786 .sr(4)
34787 .m(3)
34788 .n(16)
34789 .k(k)
34790 .a_stride(7)
34791 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34792 }
34793 }
34794
34795 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
34796 TEST_REQUIRES_X86_FMA3;
34797 for (size_t k = 1; k < 4; k++) {
34798 for (uint32_t m = 1; m <= 3; m++) {
34799 for (uint32_t n = 1; n <= 16; n++) {
34800 GemmMicrokernelTester()
34801 .mr(3)
34802 .nr(16)
34803 .kr(1)
34804 .sr(4)
34805 .m(m)
34806 .n(n)
34807 .k(k)
34808 .iterations(1)
34809 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34810 }
34811 }
34812 }
34813 }
34814
34815 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_gt_4) {
34816 TEST_REQUIRES_X86_FMA3;
34817 for (size_t k = 5; k < 8; k++) {
34818 GemmMicrokernelTester()
34819 .mr(3)
34820 .nr(16)
34821 .kr(1)
34822 .sr(4)
34823 .m(3)
34824 .n(16)
34825 .k(k)
34826 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34827 }
34828 }
34829
34830 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
34831 TEST_REQUIRES_X86_FMA3;
34832 for (size_t k = 5; k < 8; k++) {
34833 GemmMicrokernelTester()
34834 .mr(3)
34835 .nr(16)
34836 .kr(1)
34837 .sr(4)
34838 .m(3)
34839 .n(16)
34840 .k(k)
34841 .a_stride(11)
34842 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34843 }
34844 }
34845
34846 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
34847 TEST_REQUIRES_X86_FMA3;
34848 for (size_t k = 5; k < 8; k++) {
34849 for (uint32_t m = 1; m <= 3; m++) {
34850 for (uint32_t n = 1; n <= 16; n++) {
34851 GemmMicrokernelTester()
34852 .mr(3)
34853 .nr(16)
34854 .kr(1)
34855 .sr(4)
34856 .m(m)
34857 .n(n)
34858 .k(k)
34859 .iterations(1)
34860 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34861 }
34862 }
34863 }
34864 }
34865
34866 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_div_4) {
34867 TEST_REQUIRES_X86_FMA3;
34868 for (size_t k = 8; k <= 40; k += 4) {
34869 GemmMicrokernelTester()
34870 .mr(3)
34871 .nr(16)
34872 .kr(1)
34873 .sr(4)
34874 .m(3)
34875 .n(16)
34876 .k(k)
34877 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34878 }
34879 }
34880
34881 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
34882 TEST_REQUIRES_X86_FMA3;
34883 for (size_t k = 8; k <= 40; k += 4) {
34884 GemmMicrokernelTester()
34885 .mr(3)
34886 .nr(16)
34887 .kr(1)
34888 .sr(4)
34889 .m(3)
34890 .n(16)
34891 .k(k)
34892 .a_stride(43)
34893 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34894 }
34895 }
34896
34897 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, k_div_4_subtile) {
34898 TEST_REQUIRES_X86_FMA3;
34899 for (size_t k = 8; k <= 40; k += 4) {
34900 for (uint32_t m = 1; m <= 3; m++) {
34901 for (uint32_t n = 1; n <= 16; n++) {
34902 GemmMicrokernelTester()
34903 .mr(3)
34904 .nr(16)
34905 .kr(1)
34906 .sr(4)
34907 .m(m)
34908 .n(n)
34909 .k(k)
34910 .iterations(1)
34911 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34912 }
34913 }
34914 }
34915 }
34916
34917 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16) {
34918 TEST_REQUIRES_X86_FMA3;
34919 for (uint32_t n = 17; n < 32; n++) {
34920 for (size_t k = 1; k <= 20; k += 5) {
34921 GemmMicrokernelTester()
34922 .mr(3)
34923 .nr(16)
34924 .kr(1)
34925 .sr(4)
34926 .m(3)
34927 .n(16)
34928 .k(k)
34929 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34930 }
34931 }
34932 }
34933
34934 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
34935 TEST_REQUIRES_X86_FMA3;
34936 for (uint32_t n = 17; n < 32; n++) {
34937 for (size_t k = 1; k <= 20; k += 5) {
34938 GemmMicrokernelTester()
34939 .mr(3)
34940 .nr(16)
34941 .kr(1)
34942 .sr(4)
34943 .m(3)
34944 .n(16)
34945 .k(k)
34946 .cn_stride(19)
34947 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34948 }
34949 }
34950 }
34951
34952 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
34953 TEST_REQUIRES_X86_FMA3;
34954 for (uint32_t n = 17; n < 32; n++) {
34955 for (size_t k = 1; k <= 20; k += 5) {
34956 GemmMicrokernelTester()
34957 .mr(3)
34958 .nr(16)
34959 .kr(1)
34960 .sr(4)
34961 .m(3)
34962 .n(n)
34963 .k(k)
34964 .a_stride(23)
34965 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34966 }
34967 }
34968 }
34969
34970 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
34971 TEST_REQUIRES_X86_FMA3;
34972 for (uint32_t n = 17; n < 32; n++) {
34973 for (size_t k = 1; k <= 20; k += 5) {
34974 for (uint32_t m = 1; m <= 3; m++) {
34975 GemmMicrokernelTester()
34976 .mr(3)
34977 .nr(16)
34978 .kr(1)
34979 .sr(4)
34980 .m(m)
34981 .n(n)
34982 .k(k)
34983 .iterations(1)
34984 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
34985 }
34986 }
34987 }
34988 }
34989
34990 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16) {
34991 TEST_REQUIRES_X86_FMA3;
34992 for (uint32_t n = 32; n <= 48; n += 16) {
34993 for (size_t k = 1; k <= 20; k += 5) {
34994 GemmMicrokernelTester()
34995 .mr(3)
34996 .nr(16)
34997 .kr(1)
34998 .sr(4)
34999 .m(3)
35000 .n(16)
35001 .k(k)
35002 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35003 }
35004 }
35005 }
35006
35007 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
35008 TEST_REQUIRES_X86_FMA3;
35009 for (uint32_t n = 32; n <= 48; n += 16) {
35010 for (size_t k = 1; k <= 20; k += 5) {
35011 GemmMicrokernelTester()
35012 .mr(3)
35013 .nr(16)
35014 .kr(1)
35015 .sr(4)
35016 .m(3)
35017 .n(n)
35018 .k(k)
35019 .cn_stride(19)
35020 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35021 }
35022 }
35023 }
35024
35025 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
35026 TEST_REQUIRES_X86_FMA3;
35027 for (uint32_t n = 32; n <= 48; n += 16) {
35028 for (size_t k = 1; k <= 20; k += 5) {
35029 GemmMicrokernelTester()
35030 .mr(3)
35031 .nr(16)
35032 .kr(1)
35033 .sr(4)
35034 .m(3)
35035 .n(n)
35036 .k(k)
35037 .a_stride(23)
35038 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35039 }
35040 }
35041 }
35042
35043 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, n_div_16_subtile) {
35044 TEST_REQUIRES_X86_FMA3;
35045 for (uint32_t n = 32; n <= 48; n += 16) {
35046 for (size_t k = 1; k <= 20; k += 5) {
35047 for (uint32_t m = 1; m <= 3; m++) {
35048 GemmMicrokernelTester()
35049 .mr(3)
35050 .nr(16)
35051 .kr(1)
35052 .sr(4)
35053 .m(m)
35054 .n(n)
35055 .k(k)
35056 .iterations(1)
35057 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35058 }
35059 }
35060 }
35061 }
35062
35063 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, strided_cm_subtile) {
35064 TEST_REQUIRES_X86_FMA3;
35065 for (size_t k = 1; k <= 20; k += 5) {
35066 for (uint32_t m = 1; m <= 3; m++) {
35067 for (uint32_t n = 1; n <= 16; n++) {
35068 GemmMicrokernelTester()
35069 .mr(3)
35070 .nr(16)
35071 .kr(1)
35072 .sr(4)
35073 .m(m)
35074 .n(n)
35075 .k(k)
35076 .cm_stride(19)
35077 .iterations(1)
35078 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35079 }
35080 }
35081 }
35082 }
35083
35084 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, qmin) {
35085 TEST_REQUIRES_X86_FMA3;
35086 GemmMicrokernelTester()
35087 .mr(3)
35088 .nr(16)
35089 .kr(1)
35090 .sr(4)
35091 .m(3)
35092 .n(16)
35093 .k(4)
35094 .qmin(128)
35095 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35096 }
35097
35098 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, qmax) {
35099 TEST_REQUIRES_X86_FMA3;
35100 GemmMicrokernelTester()
35101 .mr(3)
35102 .nr(16)
35103 .kr(1)
35104 .sr(4)
35105 .m(3)
35106 .n(16)
35107 .k(4)
35108 .qmax(128)
35109 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35110 }
35111
35112 TEST(F32_GEMM_3X16S4__FMA3_BROADCAST, strided_cm) {
35113 TEST_REQUIRES_X86_FMA3;
35114 GemmMicrokernelTester()
35115 .mr(3)
35116 .nr(16)
35117 .kr(1)
35118 .sr(4)
35119 .m(3)
35120 .n(16)
35121 .k(4)
35122 .cm_stride(19)
35123 .Test(xnn_f32_gemm_ukernel_3x16s4__fma3_broadcast);
35124 }
35125#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35126
35127
35128#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35129 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4) {
35130 TEST_REQUIRES_X86_FMA3;
35131 GemmMicrokernelTester()
35132 .mr(4)
35133 .nr(16)
35134 .kr(1)
35135 .sr(4)
35136 .m(4)
35137 .n(16)
35138 .k(4)
35139 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35140 }
35141
35142 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, strided_cn) {
35143 TEST_REQUIRES_X86_FMA3;
35144 GemmMicrokernelTester()
35145 .mr(4)
35146 .nr(16)
35147 .kr(1)
35148 .sr(4)
35149 .m(4)
35150 .n(16)
35151 .k(4)
35152 .cn_stride(19)
35153 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35154 }
35155
35156 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
35157 TEST_REQUIRES_X86_FMA3;
35158 GemmMicrokernelTester()
35159 .mr(4)
35160 .nr(16)
35161 .kr(1)
35162 .sr(4)
35163 .m(4)
35164 .n(16)
35165 .k(4)
35166 .a_stride(7)
35167 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35168 }
35169
35170 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
35171 TEST_REQUIRES_X86_FMA3;
35172 for (uint32_t m = 1; m <= 4; m++) {
35173 for (uint32_t n = 1; n <= 16; n++) {
35174 GemmMicrokernelTester()
35175 .mr(4)
35176 .nr(16)
35177 .kr(1)
35178 .sr(4)
35179 .m(m)
35180 .n(n)
35181 .k(4)
35182 .iterations(1)
35183 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35184 }
35185 }
35186 }
35187
35188 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
35189 TEST_REQUIRES_X86_FMA3;
35190 for (uint32_t m = 1; m <= 4; m++) {
35191 GemmMicrokernelTester()
35192 .mr(4)
35193 .nr(16)
35194 .kr(1)
35195 .sr(4)
35196 .m(m)
35197 .n(16)
35198 .k(4)
35199 .iterations(1)
35200 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35201 }
35202 }
35203
35204 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
35205 TEST_REQUIRES_X86_FMA3;
35206 for (uint32_t n = 1; n <= 16; n++) {
35207 GemmMicrokernelTester()
35208 .mr(4)
35209 .nr(16)
35210 .kr(1)
35211 .sr(4)
35212 .m(4)
35213 .n(n)
35214 .k(4)
35215 .iterations(1)
35216 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35217 }
35218 }
35219
35220 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_lt_4) {
35221 TEST_REQUIRES_X86_FMA3;
35222 for (size_t k = 1; k < 4; k++) {
35223 GemmMicrokernelTester()
35224 .mr(4)
35225 .nr(16)
35226 .kr(1)
35227 .sr(4)
35228 .m(4)
35229 .n(16)
35230 .k(k)
35231 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35232 }
35233 }
35234
35235 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
35236 TEST_REQUIRES_X86_FMA3;
35237 for (size_t k = 1; k < 4; k++) {
35238 GemmMicrokernelTester()
35239 .mr(4)
35240 .nr(16)
35241 .kr(1)
35242 .sr(4)
35243 .m(4)
35244 .n(16)
35245 .k(k)
35246 .a_stride(7)
35247 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35248 }
35249 }
35250
35251 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
35252 TEST_REQUIRES_X86_FMA3;
35253 for (size_t k = 1; k < 4; k++) {
35254 for (uint32_t m = 1; m <= 4; m++) {
35255 for (uint32_t n = 1; n <= 16; n++) {
35256 GemmMicrokernelTester()
35257 .mr(4)
35258 .nr(16)
35259 .kr(1)
35260 .sr(4)
35261 .m(m)
35262 .n(n)
35263 .k(k)
35264 .iterations(1)
35265 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35266 }
35267 }
35268 }
35269 }
35270
35271 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_gt_4) {
35272 TEST_REQUIRES_X86_FMA3;
35273 for (size_t k = 5; k < 8; k++) {
35274 GemmMicrokernelTester()
35275 .mr(4)
35276 .nr(16)
35277 .kr(1)
35278 .sr(4)
35279 .m(4)
35280 .n(16)
35281 .k(k)
35282 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35283 }
35284 }
35285
35286 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
35287 TEST_REQUIRES_X86_FMA3;
35288 for (size_t k = 5; k < 8; k++) {
35289 GemmMicrokernelTester()
35290 .mr(4)
35291 .nr(16)
35292 .kr(1)
35293 .sr(4)
35294 .m(4)
35295 .n(16)
35296 .k(k)
35297 .a_stride(11)
35298 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35299 }
35300 }
35301
35302 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
35303 TEST_REQUIRES_X86_FMA3;
35304 for (size_t k = 5; k < 8; k++) {
35305 for (uint32_t m = 1; m <= 4; m++) {
35306 for (uint32_t n = 1; n <= 16; n++) {
35307 GemmMicrokernelTester()
35308 .mr(4)
35309 .nr(16)
35310 .kr(1)
35311 .sr(4)
35312 .m(m)
35313 .n(n)
35314 .k(k)
35315 .iterations(1)
35316 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35317 }
35318 }
35319 }
35320 }
35321
35322 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_div_4) {
35323 TEST_REQUIRES_X86_FMA3;
35324 for (size_t k = 8; k <= 40; k += 4) {
35325 GemmMicrokernelTester()
35326 .mr(4)
35327 .nr(16)
35328 .kr(1)
35329 .sr(4)
35330 .m(4)
35331 .n(16)
35332 .k(k)
35333 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35334 }
35335 }
35336
35337 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
35338 TEST_REQUIRES_X86_FMA3;
35339 for (size_t k = 8; k <= 40; k += 4) {
35340 GemmMicrokernelTester()
35341 .mr(4)
35342 .nr(16)
35343 .kr(1)
35344 .sr(4)
35345 .m(4)
35346 .n(16)
35347 .k(k)
35348 .a_stride(43)
35349 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35350 }
35351 }
35352
35353 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, k_div_4_subtile) {
35354 TEST_REQUIRES_X86_FMA3;
35355 for (size_t k = 8; k <= 40; k += 4) {
35356 for (uint32_t m = 1; m <= 4; m++) {
35357 for (uint32_t n = 1; n <= 16; n++) {
35358 GemmMicrokernelTester()
35359 .mr(4)
35360 .nr(16)
35361 .kr(1)
35362 .sr(4)
35363 .m(m)
35364 .n(n)
35365 .k(k)
35366 .iterations(1)
35367 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35368 }
35369 }
35370 }
35371 }
35372
35373 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16) {
35374 TEST_REQUIRES_X86_FMA3;
35375 for (uint32_t n = 17; n < 32; n++) {
35376 for (size_t k = 1; k <= 20; k += 5) {
35377 GemmMicrokernelTester()
35378 .mr(4)
35379 .nr(16)
35380 .kr(1)
35381 .sr(4)
35382 .m(4)
35383 .n(16)
35384 .k(k)
35385 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35386 }
35387 }
35388 }
35389
35390 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
35391 TEST_REQUIRES_X86_FMA3;
35392 for (uint32_t n = 17; n < 32; n++) {
35393 for (size_t k = 1; k <= 20; k += 5) {
35394 GemmMicrokernelTester()
35395 .mr(4)
35396 .nr(16)
35397 .kr(1)
35398 .sr(4)
35399 .m(4)
35400 .n(16)
35401 .k(k)
35402 .cn_stride(19)
35403 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35404 }
35405 }
35406 }
35407
35408 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
35409 TEST_REQUIRES_X86_FMA3;
35410 for (uint32_t n = 17; n < 32; n++) {
35411 for (size_t k = 1; k <= 20; k += 5) {
35412 GemmMicrokernelTester()
35413 .mr(4)
35414 .nr(16)
35415 .kr(1)
35416 .sr(4)
35417 .m(4)
35418 .n(n)
35419 .k(k)
35420 .a_stride(23)
35421 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35422 }
35423 }
35424 }
35425
35426 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
35427 TEST_REQUIRES_X86_FMA3;
35428 for (uint32_t n = 17; n < 32; n++) {
35429 for (size_t k = 1; k <= 20; k += 5) {
35430 for (uint32_t m = 1; m <= 4; m++) {
35431 GemmMicrokernelTester()
35432 .mr(4)
35433 .nr(16)
35434 .kr(1)
35435 .sr(4)
35436 .m(m)
35437 .n(n)
35438 .k(k)
35439 .iterations(1)
35440 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35441 }
35442 }
35443 }
35444 }
35445
35446 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16) {
35447 TEST_REQUIRES_X86_FMA3;
35448 for (uint32_t n = 32; n <= 48; n += 16) {
35449 for (size_t k = 1; k <= 20; k += 5) {
35450 GemmMicrokernelTester()
35451 .mr(4)
35452 .nr(16)
35453 .kr(1)
35454 .sr(4)
35455 .m(4)
35456 .n(16)
35457 .k(k)
35458 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35459 }
35460 }
35461 }
35462
35463 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
35464 TEST_REQUIRES_X86_FMA3;
35465 for (uint32_t n = 32; n <= 48; n += 16) {
35466 for (size_t k = 1; k <= 20; k += 5) {
35467 GemmMicrokernelTester()
35468 .mr(4)
35469 .nr(16)
35470 .kr(1)
35471 .sr(4)
35472 .m(4)
35473 .n(n)
35474 .k(k)
35475 .cn_stride(19)
35476 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35477 }
35478 }
35479 }
35480
35481 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
35482 TEST_REQUIRES_X86_FMA3;
35483 for (uint32_t n = 32; n <= 48; n += 16) {
35484 for (size_t k = 1; k <= 20; k += 5) {
35485 GemmMicrokernelTester()
35486 .mr(4)
35487 .nr(16)
35488 .kr(1)
35489 .sr(4)
35490 .m(4)
35491 .n(n)
35492 .k(k)
35493 .a_stride(23)
35494 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35495 }
35496 }
35497 }
35498
35499 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, n_div_16_subtile) {
35500 TEST_REQUIRES_X86_FMA3;
35501 for (uint32_t n = 32; n <= 48; n += 16) {
35502 for (size_t k = 1; k <= 20; k += 5) {
35503 for (uint32_t m = 1; m <= 4; m++) {
35504 GemmMicrokernelTester()
35505 .mr(4)
35506 .nr(16)
35507 .kr(1)
35508 .sr(4)
35509 .m(m)
35510 .n(n)
35511 .k(k)
35512 .iterations(1)
35513 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35514 }
35515 }
35516 }
35517 }
35518
35519 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, strided_cm_subtile) {
35520 TEST_REQUIRES_X86_FMA3;
35521 for (size_t k = 1; k <= 20; k += 5) {
35522 for (uint32_t m = 1; m <= 4; m++) {
35523 for (uint32_t n = 1; n <= 16; n++) {
35524 GemmMicrokernelTester()
35525 .mr(4)
35526 .nr(16)
35527 .kr(1)
35528 .sr(4)
35529 .m(m)
35530 .n(n)
35531 .k(k)
35532 .cm_stride(19)
35533 .iterations(1)
35534 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35535 }
35536 }
35537 }
35538 }
35539
35540 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, qmin) {
35541 TEST_REQUIRES_X86_FMA3;
35542 GemmMicrokernelTester()
35543 .mr(4)
35544 .nr(16)
35545 .kr(1)
35546 .sr(4)
35547 .m(4)
35548 .n(16)
35549 .k(4)
35550 .qmin(128)
35551 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35552 }
35553
35554 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, qmax) {
35555 TEST_REQUIRES_X86_FMA3;
35556 GemmMicrokernelTester()
35557 .mr(4)
35558 .nr(16)
35559 .kr(1)
35560 .sr(4)
35561 .m(4)
35562 .n(16)
35563 .k(4)
35564 .qmax(128)
35565 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35566 }
35567
35568 TEST(F32_GEMM_4X16S4__FMA3_BROADCAST, strided_cm) {
35569 TEST_REQUIRES_X86_FMA3;
35570 GemmMicrokernelTester()
35571 .mr(4)
35572 .nr(16)
35573 .kr(1)
35574 .sr(4)
35575 .m(4)
35576 .n(16)
35577 .k(4)
35578 .cm_stride(19)
35579 .Test(xnn_f32_gemm_ukernel_4x16s4__fma3_broadcast);
35580 }
35581#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
35582
35583
35584#if XNN_ARCH_X86 || XNN_ARCH_X86_64
35585 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4) {
35586 TEST_REQUIRES_X86_FMA3;
35587 GemmMicrokernelTester()
35588 .mr(5)
35589 .nr(16)
35590 .kr(1)
35591 .sr(4)
35592 .m(5)
35593 .n(16)
35594 .k(4)
35595 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35596 }
35597
35598 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, strided_cn) {
35599 TEST_REQUIRES_X86_FMA3;
35600 GemmMicrokernelTester()
35601 .mr(5)
35602 .nr(16)
35603 .kr(1)
35604 .sr(4)
35605 .m(5)
35606 .n(16)
35607 .k(4)
35608 .cn_stride(19)
35609 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35610 }
35611
35612 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_strided_a) {
35613 TEST_REQUIRES_X86_FMA3;
35614 GemmMicrokernelTester()
35615 .mr(5)
35616 .nr(16)
35617 .kr(1)
35618 .sr(4)
35619 .m(5)
35620 .n(16)
35621 .k(4)
35622 .a_stride(7)
35623 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35624 }
35625
35626 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile) {
35627 TEST_REQUIRES_X86_FMA3;
35628 for (uint32_t m = 1; m <= 5; m++) {
35629 for (uint32_t n = 1; n <= 16; n++) {
35630 GemmMicrokernelTester()
35631 .mr(5)
35632 .nr(16)
35633 .kr(1)
35634 .sr(4)
35635 .m(m)
35636 .n(n)
35637 .k(4)
35638 .iterations(1)
35639 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35640 }
35641 }
35642 }
35643
35644 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_m) {
35645 TEST_REQUIRES_X86_FMA3;
35646 for (uint32_t m = 1; m <= 5; m++) {
35647 GemmMicrokernelTester()
35648 .mr(5)
35649 .nr(16)
35650 .kr(1)
35651 .sr(4)
35652 .m(m)
35653 .n(16)
35654 .k(4)
35655 .iterations(1)
35656 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35657 }
35658 }
35659
35660 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_eq_4_subtile_n) {
35661 TEST_REQUIRES_X86_FMA3;
35662 for (uint32_t n = 1; n <= 16; n++) {
35663 GemmMicrokernelTester()
35664 .mr(5)
35665 .nr(16)
35666 .kr(1)
35667 .sr(4)
35668 .m(5)
35669 .n(n)
35670 .k(4)
35671 .iterations(1)
35672 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35673 }
35674 }
35675
35676 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_lt_4) {
35677 TEST_REQUIRES_X86_FMA3;
35678 for (size_t k = 1; k < 4; k++) {
35679 GemmMicrokernelTester()
35680 .mr(5)
35681 .nr(16)
35682 .kr(1)
35683 .sr(4)
35684 .m(5)
35685 .n(16)
35686 .k(k)
35687 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35688 }
35689 }
35690
35691 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_lt_4_strided_a) {
35692 TEST_REQUIRES_X86_FMA3;
35693 for (size_t k = 1; k < 4; k++) {
35694 GemmMicrokernelTester()
35695 .mr(5)
35696 .nr(16)
35697 .kr(1)
35698 .sr(4)
35699 .m(5)
35700 .n(16)
35701 .k(k)
35702 .a_stride(7)
35703 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35704 }
35705 }
35706
35707 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_lt_4_subtile) {
35708 TEST_REQUIRES_X86_FMA3;
35709 for (size_t k = 1; k < 4; k++) {
35710 for (uint32_t m = 1; m <= 5; m++) {
35711 for (uint32_t n = 1; n <= 16; n++) {
35712 GemmMicrokernelTester()
35713 .mr(5)
35714 .nr(16)
35715 .kr(1)
35716 .sr(4)
35717 .m(m)
35718 .n(n)
35719 .k(k)
35720 .iterations(1)
35721 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35722 }
35723 }
35724 }
35725 }
35726
35727 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_gt_4) {
35728 TEST_REQUIRES_X86_FMA3;
35729 for (size_t k = 5; k < 8; k++) {
35730 GemmMicrokernelTester()
35731 .mr(5)
35732 .nr(16)
35733 .kr(1)
35734 .sr(4)
35735 .m(5)
35736 .n(16)
35737 .k(k)
35738 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35739 }
35740 }
35741
35742 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_gt_4_strided_a) {
35743 TEST_REQUIRES_X86_FMA3;
35744 for (size_t k = 5; k < 8; k++) {
35745 GemmMicrokernelTester()
35746 .mr(5)
35747 .nr(16)
35748 .kr(1)
35749 .sr(4)
35750 .m(5)
35751 .n(16)
35752 .k(k)
35753 .a_stride(11)
35754 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35755 }
35756 }
35757
35758 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_gt_4_subtile) {
35759 TEST_REQUIRES_X86_FMA3;
35760 for (size_t k = 5; k < 8; k++) {
35761 for (uint32_t m = 1; m <= 5; m++) {
35762 for (uint32_t n = 1; n <= 16; n++) {
35763 GemmMicrokernelTester()
35764 .mr(5)
35765 .nr(16)
35766 .kr(1)
35767 .sr(4)
35768 .m(m)
35769 .n(n)
35770 .k(k)
35771 .iterations(1)
35772 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35773 }
35774 }
35775 }
35776 }
35777
35778 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_div_4) {
35779 TEST_REQUIRES_X86_FMA3;
35780 for (size_t k = 8; k <= 40; k += 4) {
35781 GemmMicrokernelTester()
35782 .mr(5)
35783 .nr(16)
35784 .kr(1)
35785 .sr(4)
35786 .m(5)
35787 .n(16)
35788 .k(k)
35789 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35790 }
35791 }
35792
35793 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_div_4_strided_a) {
35794 TEST_REQUIRES_X86_FMA3;
35795 for (size_t k = 8; k <= 40; k += 4) {
35796 GemmMicrokernelTester()
35797 .mr(5)
35798 .nr(16)
35799 .kr(1)
35800 .sr(4)
35801 .m(5)
35802 .n(16)
35803 .k(k)
35804 .a_stride(43)
35805 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35806 }
35807 }
35808
35809 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, k_div_4_subtile) {
35810 TEST_REQUIRES_X86_FMA3;
35811 for (size_t k = 8; k <= 40; k += 4) {
35812 for (uint32_t m = 1; m <= 5; m++) {
35813 for (uint32_t n = 1; n <= 16; n++) {
35814 GemmMicrokernelTester()
35815 .mr(5)
35816 .nr(16)
35817 .kr(1)
35818 .sr(4)
35819 .m(m)
35820 .n(n)
35821 .k(k)
35822 .iterations(1)
35823 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35824 }
35825 }
35826 }
35827 }
35828
35829 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16) {
35830 TEST_REQUIRES_X86_FMA3;
35831 for (uint32_t n = 17; n < 32; n++) {
35832 for (size_t k = 1; k <= 20; k += 5) {
35833 GemmMicrokernelTester()
35834 .mr(5)
35835 .nr(16)
35836 .kr(1)
35837 .sr(4)
35838 .m(5)
35839 .n(16)
35840 .k(k)
35841 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35842 }
35843 }
35844 }
35845
35846 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16_strided_cn) {
35847 TEST_REQUIRES_X86_FMA3;
35848 for (uint32_t n = 17; n < 32; n++) {
35849 for (size_t k = 1; k <= 20; k += 5) {
35850 GemmMicrokernelTester()
35851 .mr(5)
35852 .nr(16)
35853 .kr(1)
35854 .sr(4)
35855 .m(5)
35856 .n(16)
35857 .k(k)
35858 .cn_stride(19)
35859 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35860 }
35861 }
35862 }
35863
35864 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16_strided_a) {
35865 TEST_REQUIRES_X86_FMA3;
35866 for (uint32_t n = 17; n < 32; n++) {
35867 for (size_t k = 1; k <= 20; k += 5) {
35868 GemmMicrokernelTester()
35869 .mr(5)
35870 .nr(16)
35871 .kr(1)
35872 .sr(4)
35873 .m(5)
35874 .n(n)
35875 .k(k)
35876 .a_stride(23)
35877 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35878 }
35879 }
35880 }
35881
35882 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_gt_16_subtile) {
35883 TEST_REQUIRES_X86_FMA3;
35884 for (uint32_t n = 17; n < 32; n++) {
35885 for (size_t k = 1; k <= 20; k += 5) {
35886 for (uint32_t m = 1; m <= 5; m++) {
35887 GemmMicrokernelTester()
35888 .mr(5)
35889 .nr(16)
35890 .kr(1)
35891 .sr(4)
35892 .m(m)
35893 .n(n)
35894 .k(k)
35895 .iterations(1)
35896 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35897 }
35898 }
35899 }
35900 }
35901
35902 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16) {
35903 TEST_REQUIRES_X86_FMA3;
35904 for (uint32_t n = 32; n <= 48; n += 16) {
35905 for (size_t k = 1; k <= 20; k += 5) {
35906 GemmMicrokernelTester()
35907 .mr(5)
35908 .nr(16)
35909 .kr(1)
35910 .sr(4)
35911 .m(5)
35912 .n(16)
35913 .k(k)
35914 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35915 }
35916 }
35917 }
35918
35919 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16_strided_cn) {
35920 TEST_REQUIRES_X86_FMA3;
35921 for (uint32_t n = 32; n <= 48; n += 16) {
35922 for (size_t k = 1; k <= 20; k += 5) {
35923 GemmMicrokernelTester()
35924 .mr(5)
35925 .nr(16)
35926 .kr(1)
35927 .sr(4)
35928 .m(5)
35929 .n(n)
35930 .k(k)
35931 .cn_stride(19)
35932 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35933 }
35934 }
35935 }
35936
35937 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16_strided_a) {
35938 TEST_REQUIRES_X86_FMA3;
35939 for (uint32_t n = 32; n <= 48; n += 16) {
35940 for (size_t k = 1; k <= 20; k += 5) {
35941 GemmMicrokernelTester()
35942 .mr(5)
35943 .nr(16)
35944 .kr(1)
35945 .sr(4)
35946 .m(5)
35947 .n(n)
35948 .k(k)
35949 .a_stride(23)
35950 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35951 }
35952 }
35953 }
35954
35955 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, n_div_16_subtile) {
35956 TEST_REQUIRES_X86_FMA3;
35957 for (uint32_t n = 32; n <= 48; n += 16) {
35958 for (size_t k = 1; k <= 20; k += 5) {
35959 for (uint32_t m = 1; m <= 5; m++) {
35960 GemmMicrokernelTester()
35961 .mr(5)
35962 .nr(16)
35963 .kr(1)
35964 .sr(4)
35965 .m(m)
35966 .n(n)
35967 .k(k)
35968 .iterations(1)
35969 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35970 }
35971 }
35972 }
35973 }
35974
35975 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, strided_cm_subtile) {
35976 TEST_REQUIRES_X86_FMA3;
35977 for (size_t k = 1; k <= 20; k += 5) {
35978 for (uint32_t m = 1; m <= 5; m++) {
35979 for (uint32_t n = 1; n <= 16; n++) {
35980 GemmMicrokernelTester()
35981 .mr(5)
35982 .nr(16)
35983 .kr(1)
35984 .sr(4)
35985 .m(m)
35986 .n(n)
35987 .k(k)
35988 .cm_stride(19)
35989 .iterations(1)
35990 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
35991 }
35992 }
35993 }
35994 }
35995
35996 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, qmin) {
35997 TEST_REQUIRES_X86_FMA3;
35998 GemmMicrokernelTester()
35999 .mr(5)
36000 .nr(16)
36001 .kr(1)
36002 .sr(4)
36003 .m(5)
36004 .n(16)
36005 .k(4)
36006 .qmin(128)
36007 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
36008 }
36009
36010 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, qmax) {
36011 TEST_REQUIRES_X86_FMA3;
36012 GemmMicrokernelTester()
36013 .mr(5)
36014 .nr(16)
36015 .kr(1)
36016 .sr(4)
36017 .m(5)
36018 .n(16)
36019 .k(4)
36020 .qmax(128)
36021 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
36022 }
36023
36024 TEST(F32_GEMM_5X16S4__FMA3_BROADCAST, strided_cm) {
36025 TEST_REQUIRES_X86_FMA3;
36026 GemmMicrokernelTester()
36027 .mr(5)
36028 .nr(16)
36029 .kr(1)
36030 .sr(4)
36031 .m(5)
36032 .n(16)
36033 .k(4)
36034 .cm_stride(19)
36035 .Test(xnn_f32_gemm_ukernel_5x16s4__fma3_broadcast);
36036 }
36037#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36038
36039
36040#if XNN_ARCH_X86 || XNN_ARCH_X86_64
Marat Dukhan0f349c42019-11-27 11:58:54 -080036041 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1) {
36042 TEST_REQUIRES_X86_AVX512F;
36043 GemmMicrokernelTester()
36044 .mr(1)
36045 .nr(16)
36046 .kr(1)
36047 .sr(1)
36048 .m(1)
36049 .n(16)
36050 .k(1)
36051 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36052 }
36053
36054 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, strided_cn) {
36055 TEST_REQUIRES_X86_AVX512F;
36056 GemmMicrokernelTester()
36057 .mr(1)
36058 .nr(16)
36059 .kr(1)
36060 .sr(1)
36061 .m(1)
36062 .n(16)
36063 .k(1)
36064 .cn_stride(19)
36065 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36066 }
36067
36068 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
36069 TEST_REQUIRES_X86_AVX512F;
36070 GemmMicrokernelTester()
36071 .mr(1)
36072 .nr(16)
36073 .kr(1)
36074 .sr(1)
36075 .m(1)
36076 .n(16)
36077 .k(1)
36078 .a_stride(3)
36079 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36080 }
36081
36082 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36083 TEST_REQUIRES_X86_AVX512F;
36084 for (uint32_t m = 1; m <= 1; m++) {
36085 for (uint32_t n = 1; n <= 16; n++) {
36086 GemmMicrokernelTester()
36087 .mr(1)
36088 .nr(16)
36089 .kr(1)
36090 .sr(1)
36091 .m(m)
36092 .n(n)
36093 .k(1)
36094 .iterations(1)
36095 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36096 }
36097 }
36098 }
36099
36100 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36101 TEST_REQUIRES_X86_AVX512F;
36102 for (uint32_t m = 1; m <= 1; m++) {
36103 GemmMicrokernelTester()
36104 .mr(1)
36105 .nr(16)
36106 .kr(1)
36107 .sr(1)
36108 .m(m)
36109 .n(16)
36110 .k(1)
36111 .iterations(1)
36112 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36113 }
36114 }
36115
36116 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36117 TEST_REQUIRES_X86_AVX512F;
36118 for (uint32_t n = 1; n <= 16; n++) {
36119 GemmMicrokernelTester()
36120 .mr(1)
36121 .nr(16)
36122 .kr(1)
36123 .sr(1)
36124 .m(1)
36125 .n(n)
36126 .k(1)
36127 .iterations(1)
36128 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36129 }
36130 }
36131
36132 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_gt_1) {
36133 TEST_REQUIRES_X86_AVX512F;
36134 for (size_t k = 2; k < 10; k++) {
36135 GemmMicrokernelTester()
36136 .mr(1)
36137 .nr(16)
36138 .kr(1)
36139 .sr(1)
36140 .m(1)
36141 .n(16)
36142 .k(k)
36143 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36144 }
36145 }
36146
36147 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
36148 TEST_REQUIRES_X86_AVX512F;
36149 for (size_t k = 2; k < 10; k++) {
36150 GemmMicrokernelTester()
36151 .mr(1)
36152 .nr(16)
36153 .kr(1)
36154 .sr(1)
36155 .m(1)
36156 .n(16)
36157 .k(k)
36158 .a_stride(11)
36159 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36160 }
36161 }
36162
36163 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36164 TEST_REQUIRES_X86_AVX512F;
36165 for (size_t k = 2; k < 10; k++) {
36166 for (uint32_t m = 1; m <= 1; m++) {
36167 for (uint32_t n = 1; n <= 16; n++) {
36168 GemmMicrokernelTester()
36169 .mr(1)
36170 .nr(16)
36171 .kr(1)
36172 .sr(1)
36173 .m(m)
36174 .n(n)
36175 .k(k)
36176 .iterations(1)
36177 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36178 }
36179 }
36180 }
36181 }
36182
36183 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16) {
36184 TEST_REQUIRES_X86_AVX512F;
36185 for (uint32_t n = 17; n < 32; n++) {
36186 for (size_t k = 1; k <= 5; k += 2) {
36187 GemmMicrokernelTester()
36188 .mr(1)
36189 .nr(16)
36190 .kr(1)
36191 .sr(1)
36192 .m(1)
36193 .n(16)
36194 .k(k)
36195 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36196 }
36197 }
36198 }
36199
36200 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
36201 TEST_REQUIRES_X86_AVX512F;
36202 for (uint32_t n = 17; n < 32; n++) {
36203 for (size_t k = 1; k <= 5; k += 2) {
36204 GemmMicrokernelTester()
36205 .mr(1)
36206 .nr(16)
36207 .kr(1)
36208 .sr(1)
36209 .m(1)
36210 .n(16)
36211 .k(k)
36212 .cn_stride(19)
36213 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36214 }
36215 }
36216 }
36217
36218 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
36219 TEST_REQUIRES_X86_AVX512F;
36220 for (uint32_t n = 17; n < 32; n++) {
36221 for (size_t k = 1; k <= 5; k += 2) {
36222 GemmMicrokernelTester()
36223 .mr(1)
36224 .nr(16)
36225 .kr(1)
36226 .sr(1)
36227 .m(1)
36228 .n(n)
36229 .k(k)
36230 .a_stride(7)
36231 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36232 }
36233 }
36234 }
36235
36236 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_gt_16_subtile) {
36237 TEST_REQUIRES_X86_AVX512F;
36238 for (uint32_t n = 17; n < 32; n++) {
36239 for (size_t k = 1; k <= 5; k += 2) {
36240 for (uint32_t m = 1; m <= 1; m++) {
36241 GemmMicrokernelTester()
36242 .mr(1)
36243 .nr(16)
36244 .kr(1)
36245 .sr(1)
36246 .m(m)
36247 .n(n)
36248 .k(k)
36249 .iterations(1)
36250 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36251 }
36252 }
36253 }
36254 }
36255
36256 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16) {
36257 TEST_REQUIRES_X86_AVX512F;
36258 for (uint32_t n = 32; n <= 48; n += 16) {
36259 for (size_t k = 1; k <= 5; k += 2) {
36260 GemmMicrokernelTester()
36261 .mr(1)
36262 .nr(16)
36263 .kr(1)
36264 .sr(1)
36265 .m(1)
36266 .n(16)
36267 .k(k)
36268 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36269 }
36270 }
36271 }
36272
36273 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
36274 TEST_REQUIRES_X86_AVX512F;
36275 for (uint32_t n = 32; n <= 48; n += 16) {
36276 for (size_t k = 1; k <= 5; k += 2) {
36277 GemmMicrokernelTester()
36278 .mr(1)
36279 .nr(16)
36280 .kr(1)
36281 .sr(1)
36282 .m(1)
36283 .n(n)
36284 .k(k)
36285 .cn_stride(19)
36286 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36287 }
36288 }
36289 }
36290
36291 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16_strided_a) {
36292 TEST_REQUIRES_X86_AVX512F;
36293 for (uint32_t n = 32; n <= 48; n += 16) {
36294 for (size_t k = 1; k <= 5; k += 2) {
36295 GemmMicrokernelTester()
36296 .mr(1)
36297 .nr(16)
36298 .kr(1)
36299 .sr(1)
36300 .m(1)
36301 .n(n)
36302 .k(k)
36303 .a_stride(7)
36304 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36305 }
36306 }
36307 }
36308
36309 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, n_div_16_subtile) {
36310 TEST_REQUIRES_X86_AVX512F;
36311 for (uint32_t n = 32; n <= 48; n += 16) {
36312 for (size_t k = 1; k <= 5; k += 2) {
36313 for (uint32_t m = 1; m <= 1; m++) {
36314 GemmMicrokernelTester()
36315 .mr(1)
36316 .nr(16)
36317 .kr(1)
36318 .sr(1)
36319 .m(m)
36320 .n(n)
36321 .k(k)
36322 .iterations(1)
36323 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36324 }
36325 }
36326 }
36327 }
36328
36329 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, strided_cm_subtile) {
36330 TEST_REQUIRES_X86_AVX512F;
36331 for (size_t k = 1; k <= 5; k += 2) {
36332 for (uint32_t m = 1; m <= 1; m++) {
36333 for (uint32_t n = 1; n <= 16; n++) {
36334 GemmMicrokernelTester()
36335 .mr(1)
36336 .nr(16)
36337 .kr(1)
36338 .sr(1)
36339 .m(m)
36340 .n(n)
36341 .k(k)
36342 .cm_stride(19)
36343 .iterations(1)
36344 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36345 }
36346 }
36347 }
36348 }
36349
36350 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, qmin) {
36351 TEST_REQUIRES_X86_AVX512F;
36352 GemmMicrokernelTester()
36353 .mr(1)
36354 .nr(16)
36355 .kr(1)
36356 .sr(1)
36357 .m(1)
36358 .n(16)
36359 .k(1)
36360 .qmin(128)
36361 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36362 }
36363
36364 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, qmax) {
36365 TEST_REQUIRES_X86_AVX512F;
36366 GemmMicrokernelTester()
36367 .mr(1)
36368 .nr(16)
36369 .kr(1)
36370 .sr(1)
36371 .m(1)
36372 .n(16)
36373 .k(1)
36374 .qmax(128)
36375 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36376 }
36377
36378 TEST(F32_GEMM_1X16__AVX512F_BROADCAST, strided_cm) {
36379 TEST_REQUIRES_X86_AVX512F;
36380 GemmMicrokernelTester()
36381 .mr(1)
36382 .nr(16)
36383 .kr(1)
36384 .sr(1)
36385 .m(1)
36386 .n(16)
36387 .k(1)
36388 .cm_stride(19)
36389 .Test(xnn_f32_gemm_ukernel_1x16__avx512f_broadcast);
36390 }
36391#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36392
36393
36394#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36395 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1) {
36396 TEST_REQUIRES_X86_AVX512F;
36397 GemmMicrokernelTester()
36398 .mr(4)
36399 .nr(16)
36400 .kr(1)
36401 .sr(1)
36402 .m(4)
36403 .n(16)
36404 .k(1)
36405 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36406 }
36407
36408 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, strided_cn) {
36409 TEST_REQUIRES_X86_AVX512F;
36410 GemmMicrokernelTester()
36411 .mr(4)
36412 .nr(16)
36413 .kr(1)
36414 .sr(1)
36415 .m(4)
36416 .n(16)
36417 .k(1)
36418 .cn_stride(19)
36419 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36420 }
36421
36422 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
36423 TEST_REQUIRES_X86_AVX512F;
36424 GemmMicrokernelTester()
36425 .mr(4)
36426 .nr(16)
36427 .kr(1)
36428 .sr(1)
36429 .m(4)
36430 .n(16)
36431 .k(1)
36432 .a_stride(3)
36433 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36434 }
36435
36436 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36437 TEST_REQUIRES_X86_AVX512F;
36438 for (uint32_t m = 1; m <= 4; m++) {
36439 for (uint32_t n = 1; n <= 16; n++) {
36440 GemmMicrokernelTester()
36441 .mr(4)
36442 .nr(16)
36443 .kr(1)
36444 .sr(1)
36445 .m(m)
36446 .n(n)
36447 .k(1)
36448 .iterations(1)
36449 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36450 }
36451 }
36452 }
36453
36454 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36455 TEST_REQUIRES_X86_AVX512F;
36456 for (uint32_t m = 1; m <= 4; m++) {
36457 GemmMicrokernelTester()
36458 .mr(4)
36459 .nr(16)
36460 .kr(1)
36461 .sr(1)
36462 .m(m)
36463 .n(16)
36464 .k(1)
36465 .iterations(1)
36466 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36467 }
36468 }
36469
36470 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36471 TEST_REQUIRES_X86_AVX512F;
36472 for (uint32_t n = 1; n <= 16; n++) {
36473 GemmMicrokernelTester()
36474 .mr(4)
36475 .nr(16)
36476 .kr(1)
36477 .sr(1)
36478 .m(4)
36479 .n(n)
36480 .k(1)
36481 .iterations(1)
36482 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36483 }
36484 }
36485
36486 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_gt_1) {
36487 TEST_REQUIRES_X86_AVX512F;
36488 for (size_t k = 2; k < 10; k++) {
36489 GemmMicrokernelTester()
36490 .mr(4)
36491 .nr(16)
36492 .kr(1)
36493 .sr(1)
36494 .m(4)
36495 .n(16)
36496 .k(k)
36497 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36498 }
36499 }
36500
36501 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
36502 TEST_REQUIRES_X86_AVX512F;
36503 for (size_t k = 2; k < 10; k++) {
36504 GemmMicrokernelTester()
36505 .mr(4)
36506 .nr(16)
36507 .kr(1)
36508 .sr(1)
36509 .m(4)
36510 .n(16)
36511 .k(k)
36512 .a_stride(11)
36513 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36514 }
36515 }
36516
36517 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36518 TEST_REQUIRES_X86_AVX512F;
36519 for (size_t k = 2; k < 10; k++) {
36520 for (uint32_t m = 1; m <= 4; m++) {
36521 for (uint32_t n = 1; n <= 16; n++) {
36522 GemmMicrokernelTester()
36523 .mr(4)
36524 .nr(16)
36525 .kr(1)
36526 .sr(1)
36527 .m(m)
36528 .n(n)
36529 .k(k)
36530 .iterations(1)
36531 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36532 }
36533 }
36534 }
36535 }
36536
36537 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16) {
36538 TEST_REQUIRES_X86_AVX512F;
36539 for (uint32_t n = 17; n < 32; n++) {
36540 for (size_t k = 1; k <= 5; k += 2) {
36541 GemmMicrokernelTester()
36542 .mr(4)
36543 .nr(16)
36544 .kr(1)
36545 .sr(1)
36546 .m(4)
36547 .n(16)
36548 .k(k)
36549 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36550 }
36551 }
36552 }
36553
36554 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
36555 TEST_REQUIRES_X86_AVX512F;
36556 for (uint32_t n = 17; n < 32; n++) {
36557 for (size_t k = 1; k <= 5; k += 2) {
36558 GemmMicrokernelTester()
36559 .mr(4)
36560 .nr(16)
36561 .kr(1)
36562 .sr(1)
36563 .m(4)
36564 .n(16)
36565 .k(k)
36566 .cn_stride(19)
36567 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36568 }
36569 }
36570 }
36571
36572 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
36573 TEST_REQUIRES_X86_AVX512F;
36574 for (uint32_t n = 17; n < 32; n++) {
36575 for (size_t k = 1; k <= 5; k += 2) {
36576 GemmMicrokernelTester()
36577 .mr(4)
36578 .nr(16)
36579 .kr(1)
36580 .sr(1)
36581 .m(4)
36582 .n(n)
36583 .k(k)
36584 .a_stride(7)
36585 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36586 }
36587 }
36588 }
36589
36590 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_gt_16_subtile) {
36591 TEST_REQUIRES_X86_AVX512F;
36592 for (uint32_t n = 17; n < 32; n++) {
36593 for (size_t k = 1; k <= 5; k += 2) {
36594 for (uint32_t m = 1; m <= 4; m++) {
36595 GemmMicrokernelTester()
36596 .mr(4)
36597 .nr(16)
36598 .kr(1)
36599 .sr(1)
36600 .m(m)
36601 .n(n)
36602 .k(k)
36603 .iterations(1)
36604 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36605 }
36606 }
36607 }
36608 }
36609
36610 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16) {
36611 TEST_REQUIRES_X86_AVX512F;
36612 for (uint32_t n = 32; n <= 48; n += 16) {
36613 for (size_t k = 1; k <= 5; k += 2) {
36614 GemmMicrokernelTester()
36615 .mr(4)
36616 .nr(16)
36617 .kr(1)
36618 .sr(1)
36619 .m(4)
36620 .n(16)
36621 .k(k)
36622 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36623 }
36624 }
36625 }
36626
36627 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
36628 TEST_REQUIRES_X86_AVX512F;
36629 for (uint32_t n = 32; n <= 48; n += 16) {
36630 for (size_t k = 1; k <= 5; k += 2) {
36631 GemmMicrokernelTester()
36632 .mr(4)
36633 .nr(16)
36634 .kr(1)
36635 .sr(1)
36636 .m(4)
36637 .n(n)
36638 .k(k)
36639 .cn_stride(19)
36640 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36641 }
36642 }
36643 }
36644
36645 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16_strided_a) {
36646 TEST_REQUIRES_X86_AVX512F;
36647 for (uint32_t n = 32; n <= 48; n += 16) {
36648 for (size_t k = 1; k <= 5; k += 2) {
36649 GemmMicrokernelTester()
36650 .mr(4)
36651 .nr(16)
36652 .kr(1)
36653 .sr(1)
36654 .m(4)
36655 .n(n)
36656 .k(k)
36657 .a_stride(7)
36658 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36659 }
36660 }
36661 }
36662
36663 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, n_div_16_subtile) {
36664 TEST_REQUIRES_X86_AVX512F;
36665 for (uint32_t n = 32; n <= 48; n += 16) {
36666 for (size_t k = 1; k <= 5; k += 2) {
36667 for (uint32_t m = 1; m <= 4; m++) {
36668 GemmMicrokernelTester()
36669 .mr(4)
36670 .nr(16)
36671 .kr(1)
36672 .sr(1)
36673 .m(m)
36674 .n(n)
36675 .k(k)
36676 .iterations(1)
36677 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36678 }
36679 }
36680 }
36681 }
36682
36683 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, strided_cm_subtile) {
36684 TEST_REQUIRES_X86_AVX512F;
36685 for (size_t k = 1; k <= 5; k += 2) {
36686 for (uint32_t m = 1; m <= 4; m++) {
36687 for (uint32_t n = 1; n <= 16; n++) {
36688 GemmMicrokernelTester()
36689 .mr(4)
36690 .nr(16)
36691 .kr(1)
36692 .sr(1)
36693 .m(m)
36694 .n(n)
36695 .k(k)
36696 .cm_stride(19)
36697 .iterations(1)
36698 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36699 }
36700 }
36701 }
36702 }
36703
36704 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, qmin) {
36705 TEST_REQUIRES_X86_AVX512F;
36706 GemmMicrokernelTester()
36707 .mr(4)
36708 .nr(16)
36709 .kr(1)
36710 .sr(1)
36711 .m(4)
36712 .n(16)
36713 .k(1)
36714 .qmin(128)
36715 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36716 }
36717
36718 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, qmax) {
36719 TEST_REQUIRES_X86_AVX512F;
36720 GemmMicrokernelTester()
36721 .mr(4)
36722 .nr(16)
36723 .kr(1)
36724 .sr(1)
36725 .m(4)
36726 .n(16)
36727 .k(1)
36728 .qmax(128)
36729 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36730 }
36731
36732 TEST(F32_GEMM_4X16__AVX512F_BROADCAST, strided_cm) {
36733 TEST_REQUIRES_X86_AVX512F;
36734 GemmMicrokernelTester()
36735 .mr(4)
36736 .nr(16)
36737 .kr(1)
36738 .sr(1)
36739 .m(4)
36740 .n(16)
36741 .k(1)
36742 .cm_stride(19)
36743 .Test(xnn_f32_gemm_ukernel_4x16__avx512f_broadcast);
36744 }
36745#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
36746
36747
36748#if XNN_ARCH_X86 || XNN_ARCH_X86_64
36749 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1) {
36750 TEST_REQUIRES_X86_AVX512F;
36751 GemmMicrokernelTester()
36752 .mr(5)
36753 .nr(16)
36754 .kr(1)
36755 .sr(1)
36756 .m(5)
36757 .n(16)
36758 .k(1)
36759 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36760 }
36761
36762 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, strided_cn) {
36763 TEST_REQUIRES_X86_AVX512F;
36764 GemmMicrokernelTester()
36765 .mr(5)
36766 .nr(16)
36767 .kr(1)
36768 .sr(1)
36769 .m(5)
36770 .n(16)
36771 .k(1)
36772 .cn_stride(19)
36773 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36774 }
36775
36776 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
36777 TEST_REQUIRES_X86_AVX512F;
36778 GemmMicrokernelTester()
36779 .mr(5)
36780 .nr(16)
36781 .kr(1)
36782 .sr(1)
36783 .m(5)
36784 .n(16)
36785 .k(1)
36786 .a_stride(3)
36787 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36788 }
36789
36790 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile) {
36791 TEST_REQUIRES_X86_AVX512F;
36792 for (uint32_t m = 1; m <= 5; m++) {
36793 for (uint32_t n = 1; n <= 16; n++) {
36794 GemmMicrokernelTester()
36795 .mr(5)
36796 .nr(16)
36797 .kr(1)
36798 .sr(1)
36799 .m(m)
36800 .n(n)
36801 .k(1)
36802 .iterations(1)
36803 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36804 }
36805 }
36806 }
36807
36808 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
36809 TEST_REQUIRES_X86_AVX512F;
36810 for (uint32_t m = 1; m <= 5; m++) {
36811 GemmMicrokernelTester()
36812 .mr(5)
36813 .nr(16)
36814 .kr(1)
36815 .sr(1)
36816 .m(m)
36817 .n(16)
36818 .k(1)
36819 .iterations(1)
36820 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36821 }
36822 }
36823
36824 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
36825 TEST_REQUIRES_X86_AVX512F;
36826 for (uint32_t n = 1; n <= 16; n++) {
36827 GemmMicrokernelTester()
36828 .mr(5)
36829 .nr(16)
36830 .kr(1)
36831 .sr(1)
36832 .m(5)
36833 .n(n)
36834 .k(1)
36835 .iterations(1)
36836 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36837 }
36838 }
36839
36840 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_gt_1) {
36841 TEST_REQUIRES_X86_AVX512F;
36842 for (size_t k = 2; k < 10; k++) {
36843 GemmMicrokernelTester()
36844 .mr(5)
36845 .nr(16)
36846 .kr(1)
36847 .sr(1)
36848 .m(5)
36849 .n(16)
36850 .k(k)
36851 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36852 }
36853 }
36854
36855 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
36856 TEST_REQUIRES_X86_AVX512F;
36857 for (size_t k = 2; k < 10; k++) {
36858 GemmMicrokernelTester()
36859 .mr(5)
36860 .nr(16)
36861 .kr(1)
36862 .sr(1)
36863 .m(5)
36864 .n(16)
36865 .k(k)
36866 .a_stride(11)
36867 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36868 }
36869 }
36870
36871 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, k_gt_1_subtile) {
36872 TEST_REQUIRES_X86_AVX512F;
36873 for (size_t k = 2; k < 10; k++) {
36874 for (uint32_t m = 1; m <= 5; m++) {
36875 for (uint32_t n = 1; n <= 16; n++) {
36876 GemmMicrokernelTester()
36877 .mr(5)
36878 .nr(16)
36879 .kr(1)
36880 .sr(1)
36881 .m(m)
36882 .n(n)
36883 .k(k)
36884 .iterations(1)
36885 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36886 }
36887 }
36888 }
36889 }
36890
36891 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16) {
36892 TEST_REQUIRES_X86_AVX512F;
36893 for (uint32_t n = 17; n < 32; n++) {
36894 for (size_t k = 1; k <= 5; k += 2) {
36895 GemmMicrokernelTester()
36896 .mr(5)
36897 .nr(16)
36898 .kr(1)
36899 .sr(1)
36900 .m(5)
36901 .n(16)
36902 .k(k)
36903 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36904 }
36905 }
36906 }
36907
36908 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
36909 TEST_REQUIRES_X86_AVX512F;
36910 for (uint32_t n = 17; n < 32; n++) {
36911 for (size_t k = 1; k <= 5; k += 2) {
36912 GemmMicrokernelTester()
36913 .mr(5)
36914 .nr(16)
36915 .kr(1)
36916 .sr(1)
36917 .m(5)
36918 .n(16)
36919 .k(k)
36920 .cn_stride(19)
36921 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36922 }
36923 }
36924 }
36925
36926 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
36927 TEST_REQUIRES_X86_AVX512F;
36928 for (uint32_t n = 17; n < 32; n++) {
36929 for (size_t k = 1; k <= 5; k += 2) {
36930 GemmMicrokernelTester()
36931 .mr(5)
36932 .nr(16)
36933 .kr(1)
36934 .sr(1)
36935 .m(5)
36936 .n(n)
36937 .k(k)
36938 .a_stride(7)
36939 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36940 }
36941 }
36942 }
36943
36944 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_gt_16_subtile) {
36945 TEST_REQUIRES_X86_AVX512F;
36946 for (uint32_t n = 17; n < 32; n++) {
36947 for (size_t k = 1; k <= 5; k += 2) {
36948 for (uint32_t m = 1; m <= 5; m++) {
36949 GemmMicrokernelTester()
36950 .mr(5)
36951 .nr(16)
36952 .kr(1)
36953 .sr(1)
36954 .m(m)
36955 .n(n)
36956 .k(k)
36957 .iterations(1)
36958 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36959 }
36960 }
36961 }
36962 }
36963
36964 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16) {
36965 TEST_REQUIRES_X86_AVX512F;
36966 for (uint32_t n = 32; n <= 48; n += 16) {
36967 for (size_t k = 1; k <= 5; k += 2) {
36968 GemmMicrokernelTester()
36969 .mr(5)
36970 .nr(16)
36971 .kr(1)
36972 .sr(1)
36973 .m(5)
36974 .n(16)
36975 .k(k)
36976 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36977 }
36978 }
36979 }
36980
36981 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
36982 TEST_REQUIRES_X86_AVX512F;
36983 for (uint32_t n = 32; n <= 48; n += 16) {
36984 for (size_t k = 1; k <= 5; k += 2) {
36985 GemmMicrokernelTester()
36986 .mr(5)
36987 .nr(16)
36988 .kr(1)
36989 .sr(1)
36990 .m(5)
36991 .n(n)
36992 .k(k)
36993 .cn_stride(19)
36994 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
36995 }
36996 }
36997 }
36998
36999 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16_strided_a) {
37000 TEST_REQUIRES_X86_AVX512F;
37001 for (uint32_t n = 32; n <= 48; n += 16) {
37002 for (size_t k = 1; k <= 5; k += 2) {
37003 GemmMicrokernelTester()
37004 .mr(5)
37005 .nr(16)
37006 .kr(1)
37007 .sr(1)
37008 .m(5)
37009 .n(n)
37010 .k(k)
37011 .a_stride(7)
37012 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
37013 }
37014 }
37015 }
37016
37017 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, n_div_16_subtile) {
37018 TEST_REQUIRES_X86_AVX512F;
37019 for (uint32_t n = 32; n <= 48; n += 16) {
37020 for (size_t k = 1; k <= 5; k += 2) {
37021 for (uint32_t m = 1; m <= 5; m++) {
37022 GemmMicrokernelTester()
37023 .mr(5)
37024 .nr(16)
37025 .kr(1)
37026 .sr(1)
37027 .m(m)
37028 .n(n)
37029 .k(k)
37030 .iterations(1)
37031 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
37032 }
37033 }
37034 }
37035 }
37036
37037 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, strided_cm_subtile) {
37038 TEST_REQUIRES_X86_AVX512F;
37039 for (size_t k = 1; k <= 5; k += 2) {
37040 for (uint32_t m = 1; m <= 5; m++) {
37041 for (uint32_t n = 1; n <= 16; n++) {
37042 GemmMicrokernelTester()
37043 .mr(5)
37044 .nr(16)
37045 .kr(1)
37046 .sr(1)
37047 .m(m)
37048 .n(n)
37049 .k(k)
37050 .cm_stride(19)
37051 .iterations(1)
37052 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
37053 }
37054 }
37055 }
37056 }
37057
37058 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, qmin) {
37059 TEST_REQUIRES_X86_AVX512F;
37060 GemmMicrokernelTester()
37061 .mr(5)
37062 .nr(16)
37063 .kr(1)
37064 .sr(1)
37065 .m(5)
37066 .n(16)
37067 .k(1)
37068 .qmin(128)
37069 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
37070 }
37071
37072 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, qmax) {
37073 TEST_REQUIRES_X86_AVX512F;
37074 GemmMicrokernelTester()
37075 .mr(5)
37076 .nr(16)
37077 .kr(1)
37078 .sr(1)
37079 .m(5)
37080 .n(16)
37081 .k(1)
37082 .qmax(128)
37083 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
37084 }
37085
37086 TEST(F32_GEMM_5X16__AVX512F_BROADCAST, strided_cm) {
37087 TEST_REQUIRES_X86_AVX512F;
37088 GemmMicrokernelTester()
37089 .mr(5)
37090 .nr(16)
37091 .kr(1)
37092 .sr(1)
37093 .m(5)
37094 .n(16)
37095 .k(1)
37096 .cm_stride(19)
37097 .Test(xnn_f32_gemm_ukernel_5x16__avx512f_broadcast);
37098 }
37099#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37100
37101
37102#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37103 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1) {
37104 TEST_REQUIRES_X86_AVX512F;
37105 GemmMicrokernelTester()
37106 .mr(6)
37107 .nr(16)
37108 .kr(1)
37109 .sr(1)
37110 .m(6)
37111 .n(16)
37112 .k(1)
37113 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37114 }
37115
37116 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, strided_cn) {
37117 TEST_REQUIRES_X86_AVX512F;
37118 GemmMicrokernelTester()
37119 .mr(6)
37120 .nr(16)
37121 .kr(1)
37122 .sr(1)
37123 .m(6)
37124 .n(16)
37125 .k(1)
37126 .cn_stride(19)
37127 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37128 }
37129
37130 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
37131 TEST_REQUIRES_X86_AVX512F;
37132 GemmMicrokernelTester()
37133 .mr(6)
37134 .nr(16)
37135 .kr(1)
37136 .sr(1)
37137 .m(6)
37138 .n(16)
37139 .k(1)
37140 .a_stride(3)
37141 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37142 }
37143
37144 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile) {
37145 TEST_REQUIRES_X86_AVX512F;
37146 for (uint32_t m = 1; m <= 6; m++) {
37147 for (uint32_t n = 1; n <= 16; n++) {
37148 GemmMicrokernelTester()
37149 .mr(6)
37150 .nr(16)
37151 .kr(1)
37152 .sr(1)
37153 .m(m)
37154 .n(n)
37155 .k(1)
37156 .iterations(1)
37157 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37158 }
37159 }
37160 }
37161
37162 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
37163 TEST_REQUIRES_X86_AVX512F;
37164 for (uint32_t m = 1; m <= 6; m++) {
37165 GemmMicrokernelTester()
37166 .mr(6)
37167 .nr(16)
37168 .kr(1)
37169 .sr(1)
37170 .m(m)
37171 .n(16)
37172 .k(1)
37173 .iterations(1)
37174 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37175 }
37176 }
37177
37178 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
37179 TEST_REQUIRES_X86_AVX512F;
37180 for (uint32_t n = 1; n <= 16; n++) {
37181 GemmMicrokernelTester()
37182 .mr(6)
37183 .nr(16)
37184 .kr(1)
37185 .sr(1)
37186 .m(6)
37187 .n(n)
37188 .k(1)
37189 .iterations(1)
37190 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37191 }
37192 }
37193
37194 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_gt_1) {
37195 TEST_REQUIRES_X86_AVX512F;
37196 for (size_t k = 2; k < 10; k++) {
37197 GemmMicrokernelTester()
37198 .mr(6)
37199 .nr(16)
37200 .kr(1)
37201 .sr(1)
37202 .m(6)
37203 .n(16)
37204 .k(k)
37205 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37206 }
37207 }
37208
37209 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
37210 TEST_REQUIRES_X86_AVX512F;
37211 for (size_t k = 2; k < 10; k++) {
37212 GemmMicrokernelTester()
37213 .mr(6)
37214 .nr(16)
37215 .kr(1)
37216 .sr(1)
37217 .m(6)
37218 .n(16)
37219 .k(k)
37220 .a_stride(11)
37221 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37222 }
37223 }
37224
37225 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, k_gt_1_subtile) {
37226 TEST_REQUIRES_X86_AVX512F;
37227 for (size_t k = 2; k < 10; k++) {
37228 for (uint32_t m = 1; m <= 6; m++) {
37229 for (uint32_t n = 1; n <= 16; n++) {
37230 GemmMicrokernelTester()
37231 .mr(6)
37232 .nr(16)
37233 .kr(1)
37234 .sr(1)
37235 .m(m)
37236 .n(n)
37237 .k(k)
37238 .iterations(1)
37239 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37240 }
37241 }
37242 }
37243 }
37244
37245 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16) {
37246 TEST_REQUIRES_X86_AVX512F;
37247 for (uint32_t n = 17; n < 32; n++) {
37248 for (size_t k = 1; k <= 5; k += 2) {
37249 GemmMicrokernelTester()
37250 .mr(6)
37251 .nr(16)
37252 .kr(1)
37253 .sr(1)
37254 .m(6)
37255 .n(16)
37256 .k(k)
37257 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37258 }
37259 }
37260 }
37261
37262 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
37263 TEST_REQUIRES_X86_AVX512F;
37264 for (uint32_t n = 17; n < 32; n++) {
37265 for (size_t k = 1; k <= 5; k += 2) {
37266 GemmMicrokernelTester()
37267 .mr(6)
37268 .nr(16)
37269 .kr(1)
37270 .sr(1)
37271 .m(6)
37272 .n(16)
37273 .k(k)
37274 .cn_stride(19)
37275 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37276 }
37277 }
37278 }
37279
37280 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
37281 TEST_REQUIRES_X86_AVX512F;
37282 for (uint32_t n = 17; n < 32; n++) {
37283 for (size_t k = 1; k <= 5; k += 2) {
37284 GemmMicrokernelTester()
37285 .mr(6)
37286 .nr(16)
37287 .kr(1)
37288 .sr(1)
37289 .m(6)
37290 .n(n)
37291 .k(k)
37292 .a_stride(7)
37293 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37294 }
37295 }
37296 }
37297
37298 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_gt_16_subtile) {
37299 TEST_REQUIRES_X86_AVX512F;
37300 for (uint32_t n = 17; n < 32; n++) {
37301 for (size_t k = 1; k <= 5; k += 2) {
37302 for (uint32_t m = 1; m <= 6; m++) {
37303 GemmMicrokernelTester()
37304 .mr(6)
37305 .nr(16)
37306 .kr(1)
37307 .sr(1)
37308 .m(m)
37309 .n(n)
37310 .k(k)
37311 .iterations(1)
37312 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37313 }
37314 }
37315 }
37316 }
37317
37318 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16) {
37319 TEST_REQUIRES_X86_AVX512F;
37320 for (uint32_t n = 32; n <= 48; n += 16) {
37321 for (size_t k = 1; k <= 5; k += 2) {
37322 GemmMicrokernelTester()
37323 .mr(6)
37324 .nr(16)
37325 .kr(1)
37326 .sr(1)
37327 .m(6)
37328 .n(16)
37329 .k(k)
37330 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37331 }
37332 }
37333 }
37334
37335 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
37336 TEST_REQUIRES_X86_AVX512F;
37337 for (uint32_t n = 32; n <= 48; n += 16) {
37338 for (size_t k = 1; k <= 5; k += 2) {
37339 GemmMicrokernelTester()
37340 .mr(6)
37341 .nr(16)
37342 .kr(1)
37343 .sr(1)
37344 .m(6)
37345 .n(n)
37346 .k(k)
37347 .cn_stride(19)
37348 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37349 }
37350 }
37351 }
37352
37353 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16_strided_a) {
37354 TEST_REQUIRES_X86_AVX512F;
37355 for (uint32_t n = 32; n <= 48; n += 16) {
37356 for (size_t k = 1; k <= 5; k += 2) {
37357 GemmMicrokernelTester()
37358 .mr(6)
37359 .nr(16)
37360 .kr(1)
37361 .sr(1)
37362 .m(6)
37363 .n(n)
37364 .k(k)
37365 .a_stride(7)
37366 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37367 }
37368 }
37369 }
37370
37371 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, n_div_16_subtile) {
37372 TEST_REQUIRES_X86_AVX512F;
37373 for (uint32_t n = 32; n <= 48; n += 16) {
37374 for (size_t k = 1; k <= 5; k += 2) {
37375 for (uint32_t m = 1; m <= 6; m++) {
37376 GemmMicrokernelTester()
37377 .mr(6)
37378 .nr(16)
37379 .kr(1)
37380 .sr(1)
37381 .m(m)
37382 .n(n)
37383 .k(k)
37384 .iterations(1)
37385 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37386 }
37387 }
37388 }
37389 }
37390
37391 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, strided_cm_subtile) {
37392 TEST_REQUIRES_X86_AVX512F;
37393 for (size_t k = 1; k <= 5; k += 2) {
37394 for (uint32_t m = 1; m <= 6; m++) {
37395 for (uint32_t n = 1; n <= 16; n++) {
37396 GemmMicrokernelTester()
37397 .mr(6)
37398 .nr(16)
37399 .kr(1)
37400 .sr(1)
37401 .m(m)
37402 .n(n)
37403 .k(k)
37404 .cm_stride(19)
37405 .iterations(1)
37406 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37407 }
37408 }
37409 }
37410 }
37411
37412 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, qmin) {
37413 TEST_REQUIRES_X86_AVX512F;
37414 GemmMicrokernelTester()
37415 .mr(6)
37416 .nr(16)
37417 .kr(1)
37418 .sr(1)
37419 .m(6)
37420 .n(16)
37421 .k(1)
37422 .qmin(128)
37423 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37424 }
37425
37426 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, qmax) {
37427 TEST_REQUIRES_X86_AVX512F;
37428 GemmMicrokernelTester()
37429 .mr(6)
37430 .nr(16)
37431 .kr(1)
37432 .sr(1)
37433 .m(6)
37434 .n(16)
37435 .k(1)
37436 .qmax(128)
37437 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37438 }
37439
37440 TEST(F32_GEMM_6X16__AVX512F_BROADCAST, strided_cm) {
37441 TEST_REQUIRES_X86_AVX512F;
37442 GemmMicrokernelTester()
37443 .mr(6)
37444 .nr(16)
37445 .kr(1)
37446 .sr(1)
37447 .m(6)
37448 .n(16)
37449 .k(1)
37450 .cm_stride(19)
37451 .Test(xnn_f32_gemm_ukernel_6x16__avx512f_broadcast);
37452 }
37453#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37454
37455
37456#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37457 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1) {
37458 TEST_REQUIRES_X86_AVX512F;
37459 GemmMicrokernelTester()
37460 .mr(7)
37461 .nr(16)
37462 .kr(1)
37463 .sr(1)
37464 .m(7)
37465 .n(16)
37466 .k(1)
37467 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37468 }
37469
37470 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, strided_cn) {
37471 TEST_REQUIRES_X86_AVX512F;
37472 GemmMicrokernelTester()
37473 .mr(7)
37474 .nr(16)
37475 .kr(1)
37476 .sr(1)
37477 .m(7)
37478 .n(16)
37479 .k(1)
37480 .cn_stride(19)
37481 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37482 }
37483
37484 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
37485 TEST_REQUIRES_X86_AVX512F;
37486 GemmMicrokernelTester()
37487 .mr(7)
37488 .nr(16)
37489 .kr(1)
37490 .sr(1)
37491 .m(7)
37492 .n(16)
37493 .k(1)
37494 .a_stride(3)
37495 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37496 }
37497
37498 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile) {
37499 TEST_REQUIRES_X86_AVX512F;
37500 for (uint32_t m = 1; m <= 7; m++) {
37501 for (uint32_t n = 1; n <= 16; n++) {
37502 GemmMicrokernelTester()
37503 .mr(7)
37504 .nr(16)
37505 .kr(1)
37506 .sr(1)
37507 .m(m)
37508 .n(n)
37509 .k(1)
37510 .iterations(1)
37511 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37512 }
37513 }
37514 }
37515
37516 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
37517 TEST_REQUIRES_X86_AVX512F;
37518 for (uint32_t m = 1; m <= 7; m++) {
37519 GemmMicrokernelTester()
37520 .mr(7)
37521 .nr(16)
37522 .kr(1)
37523 .sr(1)
37524 .m(m)
37525 .n(16)
37526 .k(1)
37527 .iterations(1)
37528 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37529 }
37530 }
37531
37532 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
37533 TEST_REQUIRES_X86_AVX512F;
37534 for (uint32_t n = 1; n <= 16; n++) {
37535 GemmMicrokernelTester()
37536 .mr(7)
37537 .nr(16)
37538 .kr(1)
37539 .sr(1)
37540 .m(7)
37541 .n(n)
37542 .k(1)
37543 .iterations(1)
37544 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37545 }
37546 }
37547
37548 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_gt_1) {
37549 TEST_REQUIRES_X86_AVX512F;
37550 for (size_t k = 2; k < 10; k++) {
37551 GemmMicrokernelTester()
37552 .mr(7)
37553 .nr(16)
37554 .kr(1)
37555 .sr(1)
37556 .m(7)
37557 .n(16)
37558 .k(k)
37559 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37560 }
37561 }
37562
37563 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
37564 TEST_REQUIRES_X86_AVX512F;
37565 for (size_t k = 2; k < 10; k++) {
37566 GemmMicrokernelTester()
37567 .mr(7)
37568 .nr(16)
37569 .kr(1)
37570 .sr(1)
37571 .m(7)
37572 .n(16)
37573 .k(k)
37574 .a_stride(11)
37575 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37576 }
37577 }
37578
37579 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, k_gt_1_subtile) {
37580 TEST_REQUIRES_X86_AVX512F;
37581 for (size_t k = 2; k < 10; k++) {
37582 for (uint32_t m = 1; m <= 7; m++) {
37583 for (uint32_t n = 1; n <= 16; n++) {
37584 GemmMicrokernelTester()
37585 .mr(7)
37586 .nr(16)
37587 .kr(1)
37588 .sr(1)
37589 .m(m)
37590 .n(n)
37591 .k(k)
37592 .iterations(1)
37593 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37594 }
37595 }
37596 }
37597 }
37598
37599 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16) {
37600 TEST_REQUIRES_X86_AVX512F;
37601 for (uint32_t n = 17; n < 32; n++) {
37602 for (size_t k = 1; k <= 5; k += 2) {
37603 GemmMicrokernelTester()
37604 .mr(7)
37605 .nr(16)
37606 .kr(1)
37607 .sr(1)
37608 .m(7)
37609 .n(16)
37610 .k(k)
37611 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37612 }
37613 }
37614 }
37615
37616 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
37617 TEST_REQUIRES_X86_AVX512F;
37618 for (uint32_t n = 17; n < 32; n++) {
37619 for (size_t k = 1; k <= 5; k += 2) {
37620 GemmMicrokernelTester()
37621 .mr(7)
37622 .nr(16)
37623 .kr(1)
37624 .sr(1)
37625 .m(7)
37626 .n(16)
37627 .k(k)
37628 .cn_stride(19)
37629 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37630 }
37631 }
37632 }
37633
37634 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
37635 TEST_REQUIRES_X86_AVX512F;
37636 for (uint32_t n = 17; n < 32; n++) {
37637 for (size_t k = 1; k <= 5; k += 2) {
37638 GemmMicrokernelTester()
37639 .mr(7)
37640 .nr(16)
37641 .kr(1)
37642 .sr(1)
37643 .m(7)
37644 .n(n)
37645 .k(k)
37646 .a_stride(7)
37647 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37648 }
37649 }
37650 }
37651
37652 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_gt_16_subtile) {
37653 TEST_REQUIRES_X86_AVX512F;
37654 for (uint32_t n = 17; n < 32; n++) {
37655 for (size_t k = 1; k <= 5; k += 2) {
37656 for (uint32_t m = 1; m <= 7; m++) {
37657 GemmMicrokernelTester()
37658 .mr(7)
37659 .nr(16)
37660 .kr(1)
37661 .sr(1)
37662 .m(m)
37663 .n(n)
37664 .k(k)
37665 .iterations(1)
37666 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37667 }
37668 }
37669 }
37670 }
37671
37672 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16) {
37673 TEST_REQUIRES_X86_AVX512F;
37674 for (uint32_t n = 32; n <= 48; n += 16) {
37675 for (size_t k = 1; k <= 5; k += 2) {
37676 GemmMicrokernelTester()
37677 .mr(7)
37678 .nr(16)
37679 .kr(1)
37680 .sr(1)
37681 .m(7)
37682 .n(16)
37683 .k(k)
37684 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37685 }
37686 }
37687 }
37688
37689 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
37690 TEST_REQUIRES_X86_AVX512F;
37691 for (uint32_t n = 32; n <= 48; n += 16) {
37692 for (size_t k = 1; k <= 5; k += 2) {
37693 GemmMicrokernelTester()
37694 .mr(7)
37695 .nr(16)
37696 .kr(1)
37697 .sr(1)
37698 .m(7)
37699 .n(n)
37700 .k(k)
37701 .cn_stride(19)
37702 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37703 }
37704 }
37705 }
37706
37707 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16_strided_a) {
37708 TEST_REQUIRES_X86_AVX512F;
37709 for (uint32_t n = 32; n <= 48; n += 16) {
37710 for (size_t k = 1; k <= 5; k += 2) {
37711 GemmMicrokernelTester()
37712 .mr(7)
37713 .nr(16)
37714 .kr(1)
37715 .sr(1)
37716 .m(7)
37717 .n(n)
37718 .k(k)
37719 .a_stride(7)
37720 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37721 }
37722 }
37723 }
37724
37725 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, n_div_16_subtile) {
37726 TEST_REQUIRES_X86_AVX512F;
37727 for (uint32_t n = 32; n <= 48; n += 16) {
37728 for (size_t k = 1; k <= 5; k += 2) {
37729 for (uint32_t m = 1; m <= 7; m++) {
37730 GemmMicrokernelTester()
37731 .mr(7)
37732 .nr(16)
37733 .kr(1)
37734 .sr(1)
37735 .m(m)
37736 .n(n)
37737 .k(k)
37738 .iterations(1)
37739 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37740 }
37741 }
37742 }
37743 }
37744
37745 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, strided_cm_subtile) {
37746 TEST_REQUIRES_X86_AVX512F;
37747 for (size_t k = 1; k <= 5; k += 2) {
37748 for (uint32_t m = 1; m <= 7; m++) {
37749 for (uint32_t n = 1; n <= 16; n++) {
37750 GemmMicrokernelTester()
37751 .mr(7)
37752 .nr(16)
37753 .kr(1)
37754 .sr(1)
37755 .m(m)
37756 .n(n)
37757 .k(k)
37758 .cm_stride(19)
37759 .iterations(1)
37760 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37761 }
37762 }
37763 }
37764 }
37765
37766 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, qmin) {
37767 TEST_REQUIRES_X86_AVX512F;
37768 GemmMicrokernelTester()
37769 .mr(7)
37770 .nr(16)
37771 .kr(1)
37772 .sr(1)
37773 .m(7)
37774 .n(16)
37775 .k(1)
37776 .qmin(128)
37777 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37778 }
37779
37780 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, qmax) {
37781 TEST_REQUIRES_X86_AVX512F;
37782 GemmMicrokernelTester()
37783 .mr(7)
37784 .nr(16)
37785 .kr(1)
37786 .sr(1)
37787 .m(7)
37788 .n(16)
37789 .k(1)
37790 .qmax(128)
37791 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37792 }
37793
37794 TEST(F32_GEMM_7X16__AVX512F_BROADCAST, strided_cm) {
37795 TEST_REQUIRES_X86_AVX512F;
37796 GemmMicrokernelTester()
37797 .mr(7)
37798 .nr(16)
37799 .kr(1)
37800 .sr(1)
37801 .m(7)
37802 .n(16)
37803 .k(1)
37804 .cm_stride(19)
37805 .Test(xnn_f32_gemm_ukernel_7x16__avx512f_broadcast);
37806 }
37807#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
37808
37809
37810#if XNN_ARCH_X86 || XNN_ARCH_X86_64
37811 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1) {
37812 TEST_REQUIRES_X86_AVX512F;
37813 GemmMicrokernelTester()
37814 .mr(8)
37815 .nr(16)
37816 .kr(1)
37817 .sr(1)
37818 .m(8)
37819 .n(16)
37820 .k(1)
37821 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37822 }
37823
37824 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, strided_cn) {
37825 TEST_REQUIRES_X86_AVX512F;
37826 GemmMicrokernelTester()
37827 .mr(8)
37828 .nr(16)
37829 .kr(1)
37830 .sr(1)
37831 .m(8)
37832 .n(16)
37833 .k(1)
37834 .cn_stride(19)
37835 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37836 }
37837
37838 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_strided_a) {
37839 TEST_REQUIRES_X86_AVX512F;
37840 GemmMicrokernelTester()
37841 .mr(8)
37842 .nr(16)
37843 .kr(1)
37844 .sr(1)
37845 .m(8)
37846 .n(16)
37847 .k(1)
37848 .a_stride(3)
37849 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37850 }
37851
37852 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile) {
37853 TEST_REQUIRES_X86_AVX512F;
37854 for (uint32_t m = 1; m <= 8; m++) {
37855 for (uint32_t n = 1; n <= 16; n++) {
37856 GemmMicrokernelTester()
37857 .mr(8)
37858 .nr(16)
37859 .kr(1)
37860 .sr(1)
37861 .m(m)
37862 .n(n)
37863 .k(1)
37864 .iterations(1)
37865 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37866 }
37867 }
37868 }
37869
37870 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_m) {
37871 TEST_REQUIRES_X86_AVX512F;
37872 for (uint32_t m = 1; m <= 8; m++) {
37873 GemmMicrokernelTester()
37874 .mr(8)
37875 .nr(16)
37876 .kr(1)
37877 .sr(1)
37878 .m(m)
37879 .n(16)
37880 .k(1)
37881 .iterations(1)
37882 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37883 }
37884 }
37885
37886 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_eq_1_subtile_n) {
37887 TEST_REQUIRES_X86_AVX512F;
37888 for (uint32_t n = 1; n <= 16; n++) {
37889 GemmMicrokernelTester()
37890 .mr(8)
37891 .nr(16)
37892 .kr(1)
37893 .sr(1)
37894 .m(8)
37895 .n(n)
37896 .k(1)
37897 .iterations(1)
37898 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37899 }
37900 }
37901
37902 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_gt_1) {
37903 TEST_REQUIRES_X86_AVX512F;
37904 for (size_t k = 2; k < 10; k++) {
37905 GemmMicrokernelTester()
37906 .mr(8)
37907 .nr(16)
37908 .kr(1)
37909 .sr(1)
37910 .m(8)
37911 .n(16)
37912 .k(k)
37913 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37914 }
37915 }
37916
37917 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_gt_1_strided_a) {
37918 TEST_REQUIRES_X86_AVX512F;
37919 for (size_t k = 2; k < 10; k++) {
37920 GemmMicrokernelTester()
37921 .mr(8)
37922 .nr(16)
37923 .kr(1)
37924 .sr(1)
37925 .m(8)
37926 .n(16)
37927 .k(k)
37928 .a_stride(11)
37929 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37930 }
37931 }
37932
37933 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, k_gt_1_subtile) {
37934 TEST_REQUIRES_X86_AVX512F;
37935 for (size_t k = 2; k < 10; k++) {
37936 for (uint32_t m = 1; m <= 8; m++) {
37937 for (uint32_t n = 1; n <= 16; n++) {
37938 GemmMicrokernelTester()
37939 .mr(8)
37940 .nr(16)
37941 .kr(1)
37942 .sr(1)
37943 .m(m)
37944 .n(n)
37945 .k(k)
37946 .iterations(1)
37947 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37948 }
37949 }
37950 }
37951 }
37952
37953 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16) {
37954 TEST_REQUIRES_X86_AVX512F;
37955 for (uint32_t n = 17; n < 32; n++) {
37956 for (size_t k = 1; k <= 5; k += 2) {
37957 GemmMicrokernelTester()
37958 .mr(8)
37959 .nr(16)
37960 .kr(1)
37961 .sr(1)
37962 .m(8)
37963 .n(16)
37964 .k(k)
37965 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37966 }
37967 }
37968 }
37969
37970 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16_strided_cn) {
37971 TEST_REQUIRES_X86_AVX512F;
37972 for (uint32_t n = 17; n < 32; n++) {
37973 for (size_t k = 1; k <= 5; k += 2) {
37974 GemmMicrokernelTester()
37975 .mr(8)
37976 .nr(16)
37977 .kr(1)
37978 .sr(1)
37979 .m(8)
37980 .n(16)
37981 .k(k)
37982 .cn_stride(19)
37983 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
37984 }
37985 }
37986 }
37987
37988 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16_strided_a) {
37989 TEST_REQUIRES_X86_AVX512F;
37990 for (uint32_t n = 17; n < 32; n++) {
37991 for (size_t k = 1; k <= 5; k += 2) {
37992 GemmMicrokernelTester()
37993 .mr(8)
37994 .nr(16)
37995 .kr(1)
37996 .sr(1)
37997 .m(8)
37998 .n(n)
37999 .k(k)
38000 .a_stride(7)
38001 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38002 }
38003 }
38004 }
38005
38006 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_gt_16_subtile) {
38007 TEST_REQUIRES_X86_AVX512F;
38008 for (uint32_t n = 17; n < 32; n++) {
38009 for (size_t k = 1; k <= 5; k += 2) {
38010 for (uint32_t m = 1; m <= 8; m++) {
38011 GemmMicrokernelTester()
38012 .mr(8)
38013 .nr(16)
38014 .kr(1)
38015 .sr(1)
38016 .m(m)
38017 .n(n)
38018 .k(k)
38019 .iterations(1)
38020 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38021 }
38022 }
38023 }
38024 }
38025
38026 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16) {
38027 TEST_REQUIRES_X86_AVX512F;
38028 for (uint32_t n = 32; n <= 48; n += 16) {
38029 for (size_t k = 1; k <= 5; k += 2) {
38030 GemmMicrokernelTester()
38031 .mr(8)
38032 .nr(16)
38033 .kr(1)
38034 .sr(1)
38035 .m(8)
38036 .n(16)
38037 .k(k)
38038 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38039 }
38040 }
38041 }
38042
38043 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16_strided_cn) {
38044 TEST_REQUIRES_X86_AVX512F;
38045 for (uint32_t n = 32; n <= 48; n += 16) {
38046 for (size_t k = 1; k <= 5; k += 2) {
38047 GemmMicrokernelTester()
38048 .mr(8)
38049 .nr(16)
38050 .kr(1)
38051 .sr(1)
38052 .m(8)
38053 .n(n)
38054 .k(k)
38055 .cn_stride(19)
38056 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38057 }
38058 }
38059 }
38060
38061 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16_strided_a) {
38062 TEST_REQUIRES_X86_AVX512F;
38063 for (uint32_t n = 32; n <= 48; n += 16) {
38064 for (size_t k = 1; k <= 5; k += 2) {
38065 GemmMicrokernelTester()
38066 .mr(8)
38067 .nr(16)
38068 .kr(1)
38069 .sr(1)
38070 .m(8)
38071 .n(n)
38072 .k(k)
38073 .a_stride(7)
38074 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38075 }
38076 }
38077 }
38078
38079 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, n_div_16_subtile) {
38080 TEST_REQUIRES_X86_AVX512F;
38081 for (uint32_t n = 32; n <= 48; n += 16) {
38082 for (size_t k = 1; k <= 5; k += 2) {
38083 for (uint32_t m = 1; m <= 8; m++) {
38084 GemmMicrokernelTester()
38085 .mr(8)
38086 .nr(16)
38087 .kr(1)
38088 .sr(1)
38089 .m(m)
38090 .n(n)
38091 .k(k)
38092 .iterations(1)
38093 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38094 }
38095 }
38096 }
38097 }
38098
38099 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, strided_cm_subtile) {
38100 TEST_REQUIRES_X86_AVX512F;
38101 for (size_t k = 1; k <= 5; k += 2) {
38102 for (uint32_t m = 1; m <= 8; m++) {
38103 for (uint32_t n = 1; n <= 16; n++) {
38104 GemmMicrokernelTester()
38105 .mr(8)
38106 .nr(16)
38107 .kr(1)
38108 .sr(1)
38109 .m(m)
38110 .n(n)
38111 .k(k)
38112 .cm_stride(19)
38113 .iterations(1)
38114 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38115 }
38116 }
38117 }
38118 }
38119
38120 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, qmin) {
38121 TEST_REQUIRES_X86_AVX512F;
38122 GemmMicrokernelTester()
38123 .mr(8)
38124 .nr(16)
38125 .kr(1)
38126 .sr(1)
38127 .m(8)
38128 .n(16)
38129 .k(1)
38130 .qmin(128)
38131 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38132 }
38133
38134 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, qmax) {
38135 TEST_REQUIRES_X86_AVX512F;
38136 GemmMicrokernelTester()
38137 .mr(8)
38138 .nr(16)
38139 .kr(1)
38140 .sr(1)
38141 .m(8)
38142 .n(16)
38143 .k(1)
38144 .qmax(128)
38145 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38146 }
38147
38148 TEST(F32_GEMM_8X16__AVX512F_BROADCAST, strided_cm) {
38149 TEST_REQUIRES_X86_AVX512F;
38150 GemmMicrokernelTester()
38151 .mr(8)
38152 .nr(16)
38153 .kr(1)
38154 .sr(1)
38155 .m(8)
38156 .n(16)
38157 .k(1)
38158 .cm_stride(19)
38159 .Test(xnn_f32_gemm_ukernel_8x16__avx512f_broadcast);
38160 }
38161#endif // XNN_ARCH_X86 || XNN_ARCH_X86_64
38162
38163
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038164#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038165 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1) {
38166 TEST_REQUIRES_PSIMD;
38167 GemmMicrokernelTester()
38168 .mr(1)
38169 .nr(8)
38170 .kr(1)
38171 .sr(1)
38172 .m(1)
38173 .n(8)
38174 .k(1)
38175 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38176 }
38177
38178 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, strided_cn) {
38179 TEST_REQUIRES_PSIMD;
38180 GemmMicrokernelTester()
38181 .mr(1)
38182 .nr(8)
38183 .kr(1)
38184 .sr(1)
38185 .m(1)
38186 .n(8)
38187 .k(1)
38188 .cn_stride(11)
38189 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38190 }
38191
38192 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
38193 TEST_REQUIRES_PSIMD;
38194 GemmMicrokernelTester()
38195 .mr(1)
38196 .nr(8)
38197 .kr(1)
38198 .sr(1)
38199 .m(1)
38200 .n(8)
38201 .k(1)
38202 .a_stride(3)
38203 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38204 }
38205
38206 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
38207 TEST_REQUIRES_PSIMD;
38208 for (uint32_t m = 1; m <= 1; m++) {
38209 for (uint32_t n = 1; n <= 8; n++) {
38210 GemmMicrokernelTester()
38211 .mr(1)
38212 .nr(8)
38213 .kr(1)
38214 .sr(1)
38215 .m(m)
38216 .n(n)
38217 .k(1)
38218 .iterations(1)
38219 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38220 }
38221 }
38222 }
38223
38224 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
38225 TEST_REQUIRES_PSIMD;
38226 for (uint32_t m = 1; m <= 1; m++) {
38227 GemmMicrokernelTester()
38228 .mr(1)
38229 .nr(8)
38230 .kr(1)
38231 .sr(1)
38232 .m(m)
38233 .n(8)
38234 .k(1)
38235 .iterations(1)
38236 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38237 }
38238 }
38239
38240 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
38241 TEST_REQUIRES_PSIMD;
38242 for (uint32_t n = 1; n <= 8; n++) {
38243 GemmMicrokernelTester()
38244 .mr(1)
38245 .nr(8)
38246 .kr(1)
38247 .sr(1)
38248 .m(1)
38249 .n(n)
38250 .k(1)
38251 .iterations(1)
38252 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38253 }
38254 }
38255
38256 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_gt_1) {
38257 TEST_REQUIRES_PSIMD;
38258 for (size_t k = 2; k < 10; k++) {
38259 GemmMicrokernelTester()
38260 .mr(1)
38261 .nr(8)
38262 .kr(1)
38263 .sr(1)
38264 .m(1)
38265 .n(8)
38266 .k(k)
38267 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38268 }
38269 }
38270
38271 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
38272 TEST_REQUIRES_PSIMD;
38273 for (size_t k = 2; k < 10; k++) {
38274 GemmMicrokernelTester()
38275 .mr(1)
38276 .nr(8)
38277 .kr(1)
38278 .sr(1)
38279 .m(1)
38280 .n(8)
38281 .k(k)
38282 .a_stride(11)
38283 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38284 }
38285 }
38286
38287 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
38288 TEST_REQUIRES_PSIMD;
38289 for (size_t k = 2; k < 10; k++) {
38290 for (uint32_t m = 1; m <= 1; m++) {
38291 for (uint32_t n = 1; n <= 8; n++) {
38292 GemmMicrokernelTester()
38293 .mr(1)
38294 .nr(8)
38295 .kr(1)
38296 .sr(1)
38297 .m(m)
38298 .n(n)
38299 .k(k)
38300 .iterations(1)
38301 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38302 }
38303 }
38304 }
38305 }
38306
38307 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8) {
38308 TEST_REQUIRES_PSIMD;
38309 for (uint32_t n = 9; n < 16; n++) {
38310 for (size_t k = 1; k <= 5; k += 2) {
38311 GemmMicrokernelTester()
38312 .mr(1)
38313 .nr(8)
38314 .kr(1)
38315 .sr(1)
38316 .m(1)
38317 .n(8)
38318 .k(k)
38319 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38320 }
38321 }
38322 }
38323
38324 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
38325 TEST_REQUIRES_PSIMD;
38326 for (uint32_t n = 9; n < 16; n++) {
38327 for (size_t k = 1; k <= 5; k += 2) {
38328 GemmMicrokernelTester()
38329 .mr(1)
38330 .nr(8)
38331 .kr(1)
38332 .sr(1)
38333 .m(1)
38334 .n(8)
38335 .k(k)
38336 .cn_stride(11)
38337 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38338 }
38339 }
38340 }
38341
38342 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
38343 TEST_REQUIRES_PSIMD;
38344 for (uint32_t n = 9; n < 16; n++) {
38345 for (size_t k = 1; k <= 5; k += 2) {
38346 GemmMicrokernelTester()
38347 .mr(1)
38348 .nr(8)
38349 .kr(1)
38350 .sr(1)
38351 .m(1)
38352 .n(n)
38353 .k(k)
38354 .a_stride(7)
38355 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38356 }
38357 }
38358 }
38359
38360 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
38361 TEST_REQUIRES_PSIMD;
38362 for (uint32_t n = 9; n < 16; n++) {
38363 for (size_t k = 1; k <= 5; k += 2) {
38364 for (uint32_t m = 1; m <= 1; m++) {
38365 GemmMicrokernelTester()
38366 .mr(1)
38367 .nr(8)
38368 .kr(1)
38369 .sr(1)
38370 .m(m)
38371 .n(n)
38372 .k(k)
38373 .iterations(1)
38374 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38375 }
38376 }
38377 }
38378 }
38379
38380 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8) {
38381 TEST_REQUIRES_PSIMD;
38382 for (uint32_t n = 16; n <= 24; n += 8) {
38383 for (size_t k = 1; k <= 5; k += 2) {
38384 GemmMicrokernelTester()
38385 .mr(1)
38386 .nr(8)
38387 .kr(1)
38388 .sr(1)
38389 .m(1)
38390 .n(8)
38391 .k(k)
38392 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38393 }
38394 }
38395 }
38396
38397 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
38398 TEST_REQUIRES_PSIMD;
38399 for (uint32_t n = 16; n <= 24; n += 8) {
38400 for (size_t k = 1; k <= 5; k += 2) {
38401 GemmMicrokernelTester()
38402 .mr(1)
38403 .nr(8)
38404 .kr(1)
38405 .sr(1)
38406 .m(1)
38407 .n(n)
38408 .k(k)
38409 .cn_stride(11)
38410 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38411 }
38412 }
38413 }
38414
38415 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
38416 TEST_REQUIRES_PSIMD;
38417 for (uint32_t n = 16; n <= 24; n += 8) {
38418 for (size_t k = 1; k <= 5; k += 2) {
38419 GemmMicrokernelTester()
38420 .mr(1)
38421 .nr(8)
38422 .kr(1)
38423 .sr(1)
38424 .m(1)
38425 .n(n)
38426 .k(k)
38427 .a_stride(7)
38428 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38429 }
38430 }
38431 }
38432
38433 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
38434 TEST_REQUIRES_PSIMD;
38435 for (uint32_t n = 16; n <= 24; n += 8) {
38436 for (size_t k = 1; k <= 5; k += 2) {
38437 for (uint32_t m = 1; m <= 1; m++) {
38438 GemmMicrokernelTester()
38439 .mr(1)
38440 .nr(8)
38441 .kr(1)
38442 .sr(1)
38443 .m(m)
38444 .n(n)
38445 .k(k)
38446 .iterations(1)
38447 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38448 }
38449 }
38450 }
38451 }
38452
38453 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
38454 TEST_REQUIRES_PSIMD;
38455 for (size_t k = 1; k <= 5; k += 2) {
38456 for (uint32_t m = 1; m <= 1; m++) {
38457 for (uint32_t n = 1; n <= 8; n++) {
38458 GemmMicrokernelTester()
38459 .mr(1)
38460 .nr(8)
38461 .kr(1)
38462 .sr(1)
38463 .m(m)
38464 .n(n)
38465 .k(k)
38466 .cm_stride(11)
38467 .iterations(1)
38468 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38469 }
38470 }
38471 }
38472 }
38473
38474 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, qmin) {
38475 TEST_REQUIRES_PSIMD;
38476 GemmMicrokernelTester()
38477 .mr(1)
38478 .nr(8)
38479 .kr(1)
38480 .sr(1)
38481 .m(1)
38482 .n(8)
38483 .k(1)
38484 .qmin(128)
38485 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38486 }
38487
38488 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, qmax) {
38489 TEST_REQUIRES_PSIMD;
38490 GemmMicrokernelTester()
38491 .mr(1)
38492 .nr(8)
38493 .kr(1)
38494 .sr(1)
38495 .m(1)
38496 .n(8)
38497 .k(1)
38498 .qmax(128)
38499 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38500 }
38501
38502 TEST(F32_GEMM_1X8__PSIMD_LOADSPLAT, strided_cm) {
38503 TEST_REQUIRES_PSIMD;
38504 GemmMicrokernelTester()
38505 .mr(1)
38506 .nr(8)
38507 .kr(1)
38508 .sr(1)
38509 .m(1)
38510 .n(8)
38511 .k(1)
38512 .cm_stride(11)
38513 .Test(xnn_f32_gemm_ukernel_1x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38514 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038515#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038516
38517
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038518#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038519 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1) {
38520 TEST_REQUIRES_PSIMD;
38521 GemmMicrokernelTester()
38522 .mr(4)
38523 .nr(8)
38524 .kr(1)
38525 .sr(1)
38526 .m(4)
38527 .n(8)
38528 .k(1)
38529 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38530 }
38531
38532 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, strided_cn) {
38533 TEST_REQUIRES_PSIMD;
38534 GemmMicrokernelTester()
38535 .mr(4)
38536 .nr(8)
38537 .kr(1)
38538 .sr(1)
38539 .m(4)
38540 .n(8)
38541 .k(1)
38542 .cn_stride(11)
38543 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38544 }
38545
38546 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
38547 TEST_REQUIRES_PSIMD;
38548 GemmMicrokernelTester()
38549 .mr(4)
38550 .nr(8)
38551 .kr(1)
38552 .sr(1)
38553 .m(4)
38554 .n(8)
38555 .k(1)
38556 .a_stride(3)
38557 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38558 }
38559
38560 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
38561 TEST_REQUIRES_PSIMD;
38562 for (uint32_t m = 1; m <= 4; m++) {
38563 for (uint32_t n = 1; n <= 8; n++) {
38564 GemmMicrokernelTester()
38565 .mr(4)
38566 .nr(8)
38567 .kr(1)
38568 .sr(1)
38569 .m(m)
38570 .n(n)
38571 .k(1)
38572 .iterations(1)
38573 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38574 }
38575 }
38576 }
38577
38578 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
38579 TEST_REQUIRES_PSIMD;
38580 for (uint32_t m = 1; m <= 4; m++) {
38581 GemmMicrokernelTester()
38582 .mr(4)
38583 .nr(8)
38584 .kr(1)
38585 .sr(1)
38586 .m(m)
38587 .n(8)
38588 .k(1)
38589 .iterations(1)
38590 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38591 }
38592 }
38593
38594 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
38595 TEST_REQUIRES_PSIMD;
38596 for (uint32_t n = 1; n <= 8; n++) {
38597 GemmMicrokernelTester()
38598 .mr(4)
38599 .nr(8)
38600 .kr(1)
38601 .sr(1)
38602 .m(4)
38603 .n(n)
38604 .k(1)
38605 .iterations(1)
38606 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38607 }
38608 }
38609
38610 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_gt_1) {
38611 TEST_REQUIRES_PSIMD;
38612 for (size_t k = 2; k < 10; k++) {
38613 GemmMicrokernelTester()
38614 .mr(4)
38615 .nr(8)
38616 .kr(1)
38617 .sr(1)
38618 .m(4)
38619 .n(8)
38620 .k(k)
38621 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38622 }
38623 }
38624
38625 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
38626 TEST_REQUIRES_PSIMD;
38627 for (size_t k = 2; k < 10; k++) {
38628 GemmMicrokernelTester()
38629 .mr(4)
38630 .nr(8)
38631 .kr(1)
38632 .sr(1)
38633 .m(4)
38634 .n(8)
38635 .k(k)
38636 .a_stride(11)
38637 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38638 }
38639 }
38640
38641 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
38642 TEST_REQUIRES_PSIMD;
38643 for (size_t k = 2; k < 10; k++) {
38644 for (uint32_t m = 1; m <= 4; m++) {
38645 for (uint32_t n = 1; n <= 8; n++) {
38646 GemmMicrokernelTester()
38647 .mr(4)
38648 .nr(8)
38649 .kr(1)
38650 .sr(1)
38651 .m(m)
38652 .n(n)
38653 .k(k)
38654 .iterations(1)
38655 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38656 }
38657 }
38658 }
38659 }
38660
38661 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8) {
38662 TEST_REQUIRES_PSIMD;
38663 for (uint32_t n = 9; n < 16; n++) {
38664 for (size_t k = 1; k <= 5; k += 2) {
38665 GemmMicrokernelTester()
38666 .mr(4)
38667 .nr(8)
38668 .kr(1)
38669 .sr(1)
38670 .m(4)
38671 .n(8)
38672 .k(k)
38673 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38674 }
38675 }
38676 }
38677
38678 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
38679 TEST_REQUIRES_PSIMD;
38680 for (uint32_t n = 9; n < 16; n++) {
38681 for (size_t k = 1; k <= 5; k += 2) {
38682 GemmMicrokernelTester()
38683 .mr(4)
38684 .nr(8)
38685 .kr(1)
38686 .sr(1)
38687 .m(4)
38688 .n(8)
38689 .k(k)
38690 .cn_stride(11)
38691 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38692 }
38693 }
38694 }
38695
38696 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
38697 TEST_REQUIRES_PSIMD;
38698 for (uint32_t n = 9; n < 16; n++) {
38699 for (size_t k = 1; k <= 5; k += 2) {
38700 GemmMicrokernelTester()
38701 .mr(4)
38702 .nr(8)
38703 .kr(1)
38704 .sr(1)
38705 .m(4)
38706 .n(n)
38707 .k(k)
38708 .a_stride(7)
38709 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38710 }
38711 }
38712 }
38713
38714 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
38715 TEST_REQUIRES_PSIMD;
38716 for (uint32_t n = 9; n < 16; n++) {
38717 for (size_t k = 1; k <= 5; k += 2) {
38718 for (uint32_t m = 1; m <= 4; m++) {
38719 GemmMicrokernelTester()
38720 .mr(4)
38721 .nr(8)
38722 .kr(1)
38723 .sr(1)
38724 .m(m)
38725 .n(n)
38726 .k(k)
38727 .iterations(1)
38728 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38729 }
38730 }
38731 }
38732 }
38733
38734 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8) {
38735 TEST_REQUIRES_PSIMD;
38736 for (uint32_t n = 16; n <= 24; n += 8) {
38737 for (size_t k = 1; k <= 5; k += 2) {
38738 GemmMicrokernelTester()
38739 .mr(4)
38740 .nr(8)
38741 .kr(1)
38742 .sr(1)
38743 .m(4)
38744 .n(8)
38745 .k(k)
38746 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38747 }
38748 }
38749 }
38750
38751 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
38752 TEST_REQUIRES_PSIMD;
38753 for (uint32_t n = 16; n <= 24; n += 8) {
38754 for (size_t k = 1; k <= 5; k += 2) {
38755 GemmMicrokernelTester()
38756 .mr(4)
38757 .nr(8)
38758 .kr(1)
38759 .sr(1)
38760 .m(4)
38761 .n(n)
38762 .k(k)
38763 .cn_stride(11)
38764 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38765 }
38766 }
38767 }
38768
38769 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
38770 TEST_REQUIRES_PSIMD;
38771 for (uint32_t n = 16; n <= 24; n += 8) {
38772 for (size_t k = 1; k <= 5; k += 2) {
38773 GemmMicrokernelTester()
38774 .mr(4)
38775 .nr(8)
38776 .kr(1)
38777 .sr(1)
38778 .m(4)
38779 .n(n)
38780 .k(k)
38781 .a_stride(7)
38782 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38783 }
38784 }
38785 }
38786
38787 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
38788 TEST_REQUIRES_PSIMD;
38789 for (uint32_t n = 16; n <= 24; n += 8) {
38790 for (size_t k = 1; k <= 5; k += 2) {
38791 for (uint32_t m = 1; m <= 4; m++) {
38792 GemmMicrokernelTester()
38793 .mr(4)
38794 .nr(8)
38795 .kr(1)
38796 .sr(1)
38797 .m(m)
38798 .n(n)
38799 .k(k)
38800 .iterations(1)
38801 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38802 }
38803 }
38804 }
38805 }
38806
38807 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
38808 TEST_REQUIRES_PSIMD;
38809 for (size_t k = 1; k <= 5; k += 2) {
38810 for (uint32_t m = 1; m <= 4; m++) {
38811 for (uint32_t n = 1; n <= 8; n++) {
38812 GemmMicrokernelTester()
38813 .mr(4)
38814 .nr(8)
38815 .kr(1)
38816 .sr(1)
38817 .m(m)
38818 .n(n)
38819 .k(k)
38820 .cm_stride(11)
38821 .iterations(1)
38822 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38823 }
38824 }
38825 }
38826 }
38827
38828 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, qmin) {
38829 TEST_REQUIRES_PSIMD;
38830 GemmMicrokernelTester()
38831 .mr(4)
38832 .nr(8)
38833 .kr(1)
38834 .sr(1)
38835 .m(4)
38836 .n(8)
38837 .k(1)
38838 .qmin(128)
38839 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38840 }
38841
38842 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, qmax) {
38843 TEST_REQUIRES_PSIMD;
38844 GemmMicrokernelTester()
38845 .mr(4)
38846 .nr(8)
38847 .kr(1)
38848 .sr(1)
38849 .m(4)
38850 .n(8)
38851 .k(1)
38852 .qmax(128)
38853 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38854 }
38855
38856 TEST(F32_GEMM_4X8__PSIMD_LOADSPLAT, strided_cm) {
38857 TEST_REQUIRES_PSIMD;
38858 GemmMicrokernelTester()
38859 .mr(4)
38860 .nr(8)
38861 .kr(1)
38862 .sr(1)
38863 .m(4)
38864 .n(8)
38865 .k(1)
38866 .cm_stride(11)
38867 .Test(xnn_f32_gemm_ukernel_4x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38868 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038869#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038870
38871
Marat Dukhan1dadbf72019-10-01 10:46:20 -070038872#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070038873 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1) {
38874 TEST_REQUIRES_PSIMD;
38875 GemmMicrokernelTester()
38876 .mr(6)
38877 .nr(8)
38878 .kr(1)
38879 .sr(1)
38880 .m(6)
38881 .n(8)
38882 .k(1)
38883 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38884 }
38885
38886 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, strided_cn) {
38887 TEST_REQUIRES_PSIMD;
38888 GemmMicrokernelTester()
38889 .mr(6)
38890 .nr(8)
38891 .kr(1)
38892 .sr(1)
38893 .m(6)
38894 .n(8)
38895 .k(1)
38896 .cn_stride(11)
38897 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38898 }
38899
38900 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_strided_a) {
38901 TEST_REQUIRES_PSIMD;
38902 GemmMicrokernelTester()
38903 .mr(6)
38904 .nr(8)
38905 .kr(1)
38906 .sr(1)
38907 .m(6)
38908 .n(8)
38909 .k(1)
38910 .a_stride(3)
38911 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38912 }
38913
38914 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile) {
38915 TEST_REQUIRES_PSIMD;
38916 for (uint32_t m = 1; m <= 6; m++) {
38917 for (uint32_t n = 1; n <= 8; n++) {
38918 GemmMicrokernelTester()
38919 .mr(6)
38920 .nr(8)
38921 .kr(1)
38922 .sr(1)
38923 .m(m)
38924 .n(n)
38925 .k(1)
38926 .iterations(1)
38927 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38928 }
38929 }
38930 }
38931
38932 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_m) {
38933 TEST_REQUIRES_PSIMD;
38934 for (uint32_t m = 1; m <= 6; m++) {
38935 GemmMicrokernelTester()
38936 .mr(6)
38937 .nr(8)
38938 .kr(1)
38939 .sr(1)
38940 .m(m)
38941 .n(8)
38942 .k(1)
38943 .iterations(1)
38944 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38945 }
38946 }
38947
38948 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_eq_1_subtile_n) {
38949 TEST_REQUIRES_PSIMD;
38950 for (uint32_t n = 1; n <= 8; n++) {
38951 GemmMicrokernelTester()
38952 .mr(6)
38953 .nr(8)
38954 .kr(1)
38955 .sr(1)
38956 .m(6)
38957 .n(n)
38958 .k(1)
38959 .iterations(1)
38960 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38961 }
38962 }
38963
38964 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_gt_1) {
38965 TEST_REQUIRES_PSIMD;
38966 for (size_t k = 2; k < 10; k++) {
38967 GemmMicrokernelTester()
38968 .mr(6)
38969 .nr(8)
38970 .kr(1)
38971 .sr(1)
38972 .m(6)
38973 .n(8)
38974 .k(k)
38975 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38976 }
38977 }
38978
38979 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_gt_1_strided_a) {
38980 TEST_REQUIRES_PSIMD;
38981 for (size_t k = 2; k < 10; k++) {
38982 GemmMicrokernelTester()
38983 .mr(6)
38984 .nr(8)
38985 .kr(1)
38986 .sr(1)
38987 .m(6)
38988 .n(8)
38989 .k(k)
38990 .a_stride(11)
38991 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
38992 }
38993 }
38994
38995 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, k_gt_1_subtile) {
38996 TEST_REQUIRES_PSIMD;
38997 for (size_t k = 2; k < 10; k++) {
38998 for (uint32_t m = 1; m <= 6; m++) {
38999 for (uint32_t n = 1; n <= 8; n++) {
39000 GemmMicrokernelTester()
39001 .mr(6)
39002 .nr(8)
39003 .kr(1)
39004 .sr(1)
39005 .m(m)
39006 .n(n)
39007 .k(k)
39008 .iterations(1)
39009 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39010 }
39011 }
39012 }
39013 }
39014
39015 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8) {
39016 TEST_REQUIRES_PSIMD;
39017 for (uint32_t n = 9; n < 16; n++) {
39018 for (size_t k = 1; k <= 5; k += 2) {
39019 GemmMicrokernelTester()
39020 .mr(6)
39021 .nr(8)
39022 .kr(1)
39023 .sr(1)
39024 .m(6)
39025 .n(8)
39026 .k(k)
39027 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39028 }
39029 }
39030 }
39031
39032 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_cn) {
39033 TEST_REQUIRES_PSIMD;
39034 for (uint32_t n = 9; n < 16; n++) {
39035 for (size_t k = 1; k <= 5; k += 2) {
39036 GemmMicrokernelTester()
39037 .mr(6)
39038 .nr(8)
39039 .kr(1)
39040 .sr(1)
39041 .m(6)
39042 .n(8)
39043 .k(k)
39044 .cn_stride(11)
39045 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39046 }
39047 }
39048 }
39049
39050 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_strided_a) {
39051 TEST_REQUIRES_PSIMD;
39052 for (uint32_t n = 9; n < 16; n++) {
39053 for (size_t k = 1; k <= 5; k += 2) {
39054 GemmMicrokernelTester()
39055 .mr(6)
39056 .nr(8)
39057 .kr(1)
39058 .sr(1)
39059 .m(6)
39060 .n(n)
39061 .k(k)
39062 .a_stride(7)
39063 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39064 }
39065 }
39066 }
39067
39068 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_gt_8_subtile) {
39069 TEST_REQUIRES_PSIMD;
39070 for (uint32_t n = 9; n < 16; n++) {
39071 for (size_t k = 1; k <= 5; k += 2) {
39072 for (uint32_t m = 1; m <= 6; m++) {
39073 GemmMicrokernelTester()
39074 .mr(6)
39075 .nr(8)
39076 .kr(1)
39077 .sr(1)
39078 .m(m)
39079 .n(n)
39080 .k(k)
39081 .iterations(1)
39082 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39083 }
39084 }
39085 }
39086 }
39087
39088 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8) {
39089 TEST_REQUIRES_PSIMD;
39090 for (uint32_t n = 16; n <= 24; n += 8) {
39091 for (size_t k = 1; k <= 5; k += 2) {
39092 GemmMicrokernelTester()
39093 .mr(6)
39094 .nr(8)
39095 .kr(1)
39096 .sr(1)
39097 .m(6)
39098 .n(8)
39099 .k(k)
39100 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39101 }
39102 }
39103 }
39104
39105 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8_strided_cn) {
39106 TEST_REQUIRES_PSIMD;
39107 for (uint32_t n = 16; n <= 24; n += 8) {
39108 for (size_t k = 1; k <= 5; k += 2) {
39109 GemmMicrokernelTester()
39110 .mr(6)
39111 .nr(8)
39112 .kr(1)
39113 .sr(1)
39114 .m(6)
39115 .n(n)
39116 .k(k)
39117 .cn_stride(11)
39118 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39119 }
39120 }
39121 }
39122
39123 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8_strided_a) {
39124 TEST_REQUIRES_PSIMD;
39125 for (uint32_t n = 16; n <= 24; n += 8) {
39126 for (size_t k = 1; k <= 5; k += 2) {
39127 GemmMicrokernelTester()
39128 .mr(6)
39129 .nr(8)
39130 .kr(1)
39131 .sr(1)
39132 .m(6)
39133 .n(n)
39134 .k(k)
39135 .a_stride(7)
39136 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39137 }
39138 }
39139 }
39140
39141 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, n_div_8_subtile) {
39142 TEST_REQUIRES_PSIMD;
39143 for (uint32_t n = 16; n <= 24; n += 8) {
39144 for (size_t k = 1; k <= 5; k += 2) {
39145 for (uint32_t m = 1; m <= 6; m++) {
39146 GemmMicrokernelTester()
39147 .mr(6)
39148 .nr(8)
39149 .kr(1)
39150 .sr(1)
39151 .m(m)
39152 .n(n)
39153 .k(k)
39154 .iterations(1)
39155 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39156 }
39157 }
39158 }
39159 }
39160
39161 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, strided_cm_subtile) {
39162 TEST_REQUIRES_PSIMD;
39163 for (size_t k = 1; k <= 5; k += 2) {
39164 for (uint32_t m = 1; m <= 6; m++) {
39165 for (uint32_t n = 1; n <= 8; n++) {
39166 GemmMicrokernelTester()
39167 .mr(6)
39168 .nr(8)
39169 .kr(1)
39170 .sr(1)
39171 .m(m)
39172 .n(n)
39173 .k(k)
39174 .cm_stride(11)
39175 .iterations(1)
39176 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39177 }
39178 }
39179 }
39180 }
39181
39182 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, qmin) {
39183 TEST_REQUIRES_PSIMD;
39184 GemmMicrokernelTester()
39185 .mr(6)
39186 .nr(8)
39187 .kr(1)
39188 .sr(1)
39189 .m(6)
39190 .n(8)
39191 .k(1)
39192 .qmin(128)
39193 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39194 }
39195
39196 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, qmax) {
39197 TEST_REQUIRES_PSIMD;
39198 GemmMicrokernelTester()
39199 .mr(6)
39200 .nr(8)
39201 .kr(1)
39202 .sr(1)
39203 .m(6)
39204 .n(8)
39205 .k(1)
39206 .qmax(128)
39207 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39208 }
39209
39210 TEST(F32_GEMM_6X8__PSIMD_LOADSPLAT, strided_cm) {
39211 TEST_REQUIRES_PSIMD;
39212 GemmMicrokernelTester()
39213 .mr(6)
39214 .nr(8)
39215 .kr(1)
39216 .sr(1)
39217 .m(6)
39218 .n(8)
39219 .k(1)
39220 .cm_stride(11)
39221 .Test(xnn_f32_gemm_ukernel_6x8__psimd_loadsplat, GemmMicrokernelTester::Variant::Scalar);
39222 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039223#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039224
39225
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039226#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039227 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4) {
39228 TEST_REQUIRES_PSIMD;
39229 GemmMicrokernelTester()
39230 .mr(1)
39231 .nr(8)
39232 .kr(1)
39233 .sr(1)
39234 .m(1)
39235 .n(8)
39236 .k(4)
39237 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39238 }
39239
39240 TEST(F32_GEMM_1X8__PSIMD_SPLAT, strided_cn) {
39241 TEST_REQUIRES_PSIMD;
39242 GemmMicrokernelTester()
39243 .mr(1)
39244 .nr(8)
39245 .kr(1)
39246 .sr(1)
39247 .m(1)
39248 .n(8)
39249 .k(4)
39250 .cn_stride(11)
39251 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39252 }
39253
39254 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_strided_a) {
39255 TEST_REQUIRES_PSIMD;
39256 GemmMicrokernelTester()
39257 .mr(1)
39258 .nr(8)
39259 .kr(1)
39260 .sr(1)
39261 .m(1)
39262 .n(8)
39263 .k(4)
39264 .a_stride(7)
39265 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39266 }
39267
39268 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile) {
39269 TEST_REQUIRES_PSIMD;
39270 for (uint32_t m = 1; m <= 1; m++) {
39271 for (uint32_t n = 1; n <= 8; n++) {
39272 GemmMicrokernelTester()
39273 .mr(1)
39274 .nr(8)
39275 .kr(1)
39276 .sr(1)
39277 .m(m)
39278 .n(n)
39279 .k(4)
39280 .iterations(1)
39281 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39282 }
39283 }
39284 }
39285
39286 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
39287 TEST_REQUIRES_PSIMD;
39288 for (uint32_t m = 1; m <= 1; m++) {
39289 GemmMicrokernelTester()
39290 .mr(1)
39291 .nr(8)
39292 .kr(1)
39293 .sr(1)
39294 .m(m)
39295 .n(8)
39296 .k(4)
39297 .iterations(1)
39298 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39299 }
39300 }
39301
39302 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
39303 TEST_REQUIRES_PSIMD;
39304 for (uint32_t n = 1; n <= 8; n++) {
39305 GemmMicrokernelTester()
39306 .mr(1)
39307 .nr(8)
39308 .kr(1)
39309 .sr(1)
39310 .m(1)
39311 .n(n)
39312 .k(4)
39313 .iterations(1)
39314 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39315 }
39316 }
39317
39318 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_lt_4) {
39319 TEST_REQUIRES_PSIMD;
39320 for (size_t k = 1; k < 4; k++) {
39321 GemmMicrokernelTester()
39322 .mr(1)
39323 .nr(8)
39324 .kr(1)
39325 .sr(1)
39326 .m(1)
39327 .n(8)
39328 .k(k)
39329 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39330 }
39331 }
39332
39333 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_lt_4_strided_a) {
39334 TEST_REQUIRES_PSIMD;
39335 for (size_t k = 1; k < 4; k++) {
39336 GemmMicrokernelTester()
39337 .mr(1)
39338 .nr(8)
39339 .kr(1)
39340 .sr(1)
39341 .m(1)
39342 .n(8)
39343 .k(k)
39344 .a_stride(7)
39345 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39346 }
39347 }
39348
39349 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_lt_4_subtile) {
39350 TEST_REQUIRES_PSIMD;
39351 for (size_t k = 1; k < 4; k++) {
39352 for (uint32_t m = 1; m <= 1; m++) {
39353 for (uint32_t n = 1; n <= 8; n++) {
39354 GemmMicrokernelTester()
39355 .mr(1)
39356 .nr(8)
39357 .kr(1)
39358 .sr(1)
39359 .m(m)
39360 .n(n)
39361 .k(k)
39362 .iterations(1)
39363 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39364 }
39365 }
39366 }
39367 }
39368
39369 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_gt_4) {
39370 TEST_REQUIRES_PSIMD;
39371 for (size_t k = 5; k < 8; k++) {
39372 GemmMicrokernelTester()
39373 .mr(1)
39374 .nr(8)
39375 .kr(1)
39376 .sr(1)
39377 .m(1)
39378 .n(8)
39379 .k(k)
39380 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39381 }
39382 }
39383
39384 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_gt_4_strided_a) {
39385 TEST_REQUIRES_PSIMD;
39386 for (size_t k = 5; k < 8; k++) {
39387 GemmMicrokernelTester()
39388 .mr(1)
39389 .nr(8)
39390 .kr(1)
39391 .sr(1)
39392 .m(1)
39393 .n(8)
39394 .k(k)
39395 .a_stride(11)
39396 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39397 }
39398 }
39399
39400 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_gt_4_subtile) {
39401 TEST_REQUIRES_PSIMD;
39402 for (size_t k = 5; k < 8; k++) {
39403 for (uint32_t m = 1; m <= 1; m++) {
39404 for (uint32_t n = 1; n <= 8; n++) {
39405 GemmMicrokernelTester()
39406 .mr(1)
39407 .nr(8)
39408 .kr(1)
39409 .sr(1)
39410 .m(m)
39411 .n(n)
39412 .k(k)
39413 .iterations(1)
39414 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39415 }
39416 }
39417 }
39418 }
39419
39420 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_div_4) {
39421 TEST_REQUIRES_PSIMD;
39422 for (size_t k = 8; k <= 40; k += 4) {
39423 GemmMicrokernelTester()
39424 .mr(1)
39425 .nr(8)
39426 .kr(1)
39427 .sr(1)
39428 .m(1)
39429 .n(8)
39430 .k(k)
39431 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39432 }
39433 }
39434
39435 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_div_4_strided_a) {
39436 TEST_REQUIRES_PSIMD;
39437 for (size_t k = 8; k <= 40; k += 4) {
39438 GemmMicrokernelTester()
39439 .mr(1)
39440 .nr(8)
39441 .kr(1)
39442 .sr(1)
39443 .m(1)
39444 .n(8)
39445 .k(k)
39446 .a_stride(43)
39447 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39448 }
39449 }
39450
39451 TEST(F32_GEMM_1X8__PSIMD_SPLAT, k_div_4_subtile) {
39452 TEST_REQUIRES_PSIMD;
39453 for (size_t k = 8; k <= 40; k += 4) {
39454 for (uint32_t m = 1; m <= 1; m++) {
39455 for (uint32_t n = 1; n <= 8; n++) {
39456 GemmMicrokernelTester()
39457 .mr(1)
39458 .nr(8)
39459 .kr(1)
39460 .sr(1)
39461 .m(m)
39462 .n(n)
39463 .k(k)
39464 .iterations(1)
39465 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39466 }
39467 }
39468 }
39469 }
39470
39471 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8) {
39472 TEST_REQUIRES_PSIMD;
39473 for (uint32_t n = 9; n < 16; n++) {
39474 for (size_t k = 1; k <= 20; k += 5) {
39475 GemmMicrokernelTester()
39476 .mr(1)
39477 .nr(8)
39478 .kr(1)
39479 .sr(1)
39480 .m(1)
39481 .n(8)
39482 .k(k)
39483 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39484 }
39485 }
39486 }
39487
39488 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
39489 TEST_REQUIRES_PSIMD;
39490 for (uint32_t n = 9; n < 16; n++) {
39491 for (size_t k = 1; k <= 20; k += 5) {
39492 GemmMicrokernelTester()
39493 .mr(1)
39494 .nr(8)
39495 .kr(1)
39496 .sr(1)
39497 .m(1)
39498 .n(8)
39499 .k(k)
39500 .cn_stride(11)
39501 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39502 }
39503 }
39504 }
39505
39506 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8_strided_a) {
39507 TEST_REQUIRES_PSIMD;
39508 for (uint32_t n = 9; n < 16; n++) {
39509 for (size_t k = 1; k <= 20; k += 5) {
39510 GemmMicrokernelTester()
39511 .mr(1)
39512 .nr(8)
39513 .kr(1)
39514 .sr(1)
39515 .m(1)
39516 .n(n)
39517 .k(k)
39518 .a_stride(23)
39519 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39520 }
39521 }
39522 }
39523
39524 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_gt_8_subtile) {
39525 TEST_REQUIRES_PSIMD;
39526 for (uint32_t n = 9; n < 16; n++) {
39527 for (size_t k = 1; k <= 20; k += 5) {
39528 for (uint32_t m = 1; m <= 1; m++) {
39529 GemmMicrokernelTester()
39530 .mr(1)
39531 .nr(8)
39532 .kr(1)
39533 .sr(1)
39534 .m(m)
39535 .n(n)
39536 .k(k)
39537 .iterations(1)
39538 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39539 }
39540 }
39541 }
39542 }
39543
39544 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8) {
39545 TEST_REQUIRES_PSIMD;
39546 for (uint32_t n = 16; n <= 24; n += 8) {
39547 for (size_t k = 1; k <= 20; k += 5) {
39548 GemmMicrokernelTester()
39549 .mr(1)
39550 .nr(8)
39551 .kr(1)
39552 .sr(1)
39553 .m(1)
39554 .n(8)
39555 .k(k)
39556 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39557 }
39558 }
39559 }
39560
39561 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8_strided_cn) {
39562 TEST_REQUIRES_PSIMD;
39563 for (uint32_t n = 16; n <= 24; n += 8) {
39564 for (size_t k = 1; k <= 20; k += 5) {
39565 GemmMicrokernelTester()
39566 .mr(1)
39567 .nr(8)
39568 .kr(1)
39569 .sr(1)
39570 .m(1)
39571 .n(n)
39572 .k(k)
39573 .cn_stride(11)
39574 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39575 }
39576 }
39577 }
39578
39579 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8_strided_a) {
39580 TEST_REQUIRES_PSIMD;
39581 for (uint32_t n = 16; n <= 24; n += 8) {
39582 for (size_t k = 1; k <= 20; k += 5) {
39583 GemmMicrokernelTester()
39584 .mr(1)
39585 .nr(8)
39586 .kr(1)
39587 .sr(1)
39588 .m(1)
39589 .n(n)
39590 .k(k)
39591 .a_stride(23)
39592 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39593 }
39594 }
39595 }
39596
39597 TEST(F32_GEMM_1X8__PSIMD_SPLAT, n_div_8_subtile) {
39598 TEST_REQUIRES_PSIMD;
39599 for (uint32_t n = 16; n <= 24; n += 8) {
39600 for (size_t k = 1; k <= 20; k += 5) {
39601 for (uint32_t m = 1; m <= 1; m++) {
39602 GemmMicrokernelTester()
39603 .mr(1)
39604 .nr(8)
39605 .kr(1)
39606 .sr(1)
39607 .m(m)
39608 .n(n)
39609 .k(k)
39610 .iterations(1)
39611 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39612 }
39613 }
39614 }
39615 }
39616
39617 TEST(F32_GEMM_1X8__PSIMD_SPLAT, strided_cm_subtile) {
39618 TEST_REQUIRES_PSIMD;
39619 for (size_t k = 1; k <= 20; k += 5) {
39620 for (uint32_t m = 1; m <= 1; m++) {
39621 for (uint32_t n = 1; n <= 8; n++) {
39622 GemmMicrokernelTester()
39623 .mr(1)
39624 .nr(8)
39625 .kr(1)
39626 .sr(1)
39627 .m(m)
39628 .n(n)
39629 .k(k)
39630 .cm_stride(11)
39631 .iterations(1)
39632 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39633 }
39634 }
39635 }
39636 }
39637
39638 TEST(F32_GEMM_1X8__PSIMD_SPLAT, qmin) {
39639 TEST_REQUIRES_PSIMD;
39640 GemmMicrokernelTester()
39641 .mr(1)
39642 .nr(8)
39643 .kr(1)
39644 .sr(1)
39645 .m(1)
39646 .n(8)
39647 .k(4)
39648 .qmin(128)
39649 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39650 }
39651
39652 TEST(F32_GEMM_1X8__PSIMD_SPLAT, qmax) {
39653 TEST_REQUIRES_PSIMD;
39654 GemmMicrokernelTester()
39655 .mr(1)
39656 .nr(8)
39657 .kr(1)
39658 .sr(1)
39659 .m(1)
39660 .n(8)
39661 .k(4)
39662 .qmax(128)
39663 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39664 }
39665
39666 TEST(F32_GEMM_1X8__PSIMD_SPLAT, strided_cm) {
39667 TEST_REQUIRES_PSIMD;
39668 GemmMicrokernelTester()
39669 .mr(1)
39670 .nr(8)
39671 .kr(1)
39672 .sr(1)
39673 .m(1)
39674 .n(8)
39675 .k(4)
39676 .cm_stride(11)
39677 .Test(xnn_f32_gemm_ukernel_1x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39678 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039679#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039680
39681
Marat Dukhan1dadbf72019-10-01 10:46:20 -070039682#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070039683 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4) {
39684 TEST_REQUIRES_PSIMD;
39685 GemmMicrokernelTester()
39686 .mr(4)
39687 .nr(8)
39688 .kr(1)
39689 .sr(1)
39690 .m(4)
39691 .n(8)
39692 .k(4)
39693 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39694 }
39695
39696 TEST(F32_GEMM_4X8__PSIMD_SPLAT, strided_cn) {
39697 TEST_REQUIRES_PSIMD;
39698 GemmMicrokernelTester()
39699 .mr(4)
39700 .nr(8)
39701 .kr(1)
39702 .sr(1)
39703 .m(4)
39704 .n(8)
39705 .k(4)
39706 .cn_stride(11)
39707 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39708 }
39709
39710 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_strided_a) {
39711 TEST_REQUIRES_PSIMD;
39712 GemmMicrokernelTester()
39713 .mr(4)
39714 .nr(8)
39715 .kr(1)
39716 .sr(1)
39717 .m(4)
39718 .n(8)
39719 .k(4)
39720 .a_stride(7)
39721 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39722 }
39723
39724 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile) {
39725 TEST_REQUIRES_PSIMD;
39726 for (uint32_t m = 1; m <= 4; m++) {
39727 for (uint32_t n = 1; n <= 8; n++) {
39728 GemmMicrokernelTester()
39729 .mr(4)
39730 .nr(8)
39731 .kr(1)
39732 .sr(1)
39733 .m(m)
39734 .n(n)
39735 .k(4)
39736 .iterations(1)
39737 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39738 }
39739 }
39740 }
39741
39742 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
39743 TEST_REQUIRES_PSIMD;
39744 for (uint32_t m = 1; m <= 4; m++) {
39745 GemmMicrokernelTester()
39746 .mr(4)
39747 .nr(8)
39748 .kr(1)
39749 .sr(1)
39750 .m(m)
39751 .n(8)
39752 .k(4)
39753 .iterations(1)
39754 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39755 }
39756 }
39757
39758 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
39759 TEST_REQUIRES_PSIMD;
39760 for (uint32_t n = 1; n <= 8; n++) {
39761 GemmMicrokernelTester()
39762 .mr(4)
39763 .nr(8)
39764 .kr(1)
39765 .sr(1)
39766 .m(4)
39767 .n(n)
39768 .k(4)
39769 .iterations(1)
39770 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39771 }
39772 }
39773
39774 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_lt_4) {
39775 TEST_REQUIRES_PSIMD;
39776 for (size_t k = 1; k < 4; k++) {
39777 GemmMicrokernelTester()
39778 .mr(4)
39779 .nr(8)
39780 .kr(1)
39781 .sr(1)
39782 .m(4)
39783 .n(8)
39784 .k(k)
39785 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39786 }
39787 }
39788
39789 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_lt_4_strided_a) {
39790 TEST_REQUIRES_PSIMD;
39791 for (size_t k = 1; k < 4; k++) {
39792 GemmMicrokernelTester()
39793 .mr(4)
39794 .nr(8)
39795 .kr(1)
39796 .sr(1)
39797 .m(4)
39798 .n(8)
39799 .k(k)
39800 .a_stride(7)
39801 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39802 }
39803 }
39804
39805 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_lt_4_subtile) {
39806 TEST_REQUIRES_PSIMD;
39807 for (size_t k = 1; k < 4; k++) {
39808 for (uint32_t m = 1; m <= 4; m++) {
39809 for (uint32_t n = 1; n <= 8; n++) {
39810 GemmMicrokernelTester()
39811 .mr(4)
39812 .nr(8)
39813 .kr(1)
39814 .sr(1)
39815 .m(m)
39816 .n(n)
39817 .k(k)
39818 .iterations(1)
39819 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39820 }
39821 }
39822 }
39823 }
39824
39825 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_gt_4) {
39826 TEST_REQUIRES_PSIMD;
39827 for (size_t k = 5; k < 8; k++) {
39828 GemmMicrokernelTester()
39829 .mr(4)
39830 .nr(8)
39831 .kr(1)
39832 .sr(1)
39833 .m(4)
39834 .n(8)
39835 .k(k)
39836 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39837 }
39838 }
39839
39840 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_gt_4_strided_a) {
39841 TEST_REQUIRES_PSIMD;
39842 for (size_t k = 5; k < 8; k++) {
39843 GemmMicrokernelTester()
39844 .mr(4)
39845 .nr(8)
39846 .kr(1)
39847 .sr(1)
39848 .m(4)
39849 .n(8)
39850 .k(k)
39851 .a_stride(11)
39852 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39853 }
39854 }
39855
39856 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_gt_4_subtile) {
39857 TEST_REQUIRES_PSIMD;
39858 for (size_t k = 5; k < 8; k++) {
39859 for (uint32_t m = 1; m <= 4; m++) {
39860 for (uint32_t n = 1; n <= 8; n++) {
39861 GemmMicrokernelTester()
39862 .mr(4)
39863 .nr(8)
39864 .kr(1)
39865 .sr(1)
39866 .m(m)
39867 .n(n)
39868 .k(k)
39869 .iterations(1)
39870 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39871 }
39872 }
39873 }
39874 }
39875
39876 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_div_4) {
39877 TEST_REQUIRES_PSIMD;
39878 for (size_t k = 8; k <= 40; k += 4) {
39879 GemmMicrokernelTester()
39880 .mr(4)
39881 .nr(8)
39882 .kr(1)
39883 .sr(1)
39884 .m(4)
39885 .n(8)
39886 .k(k)
39887 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39888 }
39889 }
39890
39891 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_div_4_strided_a) {
39892 TEST_REQUIRES_PSIMD;
39893 for (size_t k = 8; k <= 40; k += 4) {
39894 GemmMicrokernelTester()
39895 .mr(4)
39896 .nr(8)
39897 .kr(1)
39898 .sr(1)
39899 .m(4)
39900 .n(8)
39901 .k(k)
39902 .a_stride(43)
39903 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39904 }
39905 }
39906
39907 TEST(F32_GEMM_4X8__PSIMD_SPLAT, k_div_4_subtile) {
39908 TEST_REQUIRES_PSIMD;
39909 for (size_t k = 8; k <= 40; k += 4) {
39910 for (uint32_t m = 1; m <= 4; m++) {
39911 for (uint32_t n = 1; n <= 8; n++) {
39912 GemmMicrokernelTester()
39913 .mr(4)
39914 .nr(8)
39915 .kr(1)
39916 .sr(1)
39917 .m(m)
39918 .n(n)
39919 .k(k)
39920 .iterations(1)
39921 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39922 }
39923 }
39924 }
39925 }
39926
39927 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8) {
39928 TEST_REQUIRES_PSIMD;
39929 for (uint32_t n = 9; n < 16; n++) {
39930 for (size_t k = 1; k <= 20; k += 5) {
39931 GemmMicrokernelTester()
39932 .mr(4)
39933 .nr(8)
39934 .kr(1)
39935 .sr(1)
39936 .m(4)
39937 .n(8)
39938 .k(k)
39939 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39940 }
39941 }
39942 }
39943
39944 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
39945 TEST_REQUIRES_PSIMD;
39946 for (uint32_t n = 9; n < 16; n++) {
39947 for (size_t k = 1; k <= 20; k += 5) {
39948 GemmMicrokernelTester()
39949 .mr(4)
39950 .nr(8)
39951 .kr(1)
39952 .sr(1)
39953 .m(4)
39954 .n(8)
39955 .k(k)
39956 .cn_stride(11)
39957 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39958 }
39959 }
39960 }
39961
39962 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8_strided_a) {
39963 TEST_REQUIRES_PSIMD;
39964 for (uint32_t n = 9; n < 16; n++) {
39965 for (size_t k = 1; k <= 20; k += 5) {
39966 GemmMicrokernelTester()
39967 .mr(4)
39968 .nr(8)
39969 .kr(1)
39970 .sr(1)
39971 .m(4)
39972 .n(n)
39973 .k(k)
39974 .a_stride(23)
39975 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39976 }
39977 }
39978 }
39979
39980 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_gt_8_subtile) {
39981 TEST_REQUIRES_PSIMD;
39982 for (uint32_t n = 9; n < 16; n++) {
39983 for (size_t k = 1; k <= 20; k += 5) {
39984 for (uint32_t m = 1; m <= 4; m++) {
39985 GemmMicrokernelTester()
39986 .mr(4)
39987 .nr(8)
39988 .kr(1)
39989 .sr(1)
39990 .m(m)
39991 .n(n)
39992 .k(k)
39993 .iterations(1)
39994 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
39995 }
39996 }
39997 }
39998 }
39999
40000 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8) {
40001 TEST_REQUIRES_PSIMD;
40002 for (uint32_t n = 16; n <= 24; n += 8) {
40003 for (size_t k = 1; k <= 20; k += 5) {
40004 GemmMicrokernelTester()
40005 .mr(4)
40006 .nr(8)
40007 .kr(1)
40008 .sr(1)
40009 .m(4)
40010 .n(8)
40011 .k(k)
40012 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40013 }
40014 }
40015 }
40016
40017 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8_strided_cn) {
40018 TEST_REQUIRES_PSIMD;
40019 for (uint32_t n = 16; n <= 24; n += 8) {
40020 for (size_t k = 1; k <= 20; k += 5) {
40021 GemmMicrokernelTester()
40022 .mr(4)
40023 .nr(8)
40024 .kr(1)
40025 .sr(1)
40026 .m(4)
40027 .n(n)
40028 .k(k)
40029 .cn_stride(11)
40030 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40031 }
40032 }
40033 }
40034
40035 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8_strided_a) {
40036 TEST_REQUIRES_PSIMD;
40037 for (uint32_t n = 16; n <= 24; n += 8) {
40038 for (size_t k = 1; k <= 20; k += 5) {
40039 GemmMicrokernelTester()
40040 .mr(4)
40041 .nr(8)
40042 .kr(1)
40043 .sr(1)
40044 .m(4)
40045 .n(n)
40046 .k(k)
40047 .a_stride(23)
40048 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40049 }
40050 }
40051 }
40052
40053 TEST(F32_GEMM_4X8__PSIMD_SPLAT, n_div_8_subtile) {
40054 TEST_REQUIRES_PSIMD;
40055 for (uint32_t n = 16; n <= 24; n += 8) {
40056 for (size_t k = 1; k <= 20; k += 5) {
40057 for (uint32_t m = 1; m <= 4; m++) {
40058 GemmMicrokernelTester()
40059 .mr(4)
40060 .nr(8)
40061 .kr(1)
40062 .sr(1)
40063 .m(m)
40064 .n(n)
40065 .k(k)
40066 .iterations(1)
40067 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40068 }
40069 }
40070 }
40071 }
40072
40073 TEST(F32_GEMM_4X8__PSIMD_SPLAT, strided_cm_subtile) {
40074 TEST_REQUIRES_PSIMD;
40075 for (size_t k = 1; k <= 20; k += 5) {
40076 for (uint32_t m = 1; m <= 4; m++) {
40077 for (uint32_t n = 1; n <= 8; n++) {
40078 GemmMicrokernelTester()
40079 .mr(4)
40080 .nr(8)
40081 .kr(1)
40082 .sr(1)
40083 .m(m)
40084 .n(n)
40085 .k(k)
40086 .cm_stride(11)
40087 .iterations(1)
40088 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40089 }
40090 }
40091 }
40092 }
40093
40094 TEST(F32_GEMM_4X8__PSIMD_SPLAT, qmin) {
40095 TEST_REQUIRES_PSIMD;
40096 GemmMicrokernelTester()
40097 .mr(4)
40098 .nr(8)
40099 .kr(1)
40100 .sr(1)
40101 .m(4)
40102 .n(8)
40103 .k(4)
40104 .qmin(128)
40105 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40106 }
40107
40108 TEST(F32_GEMM_4X8__PSIMD_SPLAT, qmax) {
40109 TEST_REQUIRES_PSIMD;
40110 GemmMicrokernelTester()
40111 .mr(4)
40112 .nr(8)
40113 .kr(1)
40114 .sr(1)
40115 .m(4)
40116 .n(8)
40117 .k(4)
40118 .qmax(128)
40119 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40120 }
40121
40122 TEST(F32_GEMM_4X8__PSIMD_SPLAT, strided_cm) {
40123 TEST_REQUIRES_PSIMD;
40124 GemmMicrokernelTester()
40125 .mr(4)
40126 .nr(8)
40127 .kr(1)
40128 .sr(1)
40129 .m(4)
40130 .n(8)
40131 .k(4)
40132 .cm_stride(11)
40133 .Test(xnn_f32_gemm_ukernel_4x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40134 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040135#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040136
40137
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040138#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040139 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4) {
40140 TEST_REQUIRES_PSIMD;
40141 GemmMicrokernelTester()
40142 .mr(6)
40143 .nr(8)
40144 .kr(1)
40145 .sr(1)
40146 .m(6)
40147 .n(8)
40148 .k(4)
40149 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40150 }
40151
40152 TEST(F32_GEMM_6X8__PSIMD_SPLAT, strided_cn) {
40153 TEST_REQUIRES_PSIMD;
40154 GemmMicrokernelTester()
40155 .mr(6)
40156 .nr(8)
40157 .kr(1)
40158 .sr(1)
40159 .m(6)
40160 .n(8)
40161 .k(4)
40162 .cn_stride(11)
40163 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40164 }
40165
40166 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_strided_a) {
40167 TEST_REQUIRES_PSIMD;
40168 GemmMicrokernelTester()
40169 .mr(6)
40170 .nr(8)
40171 .kr(1)
40172 .sr(1)
40173 .m(6)
40174 .n(8)
40175 .k(4)
40176 .a_stride(7)
40177 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40178 }
40179
40180 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile) {
40181 TEST_REQUIRES_PSIMD;
40182 for (uint32_t m = 1; m <= 6; m++) {
40183 for (uint32_t n = 1; n <= 8; n++) {
40184 GemmMicrokernelTester()
40185 .mr(6)
40186 .nr(8)
40187 .kr(1)
40188 .sr(1)
40189 .m(m)
40190 .n(n)
40191 .k(4)
40192 .iterations(1)
40193 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40194 }
40195 }
40196 }
40197
40198 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_m) {
40199 TEST_REQUIRES_PSIMD;
40200 for (uint32_t m = 1; m <= 6; m++) {
40201 GemmMicrokernelTester()
40202 .mr(6)
40203 .nr(8)
40204 .kr(1)
40205 .sr(1)
40206 .m(m)
40207 .n(8)
40208 .k(4)
40209 .iterations(1)
40210 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40211 }
40212 }
40213
40214 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_eq_4_subtile_n) {
40215 TEST_REQUIRES_PSIMD;
40216 for (uint32_t n = 1; n <= 8; n++) {
40217 GemmMicrokernelTester()
40218 .mr(6)
40219 .nr(8)
40220 .kr(1)
40221 .sr(1)
40222 .m(6)
40223 .n(n)
40224 .k(4)
40225 .iterations(1)
40226 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40227 }
40228 }
40229
40230 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_lt_4) {
40231 TEST_REQUIRES_PSIMD;
40232 for (size_t k = 1; k < 4; k++) {
40233 GemmMicrokernelTester()
40234 .mr(6)
40235 .nr(8)
40236 .kr(1)
40237 .sr(1)
40238 .m(6)
40239 .n(8)
40240 .k(k)
40241 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40242 }
40243 }
40244
40245 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_lt_4_strided_a) {
40246 TEST_REQUIRES_PSIMD;
40247 for (size_t k = 1; k < 4; k++) {
40248 GemmMicrokernelTester()
40249 .mr(6)
40250 .nr(8)
40251 .kr(1)
40252 .sr(1)
40253 .m(6)
40254 .n(8)
40255 .k(k)
40256 .a_stride(7)
40257 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40258 }
40259 }
40260
40261 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_lt_4_subtile) {
40262 TEST_REQUIRES_PSIMD;
40263 for (size_t k = 1; k < 4; k++) {
40264 for (uint32_t m = 1; m <= 6; m++) {
40265 for (uint32_t n = 1; n <= 8; n++) {
40266 GemmMicrokernelTester()
40267 .mr(6)
40268 .nr(8)
40269 .kr(1)
40270 .sr(1)
40271 .m(m)
40272 .n(n)
40273 .k(k)
40274 .iterations(1)
40275 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40276 }
40277 }
40278 }
40279 }
40280
40281 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_gt_4) {
40282 TEST_REQUIRES_PSIMD;
40283 for (size_t k = 5; k < 8; k++) {
40284 GemmMicrokernelTester()
40285 .mr(6)
40286 .nr(8)
40287 .kr(1)
40288 .sr(1)
40289 .m(6)
40290 .n(8)
40291 .k(k)
40292 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40293 }
40294 }
40295
40296 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_gt_4_strided_a) {
40297 TEST_REQUIRES_PSIMD;
40298 for (size_t k = 5; k < 8; k++) {
40299 GemmMicrokernelTester()
40300 .mr(6)
40301 .nr(8)
40302 .kr(1)
40303 .sr(1)
40304 .m(6)
40305 .n(8)
40306 .k(k)
40307 .a_stride(11)
40308 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40309 }
40310 }
40311
40312 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_gt_4_subtile) {
40313 TEST_REQUIRES_PSIMD;
40314 for (size_t k = 5; k < 8; k++) {
40315 for (uint32_t m = 1; m <= 6; m++) {
40316 for (uint32_t n = 1; n <= 8; n++) {
40317 GemmMicrokernelTester()
40318 .mr(6)
40319 .nr(8)
40320 .kr(1)
40321 .sr(1)
40322 .m(m)
40323 .n(n)
40324 .k(k)
40325 .iterations(1)
40326 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40327 }
40328 }
40329 }
40330 }
40331
40332 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_div_4) {
40333 TEST_REQUIRES_PSIMD;
40334 for (size_t k = 8; k <= 40; k += 4) {
40335 GemmMicrokernelTester()
40336 .mr(6)
40337 .nr(8)
40338 .kr(1)
40339 .sr(1)
40340 .m(6)
40341 .n(8)
40342 .k(k)
40343 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40344 }
40345 }
40346
40347 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_div_4_strided_a) {
40348 TEST_REQUIRES_PSIMD;
40349 for (size_t k = 8; k <= 40; k += 4) {
40350 GemmMicrokernelTester()
40351 .mr(6)
40352 .nr(8)
40353 .kr(1)
40354 .sr(1)
40355 .m(6)
40356 .n(8)
40357 .k(k)
40358 .a_stride(43)
40359 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40360 }
40361 }
40362
40363 TEST(F32_GEMM_6X8__PSIMD_SPLAT, k_div_4_subtile) {
40364 TEST_REQUIRES_PSIMD;
40365 for (size_t k = 8; k <= 40; k += 4) {
40366 for (uint32_t m = 1; m <= 6; m++) {
40367 for (uint32_t n = 1; n <= 8; n++) {
40368 GemmMicrokernelTester()
40369 .mr(6)
40370 .nr(8)
40371 .kr(1)
40372 .sr(1)
40373 .m(m)
40374 .n(n)
40375 .k(k)
40376 .iterations(1)
40377 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40378 }
40379 }
40380 }
40381 }
40382
40383 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8) {
40384 TEST_REQUIRES_PSIMD;
40385 for (uint32_t n = 9; n < 16; n++) {
40386 for (size_t k = 1; k <= 20; k += 5) {
40387 GemmMicrokernelTester()
40388 .mr(6)
40389 .nr(8)
40390 .kr(1)
40391 .sr(1)
40392 .m(6)
40393 .n(8)
40394 .k(k)
40395 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40396 }
40397 }
40398 }
40399
40400 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8_strided_cn) {
40401 TEST_REQUIRES_PSIMD;
40402 for (uint32_t n = 9; n < 16; n++) {
40403 for (size_t k = 1; k <= 20; k += 5) {
40404 GemmMicrokernelTester()
40405 .mr(6)
40406 .nr(8)
40407 .kr(1)
40408 .sr(1)
40409 .m(6)
40410 .n(8)
40411 .k(k)
40412 .cn_stride(11)
40413 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40414 }
40415 }
40416 }
40417
40418 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8_strided_a) {
40419 TEST_REQUIRES_PSIMD;
40420 for (uint32_t n = 9; n < 16; n++) {
40421 for (size_t k = 1; k <= 20; k += 5) {
40422 GemmMicrokernelTester()
40423 .mr(6)
40424 .nr(8)
40425 .kr(1)
40426 .sr(1)
40427 .m(6)
40428 .n(n)
40429 .k(k)
40430 .a_stride(23)
40431 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40432 }
40433 }
40434 }
40435
40436 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_gt_8_subtile) {
40437 TEST_REQUIRES_PSIMD;
40438 for (uint32_t n = 9; n < 16; n++) {
40439 for (size_t k = 1; k <= 20; k += 5) {
40440 for (uint32_t m = 1; m <= 6; m++) {
40441 GemmMicrokernelTester()
40442 .mr(6)
40443 .nr(8)
40444 .kr(1)
40445 .sr(1)
40446 .m(m)
40447 .n(n)
40448 .k(k)
40449 .iterations(1)
40450 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40451 }
40452 }
40453 }
40454 }
40455
40456 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8) {
40457 TEST_REQUIRES_PSIMD;
40458 for (uint32_t n = 16; n <= 24; n += 8) {
40459 for (size_t k = 1; k <= 20; k += 5) {
40460 GemmMicrokernelTester()
40461 .mr(6)
40462 .nr(8)
40463 .kr(1)
40464 .sr(1)
40465 .m(6)
40466 .n(8)
40467 .k(k)
40468 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40469 }
40470 }
40471 }
40472
40473 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8_strided_cn) {
40474 TEST_REQUIRES_PSIMD;
40475 for (uint32_t n = 16; n <= 24; n += 8) {
40476 for (size_t k = 1; k <= 20; k += 5) {
40477 GemmMicrokernelTester()
40478 .mr(6)
40479 .nr(8)
40480 .kr(1)
40481 .sr(1)
40482 .m(6)
40483 .n(n)
40484 .k(k)
40485 .cn_stride(11)
40486 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40487 }
40488 }
40489 }
40490
40491 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8_strided_a) {
40492 TEST_REQUIRES_PSIMD;
40493 for (uint32_t n = 16; n <= 24; n += 8) {
40494 for (size_t k = 1; k <= 20; k += 5) {
40495 GemmMicrokernelTester()
40496 .mr(6)
40497 .nr(8)
40498 .kr(1)
40499 .sr(1)
40500 .m(6)
40501 .n(n)
40502 .k(k)
40503 .a_stride(23)
40504 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40505 }
40506 }
40507 }
40508
40509 TEST(F32_GEMM_6X8__PSIMD_SPLAT, n_div_8_subtile) {
40510 TEST_REQUIRES_PSIMD;
40511 for (uint32_t n = 16; n <= 24; n += 8) {
40512 for (size_t k = 1; k <= 20; k += 5) {
40513 for (uint32_t m = 1; m <= 6; m++) {
40514 GemmMicrokernelTester()
40515 .mr(6)
40516 .nr(8)
40517 .kr(1)
40518 .sr(1)
40519 .m(m)
40520 .n(n)
40521 .k(k)
40522 .iterations(1)
40523 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40524 }
40525 }
40526 }
40527 }
40528
40529 TEST(F32_GEMM_6X8__PSIMD_SPLAT, strided_cm_subtile) {
40530 TEST_REQUIRES_PSIMD;
40531 for (size_t k = 1; k <= 20; k += 5) {
40532 for (uint32_t m = 1; m <= 6; m++) {
40533 for (uint32_t n = 1; n <= 8; n++) {
40534 GemmMicrokernelTester()
40535 .mr(6)
40536 .nr(8)
40537 .kr(1)
40538 .sr(1)
40539 .m(m)
40540 .n(n)
40541 .k(k)
40542 .cm_stride(11)
40543 .iterations(1)
40544 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40545 }
40546 }
40547 }
40548 }
40549
40550 TEST(F32_GEMM_6X8__PSIMD_SPLAT, qmin) {
40551 TEST_REQUIRES_PSIMD;
40552 GemmMicrokernelTester()
40553 .mr(6)
40554 .nr(8)
40555 .kr(1)
40556 .sr(1)
40557 .m(6)
40558 .n(8)
40559 .k(4)
40560 .qmin(128)
40561 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40562 }
40563
40564 TEST(F32_GEMM_6X8__PSIMD_SPLAT, qmax) {
40565 TEST_REQUIRES_PSIMD;
40566 GemmMicrokernelTester()
40567 .mr(6)
40568 .nr(8)
40569 .kr(1)
40570 .sr(1)
40571 .m(6)
40572 .n(8)
40573 .k(4)
40574 .qmax(128)
40575 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40576 }
40577
40578 TEST(F32_GEMM_6X8__PSIMD_SPLAT, strided_cm) {
40579 TEST_REQUIRES_PSIMD;
40580 GemmMicrokernelTester()
40581 .mr(6)
40582 .nr(8)
40583 .kr(1)
40584 .sr(1)
40585 .m(6)
40586 .n(8)
40587 .k(4)
40588 .cm_stride(11)
40589 .Test(xnn_f32_gemm_ukernel_6x8__psimd_splat, GemmMicrokernelTester::Variant::Scalar);
40590 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040591#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040592
40593
Marat Dukhan1dadbf72019-10-01 10:46:20 -070040594#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070040595 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4) {
40596 TEST_REQUIRES_PSIMD;
40597 GemmMicrokernelTester()
40598 .mr(1)
40599 .nr(8)
40600 .kr(1)
40601 .sr(4)
40602 .m(1)
40603 .n(8)
40604 .k(4)
40605 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40606 }
40607
40608 TEST(F32_GEMM_1X8S4__PSIMD, strided_cn) {
40609 TEST_REQUIRES_PSIMD;
40610 GemmMicrokernelTester()
40611 .mr(1)
40612 .nr(8)
40613 .kr(1)
40614 .sr(4)
40615 .m(1)
40616 .n(8)
40617 .k(4)
40618 .cn_stride(11)
40619 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40620 }
40621
40622 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_strided_a) {
40623 TEST_REQUIRES_PSIMD;
40624 GemmMicrokernelTester()
40625 .mr(1)
40626 .nr(8)
40627 .kr(1)
40628 .sr(4)
40629 .m(1)
40630 .n(8)
40631 .k(4)
40632 .a_stride(7)
40633 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40634 }
40635
40636 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_subtile) {
40637 TEST_REQUIRES_PSIMD;
40638 for (uint32_t m = 1; m <= 1; m++) {
40639 for (uint32_t n = 1; n <= 8; n++) {
40640 GemmMicrokernelTester()
40641 .mr(1)
40642 .nr(8)
40643 .kr(1)
40644 .sr(4)
40645 .m(m)
40646 .n(n)
40647 .k(4)
40648 .iterations(1)
40649 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40650 }
40651 }
40652 }
40653
40654 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_subtile_m) {
40655 TEST_REQUIRES_PSIMD;
40656 for (uint32_t m = 1; m <= 1; m++) {
40657 GemmMicrokernelTester()
40658 .mr(1)
40659 .nr(8)
40660 .kr(1)
40661 .sr(4)
40662 .m(m)
40663 .n(8)
40664 .k(4)
40665 .iterations(1)
40666 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40667 }
40668 }
40669
40670 TEST(F32_GEMM_1X8S4__PSIMD, k_eq_4_subtile_n) {
40671 TEST_REQUIRES_PSIMD;
40672 for (uint32_t n = 1; n <= 8; n++) {
40673 GemmMicrokernelTester()
40674 .mr(1)
40675 .nr(8)
40676 .kr(1)
40677 .sr(4)
40678 .m(1)
40679 .n(n)
40680 .k(4)
40681 .iterations(1)
40682 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40683 }
40684 }
40685
40686 TEST(F32_GEMM_1X8S4__PSIMD, k_lt_4) {
40687 TEST_REQUIRES_PSIMD;
40688 for (size_t k = 1; k < 4; k++) {
40689 GemmMicrokernelTester()
40690 .mr(1)
40691 .nr(8)
40692 .kr(1)
40693 .sr(4)
40694 .m(1)
40695 .n(8)
40696 .k(k)
40697 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40698 }
40699 }
40700
40701 TEST(F32_GEMM_1X8S4__PSIMD, k_lt_4_strided_a) {
40702 TEST_REQUIRES_PSIMD;
40703 for (size_t k = 1; k < 4; k++) {
40704 GemmMicrokernelTester()
40705 .mr(1)
40706 .nr(8)
40707 .kr(1)
40708 .sr(4)
40709 .m(1)
40710 .n(8)
40711 .k(k)
40712 .a_stride(7)
40713 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40714 }
40715 }
40716
40717 TEST(F32_GEMM_1X8S4__PSIMD, k_lt_4_subtile) {
40718 TEST_REQUIRES_PSIMD;
40719 for (size_t k = 1; k < 4; k++) {
40720 for (uint32_t m = 1; m <= 1; m++) {
40721 for (uint32_t n = 1; n <= 8; n++) {
40722 GemmMicrokernelTester()
40723 .mr(1)
40724 .nr(8)
40725 .kr(1)
40726 .sr(4)
40727 .m(m)
40728 .n(n)
40729 .k(k)
40730 .iterations(1)
40731 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40732 }
40733 }
40734 }
40735 }
40736
40737 TEST(F32_GEMM_1X8S4__PSIMD, k_gt_4) {
40738 TEST_REQUIRES_PSIMD;
40739 for (size_t k = 5; k < 8; k++) {
40740 GemmMicrokernelTester()
40741 .mr(1)
40742 .nr(8)
40743 .kr(1)
40744 .sr(4)
40745 .m(1)
40746 .n(8)
40747 .k(k)
40748 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40749 }
40750 }
40751
40752 TEST(F32_GEMM_1X8S4__PSIMD, k_gt_4_strided_a) {
40753 TEST_REQUIRES_PSIMD;
40754 for (size_t k = 5; k < 8; k++) {
40755 GemmMicrokernelTester()
40756 .mr(1)
40757 .nr(8)
40758 .kr(1)
40759 .sr(4)
40760 .m(1)
40761 .n(8)
40762 .k(k)
40763 .a_stride(11)
40764 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40765 }
40766 }
40767
40768 TEST(F32_GEMM_1X8S4__PSIMD, k_gt_4_subtile) {
40769 TEST_REQUIRES_PSIMD;
40770 for (size_t k = 5; k < 8; k++) {
40771 for (uint32_t m = 1; m <= 1; m++) {
40772 for (uint32_t n = 1; n <= 8; n++) {
40773 GemmMicrokernelTester()
40774 .mr(1)
40775 .nr(8)
40776 .kr(1)
40777 .sr(4)
40778 .m(m)
40779 .n(n)
40780 .k(k)
40781 .iterations(1)
40782 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40783 }
40784 }
40785 }
40786 }
40787
40788 TEST(F32_GEMM_1X8S4__PSIMD, k_div_4) {
40789 TEST_REQUIRES_PSIMD;
40790 for (size_t k = 8; k <= 40; k += 4) {
40791 GemmMicrokernelTester()
40792 .mr(1)
40793 .nr(8)
40794 .kr(1)
40795 .sr(4)
40796 .m(1)
40797 .n(8)
40798 .k(k)
40799 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40800 }
40801 }
40802
40803 TEST(F32_GEMM_1X8S4__PSIMD, k_div_4_strided_a) {
40804 TEST_REQUIRES_PSIMD;
40805 for (size_t k = 8; k <= 40; k += 4) {
40806 GemmMicrokernelTester()
40807 .mr(1)
40808 .nr(8)
40809 .kr(1)
40810 .sr(4)
40811 .m(1)
40812 .n(8)
40813 .k(k)
40814 .a_stride(43)
40815 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40816 }
40817 }
40818
40819 TEST(F32_GEMM_1X8S4__PSIMD, k_div_4_subtile) {
40820 TEST_REQUIRES_PSIMD;
40821 for (size_t k = 8; k <= 40; k += 4) {
40822 for (uint32_t m = 1; m <= 1; m++) {
40823 for (uint32_t n = 1; n <= 8; n++) {
40824 GemmMicrokernelTester()
40825 .mr(1)
40826 .nr(8)
40827 .kr(1)
40828 .sr(4)
40829 .m(m)
40830 .n(n)
40831 .k(k)
40832 .iterations(1)
40833 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40834 }
40835 }
40836 }
40837 }
40838
40839 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8) {
40840 TEST_REQUIRES_PSIMD;
40841 for (uint32_t n = 9; n < 16; n++) {
40842 for (size_t k = 1; k <= 20; k += 5) {
40843 GemmMicrokernelTester()
40844 .mr(1)
40845 .nr(8)
40846 .kr(1)
40847 .sr(4)
40848 .m(1)
40849 .n(8)
40850 .k(k)
40851 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40852 }
40853 }
40854 }
40855
40856 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8_strided_cn) {
40857 TEST_REQUIRES_PSIMD;
40858 for (uint32_t n = 9; n < 16; n++) {
40859 for (size_t k = 1; k <= 20; k += 5) {
40860 GemmMicrokernelTester()
40861 .mr(1)
40862 .nr(8)
40863 .kr(1)
40864 .sr(4)
40865 .m(1)
40866 .n(8)
40867 .k(k)
40868 .cn_stride(11)
40869 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40870 }
40871 }
40872 }
40873
40874 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8_strided_a) {
40875 TEST_REQUIRES_PSIMD;
40876 for (uint32_t n = 9; n < 16; n++) {
40877 for (size_t k = 1; k <= 20; k += 5) {
40878 GemmMicrokernelTester()
40879 .mr(1)
40880 .nr(8)
40881 .kr(1)
40882 .sr(4)
40883 .m(1)
40884 .n(n)
40885 .k(k)
40886 .a_stride(23)
40887 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40888 }
40889 }
40890 }
40891
40892 TEST(F32_GEMM_1X8S4__PSIMD, n_gt_8_subtile) {
40893 TEST_REQUIRES_PSIMD;
40894 for (uint32_t n = 9; n < 16; n++) {
40895 for (size_t k = 1; k <= 20; k += 5) {
40896 for (uint32_t m = 1; m <= 1; m++) {
40897 GemmMicrokernelTester()
40898 .mr(1)
40899 .nr(8)
40900 .kr(1)
40901 .sr(4)
40902 .m(m)
40903 .n(n)
40904 .k(k)
40905 .iterations(1)
40906 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40907 }
40908 }
40909 }
40910 }
40911
40912 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8) {
40913 TEST_REQUIRES_PSIMD;
40914 for (uint32_t n = 16; n <= 24; n += 8) {
40915 for (size_t k = 1; k <= 20; k += 5) {
40916 GemmMicrokernelTester()
40917 .mr(1)
40918 .nr(8)
40919 .kr(1)
40920 .sr(4)
40921 .m(1)
40922 .n(8)
40923 .k(k)
40924 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40925 }
40926 }
40927 }
40928
40929 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8_strided_cn) {
40930 TEST_REQUIRES_PSIMD;
40931 for (uint32_t n = 16; n <= 24; n += 8) {
40932 for (size_t k = 1; k <= 20; k += 5) {
40933 GemmMicrokernelTester()
40934 .mr(1)
40935 .nr(8)
40936 .kr(1)
40937 .sr(4)
40938 .m(1)
40939 .n(n)
40940 .k(k)
40941 .cn_stride(11)
40942 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40943 }
40944 }
40945 }
40946
40947 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8_strided_a) {
40948 TEST_REQUIRES_PSIMD;
40949 for (uint32_t n = 16; n <= 24; n += 8) {
40950 for (size_t k = 1; k <= 20; k += 5) {
40951 GemmMicrokernelTester()
40952 .mr(1)
40953 .nr(8)
40954 .kr(1)
40955 .sr(4)
40956 .m(1)
40957 .n(n)
40958 .k(k)
40959 .a_stride(23)
40960 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40961 }
40962 }
40963 }
40964
40965 TEST(F32_GEMM_1X8S4__PSIMD, n_div_8_subtile) {
40966 TEST_REQUIRES_PSIMD;
40967 for (uint32_t n = 16; n <= 24; n += 8) {
40968 for (size_t k = 1; k <= 20; k += 5) {
40969 for (uint32_t m = 1; m <= 1; m++) {
40970 GemmMicrokernelTester()
40971 .mr(1)
40972 .nr(8)
40973 .kr(1)
40974 .sr(4)
40975 .m(m)
40976 .n(n)
40977 .k(k)
40978 .iterations(1)
40979 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
40980 }
40981 }
40982 }
40983 }
40984
40985 TEST(F32_GEMM_1X8S4__PSIMD, strided_cm_subtile) {
40986 TEST_REQUIRES_PSIMD;
40987 for (size_t k = 1; k <= 20; k += 5) {
40988 for (uint32_t m = 1; m <= 1; m++) {
40989 for (uint32_t n = 1; n <= 8; n++) {
40990 GemmMicrokernelTester()
40991 .mr(1)
40992 .nr(8)
40993 .kr(1)
40994 .sr(4)
40995 .m(m)
40996 .n(n)
40997 .k(k)
40998 .cm_stride(11)
40999 .iterations(1)
41000 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41001 }
41002 }
41003 }
41004 }
41005
41006 TEST(F32_GEMM_1X8S4__PSIMD, qmin) {
41007 TEST_REQUIRES_PSIMD;
41008 GemmMicrokernelTester()
41009 .mr(1)
41010 .nr(8)
41011 .kr(1)
41012 .sr(4)
41013 .m(1)
41014 .n(8)
41015 .k(4)
41016 .qmin(128)
41017 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41018 }
41019
41020 TEST(F32_GEMM_1X8S4__PSIMD, qmax) {
41021 TEST_REQUIRES_PSIMD;
41022 GemmMicrokernelTester()
41023 .mr(1)
41024 .nr(8)
41025 .kr(1)
41026 .sr(4)
41027 .m(1)
41028 .n(8)
41029 .k(4)
41030 .qmax(128)
41031 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41032 }
41033
41034 TEST(F32_GEMM_1X8S4__PSIMD, strided_cm) {
41035 TEST_REQUIRES_PSIMD;
41036 GemmMicrokernelTester()
41037 .mr(1)
41038 .nr(8)
41039 .kr(1)
41040 .sr(4)
41041 .m(1)
41042 .n(8)
41043 .k(4)
41044 .cm_stride(11)
41045 .Test(xnn_f32_gemm_ukernel_1x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41046 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041047#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041048
41049
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041050#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041051 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4) {
41052 TEST_REQUIRES_PSIMD;
41053 GemmMicrokernelTester()
41054 .mr(4)
41055 .nr(8)
41056 .kr(1)
41057 .sr(4)
41058 .m(4)
41059 .n(8)
41060 .k(4)
41061 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41062 }
41063
41064 TEST(F32_GEMM_4X8S4__PSIMD, strided_cn) {
41065 TEST_REQUIRES_PSIMD;
41066 GemmMicrokernelTester()
41067 .mr(4)
41068 .nr(8)
41069 .kr(1)
41070 .sr(4)
41071 .m(4)
41072 .n(8)
41073 .k(4)
41074 .cn_stride(11)
41075 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41076 }
41077
41078 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_strided_a) {
41079 TEST_REQUIRES_PSIMD;
41080 GemmMicrokernelTester()
41081 .mr(4)
41082 .nr(8)
41083 .kr(1)
41084 .sr(4)
41085 .m(4)
41086 .n(8)
41087 .k(4)
41088 .a_stride(7)
41089 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41090 }
41091
41092 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_subtile) {
41093 TEST_REQUIRES_PSIMD;
41094 for (uint32_t m = 1; m <= 4; m++) {
41095 for (uint32_t n = 1; n <= 8; n++) {
41096 GemmMicrokernelTester()
41097 .mr(4)
41098 .nr(8)
41099 .kr(1)
41100 .sr(4)
41101 .m(m)
41102 .n(n)
41103 .k(4)
41104 .iterations(1)
41105 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41106 }
41107 }
41108 }
41109
41110 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_subtile_m) {
41111 TEST_REQUIRES_PSIMD;
41112 for (uint32_t m = 1; m <= 4; m++) {
41113 GemmMicrokernelTester()
41114 .mr(4)
41115 .nr(8)
41116 .kr(1)
41117 .sr(4)
41118 .m(m)
41119 .n(8)
41120 .k(4)
41121 .iterations(1)
41122 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41123 }
41124 }
41125
41126 TEST(F32_GEMM_4X8S4__PSIMD, k_eq_4_subtile_n) {
41127 TEST_REQUIRES_PSIMD;
41128 for (uint32_t n = 1; n <= 8; n++) {
41129 GemmMicrokernelTester()
41130 .mr(4)
41131 .nr(8)
41132 .kr(1)
41133 .sr(4)
41134 .m(4)
41135 .n(n)
41136 .k(4)
41137 .iterations(1)
41138 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41139 }
41140 }
41141
41142 TEST(F32_GEMM_4X8S4__PSIMD, k_lt_4) {
41143 TEST_REQUIRES_PSIMD;
41144 for (size_t k = 1; k < 4; k++) {
41145 GemmMicrokernelTester()
41146 .mr(4)
41147 .nr(8)
41148 .kr(1)
41149 .sr(4)
41150 .m(4)
41151 .n(8)
41152 .k(k)
41153 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41154 }
41155 }
41156
41157 TEST(F32_GEMM_4X8S4__PSIMD, k_lt_4_strided_a) {
41158 TEST_REQUIRES_PSIMD;
41159 for (size_t k = 1; k < 4; k++) {
41160 GemmMicrokernelTester()
41161 .mr(4)
41162 .nr(8)
41163 .kr(1)
41164 .sr(4)
41165 .m(4)
41166 .n(8)
41167 .k(k)
41168 .a_stride(7)
41169 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41170 }
41171 }
41172
41173 TEST(F32_GEMM_4X8S4__PSIMD, k_lt_4_subtile) {
41174 TEST_REQUIRES_PSIMD;
41175 for (size_t k = 1; k < 4; k++) {
41176 for (uint32_t m = 1; m <= 4; m++) {
41177 for (uint32_t n = 1; n <= 8; n++) {
41178 GemmMicrokernelTester()
41179 .mr(4)
41180 .nr(8)
41181 .kr(1)
41182 .sr(4)
41183 .m(m)
41184 .n(n)
41185 .k(k)
41186 .iterations(1)
41187 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41188 }
41189 }
41190 }
41191 }
41192
41193 TEST(F32_GEMM_4X8S4__PSIMD, k_gt_4) {
41194 TEST_REQUIRES_PSIMD;
41195 for (size_t k = 5; k < 8; k++) {
41196 GemmMicrokernelTester()
41197 .mr(4)
41198 .nr(8)
41199 .kr(1)
41200 .sr(4)
41201 .m(4)
41202 .n(8)
41203 .k(k)
41204 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41205 }
41206 }
41207
41208 TEST(F32_GEMM_4X8S4__PSIMD, k_gt_4_strided_a) {
41209 TEST_REQUIRES_PSIMD;
41210 for (size_t k = 5; k < 8; k++) {
41211 GemmMicrokernelTester()
41212 .mr(4)
41213 .nr(8)
41214 .kr(1)
41215 .sr(4)
41216 .m(4)
41217 .n(8)
41218 .k(k)
41219 .a_stride(11)
41220 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41221 }
41222 }
41223
41224 TEST(F32_GEMM_4X8S4__PSIMD, k_gt_4_subtile) {
41225 TEST_REQUIRES_PSIMD;
41226 for (size_t k = 5; k < 8; k++) {
41227 for (uint32_t m = 1; m <= 4; m++) {
41228 for (uint32_t n = 1; n <= 8; n++) {
41229 GemmMicrokernelTester()
41230 .mr(4)
41231 .nr(8)
41232 .kr(1)
41233 .sr(4)
41234 .m(m)
41235 .n(n)
41236 .k(k)
41237 .iterations(1)
41238 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41239 }
41240 }
41241 }
41242 }
41243
41244 TEST(F32_GEMM_4X8S4__PSIMD, k_div_4) {
41245 TEST_REQUIRES_PSIMD;
41246 for (size_t k = 8; k <= 40; k += 4) {
41247 GemmMicrokernelTester()
41248 .mr(4)
41249 .nr(8)
41250 .kr(1)
41251 .sr(4)
41252 .m(4)
41253 .n(8)
41254 .k(k)
41255 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41256 }
41257 }
41258
41259 TEST(F32_GEMM_4X8S4__PSIMD, k_div_4_strided_a) {
41260 TEST_REQUIRES_PSIMD;
41261 for (size_t k = 8; k <= 40; k += 4) {
41262 GemmMicrokernelTester()
41263 .mr(4)
41264 .nr(8)
41265 .kr(1)
41266 .sr(4)
41267 .m(4)
41268 .n(8)
41269 .k(k)
41270 .a_stride(43)
41271 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41272 }
41273 }
41274
41275 TEST(F32_GEMM_4X8S4__PSIMD, k_div_4_subtile) {
41276 TEST_REQUIRES_PSIMD;
41277 for (size_t k = 8; k <= 40; k += 4) {
41278 for (uint32_t m = 1; m <= 4; m++) {
41279 for (uint32_t n = 1; n <= 8; n++) {
41280 GemmMicrokernelTester()
41281 .mr(4)
41282 .nr(8)
41283 .kr(1)
41284 .sr(4)
41285 .m(m)
41286 .n(n)
41287 .k(k)
41288 .iterations(1)
41289 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41290 }
41291 }
41292 }
41293 }
41294
41295 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8) {
41296 TEST_REQUIRES_PSIMD;
41297 for (uint32_t n = 9; n < 16; n++) {
41298 for (size_t k = 1; k <= 20; k += 5) {
41299 GemmMicrokernelTester()
41300 .mr(4)
41301 .nr(8)
41302 .kr(1)
41303 .sr(4)
41304 .m(4)
41305 .n(8)
41306 .k(k)
41307 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41308 }
41309 }
41310 }
41311
41312 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8_strided_cn) {
41313 TEST_REQUIRES_PSIMD;
41314 for (uint32_t n = 9; n < 16; n++) {
41315 for (size_t k = 1; k <= 20; k += 5) {
41316 GemmMicrokernelTester()
41317 .mr(4)
41318 .nr(8)
41319 .kr(1)
41320 .sr(4)
41321 .m(4)
41322 .n(8)
41323 .k(k)
41324 .cn_stride(11)
41325 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41326 }
41327 }
41328 }
41329
41330 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8_strided_a) {
41331 TEST_REQUIRES_PSIMD;
41332 for (uint32_t n = 9; n < 16; n++) {
41333 for (size_t k = 1; k <= 20; k += 5) {
41334 GemmMicrokernelTester()
41335 .mr(4)
41336 .nr(8)
41337 .kr(1)
41338 .sr(4)
41339 .m(4)
41340 .n(n)
41341 .k(k)
41342 .a_stride(23)
41343 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41344 }
41345 }
41346 }
41347
41348 TEST(F32_GEMM_4X8S4__PSIMD, n_gt_8_subtile) {
41349 TEST_REQUIRES_PSIMD;
41350 for (uint32_t n = 9; n < 16; n++) {
41351 for (size_t k = 1; k <= 20; k += 5) {
41352 for (uint32_t m = 1; m <= 4; m++) {
41353 GemmMicrokernelTester()
41354 .mr(4)
41355 .nr(8)
41356 .kr(1)
41357 .sr(4)
41358 .m(m)
41359 .n(n)
41360 .k(k)
41361 .iterations(1)
41362 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41363 }
41364 }
41365 }
41366 }
41367
41368 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8) {
41369 TEST_REQUIRES_PSIMD;
41370 for (uint32_t n = 16; n <= 24; n += 8) {
41371 for (size_t k = 1; k <= 20; k += 5) {
41372 GemmMicrokernelTester()
41373 .mr(4)
41374 .nr(8)
41375 .kr(1)
41376 .sr(4)
41377 .m(4)
41378 .n(8)
41379 .k(k)
41380 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41381 }
41382 }
41383 }
41384
41385 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8_strided_cn) {
41386 TEST_REQUIRES_PSIMD;
41387 for (uint32_t n = 16; n <= 24; n += 8) {
41388 for (size_t k = 1; k <= 20; k += 5) {
41389 GemmMicrokernelTester()
41390 .mr(4)
41391 .nr(8)
41392 .kr(1)
41393 .sr(4)
41394 .m(4)
41395 .n(n)
41396 .k(k)
41397 .cn_stride(11)
41398 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41399 }
41400 }
41401 }
41402
41403 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8_strided_a) {
41404 TEST_REQUIRES_PSIMD;
41405 for (uint32_t n = 16; n <= 24; n += 8) {
41406 for (size_t k = 1; k <= 20; k += 5) {
41407 GemmMicrokernelTester()
41408 .mr(4)
41409 .nr(8)
41410 .kr(1)
41411 .sr(4)
41412 .m(4)
41413 .n(n)
41414 .k(k)
41415 .a_stride(23)
41416 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41417 }
41418 }
41419 }
41420
41421 TEST(F32_GEMM_4X8S4__PSIMD, n_div_8_subtile) {
41422 TEST_REQUIRES_PSIMD;
41423 for (uint32_t n = 16; n <= 24; n += 8) {
41424 for (size_t k = 1; k <= 20; k += 5) {
41425 for (uint32_t m = 1; m <= 4; m++) {
41426 GemmMicrokernelTester()
41427 .mr(4)
41428 .nr(8)
41429 .kr(1)
41430 .sr(4)
41431 .m(m)
41432 .n(n)
41433 .k(k)
41434 .iterations(1)
41435 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41436 }
41437 }
41438 }
41439 }
41440
41441 TEST(F32_GEMM_4X8S4__PSIMD, strided_cm_subtile) {
41442 TEST_REQUIRES_PSIMD;
41443 for (size_t k = 1; k <= 20; k += 5) {
41444 for (uint32_t m = 1; m <= 4; m++) {
41445 for (uint32_t n = 1; n <= 8; n++) {
41446 GemmMicrokernelTester()
41447 .mr(4)
41448 .nr(8)
41449 .kr(1)
41450 .sr(4)
41451 .m(m)
41452 .n(n)
41453 .k(k)
41454 .cm_stride(11)
41455 .iterations(1)
41456 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41457 }
41458 }
41459 }
41460 }
41461
41462 TEST(F32_GEMM_4X8S4__PSIMD, qmin) {
41463 TEST_REQUIRES_PSIMD;
41464 GemmMicrokernelTester()
41465 .mr(4)
41466 .nr(8)
41467 .kr(1)
41468 .sr(4)
41469 .m(4)
41470 .n(8)
41471 .k(4)
41472 .qmin(128)
41473 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41474 }
41475
41476 TEST(F32_GEMM_4X8S4__PSIMD, qmax) {
41477 TEST_REQUIRES_PSIMD;
41478 GemmMicrokernelTester()
41479 .mr(4)
41480 .nr(8)
41481 .kr(1)
41482 .sr(4)
41483 .m(4)
41484 .n(8)
41485 .k(4)
41486 .qmax(128)
41487 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41488 }
41489
41490 TEST(F32_GEMM_4X8S4__PSIMD, strided_cm) {
41491 TEST_REQUIRES_PSIMD;
41492 GemmMicrokernelTester()
41493 .mr(4)
41494 .nr(8)
41495 .kr(1)
41496 .sr(4)
41497 .m(4)
41498 .n(8)
41499 .k(4)
41500 .cm_stride(11)
41501 .Test(xnn_f32_gemm_ukernel_4x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41502 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041503#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041504
41505
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041506#if !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041507 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4) {
41508 TEST_REQUIRES_PSIMD;
41509 GemmMicrokernelTester()
41510 .mr(6)
41511 .nr(8)
41512 .kr(1)
41513 .sr(4)
41514 .m(6)
41515 .n(8)
41516 .k(4)
41517 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41518 }
41519
41520 TEST(F32_GEMM_6X8S4__PSIMD, strided_cn) {
41521 TEST_REQUIRES_PSIMD;
41522 GemmMicrokernelTester()
41523 .mr(6)
41524 .nr(8)
41525 .kr(1)
41526 .sr(4)
41527 .m(6)
41528 .n(8)
41529 .k(4)
41530 .cn_stride(11)
41531 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41532 }
41533
41534 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_strided_a) {
41535 TEST_REQUIRES_PSIMD;
41536 GemmMicrokernelTester()
41537 .mr(6)
41538 .nr(8)
41539 .kr(1)
41540 .sr(4)
41541 .m(6)
41542 .n(8)
41543 .k(4)
41544 .a_stride(7)
41545 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41546 }
41547
41548 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_subtile) {
41549 TEST_REQUIRES_PSIMD;
41550 for (uint32_t m = 1; m <= 6; m++) {
41551 for (uint32_t n = 1; n <= 8; n++) {
41552 GemmMicrokernelTester()
41553 .mr(6)
41554 .nr(8)
41555 .kr(1)
41556 .sr(4)
41557 .m(m)
41558 .n(n)
41559 .k(4)
41560 .iterations(1)
41561 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41562 }
41563 }
41564 }
41565
41566 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_subtile_m) {
41567 TEST_REQUIRES_PSIMD;
41568 for (uint32_t m = 1; m <= 6; m++) {
41569 GemmMicrokernelTester()
41570 .mr(6)
41571 .nr(8)
41572 .kr(1)
41573 .sr(4)
41574 .m(m)
41575 .n(8)
41576 .k(4)
41577 .iterations(1)
41578 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41579 }
41580 }
41581
41582 TEST(F32_GEMM_6X8S4__PSIMD, k_eq_4_subtile_n) {
41583 TEST_REQUIRES_PSIMD;
41584 for (uint32_t n = 1; n <= 8; n++) {
41585 GemmMicrokernelTester()
41586 .mr(6)
41587 .nr(8)
41588 .kr(1)
41589 .sr(4)
41590 .m(6)
41591 .n(n)
41592 .k(4)
41593 .iterations(1)
41594 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41595 }
41596 }
41597
41598 TEST(F32_GEMM_6X8S4__PSIMD, k_lt_4) {
41599 TEST_REQUIRES_PSIMD;
41600 for (size_t k = 1; k < 4; k++) {
41601 GemmMicrokernelTester()
41602 .mr(6)
41603 .nr(8)
41604 .kr(1)
41605 .sr(4)
41606 .m(6)
41607 .n(8)
41608 .k(k)
41609 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41610 }
41611 }
41612
41613 TEST(F32_GEMM_6X8S4__PSIMD, k_lt_4_strided_a) {
41614 TEST_REQUIRES_PSIMD;
41615 for (size_t k = 1; k < 4; k++) {
41616 GemmMicrokernelTester()
41617 .mr(6)
41618 .nr(8)
41619 .kr(1)
41620 .sr(4)
41621 .m(6)
41622 .n(8)
41623 .k(k)
41624 .a_stride(7)
41625 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41626 }
41627 }
41628
41629 TEST(F32_GEMM_6X8S4__PSIMD, k_lt_4_subtile) {
41630 TEST_REQUIRES_PSIMD;
41631 for (size_t k = 1; k < 4; k++) {
41632 for (uint32_t m = 1; m <= 6; m++) {
41633 for (uint32_t n = 1; n <= 8; n++) {
41634 GemmMicrokernelTester()
41635 .mr(6)
41636 .nr(8)
41637 .kr(1)
41638 .sr(4)
41639 .m(m)
41640 .n(n)
41641 .k(k)
41642 .iterations(1)
41643 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41644 }
41645 }
41646 }
41647 }
41648
41649 TEST(F32_GEMM_6X8S4__PSIMD, k_gt_4) {
41650 TEST_REQUIRES_PSIMD;
41651 for (size_t k = 5; k < 8; k++) {
41652 GemmMicrokernelTester()
41653 .mr(6)
41654 .nr(8)
41655 .kr(1)
41656 .sr(4)
41657 .m(6)
41658 .n(8)
41659 .k(k)
41660 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41661 }
41662 }
41663
41664 TEST(F32_GEMM_6X8S4__PSIMD, k_gt_4_strided_a) {
41665 TEST_REQUIRES_PSIMD;
41666 for (size_t k = 5; k < 8; k++) {
41667 GemmMicrokernelTester()
41668 .mr(6)
41669 .nr(8)
41670 .kr(1)
41671 .sr(4)
41672 .m(6)
41673 .n(8)
41674 .k(k)
41675 .a_stride(11)
41676 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41677 }
41678 }
41679
41680 TEST(F32_GEMM_6X8S4__PSIMD, k_gt_4_subtile) {
41681 TEST_REQUIRES_PSIMD;
41682 for (size_t k = 5; k < 8; k++) {
41683 for (uint32_t m = 1; m <= 6; m++) {
41684 for (uint32_t n = 1; n <= 8; n++) {
41685 GemmMicrokernelTester()
41686 .mr(6)
41687 .nr(8)
41688 .kr(1)
41689 .sr(4)
41690 .m(m)
41691 .n(n)
41692 .k(k)
41693 .iterations(1)
41694 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41695 }
41696 }
41697 }
41698 }
41699
41700 TEST(F32_GEMM_6X8S4__PSIMD, k_div_4) {
41701 TEST_REQUIRES_PSIMD;
41702 for (size_t k = 8; k <= 40; k += 4) {
41703 GemmMicrokernelTester()
41704 .mr(6)
41705 .nr(8)
41706 .kr(1)
41707 .sr(4)
41708 .m(6)
41709 .n(8)
41710 .k(k)
41711 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41712 }
41713 }
41714
41715 TEST(F32_GEMM_6X8S4__PSIMD, k_div_4_strided_a) {
41716 TEST_REQUIRES_PSIMD;
41717 for (size_t k = 8; k <= 40; k += 4) {
41718 GemmMicrokernelTester()
41719 .mr(6)
41720 .nr(8)
41721 .kr(1)
41722 .sr(4)
41723 .m(6)
41724 .n(8)
41725 .k(k)
41726 .a_stride(43)
41727 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41728 }
41729 }
41730
41731 TEST(F32_GEMM_6X8S4__PSIMD, k_div_4_subtile) {
41732 TEST_REQUIRES_PSIMD;
41733 for (size_t k = 8; k <= 40; k += 4) {
41734 for (uint32_t m = 1; m <= 6; m++) {
41735 for (uint32_t n = 1; n <= 8; n++) {
41736 GemmMicrokernelTester()
41737 .mr(6)
41738 .nr(8)
41739 .kr(1)
41740 .sr(4)
41741 .m(m)
41742 .n(n)
41743 .k(k)
41744 .iterations(1)
41745 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41746 }
41747 }
41748 }
41749 }
41750
41751 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8) {
41752 TEST_REQUIRES_PSIMD;
41753 for (uint32_t n = 9; n < 16; n++) {
41754 for (size_t k = 1; k <= 20; k += 5) {
41755 GemmMicrokernelTester()
41756 .mr(6)
41757 .nr(8)
41758 .kr(1)
41759 .sr(4)
41760 .m(6)
41761 .n(8)
41762 .k(k)
41763 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41764 }
41765 }
41766 }
41767
41768 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8_strided_cn) {
41769 TEST_REQUIRES_PSIMD;
41770 for (uint32_t n = 9; n < 16; n++) {
41771 for (size_t k = 1; k <= 20; k += 5) {
41772 GemmMicrokernelTester()
41773 .mr(6)
41774 .nr(8)
41775 .kr(1)
41776 .sr(4)
41777 .m(6)
41778 .n(8)
41779 .k(k)
41780 .cn_stride(11)
41781 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41782 }
41783 }
41784 }
41785
41786 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8_strided_a) {
41787 TEST_REQUIRES_PSIMD;
41788 for (uint32_t n = 9; n < 16; n++) {
41789 for (size_t k = 1; k <= 20; k += 5) {
41790 GemmMicrokernelTester()
41791 .mr(6)
41792 .nr(8)
41793 .kr(1)
41794 .sr(4)
41795 .m(6)
41796 .n(n)
41797 .k(k)
41798 .a_stride(23)
41799 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41800 }
41801 }
41802 }
41803
41804 TEST(F32_GEMM_6X8S4__PSIMD, n_gt_8_subtile) {
41805 TEST_REQUIRES_PSIMD;
41806 for (uint32_t n = 9; n < 16; n++) {
41807 for (size_t k = 1; k <= 20; k += 5) {
41808 for (uint32_t m = 1; m <= 6; m++) {
41809 GemmMicrokernelTester()
41810 .mr(6)
41811 .nr(8)
41812 .kr(1)
41813 .sr(4)
41814 .m(m)
41815 .n(n)
41816 .k(k)
41817 .iterations(1)
41818 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41819 }
41820 }
41821 }
41822 }
41823
41824 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8) {
41825 TEST_REQUIRES_PSIMD;
41826 for (uint32_t n = 16; n <= 24; n += 8) {
41827 for (size_t k = 1; k <= 20; k += 5) {
41828 GemmMicrokernelTester()
41829 .mr(6)
41830 .nr(8)
41831 .kr(1)
41832 .sr(4)
41833 .m(6)
41834 .n(8)
41835 .k(k)
41836 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41837 }
41838 }
41839 }
41840
41841 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8_strided_cn) {
41842 TEST_REQUIRES_PSIMD;
41843 for (uint32_t n = 16; n <= 24; n += 8) {
41844 for (size_t k = 1; k <= 20; k += 5) {
41845 GemmMicrokernelTester()
41846 .mr(6)
41847 .nr(8)
41848 .kr(1)
41849 .sr(4)
41850 .m(6)
41851 .n(n)
41852 .k(k)
41853 .cn_stride(11)
41854 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41855 }
41856 }
41857 }
41858
41859 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8_strided_a) {
41860 TEST_REQUIRES_PSIMD;
41861 for (uint32_t n = 16; n <= 24; n += 8) {
41862 for (size_t k = 1; k <= 20; k += 5) {
41863 GemmMicrokernelTester()
41864 .mr(6)
41865 .nr(8)
41866 .kr(1)
41867 .sr(4)
41868 .m(6)
41869 .n(n)
41870 .k(k)
41871 .a_stride(23)
41872 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41873 }
41874 }
41875 }
41876
41877 TEST(F32_GEMM_6X8S4__PSIMD, n_div_8_subtile) {
41878 TEST_REQUIRES_PSIMD;
41879 for (uint32_t n = 16; n <= 24; n += 8) {
41880 for (size_t k = 1; k <= 20; k += 5) {
41881 for (uint32_t m = 1; m <= 6; m++) {
41882 GemmMicrokernelTester()
41883 .mr(6)
41884 .nr(8)
41885 .kr(1)
41886 .sr(4)
41887 .m(m)
41888 .n(n)
41889 .k(k)
41890 .iterations(1)
41891 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41892 }
41893 }
41894 }
41895 }
41896
41897 TEST(F32_GEMM_6X8S4__PSIMD, strided_cm_subtile) {
41898 TEST_REQUIRES_PSIMD;
41899 for (size_t k = 1; k <= 20; k += 5) {
41900 for (uint32_t m = 1; m <= 6; m++) {
41901 for (uint32_t n = 1; n <= 8; n++) {
41902 GemmMicrokernelTester()
41903 .mr(6)
41904 .nr(8)
41905 .kr(1)
41906 .sr(4)
41907 .m(m)
41908 .n(n)
41909 .k(k)
41910 .cm_stride(11)
41911 .iterations(1)
41912 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41913 }
41914 }
41915 }
41916 }
41917
41918 TEST(F32_GEMM_6X8S4__PSIMD, qmin) {
41919 TEST_REQUIRES_PSIMD;
41920 GemmMicrokernelTester()
41921 .mr(6)
41922 .nr(8)
41923 .kr(1)
41924 .sr(4)
41925 .m(6)
41926 .n(8)
41927 .k(4)
41928 .qmin(128)
41929 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41930 }
41931
41932 TEST(F32_GEMM_6X8S4__PSIMD, qmax) {
41933 TEST_REQUIRES_PSIMD;
41934 GemmMicrokernelTester()
41935 .mr(6)
41936 .nr(8)
41937 .kr(1)
41938 .sr(4)
41939 .m(6)
41940 .n(8)
41941 .k(4)
41942 .qmax(128)
41943 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41944 }
41945
41946 TEST(F32_GEMM_6X8S4__PSIMD, strided_cm) {
41947 TEST_REQUIRES_PSIMD;
41948 GemmMicrokernelTester()
41949 .mr(6)
41950 .nr(8)
41951 .kr(1)
41952 .sr(4)
41953 .m(6)
41954 .n(8)
41955 .k(4)
41956 .cm_stride(11)
41957 .Test(xnn_f32_gemm_ukernel_6x8s4__psimd, GemmMicrokernelTester::Variant::Scalar);
41958 }
Marat Dukhan1dadbf72019-10-01 10:46:20 -070041959#endif // !XNN_ARCH_ASMJS && !XNN_ARCH_WASM
XNNPACK Teamb455b122019-09-27 18:10:33 -070041960
41961
Marat Dukhan436ebe62019-12-04 15:10:12 -080041962#if XNN_ARCH_WASM
41963 TEST(F32_GEMM_1X4__WASM, k_eq_1) {
41964 GemmMicrokernelTester()
41965 .mr(1)
41966 .nr(4)
41967 .kr(1)
41968 .sr(1)
41969 .m(1)
41970 .n(4)
41971 .k(1)
41972 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41973 }
41974
41975 TEST(F32_GEMM_1X4__WASM, strided_cn) {
41976 GemmMicrokernelTester()
41977 .mr(1)
41978 .nr(4)
41979 .kr(1)
41980 .sr(1)
41981 .m(1)
41982 .n(4)
41983 .k(1)
41984 .cn_stride(7)
41985 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41986 }
41987
41988 TEST(F32_GEMM_1X4__WASM, k_eq_1_strided_a) {
41989 GemmMicrokernelTester()
41990 .mr(1)
41991 .nr(4)
41992 .kr(1)
41993 .sr(1)
41994 .m(1)
41995 .n(4)
41996 .k(1)
41997 .a_stride(3)
41998 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
41999 }
42000
42001 TEST(F32_GEMM_1X4__WASM, k_eq_1_subtile) {
42002 for (uint32_t m = 1; m <= 1; m++) {
42003 for (uint32_t n = 1; n <= 4; n++) {
42004 GemmMicrokernelTester()
42005 .mr(1)
42006 .nr(4)
42007 .kr(1)
42008 .sr(1)
42009 .m(m)
42010 .n(n)
42011 .k(1)
42012 .iterations(1)
42013 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42014 }
42015 }
42016 }
42017
42018 TEST(F32_GEMM_1X4__WASM, k_eq_1_subtile_m) {
42019 for (uint32_t m = 1; m <= 1; m++) {
42020 GemmMicrokernelTester()
42021 .mr(1)
42022 .nr(4)
42023 .kr(1)
42024 .sr(1)
42025 .m(m)
42026 .n(4)
42027 .k(1)
42028 .iterations(1)
42029 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42030 }
42031 }
42032
42033 TEST(F32_GEMM_1X4__WASM, k_eq_1_subtile_n) {
42034 for (uint32_t n = 1; n <= 4; n++) {
42035 GemmMicrokernelTester()
42036 .mr(1)
42037 .nr(4)
42038 .kr(1)
42039 .sr(1)
42040 .m(1)
42041 .n(n)
42042 .k(1)
42043 .iterations(1)
42044 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42045 }
42046 }
42047
42048 TEST(F32_GEMM_1X4__WASM, k_gt_1) {
42049 for (size_t k = 2; k < 10; k++) {
42050 GemmMicrokernelTester()
42051 .mr(1)
42052 .nr(4)
42053 .kr(1)
42054 .sr(1)
42055 .m(1)
42056 .n(4)
42057 .k(k)
42058 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42059 }
42060 }
42061
42062 TEST(F32_GEMM_1X4__WASM, k_gt_1_strided_a) {
42063 for (size_t k = 2; k < 10; k++) {
42064 GemmMicrokernelTester()
42065 .mr(1)
42066 .nr(4)
42067 .kr(1)
42068 .sr(1)
42069 .m(1)
42070 .n(4)
42071 .k(k)
42072 .a_stride(11)
42073 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42074 }
42075 }
42076
42077 TEST(F32_GEMM_1X4__WASM, k_gt_1_subtile) {
42078 for (size_t k = 2; k < 10; k++) {
42079 for (uint32_t m = 1; m <= 1; m++) {
42080 for (uint32_t n = 1; n <= 4; n++) {
42081 GemmMicrokernelTester()
42082 .mr(1)
42083 .nr(4)
42084 .kr(1)
42085 .sr(1)
42086 .m(m)
42087 .n(n)
42088 .k(k)
42089 .iterations(1)
42090 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42091 }
42092 }
42093 }
42094 }
42095
42096 TEST(F32_GEMM_1X4__WASM, n_gt_4) {
42097 for (uint32_t n = 5; n < 8; n++) {
42098 for (size_t k = 1; k <= 5; k += 2) {
42099 GemmMicrokernelTester()
42100 .mr(1)
42101 .nr(4)
42102 .kr(1)
42103 .sr(1)
42104 .m(1)
42105 .n(4)
42106 .k(k)
42107 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42108 }
42109 }
42110 }
42111
42112 TEST(F32_GEMM_1X4__WASM, n_gt_4_strided_cn) {
42113 for (uint32_t n = 5; n < 8; n++) {
42114 for (size_t k = 1; k <= 5; k += 2) {
42115 GemmMicrokernelTester()
42116 .mr(1)
42117 .nr(4)
42118 .kr(1)
42119 .sr(1)
42120 .m(1)
42121 .n(4)
42122 .k(k)
42123 .cn_stride(7)
42124 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42125 }
42126 }
42127 }
42128
42129 TEST(F32_GEMM_1X4__WASM, n_gt_4_strided_a) {
42130 for (uint32_t n = 5; n < 8; n++) {
42131 for (size_t k = 1; k <= 5; k += 2) {
42132 GemmMicrokernelTester()
42133 .mr(1)
42134 .nr(4)
42135 .kr(1)
42136 .sr(1)
42137 .m(1)
42138 .n(n)
42139 .k(k)
42140 .a_stride(7)
42141 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42142 }
42143 }
42144 }
42145
42146 TEST(F32_GEMM_1X4__WASM, n_gt_4_subtile) {
42147 for (uint32_t n = 5; n < 8; n++) {
42148 for (size_t k = 1; k <= 5; k += 2) {
42149 for (uint32_t m = 1; m <= 1; m++) {
42150 GemmMicrokernelTester()
42151 .mr(1)
42152 .nr(4)
42153 .kr(1)
42154 .sr(1)
42155 .m(m)
42156 .n(n)
42157 .k(k)
42158 .iterations(1)
42159 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42160 }
42161 }
42162 }
42163 }
42164
42165 TEST(F32_GEMM_1X4__WASM, n_div_4) {
42166 for (uint32_t n = 8; n <= 12; n += 4) {
42167 for (size_t k = 1; k <= 5; k += 2) {
42168 GemmMicrokernelTester()
42169 .mr(1)
42170 .nr(4)
42171 .kr(1)
42172 .sr(1)
42173 .m(1)
42174 .n(4)
42175 .k(k)
42176 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42177 }
42178 }
42179 }
42180
42181 TEST(F32_GEMM_1X4__WASM, n_div_4_strided_cn) {
42182 for (uint32_t n = 8; n <= 12; n += 4) {
42183 for (size_t k = 1; k <= 5; k += 2) {
42184 GemmMicrokernelTester()
42185 .mr(1)
42186 .nr(4)
42187 .kr(1)
42188 .sr(1)
42189 .m(1)
42190 .n(n)
42191 .k(k)
42192 .cn_stride(7)
42193 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42194 }
42195 }
42196 }
42197
42198 TEST(F32_GEMM_1X4__WASM, n_div_4_strided_a) {
42199 for (uint32_t n = 8; n <= 12; n += 4) {
42200 for (size_t k = 1; k <= 5; k += 2) {
42201 GemmMicrokernelTester()
42202 .mr(1)
42203 .nr(4)
42204 .kr(1)
42205 .sr(1)
42206 .m(1)
42207 .n(n)
42208 .k(k)
42209 .a_stride(7)
42210 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42211 }
42212 }
42213 }
42214
42215 TEST(F32_GEMM_1X4__WASM, n_div_4_subtile) {
42216 for (uint32_t n = 8; n <= 12; n += 4) {
42217 for (size_t k = 1; k <= 5; k += 2) {
42218 for (uint32_t m = 1; m <= 1; m++) {
42219 GemmMicrokernelTester()
42220 .mr(1)
42221 .nr(4)
42222 .kr(1)
42223 .sr(1)
42224 .m(m)
42225 .n(n)
42226 .k(k)
42227 .iterations(1)
42228 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42229 }
42230 }
42231 }
42232 }
42233
42234 TEST(F32_GEMM_1X4__WASM, strided_cm_subtile) {
42235 for (size_t k = 1; k <= 5; k += 2) {
42236 for (uint32_t m = 1; m <= 1; m++) {
42237 for (uint32_t n = 1; n <= 4; n++) {
42238 GemmMicrokernelTester()
42239 .mr(1)
42240 .nr(4)
42241 .kr(1)
42242 .sr(1)
42243 .m(m)
42244 .n(n)
42245 .k(k)
42246 .cm_stride(7)
42247 .iterations(1)
42248 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42249 }
42250 }
42251 }
42252 }
42253
42254 TEST(F32_GEMM_1X4__WASM, qmin) {
42255 GemmMicrokernelTester()
42256 .mr(1)
42257 .nr(4)
42258 .kr(1)
42259 .sr(1)
42260 .m(1)
42261 .n(4)
42262 .k(1)
42263 .qmin(128)
42264 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42265 }
42266
42267 TEST(F32_GEMM_1X4__WASM, qmax) {
42268 GemmMicrokernelTester()
42269 .mr(1)
42270 .nr(4)
42271 .kr(1)
42272 .sr(1)
42273 .m(1)
42274 .n(4)
42275 .k(1)
42276 .qmax(128)
42277 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42278 }
42279
42280 TEST(F32_GEMM_1X4__WASM, strided_cm) {
42281 GemmMicrokernelTester()
42282 .mr(1)
42283 .nr(4)
42284 .kr(1)
42285 .sr(1)
42286 .m(1)
42287 .n(4)
42288 .k(1)
42289 .cm_stride(7)
42290 .Test(xnn_f32_gemm_ukernel_1x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42291 }
42292#endif // XNN_ARCH_WASM
42293
42294
42295#if XNN_ARCH_WASM
42296 TEST(F32_GEMM_2X4__WASM, k_eq_1) {
42297 GemmMicrokernelTester()
42298 .mr(2)
42299 .nr(4)
42300 .kr(1)
42301 .sr(1)
42302 .m(2)
42303 .n(4)
42304 .k(1)
42305 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42306 }
42307
42308 TEST(F32_GEMM_2X4__WASM, strided_cn) {
42309 GemmMicrokernelTester()
42310 .mr(2)
42311 .nr(4)
42312 .kr(1)
42313 .sr(1)
42314 .m(2)
42315 .n(4)
42316 .k(1)
42317 .cn_stride(7)
42318 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42319 }
42320
42321 TEST(F32_GEMM_2X4__WASM, k_eq_1_strided_a) {
42322 GemmMicrokernelTester()
42323 .mr(2)
42324 .nr(4)
42325 .kr(1)
42326 .sr(1)
42327 .m(2)
42328 .n(4)
42329 .k(1)
42330 .a_stride(3)
42331 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42332 }
42333
42334 TEST(F32_GEMM_2X4__WASM, k_eq_1_subtile) {
42335 for (uint32_t m = 1; m <= 2; m++) {
42336 for (uint32_t n = 1; n <= 4; n++) {
42337 GemmMicrokernelTester()
42338 .mr(2)
42339 .nr(4)
42340 .kr(1)
42341 .sr(1)
42342 .m(m)
42343 .n(n)
42344 .k(1)
42345 .iterations(1)
42346 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42347 }
42348 }
42349 }
42350
42351 TEST(F32_GEMM_2X4__WASM, k_eq_1_subtile_m) {
42352 for (uint32_t m = 1; m <= 2; m++) {
42353 GemmMicrokernelTester()
42354 .mr(2)
42355 .nr(4)
42356 .kr(1)
42357 .sr(1)
42358 .m(m)
42359 .n(4)
42360 .k(1)
42361 .iterations(1)
42362 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42363 }
42364 }
42365
42366 TEST(F32_GEMM_2X4__WASM, k_eq_1_subtile_n) {
42367 for (uint32_t n = 1; n <= 4; n++) {
42368 GemmMicrokernelTester()
42369 .mr(2)
42370 .nr(4)
42371 .kr(1)
42372 .sr(1)
42373 .m(2)
42374 .n(n)
42375 .k(1)
42376 .iterations(1)
42377 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42378 }
42379 }
42380
42381 TEST(F32_GEMM_2X4__WASM, k_gt_1) {
42382 for (size_t k = 2; k < 10; k++) {
42383 GemmMicrokernelTester()
42384 .mr(2)
42385 .nr(4)
42386 .kr(1)
42387 .sr(1)
42388 .m(2)
42389 .n(4)
42390 .k(k)
42391 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42392 }
42393 }
42394
42395 TEST(F32_GEMM_2X4__WASM, k_gt_1_strided_a) {
42396 for (size_t k = 2; k < 10; k++) {
42397 GemmMicrokernelTester()
42398 .mr(2)
42399 .nr(4)
42400 .kr(1)
42401 .sr(1)
42402 .m(2)
42403 .n(4)
42404 .k(k)
42405 .a_stride(11)
42406 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42407 }
42408 }
42409
42410 TEST(F32_GEMM_2X4__WASM, k_gt_1_subtile) {
42411 for (size_t k = 2; k < 10; k++) {
42412 for (uint32_t m = 1; m <= 2; m++) {
42413 for (uint32_t n = 1; n <= 4; n++) {
42414 GemmMicrokernelTester()
42415 .mr(2)
42416 .nr(4)
42417 .kr(1)
42418 .sr(1)
42419 .m(m)
42420 .n(n)
42421 .k(k)
42422 .iterations(1)
42423 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42424 }
42425 }
42426 }
42427 }
42428
42429 TEST(F32_GEMM_2X4__WASM, n_gt_4) {
42430 for (uint32_t n = 5; n < 8; n++) {
42431 for (size_t k = 1; k <= 5; k += 2) {
42432 GemmMicrokernelTester()
42433 .mr(2)
42434 .nr(4)
42435 .kr(1)
42436 .sr(1)
42437 .m(2)
42438 .n(4)
42439 .k(k)
42440 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42441 }
42442 }
42443 }
42444
42445 TEST(F32_GEMM_2X4__WASM, n_gt_4_strided_cn) {
42446 for (uint32_t n = 5; n < 8; n++) {
42447 for (size_t k = 1; k <= 5; k += 2) {
42448 GemmMicrokernelTester()
42449 .mr(2)
42450 .nr(4)
42451 .kr(1)
42452 .sr(1)
42453 .m(2)
42454 .n(4)
42455 .k(k)
42456 .cn_stride(7)
42457 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42458 }
42459 }
42460 }
42461
42462 TEST(F32_GEMM_2X4__WASM, n_gt_4_strided_a) {
42463 for (uint32_t n = 5; n < 8; n++) {
42464 for (size_t k = 1; k <= 5; k += 2) {
42465 GemmMicrokernelTester()
42466 .mr(2)
42467 .nr(4)
42468 .kr(1)
42469 .sr(1)
42470 .m(2)
42471 .n(n)
42472 .k(k)
42473 .a_stride(7)
42474 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42475 }
42476 }
42477 }
42478
42479 TEST(F32_GEMM_2X4__WASM, n_gt_4_subtile) {
42480 for (uint32_t n = 5; n < 8; n++) {
42481 for (size_t k = 1; k <= 5; k += 2) {
42482 for (uint32_t m = 1; m <= 2; m++) {
42483 GemmMicrokernelTester()
42484 .mr(2)
42485 .nr(4)
42486 .kr(1)
42487 .sr(1)
42488 .m(m)
42489 .n(n)
42490 .k(k)
42491 .iterations(1)
42492 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42493 }
42494 }
42495 }
42496 }
42497
42498 TEST(F32_GEMM_2X4__WASM, n_div_4) {
42499 for (uint32_t n = 8; n <= 12; n += 4) {
42500 for (size_t k = 1; k <= 5; k += 2) {
42501 GemmMicrokernelTester()
42502 .mr(2)
42503 .nr(4)
42504 .kr(1)
42505 .sr(1)
42506 .m(2)
42507 .n(4)
42508 .k(k)
42509 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42510 }
42511 }
42512 }
42513
42514 TEST(F32_GEMM_2X4__WASM, n_div_4_strided_cn) {
42515 for (uint32_t n = 8; n <= 12; n += 4) {
42516 for (size_t k = 1; k <= 5; k += 2) {
42517 GemmMicrokernelTester()
42518 .mr(2)
42519 .nr(4)
42520 .kr(1)
42521 .sr(1)
42522 .m(2)
42523 .n(n)
42524 .k(k)
42525 .cn_stride(7)
42526 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42527 }
42528 }
42529 }
42530
42531 TEST(F32_GEMM_2X4__WASM, n_div_4_strided_a) {
42532 for (uint32_t n = 8; n <= 12; n += 4) {
42533 for (size_t k = 1; k <= 5; k += 2) {
42534 GemmMicrokernelTester()
42535 .mr(2)
42536 .nr(4)
42537 .kr(1)
42538 .sr(1)
42539 .m(2)
42540 .n(n)
42541 .k(k)
42542 .a_stride(7)
42543 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42544 }
42545 }
42546 }
42547
42548 TEST(F32_GEMM_2X4__WASM, n_div_4_subtile) {
42549 for (uint32_t n = 8; n <= 12; n += 4) {
42550 for (size_t k = 1; k <= 5; k += 2) {
42551 for (uint32_t m = 1; m <= 2; m++) {
42552 GemmMicrokernelTester()
42553 .mr(2)
42554 .nr(4)
42555 .kr(1)
42556 .sr(1)
42557 .m(m)
42558 .n(n)
42559 .k(k)
42560 .iterations(1)
42561 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42562 }
42563 }
42564 }
42565 }
42566
42567 TEST(F32_GEMM_2X4__WASM, strided_cm_subtile) {
42568 for (size_t k = 1; k <= 5; k += 2) {
42569 for (uint32_t m = 1; m <= 2; m++) {
42570 for (uint32_t n = 1; n <= 4; n++) {
42571 GemmMicrokernelTester()
42572 .mr(2)
42573 .nr(4)
42574 .kr(1)
42575 .sr(1)
42576 .m(m)
42577 .n(n)
42578 .k(k)
42579 .cm_stride(7)
42580 .iterations(1)
42581 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42582 }
42583 }
42584 }
42585 }
42586
42587 TEST(F32_GEMM_2X4__WASM, qmin) {
42588 GemmMicrokernelTester()
42589 .mr(2)
42590 .nr(4)
42591 .kr(1)
42592 .sr(1)
42593 .m(2)
42594 .n(4)
42595 .k(1)
42596 .qmin(128)
42597 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42598 }
42599
42600 TEST(F32_GEMM_2X4__WASM, qmax) {
42601 GemmMicrokernelTester()
42602 .mr(2)
42603 .nr(4)
42604 .kr(1)
42605 .sr(1)
42606 .m(2)
42607 .n(4)
42608 .k(1)
42609 .qmax(128)
42610 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42611 }
42612
42613 TEST(F32_GEMM_2X4__WASM, strided_cm) {
42614 GemmMicrokernelTester()
42615 .mr(2)
42616 .nr(4)
42617 .kr(1)
42618 .sr(1)
42619 .m(2)
42620 .n(4)
42621 .k(1)
42622 .cm_stride(7)
42623 .Test(xnn_f32_gemm_ukernel_2x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42624 }
42625#endif // XNN_ARCH_WASM
42626
42627
42628#if XNN_ARCH_WASM
42629 TEST(F32_GEMM_4X4__WASM, k_eq_1) {
42630 GemmMicrokernelTester()
42631 .mr(4)
42632 .nr(4)
42633 .kr(1)
42634 .sr(1)
42635 .m(4)
42636 .n(4)
42637 .k(1)
42638 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42639 }
42640
42641 TEST(F32_GEMM_4X4__WASM, strided_cn) {
42642 GemmMicrokernelTester()
42643 .mr(4)
42644 .nr(4)
42645 .kr(1)
42646 .sr(1)
42647 .m(4)
42648 .n(4)
42649 .k(1)
42650 .cn_stride(7)
42651 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42652 }
42653
42654 TEST(F32_GEMM_4X4__WASM, k_eq_1_strided_a) {
42655 GemmMicrokernelTester()
42656 .mr(4)
42657 .nr(4)
42658 .kr(1)
42659 .sr(1)
42660 .m(4)
42661 .n(4)
42662 .k(1)
42663 .a_stride(3)
42664 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42665 }
42666
42667 TEST(F32_GEMM_4X4__WASM, k_eq_1_subtile) {
42668 for (uint32_t m = 1; m <= 4; m++) {
42669 for (uint32_t n = 1; n <= 4; n++) {
42670 GemmMicrokernelTester()
42671 .mr(4)
42672 .nr(4)
42673 .kr(1)
42674 .sr(1)
42675 .m(m)
42676 .n(n)
42677 .k(1)
42678 .iterations(1)
42679 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42680 }
42681 }
42682 }
42683
42684 TEST(F32_GEMM_4X4__WASM, k_eq_1_subtile_m) {
42685 for (uint32_t m = 1; m <= 4; m++) {
42686 GemmMicrokernelTester()
42687 .mr(4)
42688 .nr(4)
42689 .kr(1)
42690 .sr(1)
42691 .m(m)
42692 .n(4)
42693 .k(1)
42694 .iterations(1)
42695 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42696 }
42697 }
42698
42699 TEST(F32_GEMM_4X4__WASM, k_eq_1_subtile_n) {
42700 for (uint32_t n = 1; n <= 4; n++) {
42701 GemmMicrokernelTester()
42702 .mr(4)
42703 .nr(4)
42704 .kr(1)
42705 .sr(1)
42706 .m(4)
42707 .n(n)
42708 .k(1)
42709 .iterations(1)
42710 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42711 }
42712 }
42713
42714 TEST(F32_GEMM_4X4__WASM, k_gt_1) {
42715 for (size_t k = 2; k < 10; k++) {
42716 GemmMicrokernelTester()
42717 .mr(4)
42718 .nr(4)
42719 .kr(1)
42720 .sr(1)
42721 .m(4)
42722 .n(4)
42723 .k(k)
42724 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42725 }
42726 }
42727
42728 TEST(F32_GEMM_4X4__WASM, k_gt_1_strided_a) {
42729 for (size_t k = 2; k < 10; k++) {
42730 GemmMicrokernelTester()
42731 .mr(4)
42732 .nr(4)
42733 .kr(1)
42734 .sr(1)
42735 .m(4)
42736 .n(4)
42737 .k(k)
42738 .a_stride(11)
42739 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42740 }
42741 }
42742
42743 TEST(F32_GEMM_4X4__WASM, k_gt_1_subtile) {
42744 for (size_t k = 2; k < 10; k++) {
42745 for (uint32_t m = 1; m <= 4; m++) {
42746 for (uint32_t n = 1; n <= 4; n++) {
42747 GemmMicrokernelTester()
42748 .mr(4)
42749 .nr(4)
42750 .kr(1)
42751 .sr(1)
42752 .m(m)
42753 .n(n)
42754 .k(k)
42755 .iterations(1)
42756 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42757 }
42758 }
42759 }
42760 }
42761
42762 TEST(F32_GEMM_4X4__WASM, n_gt_4) {
42763 for (uint32_t n = 5; n < 8; n++) {
42764 for (size_t k = 1; k <= 5; k += 2) {
42765 GemmMicrokernelTester()
42766 .mr(4)
42767 .nr(4)
42768 .kr(1)
42769 .sr(1)
42770 .m(4)
42771 .n(4)
42772 .k(k)
42773 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42774 }
42775 }
42776 }
42777
42778 TEST(F32_GEMM_4X4__WASM, n_gt_4_strided_cn) {
42779 for (uint32_t n = 5; n < 8; n++) {
42780 for (size_t k = 1; k <= 5; k += 2) {
42781 GemmMicrokernelTester()
42782 .mr(4)
42783 .nr(4)
42784 .kr(1)
42785 .sr(1)
42786 .m(4)
42787 .n(4)
42788 .k(k)
42789 .cn_stride(7)
42790 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42791 }
42792 }
42793 }
42794
42795 TEST(F32_GEMM_4X4__WASM, n_gt_4_strided_a) {
42796 for (uint32_t n = 5; n < 8; n++) {
42797 for (size_t k = 1; k <= 5; k += 2) {
42798 GemmMicrokernelTester()
42799 .mr(4)
42800 .nr(4)
42801 .kr(1)
42802 .sr(1)
42803 .m(4)
42804 .n(n)
42805 .k(k)
42806 .a_stride(7)
42807 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42808 }
42809 }
42810 }
42811
42812 TEST(F32_GEMM_4X4__WASM, n_gt_4_subtile) {
42813 for (uint32_t n = 5; n < 8; n++) {
42814 for (size_t k = 1; k <= 5; k += 2) {
42815 for (uint32_t m = 1; m <= 4; m++) {
42816 GemmMicrokernelTester()
42817 .mr(4)
42818 .nr(4)
42819 .kr(1)
42820 .sr(1)
42821 .m(m)
42822 .n(n)
42823 .k(k)
42824 .iterations(1)
42825 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42826 }
42827 }
42828 }
42829 }
42830
42831 TEST(F32_GEMM_4X4__WASM, n_div_4) {
42832 for (uint32_t n = 8; n <= 12; n += 4) {
42833 for (size_t k = 1; k <= 5; k += 2) {
42834 GemmMicrokernelTester()
42835 .mr(4)
42836 .nr(4)
42837 .kr(1)
42838 .sr(1)
42839 .m(4)
42840 .n(4)
42841 .k(k)
42842 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42843 }
42844 }
42845 }
42846
42847 TEST(F32_GEMM_4X4__WASM, n_div_4_strided_cn) {
42848 for (uint32_t n = 8; n <= 12; n += 4) {
42849 for (size_t k = 1; k <= 5; k += 2) {
42850 GemmMicrokernelTester()
42851 .mr(4)
42852 .nr(4)
42853 .kr(1)
42854 .sr(1)
42855 .m(4)
42856 .n(n)
42857 .k(k)
42858 .cn_stride(7)
42859 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42860 }
42861 }
42862 }
42863
42864 TEST(F32_GEMM_4X4__WASM, n_div_4_strided_a) {
42865 for (uint32_t n = 8; n <= 12; n += 4) {
42866 for (size_t k = 1; k <= 5; k += 2) {
42867 GemmMicrokernelTester()
42868 .mr(4)
42869 .nr(4)
42870 .kr(1)
42871 .sr(1)
42872 .m(4)
42873 .n(n)
42874 .k(k)
42875 .a_stride(7)
42876 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42877 }
42878 }
42879 }
42880
42881 TEST(F32_GEMM_4X4__WASM, n_div_4_subtile) {
42882 for (uint32_t n = 8; n <= 12; n += 4) {
42883 for (size_t k = 1; k <= 5; k += 2) {
42884 for (uint32_t m = 1; m <= 4; m++) {
42885 GemmMicrokernelTester()
42886 .mr(4)
42887 .nr(4)
42888 .kr(1)
42889 .sr(1)
42890 .m(m)
42891 .n(n)
42892 .k(k)
42893 .iterations(1)
42894 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42895 }
42896 }
42897 }
42898 }
42899
42900 TEST(F32_GEMM_4X4__WASM, strided_cm_subtile) {
42901 for (size_t k = 1; k <= 5; k += 2) {
42902 for (uint32_t m = 1; m <= 4; m++) {
42903 for (uint32_t n = 1; n <= 4; n++) {
42904 GemmMicrokernelTester()
42905 .mr(4)
42906 .nr(4)
42907 .kr(1)
42908 .sr(1)
42909 .m(m)
42910 .n(n)
42911 .k(k)
42912 .cm_stride(7)
42913 .iterations(1)
42914 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42915 }
42916 }
42917 }
42918 }
42919
42920 TEST(F32_GEMM_4X4__WASM, qmin) {
42921 GemmMicrokernelTester()
42922 .mr(4)
42923 .nr(4)
42924 .kr(1)
42925 .sr(1)
42926 .m(4)
42927 .n(4)
42928 .k(1)
42929 .qmin(128)
42930 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42931 }
42932
42933 TEST(F32_GEMM_4X4__WASM, qmax) {
42934 GemmMicrokernelTester()
42935 .mr(4)
42936 .nr(4)
42937 .kr(1)
42938 .sr(1)
42939 .m(4)
42940 .n(4)
42941 .k(1)
42942 .qmax(128)
42943 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42944 }
42945
42946 TEST(F32_GEMM_4X4__WASM, strided_cm) {
42947 GemmMicrokernelTester()
42948 .mr(4)
42949 .nr(4)
42950 .kr(1)
42951 .sr(1)
42952 .m(4)
42953 .n(4)
42954 .k(1)
42955 .cm_stride(7)
42956 .Test(xnn_f32_gemm_ukernel_4x4__wasm, GemmMicrokernelTester::Variant::Scalar);
42957 }
42958#endif // XNN_ARCH_WASM
42959
42960
42961#if XNN_ARCH_WASM
42962 TEST(F32_GEMM_4X2__WASM, k_eq_1) {
42963 GemmMicrokernelTester()
42964 .mr(4)
42965 .nr(2)
42966 .kr(1)
42967 .sr(1)
42968 .m(4)
42969 .n(2)
42970 .k(1)
42971 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
42972 }
42973
42974 TEST(F32_GEMM_4X2__WASM, strided_cn) {
42975 GemmMicrokernelTester()
42976 .mr(4)
42977 .nr(2)
42978 .kr(1)
42979 .sr(1)
42980 .m(4)
42981 .n(2)
42982 .k(1)
42983 .cn_stride(5)
42984 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
42985 }
42986
42987 TEST(F32_GEMM_4X2__WASM, k_eq_1_strided_a) {
42988 GemmMicrokernelTester()
42989 .mr(4)
42990 .nr(2)
42991 .kr(1)
42992 .sr(1)
42993 .m(4)
42994 .n(2)
42995 .k(1)
42996 .a_stride(3)
42997 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
42998 }
42999
43000 TEST(F32_GEMM_4X2__WASM, k_eq_1_subtile) {
43001 for (uint32_t m = 1; m <= 4; m++) {
43002 for (uint32_t n = 1; n <= 2; n++) {
43003 GemmMicrokernelTester()
43004 .mr(4)
43005 .nr(2)
43006 .kr(1)
43007 .sr(1)
43008 .m(m)
43009 .n(n)
43010 .k(1)
43011 .iterations(1)
43012 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43013 }
43014 }
43015 }
43016
43017 TEST(F32_GEMM_4X2__WASM, k_eq_1_subtile_m) {
43018 for (uint32_t m = 1; m <= 4; m++) {
43019 GemmMicrokernelTester()
43020 .mr(4)
43021 .nr(2)
43022 .kr(1)
43023 .sr(1)
43024 .m(m)
43025 .n(2)
43026 .k(1)
43027 .iterations(1)
43028 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43029 }
43030 }
43031
43032 TEST(F32_GEMM_4X2__WASM, k_eq_1_subtile_n) {
43033 for (uint32_t n = 1; n <= 2; n++) {
43034 GemmMicrokernelTester()
43035 .mr(4)
43036 .nr(2)
43037 .kr(1)
43038 .sr(1)
43039 .m(4)
43040 .n(n)
43041 .k(1)
43042 .iterations(1)
43043 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43044 }
43045 }
43046
43047 TEST(F32_GEMM_4X2__WASM, k_gt_1) {
43048 for (size_t k = 2; k < 10; k++) {
43049 GemmMicrokernelTester()
43050 .mr(4)
43051 .nr(2)
43052 .kr(1)
43053 .sr(1)
43054 .m(4)
43055 .n(2)
43056 .k(k)
43057 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43058 }
43059 }
43060
43061 TEST(F32_GEMM_4X2__WASM, k_gt_1_strided_a) {
43062 for (size_t k = 2; k < 10; k++) {
43063 GemmMicrokernelTester()
43064 .mr(4)
43065 .nr(2)
43066 .kr(1)
43067 .sr(1)
43068 .m(4)
43069 .n(2)
43070 .k(k)
43071 .a_stride(11)
43072 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43073 }
43074 }
43075
43076 TEST(F32_GEMM_4X2__WASM, k_gt_1_subtile) {
43077 for (size_t k = 2; k < 10; k++) {
43078 for (uint32_t m = 1; m <= 4; m++) {
43079 for (uint32_t n = 1; n <= 2; n++) {
43080 GemmMicrokernelTester()
43081 .mr(4)
43082 .nr(2)
43083 .kr(1)
43084 .sr(1)
43085 .m(m)
43086 .n(n)
43087 .k(k)
43088 .iterations(1)
43089 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43090 }
43091 }
43092 }
43093 }
43094
43095 TEST(F32_GEMM_4X2__WASM, n_gt_2) {
43096 for (uint32_t n = 3; n < 4; n++) {
43097 for (size_t k = 1; k <= 5; k += 2) {
43098 GemmMicrokernelTester()
43099 .mr(4)
43100 .nr(2)
43101 .kr(1)
43102 .sr(1)
43103 .m(4)
43104 .n(2)
43105 .k(k)
43106 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43107 }
43108 }
43109 }
43110
43111 TEST(F32_GEMM_4X2__WASM, n_gt_2_strided_cn) {
43112 for (uint32_t n = 3; n < 4; n++) {
43113 for (size_t k = 1; k <= 5; k += 2) {
43114 GemmMicrokernelTester()
43115 .mr(4)
43116 .nr(2)
43117 .kr(1)
43118 .sr(1)
43119 .m(4)
43120 .n(2)
43121 .k(k)
43122 .cn_stride(5)
43123 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43124 }
43125 }
43126 }
43127
43128 TEST(F32_GEMM_4X2__WASM, n_gt_2_strided_a) {
43129 for (uint32_t n = 3; n < 4; n++) {
43130 for (size_t k = 1; k <= 5; k += 2) {
43131 GemmMicrokernelTester()
43132 .mr(4)
43133 .nr(2)
43134 .kr(1)
43135 .sr(1)
43136 .m(4)
43137 .n(n)
43138 .k(k)
43139 .a_stride(7)
43140 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43141 }
43142 }
43143 }
43144
43145 TEST(F32_GEMM_4X2__WASM, n_gt_2_subtile) {
43146 for (uint32_t n = 3; n < 4; n++) {
43147 for (size_t k = 1; k <= 5; k += 2) {
43148 for (uint32_t m = 1; m <= 4; m++) {
43149 GemmMicrokernelTester()
43150 .mr(4)
43151 .nr(2)
43152 .kr(1)
43153 .sr(1)
43154 .m(m)
43155 .n(n)
43156 .k(k)
43157 .iterations(1)
43158 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43159 }
43160 }
43161 }
43162 }
43163
43164 TEST(F32_GEMM_4X2__WASM, n_div_2) {
43165 for (uint32_t n = 4; n <= 6; n += 2) {
43166 for (size_t k = 1; k <= 5; k += 2) {
43167 GemmMicrokernelTester()
43168 .mr(4)
43169 .nr(2)
43170 .kr(1)
43171 .sr(1)
43172 .m(4)
43173 .n(2)
43174 .k(k)
43175 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43176 }
43177 }
43178 }
43179
43180 TEST(F32_GEMM_4X2__WASM, n_div_2_strided_cn) {
43181 for (uint32_t n = 4; n <= 6; n += 2) {
43182 for (size_t k = 1; k <= 5; k += 2) {
43183 GemmMicrokernelTester()
43184 .mr(4)
43185 .nr(2)
43186 .kr(1)
43187 .sr(1)
43188 .m(4)
43189 .n(n)
43190 .k(k)
43191 .cn_stride(5)
43192 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43193 }
43194 }
43195 }
43196
43197 TEST(F32_GEMM_4X2__WASM, n_div_2_strided_a) {
43198 for (uint32_t n = 4; n <= 6; n += 2) {
43199 for (size_t k = 1; k <= 5; k += 2) {
43200 GemmMicrokernelTester()
43201 .mr(4)
43202 .nr(2)
43203 .kr(1)
43204 .sr(1)
43205 .m(4)
43206 .n(n)
43207 .k(k)
43208 .a_stride(7)
43209 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43210 }
43211 }
43212 }
43213
43214 TEST(F32_GEMM_4X2__WASM, n_div_2_subtile) {
43215 for (uint32_t n = 4; n <= 6; n += 2) {
43216 for (size_t k = 1; k <= 5; k += 2) {
43217 for (uint32_t m = 1; m <= 4; m++) {
43218 GemmMicrokernelTester()
43219 .mr(4)
43220 .nr(2)
43221 .kr(1)
43222 .sr(1)
43223 .m(m)
43224 .n(n)
43225 .k(k)
43226 .iterations(1)
43227 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43228 }
43229 }
43230 }
43231 }
43232
43233 TEST(F32_GEMM_4X2__WASM, strided_cm_subtile) {
43234 for (size_t k = 1; k <= 5; k += 2) {
43235 for (uint32_t m = 1; m <= 4; m++) {
43236 for (uint32_t n = 1; n <= 2; n++) {
43237 GemmMicrokernelTester()
43238 .mr(4)
43239 .nr(2)
43240 .kr(1)
43241 .sr(1)
43242 .m(m)
43243 .n(n)
43244 .k(k)
43245 .cm_stride(5)
43246 .iterations(1)
43247 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43248 }
43249 }
43250 }
43251 }
43252
43253 TEST(F32_GEMM_4X2__WASM, qmin) {
43254 GemmMicrokernelTester()
43255 .mr(4)
43256 .nr(2)
43257 .kr(1)
43258 .sr(1)
43259 .m(4)
43260 .n(2)
43261 .k(1)
43262 .qmin(128)
43263 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43264 }
43265
43266 TEST(F32_GEMM_4X2__WASM, qmax) {
43267 GemmMicrokernelTester()
43268 .mr(4)
43269 .nr(2)
43270 .kr(1)
43271 .sr(1)
43272 .m(4)
43273 .n(2)
43274 .k(1)
43275 .qmax(128)
43276 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43277 }
43278
43279 TEST(F32_GEMM_4X2__WASM, strided_cm) {
43280 GemmMicrokernelTester()
43281 .mr(4)
43282 .nr(2)
43283 .kr(1)
43284 .sr(1)
43285 .m(4)
43286 .n(2)
43287 .k(1)
43288 .cm_stride(5)
43289 .Test(xnn_f32_gemm_ukernel_4x2__wasm, GemmMicrokernelTester::Variant::Scalar);
43290 }
43291#endif // XNN_ARCH_WASM
43292
43293
XNNPACK Teamb455b122019-09-27 18:10:33 -070043294TEST(F32_GEMM_1X4__SCALAR, k_eq_1) {
43295 GemmMicrokernelTester()
43296 .mr(1)
43297 .nr(4)
43298 .kr(1)
43299 .sr(1)
43300 .m(1)
43301 .n(4)
43302 .k(1)
43303 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43304}
43305
43306TEST(F32_GEMM_1X4__SCALAR, strided_cn) {
43307 GemmMicrokernelTester()
43308 .mr(1)
43309 .nr(4)
43310 .kr(1)
43311 .sr(1)
43312 .m(1)
43313 .n(4)
43314 .k(1)
43315 .cn_stride(7)
43316 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43317}
43318
43319TEST(F32_GEMM_1X4__SCALAR, k_eq_1_strided_a) {
43320 GemmMicrokernelTester()
43321 .mr(1)
43322 .nr(4)
43323 .kr(1)
43324 .sr(1)
43325 .m(1)
43326 .n(4)
43327 .k(1)
43328 .a_stride(3)
43329 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43330}
43331
43332TEST(F32_GEMM_1X4__SCALAR, k_eq_1_subtile) {
43333 for (uint32_t m = 1; m <= 1; m++) {
43334 for (uint32_t n = 1; n <= 4; n++) {
43335 GemmMicrokernelTester()
43336 .mr(1)
43337 .nr(4)
43338 .kr(1)
43339 .sr(1)
43340 .m(m)
43341 .n(n)
43342 .k(1)
43343 .iterations(1)
43344 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43345 }
43346 }
43347}
43348
43349TEST(F32_GEMM_1X4__SCALAR, k_eq_1_subtile_m) {
43350 for (uint32_t m = 1; m <= 1; m++) {
43351 GemmMicrokernelTester()
43352 .mr(1)
43353 .nr(4)
43354 .kr(1)
43355 .sr(1)
43356 .m(m)
43357 .n(4)
43358 .k(1)
43359 .iterations(1)
43360 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43361 }
43362}
43363
43364TEST(F32_GEMM_1X4__SCALAR, k_eq_1_subtile_n) {
43365 for (uint32_t n = 1; n <= 4; n++) {
43366 GemmMicrokernelTester()
43367 .mr(1)
43368 .nr(4)
43369 .kr(1)
43370 .sr(1)
43371 .m(1)
43372 .n(n)
43373 .k(1)
43374 .iterations(1)
43375 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43376 }
43377}
43378
43379TEST(F32_GEMM_1X4__SCALAR, k_gt_1) {
43380 for (size_t k = 2; k < 10; k++) {
43381 GemmMicrokernelTester()
43382 .mr(1)
43383 .nr(4)
43384 .kr(1)
43385 .sr(1)
43386 .m(1)
43387 .n(4)
43388 .k(k)
43389 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43390 }
43391}
43392
43393TEST(F32_GEMM_1X4__SCALAR, k_gt_1_strided_a) {
43394 for (size_t k = 2; k < 10; k++) {
43395 GemmMicrokernelTester()
43396 .mr(1)
43397 .nr(4)
43398 .kr(1)
43399 .sr(1)
43400 .m(1)
43401 .n(4)
43402 .k(k)
43403 .a_stride(11)
43404 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43405 }
43406}
43407
43408TEST(F32_GEMM_1X4__SCALAR, k_gt_1_subtile) {
43409 for (size_t k = 2; k < 10; k++) {
43410 for (uint32_t m = 1; m <= 1; m++) {
43411 for (uint32_t n = 1; n <= 4; n++) {
43412 GemmMicrokernelTester()
43413 .mr(1)
43414 .nr(4)
43415 .kr(1)
43416 .sr(1)
43417 .m(m)
43418 .n(n)
43419 .k(k)
43420 .iterations(1)
43421 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43422 }
43423 }
43424 }
43425}
43426
43427TEST(F32_GEMM_1X4__SCALAR, n_gt_4) {
43428 for (uint32_t n = 5; n < 8; n++) {
43429 for (size_t k = 1; k <= 5; k += 2) {
43430 GemmMicrokernelTester()
43431 .mr(1)
43432 .nr(4)
43433 .kr(1)
43434 .sr(1)
43435 .m(1)
43436 .n(4)
43437 .k(k)
43438 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43439 }
43440 }
43441}
43442
43443TEST(F32_GEMM_1X4__SCALAR, n_gt_4_strided_cn) {
43444 for (uint32_t n = 5; n < 8; n++) {
43445 for (size_t k = 1; k <= 5; k += 2) {
43446 GemmMicrokernelTester()
43447 .mr(1)
43448 .nr(4)
43449 .kr(1)
43450 .sr(1)
43451 .m(1)
43452 .n(4)
43453 .k(k)
43454 .cn_stride(7)
43455 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43456 }
43457 }
43458}
43459
43460TEST(F32_GEMM_1X4__SCALAR, n_gt_4_strided_a) {
43461 for (uint32_t n = 5; n < 8; n++) {
43462 for (size_t k = 1; k <= 5; k += 2) {
43463 GemmMicrokernelTester()
43464 .mr(1)
43465 .nr(4)
43466 .kr(1)
43467 .sr(1)
43468 .m(1)
43469 .n(n)
43470 .k(k)
43471 .a_stride(7)
43472 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43473 }
43474 }
43475}
43476
43477TEST(F32_GEMM_1X4__SCALAR, n_gt_4_subtile) {
43478 for (uint32_t n = 5; n < 8; n++) {
43479 for (size_t k = 1; k <= 5; k += 2) {
43480 for (uint32_t m = 1; m <= 1; m++) {
43481 GemmMicrokernelTester()
43482 .mr(1)
43483 .nr(4)
43484 .kr(1)
43485 .sr(1)
43486 .m(m)
43487 .n(n)
43488 .k(k)
43489 .iterations(1)
43490 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43491 }
43492 }
43493 }
43494}
43495
43496TEST(F32_GEMM_1X4__SCALAR, n_div_4) {
43497 for (uint32_t n = 8; n <= 12; n += 4) {
43498 for (size_t k = 1; k <= 5; k += 2) {
43499 GemmMicrokernelTester()
43500 .mr(1)
43501 .nr(4)
43502 .kr(1)
43503 .sr(1)
43504 .m(1)
43505 .n(4)
43506 .k(k)
43507 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43508 }
43509 }
43510}
43511
43512TEST(F32_GEMM_1X4__SCALAR, n_div_4_strided_cn) {
43513 for (uint32_t n = 8; n <= 12; n += 4) {
43514 for (size_t k = 1; k <= 5; k += 2) {
43515 GemmMicrokernelTester()
43516 .mr(1)
43517 .nr(4)
43518 .kr(1)
43519 .sr(1)
43520 .m(1)
43521 .n(n)
43522 .k(k)
43523 .cn_stride(7)
43524 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43525 }
43526 }
43527}
43528
43529TEST(F32_GEMM_1X4__SCALAR, n_div_4_strided_a) {
43530 for (uint32_t n = 8; n <= 12; n += 4) {
43531 for (size_t k = 1; k <= 5; k += 2) {
43532 GemmMicrokernelTester()
43533 .mr(1)
43534 .nr(4)
43535 .kr(1)
43536 .sr(1)
43537 .m(1)
43538 .n(n)
43539 .k(k)
43540 .a_stride(7)
43541 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43542 }
43543 }
43544}
43545
43546TEST(F32_GEMM_1X4__SCALAR, n_div_4_subtile) {
43547 for (uint32_t n = 8; n <= 12; n += 4) {
43548 for (size_t k = 1; k <= 5; k += 2) {
43549 for (uint32_t m = 1; m <= 1; m++) {
43550 GemmMicrokernelTester()
43551 .mr(1)
43552 .nr(4)
43553 .kr(1)
43554 .sr(1)
43555 .m(m)
43556 .n(n)
43557 .k(k)
43558 .iterations(1)
43559 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43560 }
43561 }
43562 }
43563}
43564
43565TEST(F32_GEMM_1X4__SCALAR, strided_cm_subtile) {
43566 for (size_t k = 1; k <= 5; k += 2) {
43567 for (uint32_t m = 1; m <= 1; m++) {
43568 for (uint32_t n = 1; n <= 4; n++) {
43569 GemmMicrokernelTester()
43570 .mr(1)
43571 .nr(4)
43572 .kr(1)
43573 .sr(1)
43574 .m(m)
43575 .n(n)
43576 .k(k)
43577 .cm_stride(7)
43578 .iterations(1)
43579 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43580 }
43581 }
43582 }
43583}
43584
43585TEST(F32_GEMM_1X4__SCALAR, qmin) {
43586 GemmMicrokernelTester()
43587 .mr(1)
43588 .nr(4)
43589 .kr(1)
43590 .sr(1)
43591 .m(1)
43592 .n(4)
43593 .k(1)
43594 .qmin(128)
43595 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43596}
43597
43598TEST(F32_GEMM_1X4__SCALAR, qmax) {
43599 GemmMicrokernelTester()
43600 .mr(1)
43601 .nr(4)
43602 .kr(1)
43603 .sr(1)
43604 .m(1)
43605 .n(4)
43606 .k(1)
43607 .qmax(128)
43608 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43609}
43610
43611TEST(F32_GEMM_1X4__SCALAR, strided_cm) {
43612 GemmMicrokernelTester()
43613 .mr(1)
43614 .nr(4)
43615 .kr(1)
43616 .sr(1)
43617 .m(1)
43618 .n(4)
43619 .k(1)
43620 .cm_stride(7)
43621 .Test(xnn_f32_gemm_ukernel_1x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43622}
43623
43624
43625TEST(F32_GEMM_2X4__SCALAR, k_eq_1) {
43626 GemmMicrokernelTester()
43627 .mr(2)
43628 .nr(4)
43629 .kr(1)
43630 .sr(1)
43631 .m(2)
43632 .n(4)
43633 .k(1)
43634 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43635}
43636
43637TEST(F32_GEMM_2X4__SCALAR, strided_cn) {
43638 GemmMicrokernelTester()
43639 .mr(2)
43640 .nr(4)
43641 .kr(1)
43642 .sr(1)
43643 .m(2)
43644 .n(4)
43645 .k(1)
43646 .cn_stride(7)
43647 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43648}
43649
43650TEST(F32_GEMM_2X4__SCALAR, k_eq_1_strided_a) {
43651 GemmMicrokernelTester()
43652 .mr(2)
43653 .nr(4)
43654 .kr(1)
43655 .sr(1)
43656 .m(2)
43657 .n(4)
43658 .k(1)
43659 .a_stride(3)
43660 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43661}
43662
43663TEST(F32_GEMM_2X4__SCALAR, k_eq_1_subtile) {
43664 for (uint32_t m = 1; m <= 2; m++) {
43665 for (uint32_t n = 1; n <= 4; n++) {
43666 GemmMicrokernelTester()
43667 .mr(2)
43668 .nr(4)
43669 .kr(1)
43670 .sr(1)
43671 .m(m)
43672 .n(n)
43673 .k(1)
43674 .iterations(1)
43675 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43676 }
43677 }
43678}
43679
43680TEST(F32_GEMM_2X4__SCALAR, k_eq_1_subtile_m) {
43681 for (uint32_t m = 1; m <= 2; m++) {
43682 GemmMicrokernelTester()
43683 .mr(2)
43684 .nr(4)
43685 .kr(1)
43686 .sr(1)
43687 .m(m)
43688 .n(4)
43689 .k(1)
43690 .iterations(1)
43691 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43692 }
43693}
43694
43695TEST(F32_GEMM_2X4__SCALAR, k_eq_1_subtile_n) {
43696 for (uint32_t n = 1; n <= 4; n++) {
43697 GemmMicrokernelTester()
43698 .mr(2)
43699 .nr(4)
43700 .kr(1)
43701 .sr(1)
43702 .m(2)
43703 .n(n)
43704 .k(1)
43705 .iterations(1)
43706 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43707 }
43708}
43709
43710TEST(F32_GEMM_2X4__SCALAR, k_gt_1) {
43711 for (size_t k = 2; k < 10; k++) {
43712 GemmMicrokernelTester()
43713 .mr(2)
43714 .nr(4)
43715 .kr(1)
43716 .sr(1)
43717 .m(2)
43718 .n(4)
43719 .k(k)
43720 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43721 }
43722}
43723
43724TEST(F32_GEMM_2X4__SCALAR, k_gt_1_strided_a) {
43725 for (size_t k = 2; k < 10; k++) {
43726 GemmMicrokernelTester()
43727 .mr(2)
43728 .nr(4)
43729 .kr(1)
43730 .sr(1)
43731 .m(2)
43732 .n(4)
43733 .k(k)
43734 .a_stride(11)
43735 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43736 }
43737}
43738
43739TEST(F32_GEMM_2X4__SCALAR, k_gt_1_subtile) {
43740 for (size_t k = 2; k < 10; k++) {
43741 for (uint32_t m = 1; m <= 2; m++) {
43742 for (uint32_t n = 1; n <= 4; n++) {
43743 GemmMicrokernelTester()
43744 .mr(2)
43745 .nr(4)
43746 .kr(1)
43747 .sr(1)
43748 .m(m)
43749 .n(n)
43750 .k(k)
43751 .iterations(1)
43752 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43753 }
43754 }
43755 }
43756}
43757
43758TEST(F32_GEMM_2X4__SCALAR, n_gt_4) {
43759 for (uint32_t n = 5; n < 8; n++) {
43760 for (size_t k = 1; k <= 5; k += 2) {
43761 GemmMicrokernelTester()
43762 .mr(2)
43763 .nr(4)
43764 .kr(1)
43765 .sr(1)
43766 .m(2)
43767 .n(4)
43768 .k(k)
43769 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43770 }
43771 }
43772}
43773
43774TEST(F32_GEMM_2X4__SCALAR, n_gt_4_strided_cn) {
43775 for (uint32_t n = 5; n < 8; n++) {
43776 for (size_t k = 1; k <= 5; k += 2) {
43777 GemmMicrokernelTester()
43778 .mr(2)
43779 .nr(4)
43780 .kr(1)
43781 .sr(1)
43782 .m(2)
43783 .n(4)
43784 .k(k)
43785 .cn_stride(7)
43786 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43787 }
43788 }
43789}
43790
43791TEST(F32_GEMM_2X4__SCALAR, n_gt_4_strided_a) {
43792 for (uint32_t n = 5; n < 8; n++) {
43793 for (size_t k = 1; k <= 5; k += 2) {
43794 GemmMicrokernelTester()
43795 .mr(2)
43796 .nr(4)
43797 .kr(1)
43798 .sr(1)
43799 .m(2)
43800 .n(n)
43801 .k(k)
43802 .a_stride(7)
43803 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43804 }
43805 }
43806}
43807
43808TEST(F32_GEMM_2X4__SCALAR, n_gt_4_subtile) {
43809 for (uint32_t n = 5; n < 8; n++) {
43810 for (size_t k = 1; k <= 5; k += 2) {
43811 for (uint32_t m = 1; m <= 2; m++) {
43812 GemmMicrokernelTester()
43813 .mr(2)
43814 .nr(4)
43815 .kr(1)
43816 .sr(1)
43817 .m(m)
43818 .n(n)
43819 .k(k)
43820 .iterations(1)
43821 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43822 }
43823 }
43824 }
43825}
43826
43827TEST(F32_GEMM_2X4__SCALAR, n_div_4) {
43828 for (uint32_t n = 8; n <= 12; n += 4) {
43829 for (size_t k = 1; k <= 5; k += 2) {
43830 GemmMicrokernelTester()
43831 .mr(2)
43832 .nr(4)
43833 .kr(1)
43834 .sr(1)
43835 .m(2)
43836 .n(4)
43837 .k(k)
43838 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43839 }
43840 }
43841}
43842
43843TEST(F32_GEMM_2X4__SCALAR, n_div_4_strided_cn) {
43844 for (uint32_t n = 8; n <= 12; n += 4) {
43845 for (size_t k = 1; k <= 5; k += 2) {
43846 GemmMicrokernelTester()
43847 .mr(2)
43848 .nr(4)
43849 .kr(1)
43850 .sr(1)
43851 .m(2)
43852 .n(n)
43853 .k(k)
43854 .cn_stride(7)
43855 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43856 }
43857 }
43858}
43859
43860TEST(F32_GEMM_2X4__SCALAR, n_div_4_strided_a) {
43861 for (uint32_t n = 8; n <= 12; n += 4) {
43862 for (size_t k = 1; k <= 5; k += 2) {
43863 GemmMicrokernelTester()
43864 .mr(2)
43865 .nr(4)
43866 .kr(1)
43867 .sr(1)
43868 .m(2)
43869 .n(n)
43870 .k(k)
43871 .a_stride(7)
43872 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43873 }
43874 }
43875}
43876
43877TEST(F32_GEMM_2X4__SCALAR, n_div_4_subtile) {
43878 for (uint32_t n = 8; n <= 12; n += 4) {
43879 for (size_t k = 1; k <= 5; k += 2) {
43880 for (uint32_t m = 1; m <= 2; m++) {
43881 GemmMicrokernelTester()
43882 .mr(2)
43883 .nr(4)
43884 .kr(1)
43885 .sr(1)
43886 .m(m)
43887 .n(n)
43888 .k(k)
43889 .iterations(1)
43890 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43891 }
43892 }
43893 }
43894}
43895
43896TEST(F32_GEMM_2X4__SCALAR, strided_cm_subtile) {
43897 for (size_t k = 1; k <= 5; k += 2) {
43898 for (uint32_t m = 1; m <= 2; m++) {
43899 for (uint32_t n = 1; n <= 4; n++) {
43900 GemmMicrokernelTester()
43901 .mr(2)
43902 .nr(4)
43903 .kr(1)
43904 .sr(1)
43905 .m(m)
43906 .n(n)
43907 .k(k)
43908 .cm_stride(7)
43909 .iterations(1)
43910 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43911 }
43912 }
43913 }
43914}
43915
43916TEST(F32_GEMM_2X4__SCALAR, qmin) {
43917 GemmMicrokernelTester()
43918 .mr(2)
43919 .nr(4)
43920 .kr(1)
43921 .sr(1)
43922 .m(2)
43923 .n(4)
43924 .k(1)
43925 .qmin(128)
43926 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43927}
43928
43929TEST(F32_GEMM_2X4__SCALAR, qmax) {
43930 GemmMicrokernelTester()
43931 .mr(2)
43932 .nr(4)
43933 .kr(1)
43934 .sr(1)
43935 .m(2)
43936 .n(4)
43937 .k(1)
43938 .qmax(128)
43939 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43940}
43941
43942TEST(F32_GEMM_2X4__SCALAR, strided_cm) {
43943 GemmMicrokernelTester()
43944 .mr(2)
43945 .nr(4)
43946 .kr(1)
43947 .sr(1)
43948 .m(2)
43949 .n(4)
43950 .k(1)
43951 .cm_stride(7)
43952 .Test(xnn_f32_gemm_ukernel_2x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43953}
43954
43955
43956TEST(F32_GEMM_4X4__SCALAR, k_eq_1) {
43957 GemmMicrokernelTester()
43958 .mr(4)
43959 .nr(4)
43960 .kr(1)
43961 .sr(1)
43962 .m(4)
43963 .n(4)
43964 .k(1)
43965 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43966}
43967
43968TEST(F32_GEMM_4X4__SCALAR, strided_cn) {
43969 GemmMicrokernelTester()
43970 .mr(4)
43971 .nr(4)
43972 .kr(1)
43973 .sr(1)
43974 .m(4)
43975 .n(4)
43976 .k(1)
43977 .cn_stride(7)
43978 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43979}
43980
43981TEST(F32_GEMM_4X4__SCALAR, k_eq_1_strided_a) {
43982 GemmMicrokernelTester()
43983 .mr(4)
43984 .nr(4)
43985 .kr(1)
43986 .sr(1)
43987 .m(4)
43988 .n(4)
43989 .k(1)
43990 .a_stride(3)
43991 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
43992}
43993
43994TEST(F32_GEMM_4X4__SCALAR, k_eq_1_subtile) {
43995 for (uint32_t m = 1; m <= 4; m++) {
43996 for (uint32_t n = 1; n <= 4; n++) {
43997 GemmMicrokernelTester()
43998 .mr(4)
43999 .nr(4)
44000 .kr(1)
44001 .sr(1)
44002 .m(m)
44003 .n(n)
44004 .k(1)
44005 .iterations(1)
44006 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44007 }
44008 }
44009}
44010
44011TEST(F32_GEMM_4X4__SCALAR, k_eq_1_subtile_m) {
44012 for (uint32_t m = 1; m <= 4; m++) {
44013 GemmMicrokernelTester()
44014 .mr(4)
44015 .nr(4)
44016 .kr(1)
44017 .sr(1)
44018 .m(m)
44019 .n(4)
44020 .k(1)
44021 .iterations(1)
44022 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44023 }
44024}
44025
44026TEST(F32_GEMM_4X4__SCALAR, k_eq_1_subtile_n) {
44027 for (uint32_t n = 1; n <= 4; n++) {
44028 GemmMicrokernelTester()
44029 .mr(4)
44030 .nr(4)
44031 .kr(1)
44032 .sr(1)
44033 .m(4)
44034 .n(n)
44035 .k(1)
44036 .iterations(1)
44037 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44038 }
44039}
44040
44041TEST(F32_GEMM_4X4__SCALAR, k_gt_1) {
44042 for (size_t k = 2; k < 10; k++) {
44043 GemmMicrokernelTester()
44044 .mr(4)
44045 .nr(4)
44046 .kr(1)
44047 .sr(1)
44048 .m(4)
44049 .n(4)
44050 .k(k)
44051 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44052 }
44053}
44054
44055TEST(F32_GEMM_4X4__SCALAR, k_gt_1_strided_a) {
44056 for (size_t k = 2; k < 10; k++) {
44057 GemmMicrokernelTester()
44058 .mr(4)
44059 .nr(4)
44060 .kr(1)
44061 .sr(1)
44062 .m(4)
44063 .n(4)
44064 .k(k)
44065 .a_stride(11)
44066 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44067 }
44068}
44069
44070TEST(F32_GEMM_4X4__SCALAR, k_gt_1_subtile) {
44071 for (size_t k = 2; k < 10; k++) {
44072 for (uint32_t m = 1; m <= 4; m++) {
44073 for (uint32_t n = 1; n <= 4; n++) {
44074 GemmMicrokernelTester()
44075 .mr(4)
44076 .nr(4)
44077 .kr(1)
44078 .sr(1)
44079 .m(m)
44080 .n(n)
44081 .k(k)
44082 .iterations(1)
44083 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44084 }
44085 }
44086 }
44087}
44088
44089TEST(F32_GEMM_4X4__SCALAR, n_gt_4) {
44090 for (uint32_t n = 5; n < 8; n++) {
44091 for (size_t k = 1; k <= 5; k += 2) {
44092 GemmMicrokernelTester()
44093 .mr(4)
44094 .nr(4)
44095 .kr(1)
44096 .sr(1)
44097 .m(4)
44098 .n(4)
44099 .k(k)
44100 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44101 }
44102 }
44103}
44104
44105TEST(F32_GEMM_4X4__SCALAR, n_gt_4_strided_cn) {
44106 for (uint32_t n = 5; n < 8; n++) {
44107 for (size_t k = 1; k <= 5; k += 2) {
44108 GemmMicrokernelTester()
44109 .mr(4)
44110 .nr(4)
44111 .kr(1)
44112 .sr(1)
44113 .m(4)
44114 .n(4)
44115 .k(k)
44116 .cn_stride(7)
44117 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44118 }
44119 }
44120}
44121
44122TEST(F32_GEMM_4X4__SCALAR, n_gt_4_strided_a) {
44123 for (uint32_t n = 5; n < 8; n++) {
44124 for (size_t k = 1; k <= 5; k += 2) {
44125 GemmMicrokernelTester()
44126 .mr(4)
44127 .nr(4)
44128 .kr(1)
44129 .sr(1)
44130 .m(4)
44131 .n(n)
44132 .k(k)
44133 .a_stride(7)
44134 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44135 }
44136 }
44137}
44138
44139TEST(F32_GEMM_4X4__SCALAR, n_gt_4_subtile) {
44140 for (uint32_t n = 5; n < 8; n++) {
44141 for (size_t k = 1; k <= 5; k += 2) {
44142 for (uint32_t m = 1; m <= 4; m++) {
44143 GemmMicrokernelTester()
44144 .mr(4)
44145 .nr(4)
44146 .kr(1)
44147 .sr(1)
44148 .m(m)
44149 .n(n)
44150 .k(k)
44151 .iterations(1)
44152 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44153 }
44154 }
44155 }
44156}
44157
44158TEST(F32_GEMM_4X4__SCALAR, n_div_4) {
44159 for (uint32_t n = 8; n <= 12; n += 4) {
44160 for (size_t k = 1; k <= 5; k += 2) {
44161 GemmMicrokernelTester()
44162 .mr(4)
44163 .nr(4)
44164 .kr(1)
44165 .sr(1)
44166 .m(4)
44167 .n(4)
44168 .k(k)
44169 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44170 }
44171 }
44172}
44173
44174TEST(F32_GEMM_4X4__SCALAR, n_div_4_strided_cn) {
44175 for (uint32_t n = 8; n <= 12; n += 4) {
44176 for (size_t k = 1; k <= 5; k += 2) {
44177 GemmMicrokernelTester()
44178 .mr(4)
44179 .nr(4)
44180 .kr(1)
44181 .sr(1)
44182 .m(4)
44183 .n(n)
44184 .k(k)
44185 .cn_stride(7)
44186 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44187 }
44188 }
44189}
44190
44191TEST(F32_GEMM_4X4__SCALAR, n_div_4_strided_a) {
44192 for (uint32_t n = 8; n <= 12; n += 4) {
44193 for (size_t k = 1; k <= 5; k += 2) {
44194 GemmMicrokernelTester()
44195 .mr(4)
44196 .nr(4)
44197 .kr(1)
44198 .sr(1)
44199 .m(4)
44200 .n(n)
44201 .k(k)
44202 .a_stride(7)
44203 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44204 }
44205 }
44206}
44207
44208TEST(F32_GEMM_4X4__SCALAR, n_div_4_subtile) {
44209 for (uint32_t n = 8; n <= 12; n += 4) {
44210 for (size_t k = 1; k <= 5; k += 2) {
44211 for (uint32_t m = 1; m <= 4; m++) {
44212 GemmMicrokernelTester()
44213 .mr(4)
44214 .nr(4)
44215 .kr(1)
44216 .sr(1)
44217 .m(m)
44218 .n(n)
44219 .k(k)
44220 .iterations(1)
44221 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44222 }
44223 }
44224 }
44225}
44226
44227TEST(F32_GEMM_4X4__SCALAR, strided_cm_subtile) {
44228 for (size_t k = 1; k <= 5; k += 2) {
44229 for (uint32_t m = 1; m <= 4; m++) {
44230 for (uint32_t n = 1; n <= 4; n++) {
44231 GemmMicrokernelTester()
44232 .mr(4)
44233 .nr(4)
44234 .kr(1)
44235 .sr(1)
44236 .m(m)
44237 .n(n)
44238 .k(k)
44239 .cm_stride(7)
44240 .iterations(1)
44241 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44242 }
44243 }
44244 }
44245}
44246
44247TEST(F32_GEMM_4X4__SCALAR, qmin) {
44248 GemmMicrokernelTester()
44249 .mr(4)
44250 .nr(4)
44251 .kr(1)
44252 .sr(1)
44253 .m(4)
44254 .n(4)
44255 .k(1)
44256 .qmin(128)
44257 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44258}
44259
44260TEST(F32_GEMM_4X4__SCALAR, qmax) {
44261 GemmMicrokernelTester()
44262 .mr(4)
44263 .nr(4)
44264 .kr(1)
44265 .sr(1)
44266 .m(4)
44267 .n(4)
44268 .k(1)
44269 .qmax(128)
44270 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44271}
44272
44273TEST(F32_GEMM_4X4__SCALAR, strided_cm) {
44274 GemmMicrokernelTester()
44275 .mr(4)
44276 .nr(4)
44277 .kr(1)
44278 .sr(1)
44279 .m(4)
44280 .n(4)
44281 .k(1)
44282 .cm_stride(7)
44283 .Test(xnn_f32_gemm_ukernel_4x4__scalar, GemmMicrokernelTester::Variant::Scalar);
44284}
44285
44286
44287TEST(F32_GEMM_4X2__SCALAR, k_eq_1) {
44288 GemmMicrokernelTester()
44289 .mr(4)
44290 .nr(2)
44291 .kr(1)
44292 .sr(1)
44293 .m(4)
44294 .n(2)
44295 .k(1)
44296 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44297}
44298
44299TEST(F32_GEMM_4X2__SCALAR, strided_cn) {
44300 GemmMicrokernelTester()
44301 .mr(4)
44302 .nr(2)
44303 .kr(1)
44304 .sr(1)
44305 .m(4)
44306 .n(2)
44307 .k(1)
44308 .cn_stride(5)
44309 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44310}
44311
44312TEST(F32_GEMM_4X2__SCALAR, k_eq_1_strided_a) {
44313 GemmMicrokernelTester()
44314 .mr(4)
44315 .nr(2)
44316 .kr(1)
44317 .sr(1)
44318 .m(4)
44319 .n(2)
44320 .k(1)
44321 .a_stride(3)
44322 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44323}
44324
44325TEST(F32_GEMM_4X2__SCALAR, k_eq_1_subtile) {
44326 for (uint32_t m = 1; m <= 4; m++) {
44327 for (uint32_t n = 1; n <= 2; n++) {
44328 GemmMicrokernelTester()
44329 .mr(4)
44330 .nr(2)
44331 .kr(1)
44332 .sr(1)
44333 .m(m)
44334 .n(n)
44335 .k(1)
44336 .iterations(1)
44337 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44338 }
44339 }
44340}
44341
44342TEST(F32_GEMM_4X2__SCALAR, k_eq_1_subtile_m) {
44343 for (uint32_t m = 1; m <= 4; m++) {
44344 GemmMicrokernelTester()
44345 .mr(4)
44346 .nr(2)
44347 .kr(1)
44348 .sr(1)
44349 .m(m)
44350 .n(2)
44351 .k(1)
44352 .iterations(1)
44353 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44354 }
44355}
44356
44357TEST(F32_GEMM_4X2__SCALAR, k_eq_1_subtile_n) {
44358 for (uint32_t n = 1; n <= 2; n++) {
44359 GemmMicrokernelTester()
44360 .mr(4)
44361 .nr(2)
44362 .kr(1)
44363 .sr(1)
44364 .m(4)
44365 .n(n)
44366 .k(1)
44367 .iterations(1)
44368 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44369 }
44370}
44371
44372TEST(F32_GEMM_4X2__SCALAR, k_gt_1) {
44373 for (size_t k = 2; k < 10; k++) {
44374 GemmMicrokernelTester()
44375 .mr(4)
44376 .nr(2)
44377 .kr(1)
44378 .sr(1)
44379 .m(4)
44380 .n(2)
44381 .k(k)
44382 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44383 }
44384}
44385
44386TEST(F32_GEMM_4X2__SCALAR, k_gt_1_strided_a) {
44387 for (size_t k = 2; k < 10; k++) {
44388 GemmMicrokernelTester()
44389 .mr(4)
44390 .nr(2)
44391 .kr(1)
44392 .sr(1)
44393 .m(4)
44394 .n(2)
44395 .k(k)
44396 .a_stride(11)
44397 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44398 }
44399}
44400
44401TEST(F32_GEMM_4X2__SCALAR, k_gt_1_subtile) {
44402 for (size_t k = 2; k < 10; k++) {
44403 for (uint32_t m = 1; m <= 4; m++) {
44404 for (uint32_t n = 1; n <= 2; n++) {
44405 GemmMicrokernelTester()
44406 .mr(4)
44407 .nr(2)
44408 .kr(1)
44409 .sr(1)
44410 .m(m)
44411 .n(n)
44412 .k(k)
44413 .iterations(1)
44414 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44415 }
44416 }
44417 }
44418}
44419
44420TEST(F32_GEMM_4X2__SCALAR, n_gt_2) {
44421 for (uint32_t n = 3; n < 4; n++) {
44422 for (size_t k = 1; k <= 5; k += 2) {
44423 GemmMicrokernelTester()
44424 .mr(4)
44425 .nr(2)
44426 .kr(1)
44427 .sr(1)
44428 .m(4)
44429 .n(2)
44430 .k(k)
44431 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44432 }
44433 }
44434}
44435
44436TEST(F32_GEMM_4X2__SCALAR, n_gt_2_strided_cn) {
44437 for (uint32_t n = 3; n < 4; n++) {
44438 for (size_t k = 1; k <= 5; k += 2) {
44439 GemmMicrokernelTester()
44440 .mr(4)
44441 .nr(2)
44442 .kr(1)
44443 .sr(1)
44444 .m(4)
44445 .n(2)
44446 .k(k)
44447 .cn_stride(5)
44448 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44449 }
44450 }
44451}
44452
44453TEST(F32_GEMM_4X2__SCALAR, n_gt_2_strided_a) {
44454 for (uint32_t n = 3; n < 4; n++) {
44455 for (size_t k = 1; k <= 5; k += 2) {
44456 GemmMicrokernelTester()
44457 .mr(4)
44458 .nr(2)
44459 .kr(1)
44460 .sr(1)
44461 .m(4)
44462 .n(n)
44463 .k(k)
44464 .a_stride(7)
44465 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44466 }
44467 }
44468}
44469
44470TEST(F32_GEMM_4X2__SCALAR, n_gt_2_subtile) {
44471 for (uint32_t n = 3; n < 4; n++) {
44472 for (size_t k = 1; k <= 5; k += 2) {
44473 for (uint32_t m = 1; m <= 4; m++) {
44474 GemmMicrokernelTester()
44475 .mr(4)
44476 .nr(2)
44477 .kr(1)
44478 .sr(1)
44479 .m(m)
44480 .n(n)
44481 .k(k)
44482 .iterations(1)
44483 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44484 }
44485 }
44486 }
44487}
44488
44489TEST(F32_GEMM_4X2__SCALAR, n_div_2) {
44490 for (uint32_t n = 4; n <= 6; n += 2) {
44491 for (size_t k = 1; k <= 5; k += 2) {
44492 GemmMicrokernelTester()
44493 .mr(4)
44494 .nr(2)
44495 .kr(1)
44496 .sr(1)
44497 .m(4)
44498 .n(2)
44499 .k(k)
44500 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44501 }
44502 }
44503}
44504
44505TEST(F32_GEMM_4X2__SCALAR, n_div_2_strided_cn) {
44506 for (uint32_t n = 4; n <= 6; n += 2) {
44507 for (size_t k = 1; k <= 5; k += 2) {
44508 GemmMicrokernelTester()
44509 .mr(4)
44510 .nr(2)
44511 .kr(1)
44512 .sr(1)
44513 .m(4)
44514 .n(n)
44515 .k(k)
44516 .cn_stride(5)
44517 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44518 }
44519 }
44520}
44521
44522TEST(F32_GEMM_4X2__SCALAR, n_div_2_strided_a) {
44523 for (uint32_t n = 4; n <= 6; n += 2) {
44524 for (size_t k = 1; k <= 5; k += 2) {
44525 GemmMicrokernelTester()
44526 .mr(4)
44527 .nr(2)
44528 .kr(1)
44529 .sr(1)
44530 .m(4)
44531 .n(n)
44532 .k(k)
44533 .a_stride(7)
44534 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44535 }
44536 }
44537}
44538
44539TEST(F32_GEMM_4X2__SCALAR, n_div_2_subtile) {
44540 for (uint32_t n = 4; n <= 6; n += 2) {
44541 for (size_t k = 1; k <= 5; k += 2) {
44542 for (uint32_t m = 1; m <= 4; m++) {
44543 GemmMicrokernelTester()
44544 .mr(4)
44545 .nr(2)
44546 .kr(1)
44547 .sr(1)
44548 .m(m)
44549 .n(n)
44550 .k(k)
44551 .iterations(1)
44552 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44553 }
44554 }
44555 }
44556}
44557
44558TEST(F32_GEMM_4X2__SCALAR, strided_cm_subtile) {
44559 for (size_t k = 1; k <= 5; k += 2) {
44560 for (uint32_t m = 1; m <= 4; m++) {
44561 for (uint32_t n = 1; n <= 2; n++) {
44562 GemmMicrokernelTester()
44563 .mr(4)
44564 .nr(2)
44565 .kr(1)
44566 .sr(1)
44567 .m(m)
44568 .n(n)
44569 .k(k)
44570 .cm_stride(5)
44571 .iterations(1)
44572 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44573 }
44574 }
44575 }
44576}
44577
44578TEST(F32_GEMM_4X2__SCALAR, qmin) {
44579 GemmMicrokernelTester()
44580 .mr(4)
44581 .nr(2)
44582 .kr(1)
44583 .sr(1)
44584 .m(4)
44585 .n(2)
44586 .k(1)
44587 .qmin(128)
44588 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44589}
44590
44591TEST(F32_GEMM_4X2__SCALAR, qmax) {
44592 GemmMicrokernelTester()
44593 .mr(4)
44594 .nr(2)
44595 .kr(1)
44596 .sr(1)
44597 .m(4)
44598 .n(2)
44599 .k(1)
44600 .qmax(128)
44601 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44602}
44603
44604TEST(F32_GEMM_4X2__SCALAR, strided_cm) {
44605 GemmMicrokernelTester()
44606 .mr(4)
44607 .nr(2)
44608 .kr(1)
44609 .sr(1)
44610 .m(4)
44611 .n(2)
44612 .k(1)
44613 .cm_stride(5)
44614 .Test(xnn_f32_gemm_ukernel_4x2__scalar, GemmMicrokernelTester::Variant::Scalar);
44615}